LCOV - code coverage report
Current view: top level - net/packet - af_packet.c (source / functions) Hit Total Coverage
Test: landlock.info Lines: 393 2450 16.0 %
Date: 2021-04-22 12:43:58 Functions: 22 106 20.8 %

          Line data    Source code
       1             : // SPDX-License-Identifier: GPL-2.0-or-later
       2             : /*
       3             :  * INET         An implementation of the TCP/IP protocol suite for the LINUX
       4             :  *              operating system.  INET is implemented using the  BSD Socket
       5             :  *              interface as the means of communication with the user level.
       6             :  *
       7             :  *              PACKET - implements raw packet sockets.
       8             :  *
       9             :  * Authors:     Ross Biro
      10             :  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
      11             :  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
      12             :  *
      13             :  * Fixes:
      14             :  *              Alan Cox        :       verify_area() now used correctly
      15             :  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
      16             :  *              Alan Cox        :       tidied skbuff lists.
      17             :  *              Alan Cox        :       Now uses generic datagram routines I
      18             :  *                                      added. Also fixed the peek/read crash
      19             :  *                                      from all old Linux datagram code.
      20             :  *              Alan Cox        :       Uses the improved datagram code.
      21             :  *              Alan Cox        :       Added NULL's for socket options.
      22             :  *              Alan Cox        :       Re-commented the code.
      23             :  *              Alan Cox        :       Use new kernel side addressing
      24             :  *              Rob Janssen     :       Correct MTU usage.
      25             :  *              Dave Platt      :       Counter leaks caused by incorrect
      26             :  *                                      interrupt locking and some slightly
      27             :  *                                      dubious gcc output. Can you read
      28             :  *                                      compiler: it said _VOLATILE_
      29             :  *      Richard Kooijman        :       Timestamp fixes.
      30             :  *              Alan Cox        :       New buffers. Use sk->mac.raw.
      31             :  *              Alan Cox        :       sendmsg/recvmsg support.
      32             :  *              Alan Cox        :       Protocol setting support
      33             :  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
      34             :  *      Cyrus Durgin            :       Fixed kerneld for kmod.
      35             :  *      Michal Ostrowski        :       Module initialization cleanup.
      36             :  *         Ulises Alonso        :       Frame number limit removal and
      37             :  *                                      packet_set_ring memory leak.
      38             :  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
      39             :  *                                      The convention is that longer addresses
      40             :  *                                      will simply extend the hardware address
      41             :  *                                      byte arrays at the end of sockaddr_ll
      42             :  *                                      and packet_mreq.
      43             :  *              Johann Baudy    :       Added TX RING.
      44             :  *              Chetan Loke     :       Implemented TPACKET_V3 block abstraction
      45             :  *                                      layer.
      46             :  *                                      Copyright (C) 2011, <lokec@ccs.neu.edu>
      47             :  */
      48             : 
      49             : #include <linux/ethtool.h>
      50             : #include <linux/types.h>
      51             : #include <linux/mm.h>
      52             : #include <linux/capability.h>
      53             : #include <linux/fcntl.h>
      54             : #include <linux/socket.h>
      55             : #include <linux/in.h>
      56             : #include <linux/inet.h>
      57             : #include <linux/netdevice.h>
      58             : #include <linux/if_packet.h>
      59             : #include <linux/wireless.h>
      60             : #include <linux/kernel.h>
      61             : #include <linux/kmod.h>
      62             : #include <linux/slab.h>
      63             : #include <linux/vmalloc.h>
      64             : #include <net/net_namespace.h>
      65             : #include <net/ip.h>
      66             : #include <net/protocol.h>
      67             : #include <linux/skbuff.h>
      68             : #include <net/sock.h>
      69             : #include <linux/errno.h>
      70             : #include <linux/timer.h>
      71             : #include <linux/uaccess.h>
      72             : #include <asm/ioctls.h>
      73             : #include <asm/page.h>
      74             : #include <asm/cacheflush.h>
      75             : #include <asm/io.h>
      76             : #include <linux/proc_fs.h>
      77             : #include <linux/seq_file.h>
      78             : #include <linux/poll.h>
      79             : #include <linux/module.h>
      80             : #include <linux/init.h>
      81             : #include <linux/mutex.h>
      82             : #include <linux/if_vlan.h>
      83             : #include <linux/virtio_net.h>
      84             : #include <linux/errqueue.h>
      85             : #include <linux/net_tstamp.h>
      86             : #include <linux/percpu.h>
      87             : #ifdef CONFIG_INET
      88             : #include <net/inet_common.h>
      89             : #endif
      90             : #include <linux/bpf.h>
      91             : #include <net/compat.h>
      92             : 
      93             : #include "internal.h"
      94             : 
      95             : /*
      96             :    Assumptions:
      97             :    - If the device has no dev->header_ops->create, there is no LL header
      98             :      visible above the device. In this case, its hard_header_len should be 0.
      99             :      The device may prepend its own header internally. In this case, its
     100             :      needed_headroom should be set to the space needed for it to add its
     101             :      internal header.
     102             :      For example, a WiFi driver pretending to be an Ethernet driver should
     103             :      set its hard_header_len to be the Ethernet header length, and set its
     104             :      needed_headroom to be (the real WiFi header length - the fake Ethernet
     105             :      header length).
     106             :    - packet socket receives packets with pulled ll header,
     107             :      so that SOCK_RAW should push it back.
     108             : 
     109             : On receive:
     110             : -----------
     111             : 
     112             : Incoming, dev_has_header(dev) == true
     113             :    mac_header -> ll header
     114             :    data       -> data
     115             : 
     116             : Outgoing, dev_has_header(dev) == true
     117             :    mac_header -> ll header
     118             :    data       -> ll header
     119             : 
     120             : Incoming, dev_has_header(dev) == false
     121             :    mac_header -> data
     122             :      However drivers often make it point to the ll header.
     123             :      This is incorrect because the ll header should be invisible to us.
     124             :    data       -> data
     125             : 
     126             : Outgoing, dev_has_header(dev) == false
     127             :    mac_header -> data. ll header is invisible to us.
     128             :    data       -> data
     129             : 
     130             : Resume
     131             :   If dev_has_header(dev) == false we are unable to restore the ll header,
     132             :     because it is invisible to us.
     133             : 
     134             : 
     135             : On transmit:
     136             : ------------
     137             : 
     138             : dev_has_header(dev) == true
     139             :    mac_header -> ll header
     140             :    data       -> ll header
     141             : 
     142             : dev_has_header(dev) == false (ll header is invisible to us)
     143             :    mac_header -> data
     144             :    data       -> data
     145             : 
     146             :    We should set network_header on output to the correct position,
     147             :    packet classifier depends on it.
     148             :  */
     149             : 
     150             : /* Private packet socket structures. */
     151             : 
     152             : /* identical to struct packet_mreq except it has
     153             :  * a longer address field.
     154             :  */
     155             : struct packet_mreq_max {
     156             :         int             mr_ifindex;
     157             :         unsigned short  mr_type;
     158             :         unsigned short  mr_alen;
     159             :         unsigned char   mr_address[MAX_ADDR_LEN];
     160             : };
     161             : 
     162             : union tpacket_uhdr {
     163             :         struct tpacket_hdr  *h1;
     164             :         struct tpacket2_hdr *h2;
     165             :         struct tpacket3_hdr *h3;
     166             :         void *raw;
     167             : };
     168             : 
     169             : static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
     170             :                 int closing, int tx_ring);
     171             : 
     172             : #define V3_ALIGNMENT    (8)
     173             : 
     174             : #define BLK_HDR_LEN     (ALIGN(sizeof(struct tpacket_block_desc), V3_ALIGNMENT))
     175             : 
     176             : #define BLK_PLUS_PRIV(sz_of_priv) \
     177             :         (BLK_HDR_LEN + ALIGN((sz_of_priv), V3_ALIGNMENT))
     178             : 
     179             : #define BLOCK_STATUS(x) ((x)->hdr.bh1.block_status)
     180             : #define BLOCK_NUM_PKTS(x)       ((x)->hdr.bh1.num_pkts)
     181             : #define BLOCK_O2FP(x)           ((x)->hdr.bh1.offset_to_first_pkt)
     182             : #define BLOCK_LEN(x)            ((x)->hdr.bh1.blk_len)
     183             : #define BLOCK_SNUM(x)           ((x)->hdr.bh1.seq_num)
     184             : #define BLOCK_O2PRIV(x) ((x)->offset_to_priv)
     185             : 
     186             : struct packet_sock;
     187             : static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
     188             :                        struct packet_type *pt, struct net_device *orig_dev);
     189             : 
     190             : static void *packet_previous_frame(struct packet_sock *po,
     191             :                 struct packet_ring_buffer *rb,
     192             :                 int status);
     193             : static void packet_increment_head(struct packet_ring_buffer *buff);
     194             : static int prb_curr_blk_in_use(struct tpacket_block_desc *);
     195             : static void *prb_dispatch_next_block(struct tpacket_kbdq_core *,
     196             :                         struct packet_sock *);
     197             : static void prb_retire_current_block(struct tpacket_kbdq_core *,
     198             :                 struct packet_sock *, unsigned int status);
     199             : static int prb_queue_frozen(struct tpacket_kbdq_core *);
     200             : static void prb_open_block(struct tpacket_kbdq_core *,
     201             :                 struct tpacket_block_desc *);
     202             : static void prb_retire_rx_blk_timer_expired(struct timer_list *);
     203             : static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *);
     204             : static void prb_fill_rxhash(struct tpacket_kbdq_core *, struct tpacket3_hdr *);
     205             : static void prb_clear_rxhash(struct tpacket_kbdq_core *,
     206             :                 struct tpacket3_hdr *);
     207             : static void prb_fill_vlan_info(struct tpacket_kbdq_core *,
     208             :                 struct tpacket3_hdr *);
     209             : static void packet_flush_mclist(struct sock *sk);
     210             : static u16 packet_pick_tx_queue(struct sk_buff *skb);
     211             : 
     212             : struct packet_skb_cb {
     213             :         union {
     214             :                 struct sockaddr_pkt pkt;
     215             :                 union {
     216             :                         /* Trick: alias skb original length with
     217             :                          * ll.sll_family and ll.protocol in order
     218             :                          * to save room.
     219             :                          */
     220             :                         unsigned int origlen;
     221             :                         struct sockaddr_ll ll;
     222             :                 };
     223             :         } sa;
     224             : };
     225             : 
     226             : #define vio_le() virtio_legacy_is_little_endian()
     227             : 
     228             : #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
     229             : 
     230             : #define GET_PBDQC_FROM_RB(x)    ((struct tpacket_kbdq_core *)(&(x)->prb_bdqc))
     231             : #define GET_PBLOCK_DESC(x, bid) \
     232             :         ((struct tpacket_block_desc *)((x)->pkbdq[(bid)].buffer))
     233             : #define GET_CURR_PBLOCK_DESC_FROM_CORE(x)       \
     234             :         ((struct tpacket_block_desc *)((x)->pkbdq[(x)->kactive_blk_num].buffer))
     235             : #define GET_NEXT_PRB_BLK_NUM(x) \
     236             :         (((x)->kactive_blk_num < ((x)->knum_blocks-1)) ? \
     237             :         ((x)->kactive_blk_num+1) : 0)
     238             : 
     239             : static void __fanout_unlink(struct sock *sk, struct packet_sock *po);
     240             : static void __fanout_link(struct sock *sk, struct packet_sock *po);
     241             : 
     242           0 : static int packet_direct_xmit(struct sk_buff *skb)
     243             : {
     244           0 :         return dev_direct_xmit(skb, packet_pick_tx_queue(skb));
     245             : }
     246             : 
     247           2 : static struct net_device *packet_cached_dev_get(struct packet_sock *po)
     248             : {
     249           2 :         struct net_device *dev;
     250             : 
     251           2 :         rcu_read_lock();
     252           2 :         dev = rcu_dereference(po->cached_dev);
     253           2 :         if (likely(dev))
     254           2 :                 dev_hold(dev);
     255           2 :         rcu_read_unlock();
     256             : 
     257           2 :         return dev;
     258             : }
     259             : 
     260           1 : static void packet_cached_dev_assign(struct packet_sock *po,
     261             :                                      struct net_device *dev)
     262             : {
     263           2 :         rcu_assign_pointer(po->cached_dev, dev);
     264           1 : }
     265             : 
     266           1 : static void packet_cached_dev_reset(struct packet_sock *po)
     267             : {
     268           1 :         RCU_INIT_POINTER(po->cached_dev, NULL);
     269           0 : }
     270             : 
     271           0 : static bool packet_use_direct_xmit(const struct packet_sock *po)
     272             : {
     273           0 :         return po->xmit == packet_direct_xmit;
     274             : }
     275             : 
     276           0 : static u16 packet_pick_tx_queue(struct sk_buff *skb)
     277             : {
     278           0 :         struct net_device *dev = skb->dev;
     279           0 :         const struct net_device_ops *ops = dev->netdev_ops;
     280           0 :         int cpu = raw_smp_processor_id();
     281           0 :         u16 queue_index;
     282             : 
     283             : #ifdef CONFIG_XPS
     284           0 :         skb->sender_cpu = cpu + 1;
     285             : #endif
     286           0 :         skb_record_rx_queue(skb, cpu % dev->real_num_tx_queues);
     287           0 :         if (ops->ndo_select_queue) {
     288           0 :                 queue_index = ops->ndo_select_queue(dev, skb, NULL);
     289           0 :                 queue_index = netdev_cap_txqueue(dev, queue_index);
     290             :         } else {
     291           0 :                 queue_index = netdev_pick_tx(dev, skb, NULL);
     292             :         }
     293             : 
     294           0 :         return queue_index;
     295             : }
     296             : 
     297             : /* __register_prot_hook must be invoked through register_prot_hook
     298             :  * or from a context in which asynchronous accesses to the packet
     299             :  * socket is not possible (packet_create()).
     300             :  */
     301           2 : static void __register_prot_hook(struct sock *sk)
     302             : {
     303           2 :         struct packet_sock *po = pkt_sk(sk);
     304             : 
     305           2 :         if (!po->running) {
     306           2 :                 if (po->fanout)
     307           0 :                         __fanout_link(sk, po);
     308             :                 else
     309           2 :                         dev_add_pack(&po->prot_hook);
     310             : 
     311           2 :                 sock_hold(sk);
     312           2 :                 po->running = 1;
     313             :         }
     314           2 : }
     315             : 
     316           1 : static void register_prot_hook(struct sock *sk)
     317             : {
     318           2 :         lockdep_assert_held_once(&pkt_sk(sk)->bind_lock);
     319           1 :         __register_prot_hook(sk);
     320           1 : }
     321             : 
     322             : /* If the sync parameter is true, we will temporarily drop
     323             :  * the po->bind_lock and do a synchronize_net to make sure no
     324             :  * asynchronous packet processing paths still refer to the elements
     325             :  * of po->prot_hook.  If the sync parameter is false, it is the
     326             :  * callers responsibility to take care of this.
     327             :  */
     328           1 : static void __unregister_prot_hook(struct sock *sk, bool sync)
     329             : {
     330           1 :         struct packet_sock *po = pkt_sk(sk);
     331             : 
     332           2 :         lockdep_assert_held_once(&po->bind_lock);
     333             : 
     334           1 :         po->running = 0;
     335             : 
     336           1 :         if (po->fanout)
     337           0 :                 __fanout_unlink(sk, po);
     338             :         else
     339           1 :                 __dev_remove_pack(&po->prot_hook);
     340             : 
     341           1 :         __sock_put(sk);
     342             : 
     343           1 :         if (sync) {
     344           1 :                 spin_unlock(&po->bind_lock);
     345           1 :                 synchronize_net();
     346           1 :                 spin_lock(&po->bind_lock);
     347             :         }
     348           1 : }
     349             : 
     350           0 : static void unregister_prot_hook(struct sock *sk, bool sync)
     351             : {
     352           0 :         struct packet_sock *po = pkt_sk(sk);
     353             : 
     354           0 :         if (po->running)
     355           0 :                 __unregister_prot_hook(sk, sync);
     356             : }
     357             : 
     358           0 : static inline struct page * __pure pgv_to_page(void *addr)
     359             : {
     360           0 :         if (is_vmalloc_addr(addr))
     361           0 :                 return vmalloc_to_page(addr);
     362           0 :         return virt_to_page(addr);
     363             : }
     364             : 
     365           0 : static void __packet_set_status(struct packet_sock *po, void *frame, int status)
     366             : {
     367           0 :         union tpacket_uhdr h;
     368             : 
     369           0 :         h.raw = frame;
     370           0 :         switch (po->tp_version) {
     371           0 :         case TPACKET_V1:
     372           0 :                 h.h1->tp_status = status;
     373           0 :                 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
     374             :                 break;
     375           0 :         case TPACKET_V2:
     376           0 :                 h.h2->tp_status = status;
     377           0 :                 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
     378             :                 break;
     379           0 :         case TPACKET_V3:
     380           0 :                 h.h3->tp_status = status;
     381           0 :                 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
     382             :                 break;
     383             :         default:
     384           0 :                 WARN(1, "TPACKET version not supported.\n");
     385           0 :                 BUG();
     386             :         }
     387             : 
     388           0 :         smp_wmb();
     389           0 : }
     390             : 
     391           0 : static int __packet_get_status(const struct packet_sock *po, void *frame)
     392             : {
     393           0 :         union tpacket_uhdr h;
     394             : 
     395           0 :         smp_rmb();
     396             : 
     397           0 :         h.raw = frame;
     398           0 :         switch (po->tp_version) {
     399           0 :         case TPACKET_V1:
     400           0 :                 flush_dcache_page(pgv_to_page(&h.h1->tp_status));
     401           0 :                 return h.h1->tp_status;
     402           0 :         case TPACKET_V2:
     403           0 :                 flush_dcache_page(pgv_to_page(&h.h2->tp_status));
     404           0 :                 return h.h2->tp_status;
     405           0 :         case TPACKET_V3:
     406           0 :                 flush_dcache_page(pgv_to_page(&h.h3->tp_status));
     407           0 :                 return h.h3->tp_status;
     408             :         default:
     409           0 :                 WARN(1, "TPACKET version not supported.\n");
     410           0 :                 BUG();
     411             :                 return 0;
     412             :         }
     413             : }
     414             : 
     415           0 : static __u32 tpacket_get_timestamp(struct sk_buff *skb, struct timespec64 *ts,
     416             :                                    unsigned int flags)
     417             : {
     418           0 :         struct skb_shared_hwtstamps *shhwtstamps = skb_hwtstamps(skb);
     419             : 
     420           0 :         if (shhwtstamps &&
     421           0 :             (flags & SOF_TIMESTAMPING_RAW_HARDWARE) &&
     422           0 :             ktime_to_timespec64_cond(shhwtstamps->hwtstamp, ts))
     423           0 :                 return TP_STATUS_TS_RAW_HARDWARE;
     424             : 
     425           0 :         if (ktime_to_timespec64_cond(skb->tstamp, ts))
     426           0 :                 return TP_STATUS_TS_SOFTWARE;
     427             : 
     428             :         return 0;
     429             : }
     430             : 
     431           0 : static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
     432             :                                     struct sk_buff *skb)
     433             : {
     434           0 :         union tpacket_uhdr h;
     435           0 :         struct timespec64 ts;
     436           0 :         __u32 ts_status;
     437             : 
     438           0 :         if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
     439             :                 return 0;
     440             : 
     441           0 :         h.raw = frame;
     442             :         /*
     443             :          * versions 1 through 3 overflow the timestamps in y2106, since they
     444             :          * all store the seconds in a 32-bit unsigned integer.
     445             :          * If we create a version 4, that should have a 64-bit timestamp,
     446             :          * either 64-bit seconds + 32-bit nanoseconds, or just 64-bit
     447             :          * nanoseconds.
     448             :          */
     449           0 :         switch (po->tp_version) {
     450           0 :         case TPACKET_V1:
     451           0 :                 h.h1->tp_sec = ts.tv_sec;
     452           0 :                 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
     453           0 :                 break;
     454           0 :         case TPACKET_V2:
     455           0 :                 h.h2->tp_sec = ts.tv_sec;
     456           0 :                 h.h2->tp_nsec = ts.tv_nsec;
     457           0 :                 break;
     458           0 :         case TPACKET_V3:
     459           0 :                 h.h3->tp_sec = ts.tv_sec;
     460           0 :                 h.h3->tp_nsec = ts.tv_nsec;
     461           0 :                 break;
     462             :         default:
     463           0 :                 WARN(1, "TPACKET version not supported.\n");
     464           0 :                 BUG();
     465             :         }
     466             : 
     467             :         /* one flush is safe, as both fields always lie on the same cacheline */
     468           0 :         flush_dcache_page(pgv_to_page(&h.h1->tp_sec));
     469           0 :         smp_wmb();
     470             : 
     471           0 :         return ts_status;
     472             : }
     473             : 
     474           0 : static void *packet_lookup_frame(const struct packet_sock *po,
     475             :                                  const struct packet_ring_buffer *rb,
     476             :                                  unsigned int position,
     477             :                                  int status)
     478             : {
     479           0 :         unsigned int pg_vec_pos, frame_offset;
     480           0 :         union tpacket_uhdr h;
     481             : 
     482           0 :         pg_vec_pos = position / rb->frames_per_block;
     483           0 :         frame_offset = position % rb->frames_per_block;
     484             : 
     485           0 :         h.raw = rb->pg_vec[pg_vec_pos].buffer +
     486           0 :                 (frame_offset * rb->frame_size);
     487             : 
     488           0 :         if (status != __packet_get_status(po, h.raw))
     489           0 :                 return NULL;
     490             : 
     491             :         return h.raw;
     492             : }
     493             : 
     494           0 : static void *packet_current_frame(struct packet_sock *po,
     495             :                 struct packet_ring_buffer *rb,
     496             :                 int status)
     497             : {
     498           0 :         return packet_lookup_frame(po, rb, rb->head, status);
     499             : }
     500             : 
     501           0 : static void prb_del_retire_blk_timer(struct tpacket_kbdq_core *pkc)
     502             : {
     503           0 :         del_timer_sync(&pkc->retire_blk_timer);
     504             : }
     505             : 
     506           0 : static void prb_shutdown_retire_blk_timer(struct packet_sock *po,
     507             :                 struct sk_buff_head *rb_queue)
     508             : {
     509           0 :         struct tpacket_kbdq_core *pkc;
     510             : 
     511           0 :         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
     512             : 
     513           0 :         spin_lock_bh(&rb_queue->lock);
     514           0 :         pkc->delete_blk_timer = 1;
     515           0 :         spin_unlock_bh(&rb_queue->lock);
     516             : 
     517           0 :         prb_del_retire_blk_timer(pkc);
     518           0 : }
     519             : 
     520           0 : static void prb_setup_retire_blk_timer(struct packet_sock *po)
     521             : {
     522           0 :         struct tpacket_kbdq_core *pkc;
     523             : 
     524           0 :         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
     525           0 :         timer_setup(&pkc->retire_blk_timer, prb_retire_rx_blk_timer_expired,
     526             :                     0);
     527           0 :         pkc->retire_blk_timer.expires = jiffies;
     528           0 : }
     529             : 
     530           0 : static int prb_calc_retire_blk_tmo(struct packet_sock *po,
     531             :                                 int blk_size_in_bytes)
     532             : {
     533           0 :         struct net_device *dev;
     534           0 :         unsigned int mbits, div;
     535           0 :         struct ethtool_link_ksettings ecmd;
     536           0 :         int err;
     537             : 
     538           0 :         rtnl_lock();
     539           0 :         dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
     540           0 :         if (unlikely(!dev)) {
     541           0 :                 rtnl_unlock();
     542           0 :                 return DEFAULT_PRB_RETIRE_TOV;
     543             :         }
     544           0 :         err = __ethtool_get_link_ksettings(dev, &ecmd);
     545           0 :         rtnl_unlock();
     546           0 :         if (err)
     547             :                 return DEFAULT_PRB_RETIRE_TOV;
     548             : 
     549             :         /* If the link speed is so slow you don't really
     550             :          * need to worry about perf anyways
     551             :          */
     552           0 :         if (ecmd.base.speed < SPEED_1000 ||
     553             :             ecmd.base.speed == SPEED_UNKNOWN)
     554             :                 return DEFAULT_PRB_RETIRE_TOV;
     555             : 
     556           0 :         div = ecmd.base.speed / 1000;
     557           0 :         mbits = (blk_size_in_bytes * 8) / (1024 * 1024);
     558             : 
     559           0 :         if (div)
     560           0 :                 mbits /= div;
     561             : 
     562           0 :         if (div)
     563           0 :                 return mbits + 1;
     564             :         return mbits;
     565             : }
     566             : 
     567           0 : static void prb_init_ft_ops(struct tpacket_kbdq_core *p1,
     568             :                         union tpacket_req_u *req_u)
     569             : {
     570           0 :         p1->feature_req_word = req_u->req3.tp_feature_req_word;
     571             : }
     572             : 
     573           0 : static void init_prb_bdqc(struct packet_sock *po,
     574             :                         struct packet_ring_buffer *rb,
     575             :                         struct pgv *pg_vec,
     576             :                         union tpacket_req_u *req_u)
     577             : {
     578           0 :         struct tpacket_kbdq_core *p1 = GET_PBDQC_FROM_RB(rb);
     579           0 :         struct tpacket_block_desc *pbd;
     580             : 
     581           0 :         memset(p1, 0x0, sizeof(*p1));
     582             : 
     583           0 :         p1->knxt_seq_num = 1;
     584           0 :         p1->pkbdq = pg_vec;
     585           0 :         pbd = (struct tpacket_block_desc *)pg_vec[0].buffer;
     586           0 :         p1->pkblk_start      = pg_vec[0].buffer;
     587           0 :         p1->kblk_size = req_u->req3.tp_block_size;
     588           0 :         p1->knum_blocks      = req_u->req3.tp_block_nr;
     589           0 :         p1->hdrlen = po->tp_hdrlen;
     590           0 :         p1->version = po->tp_version;
     591           0 :         p1->last_kactive_blk_num = 0;
     592           0 :         po->stats.stats3.tp_freeze_q_cnt = 0;
     593           0 :         if (req_u->req3.tp_retire_blk_tov)
     594           0 :                 p1->retire_blk_tov = req_u->req3.tp_retire_blk_tov;
     595             :         else
     596           0 :                 p1->retire_blk_tov = prb_calc_retire_blk_tmo(po,
     597           0 :                                                 req_u->req3.tp_block_size);
     598           0 :         p1->tov_in_jiffies = msecs_to_jiffies(p1->retire_blk_tov);
     599           0 :         p1->blk_sizeof_priv = req_u->req3.tp_sizeof_priv;
     600           0 :         rwlock_init(&p1->blk_fill_in_prog_lock);
     601             : 
     602           0 :         p1->max_frame_len = p1->kblk_size - BLK_PLUS_PRIV(p1->blk_sizeof_priv);
     603           0 :         prb_init_ft_ops(p1, req_u);
     604           0 :         prb_setup_retire_blk_timer(po);
     605           0 :         prb_open_block(p1, pbd);
     606           0 : }
     607             : 
     608             : /*  Do NOT update the last_blk_num first.
     609             :  *  Assumes sk_buff_head lock is held.
     610             :  */
     611           0 : static void _prb_refresh_rx_retire_blk_timer(struct tpacket_kbdq_core *pkc)
     612             : {
     613           0 :         mod_timer(&pkc->retire_blk_timer,
     614           0 :                         jiffies + pkc->tov_in_jiffies);
     615           0 :         pkc->last_kactive_blk_num = pkc->kactive_blk_num;
     616           0 : }
     617             : 
     618             : /*
     619             :  * Timer logic:
     620             :  * 1) We refresh the timer only when we open a block.
     621             :  *    By doing this we don't waste cycles refreshing the timer
     622             :  *        on packet-by-packet basis.
     623             :  *
     624             :  * With a 1MB block-size, on a 1Gbps line, it will take
     625             :  * i) ~8 ms to fill a block + ii) memcpy etc.
     626             :  * In this cut we are not accounting for the memcpy time.
     627             :  *
     628             :  * So, if the user sets the 'tmo' to 10ms then the timer
     629             :  * will never fire while the block is still getting filled
     630             :  * (which is what we want). However, the user could choose
     631             :  * to close a block early and that's fine.
     632             :  *
     633             :  * But when the timer does fire, we check whether or not to refresh it.
     634             :  * Since the tmo granularity is in msecs, it is not too expensive
     635             :  * to refresh the timer, lets say every '8' msecs.
     636             :  * Either the user can set the 'tmo' or we can derive it based on
     637             :  * a) line-speed and b) block-size.
     638             :  * prb_calc_retire_blk_tmo() calculates the tmo.
     639             :  *
     640             :  */
     641           0 : static void prb_retire_rx_blk_timer_expired(struct timer_list *t)
     642             : {
     643           0 :         struct packet_sock *po =
     644           0 :                 from_timer(po, t, rx_ring.prb_bdqc.retire_blk_timer);
     645           0 :         struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
     646           0 :         unsigned int frozen;
     647           0 :         struct tpacket_block_desc *pbd;
     648             : 
     649           0 :         spin_lock(&po->sk.sk_receive_queue.lock);
     650             : 
     651           0 :         frozen = prb_queue_frozen(pkc);
     652           0 :         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
     653             : 
     654           0 :         if (unlikely(pkc->delete_blk_timer))
     655           0 :                 goto out;
     656             : 
     657             :         /* We only need to plug the race when the block is partially filled.
     658             :          * tpacket_rcv:
     659             :          *              lock(); increment BLOCK_NUM_PKTS; unlock()
     660             :          *              copy_bits() is in progress ...
     661             :          *              timer fires on other cpu:
     662             :          *              we can't retire the current block because copy_bits
     663             :          *              is in progress.
     664             :          *
     665             :          */
     666           0 :         if (BLOCK_NUM_PKTS(pbd)) {
     667             :                 /* Waiting for skb_copy_bits to finish... */
     668           0 :                 write_lock(&pkc->blk_fill_in_prog_lock);
     669           0 :                 write_unlock(&pkc->blk_fill_in_prog_lock);
     670             :         }
     671             : 
     672           0 :         if (pkc->last_kactive_blk_num == pkc->kactive_blk_num) {
     673           0 :                 if (!frozen) {
     674           0 :                         if (!BLOCK_NUM_PKTS(pbd)) {
     675             :                                 /* An empty block. Just refresh the timer. */
     676           0 :                                 goto refresh_timer;
     677             :                         }
     678           0 :                         prb_retire_current_block(pkc, po, TP_STATUS_BLK_TMO);
     679           0 :                         if (!prb_dispatch_next_block(pkc, po))
     680           0 :                                 goto refresh_timer;
     681             :                         else
     682           0 :                                 goto out;
     683             :                 } else {
     684             :                         /* Case 1. Queue was frozen because user-space was
     685             :                          *         lagging behind.
     686             :                          */
     687           0 :                         if (prb_curr_blk_in_use(pbd)) {
     688             :                                 /*
     689             :                                  * Ok, user-space is still behind.
     690             :                                  * So just refresh the timer.
     691             :                                  */
     692           0 :                                 goto refresh_timer;
     693             :                         } else {
     694             :                                /* Case 2. queue was frozen,user-space caught up,
     695             :                                 * now the link went idle && the timer fired.
     696             :                                 * We don't have a block to close.So we open this
     697             :                                 * block and restart the timer.
     698             :                                 * opening a block thaws the queue,restarts timer
     699             :                                 * Thawing/timer-refresh is a side effect.
     700             :                                 */
     701           0 :                                 prb_open_block(pkc, pbd);
     702           0 :                                 goto out;
     703             :                         }
     704             :                 }
     705             :         }
     706             : 
     707           0 : refresh_timer:
     708           0 :         _prb_refresh_rx_retire_blk_timer(pkc);
     709             : 
     710           0 : out:
     711           0 :         spin_unlock(&po->sk.sk_receive_queue.lock);
     712           0 : }
     713             : 
     714           0 : static void prb_flush_block(struct tpacket_kbdq_core *pkc1,
     715             :                 struct tpacket_block_desc *pbd1, __u32 status)
     716             : {
     717             :         /* Flush everything minus the block header */
     718             : 
     719             : #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
     720             :         u8 *start, *end;
     721             : 
     722             :         start = (u8 *)pbd1;
     723             : 
     724             :         /* Skip the block header(we know header WILL fit in 4K) */
     725             :         start += PAGE_SIZE;
     726             : 
     727             :         end = (u8 *)PAGE_ALIGN((unsigned long)pkc1->pkblk_end);
     728             :         for (; start < end; start += PAGE_SIZE)
     729             :                 flush_dcache_page(pgv_to_page(start));
     730             : 
     731             :         smp_wmb();
     732             : #endif
     733             : 
     734             :         /* Now update the block status. */
     735             : 
     736           0 :         BLOCK_STATUS(pbd1) = status;
     737             : 
     738             :         /* Flush the block header */
     739             : 
     740             : #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
     741             :         start = (u8 *)pbd1;
     742             :         flush_dcache_page(pgv_to_page(start));
     743             : 
     744             :         smp_wmb();
     745             : #endif
     746             : }
     747             : 
     748             : /*
     749             :  * Side effect:
     750             :  *
     751             :  * 1) flush the block
     752             :  * 2) Increment active_blk_num
     753             :  *
     754             :  * Note:We DONT refresh the timer on purpose.
     755             :  *      Because almost always the next block will be opened.
     756             :  */
     757           0 : static void prb_close_block(struct tpacket_kbdq_core *pkc1,
     758             :                 struct tpacket_block_desc *pbd1,
     759             :                 struct packet_sock *po, unsigned int stat)
     760             : {
     761           0 :         __u32 status = TP_STATUS_USER | stat;
     762             : 
     763           0 :         struct tpacket3_hdr *last_pkt;
     764           0 :         struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
     765           0 :         struct sock *sk = &po->sk;
     766             : 
     767           0 :         if (atomic_read(&po->tp_drops))
     768           0 :                 status |= TP_STATUS_LOSING;
     769             : 
     770           0 :         last_pkt = (struct tpacket3_hdr *)pkc1->prev;
     771           0 :         last_pkt->tp_next_offset = 0;
     772             : 
     773             :         /* Get the ts of the last pkt */
     774           0 :         if (BLOCK_NUM_PKTS(pbd1)) {
     775           0 :                 h1->ts_last_pkt.ts_sec = last_pkt->tp_sec;
     776           0 :                 h1->ts_last_pkt.ts_nsec      = last_pkt->tp_nsec;
     777             :         } else {
     778             :                 /* Ok, we tmo'd - so get the current time.
     779             :                  *
     780             :                  * It shouldn't really happen as we don't close empty
     781             :                  * blocks. See prb_retire_rx_blk_timer_expired().
     782             :                  */
     783           0 :                 struct timespec64 ts;
     784           0 :                 ktime_get_real_ts64(&ts);
     785           0 :                 h1->ts_last_pkt.ts_sec = ts.tv_sec;
     786           0 :                 h1->ts_last_pkt.ts_nsec      = ts.tv_nsec;
     787             :         }
     788             : 
     789           0 :         smp_wmb();
     790             : 
     791             :         /* Flush the block */
     792           0 :         prb_flush_block(pkc1, pbd1, status);
     793             : 
     794           0 :         sk->sk_data_ready(sk);
     795             : 
     796           0 :         pkc1->kactive_blk_num = GET_NEXT_PRB_BLK_NUM(pkc1);
     797           0 : }
     798             : 
     799           0 : static void prb_thaw_queue(struct tpacket_kbdq_core *pkc)
     800             : {
     801           0 :         pkc->reset_pending_on_curr_blk = 0;
     802             : }
     803             : 
     804             : /*
     805             :  * Side effect of opening a block:
     806             :  *
     807             :  * 1) prb_queue is thawed.
     808             :  * 2) retire_blk_timer is refreshed.
     809             :  *
     810             :  */
     811           0 : static void prb_open_block(struct tpacket_kbdq_core *pkc1,
     812             :         struct tpacket_block_desc *pbd1)
     813             : {
     814           0 :         struct timespec64 ts;
     815           0 :         struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
     816             : 
     817           0 :         smp_rmb();
     818             : 
     819             :         /* We could have just memset this but we will lose the
     820             :          * flexibility of making the priv area sticky
     821             :          */
     822             : 
     823           0 :         BLOCK_SNUM(pbd1) = pkc1->knxt_seq_num++;
     824           0 :         BLOCK_NUM_PKTS(pbd1) = 0;
     825           0 :         BLOCK_LEN(pbd1) = BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
     826             : 
     827           0 :         ktime_get_real_ts64(&ts);
     828             : 
     829           0 :         h1->ts_first_pkt.ts_sec = ts.tv_sec;
     830           0 :         h1->ts_first_pkt.ts_nsec = ts.tv_nsec;
     831             : 
     832           0 :         pkc1->pkblk_start = (char *)pbd1;
     833           0 :         pkc1->nxt_offset = pkc1->pkblk_start + BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
     834             : 
     835           0 :         BLOCK_O2FP(pbd1) = (__u32)BLK_PLUS_PRIV(pkc1->blk_sizeof_priv);
     836           0 :         BLOCK_O2PRIV(pbd1) = BLK_HDR_LEN;
     837             : 
     838           0 :         pbd1->version = pkc1->version;
     839           0 :         pkc1->prev = pkc1->nxt_offset;
     840           0 :         pkc1->pkblk_end = pkc1->pkblk_start + pkc1->kblk_size;
     841             : 
     842           0 :         prb_thaw_queue(pkc1);
     843           0 :         _prb_refresh_rx_retire_blk_timer(pkc1);
     844             : 
     845           0 :         smp_wmb();
     846           0 : }
     847             : 
     848             : /*
     849             :  * Queue freeze logic:
     850             :  * 1) Assume tp_block_nr = 8 blocks.
     851             :  * 2) At time 't0', user opens Rx ring.
     852             :  * 3) Some time past 't0', kernel starts filling blocks starting from 0 .. 7
     853             :  * 4) user-space is either sleeping or processing block '0'.
     854             :  * 5) tpacket_rcv is currently filling block '7', since there is no space left,
     855             :  *    it will close block-7,loop around and try to fill block '0'.
     856             :  *    call-flow:
     857             :  *    __packet_lookup_frame_in_block
     858             :  *      prb_retire_current_block()
     859             :  *      prb_dispatch_next_block()
     860             :  *        |->(BLOCK_STATUS == USER) evaluates to true
     861             :  *    5.1) Since block-0 is currently in-use, we just freeze the queue.
     862             :  * 6) Now there are two cases:
     863             :  *    6.1) Link goes idle right after the queue is frozen.
     864             :  *         But remember, the last open_block() refreshed the timer.
     865             :  *         When this timer expires,it will refresh itself so that we can
     866             :  *         re-open block-0 in near future.
     867             :  *    6.2) Link is busy and keeps on receiving packets. This is a simple
     868             :  *         case and __packet_lookup_frame_in_block will check if block-0
     869             :  *         is free and can now be re-used.
     870             :  */
     871           0 : static void prb_freeze_queue(struct tpacket_kbdq_core *pkc,
     872             :                                   struct packet_sock *po)
     873             : {
     874           0 :         pkc->reset_pending_on_curr_blk = 1;
     875           0 :         po->stats.stats3.tp_freeze_q_cnt++;
     876             : }
     877             : 
     878             : #define TOTAL_PKT_LEN_INCL_ALIGN(length) (ALIGN((length), V3_ALIGNMENT))
     879             : 
     880             : /*
     881             :  * If the next block is free then we will dispatch it
     882             :  * and return a good offset.
     883             :  * Else, we will freeze the queue.
     884             :  * So, caller must check the return value.
     885             :  */
     886           0 : static void *prb_dispatch_next_block(struct tpacket_kbdq_core *pkc,
     887             :                 struct packet_sock *po)
     888             : {
     889           0 :         struct tpacket_block_desc *pbd;
     890             : 
     891           0 :         smp_rmb();
     892             : 
     893             :         /* 1. Get current block num */
     894           0 :         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
     895             : 
     896             :         /* 2. If this block is currently in_use then freeze the queue */
     897           0 :         if (TP_STATUS_USER & BLOCK_STATUS(pbd)) {
     898           0 :                 prb_freeze_queue(pkc, po);
     899           0 :                 return NULL;
     900             :         }
     901             : 
     902             :         /*
     903             :          * 3.
     904             :          * open this block and return the offset where the first packet
     905             :          * needs to get stored.
     906             :          */
     907           0 :         prb_open_block(pkc, pbd);
     908           0 :         return (void *)pkc->nxt_offset;
     909             : }
     910             : 
     911           0 : static void prb_retire_current_block(struct tpacket_kbdq_core *pkc,
     912             :                 struct packet_sock *po, unsigned int status)
     913             : {
     914           0 :         struct tpacket_block_desc *pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
     915             : 
     916             :         /* retire/close the current block */
     917           0 :         if (likely(TP_STATUS_KERNEL == BLOCK_STATUS(pbd))) {
     918             :                 /*
     919             :                  * Plug the case where copy_bits() is in progress on
     920             :                  * cpu-0 and tpacket_rcv() got invoked on cpu-1, didn't
     921             :                  * have space to copy the pkt in the current block and
     922             :                  * called prb_retire_current_block()
     923             :                  *
     924             :                  * We don't need to worry about the TMO case because
     925             :                  * the timer-handler already handled this case.
     926             :                  */
     927           0 :                 if (!(status & TP_STATUS_BLK_TMO)) {
     928             :                         /* Waiting for skb_copy_bits to finish... */
     929           0 :                         write_lock(&pkc->blk_fill_in_prog_lock);
     930           0 :                         write_unlock(&pkc->blk_fill_in_prog_lock);
     931             :                 }
     932           0 :                 prb_close_block(pkc, pbd, po, status);
     933           0 :                 return;
     934             :         }
     935             : }
     936             : 
     937           0 : static int prb_curr_blk_in_use(struct tpacket_block_desc *pbd)
     938             : {
     939           0 :         return TP_STATUS_USER & BLOCK_STATUS(pbd);
     940             : }
     941             : 
     942           0 : static int prb_queue_frozen(struct tpacket_kbdq_core *pkc)
     943             : {
     944           0 :         return pkc->reset_pending_on_curr_blk;
     945             : }
     946             : 
     947           0 : static void prb_clear_blk_fill_status(struct packet_ring_buffer *rb)
     948             :         __releases(&pkc->blk_fill_in_prog_lock)
     949             : {
     950           0 :         struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
     951             : 
     952           0 :         read_unlock(&pkc->blk_fill_in_prog_lock);
     953           0 : }
     954             : 
     955           0 : static void prb_fill_rxhash(struct tpacket_kbdq_core *pkc,
     956             :                         struct tpacket3_hdr *ppd)
     957             : {
     958           0 :         ppd->hv1.tp_rxhash = skb_get_hash(pkc->skb);
     959           0 : }
     960             : 
     961           0 : static void prb_clear_rxhash(struct tpacket_kbdq_core *pkc,
     962             :                         struct tpacket3_hdr *ppd)
     963             : {
     964           0 :         ppd->hv1.tp_rxhash = 0;
     965           0 : }
     966             : 
     967           0 : static void prb_fill_vlan_info(struct tpacket_kbdq_core *pkc,
     968             :                         struct tpacket3_hdr *ppd)
     969             : {
     970           0 :         if (skb_vlan_tag_present(pkc->skb)) {
     971           0 :                 ppd->hv1.tp_vlan_tci = skb_vlan_tag_get(pkc->skb);
     972           0 :                 ppd->hv1.tp_vlan_tpid = ntohs(pkc->skb->vlan_proto);
     973           0 :                 ppd->tp_status = TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
     974             :         } else {
     975           0 :                 ppd->hv1.tp_vlan_tci = 0;
     976           0 :                 ppd->hv1.tp_vlan_tpid = 0;
     977           0 :                 ppd->tp_status = TP_STATUS_AVAILABLE;
     978             :         }
     979           0 : }
     980             : 
     981           0 : static void prb_run_all_ft_ops(struct tpacket_kbdq_core *pkc,
     982             :                         struct tpacket3_hdr *ppd)
     983             : {
     984           0 :         ppd->hv1.tp_padding = 0;
     985           0 :         prb_fill_vlan_info(pkc, ppd);
     986             : 
     987           0 :         if (pkc->feature_req_word & TP_FT_REQ_FILL_RXHASH)
     988           0 :                 prb_fill_rxhash(pkc, ppd);
     989             :         else
     990           0 :                 prb_clear_rxhash(pkc, ppd);
     991           0 : }
     992             : 
     993           0 : static void prb_fill_curr_block(char *curr,
     994             :                                 struct tpacket_kbdq_core *pkc,
     995             :                                 struct tpacket_block_desc *pbd,
     996             :                                 unsigned int len)
     997             :         __acquires(&pkc->blk_fill_in_prog_lock)
     998             : {
     999           0 :         struct tpacket3_hdr *ppd;
    1000             : 
    1001           0 :         ppd  = (struct tpacket3_hdr *)curr;
    1002           0 :         ppd->tp_next_offset = TOTAL_PKT_LEN_INCL_ALIGN(len);
    1003           0 :         pkc->prev = curr;
    1004           0 :         pkc->nxt_offset += TOTAL_PKT_LEN_INCL_ALIGN(len);
    1005           0 :         BLOCK_LEN(pbd) += TOTAL_PKT_LEN_INCL_ALIGN(len);
    1006           0 :         BLOCK_NUM_PKTS(pbd) += 1;
    1007           0 :         read_lock(&pkc->blk_fill_in_prog_lock);
    1008           0 :         prb_run_all_ft_ops(pkc, ppd);
    1009           0 : }
    1010             : 
    1011             : /* Assumes caller has the sk->rx_queue.lock */
    1012           0 : static void *__packet_lookup_frame_in_block(struct packet_sock *po,
    1013             :                                             struct sk_buff *skb,
    1014             :                                             unsigned int len
    1015             :                                             )
    1016             : {
    1017           0 :         struct tpacket_kbdq_core *pkc;
    1018           0 :         struct tpacket_block_desc *pbd;
    1019           0 :         char *curr, *end;
    1020             : 
    1021           0 :         pkc = GET_PBDQC_FROM_RB(&po->rx_ring);
    1022           0 :         pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
    1023             : 
    1024             :         /* Queue is frozen when user space is lagging behind */
    1025           0 :         if (prb_queue_frozen(pkc)) {
    1026             :                 /*
    1027             :                  * Check if that last block which caused the queue to freeze,
    1028             :                  * is still in_use by user-space.
    1029             :                  */
    1030           0 :                 if (prb_curr_blk_in_use(pbd)) {
    1031             :                         /* Can't record this packet */
    1032             :                         return NULL;
    1033             :                 } else {
    1034             :                         /*
    1035             :                          * Ok, the block was released by user-space.
    1036             :                          * Now let's open that block.
    1037             :                          * opening a block also thaws the queue.
    1038             :                          * Thawing is a side effect.
    1039             :                          */
    1040           0 :                         prb_open_block(pkc, pbd);
    1041             :                 }
    1042             :         }
    1043             : 
    1044           0 :         smp_mb();
    1045           0 :         curr = pkc->nxt_offset;
    1046           0 :         pkc->skb = skb;
    1047           0 :         end = (char *)pbd + pkc->kblk_size;
    1048             : 
    1049             :         /* first try the current block */
    1050           0 :         if (curr+TOTAL_PKT_LEN_INCL_ALIGN(len) < end) {
    1051           0 :                 prb_fill_curr_block(curr, pkc, pbd, len);
    1052           0 :                 return (void *)curr;
    1053             :         }
    1054             : 
    1055             :         /* Ok, close the current block */
    1056           0 :         prb_retire_current_block(pkc, po, 0);
    1057             : 
    1058             :         /* Now, try to dispatch the next block */
    1059           0 :         curr = (char *)prb_dispatch_next_block(pkc, po);
    1060           0 :         if (curr) {
    1061           0 :                 pbd = GET_CURR_PBLOCK_DESC_FROM_CORE(pkc);
    1062           0 :                 prb_fill_curr_block(curr, pkc, pbd, len);
    1063           0 :                 return (void *)curr;
    1064             :         }
    1065             : 
    1066             :         /*
    1067             :          * No free blocks are available.user_space hasn't caught up yet.
    1068             :          * Queue was just frozen and now this packet will get dropped.
    1069             :          */
    1070             :         return NULL;
    1071             : }
    1072             : 
    1073           0 : static void *packet_current_rx_frame(struct packet_sock *po,
    1074             :                                             struct sk_buff *skb,
    1075             :                                             int status, unsigned int len)
    1076             : {
    1077           0 :         char *curr = NULL;
    1078           0 :         switch (po->tp_version) {
    1079           0 :         case TPACKET_V1:
    1080             :         case TPACKET_V2:
    1081           0 :                 curr = packet_lookup_frame(po, &po->rx_ring,
    1082             :                                         po->rx_ring.head, status);
    1083           0 :                 return curr;
    1084           0 :         case TPACKET_V3:
    1085           0 :                 return __packet_lookup_frame_in_block(po, skb, len);
    1086             :         default:
    1087           0 :                 WARN(1, "TPACKET version not supported\n");
    1088           0 :                 BUG();
    1089             :                 return NULL;
    1090             :         }
    1091             : }
    1092             : 
    1093           0 : static void *prb_lookup_block(const struct packet_sock *po,
    1094             :                               const struct packet_ring_buffer *rb,
    1095             :                               unsigned int idx,
    1096             :                               int status)
    1097             : {
    1098           0 :         struct tpacket_kbdq_core *pkc  = GET_PBDQC_FROM_RB(rb);
    1099           0 :         struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
    1100             : 
    1101           0 :         if (status != BLOCK_STATUS(pbd))
    1102           0 :                 return NULL;
    1103             :         return pbd;
    1104             : }
    1105             : 
    1106           0 : static int prb_previous_blk_num(struct packet_ring_buffer *rb)
    1107             : {
    1108           0 :         unsigned int prev;
    1109           0 :         if (rb->prb_bdqc.kactive_blk_num)
    1110           0 :                 prev = rb->prb_bdqc.kactive_blk_num-1;
    1111             :         else
    1112           0 :                 prev = rb->prb_bdqc.knum_blocks-1;
    1113           0 :         return prev;
    1114             : }
    1115             : 
    1116             : /* Assumes caller has held the rx_queue.lock */
    1117           0 : static void *__prb_previous_block(struct packet_sock *po,
    1118             :                                          struct packet_ring_buffer *rb,
    1119             :                                          int status)
    1120             : {
    1121           0 :         unsigned int previous = prb_previous_blk_num(rb);
    1122           0 :         return prb_lookup_block(po, rb, previous, status);
    1123             : }
    1124             : 
    1125           0 : static void *packet_previous_rx_frame(struct packet_sock *po,
    1126             :                                              struct packet_ring_buffer *rb,
    1127             :                                              int status)
    1128             : {
    1129           0 :         if (po->tp_version <= TPACKET_V2)
    1130           0 :                 return packet_previous_frame(po, rb, status);
    1131             : 
    1132           0 :         return __prb_previous_block(po, rb, status);
    1133             : }
    1134             : 
    1135           0 : static void packet_increment_rx_head(struct packet_sock *po,
    1136             :                                             struct packet_ring_buffer *rb)
    1137             : {
    1138           0 :         switch (po->tp_version) {
    1139             :         case TPACKET_V1:
    1140             :         case TPACKET_V2:
    1141           0 :                 return packet_increment_head(rb);
    1142             :         case TPACKET_V3:
    1143             :         default:
    1144           0 :                 WARN(1, "TPACKET version not supported.\n");
    1145           0 :                 BUG();
    1146           0 :                 return;
    1147             :         }
    1148             : }
    1149             : 
    1150           0 : static void *packet_previous_frame(struct packet_sock *po,
    1151             :                 struct packet_ring_buffer *rb,
    1152             :                 int status)
    1153             : {
    1154           0 :         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
    1155           0 :         return packet_lookup_frame(po, rb, previous, status);
    1156             : }
    1157             : 
    1158           0 : static void packet_increment_head(struct packet_ring_buffer *buff)
    1159             : {
    1160           0 :         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
    1161             : }
    1162             : 
    1163           0 : static void packet_inc_pending(struct packet_ring_buffer *rb)
    1164             : {
    1165           0 :         this_cpu_inc(*rb->pending_refcnt);
    1166             : }
    1167             : 
    1168           0 : static void packet_dec_pending(struct packet_ring_buffer *rb)
    1169             : {
    1170           0 :         this_cpu_dec(*rb->pending_refcnt);
    1171             : }
    1172             : 
    1173           0 : static unsigned int packet_read_pending(const struct packet_ring_buffer *rb)
    1174             : {
    1175           0 :         unsigned int refcnt = 0;
    1176           0 :         int cpu;
    1177             : 
    1178             :         /* We don't use pending refcount in rx_ring. */
    1179           0 :         if (rb->pending_refcnt == NULL)
    1180             :                 return 0;
    1181             : 
    1182           0 :         for_each_possible_cpu(cpu)
    1183           0 :                 refcnt += *per_cpu_ptr(rb->pending_refcnt, cpu);
    1184             : 
    1185             :         return refcnt;
    1186             : }
    1187             : 
    1188           1 : static int packet_alloc_pending(struct packet_sock *po)
    1189             : {
    1190           1 :         po->rx_ring.pending_refcnt = NULL;
    1191             : 
    1192           1 :         po->tx_ring.pending_refcnt = alloc_percpu(unsigned int);
    1193           1 :         if (unlikely(po->tx_ring.pending_refcnt == NULL))
    1194           0 :                 return -ENOBUFS;
    1195             : 
    1196             :         return 0;
    1197             : }
    1198             : 
    1199           0 : static void packet_free_pending(struct packet_sock *po)
    1200             : {
    1201           0 :         free_percpu(po->tx_ring.pending_refcnt);
    1202             : }
    1203             : 
    1204             : #define ROOM_POW_OFF    2
    1205             : #define ROOM_NONE       0x0
    1206             : #define ROOM_LOW        0x1
    1207             : #define ROOM_NORMAL     0x2
    1208             : 
    1209           0 : static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
    1210             : {
    1211           0 :         int idx, len;
    1212             : 
    1213           0 :         len = READ_ONCE(po->rx_ring.frame_max) + 1;
    1214           0 :         idx = READ_ONCE(po->rx_ring.head);
    1215           0 :         if (pow_off)
    1216           0 :                 idx += len >> pow_off;
    1217           0 :         if (idx >= len)
    1218           0 :                 idx -= len;
    1219           0 :         return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
    1220             : }
    1221             : 
    1222           0 : static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
    1223             : {
    1224           0 :         int idx, len;
    1225             : 
    1226           0 :         len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
    1227           0 :         idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
    1228           0 :         if (pow_off)
    1229           0 :                 idx += len >> pow_off;
    1230           0 :         if (idx >= len)
    1231           0 :                 idx -= len;
    1232           0 :         return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
    1233             : }
    1234             : 
    1235           0 : static int __packet_rcv_has_room(const struct packet_sock *po,
    1236             :                                  const struct sk_buff *skb)
    1237             : {
    1238           0 :         const struct sock *sk = &po->sk;
    1239           0 :         int ret = ROOM_NONE;
    1240             : 
    1241           0 :         if (po->prot_hook.func != tpacket_rcv) {
    1242           0 :                 int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
    1243           0 :                 int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
    1244           0 :                                    - (skb ? skb->truesize : 0);
    1245             : 
    1246           0 :                 if (avail > (rcvbuf >> ROOM_POW_OFF))
    1247             :                         return ROOM_NORMAL;
    1248           0 :                 else if (avail > 0)
    1249             :                         return ROOM_LOW;
    1250             :                 else
    1251           0 :                         return ROOM_NONE;
    1252             :         }
    1253             : 
    1254           0 :         if (po->tp_version == TPACKET_V3) {
    1255           0 :                 if (__tpacket_v3_has_room(po, ROOM_POW_OFF))
    1256             :                         ret = ROOM_NORMAL;
    1257           0 :                 else if (__tpacket_v3_has_room(po, 0))
    1258           0 :                         ret = ROOM_LOW;
    1259             :         } else {
    1260           0 :                 if (__tpacket_has_room(po, ROOM_POW_OFF))
    1261             :                         ret = ROOM_NORMAL;
    1262           0 :                 else if (__tpacket_has_room(po, 0))
    1263           0 :                         ret = ROOM_LOW;
    1264             :         }
    1265             : 
    1266             :         return ret;
    1267             : }
    1268             : 
    1269           0 : static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
    1270             : {
    1271           0 :         int pressure, ret;
    1272             : 
    1273           0 :         ret = __packet_rcv_has_room(po, skb);
    1274           0 :         pressure = ret != ROOM_NORMAL;
    1275             : 
    1276           0 :         if (READ_ONCE(po->pressure) != pressure)
    1277           0 :                 WRITE_ONCE(po->pressure, pressure);
    1278             : 
    1279           0 :         return ret;
    1280             : }
    1281             : 
    1282          10 : static void packet_rcv_try_clear_pressure(struct packet_sock *po)
    1283             : {
    1284          10 :         if (READ_ONCE(po->pressure) &&
    1285           0 :             __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
    1286           0 :                 WRITE_ONCE(po->pressure,  0);
    1287          10 : }
    1288             : 
    1289           0 : static void packet_sock_destruct(struct sock *sk)
    1290             : {
    1291           0 :         skb_queue_purge(&sk->sk_error_queue);
    1292             : 
    1293           0 :         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
    1294           0 :         WARN_ON(refcount_read(&sk->sk_wmem_alloc));
    1295             : 
    1296           0 :         if (!sock_flag(sk, SOCK_DEAD)) {
    1297           0 :                 pr_err("Attempt to release alive packet socket: %p\n", sk);
    1298           0 :                 return;
    1299             :         }
    1300             : 
    1301           0 :         sk_refcnt_debug_dec(sk);
    1302             : }
    1303             : 
    1304           0 : static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb)
    1305             : {
    1306           0 :         u32 *history = po->rollover->history;
    1307           0 :         u32 victim, rxhash;
    1308           0 :         int i, count = 0;
    1309             : 
    1310           0 :         rxhash = skb_get_hash(skb);
    1311           0 :         for (i = 0; i < ROLLOVER_HLEN; i++)
    1312           0 :                 if (READ_ONCE(history[i]) == rxhash)
    1313           0 :                         count++;
    1314             : 
    1315           0 :         victim = prandom_u32() % ROLLOVER_HLEN;
    1316             : 
    1317             :         /* Avoid dirtying the cache line if possible */
    1318           0 :         if (READ_ONCE(history[victim]) != rxhash)
    1319           0 :                 WRITE_ONCE(history[victim], rxhash);
    1320             : 
    1321           0 :         return count > (ROLLOVER_HLEN >> 1);
    1322             : }
    1323             : 
    1324           0 : static unsigned int fanout_demux_hash(struct packet_fanout *f,
    1325             :                                       struct sk_buff *skb,
    1326             :                                       unsigned int num)
    1327             : {
    1328           0 :         return reciprocal_scale(__skb_get_hash_symmetric(skb), num);
    1329             : }
    1330             : 
    1331           0 : static unsigned int fanout_demux_lb(struct packet_fanout *f,
    1332             :                                     struct sk_buff *skb,
    1333             :                                     unsigned int num)
    1334             : {
    1335           0 :         unsigned int val = atomic_inc_return(&f->rr_cur);
    1336             : 
    1337           0 :         return val % num;
    1338             : }
    1339             : 
    1340           0 : static unsigned int fanout_demux_cpu(struct packet_fanout *f,
    1341             :                                      struct sk_buff *skb,
    1342             :                                      unsigned int num)
    1343             : {
    1344           0 :         return smp_processor_id() % num;
    1345             : }
    1346             : 
    1347           0 : static unsigned int fanout_demux_rnd(struct packet_fanout *f,
    1348             :                                      struct sk_buff *skb,
    1349             :                                      unsigned int num)
    1350             : {
    1351           0 :         return prandom_u32_max(num);
    1352             : }
    1353             : 
    1354           0 : static unsigned int fanout_demux_rollover(struct packet_fanout *f,
    1355             :                                           struct sk_buff *skb,
    1356             :                                           unsigned int idx, bool try_self,
    1357             :                                           unsigned int num)
    1358             : {
    1359           0 :         struct packet_sock *po, *po_next, *po_skip = NULL;
    1360           0 :         unsigned int i, j, room = ROOM_NONE;
    1361             : 
    1362           0 :         po = pkt_sk(f->arr[idx]);
    1363             : 
    1364           0 :         if (try_self) {
    1365           0 :                 room = packet_rcv_has_room(po, skb);
    1366           0 :                 if (room == ROOM_NORMAL ||
    1367           0 :                     (room == ROOM_LOW && !fanout_flow_is_huge(po, skb)))
    1368           0 :                         return idx;
    1369             :                 po_skip = po;
    1370             :         }
    1371             : 
    1372           0 :         i = j = min_t(int, po->rollover->sock, num - 1);
    1373           0 :         do {
    1374           0 :                 po_next = pkt_sk(f->arr[i]);
    1375           0 :                 if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
    1376           0 :                     packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
    1377           0 :                         if (i != j)
    1378           0 :                                 po->rollover->sock = i;
    1379           0 :                         atomic_long_inc(&po->rollover->num);
    1380           0 :                         if (room == ROOM_LOW)
    1381           0 :                                 atomic_long_inc(&po->rollover->num_huge);
    1382           0 :                         return i;
    1383             :                 }
    1384             : 
    1385           0 :                 if (++i == num)
    1386           0 :                         i = 0;
    1387           0 :         } while (i != j);
    1388             : 
    1389           0 :         atomic_long_inc(&po->rollover->num_failed);
    1390           0 :         return idx;
    1391             : }
    1392             : 
    1393           0 : static unsigned int fanout_demux_qm(struct packet_fanout *f,
    1394             :                                     struct sk_buff *skb,
    1395             :                                     unsigned int num)
    1396             : {
    1397           0 :         return skb_get_queue_mapping(skb) % num;
    1398             : }
    1399             : 
    1400           0 : static unsigned int fanout_demux_bpf(struct packet_fanout *f,
    1401             :                                      struct sk_buff *skb,
    1402             :                                      unsigned int num)
    1403             : {
    1404           0 :         struct bpf_prog *prog;
    1405           0 :         unsigned int ret = 0;
    1406             : 
    1407           0 :         rcu_read_lock();
    1408           0 :         prog = rcu_dereference(f->bpf_prog);
    1409           0 :         if (prog)
    1410           0 :                 ret = bpf_prog_run_clear_cb(prog, skb) % num;
    1411           0 :         rcu_read_unlock();
    1412             : 
    1413           0 :         return ret;
    1414             : }
    1415             : 
    1416           0 : static bool fanout_has_flag(struct packet_fanout *f, u16 flag)
    1417             : {
    1418           0 :         return f->flags & (flag >> 8);
    1419             : }
    1420             : 
    1421           0 : static int packet_rcv_fanout(struct sk_buff *skb, struct net_device *dev,
    1422             :                              struct packet_type *pt, struct net_device *orig_dev)
    1423             : {
    1424           0 :         struct packet_fanout *f = pt->af_packet_priv;
    1425           0 :         unsigned int num = READ_ONCE(f->num_members);
    1426           0 :         struct net *net = read_pnet(&f->net);
    1427           0 :         struct packet_sock *po;
    1428           0 :         unsigned int idx;
    1429             : 
    1430           0 :         if (!net_eq(dev_net(dev), net) || !num) {
    1431           0 :                 kfree_skb(skb);
    1432           0 :                 return 0;
    1433             :         }
    1434             : 
    1435           0 :         if (fanout_has_flag(f, PACKET_FANOUT_FLAG_DEFRAG)) {
    1436           0 :                 skb = ip_check_defrag(net, skb, IP_DEFRAG_AF_PACKET);
    1437           0 :                 if (!skb)
    1438             :                         return 0;
    1439             :         }
    1440           0 :         switch (f->type) {
    1441             :         case PACKET_FANOUT_HASH:
    1442             :         default:
    1443           0 :                 idx = fanout_demux_hash(f, skb, num);
    1444           0 :                 break;
    1445           0 :         case PACKET_FANOUT_LB:
    1446           0 :                 idx = fanout_demux_lb(f, skb, num);
    1447           0 :                 break;
    1448             :         case PACKET_FANOUT_CPU:
    1449           0 :                 idx = fanout_demux_cpu(f, skb, num);
    1450           0 :                 break;
    1451             :         case PACKET_FANOUT_RND:
    1452           0 :                 idx = fanout_demux_rnd(f, skb, num);
    1453           0 :                 break;
    1454             :         case PACKET_FANOUT_QM:
    1455           0 :                 idx = fanout_demux_qm(f, skb, num);
    1456           0 :                 break;
    1457           0 :         case PACKET_FANOUT_ROLLOVER:
    1458           0 :                 idx = fanout_demux_rollover(f, skb, 0, false, num);
    1459           0 :                 break;
    1460           0 :         case PACKET_FANOUT_CBPF:
    1461             :         case PACKET_FANOUT_EBPF:
    1462           0 :                 idx = fanout_demux_bpf(f, skb, num);
    1463           0 :                 break;
    1464             :         }
    1465             : 
    1466           0 :         if (fanout_has_flag(f, PACKET_FANOUT_FLAG_ROLLOVER))
    1467           0 :                 idx = fanout_demux_rollover(f, skb, idx, true, num);
    1468             : 
    1469           0 :         po = pkt_sk(f->arr[idx]);
    1470           0 :         return po->prot_hook.func(skb, dev, &po->prot_hook, orig_dev);
    1471             : }
    1472             : 
    1473             : DEFINE_MUTEX(fanout_mutex);
    1474             : EXPORT_SYMBOL_GPL(fanout_mutex);
    1475             : static LIST_HEAD(fanout_list);
    1476             : static u16 fanout_next_id;
    1477             : 
    1478           0 : static void __fanout_link(struct sock *sk, struct packet_sock *po)
    1479             : {
    1480           0 :         struct packet_fanout *f = po->fanout;
    1481             : 
    1482           0 :         spin_lock(&f->lock);
    1483           0 :         f->arr[f->num_members] = sk;
    1484           0 :         smp_wmb();
    1485           0 :         f->num_members++;
    1486           0 :         if (f->num_members == 1)
    1487           0 :                 dev_add_pack(&f->prot_hook);
    1488           0 :         spin_unlock(&f->lock);
    1489           0 : }
    1490             : 
    1491           0 : static void __fanout_unlink(struct sock *sk, struct packet_sock *po)
    1492             : {
    1493           0 :         struct packet_fanout *f = po->fanout;
    1494           0 :         int i;
    1495             : 
    1496           0 :         spin_lock(&f->lock);
    1497           0 :         for (i = 0; i < f->num_members; i++) {
    1498           0 :                 if (f->arr[i] == sk)
    1499             :                         break;
    1500             :         }
    1501           0 :         BUG_ON(i >= f->num_members);
    1502           0 :         f->arr[i] = f->arr[f->num_members - 1];
    1503           0 :         f->num_members--;
    1504           0 :         if (f->num_members == 0)
    1505           0 :                 __dev_remove_pack(&f->prot_hook);
    1506           0 :         spin_unlock(&f->lock);
    1507           0 : }
    1508             : 
    1509           0 : static bool match_fanout_group(struct packet_type *ptype, struct sock *sk)
    1510             : {
    1511           0 :         if (sk->sk_family != PF_PACKET)
    1512             :                 return false;
    1513             : 
    1514           0 :         return ptype->af_packet_priv == pkt_sk(sk)->fanout;
    1515             : }
    1516             : 
    1517           0 : static void fanout_init_data(struct packet_fanout *f)
    1518             : {
    1519           0 :         switch (f->type) {
    1520           0 :         case PACKET_FANOUT_LB:
    1521           0 :                 atomic_set(&f->rr_cur, 0);
    1522             :                 break;
    1523             :         case PACKET_FANOUT_CBPF:
    1524             :         case PACKET_FANOUT_EBPF:
    1525           0 :                 RCU_INIT_POINTER(f->bpf_prog, NULL);
    1526           0 :                 break;
    1527             :         }
    1528           0 : }
    1529             : 
    1530           0 : static void __fanout_set_data_bpf(struct packet_fanout *f, struct bpf_prog *new)
    1531             : {
    1532           0 :         struct bpf_prog *old;
    1533             : 
    1534           0 :         spin_lock(&f->lock);
    1535           0 :         old = rcu_dereference_protected(f->bpf_prog, lockdep_is_held(&f->lock));
    1536           0 :         rcu_assign_pointer(f->bpf_prog, new);
    1537           0 :         spin_unlock(&f->lock);
    1538             : 
    1539           0 :         if (old) {
    1540           0 :                 synchronize_net();
    1541           0 :                 bpf_prog_destroy(old);
    1542             :         }
    1543           0 : }
    1544             : 
    1545           0 : static int fanout_set_data_cbpf(struct packet_sock *po, sockptr_t data,
    1546             :                                 unsigned int len)
    1547             : {
    1548           0 :         struct bpf_prog *new;
    1549           0 :         struct sock_fprog fprog;
    1550           0 :         int ret;
    1551             : 
    1552           0 :         if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
    1553             :                 return -EPERM;
    1554             : 
    1555           0 :         ret = copy_bpf_fprog_from_user(&fprog, data, len);
    1556           0 :         if (ret)
    1557             :                 return ret;
    1558             : 
    1559           0 :         ret = bpf_prog_create_from_user(&new, &fprog, NULL, false);
    1560           0 :         if (ret)
    1561             :                 return ret;
    1562             : 
    1563           0 :         __fanout_set_data_bpf(po->fanout, new);
    1564           0 :         return 0;
    1565             : }
    1566             : 
    1567           0 : static int fanout_set_data_ebpf(struct packet_sock *po, sockptr_t data,
    1568             :                                 unsigned int len)
    1569             : {
    1570           0 :         struct bpf_prog *new;
    1571           0 :         u32 fd;
    1572             : 
    1573           0 :         if (sock_flag(&po->sk, SOCK_FILTER_LOCKED))
    1574             :                 return -EPERM;
    1575           0 :         if (len != sizeof(fd))
    1576             :                 return -EINVAL;
    1577           0 :         if (copy_from_sockptr(&fd, data, len))
    1578             :                 return -EFAULT;
    1579             : 
    1580           0 :         new = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
    1581           0 :         if (IS_ERR(new))
    1582           0 :                 return PTR_ERR(new);
    1583             : 
    1584             :         __fanout_set_data_bpf(po->fanout, new);
    1585             :         return 0;
    1586             : }
    1587             : 
    1588           0 : static int fanout_set_data(struct packet_sock *po, sockptr_t data,
    1589             :                            unsigned int len)
    1590             : {
    1591           0 :         switch (po->fanout->type) {
    1592           0 :         case PACKET_FANOUT_CBPF:
    1593           0 :                 return fanout_set_data_cbpf(po, data, len);
    1594           0 :         case PACKET_FANOUT_EBPF:
    1595           0 :                 return fanout_set_data_ebpf(po, data, len);
    1596             :         default:
    1597             :                 return -EINVAL;
    1598             :         }
    1599             : }
    1600             : 
    1601           0 : static void fanout_release_data(struct packet_fanout *f)
    1602             : {
    1603           0 :         switch (f->type) {
    1604           0 :         case PACKET_FANOUT_CBPF:
    1605             :         case PACKET_FANOUT_EBPF:
    1606           0 :                 __fanout_set_data_bpf(f, NULL);
    1607             :         }
    1608           0 : }
    1609             : 
    1610           0 : static bool __fanout_id_is_free(struct sock *sk, u16 candidate_id)
    1611             : {
    1612           0 :         struct packet_fanout *f;
    1613             : 
    1614           0 :         list_for_each_entry(f, &fanout_list, list) {
    1615           0 :                 if (f->id == candidate_id &&
    1616           0 :                     read_pnet(&f->net) == sock_net(sk)) {
    1617             :                         return false;
    1618             :                 }
    1619             :         }
    1620             :         return true;
    1621             : }
    1622             : 
    1623           0 : static bool fanout_find_new_id(struct sock *sk, u16 *new_id)
    1624             : {
    1625           0 :         u16 id = fanout_next_id;
    1626             : 
    1627           0 :         do {
    1628           0 :                 if (__fanout_id_is_free(sk, id)) {
    1629           0 :                         *new_id = id;
    1630           0 :                         fanout_next_id = id + 1;
    1631           0 :                         return true;
    1632             :                 }
    1633             : 
    1634           0 :                 id++;
    1635           0 :         } while (id != fanout_next_id);
    1636             : 
    1637             :         return false;
    1638             : }
    1639             : 
    1640           0 : static int fanout_add(struct sock *sk, struct fanout_args *args)
    1641             : {
    1642           0 :         struct packet_rollover *rollover = NULL;
    1643           0 :         struct packet_sock *po = pkt_sk(sk);
    1644           0 :         u16 type_flags = args->type_flags;
    1645           0 :         struct packet_fanout *f, *match;
    1646           0 :         u8 type = type_flags & 0xff;
    1647           0 :         u8 flags = type_flags >> 8;
    1648           0 :         u16 id = args->id;
    1649           0 :         int err;
    1650             : 
    1651           0 :         switch (type) {
    1652           0 :         case PACKET_FANOUT_ROLLOVER:
    1653           0 :                 if (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)
    1654             :                         return -EINVAL;
    1655             :         case PACKET_FANOUT_HASH:
    1656             :         case PACKET_FANOUT_LB:
    1657             :         case PACKET_FANOUT_CPU:
    1658             :         case PACKET_FANOUT_RND:
    1659             :         case PACKET_FANOUT_QM:
    1660             :         case PACKET_FANOUT_CBPF:
    1661             :         case PACKET_FANOUT_EBPF:
    1662           0 :                 break;
    1663             :         default:
    1664             :                 return -EINVAL;
    1665             :         }
    1666             : 
    1667           0 :         mutex_lock(&fanout_mutex);
    1668             : 
    1669           0 :         err = -EALREADY;
    1670           0 :         if (po->fanout)
    1671           0 :                 goto out;
    1672             : 
    1673           0 :         if (type == PACKET_FANOUT_ROLLOVER ||
    1674             :             (type_flags & PACKET_FANOUT_FLAG_ROLLOVER)) {
    1675           0 :                 err = -ENOMEM;
    1676           0 :                 rollover = kzalloc(sizeof(*rollover), GFP_KERNEL);
    1677           0 :                 if (!rollover)
    1678           0 :                         goto out;
    1679           0 :                 atomic_long_set(&rollover->num, 0);
    1680           0 :                 atomic_long_set(&rollover->num_huge, 0);
    1681           0 :                 atomic_long_set(&rollover->num_failed, 0);
    1682             :         }
    1683             : 
    1684           0 :         if (type_flags & PACKET_FANOUT_FLAG_UNIQUEID) {
    1685           0 :                 if (id != 0) {
    1686           0 :                         err = -EINVAL;
    1687           0 :                         goto out;
    1688             :                 }
    1689           0 :                 if (!fanout_find_new_id(sk, &id)) {
    1690           0 :                         err = -ENOMEM;
    1691           0 :                         goto out;
    1692             :                 }
    1693             :                 /* ephemeral flag for the first socket in the group: drop it */
    1694           0 :                 flags &= ~(PACKET_FANOUT_FLAG_UNIQUEID >> 8);
    1695             :         }
    1696             : 
    1697           0 :         match = NULL;
    1698           0 :         list_for_each_entry(f, &fanout_list, list) {
    1699           0 :                 if (f->id == id &&
    1700           0 :                     read_pnet(&f->net) == sock_net(sk)) {
    1701           0 :                         match = f;
    1702           0 :                         break;
    1703             :                 }
    1704             :         }
    1705           0 :         err = -EINVAL;
    1706           0 :         if (match) {
    1707           0 :                 if (match->flags != flags)
    1708           0 :                         goto out;
    1709           0 :                 if (args->max_num_members &&
    1710           0 :                     args->max_num_members != match->max_num_members)
    1711           0 :                         goto out;
    1712             :         } else {
    1713           0 :                 if (args->max_num_members > PACKET_FANOUT_MAX)
    1714           0 :                         goto out;
    1715           0 :                 if (!args->max_num_members)
    1716             :                         /* legacy PACKET_FANOUT_MAX */
    1717           0 :                         args->max_num_members = 256;
    1718           0 :                 err = -ENOMEM;
    1719           0 :                 match = kvzalloc(struct_size(match, arr, args->max_num_members),
    1720             :                                  GFP_KERNEL);
    1721           0 :                 if (!match)
    1722           0 :                         goto out;
    1723           0 :                 write_pnet(&match->net, sock_net(sk));
    1724           0 :                 match->id = id;
    1725           0 :                 match->type = type;
    1726           0 :                 match->flags = flags;
    1727           0 :                 INIT_LIST_HEAD(&match->list);
    1728           0 :                 spin_lock_init(&match->lock);
    1729           0 :                 refcount_set(&match->sk_ref, 0);
    1730           0 :                 fanout_init_data(match);
    1731           0 :                 match->prot_hook.type = po->prot_hook.type;
    1732           0 :                 match->prot_hook.dev = po->prot_hook.dev;
    1733           0 :                 match->prot_hook.func = packet_rcv_fanout;
    1734           0 :                 match->prot_hook.af_packet_priv = match;
    1735           0 :                 match->prot_hook.id_match = match_fanout_group;
    1736           0 :                 match->max_num_members = args->max_num_members;
    1737           0 :                 list_add(&match->list, &fanout_list);
    1738             :         }
    1739           0 :         err = -EINVAL;
    1740             : 
    1741           0 :         spin_lock(&po->bind_lock);
    1742           0 :         if (po->running &&
    1743           0 :             match->type == type &&
    1744           0 :             match->prot_hook.type == po->prot_hook.type &&
    1745           0 :             match->prot_hook.dev == po->prot_hook.dev) {
    1746           0 :                 err = -ENOSPC;
    1747           0 :                 if (refcount_read(&match->sk_ref) < match->max_num_members) {
    1748           0 :                         __dev_remove_pack(&po->prot_hook);
    1749           0 :                         po->fanout = match;
    1750           0 :                         po->rollover = rollover;
    1751           0 :                         rollover = NULL;
    1752           0 :                         refcount_set(&match->sk_ref, refcount_read(&match->sk_ref) + 1);
    1753           0 :                         __fanout_link(sk, po);
    1754           0 :                         err = 0;
    1755             :                 }
    1756             :         }
    1757           0 :         spin_unlock(&po->bind_lock);
    1758             : 
    1759           0 :         if (err && !refcount_read(&match->sk_ref)) {
    1760           0 :                 list_del(&match->list);
    1761           0 :                 kvfree(match);
    1762             :         }
    1763             : 
    1764           0 : out:
    1765           0 :         kfree(rollover);
    1766           0 :         mutex_unlock(&fanout_mutex);
    1767           0 :         return err;
    1768             : }
    1769             : 
    1770             : /* If pkt_sk(sk)->fanout->sk_ref is zero, this function removes
    1771             :  * pkt_sk(sk)->fanout from fanout_list and returns pkt_sk(sk)->fanout.
    1772             :  * It is the responsibility of the caller to call fanout_release_data() and
    1773             :  * free the returned packet_fanout (after synchronize_net())
    1774             :  */
    1775           0 : static struct packet_fanout *fanout_release(struct sock *sk)
    1776             : {
    1777           0 :         struct packet_sock *po = pkt_sk(sk);
    1778           0 :         struct packet_fanout *f;
    1779             : 
    1780           0 :         mutex_lock(&fanout_mutex);
    1781           0 :         f = po->fanout;
    1782           0 :         if (f) {
    1783           0 :                 po->fanout = NULL;
    1784             : 
    1785           0 :                 if (refcount_dec_and_test(&f->sk_ref))
    1786           0 :                         list_del(&f->list);
    1787             :                 else
    1788             :                         f = NULL;
    1789             :         }
    1790           0 :         mutex_unlock(&fanout_mutex);
    1791             : 
    1792           0 :         return f;
    1793             : }
    1794             : 
    1795           0 : static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
    1796             :                                           struct sk_buff *skb)
    1797             : {
    1798             :         /* Earlier code assumed this would be a VLAN pkt, double-check
    1799             :          * this now that we have the actual packet in hand. We can only
    1800             :          * do this check on Ethernet devices.
    1801             :          */
    1802           0 :         if (unlikely(dev->type != ARPHRD_ETHER))
    1803             :                 return false;
    1804             : 
    1805           0 :         skb_reset_mac_header(skb);
    1806           0 :         return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
    1807             : }
    1808             : 
    1809             : static const struct proto_ops packet_ops;
    1810             : 
    1811             : static const struct proto_ops packet_ops_spkt;
    1812             : 
    1813           0 : static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
    1814             :                            struct packet_type *pt, struct net_device *orig_dev)
    1815             : {
    1816           0 :         struct sock *sk;
    1817           0 :         struct sockaddr_pkt *spkt;
    1818             : 
    1819             :         /*
    1820             :          *      When we registered the protocol we saved the socket in the data
    1821             :          *      field for just this event.
    1822             :          */
    1823             : 
    1824           0 :         sk = pt->af_packet_priv;
    1825             : 
    1826             :         /*
    1827             :          *      Yank back the headers [hope the device set this
    1828             :          *      right or kerboom...]
    1829             :          *
    1830             :          *      Incoming packets have ll header pulled,
    1831             :          *      push it back.
    1832             :          *
    1833             :          *      For outgoing ones skb->data == skb_mac_header(skb)
    1834             :          *      so that this procedure is noop.
    1835             :          */
    1836             : 
    1837           0 :         if (skb->pkt_type == PACKET_LOOPBACK)
    1838           0 :                 goto out;
    1839             : 
    1840           0 :         if (!net_eq(dev_net(dev), sock_net(sk)))
    1841             :                 goto out;
    1842             : 
    1843           0 :         skb = skb_share_check(skb, GFP_ATOMIC);
    1844           0 :         if (skb == NULL)
    1845           0 :                 goto oom;
    1846             : 
    1847             :         /* drop any routing info */
    1848           0 :         skb_dst_drop(skb);
    1849             : 
    1850             :         /* drop conntrack reference */
    1851           0 :         nf_reset_ct(skb);
    1852             : 
    1853           0 :         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
    1854             : 
    1855           0 :         skb_push(skb, skb->data - skb_mac_header(skb));
    1856             : 
    1857             :         /*
    1858             :          *      The SOCK_PACKET socket receives _all_ frames.
    1859             :          */
    1860             : 
    1861           0 :         spkt->spkt_family = dev->type;
    1862           0 :         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
    1863           0 :         spkt->spkt_protocol = skb->protocol;
    1864             : 
    1865             :         /*
    1866             :          *      Charge the memory to the socket. This is done specifically
    1867             :          *      to prevent sockets using all the memory up.
    1868             :          */
    1869             : 
    1870           0 :         if (sock_queue_rcv_skb(sk, skb) == 0)
    1871             :                 return 0;
    1872             : 
    1873           0 : out:
    1874           0 :         kfree_skb(skb);
    1875             : oom:
    1876             :         return 0;
    1877             : }
    1878             : 
    1879           2 : static void packet_parse_headers(struct sk_buff *skb, struct socket *sock)
    1880             : {
    1881           2 :         if ((!skb->protocol || skb->protocol == htons(ETH_P_ALL)) &&
    1882           2 :             sock->type == SOCK_RAW) {
    1883           2 :                 skb_reset_mac_header(skb);
    1884           2 :                 skb->protocol = dev_parse_header_protocol(skb);
    1885             :         }
    1886             : 
    1887           2 :         skb_probe_transport_header(skb);
    1888           2 : }
    1889             : 
    1890             : /*
    1891             :  *      Output a raw packet to a device layer. This bypasses all the other
    1892             :  *      protocol layers and you must therefore supply it with a complete frame
    1893             :  */
    1894             : 
    1895           0 : static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg,
    1896             :                                size_t len)
    1897             : {
    1898           0 :         struct sock *sk = sock->sk;
    1899           0 :         DECLARE_SOCKADDR(struct sockaddr_pkt *, saddr, msg->msg_name);
    1900           0 :         struct sk_buff *skb = NULL;
    1901           0 :         struct net_device *dev;
    1902           0 :         struct sockcm_cookie sockc;
    1903           0 :         __be16 proto = 0;
    1904           0 :         int err;
    1905           0 :         int extra_len = 0;
    1906             : 
    1907             :         /*
    1908             :          *      Get and verify the address.
    1909             :          */
    1910             : 
    1911           0 :         if (saddr) {
    1912           0 :                 if (msg->msg_namelen < sizeof(struct sockaddr))
    1913             :                         return -EINVAL;
    1914           0 :                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
    1915           0 :                         proto = saddr->spkt_protocol;
    1916             :         } else
    1917             :                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
    1918             : 
    1919             :         /*
    1920             :          *      Find the device first to size check it
    1921             :          */
    1922             : 
    1923           0 :         saddr->spkt_device[sizeof(saddr->spkt_device) - 1] = 0;
    1924           0 : retry:
    1925           0 :         rcu_read_lock();
    1926           0 :         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
    1927           0 :         err = -ENODEV;
    1928           0 :         if (dev == NULL)
    1929           0 :                 goto out_unlock;
    1930             : 
    1931           0 :         err = -ENETDOWN;
    1932           0 :         if (!(dev->flags & IFF_UP))
    1933           0 :                 goto out_unlock;
    1934             : 
    1935             :         /*
    1936             :          * You may not queue a frame bigger than the mtu. This is the lowest level
    1937             :          * raw protocol and you must do your own fragmentation at this level.
    1938             :          */
    1939             : 
    1940           0 :         if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
    1941           0 :                 if (!netif_supports_nofcs(dev)) {
    1942           0 :                         err = -EPROTONOSUPPORT;
    1943           0 :                         goto out_unlock;
    1944             :                 }
    1945             :                 extra_len = 4; /* We're doing our own CRC */
    1946             :         }
    1947             : 
    1948           0 :         err = -EMSGSIZE;
    1949           0 :         if (len > dev->mtu + dev->hard_header_len + VLAN_HLEN + extra_len)
    1950           0 :                 goto out_unlock;
    1951             : 
    1952           0 :         if (!skb) {
    1953           0 :                 size_t reserved = LL_RESERVED_SPACE(dev);
    1954           0 :                 int tlen = dev->needed_tailroom;
    1955           0 :                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
    1956             : 
    1957           0 :                 rcu_read_unlock();
    1958           0 :                 skb = sock_wmalloc(sk, len + reserved + tlen, 0, GFP_KERNEL);
    1959           0 :                 if (skb == NULL)
    1960             :                         return -ENOBUFS;
    1961             :                 /* FIXME: Save some space for broken drivers that write a hard
    1962             :                  * header at transmission time by themselves. PPP is the notable
    1963             :                  * one here. This should really be fixed at the driver level.
    1964             :                  */
    1965           0 :                 skb_reserve(skb, reserved);
    1966           0 :                 skb_reset_network_header(skb);
    1967             : 
    1968             :                 /* Try to align data part correctly */
    1969           0 :                 if (hhlen) {
    1970           0 :                         skb->data -= hhlen;
    1971           0 :                         skb->tail -= hhlen;
    1972           0 :                         if (len < hhlen)
    1973           0 :                                 skb_reset_network_header(skb);
    1974             :                 }
    1975           0 :                 err = memcpy_from_msg(skb_put(skb, len), msg, len);
    1976           0 :                 if (err)
    1977           0 :                         goto out_free;
    1978           0 :                 goto retry;
    1979             :         }
    1980             : 
    1981           0 :         if (!dev_validate_header(dev, skb->data, len)) {
    1982           0 :                 err = -EINVAL;
    1983           0 :                 goto out_unlock;
    1984             :         }
    1985           0 :         if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
    1986           0 :             !packet_extra_vlan_len_allowed(dev, skb)) {
    1987           0 :                 err = -EMSGSIZE;
    1988           0 :                 goto out_unlock;
    1989             :         }
    1990             : 
    1991           0 :         sockcm_init(&sockc, sk);
    1992           0 :         if (msg->msg_controllen) {
    1993           0 :                 err = sock_cmsg_send(sk, msg, &sockc);
    1994           0 :                 if (unlikely(err))
    1995           0 :                         goto out_unlock;
    1996             :         }
    1997             : 
    1998           0 :         skb->protocol = proto;
    1999           0 :         skb->dev = dev;
    2000           0 :         skb->priority = sk->sk_priority;
    2001           0 :         skb->mark = sk->sk_mark;
    2002           0 :         skb->tstamp = sockc.transmit_time;
    2003             : 
    2004           0 :         skb_setup_tx_timestamp(skb, sockc.tsflags);
    2005             : 
    2006           0 :         if (unlikely(extra_len == 4))
    2007           0 :                 skb->no_fcs = 1;
    2008             : 
    2009           0 :         packet_parse_headers(skb, sock);
    2010             : 
    2011           0 :         dev_queue_xmit(skb);
    2012           0 :         rcu_read_unlock();
    2013           0 :         return len;
    2014             : 
    2015           0 : out_unlock:
    2016           0 :         rcu_read_unlock();
    2017           0 : out_free:
    2018           0 :         kfree_skb(skb);
    2019           0 :         return err;
    2020             : }
    2021             : 
    2022         908 : static unsigned int run_filter(struct sk_buff *skb,
    2023             :                                const struct sock *sk,
    2024             :                                unsigned int res)
    2025             : {
    2026         908 :         struct sk_filter *filter;
    2027             : 
    2028         908 :         rcu_read_lock();
    2029         908 :         filter = rcu_dereference(sk->sk_filter);
    2030         908 :         if (filter != NULL)
    2031         908 :                 res = bpf_prog_run_clear_cb(filter->prog, skb);
    2032         908 :         rcu_read_unlock();
    2033             : 
    2034         908 :         return res;
    2035             : }
    2036             : 
    2037           0 : static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
    2038             :                            size_t *len)
    2039             : {
    2040           0 :         struct virtio_net_hdr vnet_hdr;
    2041             : 
    2042           0 :         if (*len < sizeof(vnet_hdr))
    2043             :                 return -EINVAL;
    2044           0 :         *len -= sizeof(vnet_hdr);
    2045             : 
    2046           0 :         if (virtio_net_hdr_from_skb(skb, &vnet_hdr, vio_le(), true, 0))
    2047             :                 return -EINVAL;
    2048             : 
    2049           0 :         return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
    2050             : }
    2051             : 
    2052             : /*
    2053             :  * This function makes lazy skb cloning in hope that most of packets
    2054             :  * are discarded by BPF.
    2055             :  *
    2056             :  * Note tricky part: we DO mangle shared skb! skb->data, skb->len
    2057             :  * and skb->cb are mangled. It works because (and until) packets
    2058             :  * falling here are owned by current CPU. Output packets are cloned
    2059             :  * by dev_queue_xmit_nit(), input packets are processed by net_bh
    2060             :  * sequencially, so that if we return skb to original state on exit,
    2061             :  * we will not harm anyone.
    2062             :  */
    2063             : 
    2064         908 : static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
    2065             :                       struct packet_type *pt, struct net_device *orig_dev)
    2066             : {
    2067         908 :         struct sock *sk;
    2068         908 :         struct sockaddr_ll *sll;
    2069         908 :         struct packet_sock *po;
    2070         908 :         u8 *skb_head = skb->data;
    2071         908 :         int skb_len = skb->len;
    2072         908 :         unsigned int snaplen, res;
    2073         908 :         bool is_drop_n_account = false;
    2074             : 
    2075         908 :         if (skb->pkt_type == PACKET_LOOPBACK)
    2076           0 :                 goto drop;
    2077             : 
    2078         908 :         sk = pt->af_packet_priv;
    2079         908 :         po = pkt_sk(sk);
    2080             : 
    2081         908 :         if (!net_eq(dev_net(dev), sock_net(sk)))
    2082             :                 goto drop;
    2083             : 
    2084         908 :         skb->dev = dev;
    2085             : 
    2086        1816 :         if (dev_has_header(dev)) {
    2087             :                 /* The device has an explicit notion of ll header,
    2088             :                  * exported to higher levels.
    2089             :                  *
    2090             :                  * Otherwise, the device hides details of its frame
    2091             :                  * structure, so that corresponding packet head is
    2092             :                  * never delivered to user.
    2093             :                  */
    2094         908 :                 if (sk->sk_type != SOCK_DGRAM)
    2095         908 :                         skb_push(skb, skb->data - skb_mac_header(skb));
    2096           0 :                 else if (skb->pkt_type == PACKET_OUTGOING) {
    2097             :                         /* Special case: outgoing packets have ll header at head */
    2098           0 :                         skb_pull(skb, skb_network_offset(skb));
    2099             :                 }
    2100             :         }
    2101             : 
    2102         908 :         snaplen = skb->len;
    2103             : 
    2104         908 :         res = run_filter(skb, sk, snaplen);
    2105         908 :         if (!res)
    2106         906 :                 goto drop_n_restore;
    2107           2 :         if (snaplen > res)
    2108             :                 snaplen = res;
    2109             : 
    2110           2 :         if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
    2111           0 :                 goto drop_n_acct;
    2112             : 
    2113           2 :         if (skb_shared(skb)) {
    2114           2 :                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
    2115           2 :                 if (nskb == NULL)
    2116           0 :                         goto drop_n_acct;
    2117             : 
    2118           2 :                 if (skb_head != skb->data) {
    2119           2 :                         skb->data = skb_head;
    2120           2 :                         skb->len = skb_len;
    2121             :                 }
    2122           2 :                 consume_skb(skb);
    2123           2 :                 skb = nskb;
    2124             :         }
    2125             : 
    2126           2 :         sock_skb_cb_check_size(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8);
    2127             : 
    2128           2 :         sll = &PACKET_SKB_CB(skb)->sa.ll;
    2129           2 :         sll->sll_hatype = dev->type;
    2130           2 :         sll->sll_pkttype = skb->pkt_type;
    2131           2 :         if (unlikely(po->origdev))
    2132           0 :                 sll->sll_ifindex = orig_dev->ifindex;
    2133             :         else
    2134           2 :                 sll->sll_ifindex = dev->ifindex;
    2135             : 
    2136           2 :         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
    2137             : 
    2138             :         /* sll->sll_family and sll->sll_protocol are set in packet_recvmsg().
    2139             :          * Use their space for storing the original skb length.
    2140             :          */
    2141           2 :         PACKET_SKB_CB(skb)->sa.origlen = skb->len;
    2142             : 
    2143           2 :         if (pskb_trim(skb, snaplen))
    2144           0 :                 goto drop_n_acct;
    2145             : 
    2146           2 :         skb_set_owner_r(skb, sk);
    2147           2 :         skb->dev = NULL;
    2148           2 :         skb_dst_drop(skb);
    2149             : 
    2150             :         /* drop conntrack reference */
    2151           2 :         nf_reset_ct(skb);
    2152             : 
    2153           2 :         spin_lock(&sk->sk_receive_queue.lock);
    2154           2 :         po->stats.stats1.tp_packets++;
    2155           2 :         sock_skb_set_dropcount(sk, skb);
    2156           2 :         __skb_queue_tail(&sk->sk_receive_queue, skb);
    2157           2 :         spin_unlock(&sk->sk_receive_queue.lock);
    2158           2 :         sk->sk_data_ready(sk);
    2159           2 :         return 0;
    2160             : 
    2161           0 : drop_n_acct:
    2162           0 :         is_drop_n_account = true;
    2163           0 :         atomic_inc(&po->tp_drops);
    2164           0 :         atomic_inc(&sk->sk_drops);
    2165             : 
    2166         906 : drop_n_restore:
    2167        1363 :         if (skb_head != skb->data && skb_shared(skb)) {
    2168         457 :                 skb->data = skb_head;
    2169         457 :                 skb->len = skb_len;
    2170             :         }
    2171         449 : drop:
    2172         906 :         if (!is_drop_n_account)
    2173         906 :                 consume_skb(skb);
    2174             :         else
    2175           0 :                 kfree_skb(skb);
    2176             :         return 0;
    2177             : }
    2178             : 
    2179           0 : static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
    2180             :                        struct packet_type *pt, struct net_device *orig_dev)
    2181             : {
    2182           0 :         struct sock *sk;
    2183           0 :         struct packet_sock *po;
    2184           0 :         struct sockaddr_ll *sll;
    2185           0 :         union tpacket_uhdr h;
    2186           0 :         u8 *skb_head = skb->data;
    2187           0 :         int skb_len = skb->len;
    2188           0 :         unsigned int snaplen, res;
    2189           0 :         unsigned long status = TP_STATUS_USER;
    2190           0 :         unsigned short macoff, hdrlen;
    2191           0 :         unsigned int netoff;
    2192           0 :         struct sk_buff *copy_skb = NULL;
    2193           0 :         struct timespec64 ts;
    2194           0 :         __u32 ts_status;
    2195           0 :         bool is_drop_n_account = false;
    2196           0 :         unsigned int slot_id = 0;
    2197           0 :         bool do_vnet = false;
    2198             : 
    2199             :         /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
    2200             :          * We may add members to them until current aligned size without forcing
    2201             :          * userspace to call getsockopt(..., PACKET_HDRLEN, ...).
    2202             :          */
    2203           0 :         BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h2)) != 32);
    2204           0 :         BUILD_BUG_ON(TPACKET_ALIGN(sizeof(*h.h3)) != 48);
    2205             : 
    2206           0 :         if (skb->pkt_type == PACKET_LOOPBACK)
    2207           0 :                 goto drop;
    2208             : 
    2209           0 :         sk = pt->af_packet_priv;
    2210           0 :         po = pkt_sk(sk);
    2211             : 
    2212           0 :         if (!net_eq(dev_net(dev), sock_net(sk)))
    2213             :                 goto drop;
    2214             : 
    2215           0 :         if (dev_has_header(dev)) {
    2216           0 :                 if (sk->sk_type != SOCK_DGRAM)
    2217           0 :                         skb_push(skb, skb->data - skb_mac_header(skb));
    2218           0 :                 else if (skb->pkt_type == PACKET_OUTGOING) {
    2219             :                         /* Special case: outgoing packets have ll header at head */
    2220           0 :                         skb_pull(skb, skb_network_offset(skb));
    2221             :                 }
    2222             :         }
    2223             : 
    2224           0 :         snaplen = skb->len;
    2225             : 
    2226           0 :         res = run_filter(skb, sk, snaplen);
    2227           0 :         if (!res)
    2228           0 :                 goto drop_n_restore;
    2229             : 
    2230             :         /* If we are flooded, just give up */
    2231           0 :         if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
    2232           0 :                 atomic_inc(&po->tp_drops);
    2233           0 :                 goto drop_n_restore;
    2234             :         }
    2235             : 
    2236           0 :         if (skb->ip_summed == CHECKSUM_PARTIAL)
    2237             :                 status |= TP_STATUS_CSUMNOTREADY;
    2238           0 :         else if (skb->pkt_type != PACKET_OUTGOING &&
    2239           0 :                  (skb->ip_summed == CHECKSUM_COMPLETE ||
    2240           0 :                   skb_csum_unnecessary(skb)))
    2241             :                 status |= TP_STATUS_CSUM_VALID;
    2242             : 
    2243           0 :         if (snaplen > res)
    2244             :                 snaplen = res;
    2245             : 
    2246           0 :         if (sk->sk_type == SOCK_DGRAM) {
    2247           0 :                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
    2248           0 :                                   po->tp_reserve;
    2249             :         } else {
    2250           0 :                 unsigned int maclen = skb_network_offset(skb);
    2251           0 :                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
    2252             :                                        (maclen < 16 ? 16 : maclen)) +
    2253           0 :                                        po->tp_reserve;
    2254           0 :                 if (po->has_vnet_hdr) {
    2255           0 :                         netoff += sizeof(struct virtio_net_hdr);
    2256           0 :                         do_vnet = true;
    2257             :                 }
    2258           0 :                 macoff = netoff - maclen;
    2259             :         }
    2260           0 :         if (netoff > USHRT_MAX) {
    2261           0 :                 atomic_inc(&po->tp_drops);
    2262           0 :                 goto drop_n_restore;
    2263             :         }
    2264           0 :         if (po->tp_version <= TPACKET_V2) {
    2265           0 :                 if (macoff + snaplen > po->rx_ring.frame_size) {
    2266           0 :                         if (po->copy_thresh &&
    2267           0 :                             atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
    2268           0 :                                 if (skb_shared(skb)) {
    2269           0 :                                         copy_skb = skb_clone(skb, GFP_ATOMIC);
    2270             :                                 } else {
    2271           0 :                                         copy_skb = skb_get(skb);
    2272           0 :                                         skb_head = skb->data;
    2273             :                                 }
    2274           0 :                                 if (copy_skb)
    2275           0 :                                         skb_set_owner_r(copy_skb, sk);
    2276             :                         }
    2277           0 :                         snaplen = po->rx_ring.frame_size - macoff;
    2278           0 :                         if ((int)snaplen < 0) {
    2279           0 :                                 snaplen = 0;
    2280           0 :                                 do_vnet = false;
    2281             :                         }
    2282             :                 }
    2283           0 :         } else if (unlikely(macoff + snaplen >
    2284             :                             GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len)) {
    2285           0 :                 u32 nval;
    2286             : 
    2287           0 :                 nval = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len - macoff;
    2288           0 :                 pr_err_once("tpacket_rcv: packet too big, clamped from %u to %u. macoff=%u\n",
    2289             :                             snaplen, nval, macoff);
    2290           0 :                 snaplen = nval;
    2291           0 :                 if (unlikely((int)snaplen < 0)) {
    2292           0 :                         snaplen = 0;
    2293           0 :                         macoff = GET_PBDQC_FROM_RB(&po->rx_ring)->max_frame_len;
    2294           0 :                         do_vnet = false;
    2295             :                 }
    2296             :         }
    2297           0 :         spin_lock(&sk->sk_receive_queue.lock);
    2298           0 :         h.raw = packet_current_rx_frame(po, skb,
    2299             :                                         TP_STATUS_KERNEL, (macoff+snaplen));
    2300           0 :         if (!h.raw)
    2301           0 :                 goto drop_n_account;
    2302             : 
    2303           0 :         if (po->tp_version <= TPACKET_V2) {
    2304           0 :                 slot_id = po->rx_ring.head;
    2305           0 :                 if (test_bit(slot_id, po->rx_ring.rx_owner_map))
    2306           0 :                         goto drop_n_account;
    2307           0 :                 __set_bit(slot_id, po->rx_ring.rx_owner_map);
    2308             :         }
    2309             : 
    2310           0 :         if (do_vnet &&
    2311           0 :             virtio_net_hdr_from_skb(skb, h.raw + macoff -
    2312             :                                     sizeof(struct virtio_net_hdr),
    2313             :                                     vio_le(), true, 0)) {
    2314           0 :                 if (po->tp_version == TPACKET_V3)
    2315           0 :                         prb_clear_blk_fill_status(&po->rx_ring);
    2316           0 :                 goto drop_n_account;
    2317             :         }
    2318             : 
    2319           0 :         if (po->tp_version <= TPACKET_V2) {
    2320           0 :                 packet_increment_rx_head(po, &po->rx_ring);
    2321             :         /*
    2322             :          * LOSING will be reported till you read the stats,
    2323             :          * because it's COR - Clear On Read.
    2324             :          * Anyways, moving it for V1/V2 only as V3 doesn't need this
    2325             :          * at packet level.
    2326             :          */
    2327           0 :                 if (atomic_read(&po->tp_drops))
    2328           0 :                         status |= TP_STATUS_LOSING;
    2329             :         }
    2330             : 
    2331           0 :         po->stats.stats1.tp_packets++;
    2332           0 :         if (copy_skb) {
    2333           0 :                 status |= TP_STATUS_COPY;
    2334           0 :                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
    2335             :         }
    2336           0 :         spin_unlock(&sk->sk_receive_queue.lock);
    2337             : 
    2338           0 :         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
    2339             : 
    2340           0 :         if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
    2341           0 :                 ktime_get_real_ts64(&ts);
    2342             : 
    2343           0 :         status |= ts_status;
    2344             : 
    2345           0 :         switch (po->tp_version) {
    2346           0 :         case TPACKET_V1:
    2347           0 :                 h.h1->tp_len = skb->len;
    2348           0 :                 h.h1->tp_snaplen = snaplen;
    2349           0 :                 h.h1->tp_mac = macoff;
    2350           0 :                 h.h1->tp_net = netoff;
    2351           0 :                 h.h1->tp_sec = ts.tv_sec;
    2352           0 :                 h.h1->tp_usec = ts.tv_nsec / NSEC_PER_USEC;
    2353           0 :                 hdrlen = sizeof(*h.h1);
    2354           0 :                 break;
    2355           0 :         case TPACKET_V2:
    2356           0 :                 h.h2->tp_len = skb->len;
    2357           0 :                 h.h2->tp_snaplen = snaplen;
    2358           0 :                 h.h2->tp_mac = macoff;
    2359           0 :                 h.h2->tp_net = netoff;
    2360           0 :                 h.h2->tp_sec = ts.tv_sec;
    2361           0 :                 h.h2->tp_nsec = ts.tv_nsec;
    2362           0 :                 if (skb_vlan_tag_present(skb)) {
    2363           0 :                         h.h2->tp_vlan_tci = skb_vlan_tag_get(skb);
    2364           0 :                         h.h2->tp_vlan_tpid = ntohs(skb->vlan_proto);
    2365           0 :                         status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
    2366             :                 } else {
    2367           0 :                         h.h2->tp_vlan_tci = 0;
    2368           0 :                         h.h2->tp_vlan_tpid = 0;
    2369             :                 }
    2370           0 :                 memset(h.h2->tp_padding, 0, sizeof(h.h2->tp_padding));
    2371           0 :                 hdrlen = sizeof(*h.h2);
    2372           0 :                 break;
    2373           0 :         case TPACKET_V3:
    2374             :                 /* tp_nxt_offset,vlan are already populated above.
    2375             :                  * So DONT clear those fields here
    2376             :                  */
    2377           0 :                 h.h3->tp_status |= status;
    2378           0 :                 h.h3->tp_len = skb->len;
    2379           0 :                 h.h3->tp_snaplen = snaplen;
    2380           0 :                 h.h3->tp_mac = macoff;
    2381           0 :                 h.h3->tp_net = netoff;
    2382           0 :                 h.h3->tp_sec  = ts.tv_sec;
    2383           0 :                 h.h3->tp_nsec = ts.tv_nsec;
    2384           0 :                 memset(h.h3->tp_padding, 0, sizeof(h.h3->tp_padding));
    2385           0 :                 hdrlen = sizeof(*h.h3);
    2386           0 :                 break;
    2387           0 :         default:
    2388           0 :                 BUG();
    2389             :         }
    2390             : 
    2391           0 :         sll = h.raw + TPACKET_ALIGN(hdrlen);
    2392           0 :         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
    2393           0 :         sll->sll_family = AF_PACKET;
    2394           0 :         sll->sll_hatype = dev->type;
    2395           0 :         sll->sll_protocol = skb->protocol;
    2396           0 :         sll->sll_pkttype = skb->pkt_type;
    2397           0 :         if (unlikely(po->origdev))
    2398           0 :                 sll->sll_ifindex = orig_dev->ifindex;
    2399             :         else
    2400           0 :                 sll->sll_ifindex = dev->ifindex;
    2401             : 
    2402           0 :         smp_mb();
    2403             : 
    2404             : #if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
    2405             :         if (po->tp_version <= TPACKET_V2) {
    2406             :                 u8 *start, *end;
    2407             : 
    2408             :                 end = (u8 *) PAGE_ALIGN((unsigned long) h.raw +
    2409             :                                         macoff + snaplen);
    2410             : 
    2411             :                 for (start = h.raw; start < end; start += PAGE_SIZE)
    2412             :                         flush_dcache_page(pgv_to_page(start));
    2413             :         }
    2414             :         smp_wmb();
    2415             : #endif
    2416             : 
    2417           0 :         if (po->tp_version <= TPACKET_V2) {
    2418           0 :                 spin_lock(&sk->sk_receive_queue.lock);
    2419           0 :                 __packet_set_status(po, h.raw, status);
    2420           0 :                 __clear_bit(slot_id, po->rx_ring.rx_owner_map);
    2421           0 :                 spin_unlock(&sk->sk_receive_queue.lock);
    2422           0 :                 sk->sk_data_ready(sk);
    2423           0 :         } else if (po->tp_version == TPACKET_V3) {
    2424           0 :                 prb_clear_blk_fill_status(&po->rx_ring);
    2425             :         }
    2426             : 
    2427           0 : drop_n_restore:
    2428           0 :         if (skb_head != skb->data && skb_shared(skb)) {
    2429           0 :                 skb->data = skb_head;
    2430           0 :                 skb->len = skb_len;
    2431             :         }
    2432           0 : drop:
    2433           0 :         if (!is_drop_n_account)
    2434           0 :                 consume_skb(skb);
    2435             :         else
    2436           0 :                 kfree_skb(skb);
    2437           0 :         return 0;
    2438             : 
    2439           0 : drop_n_account:
    2440           0 :         spin_unlock(&sk->sk_receive_queue.lock);
    2441           0 :         atomic_inc(&po->tp_drops);
    2442           0 :         is_drop_n_account = true;
    2443             : 
    2444           0 :         sk->sk_data_ready(sk);
    2445           0 :         kfree_skb(copy_skb);
    2446           0 :         goto drop_n_restore;
    2447             : }
    2448             : 
    2449           0 : static void tpacket_destruct_skb(struct sk_buff *skb)
    2450             : {
    2451           0 :         struct packet_sock *po = pkt_sk(skb->sk);
    2452             : 
    2453           0 :         if (likely(po->tx_ring.pg_vec)) {
    2454           0 :                 void *ph;
    2455           0 :                 __u32 ts;
    2456             : 
    2457           0 :                 ph = skb_zcopy_get_nouarg(skb);
    2458           0 :                 packet_dec_pending(&po->tx_ring);
    2459             : 
    2460           0 :                 ts = __packet_set_timestamp(po, ph, skb);
    2461           0 :                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
    2462             : 
    2463           0 :                 if (!packet_read_pending(&po->tx_ring))
    2464           0 :                         complete(&po->skb_completion);
    2465             :         }
    2466             : 
    2467           0 :         sock_wfree(skb);
    2468           0 : }
    2469             : 
    2470           0 : static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
    2471             : {
    2472           0 :         if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
    2473           0 :             (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
    2474           0 :              __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
    2475           0 :               __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
    2476           0 :                 vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
    2477           0 :                          __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
    2478           0 :                         __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
    2479             : 
    2480           0 :         if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
    2481           0 :                 return -EINVAL;
    2482             : 
    2483             :         return 0;
    2484             : }
    2485             : 
    2486           0 : static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
    2487             :                                  struct virtio_net_hdr *vnet_hdr)
    2488             : {
    2489           0 :         if (*len < sizeof(*vnet_hdr))
    2490             :                 return -EINVAL;
    2491           0 :         *len -= sizeof(*vnet_hdr);
    2492             : 
    2493           0 :         if (!copy_from_iter_full(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter))
    2494           0 :                 return -EFAULT;
    2495             : 
    2496           0 :         return __packet_snd_vnet_parse(vnet_hdr, *len);
    2497             : }
    2498             : 
    2499           0 : static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
    2500             :                 void *frame, struct net_device *dev, void *data, int tp_len,
    2501             :                 __be16 proto, unsigned char *addr, int hlen, int copylen,
    2502             :                 const struct sockcm_cookie *sockc)
    2503             : {
    2504           0 :         union tpacket_uhdr ph;
    2505           0 :         int to_write, offset, len, nr_frags, len_max;
    2506           0 :         struct socket *sock = po->sk.sk_socket;
    2507           0 :         struct page *page;
    2508           0 :         int err;
    2509             : 
    2510           0 :         ph.raw = frame;
    2511             : 
    2512           0 :         skb->protocol = proto;
    2513           0 :         skb->dev = dev;
    2514           0 :         skb->priority = po->sk.sk_priority;
    2515           0 :         skb->mark = po->sk.sk_mark;
    2516           0 :         skb->tstamp = sockc->transmit_time;
    2517           0 :         skb_setup_tx_timestamp(skb, sockc->tsflags);
    2518           0 :         skb_zcopy_set_nouarg(skb, ph.raw);
    2519             : 
    2520           0 :         skb_reserve(skb, hlen);
    2521           0 :         skb_reset_network_header(skb);
    2522             : 
    2523           0 :         to_write = tp_len;
    2524             : 
    2525           0 :         if (sock->type == SOCK_DGRAM) {
    2526           0 :                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
    2527             :                                 NULL, tp_len);
    2528           0 :                 if (unlikely(err < 0))
    2529             :                         return -EINVAL;
    2530           0 :         } else if (copylen) {
    2531           0 :                 int hdrlen = min_t(int, copylen, tp_len);
    2532             : 
    2533           0 :                 skb_push(skb, dev->hard_header_len);
    2534           0 :                 skb_put(skb, copylen - dev->hard_header_len);
    2535           0 :                 err = skb_store_bits(skb, 0, data, hdrlen);
    2536           0 :                 if (unlikely(err))
    2537             :                         return err;
    2538           0 :                 if (!dev_validate_header(dev, skb->data, hdrlen))
    2539             :                         return -EINVAL;
    2540             : 
    2541           0 :                 data += hdrlen;
    2542           0 :                 to_write -= hdrlen;
    2543             :         }
    2544             : 
    2545           0 :         offset = offset_in_page(data);
    2546           0 :         len_max = PAGE_SIZE - offset;
    2547           0 :         len = ((to_write > len_max) ? len_max : to_write);
    2548             : 
    2549           0 :         skb->data_len = to_write;
    2550           0 :         skb->len += to_write;
    2551           0 :         skb->truesize += to_write;
    2552           0 :         refcount_add(to_write, &po->sk.sk_wmem_alloc);
    2553             : 
    2554           0 :         while (likely(to_write)) {
    2555           0 :                 nr_frags = skb_shinfo(skb)->nr_frags;
    2556             : 
    2557           0 :                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
    2558           0 :                         pr_err("Packet exceed the number of skb frags(%lu)\n",
    2559             :                                MAX_SKB_FRAGS);
    2560           0 :                         return -EFAULT;
    2561             :                 }
    2562             : 
    2563           0 :                 page = pgv_to_page(data);
    2564           0 :                 data += len;
    2565           0 :                 flush_dcache_page(page);
    2566           0 :                 get_page(page);
    2567           0 :                 skb_fill_page_desc(skb, nr_frags, page, offset, len);
    2568           0 :                 to_write -= len;
    2569           0 :                 offset = 0;
    2570           0 :                 len_max = PAGE_SIZE;
    2571           0 :                 len = ((to_write > len_max) ? len_max : to_write);
    2572             :         }
    2573             : 
    2574           0 :         packet_parse_headers(skb, sock);
    2575             : 
    2576           0 :         return tp_len;
    2577             : }
    2578             : 
    2579           0 : static int tpacket_parse_header(struct packet_sock *po, void *frame,
    2580             :                                 int size_max, void **data)
    2581             : {
    2582           0 :         union tpacket_uhdr ph;
    2583           0 :         int tp_len, off;
    2584             : 
    2585           0 :         ph.raw = frame;
    2586             : 
    2587           0 :         switch (po->tp_version) {
    2588           0 :         case TPACKET_V3:
    2589           0 :                 if (ph.h3->tp_next_offset != 0) {
    2590           0 :                         pr_warn_once("variable sized slot not supported");
    2591           0 :                         return -EINVAL;
    2592             :                 }
    2593           0 :                 tp_len = ph.h3->tp_len;
    2594           0 :                 break;
    2595           0 :         case TPACKET_V2:
    2596           0 :                 tp_len = ph.h2->tp_len;
    2597           0 :                 break;
    2598           0 :         default:
    2599           0 :                 tp_len = ph.h1->tp_len;
    2600           0 :                 break;
    2601             :         }
    2602           0 :         if (unlikely(tp_len > size_max)) {
    2603           0 :                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
    2604           0 :                 return -EMSGSIZE;
    2605             :         }
    2606             : 
    2607           0 :         if (unlikely(po->tp_tx_has_off)) {
    2608           0 :                 int off_min, off_max;
    2609             : 
    2610           0 :                 off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
    2611           0 :                 off_max = po->tx_ring.frame_size - tp_len;
    2612           0 :                 if (po->sk.sk_type == SOCK_DGRAM) {
    2613           0 :                         switch (po->tp_version) {
    2614           0 :                         case TPACKET_V3:
    2615           0 :                                 off = ph.h3->tp_net;
    2616           0 :                                 break;
    2617           0 :                         case TPACKET_V2:
    2618           0 :                                 off = ph.h2->tp_net;
    2619           0 :                                 break;
    2620           0 :                         default:
    2621           0 :                                 off = ph.h1->tp_net;
    2622           0 :                                 break;
    2623             :                         }
    2624             :                 } else {
    2625           0 :                         switch (po->tp_version) {
    2626           0 :                         case TPACKET_V3:
    2627           0 :                                 off = ph.h3->tp_mac;
    2628           0 :                                 break;
    2629           0 :                         case TPACKET_V2:
    2630           0 :                                 off = ph.h2->tp_mac;
    2631           0 :                                 break;
    2632           0 :                         default:
    2633           0 :                                 off = ph.h1->tp_mac;
    2634           0 :                                 break;
    2635             :                         }
    2636             :                 }
    2637           0 :                 if (unlikely((off < off_min) || (off_max < off)))
    2638             :                         return -EINVAL;
    2639             :         } else {
    2640           0 :                 off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
    2641             :         }
    2642             : 
    2643           0 :         *data = frame + off;
    2644           0 :         return tp_len;
    2645             : }
    2646             : 
    2647           0 : static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
    2648             : {
    2649           0 :         struct sk_buff *skb = NULL;
    2650           0 :         struct net_device *dev;
    2651           0 :         struct virtio_net_hdr *vnet_hdr = NULL;
    2652           0 :         struct sockcm_cookie sockc;
    2653           0 :         __be16 proto;
    2654           0 :         int err, reserve = 0;
    2655           0 :         void *ph;
    2656           0 :         DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
    2657           0 :         bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
    2658           0 :         unsigned char *addr = NULL;
    2659           0 :         int tp_len, size_max;
    2660           0 :         void *data;
    2661           0 :         int len_sum = 0;
    2662           0 :         int status = TP_STATUS_AVAILABLE;
    2663           0 :         int hlen, tlen, copylen = 0;
    2664           0 :         long timeo = 0;
    2665             : 
    2666           0 :         mutex_lock(&po->pg_vec_lock);
    2667             : 
    2668             :         /* packet_sendmsg() check on tx_ring.pg_vec was lockless,
    2669             :          * we need to confirm it under protection of pg_vec_lock.
    2670             :          */
    2671           0 :         if (unlikely(!po->tx_ring.pg_vec)) {
    2672           0 :                 err = -EBUSY;
    2673           0 :                 goto out;
    2674             :         }
    2675           0 :         if (likely(saddr == NULL)) {
    2676           0 :                 dev     = packet_cached_dev_get(po);
    2677           0 :                 proto   = po->num;
    2678             :         } else {
    2679           0 :                 err = -EINVAL;
    2680           0 :                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
    2681           0 :                         goto out;
    2682           0 :                 if (msg->msg_namelen < (saddr->sll_halen
    2683           0 :                                         + offsetof(struct sockaddr_ll,
    2684             :                                                 sll_addr)))
    2685           0 :                         goto out;
    2686           0 :                 proto   = saddr->sll_protocol;
    2687           0 :                 dev = dev_get_by_index(sock_net(&po->sk), saddr->sll_ifindex);
    2688           0 :                 if (po->sk.sk_socket->type == SOCK_DGRAM) {
    2689           0 :                         if (dev && msg->msg_namelen < dev->addr_len +
    2690             :                                    offsetof(struct sockaddr_ll, sll_addr))
    2691           0 :                                 goto out_put;
    2692           0 :                         addr = saddr->sll_addr;
    2693             :                 }
    2694             :         }
    2695             : 
    2696           0 :         err = -ENXIO;
    2697           0 :         if (unlikely(dev == NULL))
    2698           0 :                 goto out;
    2699           0 :         err = -ENETDOWN;
    2700           0 :         if (unlikely(!(dev->flags & IFF_UP)))
    2701           0 :                 goto out_put;
    2702             : 
    2703           0 :         sockcm_init(&sockc, &po->sk);
    2704           0 :         if (msg->msg_controllen) {
    2705           0 :                 err = sock_cmsg_send(&po->sk, msg, &sockc);
    2706           0 :                 if (unlikely(err))
    2707           0 :                         goto out_put;
    2708             :         }
    2709             : 
    2710           0 :         if (po->sk.sk_socket->type == SOCK_RAW)
    2711           0 :                 reserve = dev->hard_header_len;
    2712           0 :         size_max = po->tx_ring.frame_size
    2713           0 :                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
    2714             : 
    2715           0 :         if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
    2716           0 :                 size_max = dev->mtu + reserve + VLAN_HLEN;
    2717             : 
    2718           0 :         reinit_completion(&po->skb_completion);
    2719             : 
    2720           0 :         do {
    2721           0 :                 ph = packet_current_frame(po, &po->tx_ring,
    2722             :                                           TP_STATUS_SEND_REQUEST);
    2723           0 :                 if (unlikely(ph == NULL)) {
    2724           0 :                         if (need_wait && skb) {
    2725           0 :                                 timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
    2726           0 :                                 timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
    2727           0 :                                 if (timeo <= 0) {
    2728           0 :                                         err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
    2729           0 :                                         goto out_put;
    2730             :                                 }
    2731             :                         }
    2732             :                         /* check for additional frames */
    2733           0 :                         continue;
    2734             :                 }
    2735             : 
    2736           0 :                 skb = NULL;
    2737           0 :                 tp_len = tpacket_parse_header(po, ph, size_max, &data);
    2738           0 :                 if (tp_len < 0)
    2739           0 :                         goto tpacket_error;
    2740             : 
    2741           0 :                 status = TP_STATUS_SEND_REQUEST;
    2742           0 :                 hlen = LL_RESERVED_SPACE(dev);
    2743           0 :                 tlen = dev->needed_tailroom;
    2744           0 :                 if (po->has_vnet_hdr) {
    2745           0 :                         vnet_hdr = data;
    2746           0 :                         data += sizeof(*vnet_hdr);
    2747           0 :                         tp_len -= sizeof(*vnet_hdr);
    2748           0 :                         if (tp_len < 0 ||
    2749           0 :                             __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
    2750           0 :                                 tp_len = -EINVAL;
    2751           0 :                                 goto tpacket_error;
    2752             :                         }
    2753           0 :                         copylen = __virtio16_to_cpu(vio_le(),
    2754           0 :                                                     vnet_hdr->hdr_len);
    2755             :                 }
    2756           0 :                 copylen = max_t(int, copylen, dev->hard_header_len);
    2757           0 :                 skb = sock_alloc_send_skb(&po->sk,
    2758           0 :                                 hlen + tlen + sizeof(struct sockaddr_ll) +
    2759           0 :                                 (copylen - dev->hard_header_len),
    2760           0 :                                 !need_wait, &err);
    2761             : 
    2762           0 :                 if (unlikely(skb == NULL)) {
    2763             :                         /* we assume the socket was initially writeable ... */
    2764           0 :                         if (likely(len_sum > 0))
    2765           0 :                                 err = len_sum;
    2766           0 :                         goto out_status;
    2767             :                 }
    2768           0 :                 tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
    2769             :                                           addr, hlen, copylen, &sockc);
    2770           0 :                 if (likely(tp_len >= 0) &&
    2771           0 :                     tp_len > dev->mtu + reserve &&
    2772           0 :                     !po->has_vnet_hdr &&
    2773           0 :                     !packet_extra_vlan_len_allowed(dev, skb))
    2774             :                         tp_len = -EMSGSIZE;
    2775             : 
    2776           0 :                 if (unlikely(tp_len < 0)) {
    2777           0 : tpacket_error:
    2778           0 :                         if (po->tp_loss) {
    2779           0 :                                 __packet_set_status(po, ph,
    2780             :                                                 TP_STATUS_AVAILABLE);
    2781           0 :                                 packet_increment_head(&po->tx_ring);
    2782           0 :                                 kfree_skb(skb);
    2783           0 :                                 continue;
    2784             :                         } else {
    2785           0 :                                 status = TP_STATUS_WRONG_FORMAT;
    2786           0 :                                 err = tp_len;
    2787           0 :                                 goto out_status;
    2788             :                         }
    2789             :                 }
    2790             : 
    2791           0 :                 if (po->has_vnet_hdr) {
    2792           0 :                         if (virtio_net_hdr_to_skb(skb, vnet_hdr, vio_le())) {
    2793           0 :                                 tp_len = -EINVAL;
    2794           0 :                                 goto tpacket_error;
    2795             :                         }
    2796           0 :                         virtio_net_hdr_set_proto(skb, vnet_hdr);
    2797             :                 }
    2798             : 
    2799           0 :                 skb->destructor = tpacket_destruct_skb;
    2800           0 :                 __packet_set_status(po, ph, TP_STATUS_SENDING);
    2801           0 :                 packet_inc_pending(&po->tx_ring);
    2802             : 
    2803           0 :                 status = TP_STATUS_SEND_REQUEST;
    2804           0 :                 err = po->xmit(skb);
    2805           0 :                 if (unlikely(err > 0)) {
    2806           0 :                         err = net_xmit_errno(err);
    2807           0 :                         if (err && __packet_get_status(po, ph) ==
    2808             :                                    TP_STATUS_AVAILABLE) {
    2809             :                                 /* skb was destructed already */
    2810           0 :                                 skb = NULL;
    2811           0 :                                 goto out_status;
    2812             :                         }
    2813             :                         /*
    2814             :                          * skb was dropped but not destructed yet;
    2815             :                          * let's treat it like congestion or err < 0
    2816             :                          */
    2817           0 :                         err = 0;
    2818             :                 }
    2819           0 :                 packet_increment_head(&po->tx_ring);
    2820           0 :                 len_sum += tp_len;
    2821           0 :         } while (likely((ph != NULL) ||
    2822             :                 /* Note: packet_read_pending() might be slow if we have
    2823             :                  * to call it as it's per_cpu variable, but in fast-path
    2824             :                  * we already short-circuit the loop with the first
    2825             :                  * condition, and luckily don't have to go that path
    2826             :                  * anyway.
    2827             :                  */
    2828             :                  (need_wait && packet_read_pending(&po->tx_ring))));
    2829             : 
    2830           0 :         err = len_sum;
    2831           0 :         goto out_put;
    2832             : 
    2833           0 : out_status:
    2834           0 :         __packet_set_status(po, ph, status);
    2835           0 :         kfree_skb(skb);
    2836           0 : out_put:
    2837           0 :         dev_put(dev);
    2838           0 : out:
    2839           0 :         mutex_unlock(&po->pg_vec_lock);
    2840           0 :         return err;
    2841             : }
    2842             : 
    2843           2 : static struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
    2844             :                                         size_t reserve, size_t len,
    2845             :                                         size_t linear, int noblock,
    2846             :                                         int *err)
    2847             : {
    2848           2 :         struct sk_buff *skb;
    2849             : 
    2850             :         /* Under a page?  Don't bother with paged skb. */
    2851           2 :         if (prepad + len < PAGE_SIZE || !linear)
    2852           2 :                 linear = len;
    2853             : 
    2854           2 :         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
    2855             :                                    err, 0);
    2856           2 :         if (!skb)
    2857             :                 return NULL;
    2858             : 
    2859           2 :         skb_reserve(skb, reserve);
    2860           2 :         skb_put(skb, linear);
    2861           2 :         skb->data_len = len - linear;
    2862           2 :         skb->len += len - linear;
    2863             : 
    2864           2 :         return skb;
    2865             : }
    2866             : 
    2867           2 : static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
    2868             : {
    2869           2 :         struct sock *sk = sock->sk;
    2870           2 :         DECLARE_SOCKADDR(struct sockaddr_ll *, saddr, msg->msg_name);
    2871           2 :         struct sk_buff *skb;
    2872           2 :         struct net_device *dev;
    2873           2 :         __be16 proto;
    2874           2 :         unsigned char *addr = NULL;
    2875           2 :         int err, reserve = 0;
    2876           2 :         struct sockcm_cookie sockc;
    2877           2 :         struct virtio_net_hdr vnet_hdr = { 0 };
    2878           2 :         int offset = 0;
    2879           2 :         struct packet_sock *po = pkt_sk(sk);
    2880           2 :         bool has_vnet_hdr = false;
    2881           2 :         int hlen, tlen, linear;
    2882           2 :         int extra_len = 0;
    2883             : 
    2884             :         /*
    2885             :          *      Get and verify the address.
    2886             :          */
    2887             : 
    2888           2 :         if (likely(saddr == NULL)) {
    2889           2 :                 dev     = packet_cached_dev_get(po);
    2890           2 :                 proto   = po->num;
    2891             :         } else {
    2892           0 :                 err = -EINVAL;
    2893           0 :                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
    2894           0 :                         goto out;
    2895           0 :                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
    2896           0 :                         goto out;
    2897           0 :                 proto   = saddr->sll_protocol;
    2898           0 :                 dev = dev_get_by_index(sock_net(sk), saddr->sll_ifindex);
    2899           0 :                 if (sock->type == SOCK_DGRAM) {
    2900           0 :                         if (dev && msg->msg_namelen < dev->addr_len +
    2901             :                                    offsetof(struct sockaddr_ll, sll_addr))
    2902           0 :                                 goto out_unlock;
    2903           0 :                         addr = saddr->sll_addr;
    2904             :                 }
    2905             :         }
    2906             : 
    2907           2 :         err = -ENXIO;
    2908           2 :         if (unlikely(dev == NULL))
    2909           0 :                 goto out_unlock;
    2910           2 :         err = -ENETDOWN;
    2911           2 :         if (unlikely(!(dev->flags & IFF_UP)))
    2912           0 :                 goto out_unlock;
    2913             : 
    2914           2 :         sockcm_init(&sockc, sk);
    2915           2 :         sockc.mark = sk->sk_mark;
    2916           2 :         if (msg->msg_controllen) {
    2917           0 :                 err = sock_cmsg_send(sk, msg, &sockc);
    2918           0 :                 if (unlikely(err))
    2919           0 :                         goto out_unlock;
    2920             :         }
    2921             : 
    2922           2 :         if (sock->type == SOCK_RAW)
    2923           2 :                 reserve = dev->hard_header_len;
    2924           2 :         if (po->has_vnet_hdr) {
    2925           0 :                 err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
    2926           0 :                 if (err)
    2927           0 :                         goto out_unlock;
    2928             :                 has_vnet_hdr = true;
    2929             :         }
    2930             : 
    2931           2 :         if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
    2932           0 :                 if (!netif_supports_nofcs(dev)) {
    2933           0 :                         err = -EPROTONOSUPPORT;
    2934           0 :                         goto out_unlock;
    2935             :                 }
    2936             :                 extra_len = 4; /* We're doing our own CRC */
    2937             :         }
    2938             : 
    2939           2 :         err = -EMSGSIZE;
    2940           2 :         if (!vnet_hdr.gso_type &&
    2941           2 :             (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
    2942           0 :                 goto out_unlock;
    2943             : 
    2944           2 :         err = -ENOBUFS;
    2945           2 :         hlen = LL_RESERVED_SPACE(dev);
    2946           2 :         tlen = dev->needed_tailroom;
    2947           2 :         linear = __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len);
    2948           2 :         linear = max(linear, min_t(int, len, dev->hard_header_len));
    2949           4 :         skb = packet_alloc_skb(sk, hlen + tlen, hlen, len, linear,
    2950           2 :                                msg->msg_flags & MSG_DONTWAIT, &err);
    2951           2 :         if (skb == NULL)
    2952           0 :                 goto out_unlock;
    2953             : 
    2954           2 :         skb_reset_network_header(skb);
    2955             : 
    2956           2 :         err = -EINVAL;
    2957           2 :         if (sock->type == SOCK_DGRAM) {
    2958           0 :                 offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
    2959           0 :                 if (unlikely(offset < 0))
    2960           0 :                         goto out_free;
    2961           2 :         } else if (reserve) {
    2962           2 :                 skb_reserve(skb, -reserve);
    2963           2 :                 if (len < reserve + sizeof(struct ipv6hdr) &&
    2964           0 :                     dev->min_header_len != dev->hard_header_len)
    2965           0 :                         skb_reset_network_header(skb);
    2966             :         }
    2967             : 
    2968             :         /* Returns -EFAULT on error */
    2969           2 :         err = skb_copy_datagram_from_iter(skb, offset, &msg->msg_iter, len);
    2970           2 :         if (err)
    2971           0 :                 goto out_free;
    2972             : 
    2973           4 :         if (sock->type == SOCK_RAW &&
    2974           2 :             !dev_validate_header(dev, skb->data, len)) {
    2975           0 :                 err = -EINVAL;
    2976           0 :                 goto out_free;
    2977             :         }
    2978             : 
    2979           2 :         skb_setup_tx_timestamp(skb, sockc.tsflags);
    2980             : 
    2981           2 :         if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
    2982           0 :             !packet_extra_vlan_len_allowed(dev, skb)) {
    2983           0 :                 err = -EMSGSIZE;
    2984           0 :                 goto out_free;
    2985             :         }
    2986             : 
    2987           2 :         skb->protocol = proto;
    2988           2 :         skb->dev = dev;
    2989           2 :         skb->priority = sk->sk_priority;
    2990           2 :         skb->mark = sockc.mark;
    2991           2 :         skb->tstamp = sockc.transmit_time;
    2992             : 
    2993           2 :         if (has_vnet_hdr) {
    2994           0 :                 err = virtio_net_hdr_to_skb(skb, &vnet_hdr, vio_le());
    2995           0 :                 if (err)
    2996           0 :                         goto out_free;
    2997           0 :                 len += sizeof(vnet_hdr);
    2998           0 :                 virtio_net_hdr_set_proto(skb, &vnet_hdr);
    2999             :         }
    3000             : 
    3001           2 :         packet_parse_headers(skb, sock);
    3002             : 
    3003           2 :         if (unlikely(extra_len == 4))
    3004           0 :                 skb->no_fcs = 1;
    3005             : 
    3006           2 :         err = po->xmit(skb);
    3007           2 :         if (err > 0 && (err = net_xmit_errno(err)) != 0)
    3008           0 :                 goto out_unlock;
    3009             : 
    3010           2 :         dev_put(dev);
    3011             : 
    3012           2 :         return len;
    3013             : 
    3014           0 : out_free:
    3015           0 :         kfree_skb(skb);
    3016           0 : out_unlock:
    3017           0 :         if (dev)
    3018           0 :                 dev_put(dev);
    3019           0 : out:
    3020           0 :         return err;
    3021             : }
    3022             : 
    3023           2 : static int packet_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
    3024             : {
    3025           2 :         struct sock *sk = sock->sk;
    3026           2 :         struct packet_sock *po = pkt_sk(sk);
    3027             : 
    3028           2 :         if (po->tx_ring.pg_vec)
    3029           0 :                 return tpacket_snd(po, msg);
    3030             :         else
    3031           2 :                 return packet_snd(sock, msg, len);
    3032             : }
    3033             : 
    3034             : /*
    3035             :  *      Close a PACKET socket. This is fairly simple. We immediately go
    3036             :  *      to 'closed' state and remove our protocol entry in the device list.
    3037             :  */
    3038             : 
    3039           0 : static int packet_release(struct socket *sock)
    3040             : {
    3041           0 :         struct sock *sk = sock->sk;
    3042           0 :         struct packet_sock *po;
    3043           0 :         struct packet_fanout *f;
    3044           0 :         struct net *net;
    3045           0 :         union tpacket_req_u req_u;
    3046             : 
    3047           0 :         if (!sk)
    3048             :                 return 0;
    3049             : 
    3050           0 :         net = sock_net(sk);
    3051           0 :         po = pkt_sk(sk);
    3052             : 
    3053           0 :         mutex_lock(&net->packet.sklist_lock);
    3054           0 :         sk_del_node_init_rcu(sk);
    3055           0 :         mutex_unlock(&net->packet.sklist_lock);
    3056             : 
    3057           0 :         preempt_disable();
    3058           0 :         sock_prot_inuse_add(net, sk->sk_prot, -1);
    3059           0 :         preempt_enable();
    3060             : 
    3061           0 :         spin_lock(&po->bind_lock);
    3062           0 :         unregister_prot_hook(sk, false);
    3063           0 :         packet_cached_dev_reset(po);
    3064             : 
    3065           0 :         if (po->prot_hook.dev) {
    3066           0 :                 dev_put(po->prot_hook.dev);
    3067           0 :                 po->prot_hook.dev = NULL;
    3068             :         }
    3069           0 :         spin_unlock(&po->bind_lock);
    3070             : 
    3071           0 :         packet_flush_mclist(sk);
    3072             : 
    3073           0 :         lock_sock(sk);
    3074           0 :         if (po->rx_ring.pg_vec) {
    3075           0 :                 memset(&req_u, 0, sizeof(req_u));
    3076           0 :                 packet_set_ring(sk, &req_u, 1, 0);
    3077             :         }
    3078             : 
    3079           0 :         if (po->tx_ring.pg_vec) {
    3080           0 :                 memset(&req_u, 0, sizeof(req_u));
    3081           0 :                 packet_set_ring(sk, &req_u, 1, 1);
    3082             :         }
    3083           0 :         release_sock(sk);
    3084             : 
    3085           0 :         f = fanout_release(sk);
    3086             : 
    3087           0 :         synchronize_net();
    3088             : 
    3089           0 :         kfree(po->rollover);
    3090           0 :         if (f) {
    3091           0 :                 fanout_release_data(f);
    3092           0 :                 kvfree(f);
    3093             :         }
    3094             :         /*
    3095             :          *      Now the socket is dead. No more input will appear.
    3096             :          */
    3097           0 :         sock_orphan(sk);
    3098           0 :         sock->sk = NULL;
    3099             : 
    3100             :         /* Purge queues */
    3101             : 
    3102           0 :         skb_queue_purge(&sk->sk_receive_queue);
    3103           0 :         packet_free_pending(po);
    3104           0 :         sk_refcnt_debug_release(sk);
    3105             : 
    3106           0 :         sock_put(sk);
    3107           0 :         return 0;
    3108             : }
    3109             : 
    3110             : /*
    3111             :  *      Attach a packet hook.
    3112             :  */
    3113             : 
    3114           1 : static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
    3115             :                           __be16 proto)
    3116             : {
    3117           1 :         struct packet_sock *po = pkt_sk(sk);
    3118           1 :         struct net_device *dev_curr;
    3119           1 :         __be16 proto_curr;
    3120           1 :         bool need_rehook;
    3121           1 :         struct net_device *dev = NULL;
    3122           1 :         int ret = 0;
    3123           1 :         bool unlisted = false;
    3124             : 
    3125           1 :         lock_sock(sk);
    3126           1 :         spin_lock(&po->bind_lock);
    3127           1 :         rcu_read_lock();
    3128             : 
    3129           1 :         if (po->fanout) {
    3130           0 :                 ret = -EINVAL;
    3131           0 :                 goto out_unlock;
    3132             :         }
    3133             : 
    3134           1 :         if (name) {
    3135           0 :                 dev = dev_get_by_name_rcu(sock_net(sk), name);
    3136           0 :                 if (!dev) {
    3137           0 :                         ret = -ENODEV;
    3138           0 :                         goto out_unlock;
    3139             :                 }
    3140           1 :         } else if (ifindex) {
    3141           1 :                 dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
    3142           1 :                 if (!dev) {
    3143           0 :                         ret = -ENODEV;
    3144           0 :                         goto out_unlock;
    3145             :                 }
    3146             :         }
    3147             : 
    3148           1 :         if (dev)
    3149           1 :                 dev_hold(dev);
    3150             : 
    3151           1 :         proto_curr = po->prot_hook.type;
    3152           1 :         dev_curr = po->prot_hook.dev;
    3153             : 
    3154           1 :         need_rehook = proto_curr != proto || dev_curr != dev;
    3155             : 
    3156           1 :         if (need_rehook) {
    3157           1 :                 if (po->running) {
    3158           1 :                         rcu_read_unlock();
    3159             :                         /* prevents packet_notifier() from calling
    3160             :                          * register_prot_hook()
    3161             :                          */
    3162           1 :                         po->num = 0;
    3163           1 :                         __unregister_prot_hook(sk, true);
    3164           1 :                         rcu_read_lock();
    3165           1 :                         dev_curr = po->prot_hook.dev;
    3166           1 :                         if (dev)
    3167           1 :                                 unlisted = !dev_get_by_index_rcu(sock_net(sk),
    3168             :                                                                  dev->ifindex);
    3169             :                 }
    3170             : 
    3171           1 :                 BUG_ON(po->running);
    3172           1 :                 po->num = proto;
    3173           1 :                 po->prot_hook.type = proto;
    3174             : 
    3175           1 :                 if (unlikely(unlisted)) {
    3176           0 :                         dev_put(dev);
    3177           0 :                         po->prot_hook.dev = NULL;
    3178           0 :                         po->ifindex = -1;
    3179           0 :                         packet_cached_dev_reset(po);
    3180             :                 } else {
    3181           1 :                         po->prot_hook.dev = dev;
    3182           1 :                         po->ifindex = dev ? dev->ifindex : 0;
    3183           1 :                         packet_cached_dev_assign(po, dev);
    3184             :                 }
    3185             :         }
    3186           1 :         if (dev_curr)
    3187           0 :                 dev_put(dev_curr);
    3188             : 
    3189           1 :         if (proto == 0 || !need_rehook)
    3190           0 :                 goto out_unlock;
    3191             : 
    3192           1 :         if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
    3193           1 :                 register_prot_hook(sk);
    3194             :         } else {
    3195           0 :                 sk->sk_err = ENETDOWN;
    3196           0 :                 if (!sock_flag(sk, SOCK_DEAD))
    3197           0 :                         sk->sk_error_report(sk);
    3198             :         }
    3199             : 
    3200           0 : out_unlock:
    3201           1 :         rcu_read_unlock();
    3202           1 :         spin_unlock(&po->bind_lock);
    3203           1 :         release_sock(sk);
    3204           1 :         return ret;
    3205             : }
    3206             : 
    3207             : /*
    3208             :  *      Bind a packet socket to a device
    3209             :  */
    3210             : 
    3211           0 : static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
    3212             :                             int addr_len)
    3213             : {
    3214           0 :         struct sock *sk = sock->sk;
    3215           0 :         char name[sizeof(uaddr->sa_data) + 1];
    3216             : 
    3217             :         /*
    3218             :          *      Check legality
    3219             :          */
    3220             : 
    3221           0 :         if (addr_len != sizeof(struct sockaddr))
    3222             :                 return -EINVAL;
    3223             :         /* uaddr->sa_data comes from the userspace, it's not guaranteed to be
    3224             :          * zero-terminated.
    3225             :          */
    3226           0 :         memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data));
    3227           0 :         name[sizeof(uaddr->sa_data)] = 0;
    3228             : 
    3229           0 :         return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
    3230             : }
    3231             : 
    3232           1 : static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
    3233             : {
    3234           1 :         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
    3235           1 :         struct sock *sk = sock->sk;
    3236             : 
    3237             :         /*
    3238             :          *      Check legality
    3239             :          */
    3240             : 
    3241           1 :         if (addr_len < sizeof(struct sockaddr_ll))
    3242             :                 return -EINVAL;
    3243           1 :         if (sll->sll_family != AF_PACKET)
    3244             :                 return -EINVAL;
    3245             : 
    3246           1 :         return packet_do_bind(sk, NULL, sll->sll_ifindex,
    3247           1 :                               sll->sll_protocol ? : pkt_sk(sk)->num);
    3248             : }
    3249             : 
    3250             : static struct proto packet_proto = {
    3251             :         .name     = "PACKET",
    3252             :         .owner    = THIS_MODULE,
    3253             :         .obj_size = sizeof(struct packet_sock),
    3254             : };
    3255             : 
    3256             : /*
    3257             :  *      Create a packet of type SOCK_PACKET.
    3258             :  */
    3259             : 
    3260           1 : static int packet_create(struct net *net, struct socket *sock, int protocol,
    3261             :                          int kern)
    3262             : {
    3263           1 :         struct sock *sk;
    3264           1 :         struct packet_sock *po;
    3265           1 :         __be16 proto = (__force __be16)protocol; /* weird, but documented */
    3266           1 :         int err;
    3267             : 
    3268           1 :         if (!ns_capable(net->user_ns, CAP_NET_RAW))
    3269             :                 return -EPERM;
    3270           1 :         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
    3271             :             sock->type != SOCK_PACKET)
    3272             :                 return -ESOCKTNOSUPPORT;
    3273             : 
    3274           1 :         sock->state = SS_UNCONNECTED;
    3275             : 
    3276           1 :         err = -ENOBUFS;
    3277           1 :         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, kern);
    3278           1 :         if (sk == NULL)
    3279           0 :                 goto out;
    3280             : 
    3281           1 :         sock->ops = &packet_ops;
    3282           1 :         if (sock->type == SOCK_PACKET)
    3283           0 :                 sock->ops = &packet_ops_spkt;
    3284             : 
    3285           1 :         sock_init_data(sock, sk);
    3286             : 
    3287           1 :         po = pkt_sk(sk);
    3288           1 :         init_completion(&po->skb_completion);
    3289           1 :         sk->sk_family = PF_PACKET;
    3290           1 :         po->num = proto;
    3291           1 :         po->xmit = dev_queue_xmit;
    3292             : 
    3293           1 :         err = packet_alloc_pending(po);
    3294           1 :         if (err)
    3295           0 :                 goto out2;
    3296             : 
    3297           1 :         packet_cached_dev_reset(po);
    3298             : 
    3299           1 :         sk->sk_destruct = packet_sock_destruct;
    3300           1 :         sk_refcnt_debug_inc(sk);
    3301             : 
    3302             :         /*
    3303             :          *      Attach a protocol block
    3304             :          */
    3305             : 
    3306           1 :         spin_lock_init(&po->bind_lock);
    3307           1 :         mutex_init(&po->pg_vec_lock);
    3308           1 :         po->rollover = NULL;
    3309           1 :         po->prot_hook.func = packet_rcv;
    3310             : 
    3311           1 :         if (sock->type == SOCK_PACKET)
    3312           0 :                 po->prot_hook.func = packet_rcv_spkt;
    3313             : 
    3314           1 :         po->prot_hook.af_packet_priv = sk;
    3315             : 
    3316           1 :         if (proto) {
    3317           1 :                 po->prot_hook.type = proto;
    3318           1 :                 __register_prot_hook(sk);
    3319             :         }
    3320             : 
    3321           1 :         mutex_lock(&net->packet.sklist_lock);
    3322           1 :         sk_add_node_tail_rcu(sk, &net->packet.sklist);
    3323           1 :         mutex_unlock(&net->packet.sklist_lock);
    3324             : 
    3325           1 :         preempt_disable();
    3326           1 :         sock_prot_inuse_add(net, &packet_proto, 1);
    3327           1 :         preempt_enable();
    3328             : 
    3329           1 :         return 0;
    3330           0 : out2:
    3331           0 :         sk_free(sk);
    3332             : out:
    3333             :         return err;
    3334             : }
    3335             : 
    3336             : /*
    3337             :  *      Pull a packet from our receive queue and hand it to the user.
    3338             :  *      If necessary we block.
    3339             :  */
    3340             : 
    3341           2 : static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
    3342             :                           int flags)
    3343             : {
    3344           2 :         struct sock *sk = sock->sk;
    3345           2 :         struct sk_buff *skb;
    3346           2 :         int copied, err;
    3347           2 :         int vnet_hdr_len = 0;
    3348           2 :         unsigned int origlen = 0;
    3349             : 
    3350           2 :         err = -EINVAL;
    3351           2 :         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT|MSG_ERRQUEUE))
    3352           0 :                 goto out;
    3353             : 
    3354             : #if 0
    3355             :         /* What error should we return now? EUNATTACH? */
    3356             :         if (pkt_sk(sk)->ifindex < 0)
    3357             :                 return -ENODEV;
    3358             : #endif
    3359             : 
    3360           2 :         if (flags & MSG_ERRQUEUE) {
    3361           0 :                 err = sock_recv_errqueue(sk, msg, len,
    3362             :                                          SOL_PACKET, PACKET_TX_TIMESTAMP);
    3363           0 :                 goto out;
    3364             :         }
    3365             : 
    3366             :         /*
    3367             :          *      Call the generic datagram receiver. This handles all sorts
    3368             :          *      of horrible races and re-entrancy so we can forget about it
    3369             :          *      in the protocol layers.
    3370             :          *
    3371             :          *      Now it will return ENETDOWN, if device have just gone down,
    3372             :          *      but then it will block.
    3373             :          */
    3374             : 
    3375           2 :         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
    3376             : 
    3377             :         /*
    3378             :          *      An error occurred so return it. Because skb_recv_datagram()
    3379             :          *      handles the blocking we don't see and worry about blocking
    3380             :          *      retries.
    3381             :          */
    3382             : 
    3383           2 :         if (skb == NULL)
    3384           0 :                 goto out;
    3385             : 
    3386           2 :         packet_rcv_try_clear_pressure(pkt_sk(sk));
    3387             : 
    3388           2 :         if (pkt_sk(sk)->has_vnet_hdr) {
    3389           0 :                 err = packet_rcv_vnet(msg, skb, &len);
    3390           0 :                 if (err)
    3391           0 :                         goto out_free;
    3392             :                 vnet_hdr_len = sizeof(struct virtio_net_hdr);
    3393             :         }
    3394             : 
    3395             :         /* You lose any data beyond the buffer you gave. If it worries
    3396             :          * a user program they can ask the device for its MTU
    3397             :          * anyway.
    3398             :          */
    3399           2 :         copied = skb->len;
    3400           2 :         if (copied > len) {
    3401           0 :                 copied = len;
    3402           0 :                 msg->msg_flags |= MSG_TRUNC;
    3403             :         }
    3404             : 
    3405           2 :         err = skb_copy_datagram_msg(skb, 0, msg, copied);
    3406           2 :         if (err)
    3407           0 :                 goto out_free;
    3408             : 
    3409           2 :         if (sock->type != SOCK_PACKET) {
    3410           2 :                 struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
    3411             : 
    3412             :                 /* Original length was stored in sockaddr_ll fields */
    3413           2 :                 origlen = PACKET_SKB_CB(skb)->sa.origlen;
    3414           2 :                 sll->sll_family = AF_PACKET;
    3415           2 :                 sll->sll_protocol = skb->protocol;
    3416             :         }
    3417             : 
    3418           2 :         sock_recv_ts_and_drops(msg, sk, skb);
    3419             : 
    3420           2 :         if (msg->msg_name) {
    3421           2 :                 int copy_len;
    3422             : 
    3423             :                 /* If the address length field is there to be filled
    3424             :                  * in, we fill it in now.
    3425             :                  */
    3426           2 :                 if (sock->type == SOCK_PACKET) {
    3427           0 :                         __sockaddr_check_size(sizeof(struct sockaddr_pkt));
    3428           0 :                         msg->msg_namelen = sizeof(struct sockaddr_pkt);
    3429           0 :                         copy_len = msg->msg_namelen;
    3430             :                 } else {
    3431           2 :                         struct sockaddr_ll *sll = &PACKET_SKB_CB(skb)->sa.ll;
    3432             : 
    3433           2 :                         msg->msg_namelen = sll->sll_halen +
    3434             :                                 offsetof(struct sockaddr_ll, sll_addr);
    3435           2 :                         copy_len = msg->msg_namelen;
    3436           2 :                         if (msg->msg_namelen < sizeof(struct sockaddr_ll)) {
    3437           2 :                                 memset(msg->msg_name +
    3438             :                                        offsetof(struct sockaddr_ll, sll_addr),
    3439             :                                        0, sizeof(sll->sll_addr));
    3440           2 :                                 msg->msg_namelen = sizeof(struct sockaddr_ll);
    3441             :                         }
    3442             :                 }
    3443           2 :                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa, copy_len);
    3444             :         }
    3445             : 
    3446           2 :         if (pkt_sk(sk)->auxdata) {
    3447           2 :                 struct tpacket_auxdata aux;
    3448             : 
    3449           2 :                 aux.tp_status = TP_STATUS_USER;
    3450           2 :                 if (skb->ip_summed == CHECKSUM_PARTIAL)
    3451           0 :                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
    3452           2 :                 else if (skb->pkt_type != PACKET_OUTGOING &&
    3453           2 :                          (skb->ip_summed == CHECKSUM_COMPLETE ||
    3454           2 :                           skb_csum_unnecessary(skb)))
    3455           2 :                         aux.tp_status |= TP_STATUS_CSUM_VALID;
    3456             : 
    3457           2 :                 aux.tp_len = origlen;
    3458           2 :                 aux.tp_snaplen = skb->len;
    3459           2 :                 aux.tp_mac = 0;
    3460           2 :                 aux.tp_net = skb_network_offset(skb);
    3461           2 :                 if (skb_vlan_tag_present(skb)) {
    3462           0 :                         aux.tp_vlan_tci = skb_vlan_tag_get(skb);
    3463           0 :                         aux.tp_vlan_tpid = ntohs(skb->vlan_proto);
    3464           0 :                         aux.tp_status |= TP_STATUS_VLAN_VALID | TP_STATUS_VLAN_TPID_VALID;
    3465             :                 } else {
    3466           2 :                         aux.tp_vlan_tci = 0;
    3467           2 :                         aux.tp_vlan_tpid = 0;
    3468             :                 }
    3469           2 :                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
    3470             :         }
    3471             : 
    3472             :         /*
    3473             :          *      Free or return the buffer as appropriate. Again this
    3474             :          *      hides all the races and re-entrancy issues from us.
    3475             :          */
    3476           2 :         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
    3477             : 
    3478           2 : out_free:
    3479           2 :         skb_free_datagram(sk, skb);
    3480           2 : out:
    3481           2 :         return err;
    3482             : }
    3483             : 
    3484           0 : static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
    3485             :                                int peer)
    3486             : {
    3487           0 :         struct net_device *dev;
    3488           0 :         struct sock *sk = sock->sk;
    3489             : 
    3490           0 :         if (peer)
    3491             :                 return -EOPNOTSUPP;
    3492             : 
    3493           0 :         uaddr->sa_family = AF_PACKET;
    3494           0 :         memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data));
    3495           0 :         rcu_read_lock();
    3496           0 :         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
    3497           0 :         if (dev)
    3498           0 :                 strlcpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data));
    3499           0 :         rcu_read_unlock();
    3500             : 
    3501           0 :         return sizeof(*uaddr);
    3502             : }
    3503             : 
    3504           0 : static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
    3505             :                           int peer)
    3506             : {
    3507           0 :         struct net_device *dev;
    3508           0 :         struct sock *sk = sock->sk;
    3509           0 :         struct packet_sock *po = pkt_sk(sk);
    3510           0 :         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
    3511             : 
    3512           0 :         if (peer)
    3513             :                 return -EOPNOTSUPP;
    3514             : 
    3515           0 :         sll->sll_family = AF_PACKET;
    3516           0 :         sll->sll_ifindex = po->ifindex;
    3517           0 :         sll->sll_protocol = po->num;
    3518           0 :         sll->sll_pkttype = 0;
    3519           0 :         rcu_read_lock();
    3520           0 :         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
    3521           0 :         if (dev) {
    3522           0 :                 sll->sll_hatype = dev->type;
    3523           0 :                 sll->sll_halen = dev->addr_len;
    3524           0 :                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
    3525             :         } else {
    3526           0 :                 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
    3527           0 :                 sll->sll_halen = 0;
    3528             :         }
    3529           0 :         rcu_read_unlock();
    3530             : 
    3531           0 :         return offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
    3532             : }
    3533             : 
    3534           0 : static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
    3535             :                          int what)
    3536             : {
    3537           0 :         switch (i->type) {
    3538           0 :         case PACKET_MR_MULTICAST:
    3539           0 :                 if (i->alen != dev->addr_len)
    3540             :                         return -EINVAL;
    3541           0 :                 if (what > 0)
    3542           0 :                         return dev_mc_add(dev, i->addr);
    3543             :                 else
    3544           0 :                         return dev_mc_del(dev, i->addr);
    3545           0 :                 break;
    3546           0 :         case PACKET_MR_PROMISC:
    3547           0 :                 return dev_set_promiscuity(dev, what);
    3548           0 :         case PACKET_MR_ALLMULTI:
    3549           0 :                 return dev_set_allmulti(dev, what);
    3550           0 :         case PACKET_MR_UNICAST:
    3551           0 :                 if (i->alen != dev->addr_len)
    3552             :                         return -EINVAL;
    3553           0 :                 if (what > 0)
    3554           0 :                         return dev_uc_add(dev, i->addr);
    3555             :                 else
    3556           0 :                         return dev_uc_del(dev, i->addr);
    3557             :                 break;
    3558             :         default:
    3559             :                 break;
    3560             :         }
    3561             :         return 0;
    3562             : }
    3563             : 
    3564           0 : static void packet_dev_mclist_delete(struct net_device *dev,
    3565             :                                      struct packet_mclist **mlp)
    3566             : {
    3567           0 :         struct packet_mclist *ml;
    3568             : 
    3569           0 :         while ((ml = *mlp) != NULL) {
    3570           0 :                 if (ml->ifindex == dev->ifindex) {
    3571           0 :                         packet_dev_mc(dev, ml, -1);
    3572           0 :                         *mlp = ml->next;
    3573           0 :                         kfree(ml);
    3574             :                 } else
    3575           0 :                         mlp = &ml->next;
    3576             :         }
    3577           0 : }
    3578             : 
    3579           0 : static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
    3580             : {
    3581           0 :         struct packet_sock *po = pkt_sk(sk);
    3582           0 :         struct packet_mclist *ml, *i;
    3583           0 :         struct net_device *dev;
    3584           0 :         int err;
    3585             : 
    3586           0 :         rtnl_lock();
    3587             : 
    3588           0 :         err = -ENODEV;
    3589           0 :         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
    3590           0 :         if (!dev)
    3591           0 :                 goto done;
    3592             : 
    3593           0 :         err = -EINVAL;
    3594           0 :         if (mreq->mr_alen > dev->addr_len)
    3595           0 :                 goto done;
    3596             : 
    3597           0 :         err = -ENOBUFS;
    3598           0 :         i = kmalloc(sizeof(*i), GFP_KERNEL);
    3599           0 :         if (i == NULL)
    3600           0 :                 goto done;
    3601             : 
    3602           0 :         err = 0;
    3603           0 :         for (ml = po->mclist; ml; ml = ml->next) {
    3604           0 :                 if (ml->ifindex == mreq->mr_ifindex &&
    3605           0 :                     ml->type == mreq->mr_type &&
    3606           0 :                     ml->alen == mreq->mr_alen &&
    3607           0 :                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
    3608           0 :                         ml->count++;
    3609             :                         /* Free the new element ... */
    3610           0 :                         kfree(i);
    3611           0 :                         goto done;
    3612             :                 }
    3613             :         }
    3614             : 
    3615           0 :         i->type = mreq->mr_type;
    3616           0 :         i->ifindex = mreq->mr_ifindex;
    3617           0 :         i->alen = mreq->mr_alen;
    3618           0 :         memcpy(i->addr, mreq->mr_address, i->alen);
    3619           0 :         memset(i->addr + i->alen, 0, sizeof(i->addr) - i->alen);
    3620           0 :         i->count = 1;
    3621           0 :         i->next = po->mclist;
    3622           0 :         po->mclist = i;
    3623           0 :         err = packet_dev_mc(dev, i, 1);
    3624           0 :         if (err) {
    3625           0 :                 po->mclist = i->next;
    3626           0 :                 kfree(i);
    3627             :         }
    3628             : 
    3629           0 : done:
    3630           0 :         rtnl_unlock();
    3631           0 :         return err;
    3632             : }
    3633             : 
    3634           0 : static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
    3635             : {
    3636           0 :         struct packet_mclist *ml, **mlp;
    3637             : 
    3638           0 :         rtnl_lock();
    3639             : 
    3640           0 :         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
    3641           0 :                 if (ml->ifindex == mreq->mr_ifindex &&
    3642           0 :                     ml->type == mreq->mr_type &&
    3643           0 :                     ml->alen == mreq->mr_alen &&
    3644           0 :                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
    3645           0 :                         if (--ml->count == 0) {
    3646           0 :                                 struct net_device *dev;
    3647           0 :                                 *mlp = ml->next;
    3648           0 :                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
    3649           0 :                                 if (dev)
    3650           0 :                                         packet_dev_mc(dev, ml, -1);
    3651           0 :                                 kfree(ml);
    3652             :                         }
    3653             :                         break;
    3654             :                 }
    3655             :         }
    3656           0 :         rtnl_unlock();
    3657           0 :         return 0;
    3658             : }
    3659             : 
    3660           0 : static void packet_flush_mclist(struct sock *sk)
    3661             : {
    3662           0 :         struct packet_sock *po = pkt_sk(sk);
    3663           0 :         struct packet_mclist *ml;
    3664             : 
    3665           0 :         if (!po->mclist)
    3666             :                 return;
    3667             : 
    3668           0 :         rtnl_lock();
    3669           0 :         while ((ml = po->mclist) != NULL) {
    3670           0 :                 struct net_device *dev;
    3671             : 
    3672           0 :                 po->mclist = ml->next;
    3673           0 :                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
    3674           0 :                 if (dev != NULL)
    3675           0 :                         packet_dev_mc(dev, ml, -1);
    3676           0 :                 kfree(ml);
    3677             :         }
    3678           0 :         rtnl_unlock();
    3679             : }
    3680             : 
    3681             : static int
    3682           1 : packet_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval,
    3683             :                   unsigned int optlen)
    3684             : {
    3685           1 :         struct sock *sk = sock->sk;
    3686           1 :         struct packet_sock *po = pkt_sk(sk);
    3687           1 :         int ret;
    3688             : 
    3689           1 :         if (level != SOL_PACKET)
    3690             :                 return -ENOPROTOOPT;
    3691             : 
    3692           1 :         switch (optname) {
    3693           0 :         case PACKET_ADD_MEMBERSHIP:
    3694             :         case PACKET_DROP_MEMBERSHIP:
    3695             :         {
    3696           0 :                 struct packet_mreq_max mreq;
    3697           0 :                 int len = optlen;
    3698           0 :                 memset(&mreq, 0, sizeof(mreq));
    3699           0 :                 if (len < sizeof(struct packet_mreq))
    3700             :                         return -EINVAL;
    3701           0 :                 if (len > sizeof(mreq))
    3702             :                         len = sizeof(mreq);
    3703           0 :                 if (copy_from_sockptr(&mreq, optval, len))
    3704             :                         return -EFAULT;
    3705           0 :                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
    3706             :                         return -EINVAL;
    3707           0 :                 if (optname == PACKET_ADD_MEMBERSHIP)
    3708           0 :                         ret = packet_mc_add(sk, &mreq);
    3709             :                 else
    3710           0 :                         ret = packet_mc_drop(sk, &mreq);
    3711             :                 return ret;
    3712             :         }
    3713             : 
    3714             :         case PACKET_RX_RING:
    3715             :         case PACKET_TX_RING:
    3716             :         {
    3717           0 :                 union tpacket_req_u req_u;
    3718           0 :                 int len;
    3719             : 
    3720           0 :                 lock_sock(sk);
    3721           0 :                 switch (po->tp_version) {
    3722             :                 case TPACKET_V1:
    3723             :                 case TPACKET_V2:
    3724             :                         len = sizeof(req_u.req);
    3725             :                         break;
    3726           0 :                 case TPACKET_V3:
    3727             :                 default:
    3728           0 :                         len = sizeof(req_u.req3);
    3729           0 :                         break;
    3730             :                 }
    3731           0 :                 if (optlen < len) {
    3732             :                         ret = -EINVAL;
    3733             :                 } else {
    3734           0 :                         if (copy_from_sockptr(&req_u.req, optval, len))
    3735             :                                 ret = -EFAULT;
    3736             :                         else
    3737           0 :                                 ret = packet_set_ring(sk, &req_u, 0,
    3738             :                                                     optname == PACKET_TX_RING);
    3739             :                 }
    3740           0 :                 release_sock(sk);
    3741           0 :                 return ret;
    3742             :         }
    3743           0 :         case PACKET_COPY_THRESH:
    3744             :         {
    3745           0 :                 int val;
    3746             : 
    3747           0 :                 if (optlen != sizeof(val))
    3748             :                         return -EINVAL;
    3749           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3750             :                         return -EFAULT;
    3751             : 
    3752           0 :                 pkt_sk(sk)->copy_thresh = val;
    3753           0 :                 return 0;
    3754             :         }
    3755           0 :         case PACKET_VERSION:
    3756             :         {
    3757           0 :                 int val;
    3758             : 
    3759           0 :                 if (optlen != sizeof(val))
    3760             :                         return -EINVAL;
    3761           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3762             :                         return -EFAULT;
    3763           0 :                 switch (val) {
    3764             :                 case TPACKET_V1:
    3765             :                 case TPACKET_V2:
    3766             :                 case TPACKET_V3:
    3767           0 :                         break;
    3768             :                 default:
    3769             :                         return -EINVAL;
    3770             :                 }
    3771           0 :                 lock_sock(sk);
    3772           0 :                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
    3773             :                         ret = -EBUSY;
    3774             :                 } else {
    3775           0 :                         po->tp_version = val;
    3776           0 :                         ret = 0;
    3777             :                 }
    3778           0 :                 release_sock(sk);
    3779           0 :                 return ret;
    3780             :         }
    3781           0 :         case PACKET_RESERVE:
    3782             :         {
    3783           0 :                 unsigned int val;
    3784             : 
    3785           0 :                 if (optlen != sizeof(val))
    3786             :                         return -EINVAL;
    3787           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3788             :                         return -EFAULT;
    3789           0 :                 if (val > INT_MAX)
    3790             :                         return -EINVAL;
    3791           0 :                 lock_sock(sk);
    3792           0 :                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
    3793             :                         ret = -EBUSY;
    3794             :                 } else {
    3795           0 :                         po->tp_reserve = val;
    3796           0 :                         ret = 0;
    3797             :                 }
    3798           0 :                 release_sock(sk);
    3799           0 :                 return ret;
    3800             :         }
    3801           0 :         case PACKET_LOSS:
    3802             :         {
    3803           0 :                 unsigned int val;
    3804             : 
    3805           0 :                 if (optlen != sizeof(val))
    3806             :                         return -EINVAL;
    3807           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3808             :                         return -EFAULT;
    3809             : 
    3810           0 :                 lock_sock(sk);
    3811           0 :                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
    3812             :                         ret = -EBUSY;
    3813             :                 } else {
    3814           0 :                         po->tp_loss = !!val;
    3815           0 :                         ret = 0;
    3816             :                 }
    3817           0 :                 release_sock(sk);
    3818           0 :                 return ret;
    3819             :         }
    3820           1 :         case PACKET_AUXDATA:
    3821             :         {
    3822           1 :                 int val;
    3823             : 
    3824           1 :                 if (optlen < sizeof(val))
    3825             :                         return -EINVAL;
    3826           1 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3827             :                         return -EFAULT;
    3828             : 
    3829           1 :                 lock_sock(sk);
    3830           1 :                 po->auxdata = !!val;
    3831           1 :                 release_sock(sk);
    3832           1 :                 return 0;
    3833             :         }
    3834           0 :         case PACKET_ORIGDEV:
    3835             :         {
    3836           0 :                 int val;
    3837             : 
    3838           0 :                 if (optlen < sizeof(val))
    3839             :                         return -EINVAL;
    3840           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3841             :                         return -EFAULT;
    3842             : 
    3843           0 :                 lock_sock(sk);
    3844           0 :                 po->origdev = !!val;
    3845           0 :                 release_sock(sk);
    3846           0 :                 return 0;
    3847             :         }
    3848           0 :         case PACKET_VNET_HDR:
    3849             :         {
    3850           0 :                 int val;
    3851             : 
    3852           0 :                 if (sock->type != SOCK_RAW)
    3853             :                         return -EINVAL;
    3854           0 :                 if (optlen < sizeof(val))
    3855             :                         return -EINVAL;
    3856           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3857             :                         return -EFAULT;
    3858             : 
    3859           0 :                 lock_sock(sk);
    3860           0 :                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
    3861             :                         ret = -EBUSY;
    3862             :                 } else {
    3863           0 :                         po->has_vnet_hdr = !!val;
    3864           0 :                         ret = 0;
    3865             :                 }
    3866           0 :                 release_sock(sk);
    3867           0 :                 return ret;
    3868             :         }
    3869           0 :         case PACKET_TIMESTAMP:
    3870             :         {
    3871           0 :                 int val;
    3872             : 
    3873           0 :                 if (optlen != sizeof(val))
    3874             :                         return -EINVAL;
    3875           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3876             :                         return -EFAULT;
    3877             : 
    3878           0 :                 po->tp_tstamp = val;
    3879           0 :                 return 0;
    3880             :         }
    3881           0 :         case PACKET_FANOUT:
    3882             :         {
    3883           0 :                 struct fanout_args args = { 0 };
    3884             : 
    3885           0 :                 if (optlen != sizeof(int) && optlen != sizeof(args))
    3886             :                         return -EINVAL;
    3887           0 :                 if (copy_from_sockptr(&args, optval, optlen))
    3888             :                         return -EFAULT;
    3889             : 
    3890           0 :                 return fanout_add(sk, &args);
    3891             :         }
    3892           0 :         case PACKET_FANOUT_DATA:
    3893             :         {
    3894           0 :                 if (!po->fanout)
    3895             :                         return -EINVAL;
    3896             : 
    3897           0 :                 return fanout_set_data(po, optval, optlen);
    3898             :         }
    3899           0 :         case PACKET_IGNORE_OUTGOING:
    3900             :         {
    3901           0 :                 int val;
    3902             : 
    3903           0 :                 if (optlen != sizeof(val))
    3904             :                         return -EINVAL;
    3905           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3906             :                         return -EFAULT;
    3907           0 :                 if (val < 0 || val > 1)
    3908             :                         return -EINVAL;
    3909             : 
    3910           0 :                 po->prot_hook.ignore_outgoing = !!val;
    3911           0 :                 return 0;
    3912             :         }
    3913           0 :         case PACKET_TX_HAS_OFF:
    3914             :         {
    3915           0 :                 unsigned int val;
    3916             : 
    3917           0 :                 if (optlen != sizeof(val))
    3918             :                         return -EINVAL;
    3919           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3920             :                         return -EFAULT;
    3921             : 
    3922           0 :                 lock_sock(sk);
    3923           0 :                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec) {
    3924           0 :                         ret = -EBUSY;
    3925             :                 } else {
    3926           0 :                         po->tp_tx_has_off = !!val;
    3927           0 :                         ret = 0;
    3928             :                 }
    3929           0 :                 release_sock(sk);
    3930           0 :                 return 0;
    3931             :         }
    3932           0 :         case PACKET_QDISC_BYPASS:
    3933             :         {
    3934           0 :                 int val;
    3935             : 
    3936           0 :                 if (optlen != sizeof(val))
    3937             :                         return -EINVAL;
    3938           0 :                 if (copy_from_sockptr(&val, optval, sizeof(val)))
    3939             :                         return -EFAULT;
    3940             : 
    3941           0 :                 po->xmit = val ? packet_direct_xmit : dev_queue_xmit;
    3942           0 :                 return 0;
    3943             :         }
    3944             :         default:
    3945             :                 return -ENOPROTOOPT;
    3946             :         }
    3947             : }
    3948             : 
    3949           0 : static int packet_getsockopt(struct socket *sock, int level, int optname,
    3950             :                              char __user *optval, int __user *optlen)
    3951             : {
    3952           0 :         int len;
    3953           0 :         int val, lv = sizeof(val);
    3954           0 :         struct sock *sk = sock->sk;
    3955           0 :         struct packet_sock *po = pkt_sk(sk);
    3956           0 :         void *data = &val;
    3957           0 :         union tpacket_stats_u st;
    3958           0 :         struct tpacket_rollover_stats rstats;
    3959           0 :         int drops;
    3960             : 
    3961           0 :         if (level != SOL_PACKET)
    3962             :                 return -ENOPROTOOPT;
    3963             : 
    3964           0 :         if (get_user(len, optlen))
    3965             :                 return -EFAULT;
    3966             : 
    3967           0 :         if (len < 0)
    3968             :                 return -EINVAL;
    3969             : 
    3970           0 :         switch (optname) {
    3971           0 :         case PACKET_STATISTICS:
    3972           0 :                 spin_lock_bh(&sk->sk_receive_queue.lock);
    3973           0 :                 memcpy(&st, &po->stats, sizeof(st));
    3974           0 :                 memset(&po->stats, 0, sizeof(po->stats));
    3975           0 :                 spin_unlock_bh(&sk->sk_receive_queue.lock);
    3976           0 :                 drops = atomic_xchg(&po->tp_drops, 0);
    3977             : 
    3978           0 :                 if (po->tp_version == TPACKET_V3) {
    3979           0 :                         lv = sizeof(struct tpacket_stats_v3);
    3980           0 :                         st.stats3.tp_drops = drops;
    3981           0 :                         st.stats3.tp_packets += drops;
    3982           0 :                         data = &st.stats3;
    3983             :                 } else {
    3984           0 :                         lv = sizeof(struct tpacket_stats);
    3985           0 :                         st.stats1.tp_drops = drops;
    3986           0 :                         st.stats1.tp_packets += drops;
    3987           0 :                         data = &st.stats1;
    3988             :                 }
    3989             : 
    3990             :                 break;
    3991           0 :         case PACKET_AUXDATA:
    3992           0 :                 val = po->auxdata;
    3993           0 :                 break;
    3994           0 :         case PACKET_ORIGDEV:
    3995           0 :                 val = po->origdev;
    3996           0 :                 break;
    3997           0 :         case PACKET_VNET_HDR:
    3998           0 :                 val = po->has_vnet_hdr;
    3999           0 :                 break;
    4000           0 :         case PACKET_VERSION:
    4001           0 :                 val = po->tp_version;
    4002           0 :                 break;
    4003           0 :         case PACKET_HDRLEN:
    4004           0 :                 if (len > sizeof(int))
    4005           0 :                         len = sizeof(int);
    4006           0 :                 if (len < sizeof(int))
    4007             :                         return -EINVAL;
    4008           0 :                 if (copy_from_user(&val, optval, len))
    4009             :                         return -EFAULT;
    4010           0 :                 switch (val) {
    4011           0 :                 case TPACKET_V1:
    4012           0 :                         val = sizeof(struct tpacket_hdr);
    4013           0 :                         break;
    4014           0 :                 case TPACKET_V2:
    4015           0 :                         val = sizeof(struct tpacket2_hdr);
    4016           0 :                         break;
    4017           0 :                 case TPACKET_V3:
    4018           0 :                         val = sizeof(struct tpacket3_hdr);
    4019           0 :                         break;
    4020             :                 default:
    4021             :                         return -EINVAL;
    4022             :                 }
    4023             :                 break;
    4024           0 :         case PACKET_RESERVE:
    4025           0 :                 val = po->tp_reserve;
    4026           0 :                 break;
    4027           0 :         case PACKET_LOSS:
    4028           0 :                 val = po->tp_loss;
    4029           0 :                 break;
    4030           0 :         case PACKET_TIMESTAMP:
    4031           0 :                 val = po->tp_tstamp;
    4032           0 :                 break;
    4033           0 :         case PACKET_FANOUT:
    4034           0 :                 val = (po->fanout ?
    4035           0 :                        ((u32)po->fanout->id |
    4036           0 :                         ((u32)po->fanout->type << 16) |
    4037           0 :                         ((u32)po->fanout->flags << 24)) :
    4038             :                        0);
    4039           0 :                 break;
    4040           0 :         case PACKET_IGNORE_OUTGOING:
    4041           0 :                 val = po->prot_hook.ignore_outgoing;
    4042           0 :                 break;
    4043           0 :         case PACKET_ROLLOVER_STATS:
    4044           0 :                 if (!po->rollover)
    4045             :                         return -EINVAL;
    4046           0 :                 rstats.tp_all = atomic_long_read(&po->rollover->num);
    4047           0 :                 rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
    4048           0 :                 rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
    4049           0 :                 data = &rstats;
    4050           0 :                 lv = sizeof(rstats);
    4051           0 :                 break;
    4052           0 :         case PACKET_TX_HAS_OFF:
    4053           0 :                 val = po->tp_tx_has_off;
    4054           0 :                 break;
    4055             :         case PACKET_QDISC_BYPASS:
    4056           0 :                 val = packet_use_direct_xmit(po);
    4057           0 :                 break;
    4058             :         default:
    4059             :                 return -ENOPROTOOPT;
    4060             :         }
    4061             : 
    4062           0 :         if (len > lv)
    4063             :                 len = lv;
    4064           0 :         if (put_user(len, optlen))
    4065             :                 return -EFAULT;
    4066           0 :         if (copy_to_user(optval, data, len))
    4067           0 :                 return -EFAULT;
    4068             :         return 0;
    4069             : }
    4070             : 
    4071           6 : static int packet_notifier(struct notifier_block *this,
    4072             :                            unsigned long msg, void *ptr)
    4073             : {
    4074           6 :         struct sock *sk;
    4075           6 :         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
    4076           6 :         struct net *net = dev_net(dev);
    4077             : 
    4078           6 :         rcu_read_lock();
    4079          12 :         sk_for_each_rcu(sk, &net->packet.sklist) {
    4080           0 :                 struct packet_sock *po = pkt_sk(sk);
    4081             : 
    4082           0 :                 switch (msg) {
    4083           0 :                 case NETDEV_UNREGISTER:
    4084           0 :                         if (po->mclist)
    4085           0 :                                 packet_dev_mclist_delete(dev, &po->mclist);
    4086           0 :                         fallthrough;
    4087             : 
    4088             :                 case NETDEV_DOWN:
    4089           0 :                         if (dev->ifindex == po->ifindex) {
    4090           0 :                                 spin_lock(&po->bind_lock);
    4091           0 :                                 if (po->running) {
    4092           0 :                                         __unregister_prot_hook(sk, false);
    4093           0 :                                         sk->sk_err = ENETDOWN;
    4094           0 :                                         if (!sock_flag(sk, SOCK_DEAD))
    4095           0 :                                                 sk->sk_error_report(sk);
    4096             :                                 }
    4097           0 :                                 if (msg == NETDEV_UNREGISTER) {
    4098           0 :                                         packet_cached_dev_reset(po);
    4099           0 :                                         po->ifindex = -1;
    4100           0 :                                         if (po->prot_hook.dev)
    4101           0 :                                                 dev_put(po->prot_hook.dev);
    4102           0 :                                         po->prot_hook.dev = NULL;
    4103             :                                 }
    4104           0 :                                 spin_unlock(&po->bind_lock);
    4105             :                         }
    4106             :                         break;
    4107           0 :                 case NETDEV_UP:
    4108           0 :                         if (dev->ifindex == po->ifindex) {
    4109           0 :                                 spin_lock(&po->bind_lock);
    4110           0 :                                 if (po->num)
    4111           0 :                                         register_prot_hook(sk);
    4112           0 :                                 spin_unlock(&po->bind_lock);
    4113             :                         }
    4114             :                         break;
    4115             :                 }
    4116             :         }
    4117           6 :         rcu_read_unlock();
    4118           6 :         return NOTIFY_DONE;
    4119             : }
    4120             : 
    4121             : 
    4122           1 : static int packet_ioctl(struct socket *sock, unsigned int cmd,
    4123             :                         unsigned long arg)
    4124             : {
    4125           1 :         struct sock *sk = sock->sk;
    4126             : 
    4127           1 :         switch (cmd) {
    4128             :         case SIOCOUTQ:
    4129             :         {
    4130           0 :                 int amount = sk_wmem_alloc_get(sk);
    4131             : 
    4132           0 :                 return put_user(amount, (int __user *)arg);
    4133             :         }
    4134           0 :         case SIOCINQ:
    4135             :         {
    4136           0 :                 struct sk_buff *skb;
    4137           0 :                 int amount = 0;
    4138             : 
    4139           0 :                 spin_lock_bh(&sk->sk_receive_queue.lock);
    4140           0 :                 skb = skb_peek(&sk->sk_receive_queue);
    4141           0 :                 if (skb)
    4142           0 :                         amount = skb->len;
    4143           0 :                 spin_unlock_bh(&sk->sk_receive_queue.lock);
    4144           0 :                 return put_user(amount, (int __user *)arg);
    4145             :         }
    4146             : #ifdef CONFIG_INET
    4147           0 :         case SIOCADDRT:
    4148             :         case SIOCDELRT:
    4149             :         case SIOCDARP:
    4150             :         case SIOCGARP:
    4151             :         case SIOCSARP:
    4152             :         case SIOCGIFADDR:
    4153             :         case SIOCSIFADDR:
    4154             :         case SIOCGIFBRDADDR:
    4155             :         case SIOCSIFBRDADDR:
    4156             :         case SIOCGIFNETMASK:
    4157             :         case SIOCSIFNETMASK:
    4158             :         case SIOCGIFDSTADDR:
    4159             :         case SIOCSIFDSTADDR:
    4160             :         case SIOCSIFFLAGS:
    4161           0 :                 return inet_dgram_ops.ioctl(sock, cmd, arg);
    4162             : #endif
    4163             : 
    4164             :         default:
    4165             :                 return -ENOIOCTLCMD;
    4166             :         }
    4167             :         return 0;
    4168             : }
    4169             : 
    4170           8 : static __poll_t packet_poll(struct file *file, struct socket *sock,
    4171             :                                 poll_table *wait)
    4172             : {
    4173           8 :         struct sock *sk = sock->sk;
    4174           8 :         struct packet_sock *po = pkt_sk(sk);
    4175           8 :         __poll_t mask = datagram_poll(file, sock, wait);
    4176             : 
    4177           8 :         spin_lock_bh(&sk->sk_receive_queue.lock);
    4178           8 :         if (po->rx_ring.pg_vec) {
    4179           0 :                 if (!packet_previous_rx_frame(po, &po->rx_ring,
    4180             :                         TP_STATUS_KERNEL))
    4181           0 :                         mask |= EPOLLIN | EPOLLRDNORM;
    4182             :         }
    4183           8 :         packet_rcv_try_clear_pressure(po);
    4184           8 :         spin_unlock_bh(&sk->sk_receive_queue.lock);
    4185           8 :         spin_lock_bh(&sk->sk_write_queue.lock);
    4186           8 :         if (po->tx_ring.pg_vec) {
    4187           0 :                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
    4188           0 :                         mask |= EPOLLOUT | EPOLLWRNORM;
    4189             :         }
    4190           8 :         spin_unlock_bh(&sk->sk_write_queue.lock);
    4191           8 :         return mask;
    4192             : }
    4193             : 
    4194             : 
    4195             : /* Dirty? Well, I still did not learn better way to account
    4196             :  * for user mmaps.
    4197             :  */
    4198             : 
    4199           0 : static void packet_mm_open(struct vm_area_struct *vma)
    4200             : {
    4201           0 :         struct file *file = vma->vm_file;
    4202           0 :         struct socket *sock = file->private_data;
    4203           0 :         struct sock *sk = sock->sk;
    4204             : 
    4205           0 :         if (sk)
    4206           0 :                 atomic_inc(&pkt_sk(sk)->mapped);
    4207           0 : }
    4208             : 
    4209           0 : static void packet_mm_close(struct vm_area_struct *vma)
    4210             : {
    4211           0 :         struct file *file = vma->vm_file;
    4212           0 :         struct socket *sock = file->private_data;
    4213           0 :         struct sock *sk = sock->sk;
    4214             : 
    4215           0 :         if (sk)
    4216           0 :                 atomic_dec(&pkt_sk(sk)->mapped);
    4217           0 : }
    4218             : 
    4219             : static const struct vm_operations_struct packet_mmap_ops = {
    4220             :         .open   =       packet_mm_open,
    4221             :         .close  =       packet_mm_close,
    4222             : };
    4223             : 
    4224           0 : static void free_pg_vec(struct pgv *pg_vec, unsigned int order,
    4225             :                         unsigned int len)
    4226             : {
    4227           0 :         int i;
    4228             : 
    4229           0 :         for (i = 0; i < len; i++) {
    4230           0 :                 if (likely(pg_vec[i].buffer)) {
    4231           0 :                         if (is_vmalloc_addr(pg_vec[i].buffer))
    4232           0 :                                 vfree(pg_vec[i].buffer);
    4233             :                         else
    4234           0 :                                 free_pages((unsigned long)pg_vec[i].buffer,
    4235             :                                            order);
    4236           0 :                         pg_vec[i].buffer = NULL;
    4237             :                 }
    4238             :         }
    4239           0 :         kfree(pg_vec);
    4240           0 : }
    4241             : 
    4242           0 : static char *alloc_one_pg_vec_page(unsigned long order)
    4243             : {
    4244           0 :         char *buffer;
    4245           0 :         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP |
    4246             :                           __GFP_ZERO | __GFP_NOWARN | __GFP_NORETRY;
    4247             : 
    4248           0 :         buffer = (char *) __get_free_pages(gfp_flags, order);
    4249           0 :         if (buffer)
    4250             :                 return buffer;
    4251             : 
    4252             :         /* __get_free_pages failed, fall back to vmalloc */
    4253           0 :         buffer = vzalloc(array_size((1 << order), PAGE_SIZE));
    4254           0 :         if (buffer)
    4255             :                 return buffer;
    4256             : 
    4257             :         /* vmalloc failed, lets dig into swap here */
    4258           0 :         gfp_flags &= ~__GFP_NORETRY;
    4259           0 :         buffer = (char *) __get_free_pages(gfp_flags, order);
    4260           0 :         if (buffer)
    4261           0 :                 return buffer;
    4262             : 
    4263             :         /* complete and utter failure */
    4264             :         return NULL;
    4265             : }
    4266             : 
    4267           0 : static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
    4268             : {
    4269           0 :         unsigned int block_nr = req->tp_block_nr;
    4270           0 :         struct pgv *pg_vec;
    4271           0 :         int i;
    4272             : 
    4273           0 :         pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL | __GFP_NOWARN);
    4274           0 :         if (unlikely(!pg_vec))
    4275           0 :                 goto out;
    4276             : 
    4277           0 :         for (i = 0; i < block_nr; i++) {
    4278           0 :                 pg_vec[i].buffer = alloc_one_pg_vec_page(order);
    4279           0 :                 if (unlikely(!pg_vec[i].buffer))
    4280           0 :                         goto out_free_pgvec;
    4281             :         }
    4282             : 
    4283           0 : out:
    4284           0 :         return pg_vec;
    4285             : 
    4286           0 : out_free_pgvec:
    4287           0 :         free_pg_vec(pg_vec, order, block_nr);
    4288           0 :         pg_vec = NULL;
    4289           0 :         goto out;
    4290             : }
    4291             : 
    4292           0 : static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
    4293             :                 int closing, int tx_ring)
    4294             : {
    4295           0 :         struct pgv *pg_vec = NULL;
    4296           0 :         struct packet_sock *po = pkt_sk(sk);
    4297           0 :         unsigned long *rx_owner_map = NULL;
    4298           0 :         int was_running, order = 0;
    4299           0 :         struct packet_ring_buffer *rb;
    4300           0 :         struct sk_buff_head *rb_queue;
    4301           0 :         __be16 num;
    4302           0 :         int err;
    4303             :         /* Added to avoid minimal code churn */
    4304           0 :         struct tpacket_req *req = &req_u->req;
    4305             : 
    4306           0 :         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
    4307           0 :         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
    4308             : 
    4309           0 :         err = -EBUSY;
    4310           0 :         if (!closing) {
    4311           0 :                 if (atomic_read(&po->mapped))
    4312           0 :                         goto out;
    4313           0 :                 if (packet_read_pending(rb))
    4314           0 :                         goto out;
    4315             :         }
    4316             : 
    4317           0 :         if (req->tp_block_nr) {
    4318           0 :                 unsigned int min_frame_size;
    4319             : 
    4320             :                 /* Sanity tests and some calculations */
    4321           0 :                 err = -EBUSY;
    4322           0 :                 if (unlikely(rb->pg_vec))
    4323           0 :                         goto out;
    4324             : 
    4325           0 :                 switch (po->tp_version) {
    4326           0 :                 case TPACKET_V1:
    4327           0 :                         po->tp_hdrlen = TPACKET_HDRLEN;
    4328           0 :                         break;
    4329           0 :                 case TPACKET_V2:
    4330           0 :                         po->tp_hdrlen = TPACKET2_HDRLEN;
    4331           0 :                         break;
    4332           0 :                 case TPACKET_V3:
    4333           0 :                         po->tp_hdrlen = TPACKET3_HDRLEN;
    4334           0 :                         break;
    4335             :                 }
    4336             : 
    4337           0 :                 err = -EINVAL;
    4338           0 :                 if (unlikely((int)req->tp_block_size <= 0))
    4339           0 :                         goto out;
    4340           0 :                 if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
    4341           0 :                         goto out;
    4342           0 :                 min_frame_size = po->tp_hdrlen + po->tp_reserve;
    4343           0 :                 if (po->tp_version >= TPACKET_V3 &&
    4344             :                     req->tp_block_size <
    4345           0 :                     BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv) + min_frame_size)
    4346           0 :                         goto out;
    4347           0 :                 if (unlikely(req->tp_frame_size < min_frame_size))
    4348           0 :                         goto out;
    4349           0 :                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
    4350           0 :                         goto out;
    4351             : 
    4352           0 :                 rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
    4353           0 :                 if (unlikely(rb->frames_per_block == 0))
    4354           0 :                         goto out;
    4355           0 :                 if (unlikely(rb->frames_per_block > UINT_MAX / req->tp_block_nr))
    4356           0 :                         goto out;
    4357           0 :                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
    4358             :                                         req->tp_frame_nr))
    4359           0 :                         goto out;
    4360             : 
    4361           0 :                 err = -ENOMEM;
    4362           0 :                 order = get_order(req->tp_block_size);
    4363           0 :                 pg_vec = alloc_pg_vec(req, order);
    4364           0 :                 if (unlikely(!pg_vec))
    4365           0 :                         goto out;
    4366           0 :                 switch (po->tp_version) {
    4367           0 :                 case TPACKET_V3:
    4368             :                         /* Block transmit is not supported yet */
    4369           0 :                         if (!tx_ring) {
    4370           0 :                                 init_prb_bdqc(po, rb, pg_vec, req_u);
    4371             :                         } else {
    4372           0 :                                 struct tpacket_req3 *req3 = &req_u->req3;
    4373             : 
    4374           0 :                                 if (req3->tp_retire_blk_tov ||
    4375           0 :                                     req3->tp_sizeof_priv ||
    4376           0 :                                     req3->tp_feature_req_word) {
    4377           0 :                                         err = -EINVAL;
    4378           0 :                                         goto out_free_pg_vec;
    4379             :                                 }
    4380             :                         }
    4381             :                         break;
    4382           0 :                 default:
    4383           0 :                         if (!tx_ring) {
    4384           0 :                                 rx_owner_map = bitmap_alloc(req->tp_frame_nr,
    4385             :                                         GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO);
    4386           0 :                                 if (!rx_owner_map)
    4387           0 :                                         goto out_free_pg_vec;
    4388             :                         }
    4389             :                         break;
    4390             :                 }
    4391             :         }
    4392             :         /* Done */
    4393             :         else {
    4394           0 :                 err = -EINVAL;
    4395           0 :                 if (unlikely(req->tp_frame_nr))
    4396           0 :                         goto out;
    4397             :         }
    4398             : 
    4399             : 
    4400             :         /* Detach socket from network */
    4401           0 :         spin_lock(&po->bind_lock);
    4402           0 :         was_running = po->running;
    4403           0 :         num = po->num;
    4404           0 :         if (was_running) {
    4405           0 :                 po->num = 0;
    4406           0 :                 __unregister_prot_hook(sk, false);
    4407             :         }
    4408           0 :         spin_unlock(&po->bind_lock);
    4409             : 
    4410           0 :         synchronize_net();
    4411             : 
    4412           0 :         err = -EBUSY;
    4413           0 :         mutex_lock(&po->pg_vec_lock);
    4414           0 :         if (closing || atomic_read(&po->mapped) == 0) {
    4415           0 :                 err = 0;
    4416           0 :                 spin_lock_bh(&rb_queue->lock);
    4417           0 :                 swap(rb->pg_vec, pg_vec);
    4418           0 :                 if (po->tp_version <= TPACKET_V2)
    4419           0 :                         swap(rb->rx_owner_map, rx_owner_map);
    4420           0 :                 rb->frame_max = (req->tp_frame_nr - 1);
    4421           0 :                 rb->head = 0;
    4422           0 :                 rb->frame_size = req->tp_frame_size;
    4423           0 :                 spin_unlock_bh(&rb_queue->lock);
    4424             : 
    4425           0 :                 swap(rb->pg_vec_order, order);
    4426           0 :                 swap(rb->pg_vec_len, req->tp_block_nr);
    4427             : 
    4428           0 :                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
    4429           0 :                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
    4430           0 :                                                 tpacket_rcv : packet_rcv;
    4431           0 :                 skb_queue_purge(rb_queue);
    4432           0 :                 if (atomic_read(&po->mapped))
    4433           0 :                         pr_err("packet_mmap: vma is busy: %d\n",
    4434             :                                atomic_read(&po->mapped));
    4435             :         }
    4436           0 :         mutex_unlock(&po->pg_vec_lock);
    4437             : 
    4438           0 :         spin_lock(&po->bind_lock);
    4439           0 :         if (was_running) {
    4440           0 :                 po->num = num;
    4441           0 :                 register_prot_hook(sk);
    4442             :         }
    4443           0 :         spin_unlock(&po->bind_lock);
    4444           0 :         if (pg_vec && (po->tp_version > TPACKET_V2)) {
    4445             :                 /* Because we don't support block-based V3 on tx-ring */
    4446           0 :                 if (!tx_ring)
    4447           0 :                         prb_shutdown_retire_blk_timer(po, rb_queue);
    4448             :         }
    4449             : 
    4450           0 : out_free_pg_vec:
    4451           0 :         bitmap_free(rx_owner_map);
    4452           0 :         if (pg_vec)
    4453           0 :                 free_pg_vec(pg_vec, order, req->tp_block_nr);
    4454           0 : out:
    4455           0 :         return err;
    4456             : }
    4457             : 
    4458           0 : static int packet_mmap(struct file *file, struct socket *sock,
    4459             :                 struct vm_area_struct *vma)
    4460             : {
    4461           0 :         struct sock *sk = sock->sk;
    4462           0 :         struct packet_sock *po = pkt_sk(sk);
    4463           0 :         unsigned long size, expected_size;
    4464           0 :         struct packet_ring_buffer *rb;
    4465           0 :         unsigned long start;
    4466           0 :         int err = -EINVAL;
    4467           0 :         int i;
    4468             : 
    4469           0 :         if (vma->vm_pgoff)
    4470             :                 return -EINVAL;
    4471             : 
    4472           0 :         mutex_lock(&po->pg_vec_lock);
    4473             : 
    4474           0 :         expected_size = 0;
    4475           0 :         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
    4476           0 :                 if (rb->pg_vec) {
    4477           0 :                         expected_size += rb->pg_vec_len
    4478           0 :                                                 * rb->pg_vec_pages
    4479           0 :                                                 * PAGE_SIZE;
    4480             :                 }
    4481             :         }
    4482             : 
    4483           0 :         if (expected_size == 0)
    4484           0 :                 goto out;
    4485             : 
    4486           0 :         size = vma->vm_end - vma->vm_start;
    4487           0 :         if (size != expected_size)
    4488           0 :                 goto out;
    4489             : 
    4490             :         start = vma->vm_start;
    4491           0 :         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
    4492           0 :                 if (rb->pg_vec == NULL)
    4493           0 :                         continue;
    4494             : 
    4495           0 :                 for (i = 0; i < rb->pg_vec_len; i++) {
    4496           0 :                         struct page *page;
    4497           0 :                         void *kaddr = rb->pg_vec[i].buffer;
    4498           0 :                         int pg_num;
    4499             : 
    4500           0 :                         for (pg_num = 0; pg_num < rb->pg_vec_pages; pg_num++) {
    4501           0 :                                 page = pgv_to_page(kaddr);
    4502           0 :                                 err = vm_insert_page(vma, start, page);
    4503           0 :                                 if (unlikely(err))
    4504           0 :                                         goto out;
    4505           0 :                                 start += PAGE_SIZE;
    4506           0 :                                 kaddr += PAGE_SIZE;
    4507             :                         }
    4508             :                 }
    4509             :         }
    4510             : 
    4511           0 :         atomic_inc(&po->mapped);
    4512           0 :         vma->vm_ops = &packet_mmap_ops;
    4513           0 :         err = 0;
    4514             : 
    4515           0 : out:
    4516           0 :         mutex_unlock(&po->pg_vec_lock);
    4517           0 :         return err;
    4518             : }
    4519             : 
    4520             : static const struct proto_ops packet_ops_spkt = {
    4521             :         .family =       PF_PACKET,
    4522             :         .owner =        THIS_MODULE,
    4523             :         .release =      packet_release,
    4524             :         .bind =         packet_bind_spkt,
    4525             :         .connect =      sock_no_connect,
    4526             :         .socketpair =   sock_no_socketpair,
    4527             :         .accept =       sock_no_accept,
    4528             :         .getname =      packet_getname_spkt,
    4529             :         .poll =         datagram_poll,
    4530             :         .ioctl =        packet_ioctl,
    4531             :         .gettstamp =    sock_gettstamp,
    4532             :         .listen =       sock_no_listen,
    4533             :         .shutdown =     sock_no_shutdown,
    4534             :         .sendmsg =      packet_sendmsg_spkt,
    4535             :         .recvmsg =      packet_recvmsg,
    4536             :         .mmap =         sock_no_mmap,
    4537             :         .sendpage =     sock_no_sendpage,
    4538             : };
    4539             : 
    4540             : static const struct proto_ops packet_ops = {
    4541             :         .family =       PF_PACKET,
    4542             :         .owner =        THIS_MODULE,
    4543             :         .release =      packet_release,
    4544             :         .bind =         packet_bind,
    4545             :         .connect =      sock_no_connect,
    4546             :         .socketpair =   sock_no_socketpair,
    4547             :         .accept =       sock_no_accept,
    4548             :         .getname =      packet_getname,
    4549             :         .poll =         packet_poll,
    4550             :         .ioctl =        packet_ioctl,
    4551             :         .gettstamp =    sock_gettstamp,
    4552             :         .listen =       sock_no_listen,
    4553             :         .shutdown =     sock_no_shutdown,
    4554             :         .setsockopt =   packet_setsockopt,
    4555             :         .getsockopt =   packet_getsockopt,
    4556             :         .sendmsg =      packet_sendmsg,
    4557             :         .recvmsg =      packet_recvmsg,
    4558             :         .mmap =         packet_mmap,
    4559             :         .sendpage =     sock_no_sendpage,
    4560             : };
    4561             : 
    4562             : static const struct net_proto_family packet_family_ops = {
    4563             :         .family =       PF_PACKET,
    4564             :         .create =       packet_create,
    4565             :         .owner  =       THIS_MODULE,
    4566             : };
    4567             : 
    4568             : static struct notifier_block packet_netdev_notifier = {
    4569             :         .notifier_call =        packet_notifier,
    4570             : };
    4571             : 
    4572             : #ifdef CONFIG_PROC_FS
    4573             : 
    4574           0 : static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
    4575             :         __acquires(RCU)
    4576             : {
    4577           0 :         struct net *net = seq_file_net(seq);
    4578             : 
    4579           0 :         rcu_read_lock();
    4580           0 :         return seq_hlist_start_head_rcu(&net->packet.sklist, *pos);
    4581             : }
    4582             : 
    4583           0 : static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
    4584             : {
    4585           0 :         struct net *net = seq_file_net(seq);
    4586           0 :         return seq_hlist_next_rcu(v, &net->packet.sklist, pos);
    4587             : }
    4588             : 
    4589           0 : static void packet_seq_stop(struct seq_file *seq, void *v)
    4590             :         __releases(RCU)
    4591             : {
    4592           0 :         rcu_read_unlock();
    4593           0 : }
    4594             : 
    4595           0 : static int packet_seq_show(struct seq_file *seq, void *v)
    4596             : {
    4597           0 :         if (v == SEQ_START_TOKEN)
    4598           0 :                 seq_printf(seq,
    4599             :                            "%*sRefCnt Type Proto  Iface R Rmem   User   Inode\n",
    4600             :                            IS_ENABLED(CONFIG_64BIT) ? -17 : -9, "sk");
    4601             :         else {
    4602           0 :                 struct sock *s = sk_entry(v);
    4603           0 :                 const struct packet_sock *po = pkt_sk(s);
    4604             : 
    4605           0 :                 seq_printf(seq,
    4606             :                            "%pK %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
    4607             :                            s,
    4608           0 :                            refcount_read(&s->sk_refcnt),
    4609           0 :                            s->sk_type,
    4610           0 :                            ntohs(po->num),
    4611             :                            po->ifindex,
    4612             :                            po->running,
    4613           0 :                            atomic_read(&s->sk_rmem_alloc),
    4614             :                            from_kuid_munged(seq_user_ns(seq), sock_i_uid(s)),
    4615             :                            sock_i_ino(s));
    4616             :         }
    4617             : 
    4618           0 :         return 0;
    4619             : }
    4620             : 
    4621             : static const struct seq_operations packet_seq_ops = {
    4622             :         .start  = packet_seq_start,
    4623             :         .next   = packet_seq_next,
    4624             :         .stop   = packet_seq_stop,
    4625             :         .show   = packet_seq_show,
    4626             : };
    4627             : #endif
    4628             : 
    4629           1 : static int __net_init packet_net_init(struct net *net)
    4630             : {
    4631           1 :         mutex_init(&net->packet.sklist_lock);
    4632           1 :         INIT_HLIST_HEAD(&net->packet.sklist);
    4633             : 
    4634             : #ifdef CONFIG_PROC_FS
    4635           1 :         if (!proc_create_net("packet", 0, net->proc_net, &packet_seq_ops,
    4636             :                         sizeof(struct seq_net_private)))
    4637           0 :                 return -ENOMEM;
    4638             : #endif /* CONFIG_PROC_FS */
    4639             : 
    4640             :         return 0;
    4641             : }
    4642             : 
    4643           0 : static void __net_exit packet_net_exit(struct net *net)
    4644             : {
    4645           0 :         remove_proc_entry("packet", net->proc_net);
    4646           0 :         WARN_ON_ONCE(!hlist_empty(&net->packet.sklist));
    4647           0 : }
    4648             : 
    4649             : static struct pernet_operations packet_net_ops = {
    4650             :         .init = packet_net_init,
    4651             :         .exit = packet_net_exit,
    4652             : };
    4653             : 
    4654             : 
    4655           0 : static void __exit packet_exit(void)
    4656             : {
    4657           0 :         unregister_netdevice_notifier(&packet_netdev_notifier);
    4658           0 :         unregister_pernet_subsys(&packet_net_ops);
    4659           0 :         sock_unregister(PF_PACKET);
    4660           0 :         proto_unregister(&packet_proto);
    4661           0 : }
    4662             : 
    4663           1 : static int __init packet_init(void)
    4664             : {
    4665           1 :         int rc;
    4666             : 
    4667           1 :         rc = proto_register(&packet_proto, 0);
    4668           1 :         if (rc)
    4669           0 :                 goto out;
    4670           1 :         rc = sock_register(&packet_family_ops);
    4671           1 :         if (rc)
    4672           0 :                 goto out_proto;
    4673           1 :         rc = register_pernet_subsys(&packet_net_ops);
    4674           1 :         if (rc)
    4675           0 :                 goto out_sock;
    4676           1 :         rc = register_netdevice_notifier(&packet_netdev_notifier);
    4677           1 :         if (rc)
    4678           0 :                 goto out_pernet;
    4679             : 
    4680             :         return 0;
    4681             : 
    4682           0 : out_pernet:
    4683           0 :         unregister_pernet_subsys(&packet_net_ops);
    4684           0 : out_sock:
    4685           0 :         sock_unregister(PF_PACKET);
    4686           0 : out_proto:
    4687           0 :         proto_unregister(&packet_proto);
    4688             : out:
    4689             :         return rc;
    4690             : }
    4691             : 
    4692             : module_init(packet_init);
    4693             : module_exit(packet_exit);
    4694             : MODULE_LICENSE("GPL");
    4695             : MODULE_ALIAS_NETPROTO(PF_PACKET);

Generated by: LCOV version 1.14