Line data Source code
1 : // SPDX-License-Identifier: GPL-2.0-or-later
2 : /*
3 : * NET3 Protocol independent device support routines.
4 : *
5 : * Derived from the non IP parts of dev.c 1.0.19
6 : * Authors: Ross Biro
7 : * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
8 : * Mark Evans, <evansmp@uhura.aston.ac.uk>
9 : *
10 : * Additional Authors:
11 : * Florian la Roche <rzsfl@rz.uni-sb.de>
12 : * Alan Cox <gw4pts@gw4pts.ampr.org>
13 : * David Hinds <dahinds@users.sourceforge.net>
14 : * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
15 : * Adam Sulmicki <adam@cfar.umd.edu>
16 : * Pekka Riikonen <priikone@poesidon.pspt.fi>
17 : *
18 : * Changes:
19 : * D.J. Barrow : Fixed bug where dev->refcnt gets set
20 : * to 2 if register_netdev gets called
21 : * before net_dev_init & also removed a
22 : * few lines of code in the process.
23 : * Alan Cox : device private ioctl copies fields back.
24 : * Alan Cox : Transmit queue code does relevant
25 : * stunts to keep the queue safe.
26 : * Alan Cox : Fixed double lock.
27 : * Alan Cox : Fixed promisc NULL pointer trap
28 : * ???????? : Support the full private ioctl range
29 : * Alan Cox : Moved ioctl permission check into
30 : * drivers
31 : * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
32 : * Alan Cox : 100 backlog just doesn't cut it when
33 : * you start doing multicast video 8)
34 : * Alan Cox : Rewrote net_bh and list manager.
35 : * Alan Cox : Fix ETH_P_ALL echoback lengths.
36 : * Alan Cox : Took out transmit every packet pass
37 : * Saved a few bytes in the ioctl handler
38 : * Alan Cox : Network driver sets packet type before
39 : * calling netif_rx. Saves a function
40 : * call a packet.
41 : * Alan Cox : Hashed net_bh()
42 : * Richard Kooijman: Timestamp fixes.
43 : * Alan Cox : Wrong field in SIOCGIFDSTADDR
44 : * Alan Cox : Device lock protection.
45 : * Alan Cox : Fixed nasty side effect of device close
46 : * changes.
47 : * Rudi Cilibrasi : Pass the right thing to
48 : * set_mac_address()
49 : * Dave Miller : 32bit quantity for the device lock to
50 : * make it work out on a Sparc.
51 : * Bjorn Ekwall : Added KERNELD hack.
52 : * Alan Cox : Cleaned up the backlog initialise.
53 : * Craig Metz : SIOCGIFCONF fix if space for under
54 : * 1 device.
55 : * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
56 : * is no device open function.
57 : * Andi Kleen : Fix error reporting for SIOCGIFCONF
58 : * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
59 : * Cyrus Durgin : Cleaned for KMOD
60 : * Adam Sulmicki : Bug Fix : Network Device Unload
61 : * A network device unload needs to purge
62 : * the backlog queue.
63 : * Paul Rusty Russell : SIOCSIFNAME
64 : * Pekka Riikonen : Netdev boot-time settings code
65 : * Andrew Morton : Make unregister_netdevice wait
66 : * indefinitely on dev->refcnt
67 : * J Hadi Salim : - Backlog queue sampling
68 : * - netif_rx() feedback
69 : */
70 :
71 : #include <linux/uaccess.h>
72 : #include <linux/bitops.h>
73 : #include <linux/capability.h>
74 : #include <linux/cpu.h>
75 : #include <linux/types.h>
76 : #include <linux/kernel.h>
77 : #include <linux/hash.h>
78 : #include <linux/slab.h>
79 : #include <linux/sched.h>
80 : #include <linux/sched/mm.h>
81 : #include <linux/mutex.h>
82 : #include <linux/rwsem.h>
83 : #include <linux/string.h>
84 : #include <linux/mm.h>
85 : #include <linux/socket.h>
86 : #include <linux/sockios.h>
87 : #include <linux/errno.h>
88 : #include <linux/interrupt.h>
89 : #include <linux/if_ether.h>
90 : #include <linux/netdevice.h>
91 : #include <linux/etherdevice.h>
92 : #include <linux/ethtool.h>
93 : #include <linux/skbuff.h>
94 : #include <linux/kthread.h>
95 : #include <linux/bpf.h>
96 : #include <linux/bpf_trace.h>
97 : #include <net/net_namespace.h>
98 : #include <net/sock.h>
99 : #include <net/busy_poll.h>
100 : #include <linux/rtnetlink.h>
101 : #include <linux/stat.h>
102 : #include <net/dsa.h>
103 : #include <net/dst.h>
104 : #include <net/dst_metadata.h>
105 : #include <net/gro.h>
106 : #include <net/pkt_sched.h>
107 : #include <net/pkt_cls.h>
108 : #include <net/checksum.h>
109 : #include <net/xfrm.h>
110 : #include <linux/highmem.h>
111 : #include <linux/init.h>
112 : #include <linux/module.h>
113 : #include <linux/netpoll.h>
114 : #include <linux/rcupdate.h>
115 : #include <linux/delay.h>
116 : #include <net/iw_handler.h>
117 : #include <asm/current.h>
118 : #include <linux/audit.h>
119 : #include <linux/dmaengine.h>
120 : #include <linux/err.h>
121 : #include <linux/ctype.h>
122 : #include <linux/if_arp.h>
123 : #include <linux/if_vlan.h>
124 : #include <linux/ip.h>
125 : #include <net/ip.h>
126 : #include <net/mpls.h>
127 : #include <linux/ipv6.h>
128 : #include <linux/in.h>
129 : #include <linux/jhash.h>
130 : #include <linux/random.h>
131 : #include <trace/events/napi.h>
132 : #include <trace/events/net.h>
133 : #include <trace/events/skb.h>
134 : #include <linux/inetdevice.h>
135 : #include <linux/cpu_rmap.h>
136 : #include <linux/static_key.h>
137 : #include <linux/hashtable.h>
138 : #include <linux/vmalloc.h>
139 : #include <linux/if_macvlan.h>
140 : #include <linux/errqueue.h>
141 : #include <linux/hrtimer.h>
142 : #include <linux/netfilter_ingress.h>
143 : #include <linux/crash_dump.h>
144 : #include <linux/sctp.h>
145 : #include <net/udp_tunnel.h>
146 : #include <linux/net_namespace.h>
147 : #include <linux/indirect_call_wrapper.h>
148 : #include <net/devlink.h>
149 : #include <linux/pm_runtime.h>
150 : #include <linux/prandom.h>
151 :
152 : #include "net-sysfs.h"
153 :
154 : #define MAX_GRO_SKBS 8
155 :
156 : /* This should be increased if a protocol with a bigger head is added. */
157 : #define GRO_MAX_HEAD (MAX_HEADER + 128)
158 :
159 : static DEFINE_SPINLOCK(ptype_lock);
160 : static DEFINE_SPINLOCK(offload_lock);
161 : struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
162 : struct list_head ptype_all __read_mostly; /* Taps */
163 : static struct list_head offload_base __read_mostly;
164 :
165 : static int netif_rx_internal(struct sk_buff *skb);
166 : static int call_netdevice_notifiers_info(unsigned long val,
167 : struct netdev_notifier_info *info);
168 : static int call_netdevice_notifiers_extack(unsigned long val,
169 : struct net_device *dev,
170 : struct netlink_ext_ack *extack);
171 : static struct napi_struct *napi_by_id(unsigned int napi_id);
172 :
173 : /*
174 : * The @dev_base_head list is protected by @dev_base_lock and the rtnl
175 : * semaphore.
176 : *
177 : * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
178 : *
179 : * Writers must hold the rtnl semaphore while they loop through the
180 : * dev_base_head list, and hold dev_base_lock for writing when they do the
181 : * actual updates. This allows pure readers to access the list even
182 : * while a writer is preparing to update it.
183 : *
184 : * To put it another way, dev_base_lock is held for writing only to
185 : * protect against pure readers; the rtnl semaphore provides the
186 : * protection against other writers.
187 : *
188 : * See, for example usages, register_netdevice() and
189 : * unregister_netdevice(), which must be called with the rtnl
190 : * semaphore held.
191 : */
192 : DEFINE_RWLOCK(dev_base_lock);
193 : EXPORT_SYMBOL(dev_base_lock);
194 :
195 : static DEFINE_MUTEX(ifalias_mutex);
196 :
197 : /* protects napi_hash addition/deletion and napi_gen_id */
198 : static DEFINE_SPINLOCK(napi_hash_lock);
199 :
200 : static unsigned int napi_gen_id = NR_CPUS;
201 : static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
202 :
203 : static DECLARE_RWSEM(devnet_rename_sem);
204 :
205 0 : static inline void dev_base_seq_inc(struct net *net)
206 : {
207 2 : while (++net->dev_base_seq == 0)
208 2 : ;
209 : }
210 :
211 48 : static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
212 : {
213 48 : unsigned int hash = full_name_hash(net, name, strnlen(name, IFNAMSIZ));
214 :
215 48 : return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
216 : }
217 :
218 25 : static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
219 : {
220 25 : return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
221 : }
222 :
223 0 : static inline void rps_lock(struct softnet_data *sd)
224 : {
225 : #ifdef CONFIG_RPS
226 0 : spin_lock(&sd->input_pkt_queue.lock);
227 : #endif
228 : }
229 :
230 0 : static inline void rps_unlock(struct softnet_data *sd)
231 : {
232 : #ifdef CONFIG_RPS
233 0 : spin_unlock(&sd->input_pkt_queue.lock);
234 : #endif
235 : }
236 :
237 2 : static struct netdev_name_node *netdev_name_node_alloc(struct net_device *dev,
238 : const char *name)
239 : {
240 2 : struct netdev_name_node *name_node;
241 :
242 2 : name_node = kmalloc(sizeof(*name_node), GFP_KERNEL);
243 2 : if (!name_node)
244 : return NULL;
245 2 : INIT_HLIST_NODE(&name_node->hlist);
246 2 : name_node->dev = dev;
247 2 : name_node->name = name;
248 2 : return name_node;
249 : }
250 :
251 : static struct netdev_name_node *
252 2 : netdev_name_node_head_alloc(struct net_device *dev)
253 : {
254 2 : struct netdev_name_node *name_node;
255 :
256 2 : name_node = netdev_name_node_alloc(dev, dev->name);
257 2 : if (!name_node)
258 : return NULL;
259 2 : INIT_LIST_HEAD(&name_node->list);
260 2 : return name_node;
261 : }
262 :
263 0 : static void netdev_name_node_free(struct netdev_name_node *name_node)
264 : {
265 0 : kfree(name_node);
266 : }
267 :
268 2 : static void netdev_name_node_add(struct net *net,
269 : struct netdev_name_node *name_node)
270 : {
271 2 : hlist_add_head_rcu(&name_node->hlist,
272 : dev_name_hash(net, name_node->name));
273 2 : }
274 :
275 0 : static void netdev_name_node_del(struct netdev_name_node *name_node)
276 : {
277 0 : hlist_del_rcu(&name_node->hlist);
278 : }
279 :
280 18 : static struct netdev_name_node *netdev_name_node_lookup(struct net *net,
281 : const char *name)
282 : {
283 18 : struct hlist_head *head = dev_name_hash(net, name);
284 18 : struct netdev_name_node *name_node;
285 :
286 36 : hlist_for_each_entry(name_node, head, hlist)
287 8 : if (!strcmp(name_node->name, name))
288 8 : return name_node;
289 : return NULL;
290 : }
291 :
292 28 : static struct netdev_name_node *netdev_name_node_lookup_rcu(struct net *net,
293 : const char *name)
294 : {
295 28 : struct hlist_head *head = dev_name_hash(net, name);
296 28 : struct netdev_name_node *name_node;
297 :
298 56 : hlist_for_each_entry_rcu(name_node, head, hlist)
299 26 : if (!strcmp(name_node->name, name))
300 26 : return name_node;
301 : return NULL;
302 : }
303 :
304 0 : int netdev_name_node_alt_create(struct net_device *dev, const char *name)
305 : {
306 0 : struct netdev_name_node *name_node;
307 0 : struct net *net = dev_net(dev);
308 :
309 0 : name_node = netdev_name_node_lookup(net, name);
310 0 : if (name_node)
311 : return -EEXIST;
312 0 : name_node = netdev_name_node_alloc(dev, name);
313 0 : if (!name_node)
314 : return -ENOMEM;
315 0 : netdev_name_node_add(net, name_node);
316 : /* The node that holds dev->name acts as a head of per-device list. */
317 0 : list_add_tail(&name_node->list, &dev->name_node->list);
318 :
319 0 : return 0;
320 : }
321 : EXPORT_SYMBOL(netdev_name_node_alt_create);
322 :
323 0 : static void __netdev_name_node_alt_destroy(struct netdev_name_node *name_node)
324 : {
325 0 : list_del(&name_node->list);
326 0 : netdev_name_node_del(name_node);
327 0 : kfree(name_node->name);
328 0 : netdev_name_node_free(name_node);
329 0 : }
330 :
331 0 : int netdev_name_node_alt_destroy(struct net_device *dev, const char *name)
332 : {
333 0 : struct netdev_name_node *name_node;
334 0 : struct net *net = dev_net(dev);
335 :
336 0 : name_node = netdev_name_node_lookup(net, name);
337 0 : if (!name_node)
338 : return -ENOENT;
339 : /* lookup might have found our primary name or a name belonging
340 : * to another device.
341 : */
342 0 : if (name_node == dev->name_node || name_node->dev != dev)
343 : return -EINVAL;
344 :
345 0 : __netdev_name_node_alt_destroy(name_node);
346 :
347 0 : return 0;
348 : }
349 : EXPORT_SYMBOL(netdev_name_node_alt_destroy);
350 :
351 0 : static void netdev_name_node_alt_flush(struct net_device *dev)
352 : {
353 0 : struct netdev_name_node *name_node, *tmp;
354 :
355 0 : list_for_each_entry_safe(name_node, tmp, &dev->name_node->list, list)
356 0 : __netdev_name_node_alt_destroy(name_node);
357 0 : }
358 :
359 : /* Device list insertion */
360 2 : static void list_netdevice(struct net_device *dev)
361 : {
362 2 : struct net *net = dev_net(dev);
363 :
364 2 : ASSERT_RTNL();
365 :
366 2 : write_lock_bh(&dev_base_lock);
367 2 : list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
368 2 : netdev_name_node_add(net, dev->name_node);
369 2 : hlist_add_head_rcu(&dev->index_hlist,
370 : dev_index_hash(net, dev->ifindex));
371 2 : write_unlock_bh(&dev_base_lock);
372 :
373 2 : dev_base_seq_inc(net);
374 2 : }
375 :
376 : /* Device list removal
377 : * caller must respect a RCU grace period before freeing/reusing dev
378 : */
379 0 : static void unlist_netdevice(struct net_device *dev)
380 : {
381 0 : ASSERT_RTNL();
382 :
383 : /* Unlink dev from the device chain */
384 0 : write_lock_bh(&dev_base_lock);
385 0 : list_del_rcu(&dev->dev_list);
386 0 : netdev_name_node_del(dev->name_node);
387 0 : hlist_del_rcu(&dev->index_hlist);
388 0 : write_unlock_bh(&dev_base_lock);
389 :
390 0 : dev_base_seq_inc(dev_net(dev));
391 0 : }
392 :
393 : /*
394 : * Our notifier list
395 : */
396 :
397 : static RAW_NOTIFIER_HEAD(netdev_chain);
398 :
399 : /*
400 : * Device drivers call our routines to queue packets here. We empty the
401 : * queue in the local softnet handler.
402 : */
403 :
404 : DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
405 : EXPORT_PER_CPU_SYMBOL(softnet_data);
406 :
407 : #ifdef CONFIG_LOCKDEP
408 : /*
409 : * register_netdevice() inits txq->_xmit_lock and sets lockdep class
410 : * according to dev->type
411 : */
412 : static const unsigned short netdev_lock_type[] = {
413 : ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
414 : ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
415 : ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
416 : ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
417 : ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
418 : ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
419 : ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
420 : ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
421 : ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
422 : ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
423 : ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
424 : ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
425 : ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
426 : ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
427 : ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
428 :
429 : static const char *const netdev_lock_name[] = {
430 : "_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
431 : "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
432 : "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
433 : "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
434 : "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
435 : "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
436 : "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
437 : "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
438 : "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
439 : "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
440 : "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
441 : "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
442 : "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
443 : "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
444 : "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
445 :
446 : static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
447 : static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
448 :
449 5 : static inline unsigned short netdev_lock_pos(unsigned short dev_type)
450 : {
451 5 : int i;
452 :
453 106 : for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
454 106 : if (netdev_lock_type[i] == dev_type)
455 5 : return i;
456 : /* the last key is used by default */
457 : return ARRAY_SIZE(netdev_lock_type) - 1;
458 : }
459 :
460 3 : static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
461 : unsigned short dev_type)
462 : {
463 3 : int i;
464 :
465 3 : i = netdev_lock_pos(dev_type);
466 3 : lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
467 : netdev_lock_name[i]);
468 3 : }
469 :
470 2 : static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
471 : {
472 2 : int i;
473 :
474 2 : i = netdev_lock_pos(dev->type);
475 2 : lockdep_set_class_and_name(&dev->addr_list_lock,
476 : &netdev_addr_lock_key[i],
477 : netdev_lock_name[i]);
478 2 : }
479 : #else
480 : static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
481 : unsigned short dev_type)
482 : {
483 : }
484 :
485 : static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
486 : {
487 : }
488 : #endif
489 :
490 : /*******************************************************************************
491 : *
492 : * Protocol management and registration routines
493 : *
494 : *******************************************************************************/
495 :
496 :
497 : /*
498 : * Add a protocol ID to the list. Now that the input handler is
499 : * smarter we can dispense with all the messy stuff that used to be
500 : * here.
501 : *
502 : * BEWARE!!! Protocol handlers, mangling input packets,
503 : * MUST BE last in hash buckets and checking protocol handlers
504 : * MUST start from promiscuous ptype_all chain in net_bh.
505 : * It is true now, do not change it.
506 : * Explanation follows: if protocol handler, mangling packet, will
507 : * be the first on list, it is not able to sense, that packet
508 : * is cloned and should be copied-on-write, so that it will
509 : * change it and subsequent readers will get broken packet.
510 : * --ANK (980803)
511 : */
512 :
513 5 : static inline struct list_head *ptype_head(const struct packet_type *pt)
514 : {
515 5 : if (pt->type == htons(ETH_P_ALL))
516 3 : return pt->dev ? &pt->dev->ptype_all : &ptype_all;
517 : else
518 2 : return pt->dev ? &pt->dev->ptype_specific :
519 2 : &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
520 : }
521 :
522 : /**
523 : * dev_add_pack - add packet handler
524 : * @pt: packet type declaration
525 : *
526 : * Add a protocol handler to the networking stack. The passed &packet_type
527 : * is linked into kernel lists and may not be freed until it has been
528 : * removed from the kernel lists.
529 : *
530 : * This call does not sleep therefore it can not
531 : * guarantee all CPU's that are in middle of receiving packets
532 : * will see the new packet type (until the next received packet).
533 : */
534 :
535 4 : void dev_add_pack(struct packet_type *pt)
536 : {
537 4 : struct list_head *head = ptype_head(pt);
538 :
539 4 : spin_lock(&ptype_lock);
540 4 : list_add_rcu(&pt->list, head);
541 4 : spin_unlock(&ptype_lock);
542 4 : }
543 : EXPORT_SYMBOL(dev_add_pack);
544 :
545 : /**
546 : * __dev_remove_pack - remove packet handler
547 : * @pt: packet type declaration
548 : *
549 : * Remove a protocol handler that was previously added to the kernel
550 : * protocol handlers by dev_add_pack(). The passed &packet_type is removed
551 : * from the kernel lists and can be freed or reused once this function
552 : * returns.
553 : *
554 : * The packet type might still be in use by receivers
555 : * and must not be freed until after all the CPU's have gone
556 : * through a quiescent state.
557 : */
558 1 : void __dev_remove_pack(struct packet_type *pt)
559 : {
560 1 : struct list_head *head = ptype_head(pt);
561 1 : struct packet_type *pt1;
562 :
563 1 : spin_lock(&ptype_lock);
564 :
565 1 : list_for_each_entry(pt1, head, list) {
566 1 : if (pt == pt1) {
567 1 : list_del_rcu(&pt->list);
568 1 : goto out;
569 : }
570 : }
571 :
572 0 : pr_warn("dev_remove_pack: %p not found\n", pt);
573 1 : out:
574 1 : spin_unlock(&ptype_lock);
575 1 : }
576 : EXPORT_SYMBOL(__dev_remove_pack);
577 :
578 : /**
579 : * dev_remove_pack - remove packet handler
580 : * @pt: packet type declaration
581 : *
582 : * Remove a protocol handler that was previously added to the kernel
583 : * protocol handlers by dev_add_pack(). The passed &packet_type is removed
584 : * from the kernel lists and can be freed or reused once this function
585 : * returns.
586 : *
587 : * This call sleeps to guarantee that no CPU is looking at the packet
588 : * type after return.
589 : */
590 0 : void dev_remove_pack(struct packet_type *pt)
591 : {
592 0 : __dev_remove_pack(pt);
593 :
594 0 : synchronize_net();
595 0 : }
596 : EXPORT_SYMBOL(dev_remove_pack);
597 :
598 :
599 : /**
600 : * dev_add_offload - register offload handlers
601 : * @po: protocol offload declaration
602 : *
603 : * Add protocol offload handlers to the networking stack. The passed
604 : * &proto_offload is linked into kernel lists and may not be freed until
605 : * it has been removed from the kernel lists.
606 : *
607 : * This call does not sleep therefore it can not
608 : * guarantee all CPU's that are in middle of receiving packets
609 : * will see the new offload handlers (until the next received packet).
610 : */
611 3 : void dev_add_offload(struct packet_offload *po)
612 : {
613 3 : struct packet_offload *elem;
614 :
615 3 : spin_lock(&offload_lock);
616 4 : list_for_each_entry(elem, &offload_base, list) {
617 3 : if (po->priority < elem->priority)
618 : break;
619 : }
620 3 : list_add_rcu(&po->list, elem->list.prev);
621 3 : spin_unlock(&offload_lock);
622 3 : }
623 : EXPORT_SYMBOL(dev_add_offload);
624 :
625 : /**
626 : * __dev_remove_offload - remove offload handler
627 : * @po: packet offload declaration
628 : *
629 : * Remove a protocol offload handler that was previously added to the
630 : * kernel offload handlers by dev_add_offload(). The passed &offload_type
631 : * is removed from the kernel lists and can be freed or reused once this
632 : * function returns.
633 : *
634 : * The packet type might still be in use by receivers
635 : * and must not be freed until after all the CPU's have gone
636 : * through a quiescent state.
637 : */
638 0 : static void __dev_remove_offload(struct packet_offload *po)
639 : {
640 0 : struct list_head *head = &offload_base;
641 0 : struct packet_offload *po1;
642 :
643 0 : spin_lock(&offload_lock);
644 :
645 0 : list_for_each_entry(po1, head, list) {
646 0 : if (po == po1) {
647 0 : list_del_rcu(&po->list);
648 0 : goto out;
649 : }
650 : }
651 :
652 0 : pr_warn("dev_remove_offload: %p not found\n", po);
653 0 : out:
654 0 : spin_unlock(&offload_lock);
655 0 : }
656 :
657 : /**
658 : * dev_remove_offload - remove packet offload handler
659 : * @po: packet offload declaration
660 : *
661 : * Remove a packet offload handler that was previously added to the kernel
662 : * offload handlers by dev_add_offload(). The passed &offload_type is
663 : * removed from the kernel lists and can be freed or reused once this
664 : * function returns.
665 : *
666 : * This call sleeps to guarantee that no CPU is looking at the packet
667 : * type after return.
668 : */
669 0 : void dev_remove_offload(struct packet_offload *po)
670 : {
671 0 : __dev_remove_offload(po);
672 :
673 0 : synchronize_net();
674 0 : }
675 : EXPORT_SYMBOL(dev_remove_offload);
676 :
677 : /******************************************************************************
678 : *
679 : * Device Boot-time Settings Routines
680 : *
681 : ******************************************************************************/
682 :
683 : /* Boot time configuration table */
684 : static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
685 :
686 : /**
687 : * netdev_boot_setup_add - add new setup entry
688 : * @name: name of the device
689 : * @map: configured settings for the device
690 : *
691 : * Adds new setup entry to the dev_boot_setup list. The function
692 : * returns 0 on error and 1 on success. This is a generic routine to
693 : * all netdevices.
694 : */
695 0 : static int netdev_boot_setup_add(char *name, struct ifmap *map)
696 : {
697 0 : struct netdev_boot_setup *s;
698 0 : int i;
699 :
700 0 : s = dev_boot_setup;
701 0 : for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
702 0 : if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
703 0 : memset(s[i].name, 0, sizeof(s[i].name));
704 0 : strlcpy(s[i].name, name, IFNAMSIZ);
705 0 : memcpy(&s[i].map, map, sizeof(s[i].map));
706 0 : break;
707 : }
708 : }
709 :
710 0 : return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
711 : }
712 :
713 : /**
714 : * netdev_boot_setup_check - check boot time settings
715 : * @dev: the netdevice
716 : *
717 : * Check boot time settings for the device.
718 : * The found settings are set for the device to be used
719 : * later in the device probing.
720 : * Returns 0 if no settings found, 1 if they are.
721 : */
722 0 : int netdev_boot_setup_check(struct net_device *dev)
723 : {
724 0 : struct netdev_boot_setup *s = dev_boot_setup;
725 0 : int i;
726 :
727 0 : for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
728 0 : if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
729 0 : !strcmp(dev->name, s[i].name)) {
730 0 : dev->irq = s[i].map.irq;
731 0 : dev->base_addr = s[i].map.base_addr;
732 0 : dev->mem_start = s[i].map.mem_start;
733 0 : dev->mem_end = s[i].map.mem_end;
734 0 : return 1;
735 : }
736 : }
737 : return 0;
738 : }
739 : EXPORT_SYMBOL(netdev_boot_setup_check);
740 :
741 :
742 : /**
743 : * netdev_boot_base - get address from boot time settings
744 : * @prefix: prefix for network device
745 : * @unit: id for network device
746 : *
747 : * Check boot time settings for the base address of device.
748 : * The found settings are set for the device to be used
749 : * later in the device probing.
750 : * Returns 0 if no settings found.
751 : */
752 8 : unsigned long netdev_boot_base(const char *prefix, int unit)
753 : {
754 8 : const struct netdev_boot_setup *s = dev_boot_setup;
755 8 : char name[IFNAMSIZ];
756 8 : int i;
757 :
758 8 : sprintf(name, "%s%d", prefix, unit);
759 :
760 : /*
761 : * If device already registered then return base of 1
762 : * to indicate not to probe for this interface
763 : */
764 80 : if (__dev_get_by_name(&init_net, name))
765 : return 1;
766 :
767 72 : for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
768 64 : if (!strcmp(name, s[i].name))
769 0 : return s[i].map.base_addr;
770 : return 0;
771 : }
772 :
773 : /*
774 : * Saves at boot time configured settings for any netdevice.
775 : */
776 0 : int __init netdev_boot_setup(char *str)
777 : {
778 0 : int ints[5];
779 0 : struct ifmap map;
780 :
781 0 : str = get_options(str, ARRAY_SIZE(ints), ints);
782 0 : if (!str || !*str)
783 : return 0;
784 :
785 : /* Save settings */
786 0 : memset(&map, 0, sizeof(map));
787 0 : if (ints[0] > 0)
788 0 : map.irq = ints[1];
789 0 : if (ints[0] > 1)
790 0 : map.base_addr = ints[2];
791 0 : if (ints[0] > 2)
792 0 : map.mem_start = ints[3];
793 0 : if (ints[0] > 3)
794 0 : map.mem_end = ints[4];
795 :
796 : /* Add new entry to the list */
797 0 : return netdev_boot_setup_add(str, &map);
798 : }
799 :
800 : __setup("netdev=", netdev_boot_setup);
801 :
802 : /*******************************************************************************
803 : *
804 : * Device Interface Subroutines
805 : *
806 : *******************************************************************************/
807 :
808 : /**
809 : * dev_get_iflink - get 'iflink' value of a interface
810 : * @dev: targeted interface
811 : *
812 : * Indicates the ifindex the interface is linked to.
813 : * Physical interfaces have the same 'ifindex' and 'iflink' values.
814 : */
815 :
816 18 : int dev_get_iflink(const struct net_device *dev)
817 : {
818 18 : if (dev->netdev_ops && dev->netdev_ops->ndo_get_iflink)
819 0 : return dev->netdev_ops->ndo_get_iflink(dev);
820 :
821 18 : return dev->ifindex;
822 : }
823 : EXPORT_SYMBOL(dev_get_iflink);
824 :
825 : /**
826 : * dev_fill_metadata_dst - Retrieve tunnel egress information.
827 : * @dev: targeted interface
828 : * @skb: The packet.
829 : *
830 : * For better visibility of tunnel traffic OVS needs to retrieve
831 : * egress tunnel information for a packet. Following API allows
832 : * user to get this info.
833 : */
834 0 : int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb)
835 : {
836 0 : struct ip_tunnel_info *info;
837 :
838 0 : if (!dev->netdev_ops || !dev->netdev_ops->ndo_fill_metadata_dst)
839 : return -EINVAL;
840 :
841 0 : info = skb_tunnel_info_unclone(skb);
842 0 : if (!info)
843 : return -ENOMEM;
844 0 : if (unlikely(!(info->mode & IP_TUNNEL_INFO_TX)))
845 : return -EINVAL;
846 :
847 0 : return dev->netdev_ops->ndo_fill_metadata_dst(dev, skb);
848 : }
849 : EXPORT_SYMBOL_GPL(dev_fill_metadata_dst);
850 :
851 : /**
852 : * __dev_get_by_name - find a device by its name
853 : * @net: the applicable net namespace
854 : * @name: name to find
855 : *
856 : * Find an interface by name. Must be called under RTNL semaphore
857 : * or @dev_base_lock. If the name is found a pointer to the device
858 : * is returned. If the name is not found then %NULL is returned. The
859 : * reference counters are not incremented so the caller must be
860 : * careful with locks.
861 : */
862 :
863 18 : struct net_device *__dev_get_by_name(struct net *net, const char *name)
864 : {
865 18 : struct netdev_name_node *node_name;
866 :
867 16 : node_name = netdev_name_node_lookup(net, name);
868 18 : return node_name ? node_name->dev : NULL;
869 : }
870 : EXPORT_SYMBOL(__dev_get_by_name);
871 :
872 : /**
873 : * dev_get_by_name_rcu - find a device by its name
874 : * @net: the applicable net namespace
875 : * @name: name to find
876 : *
877 : * Find an interface by name.
878 : * If the name is found a pointer to the device is returned.
879 : * If the name is not found then %NULL is returned.
880 : * The reference counters are not incremented so the caller must be
881 : * careful with locks. The caller must hold RCU lock.
882 : */
883 :
884 28 : struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
885 : {
886 28 : struct netdev_name_node *node_name;
887 :
888 27 : node_name = netdev_name_node_lookup_rcu(net, name);
889 28 : return node_name ? node_name->dev : NULL;
890 : }
891 : EXPORT_SYMBOL(dev_get_by_name_rcu);
892 :
893 : /**
894 : * dev_get_by_name - find a device by its name
895 : * @net: the applicable net namespace
896 : * @name: name to find
897 : *
898 : * Find an interface by name. This can be called from any
899 : * context and does its own locking. The returned handle has
900 : * the usage count incremented and the caller must use dev_put() to
901 : * release it when it is no longer needed. %NULL is returned if no
902 : * matching device is found.
903 : */
904 :
905 0 : struct net_device *dev_get_by_name(struct net *net, const char *name)
906 : {
907 0 : struct net_device *dev;
908 :
909 0 : rcu_read_lock();
910 0 : dev = dev_get_by_name_rcu(net, name);
911 0 : if (dev)
912 0 : dev_hold(dev);
913 0 : rcu_read_unlock();
914 0 : return dev;
915 : }
916 : EXPORT_SYMBOL(dev_get_by_name);
917 :
918 : /**
919 : * __dev_get_by_index - find a device by its ifindex
920 : * @net: the applicable net namespace
921 : * @ifindex: index of device
922 : *
923 : * Search for an interface by index. Returns %NULL if the device
924 : * is not found or a pointer to the device. The device has not
925 : * had its reference counter increased so the caller must be careful
926 : * about locking. The caller must hold either the RTNL semaphore
927 : * or @dev_base_lock.
928 : */
929 :
930 7 : struct net_device *__dev_get_by_index(struct net *net, int ifindex)
931 : {
932 7 : struct net_device *dev;
933 7 : struct hlist_head *head = dev_index_hash(net, ifindex);
934 :
935 14 : hlist_for_each_entry(dev, head, index_hlist)
936 5 : if (dev->ifindex == ifindex)
937 5 : return dev;
938 :
939 : return NULL;
940 : }
941 : EXPORT_SYMBOL(__dev_get_by_index);
942 :
943 : /**
944 : * dev_get_by_index_rcu - find a device by its ifindex
945 : * @net: the applicable net namespace
946 : * @ifindex: index of device
947 : *
948 : * Search for an interface by index. Returns %NULL if the device
949 : * is not found or a pointer to the device. The device has not
950 : * had its reference counter increased so the caller must be careful
951 : * about locking. The caller must hold RCU lock.
952 : */
953 :
954 16 : struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
955 : {
956 16 : struct net_device *dev;
957 16 : struct hlist_head *head = dev_index_hash(net, ifindex);
958 :
959 32 : hlist_for_each_entry_rcu(dev, head, index_hlist)
960 16 : if (dev->ifindex == ifindex)
961 16 : return dev;
962 :
963 : return NULL;
964 : }
965 : EXPORT_SYMBOL(dev_get_by_index_rcu);
966 :
967 :
968 : /**
969 : * dev_get_by_index - find a device by its ifindex
970 : * @net: the applicable net namespace
971 : * @ifindex: index of device
972 : *
973 : * Search for an interface by index. Returns NULL if the device
974 : * is not found or a pointer to the device. The device returned has
975 : * had a reference added and the pointer is safe until the user calls
976 : * dev_put to indicate they have finished with it.
977 : */
978 :
979 4 : struct net_device *dev_get_by_index(struct net *net, int ifindex)
980 : {
981 4 : struct net_device *dev;
982 :
983 4 : rcu_read_lock();
984 4 : dev = dev_get_by_index_rcu(net, ifindex);
985 4 : if (dev)
986 4 : dev_hold(dev);
987 4 : rcu_read_unlock();
988 4 : return dev;
989 : }
990 : EXPORT_SYMBOL(dev_get_by_index);
991 :
992 : /**
993 : * dev_get_by_napi_id - find a device by napi_id
994 : * @napi_id: ID of the NAPI struct
995 : *
996 : * Search for an interface by NAPI ID. Returns %NULL if the device
997 : * is not found or a pointer to the device. The device has not had
998 : * its reference counter increased so the caller must be careful
999 : * about locking. The caller must hold RCU lock.
1000 : */
1001 :
1002 0 : struct net_device *dev_get_by_napi_id(unsigned int napi_id)
1003 : {
1004 0 : struct napi_struct *napi;
1005 :
1006 0 : WARN_ON_ONCE(!rcu_read_lock_held());
1007 :
1008 0 : if (napi_id < MIN_NAPI_ID)
1009 : return NULL;
1010 :
1011 0 : napi = napi_by_id(napi_id);
1012 :
1013 0 : return napi ? napi->dev : NULL;
1014 : }
1015 : EXPORT_SYMBOL(dev_get_by_napi_id);
1016 :
1017 : /**
1018 : * netdev_get_name - get a netdevice name, knowing its ifindex.
1019 : * @net: network namespace
1020 : * @name: a pointer to the buffer where the name will be stored.
1021 : * @ifindex: the ifindex of the interface to get the name from.
1022 : */
1023 3 : int netdev_get_name(struct net *net, char *name, int ifindex)
1024 : {
1025 3 : struct net_device *dev;
1026 3 : int ret;
1027 :
1028 3 : down_read(&devnet_rename_sem);
1029 3 : rcu_read_lock();
1030 :
1031 3 : dev = dev_get_by_index_rcu(net, ifindex);
1032 3 : if (!dev) {
1033 0 : ret = -ENODEV;
1034 0 : goto out;
1035 : }
1036 :
1037 3 : strcpy(name, dev->name);
1038 :
1039 3 : ret = 0;
1040 3 : out:
1041 3 : rcu_read_unlock();
1042 3 : up_read(&devnet_rename_sem);
1043 3 : return ret;
1044 : }
1045 :
1046 : /**
1047 : * dev_getbyhwaddr_rcu - find a device by its hardware address
1048 : * @net: the applicable net namespace
1049 : * @type: media type of device
1050 : * @ha: hardware address
1051 : *
1052 : * Search for an interface by MAC address. Returns NULL if the device
1053 : * is not found or a pointer to the device.
1054 : * The caller must hold RCU or RTNL.
1055 : * The returned device has not had its ref count increased
1056 : * and the caller must therefore be careful about locking
1057 : *
1058 : */
1059 :
1060 0 : struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
1061 : const char *ha)
1062 : {
1063 0 : struct net_device *dev;
1064 :
1065 0 : for_each_netdev_rcu(net, dev)
1066 0 : if (dev->type == type &&
1067 0 : !memcmp(dev->dev_addr, ha, dev->addr_len))
1068 0 : return dev;
1069 :
1070 : return NULL;
1071 : }
1072 : EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
1073 :
1074 0 : struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
1075 : {
1076 0 : struct net_device *dev, *ret = NULL;
1077 :
1078 0 : rcu_read_lock();
1079 0 : for_each_netdev_rcu(net, dev)
1080 0 : if (dev->type == type) {
1081 0 : dev_hold(dev);
1082 0 : ret = dev;
1083 0 : break;
1084 : }
1085 0 : rcu_read_unlock();
1086 0 : return ret;
1087 : }
1088 : EXPORT_SYMBOL(dev_getfirstbyhwtype);
1089 :
1090 : /**
1091 : * __dev_get_by_flags - find any device with given flags
1092 : * @net: the applicable net namespace
1093 : * @if_flags: IFF_* values
1094 : * @mask: bitmask of bits in if_flags to check
1095 : *
1096 : * Search for any interface with the given flags. Returns NULL if a device
1097 : * is not found or a pointer to the device. Must be called inside
1098 : * rtnl_lock(), and result refcount is unchanged.
1099 : */
1100 :
1101 0 : struct net_device *__dev_get_by_flags(struct net *net, unsigned short if_flags,
1102 : unsigned short mask)
1103 : {
1104 0 : struct net_device *dev, *ret;
1105 :
1106 0 : ASSERT_RTNL();
1107 :
1108 0 : ret = NULL;
1109 0 : for_each_netdev(net, dev) {
1110 0 : if (((dev->flags ^ if_flags) & mask) == 0) {
1111 : ret = dev;
1112 : break;
1113 : }
1114 : }
1115 0 : return ret;
1116 : }
1117 : EXPORT_SYMBOL(__dev_get_by_flags);
1118 :
1119 : /**
1120 : * dev_valid_name - check if name is okay for network device
1121 : * @name: name string
1122 : *
1123 : * Network device names need to be valid file names to
1124 : * allow sysfs to work. We also disallow any kind of
1125 : * whitespace.
1126 : */
1127 3 : bool dev_valid_name(const char *name)
1128 : {
1129 3 : if (*name == '\0')
1130 : return false;
1131 3 : if (strnlen(name, IFNAMSIZ) == IFNAMSIZ)
1132 : return false;
1133 3 : if (!strcmp(name, ".") || !strcmp(name, ".."))
1134 : return false;
1135 :
1136 15 : while (*name) {
1137 12 : if (*name == '/' || *name == ':' || isspace(*name))
1138 : return false;
1139 12 : name++;
1140 : }
1141 : return true;
1142 : }
1143 : EXPORT_SYMBOL(dev_valid_name);
1144 :
1145 : /**
1146 : * __dev_alloc_name - allocate a name for a device
1147 : * @net: network namespace to allocate the device name in
1148 : * @name: name format string
1149 : * @buf: scratch buffer and result name string
1150 : *
1151 : * Passed a format string - eg "lt%d" it will try and find a suitable
1152 : * id. It scans list of devices to build up a free map, then chooses
1153 : * the first empty slot. The caller must hold the dev_base or rtnl lock
1154 : * while allocating the name and adding the device in order to avoid
1155 : * duplicates.
1156 : * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1157 : * Returns the number of the unit assigned or a negative errno code.
1158 : */
1159 :
1160 1 : static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1161 : {
1162 1 : int i = 0;
1163 1 : const char *p;
1164 1 : const int max_netdevices = 8*PAGE_SIZE;
1165 1 : unsigned long *inuse;
1166 1 : struct net_device *d;
1167 :
1168 1 : if (!dev_valid_name(name))
1169 : return -EINVAL;
1170 :
1171 1 : p = strchr(name, '%');
1172 1 : if (p) {
1173 : /*
1174 : * Verify the string as this thing may have come from
1175 : * the user. There must be either one "%d" and no other "%"
1176 : * characters.
1177 : */
1178 1 : if (p[1] != 'd' || strchr(p + 2, '%'))
1179 : return -EINVAL;
1180 :
1181 : /* Use one page as a bit array of possible slots */
1182 1 : inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1183 1 : if (!inuse)
1184 : return -ENOMEM;
1185 :
1186 2 : for_each_netdev(net, d) {
1187 1 : if (!sscanf(d->name, name, &i))
1188 1 : continue;
1189 0 : if (i < 0 || i >= max_netdevices)
1190 0 : continue;
1191 :
1192 : /* avoid cases where sscanf is not exact inverse of printf */
1193 0 : snprintf(buf, IFNAMSIZ, name, i);
1194 0 : if (!strncmp(buf, d->name, IFNAMSIZ))
1195 0 : set_bit(i, inuse);
1196 : }
1197 :
1198 1 : i = find_first_zero_bit(inuse, max_netdevices);
1199 1 : free_page((unsigned long) inuse);
1200 : }
1201 :
1202 1 : snprintf(buf, IFNAMSIZ, name, i);
1203 1 : if (!__dev_get_by_name(net, buf))
1204 1 : return i;
1205 :
1206 : /* It is possible to run out of possible slots
1207 : * when the name is long and there isn't enough space left
1208 : * for the digits, or if all bits are used.
1209 : */
1210 : return -ENFILE;
1211 : }
1212 :
1213 1 : static int dev_alloc_name_ns(struct net *net,
1214 : struct net_device *dev,
1215 : const char *name)
1216 : {
1217 1 : char buf[IFNAMSIZ];
1218 1 : int ret;
1219 :
1220 1 : BUG_ON(!net);
1221 1 : ret = __dev_alloc_name(net, name, buf);
1222 1 : if (ret >= 0)
1223 1 : strlcpy(dev->name, buf, IFNAMSIZ);
1224 1 : return ret;
1225 : }
1226 :
1227 : /**
1228 : * dev_alloc_name - allocate a name for a device
1229 : * @dev: device
1230 : * @name: name format string
1231 : *
1232 : * Passed a format string - eg "lt%d" it will try and find a suitable
1233 : * id. It scans list of devices to build up a free map, then chooses
1234 : * the first empty slot. The caller must hold the dev_base or rtnl lock
1235 : * while allocating the name and adding the device in order to avoid
1236 : * duplicates.
1237 : * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1238 : * Returns the number of the unit assigned or a negative errno code.
1239 : */
1240 :
1241 0 : int dev_alloc_name(struct net_device *dev, const char *name)
1242 : {
1243 0 : return dev_alloc_name_ns(dev_net(dev), dev, name);
1244 : }
1245 : EXPORT_SYMBOL(dev_alloc_name);
1246 :
1247 2 : static int dev_get_valid_name(struct net *net, struct net_device *dev,
1248 : const char *name)
1249 : {
1250 2 : BUG_ON(!net);
1251 :
1252 2 : if (!dev_valid_name(name))
1253 : return -EINVAL;
1254 :
1255 2 : if (strchr(name, '%'))
1256 1 : return dev_alloc_name_ns(net, dev, name);
1257 1 : else if (__dev_get_by_name(net, name))
1258 : return -EEXIST;
1259 1 : else if (dev->name != name)
1260 0 : strlcpy(dev->name, name, IFNAMSIZ);
1261 :
1262 : return 0;
1263 : }
1264 :
1265 : /**
1266 : * dev_change_name - change name of a device
1267 : * @dev: device
1268 : * @newname: name (or format string) must be at least IFNAMSIZ
1269 : *
1270 : * Change name of a device, can pass format strings "eth%d".
1271 : * for wildcarding.
1272 : */
1273 0 : int dev_change_name(struct net_device *dev, const char *newname)
1274 : {
1275 0 : unsigned char old_assign_type;
1276 0 : char oldname[IFNAMSIZ];
1277 0 : int err = 0;
1278 0 : int ret;
1279 0 : struct net *net;
1280 :
1281 0 : ASSERT_RTNL();
1282 0 : BUG_ON(!dev_net(dev));
1283 :
1284 0 : net = dev_net(dev);
1285 :
1286 : /* Some auto-enslaved devices e.g. failover slaves are
1287 : * special, as userspace might rename the device after
1288 : * the interface had been brought up and running since
1289 : * the point kernel initiated auto-enslavement. Allow
1290 : * live name change even when these slave devices are
1291 : * up and running.
1292 : *
1293 : * Typically, users of these auto-enslaving devices
1294 : * don't actually care about slave name change, as
1295 : * they are supposed to operate on master interface
1296 : * directly.
1297 : */
1298 0 : if (dev->flags & IFF_UP &&
1299 0 : likely(!(dev->priv_flags & IFF_LIVE_RENAME_OK)))
1300 : return -EBUSY;
1301 :
1302 0 : down_write(&devnet_rename_sem);
1303 :
1304 0 : if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1305 0 : up_write(&devnet_rename_sem);
1306 0 : return 0;
1307 : }
1308 :
1309 0 : memcpy(oldname, dev->name, IFNAMSIZ);
1310 :
1311 0 : err = dev_get_valid_name(net, dev, newname);
1312 0 : if (err < 0) {
1313 0 : up_write(&devnet_rename_sem);
1314 0 : return err;
1315 : }
1316 :
1317 0 : if (oldname[0] && !strchr(oldname, '%'))
1318 0 : netdev_info(dev, "renamed from %s\n", oldname);
1319 :
1320 0 : old_assign_type = dev->name_assign_type;
1321 0 : dev->name_assign_type = NET_NAME_RENAMED;
1322 :
1323 0 : rollback:
1324 0 : ret = device_rename(&dev->dev, dev->name);
1325 0 : if (ret) {
1326 0 : memcpy(dev->name, oldname, IFNAMSIZ);
1327 0 : dev->name_assign_type = old_assign_type;
1328 0 : up_write(&devnet_rename_sem);
1329 0 : return ret;
1330 : }
1331 :
1332 0 : up_write(&devnet_rename_sem);
1333 :
1334 0 : netdev_adjacent_rename_links(dev, oldname);
1335 :
1336 0 : write_lock_bh(&dev_base_lock);
1337 0 : netdev_name_node_del(dev->name_node);
1338 0 : write_unlock_bh(&dev_base_lock);
1339 :
1340 0 : synchronize_rcu();
1341 :
1342 0 : write_lock_bh(&dev_base_lock);
1343 0 : netdev_name_node_add(net, dev->name_node);
1344 0 : write_unlock_bh(&dev_base_lock);
1345 :
1346 0 : ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1347 0 : ret = notifier_to_errno(ret);
1348 :
1349 0 : if (ret) {
1350 : /* err >= 0 after dev_alloc_name() or stores the first errno */
1351 0 : if (err >= 0) {
1352 0 : err = ret;
1353 0 : down_write(&devnet_rename_sem);
1354 0 : memcpy(dev->name, oldname, IFNAMSIZ);
1355 0 : memcpy(oldname, newname, IFNAMSIZ);
1356 0 : dev->name_assign_type = old_assign_type;
1357 0 : old_assign_type = NET_NAME_RENAMED;
1358 0 : goto rollback;
1359 : } else {
1360 0 : pr_err("%s: name change rollback failed: %d\n",
1361 : dev->name, ret);
1362 : }
1363 : }
1364 :
1365 : return err;
1366 : }
1367 :
1368 : /**
1369 : * dev_set_alias - change ifalias of a device
1370 : * @dev: device
1371 : * @alias: name up to IFALIASZ
1372 : * @len: limit of bytes to copy from info
1373 : *
1374 : * Set ifalias for a device,
1375 : */
1376 0 : int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1377 : {
1378 0 : struct dev_ifalias *new_alias = NULL;
1379 :
1380 0 : if (len >= IFALIASZ)
1381 : return -EINVAL;
1382 :
1383 0 : if (len) {
1384 0 : new_alias = kmalloc(sizeof(*new_alias) + len + 1, GFP_KERNEL);
1385 0 : if (!new_alias)
1386 : return -ENOMEM;
1387 :
1388 0 : memcpy(new_alias->ifalias, alias, len);
1389 0 : new_alias->ifalias[len] = 0;
1390 : }
1391 :
1392 0 : mutex_lock(&ifalias_mutex);
1393 0 : new_alias = rcu_replace_pointer(dev->ifalias, new_alias,
1394 : mutex_is_locked(&ifalias_mutex));
1395 0 : mutex_unlock(&ifalias_mutex);
1396 :
1397 0 : if (new_alias)
1398 0 : kfree_rcu(new_alias, rcuhead);
1399 :
1400 0 : return len;
1401 : }
1402 : EXPORT_SYMBOL(dev_set_alias);
1403 :
1404 : /**
1405 : * dev_get_alias - get ifalias of a device
1406 : * @dev: device
1407 : * @name: buffer to store name of ifalias
1408 : * @len: size of buffer
1409 : *
1410 : * get ifalias for a device. Caller must make sure dev cannot go
1411 : * away, e.g. rcu read lock or own a reference count to device.
1412 : */
1413 16 : int dev_get_alias(const struct net_device *dev, char *name, size_t len)
1414 : {
1415 16 : const struct dev_ifalias *alias;
1416 16 : int ret = 0;
1417 :
1418 16 : rcu_read_lock();
1419 16 : alias = rcu_dereference(dev->ifalias);
1420 16 : if (alias)
1421 0 : ret = snprintf(name, len, "%s", alias->ifalias);
1422 16 : rcu_read_unlock();
1423 :
1424 16 : return ret;
1425 : }
1426 :
1427 : /**
1428 : * netdev_features_change - device changes features
1429 : * @dev: device to cause notification
1430 : *
1431 : * Called to indicate a device has changed features.
1432 : */
1433 0 : void netdev_features_change(struct net_device *dev)
1434 : {
1435 0 : call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1436 0 : }
1437 : EXPORT_SYMBOL(netdev_features_change);
1438 :
1439 : /**
1440 : * netdev_state_change - device changes state
1441 : * @dev: device to cause notification
1442 : *
1443 : * Called to indicate a device has changed state. This function calls
1444 : * the notifier chains for netdev_chain and sends a NEWLINK message
1445 : * to the routing socket.
1446 : */
1447 0 : void netdev_state_change(struct net_device *dev)
1448 : {
1449 0 : if (dev->flags & IFF_UP) {
1450 0 : struct netdev_notifier_change_info change_info = {
1451 : .info.dev = dev,
1452 : };
1453 :
1454 0 : call_netdevice_notifiers_info(NETDEV_CHANGE,
1455 : &change_info.info);
1456 0 : rtmsg_ifinfo(RTM_NEWLINK, dev, 0, GFP_KERNEL);
1457 : }
1458 0 : }
1459 : EXPORT_SYMBOL(netdev_state_change);
1460 :
1461 : /**
1462 : * __netdev_notify_peers - notify network peers about existence of @dev,
1463 : * to be called when rtnl lock is already held.
1464 : * @dev: network device
1465 : *
1466 : * Generate traffic such that interested network peers are aware of
1467 : * @dev, such as by generating a gratuitous ARP. This may be used when
1468 : * a device wants to inform the rest of the network about some sort of
1469 : * reconfiguration such as a failover event or virtual machine
1470 : * migration.
1471 : */
1472 0 : void __netdev_notify_peers(struct net_device *dev)
1473 : {
1474 0 : ASSERT_RTNL();
1475 0 : call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1476 0 : call_netdevice_notifiers(NETDEV_RESEND_IGMP, dev);
1477 0 : }
1478 : EXPORT_SYMBOL(__netdev_notify_peers);
1479 :
1480 : /**
1481 : * netdev_notify_peers - notify network peers about existence of @dev
1482 : * @dev: network device
1483 : *
1484 : * Generate traffic such that interested network peers are aware of
1485 : * @dev, such as by generating a gratuitous ARP. This may be used when
1486 : * a device wants to inform the rest of the network about some sort of
1487 : * reconfiguration such as a failover event or virtual machine
1488 : * migration.
1489 : */
1490 0 : void netdev_notify_peers(struct net_device *dev)
1491 : {
1492 0 : rtnl_lock();
1493 0 : __netdev_notify_peers(dev);
1494 0 : rtnl_unlock();
1495 0 : }
1496 : EXPORT_SYMBOL(netdev_notify_peers);
1497 :
1498 : static int napi_threaded_poll(void *data);
1499 :
1500 0 : static int napi_kthread_create(struct napi_struct *n)
1501 : {
1502 0 : int err = 0;
1503 :
1504 : /* Create and wake up the kthread once to put it in
1505 : * TASK_INTERRUPTIBLE mode to avoid the blocked task
1506 : * warning and work with loadavg.
1507 : */
1508 0 : n->thread = kthread_run(napi_threaded_poll, n, "napi/%s-%d",
1509 : n->dev->name, n->napi_id);
1510 0 : if (IS_ERR(n->thread)) {
1511 0 : err = PTR_ERR(n->thread);
1512 0 : pr_err("kthread_run failed with err %d\n", err);
1513 0 : n->thread = NULL;
1514 : }
1515 :
1516 0 : return err;
1517 : }
1518 :
1519 2 : static int __dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1520 : {
1521 2 : const struct net_device_ops *ops = dev->netdev_ops;
1522 2 : int ret;
1523 :
1524 2 : ASSERT_RTNL();
1525 :
1526 2 : if (!netif_device_present(dev)) {
1527 : /* may be detached because parent is runtime-suspended */
1528 0 : if (dev->dev.parent)
1529 0 : pm_runtime_resume(dev->dev.parent);
1530 0 : if (!netif_device_present(dev))
1531 : return -ENODEV;
1532 : }
1533 :
1534 : /* Block netpoll from trying to do any rx path servicing.
1535 : * If we don't do this there is a chance ndo_poll_controller
1536 : * or ndo_poll may be running while we open the device
1537 : */
1538 2 : netpoll_poll_disable(dev);
1539 :
1540 4 : ret = call_netdevice_notifiers_extack(NETDEV_PRE_UP, dev, extack);
1541 2 : ret = notifier_to_errno(ret);
1542 0 : if (ret)
1543 0 : return ret;
1544 :
1545 2 : set_bit(__LINK_STATE_START, &dev->state);
1546 :
1547 2 : if (ops->ndo_validate_addr)
1548 1 : ret = ops->ndo_validate_addr(dev);
1549 :
1550 2 : if (!ret && ops->ndo_open)
1551 1 : ret = ops->ndo_open(dev);
1552 :
1553 2 : netpoll_poll_enable(dev);
1554 :
1555 2 : if (ret)
1556 0 : clear_bit(__LINK_STATE_START, &dev->state);
1557 : else {
1558 2 : dev->flags |= IFF_UP;
1559 2 : dev_set_rx_mode(dev);
1560 2 : dev_activate(dev);
1561 2 : add_device_randomness(dev->dev_addr, dev->addr_len);
1562 : }
1563 :
1564 : return ret;
1565 : }
1566 :
1567 : /**
1568 : * dev_open - prepare an interface for use.
1569 : * @dev: device to open
1570 : * @extack: netlink extended ack
1571 : *
1572 : * Takes a device from down to up state. The device's private open
1573 : * function is invoked and then the multicast lists are loaded. Finally
1574 : * the device is moved into the up state and a %NETDEV_UP message is
1575 : * sent to the netdev notifier chain.
1576 : *
1577 : * Calling this function on an active interface is a nop. On a failure
1578 : * a negative errno code is returned.
1579 : */
1580 0 : int dev_open(struct net_device *dev, struct netlink_ext_ack *extack)
1581 : {
1582 0 : int ret;
1583 :
1584 0 : if (dev->flags & IFF_UP)
1585 : return 0;
1586 :
1587 0 : ret = __dev_open(dev, extack);
1588 0 : if (ret < 0)
1589 : return ret;
1590 :
1591 0 : rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1592 0 : call_netdevice_notifiers(NETDEV_UP, dev);
1593 :
1594 0 : return ret;
1595 : }
1596 : EXPORT_SYMBOL(dev_open);
1597 :
1598 0 : static void __dev_close_many(struct list_head *head)
1599 : {
1600 0 : struct net_device *dev;
1601 :
1602 0 : ASSERT_RTNL();
1603 0 : might_sleep();
1604 :
1605 0 : list_for_each_entry(dev, head, close_list) {
1606 : /* Temporarily disable netpoll until the interface is down */
1607 0 : netpoll_poll_disable(dev);
1608 :
1609 0 : call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1610 :
1611 0 : clear_bit(__LINK_STATE_START, &dev->state);
1612 :
1613 : /* Synchronize to scheduled poll. We cannot touch poll list, it
1614 : * can be even on different cpu. So just clear netif_running().
1615 : *
1616 : * dev->stop() will invoke napi_disable() on all of it's
1617 : * napi_struct instances on this device.
1618 : */
1619 0 : smp_mb__after_atomic(); /* Commit netif_running(). */
1620 : }
1621 :
1622 0 : dev_deactivate_many(head);
1623 :
1624 0 : list_for_each_entry(dev, head, close_list) {
1625 0 : const struct net_device_ops *ops = dev->netdev_ops;
1626 :
1627 : /*
1628 : * Call the device specific close. This cannot fail.
1629 : * Only if device is UP
1630 : *
1631 : * We allow it to be called even after a DETACH hot-plug
1632 : * event.
1633 : */
1634 0 : if (ops->ndo_stop)
1635 0 : ops->ndo_stop(dev);
1636 :
1637 0 : dev->flags &= ~IFF_UP;
1638 0 : netpoll_poll_enable(dev);
1639 : }
1640 0 : }
1641 :
1642 0 : static void __dev_close(struct net_device *dev)
1643 : {
1644 0 : LIST_HEAD(single);
1645 :
1646 0 : list_add(&dev->close_list, &single);
1647 0 : __dev_close_many(&single);
1648 0 : list_del(&single);
1649 0 : }
1650 :
1651 0 : void dev_close_many(struct list_head *head, bool unlink)
1652 : {
1653 0 : struct net_device *dev, *tmp;
1654 :
1655 : /* Remove the devices that don't need to be closed */
1656 0 : list_for_each_entry_safe(dev, tmp, head, close_list)
1657 0 : if (!(dev->flags & IFF_UP))
1658 0 : list_del_init(&dev->close_list);
1659 :
1660 0 : __dev_close_many(head);
1661 :
1662 0 : list_for_each_entry_safe(dev, tmp, head, close_list) {
1663 0 : rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING, GFP_KERNEL);
1664 0 : call_netdevice_notifiers(NETDEV_DOWN, dev);
1665 0 : if (unlink)
1666 0 : list_del_init(&dev->close_list);
1667 : }
1668 0 : }
1669 : EXPORT_SYMBOL(dev_close_many);
1670 :
1671 : /**
1672 : * dev_close - shutdown an interface.
1673 : * @dev: device to shutdown
1674 : *
1675 : * This function moves an active device into down state. A
1676 : * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1677 : * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1678 : * chain.
1679 : */
1680 0 : void dev_close(struct net_device *dev)
1681 : {
1682 0 : if (dev->flags & IFF_UP) {
1683 0 : LIST_HEAD(single);
1684 :
1685 0 : list_add(&dev->close_list, &single);
1686 0 : dev_close_many(&single, true);
1687 0 : list_del(&single);
1688 : }
1689 0 : }
1690 : EXPORT_SYMBOL(dev_close);
1691 :
1692 :
1693 : /**
1694 : * dev_disable_lro - disable Large Receive Offload on a device
1695 : * @dev: device
1696 : *
1697 : * Disable Large Receive Offload (LRO) on a net device. Must be
1698 : * called under RTNL. This is needed if received packets may be
1699 : * forwarded to another interface.
1700 : */
1701 0 : void dev_disable_lro(struct net_device *dev)
1702 : {
1703 0 : struct net_device *lower_dev;
1704 0 : struct list_head *iter;
1705 :
1706 0 : dev->wanted_features &= ~NETIF_F_LRO;
1707 0 : netdev_update_features(dev);
1708 :
1709 0 : if (unlikely(dev->features & NETIF_F_LRO))
1710 0 : netdev_WARN(dev, "failed to disable LRO!\n");
1711 :
1712 0 : netdev_for_each_lower_dev(dev, lower_dev, iter)
1713 0 : dev_disable_lro(lower_dev);
1714 0 : }
1715 : EXPORT_SYMBOL(dev_disable_lro);
1716 :
1717 : /**
1718 : * dev_disable_gro_hw - disable HW Generic Receive Offload on a device
1719 : * @dev: device
1720 : *
1721 : * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be
1722 : * called under RTNL. This is needed if Generic XDP is installed on
1723 : * the device.
1724 : */
1725 0 : static void dev_disable_gro_hw(struct net_device *dev)
1726 : {
1727 0 : dev->wanted_features &= ~NETIF_F_GRO_HW;
1728 0 : netdev_update_features(dev);
1729 :
1730 0 : if (unlikely(dev->features & NETIF_F_GRO_HW))
1731 0 : netdev_WARN(dev, "failed to disable GRO_HW!\n");
1732 0 : }
1733 :
1734 0 : const char *netdev_cmd_to_name(enum netdev_cmd cmd)
1735 : {
1736 : #define N(val) \
1737 : case NETDEV_##val: \
1738 : return "NETDEV_" __stringify(val);
1739 0 : switch (cmd) {
1740 : N(UP) N(DOWN) N(REBOOT) N(CHANGE) N(REGISTER) N(UNREGISTER)
1741 : N(CHANGEMTU) N(CHANGEADDR) N(GOING_DOWN) N(CHANGENAME) N(FEAT_CHANGE)
1742 : N(BONDING_FAILOVER) N(PRE_UP) N(PRE_TYPE_CHANGE) N(POST_TYPE_CHANGE)
1743 : N(POST_INIT) N(RELEASE) N(NOTIFY_PEERS) N(JOIN) N(CHANGEUPPER)
1744 : N(RESEND_IGMP) N(PRECHANGEMTU) N(CHANGEINFODATA) N(BONDING_INFO)
1745 : N(PRECHANGEUPPER) N(CHANGELOWERSTATE) N(UDP_TUNNEL_PUSH_INFO)
1746 : N(UDP_TUNNEL_DROP_INFO) N(CHANGE_TX_QUEUE_LEN)
1747 : N(CVLAN_FILTER_PUSH_INFO) N(CVLAN_FILTER_DROP_INFO)
1748 : N(SVLAN_FILTER_PUSH_INFO) N(SVLAN_FILTER_DROP_INFO)
1749 : N(PRE_CHANGEADDR)
1750 : }
1751 : #undef N
1752 : return "UNKNOWN_NETDEV_EVENT";
1753 : }
1754 : EXPORT_SYMBOL_GPL(netdev_cmd_to_name);
1755 :
1756 8 : static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val,
1757 : struct net_device *dev)
1758 : {
1759 8 : struct netdev_notifier_info info = {
1760 : .dev = dev,
1761 : };
1762 :
1763 8 : return nb->notifier_call(nb, val, &info);
1764 : }
1765 :
1766 8 : static int call_netdevice_register_notifiers(struct notifier_block *nb,
1767 : struct net_device *dev)
1768 : {
1769 8 : int err;
1770 :
1771 8 : err = call_netdevice_notifier(nb, NETDEV_REGISTER, dev);
1772 8 : err = notifier_to_errno(err);
1773 0 : if (err)
1774 0 : return err;
1775 :
1776 8 : if (!(dev->flags & IFF_UP))
1777 : return 0;
1778 :
1779 0 : call_netdevice_notifier(nb, NETDEV_UP, dev);
1780 0 : return 0;
1781 : }
1782 :
1783 0 : static void call_netdevice_unregister_notifiers(struct notifier_block *nb,
1784 : struct net_device *dev)
1785 : {
1786 0 : if (dev->flags & IFF_UP) {
1787 0 : call_netdevice_notifier(nb, NETDEV_GOING_DOWN,
1788 : dev);
1789 0 : call_netdevice_notifier(nb, NETDEV_DOWN, dev);
1790 : }
1791 0 : call_netdevice_notifier(nb, NETDEV_UNREGISTER, dev);
1792 0 : }
1793 :
1794 6 : static int call_netdevice_register_net_notifiers(struct notifier_block *nb,
1795 : struct net *net)
1796 : {
1797 6 : struct net_device *dev;
1798 6 : int err;
1799 :
1800 14 : for_each_netdev(net, dev) {
1801 8 : err = call_netdevice_register_notifiers(nb, dev);
1802 8 : if (err)
1803 0 : goto rollback;
1804 : }
1805 : return 0;
1806 :
1807 0 : rollback:
1808 0 : for_each_netdev_continue_reverse(net, dev)
1809 0 : call_netdevice_unregister_notifiers(nb, dev);
1810 : return err;
1811 : }
1812 :
1813 0 : static void call_netdevice_unregister_net_notifiers(struct notifier_block *nb,
1814 : struct net *net)
1815 : {
1816 0 : struct net_device *dev;
1817 :
1818 0 : for_each_netdev(net, dev)
1819 0 : call_netdevice_unregister_notifiers(nb, dev);
1820 0 : }
1821 :
1822 : static int dev_boot_phase = 1;
1823 :
1824 : /**
1825 : * register_netdevice_notifier - register a network notifier block
1826 : * @nb: notifier
1827 : *
1828 : * Register a notifier to be called when network device events occur.
1829 : * The notifier passed is linked into the kernel structures and must
1830 : * not be reused until it has been unregistered. A negative errno code
1831 : * is returned on a failure.
1832 : *
1833 : * When registered all registration and up events are replayed
1834 : * to the new notifier to allow device to have a race free
1835 : * view of the network device list.
1836 : */
1837 :
1838 7 : int register_netdevice_notifier(struct notifier_block *nb)
1839 : {
1840 7 : struct net *net;
1841 7 : int err;
1842 :
1843 : /* Close race with setup_net() and cleanup_net() */
1844 7 : down_write(&pernet_ops_rwsem);
1845 7 : rtnl_lock();
1846 7 : err = raw_notifier_chain_register(&netdev_chain, nb);
1847 7 : if (err)
1848 0 : goto unlock;
1849 7 : if (dev_boot_phase)
1850 1 : goto unlock;
1851 12 : for_each_net(net) {
1852 6 : err = call_netdevice_register_net_notifiers(nb, net);
1853 6 : if (err)
1854 0 : goto rollback;
1855 : }
1856 :
1857 6 : unlock:
1858 7 : rtnl_unlock();
1859 7 : up_write(&pernet_ops_rwsem);
1860 7 : return err;
1861 :
1862 0 : rollback:
1863 0 : for_each_net_continue_reverse(net)
1864 0 : call_netdevice_unregister_net_notifiers(nb, net);
1865 :
1866 0 : raw_notifier_chain_unregister(&netdev_chain, nb);
1867 0 : goto unlock;
1868 : }
1869 : EXPORT_SYMBOL(register_netdevice_notifier);
1870 :
1871 : /**
1872 : * unregister_netdevice_notifier - unregister a network notifier block
1873 : * @nb: notifier
1874 : *
1875 : * Unregister a notifier previously registered by
1876 : * register_netdevice_notifier(). The notifier is unlinked into the
1877 : * kernel structures and may then be reused. A negative errno code
1878 : * is returned on a failure.
1879 : *
1880 : * After unregistering unregister and down device events are synthesized
1881 : * for all devices on the device list to the removed notifier to remove
1882 : * the need for special case cleanup code.
1883 : */
1884 :
1885 0 : int unregister_netdevice_notifier(struct notifier_block *nb)
1886 : {
1887 0 : struct net *net;
1888 0 : int err;
1889 :
1890 : /* Close race with setup_net() and cleanup_net() */
1891 0 : down_write(&pernet_ops_rwsem);
1892 0 : rtnl_lock();
1893 0 : err = raw_notifier_chain_unregister(&netdev_chain, nb);
1894 0 : if (err)
1895 0 : goto unlock;
1896 :
1897 0 : for_each_net(net)
1898 0 : call_netdevice_unregister_net_notifiers(nb, net);
1899 :
1900 0 : unlock:
1901 0 : rtnl_unlock();
1902 0 : up_write(&pernet_ops_rwsem);
1903 0 : return err;
1904 : }
1905 : EXPORT_SYMBOL(unregister_netdevice_notifier);
1906 :
1907 0 : static int __register_netdevice_notifier_net(struct net *net,
1908 : struct notifier_block *nb,
1909 : bool ignore_call_fail)
1910 : {
1911 0 : int err;
1912 :
1913 0 : err = raw_notifier_chain_register(&net->netdev_chain, nb);
1914 0 : if (err)
1915 : return err;
1916 0 : if (dev_boot_phase)
1917 : return 0;
1918 :
1919 0 : err = call_netdevice_register_net_notifiers(nb, net);
1920 0 : if (err && !ignore_call_fail)
1921 0 : goto chain_unregister;
1922 :
1923 : return 0;
1924 :
1925 0 : chain_unregister:
1926 0 : raw_notifier_chain_unregister(&net->netdev_chain, nb);
1927 0 : return err;
1928 : }
1929 :
1930 0 : static int __unregister_netdevice_notifier_net(struct net *net,
1931 : struct notifier_block *nb)
1932 : {
1933 0 : int err;
1934 :
1935 0 : err = raw_notifier_chain_unregister(&net->netdev_chain, nb);
1936 0 : if (err)
1937 : return err;
1938 :
1939 0 : call_netdevice_unregister_net_notifiers(nb, net);
1940 0 : return 0;
1941 : }
1942 :
1943 : /**
1944 : * register_netdevice_notifier_net - register a per-netns network notifier block
1945 : * @net: network namespace
1946 : * @nb: notifier
1947 : *
1948 : * Register a notifier to be called when network device events occur.
1949 : * The notifier passed is linked into the kernel structures and must
1950 : * not be reused until it has been unregistered. A negative errno code
1951 : * is returned on a failure.
1952 : *
1953 : * When registered all registration and up events are replayed
1954 : * to the new notifier to allow device to have a race free
1955 : * view of the network device list.
1956 : */
1957 :
1958 0 : int register_netdevice_notifier_net(struct net *net, struct notifier_block *nb)
1959 : {
1960 0 : int err;
1961 :
1962 0 : rtnl_lock();
1963 0 : err = __register_netdevice_notifier_net(net, nb, false);
1964 0 : rtnl_unlock();
1965 0 : return err;
1966 : }
1967 : EXPORT_SYMBOL(register_netdevice_notifier_net);
1968 :
1969 : /**
1970 : * unregister_netdevice_notifier_net - unregister a per-netns
1971 : * network notifier block
1972 : * @net: network namespace
1973 : * @nb: notifier
1974 : *
1975 : * Unregister a notifier previously registered by
1976 : * register_netdevice_notifier(). The notifier is unlinked into the
1977 : * kernel structures and may then be reused. A negative errno code
1978 : * is returned on a failure.
1979 : *
1980 : * After unregistering unregister and down device events are synthesized
1981 : * for all devices on the device list to the removed notifier to remove
1982 : * the need for special case cleanup code.
1983 : */
1984 :
1985 0 : int unregister_netdevice_notifier_net(struct net *net,
1986 : struct notifier_block *nb)
1987 : {
1988 0 : int err;
1989 :
1990 0 : rtnl_lock();
1991 0 : err = __unregister_netdevice_notifier_net(net, nb);
1992 0 : rtnl_unlock();
1993 0 : return err;
1994 : }
1995 : EXPORT_SYMBOL(unregister_netdevice_notifier_net);
1996 :
1997 0 : int register_netdevice_notifier_dev_net(struct net_device *dev,
1998 : struct notifier_block *nb,
1999 : struct netdev_net_notifier *nn)
2000 : {
2001 0 : int err;
2002 :
2003 0 : rtnl_lock();
2004 0 : err = __register_netdevice_notifier_net(dev_net(dev), nb, false);
2005 0 : if (!err) {
2006 0 : nn->nb = nb;
2007 0 : list_add(&nn->list, &dev->net_notifier_list);
2008 : }
2009 0 : rtnl_unlock();
2010 0 : return err;
2011 : }
2012 : EXPORT_SYMBOL(register_netdevice_notifier_dev_net);
2013 :
2014 0 : int unregister_netdevice_notifier_dev_net(struct net_device *dev,
2015 : struct notifier_block *nb,
2016 : struct netdev_net_notifier *nn)
2017 : {
2018 0 : int err;
2019 :
2020 0 : rtnl_lock();
2021 0 : list_del(&nn->list);
2022 0 : err = __unregister_netdevice_notifier_net(dev_net(dev), nb);
2023 0 : rtnl_unlock();
2024 0 : return err;
2025 : }
2026 : EXPORT_SYMBOL(unregister_netdevice_notifier_dev_net);
2027 :
2028 : static void move_netdevice_notifiers_dev_net(struct net_device *dev,
2029 : struct net *net)
2030 : {
2031 : struct netdev_net_notifier *nn;
2032 :
2033 : list_for_each_entry(nn, &dev->net_notifier_list, list) {
2034 : __unregister_netdevice_notifier_net(dev_net(dev), nn->nb);
2035 : __register_netdevice_notifier_net(net, nn->nb, true);
2036 : }
2037 : }
2038 :
2039 : /**
2040 : * call_netdevice_notifiers_info - call all network notifier blocks
2041 : * @val: value passed unmodified to notifier function
2042 : * @info: notifier information data
2043 : *
2044 : * Call all network notifier blocks. Parameters and return value
2045 : * are as for raw_notifier_call_chain().
2046 : */
2047 :
2048 8 : static int call_netdevice_notifiers_info(unsigned long val,
2049 : struct netdev_notifier_info *info)
2050 : {
2051 8 : struct net *net = dev_net(info->dev);
2052 8 : int ret;
2053 :
2054 8 : ASSERT_RTNL();
2055 :
2056 : /* Run per-netns notifier block chain first, then run the global one.
2057 : * Hopefully, one day, the global one is going to be removed after
2058 : * all notifier block registrators get converted to be per-netns.
2059 : */
2060 8 : ret = raw_notifier_call_chain(&net->netdev_chain, val, info);
2061 8 : if (ret & NOTIFY_STOP_MASK)
2062 : return ret;
2063 8 : return raw_notifier_call_chain(&netdev_chain, val, info);
2064 : }
2065 :
2066 8 : static int call_netdevice_notifiers_extack(unsigned long val,
2067 : struct net_device *dev,
2068 : struct netlink_ext_ack *extack)
2069 : {
2070 8 : struct netdev_notifier_info info = {
2071 : .dev = dev,
2072 : .extack = extack,
2073 : };
2074 :
2075 2 : return call_netdevice_notifiers_info(val, &info);
2076 : }
2077 :
2078 : /**
2079 : * call_netdevice_notifiers - call all network notifier blocks
2080 : * @val: value passed unmodified to notifier function
2081 : * @dev: net_device pointer passed unmodified to notifier function
2082 : *
2083 : * Call all network notifier blocks. Parameters and return value
2084 : * are as for raw_notifier_call_chain().
2085 : */
2086 :
2087 6 : int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
2088 : {
2089 2 : return call_netdevice_notifiers_extack(val, dev, NULL);
2090 : }
2091 : EXPORT_SYMBOL(call_netdevice_notifiers);
2092 :
2093 : /**
2094 : * call_netdevice_notifiers_mtu - call all network notifier blocks
2095 : * @val: value passed unmodified to notifier function
2096 : * @dev: net_device pointer passed unmodified to notifier function
2097 : * @arg: additional u32 argument passed to the notifier function
2098 : *
2099 : * Call all network notifier blocks. Parameters and return value
2100 : * are as for raw_notifier_call_chain().
2101 : */
2102 0 : static int call_netdevice_notifiers_mtu(unsigned long val,
2103 : struct net_device *dev, u32 arg)
2104 : {
2105 0 : struct netdev_notifier_info_ext info = {
2106 : .info.dev = dev,
2107 : .ext.mtu = arg,
2108 : };
2109 :
2110 0 : BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0);
2111 :
2112 0 : return call_netdevice_notifiers_info(val, &info.info);
2113 : }
2114 :
2115 : #ifdef CONFIG_NET_INGRESS
2116 : static DEFINE_STATIC_KEY_FALSE(ingress_needed_key);
2117 :
2118 : void net_inc_ingress_queue(void)
2119 : {
2120 : static_branch_inc(&ingress_needed_key);
2121 : }
2122 : EXPORT_SYMBOL_GPL(net_inc_ingress_queue);
2123 :
2124 : void net_dec_ingress_queue(void)
2125 : {
2126 : static_branch_dec(&ingress_needed_key);
2127 : }
2128 : EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
2129 : #endif
2130 :
2131 : #ifdef CONFIG_NET_EGRESS
2132 : static DEFINE_STATIC_KEY_FALSE(egress_needed_key);
2133 :
2134 : void net_inc_egress_queue(void)
2135 : {
2136 : static_branch_inc(&egress_needed_key);
2137 : }
2138 : EXPORT_SYMBOL_GPL(net_inc_egress_queue);
2139 :
2140 : void net_dec_egress_queue(void)
2141 : {
2142 : static_branch_dec(&egress_needed_key);
2143 : }
2144 : EXPORT_SYMBOL_GPL(net_dec_egress_queue);
2145 : #endif
2146 :
2147 : static DEFINE_STATIC_KEY_FALSE(netstamp_needed_key);
2148 : #ifdef CONFIG_JUMP_LABEL
2149 : static atomic_t netstamp_needed_deferred;
2150 : static atomic_t netstamp_wanted;
2151 : static void netstamp_clear(struct work_struct *work)
2152 : {
2153 : int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
2154 : int wanted;
2155 :
2156 : wanted = atomic_add_return(deferred, &netstamp_wanted);
2157 : if (wanted > 0)
2158 : static_branch_enable(&netstamp_needed_key);
2159 : else
2160 : static_branch_disable(&netstamp_needed_key);
2161 : }
2162 : static DECLARE_WORK(netstamp_work, netstamp_clear);
2163 : #endif
2164 :
2165 0 : void net_enable_timestamp(void)
2166 : {
2167 : #ifdef CONFIG_JUMP_LABEL
2168 : int wanted;
2169 :
2170 : while (1) {
2171 : wanted = atomic_read(&netstamp_wanted);
2172 : if (wanted <= 0)
2173 : break;
2174 : if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
2175 : return;
2176 : }
2177 : atomic_inc(&netstamp_needed_deferred);
2178 : schedule_work(&netstamp_work);
2179 : #else
2180 0 : static_branch_inc(&netstamp_needed_key);
2181 : #endif
2182 0 : }
2183 : EXPORT_SYMBOL(net_enable_timestamp);
2184 :
2185 0 : void net_disable_timestamp(void)
2186 : {
2187 : #ifdef CONFIG_JUMP_LABEL
2188 : int wanted;
2189 :
2190 : while (1) {
2191 : wanted = atomic_read(&netstamp_wanted);
2192 : if (wanted <= 1)
2193 : break;
2194 : if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
2195 : return;
2196 : }
2197 : atomic_dec(&netstamp_needed_deferred);
2198 : schedule_work(&netstamp_work);
2199 : #else
2200 0 : static_branch_dec(&netstamp_needed_key);
2201 : #endif
2202 0 : }
2203 : EXPORT_SYMBOL(net_disable_timestamp);
2204 :
2205 446 : static inline void net_timestamp_set(struct sk_buff *skb)
2206 : {
2207 446 : skb->tstamp = 0;
2208 446 : if (static_branch_unlikely(&netstamp_needed_key))
2209 0 : __net_timestamp(skb);
2210 446 : }
2211 :
2212 : #define net_timestamp_check(COND, SKB) \
2213 : if (static_branch_unlikely(&netstamp_needed_key)) { \
2214 : if ((COND) && !(SKB)->tstamp) \
2215 : __net_timestamp(SKB); \
2216 : } \
2217 :
2218 0 : bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb)
2219 : {
2220 0 : return __is_skb_forwardable(dev, skb, true);
2221 : }
2222 : EXPORT_SYMBOL_GPL(is_skb_forwardable);
2223 :
2224 0 : static int __dev_forward_skb2(struct net_device *dev, struct sk_buff *skb,
2225 : bool check_mtu)
2226 : {
2227 0 : int ret = ____dev_forward_skb(dev, skb, check_mtu);
2228 :
2229 0 : if (likely(!ret)) {
2230 0 : skb->protocol = eth_type_trans(skb, dev);
2231 0 : skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
2232 : }
2233 :
2234 0 : return ret;
2235 : }
2236 :
2237 0 : int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2238 : {
2239 0 : return __dev_forward_skb2(dev, skb, true);
2240 : }
2241 : EXPORT_SYMBOL_GPL(__dev_forward_skb);
2242 :
2243 : /**
2244 : * dev_forward_skb - loopback an skb to another netif
2245 : *
2246 : * @dev: destination network device
2247 : * @skb: buffer to forward
2248 : *
2249 : * return values:
2250 : * NET_RX_SUCCESS (no congestion)
2251 : * NET_RX_DROP (packet was dropped, but freed)
2252 : *
2253 : * dev_forward_skb can be used for injecting an skb from the
2254 : * start_xmit function of one device into the receive queue
2255 : * of another device.
2256 : *
2257 : * The receiving device may be in another namespace, so
2258 : * we have to clear all information in the skb that could
2259 : * impact namespace isolation.
2260 : */
2261 0 : int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
2262 : {
2263 0 : return __dev_forward_skb(dev, skb) ?: netif_rx_internal(skb);
2264 : }
2265 : EXPORT_SYMBOL_GPL(dev_forward_skb);
2266 :
2267 0 : int dev_forward_skb_nomtu(struct net_device *dev, struct sk_buff *skb)
2268 : {
2269 0 : return __dev_forward_skb2(dev, skb, false) ?: netif_rx_internal(skb);
2270 : }
2271 :
2272 456 : static inline int deliver_skb(struct sk_buff *skb,
2273 : struct packet_type *pt_prev,
2274 : struct net_device *orig_dev)
2275 : {
2276 456 : if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
2277 : return -ENOMEM;
2278 456 : refcount_inc(&skb->users);
2279 456 : return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2280 : }
2281 :
2282 912 : static inline void deliver_ptype_list_skb(struct sk_buff *skb,
2283 : struct packet_type **pt,
2284 : struct net_device *orig_dev,
2285 : __be16 type,
2286 : struct list_head *ptype_list)
2287 : {
2288 912 : struct packet_type *ptype, *pt_prev = *pt;
2289 :
2290 1368 : list_for_each_entry_rcu(ptype, ptype_list, list) {
2291 456 : if (ptype->type != type)
2292 0 : continue;
2293 456 : if (pt_prev)
2294 456 : deliver_skb(skb, pt_prev, orig_dev);
2295 : pt_prev = ptype;
2296 : }
2297 912 : *pt = pt_prev;
2298 912 : }
2299 :
2300 448 : static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
2301 : {
2302 448 : if (!ptype->af_packet_priv || !skb->sk)
2303 : return false;
2304 :
2305 445 : if (ptype->id_match)
2306 0 : return ptype->id_match(ptype, skb->sk);
2307 445 : else if ((struct sock *)ptype->af_packet_priv == skb->sk)
2308 2 : return true;
2309 :
2310 : return false;
2311 : }
2312 :
2313 : /**
2314 : * dev_nit_active - return true if any network interface taps are in use
2315 : *
2316 : * @dev: network device to check for the presence of taps
2317 : */
2318 448 : bool dev_nit_active(struct net_device *dev)
2319 : {
2320 448 : return !list_empty(&ptype_all) || !list_empty(&dev->ptype_all);
2321 : }
2322 : EXPORT_SYMBOL_GPL(dev_nit_active);
2323 :
2324 : /*
2325 : * Support routine. Sends outgoing frames to any network
2326 : * taps currently in use.
2327 : */
2328 :
2329 448 : void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
2330 : {
2331 448 : struct packet_type *ptype;
2332 448 : struct sk_buff *skb2 = NULL;
2333 448 : struct packet_type *pt_prev = NULL;
2334 448 : struct list_head *ptype_list = &ptype_all;
2335 :
2336 448 : rcu_read_lock();
2337 448 : again:
2338 1344 : list_for_each_entry_rcu(ptype, ptype_list, list) {
2339 448 : if (ptype->ignore_outgoing)
2340 0 : continue;
2341 :
2342 : /* Never send packets back to the socket
2343 : * they originated from - MvS (miquels@drinkel.ow.org)
2344 : */
2345 448 : if (skb_loop_sk(ptype, skb))
2346 2 : continue;
2347 :
2348 446 : if (pt_prev) {
2349 0 : deliver_skb(skb2, pt_prev, skb->dev);
2350 0 : pt_prev = ptype;
2351 0 : continue;
2352 : }
2353 :
2354 : /* need to clone skb, done only once */
2355 446 : skb2 = skb_clone(skb, GFP_ATOMIC);
2356 446 : if (!skb2)
2357 0 : goto out_unlock;
2358 :
2359 446 : net_timestamp_set(skb2);
2360 :
2361 : /* skb->nh should be correctly
2362 : * set by sender, so that the second statement is
2363 : * just protection against buggy protocols.
2364 : */
2365 446 : skb_reset_mac_header(skb2);
2366 :
2367 446 : if (skb_network_header(skb2) < skb2->data ||
2368 446 : skb_network_header(skb2) > skb_tail_pointer(skb2)) {
2369 0 : net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
2370 : ntohs(skb2->protocol),
2371 : dev->name);
2372 0 : skb_reset_network_header(skb2);
2373 : }
2374 :
2375 446 : skb2->transport_header = skb2->network_header;
2376 446 : skb2->pkt_type = PACKET_OUTGOING;
2377 446 : pt_prev = ptype;
2378 : }
2379 :
2380 896 : if (ptype_list == &ptype_all) {
2381 448 : ptype_list = &dev->ptype_all;
2382 448 : goto again;
2383 : }
2384 448 : out_unlock:
2385 448 : if (pt_prev) {
2386 446 : if (!skb_orphan_frags_rx(skb2, GFP_ATOMIC))
2387 446 : pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
2388 : else
2389 0 : kfree_skb(skb2);
2390 : }
2391 448 : rcu_read_unlock();
2392 448 : }
2393 : EXPORT_SYMBOL_GPL(dev_queue_xmit_nit);
2394 :
2395 : /**
2396 : * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
2397 : * @dev: Network device
2398 : * @txq: number of queues available
2399 : *
2400 : * If real_num_tx_queues is changed the tc mappings may no longer be
2401 : * valid. To resolve this verify the tc mapping remains valid and if
2402 : * not NULL the mapping. With no priorities mapping to this
2403 : * offset/count pair it will no longer be used. In the worst case TC0
2404 : * is invalid nothing can be done so disable priority mappings. If is
2405 : * expected that drivers will fix this mapping if they can before
2406 : * calling netif_set_real_num_tx_queues.
2407 : */
2408 0 : static void netif_setup_tc(struct net_device *dev, unsigned int txq)
2409 : {
2410 0 : int i;
2411 0 : struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2412 :
2413 : /* If TC0 is invalidated disable TC mapping */
2414 0 : if (tc->offset + tc->count > txq) {
2415 0 : pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
2416 0 : dev->num_tc = 0;
2417 0 : return;
2418 : }
2419 :
2420 : /* Invalidated prio to tc mappings set to TC0 */
2421 0 : for (i = 1; i < TC_BITMASK + 1; i++) {
2422 0 : int q = netdev_get_prio_tc_map(dev, i);
2423 :
2424 0 : tc = &dev->tc_to_txq[q];
2425 0 : if (tc->offset + tc->count > txq) {
2426 0 : pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
2427 : i, q);
2428 0 : netdev_set_prio_tc_map(dev, i, 0);
2429 : }
2430 : }
2431 : }
2432 :
2433 0 : int netdev_txq_to_tc(struct net_device *dev, unsigned int txq)
2434 : {
2435 0 : if (dev->num_tc) {
2436 0 : struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
2437 0 : int i;
2438 :
2439 : /* walk through the TCs and see if it falls into any of them */
2440 0 : for (i = 0; i < TC_MAX_QUEUE; i++, tc++) {
2441 0 : if ((txq - tc->offset) < tc->count)
2442 0 : return i;
2443 : }
2444 :
2445 : /* didn't find it, just return -1 to indicate no match */
2446 : return -1;
2447 : }
2448 :
2449 : return 0;
2450 : }
2451 : EXPORT_SYMBOL(netdev_txq_to_tc);
2452 :
2453 : #ifdef CONFIG_XPS
2454 : struct static_key xps_needed __read_mostly;
2455 : EXPORT_SYMBOL(xps_needed);
2456 : struct static_key xps_rxqs_needed __read_mostly;
2457 : EXPORT_SYMBOL(xps_rxqs_needed);
2458 : static DEFINE_MUTEX(xps_map_mutex);
2459 : #define xmap_dereference(P) \
2460 : rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
2461 :
2462 0 : static bool remove_xps_queue(struct xps_dev_maps *dev_maps,
2463 : int tci, u16 index)
2464 : {
2465 0 : struct xps_map *map = NULL;
2466 0 : int pos;
2467 :
2468 0 : if (dev_maps)
2469 0 : map = xmap_dereference(dev_maps->attr_map[tci]);
2470 0 : if (!map)
2471 0 : return false;
2472 :
2473 0 : for (pos = map->len; pos--;) {
2474 0 : if (map->queues[pos] != index)
2475 0 : continue;
2476 :
2477 0 : if (map->len > 1) {
2478 0 : map->queues[pos] = map->queues[--map->len];
2479 0 : break;
2480 : }
2481 :
2482 0 : RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL);
2483 0 : kfree_rcu(map, rcu);
2484 0 : return false;
2485 : }
2486 :
2487 : return true;
2488 : }
2489 :
2490 0 : static bool remove_xps_queue_cpu(struct net_device *dev,
2491 : struct xps_dev_maps *dev_maps,
2492 : int cpu, u16 offset, u16 count)
2493 : {
2494 0 : int num_tc = dev->num_tc ? : 1;
2495 0 : bool active = false;
2496 0 : int tci;
2497 :
2498 0 : for (tci = cpu * num_tc; num_tc--; tci++) {
2499 0 : int i, j;
2500 :
2501 0 : for (i = count, j = offset; i--; j++) {
2502 0 : if (!remove_xps_queue(dev_maps, tci, j))
2503 : break;
2504 : }
2505 :
2506 0 : active |= i < 0;
2507 : }
2508 :
2509 0 : return active;
2510 : }
2511 :
2512 0 : static void reset_xps_maps(struct net_device *dev,
2513 : struct xps_dev_maps *dev_maps,
2514 : bool is_rxqs_map)
2515 : {
2516 0 : if (is_rxqs_map) {
2517 0 : static_key_slow_dec_cpuslocked(&xps_rxqs_needed);
2518 0 : RCU_INIT_POINTER(dev->xps_rxqs_map, NULL);
2519 : } else {
2520 0 : RCU_INIT_POINTER(dev->xps_cpus_map, NULL);
2521 : }
2522 0 : static_key_slow_dec_cpuslocked(&xps_needed);
2523 0 : kfree_rcu(dev_maps, rcu);
2524 0 : }
2525 :
2526 0 : static void clean_xps_maps(struct net_device *dev, const unsigned long *mask,
2527 : struct xps_dev_maps *dev_maps, unsigned int nr_ids,
2528 : u16 offset, u16 count, bool is_rxqs_map)
2529 : {
2530 0 : bool active = false;
2531 0 : int i, j;
2532 :
2533 0 : for (j = -1; j = netif_attrmask_next(j, mask, nr_ids),
2534 : j < nr_ids;)
2535 0 : active |= remove_xps_queue_cpu(dev, dev_maps, j, offset,
2536 : count);
2537 0 : if (!active)
2538 0 : reset_xps_maps(dev, dev_maps, is_rxqs_map);
2539 :
2540 0 : if (!is_rxqs_map) {
2541 0 : for (i = offset + (count - 1); count--; i--) {
2542 0 : netdev_queue_numa_node_write(
2543 : netdev_get_tx_queue(dev, i),
2544 : NUMA_NO_NODE);
2545 : }
2546 : }
2547 0 : }
2548 :
2549 0 : static void netif_reset_xps_queues(struct net_device *dev, u16 offset,
2550 : u16 count)
2551 : {
2552 0 : const unsigned long *possible_mask = NULL;
2553 0 : struct xps_dev_maps *dev_maps;
2554 0 : unsigned int nr_ids;
2555 :
2556 0 : if (!static_key_false(&xps_needed))
2557 : return;
2558 :
2559 0 : cpus_read_lock();
2560 0 : mutex_lock(&xps_map_mutex);
2561 :
2562 0 : if (static_key_false(&xps_rxqs_needed)) {
2563 0 : dev_maps = xmap_dereference(dev->xps_rxqs_map);
2564 0 : if (dev_maps) {
2565 0 : nr_ids = dev->num_rx_queues;
2566 0 : clean_xps_maps(dev, possible_mask, dev_maps, nr_ids,
2567 : offset, count, true);
2568 : }
2569 : }
2570 :
2571 0 : dev_maps = xmap_dereference(dev->xps_cpus_map);
2572 0 : if (!dev_maps)
2573 0 : goto out_no_maps;
2574 :
2575 0 : if (num_possible_cpus() > 1)
2576 0 : possible_mask = cpumask_bits(cpu_possible_mask);
2577 0 : nr_ids = nr_cpu_ids;
2578 0 : clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count,
2579 : false);
2580 :
2581 0 : out_no_maps:
2582 0 : mutex_unlock(&xps_map_mutex);
2583 0 : cpus_read_unlock();
2584 : }
2585 :
2586 0 : static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
2587 : {
2588 0 : netif_reset_xps_queues(dev, index, dev->num_tx_queues - index);
2589 0 : }
2590 :
2591 4 : static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index,
2592 : u16 index, bool is_rxqs_map)
2593 : {
2594 4 : struct xps_map *new_map;
2595 4 : int alloc_len = XPS_MIN_MAP_ALLOC;
2596 4 : int i, pos;
2597 :
2598 4 : for (pos = 0; map && pos < map->len; pos++) {
2599 0 : if (map->queues[pos] != index)
2600 0 : continue;
2601 : return map;
2602 : }
2603 :
2604 : /* Need to add tx-queue to this CPU's/rx-queue's existing map */
2605 4 : if (map) {
2606 0 : if (pos < map->alloc_len)
2607 : return map;
2608 :
2609 0 : alloc_len = map->alloc_len * 2;
2610 : }
2611 :
2612 : /* Need to allocate new map to store tx-queue on this CPU's/rx-queue's
2613 : * map
2614 : */
2615 4 : if (is_rxqs_map)
2616 0 : new_map = kzalloc(XPS_MAP_SIZE(alloc_len), GFP_KERNEL);
2617 : else
2618 4 : new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
2619 : cpu_to_node(attr_index));
2620 4 : if (!new_map)
2621 : return NULL;
2622 :
2623 4 : for (i = 0; i < pos; i++)
2624 0 : new_map->queues[i] = map->queues[i];
2625 4 : new_map->alloc_len = alloc_len;
2626 4 : new_map->len = pos;
2627 :
2628 4 : return new_map;
2629 : }
2630 :
2631 : /* Must be called under cpus_read_lock */
2632 1 : int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
2633 : u16 index, bool is_rxqs_map)
2634 : {
2635 1 : const unsigned long *online_mask = NULL, *possible_mask = NULL;
2636 1 : struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
2637 1 : int i, j, tci, numa_node_id = -2;
2638 1 : int maps_sz, num_tc = 1, tc = 0;
2639 1 : struct xps_map *map, *new_map;
2640 1 : bool active = false;
2641 1 : unsigned int nr_ids;
2642 :
2643 1 : if (dev->num_tc) {
2644 : /* Do not allow XPS on subordinate device directly */
2645 0 : num_tc = dev->num_tc;
2646 0 : if (num_tc < 0)
2647 : return -EINVAL;
2648 :
2649 : /* If queue belongs to subordinate dev use its map */
2650 0 : dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
2651 :
2652 0 : tc = netdev_txq_to_tc(dev, index);
2653 0 : if (tc < 0)
2654 : return -EINVAL;
2655 : }
2656 :
2657 1 : mutex_lock(&xps_map_mutex);
2658 1 : if (is_rxqs_map) {
2659 0 : maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues);
2660 0 : dev_maps = xmap_dereference(dev->xps_rxqs_map);
2661 0 : nr_ids = dev->num_rx_queues;
2662 : } else {
2663 1 : maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc);
2664 1 : if (num_possible_cpus() > 1) {
2665 1 : online_mask = cpumask_bits(cpu_online_mask);
2666 1 : possible_mask = cpumask_bits(cpu_possible_mask);
2667 : }
2668 2 : dev_maps = xmap_dereference(dev->xps_cpus_map);
2669 1 : nr_ids = nr_cpu_ids;
2670 : }
2671 :
2672 1 : if (maps_sz < L1_CACHE_BYTES)
2673 : maps_sz = L1_CACHE_BYTES;
2674 :
2675 : /* allocate memory for queue storage */
2676 6 : for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids),
2677 : j < nr_ids;) {
2678 4 : if (!new_dev_maps)
2679 1 : new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2680 4 : if (!new_dev_maps) {
2681 0 : mutex_unlock(&xps_map_mutex);
2682 0 : return -ENOMEM;
2683 : }
2684 :
2685 4 : tci = j * num_tc + tc;
2686 4 : map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) :
2687 : NULL;
2688 :
2689 4 : map = expand_xps_map(map, j, index, is_rxqs_map);
2690 4 : if (!map)
2691 0 : goto error;
2692 :
2693 5 : RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2694 : }
2695 :
2696 1 : if (!new_dev_maps)
2697 0 : goto out_no_new_maps;
2698 :
2699 1 : if (!dev_maps) {
2700 : /* Increment static keys at most once per type */
2701 1 : static_key_slow_inc_cpuslocked(&xps_needed);
2702 1 : if (is_rxqs_map)
2703 0 : static_key_slow_inc_cpuslocked(&xps_rxqs_needed);
2704 : }
2705 :
2706 5 : for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2707 : j < nr_ids;) {
2708 : /* copy maps belonging to foreign traffic classes */
2709 4 : for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) {
2710 : /* fill in the new device map from the old device map */
2711 0 : map = xmap_dereference(dev_maps->attr_map[tci]);
2712 0 : RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2713 : }
2714 :
2715 : /* We need to explicitly update tci as prevous loop
2716 : * could break out early if dev_maps is NULL.
2717 : */
2718 4 : tci = j * num_tc + tc;
2719 :
2720 4 : if (netif_attr_test_mask(j, mask, nr_ids) &&
2721 8 : netif_attr_test_online(j, online_mask, nr_ids)) {
2722 : /* add tx-queue to CPU/rx-queue maps */
2723 4 : int pos = 0;
2724 :
2725 8 : map = xmap_dereference(new_dev_maps->attr_map[tci]);
2726 4 : while ((pos < map->len) && (map->queues[pos] != index))
2727 0 : pos++;
2728 :
2729 4 : if (pos == map->len)
2730 4 : map->queues[map->len++] = index;
2731 : #ifdef CONFIG_NUMA
2732 4 : if (!is_rxqs_map) {
2733 4 : if (numa_node_id == -2)
2734 1 : numa_node_id = cpu_to_node(j);
2735 3 : else if (numa_node_id != cpu_to_node(j))
2736 0 : numa_node_id = -1;
2737 : }
2738 : #endif
2739 0 : } else if (dev_maps) {
2740 : /* fill in the new device map from the old device map */
2741 0 : map = xmap_dereference(dev_maps->attr_map[tci]);
2742 0 : RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2743 : }
2744 :
2745 : /* copy maps belonging to foreign traffic classes */
2746 4 : for (i = num_tc - tc, tci++; dev_maps && --i; tci++) {
2747 : /* fill in the new device map from the old device map */
2748 0 : map = xmap_dereference(dev_maps->attr_map[tci]);
2749 0 : RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map);
2750 : }
2751 : }
2752 :
2753 1 : if (is_rxqs_map)
2754 0 : rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps);
2755 : else
2756 1 : rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps);
2757 :
2758 : /* Cleanup old maps */
2759 1 : if (!dev_maps)
2760 1 : goto out_no_old_maps;
2761 :
2762 0 : for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2763 : j < nr_ids;) {
2764 0 : for (i = num_tc, tci = j * num_tc; i--; tci++) {
2765 0 : new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2766 0 : map = xmap_dereference(dev_maps->attr_map[tci]);
2767 0 : if (map && map != new_map)
2768 0 : kfree_rcu(map, rcu);
2769 : }
2770 : }
2771 :
2772 0 : kfree_rcu(dev_maps, rcu);
2773 :
2774 : out_no_old_maps:
2775 : dev_maps = new_dev_maps;
2776 : active = true;
2777 :
2778 1 : out_no_new_maps:
2779 1 : if (!is_rxqs_map) {
2780 : /* update Tx queue numa node */
2781 1 : netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2782 : (numa_node_id >= 0) ?
2783 : numa_node_id : NUMA_NO_NODE);
2784 : }
2785 :
2786 1 : if (!dev_maps)
2787 0 : goto out_no_maps;
2788 :
2789 : /* removes tx-queue from unused CPUs/rx-queues */
2790 5 : for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2791 : j < nr_ids;) {
2792 4 : for (i = tc, tci = j * num_tc; i--; tci++)
2793 0 : active |= remove_xps_queue(dev_maps, tci, index);
2794 4 : if (!netif_attr_test_mask(j, mask, nr_ids) ||
2795 8 : !netif_attr_test_online(j, online_mask, nr_ids))
2796 0 : active |= remove_xps_queue(dev_maps, tci, index);
2797 4 : for (i = num_tc - tc, tci++; --i; tci++)
2798 0 : active |= remove_xps_queue(dev_maps, tci, index);
2799 : }
2800 :
2801 : /* free map if not active */
2802 1 : if (!active)
2803 0 : reset_xps_maps(dev, dev_maps, is_rxqs_map);
2804 :
2805 1 : out_no_maps:
2806 1 : mutex_unlock(&xps_map_mutex);
2807 :
2808 1 : return 0;
2809 0 : error:
2810 : /* remove any maps that we added */
2811 0 : for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids),
2812 : j < nr_ids;) {
2813 0 : for (i = num_tc, tci = j * num_tc; i--; tci++) {
2814 0 : new_map = xmap_dereference(new_dev_maps->attr_map[tci]);
2815 0 : map = dev_maps ?
2816 0 : xmap_dereference(dev_maps->attr_map[tci]) :
2817 : NULL;
2818 0 : if (new_map && new_map != map)
2819 0 : kfree(new_map);
2820 : }
2821 : }
2822 :
2823 0 : mutex_unlock(&xps_map_mutex);
2824 :
2825 0 : kfree(new_dev_maps);
2826 0 : return -ENOMEM;
2827 : }
2828 : EXPORT_SYMBOL_GPL(__netif_set_xps_queue);
2829 :
2830 0 : int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
2831 : u16 index)
2832 : {
2833 0 : int ret;
2834 :
2835 0 : cpus_read_lock();
2836 0 : ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false);
2837 0 : cpus_read_unlock();
2838 :
2839 0 : return ret;
2840 : }
2841 : EXPORT_SYMBOL(netif_set_xps_queue);
2842 :
2843 : #endif
2844 0 : static void netdev_unbind_all_sb_channels(struct net_device *dev)
2845 : {
2846 0 : struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2847 :
2848 : /* Unbind any subordinate channels */
2849 0 : while (txq-- != &dev->_tx[0]) {
2850 0 : if (txq->sb_dev)
2851 0 : netdev_unbind_sb_channel(dev, txq->sb_dev);
2852 : }
2853 0 : }
2854 :
2855 0 : void netdev_reset_tc(struct net_device *dev)
2856 : {
2857 : #ifdef CONFIG_XPS
2858 0 : netif_reset_xps_queues_gt(dev, 0);
2859 : #endif
2860 0 : netdev_unbind_all_sb_channels(dev);
2861 :
2862 : /* Reset TC configuration of device */
2863 0 : dev->num_tc = 0;
2864 0 : memset(dev->tc_to_txq, 0, sizeof(dev->tc_to_txq));
2865 0 : memset(dev->prio_tc_map, 0, sizeof(dev->prio_tc_map));
2866 0 : }
2867 : EXPORT_SYMBOL(netdev_reset_tc);
2868 :
2869 0 : int netdev_set_tc_queue(struct net_device *dev, u8 tc, u16 count, u16 offset)
2870 : {
2871 0 : if (tc >= dev->num_tc)
2872 : return -EINVAL;
2873 :
2874 : #ifdef CONFIG_XPS
2875 0 : netif_reset_xps_queues(dev, offset, count);
2876 : #endif
2877 0 : dev->tc_to_txq[tc].count = count;
2878 0 : dev->tc_to_txq[tc].offset = offset;
2879 0 : return 0;
2880 : }
2881 : EXPORT_SYMBOL(netdev_set_tc_queue);
2882 :
2883 0 : int netdev_set_num_tc(struct net_device *dev, u8 num_tc)
2884 : {
2885 0 : if (num_tc > TC_MAX_QUEUE)
2886 : return -EINVAL;
2887 :
2888 : #ifdef CONFIG_XPS
2889 0 : netif_reset_xps_queues_gt(dev, 0);
2890 : #endif
2891 0 : netdev_unbind_all_sb_channels(dev);
2892 :
2893 0 : dev->num_tc = num_tc;
2894 0 : return 0;
2895 : }
2896 : EXPORT_SYMBOL(netdev_set_num_tc);
2897 :
2898 0 : void netdev_unbind_sb_channel(struct net_device *dev,
2899 : struct net_device *sb_dev)
2900 : {
2901 0 : struct netdev_queue *txq = &dev->_tx[dev->num_tx_queues];
2902 :
2903 : #ifdef CONFIG_XPS
2904 0 : netif_reset_xps_queues_gt(sb_dev, 0);
2905 : #endif
2906 0 : memset(sb_dev->tc_to_txq, 0, sizeof(sb_dev->tc_to_txq));
2907 0 : memset(sb_dev->prio_tc_map, 0, sizeof(sb_dev->prio_tc_map));
2908 :
2909 0 : while (txq-- != &dev->_tx[0]) {
2910 0 : if (txq->sb_dev == sb_dev)
2911 0 : txq->sb_dev = NULL;
2912 : }
2913 0 : }
2914 : EXPORT_SYMBOL(netdev_unbind_sb_channel);
2915 :
2916 0 : int netdev_bind_sb_channel_queue(struct net_device *dev,
2917 : struct net_device *sb_dev,
2918 : u8 tc, u16 count, u16 offset)
2919 : {
2920 : /* Make certain the sb_dev and dev are already configured */
2921 0 : if (sb_dev->num_tc >= 0 || tc >= dev->num_tc)
2922 : return -EINVAL;
2923 :
2924 : /* We cannot hand out queues we don't have */
2925 0 : if ((offset + count) > dev->real_num_tx_queues)
2926 : return -EINVAL;
2927 :
2928 : /* Record the mapping */
2929 0 : sb_dev->tc_to_txq[tc].count = count;
2930 0 : sb_dev->tc_to_txq[tc].offset = offset;
2931 :
2932 : /* Provide a way for Tx queue to find the tc_to_txq map or
2933 : * XPS map for itself.
2934 : */
2935 0 : while (count--)
2936 0 : netdev_get_tx_queue(dev, count + offset)->sb_dev = sb_dev;
2937 :
2938 : return 0;
2939 : }
2940 : EXPORT_SYMBOL(netdev_bind_sb_channel_queue);
2941 :
2942 0 : int netdev_set_sb_channel(struct net_device *dev, u16 channel)
2943 : {
2944 : /* Do not use a multiqueue device to represent a subordinate channel */
2945 0 : if (netif_is_multiqueue(dev))
2946 : return -ENODEV;
2947 :
2948 : /* We allow channels 1 - 32767 to be used for subordinate channels.
2949 : * Channel 0 is meant to be "native" mode and used only to represent
2950 : * the main root device. We allow writing 0 to reset the device back
2951 : * to normal mode after being used as a subordinate channel.
2952 : */
2953 0 : if (channel > S16_MAX)
2954 : return -EINVAL;
2955 :
2956 0 : dev->num_tc = -channel;
2957 :
2958 0 : return 0;
2959 : }
2960 : EXPORT_SYMBOL(netdev_set_sb_channel);
2961 :
2962 : /*
2963 : * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2964 : * greater than real_num_tx_queues stale skbs on the qdisc must be flushed.
2965 : */
2966 1 : int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2967 : {
2968 1 : bool disabling;
2969 1 : int rc;
2970 :
2971 1 : disabling = txq < dev->real_num_tx_queues;
2972 :
2973 1 : if (txq < 1 || txq > dev->num_tx_queues)
2974 : return -EINVAL;
2975 :
2976 1 : if (dev->reg_state == NETREG_REGISTERED ||
2977 : dev->reg_state == NETREG_UNREGISTERING) {
2978 0 : ASSERT_RTNL();
2979 :
2980 0 : rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2981 : txq);
2982 0 : if (rc)
2983 : return rc;
2984 :
2985 0 : if (dev->num_tc)
2986 0 : netif_setup_tc(dev, txq);
2987 :
2988 0 : dev->real_num_tx_queues = txq;
2989 :
2990 0 : if (disabling) {
2991 0 : synchronize_net();
2992 0 : qdisc_reset_all_tx_gt(dev, txq);
2993 : #ifdef CONFIG_XPS
2994 0 : netif_reset_xps_queues_gt(dev, txq);
2995 : #endif
2996 : }
2997 : } else {
2998 1 : dev->real_num_tx_queues = txq;
2999 : }
3000 :
3001 : return 0;
3002 : }
3003 : EXPORT_SYMBOL(netif_set_real_num_tx_queues);
3004 :
3005 : #ifdef CONFIG_SYSFS
3006 : /**
3007 : * netif_set_real_num_rx_queues - set actual number of RX queues used
3008 : * @dev: Network device
3009 : * @rxq: Actual number of RX queues
3010 : *
3011 : * This must be called either with the rtnl_lock held or before
3012 : * registration of the net device. Returns 0 on success, or a
3013 : * negative error code. If called before registration, it always
3014 : * succeeds.
3015 : */
3016 1 : int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
3017 : {
3018 1 : int rc;
3019 :
3020 1 : if (rxq < 1 || rxq > dev->num_rx_queues)
3021 : return -EINVAL;
3022 :
3023 1 : if (dev->reg_state == NETREG_REGISTERED) {
3024 0 : ASSERT_RTNL();
3025 :
3026 0 : rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
3027 : rxq);
3028 0 : if (rc)
3029 : return rc;
3030 : }
3031 :
3032 1 : dev->real_num_rx_queues = rxq;
3033 1 : return 0;
3034 : }
3035 : EXPORT_SYMBOL(netif_set_real_num_rx_queues);
3036 : #endif
3037 :
3038 : /**
3039 : * netif_get_num_default_rss_queues - default number of RSS queues
3040 : *
3041 : * This routine should set an upper limit on the number of RSS queues
3042 : * used by default by multiqueue devices.
3043 : */
3044 0 : int netif_get_num_default_rss_queues(void)
3045 : {
3046 0 : return is_kdump_kernel() ?
3047 0 : 1 : min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
3048 : }
3049 : EXPORT_SYMBOL(netif_get_num_default_rss_queues);
3050 :
3051 0 : static void __netif_reschedule(struct Qdisc *q)
3052 : {
3053 0 : struct softnet_data *sd;
3054 0 : unsigned long flags;
3055 :
3056 0 : local_irq_save(flags);
3057 0 : sd = this_cpu_ptr(&softnet_data);
3058 0 : q->next_sched = NULL;
3059 0 : *sd->output_queue_tailp = q;
3060 0 : sd->output_queue_tailp = &q->next_sched;
3061 0 : raise_softirq_irqoff(NET_TX_SOFTIRQ);
3062 0 : local_irq_restore(flags);
3063 0 : }
3064 :
3065 0 : void __netif_schedule(struct Qdisc *q)
3066 : {
3067 0 : if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
3068 0 : __netif_reschedule(q);
3069 0 : }
3070 : EXPORT_SYMBOL(__netif_schedule);
3071 :
3072 : struct dev_kfree_skb_cb {
3073 : enum skb_free_reason reason;
3074 : };
3075 :
3076 0 : static struct dev_kfree_skb_cb *get_kfree_skb_cb(const struct sk_buff *skb)
3077 : {
3078 0 : return (struct dev_kfree_skb_cb *)skb->cb;
3079 : }
3080 :
3081 0 : void netif_schedule_queue(struct netdev_queue *txq)
3082 : {
3083 0 : rcu_read_lock();
3084 0 : if (!netif_xmit_stopped(txq)) {
3085 0 : struct Qdisc *q = rcu_dereference(txq->qdisc);
3086 :
3087 0 : __netif_schedule(q);
3088 : }
3089 0 : rcu_read_unlock();
3090 0 : }
3091 : EXPORT_SYMBOL(netif_schedule_queue);
3092 :
3093 856 : void netif_tx_wake_queue(struct netdev_queue *dev_queue)
3094 : {
3095 856 : if (test_and_clear_bit(__QUEUE_STATE_DRV_XOFF, &dev_queue->state)) {
3096 0 : struct Qdisc *q;
3097 :
3098 0 : rcu_read_lock();
3099 0 : q = rcu_dereference(dev_queue->qdisc);
3100 0 : __netif_schedule(q);
3101 0 : rcu_read_unlock();
3102 : }
3103 856 : }
3104 : EXPORT_SYMBOL(netif_tx_wake_queue);
3105 :
3106 0 : void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason)
3107 : {
3108 0 : unsigned long flags;
3109 :
3110 0 : if (unlikely(!skb))
3111 : return;
3112 :
3113 0 : if (likely(refcount_read(&skb->users) == 1)) {
3114 0 : smp_rmb();
3115 0 : refcount_set(&skb->users, 0);
3116 0 : } else if (likely(!refcount_dec_and_test(&skb->users))) {
3117 : return;
3118 : }
3119 0 : get_kfree_skb_cb(skb)->reason = reason;
3120 0 : local_irq_save(flags);
3121 0 : skb->next = __this_cpu_read(softnet_data.completion_queue);
3122 0 : __this_cpu_write(softnet_data.completion_queue, skb);
3123 0 : raise_softirq_irqoff(NET_TX_SOFTIRQ);
3124 0 : local_irq_restore(flags);
3125 : }
3126 : EXPORT_SYMBOL(__dev_kfree_skb_irq);
3127 :
3128 35 : void __dev_kfree_skb_any(struct sk_buff *skb, enum skb_free_reason reason)
3129 : {
3130 35 : if (in_irq() || irqs_disabled())
3131 0 : __dev_kfree_skb_irq(skb, reason);
3132 : else
3133 35 : dev_kfree_skb(skb);
3134 35 : }
3135 : EXPORT_SYMBOL(__dev_kfree_skb_any);
3136 :
3137 :
3138 : /**
3139 : * netif_device_detach - mark device as removed
3140 : * @dev: network device
3141 : *
3142 : * Mark device as removed from system and therefore no longer available.
3143 : */
3144 0 : void netif_device_detach(struct net_device *dev)
3145 : {
3146 0 : if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
3147 0 : netif_running(dev)) {
3148 0 : netif_tx_stop_all_queues(dev);
3149 : }
3150 0 : }
3151 : EXPORT_SYMBOL(netif_device_detach);
3152 :
3153 : /**
3154 : * netif_device_attach - mark device as attached
3155 : * @dev: network device
3156 : *
3157 : * Mark device as attached from system and restart if needed.
3158 : */
3159 0 : void netif_device_attach(struct net_device *dev)
3160 : {
3161 0 : if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
3162 0 : netif_running(dev)) {
3163 0 : netif_tx_wake_all_queues(dev);
3164 0 : __netdev_watchdog_up(dev);
3165 : }
3166 0 : }
3167 : EXPORT_SYMBOL(netif_device_attach);
3168 :
3169 : /*
3170 : * Returns a Tx hash based on the given packet descriptor a Tx queues' number
3171 : * to be used as a distribution range.
3172 : */
3173 0 : static u16 skb_tx_hash(const struct net_device *dev,
3174 : const struct net_device *sb_dev,
3175 : struct sk_buff *skb)
3176 : {
3177 0 : u32 hash;
3178 0 : u16 qoffset = 0;
3179 0 : u16 qcount = dev->real_num_tx_queues;
3180 :
3181 0 : if (dev->num_tc) {
3182 0 : u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
3183 :
3184 0 : qoffset = sb_dev->tc_to_txq[tc].offset;
3185 0 : qcount = sb_dev->tc_to_txq[tc].count;
3186 : }
3187 :
3188 0 : if (skb_rx_queue_recorded(skb)) {
3189 0 : hash = skb_get_rx_queue(skb);
3190 0 : if (hash >= qoffset)
3191 0 : hash -= qoffset;
3192 0 : while (unlikely(hash >= qcount))
3193 0 : hash -= qcount;
3194 0 : return hash + qoffset;
3195 : }
3196 :
3197 0 : return (u16) reciprocal_scale(skb_get_hash(skb), qcount) + qoffset;
3198 : }
3199 :
3200 0 : static void skb_warn_bad_offload(const struct sk_buff *skb)
3201 : {
3202 0 : static const netdev_features_t null_features;
3203 0 : struct net_device *dev = skb->dev;
3204 0 : const char *name = "";
3205 :
3206 0 : if (!net_ratelimit())
3207 : return;
3208 :
3209 0 : if (dev) {
3210 0 : if (dev->dev.parent)
3211 0 : name = dev_driver_string(dev->dev.parent);
3212 : else
3213 0 : name = netdev_name(dev);
3214 : }
3215 0 : skb_dump(KERN_WARNING, skb, false);
3216 0 : WARN(1, "%s: caps=(%pNF, %pNF)\n",
3217 : name, dev ? &dev->features : &null_features,
3218 : skb->sk ? &skb->sk->sk_route_caps : &null_features);
3219 : }
3220 :
3221 : /*
3222 : * Invalidate hardware checksum when packet is to be mangled, and
3223 : * complete checksum manually on outgoing path.
3224 : */
3225 430 : int skb_checksum_help(struct sk_buff *skb)
3226 : {
3227 430 : __wsum csum;
3228 430 : int ret = 0, offset;
3229 :
3230 430 : if (skb->ip_summed == CHECKSUM_COMPLETE)
3231 0 : goto out_set_summed;
3232 :
3233 430 : if (unlikely(skb_is_gso(skb))) {
3234 0 : skb_warn_bad_offload(skb);
3235 0 : return -EINVAL;
3236 : }
3237 :
3238 : /* Before computing a checksum, we should make sure no frag could
3239 : * be modified by an external entity : checksum could be wrong.
3240 : */
3241 430 : if (skb_has_shared_frag(skb)) {
3242 0 : ret = __skb_linearize(skb);
3243 0 : if (ret)
3244 0 : goto out;
3245 : }
3246 :
3247 430 : offset = skb_checksum_start_offset(skb);
3248 430 : BUG_ON(offset >= skb_headlen(skb));
3249 430 : csum = skb_checksum(skb, offset, skb->len - offset, 0);
3250 :
3251 430 : offset += skb->csum_offset;
3252 430 : BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
3253 :
3254 430 : ret = skb_ensure_writable(skb, offset + sizeof(__sum16));
3255 430 : if (ret)
3256 0 : goto out;
3257 :
3258 430 : *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
3259 430 : out_set_summed:
3260 430 : skb->ip_summed = CHECKSUM_NONE;
3261 : out:
3262 : return ret;
3263 : }
3264 : EXPORT_SYMBOL(skb_checksum_help);
3265 :
3266 0 : int skb_crc32c_csum_help(struct sk_buff *skb)
3267 : {
3268 0 : __le32 crc32c_csum;
3269 0 : int ret = 0, offset, start;
3270 :
3271 0 : if (skb->ip_summed != CHECKSUM_PARTIAL)
3272 0 : goto out;
3273 :
3274 0 : if (unlikely(skb_is_gso(skb)))
3275 0 : goto out;
3276 :
3277 : /* Before computing a checksum, we should make sure no frag could
3278 : * be modified by an external entity : checksum could be wrong.
3279 : */
3280 0 : if (unlikely(skb_has_shared_frag(skb))) {
3281 0 : ret = __skb_linearize(skb);
3282 0 : if (ret)
3283 0 : goto out;
3284 : }
3285 0 : start = skb_checksum_start_offset(skb);
3286 0 : offset = start + offsetof(struct sctphdr, checksum);
3287 0 : if (WARN_ON_ONCE(offset >= skb_headlen(skb))) {
3288 0 : ret = -EINVAL;
3289 0 : goto out;
3290 : }
3291 :
3292 0 : ret = skb_ensure_writable(skb, offset + sizeof(__le32));
3293 0 : if (ret)
3294 0 : goto out;
3295 :
3296 0 : crc32c_csum = cpu_to_le32(~__skb_checksum(skb, start,
3297 : skb->len - start, ~(__u32)0,
3298 : crc32c_csum_stub));
3299 0 : *(__le32 *)(skb->data + offset) = crc32c_csum;
3300 0 : skb->ip_summed = CHECKSUM_NONE;
3301 0 : skb->csum_not_inet = 0;
3302 0 : out:
3303 0 : return ret;
3304 : }
3305 :
3306 448 : __be16 skb_network_protocol(struct sk_buff *skb, int *depth)
3307 : {
3308 448 : __be16 type = skb->protocol;
3309 :
3310 : /* Tunnel gso handlers can set protocol to ethernet. */
3311 448 : if (type == htons(ETH_P_TEB)) {
3312 0 : struct ethhdr *eth;
3313 :
3314 0 : if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
3315 : return 0;
3316 :
3317 0 : eth = (struct ethhdr *)skb->data;
3318 0 : type = eth->h_proto;
3319 : }
3320 :
3321 448 : return __vlan_get_protocol(skb, type, depth);
3322 : }
3323 :
3324 : /**
3325 : * skb_mac_gso_segment - mac layer segmentation handler.
3326 : * @skb: buffer to segment
3327 : * @features: features for the output path (see dev->features)
3328 : */
3329 0 : struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
3330 : netdev_features_t features)
3331 : {
3332 0 : struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
3333 0 : struct packet_offload *ptype;
3334 0 : int vlan_depth = skb->mac_len;
3335 0 : __be16 type = skb_network_protocol(skb, &vlan_depth);
3336 :
3337 0 : if (unlikely(!type))
3338 0 : return ERR_PTR(-EINVAL);
3339 :
3340 0 : __skb_pull(skb, vlan_depth);
3341 :
3342 0 : rcu_read_lock();
3343 0 : list_for_each_entry_rcu(ptype, &offload_base, list) {
3344 0 : if (ptype->type == type && ptype->callbacks.gso_segment) {
3345 0 : segs = ptype->callbacks.gso_segment(skb, features);
3346 0 : break;
3347 : }
3348 : }
3349 0 : rcu_read_unlock();
3350 :
3351 0 : __skb_push(skb, skb->data - skb_mac_header(skb));
3352 :
3353 0 : return segs;
3354 : }
3355 : EXPORT_SYMBOL(skb_mac_gso_segment);
3356 :
3357 :
3358 : /* openvswitch calls this on rx path, so we need a different check.
3359 : */
3360 0 : static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
3361 : {
3362 0 : if (tx_path)
3363 0 : return skb->ip_summed != CHECKSUM_PARTIAL &&
3364 : skb->ip_summed != CHECKSUM_UNNECESSARY;
3365 :
3366 0 : return skb->ip_summed == CHECKSUM_NONE;
3367 : }
3368 :
3369 : /**
3370 : * __skb_gso_segment - Perform segmentation on skb.
3371 : * @skb: buffer to segment
3372 : * @features: features for the output path (see dev->features)
3373 : * @tx_path: whether it is called in TX path
3374 : *
3375 : * This function segments the given skb and returns a list of segments.
3376 : *
3377 : * It may return NULL if the skb requires no segmentation. This is
3378 : * only possible when GSO is used for verifying header integrity.
3379 : *
3380 : * Segmentation preserves SKB_GSO_CB_OFFSET bytes of previous skb cb.
3381 : */
3382 0 : struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
3383 : netdev_features_t features, bool tx_path)
3384 : {
3385 0 : struct sk_buff *segs;
3386 :
3387 0 : if (unlikely(skb_needs_check(skb, tx_path))) {
3388 0 : int err;
3389 :
3390 : /* We're going to init ->check field in TCP or UDP header */
3391 0 : err = skb_cow_head(skb, 0);
3392 0 : if (err < 0)
3393 0 : return ERR_PTR(err);
3394 : }
3395 :
3396 : /* Only report GSO partial support if it will enable us to
3397 : * support segmentation on this frame without needing additional
3398 : * work.
3399 : */
3400 0 : if (features & NETIF_F_GSO_PARTIAL) {
3401 0 : netdev_features_t partial_features = NETIF_F_GSO_ROBUST;
3402 0 : struct net_device *dev = skb->dev;
3403 :
3404 0 : partial_features |= dev->features & dev->gso_partial_features;
3405 0 : if (!skb_gso_ok(skb, features | partial_features))
3406 0 : features &= ~NETIF_F_GSO_PARTIAL;
3407 : }
3408 :
3409 0 : BUILD_BUG_ON(SKB_GSO_CB_OFFSET +
3410 : sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
3411 :
3412 0 : SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
3413 0 : SKB_GSO_CB(skb)->encap_level = 0;
3414 :
3415 0 : skb_reset_mac_header(skb);
3416 0 : skb_reset_mac_len(skb);
3417 :
3418 0 : segs = skb_mac_gso_segment(skb, features);
3419 :
3420 0 : if (segs != skb && unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs)))
3421 0 : skb_warn_bad_offload(skb);
3422 :
3423 : return segs;
3424 : }
3425 : EXPORT_SYMBOL(__skb_gso_segment);
3426 :
3427 : /* Take action when hardware reception checksum errors are detected. */
3428 : #ifdef CONFIG_BUG
3429 0 : void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
3430 : {
3431 0 : if (net_ratelimit()) {
3432 0 : pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
3433 0 : skb_dump(KERN_ERR, skb, true);
3434 0 : dump_stack();
3435 : }
3436 0 : }
3437 : EXPORT_SYMBOL(netdev_rx_csum_fault);
3438 : #endif
3439 :
3440 : /* XXX: check that highmem exists at all on the given machine. */
3441 448 : static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
3442 : {
3443 : #ifdef CONFIG_HIGHMEM
3444 : int i;
3445 :
3446 : if (!(dev->features & NETIF_F_HIGHDMA)) {
3447 : for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
3448 : skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
3449 :
3450 : if (PageHighMem(skb_frag_page(frag)))
3451 : return 1;
3452 : }
3453 : }
3454 : #endif
3455 448 : return 0;
3456 : }
3457 :
3458 : /* If MPLS offload request, verify we are testing hardware MPLS features
3459 : * instead of standard features for the netdev.
3460 : */
3461 : #if IS_ENABLED(CONFIG_NET_MPLS_GSO)
3462 : static netdev_features_t net_mpls_features(struct sk_buff *skb,
3463 : netdev_features_t features,
3464 : __be16 type)
3465 : {
3466 : if (eth_p_mpls(type))
3467 : features &= skb->dev->mpls_features;
3468 :
3469 : return features;
3470 : }
3471 : #else
3472 448 : static netdev_features_t net_mpls_features(struct sk_buff *skb,
3473 : netdev_features_t features,
3474 : __be16 type)
3475 : {
3476 448 : return features;
3477 : }
3478 : #endif
3479 :
3480 448 : static netdev_features_t harmonize_features(struct sk_buff *skb,
3481 : netdev_features_t features)
3482 : {
3483 448 : __be16 type;
3484 :
3485 448 : type = skb_network_protocol(skb, NULL);
3486 448 : features = net_mpls_features(skb, features, type);
3487 :
3488 448 : if (skb->ip_summed != CHECKSUM_NONE &&
3489 430 : !can_checksum_protocol(features, type)) {
3490 430 : features &= ~(NETIF_F_CSUM_MASK | NETIF_F_GSO_MASK);
3491 : }
3492 448 : if (illegal_highdma(skb->dev, skb))
3493 : features &= ~NETIF_F_SG;
3494 :
3495 448 : return features;
3496 : }
3497 :
3498 448 : netdev_features_t passthru_features_check(struct sk_buff *skb,
3499 : struct net_device *dev,
3500 : netdev_features_t features)
3501 : {
3502 448 : return features;
3503 : }
3504 : EXPORT_SYMBOL(passthru_features_check);
3505 :
3506 0 : static netdev_features_t dflt_features_check(struct sk_buff *skb,
3507 : struct net_device *dev,
3508 : netdev_features_t features)
3509 : {
3510 0 : return vlan_features_check(skb, features);
3511 : }
3512 :
3513 0 : static netdev_features_t gso_features_check(const struct sk_buff *skb,
3514 : struct net_device *dev,
3515 : netdev_features_t features)
3516 : {
3517 0 : u16 gso_segs = skb_shinfo(skb)->gso_segs;
3518 :
3519 0 : if (gso_segs > dev->gso_max_segs)
3520 0 : return features & ~NETIF_F_GSO_MASK;
3521 :
3522 0 : if (!skb_shinfo(skb)->gso_type) {
3523 0 : skb_warn_bad_offload(skb);
3524 0 : return features & ~NETIF_F_GSO_MASK;
3525 : }
3526 :
3527 : /* Support for GSO partial features requires software
3528 : * intervention before we can actually process the packets
3529 : * so we need to strip support for any partial features now
3530 : * and we can pull them back in after we have partially
3531 : * segmented the frame.
3532 : */
3533 0 : if (!(skb_shinfo(skb)->gso_type & SKB_GSO_PARTIAL))
3534 0 : features &= ~dev->gso_partial_features;
3535 :
3536 : /* Make sure to clear the IPv4 ID mangling feature if the
3537 : * IPv4 header has the potential to be fragmented.
3538 : */
3539 0 : if (skb_shinfo(skb)->gso_type & SKB_GSO_TCPV4) {
3540 0 : struct iphdr *iph = skb->encapsulation ?
3541 0 : inner_ip_hdr(skb) : ip_hdr(skb);
3542 :
3543 0 : if (!(iph->frag_off & htons(IP_DF)))
3544 0 : features &= ~NETIF_F_TSO_MANGLEID;
3545 : }
3546 :
3547 : return features;
3548 : }
3549 :
3550 448 : netdev_features_t netif_skb_features(struct sk_buff *skb)
3551 : {
3552 448 : struct net_device *dev = skb->dev;
3553 448 : netdev_features_t features = dev->features;
3554 :
3555 448 : if (skb_is_gso(skb))
3556 0 : features = gso_features_check(skb, dev, features);
3557 :
3558 : /* If encapsulation offload request, verify we are testing
3559 : * hardware encapsulation features instead of standard
3560 : * features for the netdev
3561 : */
3562 448 : if (skb->encapsulation)
3563 0 : features &= dev->hw_enc_features;
3564 :
3565 448 : if (skb_vlan_tagged(skb))
3566 0 : features = netdev_intersect_features(features,
3567 0 : dev->vlan_features |
3568 : NETIF_F_HW_VLAN_CTAG_TX |
3569 : NETIF_F_HW_VLAN_STAG_TX);
3570 :
3571 448 : if (dev->netdev_ops->ndo_features_check)
3572 448 : features &= dev->netdev_ops->ndo_features_check(skb, dev,
3573 : features);
3574 : else
3575 0 : features &= dflt_features_check(skb, dev, features);
3576 :
3577 448 : return harmonize_features(skb, features);
3578 : }
3579 : EXPORT_SYMBOL(netif_skb_features);
3580 :
3581 448 : static int xmit_one(struct sk_buff *skb, struct net_device *dev,
3582 : struct netdev_queue *txq, bool more)
3583 : {
3584 448 : unsigned int len;
3585 448 : int rc;
3586 :
3587 896 : if (dev_nit_active(dev))
3588 448 : dev_queue_xmit_nit(skb, dev);
3589 :
3590 448 : len = skb->len;
3591 448 : PRANDOM_ADD_NOISE(skb, dev, txq, len + jiffies);
3592 448 : trace_net_dev_start_xmit(skb, dev);
3593 448 : rc = netdev_start_xmit(skb, dev, txq, more);
3594 448 : trace_net_dev_xmit(skb, rc, dev, len);
3595 :
3596 448 : return rc;
3597 : }
3598 :
3599 448 : struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *dev,
3600 : struct netdev_queue *txq, int *ret)
3601 : {
3602 448 : struct sk_buff *skb = first;
3603 448 : int rc = NETDEV_TX_OK;
3604 :
3605 896 : while (skb) {
3606 448 : struct sk_buff *next = skb->next;
3607 :
3608 448 : skb_mark_not_on_list(skb);
3609 448 : rc = xmit_one(skb, dev, txq, next != NULL);
3610 896 : if (unlikely(!dev_xmit_complete(rc))) {
3611 0 : skb->next = next;
3612 0 : goto out;
3613 : }
3614 :
3615 448 : skb = next;
3616 448 : if (netif_tx_queue_stopped(txq) && skb) {
3617 : rc = NETDEV_TX_BUSY;
3618 : break;
3619 : }
3620 : }
3621 :
3622 448 : out:
3623 448 : *ret = rc;
3624 448 : return skb;
3625 : }
3626 :
3627 448 : static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
3628 : netdev_features_t features)
3629 : {
3630 448 : if (skb_vlan_tag_present(skb) &&
3631 0 : !vlan_hw_offload_capable(features, skb->vlan_proto))
3632 0 : skb = __vlan_hwaccel_push_inside(skb);
3633 448 : return skb;
3634 : }
3635 :
3636 430 : int skb_csum_hwoffload_help(struct sk_buff *skb,
3637 : const netdev_features_t features)
3638 : {
3639 430 : if (unlikely(skb_csum_is_sctp(skb)))
3640 0 : return !!(features & NETIF_F_SCTP_CRC) ? 0 :
3641 0 : skb_crc32c_csum_help(skb);
3642 :
3643 430 : if (features & NETIF_F_HW_CSUM)
3644 : return 0;
3645 :
3646 430 : if (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) {
3647 0 : switch (skb->csum_offset) {
3648 : case offsetof(struct tcphdr, check):
3649 : case offsetof(struct udphdr, check):
3650 : return 0;
3651 : }
3652 : }
3653 :
3654 430 : return skb_checksum_help(skb);
3655 : }
3656 : EXPORT_SYMBOL(skb_csum_hwoffload_help);
3657 :
3658 448 : static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again)
3659 : {
3660 448 : netdev_features_t features;
3661 :
3662 448 : features = netif_skb_features(skb);
3663 448 : skb = validate_xmit_vlan(skb, features);
3664 448 : if (unlikely(!skb))
3665 0 : goto out_null;
3666 :
3667 448 : skb = sk_validate_xmit_skb(skb, dev);
3668 448 : if (unlikely(!skb))
3669 : goto out_null;
3670 :
3671 448 : if (netif_needs_gso(skb, features)) {
3672 0 : struct sk_buff *segs;
3673 :
3674 0 : segs = skb_gso_segment(skb, features);
3675 0 : if (IS_ERR(segs)) {
3676 0 : goto out_kfree_skb;
3677 0 : } else if (segs) {
3678 0 : consume_skb(skb);
3679 0 : skb = segs;
3680 : }
3681 : } else {
3682 809 : if (skb_needs_linearize(skb, features) &&
3683 361 : __skb_linearize(skb))
3684 0 : goto out_kfree_skb;
3685 :
3686 : /* If packet is not checksummed and device does not
3687 : * support checksumming for this protocol, complete
3688 : * checksumming here.
3689 : */
3690 448 : if (skb->ip_summed == CHECKSUM_PARTIAL) {
3691 430 : if (skb->encapsulation)
3692 0 : skb_set_inner_transport_header(skb,
3693 : skb_checksum_start_offset(skb));
3694 : else
3695 430 : skb_set_transport_header(skb,
3696 : skb_checksum_start_offset(skb));
3697 430 : if (skb_csum_hwoffload_help(skb, features))
3698 0 : goto out_kfree_skb;
3699 : }
3700 : }
3701 :
3702 448 : skb = validate_xmit_xfrm(skb, features, again);
3703 :
3704 : return skb;
3705 :
3706 0 : out_kfree_skb:
3707 0 : kfree_skb(skb);
3708 0 : out_null:
3709 0 : atomic_long_inc(&dev->tx_dropped);
3710 0 : return NULL;
3711 : }
3712 :
3713 448 : struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again)
3714 : {
3715 448 : struct sk_buff *next, *head = NULL, *tail;
3716 :
3717 896 : for (; skb != NULL; skb = next) {
3718 448 : next = skb->next;
3719 448 : skb_mark_not_on_list(skb);
3720 :
3721 : /* in case skb wont be segmented, point to itself */
3722 448 : skb->prev = skb;
3723 :
3724 448 : skb = validate_xmit_skb(skb, dev, again);
3725 448 : if (!skb)
3726 0 : continue;
3727 :
3728 448 : if (!head)
3729 : head = skb;
3730 : else
3731 0 : tail->next = skb;
3732 : /* If skb was segmented, skb->prev points to
3733 : * the last segment. If not, it still contains skb.
3734 : */
3735 448 : tail = skb->prev;
3736 : }
3737 448 : return head;
3738 : }
3739 : EXPORT_SYMBOL_GPL(validate_xmit_skb_list);
3740 :
3741 448 : static void qdisc_pkt_len_init(struct sk_buff *skb)
3742 : {
3743 448 : const struct skb_shared_info *shinfo = skb_shinfo(skb);
3744 :
3745 448 : qdisc_skb_cb(skb)->pkt_len = skb->len;
3746 :
3747 : /* To get more precise estimation of bytes sent on wire,
3748 : * we add to pkt_len the headers size of all segments
3749 : */
3750 448 : if (shinfo->gso_size && skb_transport_header_was_set(skb)) {
3751 0 : unsigned int hdr_len;
3752 0 : u16 gso_segs = shinfo->gso_segs;
3753 :
3754 : /* mac layer + network layer */
3755 0 : hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
3756 :
3757 : /* + transport layer */
3758 0 : if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6))) {
3759 0 : const struct tcphdr *th;
3760 0 : struct tcphdr _tcphdr;
3761 :
3762 0 : th = skb_header_pointer(skb, skb_transport_offset(skb),
3763 : sizeof(_tcphdr), &_tcphdr);
3764 0 : if (likely(th))
3765 0 : hdr_len += __tcp_hdrlen(th);
3766 : } else {
3767 0 : struct udphdr _udphdr;
3768 :
3769 0 : if (skb_header_pointer(skb, skb_transport_offset(skb),
3770 : sizeof(_udphdr), &_udphdr))
3771 0 : hdr_len += sizeof(struct udphdr);
3772 : }
3773 :
3774 0 : if (shinfo->gso_type & SKB_GSO_DODGY)
3775 0 : gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
3776 : shinfo->gso_size);
3777 :
3778 0 : qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
3779 : }
3780 448 : }
3781 :
3782 448 : static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
3783 : struct net_device *dev,
3784 : struct netdev_queue *txq)
3785 : {
3786 448 : spinlock_t *root_lock = qdisc_lock(q);
3787 448 : struct sk_buff *to_free = NULL;
3788 448 : bool contended;
3789 448 : int rc;
3790 :
3791 448 : qdisc_calculate_pkt_len(skb, q);
3792 :
3793 448 : if (q->flags & TCQ_F_NOLOCK) {
3794 448 : rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3795 448 : qdisc_run(q);
3796 :
3797 448 : if (unlikely(to_free))
3798 0 : kfree_skb_list(to_free);
3799 448 : return rc;
3800 : }
3801 :
3802 : /*
3803 : * Heuristic to force contended enqueues to serialize on a
3804 : * separate lock before trying to get qdisc main lock.
3805 : * This permits qdisc->running owner to get the lock more
3806 : * often and dequeue packets faster.
3807 : */
3808 0 : contended = qdisc_is_running(q);
3809 0 : if (unlikely(contended))
3810 0 : spin_lock(&q->busylock);
3811 :
3812 0 : spin_lock(root_lock);
3813 0 : if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
3814 0 : __qdisc_drop(skb, &to_free);
3815 0 : rc = NET_XMIT_DROP;
3816 0 : } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
3817 0 : qdisc_run_begin(q)) {
3818 : /*
3819 : * This is a work-conserving queue; there are no old skbs
3820 : * waiting to be sent out; and the qdisc is not running -
3821 : * xmit the skb directly.
3822 : */
3823 :
3824 0 : qdisc_bstats_update(q, skb);
3825 :
3826 0 : if (sch_direct_xmit(skb, q, dev, txq, root_lock, true)) {
3827 0 : if (unlikely(contended)) {
3828 0 : spin_unlock(&q->busylock);
3829 0 : contended = false;
3830 : }
3831 0 : __qdisc_run(q);
3832 : }
3833 :
3834 0 : qdisc_run_end(q);
3835 0 : rc = NET_XMIT_SUCCESS;
3836 : } else {
3837 0 : rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK;
3838 0 : if (qdisc_run_begin(q)) {
3839 0 : if (unlikely(contended)) {
3840 0 : spin_unlock(&q->busylock);
3841 0 : contended = false;
3842 : }
3843 0 : __qdisc_run(q);
3844 0 : qdisc_run_end(q);
3845 : }
3846 : }
3847 0 : spin_unlock(root_lock);
3848 0 : if (unlikely(to_free))
3849 0 : kfree_skb_list(to_free);
3850 0 : if (unlikely(contended))
3851 0 : spin_unlock(&q->busylock);
3852 : return rc;
3853 : }
3854 :
3855 : #if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
3856 : static void skb_update_prio(struct sk_buff *skb)
3857 : {
3858 : const struct netprio_map *map;
3859 : const struct sock *sk;
3860 : unsigned int prioidx;
3861 :
3862 : if (skb->priority)
3863 : return;
3864 : map = rcu_dereference_bh(skb->dev->priomap);
3865 : if (!map)
3866 : return;
3867 : sk = skb_to_full_sk(skb);
3868 : if (!sk)
3869 : return;
3870 :
3871 : prioidx = sock_cgroup_prioidx(&sk->sk_cgrp_data);
3872 :
3873 : if (prioidx < map->priomap_len)
3874 : skb->priority = map->priomap[prioidx];
3875 : }
3876 : #else
3877 : #define skb_update_prio(skb)
3878 : #endif
3879 :
3880 : /**
3881 : * dev_loopback_xmit - loop back @skb
3882 : * @net: network namespace this loopback is happening in
3883 : * @sk: sk needed to be a netfilter okfn
3884 : * @skb: buffer to transmit
3885 : */
3886 0 : int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
3887 : {
3888 0 : skb_reset_mac_header(skb);
3889 0 : __skb_pull(skb, skb_network_offset(skb));
3890 0 : skb->pkt_type = PACKET_LOOPBACK;
3891 0 : skb->ip_summed = CHECKSUM_UNNECESSARY;
3892 0 : WARN_ON(!skb_dst(skb));
3893 0 : skb_dst_force(skb);
3894 0 : netif_rx_ni(skb);
3895 0 : return 0;
3896 : }
3897 : EXPORT_SYMBOL(dev_loopback_xmit);
3898 :
3899 : #ifdef CONFIG_NET_EGRESS
3900 : static struct sk_buff *
3901 : sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
3902 : {
3903 : struct mini_Qdisc *miniq = rcu_dereference_bh(dev->miniq_egress);
3904 : struct tcf_result cl_res;
3905 :
3906 : if (!miniq)
3907 : return skb;
3908 :
3909 : /* qdisc_skb_cb(skb)->pkt_len was already set by the caller. */
3910 : qdisc_skb_cb(skb)->mru = 0;
3911 : qdisc_skb_cb(skb)->post_ct = false;
3912 : mini_qdisc_bstats_cpu_update(miniq, skb);
3913 :
3914 : switch (tcf_classify(skb, miniq->filter_list, &cl_res, false)) {
3915 : case TC_ACT_OK:
3916 : case TC_ACT_RECLASSIFY:
3917 : skb->tc_index = TC_H_MIN(cl_res.classid);
3918 : break;
3919 : case TC_ACT_SHOT:
3920 : mini_qdisc_qstats_cpu_drop(miniq);
3921 : *ret = NET_XMIT_DROP;
3922 : kfree_skb(skb);
3923 : return NULL;
3924 : case TC_ACT_STOLEN:
3925 : case TC_ACT_QUEUED:
3926 : case TC_ACT_TRAP:
3927 : *ret = NET_XMIT_SUCCESS;
3928 : consume_skb(skb);
3929 : return NULL;
3930 : case TC_ACT_REDIRECT:
3931 : /* No need to push/pop skb's mac_header here on egress! */
3932 : skb_do_redirect(skb);
3933 : *ret = NET_XMIT_SUCCESS;
3934 : return NULL;
3935 : default:
3936 : break;
3937 : }
3938 :
3939 : return skb;
3940 : }
3941 : #endif /* CONFIG_NET_EGRESS */
3942 :
3943 : #ifdef CONFIG_XPS
3944 0 : static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb,
3945 : struct xps_dev_maps *dev_maps, unsigned int tci)
3946 : {
3947 0 : struct xps_map *map;
3948 0 : int queue_index = -1;
3949 :
3950 0 : if (dev->num_tc) {
3951 0 : tci *= dev->num_tc;
3952 0 : tci += netdev_get_prio_tc_map(dev, skb->priority);
3953 : }
3954 :
3955 0 : map = rcu_dereference(dev_maps->attr_map[tci]);
3956 0 : if (map) {
3957 0 : if (map->len == 1)
3958 0 : queue_index = map->queues[0];
3959 : else
3960 0 : queue_index = map->queues[reciprocal_scale(
3961 : skb_get_hash(skb), map->len)];
3962 0 : if (unlikely(queue_index >= dev->real_num_tx_queues))
3963 0 : queue_index = -1;
3964 : }
3965 0 : return queue_index;
3966 : }
3967 : #endif
3968 :
3969 0 : static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev,
3970 : struct sk_buff *skb)
3971 : {
3972 : #ifdef CONFIG_XPS
3973 0 : struct xps_dev_maps *dev_maps;
3974 0 : struct sock *sk = skb->sk;
3975 0 : int queue_index = -1;
3976 :
3977 0 : if (!static_key_false(&xps_needed))
3978 : return -1;
3979 :
3980 0 : rcu_read_lock();
3981 0 : if (!static_key_false(&xps_rxqs_needed))
3982 0 : goto get_cpus_map;
3983 :
3984 0 : dev_maps = rcu_dereference(sb_dev->xps_rxqs_map);
3985 0 : if (dev_maps) {
3986 0 : int tci = sk_rx_queue_get(sk);
3987 :
3988 0 : if (tci >= 0 && tci < dev->num_rx_queues)
3989 0 : queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
3990 : tci);
3991 : }
3992 :
3993 0 : get_cpus_map:
3994 0 : if (queue_index < 0) {
3995 0 : dev_maps = rcu_dereference(sb_dev->xps_cpus_map);
3996 0 : if (dev_maps) {
3997 0 : unsigned int tci = skb->sender_cpu - 1;
3998 :
3999 0 : queue_index = __get_xps_queue_idx(dev, skb, dev_maps,
4000 : tci);
4001 : }
4002 : }
4003 0 : rcu_read_unlock();
4004 :
4005 0 : return queue_index;
4006 : #else
4007 : return -1;
4008 : #endif
4009 : }
4010 :
4011 0 : u16 dev_pick_tx_zero(struct net_device *dev, struct sk_buff *skb,
4012 : struct net_device *sb_dev)
4013 : {
4014 0 : return 0;
4015 : }
4016 : EXPORT_SYMBOL(dev_pick_tx_zero);
4017 :
4018 0 : u16 dev_pick_tx_cpu_id(struct net_device *dev, struct sk_buff *skb,
4019 : struct net_device *sb_dev)
4020 : {
4021 0 : return (u16)raw_smp_processor_id() % dev->real_num_tx_queues;
4022 : }
4023 : EXPORT_SYMBOL(dev_pick_tx_cpu_id);
4024 :
4025 0 : u16 netdev_pick_tx(struct net_device *dev, struct sk_buff *skb,
4026 : struct net_device *sb_dev)
4027 : {
4028 0 : struct sock *sk = skb->sk;
4029 0 : int queue_index = sk_tx_queue_get(sk);
4030 :
4031 0 : sb_dev = sb_dev ? : dev;
4032 :
4033 0 : if (queue_index < 0 || skb->ooo_okay ||
4034 0 : queue_index >= dev->real_num_tx_queues) {
4035 0 : int new_index = get_xps_queue(dev, sb_dev, skb);
4036 :
4037 0 : if (new_index < 0)
4038 0 : new_index = skb_tx_hash(dev, sb_dev, skb);
4039 :
4040 0 : if (queue_index != new_index && sk &&
4041 0 : sk_fullsock(sk) &&
4042 0 : rcu_access_pointer(sk->sk_dst_cache))
4043 0 : sk_tx_queue_set(sk, new_index);
4044 :
4045 : queue_index = new_index;
4046 : }
4047 :
4048 0 : return queue_index;
4049 : }
4050 : EXPORT_SYMBOL(netdev_pick_tx);
4051 :
4052 448 : struct netdev_queue *netdev_core_pick_tx(struct net_device *dev,
4053 : struct sk_buff *skb,
4054 : struct net_device *sb_dev)
4055 : {
4056 448 : int queue_index = 0;
4057 :
4058 : #ifdef CONFIG_XPS
4059 448 : u32 sender_cpu = skb->sender_cpu - 1;
4060 :
4061 448 : if (sender_cpu >= (u32)NR_CPUS)
4062 448 : skb->sender_cpu = raw_smp_processor_id() + 1;
4063 : #endif
4064 :
4065 448 : if (dev->real_num_tx_queues != 1) {
4066 0 : const struct net_device_ops *ops = dev->netdev_ops;
4067 :
4068 0 : if (ops->ndo_select_queue)
4069 0 : queue_index = ops->ndo_select_queue(dev, skb, sb_dev);
4070 : else
4071 0 : queue_index = netdev_pick_tx(dev, skb, sb_dev);
4072 :
4073 0 : queue_index = netdev_cap_txqueue(dev, queue_index);
4074 : }
4075 :
4076 448 : skb_set_queue_mapping(skb, queue_index);
4077 448 : return netdev_get_tx_queue(dev, queue_index);
4078 : }
4079 :
4080 : /**
4081 : * __dev_queue_xmit - transmit a buffer
4082 : * @skb: buffer to transmit
4083 : * @sb_dev: suboordinate device used for L2 forwarding offload
4084 : *
4085 : * Queue a buffer for transmission to a network device. The caller must
4086 : * have set the device and priority and built the buffer before calling
4087 : * this function. The function can be called from an interrupt.
4088 : *
4089 : * A negative errno code is returned on a failure. A success does not
4090 : * guarantee the frame will be transmitted as it may be dropped due
4091 : * to congestion or traffic shaping.
4092 : *
4093 : * -----------------------------------------------------------------------------------
4094 : * I notice this method can also return errors from the queue disciplines,
4095 : * including NET_XMIT_DROP, which is a positive value. So, errors can also
4096 : * be positive.
4097 : *
4098 : * Regardless of the return value, the skb is consumed, so it is currently
4099 : * difficult to retry a send to this method. (You can bump the ref count
4100 : * before sending to hold a reference for retry if you are careful.)
4101 : *
4102 : * When calling this method, interrupts MUST be enabled. This is because
4103 : * the BH enable code must have IRQs enabled so that it will not deadlock.
4104 : * --BLG
4105 : */
4106 448 : static int __dev_queue_xmit(struct sk_buff *skb, struct net_device *sb_dev)
4107 : {
4108 448 : struct net_device *dev = skb->dev;
4109 448 : struct netdev_queue *txq;
4110 448 : struct Qdisc *q;
4111 448 : int rc = -ENOMEM;
4112 448 : bool again = false;
4113 :
4114 448 : skb_reset_mac_header(skb);
4115 :
4116 448 : if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_SCHED_TSTAMP))
4117 0 : __skb_tstamp_tx(skb, NULL, NULL, skb->sk, SCM_TSTAMP_SCHED);
4118 :
4119 : /* Disable soft irqs for various locks below. Also
4120 : * stops preemption for RCU.
4121 : */
4122 448 : rcu_read_lock_bh();
4123 :
4124 448 : skb_update_prio(skb);
4125 :
4126 448 : qdisc_pkt_len_init(skb);
4127 : #ifdef CONFIG_NET_CLS_ACT
4128 : skb->tc_at_ingress = 0;
4129 : # ifdef CONFIG_NET_EGRESS
4130 : if (static_branch_unlikely(&egress_needed_key)) {
4131 : skb = sch_handle_egress(skb, &rc, dev);
4132 : if (!skb)
4133 : goto out;
4134 : }
4135 : # endif
4136 : #endif
4137 : /* If device/qdisc don't need skb->dst, release it right now while
4138 : * its hot in this cpu cache.
4139 : */
4140 448 : if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
4141 448 : skb_dst_drop(skb);
4142 : else
4143 0 : skb_dst_force(skb);
4144 :
4145 448 : txq = netdev_core_pick_tx(dev, skb, sb_dev);
4146 448 : q = rcu_dereference_bh(txq->qdisc);
4147 :
4148 448 : trace_net_dev_queue(skb);
4149 448 : if (q->enqueue) {
4150 448 : rc = __dev_xmit_skb(skb, q, dev, txq);
4151 448 : goto out;
4152 : }
4153 :
4154 : /* The device has no queue. Common case for software devices:
4155 : * loopback, all the sorts of tunnels...
4156 :
4157 : * Really, it is unlikely that netif_tx_lock protection is necessary
4158 : * here. (f.e. loopback and IP tunnels are clean ignoring statistics
4159 : * counters.)
4160 : * However, it is possible, that they rely on protection
4161 : * made by us here.
4162 :
4163 : * Check this and shot the lock. It is not prone from deadlocks.
4164 : *Either shot noqueue qdisc, it is even simpler 8)
4165 : */
4166 0 : if (dev->flags & IFF_UP) {
4167 0 : int cpu = smp_processor_id(); /* ok because BHs are off */
4168 :
4169 0 : if (txq->xmit_lock_owner != cpu) {
4170 0 : if (dev_xmit_recursion())
4171 0 : goto recursion_alert;
4172 :
4173 0 : skb = validate_xmit_skb(skb, dev, &again);
4174 0 : if (!skb)
4175 0 : goto out;
4176 :
4177 0 : PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4178 0 : HARD_TX_LOCK(dev, txq, cpu);
4179 :
4180 0 : if (!netif_xmit_stopped(txq)) {
4181 0 : dev_xmit_recursion_inc();
4182 0 : skb = dev_hard_start_xmit(skb, dev, txq, &rc);
4183 0 : dev_xmit_recursion_dec();
4184 0 : if (dev_xmit_complete(rc)) {
4185 0 : HARD_TX_UNLOCK(dev, txq);
4186 0 : goto out;
4187 : }
4188 : }
4189 0 : HARD_TX_UNLOCK(dev, txq);
4190 0 : net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
4191 : dev->name);
4192 : } else {
4193 : /* Recursion is detected! It is possible,
4194 : * unfortunately
4195 : */
4196 0 : recursion_alert:
4197 0 : net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
4198 : dev->name);
4199 : }
4200 : }
4201 :
4202 0 : rc = -ENETDOWN;
4203 0 : rcu_read_unlock_bh();
4204 :
4205 0 : atomic_long_inc(&dev->tx_dropped);
4206 0 : kfree_skb_list(skb);
4207 0 : return rc;
4208 448 : out:
4209 448 : rcu_read_unlock_bh();
4210 448 : return rc;
4211 : }
4212 :
4213 448 : int dev_queue_xmit(struct sk_buff *skb)
4214 : {
4215 448 : return __dev_queue_xmit(skb, NULL);
4216 : }
4217 : EXPORT_SYMBOL(dev_queue_xmit);
4218 :
4219 0 : int dev_queue_xmit_accel(struct sk_buff *skb, struct net_device *sb_dev)
4220 : {
4221 0 : return __dev_queue_xmit(skb, sb_dev);
4222 : }
4223 : EXPORT_SYMBOL(dev_queue_xmit_accel);
4224 :
4225 0 : int __dev_direct_xmit(struct sk_buff *skb, u16 queue_id)
4226 : {
4227 0 : struct net_device *dev = skb->dev;
4228 0 : struct sk_buff *orig_skb = skb;
4229 0 : struct netdev_queue *txq;
4230 0 : int ret = NETDEV_TX_BUSY;
4231 0 : bool again = false;
4232 :
4233 0 : if (unlikely(!netif_running(dev) ||
4234 : !netif_carrier_ok(dev)))
4235 0 : goto drop;
4236 :
4237 0 : skb = validate_xmit_skb_list(skb, dev, &again);
4238 0 : if (skb != orig_skb)
4239 0 : goto drop;
4240 :
4241 0 : skb_set_queue_mapping(skb, queue_id);
4242 0 : txq = skb_get_tx_queue(dev, skb);
4243 0 : PRANDOM_ADD_NOISE(skb, dev, txq, jiffies);
4244 :
4245 0 : local_bh_disable();
4246 :
4247 0 : dev_xmit_recursion_inc();
4248 0 : HARD_TX_LOCK(dev, txq, smp_processor_id());
4249 0 : if (!netif_xmit_frozen_or_drv_stopped(txq))
4250 0 : ret = netdev_start_xmit(skb, dev, txq, false);
4251 0 : HARD_TX_UNLOCK(dev, txq);
4252 0 : dev_xmit_recursion_dec();
4253 :
4254 0 : local_bh_enable();
4255 0 : return ret;
4256 0 : drop:
4257 0 : atomic_long_inc(&dev->tx_dropped);
4258 0 : kfree_skb_list(skb);
4259 0 : return NET_XMIT_DROP;
4260 : }
4261 : EXPORT_SYMBOL(__dev_direct_xmit);
4262 :
4263 : /*************************************************************************
4264 : * Receiver routines
4265 : *************************************************************************/
4266 :
4267 : int netdev_max_backlog __read_mostly = 1000;
4268 : EXPORT_SYMBOL(netdev_max_backlog);
4269 :
4270 : int netdev_tstamp_prequeue __read_mostly = 1;
4271 : int netdev_budget __read_mostly = 300;
4272 : /* Must be at least 2 jiffes to guarantee 1 jiffy timeout */
4273 : unsigned int __read_mostly netdev_budget_usecs = 2 * USEC_PER_SEC / HZ;
4274 : int weight_p __read_mostly = 64; /* old backlog weight */
4275 : int dev_weight_rx_bias __read_mostly = 1; /* bias for backlog weight */
4276 : int dev_weight_tx_bias __read_mostly = 1; /* bias for output_queue quota */
4277 : int dev_rx_weight __read_mostly = 64;
4278 : int dev_tx_weight __read_mostly = 64;
4279 : /* Maximum number of GRO_NORMAL skbs to batch up for list-RX */
4280 : int gro_normal_batch __read_mostly = 8;
4281 :
4282 : /* Called with irq disabled */
4283 855 : static inline void ____napi_schedule(struct softnet_data *sd,
4284 : struct napi_struct *napi)
4285 : {
4286 855 : struct task_struct *thread;
4287 :
4288 855 : if (test_bit(NAPI_STATE_THREADED, &napi->state)) {
4289 : /* Paired with smp_mb__before_atomic() in
4290 : * napi_enable()/dev_set_threaded().
4291 : * Use READ_ONCE() to guarantee a complete
4292 : * read on napi->thread. Only call
4293 : * wake_up_process() when it's not NULL.
4294 : */
4295 0 : thread = READ_ONCE(napi->thread);
4296 0 : if (thread) {
4297 0 : wake_up_process(thread);
4298 0 : return;
4299 : }
4300 : }
4301 :
4302 855 : list_add_tail(&napi->poll_list, &sd->poll_list);
4303 855 : __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4304 : }
4305 :
4306 : #ifdef CONFIG_RPS
4307 :
4308 : /* One global table that all flow-based protocols share. */
4309 : struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
4310 : EXPORT_SYMBOL(rps_sock_flow_table);
4311 : u32 rps_cpu_mask __read_mostly;
4312 : EXPORT_SYMBOL(rps_cpu_mask);
4313 :
4314 : struct static_key_false rps_needed __read_mostly;
4315 : EXPORT_SYMBOL(rps_needed);
4316 : struct static_key_false rfs_needed __read_mostly;
4317 : EXPORT_SYMBOL(rfs_needed);
4318 :
4319 : static struct rps_dev_flow *
4320 0 : set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4321 : struct rps_dev_flow *rflow, u16 next_cpu)
4322 : {
4323 0 : if (next_cpu < nr_cpu_ids) {
4324 : #ifdef CONFIG_RFS_ACCEL
4325 0 : struct netdev_rx_queue *rxqueue;
4326 0 : struct rps_dev_flow_table *flow_table;
4327 0 : struct rps_dev_flow *old_rflow;
4328 0 : u32 flow_id;
4329 0 : u16 rxq_index;
4330 0 : int rc;
4331 :
4332 : /* Should we steer this flow to a different hardware queue? */
4333 0 : if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
4334 0 : !(dev->features & NETIF_F_NTUPLE))
4335 0 : goto out;
4336 0 : rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
4337 0 : if (rxq_index == skb_get_rx_queue(skb))
4338 0 : goto out;
4339 :
4340 0 : rxqueue = dev->_rx + rxq_index;
4341 0 : flow_table = rcu_dereference(rxqueue->rps_flow_table);
4342 0 : if (!flow_table)
4343 0 : goto out;
4344 0 : flow_id = skb_get_hash(skb) & flow_table->mask;
4345 0 : rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
4346 : rxq_index, flow_id);
4347 0 : if (rc < 0)
4348 0 : goto out;
4349 0 : old_rflow = rflow;
4350 0 : rflow = &flow_table->flows[flow_id];
4351 0 : rflow->filter = rc;
4352 0 : if (old_rflow->filter == rflow->filter)
4353 0 : old_rflow->filter = RPS_NO_FILTER;
4354 0 : out:
4355 : #endif
4356 0 : rflow->last_qtail =
4357 0 : per_cpu(softnet_data, next_cpu).input_queue_head;
4358 : }
4359 :
4360 0 : rflow->cpu = next_cpu;
4361 0 : return rflow;
4362 : }
4363 :
4364 : /*
4365 : * get_rps_cpu is called from netif_receive_skb and returns the target
4366 : * CPU from the RPS map of the receiving queue for a given skb.
4367 : * rcu_read_lock must be held on entry.
4368 : */
4369 0 : static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
4370 : struct rps_dev_flow **rflowp)
4371 : {
4372 0 : const struct rps_sock_flow_table *sock_flow_table;
4373 0 : struct netdev_rx_queue *rxqueue = dev->_rx;
4374 0 : struct rps_dev_flow_table *flow_table;
4375 0 : struct rps_map *map;
4376 0 : int cpu = -1;
4377 0 : u32 tcpu;
4378 0 : u32 hash;
4379 :
4380 0 : if (skb_rx_queue_recorded(skb)) {
4381 0 : u16 index = skb_get_rx_queue(skb);
4382 :
4383 0 : if (unlikely(index >= dev->real_num_rx_queues)) {
4384 0 : WARN_ONCE(dev->real_num_rx_queues > 1,
4385 : "%s received packet on queue %u, but number "
4386 : "of RX queues is %u\n",
4387 : dev->name, index, dev->real_num_rx_queues);
4388 0 : goto done;
4389 : }
4390 0 : rxqueue += index;
4391 : }
4392 :
4393 : /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
4394 :
4395 0 : flow_table = rcu_dereference(rxqueue->rps_flow_table);
4396 0 : map = rcu_dereference(rxqueue->rps_map);
4397 0 : if (!flow_table && !map)
4398 0 : goto done;
4399 :
4400 0 : skb_reset_network_header(skb);
4401 0 : hash = skb_get_hash(skb);
4402 0 : if (!hash)
4403 0 : goto done;
4404 :
4405 0 : sock_flow_table = rcu_dereference(rps_sock_flow_table);
4406 0 : if (flow_table && sock_flow_table) {
4407 0 : struct rps_dev_flow *rflow;
4408 0 : u32 next_cpu;
4409 0 : u32 ident;
4410 :
4411 : /* First check into global flow table if there is a match */
4412 0 : ident = sock_flow_table->ents[hash & sock_flow_table->mask];
4413 0 : if ((ident ^ hash) & ~rps_cpu_mask)
4414 0 : goto try_rps;
4415 :
4416 0 : next_cpu = ident & rps_cpu_mask;
4417 :
4418 : /* OK, now we know there is a match,
4419 : * we can look at the local (per receive queue) flow table
4420 : */
4421 0 : rflow = &flow_table->flows[hash & flow_table->mask];
4422 0 : tcpu = rflow->cpu;
4423 :
4424 : /*
4425 : * If the desired CPU (where last recvmsg was done) is
4426 : * different from current CPU (one in the rx-queue flow
4427 : * table entry), switch if one of the following holds:
4428 : * - Current CPU is unset (>= nr_cpu_ids).
4429 : * - Current CPU is offline.
4430 : * - The current CPU's queue tail has advanced beyond the
4431 : * last packet that was enqueued using this table entry.
4432 : * This guarantees that all previous packets for the flow
4433 : * have been dequeued, thus preserving in order delivery.
4434 : */
4435 0 : if (unlikely(tcpu != next_cpu) &&
4436 0 : (tcpu >= nr_cpu_ids || !cpu_online(tcpu) ||
4437 0 : ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
4438 0 : rflow->last_qtail)) >= 0)) {
4439 0 : tcpu = next_cpu;
4440 0 : rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
4441 : }
4442 :
4443 0 : if (tcpu < nr_cpu_ids && cpu_online(tcpu)) {
4444 0 : *rflowp = rflow;
4445 0 : cpu = tcpu;
4446 0 : goto done;
4447 : }
4448 : }
4449 :
4450 0 : try_rps:
4451 :
4452 0 : if (map) {
4453 0 : tcpu = map->cpus[reciprocal_scale(hash, map->len)];
4454 0 : if (cpu_online(tcpu)) {
4455 0 : cpu = tcpu;
4456 0 : goto done;
4457 : }
4458 : }
4459 :
4460 0 : done:
4461 0 : return cpu;
4462 : }
4463 :
4464 : #ifdef CONFIG_RFS_ACCEL
4465 :
4466 : /**
4467 : * rps_may_expire_flow - check whether an RFS hardware filter may be removed
4468 : * @dev: Device on which the filter was set
4469 : * @rxq_index: RX queue index
4470 : * @flow_id: Flow ID passed to ndo_rx_flow_steer()
4471 : * @filter_id: Filter ID returned by ndo_rx_flow_steer()
4472 : *
4473 : * Drivers that implement ndo_rx_flow_steer() should periodically call
4474 : * this function for each installed filter and remove the filters for
4475 : * which it returns %true.
4476 : */
4477 0 : bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
4478 : u32 flow_id, u16 filter_id)
4479 : {
4480 0 : struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
4481 0 : struct rps_dev_flow_table *flow_table;
4482 0 : struct rps_dev_flow *rflow;
4483 0 : bool expire = true;
4484 0 : unsigned int cpu;
4485 :
4486 0 : rcu_read_lock();
4487 0 : flow_table = rcu_dereference(rxqueue->rps_flow_table);
4488 0 : if (flow_table && flow_id <= flow_table->mask) {
4489 0 : rflow = &flow_table->flows[flow_id];
4490 0 : cpu = READ_ONCE(rflow->cpu);
4491 0 : if (rflow->filter == filter_id && cpu < nr_cpu_ids &&
4492 0 : ((int)(per_cpu(softnet_data, cpu).input_queue_head -
4493 0 : rflow->last_qtail) <
4494 0 : (int)(10 * flow_table->mask)))
4495 0 : expire = false;
4496 : }
4497 0 : rcu_read_unlock();
4498 0 : return expire;
4499 : }
4500 : EXPORT_SYMBOL(rps_may_expire_flow);
4501 :
4502 : #endif /* CONFIG_RFS_ACCEL */
4503 :
4504 : /* Called from hardirq (IPI) context */
4505 0 : static void rps_trigger_softirq(void *data)
4506 : {
4507 0 : struct softnet_data *sd = data;
4508 :
4509 0 : ____napi_schedule(sd, &sd->backlog);
4510 0 : sd->received_rps++;
4511 0 : }
4512 :
4513 : #endif /* CONFIG_RPS */
4514 :
4515 : /*
4516 : * Check if this softnet_data structure is another cpu one
4517 : * If yes, queue it to our IPI list and return 1
4518 : * If no, return 0
4519 : */
4520 0 : static int rps_ipi_queued(struct softnet_data *sd)
4521 : {
4522 : #ifdef CONFIG_RPS
4523 0 : struct softnet_data *mysd = this_cpu_ptr(&softnet_data);
4524 :
4525 0 : if (sd != mysd) {
4526 0 : sd->rps_ipi_next = mysd->rps_ipi_list;
4527 0 : mysd->rps_ipi_list = sd;
4528 :
4529 0 : __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4530 0 : return 1;
4531 : }
4532 : #endif /* CONFIG_RPS */
4533 : return 0;
4534 : }
4535 :
4536 : #ifdef CONFIG_NET_FLOW_LIMIT
4537 : int netdev_flow_limit_table_len __read_mostly = (1 << 12);
4538 : #endif
4539 :
4540 0 : static bool skb_flow_limit(struct sk_buff *skb, unsigned int qlen)
4541 : {
4542 : #ifdef CONFIG_NET_FLOW_LIMIT
4543 0 : struct sd_flow_limit *fl;
4544 0 : struct softnet_data *sd;
4545 0 : unsigned int old_flow, new_flow;
4546 :
4547 0 : if (qlen < (netdev_max_backlog >> 1))
4548 : return false;
4549 :
4550 0 : sd = this_cpu_ptr(&softnet_data);
4551 :
4552 0 : rcu_read_lock();
4553 0 : fl = rcu_dereference(sd->flow_limit);
4554 0 : if (fl) {
4555 0 : new_flow = skb_get_hash(skb) & (fl->num_buckets - 1);
4556 0 : old_flow = fl->history[fl->history_head];
4557 0 : fl->history[fl->history_head] = new_flow;
4558 :
4559 0 : fl->history_head++;
4560 0 : fl->history_head &= FLOW_LIMIT_HISTORY - 1;
4561 :
4562 0 : if (likely(fl->buckets[old_flow]))
4563 0 : fl->buckets[old_flow]--;
4564 :
4565 0 : if (++fl->buckets[new_flow] > (FLOW_LIMIT_HISTORY >> 1)) {
4566 0 : fl->count++;
4567 0 : rcu_read_unlock();
4568 0 : return true;
4569 : }
4570 : }
4571 0 : rcu_read_unlock();
4572 : #endif
4573 0 : return false;
4574 : }
4575 :
4576 : /*
4577 : * enqueue_to_backlog is called to queue an skb to a per CPU backlog
4578 : * queue (may be a remote CPU queue).
4579 : */
4580 0 : static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
4581 : unsigned int *qtail)
4582 : {
4583 0 : struct softnet_data *sd;
4584 0 : unsigned long flags;
4585 0 : unsigned int qlen;
4586 :
4587 0 : sd = &per_cpu(softnet_data, cpu);
4588 :
4589 0 : local_irq_save(flags);
4590 :
4591 0 : rps_lock(sd);
4592 0 : if (!netif_running(skb->dev))
4593 0 : goto drop;
4594 0 : qlen = skb_queue_len(&sd->input_pkt_queue);
4595 0 : if (qlen <= netdev_max_backlog && !skb_flow_limit(skb, qlen)) {
4596 0 : if (qlen) {
4597 0 : enqueue:
4598 0 : __skb_queue_tail(&sd->input_pkt_queue, skb);
4599 0 : input_queue_tail_incr_save(sd, qtail);
4600 0 : rps_unlock(sd);
4601 0 : local_irq_restore(flags);
4602 0 : return NET_RX_SUCCESS;
4603 : }
4604 :
4605 : /* Schedule NAPI for backlog device
4606 : * We can use non atomic operation since we own the queue lock
4607 : */
4608 0 : if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
4609 0 : if (!rps_ipi_queued(sd))
4610 0 : ____napi_schedule(sd, &sd->backlog);
4611 : }
4612 0 : goto enqueue;
4613 : }
4614 :
4615 0 : drop:
4616 0 : sd->dropped++;
4617 0 : rps_unlock(sd);
4618 :
4619 0 : local_irq_restore(flags);
4620 :
4621 0 : atomic_long_inc(&skb->dev->rx_dropped);
4622 0 : kfree_skb(skb);
4623 0 : return NET_RX_DROP;
4624 : }
4625 :
4626 0 : static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb)
4627 : {
4628 0 : struct net_device *dev = skb->dev;
4629 0 : struct netdev_rx_queue *rxqueue;
4630 :
4631 0 : rxqueue = dev->_rx;
4632 :
4633 0 : if (skb_rx_queue_recorded(skb)) {
4634 0 : u16 index = skb_get_rx_queue(skb);
4635 :
4636 0 : if (unlikely(index >= dev->real_num_rx_queues)) {
4637 0 : WARN_ONCE(dev->real_num_rx_queues > 1,
4638 : "%s received packet on queue %u, but number "
4639 : "of RX queues is %u\n",
4640 : dev->name, index, dev->real_num_rx_queues);
4641 :
4642 0 : return rxqueue; /* Return first rxqueue */
4643 : }
4644 0 : rxqueue += index;
4645 : }
4646 : return rxqueue;
4647 : }
4648 :
4649 0 : static u32 netif_receive_generic_xdp(struct sk_buff *skb,
4650 : struct xdp_buff *xdp,
4651 : struct bpf_prog *xdp_prog)
4652 : {
4653 0 : void *orig_data, *orig_data_end, *hard_start;
4654 0 : struct netdev_rx_queue *rxqueue;
4655 0 : u32 metalen, act = XDP_DROP;
4656 0 : u32 mac_len, frame_sz;
4657 0 : __be16 orig_eth_type;
4658 0 : struct ethhdr *eth;
4659 0 : bool orig_bcast;
4660 0 : int off;
4661 :
4662 : /* Reinjected packets coming from act_mirred or similar should
4663 : * not get XDP generic processing.
4664 : */
4665 0 : if (skb_is_redirected(skb))
4666 : return XDP_PASS;
4667 :
4668 : /* XDP packets must be linear and must have sufficient headroom
4669 : * of XDP_PACKET_HEADROOM bytes. This is the guarantee that also
4670 : * native XDP provides, thus we need to do it here as well.
4671 : */
4672 0 : if (skb_cloned(skb) || skb_is_nonlinear(skb) ||
4673 0 : skb_headroom(skb) < XDP_PACKET_HEADROOM) {
4674 0 : int hroom = XDP_PACKET_HEADROOM - skb_headroom(skb);
4675 0 : int troom = skb->tail + skb->data_len - skb->end;
4676 :
4677 : /* In case we have to go down the path and also linearize,
4678 : * then lets do the pskb_expand_head() work just once here.
4679 : */
4680 0 : if (pskb_expand_head(skb,
4681 0 : hroom > 0 ? ALIGN(hroom, NET_SKB_PAD) : 0,
4682 : troom > 0 ? troom + 128 : 0, GFP_ATOMIC))
4683 0 : goto do_drop;
4684 0 : if (skb_linearize(skb))
4685 0 : goto do_drop;
4686 : }
4687 :
4688 : /* The XDP program wants to see the packet starting at the MAC
4689 : * header.
4690 : */
4691 0 : mac_len = skb->data - skb_mac_header(skb);
4692 0 : hard_start = skb->data - skb_headroom(skb);
4693 :
4694 : /* SKB "head" area always have tailroom for skb_shared_info */
4695 0 : frame_sz = (void *)skb_end_pointer(skb) - hard_start;
4696 0 : frame_sz += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
4697 :
4698 0 : rxqueue = netif_get_rxqueue(skb);
4699 0 : xdp_init_buff(xdp, frame_sz, &rxqueue->xdp_rxq);
4700 0 : xdp_prepare_buff(xdp, hard_start, skb_headroom(skb) - mac_len,
4701 0 : skb_headlen(skb) + mac_len, true);
4702 :
4703 0 : orig_data_end = xdp->data_end;
4704 0 : orig_data = xdp->data;
4705 0 : eth = (struct ethhdr *)xdp->data;
4706 0 : orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest);
4707 0 : orig_eth_type = eth->h_proto;
4708 :
4709 0 : act = bpf_prog_run_xdp(xdp_prog, xdp);
4710 :
4711 : /* check if bpf_xdp_adjust_head was used */
4712 0 : off = xdp->data - orig_data;
4713 0 : if (off) {
4714 0 : if (off > 0)
4715 0 : __skb_pull(skb, off);
4716 0 : else if (off < 0)
4717 0 : __skb_push(skb, -off);
4718 :
4719 0 : skb->mac_header += off;
4720 0 : skb_reset_network_header(skb);
4721 : }
4722 :
4723 : /* check if bpf_xdp_adjust_tail was used */
4724 0 : off = xdp->data_end - orig_data_end;
4725 0 : if (off != 0) {
4726 0 : skb_set_tail_pointer(skb, xdp->data_end - xdp->data);
4727 0 : skb->len += off; /* positive on grow, negative on shrink */
4728 : }
4729 :
4730 : /* check if XDP changed eth hdr such SKB needs update */
4731 0 : eth = (struct ethhdr *)xdp->data;
4732 0 : if ((orig_eth_type != eth->h_proto) ||
4733 0 : (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) {
4734 0 : __skb_push(skb, ETH_HLEN);
4735 0 : skb->protocol = eth_type_trans(skb, skb->dev);
4736 : }
4737 :
4738 0 : switch (act) {
4739 : case XDP_REDIRECT:
4740 : case XDP_TX:
4741 0 : __skb_push(skb, mac_len);
4742 : break;
4743 0 : case XDP_PASS:
4744 0 : metalen = xdp->data - xdp->data_meta;
4745 0 : if (metalen)
4746 0 : skb_metadata_set(skb, metalen);
4747 : break;
4748 0 : default:
4749 0 : bpf_warn_invalid_xdp_action(act);
4750 0 : fallthrough;
4751 0 : case XDP_ABORTED:
4752 0 : trace_xdp_exception(skb->dev, xdp_prog, act);
4753 0 : fallthrough;
4754 : case XDP_DROP:
4755 0 : do_drop:
4756 0 : kfree_skb(skb);
4757 0 : break;
4758 : }
4759 :
4760 0 : return act;
4761 : }
4762 :
4763 : /* When doing generic XDP we have to bypass the qdisc layer and the
4764 : * network taps in order to match in-driver-XDP behavior.
4765 : */
4766 0 : void generic_xdp_tx(struct sk_buff *skb, struct bpf_prog *xdp_prog)
4767 : {
4768 0 : struct net_device *dev = skb->dev;
4769 0 : struct netdev_queue *txq;
4770 0 : bool free_skb = true;
4771 0 : int cpu, rc;
4772 :
4773 0 : txq = netdev_core_pick_tx(dev, skb, NULL);
4774 0 : cpu = smp_processor_id();
4775 0 : HARD_TX_LOCK(dev, txq, cpu);
4776 0 : if (!netif_xmit_stopped(txq)) {
4777 0 : rc = netdev_start_xmit(skb, dev, txq, 0);
4778 0 : if (dev_xmit_complete(rc))
4779 : free_skb = false;
4780 : }
4781 0 : HARD_TX_UNLOCK(dev, txq);
4782 0 : if (free_skb) {
4783 0 : trace_xdp_exception(dev, xdp_prog, XDP_TX);
4784 0 : kfree_skb(skb);
4785 : }
4786 0 : }
4787 :
4788 : static DEFINE_STATIC_KEY_FALSE(generic_xdp_needed_key);
4789 :
4790 0 : int do_xdp_generic(struct bpf_prog *xdp_prog, struct sk_buff *skb)
4791 : {
4792 0 : if (xdp_prog) {
4793 0 : struct xdp_buff xdp;
4794 0 : u32 act;
4795 0 : int err;
4796 :
4797 0 : act = netif_receive_generic_xdp(skb, &xdp, xdp_prog);
4798 0 : if (act != XDP_PASS) {
4799 0 : switch (act) {
4800 0 : case XDP_REDIRECT:
4801 0 : err = xdp_do_generic_redirect(skb->dev, skb,
4802 : &xdp, xdp_prog);
4803 0 : if (err)
4804 0 : goto out_redir;
4805 : break;
4806 0 : case XDP_TX:
4807 0 : generic_xdp_tx(skb, xdp_prog);
4808 0 : break;
4809 : }
4810 0 : return XDP_DROP;
4811 : }
4812 : }
4813 : return XDP_PASS;
4814 0 : out_redir:
4815 0 : kfree_skb(skb);
4816 0 : return XDP_DROP;
4817 : }
4818 : EXPORT_SYMBOL_GPL(do_xdp_generic);
4819 :
4820 0 : static int netif_rx_internal(struct sk_buff *skb)
4821 : {
4822 0 : int ret;
4823 :
4824 0 : net_timestamp_check(netdev_tstamp_prequeue, skb);
4825 :
4826 0 : trace_netif_rx(skb);
4827 :
4828 : #ifdef CONFIG_RPS
4829 0 : if (static_branch_unlikely(&rps_needed)) {
4830 0 : struct rps_dev_flow voidflow, *rflow = &voidflow;
4831 0 : int cpu;
4832 :
4833 0 : preempt_disable();
4834 0 : rcu_read_lock();
4835 :
4836 0 : cpu = get_rps_cpu(skb->dev, skb, &rflow);
4837 0 : if (cpu < 0)
4838 0 : cpu = smp_processor_id();
4839 :
4840 0 : ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
4841 :
4842 0 : rcu_read_unlock();
4843 0 : preempt_enable();
4844 : } else
4845 : #endif
4846 : {
4847 0 : unsigned int qtail;
4848 :
4849 0 : ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
4850 0 : put_cpu();
4851 : }
4852 0 : return ret;
4853 : }
4854 :
4855 : /**
4856 : * netif_rx - post buffer to the network code
4857 : * @skb: buffer to post
4858 : *
4859 : * This function receives a packet from a device driver and queues it for
4860 : * the upper (protocol) levels to process. It always succeeds. The buffer
4861 : * may be dropped during processing for congestion control or by the
4862 : * protocol layers.
4863 : *
4864 : * return values:
4865 : * NET_RX_SUCCESS (no congestion)
4866 : * NET_RX_DROP (packet was dropped)
4867 : *
4868 : */
4869 :
4870 0 : int netif_rx(struct sk_buff *skb)
4871 : {
4872 0 : int ret;
4873 :
4874 0 : trace_netif_rx_entry(skb);
4875 :
4876 0 : ret = netif_rx_internal(skb);
4877 0 : trace_netif_rx_exit(ret);
4878 :
4879 0 : return ret;
4880 : }
4881 : EXPORT_SYMBOL(netif_rx);
4882 :
4883 0 : int netif_rx_ni(struct sk_buff *skb)
4884 : {
4885 0 : int err;
4886 :
4887 0 : trace_netif_rx_ni_entry(skb);
4888 :
4889 0 : preempt_disable();
4890 0 : err = netif_rx_internal(skb);
4891 0 : if (local_softirq_pending())
4892 0 : do_softirq();
4893 0 : preempt_enable();
4894 0 : trace_netif_rx_ni_exit(err);
4895 :
4896 0 : return err;
4897 : }
4898 : EXPORT_SYMBOL(netif_rx_ni);
4899 :
4900 0 : int netif_rx_any_context(struct sk_buff *skb)
4901 : {
4902 : /*
4903 : * If invoked from contexts which do not invoke bottom half
4904 : * processing either at return from interrupt or when softrqs are
4905 : * reenabled, use netif_rx_ni() which invokes bottomhalf processing
4906 : * directly.
4907 : */
4908 0 : if (in_interrupt())
4909 0 : return netif_rx(skb);
4910 : else
4911 0 : return netif_rx_ni(skb);
4912 : }
4913 : EXPORT_SYMBOL(netif_rx_any_context);
4914 :
4915 0 : static __latent_entropy void net_tx_action(struct softirq_action *h)
4916 : {
4917 0 : struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4918 :
4919 0 : if (sd->completion_queue) {
4920 0 : struct sk_buff *clist;
4921 :
4922 0 : local_irq_disable();
4923 0 : clist = sd->completion_queue;
4924 0 : sd->completion_queue = NULL;
4925 0 : local_irq_enable();
4926 :
4927 0 : while (clist) {
4928 0 : struct sk_buff *skb = clist;
4929 :
4930 0 : clist = clist->next;
4931 :
4932 0 : WARN_ON(refcount_read(&skb->users));
4933 0 : if (likely(get_kfree_skb_cb(skb)->reason == SKB_REASON_CONSUMED))
4934 0 : trace_consume_skb(skb);
4935 : else
4936 0 : trace_kfree_skb(skb, net_tx_action);
4937 :
4938 0 : if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
4939 0 : __kfree_skb(skb);
4940 : else
4941 0 : __kfree_skb_defer(skb);
4942 : }
4943 : }
4944 :
4945 0 : if (sd->output_queue) {
4946 0 : struct Qdisc *head;
4947 :
4948 0 : local_irq_disable();
4949 0 : head = sd->output_queue;
4950 0 : sd->output_queue = NULL;
4951 0 : sd->output_queue_tailp = &sd->output_queue;
4952 0 : local_irq_enable();
4953 :
4954 0 : while (head) {
4955 0 : struct Qdisc *q = head;
4956 0 : spinlock_t *root_lock = NULL;
4957 :
4958 0 : head = head->next_sched;
4959 :
4960 0 : if (!(q->flags & TCQ_F_NOLOCK)) {
4961 0 : root_lock = qdisc_lock(q);
4962 0 : spin_lock(root_lock);
4963 : }
4964 : /* We need to make sure head->next_sched is read
4965 : * before clearing __QDISC_STATE_SCHED
4966 : */
4967 0 : smp_mb__before_atomic();
4968 0 : clear_bit(__QDISC_STATE_SCHED, &q->state);
4969 0 : qdisc_run(q);
4970 0 : if (root_lock)
4971 0 : spin_unlock(root_lock);
4972 : }
4973 : }
4974 :
4975 0 : xfrm_dev_backlog(sd);
4976 0 : }
4977 :
4978 : #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE)
4979 : /* This hook is defined here for ATM LANE */
4980 : int (*br_fdb_test_addr_hook)(struct net_device *dev,
4981 : unsigned char *addr) __read_mostly;
4982 : EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
4983 : #endif
4984 :
4985 : static inline struct sk_buff *
4986 : sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
4987 : struct net_device *orig_dev, bool *another)
4988 : {
4989 : #ifdef CONFIG_NET_CLS_ACT
4990 : struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
4991 : struct tcf_result cl_res;
4992 :
4993 : /* If there's at least one ingress present somewhere (so
4994 : * we get here via enabled static key), remaining devices
4995 : * that are not configured with an ingress qdisc will bail
4996 : * out here.
4997 : */
4998 : if (!miniq)
4999 : return skb;
5000 :
5001 : if (*pt_prev) {
5002 : *ret = deliver_skb(skb, *pt_prev, orig_dev);
5003 : *pt_prev = NULL;
5004 : }
5005 :
5006 : qdisc_skb_cb(skb)->pkt_len = skb->len;
5007 : qdisc_skb_cb(skb)->mru = 0;
5008 : qdisc_skb_cb(skb)->post_ct = false;
5009 : skb->tc_at_ingress = 1;
5010 : mini_qdisc_bstats_cpu_update(miniq, skb);
5011 :
5012 : switch (tcf_classify_ingress(skb, miniq->block, miniq->filter_list,
5013 : &cl_res, false)) {
5014 : case TC_ACT_OK:
5015 : case TC_ACT_RECLASSIFY:
5016 : skb->tc_index = TC_H_MIN(cl_res.classid);
5017 : break;
5018 : case TC_ACT_SHOT:
5019 : mini_qdisc_qstats_cpu_drop(miniq);
5020 : kfree_skb(skb);
5021 : return NULL;
5022 : case TC_ACT_STOLEN:
5023 : case TC_ACT_QUEUED:
5024 : case TC_ACT_TRAP:
5025 : consume_skb(skb);
5026 : return NULL;
5027 : case TC_ACT_REDIRECT:
5028 : /* skb_mac_header check was done by cls/act_bpf, so
5029 : * we can safely push the L2 header back before
5030 : * redirecting to another netdev
5031 : */
5032 : __skb_push(skb, skb->mac_len);
5033 : if (skb_do_redirect(skb) == -EAGAIN) {
5034 : __skb_pull(skb, skb->mac_len);
5035 : *another = true;
5036 : break;
5037 : }
5038 : return NULL;
5039 : case TC_ACT_CONSUMED:
5040 : return NULL;
5041 : default:
5042 : break;
5043 : }
5044 : #endif /* CONFIG_NET_CLS_ACT */
5045 : return skb;
5046 : }
5047 :
5048 : /**
5049 : * netdev_is_rx_handler_busy - check if receive handler is registered
5050 : * @dev: device to check
5051 : *
5052 : * Check if a receive handler is already registered for a given device.
5053 : * Return true if there one.
5054 : *
5055 : * The caller must hold the rtnl_mutex.
5056 : */
5057 0 : bool netdev_is_rx_handler_busy(struct net_device *dev)
5058 : {
5059 0 : ASSERT_RTNL();
5060 0 : return dev && rtnl_dereference(dev->rx_handler);
5061 : }
5062 : EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
5063 :
5064 : /**
5065 : * netdev_rx_handler_register - register receive handler
5066 : * @dev: device to register a handler for
5067 : * @rx_handler: receive handler to register
5068 : * @rx_handler_data: data pointer that is used by rx handler
5069 : *
5070 : * Register a receive handler for a device. This handler will then be
5071 : * called from __netif_receive_skb. A negative errno code is returned
5072 : * on a failure.
5073 : *
5074 : * The caller must hold the rtnl_mutex.
5075 : *
5076 : * For a general description of rx_handler, see enum rx_handler_result.
5077 : */
5078 0 : int netdev_rx_handler_register(struct net_device *dev,
5079 : rx_handler_func_t *rx_handler,
5080 : void *rx_handler_data)
5081 : {
5082 0 : if (netdev_is_rx_handler_busy(dev))
5083 : return -EBUSY;
5084 :
5085 0 : if (dev->priv_flags & IFF_NO_RX_HANDLER)
5086 : return -EINVAL;
5087 :
5088 : /* Note: rx_handler_data must be set before rx_handler */
5089 0 : rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
5090 0 : rcu_assign_pointer(dev->rx_handler, rx_handler);
5091 :
5092 0 : return 0;
5093 : }
5094 : EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
5095 :
5096 : /**
5097 : * netdev_rx_handler_unregister - unregister receive handler
5098 : * @dev: device to unregister a handler from
5099 : *
5100 : * Unregister a receive handler from a device.
5101 : *
5102 : * The caller must hold the rtnl_mutex.
5103 : */
5104 0 : void netdev_rx_handler_unregister(struct net_device *dev)
5105 : {
5106 :
5107 0 : ASSERT_RTNL();
5108 0 : RCU_INIT_POINTER(dev->rx_handler, NULL);
5109 : /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
5110 : * section has a guarantee to see a non NULL rx_handler_data
5111 : * as well.
5112 : */
5113 0 : synchronize_net();
5114 0 : RCU_INIT_POINTER(dev->rx_handler_data, NULL);
5115 0 : }
5116 : EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
5117 :
5118 : /*
5119 : * Limit the use of PFMEMALLOC reserves to those protocols that implement
5120 : * the special handling of PFMEMALLOC skbs.
5121 : */
5122 0 : static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
5123 : {
5124 0 : switch (skb->protocol) {
5125 : case htons(ETH_P_ARP):
5126 : case htons(ETH_P_IP):
5127 : case htons(ETH_P_IPV6):
5128 : case htons(ETH_P_8021Q):
5129 : case htons(ETH_P_8021AD):
5130 : return true;
5131 0 : default:
5132 0 : return false;
5133 : }
5134 : }
5135 :
5136 : static inline int nf_ingress(struct sk_buff *skb, struct packet_type **pt_prev,
5137 : int *ret, struct net_device *orig_dev)
5138 : {
5139 : if (nf_hook_ingress_active(skb)) {
5140 : int ingress_retval;
5141 :
5142 : if (*pt_prev) {
5143 : *ret = deliver_skb(skb, *pt_prev, orig_dev);
5144 : *pt_prev = NULL;
5145 : }
5146 :
5147 : rcu_read_lock();
5148 : ingress_retval = nf_hook_ingress(skb);
5149 : rcu_read_unlock();
5150 : return ingress_retval;
5151 : }
5152 : return 0;
5153 : }
5154 :
5155 456 : static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
5156 : struct packet_type **ppt_prev)
5157 : {
5158 456 : struct packet_type *ptype, *pt_prev;
5159 456 : rx_handler_func_t *rx_handler;
5160 456 : struct sk_buff *skb = *pskb;
5161 456 : struct net_device *orig_dev;
5162 456 : bool deliver_exact = false;
5163 456 : int ret = NET_RX_DROP;
5164 456 : __be16 type;
5165 :
5166 456 : net_timestamp_check(!netdev_tstamp_prequeue, skb);
5167 :
5168 456 : trace_netif_receive_skb(skb);
5169 :
5170 456 : orig_dev = skb->dev;
5171 :
5172 456 : skb_reset_network_header(skb);
5173 456 : if (!skb_transport_header_was_set(skb))
5174 16 : skb_reset_transport_header(skb);
5175 456 : skb_reset_mac_len(skb);
5176 :
5177 456 : pt_prev = NULL;
5178 :
5179 456 : another_round:
5180 456 : skb->skb_iif = skb->dev->ifindex;
5181 :
5182 456 : __this_cpu_inc(softnet_data.processed);
5183 :
5184 456 : if (static_branch_unlikely(&generic_xdp_needed_key)) {
5185 0 : int ret2;
5186 :
5187 0 : preempt_disable();
5188 0 : ret2 = do_xdp_generic(rcu_dereference(skb->dev->xdp_prog), skb);
5189 0 : preempt_enable();
5190 :
5191 0 : if (ret2 != XDP_PASS) {
5192 0 : ret = NET_RX_DROP;
5193 0 : goto out;
5194 : }
5195 0 : skb_reset_mac_len(skb);
5196 : }
5197 :
5198 456 : if (eth_type_vlan(skb->protocol)) {
5199 0 : skb = skb_vlan_untag(skb);
5200 0 : if (unlikely(!skb))
5201 0 : goto out;
5202 : }
5203 :
5204 456 : if (skb_skip_tc_classify(skb))
5205 : goto skip_classify;
5206 :
5207 456 : if (pfmemalloc)
5208 0 : goto skip_taps;
5209 :
5210 456 : list_for_each_entry_rcu(ptype, &ptype_all, list) {
5211 0 : if (pt_prev)
5212 0 : ret = deliver_skb(skb, pt_prev, orig_dev);
5213 0 : pt_prev = ptype;
5214 : }
5215 :
5216 912 : list_for_each_entry_rcu(ptype, &skb->dev->ptype_all, list) {
5217 456 : if (pt_prev)
5218 0 : ret = deliver_skb(skb, pt_prev, orig_dev);
5219 456 : pt_prev = ptype;
5220 : }
5221 :
5222 456 : skip_taps:
5223 : #ifdef CONFIG_NET_INGRESS
5224 : if (static_branch_unlikely(&ingress_needed_key)) {
5225 : bool another = false;
5226 :
5227 : skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
5228 : &another);
5229 : if (another)
5230 : goto another_round;
5231 : if (!skb)
5232 : goto out;
5233 :
5234 : if (nf_ingress(skb, &pt_prev, &ret, orig_dev) < 0)
5235 : goto out;
5236 : }
5237 : #endif
5238 456 : skb_reset_redirect(skb);
5239 456 : skip_classify:
5240 456 : if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
5241 0 : goto drop;
5242 :
5243 456 : if (skb_vlan_tag_present(skb)) {
5244 0 : if (pt_prev) {
5245 0 : ret = deliver_skb(skb, pt_prev, orig_dev);
5246 0 : pt_prev = NULL;
5247 : }
5248 0 : if (vlan_do_receive(&skb))
5249 : goto another_round;
5250 0 : else if (unlikely(!skb))
5251 0 : goto out;
5252 : }
5253 :
5254 456 : rx_handler = rcu_dereference(skb->dev->rx_handler);
5255 456 : if (rx_handler) {
5256 0 : if (pt_prev) {
5257 0 : ret = deliver_skb(skb, pt_prev, orig_dev);
5258 0 : pt_prev = NULL;
5259 : }
5260 0 : switch (rx_handler(&skb)) {
5261 0 : case RX_HANDLER_CONSUMED:
5262 0 : ret = NET_RX_SUCCESS;
5263 0 : goto out;
5264 0 : case RX_HANDLER_ANOTHER:
5265 0 : goto another_round;
5266 : case RX_HANDLER_EXACT:
5267 : deliver_exact = true;
5268 : case RX_HANDLER_PASS:
5269 : break;
5270 0 : default:
5271 0 : BUG();
5272 : }
5273 456 : }
5274 :
5275 456 : if (unlikely(skb_vlan_tag_present(skb)) && !netdev_uses_dsa(skb->dev)) {
5276 0 : check_vlan_id:
5277 0 : if (skb_vlan_tag_get_id(skb)) {
5278 : /* Vlan id is non 0 and vlan_do_receive() above couldn't
5279 : * find vlan device.
5280 : */
5281 0 : skb->pkt_type = PACKET_OTHERHOST;
5282 0 : } else if (eth_type_vlan(skb->protocol)) {
5283 : /* Outer header is 802.1P with vlan 0, inner header is
5284 : * 802.1Q or 802.1AD and vlan_do_receive() above could
5285 : * not find vlan dev for vlan id 0.
5286 : */
5287 0 : __vlan_hwaccel_clear_tag(skb);
5288 0 : skb = skb_vlan_untag(skb);
5289 0 : if (unlikely(!skb))
5290 0 : goto out;
5291 0 : if (vlan_do_receive(&skb))
5292 : /* After stripping off 802.1P header with vlan 0
5293 : * vlan dev is found for inner header.
5294 : */
5295 : goto another_round;
5296 0 : else if (unlikely(!skb))
5297 : goto out;
5298 : else
5299 : /* We have stripped outer 802.1P vlan 0 header.
5300 : * But could not find vlan dev.
5301 : * check again for vlan id to set OTHERHOST.
5302 : */
5303 0 : goto check_vlan_id;
5304 : }
5305 : /* Note: we might in the future use prio bits
5306 : * and set skb->priority like in vlan_do_receive()
5307 : * For the time being, just ignore Priority Code Point
5308 : */
5309 0 : __vlan_hwaccel_clear_tag(skb);
5310 : }
5311 :
5312 456 : type = skb->protocol;
5313 :
5314 : /* deliver only exact match when indicated */
5315 456 : if (likely(!deliver_exact)) {
5316 456 : deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5317 456 : &ptype_base[ntohs(type) &
5318 : PTYPE_HASH_MASK]);
5319 : }
5320 :
5321 456 : deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5322 : &orig_dev->ptype_specific);
5323 :
5324 456 : if (unlikely(skb->dev != orig_dev)) {
5325 0 : deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
5326 : &skb->dev->ptype_specific);
5327 : }
5328 :
5329 456 : if (pt_prev) {
5330 456 : if (unlikely(skb_orphan_frags_rx(skb, GFP_ATOMIC)))
5331 0 : goto drop;
5332 456 : *ppt_prev = pt_prev;
5333 : } else {
5334 0 : drop:
5335 0 : if (!deliver_exact)
5336 0 : atomic_long_inc(&skb->dev->rx_dropped);
5337 : else
5338 0 : atomic_long_inc(&skb->dev->rx_nohandler);
5339 0 : kfree_skb(skb);
5340 : /* Jamal, now you will not able to escape explaining
5341 : * me how you were going to use this. :-)
5342 : */
5343 0 : ret = NET_RX_DROP;
5344 : }
5345 :
5346 456 : out:
5347 : /* The invariant here is that if *ppt_prev is not NULL
5348 : * then skb should also be non-NULL.
5349 : *
5350 : * Apparently *ppt_prev assignment above holds this invariant due to
5351 : * skb dereferencing near it.
5352 : */
5353 456 : *pskb = skb;
5354 456 : return ret;
5355 : }
5356 :
5357 0 : static int __netif_receive_skb_one_core(struct sk_buff *skb, bool pfmemalloc)
5358 : {
5359 0 : struct net_device *orig_dev = skb->dev;
5360 0 : struct packet_type *pt_prev = NULL;
5361 0 : int ret;
5362 :
5363 0 : ret = __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5364 0 : if (pt_prev)
5365 0 : ret = INDIRECT_CALL_INET(pt_prev->func, ipv6_rcv, ip_rcv, skb,
5366 : skb->dev, pt_prev, orig_dev);
5367 0 : return ret;
5368 : }
5369 :
5370 : /**
5371 : * netif_receive_skb_core - special purpose version of netif_receive_skb
5372 : * @skb: buffer to process
5373 : *
5374 : * More direct receive version of netif_receive_skb(). It should
5375 : * only be used by callers that have a need to skip RPS and Generic XDP.
5376 : * Caller must also take care of handling if ``(page_is_)pfmemalloc``.
5377 : *
5378 : * This function may only be called from softirq context and interrupts
5379 : * should be enabled.
5380 : *
5381 : * Return values (usually ignored):
5382 : * NET_RX_SUCCESS: no congestion
5383 : * NET_RX_DROP: packet was dropped
5384 : */
5385 0 : int netif_receive_skb_core(struct sk_buff *skb)
5386 : {
5387 0 : int ret;
5388 :
5389 0 : rcu_read_lock();
5390 0 : ret = __netif_receive_skb_one_core(skb, false);
5391 0 : rcu_read_unlock();
5392 :
5393 0 : return ret;
5394 : }
5395 : EXPORT_SYMBOL(netif_receive_skb_core);
5396 :
5397 814 : static inline void __netif_receive_skb_list_ptype(struct list_head *head,
5398 : struct packet_type *pt_prev,
5399 : struct net_device *orig_dev)
5400 : {
5401 814 : struct sk_buff *skb, *next;
5402 :
5403 814 : if (!pt_prev)
5404 : return;
5405 407 : if (list_empty(head))
5406 : return;
5407 407 : if (pt_prev->list_func != NULL)
5408 404 : INDIRECT_CALL_INET(pt_prev->list_func, ipv6_list_rcv,
5409 : ip_list_rcv, head, pt_prev, orig_dev);
5410 : else
5411 6 : list_for_each_entry_safe(skb, next, head, list) {
5412 3 : skb_list_del_init(skb);
5413 3 : pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
5414 : }
5415 : }
5416 :
5417 407 : static void __netif_receive_skb_list_core(struct list_head *head, bool pfmemalloc)
5418 : {
5419 : /* Fast-path assumptions:
5420 : * - There is no RX handler.
5421 : * - Only one packet_type matches.
5422 : * If either of these fails, we will end up doing some per-packet
5423 : * processing in-line, then handling the 'last ptype' for the whole
5424 : * sublist. This can't cause out-of-order delivery to any single ptype,
5425 : * because the 'last ptype' must be constant across the sublist, and all
5426 : * other ptypes are handled per-packet.
5427 : */
5428 : /* Current (common) ptype of sublist */
5429 407 : struct packet_type *pt_curr = NULL;
5430 : /* Current (common) orig_dev of sublist */
5431 407 : struct net_device *od_curr = NULL;
5432 407 : struct list_head sublist;
5433 407 : struct sk_buff *skb, *next;
5434 :
5435 407 : INIT_LIST_HEAD(&sublist);
5436 863 : list_for_each_entry_safe(skb, next, head, list) {
5437 456 : struct net_device *orig_dev = skb->dev;
5438 456 : struct packet_type *pt_prev = NULL;
5439 :
5440 456 : skb_list_del_init(skb);
5441 456 : __netif_receive_skb_core(&skb, pfmemalloc, &pt_prev);
5442 456 : if (!pt_prev)
5443 0 : continue;
5444 456 : if (pt_curr != pt_prev || od_curr != orig_dev) {
5445 : /* dispatch old sublist */
5446 407 : __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5447 : /* start new sublist */
5448 407 : INIT_LIST_HEAD(&sublist);
5449 407 : pt_curr = pt_prev;
5450 407 : od_curr = orig_dev;
5451 : }
5452 456 : list_add_tail(&skb->list, &sublist);
5453 : }
5454 :
5455 : /* dispatch final sublist */
5456 407 : __netif_receive_skb_list_ptype(&sublist, pt_curr, od_curr);
5457 407 : }
5458 :
5459 0 : static int __netif_receive_skb(struct sk_buff *skb)
5460 : {
5461 0 : int ret;
5462 :
5463 0 : if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
5464 0 : unsigned int noreclaim_flag;
5465 :
5466 : /*
5467 : * PFMEMALLOC skbs are special, they should
5468 : * - be delivered to SOCK_MEMALLOC sockets only
5469 : * - stay away from userspace
5470 : * - have bounded memory usage
5471 : *
5472 : * Use PF_MEMALLOC as this saves us from propagating the allocation
5473 : * context down to all allocation sites.
5474 : */
5475 0 : noreclaim_flag = memalloc_noreclaim_save();
5476 0 : ret = __netif_receive_skb_one_core(skb, true);
5477 0 : memalloc_noreclaim_restore(noreclaim_flag);
5478 : } else
5479 0 : ret = __netif_receive_skb_one_core(skb, false);
5480 :
5481 0 : return ret;
5482 : }
5483 :
5484 407 : static void __netif_receive_skb_list(struct list_head *head)
5485 : {
5486 407 : unsigned long noreclaim_flag = 0;
5487 407 : struct sk_buff *skb, *next;
5488 407 : bool pfmemalloc = false; /* Is current sublist PF_MEMALLOC? */
5489 :
5490 863 : list_for_each_entry_safe(skb, next, head, list) {
5491 456 : if ((sk_memalloc_socks() && skb_pfmemalloc(skb)) != pfmemalloc) {
5492 0 : struct list_head sublist;
5493 :
5494 : /* Handle the previous sublist */
5495 0 : list_cut_before(&sublist, head, &skb->list);
5496 0 : if (!list_empty(&sublist))
5497 0 : __netif_receive_skb_list_core(&sublist, pfmemalloc);
5498 0 : pfmemalloc = !pfmemalloc;
5499 : /* See comments in __netif_receive_skb */
5500 0 : if (pfmemalloc)
5501 0 : noreclaim_flag = memalloc_noreclaim_save();
5502 : else
5503 0 : memalloc_noreclaim_restore(noreclaim_flag);
5504 : }
5505 : }
5506 : /* Handle the remaining sublist */
5507 407 : if (!list_empty(head))
5508 407 : __netif_receive_skb_list_core(head, pfmemalloc);
5509 : /* Restore pflags */
5510 407 : if (pfmemalloc)
5511 0 : memalloc_noreclaim_restore(noreclaim_flag);
5512 407 : }
5513 :
5514 0 : static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp)
5515 : {
5516 0 : struct bpf_prog *old = rtnl_dereference(dev->xdp_prog);
5517 0 : struct bpf_prog *new = xdp->prog;
5518 0 : int ret = 0;
5519 :
5520 0 : if (new) {
5521 0 : u32 i;
5522 :
5523 0 : mutex_lock(&new->aux->used_maps_mutex);
5524 :
5525 : /* generic XDP does not work with DEVMAPs that can
5526 : * have a bpf_prog installed on an entry
5527 : */
5528 0 : for (i = 0; i < new->aux->used_map_cnt; i++) {
5529 0 : if (dev_map_can_have_prog(new->aux->used_maps[i]) ||
5530 0 : cpu_map_prog_allowed(new->aux->used_maps[i])) {
5531 : mutex_unlock(&new->aux->used_maps_mutex);
5532 : return -EINVAL;
5533 : }
5534 : }
5535 :
5536 0 : mutex_unlock(&new->aux->used_maps_mutex);
5537 : }
5538 :
5539 0 : switch (xdp->command) {
5540 0 : case XDP_SETUP_PROG:
5541 0 : rcu_assign_pointer(dev->xdp_prog, new);
5542 0 : if (old)
5543 0 : bpf_prog_put(old);
5544 :
5545 0 : if (old && !new) {
5546 0 : static_branch_dec(&generic_xdp_needed_key);
5547 0 : } else if (new && !old) {
5548 0 : static_branch_inc(&generic_xdp_needed_key);
5549 0 : dev_disable_lro(dev);
5550 0 : dev_disable_gro_hw(dev);
5551 : }
5552 : break;
5553 :
5554 : default:
5555 : ret = -EINVAL;
5556 : break;
5557 : }
5558 :
5559 0 : return ret;
5560 : }
5561 :
5562 0 : static int netif_receive_skb_internal(struct sk_buff *skb)
5563 : {
5564 0 : int ret;
5565 :
5566 0 : net_timestamp_check(netdev_tstamp_prequeue, skb);
5567 :
5568 0 : if (skb_defer_rx_timestamp(skb))
5569 : return NET_RX_SUCCESS;
5570 :
5571 0 : rcu_read_lock();
5572 : #ifdef CONFIG_RPS
5573 0 : if (static_branch_unlikely(&rps_needed)) {
5574 0 : struct rps_dev_flow voidflow, *rflow = &voidflow;
5575 0 : int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5576 :
5577 0 : if (cpu >= 0) {
5578 0 : ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5579 0 : rcu_read_unlock();
5580 0 : return ret;
5581 : }
5582 : }
5583 : #endif
5584 0 : ret = __netif_receive_skb(skb);
5585 0 : rcu_read_unlock();
5586 0 : return ret;
5587 : }
5588 :
5589 407 : static void netif_receive_skb_list_internal(struct list_head *head)
5590 : {
5591 407 : struct sk_buff *skb, *next;
5592 407 : struct list_head sublist;
5593 :
5594 407 : INIT_LIST_HEAD(&sublist);
5595 863 : list_for_each_entry_safe(skb, next, head, list) {
5596 456 : net_timestamp_check(netdev_tstamp_prequeue, skb);
5597 456 : skb_list_del_init(skb);
5598 456 : if (!skb_defer_rx_timestamp(skb))
5599 456 : list_add_tail(&skb->list, &sublist);
5600 : }
5601 407 : list_splice_init(&sublist, head);
5602 :
5603 407 : rcu_read_lock();
5604 : #ifdef CONFIG_RPS
5605 407 : if (static_branch_unlikely(&rps_needed)) {
5606 0 : list_for_each_entry_safe(skb, next, head, list) {
5607 0 : struct rps_dev_flow voidflow, *rflow = &voidflow;
5608 0 : int cpu = get_rps_cpu(skb->dev, skb, &rflow);
5609 :
5610 0 : if (cpu >= 0) {
5611 : /* Will be handled, remove from list */
5612 0 : skb_list_del_init(skb);
5613 0 : enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
5614 : }
5615 : }
5616 : }
5617 : #endif
5618 407 : __netif_receive_skb_list(head);
5619 407 : rcu_read_unlock();
5620 407 : }
5621 :
5622 : /**
5623 : * netif_receive_skb - process receive buffer from network
5624 : * @skb: buffer to process
5625 : *
5626 : * netif_receive_skb() is the main receive data processing function.
5627 : * It always succeeds. The buffer may be dropped during processing
5628 : * for congestion control or by the protocol layers.
5629 : *
5630 : * This function may only be called from softirq context and interrupts
5631 : * should be enabled.
5632 : *
5633 : * Return values (usually ignored):
5634 : * NET_RX_SUCCESS: no congestion
5635 : * NET_RX_DROP: packet was dropped
5636 : */
5637 0 : int netif_receive_skb(struct sk_buff *skb)
5638 : {
5639 0 : int ret;
5640 :
5641 0 : trace_netif_receive_skb_entry(skb);
5642 :
5643 0 : ret = netif_receive_skb_internal(skb);
5644 0 : trace_netif_receive_skb_exit(ret);
5645 :
5646 0 : return ret;
5647 : }
5648 : EXPORT_SYMBOL(netif_receive_skb);
5649 :
5650 : /**
5651 : * netif_receive_skb_list - process many receive buffers from network
5652 : * @head: list of skbs to process.
5653 : *
5654 : * Since return value of netif_receive_skb() is normally ignored, and
5655 : * wouldn't be meaningful for a list, this function returns void.
5656 : *
5657 : * This function may only be called from softirq context and interrupts
5658 : * should be enabled.
5659 : */
5660 0 : void netif_receive_skb_list(struct list_head *head)
5661 : {
5662 0 : struct sk_buff *skb;
5663 :
5664 0 : if (list_empty(head))
5665 : return;
5666 0 : if (trace_netif_receive_skb_list_entry_enabled()) {
5667 0 : list_for_each_entry(skb, head, list)
5668 0 : trace_netif_receive_skb_list_entry(skb);
5669 : }
5670 0 : netif_receive_skb_list_internal(head);
5671 0 : trace_netif_receive_skb_list_exit(0);
5672 : }
5673 : EXPORT_SYMBOL(netif_receive_skb_list);
5674 :
5675 : static DEFINE_PER_CPU(struct work_struct, flush_works);
5676 :
5677 : /* Network device is going away, flush any packets still pending */
5678 0 : static void flush_backlog(struct work_struct *work)
5679 : {
5680 0 : struct sk_buff *skb, *tmp;
5681 0 : struct softnet_data *sd;
5682 :
5683 0 : local_bh_disable();
5684 0 : sd = this_cpu_ptr(&softnet_data);
5685 :
5686 0 : local_irq_disable();
5687 0 : rps_lock(sd);
5688 0 : skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
5689 0 : if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5690 0 : __skb_unlink(skb, &sd->input_pkt_queue);
5691 0 : dev_kfree_skb_irq(skb);
5692 0 : input_queue_head_incr(sd);
5693 : }
5694 : }
5695 0 : rps_unlock(sd);
5696 0 : local_irq_enable();
5697 :
5698 0 : skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
5699 0 : if (skb->dev->reg_state == NETREG_UNREGISTERING) {
5700 0 : __skb_unlink(skb, &sd->process_queue);
5701 0 : kfree_skb(skb);
5702 0 : input_queue_head_incr(sd);
5703 : }
5704 : }
5705 0 : local_bh_enable();
5706 0 : }
5707 :
5708 0 : static bool flush_required(int cpu)
5709 : {
5710 : #if IS_ENABLED(CONFIG_RPS)
5711 0 : struct softnet_data *sd = &per_cpu(softnet_data, cpu);
5712 0 : bool do_flush;
5713 :
5714 0 : local_irq_disable();
5715 0 : rps_lock(sd);
5716 :
5717 : /* as insertion into process_queue happens with the rps lock held,
5718 : * process_queue access may race only with dequeue
5719 : */
5720 0 : do_flush = !skb_queue_empty(&sd->input_pkt_queue) ||
5721 0 : !skb_queue_empty_lockless(&sd->process_queue);
5722 0 : rps_unlock(sd);
5723 0 : local_irq_enable();
5724 :
5725 0 : return do_flush;
5726 : #endif
5727 : /* without RPS we can't safely check input_pkt_queue: during a
5728 : * concurrent remote skb_queue_splice() we can detect as empty both
5729 : * input_pkt_queue and process_queue even if the latter could end-up
5730 : * containing a lot of packets.
5731 : */
5732 : return true;
5733 : }
5734 :
5735 0 : static void flush_all_backlogs(void)
5736 : {
5737 0 : static cpumask_t flush_cpus;
5738 0 : unsigned int cpu;
5739 :
5740 : /* since we are under rtnl lock protection we can use static data
5741 : * for the cpumask and avoid allocating on stack the possibly
5742 : * large mask
5743 : */
5744 0 : ASSERT_RTNL();
5745 :
5746 0 : get_online_cpus();
5747 :
5748 0 : cpumask_clear(&flush_cpus);
5749 0 : for_each_online_cpu(cpu) {
5750 0 : if (flush_required(cpu)) {
5751 0 : queue_work_on(cpu, system_highpri_wq,
5752 0 : per_cpu_ptr(&flush_works, cpu));
5753 0 : cpumask_set_cpu(cpu, &flush_cpus);
5754 : }
5755 : }
5756 :
5757 : /* we can have in flight packet[s] on the cpus we are not flushing,
5758 : * synchronize_net() in unregister_netdevice_many() will take care of
5759 : * them
5760 : */
5761 0 : for_each_cpu(cpu, &flush_cpus)
5762 0 : flush_work(per_cpu_ptr(&flush_works, cpu));
5763 :
5764 0 : put_online_cpus();
5765 0 : }
5766 :
5767 : /* Pass the currently batched GRO_NORMAL SKBs up to the stack. */
5768 870 : static void gro_normal_list(struct napi_struct *napi)
5769 : {
5770 870 : if (!napi->rx_count)
5771 : return;
5772 407 : netif_receive_skb_list_internal(&napi->rx_list);
5773 407 : INIT_LIST_HEAD(&napi->rx_list);
5774 407 : napi->rx_count = 0;
5775 : }
5776 :
5777 : /* Queue one GRO_NORMAL SKB up for list processing. If batch size exceeded,
5778 : * pass the whole batch up to the stack.
5779 : */
5780 456 : static void gro_normal_one(struct napi_struct *napi, struct sk_buff *skb, int segs)
5781 : {
5782 456 : list_add_tail(&skb->list, &napi->rx_list);
5783 456 : napi->rx_count += segs;
5784 456 : if (napi->rx_count >= gro_normal_batch)
5785 15 : gro_normal_list(napi);
5786 456 : }
5787 :
5788 16 : static int napi_gro_complete(struct napi_struct *napi, struct sk_buff *skb)
5789 : {
5790 16 : struct packet_offload *ptype;
5791 16 : __be16 type = skb->protocol;
5792 16 : struct list_head *head = &offload_base;
5793 16 : int err = -ENOENT;
5794 :
5795 16 : BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
5796 :
5797 16 : if (NAPI_GRO_CB(skb)->count == 1) {
5798 1 : skb_shinfo(skb)->gso_size = 0;
5799 1 : goto out;
5800 : }
5801 :
5802 15 : rcu_read_lock();
5803 15 : list_for_each_entry_rcu(ptype, head, list) {
5804 15 : if (ptype->type != type || !ptype->callbacks.gro_complete)
5805 0 : continue;
5806 :
5807 15 : err = INDIRECT_CALL_INET(ptype->callbacks.gro_complete,
5808 : ipv6_gro_complete, inet_gro_complete,
5809 : skb, 0);
5810 15 : break;
5811 : }
5812 15 : rcu_read_unlock();
5813 :
5814 15 : if (err) {
5815 0 : WARN_ON(&ptype->list == head);
5816 0 : kfree_skb(skb);
5817 0 : return NET_RX_SUCCESS;
5818 : }
5819 :
5820 15 : out:
5821 16 : gro_normal_one(napi, skb, NAPI_GRO_CB(skb)->count);
5822 16 : return NET_RX_SUCCESS;
5823 : }
5824 :
5825 11 : static void __napi_gro_flush_chain(struct napi_struct *napi, u32 index,
5826 : bool flush_old)
5827 : {
5828 11 : struct list_head *head = &napi->gro_hash[index].list;
5829 11 : struct sk_buff *skb, *p;
5830 :
5831 22 : list_for_each_entry_safe_reverse(skb, p, head, list) {
5832 11 : if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
5833 : return;
5834 11 : skb_list_del_init(skb);
5835 11 : napi_gro_complete(napi, skb);
5836 11 : napi->gro_hash[index].count--;
5837 : }
5838 :
5839 11 : if (!napi->gro_hash[index].count)
5840 11 : __clear_bit(index, &napi->gro_bitmask);
5841 : }
5842 :
5843 : /* napi->gro_hash[].list contains packets ordered by age.
5844 : * youngest packets at the head of it.
5845 : * Complete skbs in reverse order to reduce latencies.
5846 : */
5847 11 : void napi_gro_flush(struct napi_struct *napi, bool flush_old)
5848 : {
5849 11 : unsigned long bitmask = napi->gro_bitmask;
5850 11 : unsigned int i, base = ~0U;
5851 :
5852 22 : while ((i = ffs(bitmask)) != 0) {
5853 11 : bitmask >>= i;
5854 11 : base += i;
5855 11 : __napi_gro_flush_chain(napi, base, flush_old);
5856 : }
5857 11 : }
5858 : EXPORT_SYMBOL(napi_gro_flush);
5859 :
5860 723 : static struct list_head *gro_list_prepare(struct napi_struct *napi,
5861 : struct sk_buff *skb)
5862 : {
5863 723 : unsigned int maclen = skb->dev->hard_header_len;
5864 723 : u32 hash = skb_get_hash_raw(skb);
5865 723 : struct list_head *head;
5866 723 : struct sk_buff *p;
5867 :
5868 723 : head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list;
5869 992 : list_for_each_entry(p, head, list) {
5870 269 : unsigned long diffs;
5871 :
5872 269 : NAPI_GRO_CB(p)->flush = 0;
5873 :
5874 269 : if (hash != skb_get_hash_raw(p)) {
5875 0 : NAPI_GRO_CB(p)->same_flow = 0;
5876 0 : continue;
5877 : }
5878 :
5879 269 : diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
5880 269 : diffs |= skb_vlan_tag_present(p) ^ skb_vlan_tag_present(skb);
5881 269 : if (skb_vlan_tag_present(p))
5882 0 : diffs |= skb_vlan_tag_get(p) ^ skb_vlan_tag_get(skb);
5883 269 : diffs |= skb_metadata_dst_cmp(p, skb);
5884 269 : diffs |= skb_metadata_differs(p, skb);
5885 269 : if (maclen == ETH_HLEN)
5886 269 : diffs |= compare_ether_header(skb_mac_header(p),
5887 269 : skb_mac_header(skb));
5888 0 : else if (!diffs)
5889 0 : diffs = memcmp(skb_mac_header(p),
5890 0 : skb_mac_header(skb),
5891 : maclen);
5892 269 : NAPI_GRO_CB(p)->same_flow = !diffs;
5893 : }
5894 :
5895 723 : return head;
5896 : }
5897 :
5898 723 : static void skb_gro_reset_offset(struct sk_buff *skb)
5899 : {
5900 723 : const struct skb_shared_info *pinfo = skb_shinfo(skb);
5901 723 : const skb_frag_t *frag0 = &pinfo->frags[0];
5902 :
5903 723 : NAPI_GRO_CB(skb)->data_offset = 0;
5904 723 : NAPI_GRO_CB(skb)->frag0 = NULL;
5905 723 : NAPI_GRO_CB(skb)->frag0_len = 0;
5906 :
5907 723 : if (!skb_headlen(skb) && pinfo->nr_frags &&
5908 0 : !PageHighMem(skb_frag_page(frag0))) {
5909 0 : NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
5910 0 : NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
5911 : skb_frag_size(frag0),
5912 : skb->end - skb->tail);
5913 : }
5914 723 : }
5915 :
5916 0 : static void gro_pull_from_frag0(struct sk_buff *skb, int grow)
5917 : {
5918 0 : struct skb_shared_info *pinfo = skb_shinfo(skb);
5919 :
5920 0 : BUG_ON(skb->end - skb->tail < grow);
5921 :
5922 0 : memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
5923 :
5924 0 : skb->data_len -= grow;
5925 0 : skb->tail += grow;
5926 :
5927 0 : skb_frag_off_add(&pinfo->frags[0], grow);
5928 0 : skb_frag_size_sub(&pinfo->frags[0], grow);
5929 :
5930 0 : if (unlikely(!skb_frag_size(&pinfo->frags[0]))) {
5931 0 : skb_frag_unref(skb, 0);
5932 0 : memmove(pinfo->frags, pinfo->frags + 1,
5933 0 : --pinfo->nr_frags * sizeof(pinfo->frags[0]));
5934 : }
5935 0 : }
5936 :
5937 0 : static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head)
5938 : {
5939 0 : struct sk_buff *oldest;
5940 :
5941 0 : oldest = list_last_entry(head, struct sk_buff, list);
5942 :
5943 : /* We are called with head length >= MAX_GRO_SKBS, so this is
5944 : * impossible.
5945 : */
5946 0 : if (WARN_ON_ONCE(!oldest))
5947 : return;
5948 :
5949 : /* Do not adjust napi->gro_hash[].count, caller is adding a new
5950 : * SKB to the chain.
5951 : */
5952 0 : skb_list_del_init(oldest);
5953 0 : napi_gro_complete(napi, oldest);
5954 : }
5955 :
5956 723 : static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5957 : {
5958 723 : u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1);
5959 723 : struct list_head *head = &offload_base;
5960 723 : struct packet_offload *ptype;
5961 723 : __be16 type = skb->protocol;
5962 723 : struct list_head *gro_head;
5963 723 : struct sk_buff *pp = NULL;
5964 723 : enum gro_result ret;
5965 723 : int same_flow;
5966 723 : int grow;
5967 :
5968 723 : if (netif_elide_gro(skb->dev))
5969 0 : goto normal;
5970 :
5971 723 : gro_head = gro_list_prepare(napi, skb);
5972 :
5973 723 : rcu_read_lock();
5974 732 : list_for_each_entry_rcu(ptype, head, list) {
5975 729 : if (ptype->type != type || !ptype->callbacks.gro_receive)
5976 9 : continue;
5977 :
5978 720 : skb_set_network_header(skb, skb_gro_offset(skb));
5979 720 : skb_reset_mac_len(skb);
5980 720 : NAPI_GRO_CB(skb)->same_flow = 0;
5981 720 : NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb);
5982 720 : NAPI_GRO_CB(skb)->free = 0;
5983 720 : NAPI_GRO_CB(skb)->encap_mark = 0;
5984 720 : NAPI_GRO_CB(skb)->recursion_counter = 0;
5985 720 : NAPI_GRO_CB(skb)->is_fou = 0;
5986 720 : NAPI_GRO_CB(skb)->is_atomic = 1;
5987 720 : NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
5988 :
5989 : /* Setup for GRO checksum validation */
5990 720 : switch (skb->ip_summed) {
5991 0 : case CHECKSUM_COMPLETE:
5992 0 : NAPI_GRO_CB(skb)->csum = skb->csum;
5993 0 : NAPI_GRO_CB(skb)->csum_valid = 1;
5994 0 : NAPI_GRO_CB(skb)->csum_cnt = 0;
5995 0 : break;
5996 0 : case CHECKSUM_UNNECESSARY:
5997 0 : NAPI_GRO_CB(skb)->csum_cnt = skb->csum_level + 1;
5998 0 : NAPI_GRO_CB(skb)->csum_valid = 0;
5999 0 : break;
6000 720 : default:
6001 720 : NAPI_GRO_CB(skb)->csum_cnt = 0;
6002 720 : NAPI_GRO_CB(skb)->csum_valid = 0;
6003 : }
6004 :
6005 720 : pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive,
6006 : ipv6_gro_receive, inet_gro_receive,
6007 : gro_head, skb);
6008 720 : break;
6009 : }
6010 723 : rcu_read_unlock();
6011 :
6012 723 : if (&ptype->list == head)
6013 3 : goto normal;
6014 :
6015 720 : if (PTR_ERR(pp) == -EINPROGRESS) {
6016 0 : ret = GRO_CONSUMED;
6017 0 : goto ok;
6018 : }
6019 :
6020 720 : same_flow = NAPI_GRO_CB(skb)->same_flow;
6021 720 : ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
6022 :
6023 720 : if (pp) {
6024 5 : skb_list_del_init(pp);
6025 5 : napi_gro_complete(napi, pp);
6026 5 : napi->gro_hash[hash].count--;
6027 : }
6028 :
6029 720 : if (same_flow)
6030 267 : goto ok;
6031 :
6032 453 : if (NAPI_GRO_CB(skb)->flush)
6033 437 : goto normal;
6034 :
6035 16 : if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) {
6036 0 : gro_flush_oldest(napi, gro_head);
6037 : } else {
6038 16 : napi->gro_hash[hash].count++;
6039 : }
6040 16 : NAPI_GRO_CB(skb)->count = 1;
6041 16 : NAPI_GRO_CB(skb)->age = jiffies;
6042 16 : NAPI_GRO_CB(skb)->last = skb;
6043 16 : skb_shinfo(skb)->gso_size = skb_gro_len(skb);
6044 16 : list_add(&skb->list, gro_head);
6045 16 : ret = GRO_HELD;
6046 :
6047 456 : pull:
6048 456 : grow = skb_gro_offset(skb) - skb_headlen(skb);
6049 456 : if (grow > 0)
6050 0 : gro_pull_from_frag0(skb, grow);
6051 456 : ok:
6052 723 : if (napi->gro_hash[hash].count) {
6053 280 : if (!test_bit(hash, &napi->gro_bitmask))
6054 16 : __set_bit(hash, &napi->gro_bitmask);
6055 443 : } else if (test_bit(hash, &napi->gro_bitmask)) {
6056 5 : __clear_bit(hash, &napi->gro_bitmask);
6057 : }
6058 :
6059 723 : return ret;
6060 :
6061 440 : normal:
6062 440 : ret = GRO_NORMAL;
6063 440 : goto pull;
6064 : }
6065 :
6066 0 : struct packet_offload *gro_find_receive_by_type(__be16 type)
6067 : {
6068 0 : struct list_head *offload_head = &offload_base;
6069 0 : struct packet_offload *ptype;
6070 :
6071 0 : list_for_each_entry_rcu(ptype, offload_head, list) {
6072 0 : if (ptype->type != type || !ptype->callbacks.gro_receive)
6073 0 : continue;
6074 : return ptype;
6075 : }
6076 : return NULL;
6077 : }
6078 : EXPORT_SYMBOL(gro_find_receive_by_type);
6079 :
6080 0 : struct packet_offload *gro_find_complete_by_type(__be16 type)
6081 : {
6082 0 : struct list_head *offload_head = &offload_base;
6083 0 : struct packet_offload *ptype;
6084 :
6085 0 : list_for_each_entry_rcu(ptype, offload_head, list) {
6086 0 : if (ptype->type != type || !ptype->callbacks.gro_complete)
6087 0 : continue;
6088 : return ptype;
6089 : }
6090 : return NULL;
6091 : }
6092 : EXPORT_SYMBOL(gro_find_complete_by_type);
6093 :
6094 723 : static gro_result_t napi_skb_finish(struct napi_struct *napi,
6095 : struct sk_buff *skb,
6096 : gro_result_t ret)
6097 : {
6098 723 : switch (ret) {
6099 440 : case GRO_NORMAL:
6100 440 : gro_normal_one(napi, skb, 1);
6101 440 : break;
6102 :
6103 0 : case GRO_MERGED_FREE:
6104 0 : if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6105 0 : napi_skb_free_stolen_head(skb);
6106 : else
6107 0 : __kfree_skb_defer(skb);
6108 : break;
6109 :
6110 : case GRO_HELD:
6111 : case GRO_MERGED:
6112 : case GRO_CONSUMED:
6113 : break;
6114 : }
6115 :
6116 723 : return ret;
6117 : }
6118 :
6119 723 : gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
6120 : {
6121 723 : gro_result_t ret;
6122 :
6123 723 : skb_mark_napi_id(skb, napi);
6124 723 : trace_napi_gro_receive_entry(skb);
6125 :
6126 723 : skb_gro_reset_offset(skb);
6127 :
6128 723 : ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb));
6129 723 : trace_napi_gro_receive_exit(ret);
6130 :
6131 723 : return ret;
6132 : }
6133 : EXPORT_SYMBOL(napi_gro_receive);
6134 :
6135 0 : static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
6136 : {
6137 0 : if (unlikely(skb->pfmemalloc)) {
6138 0 : consume_skb(skb);
6139 0 : return;
6140 : }
6141 0 : __skb_pull(skb, skb_headlen(skb));
6142 : /* restore the reserve we had after netdev_alloc_skb_ip_align() */
6143 0 : skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
6144 0 : __vlan_hwaccel_clear_tag(skb);
6145 0 : skb->dev = napi->dev;
6146 0 : skb->skb_iif = 0;
6147 :
6148 : /* eth_type_trans() assumes pkt_type is PACKET_HOST */
6149 0 : skb->pkt_type = PACKET_HOST;
6150 :
6151 0 : skb->encapsulation = 0;
6152 0 : skb_shinfo(skb)->gso_type = 0;
6153 0 : skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
6154 0 : skb_ext_reset(skb);
6155 :
6156 0 : napi->skb = skb;
6157 : }
6158 :
6159 0 : struct sk_buff *napi_get_frags(struct napi_struct *napi)
6160 : {
6161 0 : struct sk_buff *skb = napi->skb;
6162 :
6163 0 : if (!skb) {
6164 0 : skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
6165 0 : if (skb) {
6166 0 : napi->skb = skb;
6167 0 : skb_mark_napi_id(skb, napi);
6168 : }
6169 : }
6170 0 : return skb;
6171 : }
6172 : EXPORT_SYMBOL(napi_get_frags);
6173 :
6174 0 : static gro_result_t napi_frags_finish(struct napi_struct *napi,
6175 : struct sk_buff *skb,
6176 : gro_result_t ret)
6177 : {
6178 0 : switch (ret) {
6179 : case GRO_NORMAL:
6180 : case GRO_HELD:
6181 0 : __skb_push(skb, ETH_HLEN);
6182 0 : skb->protocol = eth_type_trans(skb, skb->dev);
6183 0 : if (ret == GRO_NORMAL)
6184 0 : gro_normal_one(napi, skb, 1);
6185 : break;
6186 :
6187 0 : case GRO_MERGED_FREE:
6188 0 : if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
6189 0 : napi_skb_free_stolen_head(skb);
6190 : else
6191 0 : napi_reuse_skb(napi, skb);
6192 : break;
6193 :
6194 : case GRO_MERGED:
6195 : case GRO_CONSUMED:
6196 : break;
6197 : }
6198 :
6199 0 : return ret;
6200 : }
6201 :
6202 : /* Upper GRO stack assumes network header starts at gro_offset=0
6203 : * Drivers could call both napi_gro_frags() and napi_gro_receive()
6204 : * We copy ethernet header into skb->data to have a common layout.
6205 : */
6206 0 : static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
6207 : {
6208 0 : struct sk_buff *skb = napi->skb;
6209 0 : const struct ethhdr *eth;
6210 0 : unsigned int hlen = sizeof(*eth);
6211 :
6212 0 : napi->skb = NULL;
6213 :
6214 0 : skb_reset_mac_header(skb);
6215 0 : skb_gro_reset_offset(skb);
6216 :
6217 0 : if (unlikely(skb_gro_header_hard(skb, hlen))) {
6218 0 : eth = skb_gro_header_slow(skb, hlen, 0);
6219 0 : if (unlikely(!eth)) {
6220 0 : net_warn_ratelimited("%s: dropping impossible skb from %s\n",
6221 : __func__, napi->dev->name);
6222 0 : napi_reuse_skb(napi, skb);
6223 0 : return NULL;
6224 : }
6225 : } else {
6226 0 : eth = (const struct ethhdr *)skb->data;
6227 0 : gro_pull_from_frag0(skb, hlen);
6228 0 : NAPI_GRO_CB(skb)->frag0 += hlen;
6229 0 : NAPI_GRO_CB(skb)->frag0_len -= hlen;
6230 : }
6231 0 : __skb_pull(skb, hlen);
6232 :
6233 : /*
6234 : * This works because the only protocols we care about don't require
6235 : * special handling.
6236 : * We'll fix it up properly in napi_frags_finish()
6237 : */
6238 0 : skb->protocol = eth->h_proto;
6239 :
6240 0 : return skb;
6241 : }
6242 :
6243 0 : gro_result_t napi_gro_frags(struct napi_struct *napi)
6244 : {
6245 0 : gro_result_t ret;
6246 0 : struct sk_buff *skb = napi_frags_skb(napi);
6247 :
6248 0 : trace_napi_gro_frags_entry(skb);
6249 :
6250 0 : ret = napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
6251 0 : trace_napi_gro_frags_exit(ret);
6252 :
6253 0 : return ret;
6254 : }
6255 : EXPORT_SYMBOL(napi_gro_frags);
6256 :
6257 : /* Compute the checksum from gro_offset and return the folded value
6258 : * after adding in any pseudo checksum.
6259 : */
6260 707 : __sum16 __skb_gro_checksum_complete(struct sk_buff *skb)
6261 : {
6262 707 : __wsum wsum;
6263 707 : __sum16 sum;
6264 :
6265 707 : wsum = skb_checksum(skb, skb_gro_offset(skb), skb_gro_len(skb), 0);
6266 :
6267 : /* NAPI_GRO_CB(skb)->csum holds pseudo checksum */
6268 707 : sum = csum_fold(csum_add(NAPI_GRO_CB(skb)->csum, wsum));
6269 : /* See comments in __skb_checksum_complete(). */
6270 707 : if (likely(!sum)) {
6271 707 : if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) &&
6272 0 : !skb->csum_complete_sw)
6273 0 : netdev_rx_csum_fault(skb->dev, skb);
6274 : }
6275 :
6276 707 : NAPI_GRO_CB(skb)->csum = wsum;
6277 707 : NAPI_GRO_CB(skb)->csum_valid = 1;
6278 :
6279 707 : return sum;
6280 : }
6281 : EXPORT_SYMBOL(__skb_gro_checksum_complete);
6282 :
6283 0 : static void net_rps_send_ipi(struct softnet_data *remsd)
6284 : {
6285 : #ifdef CONFIG_RPS
6286 0 : while (remsd) {
6287 0 : struct softnet_data *next = remsd->rps_ipi_next;
6288 :
6289 0 : if (cpu_online(remsd->cpu))
6290 0 : smp_call_function_single_async(remsd->cpu, &remsd->csd);
6291 : remsd = next;
6292 : }
6293 : #endif
6294 0 : }
6295 :
6296 : /*
6297 : * net_rps_action_and_irq_enable sends any pending IPI's for rps.
6298 : * Note: called with local irq disabled, but exits with local irq enabled.
6299 : */
6300 0 : static void net_rps_action_and_irq_enable(struct softnet_data *sd)
6301 : {
6302 : #ifdef CONFIG_RPS
6303 0 : struct softnet_data *remsd = sd->rps_ipi_list;
6304 :
6305 0 : if (remsd) {
6306 0 : sd->rps_ipi_list = NULL;
6307 :
6308 0 : local_irq_enable();
6309 :
6310 : /* Send pending IPI's to kick RPS processing on remote cpus. */
6311 0 : net_rps_send_ipi(remsd);
6312 : } else
6313 : #endif
6314 0 : local_irq_enable();
6315 0 : }
6316 :
6317 478 : static bool sd_has_rps_ipi_waiting(struct softnet_data *sd)
6318 : {
6319 : #ifdef CONFIG_RPS
6320 478 : return sd->rps_ipi_list != NULL;
6321 : #else
6322 : return false;
6323 : #endif
6324 : }
6325 :
6326 0 : static int process_backlog(struct napi_struct *napi, int quota)
6327 : {
6328 0 : struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
6329 0 : bool again = true;
6330 0 : int work = 0;
6331 :
6332 : /* Check if we have pending ipi, its better to send them now,
6333 : * not waiting net_rx_action() end.
6334 : */
6335 0 : if (sd_has_rps_ipi_waiting(sd)) {
6336 0 : local_irq_disable();
6337 0 : net_rps_action_and_irq_enable(sd);
6338 : }
6339 :
6340 0 : napi->weight = dev_rx_weight;
6341 0 : while (again) {
6342 : struct sk_buff *skb;
6343 :
6344 0 : while ((skb = __skb_dequeue(&sd->process_queue))) {
6345 0 : rcu_read_lock();
6346 0 : __netif_receive_skb(skb);
6347 0 : rcu_read_unlock();
6348 0 : input_queue_head_incr(sd);
6349 0 : if (++work >= quota)
6350 0 : return work;
6351 :
6352 : }
6353 :
6354 0 : local_irq_disable();
6355 0 : rps_lock(sd);
6356 0 : if (skb_queue_empty(&sd->input_pkt_queue)) {
6357 : /*
6358 : * Inline a custom version of __napi_complete().
6359 : * only current cpu owns and manipulates this napi,
6360 : * and NAPI_STATE_SCHED is the only possible flag set
6361 : * on backlog.
6362 : * We can use a plain write instead of clear_bit(),
6363 : * and we dont need an smp_mb() memory barrier.
6364 : */
6365 0 : napi->state = 0;
6366 0 : again = false;
6367 : } else {
6368 0 : skb_queue_splice_tail_init(&sd->input_pkt_queue,
6369 : &sd->process_queue);
6370 : }
6371 0 : rps_unlock(sd);
6372 0 : local_irq_enable();
6373 : }
6374 :
6375 : return work;
6376 : }
6377 :
6378 : /**
6379 : * __napi_schedule - schedule for receive
6380 : * @n: entry to schedule
6381 : *
6382 : * The entry's receive function will be scheduled to run.
6383 : * Consider using __napi_schedule_irqoff() if hard irqs are masked.
6384 : */
6385 855 : void __napi_schedule(struct napi_struct *n)
6386 : {
6387 855 : unsigned long flags;
6388 :
6389 1710 : local_irq_save(flags);
6390 855 : ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6391 855 : local_irq_restore(flags);
6392 855 : }
6393 : EXPORT_SYMBOL(__napi_schedule);
6394 :
6395 : /**
6396 : * napi_schedule_prep - check if napi can be scheduled
6397 : * @n: napi context
6398 : *
6399 : * Test if NAPI routine is already running, and if not mark
6400 : * it as running. This is used as a condition variable to
6401 : * insure only one NAPI poll instance runs. We also make
6402 : * sure there is no pending NAPI disable.
6403 : */
6404 882 : bool napi_schedule_prep(struct napi_struct *n)
6405 : {
6406 882 : unsigned long val, new;
6407 :
6408 882 : do {
6409 882 : val = READ_ONCE(n->state);
6410 882 : if (unlikely(val & NAPIF_STATE_DISABLE))
6411 : return false;
6412 882 : new = val | NAPIF_STATE_SCHED;
6413 :
6414 : /* Sets STATE_MISSED bit if STATE_SCHED was already set
6415 : * This was suggested by Alexander Duyck, as compiler
6416 : * emits better code than :
6417 : * if (val & NAPIF_STATE_SCHED)
6418 : * new |= NAPIF_STATE_MISSED;
6419 : */
6420 882 : new |= (val & NAPIF_STATE_SCHED) / NAPIF_STATE_SCHED *
6421 : NAPIF_STATE_MISSED;
6422 882 : } while (cmpxchg(&n->state, val, new) != val);
6423 :
6424 882 : return !(val & NAPIF_STATE_SCHED);
6425 : }
6426 : EXPORT_SYMBOL(napi_schedule_prep);
6427 :
6428 : /**
6429 : * __napi_schedule_irqoff - schedule for receive
6430 : * @n: entry to schedule
6431 : *
6432 : * Variant of __napi_schedule() assuming hard irqs are masked
6433 : */
6434 0 : void __napi_schedule_irqoff(struct napi_struct *n)
6435 : {
6436 0 : ____napi_schedule(this_cpu_ptr(&softnet_data), n);
6437 0 : }
6438 : EXPORT_SYMBOL(__napi_schedule_irqoff);
6439 :
6440 855 : bool napi_complete_done(struct napi_struct *n, int work_done)
6441 : {
6442 855 : unsigned long flags, val, new, timeout = 0;
6443 855 : bool ret = true;
6444 :
6445 : /*
6446 : * 1) Don't let napi dequeue from the cpu poll list
6447 : * just in case its running on a different cpu.
6448 : * 2) If we are busy polling, do nothing here, we have
6449 : * the guarantee we will be called later.
6450 : */
6451 855 : if (unlikely(n->state & (NAPIF_STATE_NPSVC |
6452 : NAPIF_STATE_IN_BUSY_POLL)))
6453 : return false;
6454 :
6455 855 : if (work_done) {
6456 404 : if (n->gro_bitmask)
6457 11 : timeout = READ_ONCE(n->dev->gro_flush_timeout);
6458 404 : n->defer_hard_irqs_count = READ_ONCE(n->dev->napi_defer_hard_irqs);
6459 : }
6460 855 : if (n->defer_hard_irqs_count > 0) {
6461 0 : n->defer_hard_irqs_count--;
6462 0 : timeout = READ_ONCE(n->dev->gro_flush_timeout);
6463 0 : if (timeout)
6464 0 : ret = false;
6465 : }
6466 855 : if (n->gro_bitmask) {
6467 : /* When the NAPI instance uses a timeout and keeps postponing
6468 : * it, we need to bound somehow the time packets are kept in
6469 : * the GRO layer
6470 : */
6471 11 : napi_gro_flush(n, !!timeout);
6472 : }
6473 :
6474 855 : gro_normal_list(n);
6475 :
6476 855 : if (unlikely(!list_empty(&n->poll_list))) {
6477 : /* If n->poll_list is not empty, we need to mask irqs */
6478 0 : local_irq_save(flags);
6479 0 : list_del_init(&n->poll_list);
6480 0 : local_irq_restore(flags);
6481 : }
6482 :
6483 855 : do {
6484 855 : val = READ_ONCE(n->state);
6485 :
6486 855 : WARN_ON_ONCE(!(val & NAPIF_STATE_SCHED));
6487 :
6488 855 : new = val & ~(NAPIF_STATE_MISSED | NAPIF_STATE_SCHED |
6489 : NAPIF_STATE_PREFER_BUSY_POLL);
6490 :
6491 : /* If STATE_MISSED was set, leave STATE_SCHED set,
6492 : * because we will call napi->poll() one more time.
6493 : * This C code was suggested by Alexander Duyck to help gcc.
6494 : */
6495 855 : new |= (val & NAPIF_STATE_MISSED) / NAPIF_STATE_MISSED *
6496 : NAPIF_STATE_SCHED;
6497 855 : } while (cmpxchg(&n->state, val, new) != val);
6498 :
6499 855 : if (unlikely(val & NAPIF_STATE_MISSED)) {
6500 77 : __napi_schedule(n);
6501 77 : return false;
6502 : }
6503 :
6504 778 : if (timeout)
6505 0 : hrtimer_start(&n->timer, ns_to_ktime(timeout),
6506 : HRTIMER_MODE_REL_PINNED);
6507 : return ret;
6508 : }
6509 : EXPORT_SYMBOL(napi_complete_done);
6510 :
6511 : /* must be called under rcu_read_lock(), as we dont take a reference */
6512 1 : static struct napi_struct *napi_by_id(unsigned int napi_id)
6513 : {
6514 1 : unsigned int hash = napi_id % HASH_SIZE(napi_hash);
6515 1 : struct napi_struct *napi;
6516 :
6517 2 : hlist_for_each_entry_rcu(napi, &napi_hash[hash], napi_hash_node)
6518 0 : if (napi->napi_id == napi_id)
6519 0 : return napi;
6520 :
6521 : return NULL;
6522 : }
6523 :
6524 : #if defined(CONFIG_NET_RX_BUSY_POLL)
6525 :
6526 0 : static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
6527 : {
6528 0 : if (!skip_schedule) {
6529 0 : gro_normal_list(napi);
6530 0 : __napi_schedule(napi);
6531 0 : return;
6532 : }
6533 :
6534 0 : if (napi->gro_bitmask) {
6535 : /* flush too old packets
6536 : * If HZ < 1000, flush all packets.
6537 : */
6538 0 : napi_gro_flush(napi, HZ >= 1000);
6539 : }
6540 :
6541 0 : gro_normal_list(napi);
6542 0 : clear_bit(NAPI_STATE_SCHED, &napi->state);
6543 : }
6544 :
6545 0 : static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
6546 : u16 budget)
6547 : {
6548 0 : bool skip_schedule = false;
6549 0 : unsigned long timeout;
6550 0 : int rc;
6551 :
6552 : /* Busy polling means there is a high chance device driver hard irq
6553 : * could not grab NAPI_STATE_SCHED, and that NAPI_STATE_MISSED was
6554 : * set in napi_schedule_prep().
6555 : * Since we are about to call napi->poll() once more, we can safely
6556 : * clear NAPI_STATE_MISSED.
6557 : *
6558 : * Note: x86 could use a single "lock and ..." instruction
6559 : * to perform these two clear_bit()
6560 : */
6561 0 : clear_bit(NAPI_STATE_MISSED, &napi->state);
6562 0 : clear_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state);
6563 :
6564 0 : local_bh_disable();
6565 :
6566 0 : if (prefer_busy_poll) {
6567 0 : napi->defer_hard_irqs_count = READ_ONCE(napi->dev->napi_defer_hard_irqs);
6568 0 : timeout = READ_ONCE(napi->dev->gro_flush_timeout);
6569 0 : if (napi->defer_hard_irqs_count && timeout) {
6570 0 : hrtimer_start(&napi->timer, ns_to_ktime(timeout), HRTIMER_MODE_REL_PINNED);
6571 0 : skip_schedule = true;
6572 : }
6573 : }
6574 :
6575 : /* All we really want here is to re-enable device interrupts.
6576 : * Ideally, a new ndo_busy_poll_stop() could avoid another round.
6577 : */
6578 0 : rc = napi->poll(napi, budget);
6579 : /* We can't gro_normal_list() here, because napi->poll() might have
6580 : * rearmed the napi (napi_complete_done()) in which case it could
6581 : * already be running on another CPU.
6582 : */
6583 0 : trace_napi_poll(napi, rc, budget);
6584 0 : netpoll_poll_unlock(have_poll_lock);
6585 0 : if (rc == budget)
6586 0 : __busy_poll_stop(napi, skip_schedule);
6587 0 : local_bh_enable();
6588 0 : }
6589 :
6590 0 : void napi_busy_loop(unsigned int napi_id,
6591 : bool (*loop_end)(void *, unsigned long),
6592 : void *loop_end_arg, bool prefer_busy_poll, u16 budget)
6593 : {
6594 0 : unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
6595 0 : int (*napi_poll)(struct napi_struct *napi, int budget);
6596 0 : void *have_poll_lock = NULL;
6597 0 : struct napi_struct *napi;
6598 :
6599 0 : restart:
6600 0 : napi_poll = NULL;
6601 :
6602 0 : rcu_read_lock();
6603 :
6604 0 : napi = napi_by_id(napi_id);
6605 0 : if (!napi)
6606 0 : goto out;
6607 :
6608 0 : preempt_disable();
6609 0 : for (;;) {
6610 0 : int work = 0;
6611 :
6612 0 : local_bh_disable();
6613 0 : if (!napi_poll) {
6614 0 : unsigned long val = READ_ONCE(napi->state);
6615 :
6616 : /* If multiple threads are competing for this napi,
6617 : * we avoid dirtying napi->state as much as we can.
6618 : */
6619 0 : if (val & (NAPIF_STATE_DISABLE | NAPIF_STATE_SCHED |
6620 : NAPIF_STATE_IN_BUSY_POLL)) {
6621 0 : if (prefer_busy_poll)
6622 0 : set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6623 0 : goto count;
6624 : }
6625 0 : if (cmpxchg(&napi->state, val,
6626 : val | NAPIF_STATE_IN_BUSY_POLL |
6627 : NAPIF_STATE_SCHED) != val) {
6628 0 : if (prefer_busy_poll)
6629 0 : set_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6630 0 : goto count;
6631 : }
6632 0 : have_poll_lock = netpoll_poll_lock(napi);
6633 0 : napi_poll = napi->poll;
6634 : }
6635 0 : work = napi_poll(napi, budget);
6636 0 : trace_napi_poll(napi, work, budget);
6637 0 : gro_normal_list(napi);
6638 0 : count:
6639 0 : if (work > 0)
6640 0 : __NET_ADD_STATS(dev_net(napi->dev),
6641 : LINUX_MIB_BUSYPOLLRXPACKETS, work);
6642 0 : local_bh_enable();
6643 :
6644 0 : if (!loop_end || loop_end(loop_end_arg, start_time))
6645 : break;
6646 :
6647 0 : if (unlikely(need_resched())) {
6648 0 : if (napi_poll)
6649 0 : busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6650 0 : preempt_enable();
6651 0 : rcu_read_unlock();
6652 0 : cond_resched();
6653 0 : if (loop_end(loop_end_arg, start_time))
6654 : return;
6655 0 : goto restart;
6656 : }
6657 0 : cpu_relax();
6658 : }
6659 0 : if (napi_poll)
6660 0 : busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
6661 0 : preempt_enable();
6662 0 : out:
6663 0 : rcu_read_unlock();
6664 : }
6665 : EXPORT_SYMBOL(napi_busy_loop);
6666 :
6667 : #endif /* CONFIG_NET_RX_BUSY_POLL */
6668 :
6669 2 : static void napi_hash_add(struct napi_struct *napi)
6670 : {
6671 2 : if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state))
6672 : return;
6673 :
6674 1 : spin_lock(&napi_hash_lock);
6675 :
6676 : /* 0..NR_CPUS range is reserved for sender_cpu use */
6677 1 : do {
6678 1 : if (unlikely(++napi_gen_id < MIN_NAPI_ID))
6679 0 : napi_gen_id = MIN_NAPI_ID;
6680 1 : } while (napi_by_id(napi_gen_id));
6681 1 : napi->napi_id = napi_gen_id;
6682 :
6683 2 : hlist_add_head_rcu(&napi->napi_hash_node,
6684 1 : &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
6685 :
6686 1 : spin_unlock(&napi_hash_lock);
6687 : }
6688 :
6689 : /* Warning : caller is responsible to make sure rcu grace period
6690 : * is respected before freeing memory containing @napi
6691 : */
6692 0 : static void napi_hash_del(struct napi_struct *napi)
6693 : {
6694 0 : spin_lock(&napi_hash_lock);
6695 :
6696 0 : hlist_del_init_rcu(&napi->napi_hash_node);
6697 :
6698 0 : spin_unlock(&napi_hash_lock);
6699 0 : }
6700 :
6701 0 : static enum hrtimer_restart napi_watchdog(struct hrtimer *timer)
6702 : {
6703 0 : struct napi_struct *napi;
6704 :
6705 0 : napi = container_of(timer, struct napi_struct, timer);
6706 :
6707 : /* Note : we use a relaxed variant of napi_schedule_prep() not setting
6708 : * NAPI_STATE_MISSED, since we do not react to a device IRQ.
6709 : */
6710 0 : if (!napi_disable_pending(napi) &&
6711 0 : !test_and_set_bit(NAPI_STATE_SCHED, &napi->state)) {
6712 0 : clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &napi->state);
6713 0 : __napi_schedule_irqoff(napi);
6714 : }
6715 :
6716 0 : return HRTIMER_NORESTART;
6717 : }
6718 :
6719 6 : static void init_gro_hash(struct napi_struct *napi)
6720 : {
6721 : int i;
6722 :
6723 54 : for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6724 48 : INIT_LIST_HEAD(&napi->gro_hash[i].list);
6725 48 : napi->gro_hash[i].count = 0;
6726 : }
6727 6 : napi->gro_bitmask = 0;
6728 : }
6729 :
6730 0 : int dev_set_threaded(struct net_device *dev, bool threaded)
6731 : {
6732 0 : struct napi_struct *napi;
6733 0 : int err = 0;
6734 :
6735 0 : if (dev->threaded == threaded)
6736 : return 0;
6737 :
6738 0 : if (threaded) {
6739 0 : list_for_each_entry(napi, &dev->napi_list, dev_list) {
6740 0 : if (!napi->thread) {
6741 0 : err = napi_kthread_create(napi);
6742 0 : if (err) {
6743 : threaded = false;
6744 : break;
6745 : }
6746 : }
6747 : }
6748 : }
6749 :
6750 0 : dev->threaded = threaded;
6751 :
6752 : /* Make sure kthread is created before THREADED bit
6753 : * is set.
6754 : */
6755 0 : smp_mb__before_atomic();
6756 :
6757 : /* Setting/unsetting threaded mode on a napi might not immediately
6758 : * take effect, if the current napi instance is actively being
6759 : * polled. In this case, the switch between threaded mode and
6760 : * softirq mode will happen in the next round of napi_schedule().
6761 : * This should not cause hiccups/stalls to the live traffic.
6762 : */
6763 0 : list_for_each_entry(napi, &dev->napi_list, dev_list) {
6764 0 : if (threaded)
6765 0 : set_bit(NAPI_STATE_THREADED, &napi->state);
6766 : else
6767 0 : clear_bit(NAPI_STATE_THREADED, &napi->state);
6768 : }
6769 :
6770 : return err;
6771 : }
6772 :
6773 2 : void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
6774 : int (*poll)(struct napi_struct *, int), int weight)
6775 : {
6776 2 : if (WARN_ON(test_and_set_bit(NAPI_STATE_LISTED, &napi->state)))
6777 : return;
6778 :
6779 2 : INIT_LIST_HEAD(&napi->poll_list);
6780 2 : INIT_HLIST_NODE(&napi->napi_hash_node);
6781 2 : hrtimer_init(&napi->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
6782 2 : napi->timer.function = napi_watchdog;
6783 2 : init_gro_hash(napi);
6784 2 : napi->skb = NULL;
6785 2 : INIT_LIST_HEAD(&napi->rx_list);
6786 2 : napi->rx_count = 0;
6787 2 : napi->poll = poll;
6788 2 : if (weight > NAPI_POLL_WEIGHT)
6789 0 : netdev_err_once(dev, "%s() called with weight %d\n", __func__,
6790 : weight);
6791 2 : napi->weight = weight;
6792 2 : napi->dev = dev;
6793 : #ifdef CONFIG_NETPOLL
6794 : napi->poll_owner = -1;
6795 : #endif
6796 2 : set_bit(NAPI_STATE_SCHED, &napi->state);
6797 2 : set_bit(NAPI_STATE_NPSVC, &napi->state);
6798 2 : list_add_rcu(&napi->dev_list, &dev->napi_list);
6799 2 : napi_hash_add(napi);
6800 : /* Create kthread for this napi if dev->threaded is set.
6801 : * Clear dev->threaded if kthread creation failed so that
6802 : * threaded mode will not be enabled in napi_enable().
6803 : */
6804 2 : if (dev->threaded && napi_kthread_create(napi))
6805 0 : dev->threaded = 0;
6806 : }
6807 : EXPORT_SYMBOL(netif_napi_add);
6808 :
6809 0 : void napi_disable(struct napi_struct *n)
6810 : {
6811 0 : might_sleep();
6812 0 : set_bit(NAPI_STATE_DISABLE, &n->state);
6813 :
6814 0 : while (test_and_set_bit(NAPI_STATE_SCHED, &n->state))
6815 0 : msleep(1);
6816 0 : while (test_and_set_bit(NAPI_STATE_NPSVC, &n->state))
6817 0 : msleep(1);
6818 :
6819 0 : hrtimer_cancel(&n->timer);
6820 :
6821 0 : clear_bit(NAPI_STATE_PREFER_BUSY_POLL, &n->state);
6822 0 : clear_bit(NAPI_STATE_DISABLE, &n->state);
6823 0 : clear_bit(NAPI_STATE_THREADED, &n->state);
6824 0 : }
6825 : EXPORT_SYMBOL(napi_disable);
6826 :
6827 : /**
6828 : * napi_enable - enable NAPI scheduling
6829 : * @n: NAPI context
6830 : *
6831 : * Resume NAPI from being scheduled on this context.
6832 : * Must be paired with napi_disable.
6833 : */
6834 2 : void napi_enable(struct napi_struct *n)
6835 : {
6836 2 : BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
6837 2 : smp_mb__before_atomic();
6838 2 : clear_bit(NAPI_STATE_SCHED, &n->state);
6839 2 : clear_bit(NAPI_STATE_NPSVC, &n->state);
6840 2 : if (n->dev->threaded && n->thread)
6841 0 : set_bit(NAPI_STATE_THREADED, &n->state);
6842 2 : }
6843 : EXPORT_SYMBOL(napi_enable);
6844 :
6845 0 : static void flush_gro_hash(struct napi_struct *napi)
6846 : {
6847 0 : int i;
6848 :
6849 0 : for (i = 0; i < GRO_HASH_BUCKETS; i++) {
6850 0 : struct sk_buff *skb, *n;
6851 :
6852 0 : list_for_each_entry_safe(skb, n, &napi->gro_hash[i].list, list)
6853 0 : kfree_skb(skb);
6854 0 : napi->gro_hash[i].count = 0;
6855 : }
6856 0 : }
6857 :
6858 : /* Must be called in process context */
6859 0 : void __netif_napi_del(struct napi_struct *napi)
6860 : {
6861 0 : if (!test_and_clear_bit(NAPI_STATE_LISTED, &napi->state))
6862 : return;
6863 :
6864 0 : napi_hash_del(napi);
6865 0 : list_del_rcu(&napi->dev_list);
6866 0 : napi_free_frags(napi);
6867 :
6868 0 : flush_gro_hash(napi);
6869 0 : napi->gro_bitmask = 0;
6870 :
6871 0 : if (napi->thread) {
6872 0 : kthread_stop(napi->thread);
6873 0 : napi->thread = NULL;
6874 : }
6875 : }
6876 : EXPORT_SYMBOL(__netif_napi_del);
6877 :
6878 855 : static int __napi_poll(struct napi_struct *n, bool *repoll)
6879 : {
6880 855 : int work, weight;
6881 :
6882 855 : weight = n->weight;
6883 :
6884 : /* This NAPI_STATE_SCHED test is for avoiding a race
6885 : * with netpoll's poll_napi(). Only the entity which
6886 : * obtains the lock and sees NAPI_STATE_SCHED set will
6887 : * actually make the ->poll() call. Therefore we avoid
6888 : * accidentally calling ->poll() when NAPI is not scheduled.
6889 : */
6890 855 : work = 0;
6891 855 : if (test_bit(NAPI_STATE_SCHED, &n->state)) {
6892 855 : work = n->poll(n, weight);
6893 855 : trace_napi_poll(n, work, weight);
6894 : }
6895 :
6896 855 : if (unlikely(work > weight))
6897 0 : pr_err_once("NAPI poll function %pS returned %d, exceeding its budget of %d.\n",
6898 : n->poll, work, weight);
6899 :
6900 855 : if (likely(work < weight))
6901 : return work;
6902 :
6903 : /* Drivers must not modify the NAPI state if they
6904 : * consume the entire weight. In such cases this code
6905 : * still "owns" the NAPI instance and therefore can
6906 : * move the instance around on the list at-will.
6907 : */
6908 0 : if (unlikely(napi_disable_pending(n))) {
6909 0 : napi_complete(n);
6910 0 : return work;
6911 : }
6912 :
6913 : /* The NAPI context has more processing work, but busy-polling
6914 : * is preferred. Exit early.
6915 : */
6916 0 : if (napi_prefer_busy_poll(n)) {
6917 0 : if (napi_complete_done(n, work)) {
6918 : /* If timeout is not set, we need to make sure
6919 : * that the NAPI is re-scheduled.
6920 : */
6921 0 : napi_schedule(n);
6922 : }
6923 0 : return work;
6924 : }
6925 :
6926 0 : if (n->gro_bitmask) {
6927 : /* flush too old packets
6928 : * If HZ < 1000, flush all packets.
6929 : */
6930 0 : napi_gro_flush(n, HZ >= 1000);
6931 : }
6932 :
6933 0 : gro_normal_list(n);
6934 :
6935 : /* Some drivers may have called napi_schedule
6936 : * prior to exhausting their budget.
6937 : */
6938 0 : if (unlikely(!list_empty(&n->poll_list))) {
6939 0 : pr_warn_once("%s: Budget exhausted after napi rescheduled\n",
6940 : n->dev ? n->dev->name : "backlog");
6941 0 : return work;
6942 : }
6943 :
6944 0 : *repoll = true;
6945 :
6946 0 : return work;
6947 : }
6948 :
6949 855 : static int napi_poll(struct napi_struct *n, struct list_head *repoll)
6950 : {
6951 855 : bool do_repoll = false;
6952 855 : void *have;
6953 855 : int work;
6954 :
6955 855 : list_del_init(&n->poll_list);
6956 :
6957 855 : have = netpoll_poll_lock(n);
6958 :
6959 855 : work = __napi_poll(n, &do_repoll);
6960 :
6961 855 : if (do_repoll)
6962 0 : list_add_tail(&n->poll_list, repoll);
6963 :
6964 855 : netpoll_poll_unlock(have);
6965 :
6966 855 : return work;
6967 : }
6968 :
6969 0 : static int napi_thread_wait(struct napi_struct *napi)
6970 : {
6971 0 : set_current_state(TASK_INTERRUPTIBLE);
6972 :
6973 0 : while (!kthread_should_stop() && !napi_disable_pending(napi)) {
6974 0 : if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
6975 0 : WARN_ON(!list_empty(&napi->poll_list));
6976 0 : __set_current_state(TASK_RUNNING);
6977 0 : return 0;
6978 : }
6979 :
6980 0 : schedule();
6981 0 : set_current_state(TASK_INTERRUPTIBLE);
6982 : }
6983 0 : __set_current_state(TASK_RUNNING);
6984 0 : return -1;
6985 : }
6986 :
6987 0 : static int napi_threaded_poll(void *data)
6988 : {
6989 0 : struct napi_struct *napi = data;
6990 0 : void *have;
6991 :
6992 0 : while (!napi_thread_wait(napi)) {
6993 0 : for (;;) {
6994 0 : bool repoll = false;
6995 :
6996 0 : local_bh_disable();
6997 :
6998 0 : have = netpoll_poll_lock(napi);
6999 0 : __napi_poll(napi, &repoll);
7000 0 : netpoll_poll_unlock(have);
7001 :
7002 0 : local_bh_enable();
7003 :
7004 0 : if (!repoll)
7005 : break;
7006 :
7007 0 : cond_resched();
7008 : }
7009 : }
7010 0 : return 0;
7011 : }
7012 :
7013 478 : static __latent_entropy void net_rx_action(struct softirq_action *h)
7014 : {
7015 478 : struct softnet_data *sd = this_cpu_ptr(&softnet_data);
7016 956 : unsigned long time_limit = jiffies +
7017 478 : usecs_to_jiffies(netdev_budget_usecs);
7018 478 : int budget = netdev_budget;
7019 478 : LIST_HEAD(list);
7020 478 : LIST_HEAD(repoll);
7021 :
7022 478 : local_irq_disable();
7023 478 : list_splice_init(&sd->poll_list, &list);
7024 478 : local_irq_enable();
7025 :
7026 1333 : for (;;) {
7027 1333 : struct napi_struct *n;
7028 :
7029 1333 : if (list_empty(&list)) {
7030 478 : if (!sd_has_rps_ipi_waiting(sd) && list_empty(&repoll))
7031 478 : return;
7032 : break;
7033 : }
7034 :
7035 855 : n = list_first_entry(&list, struct napi_struct, poll_list);
7036 855 : budget -= napi_poll(n, &repoll);
7037 :
7038 : /* If softirq window is exhausted then punt.
7039 : * Allow this to run for 2 jiffies since which will allow
7040 : * an average latency of 1.5/HZ.
7041 : */
7042 855 : if (unlikely(budget <= 0 ||
7043 : time_after_eq(jiffies, time_limit))) {
7044 0 : sd->time_squeeze++;
7045 0 : break;
7046 : }
7047 : }
7048 :
7049 0 : local_irq_disable();
7050 :
7051 0 : list_splice_tail_init(&sd->poll_list, &list);
7052 0 : list_splice_tail(&repoll, &list);
7053 0 : list_splice(&list, &sd->poll_list);
7054 0 : if (!list_empty(&sd->poll_list))
7055 0 : __raise_softirq_irqoff(NET_RX_SOFTIRQ);
7056 :
7057 0 : net_rps_action_and_irq_enable(sd);
7058 : }
7059 :
7060 : struct netdev_adjacent {
7061 : struct net_device *dev;
7062 :
7063 : /* upper master flag, there can only be one master device per list */
7064 : bool master;
7065 :
7066 : /* lookup ignore flag */
7067 : bool ignore;
7068 :
7069 : /* counter for the number of times this device was added to us */
7070 : u16 ref_nr;
7071 :
7072 : /* private field for the users */
7073 : void *private;
7074 :
7075 : struct list_head list;
7076 : struct rcu_head rcu;
7077 : };
7078 :
7079 0 : static struct netdev_adjacent *__netdev_find_adj(struct net_device *adj_dev,
7080 : struct list_head *adj_list)
7081 : {
7082 0 : struct netdev_adjacent *adj;
7083 :
7084 0 : list_for_each_entry(adj, adj_list, list) {
7085 0 : if (adj->dev == adj_dev)
7086 : return adj;
7087 : }
7088 : return NULL;
7089 : }
7090 :
7091 0 : static int ____netdev_has_upper_dev(struct net_device *upper_dev,
7092 : struct netdev_nested_priv *priv)
7093 : {
7094 0 : struct net_device *dev = (struct net_device *)priv->data;
7095 :
7096 0 : return upper_dev == dev;
7097 : }
7098 :
7099 : /**
7100 : * netdev_has_upper_dev - Check if device is linked to an upper device
7101 : * @dev: device
7102 : * @upper_dev: upper device to check
7103 : *
7104 : * Find out if a device is linked to specified upper device and return true
7105 : * in case it is. Note that this checks only immediate upper device,
7106 : * not through a complete stack of devices. The caller must hold the RTNL lock.
7107 : */
7108 0 : bool netdev_has_upper_dev(struct net_device *dev,
7109 : struct net_device *upper_dev)
7110 : {
7111 0 : struct netdev_nested_priv priv = {
7112 : .data = (void *)upper_dev,
7113 : };
7114 :
7115 0 : ASSERT_RTNL();
7116 :
7117 0 : return netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7118 : &priv);
7119 : }
7120 : EXPORT_SYMBOL(netdev_has_upper_dev);
7121 :
7122 : /**
7123 : * netdev_has_upper_dev_all_rcu - Check if device is linked to an upper device
7124 : * @dev: device
7125 : * @upper_dev: upper device to check
7126 : *
7127 : * Find out if a device is linked to specified upper device and return true
7128 : * in case it is. Note that this checks the entire upper device chain.
7129 : * The caller must hold rcu lock.
7130 : */
7131 :
7132 0 : bool netdev_has_upper_dev_all_rcu(struct net_device *dev,
7133 : struct net_device *upper_dev)
7134 : {
7135 0 : struct netdev_nested_priv priv = {
7136 : .data = (void *)upper_dev,
7137 : };
7138 :
7139 0 : return !!netdev_walk_all_upper_dev_rcu(dev, ____netdev_has_upper_dev,
7140 : &priv);
7141 : }
7142 : EXPORT_SYMBOL(netdev_has_upper_dev_all_rcu);
7143 :
7144 : /**
7145 : * netdev_has_any_upper_dev - Check if device is linked to some device
7146 : * @dev: device
7147 : *
7148 : * Find out if a device is linked to an upper device and return true in case
7149 : * it is. The caller must hold the RTNL lock.
7150 : */
7151 0 : bool netdev_has_any_upper_dev(struct net_device *dev)
7152 : {
7153 0 : ASSERT_RTNL();
7154 :
7155 0 : return !list_empty(&dev->adj_list.upper);
7156 : }
7157 : EXPORT_SYMBOL(netdev_has_any_upper_dev);
7158 :
7159 : /**
7160 : * netdev_master_upper_dev_get - Get master upper device
7161 : * @dev: device
7162 : *
7163 : * Find a master upper device and return pointer to it or NULL in case
7164 : * it's not there. The caller must hold the RTNL lock.
7165 : */
7166 2 : struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
7167 : {
7168 2 : struct netdev_adjacent *upper;
7169 :
7170 2 : ASSERT_RTNL();
7171 :
7172 2 : if (list_empty(&dev->adj_list.upper))
7173 : return NULL;
7174 :
7175 0 : upper = list_first_entry(&dev->adj_list.upper,
7176 : struct netdev_adjacent, list);
7177 0 : if (likely(upper->master))
7178 0 : return upper->dev;
7179 : return NULL;
7180 : }
7181 : EXPORT_SYMBOL(netdev_master_upper_dev_get);
7182 :
7183 0 : static struct net_device *__netdev_master_upper_dev_get(struct net_device *dev)
7184 : {
7185 0 : struct netdev_adjacent *upper;
7186 :
7187 0 : ASSERT_RTNL();
7188 :
7189 0 : if (list_empty(&dev->adj_list.upper))
7190 : return NULL;
7191 :
7192 0 : upper = list_first_entry(&dev->adj_list.upper,
7193 : struct netdev_adjacent, list);
7194 0 : if (likely(upper->master) && !upper->ignore)
7195 0 : return upper->dev;
7196 : return NULL;
7197 : }
7198 :
7199 : /**
7200 : * netdev_has_any_lower_dev - Check if device is linked to some device
7201 : * @dev: device
7202 : *
7203 : * Find out if a device is linked to a lower device and return true in case
7204 : * it is. The caller must hold the RTNL lock.
7205 : */
7206 0 : static bool netdev_has_any_lower_dev(struct net_device *dev)
7207 : {
7208 0 : ASSERT_RTNL();
7209 :
7210 0 : return !list_empty(&dev->adj_list.lower);
7211 : }
7212 :
7213 0 : void *netdev_adjacent_get_private(struct list_head *adj_list)
7214 : {
7215 0 : struct netdev_adjacent *adj;
7216 :
7217 0 : adj = list_entry(adj_list, struct netdev_adjacent, list);
7218 :
7219 0 : return adj->private;
7220 : }
7221 : EXPORT_SYMBOL(netdev_adjacent_get_private);
7222 :
7223 : /**
7224 : * netdev_upper_get_next_dev_rcu - Get the next dev from upper list
7225 : * @dev: device
7226 : * @iter: list_head ** of the current position
7227 : *
7228 : * Gets the next device from the dev's upper list, starting from iter
7229 : * position. The caller must hold RCU read lock.
7230 : */
7231 4 : struct net_device *netdev_upper_get_next_dev_rcu(struct net_device *dev,
7232 : struct list_head **iter)
7233 : {
7234 4 : struct netdev_adjacent *upper;
7235 :
7236 8 : WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7237 :
7238 4 : upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7239 :
7240 4 : if (&upper->list == &dev->adj_list.upper)
7241 : return NULL;
7242 :
7243 0 : *iter = &upper->list;
7244 :
7245 0 : return upper->dev;
7246 : }
7247 : EXPORT_SYMBOL(netdev_upper_get_next_dev_rcu);
7248 :
7249 0 : static struct net_device *__netdev_next_upper_dev(struct net_device *dev,
7250 : struct list_head **iter,
7251 : bool *ignore)
7252 : {
7253 0 : struct netdev_adjacent *upper;
7254 :
7255 0 : upper = list_entry((*iter)->next, struct netdev_adjacent, list);
7256 :
7257 0 : if (&upper->list == &dev->adj_list.upper)
7258 : return NULL;
7259 :
7260 0 : *iter = &upper->list;
7261 0 : *ignore = upper->ignore;
7262 :
7263 0 : return upper->dev;
7264 : }
7265 :
7266 0 : static struct net_device *netdev_next_upper_dev_rcu(struct net_device *dev,
7267 : struct list_head **iter)
7268 : {
7269 0 : struct netdev_adjacent *upper;
7270 :
7271 0 : WARN_ON_ONCE(!rcu_read_lock_held() && !lockdep_rtnl_is_held());
7272 :
7273 0 : upper = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7274 :
7275 0 : if (&upper->list == &dev->adj_list.upper)
7276 : return NULL;
7277 :
7278 0 : *iter = &upper->list;
7279 :
7280 0 : return upper->dev;
7281 : }
7282 :
7283 0 : static int __netdev_walk_all_upper_dev(struct net_device *dev,
7284 : int (*fn)(struct net_device *dev,
7285 : struct netdev_nested_priv *priv),
7286 : struct netdev_nested_priv *priv)
7287 : {
7288 0 : struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7289 0 : struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7290 0 : int ret, cur = 0;
7291 0 : bool ignore;
7292 :
7293 0 : now = dev;
7294 0 : iter = &dev->adj_list.upper;
7295 :
7296 0 : while (1) {
7297 0 : if (now != dev) {
7298 0 : ret = fn(now, priv);
7299 0 : if (ret)
7300 0 : return ret;
7301 : }
7302 :
7303 0 : next = NULL;
7304 0 : while (1) {
7305 0 : udev = __netdev_next_upper_dev(now, &iter, &ignore);
7306 0 : if (!udev)
7307 : break;
7308 0 : if (ignore)
7309 0 : continue;
7310 :
7311 0 : next = udev;
7312 0 : niter = &udev->adj_list.upper;
7313 0 : dev_stack[cur] = now;
7314 0 : iter_stack[cur++] = iter;
7315 0 : break;
7316 : }
7317 :
7318 0 : if (!next) {
7319 0 : if (!cur)
7320 : return 0;
7321 0 : next = dev_stack[--cur];
7322 0 : niter = iter_stack[cur];
7323 : }
7324 :
7325 : now = next;
7326 : iter = niter;
7327 : }
7328 :
7329 : return 0;
7330 : }
7331 :
7332 0 : int netdev_walk_all_upper_dev_rcu(struct net_device *dev,
7333 : int (*fn)(struct net_device *dev,
7334 : struct netdev_nested_priv *priv),
7335 : struct netdev_nested_priv *priv)
7336 : {
7337 0 : struct net_device *udev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7338 0 : struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7339 0 : int ret, cur = 0;
7340 :
7341 0 : now = dev;
7342 0 : iter = &dev->adj_list.upper;
7343 :
7344 0 : while (1) {
7345 0 : if (now != dev) {
7346 0 : ret = fn(now, priv);
7347 0 : if (ret)
7348 0 : return ret;
7349 : }
7350 :
7351 0 : next = NULL;
7352 0 : while (1) {
7353 0 : udev = netdev_next_upper_dev_rcu(now, &iter);
7354 0 : if (!udev)
7355 : break;
7356 :
7357 0 : next = udev;
7358 0 : niter = &udev->adj_list.upper;
7359 0 : dev_stack[cur] = now;
7360 0 : iter_stack[cur++] = iter;
7361 0 : break;
7362 : }
7363 :
7364 0 : if (!next) {
7365 0 : if (!cur)
7366 : return 0;
7367 0 : next = dev_stack[--cur];
7368 0 : niter = iter_stack[cur];
7369 : }
7370 :
7371 0 : now = next;
7372 0 : iter = niter;
7373 : }
7374 :
7375 : return 0;
7376 : }
7377 : EXPORT_SYMBOL_GPL(netdev_walk_all_upper_dev_rcu);
7378 :
7379 0 : static bool __netdev_has_upper_dev(struct net_device *dev,
7380 : struct net_device *upper_dev)
7381 : {
7382 0 : struct netdev_nested_priv priv = {
7383 : .flags = 0,
7384 : .data = (void *)upper_dev,
7385 : };
7386 :
7387 0 : ASSERT_RTNL();
7388 :
7389 0 : return __netdev_walk_all_upper_dev(dev, ____netdev_has_upper_dev,
7390 : &priv);
7391 : }
7392 :
7393 : /**
7394 : * netdev_lower_get_next_private - Get the next ->private from the
7395 : * lower neighbour list
7396 : * @dev: device
7397 : * @iter: list_head ** of the current position
7398 : *
7399 : * Gets the next netdev_adjacent->private from the dev's lower neighbour
7400 : * list, starting from iter position. The caller must hold either hold the
7401 : * RTNL lock or its own locking that guarantees that the neighbour lower
7402 : * list will remain unchanged.
7403 : */
7404 0 : void *netdev_lower_get_next_private(struct net_device *dev,
7405 : struct list_head **iter)
7406 : {
7407 0 : struct netdev_adjacent *lower;
7408 :
7409 0 : lower = list_entry(*iter, struct netdev_adjacent, list);
7410 :
7411 0 : if (&lower->list == &dev->adj_list.lower)
7412 : return NULL;
7413 :
7414 0 : *iter = lower->list.next;
7415 :
7416 0 : return lower->private;
7417 : }
7418 : EXPORT_SYMBOL(netdev_lower_get_next_private);
7419 :
7420 : /**
7421 : * netdev_lower_get_next_private_rcu - Get the next ->private from the
7422 : * lower neighbour list, RCU
7423 : * variant
7424 : * @dev: device
7425 : * @iter: list_head ** of the current position
7426 : *
7427 : * Gets the next netdev_adjacent->private from the dev's lower neighbour
7428 : * list, starting from iter position. The caller must hold RCU read lock.
7429 : */
7430 0 : void *netdev_lower_get_next_private_rcu(struct net_device *dev,
7431 : struct list_head **iter)
7432 : {
7433 0 : struct netdev_adjacent *lower;
7434 :
7435 0 : WARN_ON_ONCE(!rcu_read_lock_held());
7436 :
7437 0 : lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7438 :
7439 0 : if (&lower->list == &dev->adj_list.lower)
7440 : return NULL;
7441 :
7442 0 : *iter = &lower->list;
7443 :
7444 0 : return lower->private;
7445 : }
7446 : EXPORT_SYMBOL(netdev_lower_get_next_private_rcu);
7447 :
7448 : /**
7449 : * netdev_lower_get_next - Get the next device from the lower neighbour
7450 : * list
7451 : * @dev: device
7452 : * @iter: list_head ** of the current position
7453 : *
7454 : * Gets the next netdev_adjacent from the dev's lower neighbour
7455 : * list, starting from iter position. The caller must hold RTNL lock or
7456 : * its own locking that guarantees that the neighbour lower
7457 : * list will remain unchanged.
7458 : */
7459 4 : void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
7460 : {
7461 4 : struct netdev_adjacent *lower;
7462 :
7463 4 : lower = list_entry(*iter, struct netdev_adjacent, list);
7464 :
7465 0 : if (&lower->list == &dev->adj_list.lower)
7466 : return NULL;
7467 :
7468 0 : *iter = lower->list.next;
7469 :
7470 0 : return lower->dev;
7471 : }
7472 : EXPORT_SYMBOL(netdev_lower_get_next);
7473 :
7474 0 : static struct net_device *netdev_next_lower_dev(struct net_device *dev,
7475 : struct list_head **iter)
7476 : {
7477 0 : struct netdev_adjacent *lower;
7478 :
7479 0 : lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7480 :
7481 0 : if (&lower->list == &dev->adj_list.lower)
7482 : return NULL;
7483 :
7484 0 : *iter = &lower->list;
7485 :
7486 0 : return lower->dev;
7487 : }
7488 :
7489 0 : static struct net_device *__netdev_next_lower_dev(struct net_device *dev,
7490 : struct list_head **iter,
7491 : bool *ignore)
7492 : {
7493 0 : struct netdev_adjacent *lower;
7494 :
7495 0 : lower = list_entry((*iter)->next, struct netdev_adjacent, list);
7496 :
7497 0 : if (&lower->list == &dev->adj_list.lower)
7498 : return NULL;
7499 :
7500 0 : *iter = &lower->list;
7501 0 : *ignore = lower->ignore;
7502 :
7503 0 : return lower->dev;
7504 : }
7505 :
7506 0 : int netdev_walk_all_lower_dev(struct net_device *dev,
7507 : int (*fn)(struct net_device *dev,
7508 : struct netdev_nested_priv *priv),
7509 : struct netdev_nested_priv *priv)
7510 : {
7511 0 : struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7512 0 : struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7513 0 : int ret, cur = 0;
7514 :
7515 0 : now = dev;
7516 0 : iter = &dev->adj_list.lower;
7517 :
7518 0 : while (1) {
7519 0 : if (now != dev) {
7520 0 : ret = fn(now, priv);
7521 0 : if (ret)
7522 0 : return ret;
7523 : }
7524 :
7525 0 : next = NULL;
7526 0 : while (1) {
7527 0 : ldev = netdev_next_lower_dev(now, &iter);
7528 0 : if (!ldev)
7529 : break;
7530 :
7531 0 : next = ldev;
7532 0 : niter = &ldev->adj_list.lower;
7533 0 : dev_stack[cur] = now;
7534 0 : iter_stack[cur++] = iter;
7535 0 : break;
7536 : }
7537 :
7538 0 : if (!next) {
7539 0 : if (!cur)
7540 : return 0;
7541 0 : next = dev_stack[--cur];
7542 0 : niter = iter_stack[cur];
7543 : }
7544 :
7545 : now = next;
7546 : iter = niter;
7547 : }
7548 :
7549 : return 0;
7550 : }
7551 : EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev);
7552 :
7553 0 : static int __netdev_walk_all_lower_dev(struct net_device *dev,
7554 : int (*fn)(struct net_device *dev,
7555 : struct netdev_nested_priv *priv),
7556 : struct netdev_nested_priv *priv)
7557 : {
7558 0 : struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7559 0 : struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7560 0 : int ret, cur = 0;
7561 0 : bool ignore;
7562 :
7563 0 : now = dev;
7564 0 : iter = &dev->adj_list.lower;
7565 :
7566 0 : while (1) {
7567 0 : if (now != dev) {
7568 0 : ret = fn(now, priv);
7569 0 : if (ret)
7570 0 : return ret;
7571 : }
7572 :
7573 0 : next = NULL;
7574 0 : while (1) {
7575 0 : ldev = __netdev_next_lower_dev(now, &iter, &ignore);
7576 0 : if (!ldev)
7577 : break;
7578 0 : if (ignore)
7579 0 : continue;
7580 :
7581 0 : next = ldev;
7582 0 : niter = &ldev->adj_list.lower;
7583 0 : dev_stack[cur] = now;
7584 0 : iter_stack[cur++] = iter;
7585 0 : break;
7586 : }
7587 :
7588 0 : if (!next) {
7589 0 : if (!cur)
7590 : return 0;
7591 0 : next = dev_stack[--cur];
7592 0 : niter = iter_stack[cur];
7593 : }
7594 :
7595 : now = next;
7596 : iter = niter;
7597 : }
7598 :
7599 : return 0;
7600 : }
7601 :
7602 0 : struct net_device *netdev_next_lower_dev_rcu(struct net_device *dev,
7603 : struct list_head **iter)
7604 : {
7605 0 : struct netdev_adjacent *lower;
7606 :
7607 0 : lower = list_entry_rcu((*iter)->next, struct netdev_adjacent, list);
7608 0 : if (&lower->list == &dev->adj_list.lower)
7609 : return NULL;
7610 :
7611 0 : *iter = &lower->list;
7612 :
7613 0 : return lower->dev;
7614 : }
7615 : EXPORT_SYMBOL(netdev_next_lower_dev_rcu);
7616 :
7617 0 : static u8 __netdev_upper_depth(struct net_device *dev)
7618 : {
7619 0 : struct net_device *udev;
7620 0 : struct list_head *iter;
7621 0 : u8 max_depth = 0;
7622 0 : bool ignore;
7623 :
7624 0 : for (iter = &dev->adj_list.upper,
7625 0 : udev = __netdev_next_upper_dev(dev, &iter, &ignore);
7626 0 : udev;
7627 0 : udev = __netdev_next_upper_dev(dev, &iter, &ignore)) {
7628 0 : if (ignore)
7629 0 : continue;
7630 0 : if (max_depth < udev->upper_level)
7631 : max_depth = udev->upper_level;
7632 : }
7633 :
7634 0 : return max_depth;
7635 : }
7636 :
7637 0 : static u8 __netdev_lower_depth(struct net_device *dev)
7638 : {
7639 0 : struct net_device *ldev;
7640 0 : struct list_head *iter;
7641 0 : u8 max_depth = 0;
7642 0 : bool ignore;
7643 :
7644 0 : for (iter = &dev->adj_list.lower,
7645 0 : ldev = __netdev_next_lower_dev(dev, &iter, &ignore);
7646 0 : ldev;
7647 0 : ldev = __netdev_next_lower_dev(dev, &iter, &ignore)) {
7648 0 : if (ignore)
7649 0 : continue;
7650 0 : if (max_depth < ldev->lower_level)
7651 : max_depth = ldev->lower_level;
7652 : }
7653 :
7654 0 : return max_depth;
7655 : }
7656 :
7657 0 : static int __netdev_update_upper_level(struct net_device *dev,
7658 : struct netdev_nested_priv *__unused)
7659 : {
7660 0 : dev->upper_level = __netdev_upper_depth(dev) + 1;
7661 0 : return 0;
7662 : }
7663 :
7664 0 : static int __netdev_update_lower_level(struct net_device *dev,
7665 : struct netdev_nested_priv *priv)
7666 : {
7667 0 : dev->lower_level = __netdev_lower_depth(dev) + 1;
7668 :
7669 : #ifdef CONFIG_LOCKDEP
7670 0 : if (!priv)
7671 : return 0;
7672 :
7673 0 : if (priv->flags & NESTED_SYNC_IMM)
7674 0 : dev->nested_level = dev->lower_level - 1;
7675 0 : if (priv->flags & NESTED_SYNC_TODO)
7676 0 : net_unlink_todo(dev);
7677 : #endif
7678 : return 0;
7679 : }
7680 :
7681 0 : int netdev_walk_all_lower_dev_rcu(struct net_device *dev,
7682 : int (*fn)(struct net_device *dev,
7683 : struct netdev_nested_priv *priv),
7684 : struct netdev_nested_priv *priv)
7685 : {
7686 0 : struct net_device *ldev, *next, *now, *dev_stack[MAX_NEST_DEV + 1];
7687 0 : struct list_head *niter, *iter, *iter_stack[MAX_NEST_DEV + 1];
7688 0 : int ret, cur = 0;
7689 :
7690 0 : now = dev;
7691 0 : iter = &dev->adj_list.lower;
7692 :
7693 0 : while (1) {
7694 0 : if (now != dev) {
7695 0 : ret = fn(now, priv);
7696 0 : if (ret)
7697 0 : return ret;
7698 : }
7699 :
7700 0 : next = NULL;
7701 0 : while (1) {
7702 0 : ldev = netdev_next_lower_dev_rcu(now, &iter);
7703 0 : if (!ldev)
7704 : break;
7705 :
7706 0 : next = ldev;
7707 0 : niter = &ldev->adj_list.lower;
7708 0 : dev_stack[cur] = now;
7709 0 : iter_stack[cur++] = iter;
7710 0 : break;
7711 : }
7712 :
7713 0 : if (!next) {
7714 0 : if (!cur)
7715 : return 0;
7716 0 : next = dev_stack[--cur];
7717 0 : niter = iter_stack[cur];
7718 : }
7719 :
7720 : now = next;
7721 : iter = niter;
7722 : }
7723 :
7724 : return 0;
7725 : }
7726 : EXPORT_SYMBOL_GPL(netdev_walk_all_lower_dev_rcu);
7727 :
7728 : /**
7729 : * netdev_lower_get_first_private_rcu - Get the first ->private from the
7730 : * lower neighbour list, RCU
7731 : * variant
7732 : * @dev: device
7733 : *
7734 : * Gets the first netdev_adjacent->private from the dev's lower neighbour
7735 : * list. The caller must hold RCU read lock.
7736 : */
7737 0 : void *netdev_lower_get_first_private_rcu(struct net_device *dev)
7738 : {
7739 0 : struct netdev_adjacent *lower;
7740 :
7741 0 : lower = list_first_or_null_rcu(&dev->adj_list.lower,
7742 : struct netdev_adjacent, list);
7743 0 : if (lower)
7744 0 : return lower->private;
7745 : return NULL;
7746 : }
7747 : EXPORT_SYMBOL(netdev_lower_get_first_private_rcu);
7748 :
7749 : /**
7750 : * netdev_master_upper_dev_get_rcu - Get master upper device
7751 : * @dev: device
7752 : *
7753 : * Find a master upper device and return pointer to it or NULL in case
7754 : * it's not there. The caller must hold the RCU read lock.
7755 : */
7756 32 : struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
7757 : {
7758 32 : struct netdev_adjacent *upper;
7759 :
7760 32 : upper = list_first_or_null_rcu(&dev->adj_list.upper,
7761 : struct netdev_adjacent, list);
7762 32 : if (upper && likely(upper->master))
7763 0 : return upper->dev;
7764 : return NULL;
7765 : }
7766 : EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
7767 :
7768 0 : static int netdev_adjacent_sysfs_add(struct net_device *dev,
7769 : struct net_device *adj_dev,
7770 : struct list_head *dev_list)
7771 : {
7772 0 : char linkname[IFNAMSIZ+7];
7773 :
7774 0 : sprintf(linkname, dev_list == &dev->adj_list.upper ?
7775 0 : "upper_%s" : "lower_%s", adj_dev->name);
7776 0 : return sysfs_create_link(&(dev->dev.kobj), &(adj_dev->dev.kobj),
7777 : linkname);
7778 : }
7779 0 : static void netdev_adjacent_sysfs_del(struct net_device *dev,
7780 : char *name,
7781 : struct list_head *dev_list)
7782 : {
7783 0 : char linkname[IFNAMSIZ+7];
7784 :
7785 0 : sprintf(linkname, dev_list == &dev->adj_list.upper ?
7786 : "upper_%s" : "lower_%s", name);
7787 0 : sysfs_remove_link(&(dev->dev.kobj), linkname);
7788 0 : }
7789 :
7790 0 : static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
7791 : struct net_device *adj_dev,
7792 : struct list_head *dev_list)
7793 : {
7794 0 : return (dev_list == &dev->adj_list.upper ||
7795 0 : dev_list == &dev->adj_list.lower) &&
7796 0 : net_eq(dev_net(dev), dev_net(adj_dev));
7797 : }
7798 :
7799 0 : static int __netdev_adjacent_dev_insert(struct net_device *dev,
7800 : struct net_device *adj_dev,
7801 : struct list_head *dev_list,
7802 : void *private, bool master)
7803 : {
7804 0 : struct netdev_adjacent *adj;
7805 0 : int ret;
7806 :
7807 0 : adj = __netdev_find_adj(adj_dev, dev_list);
7808 :
7809 0 : if (adj) {
7810 0 : adj->ref_nr += 1;
7811 0 : pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d\n",
7812 : dev->name, adj_dev->name, adj->ref_nr);
7813 :
7814 0 : return 0;
7815 : }
7816 :
7817 0 : adj = kmalloc(sizeof(*adj), GFP_KERNEL);
7818 0 : if (!adj)
7819 : return -ENOMEM;
7820 :
7821 0 : adj->dev = adj_dev;
7822 0 : adj->master = master;
7823 0 : adj->ref_nr = 1;
7824 0 : adj->private = private;
7825 0 : adj->ignore = false;
7826 0 : dev_hold(adj_dev);
7827 :
7828 0 : pr_debug("Insert adjacency: dev %s adj_dev %s adj->ref_nr %d; dev_hold on %s\n",
7829 : dev->name, adj_dev->name, adj->ref_nr, adj_dev->name);
7830 :
7831 0 : if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
7832 0 : ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
7833 0 : if (ret)
7834 0 : goto free_adj;
7835 : }
7836 :
7837 : /* Ensure that master link is always the first item in list. */
7838 0 : if (master) {
7839 0 : ret = sysfs_create_link(&(dev->dev.kobj),
7840 : &(adj_dev->dev.kobj), "master");
7841 0 : if (ret)
7842 0 : goto remove_symlinks;
7843 :
7844 0 : list_add_rcu(&adj->list, dev_list);
7845 : } else {
7846 0 : list_add_tail_rcu(&adj->list, dev_list);
7847 : }
7848 :
7849 : return 0;
7850 :
7851 0 : remove_symlinks:
7852 0 : if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7853 0 : netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7854 0 : free_adj:
7855 0 : kfree(adj);
7856 0 : dev_put(adj_dev);
7857 :
7858 0 : return ret;
7859 : }
7860 :
7861 0 : static void __netdev_adjacent_dev_remove(struct net_device *dev,
7862 : struct net_device *adj_dev,
7863 : u16 ref_nr,
7864 : struct list_head *dev_list)
7865 : {
7866 0 : struct netdev_adjacent *adj;
7867 :
7868 0 : pr_debug("Remove adjacency: dev %s adj_dev %s ref_nr %d\n",
7869 : dev->name, adj_dev->name, ref_nr);
7870 :
7871 0 : adj = __netdev_find_adj(adj_dev, dev_list);
7872 :
7873 0 : if (!adj) {
7874 0 : pr_err("Adjacency does not exist for device %s from %s\n",
7875 : dev->name, adj_dev->name);
7876 0 : WARN_ON(1);
7877 0 : return;
7878 : }
7879 :
7880 0 : if (adj->ref_nr > ref_nr) {
7881 0 : pr_debug("adjacency: %s to %s ref_nr - %d = %d\n",
7882 : dev->name, adj_dev->name, ref_nr,
7883 : adj->ref_nr - ref_nr);
7884 0 : adj->ref_nr -= ref_nr;
7885 0 : return;
7886 : }
7887 :
7888 0 : if (adj->master)
7889 0 : sysfs_remove_link(&(dev->dev.kobj), "master");
7890 :
7891 0 : if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
7892 0 : netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
7893 :
7894 0 : list_del_rcu(&adj->list);
7895 0 : pr_debug("adjacency: dev_put for %s, because link removed from %s to %s\n",
7896 : adj_dev->name, dev->name, adj_dev->name);
7897 0 : dev_put(adj_dev);
7898 0 : kfree_rcu(adj, rcu);
7899 : }
7900 :
7901 0 : static int __netdev_adjacent_dev_link_lists(struct net_device *dev,
7902 : struct net_device *upper_dev,
7903 : struct list_head *up_list,
7904 : struct list_head *down_list,
7905 : void *private, bool master)
7906 : {
7907 0 : int ret;
7908 :
7909 0 : ret = __netdev_adjacent_dev_insert(dev, upper_dev, up_list,
7910 : private, master);
7911 0 : if (ret)
7912 : return ret;
7913 :
7914 0 : ret = __netdev_adjacent_dev_insert(upper_dev, dev, down_list,
7915 : private, false);
7916 0 : if (ret) {
7917 0 : __netdev_adjacent_dev_remove(dev, upper_dev, 1, up_list);
7918 0 : return ret;
7919 : }
7920 :
7921 : return 0;
7922 : }
7923 :
7924 0 : static void __netdev_adjacent_dev_unlink_lists(struct net_device *dev,
7925 : struct net_device *upper_dev,
7926 : u16 ref_nr,
7927 : struct list_head *up_list,
7928 : struct list_head *down_list)
7929 : {
7930 0 : __netdev_adjacent_dev_remove(dev, upper_dev, ref_nr, up_list);
7931 0 : __netdev_adjacent_dev_remove(upper_dev, dev, ref_nr, down_list);
7932 0 : }
7933 :
7934 0 : static int __netdev_adjacent_dev_link_neighbour(struct net_device *dev,
7935 : struct net_device *upper_dev,
7936 : void *private, bool master)
7937 : {
7938 0 : return __netdev_adjacent_dev_link_lists(dev, upper_dev,
7939 : &dev->adj_list.upper,
7940 : &upper_dev->adj_list.lower,
7941 : private, master);
7942 : }
7943 :
7944 0 : static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
7945 : struct net_device *upper_dev)
7946 : {
7947 0 : __netdev_adjacent_dev_unlink_lists(dev, upper_dev, 1,
7948 : &dev->adj_list.upper,
7949 : &upper_dev->adj_list.lower);
7950 : }
7951 :
7952 0 : static int __netdev_upper_dev_link(struct net_device *dev,
7953 : struct net_device *upper_dev, bool master,
7954 : void *upper_priv, void *upper_info,
7955 : struct netdev_nested_priv *priv,
7956 : struct netlink_ext_ack *extack)
7957 : {
7958 0 : struct netdev_notifier_changeupper_info changeupper_info = {
7959 : .info = {
7960 : .dev = dev,
7961 : .extack = extack,
7962 : },
7963 : .upper_dev = upper_dev,
7964 : .master = master,
7965 : .linking = true,
7966 : .upper_info = upper_info,
7967 : };
7968 0 : struct net_device *master_dev;
7969 0 : int ret = 0;
7970 :
7971 0 : ASSERT_RTNL();
7972 :
7973 0 : if (dev == upper_dev)
7974 : return -EBUSY;
7975 :
7976 : /* To prevent loops, check if dev is not upper device to upper_dev. */
7977 0 : if (__netdev_has_upper_dev(upper_dev, dev))
7978 : return -EBUSY;
7979 :
7980 0 : if ((dev->lower_level + upper_dev->upper_level) > MAX_NEST_DEV)
7981 : return -EMLINK;
7982 :
7983 0 : if (!master) {
7984 0 : if (__netdev_has_upper_dev(dev, upper_dev))
7985 : return -EEXIST;
7986 : } else {
7987 0 : master_dev = __netdev_master_upper_dev_get(dev);
7988 0 : if (master_dev)
7989 0 : return master_dev == upper_dev ? -EEXIST : -EBUSY;
7990 : }
7991 :
7992 0 : ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
7993 : &changeupper_info.info);
7994 0 : ret = notifier_to_errno(ret);
7995 0 : if (ret)
7996 0 : return ret;
7997 :
7998 0 : ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
7999 : master);
8000 0 : if (ret)
8001 : return ret;
8002 :
8003 0 : ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
8004 : &changeupper_info.info);
8005 0 : ret = notifier_to_errno(ret);
8006 0 : if (ret)
8007 0 : goto rollback;
8008 :
8009 0 : __netdev_update_upper_level(dev, NULL);
8010 0 : __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
8011 :
8012 0 : __netdev_update_lower_level(upper_dev, priv);
8013 0 : __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
8014 : priv);
8015 :
8016 0 : return 0;
8017 :
8018 0 : rollback:
8019 0 : __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
8020 :
8021 0 : return ret;
8022 : }
8023 :
8024 : /**
8025 : * netdev_upper_dev_link - Add a link to the upper device
8026 : * @dev: device
8027 : * @upper_dev: new upper device
8028 : * @extack: netlink extended ack
8029 : *
8030 : * Adds a link to device which is upper to this one. The caller must hold
8031 : * the RTNL lock. On a failure a negative errno code is returned.
8032 : * On success the reference counts are adjusted and the function
8033 : * returns zero.
8034 : */
8035 0 : int netdev_upper_dev_link(struct net_device *dev,
8036 : struct net_device *upper_dev,
8037 : struct netlink_ext_ack *extack)
8038 : {
8039 0 : struct netdev_nested_priv priv = {
8040 : .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8041 : .data = NULL,
8042 : };
8043 :
8044 0 : return __netdev_upper_dev_link(dev, upper_dev, false,
8045 : NULL, NULL, &priv, extack);
8046 : }
8047 : EXPORT_SYMBOL(netdev_upper_dev_link);
8048 :
8049 : /**
8050 : * netdev_master_upper_dev_link - Add a master link to the upper device
8051 : * @dev: device
8052 : * @upper_dev: new upper device
8053 : * @upper_priv: upper device private
8054 : * @upper_info: upper info to be passed down via notifier
8055 : * @extack: netlink extended ack
8056 : *
8057 : * Adds a link to device which is upper to this one. In this case, only
8058 : * one master upper device can be linked, although other non-master devices
8059 : * might be linked as well. The caller must hold the RTNL lock.
8060 : * On a failure a negative errno code is returned. On success the reference
8061 : * counts are adjusted and the function returns zero.
8062 : */
8063 0 : int netdev_master_upper_dev_link(struct net_device *dev,
8064 : struct net_device *upper_dev,
8065 : void *upper_priv, void *upper_info,
8066 : struct netlink_ext_ack *extack)
8067 : {
8068 0 : struct netdev_nested_priv priv = {
8069 : .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8070 : .data = NULL,
8071 : };
8072 :
8073 0 : return __netdev_upper_dev_link(dev, upper_dev, true,
8074 : upper_priv, upper_info, &priv, extack);
8075 : }
8076 : EXPORT_SYMBOL(netdev_master_upper_dev_link);
8077 :
8078 0 : static void __netdev_upper_dev_unlink(struct net_device *dev,
8079 : struct net_device *upper_dev,
8080 : struct netdev_nested_priv *priv)
8081 : {
8082 0 : struct netdev_notifier_changeupper_info changeupper_info = {
8083 : .info = {
8084 : .dev = dev,
8085 : },
8086 : .upper_dev = upper_dev,
8087 : .linking = false,
8088 : };
8089 :
8090 0 : ASSERT_RTNL();
8091 :
8092 0 : changeupper_info.master = netdev_master_upper_dev_get(dev) == upper_dev;
8093 :
8094 0 : call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER,
8095 : &changeupper_info.info);
8096 :
8097 0 : __netdev_adjacent_dev_unlink_neighbour(dev, upper_dev);
8098 :
8099 0 : call_netdevice_notifiers_info(NETDEV_CHANGEUPPER,
8100 : &changeupper_info.info);
8101 :
8102 0 : __netdev_update_upper_level(dev, NULL);
8103 0 : __netdev_walk_all_lower_dev(dev, __netdev_update_upper_level, NULL);
8104 :
8105 0 : __netdev_update_lower_level(upper_dev, priv);
8106 0 : __netdev_walk_all_upper_dev(upper_dev, __netdev_update_lower_level,
8107 : priv);
8108 0 : }
8109 :
8110 : /**
8111 : * netdev_upper_dev_unlink - Removes a link to upper device
8112 : * @dev: device
8113 : * @upper_dev: new upper device
8114 : *
8115 : * Removes a link to device which is upper to this one. The caller must hold
8116 : * the RTNL lock.
8117 : */
8118 0 : void netdev_upper_dev_unlink(struct net_device *dev,
8119 : struct net_device *upper_dev)
8120 : {
8121 0 : struct netdev_nested_priv priv = {
8122 : .flags = NESTED_SYNC_TODO,
8123 : .data = NULL,
8124 : };
8125 :
8126 0 : __netdev_upper_dev_unlink(dev, upper_dev, &priv);
8127 0 : }
8128 : EXPORT_SYMBOL(netdev_upper_dev_unlink);
8129 :
8130 0 : static void __netdev_adjacent_dev_set(struct net_device *upper_dev,
8131 : struct net_device *lower_dev,
8132 : bool val)
8133 : {
8134 0 : struct netdev_adjacent *adj;
8135 :
8136 0 : adj = __netdev_find_adj(lower_dev, &upper_dev->adj_list.lower);
8137 0 : if (adj)
8138 0 : adj->ignore = val;
8139 :
8140 0 : adj = __netdev_find_adj(upper_dev, &lower_dev->adj_list.upper);
8141 0 : if (adj)
8142 0 : adj->ignore = val;
8143 0 : }
8144 :
8145 0 : static void netdev_adjacent_dev_disable(struct net_device *upper_dev,
8146 : struct net_device *lower_dev)
8147 : {
8148 0 : __netdev_adjacent_dev_set(upper_dev, lower_dev, true);
8149 0 : }
8150 :
8151 0 : static void netdev_adjacent_dev_enable(struct net_device *upper_dev,
8152 : struct net_device *lower_dev)
8153 : {
8154 0 : __netdev_adjacent_dev_set(upper_dev, lower_dev, false);
8155 0 : }
8156 :
8157 0 : int netdev_adjacent_change_prepare(struct net_device *old_dev,
8158 : struct net_device *new_dev,
8159 : struct net_device *dev,
8160 : struct netlink_ext_ack *extack)
8161 : {
8162 0 : struct netdev_nested_priv priv = {
8163 : .flags = 0,
8164 : .data = NULL,
8165 : };
8166 0 : int err;
8167 :
8168 0 : if (!new_dev)
8169 : return 0;
8170 :
8171 0 : if (old_dev && new_dev != old_dev)
8172 0 : netdev_adjacent_dev_disable(dev, old_dev);
8173 0 : err = __netdev_upper_dev_link(new_dev, dev, false, NULL, NULL, &priv,
8174 : extack);
8175 0 : if (err) {
8176 0 : if (old_dev && new_dev != old_dev)
8177 0 : netdev_adjacent_dev_enable(dev, old_dev);
8178 0 : return err;
8179 : }
8180 :
8181 : return 0;
8182 : }
8183 : EXPORT_SYMBOL(netdev_adjacent_change_prepare);
8184 :
8185 0 : void netdev_adjacent_change_commit(struct net_device *old_dev,
8186 : struct net_device *new_dev,
8187 : struct net_device *dev)
8188 : {
8189 0 : struct netdev_nested_priv priv = {
8190 : .flags = NESTED_SYNC_IMM | NESTED_SYNC_TODO,
8191 : .data = NULL,
8192 : };
8193 :
8194 0 : if (!new_dev || !old_dev)
8195 0 : return;
8196 :
8197 0 : if (new_dev == old_dev)
8198 : return;
8199 :
8200 0 : netdev_adjacent_dev_enable(dev, old_dev);
8201 0 : __netdev_upper_dev_unlink(old_dev, dev, &priv);
8202 : }
8203 : EXPORT_SYMBOL(netdev_adjacent_change_commit);
8204 :
8205 0 : void netdev_adjacent_change_abort(struct net_device *old_dev,
8206 : struct net_device *new_dev,
8207 : struct net_device *dev)
8208 : {
8209 0 : struct netdev_nested_priv priv = {
8210 : .flags = 0,
8211 : .data = NULL,
8212 : };
8213 :
8214 0 : if (!new_dev)
8215 0 : return;
8216 :
8217 0 : if (old_dev && new_dev != old_dev)
8218 0 : netdev_adjacent_dev_enable(dev, old_dev);
8219 :
8220 0 : __netdev_upper_dev_unlink(new_dev, dev, &priv);
8221 : }
8222 : EXPORT_SYMBOL(netdev_adjacent_change_abort);
8223 :
8224 : /**
8225 : * netdev_bonding_info_change - Dispatch event about slave change
8226 : * @dev: device
8227 : * @bonding_info: info to dispatch
8228 : *
8229 : * Send NETDEV_BONDING_INFO to netdev notifiers with info.
8230 : * The caller must hold the RTNL lock.
8231 : */
8232 0 : void netdev_bonding_info_change(struct net_device *dev,
8233 : struct netdev_bonding_info *bonding_info)
8234 : {
8235 0 : struct netdev_notifier_bonding_info info = {
8236 : .info.dev = dev,
8237 : };
8238 :
8239 0 : memcpy(&info.bonding_info, bonding_info,
8240 : sizeof(struct netdev_bonding_info));
8241 0 : call_netdevice_notifiers_info(NETDEV_BONDING_INFO,
8242 : &info.info);
8243 0 : }
8244 : EXPORT_SYMBOL(netdev_bonding_info_change);
8245 :
8246 : /**
8247 : * netdev_get_xmit_slave - Get the xmit slave of master device
8248 : * @dev: device
8249 : * @skb: The packet
8250 : * @all_slaves: assume all the slaves are active
8251 : *
8252 : * The reference counters are not incremented so the caller must be
8253 : * careful with locks. The caller must hold RCU lock.
8254 : * %NULL is returned if no slave is found.
8255 : */
8256 :
8257 0 : struct net_device *netdev_get_xmit_slave(struct net_device *dev,
8258 : struct sk_buff *skb,
8259 : bool all_slaves)
8260 : {
8261 0 : const struct net_device_ops *ops = dev->netdev_ops;
8262 :
8263 0 : if (!ops->ndo_get_xmit_slave)
8264 : return NULL;
8265 0 : return ops->ndo_get_xmit_slave(dev, skb, all_slaves);
8266 : }
8267 : EXPORT_SYMBOL(netdev_get_xmit_slave);
8268 :
8269 0 : static struct net_device *netdev_sk_get_lower_dev(struct net_device *dev,
8270 : struct sock *sk)
8271 : {
8272 0 : const struct net_device_ops *ops = dev->netdev_ops;
8273 :
8274 0 : if (!ops->ndo_sk_get_lower_dev)
8275 : return NULL;
8276 0 : return ops->ndo_sk_get_lower_dev(dev, sk);
8277 : }
8278 :
8279 : /**
8280 : * netdev_sk_get_lowest_dev - Get the lowest device in chain given device and socket
8281 : * @dev: device
8282 : * @sk: the socket
8283 : *
8284 : * %NULL is returned if no lower device is found.
8285 : */
8286 :
8287 0 : struct net_device *netdev_sk_get_lowest_dev(struct net_device *dev,
8288 : struct sock *sk)
8289 : {
8290 0 : struct net_device *lower;
8291 :
8292 0 : lower = netdev_sk_get_lower_dev(dev, sk);
8293 0 : while (lower) {
8294 0 : dev = lower;
8295 0 : lower = netdev_sk_get_lower_dev(dev, sk);
8296 : }
8297 :
8298 0 : return dev;
8299 : }
8300 : EXPORT_SYMBOL(netdev_sk_get_lowest_dev);
8301 :
8302 : static void netdev_adjacent_add_links(struct net_device *dev)
8303 : {
8304 : struct netdev_adjacent *iter;
8305 :
8306 : struct net *net = dev_net(dev);
8307 :
8308 : list_for_each_entry(iter, &dev->adj_list.upper, list) {
8309 : if (!net_eq(net, dev_net(iter->dev)))
8310 : continue;
8311 : netdev_adjacent_sysfs_add(iter->dev, dev,
8312 : &iter->dev->adj_list.lower);
8313 : netdev_adjacent_sysfs_add(dev, iter->dev,
8314 : &dev->adj_list.upper);
8315 : }
8316 :
8317 : list_for_each_entry(iter, &dev->adj_list.lower, list) {
8318 : if (!net_eq(net, dev_net(iter->dev)))
8319 : continue;
8320 : netdev_adjacent_sysfs_add(iter->dev, dev,
8321 : &iter->dev->adj_list.upper);
8322 : netdev_adjacent_sysfs_add(dev, iter->dev,
8323 : &dev->adj_list.lower);
8324 : }
8325 : }
8326 :
8327 : static void netdev_adjacent_del_links(struct net_device *dev)
8328 : {
8329 : struct netdev_adjacent *iter;
8330 :
8331 : struct net *net = dev_net(dev);
8332 :
8333 : list_for_each_entry(iter, &dev->adj_list.upper, list) {
8334 : if (!net_eq(net, dev_net(iter->dev)))
8335 : continue;
8336 : netdev_adjacent_sysfs_del(iter->dev, dev->name,
8337 : &iter->dev->adj_list.lower);
8338 : netdev_adjacent_sysfs_del(dev, iter->dev->name,
8339 : &dev->adj_list.upper);
8340 : }
8341 :
8342 : list_for_each_entry(iter, &dev->adj_list.lower, list) {
8343 : if (!net_eq(net, dev_net(iter->dev)))
8344 : continue;
8345 : netdev_adjacent_sysfs_del(iter->dev, dev->name,
8346 : &iter->dev->adj_list.upper);
8347 : netdev_adjacent_sysfs_del(dev, iter->dev->name,
8348 : &dev->adj_list.lower);
8349 : }
8350 : }
8351 :
8352 0 : void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
8353 : {
8354 0 : struct netdev_adjacent *iter;
8355 :
8356 0 : struct net *net = dev_net(dev);
8357 :
8358 0 : list_for_each_entry(iter, &dev->adj_list.upper, list) {
8359 0 : if (!net_eq(net, dev_net(iter->dev)))
8360 : continue;
8361 0 : netdev_adjacent_sysfs_del(iter->dev, oldname,
8362 : &iter->dev->adj_list.lower);
8363 0 : netdev_adjacent_sysfs_add(iter->dev, dev,
8364 0 : &iter->dev->adj_list.lower);
8365 : }
8366 :
8367 0 : list_for_each_entry(iter, &dev->adj_list.lower, list) {
8368 0 : if (!net_eq(net, dev_net(iter->dev)))
8369 : continue;
8370 0 : netdev_adjacent_sysfs_del(iter->dev, oldname,
8371 : &iter->dev->adj_list.upper);
8372 0 : netdev_adjacent_sysfs_add(iter->dev, dev,
8373 0 : &iter->dev->adj_list.upper);
8374 : }
8375 0 : }
8376 :
8377 0 : void *netdev_lower_dev_get_private(struct net_device *dev,
8378 : struct net_device *lower_dev)
8379 : {
8380 0 : struct netdev_adjacent *lower;
8381 :
8382 0 : if (!lower_dev)
8383 : return NULL;
8384 0 : lower = __netdev_find_adj(lower_dev, &dev->adj_list.lower);
8385 0 : if (!lower)
8386 : return NULL;
8387 :
8388 0 : return lower->private;
8389 : }
8390 : EXPORT_SYMBOL(netdev_lower_dev_get_private);
8391 :
8392 :
8393 : /**
8394 : * netdev_lower_state_changed - Dispatch event about lower device state change
8395 : * @lower_dev: device
8396 : * @lower_state_info: state to dispatch
8397 : *
8398 : * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
8399 : * The caller must hold the RTNL lock.
8400 : */
8401 0 : void netdev_lower_state_changed(struct net_device *lower_dev,
8402 : void *lower_state_info)
8403 : {
8404 0 : struct netdev_notifier_changelowerstate_info changelowerstate_info = {
8405 : .info.dev = lower_dev,
8406 : };
8407 :
8408 0 : ASSERT_RTNL();
8409 0 : changelowerstate_info.lower_state_info = lower_state_info;
8410 0 : call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE,
8411 : &changelowerstate_info.info);
8412 0 : }
8413 : EXPORT_SYMBOL(netdev_lower_state_changed);
8414 :
8415 0 : static void dev_change_rx_flags(struct net_device *dev, int flags)
8416 : {
8417 0 : const struct net_device_ops *ops = dev->netdev_ops;
8418 :
8419 0 : if (ops->ndo_change_rx_flags)
8420 0 : ops->ndo_change_rx_flags(dev, flags);
8421 : }
8422 :
8423 0 : static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
8424 : {
8425 0 : unsigned int old_flags = dev->flags;
8426 0 : kuid_t uid;
8427 0 : kgid_t gid;
8428 :
8429 0 : ASSERT_RTNL();
8430 :
8431 0 : dev->flags |= IFF_PROMISC;
8432 0 : dev->promiscuity += inc;
8433 0 : if (dev->promiscuity == 0) {
8434 : /*
8435 : * Avoid overflow.
8436 : * If inc causes overflow, untouch promisc and return error.
8437 : */
8438 0 : if (inc < 0)
8439 0 : dev->flags &= ~IFF_PROMISC;
8440 : else {
8441 0 : dev->promiscuity -= inc;
8442 0 : pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
8443 : dev->name);
8444 0 : return -EOVERFLOW;
8445 : }
8446 : }
8447 0 : if (dev->flags != old_flags) {
8448 0 : pr_info("device %s %s promiscuous mode\n",
8449 : dev->name,
8450 : dev->flags & IFF_PROMISC ? "entered" : "left");
8451 0 : if (audit_enabled) {
8452 : current_uid_gid(&uid, &gid);
8453 : audit_log(audit_context(), GFP_ATOMIC,
8454 : AUDIT_ANOM_PROMISCUOUS,
8455 : "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
8456 : dev->name, (dev->flags & IFF_PROMISC),
8457 : (old_flags & IFF_PROMISC),
8458 : from_kuid(&init_user_ns, audit_get_loginuid(current)),
8459 : from_kuid(&init_user_ns, uid),
8460 : from_kgid(&init_user_ns, gid),
8461 : audit_get_sessionid(current));
8462 : }
8463 :
8464 0 : dev_change_rx_flags(dev, IFF_PROMISC);
8465 : }
8466 0 : if (notify)
8467 0 : __dev_notify_flags(dev, old_flags, IFF_PROMISC);
8468 : return 0;
8469 : }
8470 :
8471 : /**
8472 : * dev_set_promiscuity - update promiscuity count on a device
8473 : * @dev: device
8474 : * @inc: modifier
8475 : *
8476 : * Add or remove promiscuity from a device. While the count in the device
8477 : * remains above zero the interface remains promiscuous. Once it hits zero
8478 : * the device reverts back to normal filtering operation. A negative inc
8479 : * value is used to drop promiscuity on the device.
8480 : * Return 0 if successful or a negative errno code on error.
8481 : */
8482 0 : int dev_set_promiscuity(struct net_device *dev, int inc)
8483 : {
8484 0 : unsigned int old_flags = dev->flags;
8485 0 : int err;
8486 :
8487 0 : err = __dev_set_promiscuity(dev, inc, true);
8488 0 : if (err < 0)
8489 : return err;
8490 0 : if (dev->flags != old_flags)
8491 0 : dev_set_rx_mode(dev);
8492 : return err;
8493 : }
8494 : EXPORT_SYMBOL(dev_set_promiscuity);
8495 :
8496 0 : static int __dev_set_allmulti(struct net_device *dev, int inc, bool notify)
8497 : {
8498 0 : unsigned int old_flags = dev->flags, old_gflags = dev->gflags;
8499 :
8500 0 : ASSERT_RTNL();
8501 :
8502 0 : dev->flags |= IFF_ALLMULTI;
8503 0 : dev->allmulti += inc;
8504 0 : if (dev->allmulti == 0) {
8505 : /*
8506 : * Avoid overflow.
8507 : * If inc causes overflow, untouch allmulti and return error.
8508 : */
8509 0 : if (inc < 0)
8510 0 : dev->flags &= ~IFF_ALLMULTI;
8511 : else {
8512 0 : dev->allmulti -= inc;
8513 0 : pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
8514 : dev->name);
8515 0 : return -EOVERFLOW;
8516 : }
8517 : }
8518 0 : if (dev->flags ^ old_flags) {
8519 0 : dev_change_rx_flags(dev, IFF_ALLMULTI);
8520 0 : dev_set_rx_mode(dev);
8521 0 : if (notify)
8522 0 : __dev_notify_flags(dev, old_flags,
8523 0 : dev->gflags ^ old_gflags);
8524 : }
8525 : return 0;
8526 : }
8527 :
8528 : /**
8529 : * dev_set_allmulti - update allmulti count on a device
8530 : * @dev: device
8531 : * @inc: modifier
8532 : *
8533 : * Add or remove reception of all multicast frames to a device. While the
8534 : * count in the device remains above zero the interface remains listening
8535 : * to all interfaces. Once it hits zero the device reverts back to normal
8536 : * filtering operation. A negative @inc value is used to drop the counter
8537 : * when releasing a resource needing all multicasts.
8538 : * Return 0 if successful or a negative errno code on error.
8539 : */
8540 :
8541 0 : int dev_set_allmulti(struct net_device *dev, int inc)
8542 : {
8543 0 : return __dev_set_allmulti(dev, inc, true);
8544 : }
8545 : EXPORT_SYMBOL(dev_set_allmulti);
8546 :
8547 : /*
8548 : * Upload unicast and multicast address lists to device and
8549 : * configure RX filtering. When the device doesn't support unicast
8550 : * filtering it is put in promiscuous mode while unicast addresses
8551 : * are present.
8552 : */
8553 6 : void __dev_set_rx_mode(struct net_device *dev)
8554 : {
8555 6 : const struct net_device_ops *ops = dev->netdev_ops;
8556 :
8557 : /* dev_open will call this function so the list will stay sane. */
8558 6 : if (!(dev->flags&IFF_UP))
8559 : return;
8560 :
8561 4 : if (!netif_device_present(dev))
8562 : return;
8563 :
8564 4 : if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
8565 : /* Unicast addresses changes may only happen under the rtnl,
8566 : * therefore calling __dev_set_promiscuity here is safe.
8567 : */
8568 2 : if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
8569 0 : __dev_set_promiscuity(dev, 1, false);
8570 0 : dev->uc_promisc = true;
8571 2 : } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
8572 0 : __dev_set_promiscuity(dev, -1, false);
8573 0 : dev->uc_promisc = false;
8574 : }
8575 : }
8576 :
8577 4 : if (ops->ndo_set_rx_mode)
8578 2 : ops->ndo_set_rx_mode(dev);
8579 : }
8580 :
8581 5 : void dev_set_rx_mode(struct net_device *dev)
8582 : {
8583 5 : netif_addr_lock_bh(dev);
8584 5 : __dev_set_rx_mode(dev);
8585 5 : netif_addr_unlock_bh(dev);
8586 5 : }
8587 :
8588 : /**
8589 : * dev_get_flags - get flags reported to userspace
8590 : * @dev: device
8591 : *
8592 : * Get the combination of flag bits exported through APIs to userspace.
8593 : */
8594 18 : unsigned int dev_get_flags(const struct net_device *dev)
8595 : {
8596 18 : unsigned int flags;
8597 :
8598 18 : flags = (dev->flags & ~(IFF_PROMISC |
8599 : IFF_ALLMULTI |
8600 : IFF_RUNNING |
8601 : IFF_LOWER_UP |
8602 : IFF_DORMANT)) |
8603 18 : (dev->gflags & (IFF_PROMISC |
8604 : IFF_ALLMULTI));
8605 :
8606 18 : if (netif_running(dev)) {
8607 10 : if (netif_oper_up(dev))
8608 10 : flags |= IFF_RUNNING;
8609 10 : if (netif_carrier_ok(dev))
8610 10 : flags |= IFF_LOWER_UP;
8611 10 : if (netif_dormant(dev))
8612 0 : flags |= IFF_DORMANT;
8613 : }
8614 :
8615 18 : return flags;
8616 : }
8617 : EXPORT_SYMBOL(dev_get_flags);
8618 :
8619 3 : int __dev_change_flags(struct net_device *dev, unsigned int flags,
8620 : struct netlink_ext_ack *extack)
8621 : {
8622 3 : unsigned int old_flags = dev->flags;
8623 3 : int ret;
8624 :
8625 3 : ASSERT_RTNL();
8626 :
8627 : /*
8628 : * Set the flags on our device.
8629 : */
8630 :
8631 3 : dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
8632 : IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
8633 3 : IFF_AUTOMEDIA)) |
8634 3 : (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
8635 : IFF_ALLMULTI));
8636 :
8637 : /*
8638 : * Load in the correct multicast list now the flags have changed.
8639 : */
8640 :
8641 3 : if ((old_flags ^ flags) & IFF_MULTICAST)
8642 0 : dev_change_rx_flags(dev, IFF_MULTICAST);
8643 :
8644 3 : dev_set_rx_mode(dev);
8645 :
8646 : /*
8647 : * Have we downed the interface. We handle IFF_UP ourselves
8648 : * according to user attempts to set it, rather than blindly
8649 : * setting it.
8650 : */
8651 :
8652 3 : ret = 0;
8653 3 : if ((old_flags ^ flags) & IFF_UP) {
8654 2 : if (old_flags & IFF_UP)
8655 0 : __dev_close(dev);
8656 : else
8657 2 : ret = __dev_open(dev, extack);
8658 : }
8659 :
8660 3 : if ((flags ^ dev->gflags) & IFF_PROMISC) {
8661 0 : int inc = (flags & IFF_PROMISC) ? 1 : -1;
8662 0 : unsigned int old_flags = dev->flags;
8663 :
8664 0 : dev->gflags ^= IFF_PROMISC;
8665 :
8666 0 : if (__dev_set_promiscuity(dev, inc, false) >= 0)
8667 0 : if (dev->flags != old_flags)
8668 0 : dev_set_rx_mode(dev);
8669 : }
8670 :
8671 : /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
8672 : * is important. Some (broken) drivers set IFF_PROMISC, when
8673 : * IFF_ALLMULTI is requested not asking us and not reporting.
8674 : */
8675 3 : if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
8676 0 : int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
8677 :
8678 0 : dev->gflags ^= IFF_ALLMULTI;
8679 0 : __dev_set_allmulti(dev, inc, false);
8680 : }
8681 :
8682 3 : return ret;
8683 : }
8684 :
8685 3 : void __dev_notify_flags(struct net_device *dev, unsigned int old_flags,
8686 : unsigned int gchanges)
8687 : {
8688 3 : unsigned int changes = dev->flags ^ old_flags;
8689 :
8690 3 : if (gchanges)
8691 2 : rtmsg_ifinfo(RTM_NEWLINK, dev, gchanges, GFP_ATOMIC);
8692 :
8693 3 : if (changes & IFF_UP) {
8694 2 : if (dev->flags & IFF_UP)
8695 2 : call_netdevice_notifiers(NETDEV_UP, dev);
8696 : else
8697 0 : call_netdevice_notifiers(NETDEV_DOWN, dev);
8698 : }
8699 :
8700 3 : if (dev->flags & IFF_UP &&
8701 3 : (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE))) {
8702 0 : struct netdev_notifier_change_info change_info = {
8703 : .info = {
8704 : .dev = dev,
8705 : },
8706 : .flags_changed = changes,
8707 : };
8708 :
8709 0 : call_netdevice_notifiers_info(NETDEV_CHANGE, &change_info.info);
8710 : }
8711 3 : }
8712 :
8713 : /**
8714 : * dev_change_flags - change device settings
8715 : * @dev: device
8716 : * @flags: device state flags
8717 : * @extack: netlink extended ack
8718 : *
8719 : * Change settings on device based state flags. The flags are
8720 : * in the userspace exported format.
8721 : */
8722 3 : int dev_change_flags(struct net_device *dev, unsigned int flags,
8723 : struct netlink_ext_ack *extack)
8724 : {
8725 3 : int ret;
8726 3 : unsigned int changes, old_flags = dev->flags, old_gflags = dev->gflags;
8727 :
8728 3 : ret = __dev_change_flags(dev, flags, extack);
8729 3 : if (ret < 0)
8730 : return ret;
8731 :
8732 3 : changes = (old_flags ^ dev->flags) | (old_gflags ^ dev->gflags);
8733 3 : __dev_notify_flags(dev, old_flags, changes);
8734 3 : return ret;
8735 : }
8736 : EXPORT_SYMBOL(dev_change_flags);
8737 :
8738 0 : int __dev_set_mtu(struct net_device *dev, int new_mtu)
8739 : {
8740 0 : const struct net_device_ops *ops = dev->netdev_ops;
8741 :
8742 0 : if (ops->ndo_change_mtu)
8743 0 : return ops->ndo_change_mtu(dev, new_mtu);
8744 :
8745 : /* Pairs with all the lockless reads of dev->mtu in the stack */
8746 0 : WRITE_ONCE(dev->mtu, new_mtu);
8747 0 : return 0;
8748 : }
8749 : EXPORT_SYMBOL(__dev_set_mtu);
8750 :
8751 0 : int dev_validate_mtu(struct net_device *dev, int new_mtu,
8752 : struct netlink_ext_ack *extack)
8753 : {
8754 : /* MTU must be positive, and in range */
8755 0 : if (new_mtu < 0 || new_mtu < dev->min_mtu) {
8756 0 : NL_SET_ERR_MSG(extack, "mtu less than device minimum");
8757 0 : return -EINVAL;
8758 : }
8759 :
8760 0 : if (dev->max_mtu > 0 && new_mtu > dev->max_mtu) {
8761 0 : NL_SET_ERR_MSG(extack, "mtu greater than device maximum");
8762 0 : return -EINVAL;
8763 : }
8764 : return 0;
8765 : }
8766 :
8767 : /**
8768 : * dev_set_mtu_ext - Change maximum transfer unit
8769 : * @dev: device
8770 : * @new_mtu: new transfer unit
8771 : * @extack: netlink extended ack
8772 : *
8773 : * Change the maximum transfer size of the network device.
8774 : */
8775 0 : int dev_set_mtu_ext(struct net_device *dev, int new_mtu,
8776 : struct netlink_ext_ack *extack)
8777 : {
8778 0 : int err, orig_mtu;
8779 :
8780 0 : if (new_mtu == dev->mtu)
8781 : return 0;
8782 :
8783 0 : err = dev_validate_mtu(dev, new_mtu, extack);
8784 0 : if (err)
8785 : return err;
8786 :
8787 0 : if (!netif_device_present(dev))
8788 : return -ENODEV;
8789 :
8790 0 : err = call_netdevice_notifiers(NETDEV_PRECHANGEMTU, dev);
8791 0 : err = notifier_to_errno(err);
8792 0 : if (err)
8793 0 : return err;
8794 :
8795 0 : orig_mtu = dev->mtu;
8796 0 : err = __dev_set_mtu(dev, new_mtu);
8797 :
8798 0 : if (!err) {
8799 0 : err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8800 : orig_mtu);
8801 0 : err = notifier_to_errno(err);
8802 0 : if (err) {
8803 : /* setting mtu back and notifying everyone again,
8804 : * so that they have a chance to revert changes.
8805 : */
8806 0 : __dev_set_mtu(dev, orig_mtu);
8807 0 : call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev,
8808 : new_mtu);
8809 : }
8810 : }
8811 : return err;
8812 : }
8813 :
8814 0 : int dev_set_mtu(struct net_device *dev, int new_mtu)
8815 : {
8816 0 : struct netlink_ext_ack extack;
8817 0 : int err;
8818 :
8819 0 : memset(&extack, 0, sizeof(extack));
8820 0 : err = dev_set_mtu_ext(dev, new_mtu, &extack);
8821 0 : if (err && extack._msg)
8822 0 : net_err_ratelimited("%s: %s\n", dev->name, extack._msg);
8823 0 : return err;
8824 : }
8825 : EXPORT_SYMBOL(dev_set_mtu);
8826 :
8827 : /**
8828 : * dev_change_tx_queue_len - Change TX queue length of a netdevice
8829 : * @dev: device
8830 : * @new_len: new tx queue length
8831 : */
8832 0 : int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len)
8833 : {
8834 0 : unsigned int orig_len = dev->tx_queue_len;
8835 0 : int res;
8836 :
8837 0 : if (new_len != (unsigned int)new_len)
8838 : return -ERANGE;
8839 :
8840 0 : if (new_len != orig_len) {
8841 0 : dev->tx_queue_len = new_len;
8842 0 : res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev);
8843 0 : res = notifier_to_errno(res);
8844 0 : if (res)
8845 0 : goto err_rollback;
8846 0 : res = dev_qdisc_change_tx_queue_len(dev);
8847 0 : if (res)
8848 0 : goto err_rollback;
8849 : }
8850 :
8851 : return 0;
8852 :
8853 0 : err_rollback:
8854 0 : netdev_err(dev, "refused to change device tx_queue_len\n");
8855 0 : dev->tx_queue_len = orig_len;
8856 0 : return res;
8857 : }
8858 :
8859 : /**
8860 : * dev_set_group - Change group this device belongs to
8861 : * @dev: device
8862 : * @new_group: group this device should belong to
8863 : */
8864 0 : void dev_set_group(struct net_device *dev, int new_group)
8865 : {
8866 0 : dev->group = new_group;
8867 0 : }
8868 : EXPORT_SYMBOL(dev_set_group);
8869 :
8870 : /**
8871 : * dev_pre_changeaddr_notify - Call NETDEV_PRE_CHANGEADDR.
8872 : * @dev: device
8873 : * @addr: new address
8874 : * @extack: netlink extended ack
8875 : */
8876 0 : int dev_pre_changeaddr_notify(struct net_device *dev, const char *addr,
8877 : struct netlink_ext_ack *extack)
8878 : {
8879 0 : struct netdev_notifier_pre_changeaddr_info info = {
8880 : .info.dev = dev,
8881 : .info.extack = extack,
8882 : .dev_addr = addr,
8883 : };
8884 0 : int rc;
8885 :
8886 0 : rc = call_netdevice_notifiers_info(NETDEV_PRE_CHANGEADDR, &info.info);
8887 0 : return notifier_to_errno(rc);
8888 : }
8889 : EXPORT_SYMBOL(dev_pre_changeaddr_notify);
8890 :
8891 : /**
8892 : * dev_set_mac_address - Change Media Access Control Address
8893 : * @dev: device
8894 : * @sa: new address
8895 : * @extack: netlink extended ack
8896 : *
8897 : * Change the hardware (MAC) address of the device
8898 : */
8899 0 : int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa,
8900 : struct netlink_ext_ack *extack)
8901 : {
8902 0 : const struct net_device_ops *ops = dev->netdev_ops;
8903 0 : int err;
8904 :
8905 0 : if (!ops->ndo_set_mac_address)
8906 : return -EOPNOTSUPP;
8907 0 : if (sa->sa_family != dev->type)
8908 : return -EINVAL;
8909 0 : if (!netif_device_present(dev))
8910 : return -ENODEV;
8911 0 : err = dev_pre_changeaddr_notify(dev, sa->sa_data, extack);
8912 0 : if (err)
8913 : return err;
8914 0 : err = ops->ndo_set_mac_address(dev, sa);
8915 0 : if (err)
8916 : return err;
8917 0 : dev->addr_assign_type = NET_ADDR_SET;
8918 0 : call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
8919 0 : add_device_randomness(dev->dev_addr, dev->addr_len);
8920 0 : return 0;
8921 : }
8922 : EXPORT_SYMBOL(dev_set_mac_address);
8923 :
8924 : static DECLARE_RWSEM(dev_addr_sem);
8925 :
8926 0 : int dev_set_mac_address_user(struct net_device *dev, struct sockaddr *sa,
8927 : struct netlink_ext_ack *extack)
8928 : {
8929 0 : int ret;
8930 :
8931 0 : down_write(&dev_addr_sem);
8932 0 : ret = dev_set_mac_address(dev, sa, extack);
8933 0 : up_write(&dev_addr_sem);
8934 0 : return ret;
8935 : }
8936 : EXPORT_SYMBOL(dev_set_mac_address_user);
8937 :
8938 1 : int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name)
8939 : {
8940 1 : size_t size = sizeof(sa->sa_data);
8941 1 : struct net_device *dev;
8942 1 : int ret = 0;
8943 :
8944 1 : down_read(&dev_addr_sem);
8945 1 : rcu_read_lock();
8946 :
8947 1 : dev = dev_get_by_name_rcu(net, dev_name);
8948 1 : if (!dev) {
8949 0 : ret = -ENODEV;
8950 0 : goto unlock;
8951 : }
8952 1 : if (!dev->addr_len)
8953 0 : memset(sa->sa_data, 0, size);
8954 : else
8955 1 : memcpy(sa->sa_data, dev->dev_addr,
8956 1 : min_t(size_t, size, dev->addr_len));
8957 1 : sa->sa_family = dev->type;
8958 :
8959 1 : unlock:
8960 1 : rcu_read_unlock();
8961 1 : up_read(&dev_addr_sem);
8962 1 : return ret;
8963 : }
8964 : EXPORT_SYMBOL(dev_get_mac_address);
8965 :
8966 : /**
8967 : * dev_change_carrier - Change device carrier
8968 : * @dev: device
8969 : * @new_carrier: new value
8970 : *
8971 : * Change device carrier
8972 : */
8973 0 : int dev_change_carrier(struct net_device *dev, bool new_carrier)
8974 : {
8975 0 : const struct net_device_ops *ops = dev->netdev_ops;
8976 :
8977 0 : if (!ops->ndo_change_carrier)
8978 : return -EOPNOTSUPP;
8979 0 : if (!netif_device_present(dev))
8980 : return -ENODEV;
8981 0 : return ops->ndo_change_carrier(dev, new_carrier);
8982 : }
8983 : EXPORT_SYMBOL(dev_change_carrier);
8984 :
8985 : /**
8986 : * dev_get_phys_port_id - Get device physical port ID
8987 : * @dev: device
8988 : * @ppid: port ID
8989 : *
8990 : * Get device physical port ID
8991 : */
8992 16 : int dev_get_phys_port_id(struct net_device *dev,
8993 : struct netdev_phys_item_id *ppid)
8994 : {
8995 16 : const struct net_device_ops *ops = dev->netdev_ops;
8996 :
8997 16 : if (!ops->ndo_get_phys_port_id)
8998 : return -EOPNOTSUPP;
8999 0 : return ops->ndo_get_phys_port_id(dev, ppid);
9000 : }
9001 : EXPORT_SYMBOL(dev_get_phys_port_id);
9002 :
9003 : /**
9004 : * dev_get_phys_port_name - Get device physical port name
9005 : * @dev: device
9006 : * @name: port name
9007 : * @len: limit of bytes to copy to name
9008 : *
9009 : * Get device physical port name
9010 : */
9011 16 : int dev_get_phys_port_name(struct net_device *dev,
9012 : char *name, size_t len)
9013 : {
9014 16 : const struct net_device_ops *ops = dev->netdev_ops;
9015 16 : int err;
9016 :
9017 16 : if (ops->ndo_get_phys_port_name) {
9018 8 : err = ops->ndo_get_phys_port_name(dev, name, len);
9019 8 : if (err != -EOPNOTSUPP)
9020 0 : return err;
9021 : }
9022 16 : return devlink_compat_phys_port_name_get(dev, name, len);
9023 : }
9024 : EXPORT_SYMBOL(dev_get_phys_port_name);
9025 :
9026 : /**
9027 : * dev_get_port_parent_id - Get the device's port parent identifier
9028 : * @dev: network device
9029 : * @ppid: pointer to a storage for the port's parent identifier
9030 : * @recurse: allow/disallow recursion to lower devices
9031 : *
9032 : * Get the devices's port parent identifier
9033 : */
9034 16 : int dev_get_port_parent_id(struct net_device *dev,
9035 : struct netdev_phys_item_id *ppid,
9036 : bool recurse)
9037 : {
9038 16 : const struct net_device_ops *ops = dev->netdev_ops;
9039 16 : struct netdev_phys_item_id first = { };
9040 16 : struct net_device *lower_dev;
9041 16 : struct list_head *iter;
9042 16 : int err;
9043 :
9044 16 : if (ops->ndo_get_port_parent_id) {
9045 0 : err = ops->ndo_get_port_parent_id(dev, ppid);
9046 0 : if (err != -EOPNOTSUPP)
9047 : return err;
9048 : }
9049 :
9050 16 : err = devlink_compat_switch_id_get(dev, ppid);
9051 16 : if (!err || err != -EOPNOTSUPP)
9052 : return err;
9053 :
9054 16 : if (!recurse)
9055 : return -EOPNOTSUPP;
9056 :
9057 0 : netdev_for_each_lower_dev(dev, lower_dev, iter) {
9058 0 : err = dev_get_port_parent_id(lower_dev, ppid, recurse);
9059 0 : if (err)
9060 : break;
9061 0 : if (!first.id_len)
9062 0 : first = *ppid;
9063 0 : else if (memcmp(&first, ppid, sizeof(*ppid)))
9064 : return -EOPNOTSUPP;
9065 : }
9066 :
9067 : return err;
9068 : }
9069 : EXPORT_SYMBOL(dev_get_port_parent_id);
9070 :
9071 : /**
9072 : * netdev_port_same_parent_id - Indicate if two network devices have
9073 : * the same port parent identifier
9074 : * @a: first network device
9075 : * @b: second network device
9076 : */
9077 0 : bool netdev_port_same_parent_id(struct net_device *a, struct net_device *b)
9078 : {
9079 0 : struct netdev_phys_item_id a_id = { };
9080 0 : struct netdev_phys_item_id b_id = { };
9081 :
9082 0 : if (dev_get_port_parent_id(a, &a_id, true) ||
9083 0 : dev_get_port_parent_id(b, &b_id, true))
9084 0 : return false;
9085 :
9086 0 : return netdev_phys_item_id_same(&a_id, &b_id);
9087 : }
9088 : EXPORT_SYMBOL(netdev_port_same_parent_id);
9089 :
9090 : /**
9091 : * dev_change_proto_down - update protocol port state information
9092 : * @dev: device
9093 : * @proto_down: new value
9094 : *
9095 : * This info can be used by switch drivers to set the phys state of the
9096 : * port.
9097 : */
9098 0 : int dev_change_proto_down(struct net_device *dev, bool proto_down)
9099 : {
9100 0 : const struct net_device_ops *ops = dev->netdev_ops;
9101 :
9102 0 : if (!ops->ndo_change_proto_down)
9103 : return -EOPNOTSUPP;
9104 0 : if (!netif_device_present(dev))
9105 : return -ENODEV;
9106 0 : return ops->ndo_change_proto_down(dev, proto_down);
9107 : }
9108 : EXPORT_SYMBOL(dev_change_proto_down);
9109 :
9110 : /**
9111 : * dev_change_proto_down_generic - generic implementation for
9112 : * ndo_change_proto_down that sets carrier according to
9113 : * proto_down.
9114 : *
9115 : * @dev: device
9116 : * @proto_down: new value
9117 : */
9118 0 : int dev_change_proto_down_generic(struct net_device *dev, bool proto_down)
9119 : {
9120 0 : if (proto_down)
9121 0 : netif_carrier_off(dev);
9122 : else
9123 0 : netif_carrier_on(dev);
9124 0 : dev->proto_down = proto_down;
9125 0 : return 0;
9126 : }
9127 : EXPORT_SYMBOL(dev_change_proto_down_generic);
9128 :
9129 : /**
9130 : * dev_change_proto_down_reason - proto down reason
9131 : *
9132 : * @dev: device
9133 : * @mask: proto down mask
9134 : * @value: proto down value
9135 : */
9136 0 : void dev_change_proto_down_reason(struct net_device *dev, unsigned long mask,
9137 : u32 value)
9138 : {
9139 0 : int b;
9140 :
9141 0 : if (!mask) {
9142 0 : dev->proto_down_reason = value;
9143 : } else {
9144 0 : for_each_set_bit(b, &mask, 32) {
9145 0 : if (value & (1 << b))
9146 0 : dev->proto_down_reason |= BIT(b);
9147 : else
9148 0 : dev->proto_down_reason &= ~BIT(b);
9149 : }
9150 : }
9151 0 : }
9152 : EXPORT_SYMBOL(dev_change_proto_down_reason);
9153 :
9154 : struct bpf_xdp_link {
9155 : struct bpf_link link;
9156 : struct net_device *dev; /* protected by rtnl_lock, no refcnt held */
9157 : int flags;
9158 : };
9159 :
9160 0 : static enum bpf_xdp_mode dev_xdp_mode(struct net_device *dev, u32 flags)
9161 : {
9162 0 : if (flags & XDP_FLAGS_HW_MODE)
9163 : return XDP_MODE_HW;
9164 0 : if (flags & XDP_FLAGS_DRV_MODE)
9165 : return XDP_MODE_DRV;
9166 0 : if (flags & XDP_FLAGS_SKB_MODE)
9167 : return XDP_MODE_SKB;
9168 0 : return dev->netdev_ops->ndo_bpf ? XDP_MODE_DRV : XDP_MODE_SKB;
9169 : }
9170 :
9171 0 : static bpf_op_t dev_xdp_bpf_op(struct net_device *dev, enum bpf_xdp_mode mode)
9172 : {
9173 0 : switch (mode) {
9174 : case XDP_MODE_SKB:
9175 : return generic_xdp_install;
9176 0 : case XDP_MODE_DRV:
9177 : case XDP_MODE_HW:
9178 0 : return dev->netdev_ops->ndo_bpf;
9179 : default:
9180 : return NULL;
9181 : }
9182 : }
9183 :
9184 32 : static struct bpf_xdp_link *dev_xdp_link(struct net_device *dev,
9185 : enum bpf_xdp_mode mode)
9186 : {
9187 32 : return dev->xdp_state[mode].link;
9188 : }
9189 :
9190 32 : static struct bpf_prog *dev_xdp_prog(struct net_device *dev,
9191 : enum bpf_xdp_mode mode)
9192 : {
9193 32 : struct bpf_xdp_link *link = dev_xdp_link(dev, mode);
9194 :
9195 0 : if (link)
9196 0 : return link->link.prog;
9197 32 : return dev->xdp_state[mode].prog;
9198 : }
9199 :
9200 0 : static u8 dev_xdp_prog_count(struct net_device *dev)
9201 : {
9202 0 : u8 count = 0;
9203 0 : int i;
9204 :
9205 0 : for (i = 0; i < __MAX_XDP_MODE; i++)
9206 0 : if (dev->xdp_state[i].prog || dev->xdp_state[i].link)
9207 0 : count++;
9208 0 : return count;
9209 : }
9210 :
9211 32 : u32 dev_xdp_prog_id(struct net_device *dev, enum bpf_xdp_mode mode)
9212 : {
9213 32 : struct bpf_prog *prog = dev_xdp_prog(dev, mode);
9214 :
9215 32 : return prog ? prog->aux->id : 0;
9216 : }
9217 :
9218 0 : static void dev_xdp_set_link(struct net_device *dev, enum bpf_xdp_mode mode,
9219 : struct bpf_xdp_link *link)
9220 : {
9221 0 : dev->xdp_state[mode].link = link;
9222 0 : dev->xdp_state[mode].prog = NULL;
9223 0 : }
9224 :
9225 0 : static void dev_xdp_set_prog(struct net_device *dev, enum bpf_xdp_mode mode,
9226 : struct bpf_prog *prog)
9227 : {
9228 0 : dev->xdp_state[mode].link = NULL;
9229 0 : dev->xdp_state[mode].prog = prog;
9230 0 : }
9231 :
9232 0 : static int dev_xdp_install(struct net_device *dev, enum bpf_xdp_mode mode,
9233 : bpf_op_t bpf_op, struct netlink_ext_ack *extack,
9234 : u32 flags, struct bpf_prog *prog)
9235 : {
9236 0 : struct netdev_bpf xdp;
9237 0 : int err;
9238 :
9239 0 : memset(&xdp, 0, sizeof(xdp));
9240 0 : xdp.command = mode == XDP_MODE_HW ? XDP_SETUP_PROG_HW : XDP_SETUP_PROG;
9241 0 : xdp.extack = extack;
9242 0 : xdp.flags = flags;
9243 0 : xdp.prog = prog;
9244 :
9245 : /* Drivers assume refcnt is already incremented (i.e, prog pointer is
9246 : * "moved" into driver), so they don't increment it on their own, but
9247 : * they do decrement refcnt when program is detached or replaced.
9248 : * Given net_device also owns link/prog, we need to bump refcnt here
9249 : * to prevent drivers from underflowing it.
9250 : */
9251 0 : if (prog)
9252 0 : bpf_prog_inc(prog);
9253 0 : err = bpf_op(dev, &xdp);
9254 0 : if (err) {
9255 : if (prog)
9256 0 : bpf_prog_put(prog);
9257 : return err;
9258 : }
9259 :
9260 0 : if (mode != XDP_MODE_HW)
9261 0 : bpf_prog_change_xdp(dev_xdp_prog(dev, mode), prog);
9262 :
9263 : return 0;
9264 : }
9265 :
9266 0 : static void dev_xdp_uninstall(struct net_device *dev)
9267 : {
9268 0 : struct bpf_xdp_link *link;
9269 0 : struct bpf_prog *prog;
9270 0 : enum bpf_xdp_mode mode;
9271 0 : bpf_op_t bpf_op;
9272 :
9273 0 : ASSERT_RTNL();
9274 :
9275 0 : for (mode = XDP_MODE_SKB; mode < __MAX_XDP_MODE; mode++) {
9276 0 : prog = dev_xdp_prog(dev, mode);
9277 0 : if (!prog)
9278 0 : continue;
9279 :
9280 0 : bpf_op = dev_xdp_bpf_op(dev, mode);
9281 0 : if (!bpf_op)
9282 0 : continue;
9283 :
9284 0 : WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9285 :
9286 : /* auto-detach link from net device */
9287 0 : link = dev_xdp_link(dev, mode);
9288 0 : if (link)
9289 0 : link->dev = NULL;
9290 : else
9291 0 : bpf_prog_put(prog);
9292 :
9293 0 : dev_xdp_set_link(dev, mode, NULL);
9294 : }
9295 0 : }
9296 :
9297 0 : static int dev_xdp_attach(struct net_device *dev, struct netlink_ext_ack *extack,
9298 : struct bpf_xdp_link *link, struct bpf_prog *new_prog,
9299 : struct bpf_prog *old_prog, u32 flags)
9300 : {
9301 0 : unsigned int num_modes = hweight32(flags & XDP_FLAGS_MODES);
9302 0 : struct bpf_prog *cur_prog;
9303 0 : enum bpf_xdp_mode mode;
9304 0 : bpf_op_t bpf_op;
9305 0 : int err;
9306 :
9307 0 : ASSERT_RTNL();
9308 :
9309 : /* either link or prog attachment, never both */
9310 0 : if (link && (new_prog || old_prog))
9311 : return -EINVAL;
9312 : /* link supports only XDP mode flags */
9313 0 : if (link && (flags & ~XDP_FLAGS_MODES)) {
9314 0 : NL_SET_ERR_MSG(extack, "Invalid XDP flags for BPF link attachment");
9315 0 : return -EINVAL;
9316 : }
9317 : /* just one XDP mode bit should be set, zero defaults to drv/skb mode */
9318 0 : if (num_modes > 1) {
9319 0 : NL_SET_ERR_MSG(extack, "Only one XDP mode flag can be set");
9320 0 : return -EINVAL;
9321 : }
9322 : /* avoid ambiguity if offload + drv/skb mode progs are both loaded */
9323 0 : if (!num_modes && dev_xdp_prog_count(dev) > 1) {
9324 0 : NL_SET_ERR_MSG(extack,
9325 : "More than one program loaded, unset mode is ambiguous");
9326 0 : return -EINVAL;
9327 : }
9328 : /* old_prog != NULL implies XDP_FLAGS_REPLACE is set */
9329 0 : if (old_prog && !(flags & XDP_FLAGS_REPLACE)) {
9330 0 : NL_SET_ERR_MSG(extack, "XDP_FLAGS_REPLACE is not specified");
9331 0 : return -EINVAL;
9332 : }
9333 :
9334 0 : mode = dev_xdp_mode(dev, flags);
9335 : /* can't replace attached link */
9336 0 : if (dev_xdp_link(dev, mode)) {
9337 0 : NL_SET_ERR_MSG(extack, "Can't replace active BPF XDP link");
9338 0 : return -EBUSY;
9339 : }
9340 :
9341 0 : cur_prog = dev_xdp_prog(dev, mode);
9342 : /* can't replace attached prog with link */
9343 0 : if (link && cur_prog) {
9344 0 : NL_SET_ERR_MSG(extack, "Can't replace active XDP program with BPF link");
9345 0 : return -EBUSY;
9346 : }
9347 0 : if ((flags & XDP_FLAGS_REPLACE) && cur_prog != old_prog) {
9348 0 : NL_SET_ERR_MSG(extack, "Active program does not match expected");
9349 0 : return -EEXIST;
9350 : }
9351 :
9352 : /* put effective new program into new_prog */
9353 0 : if (link)
9354 0 : new_prog = link->link.prog;
9355 :
9356 0 : if (new_prog) {
9357 0 : bool offload = mode == XDP_MODE_HW;
9358 0 : enum bpf_xdp_mode other_mode = mode == XDP_MODE_SKB
9359 0 : ? XDP_MODE_DRV : XDP_MODE_SKB;
9360 :
9361 0 : if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && cur_prog) {
9362 0 : NL_SET_ERR_MSG(extack, "XDP program already attached");
9363 0 : return -EBUSY;
9364 : }
9365 0 : if (!offload && dev_xdp_prog(dev, other_mode)) {
9366 0 : NL_SET_ERR_MSG(extack, "Native and generic XDP can't be active at the same time");
9367 0 : return -EEXIST;
9368 : }
9369 0 : if (!offload && bpf_prog_is_dev_bound(new_prog->aux)) {
9370 : NL_SET_ERR_MSG(extack, "Using device-bound program without HW_MODE flag is not supported");
9371 : return -EINVAL;
9372 : }
9373 0 : if (new_prog->expected_attach_type == BPF_XDP_DEVMAP) {
9374 0 : NL_SET_ERR_MSG(extack, "BPF_XDP_DEVMAP programs can not be attached to a device");
9375 0 : return -EINVAL;
9376 : }
9377 0 : if (new_prog->expected_attach_type == BPF_XDP_CPUMAP) {
9378 0 : NL_SET_ERR_MSG(extack, "BPF_XDP_CPUMAP programs can not be attached to a device");
9379 0 : return -EINVAL;
9380 : }
9381 : }
9382 :
9383 : /* don't call drivers if the effective program didn't change */
9384 0 : if (new_prog != cur_prog) {
9385 0 : bpf_op = dev_xdp_bpf_op(dev, mode);
9386 0 : if (!bpf_op) {
9387 0 : NL_SET_ERR_MSG(extack, "Underlying driver does not support XDP in native mode");
9388 0 : return -EOPNOTSUPP;
9389 : }
9390 :
9391 0 : err = dev_xdp_install(dev, mode, bpf_op, extack, flags, new_prog);
9392 0 : if (err)
9393 : return err;
9394 : }
9395 :
9396 0 : if (link)
9397 0 : dev_xdp_set_link(dev, mode, link);
9398 : else
9399 0 : dev_xdp_set_prog(dev, mode, new_prog);
9400 : if (cur_prog)
9401 0 : bpf_prog_put(cur_prog);
9402 :
9403 : return 0;
9404 : }
9405 :
9406 : static int dev_xdp_attach_link(struct net_device *dev,
9407 : struct netlink_ext_ack *extack,
9408 : struct bpf_xdp_link *link)
9409 : {
9410 : return dev_xdp_attach(dev, extack, link, NULL, NULL, link->flags);
9411 : }
9412 :
9413 : static int dev_xdp_detach_link(struct net_device *dev,
9414 : struct netlink_ext_ack *extack,
9415 : struct bpf_xdp_link *link)
9416 : {
9417 : enum bpf_xdp_mode mode;
9418 : bpf_op_t bpf_op;
9419 :
9420 : ASSERT_RTNL();
9421 :
9422 : mode = dev_xdp_mode(dev, link->flags);
9423 : if (dev_xdp_link(dev, mode) != link)
9424 : return -EINVAL;
9425 :
9426 : bpf_op = dev_xdp_bpf_op(dev, mode);
9427 : WARN_ON(dev_xdp_install(dev, mode, bpf_op, NULL, 0, NULL));
9428 : dev_xdp_set_link(dev, mode, NULL);
9429 : return 0;
9430 : }
9431 :
9432 : static void bpf_xdp_link_release(struct bpf_link *link)
9433 : {
9434 : struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9435 :
9436 : rtnl_lock();
9437 :
9438 : /* if racing with net_device's tear down, xdp_link->dev might be
9439 : * already NULL, in which case link was already auto-detached
9440 : */
9441 : if (xdp_link->dev) {
9442 : WARN_ON(dev_xdp_detach_link(xdp_link->dev, NULL, xdp_link));
9443 : xdp_link->dev = NULL;
9444 : }
9445 :
9446 : rtnl_unlock();
9447 : }
9448 :
9449 : static int bpf_xdp_link_detach(struct bpf_link *link)
9450 : {
9451 : bpf_xdp_link_release(link);
9452 : return 0;
9453 : }
9454 :
9455 : static void bpf_xdp_link_dealloc(struct bpf_link *link)
9456 : {
9457 : struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9458 :
9459 : kfree(xdp_link);
9460 : }
9461 :
9462 : static void bpf_xdp_link_show_fdinfo(const struct bpf_link *link,
9463 : struct seq_file *seq)
9464 : {
9465 : struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9466 : u32 ifindex = 0;
9467 :
9468 : rtnl_lock();
9469 : if (xdp_link->dev)
9470 : ifindex = xdp_link->dev->ifindex;
9471 : rtnl_unlock();
9472 :
9473 : seq_printf(seq, "ifindex:\t%u\n", ifindex);
9474 : }
9475 :
9476 : static int bpf_xdp_link_fill_link_info(const struct bpf_link *link,
9477 : struct bpf_link_info *info)
9478 : {
9479 : struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9480 : u32 ifindex = 0;
9481 :
9482 : rtnl_lock();
9483 : if (xdp_link->dev)
9484 : ifindex = xdp_link->dev->ifindex;
9485 : rtnl_unlock();
9486 :
9487 : info->xdp.ifindex = ifindex;
9488 : return 0;
9489 : }
9490 :
9491 : static int bpf_xdp_link_update(struct bpf_link *link, struct bpf_prog *new_prog,
9492 : struct bpf_prog *old_prog)
9493 : {
9494 : struct bpf_xdp_link *xdp_link = container_of(link, struct bpf_xdp_link, link);
9495 : enum bpf_xdp_mode mode;
9496 : bpf_op_t bpf_op;
9497 : int err = 0;
9498 :
9499 : rtnl_lock();
9500 :
9501 : /* link might have been auto-released already, so fail */
9502 : if (!xdp_link->dev) {
9503 : err = -ENOLINK;
9504 : goto out_unlock;
9505 : }
9506 :
9507 : if (old_prog && link->prog != old_prog) {
9508 : err = -EPERM;
9509 : goto out_unlock;
9510 : }
9511 : old_prog = link->prog;
9512 : if (old_prog == new_prog) {
9513 : /* no-op, don't disturb drivers */
9514 : bpf_prog_put(new_prog);
9515 : goto out_unlock;
9516 : }
9517 :
9518 : mode = dev_xdp_mode(xdp_link->dev, xdp_link->flags);
9519 : bpf_op = dev_xdp_bpf_op(xdp_link->dev, mode);
9520 : err = dev_xdp_install(xdp_link->dev, mode, bpf_op, NULL,
9521 : xdp_link->flags, new_prog);
9522 : if (err)
9523 : goto out_unlock;
9524 :
9525 : old_prog = xchg(&link->prog, new_prog);
9526 : bpf_prog_put(old_prog);
9527 :
9528 : out_unlock:
9529 : rtnl_unlock();
9530 : return err;
9531 : }
9532 :
9533 : static const struct bpf_link_ops bpf_xdp_link_lops = {
9534 : .release = bpf_xdp_link_release,
9535 : .dealloc = bpf_xdp_link_dealloc,
9536 : .detach = bpf_xdp_link_detach,
9537 : .show_fdinfo = bpf_xdp_link_show_fdinfo,
9538 : .fill_link_info = bpf_xdp_link_fill_link_info,
9539 : .update_prog = bpf_xdp_link_update,
9540 : };
9541 :
9542 0 : int bpf_xdp_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
9543 : {
9544 0 : struct net *net = current->nsproxy->net_ns;
9545 0 : struct bpf_link_primer link_primer;
9546 0 : struct bpf_xdp_link *link;
9547 0 : struct net_device *dev;
9548 0 : int err, fd;
9549 :
9550 0 : dev = dev_get_by_index(net, attr->link_create.target_ifindex);
9551 0 : if (!dev)
9552 : return -EINVAL;
9553 :
9554 0 : link = kzalloc(sizeof(*link), GFP_USER);
9555 0 : if (!link) {
9556 0 : err = -ENOMEM;
9557 0 : goto out_put_dev;
9558 : }
9559 :
9560 0 : bpf_link_init(&link->link, BPF_LINK_TYPE_XDP, &bpf_xdp_link_lops, prog);
9561 0 : link->dev = dev;
9562 0 : link->flags = attr->link_create.flags;
9563 :
9564 0 : err = bpf_link_prime(&link->link, &link_primer);
9565 0 : if (err) {
9566 0 : kfree(link);
9567 0 : goto out_put_dev;
9568 : }
9569 :
9570 : rtnl_lock();
9571 : err = dev_xdp_attach_link(dev, NULL, link);
9572 : rtnl_unlock();
9573 :
9574 : if (err) {
9575 : bpf_link_cleanup(&link_primer);
9576 : goto out_put_dev;
9577 : }
9578 :
9579 : fd = bpf_link_settle(&link_primer);
9580 : /* link itself doesn't hold dev's refcnt to not complicate shutdown */
9581 : dev_put(dev);
9582 : return fd;
9583 :
9584 0 : out_put_dev:
9585 0 : dev_put(dev);
9586 0 : return err;
9587 : }
9588 :
9589 : /**
9590 : * dev_change_xdp_fd - set or clear a bpf program for a device rx path
9591 : * @dev: device
9592 : * @extack: netlink extended ack
9593 : * @fd: new program fd or negative value to clear
9594 : * @expected_fd: old program fd that userspace expects to replace or clear
9595 : * @flags: xdp-related flags
9596 : *
9597 : * Set or clear a bpf program for a device
9598 : */
9599 0 : int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack,
9600 : int fd, int expected_fd, u32 flags)
9601 : {
9602 0 : enum bpf_xdp_mode mode = dev_xdp_mode(dev, flags);
9603 0 : struct bpf_prog *new_prog = NULL, *old_prog = NULL;
9604 0 : int err;
9605 :
9606 0 : ASSERT_RTNL();
9607 :
9608 0 : if (fd >= 0) {
9609 0 : new_prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP,
9610 : mode != XDP_MODE_SKB);
9611 0 : if (IS_ERR(new_prog))
9612 0 : return PTR_ERR(new_prog);
9613 : }
9614 :
9615 0 : if (expected_fd >= 0) {
9616 0 : old_prog = bpf_prog_get_type_dev(expected_fd, BPF_PROG_TYPE_XDP,
9617 : mode != XDP_MODE_SKB);
9618 0 : if (IS_ERR(old_prog)) {
9619 0 : err = PTR_ERR(old_prog);
9620 0 : old_prog = NULL;
9621 0 : goto err_out;
9622 : }
9623 : }
9624 :
9625 0 : err = dev_xdp_attach(dev, extack, NULL, new_prog, old_prog, flags);
9626 :
9627 0 : err_out:
9628 0 : if (err && new_prog)
9629 0 : bpf_prog_put(new_prog);
9630 0 : if (old_prog)
9631 0 : bpf_prog_put(old_prog);
9632 0 : return err;
9633 : }
9634 :
9635 : /**
9636 : * dev_new_index - allocate an ifindex
9637 : * @net: the applicable net namespace
9638 : *
9639 : * Returns a suitable unique value for a new device interface
9640 : * number. The caller must hold the rtnl semaphore or the
9641 : * dev_base_lock to be sure it remains unique.
9642 : */
9643 2 : static int dev_new_index(struct net *net)
9644 : {
9645 2 : int ifindex = net->ifindex;
9646 :
9647 2 : for (;;) {
9648 2 : if (++ifindex <= 0)
9649 : ifindex = 1;
9650 2 : if (!__dev_get_by_index(net, ifindex))
9651 2 : return net->ifindex = ifindex;
9652 : }
9653 : }
9654 :
9655 : /* Delayed registration/unregisteration */
9656 : static LIST_HEAD(net_todo_list);
9657 : DECLARE_WAIT_QUEUE_HEAD(netdev_unregistering_wq);
9658 :
9659 0 : static void net_set_todo(struct net_device *dev)
9660 : {
9661 0 : list_add_tail(&dev->todo_list, &net_todo_list);
9662 0 : dev_net(dev)->dev_unreg_count++;
9663 : }
9664 :
9665 0 : static netdev_features_t netdev_sync_upper_features(struct net_device *lower,
9666 : struct net_device *upper, netdev_features_t features)
9667 : {
9668 0 : netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9669 0 : netdev_features_t feature;
9670 0 : int feature_bit;
9671 :
9672 0 : for_each_netdev_feature(upper_disables, feature_bit) {
9673 0 : feature = __NETIF_F_BIT(feature_bit);
9674 0 : if (!(upper->wanted_features & feature)
9675 0 : && (features & feature)) {
9676 0 : netdev_dbg(lower, "Dropping feature %pNF, upper dev %s has it off.\n",
9677 : &feature, upper->name);
9678 0 : features &= ~feature;
9679 : }
9680 : }
9681 :
9682 0 : return features;
9683 : }
9684 :
9685 0 : static void netdev_sync_lower_features(struct net_device *upper,
9686 : struct net_device *lower, netdev_features_t features)
9687 : {
9688 0 : netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
9689 0 : netdev_features_t feature;
9690 0 : int feature_bit;
9691 :
9692 0 : for_each_netdev_feature(upper_disables, feature_bit) {
9693 0 : feature = __NETIF_F_BIT(feature_bit);
9694 0 : if (!(features & feature) && (lower->features & feature)) {
9695 0 : netdev_dbg(upper, "Disabling feature %pNF on lower dev %s.\n",
9696 : &feature, lower->name);
9697 0 : lower->wanted_features &= ~feature;
9698 0 : __netdev_update_features(lower);
9699 :
9700 0 : if (unlikely(lower->features & feature))
9701 0 : netdev_WARN(upper, "failed to disable %pNF on %s!\n",
9702 : &feature, lower->name);
9703 : else
9704 0 : netdev_features_change(lower);
9705 : }
9706 : }
9707 0 : }
9708 :
9709 4 : static netdev_features_t netdev_fix_features(struct net_device *dev,
9710 : netdev_features_t features)
9711 : {
9712 : /* Fix illegal checksum combinations */
9713 4 : if ((features & NETIF_F_HW_CSUM) &&
9714 2 : (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
9715 0 : netdev_warn(dev, "mixed HW and IP checksum settings.\n");
9716 0 : features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
9717 : }
9718 :
9719 : /* TSO requires that SG is present as well. */
9720 4 : if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
9721 0 : netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
9722 0 : features &= ~NETIF_F_ALL_TSO;
9723 : }
9724 :
9725 4 : if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
9726 : !(features & NETIF_F_IP_CSUM)) {
9727 0 : netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
9728 0 : features &= ~NETIF_F_TSO;
9729 0 : features &= ~NETIF_F_TSO_ECN;
9730 : }
9731 :
9732 4 : if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
9733 : !(features & NETIF_F_IPV6_CSUM)) {
9734 0 : netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
9735 0 : features &= ~NETIF_F_TSO6;
9736 : }
9737 :
9738 : /* TSO with IPv4 ID mangling requires IPv4 TSO be enabled */
9739 4 : if ((features & NETIF_F_TSO_MANGLEID) && !(features & NETIF_F_TSO))
9740 0 : features &= ~NETIF_F_TSO_MANGLEID;
9741 :
9742 : /* TSO ECN requires that TSO is present as well. */
9743 4 : if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
9744 0 : features &= ~NETIF_F_TSO_ECN;
9745 :
9746 : /* Software GSO depends on SG. */
9747 4 : if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
9748 2 : netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
9749 2 : features &= ~NETIF_F_GSO;
9750 : }
9751 :
9752 : /* GSO partial features require GSO partial be set */
9753 4 : if ((features & dev->gso_partial_features) &&
9754 0 : !(features & NETIF_F_GSO_PARTIAL)) {
9755 0 : netdev_dbg(dev,
9756 : "Dropping partially supported GSO features since no GSO partial.\n");
9757 0 : features &= ~dev->gso_partial_features;
9758 : }
9759 :
9760 4 : if (!(features & NETIF_F_RXCSUM)) {
9761 : /* NETIF_F_GRO_HW implies doing RXCSUM since every packet
9762 : * successfully merged by hardware must also have the
9763 : * checksum verified by hardware. If the user does not
9764 : * want to enable RXCSUM, logically, we should disable GRO_HW.
9765 : */
9766 2 : if (features & NETIF_F_GRO_HW) {
9767 0 : netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n");
9768 0 : features &= ~NETIF_F_GRO_HW;
9769 : }
9770 : }
9771 :
9772 : /* LRO/HW-GRO features cannot be combined with RX-FCS */
9773 4 : if (features & NETIF_F_RXFCS) {
9774 0 : if (features & NETIF_F_LRO) {
9775 0 : netdev_dbg(dev, "Dropping LRO feature since RX-FCS is requested.\n");
9776 0 : features &= ~NETIF_F_LRO;
9777 : }
9778 :
9779 0 : if (features & NETIF_F_GRO_HW) {
9780 0 : netdev_dbg(dev, "Dropping HW-GRO feature since RX-FCS is requested.\n");
9781 0 : features &= ~NETIF_F_GRO_HW;
9782 : }
9783 : }
9784 :
9785 4 : if (features & NETIF_F_HW_TLS_TX) {
9786 0 : bool ip_csum = (features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) ==
9787 : (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
9788 0 : bool hw_csum = features & NETIF_F_HW_CSUM;
9789 :
9790 0 : if (!ip_csum && !hw_csum) {
9791 0 : netdev_dbg(dev, "Dropping TLS TX HW offload feature since no CSUM feature.\n");
9792 0 : features &= ~NETIF_F_HW_TLS_TX;
9793 : }
9794 : }
9795 :
9796 4 : if ((features & NETIF_F_HW_TLS_RX) && !(features & NETIF_F_RXCSUM)) {
9797 0 : netdev_dbg(dev, "Dropping TLS RX HW offload feature since no RXCSUM feature.\n");
9798 0 : features &= ~NETIF_F_HW_TLS_RX;
9799 : }
9800 :
9801 4 : return features;
9802 : }
9803 :
9804 4 : int __netdev_update_features(struct net_device *dev)
9805 : {
9806 4 : struct net_device *upper, *lower;
9807 4 : netdev_features_t features;
9808 4 : struct list_head *iter;
9809 4 : int err = -1;
9810 :
9811 4 : ASSERT_RTNL();
9812 :
9813 4 : features = netdev_get_wanted_features(dev);
9814 :
9815 4 : if (dev->netdev_ops->ndo_fix_features)
9816 0 : features = dev->netdev_ops->ndo_fix_features(dev, features);
9817 :
9818 : /* driver might be less strict about feature dependencies */
9819 4 : features = netdev_fix_features(dev, features);
9820 :
9821 : /* some features can't be enabled if they're off on an upper device */
9822 4 : netdev_for_each_upper_dev_rcu(dev, upper, iter)
9823 0 : features = netdev_sync_upper_features(dev, upper, features);
9824 :
9825 4 : if (dev->features == features)
9826 3 : goto sync_lower;
9827 :
9828 1 : netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
9829 : &dev->features, &features);
9830 :
9831 1 : if (dev->netdev_ops->ndo_set_features)
9832 1 : err = dev->netdev_ops->ndo_set_features(dev, features);
9833 : else
9834 : err = 0;
9835 :
9836 1 : if (unlikely(err < 0)) {
9837 0 : netdev_err(dev,
9838 : "set_features() failed (%d); wanted %pNF, left %pNF\n",
9839 : err, &features, &dev->features);
9840 : /* return non-0 since some features might have changed and
9841 : * it's better to fire a spurious notification than miss it
9842 : */
9843 0 : return -1;
9844 : }
9845 :
9846 1 : sync_lower:
9847 : /* some features must be disabled on lower devices when disabled
9848 : * on an upper device (think: bonding master or bridge)
9849 : */
9850 8 : netdev_for_each_lower_dev(dev, lower, iter)
9851 0 : netdev_sync_lower_features(dev, lower, features);
9852 :
9853 4 : if (!err) {
9854 1 : netdev_features_t diff = features ^ dev->features;
9855 :
9856 1 : if (diff & NETIF_F_RX_UDP_TUNNEL_PORT) {
9857 : /* udp_tunnel_{get,drop}_rx_info both need
9858 : * NETIF_F_RX_UDP_TUNNEL_PORT enabled on the
9859 : * device, or they won't do anything.
9860 : * Thus we need to update dev->features
9861 : * *before* calling udp_tunnel_get_rx_info,
9862 : * but *after* calling udp_tunnel_drop_rx_info.
9863 : */
9864 0 : if (features & NETIF_F_RX_UDP_TUNNEL_PORT) {
9865 0 : dev->features = features;
9866 0 : udp_tunnel_get_rx_info(dev);
9867 : } else {
9868 0 : udp_tunnel_drop_rx_info(dev);
9869 : }
9870 : }
9871 :
9872 1 : if (diff & NETIF_F_HW_VLAN_CTAG_FILTER) {
9873 0 : if (features & NETIF_F_HW_VLAN_CTAG_FILTER) {
9874 0 : dev->features = features;
9875 0 : err |= vlan_get_rx_ctag_filter_info(dev);
9876 : } else {
9877 0 : vlan_drop_rx_ctag_filter_info(dev);
9878 : }
9879 : }
9880 :
9881 1 : if (diff & NETIF_F_HW_VLAN_STAG_FILTER) {
9882 0 : if (features & NETIF_F_HW_VLAN_STAG_FILTER) {
9883 0 : dev->features = features;
9884 0 : err |= vlan_get_rx_stag_filter_info(dev);
9885 : } else {
9886 0 : vlan_drop_rx_stag_filter_info(dev);
9887 : }
9888 : }
9889 :
9890 1 : dev->features = features;
9891 : }
9892 :
9893 4 : return err < 0 ? 0 : 1;
9894 : }
9895 :
9896 : /**
9897 : * netdev_update_features - recalculate device features
9898 : * @dev: the device to check
9899 : *
9900 : * Recalculate dev->features set and send notifications if it
9901 : * has changed. Should be called after driver or hardware dependent
9902 : * conditions might have changed that influence the features.
9903 : */
9904 0 : void netdev_update_features(struct net_device *dev)
9905 : {
9906 0 : if (__netdev_update_features(dev))
9907 0 : netdev_features_change(dev);
9908 0 : }
9909 : EXPORT_SYMBOL(netdev_update_features);
9910 :
9911 : /**
9912 : * netdev_change_features - recalculate device features
9913 : * @dev: the device to check
9914 : *
9915 : * Recalculate dev->features set and send notifications even
9916 : * if they have not changed. Should be called instead of
9917 : * netdev_update_features() if also dev->vlan_features might
9918 : * have changed to allow the changes to be propagated to stacked
9919 : * VLAN devices.
9920 : */
9921 0 : void netdev_change_features(struct net_device *dev)
9922 : {
9923 0 : __netdev_update_features(dev);
9924 0 : netdev_features_change(dev);
9925 0 : }
9926 : EXPORT_SYMBOL(netdev_change_features);
9927 :
9928 : /**
9929 : * netif_stacked_transfer_operstate - transfer operstate
9930 : * @rootdev: the root or lower level device to transfer state from
9931 : * @dev: the device to transfer operstate to
9932 : *
9933 : * Transfer operational state from root to device. This is normally
9934 : * called when a stacking relationship exists between the root
9935 : * device and the device(a leaf device).
9936 : */
9937 0 : void netif_stacked_transfer_operstate(const struct net_device *rootdev,
9938 : struct net_device *dev)
9939 : {
9940 0 : if (rootdev->operstate == IF_OPER_DORMANT)
9941 0 : netif_dormant_on(dev);
9942 : else
9943 0 : netif_dormant_off(dev);
9944 :
9945 0 : if (rootdev->operstate == IF_OPER_TESTING)
9946 0 : netif_testing_on(dev);
9947 : else
9948 0 : netif_testing_off(dev);
9949 :
9950 0 : if (netif_carrier_ok(rootdev))
9951 0 : netif_carrier_on(dev);
9952 : else
9953 0 : netif_carrier_off(dev);
9954 0 : }
9955 : EXPORT_SYMBOL(netif_stacked_transfer_operstate);
9956 :
9957 3 : static int netif_alloc_rx_queues(struct net_device *dev)
9958 : {
9959 3 : unsigned int i, count = dev->num_rx_queues;
9960 3 : struct netdev_rx_queue *rx;
9961 3 : size_t sz = count * sizeof(*rx);
9962 3 : int err = 0;
9963 :
9964 3 : BUG_ON(count < 1);
9965 :
9966 3 : rx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
9967 3 : if (!rx)
9968 : return -ENOMEM;
9969 :
9970 3 : dev->_rx = rx;
9971 :
9972 6 : for (i = 0; i < count; i++) {
9973 3 : rx[i].dev = dev;
9974 :
9975 : /* XDP RX-queue setup */
9976 3 : err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i, 0);
9977 3 : if (err < 0)
9978 0 : goto err_rxq_info;
9979 : }
9980 : return 0;
9981 :
9982 0 : err_rxq_info:
9983 : /* Rollback successful reg's and free other resources */
9984 0 : while (i--)
9985 0 : xdp_rxq_info_unreg(&rx[i].xdp_rxq);
9986 0 : kvfree(dev->_rx);
9987 0 : dev->_rx = NULL;
9988 0 : return err;
9989 : }
9990 :
9991 0 : static void netif_free_rx_queues(struct net_device *dev)
9992 : {
9993 0 : unsigned int i, count = dev->num_rx_queues;
9994 :
9995 : /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */
9996 0 : if (!dev->_rx)
9997 : return;
9998 :
9999 0 : for (i = 0; i < count; i++)
10000 0 : xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq);
10001 :
10002 0 : kvfree(dev->_rx);
10003 : }
10004 :
10005 3 : static void netdev_init_one_queue(struct net_device *dev,
10006 : struct netdev_queue *queue, void *_unused)
10007 : {
10008 : /* Initialize queue lock */
10009 3 : spin_lock_init(&queue->_xmit_lock);
10010 3 : netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
10011 3 : queue->xmit_lock_owner = -1;
10012 3 : netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
10013 3 : queue->dev = dev;
10014 : #ifdef CONFIG_BQL
10015 3 : dql_init(&queue->dql, HZ);
10016 : #endif
10017 3 : }
10018 :
10019 0 : static void netif_free_tx_queues(struct net_device *dev)
10020 : {
10021 0 : kvfree(dev->_tx);
10022 : }
10023 :
10024 3 : static int netif_alloc_netdev_queues(struct net_device *dev)
10025 : {
10026 3 : unsigned int count = dev->num_tx_queues;
10027 3 : struct netdev_queue *tx;
10028 3 : size_t sz = count * sizeof(*tx);
10029 :
10030 3 : if (count < 1 || count > 0xffff)
10031 : return -EINVAL;
10032 :
10033 3 : tx = kvzalloc(sz, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10034 3 : if (!tx)
10035 : return -ENOMEM;
10036 :
10037 3 : dev->_tx = tx;
10038 :
10039 3 : netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
10040 3 : spin_lock_init(&dev->tx_global_lock);
10041 :
10042 3 : return 0;
10043 : }
10044 :
10045 0 : void netif_tx_stop_all_queues(struct net_device *dev)
10046 : {
10047 0 : unsigned int i;
10048 :
10049 0 : for (i = 0; i < dev->num_tx_queues; i++) {
10050 0 : struct netdev_queue *txq = netdev_get_tx_queue(dev, i);
10051 :
10052 0 : netif_tx_stop_queue(txq);
10053 : }
10054 0 : }
10055 : EXPORT_SYMBOL(netif_tx_stop_all_queues);
10056 :
10057 : /**
10058 : * register_netdevice - register a network device
10059 : * @dev: device to register
10060 : *
10061 : * Take a completed network device structure and add it to the kernel
10062 : * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10063 : * chain. 0 is returned on success. A negative errno code is returned
10064 : * on a failure to set up the device, or if the name is a duplicate.
10065 : *
10066 : * Callers must hold the rtnl semaphore. You may want
10067 : * register_netdev() instead of this.
10068 : *
10069 : * BUGS:
10070 : * The locking appears insufficient to guarantee two parallel registers
10071 : * will not get the same name.
10072 : */
10073 :
10074 2 : int register_netdevice(struct net_device *dev)
10075 : {
10076 2 : int ret;
10077 2 : struct net *net = dev_net(dev);
10078 :
10079 2 : BUILD_BUG_ON(sizeof(netdev_features_t) * BITS_PER_BYTE <
10080 : NETDEV_FEATURE_COUNT);
10081 2 : BUG_ON(dev_boot_phase);
10082 2 : ASSERT_RTNL();
10083 :
10084 2 : might_sleep();
10085 :
10086 : /* When net_device's are persistent, this will be fatal. */
10087 2 : BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
10088 2 : BUG_ON(!net);
10089 :
10090 2 : ret = ethtool_check_ops(dev->ethtool_ops);
10091 2 : if (ret)
10092 : return ret;
10093 :
10094 2 : spin_lock_init(&dev->addr_list_lock);
10095 2 : netdev_set_addr_lockdep_class(dev);
10096 :
10097 2 : ret = dev_get_valid_name(net, dev, dev->name);
10098 2 : if (ret < 0)
10099 0 : goto out;
10100 :
10101 2 : ret = -ENOMEM;
10102 2 : dev->name_node = netdev_name_node_head_alloc(dev);
10103 2 : if (!dev->name_node)
10104 0 : goto out;
10105 :
10106 : /* Init, if this function is available */
10107 2 : if (dev->netdev_ops->ndo_init) {
10108 1 : ret = dev->netdev_ops->ndo_init(dev);
10109 1 : if (ret) {
10110 0 : if (ret > 0)
10111 0 : ret = -EIO;
10112 0 : goto err_free_name;
10113 : }
10114 : }
10115 :
10116 2 : if (((dev->hw_features | dev->features) &
10117 1 : NETIF_F_HW_VLAN_CTAG_FILTER) &&
10118 1 : (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
10119 1 : !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
10120 0 : netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
10121 0 : ret = -EINVAL;
10122 0 : goto err_uninit;
10123 : }
10124 :
10125 2 : ret = -EBUSY;
10126 2 : if (!dev->ifindex)
10127 2 : dev->ifindex = dev_new_index(net);
10128 0 : else if (__dev_get_by_index(net, dev->ifindex))
10129 0 : goto err_uninit;
10130 :
10131 : /* Transfer changeable features to wanted_features and enable
10132 : * software offloads (GSO and GRO).
10133 : */
10134 2 : dev->hw_features |= (NETIF_F_SOFT_FEATURES | NETIF_F_SOFT_FEATURES_OFF);
10135 2 : dev->features |= NETIF_F_SOFT_FEATURES;
10136 :
10137 2 : if (dev->udp_tunnel_nic_info) {
10138 0 : dev->features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10139 0 : dev->hw_features |= NETIF_F_RX_UDP_TUNNEL_PORT;
10140 : }
10141 :
10142 2 : dev->wanted_features = dev->features & dev->hw_features;
10143 :
10144 2 : if (!(dev->flags & IFF_LOOPBACK))
10145 1 : dev->hw_features |= NETIF_F_NOCACHE_COPY;
10146 :
10147 : /* If IPv4 TCP segmentation offload is supported we should also
10148 : * allow the device to enable segmenting the frame with the option
10149 : * of ignoring a static IP ID value. This doesn't enable the
10150 : * feature itself but allows the user to enable it later.
10151 : */
10152 2 : if (dev->hw_features & NETIF_F_TSO)
10153 1 : dev->hw_features |= NETIF_F_TSO_MANGLEID;
10154 2 : if (dev->vlan_features & NETIF_F_TSO)
10155 0 : dev->vlan_features |= NETIF_F_TSO_MANGLEID;
10156 2 : if (dev->mpls_features & NETIF_F_TSO)
10157 0 : dev->mpls_features |= NETIF_F_TSO_MANGLEID;
10158 2 : if (dev->hw_enc_features & NETIF_F_TSO)
10159 0 : dev->hw_enc_features |= NETIF_F_TSO_MANGLEID;
10160 :
10161 : /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
10162 : */
10163 2 : dev->vlan_features |= NETIF_F_HIGHDMA;
10164 :
10165 : /* Make NETIF_F_SG inheritable to tunnel devices.
10166 : */
10167 2 : dev->hw_enc_features |= NETIF_F_SG | NETIF_F_GSO_PARTIAL;
10168 :
10169 : /* Make NETIF_F_SG inheritable to MPLS.
10170 : */
10171 2 : dev->mpls_features |= NETIF_F_SG;
10172 :
10173 2 : ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
10174 2 : ret = notifier_to_errno(ret);
10175 0 : if (ret)
10176 0 : goto err_uninit;
10177 :
10178 2 : ret = netdev_register_kobject(dev);
10179 2 : if (ret) {
10180 0 : dev->reg_state = NETREG_UNREGISTERED;
10181 0 : goto err_uninit;
10182 : }
10183 2 : dev->reg_state = NETREG_REGISTERED;
10184 :
10185 2 : __netdev_update_features(dev);
10186 :
10187 : /*
10188 : * Default initial state at registry is that the
10189 : * device is present.
10190 : */
10191 :
10192 2 : set_bit(__LINK_STATE_PRESENT, &dev->state);
10193 :
10194 2 : linkwatch_init_dev(dev);
10195 :
10196 2 : dev_init_scheduler(dev);
10197 2 : dev_hold(dev);
10198 2 : list_netdevice(dev);
10199 2 : add_device_randomness(dev->dev_addr, dev->addr_len);
10200 :
10201 : /* If the device has permanent device address, driver should
10202 : * set dev_addr and also addr_assign_type should be set to
10203 : * NET_ADDR_PERM (default value).
10204 : */
10205 2 : if (dev->addr_assign_type == NET_ADDR_PERM)
10206 2 : memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
10207 :
10208 : /* Notify protocols, that a new device appeared. */
10209 2 : ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
10210 2 : ret = notifier_to_errno(ret);
10211 0 : if (ret) {
10212 : /* Expect explicit free_netdev() on failure */
10213 0 : dev->needs_free_netdev = false;
10214 0 : unregister_netdevice_queue(dev, NULL);
10215 0 : goto out;
10216 : }
10217 : /*
10218 : * Prevent userspace races by waiting until the network
10219 : * device is fully setup before sending notifications.
10220 : */
10221 2 : if (!dev->rtnl_link_ops ||
10222 0 : dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10223 2 : rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
10224 :
10225 0 : out:
10226 : return ret;
10227 :
10228 0 : err_uninit:
10229 0 : if (dev->netdev_ops->ndo_uninit)
10230 0 : dev->netdev_ops->ndo_uninit(dev);
10231 0 : if (dev->priv_destructor)
10232 0 : dev->priv_destructor(dev);
10233 0 : err_free_name:
10234 0 : netdev_name_node_free(dev->name_node);
10235 0 : goto out;
10236 : }
10237 : EXPORT_SYMBOL(register_netdevice);
10238 :
10239 : /**
10240 : * init_dummy_netdev - init a dummy network device for NAPI
10241 : * @dev: device to init
10242 : *
10243 : * This takes a network device structure and initialize the minimum
10244 : * amount of fields so it can be used to schedule NAPI polls without
10245 : * registering a full blown interface. This is to be used by drivers
10246 : * that need to tie several hardware interfaces to a single NAPI
10247 : * poll scheduler due to HW limitations.
10248 : */
10249 0 : int init_dummy_netdev(struct net_device *dev)
10250 : {
10251 : /* Clear everything. Note we don't initialize spinlocks
10252 : * are they aren't supposed to be taken by any of the
10253 : * NAPI code and this dummy netdev is supposed to be
10254 : * only ever used for NAPI polls
10255 : */
10256 0 : memset(dev, 0, sizeof(struct net_device));
10257 :
10258 : /* make sure we BUG if trying to hit standard
10259 : * register/unregister code path
10260 : */
10261 0 : dev->reg_state = NETREG_DUMMY;
10262 :
10263 : /* NAPI wants this */
10264 0 : INIT_LIST_HEAD(&dev->napi_list);
10265 :
10266 : /* a dummy interface is started by default */
10267 0 : set_bit(__LINK_STATE_PRESENT, &dev->state);
10268 0 : set_bit(__LINK_STATE_START, &dev->state);
10269 :
10270 : /* napi_busy_loop stats accounting wants this */
10271 0 : dev_net_set(dev, &init_net);
10272 :
10273 : /* Note : We dont allocate pcpu_refcnt for dummy devices,
10274 : * because users of this 'device' dont need to change
10275 : * its refcount.
10276 : */
10277 :
10278 0 : return 0;
10279 : }
10280 : EXPORT_SYMBOL_GPL(init_dummy_netdev);
10281 :
10282 :
10283 : /**
10284 : * register_netdev - register a network device
10285 : * @dev: device to register
10286 : *
10287 : * Take a completed network device structure and add it to the kernel
10288 : * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
10289 : * chain. 0 is returned on success. A negative errno code is returned
10290 : * on a failure to set up the device, or if the name is a duplicate.
10291 : *
10292 : * This is a wrapper around register_netdevice that takes the rtnl semaphore
10293 : * and expands the device name if you passed a format string to
10294 : * alloc_netdev.
10295 : */
10296 2 : int register_netdev(struct net_device *dev)
10297 : {
10298 2 : int err;
10299 :
10300 2 : if (rtnl_lock_killable())
10301 : return -EINTR;
10302 2 : err = register_netdevice(dev);
10303 2 : rtnl_unlock();
10304 2 : return err;
10305 : }
10306 : EXPORT_SYMBOL(register_netdev);
10307 :
10308 0 : int netdev_refcnt_read(const struct net_device *dev)
10309 : {
10310 0 : int i, refcnt = 0;
10311 :
10312 0 : for_each_possible_cpu(i)
10313 0 : refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
10314 0 : return refcnt;
10315 : }
10316 : EXPORT_SYMBOL(netdev_refcnt_read);
10317 :
10318 : #define WAIT_REFS_MIN_MSECS 1
10319 : #define WAIT_REFS_MAX_MSECS 250
10320 : /**
10321 : * netdev_wait_allrefs - wait until all references are gone.
10322 : * @dev: target net_device
10323 : *
10324 : * This is called when unregistering network devices.
10325 : *
10326 : * Any protocol or device that holds a reference should register
10327 : * for netdevice notification, and cleanup and put back the
10328 : * reference if they receive an UNREGISTER event.
10329 : * We can get stuck here if buggy protocols don't correctly
10330 : * call dev_put.
10331 : */
10332 0 : static void netdev_wait_allrefs(struct net_device *dev)
10333 : {
10334 0 : unsigned long rebroadcast_time, warning_time;
10335 0 : int wait = 0, refcnt;
10336 :
10337 0 : linkwatch_forget_dev(dev);
10338 :
10339 0 : rebroadcast_time = warning_time = jiffies;
10340 0 : refcnt = netdev_refcnt_read(dev);
10341 :
10342 0 : while (refcnt != 0) {
10343 0 : if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
10344 0 : rtnl_lock();
10345 :
10346 : /* Rebroadcast unregister notification */
10347 0 : call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10348 :
10349 0 : __rtnl_unlock();
10350 0 : rcu_barrier();
10351 0 : rtnl_lock();
10352 :
10353 0 : if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
10354 0 : &dev->state)) {
10355 : /* We must not have linkwatch events
10356 : * pending on unregister. If this
10357 : * happens, we simply run the queue
10358 : * unscheduled, resulting in a noop
10359 : * for this device.
10360 : */
10361 0 : linkwatch_run_queue();
10362 : }
10363 :
10364 0 : __rtnl_unlock();
10365 :
10366 0 : rebroadcast_time = jiffies;
10367 : }
10368 :
10369 0 : if (!wait) {
10370 0 : rcu_barrier();
10371 0 : wait = WAIT_REFS_MIN_MSECS;
10372 : } else {
10373 0 : msleep(wait);
10374 0 : wait = min(wait << 1, WAIT_REFS_MAX_MSECS);
10375 : }
10376 :
10377 0 : refcnt = netdev_refcnt_read(dev);
10378 :
10379 0 : if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) {
10380 0 : pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
10381 : dev->name, refcnt);
10382 0 : warning_time = jiffies;
10383 : }
10384 : }
10385 0 : }
10386 :
10387 : /* The sequence is:
10388 : *
10389 : * rtnl_lock();
10390 : * ...
10391 : * register_netdevice(x1);
10392 : * register_netdevice(x2);
10393 : * ...
10394 : * unregister_netdevice(y1);
10395 : * unregister_netdevice(y2);
10396 : * ...
10397 : * rtnl_unlock();
10398 : * free_netdev(y1);
10399 : * free_netdev(y2);
10400 : *
10401 : * We are invoked by rtnl_unlock().
10402 : * This allows us to deal with problems:
10403 : * 1) We can delete sysfs objects which invoke hotplug
10404 : * without deadlocking with linkwatch via keventd.
10405 : * 2) Since we run with the RTNL semaphore not held, we can sleep
10406 : * safely in order to wait for the netdev refcnt to drop to zero.
10407 : *
10408 : * We must not return until all unregister events added during
10409 : * the interval the lock was held have been completed.
10410 : */
10411 69 : void netdev_run_todo(void)
10412 : {
10413 69 : struct list_head list;
10414 : #ifdef CONFIG_LOCKDEP
10415 69 : struct list_head unlink_list;
10416 :
10417 69 : list_replace_init(&net_unlink_list, &unlink_list);
10418 :
10419 69 : while (!list_empty(&unlink_list)) {
10420 0 : struct net_device *dev = list_first_entry(&unlink_list,
10421 : struct net_device,
10422 : unlink_list);
10423 0 : list_del_init(&dev->unlink_list);
10424 0 : dev->nested_level = dev->lower_level - 1;
10425 : }
10426 : #endif
10427 :
10428 : /* Snapshot list, allow later requests */
10429 69 : list_replace_init(&net_todo_list, &list);
10430 :
10431 69 : __rtnl_unlock();
10432 :
10433 :
10434 : /* Wait for rcu callbacks to finish before next phase */
10435 69 : if (!list_empty(&list))
10436 0 : rcu_barrier();
10437 :
10438 69 : while (!list_empty(&list)) {
10439 0 : struct net_device *dev
10440 0 : = list_first_entry(&list, struct net_device, todo_list);
10441 0 : list_del(&dev->todo_list);
10442 :
10443 0 : if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
10444 0 : pr_err("network todo '%s' but state %d\n",
10445 : dev->name, dev->reg_state);
10446 0 : dump_stack();
10447 0 : continue;
10448 : }
10449 :
10450 0 : dev->reg_state = NETREG_UNREGISTERED;
10451 :
10452 0 : netdev_wait_allrefs(dev);
10453 :
10454 : /* paranoia */
10455 0 : BUG_ON(netdev_refcnt_read(dev));
10456 0 : BUG_ON(!list_empty(&dev->ptype_all));
10457 0 : BUG_ON(!list_empty(&dev->ptype_specific));
10458 0 : WARN_ON(rcu_access_pointer(dev->ip_ptr));
10459 0 : WARN_ON(rcu_access_pointer(dev->ip6_ptr));
10460 : #if IS_ENABLED(CONFIG_DECNET)
10461 : WARN_ON(dev->dn_ptr);
10462 : #endif
10463 0 : if (dev->priv_destructor)
10464 0 : dev->priv_destructor(dev);
10465 0 : if (dev->needs_free_netdev)
10466 0 : free_netdev(dev);
10467 :
10468 : /* Report a network device has been unregistered */
10469 0 : rtnl_lock();
10470 0 : dev_net(dev)->dev_unreg_count--;
10471 0 : __rtnl_unlock();
10472 0 : wake_up(&netdev_unregistering_wq);
10473 :
10474 : /* Free network device */
10475 0 : kobject_put(&dev->dev.kobj);
10476 : }
10477 69 : }
10478 :
10479 : /* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
10480 : * all the same fields in the same order as net_device_stats, with only
10481 : * the type differing, but rtnl_link_stats64 may have additional fields
10482 : * at the end for newer counters.
10483 : */
10484 0 : void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
10485 : const struct net_device_stats *netdev_stats)
10486 : {
10487 : #if BITS_PER_LONG == 64
10488 0 : BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
10489 0 : memcpy(stats64, netdev_stats, sizeof(*netdev_stats));
10490 : /* zero out counters that only exist in rtnl_link_stats64 */
10491 0 : memset((char *)stats64 + sizeof(*netdev_stats), 0,
10492 : sizeof(*stats64) - sizeof(*netdev_stats));
10493 : #else
10494 : size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
10495 : const unsigned long *src = (const unsigned long *)netdev_stats;
10496 : u64 *dst = (u64 *)stats64;
10497 :
10498 : BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
10499 : for (i = 0; i < n; i++)
10500 : dst[i] = src[i];
10501 : /* zero out counters that only exist in rtnl_link_stats64 */
10502 : memset((char *)stats64 + n * sizeof(u64), 0,
10503 : sizeof(*stats64) - n * sizeof(u64));
10504 : #endif
10505 0 : }
10506 : EXPORT_SYMBOL(netdev_stats_to_stats64);
10507 :
10508 : /**
10509 : * dev_get_stats - get network device statistics
10510 : * @dev: device to get statistics from
10511 : * @storage: place to store stats
10512 : *
10513 : * Get network statistics from device. Return @storage.
10514 : * The device driver may provide its own method by setting
10515 : * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
10516 : * otherwise the internal statistics structure is used.
10517 : */
10518 16 : struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
10519 : struct rtnl_link_stats64 *storage)
10520 : {
10521 16 : const struct net_device_ops *ops = dev->netdev_ops;
10522 :
10523 16 : if (ops->ndo_get_stats64) {
10524 16 : memset(storage, 0, sizeof(*storage));
10525 16 : ops->ndo_get_stats64(dev, storage);
10526 0 : } else if (ops->ndo_get_stats) {
10527 0 : netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
10528 : } else {
10529 0 : netdev_stats_to_stats64(storage, &dev->stats);
10530 : }
10531 16 : storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
10532 16 : storage->tx_dropped += (unsigned long)atomic_long_read(&dev->tx_dropped);
10533 16 : storage->rx_nohandler += (unsigned long)atomic_long_read(&dev->rx_nohandler);
10534 16 : return storage;
10535 : }
10536 : EXPORT_SYMBOL(dev_get_stats);
10537 :
10538 : /**
10539 : * dev_fetch_sw_netstats - get per-cpu network device statistics
10540 : * @s: place to store stats
10541 : * @netstats: per-cpu network stats to read from
10542 : *
10543 : * Read per-cpu network statistics and populate the related fields in @s.
10544 : */
10545 0 : void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s,
10546 : const struct pcpu_sw_netstats __percpu *netstats)
10547 : {
10548 0 : int cpu;
10549 :
10550 0 : for_each_possible_cpu(cpu) {
10551 0 : const struct pcpu_sw_netstats *stats;
10552 0 : struct pcpu_sw_netstats tmp;
10553 0 : unsigned int start;
10554 :
10555 0 : stats = per_cpu_ptr(netstats, cpu);
10556 0 : do {
10557 0 : start = u64_stats_fetch_begin_irq(&stats->syncp);
10558 0 : tmp.rx_packets = stats->rx_packets;
10559 0 : tmp.rx_bytes = stats->rx_bytes;
10560 0 : tmp.tx_packets = stats->tx_packets;
10561 0 : tmp.tx_bytes = stats->tx_bytes;
10562 0 : } while (u64_stats_fetch_retry_irq(&stats->syncp, start));
10563 :
10564 0 : s->rx_packets += tmp.rx_packets;
10565 0 : s->rx_bytes += tmp.rx_bytes;
10566 0 : s->tx_packets += tmp.tx_packets;
10567 0 : s->tx_bytes += tmp.tx_bytes;
10568 : }
10569 0 : }
10570 : EXPORT_SYMBOL_GPL(dev_fetch_sw_netstats);
10571 :
10572 : /**
10573 : * dev_get_tstats64 - ndo_get_stats64 implementation
10574 : * @dev: device to get statistics from
10575 : * @s: place to store stats
10576 : *
10577 : * Populate @s from dev->stats and dev->tstats. Can be used as
10578 : * ndo_get_stats64() callback.
10579 : */
10580 0 : void dev_get_tstats64(struct net_device *dev, struct rtnl_link_stats64 *s)
10581 : {
10582 0 : netdev_stats_to_stats64(s, &dev->stats);
10583 0 : dev_fetch_sw_netstats(s, dev->tstats);
10584 0 : }
10585 : EXPORT_SYMBOL_GPL(dev_get_tstats64);
10586 :
10587 0 : struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
10588 : {
10589 0 : struct netdev_queue *queue = dev_ingress_queue(dev);
10590 :
10591 : #ifdef CONFIG_NET_CLS_ACT
10592 : if (queue)
10593 : return queue;
10594 : queue = kzalloc(sizeof(*queue), GFP_KERNEL);
10595 : if (!queue)
10596 : return NULL;
10597 : netdev_init_one_queue(dev, queue, NULL);
10598 : RCU_INIT_POINTER(queue->qdisc, &noop_qdisc);
10599 : queue->qdisc_sleeping = &noop_qdisc;
10600 : rcu_assign_pointer(dev->ingress_queue, queue);
10601 : #endif
10602 0 : return queue;
10603 : }
10604 :
10605 : static const struct ethtool_ops default_ethtool_ops;
10606 :
10607 0 : void netdev_set_default_ethtool_ops(struct net_device *dev,
10608 : const struct ethtool_ops *ops)
10609 : {
10610 0 : if (dev->ethtool_ops == &default_ethtool_ops)
10611 0 : dev->ethtool_ops = ops;
10612 0 : }
10613 : EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
10614 :
10615 0 : void netdev_freemem(struct net_device *dev)
10616 : {
10617 0 : char *addr = (char *)dev - dev->padded;
10618 :
10619 0 : kvfree(addr);
10620 0 : }
10621 :
10622 : /**
10623 : * alloc_netdev_mqs - allocate network device
10624 : * @sizeof_priv: size of private data to allocate space for
10625 : * @name: device name format string
10626 : * @name_assign_type: origin of device name
10627 : * @setup: callback to initialize device
10628 : * @txqs: the number of TX subqueues to allocate
10629 : * @rxqs: the number of RX subqueues to allocate
10630 : *
10631 : * Allocates a struct net_device with private data area for driver use
10632 : * and performs basic initialization. Also allocates subqueue structs
10633 : * for each queue on the device.
10634 : */
10635 3 : struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
10636 : unsigned char name_assign_type,
10637 : void (*setup)(struct net_device *),
10638 : unsigned int txqs, unsigned int rxqs)
10639 : {
10640 3 : struct net_device *dev;
10641 3 : unsigned int alloc_size;
10642 3 : struct net_device *p;
10643 :
10644 3 : BUG_ON(strlen(name) >= sizeof(dev->name));
10645 :
10646 3 : if (txqs < 1) {
10647 0 : pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
10648 0 : return NULL;
10649 : }
10650 :
10651 3 : if (rxqs < 1) {
10652 0 : pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
10653 0 : return NULL;
10654 : }
10655 :
10656 3 : alloc_size = sizeof(struct net_device);
10657 3 : if (sizeof_priv) {
10658 : /* ensure 32-byte alignment of private area */
10659 1 : alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
10660 1 : alloc_size += sizeof_priv;
10661 : }
10662 : /* ensure 32-byte alignment of whole construct */
10663 3 : alloc_size += NETDEV_ALIGN - 1;
10664 :
10665 3 : p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
10666 3 : if (!p)
10667 : return NULL;
10668 :
10669 3 : dev = PTR_ALIGN(p, NETDEV_ALIGN);
10670 3 : dev->padded = (char *)dev - (char *)p;
10671 :
10672 3 : dev->pcpu_refcnt = alloc_percpu(int);
10673 3 : if (!dev->pcpu_refcnt)
10674 0 : goto free_dev;
10675 :
10676 3 : if (dev_addr_init(dev))
10677 0 : goto free_pcpu;
10678 :
10679 3 : dev_mc_init(dev);
10680 3 : dev_uc_init(dev);
10681 :
10682 3 : dev_net_set(dev, &init_net);
10683 :
10684 3 : dev->gso_max_size = GSO_MAX_SIZE;
10685 3 : dev->gso_max_segs = GSO_MAX_SEGS;
10686 3 : dev->upper_level = 1;
10687 3 : dev->lower_level = 1;
10688 : #ifdef CONFIG_LOCKDEP
10689 3 : dev->nested_level = 0;
10690 3 : INIT_LIST_HEAD(&dev->unlink_list);
10691 : #endif
10692 :
10693 3 : INIT_LIST_HEAD(&dev->napi_list);
10694 3 : INIT_LIST_HEAD(&dev->unreg_list);
10695 3 : INIT_LIST_HEAD(&dev->close_list);
10696 3 : INIT_LIST_HEAD(&dev->link_watch_list);
10697 3 : INIT_LIST_HEAD(&dev->adj_list.upper);
10698 3 : INIT_LIST_HEAD(&dev->adj_list.lower);
10699 3 : INIT_LIST_HEAD(&dev->ptype_all);
10700 3 : INIT_LIST_HEAD(&dev->ptype_specific);
10701 3 : INIT_LIST_HEAD(&dev->net_notifier_list);
10702 : #ifdef CONFIG_NET_SCHED
10703 : hash_init(dev->qdisc_hash);
10704 : #endif
10705 3 : dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
10706 3 : setup(dev);
10707 :
10708 3 : if (!dev->tx_queue_len) {
10709 2 : dev->priv_flags |= IFF_NO_QUEUE;
10710 2 : dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
10711 : }
10712 :
10713 3 : dev->num_tx_queues = txqs;
10714 3 : dev->real_num_tx_queues = txqs;
10715 3 : if (netif_alloc_netdev_queues(dev))
10716 0 : goto free_all;
10717 :
10718 3 : dev->num_rx_queues = rxqs;
10719 3 : dev->real_num_rx_queues = rxqs;
10720 3 : if (netif_alloc_rx_queues(dev))
10721 0 : goto free_all;
10722 :
10723 3 : strcpy(dev->name, name);
10724 3 : dev->name_assign_type = name_assign_type;
10725 3 : dev->group = INIT_NETDEV_GROUP;
10726 3 : if (!dev->ethtool_ops)
10727 2 : dev->ethtool_ops = &default_ethtool_ops;
10728 :
10729 3 : nf_hook_ingress_init(dev);
10730 :
10731 : return dev;
10732 :
10733 0 : free_all:
10734 0 : free_netdev(dev);
10735 0 : return NULL;
10736 :
10737 0 : free_pcpu:
10738 0 : free_percpu(dev->pcpu_refcnt);
10739 0 : free_dev:
10740 0 : netdev_freemem(dev);
10741 0 : return NULL;
10742 : }
10743 : EXPORT_SYMBOL(alloc_netdev_mqs);
10744 :
10745 : /**
10746 : * free_netdev - free network device
10747 : * @dev: device
10748 : *
10749 : * This function does the last stage of destroying an allocated device
10750 : * interface. The reference to the device object is released. If this
10751 : * is the last reference then it will be freed.Must be called in process
10752 : * context.
10753 : */
10754 0 : void free_netdev(struct net_device *dev)
10755 : {
10756 0 : struct napi_struct *p, *n;
10757 :
10758 0 : might_sleep();
10759 :
10760 : /* When called immediately after register_netdevice() failed the unwind
10761 : * handling may still be dismantling the device. Handle that case by
10762 : * deferring the free.
10763 : */
10764 0 : if (dev->reg_state == NETREG_UNREGISTERING) {
10765 0 : ASSERT_RTNL();
10766 0 : dev->needs_free_netdev = true;
10767 0 : return;
10768 : }
10769 :
10770 0 : netif_free_tx_queues(dev);
10771 0 : netif_free_rx_queues(dev);
10772 :
10773 0 : kfree(rcu_dereference_protected(dev->ingress_queue, 1));
10774 :
10775 : /* Flush device addresses */
10776 0 : dev_addr_flush(dev);
10777 :
10778 0 : list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
10779 0 : netif_napi_del(p);
10780 :
10781 0 : free_percpu(dev->pcpu_refcnt);
10782 0 : dev->pcpu_refcnt = NULL;
10783 0 : free_percpu(dev->xdp_bulkq);
10784 0 : dev->xdp_bulkq = NULL;
10785 :
10786 : /* Compatibility with error handling in drivers */
10787 0 : if (dev->reg_state == NETREG_UNINITIALIZED) {
10788 0 : netdev_freemem(dev);
10789 0 : return;
10790 : }
10791 :
10792 0 : BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
10793 0 : dev->reg_state = NETREG_RELEASED;
10794 :
10795 : /* will free via device release */
10796 0 : put_device(&dev->dev);
10797 : }
10798 : EXPORT_SYMBOL(free_netdev);
10799 :
10800 : /**
10801 : * synchronize_net - Synchronize with packet receive processing
10802 : *
10803 : * Wait for packets currently being received to be done.
10804 : * Does not block later packets from starting.
10805 : */
10806 1 : void synchronize_net(void)
10807 : {
10808 1 : might_sleep();
10809 1 : if (rtnl_is_locked())
10810 0 : synchronize_rcu_expedited();
10811 : else
10812 1 : synchronize_rcu();
10813 1 : }
10814 : EXPORT_SYMBOL(synchronize_net);
10815 :
10816 : /**
10817 : * unregister_netdevice_queue - remove device from the kernel
10818 : * @dev: device
10819 : * @head: list
10820 : *
10821 : * This function shuts down a device interface and removes it
10822 : * from the kernel tables.
10823 : * If head not NULL, device is queued to be unregistered later.
10824 : *
10825 : * Callers must hold the rtnl semaphore. You may want
10826 : * unregister_netdev() instead of this.
10827 : */
10828 :
10829 0 : void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
10830 : {
10831 0 : ASSERT_RTNL();
10832 :
10833 0 : if (head) {
10834 0 : list_move_tail(&dev->unreg_list, head);
10835 : } else {
10836 0 : LIST_HEAD(single);
10837 :
10838 0 : list_add(&dev->unreg_list, &single);
10839 0 : unregister_netdevice_many(&single);
10840 : }
10841 0 : }
10842 : EXPORT_SYMBOL(unregister_netdevice_queue);
10843 :
10844 : /**
10845 : * unregister_netdevice_many - unregister many devices
10846 : * @head: list of devices
10847 : *
10848 : * Note: As most callers use a stack allocated list_head,
10849 : * we force a list_del() to make sure stack wont be corrupted later.
10850 : */
10851 0 : void unregister_netdevice_many(struct list_head *head)
10852 : {
10853 0 : struct net_device *dev, *tmp;
10854 0 : LIST_HEAD(close_head);
10855 :
10856 0 : BUG_ON(dev_boot_phase);
10857 0 : ASSERT_RTNL();
10858 :
10859 0 : if (list_empty(head))
10860 0 : return;
10861 :
10862 0 : list_for_each_entry_safe(dev, tmp, head, unreg_list) {
10863 : /* Some devices call without registering
10864 : * for initialization unwind. Remove those
10865 : * devices and proceed with the remaining.
10866 : */
10867 0 : if (dev->reg_state == NETREG_UNINITIALIZED) {
10868 0 : pr_debug("unregister_netdevice: device %s/%p never was registered\n",
10869 : dev->name, dev);
10870 :
10871 0 : WARN_ON(1);
10872 0 : list_del(&dev->unreg_list);
10873 0 : continue;
10874 : }
10875 0 : dev->dismantle = true;
10876 0 : BUG_ON(dev->reg_state != NETREG_REGISTERED);
10877 : }
10878 :
10879 : /* If device is running, close it first. */
10880 0 : list_for_each_entry(dev, head, unreg_list)
10881 0 : list_add_tail(&dev->close_list, &close_head);
10882 0 : dev_close_many(&close_head, true);
10883 :
10884 0 : list_for_each_entry(dev, head, unreg_list) {
10885 : /* And unlink it from device chain. */
10886 0 : unlist_netdevice(dev);
10887 :
10888 0 : dev->reg_state = NETREG_UNREGISTERING;
10889 : }
10890 0 : flush_all_backlogs();
10891 :
10892 0 : synchronize_net();
10893 :
10894 0 : list_for_each_entry(dev, head, unreg_list) {
10895 0 : struct sk_buff *skb = NULL;
10896 :
10897 : /* Shutdown queueing discipline. */
10898 0 : dev_shutdown(dev);
10899 :
10900 0 : dev_xdp_uninstall(dev);
10901 :
10902 : /* Notify protocols, that we are about to destroy
10903 : * this device. They should clean all the things.
10904 : */
10905 0 : call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
10906 :
10907 0 : if (!dev->rtnl_link_ops ||
10908 0 : dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
10909 0 : skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0,
10910 : GFP_KERNEL, NULL, 0);
10911 :
10912 : /*
10913 : * Flush the unicast and multicast chains
10914 : */
10915 0 : dev_uc_flush(dev);
10916 0 : dev_mc_flush(dev);
10917 :
10918 0 : netdev_name_node_alt_flush(dev);
10919 0 : netdev_name_node_free(dev->name_node);
10920 :
10921 0 : if (dev->netdev_ops->ndo_uninit)
10922 0 : dev->netdev_ops->ndo_uninit(dev);
10923 :
10924 0 : if (skb)
10925 0 : rtmsg_ifinfo_send(skb, dev, GFP_KERNEL);
10926 :
10927 : /* Notifier chain MUST detach us all upper devices. */
10928 0 : WARN_ON(netdev_has_any_upper_dev(dev));
10929 0 : WARN_ON(netdev_has_any_lower_dev(dev));
10930 :
10931 : /* Remove entries from kobject tree */
10932 0 : netdev_unregister_kobject(dev);
10933 : #ifdef CONFIG_XPS
10934 : /* Remove XPS queueing entries */
10935 0 : netif_reset_xps_queues_gt(dev, 0);
10936 : #endif
10937 : }
10938 :
10939 0 : synchronize_net();
10940 :
10941 0 : list_for_each_entry(dev, head, unreg_list) {
10942 0 : dev_put(dev);
10943 0 : net_set_todo(dev);
10944 : }
10945 :
10946 0 : list_del(head);
10947 : }
10948 : EXPORT_SYMBOL(unregister_netdevice_many);
10949 :
10950 : /**
10951 : * unregister_netdev - remove device from the kernel
10952 : * @dev: device
10953 : *
10954 : * This function shuts down a device interface and removes it
10955 : * from the kernel tables.
10956 : *
10957 : * This is just a wrapper for unregister_netdevice that takes
10958 : * the rtnl semaphore. In general you want to use this and not
10959 : * unregister_netdevice.
10960 : */
10961 0 : void unregister_netdev(struct net_device *dev)
10962 : {
10963 0 : rtnl_lock();
10964 0 : unregister_netdevice(dev);
10965 0 : rtnl_unlock();
10966 0 : }
10967 : EXPORT_SYMBOL(unregister_netdev);
10968 :
10969 : /**
10970 : * dev_change_net_namespace - move device to different nethost namespace
10971 : * @dev: device
10972 : * @net: network namespace
10973 : * @pat: If not NULL name pattern to try if the current device name
10974 : * is already taken in the destination network namespace.
10975 : *
10976 : * This function shuts down a device interface and moves it
10977 : * to a new network namespace. On success 0 is returned, on
10978 : * a failure a netagive errno code is returned.
10979 : *
10980 : * Callers must hold the rtnl semaphore.
10981 : */
10982 :
10983 0 : int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
10984 : {
10985 0 : struct net *net_old = dev_net(dev);
10986 0 : int err, new_nsid, new_ifindex;
10987 :
10988 0 : ASSERT_RTNL();
10989 :
10990 : /* Don't allow namespace local devices to be moved. */
10991 0 : err = -EINVAL;
10992 0 : if (dev->features & NETIF_F_NETNS_LOCAL)
10993 0 : goto out;
10994 :
10995 : /* Ensure the device has been registrered */
10996 0 : if (dev->reg_state != NETREG_REGISTERED)
10997 0 : goto out;
10998 :
10999 : /* Get out if there is nothing todo */
11000 0 : err = 0;
11001 0 : if (net_eq(net_old, net))
11002 0 : goto out;
11003 :
11004 : /* Pick the destination device name, and ensure
11005 : * we can use it in the destination network namespace.
11006 : */
11007 : err = -EEXIST;
11008 : if (__dev_get_by_name(net, dev->name)) {
11009 : /* We get here if we can't use the current device name */
11010 : if (!pat)
11011 : goto out;
11012 : err = dev_get_valid_name(net, dev, pat);
11013 : if (err < 0)
11014 : goto out;
11015 : }
11016 :
11017 : /*
11018 : * And now a mini version of register_netdevice unregister_netdevice.
11019 : */
11020 :
11021 : /* If device is running close it first. */
11022 : dev_close(dev);
11023 :
11024 : /* And unlink it from device chain */
11025 : unlist_netdevice(dev);
11026 :
11027 : synchronize_net();
11028 :
11029 : /* Shutdown queueing discipline. */
11030 : dev_shutdown(dev);
11031 :
11032 : /* Notify protocols, that we are about to destroy
11033 : * this device. They should clean all the things.
11034 : *
11035 : * Note that dev->reg_state stays at NETREG_REGISTERED.
11036 : * This is wanted because this way 8021q and macvlan know
11037 : * the device is just moving and can keep their slaves up.
11038 : */
11039 : call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
11040 : rcu_barrier();
11041 :
11042 : new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL);
11043 : /* If there is an ifindex conflict assign a new one */
11044 : if (__dev_get_by_index(net, dev->ifindex))
11045 : new_ifindex = dev_new_index(net);
11046 : else
11047 : new_ifindex = dev->ifindex;
11048 :
11049 : rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid,
11050 : new_ifindex);
11051 :
11052 : /*
11053 : * Flush the unicast and multicast chains
11054 : */
11055 : dev_uc_flush(dev);
11056 : dev_mc_flush(dev);
11057 :
11058 : /* Send a netdev-removed uevent to the old namespace */
11059 : kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
11060 : netdev_adjacent_del_links(dev);
11061 :
11062 : /* Move per-net netdevice notifiers that are following the netdevice */
11063 : move_netdevice_notifiers_dev_net(dev, net);
11064 :
11065 : /* Actually switch the network namespace */
11066 : dev_net_set(dev, net);
11067 : dev->ifindex = new_ifindex;
11068 :
11069 : /* Send a netdev-add uevent to the new namespace */
11070 : kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
11071 : netdev_adjacent_add_links(dev);
11072 :
11073 : /* Fixup kobjects */
11074 : err = device_rename(&dev->dev, dev->name);
11075 : WARN_ON(err);
11076 :
11077 : /* Adapt owner in case owning user namespace of target network
11078 : * namespace is different from the original one.
11079 : */
11080 : err = netdev_change_owner(dev, net_old, net);
11081 : WARN_ON(err);
11082 :
11083 : /* Add the device back in the hashes */
11084 : list_netdevice(dev);
11085 :
11086 : /* Notify protocols, that a new device appeared. */
11087 : call_netdevice_notifiers(NETDEV_REGISTER, dev);
11088 :
11089 : /*
11090 : * Prevent userspace races by waiting until the network
11091 : * device is fully setup before sending notifications.
11092 : */
11093 : rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U, GFP_KERNEL);
11094 :
11095 : synchronize_net();
11096 : err = 0;
11097 0 : out:
11098 0 : return err;
11099 : }
11100 : EXPORT_SYMBOL_GPL(dev_change_net_namespace);
11101 :
11102 0 : static int dev_cpu_dead(unsigned int oldcpu)
11103 : {
11104 0 : struct sk_buff **list_skb;
11105 0 : struct sk_buff *skb;
11106 0 : unsigned int cpu;
11107 0 : struct softnet_data *sd, *oldsd, *remsd = NULL;
11108 :
11109 0 : local_irq_disable();
11110 0 : cpu = smp_processor_id();
11111 0 : sd = &per_cpu(softnet_data, cpu);
11112 0 : oldsd = &per_cpu(softnet_data, oldcpu);
11113 :
11114 : /* Find end of our completion_queue. */
11115 0 : list_skb = &sd->completion_queue;
11116 0 : while (*list_skb)
11117 0 : list_skb = &(*list_skb)->next;
11118 : /* Append completion queue from offline CPU. */
11119 0 : *list_skb = oldsd->completion_queue;
11120 0 : oldsd->completion_queue = NULL;
11121 :
11122 : /* Append output queue from offline CPU. */
11123 0 : if (oldsd->output_queue) {
11124 0 : *sd->output_queue_tailp = oldsd->output_queue;
11125 0 : sd->output_queue_tailp = oldsd->output_queue_tailp;
11126 0 : oldsd->output_queue = NULL;
11127 0 : oldsd->output_queue_tailp = &oldsd->output_queue;
11128 : }
11129 : /* Append NAPI poll list from offline CPU, with one exception :
11130 : * process_backlog() must be called by cpu owning percpu backlog.
11131 : * We properly handle process_queue & input_pkt_queue later.
11132 : */
11133 0 : while (!list_empty(&oldsd->poll_list)) {
11134 0 : struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
11135 : struct napi_struct,
11136 : poll_list);
11137 :
11138 0 : list_del_init(&napi->poll_list);
11139 0 : if (napi->poll == process_backlog)
11140 0 : napi->state = 0;
11141 : else
11142 0 : ____napi_schedule(sd, napi);
11143 : }
11144 :
11145 0 : raise_softirq_irqoff(NET_TX_SOFTIRQ);
11146 0 : local_irq_enable();
11147 :
11148 : #ifdef CONFIG_RPS
11149 0 : remsd = oldsd->rps_ipi_list;
11150 0 : oldsd->rps_ipi_list = NULL;
11151 : #endif
11152 : /* send out pending IPI's on offline CPU */
11153 0 : net_rps_send_ipi(remsd);
11154 :
11155 : /* Process offline CPU's input_pkt_queue */
11156 0 : while ((skb = __skb_dequeue(&oldsd->process_queue))) {
11157 0 : netif_rx_ni(skb);
11158 0 : input_queue_head_incr(oldsd);
11159 : }
11160 0 : while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
11161 0 : netif_rx_ni(skb);
11162 0 : input_queue_head_incr(oldsd);
11163 : }
11164 :
11165 0 : return 0;
11166 : }
11167 :
11168 : /**
11169 : * netdev_increment_features - increment feature set by one
11170 : * @all: current feature set
11171 : * @one: new feature set
11172 : * @mask: mask feature set
11173 : *
11174 : * Computes a new feature set after adding a device with feature set
11175 : * @one to the master device with current feature set @all. Will not
11176 : * enable anything that is off in @mask. Returns the new feature set.
11177 : */
11178 0 : netdev_features_t netdev_increment_features(netdev_features_t all,
11179 : netdev_features_t one, netdev_features_t mask)
11180 : {
11181 0 : if (mask & NETIF_F_HW_CSUM)
11182 0 : mask |= NETIF_F_CSUM_MASK;
11183 0 : mask |= NETIF_F_VLAN_CHALLENGED;
11184 :
11185 0 : all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
11186 0 : all &= one | ~NETIF_F_ALL_FOR_ALL;
11187 :
11188 : /* If one device supports hw checksumming, set for all. */
11189 0 : if (all & NETIF_F_HW_CSUM)
11190 0 : all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
11191 :
11192 0 : return all;
11193 : }
11194 : EXPORT_SYMBOL(netdev_increment_features);
11195 :
11196 2 : static struct hlist_head * __net_init netdev_create_hash(void)
11197 : {
11198 2 : int i;
11199 2 : struct hlist_head *hash;
11200 :
11201 2 : hash = kmalloc_array(NETDEV_HASHENTRIES, sizeof(*hash), GFP_KERNEL);
11202 2 : if (hash != NULL)
11203 514 : for (i = 0; i < NETDEV_HASHENTRIES; i++)
11204 512 : INIT_HLIST_HEAD(&hash[i]);
11205 :
11206 2 : return hash;
11207 : }
11208 :
11209 : /* Initialize per network namespace state */
11210 1 : static int __net_init netdev_init(struct net *net)
11211 : {
11212 1 : BUILD_BUG_ON(GRO_HASH_BUCKETS >
11213 : 8 * sizeof_field(struct napi_struct, gro_bitmask));
11214 :
11215 1 : if (net != &init_net)
11216 0 : INIT_LIST_HEAD(&net->dev_base_head);
11217 :
11218 1 : net->dev_name_head = netdev_create_hash();
11219 1 : if (net->dev_name_head == NULL)
11220 0 : goto err_name;
11221 :
11222 1 : net->dev_index_head = netdev_create_hash();
11223 1 : if (net->dev_index_head == NULL)
11224 0 : goto err_idx;
11225 :
11226 1 : RAW_INIT_NOTIFIER_HEAD(&net->netdev_chain);
11227 :
11228 1 : return 0;
11229 :
11230 0 : err_idx:
11231 0 : kfree(net->dev_name_head);
11232 : err_name:
11233 : return -ENOMEM;
11234 : }
11235 :
11236 : /**
11237 : * netdev_drivername - network driver for the device
11238 : * @dev: network device
11239 : *
11240 : * Determine network driver for device.
11241 : */
11242 0 : const char *netdev_drivername(const struct net_device *dev)
11243 : {
11244 0 : const struct device_driver *driver;
11245 0 : const struct device *parent;
11246 0 : const char *empty = "";
11247 :
11248 0 : parent = dev->dev.parent;
11249 0 : if (!parent)
11250 : return empty;
11251 :
11252 0 : driver = parent->driver;
11253 0 : if (driver && driver->name)
11254 0 : return driver->name;
11255 : return empty;
11256 : }
11257 :
11258 0 : static void __netdev_printk(const char *level, const struct net_device *dev,
11259 : struct va_format *vaf)
11260 : {
11261 0 : if (dev && dev->dev.parent) {
11262 0 : dev_printk_emit(level[1] - '0',
11263 0 : dev->dev.parent,
11264 : "%s %s %s%s: %pV",
11265 : dev_driver_string(dev->dev.parent),
11266 0 : dev_name(dev->dev.parent),
11267 : netdev_name(dev), netdev_reg_state(dev),
11268 : vaf);
11269 0 : } else if (dev) {
11270 0 : printk("%s%s%s: %pV",
11271 : level, netdev_name(dev), netdev_reg_state(dev), vaf);
11272 : } else {
11273 0 : printk("%s(NULL net_device): %pV", level, vaf);
11274 : }
11275 0 : }
11276 :
11277 0 : void netdev_printk(const char *level, const struct net_device *dev,
11278 : const char *format, ...)
11279 : {
11280 0 : struct va_format vaf;
11281 0 : va_list args;
11282 :
11283 0 : va_start(args, format);
11284 :
11285 0 : vaf.fmt = format;
11286 0 : vaf.va = &args;
11287 :
11288 0 : __netdev_printk(level, dev, &vaf);
11289 :
11290 0 : va_end(args);
11291 0 : }
11292 : EXPORT_SYMBOL(netdev_printk);
11293 :
11294 : #define define_netdev_printk_level(func, level) \
11295 : void func(const struct net_device *dev, const char *fmt, ...) \
11296 : { \
11297 : struct va_format vaf; \
11298 : va_list args; \
11299 : \
11300 : va_start(args, fmt); \
11301 : \
11302 : vaf.fmt = fmt; \
11303 : vaf.va = &args; \
11304 : \
11305 : __netdev_printk(level, dev, &vaf); \
11306 : \
11307 : va_end(args); \
11308 : } \
11309 : EXPORT_SYMBOL(func);
11310 :
11311 0 : define_netdev_printk_level(netdev_emerg, KERN_EMERG);
11312 0 : define_netdev_printk_level(netdev_alert, KERN_ALERT);
11313 0 : define_netdev_printk_level(netdev_crit, KERN_CRIT);
11314 0 : define_netdev_printk_level(netdev_err, KERN_ERR);
11315 0 : define_netdev_printk_level(netdev_warn, KERN_WARNING);
11316 0 : define_netdev_printk_level(netdev_notice, KERN_NOTICE);
11317 0 : define_netdev_printk_level(netdev_info, KERN_INFO);
11318 :
11319 0 : static void __net_exit netdev_exit(struct net *net)
11320 : {
11321 0 : kfree(net->dev_name_head);
11322 0 : kfree(net->dev_index_head);
11323 0 : if (net != &init_net)
11324 0 : WARN_ON_ONCE(!list_empty(&net->dev_base_head));
11325 0 : }
11326 :
11327 : static struct pernet_operations __net_initdata netdev_net_ops = {
11328 : .init = netdev_init,
11329 : .exit = netdev_exit,
11330 : };
11331 :
11332 0 : static void __net_exit default_device_exit(struct net *net)
11333 : {
11334 0 : struct net_device *dev, *aux;
11335 : /*
11336 : * Push all migratable network devices back to the
11337 : * initial network namespace
11338 : */
11339 0 : rtnl_lock();
11340 0 : for_each_netdev_safe(net, dev, aux) {
11341 0 : int err;
11342 0 : char fb_name[IFNAMSIZ];
11343 :
11344 : /* Ignore unmoveable devices (i.e. loopback) */
11345 0 : if (dev->features & NETIF_F_NETNS_LOCAL)
11346 0 : continue;
11347 :
11348 : /* Leave virtual devices for the generic cleanup */
11349 0 : if (dev->rtnl_link_ops)
11350 0 : continue;
11351 :
11352 : /* Push remaining network devices to init_net */
11353 0 : snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
11354 0 : if (__dev_get_by_name(&init_net, fb_name))
11355 0 : snprintf(fb_name, IFNAMSIZ, "dev%%d");
11356 0 : err = dev_change_net_namespace(dev, &init_net, fb_name);
11357 0 : if (err) {
11358 0 : pr_emerg("%s: failed to move %s to init_net: %d\n",
11359 : __func__, dev->name, err);
11360 0 : BUG();
11361 : }
11362 : }
11363 0 : rtnl_unlock();
11364 0 : }
11365 :
11366 0 : static void __net_exit rtnl_lock_unregistering(struct list_head *net_list)
11367 : {
11368 : /* Return with the rtnl_lock held when there are no network
11369 : * devices unregistering in any network namespace in net_list.
11370 : */
11371 0 : struct net *net;
11372 0 : bool unregistering;
11373 0 : DEFINE_WAIT_FUNC(wait, woken_wake_function);
11374 :
11375 0 : add_wait_queue(&netdev_unregistering_wq, &wait);
11376 0 : for (;;) {
11377 0 : unregistering = false;
11378 0 : rtnl_lock();
11379 0 : list_for_each_entry(net, net_list, exit_list) {
11380 0 : if (net->dev_unreg_count > 0) {
11381 : unregistering = true;
11382 : break;
11383 : }
11384 : }
11385 0 : if (!unregistering)
11386 : break;
11387 0 : __rtnl_unlock();
11388 :
11389 0 : wait_woken(&wait, TASK_UNINTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
11390 : }
11391 0 : remove_wait_queue(&netdev_unregistering_wq, &wait);
11392 0 : }
11393 :
11394 0 : static void __net_exit default_device_exit_batch(struct list_head *net_list)
11395 : {
11396 : /* At exit all network devices most be removed from a network
11397 : * namespace. Do this in the reverse order of registration.
11398 : * Do this across as many network namespaces as possible to
11399 : * improve batching efficiency.
11400 : */
11401 0 : struct net_device *dev;
11402 0 : struct net *net;
11403 0 : LIST_HEAD(dev_kill_list);
11404 :
11405 : /* To prevent network device cleanup code from dereferencing
11406 : * loopback devices or network devices that have been freed
11407 : * wait here for all pending unregistrations to complete,
11408 : * before unregistring the loopback device and allowing the
11409 : * network namespace be freed.
11410 : *
11411 : * The netdev todo list containing all network devices
11412 : * unregistrations that happen in default_device_exit_batch
11413 : * will run in the rtnl_unlock() at the end of
11414 : * default_device_exit_batch.
11415 : */
11416 0 : rtnl_lock_unregistering(net_list);
11417 0 : list_for_each_entry(net, net_list, exit_list) {
11418 0 : for_each_netdev_reverse(net, dev) {
11419 0 : if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink)
11420 0 : dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
11421 : else
11422 0 : unregister_netdevice_queue(dev, &dev_kill_list);
11423 : }
11424 : }
11425 0 : unregister_netdevice_many(&dev_kill_list);
11426 0 : rtnl_unlock();
11427 0 : }
11428 :
11429 : static struct pernet_operations __net_initdata default_device_ops = {
11430 : .exit = default_device_exit,
11431 : .exit_batch = default_device_exit_batch,
11432 : };
11433 :
11434 : /*
11435 : * Initialize the DEV module. At boot time this walks the device list and
11436 : * unhooks any devices that fail to initialise (normally hardware not
11437 : * present) and leaves us with a valid list of present and active devices.
11438 : *
11439 : */
11440 :
11441 : /*
11442 : * This is called single threaded during boot, so no need
11443 : * to take the rtnl semaphore.
11444 : */
11445 1 : static int __init net_dev_init(void)
11446 : {
11447 1 : int i, rc = -ENOMEM;
11448 :
11449 1 : BUG_ON(!dev_boot_phase);
11450 :
11451 1 : if (dev_proc_init())
11452 0 : goto out;
11453 :
11454 1 : if (netdev_kobject_init())
11455 0 : goto out;
11456 :
11457 1 : INIT_LIST_HEAD(&ptype_all);
11458 17 : for (i = 0; i < PTYPE_HASH_SIZE; i++)
11459 16 : INIT_LIST_HEAD(&ptype_base[i]);
11460 :
11461 1 : INIT_LIST_HEAD(&offload_base);
11462 :
11463 1 : if (register_pernet_subsys(&netdev_net_ops))
11464 0 : goto out;
11465 :
11466 : /*
11467 : * Initialise the packet receive queues.
11468 : */
11469 :
11470 5 : for_each_possible_cpu(i) {
11471 4 : struct work_struct *flush = per_cpu_ptr(&flush_works, i);
11472 4 : struct softnet_data *sd = &per_cpu(softnet_data, i);
11473 :
11474 4 : INIT_WORK(flush, flush_backlog);
11475 :
11476 4 : skb_queue_head_init(&sd->input_pkt_queue);
11477 4 : skb_queue_head_init(&sd->process_queue);
11478 : #ifdef CONFIG_XFRM_OFFLOAD
11479 : skb_queue_head_init(&sd->xfrm_backlog);
11480 : #endif
11481 4 : INIT_LIST_HEAD(&sd->poll_list);
11482 4 : sd->output_queue_tailp = &sd->output_queue;
11483 : #ifdef CONFIG_RPS
11484 4 : INIT_CSD(&sd->csd, rps_trigger_softirq, sd);
11485 4 : sd->cpu = i;
11486 : #endif
11487 :
11488 4 : init_gro_hash(&sd->backlog);
11489 4 : sd->backlog.poll = process_backlog;
11490 4 : sd->backlog.weight = weight_p;
11491 : }
11492 :
11493 1 : dev_boot_phase = 0;
11494 :
11495 : /* The loopback device is special if any other network devices
11496 : * is present in a network namespace the loopback device must
11497 : * be present. Since we now dynamically allocate and free the
11498 : * loopback device ensure this invariant is maintained by
11499 : * keeping the loopback device as the first device on the
11500 : * list of network devices. Ensuring the loopback devices
11501 : * is the first device that appears and the last network device
11502 : * that disappears.
11503 : */
11504 1 : if (register_pernet_device(&loopback_net_ops))
11505 0 : goto out;
11506 :
11507 1 : if (register_pernet_device(&default_device_ops))
11508 0 : goto out;
11509 :
11510 1 : open_softirq(NET_TX_SOFTIRQ, net_tx_action);
11511 1 : open_softirq(NET_RX_SOFTIRQ, net_rx_action);
11512 :
11513 1 : rc = cpuhp_setup_state_nocalls(CPUHP_NET_DEV_DEAD, "net/dev:dead",
11514 : NULL, dev_cpu_dead);
11515 1 : WARN_ON(rc < 0);
11516 : rc = 0;
11517 1 : out:
11518 1 : return rc;
11519 : }
11520 :
11521 : subsys_initcall(net_dev_init);
|