Merge tag 'v3.10.108' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / core / dev.c
1 /*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
10 * Authors: Ross Biro
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75 #include <asm/uaccess.h>
76 #include <linux/bitops.h>
77 #include <linux/capability.h>
78 #include <linux/cpu.h>
79 #include <linux/types.h>
80 #include <linux/kernel.h>
81 #include <linux/hash.h>
82 #include <linux/slab.h>
83 #include <linux/sched.h>
84 #include <linux/mutex.h>
85 #include <linux/string.h>
86 #include <linux/mm.h>
87 #include <linux/socket.h>
88 #include <linux/sockios.h>
89 #include <linux/errno.h>
90 #include <linux/interrupt.h>
91 #include <linux/if_ether.h>
92 #include <linux/netdevice.h>
93 #include <linux/etherdevice.h>
94 #include <linux/ethtool.h>
95 #include <linux/notifier.h>
96 #include <linux/skbuff.h>
97 #include <net/net_namespace.h>
98 #include <net/sock.h>
99 #include <linux/rtnetlink.h>
100 #include <linux/stat.h>
101 #include <net/dst.h>
102 #include <net/pkt_sched.h>
103 #include <net/checksum.h>
104 #include <net/xfrm.h>
105 #include <linux/highmem.h>
106 #include <linux/init.h>
107 #include <linux/module.h>
108 #include <linux/netpoll.h>
109 #include <linux/rcupdate.h>
110 #include <linux/delay.h>
111 #include <net/iw_handler.h>
112 #include <asm/current.h>
113 #include <linux/audit.h>
114 #include <linux/dmaengine.h>
115 #include <linux/err.h>
116 #include <linux/ctype.h>
117 #include <linux/if_arp.h>
118 #include <linux/if_vlan.h>
119 #include <linux/ip.h>
120 #include <net/ip.h>
121 #include <linux/ipv6.h>
122 #include <linux/in.h>
123 #include <linux/jhash.h>
124 #include <linux/random.h>
125 #include <trace/events/napi.h>
126 #include <trace/events/net.h>
127 #include <trace/events/skb.h>
128 #include <linux/pci.h>
129 #include <linux/inetdevice.h>
130 #include <linux/cpu_rmap.h>
131 #include <linux/static_key.h>
132 #include <net/udp.h>
133 #include "net-sysfs.h"
134
135 #ifdef UDP_SKT_WIFI
136 #include <linux/ftrace_event.h>
137 #endif
138
139 /* Instead of increasing this, you should create a hash table. */
140 #define MAX_GRO_SKBS 8
141
142 /* This should be increased if a protocol with a bigger head is added. */
143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
144
145 static DEFINE_SPINLOCK(ptype_lock);
146 static DEFINE_SPINLOCK(offload_lock);
147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
148 struct list_head ptype_all __read_mostly; /* Taps */
149 static struct list_head offload_base __read_mostly;
150
151 /*
152 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
153 * semaphore.
154 *
155 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
156 *
157 * Writers must hold the rtnl semaphore while they loop through the
158 * dev_base_head list, and hold dev_base_lock for writing when they do the
159 * actual updates. This allows pure readers to access the list even
160 * while a writer is preparing to update it.
161 *
162 * To put it another way, dev_base_lock is held for writing only to
163 * protect against pure readers; the rtnl semaphore provides the
164 * protection against other writers.
165 *
166 * See, for example usages, register_netdevice() and
167 * unregister_netdevice(), which must be called with the rtnl
168 * semaphore held.
169 */
170 DEFINE_RWLOCK(dev_base_lock);
171 EXPORT_SYMBOL(dev_base_lock);
172
173 seqcount_t devnet_rename_seq;
174
175 static inline void dev_base_seq_inc(struct net *net)
176 {
177 while (++net->dev_base_seq == 0);
178 }
179
180 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
181 {
182 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
183
184 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
185 }
186
187 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
188 {
189 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
190 }
191
192 static inline void rps_lock(struct softnet_data *sd)
193 {
194 #ifdef CONFIG_RPS
195 spin_lock(&sd->input_pkt_queue.lock);
196 #endif
197 }
198
199 static inline void rps_unlock(struct softnet_data *sd)
200 {
201 #ifdef CONFIG_RPS
202 spin_unlock(&sd->input_pkt_queue.lock);
203 #endif
204 }
205
206 /* Device list insertion */
207 static void list_netdevice(struct net_device *dev)
208 {
209 struct net *net = dev_net(dev);
210
211 ASSERT_RTNL();
212
213 write_lock_bh(&dev_base_lock);
214 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
215 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
216 hlist_add_head_rcu(&dev->index_hlist,
217 dev_index_hash(net, dev->ifindex));
218 write_unlock_bh(&dev_base_lock);
219
220 dev_base_seq_inc(net);
221 }
222
223 /* Device list removal
224 * caller must respect a RCU grace period before freeing/reusing dev
225 */
226 static void unlist_netdevice(struct net_device *dev)
227 {
228 ASSERT_RTNL();
229
230 /* Unlink dev from the device chain */
231 write_lock_bh(&dev_base_lock);
232 list_del_rcu(&dev->dev_list);
233 hlist_del_rcu(&dev->name_hlist);
234 hlist_del_rcu(&dev->index_hlist);
235 write_unlock_bh(&dev_base_lock);
236
237 dev_base_seq_inc(dev_net(dev));
238 }
239
240 /*
241 * Our notifier list
242 */
243
244 static RAW_NOTIFIER_HEAD(netdev_chain);
245
246 /*
247 * Device drivers call our routines to queue packets here. We empty the
248 * queue in the local softnet handler.
249 */
250
251 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
252 EXPORT_PER_CPU_SYMBOL(softnet_data);
253
254 #ifdef CONFIG_LOCKDEP
255 /*
256 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
257 * according to dev->type
258 */
259 static const unsigned short netdev_lock_type[] =
260 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
261 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
262 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
263 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
264 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
265 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
266 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
267 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
268 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
269 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
270 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
271 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
272 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
273 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
274 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
275
276 static const char *const netdev_lock_name[] =
277 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
278 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
279 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
280 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
281 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
282 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
283 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
284 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
285 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
286 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
287 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
288 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
289 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
290 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
291 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
292
293 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
294 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
295
296 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
297 {
298 int i;
299
300 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
301 if (netdev_lock_type[i] == dev_type)
302 return i;
303 /* the last key is used by default */
304 return ARRAY_SIZE(netdev_lock_type) - 1;
305 }
306
307 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
308 unsigned short dev_type)
309 {
310 int i;
311
312 i = netdev_lock_pos(dev_type);
313 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
314 netdev_lock_name[i]);
315 }
316
317 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
318 {
319 int i;
320
321 i = netdev_lock_pos(dev->type);
322 lockdep_set_class_and_name(&dev->addr_list_lock,
323 &netdev_addr_lock_key[i],
324 netdev_lock_name[i]);
325 }
326 #else
327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
328 unsigned short dev_type)
329 {
330 }
331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
332 {
333 }
334 #endif
335
336 /*******************************************************************************
337
338 Protocol management and registration routines
339
340 *******************************************************************************/
341
342 /*
343 * Add a protocol ID to the list. Now that the input handler is
344 * smarter we can dispense with all the messy stuff that used to be
345 * here.
346 *
347 * BEWARE!!! Protocol handlers, mangling input packets,
348 * MUST BE last in hash buckets and checking protocol handlers
349 * MUST start from promiscuous ptype_all chain in net_bh.
350 * It is true now, do not change it.
351 * Explanation follows: if protocol handler, mangling packet, will
352 * be the first on list, it is not able to sense, that packet
353 * is cloned and should be copied-on-write, so that it will
354 * change it and subsequent readers will get broken packet.
355 * --ANK (980803)
356 */
357
358 static inline struct list_head *ptype_head(const struct packet_type *pt)
359 {
360 if (pt->type == htons(ETH_P_ALL))
361 return &ptype_all;
362 else
363 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
364 }
365
366 /**
367 * dev_add_pack - add packet handler
368 * @pt: packet type declaration
369 *
370 * Add a protocol handler to the networking stack. The passed &packet_type
371 * is linked into kernel lists and may not be freed until it has been
372 * removed from the kernel lists.
373 *
374 * This call does not sleep therefore it can not
375 * guarantee all CPU's that are in middle of receiving packets
376 * will see the new packet type (until the next received packet).
377 */
378
379 void dev_add_pack(struct packet_type *pt)
380 {
381 struct list_head *head = ptype_head(pt);
382
383 spin_lock(&ptype_lock);
384 list_add_rcu(&pt->list, head);
385 spin_unlock(&ptype_lock);
386 }
387 EXPORT_SYMBOL(dev_add_pack);
388
389 /**
390 * __dev_remove_pack - remove packet handler
391 * @pt: packet type declaration
392 *
393 * Remove a protocol handler that was previously added to the kernel
394 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
395 * from the kernel lists and can be freed or reused once this function
396 * returns.
397 *
398 * The packet type might still be in use by receivers
399 * and must not be freed until after all the CPU's have gone
400 * through a quiescent state.
401 */
402 void __dev_remove_pack(struct packet_type *pt)
403 {
404 struct list_head *head = ptype_head(pt);
405 struct packet_type *pt1;
406
407 spin_lock(&ptype_lock);
408
409 list_for_each_entry(pt1, head, list) {
410 if (pt == pt1) {
411 list_del_rcu(&pt->list);
412 goto out;
413 }
414 }
415
416 pr_warn("dev_remove_pack: %p not found\n", pt);
417 out:
418 spin_unlock(&ptype_lock);
419 }
420 EXPORT_SYMBOL(__dev_remove_pack);
421
422 /**
423 * dev_remove_pack - remove packet handler
424 * @pt: packet type declaration
425 *
426 * Remove a protocol handler that was previously added to the kernel
427 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
428 * from the kernel lists and can be freed or reused once this function
429 * returns.
430 *
431 * This call sleeps to guarantee that no CPU is looking at the packet
432 * type after return.
433 */
434 void dev_remove_pack(struct packet_type *pt)
435 {
436 __dev_remove_pack(pt);
437
438 synchronize_net();
439 }
440 EXPORT_SYMBOL(dev_remove_pack);
441
442
443 /**
444 * dev_add_offload - register offload handlers
445 * @po: protocol offload declaration
446 *
447 * Add protocol offload handlers to the networking stack. The passed
448 * &proto_offload is linked into kernel lists and may not be freed until
449 * it has been removed from the kernel lists.
450 *
451 * This call does not sleep therefore it can not
452 * guarantee all CPU's that are in middle of receiving packets
453 * will see the new offload handlers (until the next received packet).
454 */
455 void dev_add_offload(struct packet_offload *po)
456 {
457 struct list_head *head = &offload_base;
458
459 spin_lock(&offload_lock);
460 list_add_rcu(&po->list, head);
461 spin_unlock(&offload_lock);
462 }
463 EXPORT_SYMBOL(dev_add_offload);
464
465 /**
466 * __dev_remove_offload - remove offload handler
467 * @po: packet offload declaration
468 *
469 * Remove a protocol offload handler that was previously added to the
470 * kernel offload handlers by dev_add_offload(). The passed &offload_type
471 * is removed from the kernel lists and can be freed or reused once this
472 * function returns.
473 *
474 * The packet type might still be in use by receivers
475 * and must not be freed until after all the CPU's have gone
476 * through a quiescent state.
477 */
478 void __dev_remove_offload(struct packet_offload *po)
479 {
480 struct list_head *head = &offload_base;
481 struct packet_offload *po1;
482
483 spin_lock(&offload_lock);
484
485 list_for_each_entry(po1, head, list) {
486 if (po == po1) {
487 list_del_rcu(&po->list);
488 goto out;
489 }
490 }
491
492 pr_warn("dev_remove_offload: %p not found\n", po);
493 out:
494 spin_unlock(&offload_lock);
495 }
496 EXPORT_SYMBOL(__dev_remove_offload);
497
498 /**
499 * dev_remove_offload - remove packet offload handler
500 * @po: packet offload declaration
501 *
502 * Remove a packet offload handler that was previously added to the kernel
503 * offload handlers by dev_add_offload(). The passed &offload_type is
504 * removed from the kernel lists and can be freed or reused once this
505 * function returns.
506 *
507 * This call sleeps to guarantee that no CPU is looking at the packet
508 * type after return.
509 */
510 void dev_remove_offload(struct packet_offload *po)
511 {
512 __dev_remove_offload(po);
513
514 synchronize_net();
515 }
516 EXPORT_SYMBOL(dev_remove_offload);
517
518 /******************************************************************************
519
520 Device Boot-time Settings Routines
521
522 *******************************************************************************/
523
524 /* Boot time configuration table */
525 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
526
527 /**
528 * netdev_boot_setup_add - add new setup entry
529 * @name: name of the device
530 * @map: configured settings for the device
531 *
532 * Adds new setup entry to the dev_boot_setup list. The function
533 * returns 0 on error and 1 on success. This is a generic routine to
534 * all netdevices.
535 */
536 static int netdev_boot_setup_add(char *name, struct ifmap *map)
537 {
538 struct netdev_boot_setup *s;
539 int i;
540
541 s = dev_boot_setup;
542 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
543 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
544 memset(s[i].name, 0, sizeof(s[i].name));
545 strlcpy(s[i].name, name, IFNAMSIZ);
546 memcpy(&s[i].map, map, sizeof(s[i].map));
547 break;
548 }
549 }
550
551 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
552 }
553
554 /**
555 * netdev_boot_setup_check - check boot time settings
556 * @dev: the netdevice
557 *
558 * Check boot time settings for the device.
559 * The found settings are set for the device to be used
560 * later in the device probing.
561 * Returns 0 if no settings found, 1 if they are.
562 */
563 int netdev_boot_setup_check(struct net_device *dev)
564 {
565 struct netdev_boot_setup *s = dev_boot_setup;
566 int i;
567
568 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
569 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
570 !strcmp(dev->name, s[i].name)) {
571 dev->irq = s[i].map.irq;
572 dev->base_addr = s[i].map.base_addr;
573 dev->mem_start = s[i].map.mem_start;
574 dev->mem_end = s[i].map.mem_end;
575 return 1;
576 }
577 }
578 return 0;
579 }
580 EXPORT_SYMBOL(netdev_boot_setup_check);
581
582
583 /**
584 * netdev_boot_base - get address from boot time settings
585 * @prefix: prefix for network device
586 * @unit: id for network device
587 *
588 * Check boot time settings for the base address of device.
589 * The found settings are set for the device to be used
590 * later in the device probing.
591 * Returns 0 if no settings found.
592 */
593 unsigned long netdev_boot_base(const char *prefix, int unit)
594 {
595 const struct netdev_boot_setup *s = dev_boot_setup;
596 char name[IFNAMSIZ];
597 int i;
598
599 sprintf(name, "%s%d", prefix, unit);
600
601 /*
602 * If device already registered then return base of 1
603 * to indicate not to probe for this interface
604 */
605 if (__dev_get_by_name(&init_net, name))
606 return 1;
607
608 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
609 if (!strcmp(name, s[i].name))
610 return s[i].map.base_addr;
611 return 0;
612 }
613
614 /*
615 * Saves at boot time configured settings for any netdevice.
616 */
617 int __init netdev_boot_setup(char *str)
618 {
619 int ints[5];
620 struct ifmap map;
621
622 str = get_options(str, ARRAY_SIZE(ints), ints);
623 if (!str || !*str)
624 return 0;
625
626 /* Save settings */
627 memset(&map, 0, sizeof(map));
628 if (ints[0] > 0)
629 map.irq = ints[1];
630 if (ints[0] > 1)
631 map.base_addr = ints[2];
632 if (ints[0] > 2)
633 map.mem_start = ints[3];
634 if (ints[0] > 3)
635 map.mem_end = ints[4];
636
637 /* Add new entry to the list */
638 return netdev_boot_setup_add(str, &map);
639 }
640
641 __setup("netdev=", netdev_boot_setup);
642
643 /*******************************************************************************
644
645 Device Interface Subroutines
646
647 *******************************************************************************/
648
649 /**
650 * __dev_get_by_name - find a device by its name
651 * @net: the applicable net namespace
652 * @name: name to find
653 *
654 * Find an interface by name. Must be called under RTNL semaphore
655 * or @dev_base_lock. If the name is found a pointer to the device
656 * is returned. If the name is not found then %NULL is returned. The
657 * reference counters are not incremented so the caller must be
658 * careful with locks.
659 */
660
661 struct net_device *__dev_get_by_name(struct net *net, const char *name)
662 {
663 struct net_device *dev;
664 struct hlist_head *head = dev_name_hash(net, name);
665
666 hlist_for_each_entry(dev, head, name_hlist)
667 if (!strncmp(dev->name, name, IFNAMSIZ))
668 return dev;
669
670 return NULL;
671 }
672 EXPORT_SYMBOL(__dev_get_by_name);
673
674 /**
675 * dev_get_by_name_rcu - find a device by its name
676 * @net: the applicable net namespace
677 * @name: name to find
678 *
679 * Find an interface by name.
680 * If the name is found a pointer to the device is returned.
681 * If the name is not found then %NULL is returned.
682 * The reference counters are not incremented so the caller must be
683 * careful with locks. The caller must hold RCU lock.
684 */
685
686 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
687 {
688 struct net_device *dev;
689 struct hlist_head *head = dev_name_hash(net, name);
690
691 hlist_for_each_entry_rcu(dev, head, name_hlist)
692 if (!strncmp(dev->name, name, IFNAMSIZ))
693 return dev;
694
695 return NULL;
696 }
697 EXPORT_SYMBOL(dev_get_by_name_rcu);
698
699 /**
700 * dev_get_by_name - find a device by its name
701 * @net: the applicable net namespace
702 * @name: name to find
703 *
704 * Find an interface by name. This can be called from any
705 * context and does its own locking. The returned handle has
706 * the usage count incremented and the caller must use dev_put() to
707 * release it when it is no longer needed. %NULL is returned if no
708 * matching device is found.
709 */
710
711 struct net_device *dev_get_by_name(struct net *net, const char *name)
712 {
713 struct net_device *dev;
714
715 rcu_read_lock();
716 dev = dev_get_by_name_rcu(net, name);
717 if (dev)
718 dev_hold(dev);
719 rcu_read_unlock();
720 return dev;
721 }
722 EXPORT_SYMBOL(dev_get_by_name);
723
724 /**
725 * __dev_get_by_index - find a device by its ifindex
726 * @net: the applicable net namespace
727 * @ifindex: index of device
728 *
729 * Search for an interface by index. Returns %NULL if the device
730 * is not found or a pointer to the device. The device has not
731 * had its reference counter increased so the caller must be careful
732 * about locking. The caller must hold either the RTNL semaphore
733 * or @dev_base_lock.
734 */
735
736 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
737 {
738 struct net_device *dev;
739 struct hlist_head *head = dev_index_hash(net, ifindex);
740
741 hlist_for_each_entry(dev, head, index_hlist)
742 if (dev->ifindex == ifindex)
743 return dev;
744
745 return NULL;
746 }
747 EXPORT_SYMBOL(__dev_get_by_index);
748
749 /**
750 * dev_get_by_index_rcu - find a device by its ifindex
751 * @net: the applicable net namespace
752 * @ifindex: index of device
753 *
754 * Search for an interface by index. Returns %NULL if the device
755 * is not found or a pointer to the device. The device has not
756 * had its reference counter increased so the caller must be careful
757 * about locking. The caller must hold RCU lock.
758 */
759
760 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
761 {
762 struct net_device *dev;
763 struct hlist_head *head = dev_index_hash(net, ifindex);
764
765 hlist_for_each_entry_rcu(dev, head, index_hlist)
766 if (dev->ifindex == ifindex)
767 return dev;
768
769 return NULL;
770 }
771 EXPORT_SYMBOL(dev_get_by_index_rcu);
772
773
774 /**
775 * dev_get_by_index - find a device by its ifindex
776 * @net: the applicable net namespace
777 * @ifindex: index of device
778 *
779 * Search for an interface by index. Returns NULL if the device
780 * is not found or a pointer to the device. The device returned has
781 * had a reference added and the pointer is safe until the user calls
782 * dev_put to indicate they have finished with it.
783 */
784
785 struct net_device *dev_get_by_index(struct net *net, int ifindex)
786 {
787 struct net_device *dev;
788
789 rcu_read_lock();
790 dev = dev_get_by_index_rcu(net, ifindex);
791 if (dev)
792 dev_hold(dev);
793 rcu_read_unlock();
794 return dev;
795 }
796 EXPORT_SYMBOL(dev_get_by_index);
797
798 /**
799 * netdev_get_name - get a netdevice name, knowing its ifindex.
800 * @net: network namespace
801 * @name: a pointer to the buffer where the name will be stored.
802 * @ifindex: the ifindex of the interface to get the name from.
803 *
804 * The use of raw_seqcount_begin() and cond_resched() before
805 * retrying is required as we want to give the writers a chance
806 * to complete when CONFIG_PREEMPT is not set.
807 */
808 int netdev_get_name(struct net *net, char *name, int ifindex)
809 {
810 struct net_device *dev;
811 unsigned int seq;
812
813 retry:
814 seq = raw_seqcount_begin(&devnet_rename_seq);
815 rcu_read_lock();
816 dev = dev_get_by_index_rcu(net, ifindex);
817 if (!dev) {
818 rcu_read_unlock();
819 return -ENODEV;
820 }
821
822 strcpy(name, dev->name);
823 rcu_read_unlock();
824 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
825 cond_resched();
826 goto retry;
827 }
828
829 return 0;
830 }
831
832 /**
833 * dev_getbyhwaddr_rcu - find a device by its hardware address
834 * @net: the applicable net namespace
835 * @type: media type of device
836 * @ha: hardware address
837 *
838 * Search for an interface by MAC address. Returns NULL if the device
839 * is not found or a pointer to the device.
840 * The caller must hold RCU or RTNL.
841 * The returned device has not had its ref count increased
842 * and the caller must therefore be careful about locking
843 *
844 */
845
846 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
847 const char *ha)
848 {
849 struct net_device *dev;
850
851 for_each_netdev_rcu(net, dev)
852 if (dev->type == type &&
853 !memcmp(dev->dev_addr, ha, dev->addr_len))
854 return dev;
855
856 return NULL;
857 }
858 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
859
860 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
861 {
862 struct net_device *dev;
863
864 ASSERT_RTNL();
865 for_each_netdev(net, dev)
866 if (dev->type == type)
867 return dev;
868
869 return NULL;
870 }
871 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
872
873 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
874 {
875 struct net_device *dev, *ret = NULL;
876
877 rcu_read_lock();
878 for_each_netdev_rcu(net, dev)
879 if (dev->type == type) {
880 dev_hold(dev);
881 ret = dev;
882 break;
883 }
884 rcu_read_unlock();
885 return ret;
886 }
887 EXPORT_SYMBOL(dev_getfirstbyhwtype);
888
889 /**
890 * dev_get_by_flags_rcu - find any device with given flags
891 * @net: the applicable net namespace
892 * @if_flags: IFF_* values
893 * @mask: bitmask of bits in if_flags to check
894 *
895 * Search for any interface with the given flags. Returns NULL if a device
896 * is not found or a pointer to the device. Must be called inside
897 * rcu_read_lock(), and result refcount is unchanged.
898 */
899
900 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
901 unsigned short mask)
902 {
903 struct net_device *dev, *ret;
904
905 ret = NULL;
906 for_each_netdev_rcu(net, dev) {
907 if (((dev->flags ^ if_flags) & mask) == 0) {
908 ret = dev;
909 break;
910 }
911 }
912 return ret;
913 }
914 EXPORT_SYMBOL(dev_get_by_flags_rcu);
915
916 /**
917 * dev_valid_name - check if name is okay for network device
918 * @name: name string
919 *
920 * Network device names need to be valid file names to
921 * to allow sysfs to work. We also disallow any kind of
922 * whitespace.
923 */
924 bool dev_valid_name(const char *name)
925 {
926 if (*name == '\0')
927 return false;
928 if (strlen(name) >= IFNAMSIZ)
929 return false;
930 if (!strcmp(name, ".") || !strcmp(name, ".."))
931 return false;
932
933 while (*name) {
934 if (*name == '/' || *name == ':' || isspace(*name))
935 return false;
936 name++;
937 }
938 return true;
939 }
940 EXPORT_SYMBOL(dev_valid_name);
941
942 /**
943 * __dev_alloc_name - allocate a name for a device
944 * @net: network namespace to allocate the device name in
945 * @name: name format string
946 * @buf: scratch buffer and result name string
947 *
948 * Passed a format string - eg "lt%d" it will try and find a suitable
949 * id. It scans list of devices to build up a free map, then chooses
950 * the first empty slot. The caller must hold the dev_base or rtnl lock
951 * while allocating the name and adding the device in order to avoid
952 * duplicates.
953 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
954 * Returns the number of the unit assigned or a negative errno code.
955 */
956
957 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
958 {
959 int i = 0;
960 const char *p;
961 const int max_netdevices = 8*PAGE_SIZE;
962 unsigned long *inuse;
963 struct net_device *d;
964
965 p = strnchr(name, IFNAMSIZ-1, '%');
966 if (p) {
967 /*
968 * Verify the string as this thing may have come from
969 * the user. There must be either one "%d" and no other "%"
970 * characters.
971 */
972 if (p[1] != 'd' || strchr(p + 2, '%'))
973 return -EINVAL;
974
975 /* Use one page as a bit array of possible slots */
976 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
977 if (!inuse)
978 return -ENOMEM;
979
980 for_each_netdev(net, d) {
981 if (!sscanf(d->name, name, &i))
982 continue;
983 if (i < 0 || i >= max_netdevices)
984 continue;
985
986 /* avoid cases where sscanf is not exact inverse of printf */
987 snprintf(buf, IFNAMSIZ, name, i);
988 if (!strncmp(buf, d->name, IFNAMSIZ))
989 set_bit(i, inuse);
990 }
991
992 i = find_first_zero_bit(inuse, max_netdevices);
993 free_page((unsigned long) inuse);
994 }
995
996 if (buf != name)
997 snprintf(buf, IFNAMSIZ, name, i);
998 if (!__dev_get_by_name(net, buf))
999 return i;
1000
1001 /* It is possible to run out of possible slots
1002 * when the name is long and there isn't enough space left
1003 * for the digits, or if all bits are used.
1004 */
1005 return -ENFILE;
1006 }
1007
1008 /**
1009 * dev_alloc_name - allocate a name for a device
1010 * @dev: device
1011 * @name: name format string
1012 *
1013 * Passed a format string - eg "lt%d" it will try and find a suitable
1014 * id. It scans list of devices to build up a free map, then chooses
1015 * the first empty slot. The caller must hold the dev_base or rtnl lock
1016 * while allocating the name and adding the device in order to avoid
1017 * duplicates.
1018 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1019 * Returns the number of the unit assigned or a negative errno code.
1020 */
1021
1022 int dev_alloc_name(struct net_device *dev, const char *name)
1023 {
1024 char buf[IFNAMSIZ];
1025 struct net *net;
1026 int ret;
1027
1028 BUG_ON(!dev_net(dev));
1029 net = dev_net(dev);
1030 ret = __dev_alloc_name(net, name, buf);
1031 if (ret >= 0)
1032 strlcpy(dev->name, buf, IFNAMSIZ);
1033 return ret;
1034 }
1035 EXPORT_SYMBOL(dev_alloc_name);
1036
1037 static int dev_alloc_name_ns(struct net *net,
1038 struct net_device *dev,
1039 const char *name)
1040 {
1041 char buf[IFNAMSIZ];
1042 int ret;
1043
1044 ret = __dev_alloc_name(net, name, buf);
1045 if (ret >= 0)
1046 strlcpy(dev->name, buf, IFNAMSIZ);
1047 return ret;
1048 }
1049
1050 static int dev_get_valid_name(struct net *net,
1051 struct net_device *dev,
1052 const char *name)
1053 {
1054 BUG_ON(!net);
1055
1056 if (!dev_valid_name(name))
1057 return -EINVAL;
1058
1059 if (strchr(name, '%'))
1060 return dev_alloc_name_ns(net, dev, name);
1061 else if (__dev_get_by_name(net, name))
1062 return -EEXIST;
1063 else if (dev->name != name)
1064 strlcpy(dev->name, name, IFNAMSIZ);
1065
1066 return 0;
1067 }
1068
1069 /**
1070 * dev_change_name - change name of a device
1071 * @dev: device
1072 * @newname: name (or format string) must be at least IFNAMSIZ
1073 *
1074 * Change name of a device, can pass format strings "eth%d".
1075 * for wildcarding.
1076 */
1077 int dev_change_name(struct net_device *dev, const char *newname)
1078 {
1079 char oldname[IFNAMSIZ];
1080 int err = 0;
1081 int ret;
1082 struct net *net;
1083
1084 ASSERT_RTNL();
1085 BUG_ON(!dev_net(dev));
1086
1087 net = dev_net(dev);
1088 if (dev->flags & IFF_UP)
1089 return -EBUSY;
1090
1091 write_seqcount_begin(&devnet_rename_seq);
1092
1093 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1094 write_seqcount_end(&devnet_rename_seq);
1095 return 0;
1096 }
1097
1098 memcpy(oldname, dev->name, IFNAMSIZ);
1099
1100 err = dev_get_valid_name(net, dev, newname);
1101 if (err < 0) {
1102 write_seqcount_end(&devnet_rename_seq);
1103 return err;
1104 }
1105
1106 rollback:
1107 ret = device_rename(&dev->dev, dev->name);
1108 if (ret) {
1109 memcpy(dev->name, oldname, IFNAMSIZ);
1110 write_seqcount_end(&devnet_rename_seq);
1111 return ret;
1112 }
1113
1114 write_seqcount_end(&devnet_rename_seq);
1115
1116 write_lock_bh(&dev_base_lock);
1117 hlist_del_rcu(&dev->name_hlist);
1118 write_unlock_bh(&dev_base_lock);
1119
1120 synchronize_rcu();
1121
1122 write_lock_bh(&dev_base_lock);
1123 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1124 write_unlock_bh(&dev_base_lock);
1125
1126 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1127 ret = notifier_to_errno(ret);
1128
1129 if (ret) {
1130 /* err >= 0 after dev_alloc_name() or stores the first errno */
1131 if (err >= 0) {
1132 err = ret;
1133 write_seqcount_begin(&devnet_rename_seq);
1134 memcpy(dev->name, oldname, IFNAMSIZ);
1135 goto rollback;
1136 } else {
1137 pr_err("%s: name change rollback failed: %d\n",
1138 dev->name, ret);
1139 }
1140 }
1141
1142 return err;
1143 }
1144
1145 /**
1146 * dev_set_alias - change ifalias of a device
1147 * @dev: device
1148 * @alias: name up to IFALIASZ
1149 * @len: limit of bytes to copy from info
1150 *
1151 * Set ifalias for a device,
1152 */
1153 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1154 {
1155 char *new_ifalias;
1156
1157 ASSERT_RTNL();
1158
1159 if (len >= IFALIASZ)
1160 return -EINVAL;
1161
1162 if (!len) {
1163 kfree(dev->ifalias);
1164 dev->ifalias = NULL;
1165 return 0;
1166 }
1167
1168 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1169 if (!new_ifalias)
1170 return -ENOMEM;
1171 dev->ifalias = new_ifalias;
1172
1173 strlcpy(dev->ifalias, alias, len+1);
1174 return len;
1175 }
1176
1177
1178 /**
1179 * netdev_features_change - device changes features
1180 * @dev: device to cause notification
1181 *
1182 * Called to indicate a device has changed features.
1183 */
1184 void netdev_features_change(struct net_device *dev)
1185 {
1186 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1187 }
1188 EXPORT_SYMBOL(netdev_features_change);
1189
1190 /**
1191 * netdev_state_change - device changes state
1192 * @dev: device to cause notification
1193 *
1194 * Called to indicate a device has changed state. This function calls
1195 * the notifier chains for netdev_chain and sends a NEWLINK message
1196 * to the routing socket.
1197 */
1198 void netdev_state_change(struct net_device *dev)
1199 {
1200 if (dev->flags & IFF_UP) {
1201 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1202 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1203 }
1204 }
1205 EXPORT_SYMBOL(netdev_state_change);
1206
1207 /**
1208 * netdev_notify_peers - notify network peers about existence of @dev
1209 * @dev: network device
1210 *
1211 * Generate traffic such that interested network peers are aware of
1212 * @dev, such as by generating a gratuitous ARP. This may be used when
1213 * a device wants to inform the rest of the network about some sort of
1214 * reconfiguration such as a failover event or virtual machine
1215 * migration.
1216 */
1217 void netdev_notify_peers(struct net_device *dev)
1218 {
1219 rtnl_lock();
1220 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1221 rtnl_unlock();
1222 }
1223 EXPORT_SYMBOL(netdev_notify_peers);
1224
1225 static int __dev_open(struct net_device *dev)
1226 {
1227 const struct net_device_ops *ops = dev->netdev_ops;
1228 int ret;
1229
1230 ASSERT_RTNL();
1231
1232 if (!netif_device_present(dev))
1233 return -ENODEV;
1234
1235 /* Block netpoll from trying to do any rx path servicing.
1236 * If we don't do this there is a chance ndo_poll_controller
1237 * or ndo_poll may be running while we open the device
1238 */
1239 ret = netpoll_rx_disable(dev);
1240 if (ret)
1241 return ret;
1242
1243 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1244 ret = notifier_to_errno(ret);
1245 if (ret)
1246 return ret;
1247
1248 set_bit(__LINK_STATE_START, &dev->state);
1249
1250 if (ops->ndo_validate_addr)
1251 ret = ops->ndo_validate_addr(dev);
1252
1253 if (!ret && ops->ndo_open)
1254 ret = ops->ndo_open(dev);
1255
1256 netpoll_rx_enable(dev);
1257
1258 if (ret)
1259 clear_bit(__LINK_STATE_START, &dev->state);
1260 else {
1261 dev->flags |= IFF_UP;
1262 net_dmaengine_get();
1263 dev_set_rx_mode(dev);
1264 dev_activate(dev);
1265 add_device_randomness(dev->dev_addr, dev->addr_len);
1266 }
1267
1268 return ret;
1269 }
1270
1271 /**
1272 * dev_open - prepare an interface for use.
1273 * @dev: device to open
1274 *
1275 * Takes a device from down to up state. The device's private open
1276 * function is invoked and then the multicast lists are loaded. Finally
1277 * the device is moved into the up state and a %NETDEV_UP message is
1278 * sent to the netdev notifier chain.
1279 *
1280 * Calling this function on an active interface is a nop. On a failure
1281 * a negative errno code is returned.
1282 */
1283 int dev_open(struct net_device *dev)
1284 {
1285 int ret;
1286
1287 if (dev->flags & IFF_UP)
1288 return 0;
1289
1290 ret = __dev_open(dev);
1291 if (ret < 0)
1292 return ret;
1293
1294 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1295 call_netdevice_notifiers(NETDEV_UP, dev);
1296
1297 return ret;
1298 }
1299 EXPORT_SYMBOL(dev_open);
1300
1301 static int __dev_close_many(struct list_head *head)
1302 {
1303 struct net_device *dev;
1304
1305 ASSERT_RTNL();
1306 might_sleep();
1307
1308 list_for_each_entry(dev, head, unreg_list) {
1309 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1310
1311 clear_bit(__LINK_STATE_START, &dev->state);
1312
1313 /* Synchronize to scheduled poll. We cannot touch poll list, it
1314 * can be even on different cpu. So just clear netif_running().
1315 *
1316 * dev->stop() will invoke napi_disable() on all of it's
1317 * napi_struct instances on this device.
1318 */
1319 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1320 }
1321
1322 dev_deactivate_many(head);
1323
1324 list_for_each_entry(dev, head, unreg_list) {
1325 const struct net_device_ops *ops = dev->netdev_ops;
1326
1327 /*
1328 * Call the device specific close. This cannot fail.
1329 * Only if device is UP
1330 *
1331 * We allow it to be called even after a DETACH hot-plug
1332 * event.
1333 */
1334 if (ops->ndo_stop)
1335 ops->ndo_stop(dev);
1336
1337 dev->flags &= ~IFF_UP;
1338 net_dmaengine_put();
1339 }
1340
1341 return 0;
1342 }
1343
1344 static int __dev_close(struct net_device *dev)
1345 {
1346 int retval;
1347 LIST_HEAD(single);
1348
1349 /* Temporarily disable netpoll until the interface is down */
1350 retval = netpoll_rx_disable(dev);
1351 if (retval)
1352 return retval;
1353
1354 list_add(&dev->unreg_list, &single);
1355 retval = __dev_close_many(&single);
1356 list_del(&single);
1357
1358 netpoll_rx_enable(dev);
1359 return retval;
1360 }
1361
1362 static int dev_close_many(struct list_head *head)
1363 {
1364 struct net_device *dev, *tmp;
1365 LIST_HEAD(tmp_list);
1366
1367 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1368 if (!(dev->flags & IFF_UP))
1369 list_move(&dev->unreg_list, &tmp_list);
1370
1371 __dev_close_many(head);
1372
1373 list_for_each_entry(dev, head, unreg_list) {
1374 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1375 call_netdevice_notifiers(NETDEV_DOWN, dev);
1376 }
1377
1378 /* rollback_registered_many needs the complete original list */
1379 list_splice(&tmp_list, head);
1380 return 0;
1381 }
1382
1383 /**
1384 * dev_close - shutdown an interface.
1385 * @dev: device to shutdown
1386 *
1387 * This function moves an active device into down state. A
1388 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390 * chain.
1391 */
1392 int dev_close(struct net_device *dev)
1393 {
1394 int ret = 0;
1395 if (dev->flags & IFF_UP) {
1396 LIST_HEAD(single);
1397
1398 /* Block netpoll rx while the interface is going down */
1399 ret = netpoll_rx_disable(dev);
1400 if (ret)
1401 return ret;
1402
1403 list_add(&dev->unreg_list, &single);
1404 dev_close_many(&single);
1405 list_del(&single);
1406
1407 netpoll_rx_enable(dev);
1408 }
1409 return ret;
1410 }
1411 EXPORT_SYMBOL(dev_close);
1412
1413
1414 /**
1415 * dev_disable_lro - disable Large Receive Offload on a device
1416 * @dev: device
1417 *
1418 * Disable Large Receive Offload (LRO) on a net device. Must be
1419 * called under RTNL. This is needed if received packets may be
1420 * forwarded to another interface.
1421 */
1422 void dev_disable_lro(struct net_device *dev)
1423 {
1424 /*
1425 * If we're trying to disable lro on a vlan device
1426 * use the underlying physical device instead
1427 */
1428 if (is_vlan_dev(dev))
1429 dev = vlan_dev_real_dev(dev);
1430
1431 dev->wanted_features &= ~NETIF_F_LRO;
1432 netdev_update_features(dev);
1433
1434 if (unlikely(dev->features & NETIF_F_LRO))
1435 netdev_WARN(dev, "failed to disable LRO!\n");
1436 }
1437 EXPORT_SYMBOL(dev_disable_lro);
1438
1439
1440 static int dev_boot_phase = 1;
1441
1442 /**
1443 * register_netdevice_notifier - register a network notifier block
1444 * @nb: notifier
1445 *
1446 * Register a notifier to be called when network device events occur.
1447 * The notifier passed is linked into the kernel structures and must
1448 * not be reused until it has been unregistered. A negative errno code
1449 * is returned on a failure.
1450 *
1451 * When registered all registration and up events are replayed
1452 * to the new notifier to allow device to have a race free
1453 * view of the network device list.
1454 */
1455
1456 int register_netdevice_notifier(struct notifier_block *nb)
1457 {
1458 struct net_device *dev;
1459 struct net_device *last;
1460 struct net *net;
1461 int err;
1462
1463 rtnl_lock();
1464 err = raw_notifier_chain_register(&netdev_chain, nb);
1465 if (err)
1466 goto unlock;
1467 if (dev_boot_phase)
1468 goto unlock;
1469 for_each_net(net) {
1470 for_each_netdev(net, dev) {
1471 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1472 err = notifier_to_errno(err);
1473 if (err)
1474 goto rollback;
1475
1476 if (!(dev->flags & IFF_UP))
1477 continue;
1478
1479 nb->notifier_call(nb, NETDEV_UP, dev);
1480 }
1481 }
1482
1483 unlock:
1484 rtnl_unlock();
1485 return err;
1486
1487 rollback:
1488 last = dev;
1489 for_each_net(net) {
1490 for_each_netdev(net, dev) {
1491 if (dev == last)
1492 goto outroll;
1493
1494 if (dev->flags & IFF_UP) {
1495 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1496 nb->notifier_call(nb, NETDEV_DOWN, dev);
1497 }
1498 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1499 }
1500 }
1501
1502 outroll:
1503 raw_notifier_chain_unregister(&netdev_chain, nb);
1504 goto unlock;
1505 }
1506 EXPORT_SYMBOL(register_netdevice_notifier);
1507
1508 /**
1509 * unregister_netdevice_notifier - unregister a network notifier block
1510 * @nb: notifier
1511 *
1512 * Unregister a notifier previously registered by
1513 * register_netdevice_notifier(). The notifier is unlinked into the
1514 * kernel structures and may then be reused. A negative errno code
1515 * is returned on a failure.
1516 *
1517 * After unregistering unregister and down device events are synthesized
1518 * for all devices on the device list to the removed notifier to remove
1519 * the need for special case cleanup code.
1520 */
1521
1522 int unregister_netdevice_notifier(struct notifier_block *nb)
1523 {
1524 struct net_device *dev;
1525 struct net *net;
1526 int err;
1527
1528 rtnl_lock();
1529 err = raw_notifier_chain_unregister(&netdev_chain, nb);
1530 if (err)
1531 goto unlock;
1532
1533 for_each_net(net) {
1534 for_each_netdev(net, dev) {
1535 if (dev->flags & IFF_UP) {
1536 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1537 nb->notifier_call(nb, NETDEV_DOWN, dev);
1538 }
1539 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1540 }
1541 }
1542 unlock:
1543 rtnl_unlock();
1544 return err;
1545 }
1546 EXPORT_SYMBOL(unregister_netdevice_notifier);
1547
1548 /**
1549 * call_netdevice_notifiers - call all network notifier blocks
1550 * @val: value passed unmodified to notifier function
1551 * @dev: net_device pointer passed unmodified to notifier function
1552 *
1553 * Call all network notifier blocks. Parameters and return value
1554 * are as for raw_notifier_call_chain().
1555 */
1556
1557 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1558 {
1559 ASSERT_RTNL();
1560 return raw_notifier_call_chain(&netdev_chain, val, dev);
1561 }
1562 EXPORT_SYMBOL(call_netdevice_notifiers);
1563
1564 static struct static_key netstamp_needed __read_mostly;
1565 #ifdef HAVE_JUMP_LABEL
1566 static atomic_t netstamp_needed_deferred;
1567 static atomic_t netstamp_wanted;
1568 static void netstamp_clear(struct work_struct *work)
1569 {
1570 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1571 int wanted;
1572
1573 wanted = atomic_add_return(deferred, &netstamp_wanted);
1574 if (wanted > 0)
1575 static_key_enable(&netstamp_needed);
1576 else
1577 static_key_disable(&netstamp_needed);
1578 }
1579 static DECLARE_WORK(netstamp_work, netstamp_clear);
1580 #endif
1581
1582 void net_enable_timestamp(void)
1583 {
1584 #ifdef HAVE_JUMP_LABEL
1585 int wanted;
1586
1587 while (1) {
1588 wanted = atomic_read(&netstamp_wanted);
1589 if (wanted <= 0)
1590 break;
1591 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1592 return;
1593 }
1594 atomic_inc(&netstamp_needed_deferred);
1595 schedule_work(&netstamp_work);
1596 #else
1597 static_key_slow_inc(&netstamp_needed);
1598 #endif
1599 }
1600 EXPORT_SYMBOL(net_enable_timestamp);
1601
1602 void net_disable_timestamp(void)
1603 {
1604 #ifdef HAVE_JUMP_LABEL
1605 int wanted;
1606
1607 while (1) {
1608 wanted = atomic_read(&netstamp_wanted);
1609 if (wanted <= 1)
1610 break;
1611 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1612 return;
1613 }
1614 atomic_dec(&netstamp_needed_deferred);
1615 schedule_work(&netstamp_work);
1616 #else
1617 static_key_slow_dec(&netstamp_needed);
1618 #endif
1619 }
1620 EXPORT_SYMBOL(net_disable_timestamp);
1621
1622 static inline void net_timestamp_set(struct sk_buff *skb)
1623 {
1624 skb->tstamp.tv64 = 0;
1625 if (static_key_false(&netstamp_needed))
1626 __net_timestamp(skb);
1627 }
1628
1629 #define net_timestamp_check(COND, SKB) \
1630 if (static_key_false(&netstamp_needed)) { \
1631 if ((COND) && !(SKB)->tstamp.tv64) \
1632 __net_timestamp(SKB); \
1633 } \
1634
1635 static inline bool is_skb_forwardable(struct net_device *dev,
1636 struct sk_buff *skb)
1637 {
1638 unsigned int len;
1639
1640 if (!(dev->flags & IFF_UP))
1641 return false;
1642
1643 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1644 if (skb->len <= len)
1645 return true;
1646
1647 /* if TSO is enabled, we don't care about the length as the packet
1648 * could be forwarded without being segmented before
1649 */
1650 if (skb_is_gso(skb))
1651 return true;
1652
1653 return false;
1654 }
1655
1656 /**
1657 * dev_forward_skb - loopback an skb to another netif
1658 *
1659 * @dev: destination network device
1660 * @skb: buffer to forward
1661 *
1662 * return values:
1663 * NET_RX_SUCCESS (no congestion)
1664 * NET_RX_DROP (packet was dropped, but freed)
1665 *
1666 * dev_forward_skb can be used for injecting an skb from the
1667 * start_xmit function of one device into the receive queue
1668 * of another device.
1669 *
1670 * The receiving device may be in another namespace, so
1671 * we have to clear all information in the skb that could
1672 * impact namespace isolation.
1673 */
1674 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1675 {
1676 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1677 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1678 atomic_long_inc(&dev->rx_dropped);
1679 kfree_skb(skb);
1680 return NET_RX_DROP;
1681 }
1682 }
1683
1684 skb_orphan(skb);
1685
1686 if (unlikely(!is_skb_forwardable(dev, skb))) {
1687 atomic_long_inc(&dev->rx_dropped);
1688 kfree_skb(skb);
1689 return NET_RX_DROP;
1690 }
1691 skb->skb_iif = 0;
1692 skb->dev = dev;
1693 skb_dst_drop(skb);
1694 skb->tstamp.tv64 = 0;
1695 skb->pkt_type = PACKET_HOST;
1696 skb->protocol = eth_type_trans(skb, dev);
1697 skb->mark = 0;
1698 secpath_reset(skb);
1699 nf_reset(skb);
1700 nf_reset_trace(skb);
1701 return netif_rx(skb);
1702 }
1703 EXPORT_SYMBOL_GPL(dev_forward_skb);
1704
1705 static inline int deliver_skb(struct sk_buff *skb,
1706 struct packet_type *pt_prev,
1707 struct net_device *orig_dev)
1708 {
1709 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1710 return -ENOMEM;
1711 atomic_inc(&skb->users);
1712 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1713 }
1714
1715 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1716 {
1717 if (!ptype->af_packet_priv || !skb->sk)
1718 return false;
1719
1720 if (ptype->id_match)
1721 return ptype->id_match(ptype, skb->sk);
1722 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1723 return true;
1724
1725 return false;
1726 }
1727
1728 /*
1729 * Support routine. Sends outgoing frames to any network
1730 * taps currently in use.
1731 */
1732
1733 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1734 {
1735 struct packet_type *ptype;
1736 struct sk_buff *skb2 = NULL;
1737 struct packet_type *pt_prev = NULL;
1738
1739 rcu_read_lock();
1740 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1741 /* Never send packets back to the socket
1742 * they originated from - MvS (miquels@drinkel.ow.org)
1743 */
1744 if ((ptype->dev == dev || !ptype->dev) &&
1745 (!skb_loop_sk(ptype, skb))) {
1746 if (pt_prev) {
1747 deliver_skb(skb2, pt_prev, skb->dev);
1748 pt_prev = ptype;
1749 continue;
1750 }
1751
1752 skb2 = skb_clone(skb, GFP_ATOMIC);
1753 if (!skb2)
1754 break;
1755
1756 net_timestamp_set(skb2);
1757
1758 /* skb->nh should be correctly
1759 set by sender, so that the second statement is
1760 just protection against buggy protocols.
1761 */
1762 skb_reset_mac_header(skb2);
1763
1764 if (skb_network_header(skb2) < skb2->data ||
1765 skb2->network_header > skb2->tail) {
1766 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1767 ntohs(skb2->protocol),
1768 dev->name);
1769 skb_reset_network_header(skb2);
1770 }
1771
1772 skb2->transport_header = skb2->network_header;
1773 skb2->pkt_type = PACKET_OUTGOING;
1774 pt_prev = ptype;
1775 }
1776 }
1777 if (pt_prev)
1778 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1779 rcu_read_unlock();
1780 }
1781
1782 /**
1783 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1784 * @dev: Network device
1785 * @txq: number of queues available
1786 *
1787 * If real_num_tx_queues is changed the tc mappings may no longer be
1788 * valid. To resolve this verify the tc mapping remains valid and if
1789 * not NULL the mapping. With no priorities mapping to this
1790 * offset/count pair it will no longer be used. In the worst case TC0
1791 * is invalid nothing can be done so disable priority mappings. If is
1792 * expected that drivers will fix this mapping if they can before
1793 * calling netif_set_real_num_tx_queues.
1794 */
1795 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1796 {
1797 int i;
1798 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1799
1800 /* If TC0 is invalidated disable TC mapping */
1801 if (tc->offset + tc->count > txq) {
1802 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1803 dev->num_tc = 0;
1804 return;
1805 }
1806
1807 /* Invalidated prio to tc mappings set to TC0 */
1808 for (i = 1; i < TC_BITMASK + 1; i++) {
1809 int q = netdev_get_prio_tc_map(dev, i);
1810
1811 tc = &dev->tc_to_txq[q];
1812 if (tc->offset + tc->count > txq) {
1813 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1814 i, q);
1815 netdev_set_prio_tc_map(dev, i, 0);
1816 }
1817 }
1818 }
1819
1820 #ifdef CONFIG_XPS
1821 static DEFINE_MUTEX(xps_map_mutex);
1822 #define xmap_dereference(P) \
1823 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1824
1825 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1826 int cpu, u16 index)
1827 {
1828 struct xps_map *map = NULL;
1829 int pos;
1830
1831 if (dev_maps)
1832 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1833
1834 for (pos = 0; map && pos < map->len; pos++) {
1835 if (map->queues[pos] == index) {
1836 if (map->len > 1) {
1837 map->queues[pos] = map->queues[--map->len];
1838 } else {
1839 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1840 kfree_rcu(map, rcu);
1841 map = NULL;
1842 }
1843 break;
1844 }
1845 }
1846
1847 return map;
1848 }
1849
1850 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1851 {
1852 struct xps_dev_maps *dev_maps;
1853 int cpu, i;
1854 bool active = false;
1855
1856 mutex_lock(&xps_map_mutex);
1857 dev_maps = xmap_dereference(dev->xps_maps);
1858
1859 if (!dev_maps)
1860 goto out_no_maps;
1861
1862 for_each_possible_cpu(cpu) {
1863 for (i = index; i < dev->num_tx_queues; i++) {
1864 if (!remove_xps_queue(dev_maps, cpu, i))
1865 break;
1866 }
1867 if (i == dev->num_tx_queues)
1868 active = true;
1869 }
1870
1871 if (!active) {
1872 RCU_INIT_POINTER(dev->xps_maps, NULL);
1873 kfree_rcu(dev_maps, rcu);
1874 }
1875
1876 for (i = index; i < dev->num_tx_queues; i++)
1877 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1878 NUMA_NO_NODE);
1879
1880 out_no_maps:
1881 mutex_unlock(&xps_map_mutex);
1882 }
1883
1884 static struct xps_map *expand_xps_map(struct xps_map *map,
1885 int cpu, u16 index)
1886 {
1887 struct xps_map *new_map;
1888 int alloc_len = XPS_MIN_MAP_ALLOC;
1889 int i, pos;
1890
1891 for (pos = 0; map && pos < map->len; pos++) {
1892 if (map->queues[pos] != index)
1893 continue;
1894 return map;
1895 }
1896
1897 /* Need to add queue to this CPU's existing map */
1898 if (map) {
1899 if (pos < map->alloc_len)
1900 return map;
1901
1902 alloc_len = map->alloc_len * 2;
1903 }
1904
1905 /* Need to allocate new map to store queue on this CPU's map */
1906 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1907 cpu_to_node(cpu));
1908 if (!new_map)
1909 return NULL;
1910
1911 for (i = 0; i < pos; i++)
1912 new_map->queues[i] = map->queues[i];
1913 new_map->alloc_len = alloc_len;
1914 new_map->len = pos;
1915
1916 return new_map;
1917 }
1918
1919 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1920 {
1921 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1922 struct xps_map *map, *new_map;
1923 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1924 int cpu, numa_node_id = -2;
1925 bool active = false;
1926
1927 mutex_lock(&xps_map_mutex);
1928
1929 dev_maps = xmap_dereference(dev->xps_maps);
1930
1931 /* allocate memory for queue storage */
1932 for_each_online_cpu(cpu) {
1933 if (!cpumask_test_cpu(cpu, mask))
1934 continue;
1935
1936 if (!new_dev_maps)
1937 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1938 if (!new_dev_maps) {
1939 mutex_unlock(&xps_map_mutex);
1940 return -ENOMEM;
1941 }
1942
1943 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1944 NULL;
1945
1946 map = expand_xps_map(map, cpu, index);
1947 if (!map)
1948 goto error;
1949
1950 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1951 }
1952
1953 if (!new_dev_maps)
1954 goto out_no_new_maps;
1955
1956 for_each_possible_cpu(cpu) {
1957 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1958 /* add queue to CPU maps */
1959 int pos = 0;
1960
1961 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1962 while ((pos < map->len) && (map->queues[pos] != index))
1963 pos++;
1964
1965 if (pos == map->len)
1966 map->queues[map->len++] = index;
1967 #ifdef CONFIG_NUMA
1968 if (numa_node_id == -2)
1969 numa_node_id = cpu_to_node(cpu);
1970 else if (numa_node_id != cpu_to_node(cpu))
1971 numa_node_id = -1;
1972 #endif
1973 } else if (dev_maps) {
1974 /* fill in the new device map from the old device map */
1975 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1976 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1977 }
1978
1979 }
1980
1981 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1982
1983 /* Cleanup old maps */
1984 if (dev_maps) {
1985 for_each_possible_cpu(cpu) {
1986 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1987 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1988 if (map && map != new_map)
1989 kfree_rcu(map, rcu);
1990 }
1991
1992 kfree_rcu(dev_maps, rcu);
1993 }
1994
1995 dev_maps = new_dev_maps;
1996 active = true;
1997
1998 out_no_new_maps:
1999 /* update Tx queue numa node */
2000 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2001 (numa_node_id >= 0) ? numa_node_id :
2002 NUMA_NO_NODE);
2003
2004 if (!dev_maps)
2005 goto out_no_maps;
2006
2007 /* removes queue from unused CPUs */
2008 for_each_possible_cpu(cpu) {
2009 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2010 continue;
2011
2012 if (remove_xps_queue(dev_maps, cpu, index))
2013 active = true;
2014 }
2015
2016 /* free map if not active */
2017 if (!active) {
2018 RCU_INIT_POINTER(dev->xps_maps, NULL);
2019 kfree_rcu(dev_maps, rcu);
2020 }
2021
2022 out_no_maps:
2023 mutex_unlock(&xps_map_mutex);
2024
2025 return 0;
2026 error:
2027 /* remove any maps that we added */
2028 for_each_possible_cpu(cpu) {
2029 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2030 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2031 NULL;
2032 if (new_map && new_map != map)
2033 kfree(new_map);
2034 }
2035
2036 mutex_unlock(&xps_map_mutex);
2037
2038 kfree(new_dev_maps);
2039 return -ENOMEM;
2040 }
2041 EXPORT_SYMBOL(netif_set_xps_queue);
2042
2043 #endif
2044 /*
2045 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2046 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2047 */
2048 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2049 {
2050 int rc;
2051
2052 if (txq < 1 || txq > dev->num_tx_queues)
2053 return -EINVAL;
2054
2055 if (dev->reg_state == NETREG_REGISTERED ||
2056 dev->reg_state == NETREG_UNREGISTERING) {
2057 ASSERT_RTNL();
2058
2059 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2060 txq);
2061 if (rc)
2062 return rc;
2063
2064 if (dev->num_tc)
2065 netif_setup_tc(dev, txq);
2066
2067 if (txq < dev->real_num_tx_queues) {
2068 qdisc_reset_all_tx_gt(dev, txq);
2069 #ifdef CONFIG_XPS
2070 netif_reset_xps_queues_gt(dev, txq);
2071 #endif
2072 }
2073 }
2074
2075 dev->real_num_tx_queues = txq;
2076 return 0;
2077 }
2078 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2079
2080 #ifdef CONFIG_RPS
2081 /**
2082 * netif_set_real_num_rx_queues - set actual number of RX queues used
2083 * @dev: Network device
2084 * @rxq: Actual number of RX queues
2085 *
2086 * This must be called either with the rtnl_lock held or before
2087 * registration of the net device. Returns 0 on success, or a
2088 * negative error code. If called before registration, it always
2089 * succeeds.
2090 */
2091 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2092 {
2093 int rc;
2094
2095 if (rxq < 1 || rxq > dev->num_rx_queues)
2096 return -EINVAL;
2097
2098 if (dev->reg_state == NETREG_REGISTERED) {
2099 ASSERT_RTNL();
2100
2101 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2102 rxq);
2103 if (rc)
2104 return rc;
2105 }
2106
2107 dev->real_num_rx_queues = rxq;
2108 return 0;
2109 }
2110 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2111 #endif
2112
2113 /**
2114 * netif_get_num_default_rss_queues - default number of RSS queues
2115 *
2116 * This routine should set an upper limit on the number of RSS queues
2117 * used by default by multiqueue devices.
2118 */
2119 int netif_get_num_default_rss_queues(void)
2120 {
2121 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2122 }
2123 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2124
2125 static inline void __netif_reschedule(struct Qdisc *q)
2126 {
2127 struct softnet_data *sd;
2128 unsigned long flags;
2129
2130 local_irq_save(flags);
2131 sd = &__get_cpu_var(softnet_data);
2132 q->next_sched = NULL;
2133 *sd->output_queue_tailp = q;
2134 sd->output_queue_tailp = &q->next_sched;
2135 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2136 local_irq_restore(flags);
2137 }
2138
2139 void __netif_schedule(struct Qdisc *q)
2140 {
2141 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2142 __netif_reschedule(q);
2143 }
2144 EXPORT_SYMBOL(__netif_schedule);
2145
2146 void dev_kfree_skb_irq(struct sk_buff *skb)
2147 {
2148 if (atomic_dec_and_test(&skb->users)) {
2149 struct softnet_data *sd;
2150 unsigned long flags;
2151
2152 local_irq_save(flags);
2153 sd = &__get_cpu_var(softnet_data);
2154 skb->next = sd->completion_queue;
2155 sd->completion_queue = skb;
2156 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2157 local_irq_restore(flags);
2158 }
2159 }
2160 EXPORT_SYMBOL(dev_kfree_skb_irq);
2161
2162 void dev_kfree_skb_any(struct sk_buff *skb)
2163 {
2164 if (in_irq() || irqs_disabled())
2165 dev_kfree_skb_irq(skb);
2166 else
2167 dev_kfree_skb(skb);
2168 }
2169 EXPORT_SYMBOL(dev_kfree_skb_any);
2170
2171
2172 /**
2173 * netif_device_detach - mark device as removed
2174 * @dev: network device
2175 *
2176 * Mark device as removed from system and therefore no longer available.
2177 */
2178 void netif_device_detach(struct net_device *dev)
2179 {
2180 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2181 netif_running(dev)) {
2182 netif_tx_stop_all_queues(dev);
2183 }
2184 }
2185 EXPORT_SYMBOL(netif_device_detach);
2186
2187 /**
2188 * netif_device_attach - mark device as attached
2189 * @dev: network device
2190 *
2191 * Mark device as attached from system and restart if needed.
2192 */
2193 void netif_device_attach(struct net_device *dev)
2194 {
2195 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2196 netif_running(dev)) {
2197 netif_tx_wake_all_queues(dev);
2198 __netdev_watchdog_up(dev);
2199 }
2200 }
2201 EXPORT_SYMBOL(netif_device_attach);
2202
2203 static void skb_warn_bad_offload(const struct sk_buff *skb)
2204 {
2205 static const netdev_features_t null_features = 0;
2206 struct net_device *dev = skb->dev;
2207 const char *driver = "";
2208
2209 if (!net_ratelimit())
2210 return;
2211
2212 if (dev && dev->dev.parent)
2213 driver = dev_driver_string(dev->dev.parent);
2214
2215 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2216 "gso_type=%d ip_summed=%d\n",
2217 driver, dev ? &dev->features : &null_features,
2218 skb->sk ? &skb->sk->sk_route_caps : &null_features,
2219 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2220 skb_shinfo(skb)->gso_type, skb->ip_summed);
2221 }
2222
2223 /*
2224 * Invalidate hardware checksum when packet is to be mangled, and
2225 * complete checksum manually on outgoing path.
2226 */
2227 int skb_checksum_help(struct sk_buff *skb)
2228 {
2229 __wsum csum;
2230 int ret = 0, offset;
2231
2232 if (skb->ip_summed == CHECKSUM_COMPLETE)
2233 goto out_set_summed;
2234
2235 if (unlikely(skb_shinfo(skb)->gso_size)) {
2236 skb_warn_bad_offload(skb);
2237 return -EINVAL;
2238 }
2239
2240 /* Before computing a checksum, we should make sure no frag could
2241 * be modified by an external entity : checksum could be wrong.
2242 */
2243 if (skb_has_shared_frag(skb)) {
2244 ret = __skb_linearize(skb);
2245 if (ret)
2246 goto out;
2247 }
2248
2249 offset = skb_checksum_start_offset(skb);
2250 BUG_ON(offset >= skb_headlen(skb));
2251 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2252
2253 offset += skb->csum_offset;
2254 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2255
2256 if (skb_cloned(skb) &&
2257 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2258 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2259 if (ret)
2260 goto out;
2261 }
2262
2263 *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2264 out_set_summed:
2265 skb->ip_summed = CHECKSUM_NONE;
2266 out:
2267 return ret;
2268 }
2269 EXPORT_SYMBOL(skb_checksum_help);
2270
2271 __be16 skb_network_protocol(struct sk_buff *skb)
2272 {
2273 __be16 type = skb->protocol;
2274 int vlan_depth = ETH_HLEN;
2275
2276 /* Tunnel gso handlers can set protocol to ethernet. */
2277 if (type == htons(ETH_P_TEB)) {
2278 struct ethhdr *eth;
2279
2280 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2281 return 0;
2282
2283 eth = (struct ethhdr *)skb_mac_header(skb);
2284 type = eth->h_proto;
2285 }
2286
2287 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2288 struct vlan_hdr *vh;
2289
2290 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2291 return 0;
2292
2293 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2294 type = vh->h_vlan_encapsulated_proto;
2295 vlan_depth += VLAN_HLEN;
2296 }
2297
2298 return type;
2299 }
2300
2301 /**
2302 * skb_mac_gso_segment - mac layer segmentation handler.
2303 * @skb: buffer to segment
2304 * @features: features for the output path (see dev->features)
2305 */
2306 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2307 netdev_features_t features)
2308 {
2309 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2310 struct packet_offload *ptype;
2311 __be16 type = skb_network_protocol(skb);
2312
2313 if (unlikely(!type))
2314 return ERR_PTR(-EINVAL);
2315
2316 __skb_pull(skb, skb->mac_len);
2317
2318 rcu_read_lock();
2319 list_for_each_entry_rcu(ptype, &offload_base, list) {
2320 if (ptype->type == type && ptype->callbacks.gso_segment) {
2321 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2322 int err;
2323
2324 err = ptype->callbacks.gso_send_check(skb);
2325 segs = ERR_PTR(err);
2326 if (err || skb_gso_ok(skb, features))
2327 break;
2328 __skb_push(skb, (skb->data -
2329 skb_network_header(skb)));
2330 }
2331 segs = ptype->callbacks.gso_segment(skb, features);
2332 break;
2333 }
2334 }
2335 rcu_read_unlock();
2336
2337 __skb_push(skb, skb->data - skb_mac_header(skb));
2338
2339 return segs;
2340 }
2341 EXPORT_SYMBOL(skb_mac_gso_segment);
2342
2343
2344 /* openvswitch calls this on rx path, so we need a different check.
2345 */
2346 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2347 {
2348 if (tx_path)
2349 return skb->ip_summed != CHECKSUM_PARTIAL &&
2350 skb->ip_summed != CHECKSUM_NONE;
2351
2352 return skb->ip_summed == CHECKSUM_NONE;
2353 }
2354
2355 /**
2356 * __skb_gso_segment - Perform segmentation on skb.
2357 * @skb: buffer to segment
2358 * @features: features for the output path (see dev->features)
2359 * @tx_path: whether it is called in TX path
2360 *
2361 * This function segments the given skb and returns a list of segments.
2362 *
2363 * It may return NULL if the skb requires no segmentation. This is
2364 * only possible when GSO is used for verifying header integrity.
2365 */
2366 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2367 netdev_features_t features, bool tx_path)
2368 {
2369 struct sk_buff *segs;
2370
2371 if (unlikely(skb_needs_check(skb, tx_path))) {
2372 int err;
2373
2374 /* We're going to init ->check field in TCP or UDP header */
2375 if (skb_header_cloned(skb) &&
2376 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2377 return ERR_PTR(err);
2378 }
2379
2380 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2381 skb_reset_mac_header(skb);
2382 skb_reset_mac_len(skb);
2383
2384 segs = skb_mac_gso_segment(skb, features);
2385
2386 if (unlikely(skb_needs_check(skb, tx_path)))
2387 skb_warn_bad_offload(skb);
2388
2389 return segs;
2390 }
2391 EXPORT_SYMBOL(__skb_gso_segment);
2392
2393 /* Take action when hardware reception checksum errors are detected. */
2394 #ifdef CONFIG_BUG
2395 void netdev_rx_csum_fault(struct net_device *dev)
2396 {
2397 if (net_ratelimit()) {
2398 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2399 dump_stack();
2400 }
2401 }
2402 EXPORT_SYMBOL(netdev_rx_csum_fault);
2403 #endif
2404
2405 /* Actually, we should eliminate this check as soon as we know, that:
2406 * 1. IOMMU is present and allows to map all the memory.
2407 * 2. No high memory really exists on this machine.
2408 */
2409
2410 static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
2411 {
2412 #ifdef CONFIG_HIGHMEM
2413 int i;
2414 if (!(dev->features & NETIF_F_HIGHDMA)) {
2415 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2416 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2417 if (PageHighMem(skb_frag_page(frag)))
2418 return 1;
2419 }
2420 }
2421
2422 if (PCI_DMA_BUS_IS_PHYS) {
2423 struct device *pdev = dev->dev.parent;
2424
2425 if (!pdev)
2426 return 0;
2427 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2428 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2429 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2430 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2431 return 1;
2432 }
2433 }
2434 #endif
2435 return 0;
2436 }
2437
2438 struct dev_gso_cb {
2439 void (*destructor)(struct sk_buff *skb);
2440 };
2441
2442 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2443
2444 static void dev_gso_skb_destructor(struct sk_buff *skb)
2445 {
2446 struct dev_gso_cb *cb;
2447
2448 do {
2449 struct sk_buff *nskb = skb->next;
2450
2451 skb->next = nskb->next;
2452 nskb->next = NULL;
2453 kfree_skb(nskb);
2454 } while (skb->next);
2455
2456 cb = DEV_GSO_CB(skb);
2457 if (cb->destructor)
2458 cb->destructor(skb);
2459 }
2460
2461 /**
2462 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2463 * @skb: buffer to segment
2464 * @features: device features as applicable to this skb
2465 *
2466 * This function segments the given skb and stores the list of segments
2467 * in skb->next.
2468 */
2469 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2470 {
2471 struct sk_buff *segs;
2472
2473 segs = skb_gso_segment(skb, features);
2474
2475 /* Verifying header integrity only. */
2476 if (!segs)
2477 return 0;
2478
2479 if (IS_ERR(segs))
2480 return PTR_ERR(segs);
2481
2482 skb->next = segs;
2483 DEV_GSO_CB(skb)->destructor = skb->destructor;
2484 skb->destructor = dev_gso_skb_destructor;
2485
2486 return 0;
2487 }
2488
2489 static netdev_features_t harmonize_features(struct sk_buff *skb,
2490 __be16 protocol,
2491 const struct net_device *dev,
2492 netdev_features_t features)
2493 {
2494 if (skb->ip_summed != CHECKSUM_NONE &&
2495 !can_checksum_protocol(features, protocol)) {
2496 features &= ~NETIF_F_ALL_CSUM;
2497 }
2498 if (illegal_highdma(dev, skb))
2499 features &= ~NETIF_F_SG;
2500
2501 return features;
2502 }
2503
2504 netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2505 const struct net_device *dev)
2506 {
2507 __be16 protocol = skb->protocol;
2508 netdev_features_t features = dev->features;
2509
2510 if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
2511 features &= ~NETIF_F_GSO_MASK;
2512
2513 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2514 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2515 protocol = veh->h_vlan_encapsulated_proto;
2516 } else if (!vlan_tx_tag_present(skb)) {
2517 return harmonize_features(skb, protocol, dev, features);
2518 }
2519
2520 features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2521 NETIF_F_HW_VLAN_STAG_TX);
2522
2523 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2524 return harmonize_features(skb, protocol, dev, features);
2525 } else {
2526 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2527 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2528 NETIF_F_HW_VLAN_STAG_TX;
2529 return harmonize_features(skb, protocol, dev, features);
2530 }
2531
2532 return harmonize_features(skb, protocol, dev, features);
2533 }
2534 EXPORT_SYMBOL(netif_skb_dev_features);
2535
2536 /*
2537 * Returns true if either:
2538 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2539 * 2. skb is fragmented and the device does not support SG.
2540 */
2541 static inline int skb_needs_linearize(struct sk_buff *skb,
2542 netdev_features_t features)
2543 {
2544 return skb_is_nonlinear(skb) &&
2545 ((skb_has_frag_list(skb) &&
2546 !(features & NETIF_F_FRAGLIST)) ||
2547 (skb_shinfo(skb)->nr_frags &&
2548 !(features & NETIF_F_SG)));
2549 }
2550
2551 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2552 struct netdev_queue *txq)
2553 {
2554 const struct net_device_ops *ops = dev->netdev_ops;
2555 int rc = NETDEV_TX_OK;
2556 unsigned int skb_len;
2557
2558 if (likely(!skb->next)) {
2559 netdev_features_t features;
2560
2561 /*
2562 * If device doesn't need skb->dst, release it right now while
2563 * its hot in this cpu cache
2564 */
2565 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2566 skb_dst_drop(skb);
2567
2568 features = netif_skb_features(skb);
2569
2570 if (vlan_tx_tag_present(skb) &&
2571 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2572 skb = __vlan_put_tag(skb, skb->vlan_proto,
2573 vlan_tx_tag_get(skb));
2574 if (unlikely(!skb))
2575 goto out;
2576
2577 skb->vlan_tci = 0;
2578 }
2579
2580 /* If encapsulation offload request, verify we are testing
2581 * hardware encapsulation features instead of standard
2582 * features for the netdev
2583 */
2584 if (skb->encapsulation)
2585 features &= dev->hw_enc_features;
2586
2587 if (netif_needs_gso(skb, features)) {
2588 if (unlikely(dev_gso_segment(skb, features)))
2589 goto out_kfree_skb;
2590 if (skb->next)
2591 goto gso;
2592 } else {
2593 if (skb_needs_linearize(skb, features) &&
2594 __skb_linearize(skb))
2595 goto out_kfree_skb;
2596
2597 /* If packet is not checksummed and device does not
2598 * support checksumming for this protocol, complete
2599 * checksumming here.
2600 */
2601 if (skb->ip_summed == CHECKSUM_PARTIAL) {
2602 if (skb->encapsulation)
2603 skb_set_inner_transport_header(skb,
2604 skb_checksum_start_offset(skb));
2605 else
2606 skb_set_transport_header(skb,
2607 skb_checksum_start_offset(skb));
2608 if (!(features & NETIF_F_ALL_CSUM) &&
2609 skb_checksum_help(skb))
2610 goto out_kfree_skb;
2611 }
2612 }
2613
2614 if (!list_empty(&ptype_all))
2615 dev_queue_xmit_nit(skb, dev);
2616
2617 skb_len = skb->len;
2618 rc = ops->ndo_start_xmit(skb, dev);
2619 trace_net_dev_xmit(skb, rc, dev, skb_len);
2620 if (rc == NETDEV_TX_OK)
2621 txq_trans_update(txq);
2622 return rc;
2623 }
2624
2625 gso:
2626 do {
2627 struct sk_buff *nskb = skb->next;
2628
2629 skb->next = nskb->next;
2630 nskb->next = NULL;
2631
2632 if (!list_empty(&ptype_all))
2633 dev_queue_xmit_nit(nskb, dev);
2634
2635 skb_len = nskb->len;
2636 rc = ops->ndo_start_xmit(nskb, dev);
2637 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2638 if (unlikely(rc != NETDEV_TX_OK)) {
2639 if (rc & ~NETDEV_TX_MASK)
2640 goto out_kfree_gso_skb;
2641 nskb->next = skb->next;
2642 skb->next = nskb;
2643 return rc;
2644 }
2645 txq_trans_update(txq);
2646 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2647 return NETDEV_TX_BUSY;
2648 } while (skb->next);
2649
2650 out_kfree_gso_skb:
2651 if (likely(skb->next == NULL)) {
2652 skb->destructor = DEV_GSO_CB(skb)->destructor;
2653 consume_skb(skb);
2654 return rc;
2655 }
2656 out_kfree_skb:
2657 kfree_skb(skb);
2658 out:
2659 return rc;
2660 }
2661
2662 static void qdisc_pkt_len_init(struct sk_buff *skb)
2663 {
2664 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2665
2666 qdisc_skb_cb(skb)->pkt_len = skb->len;
2667
2668 /* To get more precise estimation of bytes sent on wire,
2669 * we add to pkt_len the headers size of all segments
2670 */
2671 if (shinfo->gso_size) {
2672 unsigned int hdr_len;
2673 u16 gso_segs = shinfo->gso_segs;
2674
2675 /* mac layer + network layer */
2676 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2677
2678 /* + transport layer */
2679 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2680 hdr_len += tcp_hdrlen(skb);
2681 else
2682 hdr_len += sizeof(struct udphdr);
2683
2684 if (shinfo->gso_type & SKB_GSO_DODGY)
2685 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2686 shinfo->gso_size);
2687
2688 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2689 }
2690 }
2691
2692 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2693 struct net_device *dev,
2694 struct netdev_queue *txq)
2695 {
2696 spinlock_t *root_lock = qdisc_lock(q);
2697 bool contended;
2698 int rc;
2699
2700 qdisc_pkt_len_init(skb);
2701 qdisc_calculate_pkt_len(skb, q);
2702 /*
2703 * Heuristic to force contended enqueues to serialize on a
2704 * separate lock before trying to get qdisc main lock.
2705 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2706 * and dequeue packets faster.
2707 */
2708 contended = qdisc_is_running(q);
2709 if (unlikely(contended))
2710 spin_lock(&q->busylock);
2711
2712 spin_lock(root_lock);
2713 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2714 printk(KERN_WARNING "[mtk_net]__dev_xmit_skb drop skb_len = %d \n", skb->len);
2715 kfree_skb(skb);
2716 rc = NET_XMIT_DROP;
2717 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2718 qdisc_run_begin(q)) {
2719 /*
2720 * This is a work-conserving queue; there are no old skbs
2721 * waiting to be sent out; and the qdisc is not running -
2722 * xmit the skb directly.
2723 */
2724 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2725 skb_dst_force(skb);
2726
2727 qdisc_bstats_update(q, skb);
2728
2729 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2730 if (unlikely(contended)) {
2731 spin_unlock(&q->busylock);
2732 contended = false;
2733 }
2734 __qdisc_run(q);
2735 } else
2736 qdisc_run_end(q);
2737
2738 rc = NET_XMIT_SUCCESS;
2739 } else {
2740 skb_dst_force(skb);
2741 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2742 if (qdisc_run_begin(q)) {
2743 if (unlikely(contended)) {
2744 spin_unlock(&q->busylock);
2745 contended = false;
2746 }
2747 __qdisc_run(q);
2748 }
2749 }
2750 spin_unlock(root_lock);
2751 if (unlikely(contended))
2752 spin_unlock(&q->busylock);
2753 return rc;
2754 }
2755
2756 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2757 static void skb_update_prio(struct sk_buff *skb)
2758 {
2759 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2760
2761 if (!skb->priority && skb->sk && map) {
2762 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2763
2764 if (prioidx < map->priomap_len)
2765 skb->priority = map->priomap[prioidx];
2766 }
2767 }
2768 #else
2769 #define skb_update_prio(skb)
2770 #endif
2771
2772 static DEFINE_PER_CPU(int, xmit_recursion);
2773 #define RECURSION_LIMIT 10
2774
2775 /**
2776 * dev_loopback_xmit - loop back @skb
2777 * @skb: buffer to transmit
2778 */
2779 int dev_loopback_xmit(struct sk_buff *skb)
2780 {
2781 skb_reset_mac_header(skb);
2782 __skb_pull(skb, skb_network_offset(skb));
2783 skb->pkt_type = PACKET_LOOPBACK;
2784 skb->ip_summed = CHECKSUM_UNNECESSARY;
2785 WARN_ON(!skb_dst(skb));
2786 skb_dst_force(skb);
2787 netif_rx_ni(skb);
2788 return 0;
2789 }
2790 EXPORT_SYMBOL(dev_loopback_xmit);
2791
2792 /**
2793 * dev_queue_xmit - transmit a buffer
2794 * @skb: buffer to transmit
2795 *
2796 * Queue a buffer for transmission to a network device. The caller must
2797 * have set the device and priority and built the buffer before calling
2798 * this function. The function can be called from an interrupt.
2799 *
2800 * A negative errno code is returned on a failure. A success does not
2801 * guarantee the frame will be transmitted as it may be dropped due
2802 * to congestion or traffic shaping.
2803 *
2804 * -----------------------------------------------------------------------------------
2805 * I notice this method can also return errors from the queue disciplines,
2806 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2807 * be positive.
2808 *
2809 * Regardless of the return value, the skb is consumed, so it is currently
2810 * difficult to retry a send to this method. (You can bump the ref count
2811 * before sending to hold a reference for retry if you are careful.)
2812 *
2813 * When calling this method, interrupts MUST be enabled. This is because
2814 * the BH enable code must have IRQs enabled so that it will not deadlock.
2815 * --BLG
2816 */
2817 int dev_queue_xmit(struct sk_buff *skb)
2818 {
2819 struct net_device *dev = skb->dev;
2820 struct netdev_queue *txq;
2821 struct Qdisc *q;
2822 int rc = -ENOMEM;
2823
2824 skb_reset_mac_header(skb);
2825
2826 #ifdef UDP_SKT_WIFI
2827
2828 if (unlikely((sysctl_met_is_enable == 1) && (sysctl_udp_met_port > 0)
2829 && (ip_hdr(skb)->protocol == IPPROTO_UDP) && skb->sk)) {
2830
2831 if (sysctl_udp_met_port == ntohs((inet_sk(skb->sk))->inet_sport)) {
2832 struct udphdr * udp_iphdr = udp_hdr(skb);
2833 if (udp_iphdr && (ntohs(udp_iphdr->len) >= 12)) {
2834 __u16 * seq_id = (__u16 *)((char *)udp_iphdr + 10);
2835 udp_event_trace_printk("F|%d|%s|%d\n", current->pid, *seq_id);
2836
2837 }
2838 }
2839 }
2840 #endif
2841
2842 /* Disable soft irqs for various locks below. Also
2843 * stops preemption for RCU.
2844 */
2845 rcu_read_lock_bh();
2846
2847 skb_update_prio(skb);
2848
2849 txq = netdev_pick_tx(dev, skb);
2850 q = rcu_dereference_bh(txq->qdisc);
2851
2852 #ifdef CONFIG_NET_CLS_ACT
2853 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2854 #endif
2855 trace_net_dev_queue(skb);
2856 if (q->enqueue) {
2857 rc = __dev_xmit_skb(skb, q, dev, txq);
2858 goto out;
2859 }
2860
2861 /* The device has no queue. Common case for software devices:
2862 loopback, all the sorts of tunnels...
2863
2864 Really, it is unlikely that netif_tx_lock protection is necessary
2865 here. (f.e. loopback and IP tunnels are clean ignoring statistics
2866 counters.)
2867 However, it is possible, that they rely on protection
2868 made by us here.
2869
2870 Check this and shot the lock. It is not prone from deadlocks.
2871 Either shot noqueue qdisc, it is even simpler 8)
2872 */
2873 if (dev->flags & IFF_UP) {
2874 int cpu = smp_processor_id(); /* ok because BHs are off */
2875
2876 if (txq->xmit_lock_owner != cpu) {
2877
2878 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2879 goto recursion_alert;
2880
2881 HARD_TX_LOCK(dev, txq, cpu);
2882
2883 if (!netif_xmit_stopped(txq)) {
2884 __this_cpu_inc(xmit_recursion);
2885 rc = dev_hard_start_xmit(skb, dev, txq);
2886 __this_cpu_dec(xmit_recursion);
2887 if (dev_xmit_complete(rc)) {
2888 HARD_TX_UNLOCK(dev, txq);
2889 goto out;
2890 }
2891 }
2892 HARD_TX_UNLOCK(dev, txq);
2893 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2894 dev->name);
2895 } else {
2896 /* Recursion is detected! It is possible,
2897 * unfortunately
2898 */
2899 recursion_alert:
2900 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2901 dev->name);
2902 }
2903 }
2904
2905 rc = -ENETDOWN;
2906 rcu_read_unlock_bh();
2907
2908 kfree_skb(skb);
2909 return rc;
2910 out:
2911 rcu_read_unlock_bh();
2912 return rc;
2913 }
2914 EXPORT_SYMBOL(dev_queue_xmit);
2915
2916
2917 /*=======================================================================
2918 Receiver routines
2919 =======================================================================*/
2920
2921 int netdev_max_backlog __read_mostly = 1000;
2922 EXPORT_SYMBOL(netdev_max_backlog);
2923
2924 int netdev_tstamp_prequeue __read_mostly = 1;
2925 int netdev_budget __read_mostly = 300;
2926 int weight_p __read_mostly = 64; /* old backlog weight */
2927
2928 /* Called with irq disabled */
2929 static inline void ____napi_schedule(struct softnet_data *sd,
2930 struct napi_struct *napi)
2931 {
2932 list_add_tail(&napi->poll_list, &sd->poll_list);
2933 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2934 }
2935
2936 #ifdef CONFIG_RPS
2937
2938 /* One global table that all flow-based protocols share. */
2939 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2940 EXPORT_SYMBOL(rps_sock_flow_table);
2941
2942 struct static_key rps_needed __read_mostly;
2943
2944 static struct rps_dev_flow *
2945 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2946 struct rps_dev_flow *rflow, u16 next_cpu)
2947 {
2948 if (next_cpu != RPS_NO_CPU) {
2949 #ifdef CONFIG_RFS_ACCEL
2950 struct netdev_rx_queue *rxqueue;
2951 struct rps_dev_flow_table *flow_table;
2952 struct rps_dev_flow *old_rflow;
2953 u32 flow_id;
2954 u16 rxq_index;
2955 int rc;
2956
2957 /* Should we steer this flow to a different hardware queue? */
2958 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2959 !(dev->features & NETIF_F_NTUPLE))
2960 goto out;
2961 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2962 if (rxq_index == skb_get_rx_queue(skb))
2963 goto out;
2964
2965 rxqueue = dev->_rx + rxq_index;
2966 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2967 if (!flow_table)
2968 goto out;
2969 flow_id = skb->rxhash & flow_table->mask;
2970 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2971 rxq_index, flow_id);
2972 if (rc < 0)
2973 goto out;
2974 old_rflow = rflow;
2975 rflow = &flow_table->flows[flow_id];
2976 rflow->filter = rc;
2977 if (old_rflow->filter == rflow->filter)
2978 old_rflow->filter = RPS_NO_FILTER;
2979 out:
2980 #endif
2981 rflow->last_qtail =
2982 per_cpu(softnet_data, next_cpu).input_queue_head;
2983 }
2984
2985 rflow->cpu = next_cpu;
2986 return rflow;
2987 }
2988
2989 /*
2990 * get_rps_cpu is called from netif_receive_skb and returns the target
2991 * CPU from the RPS map of the receiving queue for a given skb.
2992 * rcu_read_lock must be held on entry.
2993 */
2994 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2995 struct rps_dev_flow **rflowp)
2996 {
2997 struct netdev_rx_queue *rxqueue;
2998 struct rps_map *map;
2999 struct rps_dev_flow_table *flow_table;
3000 struct rps_sock_flow_table *sock_flow_table;
3001 int cpu = -1;
3002 u16 tcpu;
3003
3004 if (skb_rx_queue_recorded(skb)) {
3005 u16 index = skb_get_rx_queue(skb);
3006 if (unlikely(index >= dev->real_num_rx_queues)) {
3007 WARN_ONCE(dev->real_num_rx_queues > 1,
3008 "%s received packet on queue %u, but number "
3009 "of RX queues is %u\n",
3010 dev->name, index, dev->real_num_rx_queues);
3011 goto done;
3012 }
3013 rxqueue = dev->_rx + index;
3014 } else
3015 rxqueue = dev->_rx;
3016
3017 map = rcu_dereference(rxqueue->rps_map);
3018 if (map) {
3019 if (map->len == 1 &&
3020 !rcu_access_pointer(rxqueue->rps_flow_table)) {
3021 tcpu = map->cpus[0];
3022 if (cpu_online(tcpu))
3023 cpu = tcpu;
3024 goto done;
3025 }
3026 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3027 goto done;
3028 }
3029
3030 skb_reset_network_header(skb);
3031 if (!skb_get_rxhash(skb))
3032 goto done;
3033
3034 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3035 sock_flow_table = rcu_dereference(rps_sock_flow_table);
3036 if (flow_table && sock_flow_table) {
3037 u16 next_cpu;
3038 struct rps_dev_flow *rflow;
3039
3040 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3041 tcpu = rflow->cpu;
3042
3043 next_cpu = sock_flow_table->ents[skb->rxhash &
3044 sock_flow_table->mask];
3045
3046 /*
3047 * If the desired CPU (where last recvmsg was done) is
3048 * different from current CPU (one in the rx-queue flow
3049 * table entry), switch if one of the following holds:
3050 * - Current CPU is unset (equal to RPS_NO_CPU).
3051 * - Current CPU is offline.
3052 * - The current CPU's queue tail has advanced beyond the
3053 * last packet that was enqueued using this table entry.
3054 * This guarantees that all previous packets for the flow
3055 * have been dequeued, thus preserving in order delivery.
3056 */
3057 if (unlikely(tcpu != next_cpu) &&
3058 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3059 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3060 rflow->last_qtail)) >= 0)) {
3061 tcpu = next_cpu;
3062 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3063 }
3064
3065 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3066 *rflowp = rflow;
3067 cpu = tcpu;
3068 goto done;
3069 }
3070 }
3071
3072 if (map) {
3073 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3074
3075 if (cpu_online(tcpu)) {
3076 cpu = tcpu;
3077 goto done;
3078 }
3079 }
3080
3081 done:
3082 return cpu;
3083 }
3084
3085 #ifdef CONFIG_RFS_ACCEL
3086
3087 /**
3088 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3089 * @dev: Device on which the filter was set
3090 * @rxq_index: RX queue index
3091 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3092 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3093 *
3094 * Drivers that implement ndo_rx_flow_steer() should periodically call
3095 * this function for each installed filter and remove the filters for
3096 * which it returns %true.
3097 */
3098 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3099 u32 flow_id, u16 filter_id)
3100 {
3101 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3102 struct rps_dev_flow_table *flow_table;
3103 struct rps_dev_flow *rflow;
3104 bool expire = true;
3105 int cpu;
3106
3107 rcu_read_lock();
3108 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3109 if (flow_table && flow_id <= flow_table->mask) {
3110 rflow = &flow_table->flows[flow_id];
3111 cpu = ACCESS_ONCE(rflow->cpu);
3112 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3113 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3114 rflow->last_qtail) <
3115 (int)(10 * flow_table->mask)))
3116 expire = false;
3117 }
3118 rcu_read_unlock();
3119 return expire;
3120 }
3121 EXPORT_SYMBOL(rps_may_expire_flow);
3122
3123 #endif /* CONFIG_RFS_ACCEL */
3124
3125 /* Called from hardirq (IPI) context */
3126 static void rps_trigger_softirq(void *data)
3127 {
3128 struct softnet_data *sd = data;
3129
3130 ____napi_schedule(sd, &sd->backlog);
3131 sd->received_rps++;
3132 }
3133
3134 #endif /* CONFIG_RPS */
3135
3136 /*
3137 * Check if this softnet_data structure is another cpu one
3138 * If yes, queue it to our IPI list and return 1
3139 * If no, return 0
3140 */
3141 static int rps_ipi_queued(struct softnet_data *sd)
3142 {
3143 #ifdef CONFIG_RPS
3144 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3145
3146 if (sd != mysd) {
3147 sd->rps_ipi_next = mysd->rps_ipi_list;
3148 mysd->rps_ipi_list = sd;
3149
3150 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3151 return 1;
3152 }
3153 #endif /* CONFIG_RPS */
3154 return 0;
3155 }
3156
3157 /*
3158 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3159 * queue (may be a remote CPU queue).
3160 */
3161 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3162 unsigned int *qtail)
3163 {
3164 struct softnet_data *sd;
3165 unsigned long flags;
3166
3167 sd = &per_cpu(softnet_data, cpu);
3168
3169 local_irq_save(flags);
3170
3171 rps_lock(sd);
3172 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3173 if (skb_queue_len(&sd->input_pkt_queue)) {
3174 enqueue:
3175 __skb_queue_tail(&sd->input_pkt_queue, skb);
3176 input_queue_tail_incr_save(sd, qtail);
3177 rps_unlock(sd);
3178 local_irq_restore(flags);
3179 return NET_RX_SUCCESS;
3180 }
3181
3182 /* Schedule NAPI for backlog device
3183 * We can use non atomic operation since we own the queue lock
3184 */
3185 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3186 if (!rps_ipi_queued(sd))
3187 ____napi_schedule(sd, &sd->backlog);
3188 }
3189 goto enqueue;
3190 }
3191
3192 sd->dropped++;
3193 rps_unlock(sd);
3194
3195 local_irq_restore(flags);
3196
3197 atomic_long_inc(&skb->dev->rx_dropped);
3198 kfree_skb(skb);
3199 return NET_RX_DROP;
3200 }
3201
3202 /**
3203 * netif_rx - post buffer to the network code
3204 * @skb: buffer to post
3205 *
3206 * This function receives a packet from a device driver and queues it for
3207 * the upper (protocol) levels to process. It always succeeds. The buffer
3208 * may be dropped during processing for congestion control or by the
3209 * protocol layers.
3210 *
3211 * return values:
3212 * NET_RX_SUCCESS (no congestion)
3213 * NET_RX_DROP (packet was dropped)
3214 *
3215 */
3216
3217 int netif_rx(struct sk_buff *skb)
3218 {
3219 int ret;
3220
3221 /* if netpoll wants it, pretend we never saw it */
3222 if (netpoll_rx(skb))
3223 return NET_RX_DROP;
3224
3225 net_timestamp_check(netdev_tstamp_prequeue, skb);
3226
3227 trace_netif_rx(skb);
3228 #ifdef CONFIG_RPS
3229 if (static_key_false(&rps_needed)) {
3230 struct rps_dev_flow voidflow, *rflow = &voidflow;
3231 int cpu;
3232
3233 preempt_disable();
3234 rcu_read_lock();
3235
3236 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3237 if (cpu < 0)
3238 cpu = smp_processor_id();
3239
3240 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3241
3242 rcu_read_unlock();
3243 preempt_enable();
3244 } else
3245 #endif
3246 {
3247 unsigned int qtail;
3248 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3249 put_cpu();
3250 }
3251 return ret;
3252 }
3253 EXPORT_SYMBOL(netif_rx);
3254
3255 int netif_rx_ni(struct sk_buff *skb)
3256 {
3257 int err;
3258
3259 preempt_disable();
3260 err = netif_rx(skb);
3261 if (local_softirq_pending())
3262 do_softirq();
3263 preempt_enable();
3264
3265 return err;
3266 }
3267 EXPORT_SYMBOL(netif_rx_ni);
3268
3269 static void net_tx_action(struct softirq_action *h)
3270 {
3271 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3272
3273 if (sd->completion_queue) {
3274 struct sk_buff *clist;
3275
3276 local_irq_disable();
3277 clist = sd->completion_queue;
3278 sd->completion_queue = NULL;
3279 local_irq_enable();
3280
3281 while (clist) {
3282 struct sk_buff *skb = clist;
3283 clist = clist->next;
3284
3285 WARN_ON(atomic_read(&skb->users));
3286 trace_kfree_skb(skb, net_tx_action);
3287 __kfree_skb(skb);
3288 }
3289 }
3290
3291 if (sd->output_queue) {
3292 struct Qdisc *head;
3293
3294 local_irq_disable();
3295 head = sd->output_queue;
3296 sd->output_queue = NULL;
3297 sd->output_queue_tailp = &sd->output_queue;
3298 local_irq_enable();
3299
3300 while (head) {
3301 struct Qdisc *q = head;
3302 spinlock_t *root_lock;
3303
3304 head = head->next_sched;
3305
3306 root_lock = qdisc_lock(q);
3307 if (spin_trylock(root_lock)) {
3308 smp_mb__before_clear_bit();
3309 clear_bit(__QDISC_STATE_SCHED,
3310 &q->state);
3311 qdisc_run(q);
3312 spin_unlock(root_lock);
3313 } else {
3314 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3315 &q->state)) {
3316 __netif_reschedule(q);
3317 } else {
3318 smp_mb__before_clear_bit();
3319 clear_bit(__QDISC_STATE_SCHED,
3320 &q->state);
3321 }
3322 }
3323 }
3324 }
3325 }
3326
3327 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3328 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3329 /* This hook is defined here for ATM LANE */
3330 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3331 unsigned char *addr) __read_mostly;
3332 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3333 #endif
3334
3335 #ifdef CONFIG_NET_CLS_ACT
3336 /* TODO: Maybe we should just force sch_ingress to be compiled in
3337 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3338 * a compare and 2 stores extra right now if we dont have it on
3339 * but have CONFIG_NET_CLS_ACT
3340 * NOTE: This doesn't stop any functionality; if you dont have
3341 * the ingress scheduler, you just can't add policies on ingress.
3342 *
3343 */
3344 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3345 {
3346 struct net_device *dev = skb->dev;
3347 u32 ttl = G_TC_RTTL(skb->tc_verd);
3348 int result = TC_ACT_OK;
3349 struct Qdisc *q;
3350
3351 if (unlikely(MAX_RED_LOOP < ttl++)) {
3352 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3353 skb->skb_iif, dev->ifindex);
3354 return TC_ACT_SHOT;
3355 }
3356
3357 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3358 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3359
3360 q = rxq->qdisc;
3361 if (q != &noop_qdisc) {
3362 spin_lock(qdisc_lock(q));
3363 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3364 result = qdisc_enqueue_root(skb, q);
3365 spin_unlock(qdisc_lock(q));
3366 }
3367
3368 return result;
3369 }
3370
3371 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3372 struct packet_type **pt_prev,
3373 int *ret, struct net_device *orig_dev)
3374 {
3375 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3376
3377 if (!rxq || rxq->qdisc == &noop_qdisc)
3378 goto out;
3379
3380 if (*pt_prev) {
3381 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3382 *pt_prev = NULL;
3383 }
3384
3385 switch (ing_filter(skb, rxq)) {
3386 case TC_ACT_SHOT:
3387 case TC_ACT_STOLEN:
3388 kfree_skb(skb);
3389 return NULL;
3390 }
3391
3392 out:
3393 skb->tc_verd = 0;
3394 return skb;
3395 }
3396 #endif
3397
3398 /**
3399 * netdev_is_rx_handler_busy - check if receive handler is registered
3400 * @dev: device to check
3401 *
3402 * Check if a receive handler is already registered for a given device.
3403 * Return true if there one.
3404 *
3405 * The caller must hold the rtnl_mutex.
3406 */
3407 bool netdev_is_rx_handler_busy(struct net_device *dev)
3408 {
3409 ASSERT_RTNL();
3410 return dev && rtnl_dereference(dev->rx_handler);
3411 }
3412 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3413
3414 /**
3415 * netdev_rx_handler_register - register receive handler
3416 * @dev: device to register a handler for
3417 * @rx_handler: receive handler to register
3418 * @rx_handler_data: data pointer that is used by rx handler
3419 *
3420 * Register a receive hander for a device. This handler will then be
3421 * called from __netif_receive_skb. A negative errno code is returned
3422 * on a failure.
3423 *
3424 * The caller must hold the rtnl_mutex.
3425 *
3426 * For a general description of rx_handler, see enum rx_handler_result.
3427 */
3428 int netdev_rx_handler_register(struct net_device *dev,
3429 rx_handler_func_t *rx_handler,
3430 void *rx_handler_data)
3431 {
3432 ASSERT_RTNL();
3433
3434 if (dev->rx_handler)
3435 return -EBUSY;
3436
3437 /* Note: rx_handler_data must be set before rx_handler */
3438 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3439 rcu_assign_pointer(dev->rx_handler, rx_handler);
3440
3441 return 0;
3442 }
3443 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3444
3445 /**
3446 * netdev_rx_handler_unregister - unregister receive handler
3447 * @dev: device to unregister a handler from
3448 *
3449 * Unregister a receive handler from a device.
3450 *
3451 * The caller must hold the rtnl_mutex.
3452 */
3453 void netdev_rx_handler_unregister(struct net_device *dev)
3454 {
3455
3456 ASSERT_RTNL();
3457 RCU_INIT_POINTER(dev->rx_handler, NULL);
3458 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3459 * section has a guarantee to see a non NULL rx_handler_data
3460 * as well.
3461 */
3462 synchronize_net();
3463 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3464 }
3465 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3466
3467 /*
3468 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3469 * the special handling of PFMEMALLOC skbs.
3470 */
3471 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3472 {
3473 switch (skb->protocol) {
3474 case __constant_htons(ETH_P_ARP):
3475 case __constant_htons(ETH_P_IP):
3476 case __constant_htons(ETH_P_IPV6):
3477 case __constant_htons(ETH_P_8021Q):
3478 case __constant_htons(ETH_P_8021AD):
3479 return true;
3480 default:
3481 return false;
3482 }
3483 }
3484
3485 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3486 {
3487 struct packet_type *ptype, *pt_prev;
3488 rx_handler_func_t *rx_handler;
3489 struct net_device *orig_dev;
3490 struct net_device *null_or_dev;
3491 bool deliver_exact = false;
3492 int ret = NET_RX_DROP;
3493 __be16 type;
3494
3495 net_timestamp_check(!netdev_tstamp_prequeue, skb);
3496
3497 trace_netif_receive_skb(skb);
3498
3499 /* if we've gotten here through NAPI, check netpoll */
3500 if (netpoll_receive_skb(skb))
3501 goto out;
3502
3503 orig_dev = skb->dev;
3504
3505 skb_reset_network_header(skb);
3506 if (!skb_transport_header_was_set(skb))
3507 skb_reset_transport_header(skb);
3508 skb_reset_mac_len(skb);
3509
3510 pt_prev = NULL;
3511
3512 another_round:
3513 skb->skb_iif = skb->dev->ifindex;
3514
3515 __this_cpu_inc(softnet_data.processed);
3516
3517 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3518 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3519 skb = vlan_untag(skb);
3520 if (unlikely(!skb))
3521 goto out;
3522 }
3523
3524 #ifdef CONFIG_NET_CLS_ACT
3525 if (skb->tc_verd & TC_NCLS) {
3526 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3527 goto ncls;
3528 }
3529 #endif
3530
3531 if (pfmemalloc)
3532 goto skip_taps;
3533
3534 list_for_each_entry_rcu(ptype, &ptype_all, list) {
3535 if (!ptype->dev || ptype->dev == skb->dev) {
3536 if (pt_prev)
3537 ret = deliver_skb(skb, pt_prev, orig_dev);
3538 pt_prev = ptype;
3539 }
3540 }
3541
3542 skip_taps:
3543 #ifdef CONFIG_NET_CLS_ACT
3544 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3545 if (!skb)
3546 goto out;
3547 ncls:
3548 #endif
3549
3550 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3551 goto drop;
3552
3553 if (vlan_tx_tag_present(skb)) {
3554 if (pt_prev) {
3555 ret = deliver_skb(skb, pt_prev, orig_dev);
3556 pt_prev = NULL;
3557 }
3558 if (vlan_do_receive(&skb))
3559 goto another_round;
3560 else if (unlikely(!skb))
3561 goto out;
3562 }
3563
3564 rx_handler = rcu_dereference(skb->dev->rx_handler);
3565 if (rx_handler) {
3566 if (pt_prev) {
3567 ret = deliver_skb(skb, pt_prev, orig_dev);
3568 pt_prev = NULL;
3569 }
3570 switch (rx_handler(&skb)) {
3571 case RX_HANDLER_CONSUMED:
3572 ret = NET_RX_SUCCESS;
3573 goto out;
3574 case RX_HANDLER_ANOTHER:
3575 goto another_round;
3576 case RX_HANDLER_EXACT:
3577 deliver_exact = true;
3578 case RX_HANDLER_PASS:
3579 break;
3580 default:
3581 BUG();
3582 }
3583 }
3584
3585 if (unlikely(vlan_tx_tag_present(skb))) {
3586 if (vlan_tx_tag_get_id(skb))
3587 skb->pkt_type = PACKET_OTHERHOST;
3588 /* Note: we might in the future use prio bits
3589 * and set skb->priority like in vlan_do_receive()
3590 * For the time being, just ignore Priority Code Point
3591 */
3592 skb->vlan_tci = 0;
3593 }
3594
3595 /* deliver only exact match when indicated */
3596 null_or_dev = deliver_exact ? skb->dev : NULL;
3597
3598 type = skb->protocol;
3599 list_for_each_entry_rcu(ptype,
3600 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3601 if (ptype->type == type &&
3602 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3603 ptype->dev == orig_dev)) {
3604 if (pt_prev)
3605 ret = deliver_skb(skb, pt_prev, orig_dev);
3606 pt_prev = ptype;
3607 }
3608 }
3609
3610 if (pt_prev) {
3611 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3612 goto drop;
3613 else
3614 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3615 } else {
3616 drop:
3617 atomic_long_inc(&skb->dev->rx_dropped);
3618 kfree_skb(skb);
3619 /* Jamal, now you will not able to escape explaining
3620 * me how you were going to use this. :-)
3621 */
3622 ret = NET_RX_DROP;
3623 }
3624
3625 out:
3626 return ret;
3627 }
3628
3629 static int __netif_receive_skb(struct sk_buff *skb)
3630 {
3631 int ret;
3632
3633 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3634 unsigned long pflags = current->flags;
3635
3636 /*
3637 * PFMEMALLOC skbs are special, they should
3638 * - be delivered to SOCK_MEMALLOC sockets only
3639 * - stay away from userspace
3640 * - have bounded memory usage
3641 *
3642 * Use PF_MEMALLOC as this saves us from propagating the allocation
3643 * context down to all allocation sites.
3644 */
3645 current->flags |= PF_MEMALLOC;
3646 ret = __netif_receive_skb_core(skb, true);
3647 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3648 } else
3649 ret = __netif_receive_skb_core(skb, false);
3650
3651 return ret;
3652 }
3653
3654 /**
3655 * netif_receive_skb - process receive buffer from network
3656 * @skb: buffer to process
3657 *
3658 * netif_receive_skb() is the main receive data processing function.
3659 * It always succeeds. The buffer may be dropped during processing
3660 * for congestion control or by the protocol layers.
3661 *
3662 * This function may only be called from softirq context and interrupts
3663 * should be enabled.
3664 *
3665 * Return values (usually ignored):
3666 * NET_RX_SUCCESS: no congestion
3667 * NET_RX_DROP: packet was dropped
3668 */
3669 int netif_receive_skb(struct sk_buff *skb)
3670 {
3671 int ret;
3672
3673 net_timestamp_check(netdev_tstamp_prequeue, skb);
3674
3675 if (skb_defer_rx_timestamp(skb))
3676 return NET_RX_SUCCESS;
3677
3678 rcu_read_lock();
3679
3680 #ifdef CONFIG_RPS
3681 if (static_key_false(&rps_needed)) {
3682 struct rps_dev_flow voidflow, *rflow = &voidflow;
3683 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3684
3685 if (cpu >= 0) {
3686 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3687 rcu_read_unlock();
3688 return ret;
3689 }
3690 }
3691 #endif
3692 ret = __netif_receive_skb(skb);
3693 rcu_read_unlock();
3694 return ret;
3695 }
3696 EXPORT_SYMBOL(netif_receive_skb);
3697
3698 /* Network device is going away, flush any packets still pending
3699 * Called with irqs disabled.
3700 */
3701 static void flush_backlog(void *arg)
3702 {
3703 struct net_device *dev = arg;
3704 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3705 struct sk_buff *skb, *tmp;
3706
3707 rps_lock(sd);
3708 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3709 if (skb->dev == dev) {
3710 __skb_unlink(skb, &sd->input_pkt_queue);
3711 kfree_skb(skb);
3712 input_queue_head_incr(sd);
3713 }
3714 }
3715 rps_unlock(sd);
3716
3717 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3718 if (skb->dev == dev) {
3719 __skb_unlink(skb, &sd->process_queue);
3720 kfree_skb(skb);
3721 input_queue_head_incr(sd);
3722 }
3723 }
3724 }
3725
3726 static int napi_gro_complete(struct sk_buff *skb)
3727 {
3728 struct packet_offload *ptype;
3729 __be16 type = skb->protocol;
3730 struct list_head *head = &offload_base;
3731 int err = -ENOENT;
3732
3733 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3734
3735 if (NAPI_GRO_CB(skb)->count == 1) {
3736 skb_shinfo(skb)->gso_size = 0;
3737 goto out;
3738 }
3739
3740 rcu_read_lock();
3741 list_for_each_entry_rcu(ptype, head, list) {
3742 if (ptype->type != type || !ptype->callbacks.gro_complete)
3743 continue;
3744
3745 err = ptype->callbacks.gro_complete(skb);
3746 break;
3747 }
3748 rcu_read_unlock();
3749
3750 if (err) {
3751 WARN_ON(&ptype->list == head);
3752 kfree_skb(skb);
3753 return NET_RX_SUCCESS;
3754 }
3755
3756 out:
3757 return netif_receive_skb(skb);
3758 }
3759
3760 /* napi->gro_list contains packets ordered by age.
3761 * youngest packets at the head of it.
3762 * Complete skbs in reverse order to reduce latencies.
3763 */
3764 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3765 {
3766 struct sk_buff *skb, *prev = NULL;
3767
3768 /* scan list and build reverse chain */
3769 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3770 skb->prev = prev;
3771 prev = skb;
3772 }
3773
3774 for (skb = prev; skb; skb = prev) {
3775 skb->next = NULL;
3776
3777 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3778 return;
3779
3780 prev = skb->prev;
3781 napi_gro_complete(skb);
3782 napi->gro_count--;
3783 }
3784
3785 napi->gro_list = NULL;
3786 }
3787 EXPORT_SYMBOL(napi_gro_flush);
3788
3789 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3790 {
3791 struct sk_buff *p;
3792 unsigned int maclen = skb->dev->hard_header_len;
3793
3794 for (p = napi->gro_list; p; p = p->next) {
3795 unsigned long diffs;
3796
3797 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3798 diffs |= p->vlan_tci ^ skb->vlan_tci;
3799 if (maclen == ETH_HLEN)
3800 diffs |= compare_ether_header(skb_mac_header(p),
3801 skb_gro_mac_header(skb));
3802 else if (!diffs)
3803 diffs = memcmp(skb_mac_header(p),
3804 skb_gro_mac_header(skb),
3805 maclen);
3806 NAPI_GRO_CB(p)->same_flow = !diffs;
3807 NAPI_GRO_CB(p)->flush = 0;
3808 }
3809 }
3810
3811 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3812 {
3813 struct sk_buff **pp = NULL;
3814 struct packet_offload *ptype;
3815 __be16 type = skb->protocol;
3816 struct list_head *head = &offload_base;
3817 int same_flow;
3818 enum gro_result ret;
3819
3820 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3821 goto normal;
3822
3823 if (skb_is_gso(skb) || skb_has_frag_list(skb))
3824 goto normal;
3825
3826 gro_list_prepare(napi, skb);
3827
3828 rcu_read_lock();
3829 list_for_each_entry_rcu(ptype, head, list) {
3830 if (ptype->type != type || !ptype->callbacks.gro_receive)
3831 continue;
3832
3833 skb_set_network_header(skb, skb_gro_offset(skb));
3834 skb_reset_mac_len(skb);
3835 NAPI_GRO_CB(skb)->same_flow = 0;
3836 NAPI_GRO_CB(skb)->flush = 0;
3837 NAPI_GRO_CB(skb)->free = 0;
3838
3839 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3840 break;
3841 }
3842 rcu_read_unlock();
3843
3844 if (&ptype->list == head)
3845 goto normal;
3846
3847 same_flow = NAPI_GRO_CB(skb)->same_flow;
3848 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3849
3850 if (pp) {
3851 struct sk_buff *nskb = *pp;
3852
3853 *pp = nskb->next;
3854 nskb->next = NULL;
3855 napi_gro_complete(nskb);
3856 napi->gro_count--;
3857 }
3858
3859 if (same_flow)
3860 goto ok;
3861
3862 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3863 goto normal;
3864
3865 napi->gro_count++;
3866 NAPI_GRO_CB(skb)->count = 1;
3867 NAPI_GRO_CB(skb)->age = jiffies;
3868 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3869 skb->next = napi->gro_list;
3870 napi->gro_list = skb;
3871 ret = GRO_HELD;
3872
3873 pull:
3874 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3875 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3876
3877 BUG_ON(skb->end - skb->tail < grow);
3878
3879 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3880
3881 skb->tail += grow;
3882 skb->data_len -= grow;
3883
3884 skb_shinfo(skb)->frags[0].page_offset += grow;
3885 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3886
3887 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3888 skb_frag_unref(skb, 0);
3889 memmove(skb_shinfo(skb)->frags,
3890 skb_shinfo(skb)->frags + 1,
3891 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3892 }
3893 }
3894
3895 ok:
3896 return ret;
3897
3898 normal:
3899 ret = GRO_NORMAL;
3900 goto pull;
3901 }
3902
3903
3904 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3905 {
3906 switch (ret) {
3907 case GRO_NORMAL:
3908 if (netif_receive_skb(skb))
3909 ret = GRO_DROP;
3910 break;
3911
3912 case GRO_DROP:
3913 kfree_skb(skb);
3914 break;
3915
3916 case GRO_MERGED_FREE:
3917 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3918 kmem_cache_free(skbuff_head_cache, skb);
3919 else
3920 __kfree_skb(skb);
3921 break;
3922
3923 case GRO_HELD:
3924 case GRO_MERGED:
3925 break;
3926 }
3927
3928 return ret;
3929 }
3930
3931 static void skb_gro_reset_offset(struct sk_buff *skb)
3932 {
3933 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3934 const skb_frag_t *frag0 = &pinfo->frags[0];
3935
3936 NAPI_GRO_CB(skb)->data_offset = 0;
3937 NAPI_GRO_CB(skb)->frag0 = NULL;
3938 NAPI_GRO_CB(skb)->frag0_len = 0;
3939
3940 if (skb->mac_header == skb->tail &&
3941 pinfo->nr_frags &&
3942 !PageHighMem(skb_frag_page(frag0))) {
3943 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3944 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
3945 skb_frag_size(frag0),
3946 skb->end - skb->tail);
3947 }
3948 }
3949
3950 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3951 {
3952 skb_gro_reset_offset(skb);
3953
3954 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3955 }
3956 EXPORT_SYMBOL(napi_gro_receive);
3957
3958 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3959 {
3960 __skb_pull(skb, skb_headlen(skb));
3961 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3962 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3963 skb->vlan_tci = 0;
3964 skb->dev = napi->dev;
3965 skb->skb_iif = 0;
3966 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
3967
3968 napi->skb = skb;
3969 }
3970
3971 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3972 {
3973 struct sk_buff *skb = napi->skb;
3974
3975 if (!skb) {
3976 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3977 if (skb)
3978 napi->skb = skb;
3979 }
3980 return skb;
3981 }
3982 EXPORT_SYMBOL(napi_get_frags);
3983
3984 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3985 gro_result_t ret)
3986 {
3987 switch (ret) {
3988 case GRO_NORMAL:
3989 case GRO_HELD:
3990 skb->protocol = eth_type_trans(skb, skb->dev);
3991
3992 if (ret == GRO_HELD)
3993 skb_gro_pull(skb, -ETH_HLEN);
3994 else if (netif_receive_skb(skb))
3995 ret = GRO_DROP;
3996 break;
3997
3998 case GRO_DROP:
3999 case GRO_MERGED_FREE:
4000 napi_reuse_skb(napi, skb);
4001 break;
4002
4003 case GRO_MERGED:
4004 break;
4005 }
4006
4007 return ret;
4008 }
4009
4010 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4011 {
4012 struct sk_buff *skb = napi->skb;
4013 struct ethhdr *eth;
4014 unsigned int hlen;
4015 unsigned int off;
4016
4017 napi->skb = NULL;
4018
4019 skb_reset_mac_header(skb);
4020 skb_gro_reset_offset(skb);
4021
4022 off = skb_gro_offset(skb);
4023 hlen = off + sizeof(*eth);
4024 eth = skb_gro_header_fast(skb, off);
4025 if (skb_gro_header_hard(skb, hlen)) {
4026 eth = skb_gro_header_slow(skb, hlen, off);
4027 if (unlikely(!eth)) {
4028 napi_reuse_skb(napi, skb);
4029 skb = NULL;
4030 goto out;
4031 }
4032 }
4033
4034 skb_gro_pull(skb, sizeof(*eth));
4035
4036 /*
4037 * This works because the only protocols we care about don't require
4038 * special handling. We'll fix it up properly at the end.
4039 */
4040 skb->protocol = eth->h_proto;
4041
4042 out:
4043 return skb;
4044 }
4045
4046 gro_result_t napi_gro_frags(struct napi_struct *napi)
4047 {
4048 struct sk_buff *skb = napi_frags_skb(napi);
4049
4050 if (!skb)
4051 return GRO_DROP;
4052
4053 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4054 }
4055 EXPORT_SYMBOL(napi_gro_frags);
4056
4057 /*
4058 * net_rps_action sends any pending IPI's for rps.
4059 * Note: called with local irq disabled, but exits with local irq enabled.
4060 */
4061 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4062 {
4063 #ifdef CONFIG_RPS
4064 struct softnet_data *remsd = sd->rps_ipi_list;
4065
4066 if (remsd) {
4067 sd->rps_ipi_list = NULL;
4068
4069 local_irq_enable();
4070
4071 /* Send pending IPI's to kick RPS processing on remote cpus. */
4072 while (remsd) {
4073 struct softnet_data *next = remsd->rps_ipi_next;
4074
4075 if (cpu_online(remsd->cpu))
4076 __smp_call_function_single(remsd->cpu,
4077 &remsd->csd, 0);
4078 remsd = next;
4079 }
4080 } else
4081 #endif
4082 local_irq_enable();
4083 }
4084
4085 static int process_backlog(struct napi_struct *napi, int quota)
4086 {
4087 int work = 0;
4088 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4089
4090 #ifdef CONFIG_RPS
4091 /* Check if we have pending ipi, its better to send them now,
4092 * not waiting net_rx_action() end.
4093 */
4094 if (sd->rps_ipi_list) {
4095 local_irq_disable();
4096 net_rps_action_and_irq_enable(sd);
4097 }
4098 #endif
4099 napi->weight = weight_p;
4100 local_irq_disable();
4101 while (work < quota) {
4102 struct sk_buff *skb;
4103 unsigned int qlen;
4104
4105 while ((skb = __skb_dequeue(&sd->process_queue))) {
4106 rcu_read_lock();
4107 local_irq_enable();
4108 __netif_receive_skb(skb);
4109 rcu_read_unlock();
4110 local_irq_disable();
4111 input_queue_head_incr(sd);
4112 if (++work >= quota) {
4113 local_irq_enable();
4114 return work;
4115 }
4116 }
4117
4118 rps_lock(sd);
4119 qlen = skb_queue_len(&sd->input_pkt_queue);
4120 if (qlen)
4121 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4122 &sd->process_queue);
4123
4124 if (qlen < quota - work) {
4125 /*
4126 * Inline a custom version of __napi_complete().
4127 * only current cpu owns and manipulates this napi,
4128 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4129 * we can use a plain write instead of clear_bit(),
4130 * and we dont need an smp_mb() memory barrier.
4131 */
4132 list_del(&napi->poll_list);
4133 napi->state = 0;
4134
4135 quota = work + qlen;
4136 }
4137 rps_unlock(sd);
4138 }
4139 local_irq_enable();
4140
4141 return work;
4142 }
4143
4144 /**
4145 * __napi_schedule - schedule for receive
4146 * @n: entry to schedule
4147 *
4148 * The entry's receive function will be scheduled to run
4149 */
4150 void __napi_schedule(struct napi_struct *n)
4151 {
4152 unsigned long flags;
4153
4154 local_irq_save(flags);
4155 ____napi_schedule(&__get_cpu_var(softnet_data), n);
4156 local_irq_restore(flags);
4157 }
4158 EXPORT_SYMBOL(__napi_schedule);
4159
4160 void __napi_complete(struct napi_struct *n)
4161 {
4162 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4163 BUG_ON(n->gro_list);
4164
4165 list_del(&n->poll_list);
4166 smp_mb__before_clear_bit();
4167 clear_bit(NAPI_STATE_SCHED, &n->state);
4168 }
4169 EXPORT_SYMBOL(__napi_complete);
4170
4171 void napi_complete(struct napi_struct *n)
4172 {
4173 unsigned long flags;
4174
4175 /*
4176 * don't let napi dequeue from the cpu poll list
4177 * just in case its running on a different cpu
4178 */
4179 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4180 return;
4181
4182 napi_gro_flush(n, false);
4183 local_irq_save(flags);
4184 __napi_complete(n);
4185 local_irq_restore(flags);
4186 }
4187 EXPORT_SYMBOL(napi_complete);
4188
4189 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4190 int (*poll)(struct napi_struct *, int), int weight)
4191 {
4192 INIT_LIST_HEAD(&napi->poll_list);
4193 napi->gro_count = 0;
4194 napi->gro_list = NULL;
4195 napi->skb = NULL;
4196 napi->poll = poll;
4197 if (weight > NAPI_POLL_WEIGHT)
4198 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4199 weight, dev->name);
4200 napi->weight = weight;
4201 list_add(&napi->dev_list, &dev->napi_list);
4202 napi->dev = dev;
4203 #ifdef CONFIG_NETPOLL
4204 spin_lock_init(&napi->poll_lock);
4205 napi->poll_owner = -1;
4206 #endif
4207 set_bit(NAPI_STATE_SCHED, &napi->state);
4208 }
4209 EXPORT_SYMBOL(netif_napi_add);
4210
4211 void netif_napi_del(struct napi_struct *napi)
4212 {
4213 struct sk_buff *skb, *next;
4214
4215 list_del_init(&napi->dev_list);
4216 napi_free_frags(napi);
4217
4218 for (skb = napi->gro_list; skb; skb = next) {
4219 next = skb->next;
4220 skb->next = NULL;
4221 kfree_skb(skb);
4222 }
4223
4224 napi->gro_list = NULL;
4225 napi->gro_count = 0;
4226 }
4227 EXPORT_SYMBOL(netif_napi_del);
4228
4229 static void net_rx_action(struct softirq_action *h)
4230 {
4231 struct softnet_data *sd = &__get_cpu_var(softnet_data);
4232 unsigned long time_limit = jiffies + 2;
4233 int budget = netdev_budget;
4234 void *have;
4235
4236 local_irq_disable();
4237
4238 while (!list_empty(&sd->poll_list)) {
4239 struct napi_struct *n;
4240 int work, weight;
4241
4242 /* If softirq window is exhuasted then punt.
4243 * Allow this to run for 2 jiffies since which will allow
4244 * an average latency of 1.5/HZ.
4245 */
4246 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4247 goto softnet_break;
4248
4249 local_irq_enable();
4250
4251 /* Even though interrupts have been re-enabled, this
4252 * access is safe because interrupts can only add new
4253 * entries to the tail of this list, and only ->poll()
4254 * calls can remove this head entry from the list.
4255 */
4256 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4257
4258 have = netpoll_poll_lock(n);
4259
4260 weight = n->weight;
4261
4262 /* This NAPI_STATE_SCHED test is for avoiding a race
4263 * with netpoll's poll_napi(). Only the entity which
4264 * obtains the lock and sees NAPI_STATE_SCHED set will
4265 * actually make the ->poll() call. Therefore we avoid
4266 * accidentally calling ->poll() when NAPI is not scheduled.
4267 */
4268 work = 0;
4269 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4270 work = n->poll(n, weight);
4271 trace_napi_poll(n);
4272 }
4273
4274 WARN_ON_ONCE(work > weight);
4275
4276 budget -= work;
4277
4278 local_irq_disable();
4279
4280 /* Drivers must not modify the NAPI state if they
4281 * consume the entire weight. In such cases this code
4282 * still "owns" the NAPI instance and therefore can
4283 * move the instance around on the list at-will.
4284 */
4285 if (unlikely(work == weight)) {
4286 if (unlikely(napi_disable_pending(n))) {
4287 local_irq_enable();
4288 napi_complete(n);
4289 local_irq_disable();
4290 } else {
4291 if (n->gro_list) {
4292 /* flush too old packets
4293 * If HZ < 1000, flush all packets.
4294 */
4295 local_irq_enable();
4296 napi_gro_flush(n, HZ >= 1000);
4297 local_irq_disable();
4298 }
4299 list_move_tail(&n->poll_list, &sd->poll_list);
4300 }
4301 }
4302
4303 netpoll_poll_unlock(have);
4304 }
4305 out:
4306 net_rps_action_and_irq_enable(sd);
4307
4308 #ifdef CONFIG_NET_DMA
4309 /*
4310 * There may not be any more sk_buffs coming right now, so push
4311 * any pending DMA copies to hardware
4312 */
4313 dma_issue_pending_all();
4314 #endif
4315
4316 return;
4317
4318 softnet_break:
4319 sd->time_squeeze++;
4320 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4321 goto out;
4322 }
4323
4324 struct netdev_upper {
4325 struct net_device *dev;
4326 bool master;
4327 struct list_head list;
4328 struct rcu_head rcu;
4329 struct list_head search_list;
4330 };
4331
4332 static void __append_search_uppers(struct list_head *search_list,
4333 struct net_device *dev)
4334 {
4335 struct netdev_upper *upper;
4336
4337 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4338 /* check if this upper is not already in search list */
4339 if (list_empty(&upper->search_list))
4340 list_add_tail(&upper->search_list, search_list);
4341 }
4342 }
4343
4344 static bool __netdev_search_upper_dev(struct net_device *dev,
4345 struct net_device *upper_dev)
4346 {
4347 LIST_HEAD(search_list);
4348 struct netdev_upper *upper;
4349 struct netdev_upper *tmp;
4350 bool ret = false;
4351
4352 __append_search_uppers(&search_list, dev);
4353 list_for_each_entry(upper, &search_list, search_list) {
4354 if (upper->dev == upper_dev) {
4355 ret = true;
4356 break;
4357 }
4358 __append_search_uppers(&search_list, upper->dev);
4359 }
4360 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4361 INIT_LIST_HEAD(&upper->search_list);
4362 return ret;
4363 }
4364
4365 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4366 struct net_device *upper_dev)
4367 {
4368 struct netdev_upper *upper;
4369
4370 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4371 if (upper->dev == upper_dev)
4372 return upper;
4373 }
4374 return NULL;
4375 }
4376
4377 /**
4378 * netdev_has_upper_dev - Check if device is linked to an upper device
4379 * @dev: device
4380 * @upper_dev: upper device to check
4381 *
4382 * Find out if a device is linked to specified upper device and return true
4383 * in case it is. Note that this checks only immediate upper device,
4384 * not through a complete stack of devices. The caller must hold the RTNL lock.
4385 */
4386 bool netdev_has_upper_dev(struct net_device *dev,
4387 struct net_device *upper_dev)
4388 {
4389 ASSERT_RTNL();
4390
4391 return __netdev_find_upper(dev, upper_dev);
4392 }
4393 EXPORT_SYMBOL(netdev_has_upper_dev);
4394
4395 /**
4396 * netdev_has_any_upper_dev - Check if device is linked to some device
4397 * @dev: device
4398 *
4399 * Find out if a device is linked to an upper device and return true in case
4400 * it is. The caller must hold the RTNL lock.
4401 */
4402 bool netdev_has_any_upper_dev(struct net_device *dev)
4403 {
4404 ASSERT_RTNL();
4405
4406 return !list_empty(&dev->upper_dev_list);
4407 }
4408 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4409
4410 /**
4411 * netdev_master_upper_dev_get - Get master upper device
4412 * @dev: device
4413 *
4414 * Find a master upper device and return pointer to it or NULL in case
4415 * it's not there. The caller must hold the RTNL lock.
4416 */
4417 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4418 {
4419 struct netdev_upper *upper;
4420
4421 ASSERT_RTNL();
4422
4423 if (list_empty(&dev->upper_dev_list))
4424 return NULL;
4425
4426 upper = list_first_entry(&dev->upper_dev_list,
4427 struct netdev_upper, list);
4428 if (likely(upper->master))
4429 return upper->dev;
4430 return NULL;
4431 }
4432 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4433
4434 /**
4435 * netdev_master_upper_dev_get_rcu - Get master upper device
4436 * @dev: device
4437 *
4438 * Find a master upper device and return pointer to it or NULL in case
4439 * it's not there. The caller must hold the RCU read lock.
4440 */
4441 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4442 {
4443 struct netdev_upper *upper;
4444
4445 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4446 struct netdev_upper, list);
4447 if (upper && likely(upper->master))
4448 return upper->dev;
4449 return NULL;
4450 }
4451 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4452
4453 static int __netdev_upper_dev_link(struct net_device *dev,
4454 struct net_device *upper_dev, bool master)
4455 {
4456 struct netdev_upper *upper;
4457
4458 ASSERT_RTNL();
4459
4460 if (dev == upper_dev)
4461 return -EBUSY;
4462
4463 /* To prevent loops, check if dev is not upper device to upper_dev. */
4464 if (__netdev_search_upper_dev(upper_dev, dev))
4465 return -EBUSY;
4466
4467 if (__netdev_find_upper(dev, upper_dev))
4468 return -EEXIST;
4469
4470 if (master && netdev_master_upper_dev_get(dev))
4471 return -EBUSY;
4472
4473 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4474 if (!upper)
4475 return -ENOMEM;
4476
4477 upper->dev = upper_dev;
4478 upper->master = master;
4479 INIT_LIST_HEAD(&upper->search_list);
4480
4481 /* Ensure that master upper link is always the first item in list. */
4482 if (master)
4483 list_add_rcu(&upper->list, &dev->upper_dev_list);
4484 else
4485 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4486 dev_hold(upper_dev);
4487
4488 return 0;
4489 }
4490
4491 /**
4492 * netdev_upper_dev_link - Add a link to the upper device
4493 * @dev: device
4494 * @upper_dev: new upper device
4495 *
4496 * Adds a link to device which is upper to this one. The caller must hold
4497 * the RTNL lock. On a failure a negative errno code is returned.
4498 * On success the reference counts are adjusted and the function
4499 * returns zero.
4500 */
4501 int netdev_upper_dev_link(struct net_device *dev,
4502 struct net_device *upper_dev)
4503 {
4504 return __netdev_upper_dev_link(dev, upper_dev, false);
4505 }
4506 EXPORT_SYMBOL(netdev_upper_dev_link);
4507
4508 /**
4509 * netdev_master_upper_dev_link - Add a master link to the upper device
4510 * @dev: device
4511 * @upper_dev: new upper device
4512 *
4513 * Adds a link to device which is upper to this one. In this case, only
4514 * one master upper device can be linked, although other non-master devices
4515 * might be linked as well. The caller must hold the RTNL lock.
4516 * On a failure a negative errno code is returned. On success the reference
4517 * counts are adjusted and the function returns zero.
4518 */
4519 int netdev_master_upper_dev_link(struct net_device *dev,
4520 struct net_device *upper_dev)
4521 {
4522 return __netdev_upper_dev_link(dev, upper_dev, true);
4523 }
4524 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4525
4526 /**
4527 * netdev_upper_dev_unlink - Removes a link to upper device
4528 * @dev: device
4529 * @upper_dev: new upper device
4530 *
4531 * Removes a link to device which is upper to this one. The caller must hold
4532 * the RTNL lock.
4533 */
4534 void netdev_upper_dev_unlink(struct net_device *dev,
4535 struct net_device *upper_dev)
4536 {
4537 struct netdev_upper *upper;
4538
4539 ASSERT_RTNL();
4540
4541 upper = __netdev_find_upper(dev, upper_dev);
4542 if (!upper)
4543 return;
4544 list_del_rcu(&upper->list);
4545 dev_put(upper_dev);
4546 kfree_rcu(upper, rcu);
4547 }
4548 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4549
4550 static void dev_change_rx_flags(struct net_device *dev, int flags)
4551 {
4552 const struct net_device_ops *ops = dev->netdev_ops;
4553
4554 if (ops->ndo_change_rx_flags)
4555 ops->ndo_change_rx_flags(dev, flags);
4556 }
4557
4558 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4559 {
4560 unsigned int old_flags = dev->flags;
4561 kuid_t uid;
4562 kgid_t gid;
4563
4564 ASSERT_RTNL();
4565
4566 dev->flags |= IFF_PROMISC;
4567 dev->promiscuity += inc;
4568 if (dev->promiscuity == 0) {
4569 /*
4570 * Avoid overflow.
4571 * If inc causes overflow, untouch promisc and return error.
4572 */
4573 if (inc < 0)
4574 dev->flags &= ~IFF_PROMISC;
4575 else {
4576 dev->promiscuity -= inc;
4577 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4578 dev->name);
4579 return -EOVERFLOW;
4580 }
4581 }
4582 if (dev->flags != old_flags) {
4583 pr_info("device %s %s promiscuous mode\n",
4584 dev->name,
4585 dev->flags & IFF_PROMISC ? "entered" : "left");
4586 if (audit_enabled) {
4587 current_uid_gid(&uid, &gid);
4588 audit_log(current->audit_context, GFP_ATOMIC,
4589 AUDIT_ANOM_PROMISCUOUS,
4590 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4591 dev->name, (dev->flags & IFF_PROMISC),
4592 (old_flags & IFF_PROMISC),
4593 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4594 from_kuid(&init_user_ns, uid),
4595 from_kgid(&init_user_ns, gid),
4596 audit_get_sessionid(current));
4597 }
4598
4599 dev_change_rx_flags(dev, IFF_PROMISC);
4600 }
4601 return 0;
4602 }
4603
4604 /**
4605 * dev_set_promiscuity - update promiscuity count on a device
4606 * @dev: device
4607 * @inc: modifier
4608 *
4609 * Add or remove promiscuity from a device. While the count in the device
4610 * remains above zero the interface remains promiscuous. Once it hits zero
4611 * the device reverts back to normal filtering operation. A negative inc
4612 * value is used to drop promiscuity on the device.
4613 * Return 0 if successful or a negative errno code on error.
4614 */
4615 int dev_set_promiscuity(struct net_device *dev, int inc)
4616 {
4617 unsigned int old_flags = dev->flags;
4618 int err;
4619
4620 err = __dev_set_promiscuity(dev, inc);
4621 if (err < 0)
4622 return err;
4623 if (dev->flags != old_flags)
4624 dev_set_rx_mode(dev);
4625 return err;
4626 }
4627 EXPORT_SYMBOL(dev_set_promiscuity);
4628
4629 /**
4630 * dev_set_allmulti - update allmulti count on a device
4631 * @dev: device
4632 * @inc: modifier
4633 *
4634 * Add or remove reception of all multicast frames to a device. While the
4635 * count in the device remains above zero the interface remains listening
4636 * to all interfaces. Once it hits zero the device reverts back to normal
4637 * filtering operation. A negative @inc value is used to drop the counter
4638 * when releasing a resource needing all multicasts.
4639 * Return 0 if successful or a negative errno code on error.
4640 */
4641
4642 int dev_set_allmulti(struct net_device *dev, int inc)
4643 {
4644 unsigned int old_flags = dev->flags;
4645
4646 ASSERT_RTNL();
4647
4648 dev->flags |= IFF_ALLMULTI;
4649 dev->allmulti += inc;
4650 if (dev->allmulti == 0) {
4651 /*
4652 * Avoid overflow.
4653 * If inc causes overflow, untouch allmulti and return error.
4654 */
4655 if (inc < 0)
4656 dev->flags &= ~IFF_ALLMULTI;
4657 else {
4658 dev->allmulti -= inc;
4659 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4660 dev->name);
4661 return -EOVERFLOW;
4662 }
4663 }
4664 if (dev->flags ^ old_flags) {
4665 dev_change_rx_flags(dev, IFF_ALLMULTI);
4666 dev_set_rx_mode(dev);
4667 }
4668 return 0;
4669 }
4670 EXPORT_SYMBOL(dev_set_allmulti);
4671
4672 /*
4673 * Upload unicast and multicast address lists to device and
4674 * configure RX filtering. When the device doesn't support unicast
4675 * filtering it is put in promiscuous mode while unicast addresses
4676 * are present.
4677 */
4678 void __dev_set_rx_mode(struct net_device *dev)
4679 {
4680 const struct net_device_ops *ops = dev->netdev_ops;
4681
4682 /* dev_open will call this function so the list will stay sane. */
4683 if (!(dev->flags&IFF_UP))
4684 return;
4685
4686 if (!netif_device_present(dev))
4687 return;
4688
4689 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4690 /* Unicast addresses changes may only happen under the rtnl,
4691 * therefore calling __dev_set_promiscuity here is safe.
4692 */
4693 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4694 __dev_set_promiscuity(dev, 1);
4695 dev->uc_promisc = true;
4696 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4697 __dev_set_promiscuity(dev, -1);
4698 dev->uc_promisc = false;
4699 }
4700 }
4701
4702 if (ops->ndo_set_rx_mode)
4703 ops->ndo_set_rx_mode(dev);
4704 }
4705 EXPORT_SYMBOL(__dev_set_rx_mode);
4706
4707 void dev_set_rx_mode(struct net_device *dev)
4708 {
4709 netif_addr_lock_bh(dev);
4710 __dev_set_rx_mode(dev);
4711 netif_addr_unlock_bh(dev);
4712 }
4713
4714 /**
4715 * dev_get_flags - get flags reported to userspace
4716 * @dev: device
4717 *
4718 * Get the combination of flag bits exported through APIs to userspace.
4719 */
4720 unsigned int dev_get_flags(const struct net_device *dev)
4721 {
4722 unsigned int flags;
4723
4724 flags = (dev->flags & ~(IFF_PROMISC |
4725 IFF_ALLMULTI |
4726 IFF_RUNNING |
4727 IFF_LOWER_UP |
4728 IFF_DORMANT)) |
4729 (dev->gflags & (IFF_PROMISC |
4730 IFF_ALLMULTI));
4731
4732 if (netif_running(dev)) {
4733 if (netif_oper_up(dev))
4734 flags |= IFF_RUNNING;
4735 if (netif_carrier_ok(dev))
4736 flags |= IFF_LOWER_UP;
4737 if (netif_dormant(dev))
4738 flags |= IFF_DORMANT;
4739 }
4740
4741 return flags;
4742 }
4743 EXPORT_SYMBOL(dev_get_flags);
4744
4745 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4746 {
4747 unsigned int old_flags = dev->flags;
4748 int ret;
4749
4750 ASSERT_RTNL();
4751
4752 /*
4753 * Set the flags on our device.
4754 */
4755
4756 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4757 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4758 IFF_AUTOMEDIA)) |
4759 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4760 IFF_ALLMULTI));
4761
4762 /*
4763 * Load in the correct multicast list now the flags have changed.
4764 */
4765
4766 if ((old_flags ^ flags) & IFF_MULTICAST)
4767 dev_change_rx_flags(dev, IFF_MULTICAST);
4768
4769 dev_set_rx_mode(dev);
4770
4771 /*
4772 * Have we downed the interface. We handle IFF_UP ourselves
4773 * according to user attempts to set it, rather than blindly
4774 * setting it.
4775 */
4776
4777 ret = 0;
4778 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
4779 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4780
4781 if (!ret)
4782 dev_set_rx_mode(dev);
4783 }
4784
4785 if ((flags ^ dev->gflags) & IFF_PROMISC) {
4786 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4787
4788 dev->gflags ^= IFF_PROMISC;
4789 dev_set_promiscuity(dev, inc);
4790 }
4791
4792 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4793 is important. Some (broken) drivers set IFF_PROMISC, when
4794 IFF_ALLMULTI is requested not asking us and not reporting.
4795 */
4796 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4797 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4798
4799 dev->gflags ^= IFF_ALLMULTI;
4800 dev_set_allmulti(dev, inc);
4801 }
4802
4803 return ret;
4804 }
4805
4806 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4807 {
4808 unsigned int changes = dev->flags ^ old_flags;
4809
4810 if (changes & IFF_UP) {
4811 if (dev->flags & IFF_UP)
4812 call_netdevice_notifiers(NETDEV_UP, dev);
4813 else
4814 call_netdevice_notifiers(NETDEV_DOWN, dev);
4815 }
4816
4817 if (dev->flags & IFF_UP &&
4818 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4819 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4820 }
4821
4822 /**
4823 * dev_change_flags - change device settings
4824 * @dev: device
4825 * @flags: device state flags
4826 *
4827 * Change settings on device based state flags. The flags are
4828 * in the userspace exported format.
4829 */
4830 int dev_change_flags(struct net_device *dev, unsigned int flags)
4831 {
4832 int ret;
4833 unsigned int changes, old_flags = dev->flags;
4834
4835 ret = __dev_change_flags(dev, flags);
4836 if (ret < 0)
4837 return ret;
4838
4839 changes = old_flags ^ dev->flags;
4840 if (changes)
4841 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4842
4843 __dev_notify_flags(dev, old_flags);
4844 return ret;
4845 }
4846 EXPORT_SYMBOL(dev_change_flags);
4847
4848 /**
4849 * dev_set_mtu - Change maximum transfer unit
4850 * @dev: device
4851 * @new_mtu: new transfer unit
4852 *
4853 * Change the maximum transfer size of the network device.
4854 */
4855 int dev_set_mtu(struct net_device *dev, int new_mtu)
4856 {
4857 const struct net_device_ops *ops = dev->netdev_ops;
4858 int err;
4859
4860 if (new_mtu == dev->mtu)
4861 return 0;
4862
4863 /* MTU must be positive. */
4864 if (new_mtu < 0)
4865 return -EINVAL;
4866
4867 if (!netif_device_present(dev))
4868 return -ENODEV;
4869
4870 err = 0;
4871 if (ops->ndo_change_mtu)
4872 err = ops->ndo_change_mtu(dev, new_mtu);
4873 else
4874 dev->mtu = new_mtu;
4875
4876 if (!err)
4877 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4878 return err;
4879 }
4880 EXPORT_SYMBOL(dev_set_mtu);
4881
4882 /**
4883 * dev_set_group - Change group this device belongs to
4884 * @dev: device
4885 * @new_group: group this device should belong to
4886 */
4887 void dev_set_group(struct net_device *dev, int new_group)
4888 {
4889 dev->group = new_group;
4890 }
4891 EXPORT_SYMBOL(dev_set_group);
4892
4893 /**
4894 * dev_set_mac_address - Change Media Access Control Address
4895 * @dev: device
4896 * @sa: new address
4897 *
4898 * Change the hardware (MAC) address of the device
4899 */
4900 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4901 {
4902 const struct net_device_ops *ops = dev->netdev_ops;
4903 int err;
4904
4905 if (!ops->ndo_set_mac_address)
4906 return -EOPNOTSUPP;
4907 if (sa->sa_family != dev->type)
4908 return -EINVAL;
4909 if (!netif_device_present(dev))
4910 return -ENODEV;
4911 err = ops->ndo_set_mac_address(dev, sa);
4912 if (err)
4913 return err;
4914 dev->addr_assign_type = NET_ADDR_SET;
4915 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4916 add_device_randomness(dev->dev_addr, dev->addr_len);
4917 return 0;
4918 }
4919 EXPORT_SYMBOL(dev_set_mac_address);
4920
4921 /**
4922 * dev_change_carrier - Change device carrier
4923 * @dev: device
4924 * @new_carrier: new value
4925 *
4926 * Change device carrier
4927 */
4928 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4929 {
4930 const struct net_device_ops *ops = dev->netdev_ops;
4931
4932 if (!ops->ndo_change_carrier)
4933 return -EOPNOTSUPP;
4934 if (!netif_device_present(dev))
4935 return -ENODEV;
4936 return ops->ndo_change_carrier(dev, new_carrier);
4937 }
4938 EXPORT_SYMBOL(dev_change_carrier);
4939
4940 /**
4941 * dev_new_index - allocate an ifindex
4942 * @net: the applicable net namespace
4943 *
4944 * Returns a suitable unique value for a new device interface
4945 * number. The caller must hold the rtnl semaphore or the
4946 * dev_base_lock to be sure it remains unique.
4947 */
4948 static int dev_new_index(struct net *net)
4949 {
4950 int ifindex = net->ifindex;
4951 for (;;) {
4952 if (++ifindex <= 0)
4953 ifindex = 1;
4954 if (!__dev_get_by_index(net, ifindex))
4955 return net->ifindex = ifindex;
4956 }
4957 }
4958
4959 /* Delayed registration/unregisteration */
4960 static LIST_HEAD(net_todo_list);
4961
4962 static void net_set_todo(struct net_device *dev)
4963 {
4964 list_add_tail(&dev->todo_list, &net_todo_list);
4965 }
4966
4967 static void rollback_registered_many(struct list_head *head)
4968 {
4969 struct net_device *dev, *tmp;
4970
4971 BUG_ON(dev_boot_phase);
4972 ASSERT_RTNL();
4973
4974 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4975 /* Some devices call without registering
4976 * for initialization unwind. Remove those
4977 * devices and proceed with the remaining.
4978 */
4979 if (dev->reg_state == NETREG_UNINITIALIZED) {
4980 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4981 dev->name, dev);
4982
4983 WARN_ON(1);
4984 list_del(&dev->unreg_list);
4985 continue;
4986 }
4987 dev->dismantle = true;
4988 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4989 }
4990
4991 /* If device is running, close it first. */
4992 dev_close_many(head);
4993
4994 list_for_each_entry(dev, head, unreg_list) {
4995 /* And unlink it from device chain. */
4996 unlist_netdevice(dev);
4997
4998 dev->reg_state = NETREG_UNREGISTERING;
4999 }
5000
5001 synchronize_net();
5002
5003 list_for_each_entry(dev, head, unreg_list) {
5004 /* Shutdown queueing discipline. */
5005 dev_shutdown(dev);
5006
5007
5008 /* Notify protocols, that we are about to destroy
5009 this device. They should clean all the things.
5010 */
5011 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5012
5013 if (!dev->rtnl_link_ops ||
5014 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5015 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5016
5017 /*
5018 * Flush the unicast and multicast chains
5019 */
5020 dev_uc_flush(dev);
5021 dev_mc_flush(dev);
5022
5023 if (dev->netdev_ops->ndo_uninit)
5024 dev->netdev_ops->ndo_uninit(dev);
5025
5026 /* Notifier chain MUST detach us all upper devices. */
5027 WARN_ON(netdev_has_any_upper_dev(dev));
5028
5029 /* Remove entries from kobject tree */
5030 netdev_unregister_kobject(dev);
5031 #ifdef CONFIG_XPS
5032 /* Remove XPS queueing entries */
5033 netif_reset_xps_queues_gt(dev, 0);
5034 #endif
5035 }
5036
5037 synchronize_net();
5038
5039 list_for_each_entry(dev, head, unreg_list)
5040 dev_put(dev);
5041 }
5042
5043 static void rollback_registered(struct net_device *dev)
5044 {
5045 LIST_HEAD(single);
5046
5047 list_add(&dev->unreg_list, &single);
5048 rollback_registered_many(&single);
5049 list_del(&single);
5050 }
5051
5052 static netdev_features_t netdev_fix_features(struct net_device *dev,
5053 netdev_features_t features)
5054 {
5055 /* Fix illegal checksum combinations */
5056 if ((features & NETIF_F_HW_CSUM) &&
5057 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5058 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5059 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5060 }
5061
5062 /* TSO requires that SG is present as well. */
5063 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5064 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5065 features &= ~NETIF_F_ALL_TSO;
5066 }
5067
5068 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5069 !(features & NETIF_F_IP_CSUM)) {
5070 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5071 features &= ~NETIF_F_TSO;
5072 features &= ~NETIF_F_TSO_ECN;
5073 }
5074
5075 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5076 !(features & NETIF_F_IPV6_CSUM)) {
5077 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5078 features &= ~NETIF_F_TSO6;
5079 }
5080
5081 /* TSO ECN requires that TSO is present as well. */
5082 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5083 features &= ~NETIF_F_TSO_ECN;
5084
5085 /* Software GSO depends on SG. */
5086 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5087 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5088 features &= ~NETIF_F_GSO;
5089 }
5090
5091 /* UFO needs SG and checksumming */
5092 if (features & NETIF_F_UFO) {
5093 /* maybe split UFO into V4 and V6? */
5094 if (!((features & NETIF_F_GEN_CSUM) ||
5095 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5096 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5097 netdev_dbg(dev,
5098 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5099 features &= ~NETIF_F_UFO;
5100 }
5101
5102 if (!(features & NETIF_F_SG)) {
5103 netdev_dbg(dev,
5104 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5105 features &= ~NETIF_F_UFO;
5106 }
5107 }
5108
5109 return features;
5110 }
5111
5112 int __netdev_update_features(struct net_device *dev)
5113 {
5114 netdev_features_t features;
5115 int err = 0;
5116
5117 ASSERT_RTNL();
5118
5119 features = netdev_get_wanted_features(dev);
5120
5121 if (dev->netdev_ops->ndo_fix_features)
5122 features = dev->netdev_ops->ndo_fix_features(dev, features);
5123
5124 /* driver might be less strict about feature dependencies */
5125 features = netdev_fix_features(dev, features);
5126
5127 if (dev->features == features)
5128 return 0;
5129
5130 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5131 &dev->features, &features);
5132
5133 if (dev->netdev_ops->ndo_set_features)
5134 err = dev->netdev_ops->ndo_set_features(dev, features);
5135
5136 if (unlikely(err < 0)) {
5137 netdev_err(dev,
5138 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5139 err, &features, &dev->features);
5140 return -1;
5141 }
5142
5143 if (!err)
5144 dev->features = features;
5145
5146 return 1;
5147 }
5148
5149 /**
5150 * netdev_update_features - recalculate device features
5151 * @dev: the device to check
5152 *
5153 * Recalculate dev->features set and send notifications if it
5154 * has changed. Should be called after driver or hardware dependent
5155 * conditions might have changed that influence the features.
5156 */
5157 void netdev_update_features(struct net_device *dev)
5158 {
5159 if (__netdev_update_features(dev))
5160 netdev_features_change(dev);
5161 }
5162 EXPORT_SYMBOL(netdev_update_features);
5163
5164 /**
5165 * netdev_change_features - recalculate device features
5166 * @dev: the device to check
5167 *
5168 * Recalculate dev->features set and send notifications even
5169 * if they have not changed. Should be called instead of
5170 * netdev_update_features() if also dev->vlan_features might
5171 * have changed to allow the changes to be propagated to stacked
5172 * VLAN devices.
5173 */
5174 void netdev_change_features(struct net_device *dev)
5175 {
5176 __netdev_update_features(dev);
5177 netdev_features_change(dev);
5178 }
5179 EXPORT_SYMBOL(netdev_change_features);
5180
5181 /**
5182 * netif_stacked_transfer_operstate - transfer operstate
5183 * @rootdev: the root or lower level device to transfer state from
5184 * @dev: the device to transfer operstate to
5185 *
5186 * Transfer operational state from root to device. This is normally
5187 * called when a stacking relationship exists between the root
5188 * device and the device(a leaf device).
5189 */
5190 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5191 struct net_device *dev)
5192 {
5193 if (rootdev->operstate == IF_OPER_DORMANT)
5194 netif_dormant_on(dev);
5195 else
5196 netif_dormant_off(dev);
5197
5198 if (netif_carrier_ok(rootdev)) {
5199 if (!netif_carrier_ok(dev))
5200 netif_carrier_on(dev);
5201 } else {
5202 if (netif_carrier_ok(dev))
5203 netif_carrier_off(dev);
5204 }
5205 }
5206 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5207
5208 #ifdef CONFIG_RPS
5209 static int netif_alloc_rx_queues(struct net_device *dev)
5210 {
5211 unsigned int i, count = dev->num_rx_queues;
5212 struct netdev_rx_queue *rx;
5213
5214 BUG_ON(count < 1);
5215
5216 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5217 if (!rx)
5218 return -ENOMEM;
5219
5220 dev->_rx = rx;
5221
5222 for (i = 0; i < count; i++)
5223 rx[i].dev = dev;
5224 return 0;
5225 }
5226 #endif
5227
5228 static void netdev_init_one_queue(struct net_device *dev,
5229 struct netdev_queue *queue, void *_unused)
5230 {
5231 /* Initialize queue lock */
5232 spin_lock_init(&queue->_xmit_lock);
5233 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5234 queue->xmit_lock_owner = -1;
5235 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5236 queue->dev = dev;
5237 #ifdef CONFIG_BQL
5238 dql_init(&queue->dql, HZ);
5239 #endif
5240 }
5241
5242 static int netif_alloc_netdev_queues(struct net_device *dev)
5243 {
5244 unsigned int count = dev->num_tx_queues;
5245 struct netdev_queue *tx;
5246
5247 BUG_ON(count < 1);
5248
5249 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5250 if (!tx)
5251 return -ENOMEM;
5252
5253 dev->_tx = tx;
5254
5255 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5256 spin_lock_init(&dev->tx_global_lock);
5257
5258 return 0;
5259 }
5260
5261 /**
5262 * register_netdevice - register a network device
5263 * @dev: device to register
5264 *
5265 * Take a completed network device structure and add it to the kernel
5266 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5267 * chain. 0 is returned on success. A negative errno code is returned
5268 * on a failure to set up the device, or if the name is a duplicate.
5269 *
5270 * Callers must hold the rtnl semaphore. You may want
5271 * register_netdev() instead of this.
5272 *
5273 * BUGS:
5274 * The locking appears insufficient to guarantee two parallel registers
5275 * will not get the same name.
5276 */
5277
5278 int register_netdevice(struct net_device *dev)
5279 {
5280 int ret;
5281 struct net *net = dev_net(dev);
5282
5283 BUG_ON(dev_boot_phase);
5284 ASSERT_RTNL();
5285
5286 might_sleep();
5287
5288 /* When net_device's are persistent, this will be fatal. */
5289 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5290 BUG_ON(!net);
5291
5292 spin_lock_init(&dev->addr_list_lock);
5293 netdev_set_addr_lockdep_class(dev);
5294
5295 dev->iflink = -1;
5296
5297 ret = dev_get_valid_name(net, dev, dev->name);
5298 if (ret < 0)
5299 goto out;
5300
5301 /* Init, if this function is available */
5302 if (dev->netdev_ops->ndo_init) {
5303 ret = dev->netdev_ops->ndo_init(dev);
5304 if (ret) {
5305 if (ret > 0)
5306 ret = -EIO;
5307 goto out;
5308 }
5309 }
5310
5311 if (((dev->hw_features | dev->features) &
5312 NETIF_F_HW_VLAN_CTAG_FILTER) &&
5313 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5314 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5315 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5316 ret = -EINVAL;
5317 goto err_uninit;
5318 }
5319
5320 ret = -EBUSY;
5321 if (!dev->ifindex)
5322 dev->ifindex = dev_new_index(net);
5323 else if (__dev_get_by_index(net, dev->ifindex))
5324 goto err_uninit;
5325
5326 if (dev->iflink == -1)
5327 dev->iflink = dev->ifindex;
5328
5329 /* Transfer changeable features to wanted_features and enable
5330 * software offloads (GSO and GRO).
5331 */
5332 dev->hw_features |= NETIF_F_SOFT_FEATURES;
5333 dev->features |= NETIF_F_SOFT_FEATURES;
5334 dev->wanted_features = dev->features & dev->hw_features;
5335
5336 /* Turn on no cache copy if HW is doing checksum */
5337 if (!(dev->flags & IFF_LOOPBACK)) {
5338 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5339 if (dev->features & NETIF_F_ALL_CSUM) {
5340 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5341 dev->features |= NETIF_F_NOCACHE_COPY;
5342 }
5343 }
5344
5345 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5346 */
5347 dev->vlan_features |= NETIF_F_HIGHDMA;
5348
5349 /* Make NETIF_F_SG inheritable to tunnel devices.
5350 */
5351 dev->hw_enc_features |= NETIF_F_SG;
5352
5353 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5354 ret = notifier_to_errno(ret);
5355 if (ret)
5356 goto err_uninit;
5357
5358 ret = netdev_register_kobject(dev);
5359 if (ret)
5360 goto err_uninit;
5361 dev->reg_state = NETREG_REGISTERED;
5362
5363 __netdev_update_features(dev);
5364
5365 /*
5366 * Default initial state at registry is that the
5367 * device is present.
5368 */
5369
5370 set_bit(__LINK_STATE_PRESENT, &dev->state);
5371
5372 linkwatch_init_dev(dev);
5373
5374 dev_init_scheduler(dev);
5375 dev_hold(dev);
5376 list_netdevice(dev);
5377 add_device_randomness(dev->dev_addr, dev->addr_len);
5378
5379 /* If the device has permanent device address, driver should
5380 * set dev_addr and also addr_assign_type should be set to
5381 * NET_ADDR_PERM (default value).
5382 */
5383 if (dev->addr_assign_type == NET_ADDR_PERM)
5384 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5385
5386 /* Notify protocols, that a new device appeared. */
5387 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5388 ret = notifier_to_errno(ret);
5389 if (ret) {
5390 rollback_registered(dev);
5391 dev->reg_state = NETREG_UNREGISTERED;
5392 }
5393 /*
5394 * Prevent userspace races by waiting until the network
5395 * device is fully setup before sending notifications.
5396 */
5397 if (!dev->rtnl_link_ops ||
5398 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5399 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5400
5401 out:
5402 return ret;
5403
5404 err_uninit:
5405 if (dev->netdev_ops->ndo_uninit)
5406 dev->netdev_ops->ndo_uninit(dev);
5407 goto out;
5408 }
5409 EXPORT_SYMBOL(register_netdevice);
5410
5411 /**
5412 * init_dummy_netdev - init a dummy network device for NAPI
5413 * @dev: device to init
5414 *
5415 * This takes a network device structure and initialize the minimum
5416 * amount of fields so it can be used to schedule NAPI polls without
5417 * registering a full blown interface. This is to be used by drivers
5418 * that need to tie several hardware interfaces to a single NAPI
5419 * poll scheduler due to HW limitations.
5420 */
5421 int init_dummy_netdev(struct net_device *dev)
5422 {
5423 /* Clear everything. Note we don't initialize spinlocks
5424 * are they aren't supposed to be taken by any of the
5425 * NAPI code and this dummy netdev is supposed to be
5426 * only ever used for NAPI polls
5427 */
5428 memset(dev, 0, sizeof(struct net_device));
5429
5430 /* make sure we BUG if trying to hit standard
5431 * register/unregister code path
5432 */
5433 dev->reg_state = NETREG_DUMMY;
5434
5435 /* NAPI wants this */
5436 INIT_LIST_HEAD(&dev->napi_list);
5437
5438 /* a dummy interface is started by default */
5439 set_bit(__LINK_STATE_PRESENT, &dev->state);
5440 set_bit(__LINK_STATE_START, &dev->state);
5441
5442 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5443 * because users of this 'device' dont need to change
5444 * its refcount.
5445 */
5446
5447 return 0;
5448 }
5449 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5450
5451
5452 /**
5453 * register_netdev - register a network device
5454 * @dev: device to register
5455 *
5456 * Take a completed network device structure and add it to the kernel
5457 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5458 * chain. 0 is returned on success. A negative errno code is returned
5459 * on a failure to set up the device, or if the name is a duplicate.
5460 *
5461 * This is a wrapper around register_netdevice that takes the rtnl semaphore
5462 * and expands the device name if you passed a format string to
5463 * alloc_netdev.
5464 */
5465 int register_netdev(struct net_device *dev)
5466 {
5467 int err;
5468
5469 rtnl_lock();
5470 err = register_netdevice(dev);
5471 rtnl_unlock();
5472 return err;
5473 }
5474 EXPORT_SYMBOL(register_netdev);
5475
5476 int netdev_refcnt_read(const struct net_device *dev)
5477 {
5478 int i, refcnt = 0;
5479
5480 for_each_possible_cpu(i)
5481 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5482 return refcnt;
5483 }
5484 EXPORT_SYMBOL(netdev_refcnt_read);
5485
5486 /**
5487 * netdev_wait_allrefs - wait until all references are gone.
5488 * @dev: target net_device
5489 *
5490 * This is called when unregistering network devices.
5491 *
5492 * Any protocol or device that holds a reference should register
5493 * for netdevice notification, and cleanup and put back the
5494 * reference if they receive an UNREGISTER event.
5495 * We can get stuck here if buggy protocols don't correctly
5496 * call dev_put.
5497 */
5498 static void netdev_wait_allrefs(struct net_device *dev)
5499 {
5500 unsigned long rebroadcast_time, warning_time;
5501 int refcnt;
5502
5503 linkwatch_forget_dev(dev);
5504
5505 rebroadcast_time = warning_time = jiffies;
5506 refcnt = netdev_refcnt_read(dev);
5507
5508 while (refcnt != 0) {
5509 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5510 rtnl_lock();
5511
5512 /* Rebroadcast unregister notification */
5513 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5514
5515 __rtnl_unlock();
5516 rcu_barrier();
5517 rtnl_lock();
5518
5519 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5520 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5521 &dev->state)) {
5522 /* We must not have linkwatch events
5523 * pending on unregister. If this
5524 * happens, we simply run the queue
5525 * unscheduled, resulting in a noop
5526 * for this device.
5527 */
5528 linkwatch_run_queue();
5529 }
5530
5531 __rtnl_unlock();
5532
5533 rebroadcast_time = jiffies;
5534 }
5535
5536 msleep(250);
5537
5538 refcnt = netdev_refcnt_read(dev);
5539
5540 if (time_after(jiffies, warning_time + 10 * HZ)) {
5541 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5542 dev->name, refcnt);
5543 warning_time = jiffies;
5544 }
5545 }
5546 }
5547
5548 /* The sequence is:
5549 *
5550 * rtnl_lock();
5551 * ...
5552 * register_netdevice(x1);
5553 * register_netdevice(x2);
5554 * ...
5555 * unregister_netdevice(y1);
5556 * unregister_netdevice(y2);
5557 * ...
5558 * rtnl_unlock();
5559 * free_netdev(y1);
5560 * free_netdev(y2);
5561 *
5562 * We are invoked by rtnl_unlock().
5563 * This allows us to deal with problems:
5564 * 1) We can delete sysfs objects which invoke hotplug
5565 * without deadlocking with linkwatch via keventd.
5566 * 2) Since we run with the RTNL semaphore not held, we can sleep
5567 * safely in order to wait for the netdev refcnt to drop to zero.
5568 *
5569 * We must not return until all unregister events added during
5570 * the interval the lock was held have been completed.
5571 */
5572 void netdev_run_todo(void)
5573 {
5574 struct list_head list;
5575
5576 /* Snapshot list, allow later requests */
5577 list_replace_init(&net_todo_list, &list);
5578
5579 __rtnl_unlock();
5580
5581
5582 /* Wait for rcu callbacks to finish before next phase */
5583 if (!list_empty(&list))
5584 rcu_barrier();
5585
5586 while (!list_empty(&list)) {
5587 struct net_device *dev
5588 = list_first_entry(&list, struct net_device, todo_list);
5589 list_del(&dev->todo_list);
5590
5591 rtnl_lock();
5592 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5593 __rtnl_unlock();
5594
5595 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5596 pr_err("network todo '%s' but state %d\n",
5597 dev->name, dev->reg_state);
5598 dump_stack();
5599 continue;
5600 }
5601
5602 dev->reg_state = NETREG_UNREGISTERED;
5603
5604 on_each_cpu(flush_backlog, dev, 1);
5605
5606 netdev_wait_allrefs(dev);
5607
5608 /* paranoia */
5609 BUG_ON(netdev_refcnt_read(dev));
5610 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5611 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5612 WARN_ON(dev->dn_ptr);
5613
5614 if (dev->destructor)
5615 dev->destructor(dev);
5616
5617 /* Free network device */
5618 kobject_put(&dev->dev.kobj);
5619 }
5620 }
5621
5622 /* Convert net_device_stats to rtnl_link_stats64. They have the same
5623 * fields in the same order, with only the type differing.
5624 */
5625 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5626 const struct net_device_stats *netdev_stats)
5627 {
5628 #if BITS_PER_LONG == 64
5629 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5630 memcpy(stats64, netdev_stats, sizeof(*stats64));
5631 #else
5632 size_t i, n = sizeof(*stats64) / sizeof(u64);
5633 const unsigned long *src = (const unsigned long *)netdev_stats;
5634 u64 *dst = (u64 *)stats64;
5635
5636 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5637 sizeof(*stats64) / sizeof(u64));
5638 for (i = 0; i < n; i++)
5639 dst[i] = src[i];
5640 #endif
5641 }
5642 EXPORT_SYMBOL(netdev_stats_to_stats64);
5643
5644 /**
5645 * dev_get_stats - get network device statistics
5646 * @dev: device to get statistics from
5647 * @storage: place to store stats
5648 *
5649 * Get network statistics from device. Return @storage.
5650 * The device driver may provide its own method by setting
5651 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5652 * otherwise the internal statistics structure is used.
5653 */
5654 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5655 struct rtnl_link_stats64 *storage)
5656 {
5657 const struct net_device_ops *ops = dev->netdev_ops;
5658
5659 if (ops->ndo_get_stats64) {
5660 memset(storage, 0, sizeof(*storage));
5661 ops->ndo_get_stats64(dev, storage);
5662 } else if (ops->ndo_get_stats) {
5663 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5664 } else {
5665 netdev_stats_to_stats64(storage, &dev->stats);
5666 }
5667 storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
5668 return storage;
5669 }
5670 EXPORT_SYMBOL(dev_get_stats);
5671
5672 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5673 {
5674 struct netdev_queue *queue = dev_ingress_queue(dev);
5675
5676 #ifdef CONFIG_NET_CLS_ACT
5677 if (queue)
5678 return queue;
5679 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5680 if (!queue)
5681 return NULL;
5682 netdev_init_one_queue(dev, queue, NULL);
5683 queue->qdisc = &noop_qdisc;
5684 queue->qdisc_sleeping = &noop_qdisc;
5685 rcu_assign_pointer(dev->ingress_queue, queue);
5686 #endif
5687 return queue;
5688 }
5689
5690 static const struct ethtool_ops default_ethtool_ops;
5691
5692 void netdev_set_default_ethtool_ops(struct net_device *dev,
5693 const struct ethtool_ops *ops)
5694 {
5695 if (dev->ethtool_ops == &default_ethtool_ops)
5696 dev->ethtool_ops = ops;
5697 }
5698 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5699
5700 /**
5701 * alloc_netdev_mqs - allocate network device
5702 * @sizeof_priv: size of private data to allocate space for
5703 * @name: device name format string
5704 * @setup: callback to initialize device
5705 * @txqs: the number of TX subqueues to allocate
5706 * @rxqs: the number of RX subqueues to allocate
5707 *
5708 * Allocates a struct net_device with private data area for driver use
5709 * and performs basic initialization. Also allocates subquue structs
5710 * for each queue on the device.
5711 */
5712 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5713 void (*setup)(struct net_device *),
5714 unsigned int txqs, unsigned int rxqs)
5715 {
5716 struct net_device *dev;
5717 size_t alloc_size;
5718 struct net_device *p;
5719
5720 BUG_ON(strlen(name) >= sizeof(dev->name));
5721
5722 if (txqs < 1) {
5723 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5724 return NULL;
5725 }
5726
5727 #ifdef CONFIG_RPS
5728 if (rxqs < 1) {
5729 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5730 return NULL;
5731 }
5732 #endif
5733
5734 alloc_size = sizeof(struct net_device);
5735 if (sizeof_priv) {
5736 /* ensure 32-byte alignment of private area */
5737 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5738 alloc_size += sizeof_priv;
5739 }
5740 /* ensure 32-byte alignment of whole construct */
5741 alloc_size += NETDEV_ALIGN - 1;
5742
5743 p = kzalloc(alloc_size, GFP_KERNEL);
5744 if (!p)
5745 return NULL;
5746
5747 dev = PTR_ALIGN(p, NETDEV_ALIGN);
5748 dev->padded = (char *)dev - (char *)p;
5749
5750 dev->pcpu_refcnt = alloc_percpu(int);
5751 if (!dev->pcpu_refcnt)
5752 goto free_p;
5753
5754 if (dev_addr_init(dev))
5755 goto free_pcpu;
5756
5757 dev_mc_init(dev);
5758 dev_uc_init(dev);
5759
5760 dev_net_set(dev, &init_net);
5761
5762 dev->gso_max_size = GSO_MAX_SIZE;
5763 dev->gso_max_segs = GSO_MAX_SEGS;
5764
5765 INIT_LIST_HEAD(&dev->napi_list);
5766 INIT_LIST_HEAD(&dev->unreg_list);
5767 INIT_LIST_HEAD(&dev->link_watch_list);
5768 INIT_LIST_HEAD(&dev->upper_dev_list);
5769 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5770 setup(dev);
5771
5772 dev->num_tx_queues = txqs;
5773 dev->real_num_tx_queues = txqs;
5774 if (netif_alloc_netdev_queues(dev))
5775 goto free_all;
5776
5777 #ifdef CONFIG_RPS
5778 dev->num_rx_queues = rxqs;
5779 dev->real_num_rx_queues = rxqs;
5780 if (netif_alloc_rx_queues(dev))
5781 goto free_all;
5782 #endif
5783
5784 strcpy(dev->name, name);
5785 dev->group = INIT_NETDEV_GROUP;
5786 if (!dev->ethtool_ops)
5787 dev->ethtool_ops = &default_ethtool_ops;
5788 return dev;
5789
5790 free_all:
5791 free_netdev(dev);
5792 return NULL;
5793
5794 free_pcpu:
5795 free_percpu(dev->pcpu_refcnt);
5796 kfree(dev->_tx);
5797 #ifdef CONFIG_RPS
5798 kfree(dev->_rx);
5799 #endif
5800
5801 free_p:
5802 kfree(p);
5803 return NULL;
5804 }
5805 EXPORT_SYMBOL(alloc_netdev_mqs);
5806
5807 /**
5808 * free_netdev - free network device
5809 * @dev: device
5810 *
5811 * This function does the last stage of destroying an allocated device
5812 * interface. The reference to the device object is released.
5813 * If this is the last reference then it will be freed.
5814 */
5815 void free_netdev(struct net_device *dev)
5816 {
5817 struct napi_struct *p, *n;
5818
5819 release_net(dev_net(dev));
5820
5821 kfree(dev->_tx);
5822 #ifdef CONFIG_RPS
5823 kfree(dev->_rx);
5824 #endif
5825
5826 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5827
5828 /* Flush device addresses */
5829 dev_addr_flush(dev);
5830
5831 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5832 netif_napi_del(p);
5833
5834 free_percpu(dev->pcpu_refcnt);
5835 dev->pcpu_refcnt = NULL;
5836
5837 /* Compatibility with error handling in drivers */
5838 if (dev->reg_state == NETREG_UNINITIALIZED) {
5839 kfree((char *)dev - dev->padded);
5840 return;
5841 }
5842
5843 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5844 dev->reg_state = NETREG_RELEASED;
5845
5846 /* will free via device release */
5847 put_device(&dev->dev);
5848 }
5849 EXPORT_SYMBOL(free_netdev);
5850
5851 /**
5852 * synchronize_net - Synchronize with packet receive processing
5853 *
5854 * Wait for packets currently being received to be done.
5855 * Does not block later packets from starting.
5856 */
5857 void synchronize_net(void)
5858 {
5859 might_sleep();
5860 if (rtnl_is_locked())
5861 synchronize_rcu_expedited();
5862 else
5863 synchronize_rcu();
5864 }
5865 EXPORT_SYMBOL(synchronize_net);
5866
5867 /**
5868 * unregister_netdevice_queue - remove device from the kernel
5869 * @dev: device
5870 * @head: list
5871 *
5872 * This function shuts down a device interface and removes it
5873 * from the kernel tables.
5874 * If head not NULL, device is queued to be unregistered later.
5875 *
5876 * Callers must hold the rtnl semaphore. You may want
5877 * unregister_netdev() instead of this.
5878 */
5879
5880 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5881 {
5882 ASSERT_RTNL();
5883
5884 if (head) {
5885 list_move_tail(&dev->unreg_list, head);
5886 } else {
5887 rollback_registered(dev);
5888 /* Finish processing unregister after unlock */
5889 net_set_todo(dev);
5890 }
5891 }
5892 EXPORT_SYMBOL(unregister_netdevice_queue);
5893
5894 /**
5895 * unregister_netdevice_many - unregister many devices
5896 * @head: list of devices
5897 *
5898 * Note: As most callers use a stack allocated list_head,
5899 * we force a list_del() to make sure stack wont be corrupted later.
5900 */
5901 void unregister_netdevice_many(struct list_head *head)
5902 {
5903 struct net_device *dev;
5904
5905 if (!list_empty(head)) {
5906 rollback_registered_many(head);
5907 list_for_each_entry(dev, head, unreg_list)
5908 net_set_todo(dev);
5909 list_del(head);
5910 }
5911 }
5912 EXPORT_SYMBOL(unregister_netdevice_many);
5913
5914 /**
5915 * unregister_netdev - remove device from the kernel
5916 * @dev: device
5917 *
5918 * This function shuts down a device interface and removes it
5919 * from the kernel tables.
5920 *
5921 * This is just a wrapper for unregister_netdevice that takes
5922 * the rtnl semaphore. In general you want to use this and not
5923 * unregister_netdevice.
5924 */
5925 void unregister_netdev(struct net_device *dev)
5926 {
5927 rtnl_lock();
5928 unregister_netdevice(dev);
5929 rtnl_unlock();
5930 }
5931 EXPORT_SYMBOL(unregister_netdev);
5932
5933 /**
5934 * dev_change_net_namespace - move device to different nethost namespace
5935 * @dev: device
5936 * @net: network namespace
5937 * @pat: If not NULL name pattern to try if the current device name
5938 * is already taken in the destination network namespace.
5939 *
5940 * This function shuts down a device interface and moves it
5941 * to a new network namespace. On success 0 is returned, on
5942 * a failure a netagive errno code is returned.
5943 *
5944 * Callers must hold the rtnl semaphore.
5945 */
5946
5947 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5948 {
5949 int err;
5950
5951 ASSERT_RTNL();
5952
5953 /* Don't allow namespace local devices to be moved. */
5954 err = -EINVAL;
5955 if (dev->features & NETIF_F_NETNS_LOCAL)
5956 goto out;
5957
5958 /* Ensure the device has been registrered */
5959 if (dev->reg_state != NETREG_REGISTERED)
5960 goto out;
5961
5962 /* Get out if there is nothing todo */
5963 err = 0;
5964 if (net_eq(dev_net(dev), net))
5965 goto out;
5966
5967 /* Pick the destination device name, and ensure
5968 * we can use it in the destination network namespace.
5969 */
5970 err = -EEXIST;
5971 if (__dev_get_by_name(net, dev->name)) {
5972 /* We get here if we can't use the current device name */
5973 if (!pat)
5974 goto out;
5975 if (dev_get_valid_name(net, dev, pat) < 0)
5976 goto out;
5977 }
5978
5979 /*
5980 * And now a mini version of register_netdevice unregister_netdevice.
5981 */
5982
5983 /* If device is running close it first. */
5984 dev_close(dev);
5985
5986 /* And unlink it from device chain */
5987 err = -ENODEV;
5988 unlist_netdevice(dev);
5989
5990 synchronize_net();
5991
5992 /* Shutdown queueing discipline. */
5993 dev_shutdown(dev);
5994
5995 /* Notify protocols, that we are about to destroy
5996 this device. They should clean all the things.
5997
5998 Note that dev->reg_state stays at NETREG_REGISTERED.
5999 This is wanted because this way 8021q and macvlan know
6000 the device is just moving and can keep their slaves up.
6001 */
6002 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6003 rcu_barrier();
6004 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6005 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6006
6007 /*
6008 * Flush the unicast and multicast chains
6009 */
6010 dev_uc_flush(dev);
6011 dev_mc_flush(dev);
6012
6013 /* Send a netdev-removed uevent to the old namespace */
6014 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6015
6016 /* Actually switch the network namespace */
6017 dev_net_set(dev, net);
6018
6019 /* If there is an ifindex conflict assign a new one */
6020 if (__dev_get_by_index(net, dev->ifindex)) {
6021 int iflink = (dev->iflink == dev->ifindex);
6022 dev->ifindex = dev_new_index(net);
6023 if (iflink)
6024 dev->iflink = dev->ifindex;
6025 }
6026
6027 /* Send a netdev-add uevent to the new namespace */
6028 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6029
6030 /* Fixup kobjects */
6031 err = device_rename(&dev->dev, dev->name);
6032 WARN_ON(err);
6033
6034 /* Add the device back in the hashes */
6035 list_netdevice(dev);
6036
6037 /* Notify protocols, that a new device appeared. */
6038 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6039
6040 /*
6041 * Prevent userspace races by waiting until the network
6042 * device is fully setup before sending notifications.
6043 */
6044 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6045
6046 synchronize_net();
6047 err = 0;
6048 out:
6049 return err;
6050 }
6051 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6052
6053 static int dev_cpu_callback(struct notifier_block *nfb,
6054 unsigned long action,
6055 void *ocpu)
6056 {
6057 struct sk_buff **list_skb;
6058 struct sk_buff *skb;
6059 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6060 struct softnet_data *sd, *oldsd;
6061
6062 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6063 return NOTIFY_OK;
6064
6065 local_irq_disable();
6066 cpu = smp_processor_id();
6067 sd = &per_cpu(softnet_data, cpu);
6068 oldsd = &per_cpu(softnet_data, oldcpu);
6069
6070 /* Find end of our completion_queue. */
6071 list_skb = &sd->completion_queue;
6072 while (*list_skb)
6073 list_skb = &(*list_skb)->next;
6074 /* Append completion queue from offline CPU. */
6075 *list_skb = oldsd->completion_queue;
6076 oldsd->completion_queue = NULL;
6077
6078 /* Append output queue from offline CPU. */
6079 if (oldsd->output_queue) {
6080 *sd->output_queue_tailp = oldsd->output_queue;
6081 sd->output_queue_tailp = oldsd->output_queue_tailp;
6082 oldsd->output_queue = NULL;
6083 oldsd->output_queue_tailp = &oldsd->output_queue;
6084 }
6085 /* Append NAPI poll list from offline CPU, with one exception :
6086 * process_backlog() must be called by cpu owning percpu backlog.
6087 * We properly handle process_queue & input_pkt_queue later.
6088 */
6089 while (!list_empty(&oldsd->poll_list)) {
6090 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
6091 struct napi_struct,
6092 poll_list);
6093
6094 list_del_init(&napi->poll_list);
6095 if (napi->poll == process_backlog)
6096 napi->state = 0;
6097 else
6098 ____napi_schedule(sd, napi);
6099 }
6100
6101 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6102 local_irq_enable();
6103
6104 /* Process offline CPU's input_pkt_queue */
6105 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6106 netif_rx(skb);
6107 input_queue_head_incr(oldsd);
6108 }
6109 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
6110 netif_rx(skb);
6111 input_queue_head_incr(oldsd);
6112 }
6113
6114 return NOTIFY_OK;
6115 }
6116
6117
6118 /**
6119 * netdev_increment_features - increment feature set by one
6120 * @all: current feature set
6121 * @one: new feature set
6122 * @mask: mask feature set
6123 *
6124 * Computes a new feature set after adding a device with feature set
6125 * @one to the master device with current feature set @all. Will not
6126 * enable anything that is off in @mask. Returns the new feature set.
6127 */
6128 netdev_features_t netdev_increment_features(netdev_features_t all,
6129 netdev_features_t one, netdev_features_t mask)
6130 {
6131 if (mask & NETIF_F_GEN_CSUM)
6132 mask |= NETIF_F_ALL_CSUM;
6133 mask |= NETIF_F_VLAN_CHALLENGED;
6134
6135 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6136 all &= one | ~NETIF_F_ALL_FOR_ALL;
6137
6138 /* If one device supports hw checksumming, set for all. */
6139 if (all & NETIF_F_GEN_CSUM)
6140 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6141
6142 return all;
6143 }
6144 EXPORT_SYMBOL(netdev_increment_features);
6145
6146 static struct hlist_head *netdev_create_hash(void)
6147 {
6148 int i;
6149 struct hlist_head *hash;
6150
6151 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6152 if (hash != NULL)
6153 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6154 INIT_HLIST_HEAD(&hash[i]);
6155
6156 return hash;
6157 }
6158
6159 /* Initialize per network namespace state */
6160 static int __net_init netdev_init(struct net *net)
6161 {
6162 if (net != &init_net)
6163 INIT_LIST_HEAD(&net->dev_base_head);
6164
6165 net->dev_name_head = netdev_create_hash();
6166 if (net->dev_name_head == NULL)
6167 goto err_name;
6168
6169 net->dev_index_head = netdev_create_hash();
6170 if (net->dev_index_head == NULL)
6171 goto err_idx;
6172
6173 return 0;
6174
6175 err_idx:
6176 kfree(net->dev_name_head);
6177 err_name:
6178 return -ENOMEM;
6179 }
6180
6181 /**
6182 * netdev_drivername - network driver for the device
6183 * @dev: network device
6184 *
6185 * Determine network driver for device.
6186 */
6187 const char *netdev_drivername(const struct net_device *dev)
6188 {
6189 const struct device_driver *driver;
6190 const struct device *parent;
6191 const char *empty = "";
6192
6193 parent = dev->dev.parent;
6194 if (!parent)
6195 return empty;
6196
6197 driver = parent->driver;
6198 if (driver && driver->name)
6199 return driver->name;
6200 return empty;
6201 }
6202
6203 static int __netdev_printk(const char *level, const struct net_device *dev,
6204 struct va_format *vaf)
6205 {
6206 int r;
6207
6208 if (dev && dev->dev.parent) {
6209 r = dev_printk_emit(level[1] - '0',
6210 dev->dev.parent,
6211 "%s %s %s: %pV",
6212 dev_driver_string(dev->dev.parent),
6213 dev_name(dev->dev.parent),
6214 netdev_name(dev), vaf);
6215 } else if (dev) {
6216 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6217 } else {
6218 r = printk("%s(NULL net_device): %pV", level, vaf);
6219 }
6220
6221 return r;
6222 }
6223
6224 int netdev_printk(const char *level, const struct net_device *dev,
6225 const char *format, ...)
6226 {
6227 struct va_format vaf;
6228 va_list args;
6229 int r;
6230
6231 va_start(args, format);
6232
6233 vaf.fmt = format;
6234 vaf.va = &args;
6235
6236 r = __netdev_printk(level, dev, &vaf);
6237
6238 va_end(args);
6239
6240 return r;
6241 }
6242 EXPORT_SYMBOL(netdev_printk);
6243
6244 #define define_netdev_printk_level(func, level) \
6245 int func(const struct net_device *dev, const char *fmt, ...) \
6246 { \
6247 int r; \
6248 struct va_format vaf; \
6249 va_list args; \
6250 \
6251 va_start(args, fmt); \
6252 \
6253 vaf.fmt = fmt; \
6254 vaf.va = &args; \
6255 \
6256 r = __netdev_printk(level, dev, &vaf); \
6257 \
6258 va_end(args); \
6259 \
6260 return r; \
6261 } \
6262 EXPORT_SYMBOL(func);
6263
6264 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6265 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6266 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6267 define_netdev_printk_level(netdev_err, KERN_ERR);
6268 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6269 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6270 define_netdev_printk_level(netdev_info, KERN_INFO);
6271
6272 static void __net_exit netdev_exit(struct net *net)
6273 {
6274 kfree(net->dev_name_head);
6275 kfree(net->dev_index_head);
6276 }
6277
6278 static struct pernet_operations __net_initdata netdev_net_ops = {
6279 .init = netdev_init,
6280 .exit = netdev_exit,
6281 };
6282
6283 static void __net_exit default_device_exit(struct net *net)
6284 {
6285 struct net_device *dev, *aux;
6286 /*
6287 * Push all migratable network devices back to the
6288 * initial network namespace
6289 */
6290 rtnl_lock();
6291 for_each_netdev_safe(net, dev, aux) {
6292 int err;
6293 char fb_name[IFNAMSIZ];
6294
6295 /* Ignore unmoveable devices (i.e. loopback) */
6296 if (dev->features & NETIF_F_NETNS_LOCAL)
6297 continue;
6298
6299 /* Leave virtual devices for the generic cleanup */
6300 if (dev->rtnl_link_ops)
6301 continue;
6302
6303 /* Push remaining network devices to init_net */
6304 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6305 err = dev_change_net_namespace(dev, &init_net, fb_name);
6306 if (err) {
6307 pr_emerg("%s: failed to move %s to init_net: %d\n",
6308 __func__, dev->name, err);
6309 BUG();
6310 }
6311 }
6312 rtnl_unlock();
6313 }
6314
6315 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6316 {
6317 /* At exit all network devices most be removed from a network
6318 * namespace. Do this in the reverse order of registration.
6319 * Do this across as many network namespaces as possible to
6320 * improve batching efficiency.
6321 */
6322 struct net_device *dev;
6323 struct net *net;
6324 LIST_HEAD(dev_kill_list);
6325
6326 rtnl_lock();
6327 list_for_each_entry(net, net_list, exit_list) {
6328 for_each_netdev_reverse(net, dev) {
6329 if (dev->rtnl_link_ops)
6330 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6331 else
6332 unregister_netdevice_queue(dev, &dev_kill_list);
6333 }
6334 }
6335 unregister_netdevice_many(&dev_kill_list);
6336 rtnl_unlock();
6337 }
6338
6339 static struct pernet_operations __net_initdata default_device_ops = {
6340 .exit = default_device_exit,
6341 .exit_batch = default_device_exit_batch,
6342 };
6343
6344 /*
6345 * Initialize the DEV module. At boot time this walks the device list and
6346 * unhooks any devices that fail to initialise (normally hardware not
6347 * present) and leaves us with a valid list of present and active devices.
6348 *
6349 */
6350
6351 /*
6352 * This is called single threaded during boot, so no need
6353 * to take the rtnl semaphore.
6354 */
6355 static int __init net_dev_init(void)
6356 {
6357 int i, rc = -ENOMEM;
6358
6359 BUG_ON(!dev_boot_phase);
6360
6361 if (dev_proc_init())
6362 goto out;
6363
6364 if (netdev_kobject_init())
6365 goto out;
6366
6367 INIT_LIST_HEAD(&ptype_all);
6368 for (i = 0; i < PTYPE_HASH_SIZE; i++)
6369 INIT_LIST_HEAD(&ptype_base[i]);
6370
6371 INIT_LIST_HEAD(&offload_base);
6372
6373 if (register_pernet_subsys(&netdev_net_ops))
6374 goto out;
6375
6376 /*
6377 * Initialise the packet receive queues.
6378 */
6379
6380 for_each_possible_cpu(i) {
6381 struct softnet_data *sd = &per_cpu(softnet_data, i);
6382
6383 memset(sd, 0, sizeof(*sd));
6384 skb_queue_head_init(&sd->input_pkt_queue);
6385 skb_queue_head_init(&sd->process_queue);
6386 sd->completion_queue = NULL;
6387 INIT_LIST_HEAD(&sd->poll_list);
6388 sd->output_queue = NULL;
6389 sd->output_queue_tailp = &sd->output_queue;
6390 #ifdef CONFIG_RPS
6391 sd->csd.func = rps_trigger_softirq;
6392 sd->csd.info = sd;
6393 sd->csd.flags = 0;
6394 sd->cpu = i;
6395 #endif
6396
6397 sd->backlog.poll = process_backlog;
6398 sd->backlog.weight = weight_p;
6399 sd->backlog.gro_list = NULL;
6400 sd->backlog.gro_count = 0;
6401 }
6402
6403 dev_boot_phase = 0;
6404
6405 /* The loopback device is special if any other network devices
6406 * is present in a network namespace the loopback device must
6407 * be present. Since we now dynamically allocate and free the
6408 * loopback device ensure this invariant is maintained by
6409 * keeping the loopback device as the first device on the
6410 * list of network devices. Ensuring the loopback devices
6411 * is the first device that appears and the last network device
6412 * that disappears.
6413 */
6414 if (register_pernet_device(&loopback_net_ops))
6415 goto out;
6416
6417 if (register_pernet_device(&default_device_ops))
6418 goto out;
6419
6420 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6421 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6422
6423 hotcpu_notifier(dev_cpu_callback, 0);
6424 dst_init();
6425 rc = 0;
6426 out:
6427 return rc;
6428 }
6429
6430 subsys_initcall(net_dev_init);