qmi_wwan, cdc-ether: add ADU960S
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h>
1da177e4
LT
103#include <net/dst.h>
104#include <net/pkt_sched.h>
105#include <net/checksum.h>
44540960 106#include <net/xfrm.h>
1da177e4
LT
107#include <linux/highmem.h>
108#include <linux/init.h>
1da177e4 109#include <linux/module.h>
1da177e4
LT
110#include <linux/netpoll.h>
111#include <linux/rcupdate.h>
112#include <linux/delay.h>
295f4a1f 113#include <net/wext.h>
1da177e4 114#include <net/iw_handler.h>
1da177e4 115#include <asm/current.h>
5bdb9886 116#include <linux/audit.h>
db217334 117#include <linux/dmaengine.h>
f6a78bfc 118#include <linux/err.h>
c7fa9d18 119#include <linux/ctype.h>
723e98b7 120#include <linux/if_arp.h>
6de329e2 121#include <linux/if_vlan.h>
8f0f2223 122#include <linux/ip.h>
ad55dcaf 123#include <net/ip.h>
8f0f2223
DM
124#include <linux/ipv6.h>
125#include <linux/in.h>
b6b2fed1
DM
126#include <linux/jhash.h>
127#include <linux/random.h>
9cbc1cb8 128#include <trace/events/napi.h>
cf66ba58 129#include <trace/events/net.h>
07dc22e7 130#include <trace/events/skb.h>
5acbbd42 131#include <linux/pci.h>
caeda9b9 132#include <linux/inetdevice.h>
c445477d 133#include <linux/cpu_rmap.h>
c5905afb 134#include <linux/static_key.h>
1da177e4 135
342709ef
PE
136#include "net-sysfs.h"
137
d565b0a1
HX
138/* Instead of increasing this, you should create a hash table. */
139#define MAX_GRO_SKBS 8
140
5d38a079
HX
141/* This should be increased if a protocol with a bigger head is added. */
142#define GRO_MAX_HEAD (MAX_HEADER + 128)
143
1da177e4
LT
144/*
145 * The list of packet types we will receive (as opposed to discard)
146 * and the routines to invoke.
147 *
148 * Why 16. Because with 16 the only overlap we get on a hash of the
149 * low nibble of the protocol value is RARP/SNAP/X.25.
150 *
151 * NOTE: That is no longer true with the addition of VLAN tags. Not
152 * sure which should go first, but I bet it won't make much
153 * difference if we are running VLANs. The good news is that
154 * this protocol won't be in the list unless compiled in, so
3041a069 155 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
156 * --BLG
157 *
158 * 0800 IP
159 * 8100 802.1Q VLAN
160 * 0001 802.3
161 * 0002 AX.25
162 * 0004 802.2
163 * 8035 RARP
164 * 0005 SNAP
165 * 0805 X.25
166 * 0806 ARP
167 * 8137 IPX
168 * 0009 Localtalk
169 * 86DD IPv6
170 */
171
82d8a867
PE
172#define PTYPE_HASH_SIZE (16)
173#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
174
1da177e4 175static DEFINE_SPINLOCK(ptype_lock);
62532da9 176static DEFINE_SPINLOCK(offload_lock);
82d8a867 177static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 178static struct list_head ptype_all __read_mostly; /* Taps */
62532da9 179static struct list_head offload_base __read_mostly;
1da177e4 180
1da177e4 181/*
7562f876 182 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
183 * semaphore.
184 *
c6d14c84 185 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
186 *
187 * Writers must hold the rtnl semaphore while they loop through the
7562f876 188 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
189 * actual updates. This allows pure readers to access the list even
190 * while a writer is preparing to update it.
191 *
192 * To put it another way, dev_base_lock is held for writing only to
193 * protect against pure readers; the rtnl semaphore provides the
194 * protection against other writers.
195 *
196 * See, for example usages, register_netdevice() and
197 * unregister_netdevice(), which must be called with the rtnl
198 * semaphore held.
199 */
1da177e4 200DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
201EXPORT_SYMBOL(dev_base_lock);
202
30e6c9fa 203seqcount_t devnet_rename_seq;
c91f6df2 204
4e985ada
TG
205static inline void dev_base_seq_inc(struct net *net)
206{
207 while (++net->dev_base_seq == 0);
208}
209
881d966b 210static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 211{
95c96174
ED
212 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
213
08e9897d 214 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
215}
216
881d966b 217static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 218{
7c28bd0b 219 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
220}
221
e36fa2f7 222static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
223{
224#ifdef CONFIG_RPS
e36fa2f7 225 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
226#endif
227}
228
e36fa2f7 229static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
230{
231#ifdef CONFIG_RPS
e36fa2f7 232 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
233#endif
234}
235
ce286d32
EB
236/* Device list insertion */
237static int list_netdevice(struct net_device *dev)
238{
c346dca1 239 struct net *net = dev_net(dev);
ce286d32
EB
240
241 ASSERT_RTNL();
242
243 write_lock_bh(&dev_base_lock);
c6d14c84 244 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 245 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
246 hlist_add_head_rcu(&dev->index_hlist,
247 dev_index_hash(net, dev->ifindex));
ce286d32 248 write_unlock_bh(&dev_base_lock);
4e985ada
TG
249
250 dev_base_seq_inc(net);
251
ce286d32
EB
252 return 0;
253}
254
fb699dfd
ED
255/* Device list removal
256 * caller must respect a RCU grace period before freeing/reusing dev
257 */
ce286d32
EB
258static void unlist_netdevice(struct net_device *dev)
259{
260 ASSERT_RTNL();
261
262 /* Unlink dev from the device chain */
263 write_lock_bh(&dev_base_lock);
c6d14c84 264 list_del_rcu(&dev->dev_list);
72c9528b 265 hlist_del_rcu(&dev->name_hlist);
fb699dfd 266 hlist_del_rcu(&dev->index_hlist);
ce286d32 267 write_unlock_bh(&dev_base_lock);
4e985ada
TG
268
269 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
270}
271
1da177e4
LT
272/*
273 * Our notifier list
274 */
275
f07d5b94 276static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
277
278/*
279 * Device drivers call our routines to queue packets here. We empty the
280 * queue in the local softnet handler.
281 */
bea3348e 282
9958da05 283DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 284EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 285
cf508b12 286#ifdef CONFIG_LOCKDEP
723e98b7 287/*
c773e847 288 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
289 * according to dev->type
290 */
291static const unsigned short netdev_lock_type[] =
292 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
293 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
294 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
295 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
296 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
297 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
298 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
299 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
300 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
301 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
302 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
303 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
304 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
305 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
306 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 307
36cbd3dc 308static const char *const netdev_lock_name[] =
723e98b7
JP
309 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
310 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
311 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
312 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
313 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
314 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
315 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
316 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
317 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
318 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
319 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
320 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
321 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
322 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
323 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
324
325static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 326static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
327
328static inline unsigned short netdev_lock_pos(unsigned short dev_type)
329{
330 int i;
331
332 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
333 if (netdev_lock_type[i] == dev_type)
334 return i;
335 /* the last key is used by default */
336 return ARRAY_SIZE(netdev_lock_type) - 1;
337}
338
cf508b12
DM
339static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
723e98b7
JP
341{
342 int i;
343
344 i = netdev_lock_pos(dev_type);
345 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
346 netdev_lock_name[i]);
347}
cf508b12
DM
348
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350{
351 int i;
352
353 i = netdev_lock_pos(dev->type);
354 lockdep_set_class_and_name(&dev->addr_list_lock,
355 &netdev_addr_lock_key[i],
356 netdev_lock_name[i]);
357}
723e98b7 358#else
cf508b12
DM
359static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
360 unsigned short dev_type)
361{
362}
363static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
364{
365}
366#endif
1da177e4
LT
367
368/*******************************************************************************
369
370 Protocol management and registration routines
371
372*******************************************************************************/
373
1da177e4
LT
374/*
375 * Add a protocol ID to the list. Now that the input handler is
376 * smarter we can dispense with all the messy stuff that used to be
377 * here.
378 *
379 * BEWARE!!! Protocol handlers, mangling input packets,
380 * MUST BE last in hash buckets and checking protocol handlers
381 * MUST start from promiscuous ptype_all chain in net_bh.
382 * It is true now, do not change it.
383 * Explanation follows: if protocol handler, mangling packet, will
384 * be the first on list, it is not able to sense, that packet
385 * is cloned and should be copied-on-write, so that it will
386 * change it and subsequent readers will get broken packet.
387 * --ANK (980803)
388 */
389
c07b68e8
ED
390static inline struct list_head *ptype_head(const struct packet_type *pt)
391{
392 if (pt->type == htons(ETH_P_ALL))
393 return &ptype_all;
394 else
395 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
396}
397
1da177e4
LT
398/**
399 * dev_add_pack - add packet handler
400 * @pt: packet type declaration
401 *
402 * Add a protocol handler to the networking stack. The passed &packet_type
403 * is linked into kernel lists and may not be freed until it has been
404 * removed from the kernel lists.
405 *
4ec93edb 406 * This call does not sleep therefore it can not
1da177e4
LT
407 * guarantee all CPU's that are in middle of receiving packets
408 * will see the new packet type (until the next received packet).
409 */
410
411void dev_add_pack(struct packet_type *pt)
412{
c07b68e8 413 struct list_head *head = ptype_head(pt);
1da177e4 414
c07b68e8
ED
415 spin_lock(&ptype_lock);
416 list_add_rcu(&pt->list, head);
417 spin_unlock(&ptype_lock);
1da177e4 418}
d1b19dff 419EXPORT_SYMBOL(dev_add_pack);
1da177e4 420
1da177e4
LT
421/**
422 * __dev_remove_pack - remove packet handler
423 * @pt: packet type declaration
424 *
425 * Remove a protocol handler that was previously added to the kernel
426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
427 * from the kernel lists and can be freed or reused once this function
4ec93edb 428 * returns.
1da177e4
LT
429 *
430 * The packet type might still be in use by receivers
431 * and must not be freed until after all the CPU's have gone
432 * through a quiescent state.
433 */
434void __dev_remove_pack(struct packet_type *pt)
435{
c07b68e8 436 struct list_head *head = ptype_head(pt);
1da177e4
LT
437 struct packet_type *pt1;
438
c07b68e8 439 spin_lock(&ptype_lock);
1da177e4
LT
440
441 list_for_each_entry(pt1, head, list) {
442 if (pt == pt1) {
443 list_del_rcu(&pt->list);
444 goto out;
445 }
446 }
447
7b6cd1ce 448 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 449out:
c07b68e8 450 spin_unlock(&ptype_lock);
1da177e4 451}
d1b19dff
ED
452EXPORT_SYMBOL(__dev_remove_pack);
453
1da177e4
LT
454/**
455 * dev_remove_pack - remove packet handler
456 * @pt: packet type declaration
457 *
458 * Remove a protocol handler that was previously added to the kernel
459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
460 * from the kernel lists and can be freed or reused once this function
461 * returns.
462 *
463 * This call sleeps to guarantee that no CPU is looking at the packet
464 * type after return.
465 */
466void dev_remove_pack(struct packet_type *pt)
467{
468 __dev_remove_pack(pt);
4ec93edb 469
1da177e4
LT
470 synchronize_net();
471}
d1b19dff 472EXPORT_SYMBOL(dev_remove_pack);
1da177e4 473
62532da9
VY
474
475/**
476 * dev_add_offload - register offload handlers
477 * @po: protocol offload declaration
478 *
479 * Add protocol offload handlers to the networking stack. The passed
480 * &proto_offload is linked into kernel lists and may not be freed until
481 * it has been removed from the kernel lists.
482 *
483 * This call does not sleep therefore it can not
484 * guarantee all CPU's that are in middle of receiving packets
485 * will see the new offload handlers (until the next received packet).
486 */
487void dev_add_offload(struct packet_offload *po)
488{
489 struct list_head *head = &offload_base;
490
491 spin_lock(&offload_lock);
492 list_add_rcu(&po->list, head);
493 spin_unlock(&offload_lock);
494}
495EXPORT_SYMBOL(dev_add_offload);
496
497/**
498 * __dev_remove_offload - remove offload handler
499 * @po: packet offload declaration
500 *
501 * Remove a protocol offload handler that was previously added to the
502 * kernel offload handlers by dev_add_offload(). The passed &offload_type
503 * is removed from the kernel lists and can be freed or reused once this
504 * function returns.
505 *
506 * The packet type might still be in use by receivers
507 * and must not be freed until after all the CPU's have gone
508 * through a quiescent state.
509 */
510void __dev_remove_offload(struct packet_offload *po)
511{
512 struct list_head *head = &offload_base;
513 struct packet_offload *po1;
514
c53aa505 515 spin_lock(&offload_lock);
62532da9
VY
516
517 list_for_each_entry(po1, head, list) {
518 if (po == po1) {
519 list_del_rcu(&po->list);
520 goto out;
521 }
522 }
523
524 pr_warn("dev_remove_offload: %p not found\n", po);
525out:
c53aa505 526 spin_unlock(&offload_lock);
62532da9
VY
527}
528EXPORT_SYMBOL(__dev_remove_offload);
529
530/**
531 * dev_remove_offload - remove packet offload handler
532 * @po: packet offload declaration
533 *
534 * Remove a packet offload handler that was previously added to the kernel
535 * offload handlers by dev_add_offload(). The passed &offload_type is
536 * removed from the kernel lists and can be freed or reused once this
537 * function returns.
538 *
539 * This call sleeps to guarantee that no CPU is looking at the packet
540 * type after return.
541 */
542void dev_remove_offload(struct packet_offload *po)
543{
544 __dev_remove_offload(po);
545
546 synchronize_net();
547}
548EXPORT_SYMBOL(dev_remove_offload);
549
1da177e4
LT
550/******************************************************************************
551
552 Device Boot-time Settings Routines
553
554*******************************************************************************/
555
556/* Boot time configuration table */
557static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
558
559/**
560 * netdev_boot_setup_add - add new setup entry
561 * @name: name of the device
562 * @map: configured settings for the device
563 *
564 * Adds new setup entry to the dev_boot_setup list. The function
565 * returns 0 on error and 1 on success. This is a generic routine to
566 * all netdevices.
567 */
568static int netdev_boot_setup_add(char *name, struct ifmap *map)
569{
570 struct netdev_boot_setup *s;
571 int i;
572
573 s = dev_boot_setup;
574 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
575 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
576 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 577 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
578 memcpy(&s[i].map, map, sizeof(s[i].map));
579 break;
580 }
581 }
582
583 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
584}
585
586/**
587 * netdev_boot_setup_check - check boot time settings
588 * @dev: the netdevice
589 *
590 * Check boot time settings for the device.
591 * The found settings are set for the device to be used
592 * later in the device probing.
593 * Returns 0 if no settings found, 1 if they are.
594 */
595int netdev_boot_setup_check(struct net_device *dev)
596{
597 struct netdev_boot_setup *s = dev_boot_setup;
598 int i;
599
600 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
601 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 602 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
603 dev->irq = s[i].map.irq;
604 dev->base_addr = s[i].map.base_addr;
605 dev->mem_start = s[i].map.mem_start;
606 dev->mem_end = s[i].map.mem_end;
607 return 1;
608 }
609 }
610 return 0;
611}
d1b19dff 612EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
613
614
615/**
616 * netdev_boot_base - get address from boot time settings
617 * @prefix: prefix for network device
618 * @unit: id for network device
619 *
620 * Check boot time settings for the base address of device.
621 * The found settings are set for the device to be used
622 * later in the device probing.
623 * Returns 0 if no settings found.
624 */
625unsigned long netdev_boot_base(const char *prefix, int unit)
626{
627 const struct netdev_boot_setup *s = dev_boot_setup;
628 char name[IFNAMSIZ];
629 int i;
630
631 sprintf(name, "%s%d", prefix, unit);
632
633 /*
634 * If device already registered then return base of 1
635 * to indicate not to probe for this interface
636 */
881d966b 637 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
638 return 1;
639
640 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
641 if (!strcmp(name, s[i].name))
642 return s[i].map.base_addr;
643 return 0;
644}
645
646/*
647 * Saves at boot time configured settings for any netdevice.
648 */
649int __init netdev_boot_setup(char *str)
650{
651 int ints[5];
652 struct ifmap map;
653
654 str = get_options(str, ARRAY_SIZE(ints), ints);
655 if (!str || !*str)
656 return 0;
657
658 /* Save settings */
659 memset(&map, 0, sizeof(map));
660 if (ints[0] > 0)
661 map.irq = ints[1];
662 if (ints[0] > 1)
663 map.base_addr = ints[2];
664 if (ints[0] > 2)
665 map.mem_start = ints[3];
666 if (ints[0] > 3)
667 map.mem_end = ints[4];
668
669 /* Add new entry to the list */
670 return netdev_boot_setup_add(str, &map);
671}
672
673__setup("netdev=", netdev_boot_setup);
674
675/*******************************************************************************
676
677 Device Interface Subroutines
678
679*******************************************************************************/
680
681/**
682 * __dev_get_by_name - find a device by its name
c4ea43c5 683 * @net: the applicable net namespace
1da177e4
LT
684 * @name: name to find
685 *
686 * Find an interface by name. Must be called under RTNL semaphore
687 * or @dev_base_lock. If the name is found a pointer to the device
688 * is returned. If the name is not found then %NULL is returned. The
689 * reference counters are not incremented so the caller must be
690 * careful with locks.
691 */
692
881d966b 693struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
694{
695 struct hlist_node *p;
0bd8d536
ED
696 struct net_device *dev;
697 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 698
0bd8d536 699 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
700 if (!strncmp(dev->name, name, IFNAMSIZ))
701 return dev;
0bd8d536 702
1da177e4
LT
703 return NULL;
704}
d1b19dff 705EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 706
72c9528b
ED
707/**
708 * dev_get_by_name_rcu - find a device by its name
709 * @net: the applicable net namespace
710 * @name: name to find
711 *
712 * Find an interface by name.
713 * If the name is found a pointer to the device is returned.
714 * If the name is not found then %NULL is returned.
715 * The reference counters are not incremented so the caller must be
716 * careful with locks. The caller must hold RCU lock.
717 */
718
719struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
720{
721 struct hlist_node *p;
722 struct net_device *dev;
723 struct hlist_head *head = dev_name_hash(net, name);
724
725 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
726 if (!strncmp(dev->name, name, IFNAMSIZ))
727 return dev;
728
729 return NULL;
730}
731EXPORT_SYMBOL(dev_get_by_name_rcu);
732
1da177e4
LT
733/**
734 * dev_get_by_name - find a device by its name
c4ea43c5 735 * @net: the applicable net namespace
1da177e4
LT
736 * @name: name to find
737 *
738 * Find an interface by name. This can be called from any
739 * context and does its own locking. The returned handle has
740 * the usage count incremented and the caller must use dev_put() to
741 * release it when it is no longer needed. %NULL is returned if no
742 * matching device is found.
743 */
744
881d966b 745struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
746{
747 struct net_device *dev;
748
72c9528b
ED
749 rcu_read_lock();
750 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
751 if (dev)
752 dev_hold(dev);
72c9528b 753 rcu_read_unlock();
1da177e4
LT
754 return dev;
755}
d1b19dff 756EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
757
758/**
759 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 760 * @net: the applicable net namespace
1da177e4
LT
761 * @ifindex: index of device
762 *
763 * Search for an interface by index. Returns %NULL if the device
764 * is not found or a pointer to the device. The device has not
765 * had its reference counter increased so the caller must be careful
766 * about locking. The caller must hold either the RTNL semaphore
767 * or @dev_base_lock.
768 */
769
881d966b 770struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
771{
772 struct hlist_node *p;
0bd8d536
ED
773 struct net_device *dev;
774 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 775
0bd8d536 776 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
777 if (dev->ifindex == ifindex)
778 return dev;
0bd8d536 779
1da177e4
LT
780 return NULL;
781}
d1b19dff 782EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 783
fb699dfd
ED
784/**
785 * dev_get_by_index_rcu - find a device by its ifindex
786 * @net: the applicable net namespace
787 * @ifindex: index of device
788 *
789 * Search for an interface by index. Returns %NULL if the device
790 * is not found or a pointer to the device. The device has not
791 * had its reference counter increased so the caller must be careful
792 * about locking. The caller must hold RCU lock.
793 */
794
795struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
796{
797 struct hlist_node *p;
798 struct net_device *dev;
799 struct hlist_head *head = dev_index_hash(net, ifindex);
800
801 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
802 if (dev->ifindex == ifindex)
803 return dev;
804
805 return NULL;
806}
807EXPORT_SYMBOL(dev_get_by_index_rcu);
808
1da177e4
LT
809
810/**
811 * dev_get_by_index - find a device by its ifindex
c4ea43c5 812 * @net: the applicable net namespace
1da177e4
LT
813 * @ifindex: index of device
814 *
815 * Search for an interface by index. Returns NULL if the device
816 * is not found or a pointer to the device. The device returned has
817 * had a reference added and the pointer is safe until the user calls
818 * dev_put to indicate they have finished with it.
819 */
820
881d966b 821struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
822{
823 struct net_device *dev;
824
fb699dfd
ED
825 rcu_read_lock();
826 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
827 if (dev)
828 dev_hold(dev);
fb699dfd 829 rcu_read_unlock();
1da177e4
LT
830 return dev;
831}
d1b19dff 832EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
833
834/**
941666c2 835 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 836 * @net: the applicable net namespace
1da177e4
LT
837 * @type: media type of device
838 * @ha: hardware address
839 *
840 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
841 * is not found or a pointer to the device.
842 * The caller must hold RCU or RTNL.
941666c2 843 * The returned device has not had its ref count increased
1da177e4
LT
844 * and the caller must therefore be careful about locking
845 *
1da177e4
LT
846 */
847
941666c2
ED
848struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
849 const char *ha)
1da177e4
LT
850{
851 struct net_device *dev;
852
941666c2 853 for_each_netdev_rcu(net, dev)
1da177e4
LT
854 if (dev->type == type &&
855 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
856 return dev;
857
858 return NULL;
1da177e4 859}
941666c2 860EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 861
881d966b 862struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
863{
864 struct net_device *dev;
865
4e9cac2b 866 ASSERT_RTNL();
881d966b 867 for_each_netdev(net, dev)
4e9cac2b 868 if (dev->type == type)
7562f876
PE
869 return dev;
870
871 return NULL;
4e9cac2b 872}
4e9cac2b
PM
873EXPORT_SYMBOL(__dev_getfirstbyhwtype);
874
881d966b 875struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 876{
99fe3c39 877 struct net_device *dev, *ret = NULL;
4e9cac2b 878
99fe3c39
ED
879 rcu_read_lock();
880 for_each_netdev_rcu(net, dev)
881 if (dev->type == type) {
882 dev_hold(dev);
883 ret = dev;
884 break;
885 }
886 rcu_read_unlock();
887 return ret;
1da177e4 888}
1da177e4
LT
889EXPORT_SYMBOL(dev_getfirstbyhwtype);
890
891/**
bb69ae04 892 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 893 * @net: the applicable net namespace
1da177e4
LT
894 * @if_flags: IFF_* values
895 * @mask: bitmask of bits in if_flags to check
896 *
897 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
898 * is not found or a pointer to the device. Must be called inside
899 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
900 */
901
bb69ae04 902struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 903 unsigned short mask)
1da177e4 904{
7562f876 905 struct net_device *dev, *ret;
1da177e4 906
7562f876 907 ret = NULL;
c6d14c84 908 for_each_netdev_rcu(net, dev) {
1da177e4 909 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 910 ret = dev;
1da177e4
LT
911 break;
912 }
913 }
7562f876 914 return ret;
1da177e4 915}
bb69ae04 916EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
917
918/**
919 * dev_valid_name - check if name is okay for network device
920 * @name: name string
921 *
922 * Network device names need to be valid file names to
c7fa9d18
DM
923 * to allow sysfs to work. We also disallow any kind of
924 * whitespace.
1da177e4 925 */
95f050bf 926bool dev_valid_name(const char *name)
1da177e4 927{
c7fa9d18 928 if (*name == '\0')
95f050bf 929 return false;
b6fe17d6 930 if (strlen(name) >= IFNAMSIZ)
95f050bf 931 return false;
c7fa9d18 932 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 933 return false;
c7fa9d18
DM
934
935 while (*name) {
936 if (*name == '/' || isspace(*name))
95f050bf 937 return false;
c7fa9d18
DM
938 name++;
939 }
95f050bf 940 return true;
1da177e4 941}
d1b19dff 942EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
943
944/**
b267b179
EB
945 * __dev_alloc_name - allocate a name for a device
946 * @net: network namespace to allocate the device name in
1da177e4 947 * @name: name format string
b267b179 948 * @buf: scratch buffer and result name string
1da177e4
LT
949 *
950 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
951 * id. It scans list of devices to build up a free map, then chooses
952 * the first empty slot. The caller must hold the dev_base or rtnl lock
953 * while allocating the name and adding the device in order to avoid
954 * duplicates.
955 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
956 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
957 */
958
b267b179 959static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
960{
961 int i = 0;
1da177e4
LT
962 const char *p;
963 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 964 unsigned long *inuse;
1da177e4
LT
965 struct net_device *d;
966
967 p = strnchr(name, IFNAMSIZ-1, '%');
968 if (p) {
969 /*
970 * Verify the string as this thing may have come from
971 * the user. There must be either one "%d" and no other "%"
972 * characters.
973 */
974 if (p[1] != 'd' || strchr(p + 2, '%'))
975 return -EINVAL;
976
977 /* Use one page as a bit array of possible slots */
cfcabdcc 978 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
979 if (!inuse)
980 return -ENOMEM;
981
881d966b 982 for_each_netdev(net, d) {
1da177e4
LT
983 if (!sscanf(d->name, name, &i))
984 continue;
985 if (i < 0 || i >= max_netdevices)
986 continue;
987
988 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 989 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
990 if (!strncmp(buf, d->name, IFNAMSIZ))
991 set_bit(i, inuse);
992 }
993
994 i = find_first_zero_bit(inuse, max_netdevices);
995 free_page((unsigned long) inuse);
996 }
997
d9031024
OP
998 if (buf != name)
999 snprintf(buf, IFNAMSIZ, name, i);
b267b179 1000 if (!__dev_get_by_name(net, buf))
1da177e4 1001 return i;
1da177e4
LT
1002
1003 /* It is possible to run out of possible slots
1004 * when the name is long and there isn't enough space left
1005 * for the digits, or if all bits are used.
1006 */
1007 return -ENFILE;
1008}
1009
b267b179
EB
1010/**
1011 * dev_alloc_name - allocate a name for a device
1012 * @dev: device
1013 * @name: name format string
1014 *
1015 * Passed a format string - eg "lt%d" it will try and find a suitable
1016 * id. It scans list of devices to build up a free map, then chooses
1017 * the first empty slot. The caller must hold the dev_base or rtnl lock
1018 * while allocating the name and adding the device in order to avoid
1019 * duplicates.
1020 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1021 * Returns the number of the unit assigned or a negative errno code.
1022 */
1023
1024int dev_alloc_name(struct net_device *dev, const char *name)
1025{
1026 char buf[IFNAMSIZ];
1027 struct net *net;
1028 int ret;
1029
c346dca1
YH
1030 BUG_ON(!dev_net(dev));
1031 net = dev_net(dev);
b267b179
EB
1032 ret = __dev_alloc_name(net, name, buf);
1033 if (ret >= 0)
1034 strlcpy(dev->name, buf, IFNAMSIZ);
1035 return ret;
1036}
d1b19dff 1037EXPORT_SYMBOL(dev_alloc_name);
b267b179 1038
828de4f6
G
1039static int dev_alloc_name_ns(struct net *net,
1040 struct net_device *dev,
1041 const char *name)
d9031024 1042{
828de4f6
G
1043 char buf[IFNAMSIZ];
1044 int ret;
8ce6cebc 1045
828de4f6
G
1046 ret = __dev_alloc_name(net, name, buf);
1047 if (ret >= 0)
1048 strlcpy(dev->name, buf, IFNAMSIZ);
1049 return ret;
1050}
1051
1052static int dev_get_valid_name(struct net *net,
1053 struct net_device *dev,
1054 const char *name)
1055{
1056 BUG_ON(!net);
8ce6cebc 1057
d9031024
OP
1058 if (!dev_valid_name(name))
1059 return -EINVAL;
1060
1c5cae81 1061 if (strchr(name, '%'))
828de4f6 1062 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1063 else if (__dev_get_by_name(net, name))
1064 return -EEXIST;
8ce6cebc
DL
1065 else if (dev->name != name)
1066 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1067
1068 return 0;
1069}
1da177e4
LT
1070
1071/**
1072 * dev_change_name - change name of a device
1073 * @dev: device
1074 * @newname: name (or format string) must be at least IFNAMSIZ
1075 *
1076 * Change name of a device, can pass format strings "eth%d".
1077 * for wildcarding.
1078 */
cf04a4c7 1079int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1080{
fcc5a03a 1081 char oldname[IFNAMSIZ];
1da177e4 1082 int err = 0;
fcc5a03a 1083 int ret;
881d966b 1084 struct net *net;
1da177e4
LT
1085
1086 ASSERT_RTNL();
c346dca1 1087 BUG_ON(!dev_net(dev));
1da177e4 1088
c346dca1 1089 net = dev_net(dev);
1da177e4
LT
1090 if (dev->flags & IFF_UP)
1091 return -EBUSY;
1092
30e6c9fa 1093 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1094
1095 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1096 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1097 return 0;
c91f6df2 1098 }
c8d90dca 1099
fcc5a03a
HX
1100 memcpy(oldname, dev->name, IFNAMSIZ);
1101
828de4f6 1102 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1103 if (err < 0) {
30e6c9fa 1104 write_seqcount_end(&devnet_rename_seq);
d9031024 1105 return err;
c91f6df2 1106 }
1da177e4 1107
fcc5a03a 1108rollback:
a1b3f594
EB
1109 ret = device_rename(&dev->dev, dev->name);
1110 if (ret) {
1111 memcpy(dev->name, oldname, IFNAMSIZ);
30e6c9fa 1112 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1113 return ret;
dcc99773 1114 }
7f988eab 1115
30e6c9fa 1116 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1117
7f988eab 1118 write_lock_bh(&dev_base_lock);
372b2312 1119 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1120 write_unlock_bh(&dev_base_lock);
1121
1122 synchronize_rcu();
1123
1124 write_lock_bh(&dev_base_lock);
1125 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1126 write_unlock_bh(&dev_base_lock);
1127
056925ab 1128 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1129 ret = notifier_to_errno(ret);
1130
1131 if (ret) {
91e9c07b
ED
1132 /* err >= 0 after dev_alloc_name() or stores the first errno */
1133 if (err >= 0) {
fcc5a03a 1134 err = ret;
30e6c9fa 1135 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a
HX
1136 memcpy(dev->name, oldname, IFNAMSIZ);
1137 goto rollback;
91e9c07b 1138 } else {
7b6cd1ce 1139 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1140 dev->name, ret);
fcc5a03a
HX
1141 }
1142 }
1da177e4
LT
1143
1144 return err;
1145}
1146
0b815a1a
SH
1147/**
1148 * dev_set_alias - change ifalias of a device
1149 * @dev: device
1150 * @alias: name up to IFALIASZ
f0db275a 1151 * @len: limit of bytes to copy from info
0b815a1a
SH
1152 *
1153 * Set ifalias for a device,
1154 */
1155int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1156{
7364e445
AK
1157 char *new_ifalias;
1158
0b815a1a
SH
1159 ASSERT_RTNL();
1160
1161 if (len >= IFALIASZ)
1162 return -EINVAL;
1163
96ca4a2c 1164 if (!len) {
388dfc2d
SK
1165 kfree(dev->ifalias);
1166 dev->ifalias = NULL;
96ca4a2c
OH
1167 return 0;
1168 }
1169
7364e445
AK
1170 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1171 if (!new_ifalias)
0b815a1a 1172 return -ENOMEM;
7364e445 1173 dev->ifalias = new_ifalias;
0b815a1a
SH
1174
1175 strlcpy(dev->ifalias, alias, len+1);
1176 return len;
1177}
1178
1179
d8a33ac4 1180/**
3041a069 1181 * netdev_features_change - device changes features
d8a33ac4
SH
1182 * @dev: device to cause notification
1183 *
1184 * Called to indicate a device has changed features.
1185 */
1186void netdev_features_change(struct net_device *dev)
1187{
056925ab 1188 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1189}
1190EXPORT_SYMBOL(netdev_features_change);
1191
1da177e4
LT
1192/**
1193 * netdev_state_change - device changes state
1194 * @dev: device to cause notification
1195 *
1196 * Called to indicate a device has changed state. This function calls
1197 * the notifier chains for netdev_chain and sends a NEWLINK message
1198 * to the routing socket.
1199 */
1200void netdev_state_change(struct net_device *dev)
1201{
1202 if (dev->flags & IFF_UP) {
056925ab 1203 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1204 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1205 }
1206}
d1b19dff 1207EXPORT_SYMBOL(netdev_state_change);
1da177e4 1208
ee89bab1
AW
1209/**
1210 * netdev_notify_peers - notify network peers about existence of @dev
1211 * @dev: network device
1212 *
1213 * Generate traffic such that interested network peers are aware of
1214 * @dev, such as by generating a gratuitous ARP. This may be used when
1215 * a device wants to inform the rest of the network about some sort of
1216 * reconfiguration such as a failover event or virtual machine
1217 * migration.
1218 */
1219void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1220{
ee89bab1
AW
1221 rtnl_lock();
1222 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1223 rtnl_unlock();
c1da4ac7 1224}
ee89bab1 1225EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1226
bd380811 1227static int __dev_open(struct net_device *dev)
1da177e4 1228{
d314774c 1229 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1230 int ret;
1da177e4 1231
e46b66bc
BH
1232 ASSERT_RTNL();
1233
1da177e4
LT
1234 if (!netif_device_present(dev))
1235 return -ENODEV;
1236
ca99ca14
NH
1237 /* Block netpoll from trying to do any rx path servicing.
1238 * If we don't do this there is a chance ndo_poll_controller
1239 * or ndo_poll may be running while we open the device
1240 */
1241 ret = netpoll_rx_disable(dev);
1242 if (ret)
1243 return ret;
1244
3b8bcfd5
JB
1245 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1246 ret = notifier_to_errno(ret);
1247 if (ret)
1248 return ret;
1249
1da177e4 1250 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1251
d314774c
SH
1252 if (ops->ndo_validate_addr)
1253 ret = ops->ndo_validate_addr(dev);
bada339b 1254
d314774c
SH
1255 if (!ret && ops->ndo_open)
1256 ret = ops->ndo_open(dev);
1da177e4 1257
ca99ca14
NH
1258 netpoll_rx_enable(dev);
1259
bada339b
JG
1260 if (ret)
1261 clear_bit(__LINK_STATE_START, &dev->state);
1262 else {
1da177e4 1263 dev->flags |= IFF_UP;
b4bd07c2 1264 net_dmaengine_get();
4417da66 1265 dev_set_rx_mode(dev);
1da177e4 1266 dev_activate(dev);
7bf23575 1267 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1268 }
bada339b 1269
1da177e4
LT
1270 return ret;
1271}
1272
1273/**
bd380811
PM
1274 * dev_open - prepare an interface for use.
1275 * @dev: device to open
1da177e4 1276 *
bd380811
PM
1277 * Takes a device from down to up state. The device's private open
1278 * function is invoked and then the multicast lists are loaded. Finally
1279 * the device is moved into the up state and a %NETDEV_UP message is
1280 * sent to the netdev notifier chain.
1281 *
1282 * Calling this function on an active interface is a nop. On a failure
1283 * a negative errno code is returned.
1da177e4 1284 */
bd380811
PM
1285int dev_open(struct net_device *dev)
1286{
1287 int ret;
1288
bd380811
PM
1289 if (dev->flags & IFF_UP)
1290 return 0;
1291
bd380811
PM
1292 ret = __dev_open(dev);
1293 if (ret < 0)
1294 return ret;
1295
bd380811
PM
1296 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1297 call_netdevice_notifiers(NETDEV_UP, dev);
1298
1299 return ret;
1300}
1301EXPORT_SYMBOL(dev_open);
1302
44345724 1303static int __dev_close_many(struct list_head *head)
1da177e4 1304{
44345724 1305 struct net_device *dev;
e46b66bc 1306
bd380811 1307 ASSERT_RTNL();
9d5010db
DM
1308 might_sleep();
1309
44345724 1310 list_for_each_entry(dev, head, unreg_list) {
44345724 1311 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1312
44345724 1313 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1314
44345724
OP
1315 /* Synchronize to scheduled poll. We cannot touch poll list, it
1316 * can be even on different cpu. So just clear netif_running().
1317 *
1318 * dev->stop() will invoke napi_disable() on all of it's
1319 * napi_struct instances on this device.
1320 */
1321 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1322 }
1da177e4 1323
44345724 1324 dev_deactivate_many(head);
d8b2a4d2 1325
44345724
OP
1326 list_for_each_entry(dev, head, unreg_list) {
1327 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1328
44345724
OP
1329 /*
1330 * Call the device specific close. This cannot fail.
1331 * Only if device is UP
1332 *
1333 * We allow it to be called even after a DETACH hot-plug
1334 * event.
1335 */
1336 if (ops->ndo_stop)
1337 ops->ndo_stop(dev);
1338
44345724 1339 dev->flags &= ~IFF_UP;
44345724
OP
1340 net_dmaengine_put();
1341 }
1342
1343 return 0;
1344}
1345
1346static int __dev_close(struct net_device *dev)
1347{
f87e6f47 1348 int retval;
44345724
OP
1349 LIST_HEAD(single);
1350
ca99ca14
NH
1351 /* Temporarily disable netpoll until the interface is down */
1352 retval = netpoll_rx_disable(dev);
1353 if (retval)
1354 return retval;
1355
44345724 1356 list_add(&dev->unreg_list, &single);
f87e6f47
LT
1357 retval = __dev_close_many(&single);
1358 list_del(&single);
ca99ca14
NH
1359
1360 netpoll_rx_enable(dev);
f87e6f47 1361 return retval;
44345724
OP
1362}
1363
3fbd8758 1364static int dev_close_many(struct list_head *head)
44345724
OP
1365{
1366 struct net_device *dev, *tmp;
1367 LIST_HEAD(tmp_list);
1da177e4 1368
44345724
OP
1369 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1370 if (!(dev->flags & IFF_UP))
1371 list_move(&dev->unreg_list, &tmp_list);
1372
1373 __dev_close_many(head);
1da177e4 1374
44345724
OP
1375 list_for_each_entry(dev, head, unreg_list) {
1376 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1377 call_netdevice_notifiers(NETDEV_DOWN, dev);
1378 }
bd380811 1379
44345724
OP
1380 /* rollback_registered_many needs the complete original list */
1381 list_splice(&tmp_list, head);
bd380811
PM
1382 return 0;
1383}
1384
1385/**
1386 * dev_close - shutdown an interface.
1387 * @dev: device to shutdown
1388 *
1389 * This function moves an active device into down state. A
1390 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1391 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1392 * chain.
1393 */
1394int dev_close(struct net_device *dev)
1395{
ca99ca14 1396 int ret = 0;
e14a5993
ED
1397 if (dev->flags & IFF_UP) {
1398 LIST_HEAD(single);
1da177e4 1399
ca99ca14
NH
1400 /* Block netpoll rx while the interface is going down */
1401 ret = netpoll_rx_disable(dev);
1402 if (ret)
1403 return ret;
1404
e14a5993
ED
1405 list_add(&dev->unreg_list, &single);
1406 dev_close_many(&single);
1407 list_del(&single);
ca99ca14
NH
1408
1409 netpoll_rx_enable(dev);
e14a5993 1410 }
ca99ca14 1411 return ret;
1da177e4 1412}
d1b19dff 1413EXPORT_SYMBOL(dev_close);
1da177e4
LT
1414
1415
0187bdfb
BH
1416/**
1417 * dev_disable_lro - disable Large Receive Offload on a device
1418 * @dev: device
1419 *
1420 * Disable Large Receive Offload (LRO) on a net device. Must be
1421 * called under RTNL. This is needed if received packets may be
1422 * forwarded to another interface.
1423 */
1424void dev_disable_lro(struct net_device *dev)
1425{
f11970e3
NH
1426 /*
1427 * If we're trying to disable lro on a vlan device
1428 * use the underlying physical device instead
1429 */
1430 if (is_vlan_dev(dev))
1431 dev = vlan_dev_real_dev(dev);
1432
bc5787c6
MM
1433 dev->wanted_features &= ~NETIF_F_LRO;
1434 netdev_update_features(dev);
27660515 1435
22d5969f
MM
1436 if (unlikely(dev->features & NETIF_F_LRO))
1437 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1438}
1439EXPORT_SYMBOL(dev_disable_lro);
1440
1441
881d966b
EB
1442static int dev_boot_phase = 1;
1443
1da177e4
LT
1444/**
1445 * register_netdevice_notifier - register a network notifier block
1446 * @nb: notifier
1447 *
1448 * Register a notifier to be called when network device events occur.
1449 * The notifier passed is linked into the kernel structures and must
1450 * not be reused until it has been unregistered. A negative errno code
1451 * is returned on a failure.
1452 *
1453 * When registered all registration and up events are replayed
4ec93edb 1454 * to the new notifier to allow device to have a race free
1da177e4
LT
1455 * view of the network device list.
1456 */
1457
1458int register_netdevice_notifier(struct notifier_block *nb)
1459{
1460 struct net_device *dev;
fcc5a03a 1461 struct net_device *last;
881d966b 1462 struct net *net;
1da177e4
LT
1463 int err;
1464
1465 rtnl_lock();
f07d5b94 1466 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1467 if (err)
1468 goto unlock;
881d966b
EB
1469 if (dev_boot_phase)
1470 goto unlock;
1471 for_each_net(net) {
1472 for_each_netdev(net, dev) {
1473 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1474 err = notifier_to_errno(err);
1475 if (err)
1476 goto rollback;
1477
1478 if (!(dev->flags & IFF_UP))
1479 continue;
1da177e4 1480
881d966b
EB
1481 nb->notifier_call(nb, NETDEV_UP, dev);
1482 }
1da177e4 1483 }
fcc5a03a
HX
1484
1485unlock:
1da177e4
LT
1486 rtnl_unlock();
1487 return err;
fcc5a03a
HX
1488
1489rollback:
1490 last = dev;
881d966b
EB
1491 for_each_net(net) {
1492 for_each_netdev(net, dev) {
1493 if (dev == last)
8f891489 1494 goto outroll;
fcc5a03a 1495
881d966b
EB
1496 if (dev->flags & IFF_UP) {
1497 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1498 nb->notifier_call(nb, NETDEV_DOWN, dev);
1499 }
1500 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1501 }
fcc5a03a 1502 }
c67625a1 1503
8f891489 1504outroll:
c67625a1 1505 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1506 goto unlock;
1da177e4 1507}
d1b19dff 1508EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1509
1510/**
1511 * unregister_netdevice_notifier - unregister a network notifier block
1512 * @nb: notifier
1513 *
1514 * Unregister a notifier previously registered by
1515 * register_netdevice_notifier(). The notifier is unlinked into the
1516 * kernel structures and may then be reused. A negative errno code
1517 * is returned on a failure.
7d3d43da
EB
1518 *
1519 * After unregistering unregister and down device events are synthesized
1520 * for all devices on the device list to the removed notifier to remove
1521 * the need for special case cleanup code.
1da177e4
LT
1522 */
1523
1524int unregister_netdevice_notifier(struct notifier_block *nb)
1525{
7d3d43da
EB
1526 struct net_device *dev;
1527 struct net *net;
9f514950
HX
1528 int err;
1529
1530 rtnl_lock();
f07d5b94 1531 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1532 if (err)
1533 goto unlock;
1534
1535 for_each_net(net) {
1536 for_each_netdev(net, dev) {
1537 if (dev->flags & IFF_UP) {
1538 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1539 nb->notifier_call(nb, NETDEV_DOWN, dev);
1540 }
1541 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1542 }
1543 }
1544unlock:
9f514950
HX
1545 rtnl_unlock();
1546 return err;
1da177e4 1547}
d1b19dff 1548EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1549
1550/**
1551 * call_netdevice_notifiers - call all network notifier blocks
1552 * @val: value passed unmodified to notifier function
c4ea43c5 1553 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1554 *
1555 * Call all network notifier blocks. Parameters and return value
f07d5b94 1556 * are as for raw_notifier_call_chain().
1da177e4
LT
1557 */
1558
ad7379d4 1559int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1560{
ab930471 1561 ASSERT_RTNL();
ad7379d4 1562 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4 1563}
edf947f1 1564EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1565
c5905afb 1566static struct static_key netstamp_needed __read_mostly;
b90e5794 1567#ifdef HAVE_JUMP_LABEL
c5905afb 1568/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1569 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1570 * static_key_slow_dec() calls.
b90e5794
ED
1571 */
1572static atomic_t netstamp_needed_deferred;
1573#endif
1da177e4
LT
1574
1575void net_enable_timestamp(void)
1576{
b90e5794
ED
1577#ifdef HAVE_JUMP_LABEL
1578 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1579
1580 if (deferred) {
1581 while (--deferred)
c5905afb 1582 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1583 return;
1584 }
1585#endif
1586 WARN_ON(in_interrupt());
c5905afb 1587 static_key_slow_inc(&netstamp_needed);
1da177e4 1588}
d1b19dff 1589EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1590
1591void net_disable_timestamp(void)
1592{
b90e5794
ED
1593#ifdef HAVE_JUMP_LABEL
1594 if (in_interrupt()) {
1595 atomic_inc(&netstamp_needed_deferred);
1596 return;
1597 }
1598#endif
c5905afb 1599 static_key_slow_dec(&netstamp_needed);
1da177e4 1600}
d1b19dff 1601EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1602
3b098e2d 1603static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1604{
588f0330 1605 skb->tstamp.tv64 = 0;
c5905afb 1606 if (static_key_false(&netstamp_needed))
a61bbcf2 1607 __net_timestamp(skb);
1da177e4
LT
1608}
1609
588f0330 1610#define net_timestamp_check(COND, SKB) \
c5905afb 1611 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1612 if ((COND) && !(SKB)->tstamp.tv64) \
1613 __net_timestamp(SKB); \
1614 } \
3b098e2d 1615
79b569f0
DL
1616static inline bool is_skb_forwardable(struct net_device *dev,
1617 struct sk_buff *skb)
1618{
1619 unsigned int len;
1620
1621 if (!(dev->flags & IFF_UP))
1622 return false;
1623
1624 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1625 if (skb->len <= len)
1626 return true;
1627
1628 /* if TSO is enabled, we don't care about the length as the packet
1629 * could be forwarded without being segmented before
1630 */
1631 if (skb_is_gso(skb))
1632 return true;
1633
1634 return false;
1635}
1636
44540960
AB
1637/**
1638 * dev_forward_skb - loopback an skb to another netif
1639 *
1640 * @dev: destination network device
1641 * @skb: buffer to forward
1642 *
1643 * return values:
1644 * NET_RX_SUCCESS (no congestion)
6ec82562 1645 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1646 *
1647 * dev_forward_skb can be used for injecting an skb from the
1648 * start_xmit function of one device into the receive queue
1649 * of another device.
1650 *
1651 * The receiving device may be in another namespace, so
1652 * we have to clear all information in the skb that could
1653 * impact namespace isolation.
1654 */
1655int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1656{
48c83012
MT
1657 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1658 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1659 atomic_long_inc(&dev->rx_dropped);
1660 kfree_skb(skb);
1661 return NET_RX_DROP;
1662 }
1663 }
1664
44540960 1665 skb_orphan(skb);
c736eefa 1666 nf_reset(skb);
44540960 1667
79b569f0 1668 if (unlikely(!is_skb_forwardable(dev, skb))) {
caf586e5 1669 atomic_long_inc(&dev->rx_dropped);
6ec82562 1670 kfree_skb(skb);
44540960 1671 return NET_RX_DROP;
6ec82562 1672 }
3b9785c6 1673 skb->skb_iif = 0;
59b9997b
DM
1674 skb->dev = dev;
1675 skb_dst_drop(skb);
44540960
AB
1676 skb->tstamp.tv64 = 0;
1677 skb->pkt_type = PACKET_HOST;
1678 skb->protocol = eth_type_trans(skb, dev);
59b9997b
DM
1679 skb->mark = 0;
1680 secpath_reset(skb);
1681 nf_reset(skb);
44540960
AB
1682 return netif_rx(skb);
1683}
1684EXPORT_SYMBOL_GPL(dev_forward_skb);
1685
71d9dec2
CG
1686static inline int deliver_skb(struct sk_buff *skb,
1687 struct packet_type *pt_prev,
1688 struct net_device *orig_dev)
1689{
1080e512
MT
1690 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1691 return -ENOMEM;
71d9dec2
CG
1692 atomic_inc(&skb->users);
1693 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1694}
1695
c0de08d0
EL
1696static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1697{
a3d744e9 1698 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1699 return false;
1700
1701 if (ptype->id_match)
1702 return ptype->id_match(ptype, skb->sk);
1703 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1704 return true;
1705
1706 return false;
1707}
1708
1da177e4
LT
1709/*
1710 * Support routine. Sends outgoing frames to any network
1711 * taps currently in use.
1712 */
1713
f6a78bfc 1714static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1715{
1716 struct packet_type *ptype;
71d9dec2
CG
1717 struct sk_buff *skb2 = NULL;
1718 struct packet_type *pt_prev = NULL;
a61bbcf2 1719
1da177e4
LT
1720 rcu_read_lock();
1721 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1722 /* Never send packets back to the socket
1723 * they originated from - MvS (miquels@drinkel.ow.org)
1724 */
1725 if ((ptype->dev == dev || !ptype->dev) &&
c0de08d0 1726 (!skb_loop_sk(ptype, skb))) {
71d9dec2
CG
1727 if (pt_prev) {
1728 deliver_skb(skb2, pt_prev, skb->dev);
1729 pt_prev = ptype;
1730 continue;
1731 }
1732
1733 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1734 if (!skb2)
1735 break;
1736
70978182
ED
1737 net_timestamp_set(skb2);
1738
1da177e4
LT
1739 /* skb->nh should be correctly
1740 set by sender, so that the second statement is
1741 just protection against buggy protocols.
1742 */
459a98ed 1743 skb_reset_mac_header(skb2);
1da177e4 1744
d56f90a7 1745 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1746 skb2->network_header > skb2->tail) {
e87cc472
JP
1747 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1748 ntohs(skb2->protocol),
1749 dev->name);
c1d2bbe1 1750 skb_reset_network_header(skb2);
1da177e4
LT
1751 }
1752
b0e380b1 1753 skb2->transport_header = skb2->network_header;
1da177e4 1754 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1755 pt_prev = ptype;
1da177e4
LT
1756 }
1757 }
71d9dec2
CG
1758 if (pt_prev)
1759 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1760 rcu_read_unlock();
1761}
1762
2c53040f
BH
1763/**
1764 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1765 * @dev: Network device
1766 * @txq: number of queues available
1767 *
1768 * If real_num_tx_queues is changed the tc mappings may no longer be
1769 * valid. To resolve this verify the tc mapping remains valid and if
1770 * not NULL the mapping. With no priorities mapping to this
1771 * offset/count pair it will no longer be used. In the worst case TC0
1772 * is invalid nothing can be done so disable priority mappings. If is
1773 * expected that drivers will fix this mapping if they can before
1774 * calling netif_set_real_num_tx_queues.
1775 */
bb134d22 1776static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1777{
1778 int i;
1779 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1780
1781 /* If TC0 is invalidated disable TC mapping */
1782 if (tc->offset + tc->count > txq) {
7b6cd1ce 1783 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1784 dev->num_tc = 0;
1785 return;
1786 }
1787
1788 /* Invalidated prio to tc mappings set to TC0 */
1789 for (i = 1; i < TC_BITMASK + 1; i++) {
1790 int q = netdev_get_prio_tc_map(dev, i);
1791
1792 tc = &dev->tc_to_txq[q];
1793 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1794 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1795 i, q);
4f57c087
JF
1796 netdev_set_prio_tc_map(dev, i, 0);
1797 }
1798 }
1799}
1800
537c00de
AD
1801#ifdef CONFIG_XPS
1802static DEFINE_MUTEX(xps_map_mutex);
1803#define xmap_dereference(P) \
1804 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1805
10cdc3f3
AD
1806static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1807 int cpu, u16 index)
537c00de 1808{
10cdc3f3
AD
1809 struct xps_map *map = NULL;
1810 int pos;
537c00de 1811
10cdc3f3
AD
1812 if (dev_maps)
1813 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1814
10cdc3f3
AD
1815 for (pos = 0; map && pos < map->len; pos++) {
1816 if (map->queues[pos] == index) {
537c00de
AD
1817 if (map->len > 1) {
1818 map->queues[pos] = map->queues[--map->len];
1819 } else {
10cdc3f3 1820 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1821 kfree_rcu(map, rcu);
1822 map = NULL;
1823 }
10cdc3f3 1824 break;
537c00de 1825 }
537c00de
AD
1826 }
1827
10cdc3f3
AD
1828 return map;
1829}
1830
024e9679 1831static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1832{
1833 struct xps_dev_maps *dev_maps;
024e9679 1834 int cpu, i;
10cdc3f3
AD
1835 bool active = false;
1836
1837 mutex_lock(&xps_map_mutex);
1838 dev_maps = xmap_dereference(dev->xps_maps);
1839
1840 if (!dev_maps)
1841 goto out_no_maps;
1842
1843 for_each_possible_cpu(cpu) {
024e9679
AD
1844 for (i = index; i < dev->num_tx_queues; i++) {
1845 if (!remove_xps_queue(dev_maps, cpu, i))
1846 break;
1847 }
1848 if (i == dev->num_tx_queues)
10cdc3f3
AD
1849 active = true;
1850 }
1851
1852 if (!active) {
537c00de
AD
1853 RCU_INIT_POINTER(dev->xps_maps, NULL);
1854 kfree_rcu(dev_maps, rcu);
1855 }
1856
024e9679
AD
1857 for (i = index; i < dev->num_tx_queues; i++)
1858 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1859 NUMA_NO_NODE);
1860
537c00de
AD
1861out_no_maps:
1862 mutex_unlock(&xps_map_mutex);
1863}
1864
01c5f864
AD
1865static struct xps_map *expand_xps_map(struct xps_map *map,
1866 int cpu, u16 index)
1867{
1868 struct xps_map *new_map;
1869 int alloc_len = XPS_MIN_MAP_ALLOC;
1870 int i, pos;
1871
1872 for (pos = 0; map && pos < map->len; pos++) {
1873 if (map->queues[pos] != index)
1874 continue;
1875 return map;
1876 }
1877
1878 /* Need to add queue to this CPU's existing map */
1879 if (map) {
1880 if (pos < map->alloc_len)
1881 return map;
1882
1883 alloc_len = map->alloc_len * 2;
1884 }
1885
1886 /* Need to allocate new map to store queue on this CPU's map */
1887 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1888 cpu_to_node(cpu));
1889 if (!new_map)
1890 return NULL;
1891
1892 for (i = 0; i < pos; i++)
1893 new_map->queues[i] = map->queues[i];
1894 new_map->alloc_len = alloc_len;
1895 new_map->len = pos;
1896
1897 return new_map;
1898}
1899
537c00de
AD
1900int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1901{
01c5f864 1902 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 1903 struct xps_map *map, *new_map;
537c00de 1904 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
1905 int cpu, numa_node_id = -2;
1906 bool active = false;
537c00de
AD
1907
1908 mutex_lock(&xps_map_mutex);
1909
1910 dev_maps = xmap_dereference(dev->xps_maps);
1911
01c5f864
AD
1912 /* allocate memory for queue storage */
1913 for_each_online_cpu(cpu) {
1914 if (!cpumask_test_cpu(cpu, mask))
1915 continue;
1916
1917 if (!new_dev_maps)
1918 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1919 if (!new_dev_maps)
1920 return -ENOMEM;
1921
1922 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1923 NULL;
1924
1925 map = expand_xps_map(map, cpu, index);
1926 if (!map)
1927 goto error;
1928
1929 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1930 }
1931
1932 if (!new_dev_maps)
1933 goto out_no_new_maps;
1934
537c00de 1935 for_each_possible_cpu(cpu) {
01c5f864
AD
1936 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1937 /* add queue to CPU maps */
1938 int pos = 0;
1939
1940 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1941 while ((pos < map->len) && (map->queues[pos] != index))
1942 pos++;
1943
1944 if (pos == map->len)
1945 map->queues[map->len++] = index;
537c00de 1946#ifdef CONFIG_NUMA
537c00de
AD
1947 if (numa_node_id == -2)
1948 numa_node_id = cpu_to_node(cpu);
1949 else if (numa_node_id != cpu_to_node(cpu))
1950 numa_node_id = -1;
537c00de 1951#endif
01c5f864
AD
1952 } else if (dev_maps) {
1953 /* fill in the new device map from the old device map */
1954 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1955 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 1956 }
01c5f864 1957
537c00de
AD
1958 }
1959
01c5f864
AD
1960 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1961
537c00de 1962 /* Cleanup old maps */
01c5f864
AD
1963 if (dev_maps) {
1964 for_each_possible_cpu(cpu) {
1965 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1966 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1967 if (map && map != new_map)
1968 kfree_rcu(map, rcu);
1969 }
537c00de 1970
01c5f864 1971 kfree_rcu(dev_maps, rcu);
537c00de
AD
1972 }
1973
01c5f864
AD
1974 dev_maps = new_dev_maps;
1975 active = true;
537c00de 1976
01c5f864
AD
1977out_no_new_maps:
1978 /* update Tx queue numa node */
537c00de
AD
1979 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1980 (numa_node_id >= 0) ? numa_node_id :
1981 NUMA_NO_NODE);
1982
01c5f864
AD
1983 if (!dev_maps)
1984 goto out_no_maps;
1985
1986 /* removes queue from unused CPUs */
1987 for_each_possible_cpu(cpu) {
1988 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1989 continue;
1990
1991 if (remove_xps_queue(dev_maps, cpu, index))
1992 active = true;
1993 }
1994
1995 /* free map if not active */
1996 if (!active) {
1997 RCU_INIT_POINTER(dev->xps_maps, NULL);
1998 kfree_rcu(dev_maps, rcu);
1999 }
2000
2001out_no_maps:
537c00de
AD
2002 mutex_unlock(&xps_map_mutex);
2003
2004 return 0;
2005error:
01c5f864
AD
2006 /* remove any maps that we added */
2007 for_each_possible_cpu(cpu) {
2008 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2009 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2010 NULL;
2011 if (new_map && new_map != map)
2012 kfree(new_map);
2013 }
2014
537c00de
AD
2015 mutex_unlock(&xps_map_mutex);
2016
537c00de
AD
2017 kfree(new_dev_maps);
2018 return -ENOMEM;
2019}
2020EXPORT_SYMBOL(netif_set_xps_queue);
2021
2022#endif
f0796d5c
JF
2023/*
2024 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2025 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2026 */
e6484930 2027int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2028{
1d24eb48
TH
2029 int rc;
2030
e6484930
TH
2031 if (txq < 1 || txq > dev->num_tx_queues)
2032 return -EINVAL;
f0796d5c 2033
5c56580b
BH
2034 if (dev->reg_state == NETREG_REGISTERED ||
2035 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2036 ASSERT_RTNL();
2037
1d24eb48
TH
2038 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2039 txq);
bf264145
TH
2040 if (rc)
2041 return rc;
2042
4f57c087
JF
2043 if (dev->num_tc)
2044 netif_setup_tc(dev, txq);
2045
024e9679 2046 if (txq < dev->real_num_tx_queues) {
e6484930 2047 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2048#ifdef CONFIG_XPS
2049 netif_reset_xps_queues_gt(dev, txq);
2050#endif
2051 }
f0796d5c 2052 }
e6484930
TH
2053
2054 dev->real_num_tx_queues = txq;
2055 return 0;
f0796d5c
JF
2056}
2057EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2058
62fe0b40
BH
2059#ifdef CONFIG_RPS
2060/**
2061 * netif_set_real_num_rx_queues - set actual number of RX queues used
2062 * @dev: Network device
2063 * @rxq: Actual number of RX queues
2064 *
2065 * This must be called either with the rtnl_lock held or before
2066 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2067 * negative error code. If called before registration, it always
2068 * succeeds.
62fe0b40
BH
2069 */
2070int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2071{
2072 int rc;
2073
bd25fa7b
TH
2074 if (rxq < 1 || rxq > dev->num_rx_queues)
2075 return -EINVAL;
2076
62fe0b40
BH
2077 if (dev->reg_state == NETREG_REGISTERED) {
2078 ASSERT_RTNL();
2079
62fe0b40
BH
2080 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2081 rxq);
2082 if (rc)
2083 return rc;
62fe0b40
BH
2084 }
2085
2086 dev->real_num_rx_queues = rxq;
2087 return 0;
2088}
2089EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2090#endif
2091
2c53040f
BH
2092/**
2093 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2094 *
2095 * This routine should set an upper limit on the number of RSS queues
2096 * used by default by multiqueue devices.
2097 */
a55b138b 2098int netif_get_num_default_rss_queues(void)
16917b87
YM
2099{
2100 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2101}
2102EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2103
def82a1d 2104static inline void __netif_reschedule(struct Qdisc *q)
56079431 2105{
def82a1d
JP
2106 struct softnet_data *sd;
2107 unsigned long flags;
56079431 2108
def82a1d
JP
2109 local_irq_save(flags);
2110 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
2111 q->next_sched = NULL;
2112 *sd->output_queue_tailp = q;
2113 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2114 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2115 local_irq_restore(flags);
2116}
2117
2118void __netif_schedule(struct Qdisc *q)
2119{
2120 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2121 __netif_reschedule(q);
56079431
DV
2122}
2123EXPORT_SYMBOL(__netif_schedule);
2124
bea3348e 2125void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 2126{
3578b0c8 2127 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
2128 struct softnet_data *sd;
2129 unsigned long flags;
56079431 2130
bea3348e
SH
2131 local_irq_save(flags);
2132 sd = &__get_cpu_var(softnet_data);
2133 skb->next = sd->completion_queue;
2134 sd->completion_queue = skb;
2135 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2136 local_irq_restore(flags);
2137 }
56079431 2138}
bea3348e 2139EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
2140
2141void dev_kfree_skb_any(struct sk_buff *skb)
2142{
2143 if (in_irq() || irqs_disabled())
2144 dev_kfree_skb_irq(skb);
2145 else
2146 dev_kfree_skb(skb);
2147}
2148EXPORT_SYMBOL(dev_kfree_skb_any);
2149
2150
bea3348e
SH
2151/**
2152 * netif_device_detach - mark device as removed
2153 * @dev: network device
2154 *
2155 * Mark device as removed from system and therefore no longer available.
2156 */
56079431
DV
2157void netif_device_detach(struct net_device *dev)
2158{
2159 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2160 netif_running(dev)) {
d543103a 2161 netif_tx_stop_all_queues(dev);
56079431
DV
2162 }
2163}
2164EXPORT_SYMBOL(netif_device_detach);
2165
bea3348e
SH
2166/**
2167 * netif_device_attach - mark device as attached
2168 * @dev: network device
2169 *
2170 * Mark device as attached from system and restart if needed.
2171 */
56079431
DV
2172void netif_device_attach(struct net_device *dev)
2173{
2174 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2175 netif_running(dev)) {
d543103a 2176 netif_tx_wake_all_queues(dev);
4ec93edb 2177 __netdev_watchdog_up(dev);
56079431
DV
2178 }
2179}
2180EXPORT_SYMBOL(netif_device_attach);
2181
36c92474
BH
2182static void skb_warn_bad_offload(const struct sk_buff *skb)
2183{
65e9d2fa 2184 static const netdev_features_t null_features = 0;
36c92474
BH
2185 struct net_device *dev = skb->dev;
2186 const char *driver = "";
2187
2188 if (dev && dev->dev.parent)
2189 driver = dev_driver_string(dev->dev.parent);
2190
2191 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2192 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2193 driver, dev ? &dev->features : &null_features,
2194 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2195 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2196 skb_shinfo(skb)->gso_type, skb->ip_summed);
2197}
2198
1da177e4
LT
2199/*
2200 * Invalidate hardware checksum when packet is to be mangled, and
2201 * complete checksum manually on outgoing path.
2202 */
84fa7933 2203int skb_checksum_help(struct sk_buff *skb)
1da177e4 2204{
d3bc23e7 2205 __wsum csum;
663ead3b 2206 int ret = 0, offset;
1da177e4 2207
84fa7933 2208 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2209 goto out_set_summed;
2210
2211 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2212 skb_warn_bad_offload(skb);
2213 return -EINVAL;
1da177e4
LT
2214 }
2215
cef401de
ED
2216 /* Before computing a checksum, we should make sure no frag could
2217 * be modified by an external entity : checksum could be wrong.
2218 */
2219 if (skb_has_shared_frag(skb)) {
2220 ret = __skb_linearize(skb);
2221 if (ret)
2222 goto out;
2223 }
2224
55508d60 2225 offset = skb_checksum_start_offset(skb);
a030847e
HX
2226 BUG_ON(offset >= skb_headlen(skb));
2227 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2228
2229 offset += skb->csum_offset;
2230 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2231
2232 if (skb_cloned(skb) &&
2233 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2234 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2235 if (ret)
2236 goto out;
2237 }
2238
a030847e 2239 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2240out_set_summed:
1da177e4 2241 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2242out:
1da177e4
LT
2243 return ret;
2244}
d1b19dff 2245EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2246
f6a78bfc 2247/**
05e8ef4a 2248 * skb_mac_gso_segment - mac layer segmentation handler.
f6a78bfc 2249 * @skb: buffer to segment
576a30eb 2250 * @features: features for the output path (see dev->features)
f6a78bfc 2251 */
05e8ef4a
PS
2252struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2253 netdev_features_t features)
f6a78bfc
HX
2254{
2255 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
22061d80 2256 struct packet_offload *ptype;
252e3346 2257 __be16 type = skb->protocol;
f6a78bfc 2258
c8d5bcd1 2259 while (type == htons(ETH_P_8021Q)) {
05e8ef4a 2260 int vlan_depth = ETH_HLEN;
c8d5bcd1 2261 struct vlan_hdr *vh;
7b9c6090 2262
c8d5bcd1 2263 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
7b9c6090
JG
2264 return ERR_PTR(-EINVAL);
2265
c8d5bcd1
JG
2266 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2267 type = vh->h_vlan_encapsulated_proto;
2268 vlan_depth += VLAN_HLEN;
7b9c6090
JG
2269 }
2270
f6a78bfc
HX
2271 __skb_pull(skb, skb->mac_len);
2272
2273 rcu_read_lock();
22061d80 2274 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2275 if (ptype->type == type && ptype->callbacks.gso_segment) {
84fa7933 2276 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
05e8ef4a
PS
2277 int err;
2278
f191a1d1 2279 err = ptype->callbacks.gso_send_check(skb);
a430a43d
HX
2280 segs = ERR_PTR(err);
2281 if (err || skb_gso_ok(skb, features))
2282 break;
d56f90a7
ACM
2283 __skb_push(skb, (skb->data -
2284 skb_network_header(skb)));
a430a43d 2285 }
f191a1d1 2286 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2287 break;
2288 }
2289 }
2290 rcu_read_unlock();
2291
98e399f8 2292 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2293
f6a78bfc
HX
2294 return segs;
2295}
05e8ef4a
PS
2296EXPORT_SYMBOL(skb_mac_gso_segment);
2297
2298
2299/* openvswitch calls this on rx path, so we need a different check.
2300 */
2301static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2302{
2303 if (tx_path)
2304 return skb->ip_summed != CHECKSUM_PARTIAL;
2305 else
2306 return skb->ip_summed == CHECKSUM_NONE;
2307}
2308
2309/**
2310 * __skb_gso_segment - Perform segmentation on skb.
2311 * @skb: buffer to segment
2312 * @features: features for the output path (see dev->features)
2313 * @tx_path: whether it is called in TX path
2314 *
2315 * This function segments the given skb and returns a list of segments.
2316 *
2317 * It may return NULL if the skb requires no segmentation. This is
2318 * only possible when GSO is used for verifying header integrity.
2319 */
2320struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2321 netdev_features_t features, bool tx_path)
2322{
2323 if (unlikely(skb_needs_check(skb, tx_path))) {
2324 int err;
2325
2326 skb_warn_bad_offload(skb);
2327
2328 if (skb_header_cloned(skb) &&
2329 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2330 return ERR_PTR(err);
2331 }
2332
68c33163 2333 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
05e8ef4a
PS
2334 skb_reset_mac_header(skb);
2335 skb_reset_mac_len(skb);
2336
2337 return skb_mac_gso_segment(skb, features);
2338}
12b0004d 2339EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2340
fb286bb2
HX
2341/* Take action when hardware reception checksum errors are detected. */
2342#ifdef CONFIG_BUG
2343void netdev_rx_csum_fault(struct net_device *dev)
2344{
2345 if (net_ratelimit()) {
7b6cd1ce 2346 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2347 dump_stack();
2348 }
2349}
2350EXPORT_SYMBOL(netdev_rx_csum_fault);
2351#endif
2352
1da177e4
LT
2353/* Actually, we should eliminate this check as soon as we know, that:
2354 * 1. IOMMU is present and allows to map all the memory.
2355 * 2. No high memory really exists on this machine.
2356 */
2357
9092c658 2358static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2359{
3d3a8533 2360#ifdef CONFIG_HIGHMEM
1da177e4 2361 int i;
5acbbd42 2362 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2363 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2364 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2365 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2366 return 1;
ea2ab693 2367 }
5acbbd42 2368 }
1da177e4 2369
5acbbd42
FT
2370 if (PCI_DMA_BUS_IS_PHYS) {
2371 struct device *pdev = dev->dev.parent;
1da177e4 2372
9092c658
ED
2373 if (!pdev)
2374 return 0;
5acbbd42 2375 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2376 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2377 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2378 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2379 return 1;
2380 }
2381 }
3d3a8533 2382#endif
1da177e4
LT
2383 return 0;
2384}
1da177e4 2385
f6a78bfc
HX
2386struct dev_gso_cb {
2387 void (*destructor)(struct sk_buff *skb);
2388};
2389
2390#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2391
2392static void dev_gso_skb_destructor(struct sk_buff *skb)
2393{
2394 struct dev_gso_cb *cb;
2395
2396 do {
2397 struct sk_buff *nskb = skb->next;
2398
2399 skb->next = nskb->next;
2400 nskb->next = NULL;
2401 kfree_skb(nskb);
2402 } while (skb->next);
2403
2404 cb = DEV_GSO_CB(skb);
2405 if (cb->destructor)
2406 cb->destructor(skb);
2407}
2408
2409/**
2410 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2411 * @skb: buffer to segment
91ecb63c 2412 * @features: device features as applicable to this skb
f6a78bfc
HX
2413 *
2414 * This function segments the given skb and stores the list of segments
2415 * in skb->next.
2416 */
c8f44aff 2417static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
f6a78bfc 2418{
f6a78bfc 2419 struct sk_buff *segs;
576a30eb
HX
2420
2421 segs = skb_gso_segment(skb, features);
2422
2423 /* Verifying header integrity only. */
2424 if (!segs)
2425 return 0;
f6a78bfc 2426
801678c5 2427 if (IS_ERR(segs))
f6a78bfc
HX
2428 return PTR_ERR(segs);
2429
2430 skb->next = segs;
2431 DEV_GSO_CB(skb)->destructor = skb->destructor;
2432 skb->destructor = dev_gso_skb_destructor;
2433
2434 return 0;
2435}
2436
c8f44aff 2437static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
03634668
JG
2438{
2439 return ((features & NETIF_F_GEN_CSUM) ||
2440 ((features & NETIF_F_V4_CSUM) &&
2441 protocol == htons(ETH_P_IP)) ||
2442 ((features & NETIF_F_V6_CSUM) &&
2443 protocol == htons(ETH_P_IPV6)) ||
2444 ((features & NETIF_F_FCOE_CRC) &&
2445 protocol == htons(ETH_P_FCOE)));
2446}
2447
c8f44aff
MM
2448static netdev_features_t harmonize_features(struct sk_buff *skb,
2449 __be16 protocol, netdev_features_t features)
f01a5236 2450{
c0d680e5
EC
2451 if (skb->ip_summed != CHECKSUM_NONE &&
2452 !can_checksum_protocol(features, protocol)) {
f01a5236
JG
2453 features &= ~NETIF_F_ALL_CSUM;
2454 features &= ~NETIF_F_SG;
2455 } else if (illegal_highdma(skb->dev, skb)) {
2456 features &= ~NETIF_F_SG;
2457 }
2458
2459 return features;
2460}
2461
c8f44aff 2462netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6
JG
2463{
2464 __be16 protocol = skb->protocol;
c8f44aff 2465 netdev_features_t features = skb->dev->features;
58e998c6 2466
30b678d8
BH
2467 if (skb_shinfo(skb)->gso_segs > skb->dev->gso_max_segs)
2468 features &= ~NETIF_F_GSO_MASK;
2469
58e998c6
JG
2470 if (protocol == htons(ETH_P_8021Q)) {
2471 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2472 protocol = veh->h_vlan_encapsulated_proto;
f01a5236
JG
2473 } else if (!vlan_tx_tag_present(skb)) {
2474 return harmonize_features(skb, protocol, features);
2475 }
58e998c6 2476
6ee400aa 2477 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
f01a5236
JG
2478
2479 if (protocol != htons(ETH_P_8021Q)) {
2480 return harmonize_features(skb, protocol, features);
2481 } else {
2482 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
6ee400aa 2483 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
f01a5236
JG
2484 return harmonize_features(skb, protocol, features);
2485 }
58e998c6 2486}
f01a5236 2487EXPORT_SYMBOL(netif_skb_features);
58e998c6 2488
6afff0ca
JF
2489/*
2490 * Returns true if either:
2491 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
d1a53dfd 2492 * 2. skb is fragmented and the device does not support SG.
6afff0ca
JF
2493 */
2494static inline int skb_needs_linearize(struct sk_buff *skb,
02932ce9 2495 int features)
6afff0ca 2496{
02932ce9
JG
2497 return skb_is_nonlinear(skb) &&
2498 ((skb_has_frag_list(skb) &&
2499 !(features & NETIF_F_FRAGLIST)) ||
e1e78db6 2500 (skb_shinfo(skb)->nr_frags &&
02932ce9 2501 !(features & NETIF_F_SG)));
6afff0ca
JF
2502}
2503
fd2ea0a7
DM
2504int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2505 struct netdev_queue *txq)
f6a78bfc 2506{
00829823 2507 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 2508 int rc = NETDEV_TX_OK;
ec764bf0 2509 unsigned int skb_len;
00829823 2510
f6a78bfc 2511 if (likely(!skb->next)) {
c8f44aff 2512 netdev_features_t features;
fc741216 2513
93f154b5 2514 /*
25985edc 2515 * If device doesn't need skb->dst, release it right now while
93f154b5
ED
2516 * its hot in this cpu cache
2517 */
adf30907
ED
2518 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2519 skb_dst_drop(skb);
2520
fc741216
JG
2521 features = netif_skb_features(skb);
2522
7b9c6090 2523 if (vlan_tx_tag_present(skb) &&
fc741216 2524 !(features & NETIF_F_HW_VLAN_TX)) {
7b9c6090
JG
2525 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2526 if (unlikely(!skb))
2527 goto out;
2528
2529 skb->vlan_tci = 0;
2530 }
2531
fc70fb64
AD
2532 /* If encapsulation offload request, verify we are testing
2533 * hardware encapsulation features instead of standard
2534 * features for the netdev
2535 */
2536 if (skb->encapsulation)
2537 features &= dev->hw_enc_features;
2538
fc741216 2539 if (netif_needs_gso(skb, features)) {
91ecb63c 2540 if (unlikely(dev_gso_segment(skb, features)))
9ccb8975
DM
2541 goto out_kfree_skb;
2542 if (skb->next)
2543 goto gso;
6afff0ca 2544 } else {
02932ce9 2545 if (skb_needs_linearize(skb, features) &&
6afff0ca
JF
2546 __skb_linearize(skb))
2547 goto out_kfree_skb;
2548
2549 /* If packet is not checksummed and device does not
2550 * support checksumming for this protocol, complete
2551 * checksumming here.
2552 */
2553 if (skb->ip_summed == CHECKSUM_PARTIAL) {
fc70fb64
AD
2554 if (skb->encapsulation)
2555 skb_set_inner_transport_header(skb,
2556 skb_checksum_start_offset(skb));
2557 else
2558 skb_set_transport_header(skb,
2559 skb_checksum_start_offset(skb));
03634668 2560 if (!(features & NETIF_F_ALL_CSUM) &&
6afff0ca
JF
2561 skb_checksum_help(skb))
2562 goto out_kfree_skb;
2563 }
9ccb8975
DM
2564 }
2565
b40863c6
ED
2566 if (!list_empty(&ptype_all))
2567 dev_queue_xmit_nit(skb, dev);
2568
ec764bf0 2569 skb_len = skb->len;
ac45f602 2570 rc = ops->ndo_start_xmit(skb, dev);
ec764bf0 2571 trace_net_dev_xmit(skb, rc, dev, skb_len);
ec634fe3 2572 if (rc == NETDEV_TX_OK)
08baf561 2573 txq_trans_update(txq);
ac45f602 2574 return rc;
f6a78bfc
HX
2575 }
2576
576a30eb 2577gso:
f6a78bfc
HX
2578 do {
2579 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2580
2581 skb->next = nskb->next;
2582 nskb->next = NULL;
068a2de5
KK
2583
2584 /*
25985edc 2585 * If device doesn't need nskb->dst, release it right now while
068a2de5
KK
2586 * its hot in this cpu cache
2587 */
2588 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2589 skb_dst_drop(nskb);
2590
b40863c6
ED
2591 if (!list_empty(&ptype_all))
2592 dev_queue_xmit_nit(nskb, dev);
2593
ec764bf0 2594 skb_len = nskb->len;
00829823 2595 rc = ops->ndo_start_xmit(nskb, dev);
ec764bf0 2596 trace_net_dev_xmit(nskb, rc, dev, skb_len);
ec634fe3 2597 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2598 if (rc & ~NETDEV_TX_MASK)
2599 goto out_kfree_gso_skb;
f54d9e8d 2600 nskb->next = skb->next;
f6a78bfc
HX
2601 skb->next = nskb;
2602 return rc;
2603 }
08baf561 2604 txq_trans_update(txq);
73466498 2605 if (unlikely(netif_xmit_stopped(txq) && skb->next))
f54d9e8d 2606 return NETDEV_TX_BUSY;
f6a78bfc 2607 } while (skb->next);
4ec93edb 2608
572a9d7b
PM
2609out_kfree_gso_skb:
2610 if (likely(skb->next == NULL))
2611 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
2612out_kfree_skb:
2613 kfree_skb(skb);
7b9c6090 2614out:
572a9d7b 2615 return rc;
f6a78bfc
HX
2616}
2617
1def9238
ED
2618static void qdisc_pkt_len_init(struct sk_buff *skb)
2619{
2620 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2621
2622 qdisc_skb_cb(skb)->pkt_len = skb->len;
2623
2624 /* To get more precise estimation of bytes sent on wire,
2625 * we add to pkt_len the headers size of all segments
2626 */
2627 if (shinfo->gso_size) {
757b8b1d 2628 unsigned int hdr_len;
1def9238 2629
757b8b1d
ED
2630 /* mac layer + network layer */
2631 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2632
2633 /* + transport layer */
1def9238
ED
2634 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2635 hdr_len += tcp_hdrlen(skb);
2636 else
2637 hdr_len += sizeof(struct udphdr);
2638 qdisc_skb_cb(skb)->pkt_len += (shinfo->gso_segs - 1) * hdr_len;
2639 }
2640}
2641
bbd8a0d3
KK
2642static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2643 struct net_device *dev,
2644 struct netdev_queue *txq)
2645{
2646 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2647 bool contended;
bbd8a0d3
KK
2648 int rc;
2649
1def9238 2650 qdisc_pkt_len_init(skb);
a2da570d 2651 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2652 /*
2653 * Heuristic to force contended enqueues to serialize on a
2654 * separate lock before trying to get qdisc main lock.
2655 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2656 * and dequeue packets faster.
2657 */
a2da570d 2658 contended = qdisc_is_running(q);
79640a4c
ED
2659 if (unlikely(contended))
2660 spin_lock(&q->busylock);
2661
bbd8a0d3
KK
2662 spin_lock(root_lock);
2663 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2664 kfree_skb(skb);
2665 rc = NET_XMIT_DROP;
2666 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2667 qdisc_run_begin(q)) {
bbd8a0d3
KK
2668 /*
2669 * This is a work-conserving queue; there are no old skbs
2670 * waiting to be sent out; and the qdisc is not running -
2671 * xmit the skb directly.
2672 */
7fee226a
ED
2673 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2674 skb_dst_force(skb);
bfe0d029 2675
bfe0d029
ED
2676 qdisc_bstats_update(q, skb);
2677
79640a4c
ED
2678 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2679 if (unlikely(contended)) {
2680 spin_unlock(&q->busylock);
2681 contended = false;
2682 }
bbd8a0d3 2683 __qdisc_run(q);
79640a4c 2684 } else
bc135b23 2685 qdisc_run_end(q);
bbd8a0d3
KK
2686
2687 rc = NET_XMIT_SUCCESS;
2688 } else {
7fee226a 2689 skb_dst_force(skb);
a2da570d 2690 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2691 if (qdisc_run_begin(q)) {
2692 if (unlikely(contended)) {
2693 spin_unlock(&q->busylock);
2694 contended = false;
2695 }
2696 __qdisc_run(q);
2697 }
bbd8a0d3
KK
2698 }
2699 spin_unlock(root_lock);
79640a4c
ED
2700 if (unlikely(contended))
2701 spin_unlock(&q->busylock);
bbd8a0d3
KK
2702 return rc;
2703}
2704
5bc1421e
NH
2705#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2706static void skb_update_prio(struct sk_buff *skb)
2707{
6977a79d 2708 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2709
91c68ce2
ED
2710 if (!skb->priority && skb->sk && map) {
2711 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2712
2713 if (prioidx < map->priomap_len)
2714 skb->priority = map->priomap[prioidx];
2715 }
5bc1421e
NH
2716}
2717#else
2718#define skb_update_prio(skb)
2719#endif
2720
745e20f1 2721static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2722#define RECURSION_LIMIT 10
745e20f1 2723
95603e22
MM
2724/**
2725 * dev_loopback_xmit - loop back @skb
2726 * @skb: buffer to transmit
2727 */
2728int dev_loopback_xmit(struct sk_buff *skb)
2729{
2730 skb_reset_mac_header(skb);
2731 __skb_pull(skb, skb_network_offset(skb));
2732 skb->pkt_type = PACKET_LOOPBACK;
2733 skb->ip_summed = CHECKSUM_UNNECESSARY;
2734 WARN_ON(!skb_dst(skb));
2735 skb_dst_force(skb);
2736 netif_rx_ni(skb);
2737 return 0;
2738}
2739EXPORT_SYMBOL(dev_loopback_xmit);
2740
d29f749e
DJ
2741/**
2742 * dev_queue_xmit - transmit a buffer
2743 * @skb: buffer to transmit
2744 *
2745 * Queue a buffer for transmission to a network device. The caller must
2746 * have set the device and priority and built the buffer before calling
2747 * this function. The function can be called from an interrupt.
2748 *
2749 * A negative errno code is returned on a failure. A success does not
2750 * guarantee the frame will be transmitted as it may be dropped due
2751 * to congestion or traffic shaping.
2752 *
2753 * -----------------------------------------------------------------------------------
2754 * I notice this method can also return errors from the queue disciplines,
2755 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2756 * be positive.
2757 *
2758 * Regardless of the return value, the skb is consumed, so it is currently
2759 * difficult to retry a send to this method. (You can bump the ref count
2760 * before sending to hold a reference for retry if you are careful.)
2761 *
2762 * When calling this method, interrupts MUST be enabled. This is because
2763 * the BH enable code must have IRQs enabled so that it will not deadlock.
2764 * --BLG
2765 */
1da177e4
LT
2766int dev_queue_xmit(struct sk_buff *skb)
2767{
2768 struct net_device *dev = skb->dev;
dc2b4847 2769 struct netdev_queue *txq;
1da177e4
LT
2770 struct Qdisc *q;
2771 int rc = -ENOMEM;
2772
6d1ccff6
ED
2773 skb_reset_mac_header(skb);
2774
4ec93edb
YH
2775 /* Disable soft irqs for various locks below. Also
2776 * stops preemption for RCU.
1da177e4 2777 */
4ec93edb 2778 rcu_read_lock_bh();
1da177e4 2779
5bc1421e
NH
2780 skb_update_prio(skb);
2781
8c4c49df 2782 txq = netdev_pick_tx(dev, skb);
a898def2 2783 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2784
1da177e4 2785#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2786 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2787#endif
cf66ba58 2788 trace_net_dev_queue(skb);
1da177e4 2789 if (q->enqueue) {
bbd8a0d3 2790 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2791 goto out;
1da177e4
LT
2792 }
2793
2794 /* The device has no queue. Common case for software devices:
2795 loopback, all the sorts of tunnels...
2796
932ff279
HX
2797 Really, it is unlikely that netif_tx_lock protection is necessary
2798 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2799 counters.)
2800 However, it is possible, that they rely on protection
2801 made by us here.
2802
2803 Check this and shot the lock. It is not prone from deadlocks.
2804 Either shot noqueue qdisc, it is even simpler 8)
2805 */
2806 if (dev->flags & IFF_UP) {
2807 int cpu = smp_processor_id(); /* ok because BHs are off */
2808
c773e847 2809 if (txq->xmit_lock_owner != cpu) {
1da177e4 2810
745e20f1
ED
2811 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2812 goto recursion_alert;
2813
c773e847 2814 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2815
73466498 2816 if (!netif_xmit_stopped(txq)) {
745e20f1 2817 __this_cpu_inc(xmit_recursion);
572a9d7b 2818 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2819 __this_cpu_dec(xmit_recursion);
572a9d7b 2820 if (dev_xmit_complete(rc)) {
c773e847 2821 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2822 goto out;
2823 }
2824 }
c773e847 2825 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
2826 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2827 dev->name);
1da177e4
LT
2828 } else {
2829 /* Recursion is detected! It is possible,
745e20f1
ED
2830 * unfortunately
2831 */
2832recursion_alert:
e87cc472
JP
2833 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2834 dev->name);
1da177e4
LT
2835 }
2836 }
2837
2838 rc = -ENETDOWN;
d4828d85 2839 rcu_read_unlock_bh();
1da177e4 2840
1da177e4
LT
2841 kfree_skb(skb);
2842 return rc;
2843out:
d4828d85 2844 rcu_read_unlock_bh();
1da177e4
LT
2845 return rc;
2846}
d1b19dff 2847EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2848
2849
2850/*=======================================================================
2851 Receiver routines
2852 =======================================================================*/
2853
6b2bedc3 2854int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
2855EXPORT_SYMBOL(netdev_max_backlog);
2856
3b098e2d 2857int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2858int netdev_budget __read_mostly = 300;
2859int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2860
eecfd7c4
ED
2861/* Called with irq disabled */
2862static inline void ____napi_schedule(struct softnet_data *sd,
2863 struct napi_struct *napi)
2864{
2865 list_add_tail(&napi->poll_list, &sd->poll_list);
2866 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2867}
2868
bfb564e7
KK
2869#ifdef CONFIG_RPS
2870
2871/* One global table that all flow-based protocols share. */
6e3f7faf 2872struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
2873EXPORT_SYMBOL(rps_sock_flow_table);
2874
c5905afb 2875struct static_key rps_needed __read_mostly;
adc9300e 2876
c445477d
BH
2877static struct rps_dev_flow *
2878set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2879 struct rps_dev_flow *rflow, u16 next_cpu)
2880{
09994d1b 2881 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
2882#ifdef CONFIG_RFS_ACCEL
2883 struct netdev_rx_queue *rxqueue;
2884 struct rps_dev_flow_table *flow_table;
2885 struct rps_dev_flow *old_rflow;
2886 u32 flow_id;
2887 u16 rxq_index;
2888 int rc;
2889
2890 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
2891 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2892 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
2893 goto out;
2894 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2895 if (rxq_index == skb_get_rx_queue(skb))
2896 goto out;
2897
2898 rxqueue = dev->_rx + rxq_index;
2899 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2900 if (!flow_table)
2901 goto out;
2902 flow_id = skb->rxhash & flow_table->mask;
2903 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2904 rxq_index, flow_id);
2905 if (rc < 0)
2906 goto out;
2907 old_rflow = rflow;
2908 rflow = &flow_table->flows[flow_id];
c445477d
BH
2909 rflow->filter = rc;
2910 if (old_rflow->filter == rflow->filter)
2911 old_rflow->filter = RPS_NO_FILTER;
2912 out:
2913#endif
2914 rflow->last_qtail =
09994d1b 2915 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
2916 }
2917
09994d1b 2918 rflow->cpu = next_cpu;
c445477d
BH
2919 return rflow;
2920}
2921
bfb564e7
KK
2922/*
2923 * get_rps_cpu is called from netif_receive_skb and returns the target
2924 * CPU from the RPS map of the receiving queue for a given skb.
2925 * rcu_read_lock must be held on entry.
2926 */
2927static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2928 struct rps_dev_flow **rflowp)
2929{
2930 struct netdev_rx_queue *rxqueue;
6e3f7faf 2931 struct rps_map *map;
bfb564e7
KK
2932 struct rps_dev_flow_table *flow_table;
2933 struct rps_sock_flow_table *sock_flow_table;
2934 int cpu = -1;
2935 u16 tcpu;
2936
2937 if (skb_rx_queue_recorded(skb)) {
2938 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
2939 if (unlikely(index >= dev->real_num_rx_queues)) {
2940 WARN_ONCE(dev->real_num_rx_queues > 1,
2941 "%s received packet on queue %u, but number "
2942 "of RX queues is %u\n",
2943 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
2944 goto done;
2945 }
2946 rxqueue = dev->_rx + index;
2947 } else
2948 rxqueue = dev->_rx;
2949
6e3f7faf
ED
2950 map = rcu_dereference(rxqueue->rps_map);
2951 if (map) {
85875236 2952 if (map->len == 1 &&
33d480ce 2953 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
2954 tcpu = map->cpus[0];
2955 if (cpu_online(tcpu))
2956 cpu = tcpu;
2957 goto done;
2958 }
33d480ce 2959 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 2960 goto done;
6febfca9 2961 }
bfb564e7 2962
2d47b459 2963 skb_reset_network_header(skb);
bfb564e7
KK
2964 if (!skb_get_rxhash(skb))
2965 goto done;
2966
fec5e652
TH
2967 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2968 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2969 if (flow_table && sock_flow_table) {
2970 u16 next_cpu;
2971 struct rps_dev_flow *rflow;
2972
2973 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2974 tcpu = rflow->cpu;
2975
2976 next_cpu = sock_flow_table->ents[skb->rxhash &
2977 sock_flow_table->mask];
2978
2979 /*
2980 * If the desired CPU (where last recvmsg was done) is
2981 * different from current CPU (one in the rx-queue flow
2982 * table entry), switch if one of the following holds:
2983 * - Current CPU is unset (equal to RPS_NO_CPU).
2984 * - Current CPU is offline.
2985 * - The current CPU's queue tail has advanced beyond the
2986 * last packet that was enqueued using this table entry.
2987 * This guarantees that all previous packets for the flow
2988 * have been dequeued, thus preserving in order delivery.
2989 */
2990 if (unlikely(tcpu != next_cpu) &&
2991 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2992 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
2993 rflow->last_qtail)) >= 0)) {
2994 tcpu = next_cpu;
c445477d 2995 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 2996 }
c445477d 2997
fec5e652
TH
2998 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2999 *rflowp = rflow;
3000 cpu = tcpu;
3001 goto done;
3002 }
3003 }
3004
0a9627f2 3005 if (map) {
fec5e652 3006 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
3007
3008 if (cpu_online(tcpu)) {
3009 cpu = tcpu;
3010 goto done;
3011 }
3012 }
3013
3014done:
0a9627f2
TH
3015 return cpu;
3016}
3017
c445477d
BH
3018#ifdef CONFIG_RFS_ACCEL
3019
3020/**
3021 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3022 * @dev: Device on which the filter was set
3023 * @rxq_index: RX queue index
3024 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3025 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3026 *
3027 * Drivers that implement ndo_rx_flow_steer() should periodically call
3028 * this function for each installed filter and remove the filters for
3029 * which it returns %true.
3030 */
3031bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3032 u32 flow_id, u16 filter_id)
3033{
3034 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3035 struct rps_dev_flow_table *flow_table;
3036 struct rps_dev_flow *rflow;
3037 bool expire = true;
3038 int cpu;
3039
3040 rcu_read_lock();
3041 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3042 if (flow_table && flow_id <= flow_table->mask) {
3043 rflow = &flow_table->flows[flow_id];
3044 cpu = ACCESS_ONCE(rflow->cpu);
3045 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3046 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3047 rflow->last_qtail) <
3048 (int)(10 * flow_table->mask)))
3049 expire = false;
3050 }
3051 rcu_read_unlock();
3052 return expire;
3053}
3054EXPORT_SYMBOL(rps_may_expire_flow);
3055
3056#endif /* CONFIG_RFS_ACCEL */
3057
0a9627f2 3058/* Called from hardirq (IPI) context */
e36fa2f7 3059static void rps_trigger_softirq(void *data)
0a9627f2 3060{
e36fa2f7
ED
3061 struct softnet_data *sd = data;
3062
eecfd7c4 3063 ____napi_schedule(sd, &sd->backlog);
dee42870 3064 sd->received_rps++;
0a9627f2 3065}
e36fa2f7 3066
fec5e652 3067#endif /* CONFIG_RPS */
0a9627f2 3068
e36fa2f7
ED
3069/*
3070 * Check if this softnet_data structure is another cpu one
3071 * If yes, queue it to our IPI list and return 1
3072 * If no, return 0
3073 */
3074static int rps_ipi_queued(struct softnet_data *sd)
3075{
3076#ifdef CONFIG_RPS
3077 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3078
3079 if (sd != mysd) {
3080 sd->rps_ipi_next = mysd->rps_ipi_list;
3081 mysd->rps_ipi_list = sd;
3082
3083 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3084 return 1;
3085 }
3086#endif /* CONFIG_RPS */
3087 return 0;
3088}
3089
0a9627f2
TH
3090/*
3091 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3092 * queue (may be a remote CPU queue).
3093 */
fec5e652
TH
3094static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3095 unsigned int *qtail)
0a9627f2 3096{
e36fa2f7 3097 struct softnet_data *sd;
0a9627f2
TH
3098 unsigned long flags;
3099
e36fa2f7 3100 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3101
3102 local_irq_save(flags);
0a9627f2 3103
e36fa2f7 3104 rps_lock(sd);
6e7676c1
CG
3105 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3106 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 3107enqueue:
e36fa2f7 3108 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3109 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3110 rps_unlock(sd);
152102c7 3111 local_irq_restore(flags);
0a9627f2
TH
3112 return NET_RX_SUCCESS;
3113 }
3114
ebda37c2
ED
3115 /* Schedule NAPI for backlog device
3116 * We can use non atomic operation since we own the queue lock
3117 */
3118 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3119 if (!rps_ipi_queued(sd))
eecfd7c4 3120 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3121 }
3122 goto enqueue;
3123 }
3124
dee42870 3125 sd->dropped++;
e36fa2f7 3126 rps_unlock(sd);
0a9627f2 3127
0a9627f2
TH
3128 local_irq_restore(flags);
3129
caf586e5 3130 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3131 kfree_skb(skb);
3132 return NET_RX_DROP;
3133}
1da177e4 3134
1da177e4
LT
3135/**
3136 * netif_rx - post buffer to the network code
3137 * @skb: buffer to post
3138 *
3139 * This function receives a packet from a device driver and queues it for
3140 * the upper (protocol) levels to process. It always succeeds. The buffer
3141 * may be dropped during processing for congestion control or by the
3142 * protocol layers.
3143 *
3144 * return values:
3145 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
3146 * NET_RX_DROP (packet was dropped)
3147 *
3148 */
3149
3150int netif_rx(struct sk_buff *skb)
3151{
b0e28f1e 3152 int ret;
1da177e4
LT
3153
3154 /* if netpoll wants it, pretend we never saw it */
3155 if (netpoll_rx(skb))
3156 return NET_RX_DROP;
3157
588f0330 3158 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3159
cf66ba58 3160 trace_netif_rx(skb);
df334545 3161#ifdef CONFIG_RPS
c5905afb 3162 if (static_key_false(&rps_needed)) {
fec5e652 3163 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3164 int cpu;
3165
cece1945 3166 preempt_disable();
b0e28f1e 3167 rcu_read_lock();
fec5e652
TH
3168
3169 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3170 if (cpu < 0)
3171 cpu = smp_processor_id();
fec5e652
TH
3172
3173 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3174
b0e28f1e 3175 rcu_read_unlock();
cece1945 3176 preempt_enable();
adc9300e
ED
3177 } else
3178#endif
fec5e652
TH
3179 {
3180 unsigned int qtail;
3181 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3182 put_cpu();
3183 }
b0e28f1e 3184 return ret;
1da177e4 3185}
d1b19dff 3186EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3187
3188int netif_rx_ni(struct sk_buff *skb)
3189{
3190 int err;
3191
3192 preempt_disable();
3193 err = netif_rx(skb);
3194 if (local_softirq_pending())
3195 do_softirq();
3196 preempt_enable();
3197
3198 return err;
3199}
1da177e4
LT
3200EXPORT_SYMBOL(netif_rx_ni);
3201
1da177e4
LT
3202static void net_tx_action(struct softirq_action *h)
3203{
3204 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3205
3206 if (sd->completion_queue) {
3207 struct sk_buff *clist;
3208
3209 local_irq_disable();
3210 clist = sd->completion_queue;
3211 sd->completion_queue = NULL;
3212 local_irq_enable();
3213
3214 while (clist) {
3215 struct sk_buff *skb = clist;
3216 clist = clist->next;
3217
547b792c 3218 WARN_ON(atomic_read(&skb->users));
07dc22e7 3219 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3220 __kfree_skb(skb);
3221 }
3222 }
3223
3224 if (sd->output_queue) {
37437bb2 3225 struct Qdisc *head;
1da177e4
LT
3226
3227 local_irq_disable();
3228 head = sd->output_queue;
3229 sd->output_queue = NULL;
a9cbd588 3230 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3231 local_irq_enable();
3232
3233 while (head) {
37437bb2
DM
3234 struct Qdisc *q = head;
3235 spinlock_t *root_lock;
3236
1da177e4
LT
3237 head = head->next_sched;
3238
5fb66229 3239 root_lock = qdisc_lock(q);
37437bb2 3240 if (spin_trylock(root_lock)) {
def82a1d
JP
3241 smp_mb__before_clear_bit();
3242 clear_bit(__QDISC_STATE_SCHED,
3243 &q->state);
37437bb2
DM
3244 qdisc_run(q);
3245 spin_unlock(root_lock);
1da177e4 3246 } else {
195648bb 3247 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3248 &q->state)) {
195648bb 3249 __netif_reschedule(q);
e8a83e10
JP
3250 } else {
3251 smp_mb__before_clear_bit();
3252 clear_bit(__QDISC_STATE_SCHED,
3253 &q->state);
3254 }
1da177e4
LT
3255 }
3256 }
3257 }
3258}
3259
ab95bfe0
JP
3260#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3261 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3262/* This hook is defined here for ATM LANE */
3263int (*br_fdb_test_addr_hook)(struct net_device *dev,
3264 unsigned char *addr) __read_mostly;
4fb019a0 3265EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3266#endif
1da177e4 3267
1da177e4
LT
3268#ifdef CONFIG_NET_CLS_ACT
3269/* TODO: Maybe we should just force sch_ingress to be compiled in
3270 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3271 * a compare and 2 stores extra right now if we dont have it on
3272 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3273 * NOTE: This doesn't stop any functionality; if you dont have
3274 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3275 *
3276 */
24824a09 3277static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3278{
1da177e4 3279 struct net_device *dev = skb->dev;
f697c3e8 3280 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3281 int result = TC_ACT_OK;
3282 struct Qdisc *q;
4ec93edb 3283
de384830 3284 if (unlikely(MAX_RED_LOOP < ttl++)) {
e87cc472
JP
3285 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3286 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3287 return TC_ACT_SHOT;
3288 }
1da177e4 3289
f697c3e8
HX
3290 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3291 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3292
83874000 3293 q = rxq->qdisc;
8d50b53d 3294 if (q != &noop_qdisc) {
83874000 3295 spin_lock(qdisc_lock(q));
a9312ae8
DM
3296 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3297 result = qdisc_enqueue_root(skb, q);
83874000
DM
3298 spin_unlock(qdisc_lock(q));
3299 }
f697c3e8
HX
3300
3301 return result;
3302}
86e65da9 3303
f697c3e8
HX
3304static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3305 struct packet_type **pt_prev,
3306 int *ret, struct net_device *orig_dev)
3307{
24824a09
ED
3308 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3309
3310 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 3311 goto out;
1da177e4 3312
f697c3e8
HX
3313 if (*pt_prev) {
3314 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3315 *pt_prev = NULL;
1da177e4
LT
3316 }
3317
24824a09 3318 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3319 case TC_ACT_SHOT:
3320 case TC_ACT_STOLEN:
3321 kfree_skb(skb);
3322 return NULL;
3323 }
3324
3325out:
3326 skb->tc_verd = 0;
3327 return skb;
1da177e4
LT
3328}
3329#endif
3330
ab95bfe0
JP
3331/**
3332 * netdev_rx_handler_register - register receive handler
3333 * @dev: device to register a handler for
3334 * @rx_handler: receive handler to register
93e2c32b 3335 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
3336 *
3337 * Register a receive hander for a device. This handler will then be
3338 * called from __netif_receive_skb. A negative errno code is returned
3339 * on a failure.
3340 *
3341 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3342 *
3343 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3344 */
3345int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3346 rx_handler_func_t *rx_handler,
3347 void *rx_handler_data)
ab95bfe0
JP
3348{
3349 ASSERT_RTNL();
3350
3351 if (dev->rx_handler)
3352 return -EBUSY;
3353
93e2c32b 3354 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3355 rcu_assign_pointer(dev->rx_handler, rx_handler);
3356
3357 return 0;
3358}
3359EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3360
3361/**
3362 * netdev_rx_handler_unregister - unregister receive handler
3363 * @dev: device to unregister a handler from
3364 *
3365 * Unregister a receive hander from a device.
3366 *
3367 * The caller must hold the rtnl_mutex.
3368 */
3369void netdev_rx_handler_unregister(struct net_device *dev)
3370{
3371
3372 ASSERT_RTNL();
a9b3cd7f
SH
3373 RCU_INIT_POINTER(dev->rx_handler, NULL);
3374 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3375}
3376EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3377
b4b9e355
MG
3378/*
3379 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3380 * the special handling of PFMEMALLOC skbs.
3381 */
3382static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3383{
3384 switch (skb->protocol) {
3385 case __constant_htons(ETH_P_ARP):
3386 case __constant_htons(ETH_P_IP):
3387 case __constant_htons(ETH_P_IPV6):
3388 case __constant_htons(ETH_P_8021Q):
3389 return true;
3390 default:
3391 return false;
3392 }
3393}
3394
9754e293 3395static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3396{
3397 struct packet_type *ptype, *pt_prev;
ab95bfe0 3398 rx_handler_func_t *rx_handler;
f2ccd8fa 3399 struct net_device *orig_dev;
63d8ea7f 3400 struct net_device *null_or_dev;
8a4eb573 3401 bool deliver_exact = false;
1da177e4 3402 int ret = NET_RX_DROP;
252e3346 3403 __be16 type;
1da177e4 3404
588f0330 3405 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3406
cf66ba58 3407 trace_netif_receive_skb(skb);
9b22ea56 3408
1da177e4 3409 /* if we've gotten here through NAPI, check netpoll */
bea3348e 3410 if (netpoll_receive_skb(skb))
b4b9e355 3411 goto out;
1da177e4 3412
cc9bd5ce 3413 orig_dev = skb->dev;
8f903c70 3414
c1d2bbe1 3415 skb_reset_network_header(skb);
fda55eca
ED
3416 if (!skb_transport_header_was_set(skb))
3417 skb_reset_transport_header(skb);
0b5c9db1 3418 skb_reset_mac_len(skb);
1da177e4
LT
3419
3420 pt_prev = NULL;
3421
3422 rcu_read_lock();
3423
63d8ea7f 3424another_round:
b6858177 3425 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3426
3427 __this_cpu_inc(softnet_data.processed);
3428
bcc6d479
JP
3429 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3430 skb = vlan_untag(skb);
3431 if (unlikely(!skb))
b4b9e355 3432 goto unlock;
bcc6d479
JP
3433 }
3434
1da177e4
LT
3435#ifdef CONFIG_NET_CLS_ACT
3436 if (skb->tc_verd & TC_NCLS) {
3437 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3438 goto ncls;
3439 }
3440#endif
3441
9754e293 3442 if (pfmemalloc)
b4b9e355
MG
3443 goto skip_taps;
3444
1da177e4 3445 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3446 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3447 if (pt_prev)
f2ccd8fa 3448 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3449 pt_prev = ptype;
3450 }
3451 }
3452
b4b9e355 3453skip_taps:
1da177e4 3454#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3455 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3456 if (!skb)
b4b9e355 3457 goto unlock;
1da177e4
LT
3458ncls:
3459#endif
3460
9754e293 3461 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3462 goto drop;
3463
2425717b
JF
3464 if (vlan_tx_tag_present(skb)) {
3465 if (pt_prev) {
3466 ret = deliver_skb(skb, pt_prev, orig_dev);
3467 pt_prev = NULL;
3468 }
48cc32d3 3469 if (vlan_do_receive(&skb))
2425717b
JF
3470 goto another_round;
3471 else if (unlikely(!skb))
b4b9e355 3472 goto unlock;
2425717b
JF
3473 }
3474
48cc32d3 3475 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3476 if (rx_handler) {
3477 if (pt_prev) {
3478 ret = deliver_skb(skb, pt_prev, orig_dev);
3479 pt_prev = NULL;
3480 }
8a4eb573
JP
3481 switch (rx_handler(&skb)) {
3482 case RX_HANDLER_CONSUMED:
b4b9e355 3483 goto unlock;
8a4eb573 3484 case RX_HANDLER_ANOTHER:
63d8ea7f 3485 goto another_round;
8a4eb573
JP
3486 case RX_HANDLER_EXACT:
3487 deliver_exact = true;
3488 case RX_HANDLER_PASS:
3489 break;
3490 default:
3491 BUG();
3492 }
ab95bfe0 3493 }
1da177e4 3494
48cc32d3
FZ
3495 if (vlan_tx_nonzero_tag_present(skb))
3496 skb->pkt_type = PACKET_OTHERHOST;
3497
63d8ea7f 3498 /* deliver only exact match when indicated */
8a4eb573 3499 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3500
1da177e4 3501 type = skb->protocol;
82d8a867
PE
3502 list_for_each_entry_rcu(ptype,
3503 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3504 if (ptype->type == type &&
e3f48d37
JP
3505 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3506 ptype->dev == orig_dev)) {
4ec93edb 3507 if (pt_prev)
f2ccd8fa 3508 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3509 pt_prev = ptype;
3510 }
3511 }
3512
3513 if (pt_prev) {
1080e512 3514 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3515 goto drop;
1080e512
MT
3516 else
3517 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3518 } else {
b4b9e355 3519drop:
caf586e5 3520 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3521 kfree_skb(skb);
3522 /* Jamal, now you will not able to escape explaining
3523 * me how you were going to use this. :-)
3524 */
3525 ret = NET_RX_DROP;
3526 }
3527
b4b9e355 3528unlock:
1da177e4 3529 rcu_read_unlock();
b4b9e355 3530out:
9754e293
DM
3531 return ret;
3532}
3533
3534static int __netif_receive_skb(struct sk_buff *skb)
3535{
3536 int ret;
3537
3538 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3539 unsigned long pflags = current->flags;
3540
3541 /*
3542 * PFMEMALLOC skbs are special, they should
3543 * - be delivered to SOCK_MEMALLOC sockets only
3544 * - stay away from userspace
3545 * - have bounded memory usage
3546 *
3547 * Use PF_MEMALLOC as this saves us from propagating the allocation
3548 * context down to all allocation sites.
3549 */
3550 current->flags |= PF_MEMALLOC;
3551 ret = __netif_receive_skb_core(skb, true);
3552 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3553 } else
3554 ret = __netif_receive_skb_core(skb, false);
3555
1da177e4
LT
3556 return ret;
3557}
0a9627f2
TH
3558
3559/**
3560 * netif_receive_skb - process receive buffer from network
3561 * @skb: buffer to process
3562 *
3563 * netif_receive_skb() is the main receive data processing function.
3564 * It always succeeds. The buffer may be dropped during processing
3565 * for congestion control or by the protocol layers.
3566 *
3567 * This function may only be called from softirq context and interrupts
3568 * should be enabled.
3569 *
3570 * Return values (usually ignored):
3571 * NET_RX_SUCCESS: no congestion
3572 * NET_RX_DROP: packet was dropped
3573 */
3574int netif_receive_skb(struct sk_buff *skb)
3575{
588f0330 3576 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3577
c1f19b51
RC
3578 if (skb_defer_rx_timestamp(skb))
3579 return NET_RX_SUCCESS;
3580
df334545 3581#ifdef CONFIG_RPS
c5905afb 3582 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3583 struct rps_dev_flow voidflow, *rflow = &voidflow;
3584 int cpu, ret;
fec5e652 3585
3b098e2d
ED
3586 rcu_read_lock();
3587
3588 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3589
3b098e2d
ED
3590 if (cpu >= 0) {
3591 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3592 rcu_read_unlock();
adc9300e 3593 return ret;
3b098e2d 3594 }
adc9300e 3595 rcu_read_unlock();
fec5e652 3596 }
1e94d72f 3597#endif
adc9300e 3598 return __netif_receive_skb(skb);
0a9627f2 3599}
d1b19dff 3600EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3601
88751275
ED
3602/* Network device is going away, flush any packets still pending
3603 * Called with irqs disabled.
3604 */
152102c7 3605static void flush_backlog(void *arg)
6e583ce5 3606{
152102c7 3607 struct net_device *dev = arg;
e36fa2f7 3608 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3609 struct sk_buff *skb, *tmp;
3610
e36fa2f7 3611 rps_lock(sd);
6e7676c1 3612 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3613 if (skb->dev == dev) {
e36fa2f7 3614 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3615 kfree_skb(skb);
76cc8b13 3616 input_queue_head_incr(sd);
6e583ce5 3617 }
6e7676c1 3618 }
e36fa2f7 3619 rps_unlock(sd);
6e7676c1
CG
3620
3621 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3622 if (skb->dev == dev) {
3623 __skb_unlink(skb, &sd->process_queue);
3624 kfree_skb(skb);
76cc8b13 3625 input_queue_head_incr(sd);
6e7676c1
CG
3626 }
3627 }
6e583ce5
SH
3628}
3629
d565b0a1
HX
3630static int napi_gro_complete(struct sk_buff *skb)
3631{
22061d80 3632 struct packet_offload *ptype;
d565b0a1 3633 __be16 type = skb->protocol;
22061d80 3634 struct list_head *head = &offload_base;
d565b0a1
HX
3635 int err = -ENOENT;
3636
c3c7c254
ED
3637 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3638
fc59f9a3
HX
3639 if (NAPI_GRO_CB(skb)->count == 1) {
3640 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3641 goto out;
fc59f9a3 3642 }
d565b0a1
HX
3643
3644 rcu_read_lock();
3645 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3646 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
3647 continue;
3648
f191a1d1 3649 err = ptype->callbacks.gro_complete(skb);
d565b0a1
HX
3650 break;
3651 }
3652 rcu_read_unlock();
3653
3654 if (err) {
3655 WARN_ON(&ptype->list == head);
3656 kfree_skb(skb);
3657 return NET_RX_SUCCESS;
3658 }
3659
3660out:
d565b0a1
HX
3661 return netif_receive_skb(skb);
3662}
3663
2e71a6f8
ED
3664/* napi->gro_list contains packets ordered by age.
3665 * youngest packets at the head of it.
3666 * Complete skbs in reverse order to reduce latencies.
3667 */
3668void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 3669{
2e71a6f8 3670 struct sk_buff *skb, *prev = NULL;
d565b0a1 3671
2e71a6f8
ED
3672 /* scan list and build reverse chain */
3673 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3674 skb->prev = prev;
3675 prev = skb;
3676 }
3677
3678 for (skb = prev; skb; skb = prev) {
d565b0a1 3679 skb->next = NULL;
2e71a6f8
ED
3680
3681 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3682 return;
3683
3684 prev = skb->prev;
d565b0a1 3685 napi_gro_complete(skb);
2e71a6f8 3686 napi->gro_count--;
d565b0a1
HX
3687 }
3688
3689 napi->gro_list = NULL;
3690}
86cac58b 3691EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3692
89c5fa33
ED
3693static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3694{
3695 struct sk_buff *p;
3696 unsigned int maclen = skb->dev->hard_header_len;
3697
3698 for (p = napi->gro_list; p; p = p->next) {
3699 unsigned long diffs;
3700
3701 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3702 diffs |= p->vlan_tci ^ skb->vlan_tci;
3703 if (maclen == ETH_HLEN)
3704 diffs |= compare_ether_header(skb_mac_header(p),
3705 skb_gro_mac_header(skb));
3706 else if (!diffs)
3707 diffs = memcmp(skb_mac_header(p),
3708 skb_gro_mac_header(skb),
3709 maclen);
3710 NAPI_GRO_CB(p)->same_flow = !diffs;
3711 NAPI_GRO_CB(p)->flush = 0;
3712 }
3713}
3714
bb728820 3715static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3716{
3717 struct sk_buff **pp = NULL;
22061d80 3718 struct packet_offload *ptype;
d565b0a1 3719 __be16 type = skb->protocol;
22061d80 3720 struct list_head *head = &offload_base;
0da2afd5 3721 int same_flow;
5b252f0c 3722 enum gro_result ret;
d565b0a1 3723
ce9e76c8 3724 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3725 goto normal;
3726
21dc3301 3727 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3728 goto normal;
3729
89c5fa33
ED
3730 gro_list_prepare(napi, skb);
3731
d565b0a1
HX
3732 rcu_read_lock();
3733 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3734 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
3735 continue;
3736
86911732 3737 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 3738 skb_reset_mac_len(skb);
d565b0a1
HX
3739 NAPI_GRO_CB(skb)->same_flow = 0;
3740 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3741 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3742
f191a1d1 3743 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
3744 break;
3745 }
3746 rcu_read_unlock();
3747
3748 if (&ptype->list == head)
3749 goto normal;
3750
0da2afd5 3751 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3752 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3753
d565b0a1
HX
3754 if (pp) {
3755 struct sk_buff *nskb = *pp;
3756
3757 *pp = nskb->next;
3758 nskb->next = NULL;
3759 napi_gro_complete(nskb);
4ae5544f 3760 napi->gro_count--;
d565b0a1
HX
3761 }
3762
0da2afd5 3763 if (same_flow)
d565b0a1
HX
3764 goto ok;
3765
4ae5544f 3766 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3767 goto normal;
d565b0a1 3768
4ae5544f 3769 napi->gro_count++;
d565b0a1 3770 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 3771 NAPI_GRO_CB(skb)->age = jiffies;
86911732 3772 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3773 skb->next = napi->gro_list;
3774 napi->gro_list = skb;
5d0d9be8 3775 ret = GRO_HELD;
d565b0a1 3776
ad0f9904 3777pull:
cb18978c
HX
3778 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3779 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3780
3781 BUG_ON(skb->end - skb->tail < grow);
3782
3783 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3784
3785 skb->tail += grow;
3786 skb->data_len -= grow;
3787
3788 skb_shinfo(skb)->frags[0].page_offset += grow;
9e903e08 3789 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
cb18978c 3790
9e903e08 3791 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
ea2ab693 3792 skb_frag_unref(skb, 0);
cb18978c
HX
3793 memmove(skb_shinfo(skb)->frags,
3794 skb_shinfo(skb)->frags + 1,
e5093aec 3795 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3796 }
ad0f9904
HX
3797 }
3798
d565b0a1 3799ok:
5d0d9be8 3800 return ret;
d565b0a1
HX
3801
3802normal:
ad0f9904
HX
3803 ret = GRO_NORMAL;
3804 goto pull;
5d38a079 3805}
96e93eab 3806
5d38a079 3807
bb728820 3808static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3809{
5d0d9be8
HX
3810 switch (ret) {
3811 case GRO_NORMAL:
c7c4b3b6
BH
3812 if (netif_receive_skb(skb))
3813 ret = GRO_DROP;
3814 break;
5d38a079 3815
5d0d9be8 3816 case GRO_DROP:
5d38a079
HX
3817 kfree_skb(skb);
3818 break;
5b252f0c 3819
daa86548 3820 case GRO_MERGED_FREE:
d7e8883c
ED
3821 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3822 kmem_cache_free(skbuff_head_cache, skb);
3823 else
3824 __kfree_skb(skb);
daa86548
ED
3825 break;
3826
5b252f0c
BH
3827 case GRO_HELD:
3828 case GRO_MERGED:
3829 break;
5d38a079
HX
3830 }
3831
c7c4b3b6 3832 return ret;
5d0d9be8 3833}
5d0d9be8 3834
ca07e43e 3835static void skb_gro_reset_offset(struct sk_buff *skb)
78a478d0 3836{
ca07e43e
ED
3837 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3838 const skb_frag_t *frag0 = &pinfo->frags[0];
3839
78a478d0
HX
3840 NAPI_GRO_CB(skb)->data_offset = 0;
3841 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3842 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3843
78d3fd0b 3844 if (skb->mac_header == skb->tail &&
ca07e43e
ED
3845 pinfo->nr_frags &&
3846 !PageHighMem(skb_frag_page(frag0))) {
3847 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3848 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
7489594c 3849 }
78a478d0 3850}
78a478d0 3851
c7c4b3b6 3852gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3853{
86911732
HX
3854 skb_gro_reset_offset(skb);
3855
89c5fa33 3856 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
3857}
3858EXPORT_SYMBOL(napi_gro_receive);
3859
d0c2b0d2 3860static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 3861{
96e93eab 3862 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
3863 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3864 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 3865 skb->vlan_tci = 0;
66c46d74 3866 skb->dev = napi->dev;
6d152e23 3867 skb->skb_iif = 0;
96e93eab
HX
3868
3869 napi->skb = skb;
3870}
96e93eab 3871
76620aaf 3872struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3873{
5d38a079 3874 struct sk_buff *skb = napi->skb;
5d38a079
HX
3875
3876 if (!skb) {
89d71a66
ED
3877 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3878 if (skb)
3879 napi->skb = skb;
80595d59 3880 }
96e93eab
HX
3881 return skb;
3882}
76620aaf 3883EXPORT_SYMBOL(napi_get_frags);
96e93eab 3884
bb728820 3885static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
c7c4b3b6 3886 gro_result_t ret)
96e93eab 3887{
5d0d9be8
HX
3888 switch (ret) {
3889 case GRO_NORMAL:
86911732 3890 case GRO_HELD:
e76b69cc 3891 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3892
c7c4b3b6
BH
3893 if (ret == GRO_HELD)
3894 skb_gro_pull(skb, -ETH_HLEN);
3895 else if (netif_receive_skb(skb))
3896 ret = GRO_DROP;
86911732 3897 break;
5d38a079 3898
5d0d9be8 3899 case GRO_DROP:
5d0d9be8
HX
3900 case GRO_MERGED_FREE:
3901 napi_reuse_skb(napi, skb);
3902 break;
5b252f0c
BH
3903
3904 case GRO_MERGED:
3905 break;
5d0d9be8 3906 }
5d38a079 3907
c7c4b3b6 3908 return ret;
5d38a079 3909}
5d0d9be8 3910
4adb9c4a 3911static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
3912{
3913 struct sk_buff *skb = napi->skb;
3914 struct ethhdr *eth;
a5b1cf28
HX
3915 unsigned int hlen;
3916 unsigned int off;
76620aaf
HX
3917
3918 napi->skb = NULL;
3919
3920 skb_reset_mac_header(skb);
3921 skb_gro_reset_offset(skb);
3922
a5b1cf28
HX
3923 off = skb_gro_offset(skb);
3924 hlen = off + sizeof(*eth);
3925 eth = skb_gro_header_fast(skb, off);
3926 if (skb_gro_header_hard(skb, hlen)) {
3927 eth = skb_gro_header_slow(skb, hlen, off);
3928 if (unlikely(!eth)) {
3929 napi_reuse_skb(napi, skb);
3930 skb = NULL;
3931 goto out;
3932 }
76620aaf
HX
3933 }
3934
3935 skb_gro_pull(skb, sizeof(*eth));
3936
3937 /*
3938 * This works because the only protocols we care about don't require
3939 * special handling. We'll fix it up properly at the end.
3940 */
3941 skb->protocol = eth->h_proto;
3942
3943out:
3944 return skb;
3945}
76620aaf 3946
c7c4b3b6 3947gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3948{
76620aaf 3949 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3950
3951 if (!skb)
c7c4b3b6 3952 return GRO_DROP;
5d0d9be8 3953
89c5fa33 3954 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 3955}
5d38a079
HX
3956EXPORT_SYMBOL(napi_gro_frags);
3957
e326bed2
ED
3958/*
3959 * net_rps_action sends any pending IPI's for rps.
3960 * Note: called with local irq disabled, but exits with local irq enabled.
3961 */
3962static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3963{
3964#ifdef CONFIG_RPS
3965 struct softnet_data *remsd = sd->rps_ipi_list;
3966
3967 if (remsd) {
3968 sd->rps_ipi_list = NULL;
3969
3970 local_irq_enable();
3971
3972 /* Send pending IPI's to kick RPS processing on remote cpus. */
3973 while (remsd) {
3974 struct softnet_data *next = remsd->rps_ipi_next;
3975
3976 if (cpu_online(remsd->cpu))
3977 __smp_call_function_single(remsd->cpu,
3978 &remsd->csd, 0);
3979 remsd = next;
3980 }
3981 } else
3982#endif
3983 local_irq_enable();
3984}
3985
bea3348e 3986static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
3987{
3988 int work = 0;
eecfd7c4 3989 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 3990
e326bed2
ED
3991#ifdef CONFIG_RPS
3992 /* Check if we have pending ipi, its better to send them now,
3993 * not waiting net_rx_action() end.
3994 */
3995 if (sd->rps_ipi_list) {
3996 local_irq_disable();
3997 net_rps_action_and_irq_enable(sd);
3998 }
3999#endif
bea3348e 4000 napi->weight = weight_p;
6e7676c1
CG
4001 local_irq_disable();
4002 while (work < quota) {
1da177e4 4003 struct sk_buff *skb;
6e7676c1
CG
4004 unsigned int qlen;
4005
4006 while ((skb = __skb_dequeue(&sd->process_queue))) {
4007 local_irq_enable();
4008 __netif_receive_skb(skb);
6e7676c1 4009 local_irq_disable();
76cc8b13
TH
4010 input_queue_head_incr(sd);
4011 if (++work >= quota) {
4012 local_irq_enable();
4013 return work;
4014 }
6e7676c1 4015 }
1da177e4 4016
e36fa2f7 4017 rps_lock(sd);
6e7676c1 4018 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 4019 if (qlen)
6e7676c1
CG
4020 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4021 &sd->process_queue);
76cc8b13 4022
6e7676c1 4023 if (qlen < quota - work) {
eecfd7c4
ED
4024 /*
4025 * Inline a custom version of __napi_complete().
4026 * only current cpu owns and manipulates this napi,
4027 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4028 * we can use a plain write instead of clear_bit(),
4029 * and we dont need an smp_mb() memory barrier.
4030 */
4031 list_del(&napi->poll_list);
4032 napi->state = 0;
4033
6e7676c1 4034 quota = work + qlen;
bea3348e 4035 }
e36fa2f7 4036 rps_unlock(sd);
6e7676c1
CG
4037 }
4038 local_irq_enable();
1da177e4 4039
bea3348e
SH
4040 return work;
4041}
1da177e4 4042
bea3348e
SH
4043/**
4044 * __napi_schedule - schedule for receive
c4ea43c5 4045 * @n: entry to schedule
bea3348e
SH
4046 *
4047 * The entry's receive function will be scheduled to run
4048 */
b5606c2d 4049void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4050{
4051 unsigned long flags;
1da177e4 4052
bea3348e 4053 local_irq_save(flags);
eecfd7c4 4054 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 4055 local_irq_restore(flags);
1da177e4 4056}
bea3348e
SH
4057EXPORT_SYMBOL(__napi_schedule);
4058
d565b0a1
HX
4059void __napi_complete(struct napi_struct *n)
4060{
4061 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4062 BUG_ON(n->gro_list);
4063
4064 list_del(&n->poll_list);
4065 smp_mb__before_clear_bit();
4066 clear_bit(NAPI_STATE_SCHED, &n->state);
4067}
4068EXPORT_SYMBOL(__napi_complete);
4069
4070void napi_complete(struct napi_struct *n)
4071{
4072 unsigned long flags;
4073
4074 /*
4075 * don't let napi dequeue from the cpu poll list
4076 * just in case its running on a different cpu
4077 */
4078 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4079 return;
4080
2e71a6f8 4081 napi_gro_flush(n, false);
d565b0a1
HX
4082 local_irq_save(flags);
4083 __napi_complete(n);
4084 local_irq_restore(flags);
4085}
4086EXPORT_SYMBOL(napi_complete);
4087
4088void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4089 int (*poll)(struct napi_struct *, int), int weight)
4090{
4091 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 4092 napi->gro_count = 0;
d565b0a1 4093 napi->gro_list = NULL;
5d38a079 4094 napi->skb = NULL;
d565b0a1
HX
4095 napi->poll = poll;
4096 napi->weight = weight;
4097 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4098 napi->dev = dev;
5d38a079 4099#ifdef CONFIG_NETPOLL
d565b0a1
HX
4100 spin_lock_init(&napi->poll_lock);
4101 napi->poll_owner = -1;
4102#endif
4103 set_bit(NAPI_STATE_SCHED, &napi->state);
4104}
4105EXPORT_SYMBOL(netif_napi_add);
4106
4107void netif_napi_del(struct napi_struct *napi)
4108{
4109 struct sk_buff *skb, *next;
4110
d7b06636 4111 list_del_init(&napi->dev_list);
76620aaf 4112 napi_free_frags(napi);
d565b0a1
HX
4113
4114 for (skb = napi->gro_list; skb; skb = next) {
4115 next = skb->next;
4116 skb->next = NULL;
4117 kfree_skb(skb);
4118 }
4119
4120 napi->gro_list = NULL;
4ae5544f 4121 napi->gro_count = 0;
d565b0a1
HX
4122}
4123EXPORT_SYMBOL(netif_napi_del);
4124
1da177e4
LT
4125static void net_rx_action(struct softirq_action *h)
4126{
e326bed2 4127 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 4128 unsigned long time_limit = jiffies + 2;
51b0bded 4129 int budget = netdev_budget;
53fb95d3
MM
4130 void *have;
4131
1da177e4
LT
4132 local_irq_disable();
4133
e326bed2 4134 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
4135 struct napi_struct *n;
4136 int work, weight;
1da177e4 4137
bea3348e 4138 /* If softirq window is exhuasted then punt.
24f8b238
SH
4139 * Allow this to run for 2 jiffies since which will allow
4140 * an average latency of 1.5/HZ.
bea3348e 4141 */
24f8b238 4142 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
4143 goto softnet_break;
4144
4145 local_irq_enable();
4146
bea3348e
SH
4147 /* Even though interrupts have been re-enabled, this
4148 * access is safe because interrupts can only add new
4149 * entries to the tail of this list, and only ->poll()
4150 * calls can remove this head entry from the list.
4151 */
e326bed2 4152 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 4153
bea3348e
SH
4154 have = netpoll_poll_lock(n);
4155
4156 weight = n->weight;
4157
0a7606c1
DM
4158 /* This NAPI_STATE_SCHED test is for avoiding a race
4159 * with netpoll's poll_napi(). Only the entity which
4160 * obtains the lock and sees NAPI_STATE_SCHED set will
4161 * actually make the ->poll() call. Therefore we avoid
25985edc 4162 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
4163 */
4164 work = 0;
4ea7e386 4165 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 4166 work = n->poll(n, weight);
4ea7e386
NH
4167 trace_napi_poll(n);
4168 }
bea3348e
SH
4169
4170 WARN_ON_ONCE(work > weight);
4171
4172 budget -= work;
4173
4174 local_irq_disable();
4175
4176 /* Drivers must not modify the NAPI state if they
4177 * consume the entire weight. In such cases this code
4178 * still "owns" the NAPI instance and therefore can
4179 * move the instance around on the list at-will.
4180 */
fed17f30 4181 if (unlikely(work == weight)) {
ff780cd8
HX
4182 if (unlikely(napi_disable_pending(n))) {
4183 local_irq_enable();
4184 napi_complete(n);
4185 local_irq_disable();
2e71a6f8
ED
4186 } else {
4187 if (n->gro_list) {
4188 /* flush too old packets
4189 * If HZ < 1000, flush all packets.
4190 */
4191 local_irq_enable();
4192 napi_gro_flush(n, HZ >= 1000);
4193 local_irq_disable();
4194 }
e326bed2 4195 list_move_tail(&n->poll_list, &sd->poll_list);
2e71a6f8 4196 }
fed17f30 4197 }
bea3348e
SH
4198
4199 netpoll_poll_unlock(have);
1da177e4
LT
4200 }
4201out:
e326bed2 4202 net_rps_action_and_irq_enable(sd);
0a9627f2 4203
db217334
CL
4204#ifdef CONFIG_NET_DMA
4205 /*
4206 * There may not be any more sk_buffs coming right now, so push
4207 * any pending DMA copies to hardware
4208 */
2ba05622 4209 dma_issue_pending_all();
db217334 4210#endif
bea3348e 4211
1da177e4
LT
4212 return;
4213
4214softnet_break:
dee42870 4215 sd->time_squeeze++;
1da177e4
LT
4216 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4217 goto out;
4218}
4219
1da177e4 4220#ifdef CONFIG_PROC_FS
f04565dd 4221
2def16ae 4222#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
f04565dd
MM
4223
4224#define get_bucket(x) ((x) >> BUCKET_SPACE)
4225#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4226#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4227
2def16ae 4228static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
f04565dd 4229{
f04565dd
MM
4230 struct net *net = seq_file_net(seq);
4231 struct net_device *dev;
4232 struct hlist_node *p;
4233 struct hlist_head *h;
2def16ae 4234 unsigned int count = 0, offset = get_offset(*pos);
f04565dd 4235
2def16ae 4236 h = &net->dev_name_head[get_bucket(*pos)];
f04565dd 4237 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
2def16ae 4238 if (++count == offset)
f04565dd 4239 return dev;
f04565dd
MM
4240 }
4241
4242 return NULL;
4243}
4244
2def16ae 4245static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
f04565dd 4246{
f04565dd
MM
4247 struct net_device *dev;
4248 unsigned int bucket;
4249
f04565dd 4250 do {
2def16ae 4251 dev = dev_from_same_bucket(seq, pos);
f04565dd
MM
4252 if (dev)
4253 return dev;
4254
2def16ae
ED
4255 bucket = get_bucket(*pos) + 1;
4256 *pos = set_bucket_offset(bucket, 1);
f04565dd
MM
4257 } while (bucket < NETDEV_HASHENTRIES);
4258
4259 return NULL;
4260}
4261
1da177e4
LT
4262/*
4263 * This is invoked by the /proc filesystem handler to display a device
4264 * in detail.
4265 */
7562f876 4266void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 4267 __acquires(RCU)
1da177e4 4268{
c6d14c84 4269 rcu_read_lock();
7562f876
PE
4270 if (!*pos)
4271 return SEQ_START_TOKEN;
1da177e4 4272
2def16ae 4273 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
f04565dd 4274 return NULL;
1da177e4 4275
2def16ae 4276 return dev_from_bucket(seq, pos);
1da177e4
LT
4277}
4278
4279void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4280{
f04565dd 4281 ++*pos;
2def16ae 4282 return dev_from_bucket(seq, pos);
1da177e4
LT
4283}
4284
4285void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 4286 __releases(RCU)
1da177e4 4287{
c6d14c84 4288 rcu_read_unlock();
1da177e4
LT
4289}
4290
4291static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4292{
28172739
ED
4293 struct rtnl_link_stats64 temp;
4294 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
1da177e4 4295
be1f3c2c
BH
4296 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4297 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
5a1b5898
RR
4298 dev->name, stats->rx_bytes, stats->rx_packets,
4299 stats->rx_errors,
4300 stats->rx_dropped + stats->rx_missed_errors,
4301 stats->rx_fifo_errors,
4302 stats->rx_length_errors + stats->rx_over_errors +
4303 stats->rx_crc_errors + stats->rx_frame_errors,
4304 stats->rx_compressed, stats->multicast,
4305 stats->tx_bytes, stats->tx_packets,
4306 stats->tx_errors, stats->tx_dropped,
4307 stats->tx_fifo_errors, stats->collisions,
4308 stats->tx_carrier_errors +
4309 stats->tx_aborted_errors +
4310 stats->tx_window_errors +
4311 stats->tx_heartbeat_errors,
4312 stats->tx_compressed);
1da177e4
LT
4313}
4314
4315/*
4316 * Called from the PROCfs module. This now uses the new arbitrary sized
4317 * /proc/net interface to create /proc/net/dev
4318 */
4319static int dev_seq_show(struct seq_file *seq, void *v)
4320{
4321 if (v == SEQ_START_TOKEN)
4322 seq_puts(seq, "Inter-| Receive "
4323 " | Transmit\n"
4324 " face |bytes packets errs drop fifo frame "
4325 "compressed multicast|bytes packets errs "
4326 "drop fifo colls carrier compressed\n");
4327 else
4328 dev_seq_printf_stats(seq, v);
4329 return 0;
4330}
4331
dee42870 4332static struct softnet_data *softnet_get_online(loff_t *pos)
1da177e4 4333{
dee42870 4334 struct softnet_data *sd = NULL;
1da177e4 4335
0c0b0aca 4336 while (*pos < nr_cpu_ids)
4ec93edb 4337 if (cpu_online(*pos)) {
dee42870 4338 sd = &per_cpu(softnet_data, *pos);
1da177e4
LT
4339 break;
4340 } else
4341 ++*pos;
dee42870 4342 return sd;
1da177e4
LT
4343}
4344
4345static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4346{
4347 return softnet_get_online(pos);
4348}
4349
4350static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4351{
4352 ++*pos;
4353 return softnet_get_online(pos);
4354}
4355
4356static void softnet_seq_stop(struct seq_file *seq, void *v)
4357{
4358}
4359
4360static int softnet_seq_show(struct seq_file *seq, void *v)
4361{
dee42870 4362 struct softnet_data *sd = v;
1da177e4 4363
0a9627f2 4364 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
dee42870 4365 sd->processed, sd->dropped, sd->time_squeeze, 0,
c1ebcdb8 4366 0, 0, 0, 0, /* was fastroute */
dee42870 4367 sd->cpu_collision, sd->received_rps);
1da177e4
LT
4368 return 0;
4369}
4370
f690808e 4371static const struct seq_operations dev_seq_ops = {
1da177e4
LT
4372 .start = dev_seq_start,
4373 .next = dev_seq_next,
4374 .stop = dev_seq_stop,
4375 .show = dev_seq_show,
4376};
4377
4378static int dev_seq_open(struct inode *inode, struct file *file)
4379{
e372c414 4380 return seq_open_net(inode, file, &dev_seq_ops,
2def16ae 4381 sizeof(struct seq_net_private));
5cac98dd
AB
4382}
4383
9a32144e 4384static const struct file_operations dev_seq_fops = {
1da177e4
LT
4385 .owner = THIS_MODULE,
4386 .open = dev_seq_open,
4387 .read = seq_read,
4388 .llseek = seq_lseek,
e372c414 4389 .release = seq_release_net,
1da177e4
LT
4390};
4391
f690808e 4392static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
4393 .start = softnet_seq_start,
4394 .next = softnet_seq_next,
4395 .stop = softnet_seq_stop,
4396 .show = softnet_seq_show,
4397};
4398
4399static int softnet_seq_open(struct inode *inode, struct file *file)
4400{
4401 return seq_open(file, &softnet_seq_ops);
4402}
4403
9a32144e 4404static const struct file_operations softnet_seq_fops = {
1da177e4
LT
4405 .owner = THIS_MODULE,
4406 .open = softnet_seq_open,
4407 .read = seq_read,
4408 .llseek = seq_lseek,
4409 .release = seq_release,
4410};
4411
0e1256ff
SH
4412static void *ptype_get_idx(loff_t pos)
4413{
4414 struct packet_type *pt = NULL;
4415 loff_t i = 0;
4416 int t;
4417
4418 list_for_each_entry_rcu(pt, &ptype_all, list) {
4419 if (i == pos)
4420 return pt;
4421 ++i;
4422 }
4423
82d8a867 4424 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
4425 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4426 if (i == pos)
4427 return pt;
4428 ++i;
4429 }
4430 }
4431 return NULL;
4432}
4433
4434static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 4435 __acquires(RCU)
0e1256ff
SH
4436{
4437 rcu_read_lock();
4438 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4439}
4440
4441static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4442{
4443 struct packet_type *pt;
4444 struct list_head *nxt;
4445 int hash;
4446
4447 ++*pos;
4448 if (v == SEQ_START_TOKEN)
4449 return ptype_get_idx(0);
4450
4451 pt = v;
4452 nxt = pt->list.next;
4453 if (pt->type == htons(ETH_P_ALL)) {
4454 if (nxt != &ptype_all)
4455 goto found;
4456 hash = 0;
4457 nxt = ptype_base[0].next;
4458 } else
82d8a867 4459 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
4460
4461 while (nxt == &ptype_base[hash]) {
82d8a867 4462 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
4463 return NULL;
4464 nxt = ptype_base[hash].next;
4465 }
4466found:
4467 return list_entry(nxt, struct packet_type, list);
4468}
4469
4470static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 4471 __releases(RCU)
0e1256ff
SH
4472{
4473 rcu_read_unlock();
4474}
4475
0e1256ff
SH
4476static int ptype_seq_show(struct seq_file *seq, void *v)
4477{
4478 struct packet_type *pt = v;
4479
4480 if (v == SEQ_START_TOKEN)
4481 seq_puts(seq, "Type Device Function\n");
c346dca1 4482 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
4483 if (pt->type == htons(ETH_P_ALL))
4484 seq_puts(seq, "ALL ");
4485 else
4486 seq_printf(seq, "%04x", ntohs(pt->type));
4487
908cd2da
AD
4488 seq_printf(seq, " %-8s %pF\n",
4489 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
4490 }
4491
4492 return 0;
4493}
4494
4495static const struct seq_operations ptype_seq_ops = {
4496 .start = ptype_seq_start,
4497 .next = ptype_seq_next,
4498 .stop = ptype_seq_stop,
4499 .show = ptype_seq_show,
4500};
4501
4502static int ptype_seq_open(struct inode *inode, struct file *file)
4503{
2feb27db
PE
4504 return seq_open_net(inode, file, &ptype_seq_ops,
4505 sizeof(struct seq_net_private));
0e1256ff
SH
4506}
4507
4508static const struct file_operations ptype_seq_fops = {
4509 .owner = THIS_MODULE,
4510 .open = ptype_seq_open,
4511 .read = seq_read,
4512 .llseek = seq_lseek,
2feb27db 4513 .release = seq_release_net,
0e1256ff
SH
4514};
4515
4516
4665079c 4517static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
4518{
4519 int rc = -ENOMEM;
4520
d4beaa66 4521 if (!proc_create("dev", S_IRUGO, net->proc_net, &dev_seq_fops))
1da177e4 4522 goto out;
d4beaa66
G
4523 if (!proc_create("softnet_stat", S_IRUGO, net->proc_net,
4524 &softnet_seq_fops))
1da177e4 4525 goto out_dev;
d4beaa66 4526 if (!proc_create("ptype", S_IRUGO, net->proc_net, &ptype_seq_fops))
457c4cbc 4527 goto out_softnet;
0e1256ff 4528
881d966b 4529 if (wext_proc_init(net))
457c4cbc 4530 goto out_ptype;
1da177e4
LT
4531 rc = 0;
4532out:
4533 return rc;
457c4cbc 4534out_ptype:
ece31ffd 4535 remove_proc_entry("ptype", net->proc_net);
1da177e4 4536out_softnet:
ece31ffd 4537 remove_proc_entry("softnet_stat", net->proc_net);
1da177e4 4538out_dev:
ece31ffd 4539 remove_proc_entry("dev", net->proc_net);
1da177e4
LT
4540 goto out;
4541}
881d966b 4542
4665079c 4543static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
4544{
4545 wext_proc_exit(net);
4546
ece31ffd
G
4547 remove_proc_entry("ptype", net->proc_net);
4548 remove_proc_entry("softnet_stat", net->proc_net);
4549 remove_proc_entry("dev", net->proc_net);
881d966b
EB
4550}
4551
022cbae6 4552static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
4553 .init = dev_proc_net_init,
4554 .exit = dev_proc_net_exit,
4555};
4556
4557static int __init dev_proc_init(void)
4558{
4559 return register_pernet_subsys(&dev_proc_ops);
4560}
1da177e4
LT
4561#else
4562#define dev_proc_init() 0
4563#endif /* CONFIG_PROC_FS */
4564
4565
9ff162a8
JP
4566struct netdev_upper {
4567 struct net_device *dev;
4568 bool master;
4569 struct list_head list;
4570 struct rcu_head rcu;
4571 struct list_head search_list;
4572};
4573
4574static void __append_search_uppers(struct list_head *search_list,
4575 struct net_device *dev)
4576{
4577 struct netdev_upper *upper;
4578
4579 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4580 /* check if this upper is not already in search list */
4581 if (list_empty(&upper->search_list))
4582 list_add_tail(&upper->search_list, search_list);
4583 }
4584}
4585
4586static bool __netdev_search_upper_dev(struct net_device *dev,
4587 struct net_device *upper_dev)
4588{
4589 LIST_HEAD(search_list);
4590 struct netdev_upper *upper;
4591 struct netdev_upper *tmp;
4592 bool ret = false;
4593
4594 __append_search_uppers(&search_list, dev);
4595 list_for_each_entry(upper, &search_list, search_list) {
4596 if (upper->dev == upper_dev) {
4597 ret = true;
4598 break;
4599 }
4600 __append_search_uppers(&search_list, upper->dev);
4601 }
4602 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4603 INIT_LIST_HEAD(&upper->search_list);
4604 return ret;
4605}
4606
4607static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4608 struct net_device *upper_dev)
4609{
4610 struct netdev_upper *upper;
4611
4612 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4613 if (upper->dev == upper_dev)
4614 return upper;
4615 }
4616 return NULL;
4617}
4618
4619/**
4620 * netdev_has_upper_dev - Check if device is linked to an upper device
4621 * @dev: device
4622 * @upper_dev: upper device to check
4623 *
4624 * Find out if a device is linked to specified upper device and return true
4625 * in case it is. Note that this checks only immediate upper device,
4626 * not through a complete stack of devices. The caller must hold the RTNL lock.
4627 */
4628bool netdev_has_upper_dev(struct net_device *dev,
4629 struct net_device *upper_dev)
4630{
4631 ASSERT_RTNL();
4632
4633 return __netdev_find_upper(dev, upper_dev);
4634}
4635EXPORT_SYMBOL(netdev_has_upper_dev);
4636
4637/**
4638 * netdev_has_any_upper_dev - Check if device is linked to some device
4639 * @dev: device
4640 *
4641 * Find out if a device is linked to an upper device and return true in case
4642 * it is. The caller must hold the RTNL lock.
4643 */
4644bool netdev_has_any_upper_dev(struct net_device *dev)
4645{
4646 ASSERT_RTNL();
4647
4648 return !list_empty(&dev->upper_dev_list);
4649}
4650EXPORT_SYMBOL(netdev_has_any_upper_dev);
4651
4652/**
4653 * netdev_master_upper_dev_get - Get master upper device
4654 * @dev: device
4655 *
4656 * Find a master upper device and return pointer to it or NULL in case
4657 * it's not there. The caller must hold the RTNL lock.
4658 */
4659struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4660{
4661 struct netdev_upper *upper;
4662
4663 ASSERT_RTNL();
4664
4665 if (list_empty(&dev->upper_dev_list))
4666 return NULL;
4667
4668 upper = list_first_entry(&dev->upper_dev_list,
4669 struct netdev_upper, list);
4670 if (likely(upper->master))
4671 return upper->dev;
4672 return NULL;
4673}
4674EXPORT_SYMBOL(netdev_master_upper_dev_get);
4675
4676/**
4677 * netdev_master_upper_dev_get_rcu - Get master upper device
4678 * @dev: device
4679 *
4680 * Find a master upper device and return pointer to it or NULL in case
4681 * it's not there. The caller must hold the RCU read lock.
4682 */
4683struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4684{
4685 struct netdev_upper *upper;
4686
4687 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4688 struct netdev_upper, list);
4689 if (upper && likely(upper->master))
4690 return upper->dev;
4691 return NULL;
4692}
4693EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4694
4695static int __netdev_upper_dev_link(struct net_device *dev,
4696 struct net_device *upper_dev, bool master)
4697{
4698 struct netdev_upper *upper;
4699
4700 ASSERT_RTNL();
4701
4702 if (dev == upper_dev)
4703 return -EBUSY;
4704
4705 /* To prevent loops, check if dev is not upper device to upper_dev. */
4706 if (__netdev_search_upper_dev(upper_dev, dev))
4707 return -EBUSY;
4708
4709 if (__netdev_find_upper(dev, upper_dev))
4710 return -EEXIST;
4711
4712 if (master && netdev_master_upper_dev_get(dev))
4713 return -EBUSY;
4714
4715 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4716 if (!upper)
4717 return -ENOMEM;
4718
4719 upper->dev = upper_dev;
4720 upper->master = master;
4721 INIT_LIST_HEAD(&upper->search_list);
4722
4723 /* Ensure that master upper link is always the first item in list. */
4724 if (master)
4725 list_add_rcu(&upper->list, &dev->upper_dev_list);
4726 else
4727 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4728 dev_hold(upper_dev);
4729
4730 return 0;
4731}
4732
4733/**
4734 * netdev_upper_dev_link - Add a link to the upper device
4735 * @dev: device
4736 * @upper_dev: new upper device
4737 *
4738 * Adds a link to device which is upper to this one. The caller must hold
4739 * the RTNL lock. On a failure a negative errno code is returned.
4740 * On success the reference counts are adjusted and the function
4741 * returns zero.
4742 */
4743int netdev_upper_dev_link(struct net_device *dev,
4744 struct net_device *upper_dev)
4745{
4746 return __netdev_upper_dev_link(dev, upper_dev, false);
4747}
4748EXPORT_SYMBOL(netdev_upper_dev_link);
4749
4750/**
4751 * netdev_master_upper_dev_link - Add a master link to the upper device
4752 * @dev: device
4753 * @upper_dev: new upper device
4754 *
4755 * Adds a link to device which is upper to this one. In this case, only
4756 * one master upper device can be linked, although other non-master devices
4757 * might be linked as well. The caller must hold the RTNL lock.
4758 * On a failure a negative errno code is returned. On success the reference
4759 * counts are adjusted and the function returns zero.
4760 */
4761int netdev_master_upper_dev_link(struct net_device *dev,
4762 struct net_device *upper_dev)
4763{
4764 return __netdev_upper_dev_link(dev, upper_dev, true);
4765}
4766EXPORT_SYMBOL(netdev_master_upper_dev_link);
4767
4768/**
4769 * netdev_upper_dev_unlink - Removes a link to upper device
4770 * @dev: device
4771 * @upper_dev: new upper device
4772 *
4773 * Removes a link to device which is upper to this one. The caller must hold
4774 * the RTNL lock.
4775 */
4776void netdev_upper_dev_unlink(struct net_device *dev,
4777 struct net_device *upper_dev)
4778{
4779 struct netdev_upper *upper;
4780
4781 ASSERT_RTNL();
4782
4783 upper = __netdev_find_upper(dev, upper_dev);
4784 if (!upper)
4785 return;
4786 list_del_rcu(&upper->list);
4787 dev_put(upper_dev);
4788 kfree_rcu(upper, rcu);
4789}
4790EXPORT_SYMBOL(netdev_upper_dev_unlink);
4791
b6c40d68
PM
4792static void dev_change_rx_flags(struct net_device *dev, int flags)
4793{
d314774c
SH
4794 const struct net_device_ops *ops = dev->netdev_ops;
4795
4796 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4797 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4798}
4799
dad9b335 4800static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4 4801{
b536db93 4802 unsigned int old_flags = dev->flags;
d04a48b0
EB
4803 kuid_t uid;
4804 kgid_t gid;
1da177e4 4805
24023451
PM
4806 ASSERT_RTNL();
4807
dad9b335
WC
4808 dev->flags |= IFF_PROMISC;
4809 dev->promiscuity += inc;
4810 if (dev->promiscuity == 0) {
4811 /*
4812 * Avoid overflow.
4813 * If inc causes overflow, untouch promisc and return error.
4814 */
4815 if (inc < 0)
4816 dev->flags &= ~IFF_PROMISC;
4817 else {
4818 dev->promiscuity -= inc;
7b6cd1ce
JP
4819 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4820 dev->name);
dad9b335
WC
4821 return -EOVERFLOW;
4822 }
4823 }
52609c0b 4824 if (dev->flags != old_flags) {
7b6cd1ce
JP
4825 pr_info("device %s %s promiscuous mode\n",
4826 dev->name,
4827 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
4828 if (audit_enabled) {
4829 current_uid_gid(&uid, &gid);
7759db82
KHK
4830 audit_log(current->audit_context, GFP_ATOMIC,
4831 AUDIT_ANOM_PROMISCUOUS,
4832 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4833 dev->name, (dev->flags & IFF_PROMISC),
4834 (old_flags & IFF_PROMISC),
e1760bd5 4835 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
4836 from_kuid(&init_user_ns, uid),
4837 from_kgid(&init_user_ns, gid),
7759db82 4838 audit_get_sessionid(current));
8192b0c4 4839 }
24023451 4840
b6c40d68 4841 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4842 }
dad9b335 4843 return 0;
1da177e4
LT
4844}
4845
4417da66
PM
4846/**
4847 * dev_set_promiscuity - update promiscuity count on a device
4848 * @dev: device
4849 * @inc: modifier
4850 *
4851 * Add or remove promiscuity from a device. While the count in the device
4852 * remains above zero the interface remains promiscuous. Once it hits zero
4853 * the device reverts back to normal filtering operation. A negative inc
4854 * value is used to drop promiscuity on the device.
dad9b335 4855 * Return 0 if successful or a negative errno code on error.
4417da66 4856 */
dad9b335 4857int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 4858{
b536db93 4859 unsigned int old_flags = dev->flags;
dad9b335 4860 int err;
4417da66 4861
dad9b335 4862 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4863 if (err < 0)
dad9b335 4864 return err;
4417da66
PM
4865 if (dev->flags != old_flags)
4866 dev_set_rx_mode(dev);
dad9b335 4867 return err;
4417da66 4868}
d1b19dff 4869EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4870
1da177e4
LT
4871/**
4872 * dev_set_allmulti - update allmulti count on a device
4873 * @dev: device
4874 * @inc: modifier
4875 *
4876 * Add or remove reception of all multicast frames to a device. While the
4877 * count in the device remains above zero the interface remains listening
4878 * to all interfaces. Once it hits zero the device reverts back to normal
4879 * filtering operation. A negative @inc value is used to drop the counter
4880 * when releasing a resource needing all multicasts.
dad9b335 4881 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4882 */
4883
dad9b335 4884int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4 4885{
b536db93 4886 unsigned int old_flags = dev->flags;
1da177e4 4887
24023451
PM
4888 ASSERT_RTNL();
4889
1da177e4 4890 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4891 dev->allmulti += inc;
4892 if (dev->allmulti == 0) {
4893 /*
4894 * Avoid overflow.
4895 * If inc causes overflow, untouch allmulti and return error.
4896 */
4897 if (inc < 0)
4898 dev->flags &= ~IFF_ALLMULTI;
4899 else {
4900 dev->allmulti -= inc;
7b6cd1ce
JP
4901 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4902 dev->name);
dad9b335
WC
4903 return -EOVERFLOW;
4904 }
4905 }
24023451 4906 if (dev->flags ^ old_flags) {
b6c40d68 4907 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4908 dev_set_rx_mode(dev);
24023451 4909 }
dad9b335 4910 return 0;
4417da66 4911}
d1b19dff 4912EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4913
4914/*
4915 * Upload unicast and multicast address lists to device and
4916 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4917 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4918 * are present.
4919 */
4920void __dev_set_rx_mode(struct net_device *dev)
4921{
d314774c
SH
4922 const struct net_device_ops *ops = dev->netdev_ops;
4923
4417da66
PM
4924 /* dev_open will call this function so the list will stay sane. */
4925 if (!(dev->flags&IFF_UP))
4926 return;
4927
4928 if (!netif_device_present(dev))
40b77c94 4929 return;
4417da66 4930
01789349 4931 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
4932 /* Unicast addresses changes may only happen under the rtnl,
4933 * therefore calling __dev_set_promiscuity here is safe.
4934 */
32e7bfc4 4935 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66 4936 __dev_set_promiscuity(dev, 1);
2d348d1f 4937 dev->uc_promisc = true;
32e7bfc4 4938 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66 4939 __dev_set_promiscuity(dev, -1);
2d348d1f 4940 dev->uc_promisc = false;
4417da66 4941 }
4417da66 4942 }
01789349
JP
4943
4944 if (ops->ndo_set_rx_mode)
4945 ops->ndo_set_rx_mode(dev);
4417da66
PM
4946}
4947
4948void dev_set_rx_mode(struct net_device *dev)
4949{
b9e40857 4950 netif_addr_lock_bh(dev);
4417da66 4951 __dev_set_rx_mode(dev);
b9e40857 4952 netif_addr_unlock_bh(dev);
1da177e4
LT
4953}
4954
f0db275a
SH
4955/**
4956 * dev_get_flags - get flags reported to userspace
4957 * @dev: device
4958 *
4959 * Get the combination of flag bits exported through APIs to userspace.
4960 */
95c96174 4961unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 4962{
95c96174 4963 unsigned int flags;
1da177e4
LT
4964
4965 flags = (dev->flags & ~(IFF_PROMISC |
4966 IFF_ALLMULTI |
b00055aa
SR
4967 IFF_RUNNING |
4968 IFF_LOWER_UP |
4969 IFF_DORMANT)) |
1da177e4
LT
4970 (dev->gflags & (IFF_PROMISC |
4971 IFF_ALLMULTI));
4972
b00055aa
SR
4973 if (netif_running(dev)) {
4974 if (netif_oper_up(dev))
4975 flags |= IFF_RUNNING;
4976 if (netif_carrier_ok(dev))
4977 flags |= IFF_LOWER_UP;
4978 if (netif_dormant(dev))
4979 flags |= IFF_DORMANT;
4980 }
1da177e4
LT
4981
4982 return flags;
4983}
d1b19dff 4984EXPORT_SYMBOL(dev_get_flags);
1da177e4 4985
bd380811 4986int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4987{
b536db93 4988 unsigned int old_flags = dev->flags;
bd380811 4989 int ret;
1da177e4 4990
24023451
PM
4991 ASSERT_RTNL();
4992
1da177e4
LT
4993 /*
4994 * Set the flags on our device.
4995 */
4996
4997 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4998 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4999 IFF_AUTOMEDIA)) |
5000 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
5001 IFF_ALLMULTI));
5002
5003 /*
5004 * Load in the correct multicast list now the flags have changed.
5005 */
5006
b6c40d68
PM
5007 if ((old_flags ^ flags) & IFF_MULTICAST)
5008 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 5009
4417da66 5010 dev_set_rx_mode(dev);
1da177e4
LT
5011
5012 /*
5013 * Have we downed the interface. We handle IFF_UP ourselves
5014 * according to user attempts to set it, rather than blindly
5015 * setting it.
5016 */
5017
5018 ret = 0;
5019 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 5020 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
5021
5022 if (!ret)
4417da66 5023 dev_set_rx_mode(dev);
1da177e4
LT
5024 }
5025
1da177e4 5026 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
5027 int inc = (flags & IFF_PROMISC) ? 1 : -1;
5028
1da177e4
LT
5029 dev->gflags ^= IFF_PROMISC;
5030 dev_set_promiscuity(dev, inc);
5031 }
5032
5033 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
5034 is important. Some (broken) drivers set IFF_PROMISC, when
5035 IFF_ALLMULTI is requested not asking us and not reporting.
5036 */
5037 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
5038 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
5039
1da177e4
LT
5040 dev->gflags ^= IFF_ALLMULTI;
5041 dev_set_allmulti(dev, inc);
5042 }
5043
bd380811
PM
5044 return ret;
5045}
5046
5047void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
5048{
5049 unsigned int changes = dev->flags ^ old_flags;
5050
5051 if (changes & IFF_UP) {
5052 if (dev->flags & IFF_UP)
5053 call_netdevice_notifiers(NETDEV_UP, dev);
5054 else
5055 call_netdevice_notifiers(NETDEV_DOWN, dev);
5056 }
5057
5058 if (dev->flags & IFF_UP &&
5059 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
5060 call_netdevice_notifiers(NETDEV_CHANGE, dev);
5061}
5062
5063/**
5064 * dev_change_flags - change device settings
5065 * @dev: device
5066 * @flags: device state flags
5067 *
5068 * Change settings on device based state flags. The flags are
5069 * in the userspace exported format.
5070 */
b536db93 5071int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 5072{
b536db93
ED
5073 int ret;
5074 unsigned int changes, old_flags = dev->flags;
bd380811
PM
5075
5076 ret = __dev_change_flags(dev, flags);
5077 if (ret < 0)
5078 return ret;
5079
5080 changes = old_flags ^ dev->flags;
7c355f53
TG
5081 if (changes)
5082 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 5083
bd380811 5084 __dev_notify_flags(dev, old_flags);
1da177e4
LT
5085 return ret;
5086}
d1b19dff 5087EXPORT_SYMBOL(dev_change_flags);
1da177e4 5088
f0db275a
SH
5089/**
5090 * dev_set_mtu - Change maximum transfer unit
5091 * @dev: device
5092 * @new_mtu: new transfer unit
5093 *
5094 * Change the maximum transfer size of the network device.
5095 */
1da177e4
LT
5096int dev_set_mtu(struct net_device *dev, int new_mtu)
5097{
d314774c 5098 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
5099 int err;
5100
5101 if (new_mtu == dev->mtu)
5102 return 0;
5103
5104 /* MTU must be positive. */
5105 if (new_mtu < 0)
5106 return -EINVAL;
5107
5108 if (!netif_device_present(dev))
5109 return -ENODEV;
5110
5111 err = 0;
d314774c
SH
5112 if (ops->ndo_change_mtu)
5113 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
5114 else
5115 dev->mtu = new_mtu;
d314774c 5116
e3d8fabe 5117 if (!err)
056925ab 5118 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
5119 return err;
5120}
d1b19dff 5121EXPORT_SYMBOL(dev_set_mtu);
1da177e4 5122
cbda10fa
VD
5123/**
5124 * dev_set_group - Change group this device belongs to
5125 * @dev: device
5126 * @new_group: group this device should belong to
5127 */
5128void dev_set_group(struct net_device *dev, int new_group)
5129{
5130 dev->group = new_group;
5131}
5132EXPORT_SYMBOL(dev_set_group);
5133
f0db275a
SH
5134/**
5135 * dev_set_mac_address - Change Media Access Control Address
5136 * @dev: device
5137 * @sa: new address
5138 *
5139 * Change the hardware (MAC) address of the device
5140 */
1da177e4
LT
5141int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
5142{
d314774c 5143 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
5144 int err;
5145
d314774c 5146 if (!ops->ndo_set_mac_address)
1da177e4
LT
5147 return -EOPNOTSUPP;
5148 if (sa->sa_family != dev->type)
5149 return -EINVAL;
5150 if (!netif_device_present(dev))
5151 return -ENODEV;
d314774c 5152 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
5153 if (err)
5154 return err;
fbdeca2d 5155 dev->addr_assign_type = NET_ADDR_SET;
f6521516 5156 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 5157 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 5158 return 0;
1da177e4 5159}
d1b19dff 5160EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 5161
4bf84c35
JP
5162/**
5163 * dev_change_carrier - Change device carrier
5164 * @dev: device
5165 * @new_carries: new value
5166 *
5167 * Change device carrier
5168 */
5169int dev_change_carrier(struct net_device *dev, bool new_carrier)
5170{
5171 const struct net_device_ops *ops = dev->netdev_ops;
5172
5173 if (!ops->ndo_change_carrier)
5174 return -EOPNOTSUPP;
5175 if (!netif_device_present(dev))
5176 return -ENODEV;
5177 return ops->ndo_change_carrier(dev, new_carrier);
5178}
5179EXPORT_SYMBOL(dev_change_carrier);
5180
1da177e4
LT
5181/**
5182 * dev_new_index - allocate an ifindex
c4ea43c5 5183 * @net: the applicable net namespace
1da177e4
LT
5184 *
5185 * Returns a suitable unique value for a new device interface
5186 * number. The caller must hold the rtnl semaphore or the
5187 * dev_base_lock to be sure it remains unique.
5188 */
881d966b 5189static int dev_new_index(struct net *net)
1da177e4 5190{
aa79e66e 5191 int ifindex = net->ifindex;
1da177e4
LT
5192 for (;;) {
5193 if (++ifindex <= 0)
5194 ifindex = 1;
881d966b 5195 if (!__dev_get_by_index(net, ifindex))
aa79e66e 5196 return net->ifindex = ifindex;
1da177e4
LT
5197 }
5198}
5199
1da177e4 5200/* Delayed registration/unregisteration */
3b5b34fd 5201static LIST_HEAD(net_todo_list);
1da177e4 5202
6f05f629 5203static void net_set_todo(struct net_device *dev)
1da177e4 5204{
1da177e4 5205 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
5206}
5207
9b5e383c 5208static void rollback_registered_many(struct list_head *head)
93ee31f1 5209{
e93737b0 5210 struct net_device *dev, *tmp;
9b5e383c 5211
93ee31f1
DL
5212 BUG_ON(dev_boot_phase);
5213 ASSERT_RTNL();
5214
e93737b0 5215 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 5216 /* Some devices call without registering
e93737b0
KK
5217 * for initialization unwind. Remove those
5218 * devices and proceed with the remaining.
9b5e383c
ED
5219 */
5220 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
5221 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5222 dev->name, dev);
93ee31f1 5223
9b5e383c 5224 WARN_ON(1);
e93737b0
KK
5225 list_del(&dev->unreg_list);
5226 continue;
9b5e383c 5227 }
449f4544 5228 dev->dismantle = true;
9b5e383c 5229 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 5230 }
93ee31f1 5231
44345724
OP
5232 /* If device is running, close it first. */
5233 dev_close_many(head);
93ee31f1 5234
44345724 5235 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5236 /* And unlink it from device chain. */
5237 unlist_netdevice(dev);
93ee31f1 5238
9b5e383c
ED
5239 dev->reg_state = NETREG_UNREGISTERING;
5240 }
93ee31f1
DL
5241
5242 synchronize_net();
5243
9b5e383c
ED
5244 list_for_each_entry(dev, head, unreg_list) {
5245 /* Shutdown queueing discipline. */
5246 dev_shutdown(dev);
93ee31f1
DL
5247
5248
9b5e383c
ED
5249 /* Notify protocols, that we are about to destroy
5250 this device. They should clean all the things.
5251 */
5252 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5253
a2835763
PM
5254 if (!dev->rtnl_link_ops ||
5255 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5256 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5257
9b5e383c
ED
5258 /*
5259 * Flush the unicast and multicast chains
5260 */
a748ee24 5261 dev_uc_flush(dev);
22bedad3 5262 dev_mc_flush(dev);
93ee31f1 5263
9b5e383c
ED
5264 if (dev->netdev_ops->ndo_uninit)
5265 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5266
9ff162a8
JP
5267 /* Notifier chain MUST detach us all upper devices. */
5268 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 5269
9b5e383c
ED
5270 /* Remove entries from kobject tree */
5271 netdev_unregister_kobject(dev);
024e9679
AD
5272#ifdef CONFIG_XPS
5273 /* Remove XPS queueing entries */
5274 netif_reset_xps_queues_gt(dev, 0);
5275#endif
9b5e383c 5276 }
93ee31f1 5277
850a545b 5278 synchronize_net();
395264d5 5279
a5ee1551 5280 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5281 dev_put(dev);
5282}
5283
5284static void rollback_registered(struct net_device *dev)
5285{
5286 LIST_HEAD(single);
5287
5288 list_add(&dev->unreg_list, &single);
5289 rollback_registered_many(&single);
ceaaec98 5290 list_del(&single);
93ee31f1
DL
5291}
5292
c8f44aff
MM
5293static netdev_features_t netdev_fix_features(struct net_device *dev,
5294 netdev_features_t features)
b63365a2 5295{
57422dc5
MM
5296 /* Fix illegal checksum combinations */
5297 if ((features & NETIF_F_HW_CSUM) &&
5298 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5299 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
5300 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5301 }
5302
b63365a2
HX
5303 /* Fix illegal SG+CSUM combinations. */
5304 if ((features & NETIF_F_SG) &&
5305 !(features & NETIF_F_ALL_CSUM)) {
6f404e44
MM
5306 netdev_dbg(dev,
5307 "Dropping NETIF_F_SG since no checksum feature.\n");
b63365a2
HX
5308 features &= ~NETIF_F_SG;
5309 }
5310
5311 /* TSO requires that SG is present as well. */
ea2d3688 5312 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 5313 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 5314 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
5315 }
5316
31d8b9e0
BH
5317 /* TSO ECN requires that TSO is present as well. */
5318 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5319 features &= ~NETIF_F_TSO_ECN;
5320
212b573f
MM
5321 /* Software GSO depends on SG. */
5322 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5323 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5324 features &= ~NETIF_F_GSO;
5325 }
5326
acd1130e 5327 /* UFO needs SG and checksumming */
b63365a2 5328 if (features & NETIF_F_UFO) {
79032644
MM
5329 /* maybe split UFO into V4 and V6? */
5330 if (!((features & NETIF_F_GEN_CSUM) ||
5331 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5332 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5333 netdev_dbg(dev,
acd1130e 5334 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5335 features &= ~NETIF_F_UFO;
5336 }
5337
5338 if (!(features & NETIF_F_SG)) {
6f404e44 5339 netdev_dbg(dev,
acd1130e 5340 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
5341 features &= ~NETIF_F_UFO;
5342 }
5343 }
5344
5345 return features;
5346}
b63365a2 5347
6cb6a27c 5348int __netdev_update_features(struct net_device *dev)
5455c699 5349{
c8f44aff 5350 netdev_features_t features;
5455c699
MM
5351 int err = 0;
5352
87267485
MM
5353 ASSERT_RTNL();
5354
5455c699
MM
5355 features = netdev_get_wanted_features(dev);
5356
5357 if (dev->netdev_ops->ndo_fix_features)
5358 features = dev->netdev_ops->ndo_fix_features(dev, features);
5359
5360 /* driver might be less strict about feature dependencies */
5361 features = netdev_fix_features(dev, features);
5362
5363 if (dev->features == features)
6cb6a27c 5364 return 0;
5455c699 5365
c8f44aff
MM
5366 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5367 &dev->features, &features);
5455c699
MM
5368
5369 if (dev->netdev_ops->ndo_set_features)
5370 err = dev->netdev_ops->ndo_set_features(dev, features);
5371
6cb6a27c 5372 if (unlikely(err < 0)) {
5455c699 5373 netdev_err(dev,
c8f44aff
MM
5374 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5375 err, &features, &dev->features);
6cb6a27c
MM
5376 return -1;
5377 }
5378
5379 if (!err)
5380 dev->features = features;
5381
5382 return 1;
5383}
5384
afe12cc8
MM
5385/**
5386 * netdev_update_features - recalculate device features
5387 * @dev: the device to check
5388 *
5389 * Recalculate dev->features set and send notifications if it
5390 * has changed. Should be called after driver or hardware dependent
5391 * conditions might have changed that influence the features.
5392 */
6cb6a27c
MM
5393void netdev_update_features(struct net_device *dev)
5394{
5395 if (__netdev_update_features(dev))
5396 netdev_features_change(dev);
5455c699
MM
5397}
5398EXPORT_SYMBOL(netdev_update_features);
5399
afe12cc8
MM
5400/**
5401 * netdev_change_features - recalculate device features
5402 * @dev: the device to check
5403 *
5404 * Recalculate dev->features set and send notifications even
5405 * if they have not changed. Should be called instead of
5406 * netdev_update_features() if also dev->vlan_features might
5407 * have changed to allow the changes to be propagated to stacked
5408 * VLAN devices.
5409 */
5410void netdev_change_features(struct net_device *dev)
5411{
5412 __netdev_update_features(dev);
5413 netdev_features_change(dev);
5414}
5415EXPORT_SYMBOL(netdev_change_features);
5416
fc4a7489
PM
5417/**
5418 * netif_stacked_transfer_operstate - transfer operstate
5419 * @rootdev: the root or lower level device to transfer state from
5420 * @dev: the device to transfer operstate to
5421 *
5422 * Transfer operational state from root to device. This is normally
5423 * called when a stacking relationship exists between the root
5424 * device and the device(a leaf device).
5425 */
5426void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5427 struct net_device *dev)
5428{
5429 if (rootdev->operstate == IF_OPER_DORMANT)
5430 netif_dormant_on(dev);
5431 else
5432 netif_dormant_off(dev);
5433
5434 if (netif_carrier_ok(rootdev)) {
5435 if (!netif_carrier_ok(dev))
5436 netif_carrier_on(dev);
5437 } else {
5438 if (netif_carrier_ok(dev))
5439 netif_carrier_off(dev);
5440 }
5441}
5442EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5443
bf264145 5444#ifdef CONFIG_RPS
1b4bf461
ED
5445static int netif_alloc_rx_queues(struct net_device *dev)
5446{
1b4bf461 5447 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5448 struct netdev_rx_queue *rx;
1b4bf461 5449
bd25fa7b 5450 BUG_ON(count < 1);
1b4bf461 5451
bd25fa7b 5452 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
62b5942a 5453 if (!rx)
bd25fa7b 5454 return -ENOMEM;
62b5942a 5455
bd25fa7b
TH
5456 dev->_rx = rx;
5457
bd25fa7b 5458 for (i = 0; i < count; i++)
fe822240 5459 rx[i].dev = dev;
1b4bf461
ED
5460 return 0;
5461}
bf264145 5462#endif
1b4bf461 5463
aa942104
CG
5464static void netdev_init_one_queue(struct net_device *dev,
5465 struct netdev_queue *queue, void *_unused)
5466{
5467 /* Initialize queue lock */
5468 spin_lock_init(&queue->_xmit_lock);
5469 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5470 queue->xmit_lock_owner = -1;
b236da69 5471 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 5472 queue->dev = dev;
114cf580
TH
5473#ifdef CONFIG_BQL
5474 dql_init(&queue->dql, HZ);
5475#endif
aa942104
CG
5476}
5477
e6484930
TH
5478static int netif_alloc_netdev_queues(struct net_device *dev)
5479{
5480 unsigned int count = dev->num_tx_queues;
5481 struct netdev_queue *tx;
5482
5483 BUG_ON(count < 1);
5484
5485 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
62b5942a 5486 if (!tx)
e6484930 5487 return -ENOMEM;
62b5942a 5488
e6484930 5489 dev->_tx = tx;
1d24eb48 5490
e6484930
TH
5491 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5492 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
5493
5494 return 0;
e6484930
TH
5495}
5496
1da177e4
LT
5497/**
5498 * register_netdevice - register a network device
5499 * @dev: device to register
5500 *
5501 * Take a completed network device structure and add it to the kernel
5502 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5503 * chain. 0 is returned on success. A negative errno code is returned
5504 * on a failure to set up the device, or if the name is a duplicate.
5505 *
5506 * Callers must hold the rtnl semaphore. You may want
5507 * register_netdev() instead of this.
5508 *
5509 * BUGS:
5510 * The locking appears insufficient to guarantee two parallel registers
5511 * will not get the same name.
5512 */
5513
5514int register_netdevice(struct net_device *dev)
5515{
1da177e4 5516 int ret;
d314774c 5517 struct net *net = dev_net(dev);
1da177e4
LT
5518
5519 BUG_ON(dev_boot_phase);
5520 ASSERT_RTNL();
5521
b17a7c17
SH
5522 might_sleep();
5523
1da177e4
LT
5524 /* When net_device's are persistent, this will be fatal. */
5525 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5526 BUG_ON(!net);
1da177e4 5527
f1f28aa3 5528 spin_lock_init(&dev->addr_list_lock);
cf508b12 5529 netdev_set_addr_lockdep_class(dev);
1da177e4 5530
1da177e4
LT
5531 dev->iflink = -1;
5532
828de4f6 5533 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
5534 if (ret < 0)
5535 goto out;
5536
1da177e4 5537 /* Init, if this function is available */
d314774c
SH
5538 if (dev->netdev_ops->ndo_init) {
5539 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5540 if (ret) {
5541 if (ret > 0)
5542 ret = -EIO;
90833aa4 5543 goto out;
1da177e4
LT
5544 }
5545 }
4ec93edb 5546
d2ed273d
MM
5547 if (((dev->hw_features | dev->features) & NETIF_F_HW_VLAN_FILTER) &&
5548 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5549 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5550 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5551 ret = -EINVAL;
5552 goto err_uninit;
5553 }
5554
9c7dafbf
PE
5555 ret = -EBUSY;
5556 if (!dev->ifindex)
5557 dev->ifindex = dev_new_index(net);
5558 else if (__dev_get_by_index(net, dev->ifindex))
5559 goto err_uninit;
5560
1da177e4
LT
5561 if (dev->iflink == -1)
5562 dev->iflink = dev->ifindex;
5563
5455c699
MM
5564 /* Transfer changeable features to wanted_features and enable
5565 * software offloads (GSO and GRO).
5566 */
5567 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
5568 dev->features |= NETIF_F_SOFT_FEATURES;
5569 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 5570
c6e1a0d1 5571 /* Turn on no cache copy if HW is doing checksum */
34324dc2
MM
5572 if (!(dev->flags & IFF_LOOPBACK)) {
5573 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5574 if (dev->features & NETIF_F_ALL_CSUM) {
5575 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5576 dev->features |= NETIF_F_NOCACHE_COPY;
5577 }
c6e1a0d1
TH
5578 }
5579
1180e7d6 5580 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 5581 */
1180e7d6 5582 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 5583
7ffbe3fd
JB
5584 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5585 ret = notifier_to_errno(ret);
5586 if (ret)
5587 goto err_uninit;
5588
8b41d188 5589 ret = netdev_register_kobject(dev);
b17a7c17 5590 if (ret)
7ce1b0ed 5591 goto err_uninit;
b17a7c17
SH
5592 dev->reg_state = NETREG_REGISTERED;
5593
6cb6a27c 5594 __netdev_update_features(dev);
8e9b59b2 5595
1da177e4
LT
5596 /*
5597 * Default initial state at registry is that the
5598 * device is present.
5599 */
5600
5601 set_bit(__LINK_STATE_PRESENT, &dev->state);
5602
8f4cccbb
BH
5603 linkwatch_init_dev(dev);
5604
1da177e4 5605 dev_init_scheduler(dev);
1da177e4 5606 dev_hold(dev);
ce286d32 5607 list_netdevice(dev);
7bf23575 5608 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 5609
948b337e
JP
5610 /* If the device has permanent device address, driver should
5611 * set dev_addr and also addr_assign_type should be set to
5612 * NET_ADDR_PERM (default value).
5613 */
5614 if (dev->addr_assign_type == NET_ADDR_PERM)
5615 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5616
1da177e4 5617 /* Notify protocols, that a new device appeared. */
056925ab 5618 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5619 ret = notifier_to_errno(ret);
93ee31f1
DL
5620 if (ret) {
5621 rollback_registered(dev);
5622 dev->reg_state = NETREG_UNREGISTERED;
5623 }
d90a909e
EB
5624 /*
5625 * Prevent userspace races by waiting until the network
5626 * device is fully setup before sending notifications.
5627 */
a2835763
PM
5628 if (!dev->rtnl_link_ops ||
5629 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5630 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5631
5632out:
5633 return ret;
7ce1b0ed
HX
5634
5635err_uninit:
d314774c
SH
5636 if (dev->netdev_ops->ndo_uninit)
5637 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5638 goto out;
1da177e4 5639}
d1b19dff 5640EXPORT_SYMBOL(register_netdevice);
1da177e4 5641
937f1ba5
BH
5642/**
5643 * init_dummy_netdev - init a dummy network device for NAPI
5644 * @dev: device to init
5645 *
5646 * This takes a network device structure and initialize the minimum
5647 * amount of fields so it can be used to schedule NAPI polls without
5648 * registering a full blown interface. This is to be used by drivers
5649 * that need to tie several hardware interfaces to a single NAPI
5650 * poll scheduler due to HW limitations.
5651 */
5652int init_dummy_netdev(struct net_device *dev)
5653{
5654 /* Clear everything. Note we don't initialize spinlocks
5655 * are they aren't supposed to be taken by any of the
5656 * NAPI code and this dummy netdev is supposed to be
5657 * only ever used for NAPI polls
5658 */
5659 memset(dev, 0, sizeof(struct net_device));
5660
5661 /* make sure we BUG if trying to hit standard
5662 * register/unregister code path
5663 */
5664 dev->reg_state = NETREG_DUMMY;
5665
937f1ba5
BH
5666 /* NAPI wants this */
5667 INIT_LIST_HEAD(&dev->napi_list);
5668
5669 /* a dummy interface is started by default */
5670 set_bit(__LINK_STATE_PRESENT, &dev->state);
5671 set_bit(__LINK_STATE_START, &dev->state);
5672
29b4433d
ED
5673 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5674 * because users of this 'device' dont need to change
5675 * its refcount.
5676 */
5677
937f1ba5
BH
5678 return 0;
5679}
5680EXPORT_SYMBOL_GPL(init_dummy_netdev);
5681
5682
1da177e4
LT
5683/**
5684 * register_netdev - register a network device
5685 * @dev: device to register
5686 *
5687 * Take a completed network device structure and add it to the kernel
5688 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5689 * chain. 0 is returned on success. A negative errno code is returned
5690 * on a failure to set up the device, or if the name is a duplicate.
5691 *
38b4da38 5692 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5693 * and expands the device name if you passed a format string to
5694 * alloc_netdev.
5695 */
5696int register_netdev(struct net_device *dev)
5697{
5698 int err;
5699
5700 rtnl_lock();
1da177e4 5701 err = register_netdevice(dev);
1da177e4
LT
5702 rtnl_unlock();
5703 return err;
5704}
5705EXPORT_SYMBOL(register_netdev);
5706
29b4433d
ED
5707int netdev_refcnt_read(const struct net_device *dev)
5708{
5709 int i, refcnt = 0;
5710
5711 for_each_possible_cpu(i)
5712 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5713 return refcnt;
5714}
5715EXPORT_SYMBOL(netdev_refcnt_read);
5716
2c53040f 5717/**
1da177e4 5718 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 5719 * @dev: target net_device
1da177e4
LT
5720 *
5721 * This is called when unregistering network devices.
5722 *
5723 * Any protocol or device that holds a reference should register
5724 * for netdevice notification, and cleanup and put back the
5725 * reference if they receive an UNREGISTER event.
5726 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5727 * call dev_put.
1da177e4
LT
5728 */
5729static void netdev_wait_allrefs(struct net_device *dev)
5730{
5731 unsigned long rebroadcast_time, warning_time;
29b4433d 5732 int refcnt;
1da177e4 5733
e014debe
ED
5734 linkwatch_forget_dev(dev);
5735
1da177e4 5736 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
5737 refcnt = netdev_refcnt_read(dev);
5738
5739 while (refcnt != 0) {
1da177e4 5740 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5741 rtnl_lock();
1da177e4
LT
5742
5743 /* Rebroadcast unregister notification */
056925ab 5744 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 5745
748e2d93 5746 __rtnl_unlock();
0115e8e3 5747 rcu_barrier();
748e2d93
ED
5748 rtnl_lock();
5749
0115e8e3 5750 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
5751 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5752 &dev->state)) {
5753 /* We must not have linkwatch events
5754 * pending on unregister. If this
5755 * happens, we simply run the queue
5756 * unscheduled, resulting in a noop
5757 * for this device.
5758 */
5759 linkwatch_run_queue();
5760 }
5761
6756ae4b 5762 __rtnl_unlock();
1da177e4
LT
5763
5764 rebroadcast_time = jiffies;
5765 }
5766
5767 msleep(250);
5768
29b4433d
ED
5769 refcnt = netdev_refcnt_read(dev);
5770
1da177e4 5771 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
5772 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5773 dev->name, refcnt);
1da177e4
LT
5774 warning_time = jiffies;
5775 }
5776 }
5777}
5778
5779/* The sequence is:
5780 *
5781 * rtnl_lock();
5782 * ...
5783 * register_netdevice(x1);
5784 * register_netdevice(x2);
5785 * ...
5786 * unregister_netdevice(y1);
5787 * unregister_netdevice(y2);
5788 * ...
5789 * rtnl_unlock();
5790 * free_netdev(y1);
5791 * free_netdev(y2);
5792 *
58ec3b4d 5793 * We are invoked by rtnl_unlock().
1da177e4 5794 * This allows us to deal with problems:
b17a7c17 5795 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5796 * without deadlocking with linkwatch via keventd.
5797 * 2) Since we run with the RTNL semaphore not held, we can sleep
5798 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5799 *
5800 * We must not return until all unregister events added during
5801 * the interval the lock was held have been completed.
1da177e4 5802 */
1da177e4
LT
5803void netdev_run_todo(void)
5804{
626ab0e6 5805 struct list_head list;
1da177e4 5806
1da177e4 5807 /* Snapshot list, allow later requests */
626ab0e6 5808 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5809
5810 __rtnl_unlock();
626ab0e6 5811
0115e8e3
ED
5812
5813 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
5814 if (!list_empty(&list))
5815 rcu_barrier();
5816
1da177e4
LT
5817 while (!list_empty(&list)) {
5818 struct net_device *dev
e5e26d75 5819 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5820 list_del(&dev->todo_list);
5821
748e2d93 5822 rtnl_lock();
0115e8e3 5823 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 5824 __rtnl_unlock();
0115e8e3 5825
b17a7c17 5826 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 5827 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
5828 dev->name, dev->reg_state);
5829 dump_stack();
5830 continue;
5831 }
1da177e4 5832
b17a7c17 5833 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5834
152102c7 5835 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5836
b17a7c17 5837 netdev_wait_allrefs(dev);
1da177e4 5838
b17a7c17 5839 /* paranoia */
29b4433d 5840 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
5841 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5842 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 5843 WARN_ON(dev->dn_ptr);
1da177e4 5844
b17a7c17
SH
5845 if (dev->destructor)
5846 dev->destructor(dev);
9093bbb2
SH
5847
5848 /* Free network device */
5849 kobject_put(&dev->dev.kobj);
1da177e4 5850 }
1da177e4
LT
5851}
5852
3cfde79c
BH
5853/* Convert net_device_stats to rtnl_link_stats64. They have the same
5854 * fields in the same order, with only the type differing.
5855 */
77a1abf5
ED
5856void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5857 const struct net_device_stats *netdev_stats)
3cfde79c
BH
5858{
5859#if BITS_PER_LONG == 64
77a1abf5
ED
5860 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5861 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
5862#else
5863 size_t i, n = sizeof(*stats64) / sizeof(u64);
5864 const unsigned long *src = (const unsigned long *)netdev_stats;
5865 u64 *dst = (u64 *)stats64;
5866
5867 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5868 sizeof(*stats64) / sizeof(u64));
5869 for (i = 0; i < n; i++)
5870 dst[i] = src[i];
5871#endif
5872}
77a1abf5 5873EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 5874
eeda3fd6
SH
5875/**
5876 * dev_get_stats - get network device statistics
5877 * @dev: device to get statistics from
28172739 5878 * @storage: place to store stats
eeda3fd6 5879 *
d7753516
BH
5880 * Get network statistics from device. Return @storage.
5881 * The device driver may provide its own method by setting
5882 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5883 * otherwise the internal statistics structure is used.
eeda3fd6 5884 */
d7753516
BH
5885struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5886 struct rtnl_link_stats64 *storage)
7004bf25 5887{
eeda3fd6
SH
5888 const struct net_device_ops *ops = dev->netdev_ops;
5889
28172739
ED
5890 if (ops->ndo_get_stats64) {
5891 memset(storage, 0, sizeof(*storage));
caf586e5
ED
5892 ops->ndo_get_stats64(dev, storage);
5893 } else if (ops->ndo_get_stats) {
3cfde79c 5894 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
5895 } else {
5896 netdev_stats_to_stats64(storage, &dev->stats);
28172739 5897 }
caf586e5 5898 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
28172739 5899 return storage;
c45d286e 5900}
eeda3fd6 5901EXPORT_SYMBOL(dev_get_stats);
c45d286e 5902
24824a09 5903struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 5904{
24824a09 5905 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 5906
24824a09
ED
5907#ifdef CONFIG_NET_CLS_ACT
5908 if (queue)
5909 return queue;
5910 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5911 if (!queue)
5912 return NULL;
5913 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
5914 queue->qdisc = &noop_qdisc;
5915 queue->qdisc_sleeping = &noop_qdisc;
5916 rcu_assign_pointer(dev->ingress_queue, queue);
5917#endif
5918 return queue;
bb949fbd
DM
5919}
5920
2c60db03
ED
5921static const struct ethtool_ops default_ethtool_ops;
5922
d07d7507
SG
5923void netdev_set_default_ethtool_ops(struct net_device *dev,
5924 const struct ethtool_ops *ops)
5925{
5926 if (dev->ethtool_ops == &default_ethtool_ops)
5927 dev->ethtool_ops = ops;
5928}
5929EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5930
1da177e4 5931/**
36909ea4 5932 * alloc_netdev_mqs - allocate network device
1da177e4
LT
5933 * @sizeof_priv: size of private data to allocate space for
5934 * @name: device name format string
5935 * @setup: callback to initialize device
36909ea4
TH
5936 * @txqs: the number of TX subqueues to allocate
5937 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
5938 *
5939 * Allocates a struct net_device with private data area for driver use
f25f4e44 5940 * and performs basic initialization. Also allocates subquue structs
36909ea4 5941 * for each queue on the device.
1da177e4 5942 */
36909ea4
TH
5943struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5944 void (*setup)(struct net_device *),
5945 unsigned int txqs, unsigned int rxqs)
1da177e4 5946{
1da177e4 5947 struct net_device *dev;
7943986c 5948 size_t alloc_size;
1ce8e7b5 5949 struct net_device *p;
1da177e4 5950
b6fe17d6
SH
5951 BUG_ON(strlen(name) >= sizeof(dev->name));
5952
36909ea4 5953 if (txqs < 1) {
7b6cd1ce 5954 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
5955 return NULL;
5956 }
5957
36909ea4
TH
5958#ifdef CONFIG_RPS
5959 if (rxqs < 1) {
7b6cd1ce 5960 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
5961 return NULL;
5962 }
5963#endif
5964
fd2ea0a7 5965 alloc_size = sizeof(struct net_device);
d1643d24
AD
5966 if (sizeof_priv) {
5967 /* ensure 32-byte alignment of private area */
1ce8e7b5 5968 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5969 alloc_size += sizeof_priv;
5970 }
5971 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5972 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5973
31380de9 5974 p = kzalloc(alloc_size, GFP_KERNEL);
62b5942a 5975 if (!p)
1da177e4 5976 return NULL;
1da177e4 5977
1ce8e7b5 5978 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5979 dev->padded = (char *)dev - (char *)p;
ab9c73cc 5980
29b4433d
ED
5981 dev->pcpu_refcnt = alloc_percpu(int);
5982 if (!dev->pcpu_refcnt)
e6484930 5983 goto free_p;
ab9c73cc 5984
ab9c73cc 5985 if (dev_addr_init(dev))
29b4433d 5986 goto free_pcpu;
ab9c73cc 5987
22bedad3 5988 dev_mc_init(dev);
a748ee24 5989 dev_uc_init(dev);
ccffad25 5990
c346dca1 5991 dev_net_set(dev, &init_net);
1da177e4 5992
8d3bdbd5 5993 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 5994 dev->gso_max_segs = GSO_MAX_SEGS;
8d3bdbd5 5995
8d3bdbd5
DM
5996 INIT_LIST_HEAD(&dev->napi_list);
5997 INIT_LIST_HEAD(&dev->unreg_list);
5998 INIT_LIST_HEAD(&dev->link_watch_list);
9ff162a8 5999 INIT_LIST_HEAD(&dev->upper_dev_list);
8d3bdbd5
DM
6000 dev->priv_flags = IFF_XMIT_DST_RELEASE;
6001 setup(dev);
6002
36909ea4
TH
6003 dev->num_tx_queues = txqs;
6004 dev->real_num_tx_queues = txqs;
ed9af2e8 6005 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 6006 goto free_all;
e8a0464c 6007
df334545 6008#ifdef CONFIG_RPS
36909ea4
TH
6009 dev->num_rx_queues = rxqs;
6010 dev->real_num_rx_queues = rxqs;
fe822240 6011 if (netif_alloc_rx_queues(dev))
8d3bdbd5 6012 goto free_all;
df334545 6013#endif
0a9627f2 6014
1da177e4 6015 strcpy(dev->name, name);
cbda10fa 6016 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
6017 if (!dev->ethtool_ops)
6018 dev->ethtool_ops = &default_ethtool_ops;
1da177e4 6019 return dev;
ab9c73cc 6020
8d3bdbd5
DM
6021free_all:
6022 free_netdev(dev);
6023 return NULL;
6024
29b4433d
ED
6025free_pcpu:
6026 free_percpu(dev->pcpu_refcnt);
ed9af2e8 6027 kfree(dev->_tx);
fe822240
TH
6028#ifdef CONFIG_RPS
6029 kfree(dev->_rx);
6030#endif
6031
ab9c73cc
JP
6032free_p:
6033 kfree(p);
6034 return NULL;
1da177e4 6035}
36909ea4 6036EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
6037
6038/**
6039 * free_netdev - free network device
6040 * @dev: device
6041 *
4ec93edb
YH
6042 * This function does the last stage of destroying an allocated device
6043 * interface. The reference to the device object is released.
1da177e4
LT
6044 * If this is the last reference then it will be freed.
6045 */
6046void free_netdev(struct net_device *dev)
6047{
d565b0a1
HX
6048 struct napi_struct *p, *n;
6049
f3005d7f
DL
6050 release_net(dev_net(dev));
6051
e8a0464c 6052 kfree(dev->_tx);
fe822240
TH
6053#ifdef CONFIG_RPS
6054 kfree(dev->_rx);
6055#endif
e8a0464c 6056
33d480ce 6057 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 6058
f001fde5
JP
6059 /* Flush device addresses */
6060 dev_addr_flush(dev);
6061
d565b0a1
HX
6062 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6063 netif_napi_del(p);
6064
29b4433d
ED
6065 free_percpu(dev->pcpu_refcnt);
6066 dev->pcpu_refcnt = NULL;
6067
3041a069 6068 /* Compatibility with error handling in drivers */
1da177e4
LT
6069 if (dev->reg_state == NETREG_UNINITIALIZED) {
6070 kfree((char *)dev - dev->padded);
6071 return;
6072 }
6073
6074 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6075 dev->reg_state = NETREG_RELEASED;
6076
43cb76d9
GKH
6077 /* will free via device release */
6078 put_device(&dev->dev);
1da177e4 6079}
d1b19dff 6080EXPORT_SYMBOL(free_netdev);
4ec93edb 6081
f0db275a
SH
6082/**
6083 * synchronize_net - Synchronize with packet receive processing
6084 *
6085 * Wait for packets currently being received to be done.
6086 * Does not block later packets from starting.
6087 */
4ec93edb 6088void synchronize_net(void)
1da177e4
LT
6089{
6090 might_sleep();
be3fc413
ED
6091 if (rtnl_is_locked())
6092 synchronize_rcu_expedited();
6093 else
6094 synchronize_rcu();
1da177e4 6095}
d1b19dff 6096EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
6097
6098/**
44a0873d 6099 * unregister_netdevice_queue - remove device from the kernel
1da177e4 6100 * @dev: device
44a0873d 6101 * @head: list
6ebfbc06 6102 *
1da177e4 6103 * This function shuts down a device interface and removes it
d59b54b1 6104 * from the kernel tables.
44a0873d 6105 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
6106 *
6107 * Callers must hold the rtnl semaphore. You may want
6108 * unregister_netdev() instead of this.
6109 */
6110
44a0873d 6111void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 6112{
a6620712
HX
6113 ASSERT_RTNL();
6114
44a0873d 6115 if (head) {
9fdce099 6116 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
6117 } else {
6118 rollback_registered(dev);
6119 /* Finish processing unregister after unlock */
6120 net_set_todo(dev);
6121 }
1da177e4 6122}
44a0873d 6123EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 6124
9b5e383c
ED
6125/**
6126 * unregister_netdevice_many - unregister many devices
6127 * @head: list of devices
9b5e383c
ED
6128 */
6129void unregister_netdevice_many(struct list_head *head)
6130{
6131 struct net_device *dev;
6132
6133 if (!list_empty(head)) {
6134 rollback_registered_many(head);
6135 list_for_each_entry(dev, head, unreg_list)
6136 net_set_todo(dev);
6137 }
6138}
63c8099d 6139EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 6140
1da177e4
LT
6141/**
6142 * unregister_netdev - remove device from the kernel
6143 * @dev: device
6144 *
6145 * This function shuts down a device interface and removes it
d59b54b1 6146 * from the kernel tables.
1da177e4
LT
6147 *
6148 * This is just a wrapper for unregister_netdevice that takes
6149 * the rtnl semaphore. In general you want to use this and not
6150 * unregister_netdevice.
6151 */
6152void unregister_netdev(struct net_device *dev)
6153{
6154 rtnl_lock();
6155 unregister_netdevice(dev);
6156 rtnl_unlock();
6157}
1da177e4
LT
6158EXPORT_SYMBOL(unregister_netdev);
6159
ce286d32
EB
6160/**
6161 * dev_change_net_namespace - move device to different nethost namespace
6162 * @dev: device
6163 * @net: network namespace
6164 * @pat: If not NULL name pattern to try if the current device name
6165 * is already taken in the destination network namespace.
6166 *
6167 * This function shuts down a device interface and moves it
6168 * to a new network namespace. On success 0 is returned, on
6169 * a failure a netagive errno code is returned.
6170 *
6171 * Callers must hold the rtnl semaphore.
6172 */
6173
6174int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6175{
ce286d32
EB
6176 int err;
6177
6178 ASSERT_RTNL();
6179
6180 /* Don't allow namespace local devices to be moved. */
6181 err = -EINVAL;
6182 if (dev->features & NETIF_F_NETNS_LOCAL)
6183 goto out;
6184
6185 /* Ensure the device has been registrered */
ce286d32
EB
6186 if (dev->reg_state != NETREG_REGISTERED)
6187 goto out;
6188
6189 /* Get out if there is nothing todo */
6190 err = 0;
878628fb 6191 if (net_eq(dev_net(dev), net))
ce286d32
EB
6192 goto out;
6193
6194 /* Pick the destination device name, and ensure
6195 * we can use it in the destination network namespace.
6196 */
6197 err = -EEXIST;
d9031024 6198 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
6199 /* We get here if we can't use the current device name */
6200 if (!pat)
6201 goto out;
828de4f6 6202 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
6203 goto out;
6204 }
6205
6206 /*
6207 * And now a mini version of register_netdevice unregister_netdevice.
6208 */
6209
6210 /* If device is running close it first. */
9b772652 6211 dev_close(dev);
ce286d32
EB
6212
6213 /* And unlink it from device chain */
6214 err = -ENODEV;
6215 unlist_netdevice(dev);
6216
6217 synchronize_net();
6218
6219 /* Shutdown queueing discipline. */
6220 dev_shutdown(dev);
6221
6222 /* Notify protocols, that we are about to destroy
6223 this device. They should clean all the things.
3b27e105
DL
6224
6225 Note that dev->reg_state stays at NETREG_REGISTERED.
6226 This is wanted because this way 8021q and macvlan know
6227 the device is just moving and can keep their slaves up.
ce286d32
EB
6228 */
6229 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
6230 rcu_barrier();
6231 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
d2237d35 6232 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
ce286d32
EB
6233
6234 /*
6235 * Flush the unicast and multicast chains
6236 */
a748ee24 6237 dev_uc_flush(dev);
22bedad3 6238 dev_mc_flush(dev);
ce286d32 6239
4e66ae2e
SH
6240 /* Send a netdev-removed uevent to the old namespace */
6241 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6242
ce286d32 6243 /* Actually switch the network namespace */
c346dca1 6244 dev_net_set(dev, net);
ce286d32 6245
ce286d32
EB
6246 /* If there is an ifindex conflict assign a new one */
6247 if (__dev_get_by_index(net, dev->ifindex)) {
6248 int iflink = (dev->iflink == dev->ifindex);
6249 dev->ifindex = dev_new_index(net);
6250 if (iflink)
6251 dev->iflink = dev->ifindex;
6252 }
6253
4e66ae2e
SH
6254 /* Send a netdev-add uevent to the new namespace */
6255 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6256
8b41d188 6257 /* Fixup kobjects */
a1b3f594 6258 err = device_rename(&dev->dev, dev->name);
8b41d188 6259 WARN_ON(err);
ce286d32
EB
6260
6261 /* Add the device back in the hashes */
6262 list_netdevice(dev);
6263
6264 /* Notify protocols, that a new device appeared. */
6265 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6266
d90a909e
EB
6267 /*
6268 * Prevent userspace races by waiting until the network
6269 * device is fully setup before sending notifications.
6270 */
6271 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6272
ce286d32
EB
6273 synchronize_net();
6274 err = 0;
6275out:
6276 return err;
6277}
463d0183 6278EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 6279
1da177e4
LT
6280static int dev_cpu_callback(struct notifier_block *nfb,
6281 unsigned long action,
6282 void *ocpu)
6283{
6284 struct sk_buff **list_skb;
1da177e4
LT
6285 struct sk_buff *skb;
6286 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6287 struct softnet_data *sd, *oldsd;
6288
8bb78442 6289 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
6290 return NOTIFY_OK;
6291
6292 local_irq_disable();
6293 cpu = smp_processor_id();
6294 sd = &per_cpu(softnet_data, cpu);
6295 oldsd = &per_cpu(softnet_data, oldcpu);
6296
6297 /* Find end of our completion_queue. */
6298 list_skb = &sd->completion_queue;
6299 while (*list_skb)
6300 list_skb = &(*list_skb)->next;
6301 /* Append completion queue from offline CPU. */
6302 *list_skb = oldsd->completion_queue;
6303 oldsd->completion_queue = NULL;
6304
1da177e4 6305 /* Append output queue from offline CPU. */
a9cbd588
CG
6306 if (oldsd->output_queue) {
6307 *sd->output_queue_tailp = oldsd->output_queue;
6308 sd->output_queue_tailp = oldsd->output_queue_tailp;
6309 oldsd->output_queue = NULL;
6310 oldsd->output_queue_tailp = &oldsd->output_queue;
6311 }
264524d5
HC
6312 /* Append NAPI poll list from offline CPU. */
6313 if (!list_empty(&oldsd->poll_list)) {
6314 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6315 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6316 }
1da177e4
LT
6317
6318 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6319 local_irq_enable();
6320
6321 /* Process offline CPU's input_pkt_queue */
76cc8b13 6322 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 6323 netif_rx(skb);
76cc8b13 6324 input_queue_head_incr(oldsd);
fec5e652 6325 }
76cc8b13 6326 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 6327 netif_rx(skb);
76cc8b13
TH
6328 input_queue_head_incr(oldsd);
6329 }
1da177e4
LT
6330
6331 return NOTIFY_OK;
6332}
1da177e4
LT
6333
6334
7f353bf2 6335/**
b63365a2
HX
6336 * netdev_increment_features - increment feature set by one
6337 * @all: current feature set
6338 * @one: new feature set
6339 * @mask: mask feature set
7f353bf2
HX
6340 *
6341 * Computes a new feature set after adding a device with feature set
b63365a2
HX
6342 * @one to the master device with current feature set @all. Will not
6343 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 6344 */
c8f44aff
MM
6345netdev_features_t netdev_increment_features(netdev_features_t all,
6346 netdev_features_t one, netdev_features_t mask)
b63365a2 6347{
1742f183
MM
6348 if (mask & NETIF_F_GEN_CSUM)
6349 mask |= NETIF_F_ALL_CSUM;
6350 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 6351
1742f183
MM
6352 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6353 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 6354
1742f183
MM
6355 /* If one device supports hw checksumming, set for all. */
6356 if (all & NETIF_F_GEN_CSUM)
6357 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
6358
6359 return all;
6360}
b63365a2 6361EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 6362
30d97d35
PE
6363static struct hlist_head *netdev_create_hash(void)
6364{
6365 int i;
6366 struct hlist_head *hash;
6367
6368 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6369 if (hash != NULL)
6370 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6371 INIT_HLIST_HEAD(&hash[i]);
6372
6373 return hash;
6374}
6375
881d966b 6376/* Initialize per network namespace state */
4665079c 6377static int __net_init netdev_init(struct net *net)
881d966b 6378{
734b6541
RM
6379 if (net != &init_net)
6380 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 6381
30d97d35
PE
6382 net->dev_name_head = netdev_create_hash();
6383 if (net->dev_name_head == NULL)
6384 goto err_name;
881d966b 6385
30d97d35
PE
6386 net->dev_index_head = netdev_create_hash();
6387 if (net->dev_index_head == NULL)
6388 goto err_idx;
881d966b
EB
6389
6390 return 0;
30d97d35
PE
6391
6392err_idx:
6393 kfree(net->dev_name_head);
6394err_name:
6395 return -ENOMEM;
881d966b
EB
6396}
6397
f0db275a
SH
6398/**
6399 * netdev_drivername - network driver for the device
6400 * @dev: network device
f0db275a
SH
6401 *
6402 * Determine network driver for device.
6403 */
3019de12 6404const char *netdev_drivername(const struct net_device *dev)
6579e57b 6405{
cf04a4c7
SH
6406 const struct device_driver *driver;
6407 const struct device *parent;
3019de12 6408 const char *empty = "";
6579e57b
AV
6409
6410 parent = dev->dev.parent;
6579e57b 6411 if (!parent)
3019de12 6412 return empty;
6579e57b
AV
6413
6414 driver = parent->driver;
6415 if (driver && driver->name)
3019de12
DM
6416 return driver->name;
6417 return empty;
6579e57b
AV
6418}
6419
b004ff49 6420static int __netdev_printk(const char *level, const struct net_device *dev,
256df2f3
JP
6421 struct va_format *vaf)
6422{
6423 int r;
6424
b004ff49 6425 if (dev && dev->dev.parent) {
666f355f
JP
6426 r = dev_printk_emit(level[1] - '0',
6427 dev->dev.parent,
6428 "%s %s %s: %pV",
6429 dev_driver_string(dev->dev.parent),
6430 dev_name(dev->dev.parent),
6431 netdev_name(dev), vaf);
b004ff49 6432 } else if (dev) {
256df2f3 6433 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
b004ff49 6434 } else {
256df2f3 6435 r = printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 6436 }
256df2f3
JP
6437
6438 return r;
6439}
6440
6441int netdev_printk(const char *level, const struct net_device *dev,
6442 const char *format, ...)
6443{
6444 struct va_format vaf;
6445 va_list args;
6446 int r;
6447
6448 va_start(args, format);
6449
6450 vaf.fmt = format;
6451 vaf.va = &args;
6452
6453 r = __netdev_printk(level, dev, &vaf);
b004ff49 6454
256df2f3
JP
6455 va_end(args);
6456
6457 return r;
6458}
6459EXPORT_SYMBOL(netdev_printk);
6460
6461#define define_netdev_printk_level(func, level) \
6462int func(const struct net_device *dev, const char *fmt, ...) \
6463{ \
6464 int r; \
6465 struct va_format vaf; \
6466 va_list args; \
6467 \
6468 va_start(args, fmt); \
6469 \
6470 vaf.fmt = fmt; \
6471 vaf.va = &args; \
6472 \
6473 r = __netdev_printk(level, dev, &vaf); \
b004ff49 6474 \
256df2f3
JP
6475 va_end(args); \
6476 \
6477 return r; \
6478} \
6479EXPORT_SYMBOL(func);
6480
6481define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6482define_netdev_printk_level(netdev_alert, KERN_ALERT);
6483define_netdev_printk_level(netdev_crit, KERN_CRIT);
6484define_netdev_printk_level(netdev_err, KERN_ERR);
6485define_netdev_printk_level(netdev_warn, KERN_WARNING);
6486define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6487define_netdev_printk_level(netdev_info, KERN_INFO);
6488
4665079c 6489static void __net_exit netdev_exit(struct net *net)
881d966b
EB
6490{
6491 kfree(net->dev_name_head);
6492 kfree(net->dev_index_head);
6493}
6494
022cbae6 6495static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
6496 .init = netdev_init,
6497 .exit = netdev_exit,
6498};
6499
4665079c 6500static void __net_exit default_device_exit(struct net *net)
ce286d32 6501{
e008b5fc 6502 struct net_device *dev, *aux;
ce286d32 6503 /*
e008b5fc 6504 * Push all migratable network devices back to the
ce286d32
EB
6505 * initial network namespace
6506 */
6507 rtnl_lock();
e008b5fc 6508 for_each_netdev_safe(net, dev, aux) {
ce286d32 6509 int err;
aca51397 6510 char fb_name[IFNAMSIZ];
ce286d32
EB
6511
6512 /* Ignore unmoveable devices (i.e. loopback) */
6513 if (dev->features & NETIF_F_NETNS_LOCAL)
6514 continue;
6515
e008b5fc
EB
6516 /* Leave virtual devices for the generic cleanup */
6517 if (dev->rtnl_link_ops)
6518 continue;
d0c082ce 6519
25985edc 6520 /* Push remaining network devices to init_net */
aca51397
PE
6521 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6522 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 6523 if (err) {
7b6cd1ce
JP
6524 pr_emerg("%s: failed to move %s to init_net: %d\n",
6525 __func__, dev->name, err);
aca51397 6526 BUG();
ce286d32
EB
6527 }
6528 }
6529 rtnl_unlock();
6530}
6531
04dc7f6b
EB
6532static void __net_exit default_device_exit_batch(struct list_head *net_list)
6533{
6534 /* At exit all network devices most be removed from a network
b595076a 6535 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
6536 * Do this across as many network namespaces as possible to
6537 * improve batching efficiency.
6538 */
6539 struct net_device *dev;
6540 struct net *net;
6541 LIST_HEAD(dev_kill_list);
6542
6543 rtnl_lock();
6544 list_for_each_entry(net, net_list, exit_list) {
6545 for_each_netdev_reverse(net, dev) {
6546 if (dev->rtnl_link_ops)
6547 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6548 else
6549 unregister_netdevice_queue(dev, &dev_kill_list);
6550 }
6551 }
6552 unregister_netdevice_many(&dev_kill_list);
ceaaec98 6553 list_del(&dev_kill_list);
04dc7f6b
EB
6554 rtnl_unlock();
6555}
6556
022cbae6 6557static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6558 .exit = default_device_exit,
04dc7f6b 6559 .exit_batch = default_device_exit_batch,
ce286d32
EB
6560};
6561
1da177e4
LT
6562/*
6563 * Initialize the DEV module. At boot time this walks the device list and
6564 * unhooks any devices that fail to initialise (normally hardware not
6565 * present) and leaves us with a valid list of present and active devices.
6566 *
6567 */
6568
6569/*
6570 * This is called single threaded during boot, so no need
6571 * to take the rtnl semaphore.
6572 */
6573static int __init net_dev_init(void)
6574{
6575 int i, rc = -ENOMEM;
6576
6577 BUG_ON(!dev_boot_phase);
6578
1da177e4
LT
6579 if (dev_proc_init())
6580 goto out;
6581
8b41d188 6582 if (netdev_kobject_init())
1da177e4
LT
6583 goto out;
6584
6585 INIT_LIST_HEAD(&ptype_all);
82d8a867 6586 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6587 INIT_LIST_HEAD(&ptype_base[i]);
6588
62532da9
VY
6589 INIT_LIST_HEAD(&offload_base);
6590
881d966b
EB
6591 if (register_pernet_subsys(&netdev_net_ops))
6592 goto out;
1da177e4
LT
6593
6594 /*
6595 * Initialise the packet receive queues.
6596 */
6597
6f912042 6598 for_each_possible_cpu(i) {
e36fa2f7 6599 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6600
dee42870 6601 memset(sd, 0, sizeof(*sd));
e36fa2f7 6602 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6603 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6604 sd->completion_queue = NULL;
6605 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6606 sd->output_queue = NULL;
6607 sd->output_queue_tailp = &sd->output_queue;
df334545 6608#ifdef CONFIG_RPS
e36fa2f7
ED
6609 sd->csd.func = rps_trigger_softirq;
6610 sd->csd.info = sd;
6611 sd->csd.flags = 0;
6612 sd->cpu = i;
1e94d72f 6613#endif
0a9627f2 6614
e36fa2f7
ED
6615 sd->backlog.poll = process_backlog;
6616 sd->backlog.weight = weight_p;
6617 sd->backlog.gro_list = NULL;
6618 sd->backlog.gro_count = 0;
1da177e4
LT
6619 }
6620
1da177e4
LT
6621 dev_boot_phase = 0;
6622
505d4f73
EB
6623 /* The loopback device is special if any other network devices
6624 * is present in a network namespace the loopback device must
6625 * be present. Since we now dynamically allocate and free the
6626 * loopback device ensure this invariant is maintained by
6627 * keeping the loopback device as the first device on the
6628 * list of network devices. Ensuring the loopback devices
6629 * is the first device that appears and the last network device
6630 * that disappears.
6631 */
6632 if (register_pernet_device(&loopback_net_ops))
6633 goto out;
6634
6635 if (register_pernet_device(&default_device_ops))
6636 goto out;
6637
962cf36c
CM
6638 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6639 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6640
6641 hotcpu_notifier(dev_cpu_callback, 0);
6642 dst_init();
6643 dev_mcast_init();
6644 rc = 0;
6645out:
6646 return rc;
6647}
6648
6649subsys_initcall(net_dev_init);