tcp: take care of truncations done by sk_filter()
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
1da177e4 100#include <linux/stat.h>
1da177e4
LT
101#include <net/dst.h>
102#include <net/pkt_sched.h>
103#include <net/checksum.h>
44540960 104#include <net/xfrm.h>
1da177e4
LT
105#include <linux/highmem.h>
106#include <linux/init.h>
1da177e4 107#include <linux/module.h>
1da177e4
LT
108#include <linux/netpoll.h>
109#include <linux/rcupdate.h>
110#include <linux/delay.h>
1da177e4 111#include <net/iw_handler.h>
1da177e4 112#include <asm/current.h>
5bdb9886 113#include <linux/audit.h>
db217334 114#include <linux/dmaengine.h>
f6a78bfc 115#include <linux/err.h>
c7fa9d18 116#include <linux/ctype.h>
723e98b7 117#include <linux/if_arp.h>
6de329e2 118#include <linux/if_vlan.h>
8f0f2223 119#include <linux/ip.h>
ad55dcaf 120#include <net/ip.h>
8f0f2223
DM
121#include <linux/ipv6.h>
122#include <linux/in.h>
b6b2fed1
DM
123#include <linux/jhash.h>
124#include <linux/random.h>
9cbc1cb8 125#include <trace/events/napi.h>
cf66ba58 126#include <trace/events/net.h>
07dc22e7 127#include <trace/events/skb.h>
5acbbd42 128#include <linux/pci.h>
caeda9b9 129#include <linux/inetdevice.h>
c445477d 130#include <linux/cpu_rmap.h>
c5905afb 131#include <linux/static_key.h>
1da177e4 132
342709ef
PE
133#include "net-sysfs.h"
134
d565b0a1
HX
135/* Instead of increasing this, you should create a hash table. */
136#define MAX_GRO_SKBS 8
137
5d38a079
HX
138/* This should be increased if a protocol with a bigger head is added. */
139#define GRO_MAX_HEAD (MAX_HEADER + 128)
140
1da177e4 141static DEFINE_SPINLOCK(ptype_lock);
62532da9 142static DEFINE_SPINLOCK(offload_lock);
900ff8c6
CW
143struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
144struct list_head ptype_all __read_mostly; /* Taps */
62532da9 145static struct list_head offload_base __read_mostly;
1da177e4 146
1da177e4 147/*
7562f876 148 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
149 * semaphore.
150 *
c6d14c84 151 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
152 *
153 * Writers must hold the rtnl semaphore while they loop through the
7562f876 154 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
155 * actual updates. This allows pure readers to access the list even
156 * while a writer is preparing to update it.
157 *
158 * To put it another way, dev_base_lock is held for writing only to
159 * protect against pure readers; the rtnl semaphore provides the
160 * protection against other writers.
161 *
162 * See, for example usages, register_netdevice() and
163 * unregister_netdevice(), which must be called with the rtnl
164 * semaphore held.
165 */
1da177e4 166DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
167EXPORT_SYMBOL(dev_base_lock);
168
30e6c9fa 169seqcount_t devnet_rename_seq;
c91f6df2 170
4e985ada
TG
171static inline void dev_base_seq_inc(struct net *net)
172{
173 while (++net->dev_base_seq == 0);
174}
175
881d966b 176static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4 177{
95c96174
ED
178 unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
179
08e9897d 180 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
181}
182
881d966b 183static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 184{
7c28bd0b 185 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
186}
187
e36fa2f7 188static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
189{
190#ifdef CONFIG_RPS
e36fa2f7 191 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
192#endif
193}
194
e36fa2f7 195static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
196{
197#ifdef CONFIG_RPS
e36fa2f7 198 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
199#endif
200}
201
ce286d32 202/* Device list insertion */
53759be9 203static void list_netdevice(struct net_device *dev)
ce286d32 204{
c346dca1 205 struct net *net = dev_net(dev);
ce286d32
EB
206
207 ASSERT_RTNL();
208
209 write_lock_bh(&dev_base_lock);
c6d14c84 210 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 211 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
212 hlist_add_head_rcu(&dev->index_hlist,
213 dev_index_hash(net, dev->ifindex));
ce286d32 214 write_unlock_bh(&dev_base_lock);
4e985ada
TG
215
216 dev_base_seq_inc(net);
ce286d32
EB
217}
218
fb699dfd
ED
219/* Device list removal
220 * caller must respect a RCU grace period before freeing/reusing dev
221 */
ce286d32
EB
222static void unlist_netdevice(struct net_device *dev)
223{
224 ASSERT_RTNL();
225
226 /* Unlink dev from the device chain */
227 write_lock_bh(&dev_base_lock);
c6d14c84 228 list_del_rcu(&dev->dev_list);
72c9528b 229 hlist_del_rcu(&dev->name_hlist);
fb699dfd 230 hlist_del_rcu(&dev->index_hlist);
ce286d32 231 write_unlock_bh(&dev_base_lock);
4e985ada
TG
232
233 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
234}
235
1da177e4
LT
236/*
237 * Our notifier list
238 */
239
f07d5b94 240static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
241
242/*
243 * Device drivers call our routines to queue packets here. We empty the
244 * queue in the local softnet handler.
245 */
bea3348e 246
9958da05 247DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 248EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 249
cf508b12 250#ifdef CONFIG_LOCKDEP
723e98b7 251/*
c773e847 252 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
253 * according to dev->type
254 */
255static const unsigned short netdev_lock_type[] =
256 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
257 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
258 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
259 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
260 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
261 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
262 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
263 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
264 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
265 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
266 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
267 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
211ed865
PG
268 ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
269 ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
270 ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
723e98b7 271
36cbd3dc 272static const char *const netdev_lock_name[] =
723e98b7
JP
273 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
274 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
275 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
276 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
277 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
278 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
279 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
280 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
281 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
282 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
283 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
284 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
211ed865
PG
285 "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
286 "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
287 "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
288
289static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 290static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
291
292static inline unsigned short netdev_lock_pos(unsigned short dev_type)
293{
294 int i;
295
296 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
297 if (netdev_lock_type[i] == dev_type)
298 return i;
299 /* the last key is used by default */
300 return ARRAY_SIZE(netdev_lock_type) - 1;
301}
302
cf508b12
DM
303static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
304 unsigned short dev_type)
723e98b7
JP
305{
306 int i;
307
308 i = netdev_lock_pos(dev_type);
309 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
310 netdev_lock_name[i]);
311}
cf508b12
DM
312
313static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
314{
315 int i;
316
317 i = netdev_lock_pos(dev->type);
318 lockdep_set_class_and_name(&dev->addr_list_lock,
319 &netdev_addr_lock_key[i],
320 netdev_lock_name[i]);
321}
723e98b7 322#else
cf508b12
DM
323static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
324 unsigned short dev_type)
325{
326}
327static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
328{
329}
330#endif
1da177e4
LT
331
332/*******************************************************************************
333
334 Protocol management and registration routines
335
336*******************************************************************************/
337
1da177e4
LT
338/*
339 * Add a protocol ID to the list. Now that the input handler is
340 * smarter we can dispense with all the messy stuff that used to be
341 * here.
342 *
343 * BEWARE!!! Protocol handlers, mangling input packets,
344 * MUST BE last in hash buckets and checking protocol handlers
345 * MUST start from promiscuous ptype_all chain in net_bh.
346 * It is true now, do not change it.
347 * Explanation follows: if protocol handler, mangling packet, will
348 * be the first on list, it is not able to sense, that packet
349 * is cloned and should be copied-on-write, so that it will
350 * change it and subsequent readers will get broken packet.
351 * --ANK (980803)
352 */
353
c07b68e8
ED
354static inline struct list_head *ptype_head(const struct packet_type *pt)
355{
356 if (pt->type == htons(ETH_P_ALL))
357 return &ptype_all;
358 else
359 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
360}
361
1da177e4
LT
362/**
363 * dev_add_pack - add packet handler
364 * @pt: packet type declaration
365 *
366 * Add a protocol handler to the networking stack. The passed &packet_type
367 * is linked into kernel lists and may not be freed until it has been
368 * removed from the kernel lists.
369 *
4ec93edb 370 * This call does not sleep therefore it can not
1da177e4
LT
371 * guarantee all CPU's that are in middle of receiving packets
372 * will see the new packet type (until the next received packet).
373 */
374
375void dev_add_pack(struct packet_type *pt)
376{
c07b68e8 377 struct list_head *head = ptype_head(pt);
1da177e4 378
c07b68e8
ED
379 spin_lock(&ptype_lock);
380 list_add_rcu(&pt->list, head);
381 spin_unlock(&ptype_lock);
1da177e4 382}
d1b19dff 383EXPORT_SYMBOL(dev_add_pack);
1da177e4 384
1da177e4
LT
385/**
386 * __dev_remove_pack - remove packet handler
387 * @pt: packet type declaration
388 *
389 * Remove a protocol handler that was previously added to the kernel
390 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
391 * from the kernel lists and can be freed or reused once this function
4ec93edb 392 * returns.
1da177e4
LT
393 *
394 * The packet type might still be in use by receivers
395 * and must not be freed until after all the CPU's have gone
396 * through a quiescent state.
397 */
398void __dev_remove_pack(struct packet_type *pt)
399{
c07b68e8 400 struct list_head *head = ptype_head(pt);
1da177e4
LT
401 struct packet_type *pt1;
402
c07b68e8 403 spin_lock(&ptype_lock);
1da177e4
LT
404
405 list_for_each_entry(pt1, head, list) {
406 if (pt == pt1) {
407 list_del_rcu(&pt->list);
408 goto out;
409 }
410 }
411
7b6cd1ce 412 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 413out:
c07b68e8 414 spin_unlock(&ptype_lock);
1da177e4 415}
d1b19dff
ED
416EXPORT_SYMBOL(__dev_remove_pack);
417
1da177e4
LT
418/**
419 * dev_remove_pack - remove packet handler
420 * @pt: packet type declaration
421 *
422 * Remove a protocol handler that was previously added to the kernel
423 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
424 * from the kernel lists and can be freed or reused once this function
425 * returns.
426 *
427 * This call sleeps to guarantee that no CPU is looking at the packet
428 * type after return.
429 */
430void dev_remove_pack(struct packet_type *pt)
431{
432 __dev_remove_pack(pt);
4ec93edb 433
1da177e4
LT
434 synchronize_net();
435}
d1b19dff 436EXPORT_SYMBOL(dev_remove_pack);
1da177e4 437
62532da9
VY
438
439/**
440 * dev_add_offload - register offload handlers
441 * @po: protocol offload declaration
442 *
443 * Add protocol offload handlers to the networking stack. The passed
444 * &proto_offload is linked into kernel lists and may not be freed until
445 * it has been removed from the kernel lists.
446 *
447 * This call does not sleep therefore it can not
448 * guarantee all CPU's that are in middle of receiving packets
449 * will see the new offload handlers (until the next received packet).
450 */
451void dev_add_offload(struct packet_offload *po)
452{
453 struct list_head *head = &offload_base;
454
455 spin_lock(&offload_lock);
456 list_add_rcu(&po->list, head);
457 spin_unlock(&offload_lock);
458}
459EXPORT_SYMBOL(dev_add_offload);
460
461/**
462 * __dev_remove_offload - remove offload handler
463 * @po: packet offload declaration
464 *
465 * Remove a protocol offload handler that was previously added to the
466 * kernel offload handlers by dev_add_offload(). The passed &offload_type
467 * is removed from the kernel lists and can be freed or reused once this
468 * function returns.
469 *
470 * The packet type might still be in use by receivers
471 * and must not be freed until after all the CPU's have gone
472 * through a quiescent state.
473 */
474void __dev_remove_offload(struct packet_offload *po)
475{
476 struct list_head *head = &offload_base;
477 struct packet_offload *po1;
478
c53aa505 479 spin_lock(&offload_lock);
62532da9
VY
480
481 list_for_each_entry(po1, head, list) {
482 if (po == po1) {
483 list_del_rcu(&po->list);
484 goto out;
485 }
486 }
487
488 pr_warn("dev_remove_offload: %p not found\n", po);
489out:
c53aa505 490 spin_unlock(&offload_lock);
62532da9
VY
491}
492EXPORT_SYMBOL(__dev_remove_offload);
493
494/**
495 * dev_remove_offload - remove packet offload handler
496 * @po: packet offload declaration
497 *
498 * Remove a packet offload handler that was previously added to the kernel
499 * offload handlers by dev_add_offload(). The passed &offload_type is
500 * removed from the kernel lists and can be freed or reused once this
501 * function returns.
502 *
503 * This call sleeps to guarantee that no CPU is looking at the packet
504 * type after return.
505 */
506void dev_remove_offload(struct packet_offload *po)
507{
508 __dev_remove_offload(po);
509
510 synchronize_net();
511}
512EXPORT_SYMBOL(dev_remove_offload);
513
1da177e4
LT
514/******************************************************************************
515
516 Device Boot-time Settings Routines
517
518*******************************************************************************/
519
520/* Boot time configuration table */
521static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
522
523/**
524 * netdev_boot_setup_add - add new setup entry
525 * @name: name of the device
526 * @map: configured settings for the device
527 *
528 * Adds new setup entry to the dev_boot_setup list. The function
529 * returns 0 on error and 1 on success. This is a generic routine to
530 * all netdevices.
531 */
532static int netdev_boot_setup_add(char *name, struct ifmap *map)
533{
534 struct netdev_boot_setup *s;
535 int i;
536
537 s = dev_boot_setup;
538 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
539 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
540 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 541 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
542 memcpy(&s[i].map, map, sizeof(s[i].map));
543 break;
544 }
545 }
546
547 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
548}
549
550/**
551 * netdev_boot_setup_check - check boot time settings
552 * @dev: the netdevice
553 *
554 * Check boot time settings for the device.
555 * The found settings are set for the device to be used
556 * later in the device probing.
557 * Returns 0 if no settings found, 1 if they are.
558 */
559int netdev_boot_setup_check(struct net_device *dev)
560{
561 struct netdev_boot_setup *s = dev_boot_setup;
562 int i;
563
564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
565 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 566 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
567 dev->irq = s[i].map.irq;
568 dev->base_addr = s[i].map.base_addr;
569 dev->mem_start = s[i].map.mem_start;
570 dev->mem_end = s[i].map.mem_end;
571 return 1;
572 }
573 }
574 return 0;
575}
d1b19dff 576EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
577
578
579/**
580 * netdev_boot_base - get address from boot time settings
581 * @prefix: prefix for network device
582 * @unit: id for network device
583 *
584 * Check boot time settings for the base address of device.
585 * The found settings are set for the device to be used
586 * later in the device probing.
587 * Returns 0 if no settings found.
588 */
589unsigned long netdev_boot_base(const char *prefix, int unit)
590{
591 const struct netdev_boot_setup *s = dev_boot_setup;
592 char name[IFNAMSIZ];
593 int i;
594
595 sprintf(name, "%s%d", prefix, unit);
596
597 /*
598 * If device already registered then return base of 1
599 * to indicate not to probe for this interface
600 */
881d966b 601 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
602 return 1;
603
604 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
605 if (!strcmp(name, s[i].name))
606 return s[i].map.base_addr;
607 return 0;
608}
609
610/*
611 * Saves at boot time configured settings for any netdevice.
612 */
613int __init netdev_boot_setup(char *str)
614{
615 int ints[5];
616 struct ifmap map;
617
618 str = get_options(str, ARRAY_SIZE(ints), ints);
619 if (!str || !*str)
620 return 0;
621
622 /* Save settings */
623 memset(&map, 0, sizeof(map));
624 if (ints[0] > 0)
625 map.irq = ints[1];
626 if (ints[0] > 1)
627 map.base_addr = ints[2];
628 if (ints[0] > 2)
629 map.mem_start = ints[3];
630 if (ints[0] > 3)
631 map.mem_end = ints[4];
632
633 /* Add new entry to the list */
634 return netdev_boot_setup_add(str, &map);
635}
636
637__setup("netdev=", netdev_boot_setup);
638
639/*******************************************************************************
640
641 Device Interface Subroutines
642
643*******************************************************************************/
644
645/**
646 * __dev_get_by_name - find a device by its name
c4ea43c5 647 * @net: the applicable net namespace
1da177e4
LT
648 * @name: name to find
649 *
650 * Find an interface by name. Must be called under RTNL semaphore
651 * or @dev_base_lock. If the name is found a pointer to the device
652 * is returned. If the name is not found then %NULL is returned. The
653 * reference counters are not incremented so the caller must be
654 * careful with locks.
655 */
656
881d966b 657struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4 658{
0bd8d536
ED
659 struct net_device *dev;
660 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 661
b67bfe0d 662 hlist_for_each_entry(dev, head, name_hlist)
1da177e4
LT
663 if (!strncmp(dev->name, name, IFNAMSIZ))
664 return dev;
0bd8d536 665
1da177e4
LT
666 return NULL;
667}
d1b19dff 668EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 669
72c9528b
ED
670/**
671 * dev_get_by_name_rcu - find a device by its name
672 * @net: the applicable net namespace
673 * @name: name to find
674 *
675 * Find an interface by name.
676 * If the name is found a pointer to the device is returned.
677 * If the name is not found then %NULL is returned.
678 * The reference counters are not incremented so the caller must be
679 * careful with locks. The caller must hold RCU lock.
680 */
681
682struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
683{
72c9528b
ED
684 struct net_device *dev;
685 struct hlist_head *head = dev_name_hash(net, name);
686
b67bfe0d 687 hlist_for_each_entry_rcu(dev, head, name_hlist)
72c9528b
ED
688 if (!strncmp(dev->name, name, IFNAMSIZ))
689 return dev;
690
691 return NULL;
692}
693EXPORT_SYMBOL(dev_get_by_name_rcu);
694
1da177e4
LT
695/**
696 * dev_get_by_name - find a device by its name
c4ea43c5 697 * @net: the applicable net namespace
1da177e4
LT
698 * @name: name to find
699 *
700 * Find an interface by name. This can be called from any
701 * context and does its own locking. The returned handle has
702 * the usage count incremented and the caller must use dev_put() to
703 * release it when it is no longer needed. %NULL is returned if no
704 * matching device is found.
705 */
706
881d966b 707struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
708{
709 struct net_device *dev;
710
72c9528b
ED
711 rcu_read_lock();
712 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
713 if (dev)
714 dev_hold(dev);
72c9528b 715 rcu_read_unlock();
1da177e4
LT
716 return dev;
717}
d1b19dff 718EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
719
720/**
721 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 722 * @net: the applicable net namespace
1da177e4
LT
723 * @ifindex: index of device
724 *
725 * Search for an interface by index. Returns %NULL if the device
726 * is not found or a pointer to the device. The device has not
727 * had its reference counter increased so the caller must be careful
728 * about locking. The caller must hold either the RTNL semaphore
729 * or @dev_base_lock.
730 */
731
881d966b 732struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4 733{
0bd8d536
ED
734 struct net_device *dev;
735 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 736
b67bfe0d 737 hlist_for_each_entry(dev, head, index_hlist)
1da177e4
LT
738 if (dev->ifindex == ifindex)
739 return dev;
0bd8d536 740
1da177e4
LT
741 return NULL;
742}
d1b19dff 743EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 744
fb699dfd
ED
745/**
746 * dev_get_by_index_rcu - find a device by its ifindex
747 * @net: the applicable net namespace
748 * @ifindex: index of device
749 *
750 * Search for an interface by index. Returns %NULL if the device
751 * is not found or a pointer to the device. The device has not
752 * had its reference counter increased so the caller must be careful
753 * about locking. The caller must hold RCU lock.
754 */
755
756struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
757{
fb699dfd
ED
758 struct net_device *dev;
759 struct hlist_head *head = dev_index_hash(net, ifindex);
760
b67bfe0d 761 hlist_for_each_entry_rcu(dev, head, index_hlist)
fb699dfd
ED
762 if (dev->ifindex == ifindex)
763 return dev;
764
765 return NULL;
766}
767EXPORT_SYMBOL(dev_get_by_index_rcu);
768
1da177e4
LT
769
770/**
771 * dev_get_by_index - find a device by its ifindex
c4ea43c5 772 * @net: the applicable net namespace
1da177e4
LT
773 * @ifindex: index of device
774 *
775 * Search for an interface by index. Returns NULL if the device
776 * is not found or a pointer to the device. The device returned has
777 * had a reference added and the pointer is safe until the user calls
778 * dev_put to indicate they have finished with it.
779 */
780
881d966b 781struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
782{
783 struct net_device *dev;
784
fb699dfd
ED
785 rcu_read_lock();
786 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
787 if (dev)
788 dev_hold(dev);
fb699dfd 789 rcu_read_unlock();
1da177e4
LT
790 return dev;
791}
d1b19dff 792EXPORT_SYMBOL(dev_get_by_index);
1da177e4 793
5dbe7c17
NS
794/**
795 * netdev_get_name - get a netdevice name, knowing its ifindex.
796 * @net: network namespace
797 * @name: a pointer to the buffer where the name will be stored.
798 * @ifindex: the ifindex of the interface to get the name from.
799 *
800 * The use of raw_seqcount_begin() and cond_resched() before
801 * retrying is required as we want to give the writers a chance
802 * to complete when CONFIG_PREEMPT is not set.
803 */
804int netdev_get_name(struct net *net, char *name, int ifindex)
805{
806 struct net_device *dev;
807 unsigned int seq;
808
809retry:
810 seq = raw_seqcount_begin(&devnet_rename_seq);
811 rcu_read_lock();
812 dev = dev_get_by_index_rcu(net, ifindex);
813 if (!dev) {
814 rcu_read_unlock();
815 return -ENODEV;
816 }
817
818 strcpy(name, dev->name);
819 rcu_read_unlock();
820 if (read_seqcount_retry(&devnet_rename_seq, seq)) {
821 cond_resched();
822 goto retry;
823 }
824
825 return 0;
826}
827
1da177e4 828/**
941666c2 829 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 830 * @net: the applicable net namespace
1da177e4
LT
831 * @type: media type of device
832 * @ha: hardware address
833 *
834 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
835 * is not found or a pointer to the device.
836 * The caller must hold RCU or RTNL.
941666c2 837 * The returned device has not had its ref count increased
1da177e4
LT
838 * and the caller must therefore be careful about locking
839 *
1da177e4
LT
840 */
841
941666c2
ED
842struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
843 const char *ha)
1da177e4
LT
844{
845 struct net_device *dev;
846
941666c2 847 for_each_netdev_rcu(net, dev)
1da177e4
LT
848 if (dev->type == type &&
849 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
850 return dev;
851
852 return NULL;
1da177e4 853}
941666c2 854EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 855
881d966b 856struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
857{
858 struct net_device *dev;
859
4e9cac2b 860 ASSERT_RTNL();
881d966b 861 for_each_netdev(net, dev)
4e9cac2b 862 if (dev->type == type)
7562f876
PE
863 return dev;
864
865 return NULL;
4e9cac2b 866}
4e9cac2b
PM
867EXPORT_SYMBOL(__dev_getfirstbyhwtype);
868
881d966b 869struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 870{
99fe3c39 871 struct net_device *dev, *ret = NULL;
4e9cac2b 872
99fe3c39
ED
873 rcu_read_lock();
874 for_each_netdev_rcu(net, dev)
875 if (dev->type == type) {
876 dev_hold(dev);
877 ret = dev;
878 break;
879 }
880 rcu_read_unlock();
881 return ret;
1da177e4 882}
1da177e4
LT
883EXPORT_SYMBOL(dev_getfirstbyhwtype);
884
885/**
bb69ae04 886 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 887 * @net: the applicable net namespace
1da177e4
LT
888 * @if_flags: IFF_* values
889 * @mask: bitmask of bits in if_flags to check
890 *
891 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
892 * is not found or a pointer to the device. Must be called inside
893 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
894 */
895
bb69ae04 896struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 897 unsigned short mask)
1da177e4 898{
7562f876 899 struct net_device *dev, *ret;
1da177e4 900
7562f876 901 ret = NULL;
c6d14c84 902 for_each_netdev_rcu(net, dev) {
1da177e4 903 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 904 ret = dev;
1da177e4
LT
905 break;
906 }
907 }
7562f876 908 return ret;
1da177e4 909}
bb69ae04 910EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
911
912/**
913 * dev_valid_name - check if name is okay for network device
914 * @name: name string
915 *
916 * Network device names need to be valid file names to
c7fa9d18
DM
917 * to allow sysfs to work. We also disallow any kind of
918 * whitespace.
1da177e4 919 */
95f050bf 920bool dev_valid_name(const char *name)
1da177e4 921{
c7fa9d18 922 if (*name == '\0')
95f050bf 923 return false;
b6fe17d6 924 if (strlen(name) >= IFNAMSIZ)
95f050bf 925 return false;
c7fa9d18 926 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 927 return false;
c7fa9d18
DM
928
929 while (*name) {
cde81ed7 930 if (*name == '/' || *name == ':' || isspace(*name))
95f050bf 931 return false;
c7fa9d18
DM
932 name++;
933 }
95f050bf 934 return true;
1da177e4 935}
d1b19dff 936EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
937
938/**
b267b179
EB
939 * __dev_alloc_name - allocate a name for a device
940 * @net: network namespace to allocate the device name in
1da177e4 941 * @name: name format string
b267b179 942 * @buf: scratch buffer and result name string
1da177e4
LT
943 *
944 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
945 * id. It scans list of devices to build up a free map, then chooses
946 * the first empty slot. The caller must hold the dev_base or rtnl lock
947 * while allocating the name and adding the device in order to avoid
948 * duplicates.
949 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
950 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
951 */
952
b267b179 953static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
954{
955 int i = 0;
1da177e4
LT
956 const char *p;
957 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 958 unsigned long *inuse;
1da177e4
LT
959 struct net_device *d;
960
961 p = strnchr(name, IFNAMSIZ-1, '%');
962 if (p) {
963 /*
964 * Verify the string as this thing may have come from
965 * the user. There must be either one "%d" and no other "%"
966 * characters.
967 */
968 if (p[1] != 'd' || strchr(p + 2, '%'))
969 return -EINVAL;
970
971 /* Use one page as a bit array of possible slots */
cfcabdcc 972 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
973 if (!inuse)
974 return -ENOMEM;
975
881d966b 976 for_each_netdev(net, d) {
1da177e4
LT
977 if (!sscanf(d->name, name, &i))
978 continue;
979 if (i < 0 || i >= max_netdevices)
980 continue;
981
982 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 983 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
984 if (!strncmp(buf, d->name, IFNAMSIZ))
985 set_bit(i, inuse);
986 }
987
988 i = find_first_zero_bit(inuse, max_netdevices);
989 free_page((unsigned long) inuse);
990 }
991
d9031024
OP
992 if (buf != name)
993 snprintf(buf, IFNAMSIZ, name, i);
b267b179 994 if (!__dev_get_by_name(net, buf))
1da177e4 995 return i;
1da177e4
LT
996
997 /* It is possible to run out of possible slots
998 * when the name is long and there isn't enough space left
999 * for the digits, or if all bits are used.
1000 */
1001 return -ENFILE;
1002}
1003
b267b179
EB
1004/**
1005 * dev_alloc_name - allocate a name for a device
1006 * @dev: device
1007 * @name: name format string
1008 *
1009 * Passed a format string - eg "lt%d" it will try and find a suitable
1010 * id. It scans list of devices to build up a free map, then chooses
1011 * the first empty slot. The caller must hold the dev_base or rtnl lock
1012 * while allocating the name and adding the device in order to avoid
1013 * duplicates.
1014 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1015 * Returns the number of the unit assigned or a negative errno code.
1016 */
1017
1018int dev_alloc_name(struct net_device *dev, const char *name)
1019{
1020 char buf[IFNAMSIZ];
1021 struct net *net;
1022 int ret;
1023
c346dca1
YH
1024 BUG_ON(!dev_net(dev));
1025 net = dev_net(dev);
b267b179
EB
1026 ret = __dev_alloc_name(net, name, buf);
1027 if (ret >= 0)
1028 strlcpy(dev->name, buf, IFNAMSIZ);
1029 return ret;
1030}
d1b19dff 1031EXPORT_SYMBOL(dev_alloc_name);
b267b179 1032
828de4f6
G
1033static int dev_alloc_name_ns(struct net *net,
1034 struct net_device *dev,
1035 const char *name)
d9031024 1036{
828de4f6
G
1037 char buf[IFNAMSIZ];
1038 int ret;
8ce6cebc 1039
828de4f6
G
1040 ret = __dev_alloc_name(net, name, buf);
1041 if (ret >= 0)
1042 strlcpy(dev->name, buf, IFNAMSIZ);
1043 return ret;
1044}
1045
1046static int dev_get_valid_name(struct net *net,
1047 struct net_device *dev,
1048 const char *name)
1049{
1050 BUG_ON(!net);
8ce6cebc 1051
d9031024
OP
1052 if (!dev_valid_name(name))
1053 return -EINVAL;
1054
1c5cae81 1055 if (strchr(name, '%'))
828de4f6 1056 return dev_alloc_name_ns(net, dev, name);
d9031024
OP
1057 else if (__dev_get_by_name(net, name))
1058 return -EEXIST;
8ce6cebc
DL
1059 else if (dev->name != name)
1060 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
1061
1062 return 0;
1063}
1da177e4
LT
1064
1065/**
1066 * dev_change_name - change name of a device
1067 * @dev: device
1068 * @newname: name (or format string) must be at least IFNAMSIZ
1069 *
1070 * Change name of a device, can pass format strings "eth%d".
1071 * for wildcarding.
1072 */
cf04a4c7 1073int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 1074{
fcc5a03a 1075 char oldname[IFNAMSIZ];
1da177e4 1076 int err = 0;
fcc5a03a 1077 int ret;
881d966b 1078 struct net *net;
1da177e4
LT
1079
1080 ASSERT_RTNL();
c346dca1 1081 BUG_ON(!dev_net(dev));
1da177e4 1082
c346dca1 1083 net = dev_net(dev);
1da177e4
LT
1084 if (dev->flags & IFF_UP)
1085 return -EBUSY;
1086
30e6c9fa 1087 write_seqcount_begin(&devnet_rename_seq);
c91f6df2
BH
1088
1089 if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
30e6c9fa 1090 write_seqcount_end(&devnet_rename_seq);
c8d90dca 1091 return 0;
c91f6df2 1092 }
c8d90dca 1093
fcc5a03a
HX
1094 memcpy(oldname, dev->name, IFNAMSIZ);
1095
828de4f6 1096 err = dev_get_valid_name(net, dev, newname);
c91f6df2 1097 if (err < 0) {
30e6c9fa 1098 write_seqcount_end(&devnet_rename_seq);
d9031024 1099 return err;
c91f6df2 1100 }
1da177e4 1101
fcc5a03a 1102rollback:
a1b3f594
EB
1103 ret = device_rename(&dev->dev, dev->name);
1104 if (ret) {
1105 memcpy(dev->name, oldname, IFNAMSIZ);
30e6c9fa 1106 write_seqcount_end(&devnet_rename_seq);
a1b3f594 1107 return ret;
dcc99773 1108 }
7f988eab 1109
30e6c9fa 1110 write_seqcount_end(&devnet_rename_seq);
c91f6df2 1111
7f988eab 1112 write_lock_bh(&dev_base_lock);
372b2312 1113 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1114 write_unlock_bh(&dev_base_lock);
1115
1116 synchronize_rcu();
1117
1118 write_lock_bh(&dev_base_lock);
1119 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1120 write_unlock_bh(&dev_base_lock);
1121
056925ab 1122 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1123 ret = notifier_to_errno(ret);
1124
1125 if (ret) {
91e9c07b
ED
1126 /* err >= 0 after dev_alloc_name() or stores the first errno */
1127 if (err >= 0) {
fcc5a03a 1128 err = ret;
30e6c9fa 1129 write_seqcount_begin(&devnet_rename_seq);
fcc5a03a
HX
1130 memcpy(dev->name, oldname, IFNAMSIZ);
1131 goto rollback;
91e9c07b 1132 } else {
7b6cd1ce 1133 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1134 dev->name, ret);
fcc5a03a
HX
1135 }
1136 }
1da177e4
LT
1137
1138 return err;
1139}
1140
0b815a1a
SH
1141/**
1142 * dev_set_alias - change ifalias of a device
1143 * @dev: device
1144 * @alias: name up to IFALIASZ
f0db275a 1145 * @len: limit of bytes to copy from info
0b815a1a
SH
1146 *
1147 * Set ifalias for a device,
1148 */
1149int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1150{
7364e445
AK
1151 char *new_ifalias;
1152
0b815a1a
SH
1153 ASSERT_RTNL();
1154
1155 if (len >= IFALIASZ)
1156 return -EINVAL;
1157
96ca4a2c 1158 if (!len) {
388dfc2d
SK
1159 kfree(dev->ifalias);
1160 dev->ifalias = NULL;
96ca4a2c
OH
1161 return 0;
1162 }
1163
7364e445
AK
1164 new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1165 if (!new_ifalias)
0b815a1a 1166 return -ENOMEM;
7364e445 1167 dev->ifalias = new_ifalias;
0b815a1a
SH
1168
1169 strlcpy(dev->ifalias, alias, len+1);
1170 return len;
1171}
1172
1173
d8a33ac4 1174/**
3041a069 1175 * netdev_features_change - device changes features
d8a33ac4
SH
1176 * @dev: device to cause notification
1177 *
1178 * Called to indicate a device has changed features.
1179 */
1180void netdev_features_change(struct net_device *dev)
1181{
056925ab 1182 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1183}
1184EXPORT_SYMBOL(netdev_features_change);
1185
1da177e4
LT
1186/**
1187 * netdev_state_change - device changes state
1188 * @dev: device to cause notification
1189 *
1190 * Called to indicate a device has changed state. This function calls
1191 * the notifier chains for netdev_chain and sends a NEWLINK message
1192 * to the routing socket.
1193 */
1194void netdev_state_change(struct net_device *dev)
1195{
1196 if (dev->flags & IFF_UP) {
056925ab 1197 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1198 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1199 }
1200}
d1b19dff 1201EXPORT_SYMBOL(netdev_state_change);
1da177e4 1202
ee89bab1
AW
1203/**
1204 * netdev_notify_peers - notify network peers about existence of @dev
1205 * @dev: network device
1206 *
1207 * Generate traffic such that interested network peers are aware of
1208 * @dev, such as by generating a gratuitous ARP. This may be used when
1209 * a device wants to inform the rest of the network about some sort of
1210 * reconfiguration such as a failover event or virtual machine
1211 * migration.
1212 */
1213void netdev_notify_peers(struct net_device *dev)
c1da4ac7 1214{
ee89bab1
AW
1215 rtnl_lock();
1216 call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1217 rtnl_unlock();
c1da4ac7 1218}
ee89bab1 1219EXPORT_SYMBOL(netdev_notify_peers);
c1da4ac7 1220
bd380811 1221static int __dev_open(struct net_device *dev)
1da177e4 1222{
d314774c 1223 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1224 int ret;
1da177e4 1225
e46b66bc
BH
1226 ASSERT_RTNL();
1227
1da177e4
LT
1228 if (!netif_device_present(dev))
1229 return -ENODEV;
1230
ca99ca14
NH
1231 /* Block netpoll from trying to do any rx path servicing.
1232 * If we don't do this there is a chance ndo_poll_controller
1233 * or ndo_poll may be running while we open the device
1234 */
1235 ret = netpoll_rx_disable(dev);
1236 if (ret)
1237 return ret;
1238
3b8bcfd5
JB
1239 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1240 ret = notifier_to_errno(ret);
1241 if (ret)
1242 return ret;
1243
1da177e4 1244 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1245
d314774c
SH
1246 if (ops->ndo_validate_addr)
1247 ret = ops->ndo_validate_addr(dev);
bada339b 1248
d314774c
SH
1249 if (!ret && ops->ndo_open)
1250 ret = ops->ndo_open(dev);
1da177e4 1251
ca99ca14
NH
1252 netpoll_rx_enable(dev);
1253
bada339b
JG
1254 if (ret)
1255 clear_bit(__LINK_STATE_START, &dev->state);
1256 else {
1da177e4 1257 dev->flags |= IFF_UP;
b4bd07c2 1258 net_dmaengine_get();
4417da66 1259 dev_set_rx_mode(dev);
1da177e4 1260 dev_activate(dev);
7bf23575 1261 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 1262 }
bada339b 1263
1da177e4
LT
1264 return ret;
1265}
1266
1267/**
bd380811
PM
1268 * dev_open - prepare an interface for use.
1269 * @dev: device to open
1da177e4 1270 *
bd380811
PM
1271 * Takes a device from down to up state. The device's private open
1272 * function is invoked and then the multicast lists are loaded. Finally
1273 * the device is moved into the up state and a %NETDEV_UP message is
1274 * sent to the netdev notifier chain.
1275 *
1276 * Calling this function on an active interface is a nop. On a failure
1277 * a negative errno code is returned.
1da177e4 1278 */
bd380811
PM
1279int dev_open(struct net_device *dev)
1280{
1281 int ret;
1282
bd380811
PM
1283 if (dev->flags & IFF_UP)
1284 return 0;
1285
bd380811
PM
1286 ret = __dev_open(dev);
1287 if (ret < 0)
1288 return ret;
1289
bd380811
PM
1290 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1291 call_netdevice_notifiers(NETDEV_UP, dev);
1292
1293 return ret;
1294}
1295EXPORT_SYMBOL(dev_open);
1296
44345724 1297static int __dev_close_many(struct list_head *head)
1da177e4 1298{
44345724 1299 struct net_device *dev;
e46b66bc 1300
bd380811 1301 ASSERT_RTNL();
9d5010db
DM
1302 might_sleep();
1303
44345724 1304 list_for_each_entry(dev, head, unreg_list) {
44345724 1305 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1306
44345724 1307 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1308
44345724
OP
1309 /* Synchronize to scheduled poll. We cannot touch poll list, it
1310 * can be even on different cpu. So just clear netif_running().
1311 *
1312 * dev->stop() will invoke napi_disable() on all of it's
1313 * napi_struct instances on this device.
1314 */
1315 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1316 }
1da177e4 1317
44345724 1318 dev_deactivate_many(head);
d8b2a4d2 1319
44345724
OP
1320 list_for_each_entry(dev, head, unreg_list) {
1321 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1322
44345724
OP
1323 /*
1324 * Call the device specific close. This cannot fail.
1325 * Only if device is UP
1326 *
1327 * We allow it to be called even after a DETACH hot-plug
1328 * event.
1329 */
1330 if (ops->ndo_stop)
1331 ops->ndo_stop(dev);
1332
44345724 1333 dev->flags &= ~IFF_UP;
44345724
OP
1334 net_dmaengine_put();
1335 }
1336
1337 return 0;
1338}
1339
1340static int __dev_close(struct net_device *dev)
1341{
f87e6f47 1342 int retval;
44345724
OP
1343 LIST_HEAD(single);
1344
ca99ca14
NH
1345 /* Temporarily disable netpoll until the interface is down */
1346 retval = netpoll_rx_disable(dev);
1347 if (retval)
1348 return retval;
1349
44345724 1350 list_add(&dev->unreg_list, &single);
f87e6f47
LT
1351 retval = __dev_close_many(&single);
1352 list_del(&single);
ca99ca14
NH
1353
1354 netpoll_rx_enable(dev);
f87e6f47 1355 return retval;
44345724
OP
1356}
1357
3fbd8758 1358static int dev_close_many(struct list_head *head)
44345724
OP
1359{
1360 struct net_device *dev, *tmp;
1361 LIST_HEAD(tmp_list);
1da177e4 1362
44345724
OP
1363 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1364 if (!(dev->flags & IFF_UP))
1365 list_move(&dev->unreg_list, &tmp_list);
1366
1367 __dev_close_many(head);
1da177e4 1368
44345724
OP
1369 list_for_each_entry(dev, head, unreg_list) {
1370 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1371 call_netdevice_notifiers(NETDEV_DOWN, dev);
1372 }
bd380811 1373
44345724
OP
1374 /* rollback_registered_many needs the complete original list */
1375 list_splice(&tmp_list, head);
bd380811
PM
1376 return 0;
1377}
1378
1379/**
1380 * dev_close - shutdown an interface.
1381 * @dev: device to shutdown
1382 *
1383 * This function moves an active device into down state. A
1384 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1385 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1386 * chain.
1387 */
1388int dev_close(struct net_device *dev)
1389{
ca99ca14 1390 int ret = 0;
e14a5993
ED
1391 if (dev->flags & IFF_UP) {
1392 LIST_HEAD(single);
1da177e4 1393
ca99ca14
NH
1394 /* Block netpoll rx while the interface is going down */
1395 ret = netpoll_rx_disable(dev);
1396 if (ret)
1397 return ret;
1398
e14a5993
ED
1399 list_add(&dev->unreg_list, &single);
1400 dev_close_many(&single);
1401 list_del(&single);
ca99ca14
NH
1402
1403 netpoll_rx_enable(dev);
e14a5993 1404 }
ca99ca14 1405 return ret;
1da177e4 1406}
d1b19dff 1407EXPORT_SYMBOL(dev_close);
1da177e4
LT
1408
1409
0187bdfb
BH
1410/**
1411 * dev_disable_lro - disable Large Receive Offload on a device
1412 * @dev: device
1413 *
1414 * Disable Large Receive Offload (LRO) on a net device. Must be
1415 * called under RTNL. This is needed if received packets may be
1416 * forwarded to another interface.
1417 */
1418void dev_disable_lro(struct net_device *dev)
1419{
f11970e3
NH
1420 /*
1421 * If we're trying to disable lro on a vlan device
1422 * use the underlying physical device instead
1423 */
1424 if (is_vlan_dev(dev))
1425 dev = vlan_dev_real_dev(dev);
1426
bc5787c6
MM
1427 dev->wanted_features &= ~NETIF_F_LRO;
1428 netdev_update_features(dev);
27660515 1429
22d5969f
MM
1430 if (unlikely(dev->features & NETIF_F_LRO))
1431 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1432}
1433EXPORT_SYMBOL(dev_disable_lro);
1434
1435
881d966b
EB
1436static int dev_boot_phase = 1;
1437
1da177e4
LT
1438/**
1439 * register_netdevice_notifier - register a network notifier block
1440 * @nb: notifier
1441 *
1442 * Register a notifier to be called when network device events occur.
1443 * The notifier passed is linked into the kernel structures and must
1444 * not be reused until it has been unregistered. A negative errno code
1445 * is returned on a failure.
1446 *
1447 * When registered all registration and up events are replayed
4ec93edb 1448 * to the new notifier to allow device to have a race free
1da177e4
LT
1449 * view of the network device list.
1450 */
1451
1452int register_netdevice_notifier(struct notifier_block *nb)
1453{
1454 struct net_device *dev;
fcc5a03a 1455 struct net_device *last;
881d966b 1456 struct net *net;
1da177e4
LT
1457 int err;
1458
1459 rtnl_lock();
f07d5b94 1460 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1461 if (err)
1462 goto unlock;
881d966b
EB
1463 if (dev_boot_phase)
1464 goto unlock;
1465 for_each_net(net) {
1466 for_each_netdev(net, dev) {
1467 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1468 err = notifier_to_errno(err);
1469 if (err)
1470 goto rollback;
1471
1472 if (!(dev->flags & IFF_UP))
1473 continue;
1da177e4 1474
881d966b
EB
1475 nb->notifier_call(nb, NETDEV_UP, dev);
1476 }
1da177e4 1477 }
fcc5a03a
HX
1478
1479unlock:
1da177e4
LT
1480 rtnl_unlock();
1481 return err;
fcc5a03a
HX
1482
1483rollback:
1484 last = dev;
881d966b
EB
1485 for_each_net(net) {
1486 for_each_netdev(net, dev) {
1487 if (dev == last)
8f891489 1488 goto outroll;
fcc5a03a 1489
881d966b
EB
1490 if (dev->flags & IFF_UP) {
1491 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1492 nb->notifier_call(nb, NETDEV_DOWN, dev);
1493 }
1494 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
fcc5a03a 1495 }
fcc5a03a 1496 }
c67625a1 1497
8f891489 1498outroll:
c67625a1 1499 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1500 goto unlock;
1da177e4 1501}
d1b19dff 1502EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1503
1504/**
1505 * unregister_netdevice_notifier - unregister a network notifier block
1506 * @nb: notifier
1507 *
1508 * Unregister a notifier previously registered by
1509 * register_netdevice_notifier(). The notifier is unlinked into the
1510 * kernel structures and may then be reused. A negative errno code
1511 * is returned on a failure.
7d3d43da
EB
1512 *
1513 * After unregistering unregister and down device events are synthesized
1514 * for all devices on the device list to the removed notifier to remove
1515 * the need for special case cleanup code.
1da177e4
LT
1516 */
1517
1518int unregister_netdevice_notifier(struct notifier_block *nb)
1519{
7d3d43da
EB
1520 struct net_device *dev;
1521 struct net *net;
9f514950
HX
1522 int err;
1523
1524 rtnl_lock();
f07d5b94 1525 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1526 if (err)
1527 goto unlock;
1528
1529 for_each_net(net) {
1530 for_each_netdev(net, dev) {
1531 if (dev->flags & IFF_UP) {
1532 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1533 nb->notifier_call(nb, NETDEV_DOWN, dev);
1534 }
1535 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
7d3d43da
EB
1536 }
1537 }
1538unlock:
9f514950
HX
1539 rtnl_unlock();
1540 return err;
1da177e4 1541}
d1b19dff 1542EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1543
1544/**
1545 * call_netdevice_notifiers - call all network notifier blocks
1546 * @val: value passed unmodified to notifier function
c4ea43c5 1547 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1548 *
1549 * Call all network notifier blocks. Parameters and return value
f07d5b94 1550 * are as for raw_notifier_call_chain().
1da177e4
LT
1551 */
1552
ad7379d4 1553int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1554{
ab930471 1555 ASSERT_RTNL();
ad7379d4 1556 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4 1557}
edf947f1 1558EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1559
c5905afb 1560static struct static_key netstamp_needed __read_mostly;
b90e5794 1561#ifdef HAVE_JUMP_LABEL
c5905afb 1562/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1563 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1564 * static_key_slow_dec() calls.
b90e5794
ED
1565 */
1566static atomic_t netstamp_needed_deferred;
1567#endif
1da177e4
LT
1568
1569void net_enable_timestamp(void)
1570{
b90e5794
ED
1571#ifdef HAVE_JUMP_LABEL
1572 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1573
1574 if (deferred) {
1575 while (--deferred)
c5905afb 1576 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1577 return;
1578 }
1579#endif
c5905afb 1580 static_key_slow_inc(&netstamp_needed);
1da177e4 1581}
d1b19dff 1582EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1583
1584void net_disable_timestamp(void)
1585{
b90e5794
ED
1586#ifdef HAVE_JUMP_LABEL
1587 if (in_interrupt()) {
1588 atomic_inc(&netstamp_needed_deferred);
1589 return;
1590 }
1591#endif
c5905afb 1592 static_key_slow_dec(&netstamp_needed);
1da177e4 1593}
d1b19dff 1594EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1595
3b098e2d 1596static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1597{
588f0330 1598 skb->tstamp.tv64 = 0;
c5905afb 1599 if (static_key_false(&netstamp_needed))
a61bbcf2 1600 __net_timestamp(skb);
1da177e4
LT
1601}
1602
588f0330 1603#define net_timestamp_check(COND, SKB) \
c5905afb 1604 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1605 if ((COND) && !(SKB)->tstamp.tv64) \
1606 __net_timestamp(SKB); \
1607 } \
3b098e2d 1608
79b569f0
DL
1609static inline bool is_skb_forwardable(struct net_device *dev,
1610 struct sk_buff *skb)
1611{
1612 unsigned int len;
1613
1614 if (!(dev->flags & IFF_UP))
1615 return false;
1616
1617 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1618 if (skb->len <= len)
1619 return true;
1620
1621 /* if TSO is enabled, we don't care about the length as the packet
1622 * could be forwarded without being segmented before
1623 */
1624 if (skb_is_gso(skb))
1625 return true;
1626
1627 return false;
1628}
1629
44540960
AB
1630/**
1631 * dev_forward_skb - loopback an skb to another netif
1632 *
1633 * @dev: destination network device
1634 * @skb: buffer to forward
1635 *
1636 * return values:
1637 * NET_RX_SUCCESS (no congestion)
6ec82562 1638 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1639 *
1640 * dev_forward_skb can be used for injecting an skb from the
1641 * start_xmit function of one device into the receive queue
1642 * of another device.
1643 *
1644 * The receiving device may be in another namespace, so
1645 * we have to clear all information in the skb that could
1646 * impact namespace isolation.
1647 */
1648int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1649{
48c83012
MT
1650 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1651 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1652 atomic_long_inc(&dev->rx_dropped);
1653 kfree_skb(skb);
1654 return NET_RX_DROP;
1655 }
1656 }
1657
44540960
AB
1658 skb_orphan(skb);
1659
79b569f0 1660 if (unlikely(!is_skb_forwardable(dev, skb))) {
caf586e5 1661 atomic_long_inc(&dev->rx_dropped);
6ec82562 1662 kfree_skb(skb);
44540960 1663 return NET_RX_DROP;
6ec82562 1664 }
3b9785c6 1665 skb->skb_iif = 0;
59b9997b
DM
1666 skb->dev = dev;
1667 skb_dst_drop(skb);
44540960
AB
1668 skb->tstamp.tv64 = 0;
1669 skb->pkt_type = PACKET_HOST;
1670 skb->protocol = eth_type_trans(skb, dev);
59b9997b
DM
1671 skb->mark = 0;
1672 secpath_reset(skb);
1673 nf_reset(skb);
124dff01 1674 nf_reset_trace(skb);
44540960
AB
1675 return netif_rx(skb);
1676}
1677EXPORT_SYMBOL_GPL(dev_forward_skb);
1678
71d9dec2
CG
1679static inline int deliver_skb(struct sk_buff *skb,
1680 struct packet_type *pt_prev,
1681 struct net_device *orig_dev)
1682{
1080e512
MT
1683 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1684 return -ENOMEM;
71d9dec2
CG
1685 atomic_inc(&skb->users);
1686 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1687}
1688
c0de08d0
EL
1689static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1690{
a3d744e9 1691 if (!ptype->af_packet_priv || !skb->sk)
c0de08d0
EL
1692 return false;
1693
1694 if (ptype->id_match)
1695 return ptype->id_match(ptype, skb->sk);
1696 else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1697 return true;
1698
1699 return false;
1700}
1701
1da177e4
LT
1702/*
1703 * Support routine. Sends outgoing frames to any network
1704 * taps currently in use.
1705 */
1706
f6a78bfc 1707static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1708{
1709 struct packet_type *ptype;
71d9dec2
CG
1710 struct sk_buff *skb2 = NULL;
1711 struct packet_type *pt_prev = NULL;
a61bbcf2 1712
1da177e4
LT
1713 rcu_read_lock();
1714 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1715 /* Never send packets back to the socket
1716 * they originated from - MvS (miquels@drinkel.ow.org)
1717 */
1718 if ((ptype->dev == dev || !ptype->dev) &&
c0de08d0 1719 (!skb_loop_sk(ptype, skb))) {
71d9dec2
CG
1720 if (pt_prev) {
1721 deliver_skb(skb2, pt_prev, skb->dev);
1722 pt_prev = ptype;
1723 continue;
1724 }
1725
1726 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1727 if (!skb2)
1728 break;
1729
70978182
ED
1730 net_timestamp_set(skb2);
1731
1da177e4
LT
1732 /* skb->nh should be correctly
1733 set by sender, so that the second statement is
1734 just protection against buggy protocols.
1735 */
459a98ed 1736 skb_reset_mac_header(skb2);
1da177e4 1737
d56f90a7 1738 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1739 skb2->network_header > skb2->tail) {
e87cc472
JP
1740 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1741 ntohs(skb2->protocol),
1742 dev->name);
c1d2bbe1 1743 skb_reset_network_header(skb2);
1da177e4
LT
1744 }
1745
b0e380b1 1746 skb2->transport_header = skb2->network_header;
1da177e4 1747 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1748 pt_prev = ptype;
1da177e4
LT
1749 }
1750 }
71d9dec2
CG
1751 if (pt_prev)
1752 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1753 rcu_read_unlock();
1754}
1755
2c53040f
BH
1756/**
1757 * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
4f57c087
JF
1758 * @dev: Network device
1759 * @txq: number of queues available
1760 *
1761 * If real_num_tx_queues is changed the tc mappings may no longer be
1762 * valid. To resolve this verify the tc mapping remains valid and if
1763 * not NULL the mapping. With no priorities mapping to this
1764 * offset/count pair it will no longer be used. In the worst case TC0
1765 * is invalid nothing can be done so disable priority mappings. If is
1766 * expected that drivers will fix this mapping if they can before
1767 * calling netif_set_real_num_tx_queues.
1768 */
bb134d22 1769static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1770{
1771 int i;
1772 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1773
1774 /* If TC0 is invalidated disable TC mapping */
1775 if (tc->offset + tc->count > txq) {
7b6cd1ce 1776 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1777 dev->num_tc = 0;
1778 return;
1779 }
1780
1781 /* Invalidated prio to tc mappings set to TC0 */
1782 for (i = 1; i < TC_BITMASK + 1; i++) {
1783 int q = netdev_get_prio_tc_map(dev, i);
1784
1785 tc = &dev->tc_to_txq[q];
1786 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1787 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1788 i, q);
4f57c087
JF
1789 netdev_set_prio_tc_map(dev, i, 0);
1790 }
1791 }
1792}
1793
537c00de
AD
1794#ifdef CONFIG_XPS
1795static DEFINE_MUTEX(xps_map_mutex);
1796#define xmap_dereference(P) \
1797 rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1798
10cdc3f3
AD
1799static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1800 int cpu, u16 index)
537c00de 1801{
10cdc3f3
AD
1802 struct xps_map *map = NULL;
1803 int pos;
537c00de 1804
10cdc3f3
AD
1805 if (dev_maps)
1806 map = xmap_dereference(dev_maps->cpu_map[cpu]);
537c00de 1807
10cdc3f3
AD
1808 for (pos = 0; map && pos < map->len; pos++) {
1809 if (map->queues[pos] == index) {
537c00de
AD
1810 if (map->len > 1) {
1811 map->queues[pos] = map->queues[--map->len];
1812 } else {
10cdc3f3 1813 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
537c00de
AD
1814 kfree_rcu(map, rcu);
1815 map = NULL;
1816 }
10cdc3f3 1817 break;
537c00de 1818 }
537c00de
AD
1819 }
1820
10cdc3f3
AD
1821 return map;
1822}
1823
024e9679 1824static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
10cdc3f3
AD
1825{
1826 struct xps_dev_maps *dev_maps;
024e9679 1827 int cpu, i;
10cdc3f3
AD
1828 bool active = false;
1829
1830 mutex_lock(&xps_map_mutex);
1831 dev_maps = xmap_dereference(dev->xps_maps);
1832
1833 if (!dev_maps)
1834 goto out_no_maps;
1835
1836 for_each_possible_cpu(cpu) {
024e9679
AD
1837 for (i = index; i < dev->num_tx_queues; i++) {
1838 if (!remove_xps_queue(dev_maps, cpu, i))
1839 break;
1840 }
1841 if (i == dev->num_tx_queues)
10cdc3f3
AD
1842 active = true;
1843 }
1844
1845 if (!active) {
537c00de
AD
1846 RCU_INIT_POINTER(dev->xps_maps, NULL);
1847 kfree_rcu(dev_maps, rcu);
1848 }
1849
024e9679
AD
1850 for (i = index; i < dev->num_tx_queues; i++)
1851 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1852 NUMA_NO_NODE);
1853
537c00de
AD
1854out_no_maps:
1855 mutex_unlock(&xps_map_mutex);
1856}
1857
01c5f864
AD
1858static struct xps_map *expand_xps_map(struct xps_map *map,
1859 int cpu, u16 index)
1860{
1861 struct xps_map *new_map;
1862 int alloc_len = XPS_MIN_MAP_ALLOC;
1863 int i, pos;
1864
1865 for (pos = 0; map && pos < map->len; pos++) {
1866 if (map->queues[pos] != index)
1867 continue;
1868 return map;
1869 }
1870
1871 /* Need to add queue to this CPU's existing map */
1872 if (map) {
1873 if (pos < map->alloc_len)
1874 return map;
1875
1876 alloc_len = map->alloc_len * 2;
1877 }
1878
1879 /* Need to allocate new map to store queue on this CPU's map */
1880 new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1881 cpu_to_node(cpu));
1882 if (!new_map)
1883 return NULL;
1884
1885 for (i = 0; i < pos; i++)
1886 new_map->queues[i] = map->queues[i];
1887 new_map->alloc_len = alloc_len;
1888 new_map->len = pos;
1889
1890 return new_map;
1891}
1892
537c00de
AD
1893int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1894{
01c5f864 1895 struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
537c00de 1896 struct xps_map *map, *new_map;
537c00de 1897 int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
01c5f864
AD
1898 int cpu, numa_node_id = -2;
1899 bool active = false;
537c00de
AD
1900
1901 mutex_lock(&xps_map_mutex);
1902
1903 dev_maps = xmap_dereference(dev->xps_maps);
1904
01c5f864
AD
1905 /* allocate memory for queue storage */
1906 for_each_online_cpu(cpu) {
1907 if (!cpumask_test_cpu(cpu, mask))
1908 continue;
1909
1910 if (!new_dev_maps)
1911 new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
2bb60cb9
AD
1912 if (!new_dev_maps) {
1913 mutex_unlock(&xps_map_mutex);
01c5f864 1914 return -ENOMEM;
2bb60cb9 1915 }
01c5f864
AD
1916
1917 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1918 NULL;
1919
1920 map = expand_xps_map(map, cpu, index);
1921 if (!map)
1922 goto error;
1923
1924 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1925 }
1926
1927 if (!new_dev_maps)
1928 goto out_no_new_maps;
1929
537c00de 1930 for_each_possible_cpu(cpu) {
01c5f864
AD
1931 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1932 /* add queue to CPU maps */
1933 int pos = 0;
1934
1935 map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1936 while ((pos < map->len) && (map->queues[pos] != index))
1937 pos++;
1938
1939 if (pos == map->len)
1940 map->queues[map->len++] = index;
537c00de 1941#ifdef CONFIG_NUMA
537c00de
AD
1942 if (numa_node_id == -2)
1943 numa_node_id = cpu_to_node(cpu);
1944 else if (numa_node_id != cpu_to_node(cpu))
1945 numa_node_id = -1;
537c00de 1946#endif
01c5f864
AD
1947 } else if (dev_maps) {
1948 /* fill in the new device map from the old device map */
1949 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1950 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
537c00de 1951 }
01c5f864 1952
537c00de
AD
1953 }
1954
01c5f864
AD
1955 rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1956
537c00de 1957 /* Cleanup old maps */
01c5f864
AD
1958 if (dev_maps) {
1959 for_each_possible_cpu(cpu) {
1960 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1961 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1962 if (map && map != new_map)
1963 kfree_rcu(map, rcu);
1964 }
537c00de 1965
01c5f864 1966 kfree_rcu(dev_maps, rcu);
537c00de
AD
1967 }
1968
01c5f864
AD
1969 dev_maps = new_dev_maps;
1970 active = true;
537c00de 1971
01c5f864
AD
1972out_no_new_maps:
1973 /* update Tx queue numa node */
537c00de
AD
1974 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1975 (numa_node_id >= 0) ? numa_node_id :
1976 NUMA_NO_NODE);
1977
01c5f864
AD
1978 if (!dev_maps)
1979 goto out_no_maps;
1980
1981 /* removes queue from unused CPUs */
1982 for_each_possible_cpu(cpu) {
1983 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1984 continue;
1985
1986 if (remove_xps_queue(dev_maps, cpu, index))
1987 active = true;
1988 }
1989
1990 /* free map if not active */
1991 if (!active) {
1992 RCU_INIT_POINTER(dev->xps_maps, NULL);
1993 kfree_rcu(dev_maps, rcu);
1994 }
1995
1996out_no_maps:
537c00de
AD
1997 mutex_unlock(&xps_map_mutex);
1998
1999 return 0;
2000error:
01c5f864
AD
2001 /* remove any maps that we added */
2002 for_each_possible_cpu(cpu) {
2003 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2004 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2005 NULL;
2006 if (new_map && new_map != map)
2007 kfree(new_map);
2008 }
2009
537c00de
AD
2010 mutex_unlock(&xps_map_mutex);
2011
537c00de
AD
2012 kfree(new_dev_maps);
2013 return -ENOMEM;
2014}
2015EXPORT_SYMBOL(netif_set_xps_queue);
2016
2017#endif
f0796d5c
JF
2018/*
2019 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2020 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2021 */
e6484930 2022int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 2023{
1d24eb48
TH
2024 int rc;
2025
e6484930
TH
2026 if (txq < 1 || txq > dev->num_tx_queues)
2027 return -EINVAL;
f0796d5c 2028
5c56580b
BH
2029 if (dev->reg_state == NETREG_REGISTERED ||
2030 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
2031 ASSERT_RTNL();
2032
1d24eb48
TH
2033 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2034 txq);
bf264145
TH
2035 if (rc)
2036 return rc;
2037
4f57c087
JF
2038 if (dev->num_tc)
2039 netif_setup_tc(dev, txq);
2040
024e9679 2041 if (txq < dev->real_num_tx_queues) {
e6484930 2042 qdisc_reset_all_tx_gt(dev, txq);
024e9679
AD
2043#ifdef CONFIG_XPS
2044 netif_reset_xps_queues_gt(dev, txq);
2045#endif
2046 }
f0796d5c 2047 }
e6484930
TH
2048
2049 dev->real_num_tx_queues = txq;
2050 return 0;
f0796d5c
JF
2051}
2052EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 2053
62fe0b40
BH
2054#ifdef CONFIG_RPS
2055/**
2056 * netif_set_real_num_rx_queues - set actual number of RX queues used
2057 * @dev: Network device
2058 * @rxq: Actual number of RX queues
2059 *
2060 * This must be called either with the rtnl_lock held or before
2061 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
2062 * negative error code. If called before registration, it always
2063 * succeeds.
62fe0b40
BH
2064 */
2065int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2066{
2067 int rc;
2068
bd25fa7b
TH
2069 if (rxq < 1 || rxq > dev->num_rx_queues)
2070 return -EINVAL;
2071
62fe0b40
BH
2072 if (dev->reg_state == NETREG_REGISTERED) {
2073 ASSERT_RTNL();
2074
62fe0b40
BH
2075 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2076 rxq);
2077 if (rc)
2078 return rc;
62fe0b40
BH
2079 }
2080
2081 dev->real_num_rx_queues = rxq;
2082 return 0;
2083}
2084EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2085#endif
2086
2c53040f
BH
2087/**
2088 * netif_get_num_default_rss_queues - default number of RSS queues
16917b87
YM
2089 *
2090 * This routine should set an upper limit on the number of RSS queues
2091 * used by default by multiqueue devices.
2092 */
a55b138b 2093int netif_get_num_default_rss_queues(void)
16917b87
YM
2094{
2095 return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2096}
2097EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2098
def82a1d 2099static inline void __netif_reschedule(struct Qdisc *q)
56079431 2100{
def82a1d
JP
2101 struct softnet_data *sd;
2102 unsigned long flags;
56079431 2103
def82a1d
JP
2104 local_irq_save(flags);
2105 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
2106 q->next_sched = NULL;
2107 *sd->output_queue_tailp = q;
2108 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
2109 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2110 local_irq_restore(flags);
2111}
2112
2113void __netif_schedule(struct Qdisc *q)
2114{
2115 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2116 __netif_reschedule(q);
56079431
DV
2117}
2118EXPORT_SYMBOL(__netif_schedule);
2119
bea3348e 2120void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 2121{
3578b0c8 2122 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
2123 struct softnet_data *sd;
2124 unsigned long flags;
56079431 2125
bea3348e
SH
2126 local_irq_save(flags);
2127 sd = &__get_cpu_var(softnet_data);
2128 skb->next = sd->completion_queue;
2129 sd->completion_queue = skb;
2130 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2131 local_irq_restore(flags);
2132 }
56079431 2133}
bea3348e 2134EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
2135
2136void dev_kfree_skb_any(struct sk_buff *skb)
2137{
2138 if (in_irq() || irqs_disabled())
2139 dev_kfree_skb_irq(skb);
2140 else
2141 dev_kfree_skb(skb);
2142}
2143EXPORT_SYMBOL(dev_kfree_skb_any);
2144
2145
bea3348e
SH
2146/**
2147 * netif_device_detach - mark device as removed
2148 * @dev: network device
2149 *
2150 * Mark device as removed from system and therefore no longer available.
2151 */
56079431
DV
2152void netif_device_detach(struct net_device *dev)
2153{
2154 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2155 netif_running(dev)) {
d543103a 2156 netif_tx_stop_all_queues(dev);
56079431
DV
2157 }
2158}
2159EXPORT_SYMBOL(netif_device_detach);
2160
bea3348e
SH
2161/**
2162 * netif_device_attach - mark device as attached
2163 * @dev: network device
2164 *
2165 * Mark device as attached from system and restart if needed.
2166 */
56079431
DV
2167void netif_device_attach(struct net_device *dev)
2168{
2169 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2170 netif_running(dev)) {
d543103a 2171 netif_tx_wake_all_queues(dev);
4ec93edb 2172 __netdev_watchdog_up(dev);
56079431
DV
2173 }
2174}
2175EXPORT_SYMBOL(netif_device_attach);
2176
36c92474
BH
2177static void skb_warn_bad_offload(const struct sk_buff *skb)
2178{
65e9d2fa 2179 static const netdev_features_t null_features = 0;
36c92474
BH
2180 struct net_device *dev = skb->dev;
2181 const char *driver = "";
2182
c846ad9b
BG
2183 if (!net_ratelimit())
2184 return;
2185
36c92474
BH
2186 if (dev && dev->dev.parent)
2187 driver = dev_driver_string(dev->dev.parent);
2188
2189 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2190 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
2191 driver, dev ? &dev->features : &null_features,
2192 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
2193 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2194 skb_shinfo(skb)->gso_type, skb->ip_summed);
2195}
2196
1da177e4
LT
2197/*
2198 * Invalidate hardware checksum when packet is to be mangled, and
2199 * complete checksum manually on outgoing path.
2200 */
84fa7933 2201int skb_checksum_help(struct sk_buff *skb)
1da177e4 2202{
d3bc23e7 2203 __wsum csum;
663ead3b 2204 int ret = 0, offset;
1da177e4 2205
84fa7933 2206 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
2207 goto out_set_summed;
2208
2209 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
2210 skb_warn_bad_offload(skb);
2211 return -EINVAL;
1da177e4
LT
2212 }
2213
cef401de
ED
2214 /* Before computing a checksum, we should make sure no frag could
2215 * be modified by an external entity : checksum could be wrong.
2216 */
2217 if (skb_has_shared_frag(skb)) {
2218 ret = __skb_linearize(skb);
2219 if (ret)
2220 goto out;
2221 }
2222
55508d60 2223 offset = skb_checksum_start_offset(skb);
a030847e
HX
2224 BUG_ON(offset >= skb_headlen(skb));
2225 csum = skb_checksum(skb, offset, skb->len - offset, 0);
2226
2227 offset += skb->csum_offset;
2228 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2229
2230 if (skb_cloned(skb) &&
2231 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
2232 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2233 if (ret)
2234 goto out;
2235 }
2236
a030847e 2237 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 2238out_set_summed:
1da177e4 2239 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 2240out:
1da177e4
LT
2241 return ret;
2242}
d1b19dff 2243EXPORT_SYMBOL(skb_checksum_help);
1da177e4 2244
ec5f0615 2245__be16 skb_network_protocol(struct sk_buff *skb)
f6a78bfc 2246{
252e3346 2247 __be16 type = skb->protocol;
c80a8512 2248 int vlan_depth = ETH_HLEN;
f6a78bfc 2249
19acc327
PS
2250 /* Tunnel gso handlers can set protocol to ethernet. */
2251 if (type == htons(ETH_P_TEB)) {
2252 struct ethhdr *eth;
2253
2254 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2255 return 0;
2256
2257 eth = (struct ethhdr *)skb_mac_header(skb);
2258 type = eth->h_proto;
2259 }
2260
8ad227ff 2261 while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
c8d5bcd1 2262 struct vlan_hdr *vh;
7b9c6090 2263
c8d5bcd1 2264 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
ec5f0615 2265 return 0;
7b9c6090 2266
c8d5bcd1
JG
2267 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2268 type = vh->h_vlan_encapsulated_proto;
2269 vlan_depth += VLAN_HLEN;
7b9c6090
JG
2270 }
2271
ec5f0615
PS
2272 return type;
2273}
2274
2275/**
2276 * skb_mac_gso_segment - mac layer segmentation handler.
2277 * @skb: buffer to segment
2278 * @features: features for the output path (see dev->features)
2279 */
2280struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2281 netdev_features_t features)
2282{
2283 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2284 struct packet_offload *ptype;
2285 __be16 type = skb_network_protocol(skb);
2286
2287 if (unlikely(!type))
2288 return ERR_PTR(-EINVAL);
2289
f6a78bfc
HX
2290 __skb_pull(skb, skb->mac_len);
2291
2292 rcu_read_lock();
22061d80 2293 list_for_each_entry_rcu(ptype, &offload_base, list) {
f191a1d1 2294 if (ptype->type == type && ptype->callbacks.gso_segment) {
84fa7933 2295 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
05e8ef4a
PS
2296 int err;
2297
f191a1d1 2298 err = ptype->callbacks.gso_send_check(skb);
a430a43d
HX
2299 segs = ERR_PTR(err);
2300 if (err || skb_gso_ok(skb, features))
2301 break;
d56f90a7
ACM
2302 __skb_push(skb, (skb->data -
2303 skb_network_header(skb)));
a430a43d 2304 }
f191a1d1 2305 segs = ptype->callbacks.gso_segment(skb, features);
f6a78bfc
HX
2306 break;
2307 }
2308 }
2309 rcu_read_unlock();
2310
98e399f8 2311 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2312
f6a78bfc
HX
2313 return segs;
2314}
05e8ef4a
PS
2315EXPORT_SYMBOL(skb_mac_gso_segment);
2316
2317
2318/* openvswitch calls this on rx path, so we need a different check.
2319 */
2320static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2321{
2322 if (tx_path)
2323 return skb->ip_summed != CHECKSUM_PARTIAL;
2324 else
2325 return skb->ip_summed == CHECKSUM_NONE;
2326}
2327
2328/**
2329 * __skb_gso_segment - Perform segmentation on skb.
2330 * @skb: buffer to segment
2331 * @features: features for the output path (see dev->features)
2332 * @tx_path: whether it is called in TX path
2333 *
2334 * This function segments the given skb and returns a list of segments.
2335 *
2336 * It may return NULL if the skb requires no segmentation. This is
2337 * only possible when GSO is used for verifying header integrity.
2338 */
2339struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2340 netdev_features_t features, bool tx_path)
2341{
2342 if (unlikely(skb_needs_check(skb, tx_path))) {
2343 int err;
2344
2345 skb_warn_bad_offload(skb);
2346
2347 if (skb_header_cloned(skb) &&
2348 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2349 return ERR_PTR(err);
2350 }
2351
68c33163 2352 SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
05e8ef4a
PS
2353 skb_reset_mac_header(skb);
2354 skb_reset_mac_len(skb);
2355
2356 return skb_mac_gso_segment(skb, features);
2357}
12b0004d 2358EXPORT_SYMBOL(__skb_gso_segment);
f6a78bfc 2359
fb286bb2
HX
2360/* Take action when hardware reception checksum errors are detected. */
2361#ifdef CONFIG_BUG
2362void netdev_rx_csum_fault(struct net_device *dev)
2363{
2364 if (net_ratelimit()) {
7b6cd1ce 2365 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2366 dump_stack();
2367 }
2368}
2369EXPORT_SYMBOL(netdev_rx_csum_fault);
2370#endif
2371
1da177e4
LT
2372/* Actually, we should eliminate this check as soon as we know, that:
2373 * 1. IOMMU is present and allows to map all the memory.
2374 * 2. No high memory really exists on this machine.
2375 */
2376
a999dd5c 2377static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
1da177e4 2378{
3d3a8533 2379#ifdef CONFIG_HIGHMEM
1da177e4 2380 int i;
5acbbd42 2381 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2382 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2383 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2384 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2385 return 1;
ea2ab693 2386 }
5acbbd42 2387 }
1da177e4 2388
5acbbd42
FT
2389 if (PCI_DMA_BUS_IS_PHYS) {
2390 struct device *pdev = dev->dev.parent;
1da177e4 2391
9092c658
ED
2392 if (!pdev)
2393 return 0;
5acbbd42 2394 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2395 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2396 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2397 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2398 return 1;
2399 }
2400 }
3d3a8533 2401#endif
1da177e4
LT
2402 return 0;
2403}
1da177e4 2404
f6a78bfc
HX
2405struct dev_gso_cb {
2406 void (*destructor)(struct sk_buff *skb);
2407};
2408
2409#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2410
2411static void dev_gso_skb_destructor(struct sk_buff *skb)
2412{
2413 struct dev_gso_cb *cb;
2414
2415 do {
2416 struct sk_buff *nskb = skb->next;
2417
2418 skb->next = nskb->next;
2419 nskb->next = NULL;
2420 kfree_skb(nskb);
2421 } while (skb->next);
2422
2423 cb = DEV_GSO_CB(skb);
2424 if (cb->destructor)
2425 cb->destructor(skb);
2426}
2427
2428/**
2429 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2430 * @skb: buffer to segment
91ecb63c 2431 * @features: device features as applicable to this skb
f6a78bfc
HX
2432 *
2433 * This function segments the given skb and stores the list of segments
2434 * in skb->next.
2435 */
c8f44aff 2436static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
f6a78bfc 2437{
f6a78bfc 2438 struct sk_buff *segs;
576a30eb
HX
2439
2440 segs = skb_gso_segment(skb, features);
2441
2442 /* Verifying header integrity only. */
2443 if (!segs)
2444 return 0;
f6a78bfc 2445
801678c5 2446 if (IS_ERR(segs))
f6a78bfc
HX
2447 return PTR_ERR(segs);
2448
2449 skb->next = segs;
2450 DEV_GSO_CB(skb)->destructor = skb->destructor;
2451 skb->destructor = dev_gso_skb_destructor;
2452
2453 return 0;
2454}
2455
c8f44aff 2456static netdev_features_t harmonize_features(struct sk_buff *skb,
a999dd5c
FW
2457 __be16 protocol,
2458 const struct net_device *dev,
2459 netdev_features_t features)
f01a5236 2460{
c0d680e5
EC
2461 if (skb->ip_summed != CHECKSUM_NONE &&
2462 !can_checksum_protocol(features, protocol)) {
f01a5236 2463 features &= ~NETIF_F_ALL_CSUM;
a999dd5c 2464 } else if (illegal_highdma(dev, skb)) {
f01a5236
JG
2465 features &= ~NETIF_F_SG;
2466 }
2467
2468 return features;
2469}
2470
a999dd5c
FW
2471netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2472 const struct net_device *dev)
58e998c6
JG
2473{
2474 __be16 protocol = skb->protocol;
a999dd5c 2475 netdev_features_t features = dev->features;
58e998c6 2476
a999dd5c 2477 if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
30b678d8
BH
2478 features &= ~NETIF_F_GSO_MASK;
2479
8ad227ff 2480 if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
58e998c6
JG
2481 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2482 protocol = veh->h_vlan_encapsulated_proto;
f01a5236 2483 } else if (!vlan_tx_tag_present(skb)) {
a999dd5c 2484 return harmonize_features(skb, protocol, dev, features);
f01a5236 2485 }
58e998c6 2486
a999dd5c 2487 features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
8ad227ff 2488 NETIF_F_HW_VLAN_STAG_TX);
f01a5236 2489
8ad227ff 2490 if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
a999dd5c 2491 return harmonize_features(skb, protocol, dev, features);
f01a5236
JG
2492 } else {
2493 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
8ad227ff
PM
2494 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2495 NETIF_F_HW_VLAN_STAG_TX;
a999dd5c 2496 return harmonize_features(skb, protocol, dev, features);
f01a5236 2497 }
a999dd5c
FW
2498
2499 return harmonize_features(skb, protocol, dev, features);
58e998c6 2500}
a999dd5c 2501EXPORT_SYMBOL(netif_skb_dev_features);
58e998c6 2502
6afff0ca
JF
2503/*
2504 * Returns true if either:
2505 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
d1a53dfd 2506 * 2. skb is fragmented and the device does not support SG.
6afff0ca
JF
2507 */
2508static inline int skb_needs_linearize(struct sk_buff *skb,
6708c9e5 2509 netdev_features_t features)
6afff0ca 2510{
02932ce9
JG
2511 return skb_is_nonlinear(skb) &&
2512 ((skb_has_frag_list(skb) &&
2513 !(features & NETIF_F_FRAGLIST)) ||
e1e78db6 2514 (skb_shinfo(skb)->nr_frags &&
02932ce9 2515 !(features & NETIF_F_SG)));
6afff0ca
JF
2516}
2517
fd2ea0a7
DM
2518int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2519 struct netdev_queue *txq)
f6a78bfc 2520{
00829823 2521 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 2522 int rc = NETDEV_TX_OK;
ec764bf0 2523 unsigned int skb_len;
00829823 2524
f6a78bfc 2525 if (likely(!skb->next)) {
c8f44aff 2526 netdev_features_t features;
fc741216 2527
93f154b5 2528 /*
25985edc 2529 * If device doesn't need skb->dst, release it right now while
93f154b5
ED
2530 * its hot in this cpu cache
2531 */
adf30907
ED
2532 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2533 skb_dst_drop(skb);
2534
fc741216
JG
2535 features = netif_skb_features(skb);
2536
7b9c6090 2537 if (vlan_tx_tag_present(skb) &&
86a9bad3
PM
2538 !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2539 skb = __vlan_put_tag(skb, skb->vlan_proto,
2540 vlan_tx_tag_get(skb));
7b9c6090
JG
2541 if (unlikely(!skb))
2542 goto out;
2543
2544 skb->vlan_tci = 0;
2545 }
2546
fc70fb64
AD
2547 /* If encapsulation offload request, verify we are testing
2548 * hardware encapsulation features instead of standard
2549 * features for the netdev
2550 */
2551 if (skb->encapsulation)
2552 features &= dev->hw_enc_features;
2553
fc741216 2554 if (netif_needs_gso(skb, features)) {
91ecb63c 2555 if (unlikely(dev_gso_segment(skb, features)))
9ccb8975
DM
2556 goto out_kfree_skb;
2557 if (skb->next)
2558 goto gso;
6afff0ca 2559 } else {
02932ce9 2560 if (skb_needs_linearize(skb, features) &&
6afff0ca
JF
2561 __skb_linearize(skb))
2562 goto out_kfree_skb;
2563
2564 /* If packet is not checksummed and device does not
2565 * support checksumming for this protocol, complete
2566 * checksumming here.
2567 */
2568 if (skb->ip_summed == CHECKSUM_PARTIAL) {
fc70fb64
AD
2569 if (skb->encapsulation)
2570 skb_set_inner_transport_header(skb,
2571 skb_checksum_start_offset(skb));
2572 else
2573 skb_set_transport_header(skb,
2574 skb_checksum_start_offset(skb));
03634668 2575 if (!(features & NETIF_F_ALL_CSUM) &&
6afff0ca
JF
2576 skb_checksum_help(skb))
2577 goto out_kfree_skb;
2578 }
9ccb8975
DM
2579 }
2580
b40863c6
ED
2581 if (!list_empty(&ptype_all))
2582 dev_queue_xmit_nit(skb, dev);
2583
ec764bf0 2584 skb_len = skb->len;
ac45f602 2585 rc = ops->ndo_start_xmit(skb, dev);
ec764bf0 2586 trace_net_dev_xmit(skb, rc, dev, skb_len);
ec634fe3 2587 if (rc == NETDEV_TX_OK)
08baf561 2588 txq_trans_update(txq);
ac45f602 2589 return rc;
f6a78bfc
HX
2590 }
2591
576a30eb 2592gso:
f6a78bfc
HX
2593 do {
2594 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2595
2596 skb->next = nskb->next;
2597 nskb->next = NULL;
068a2de5 2598
b40863c6
ED
2599 if (!list_empty(&ptype_all))
2600 dev_queue_xmit_nit(nskb, dev);
2601
ec764bf0 2602 skb_len = nskb->len;
00829823 2603 rc = ops->ndo_start_xmit(nskb, dev);
ec764bf0 2604 trace_net_dev_xmit(nskb, rc, dev, skb_len);
ec634fe3 2605 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2606 if (rc & ~NETDEV_TX_MASK)
2607 goto out_kfree_gso_skb;
f54d9e8d 2608 nskb->next = skb->next;
f6a78bfc
HX
2609 skb->next = nskb;
2610 return rc;
2611 }
08baf561 2612 txq_trans_update(txq);
73466498 2613 if (unlikely(netif_xmit_stopped(txq) && skb->next))
f54d9e8d 2614 return NETDEV_TX_BUSY;
f6a78bfc 2615 } while (skb->next);
4ec93edb 2616
572a9d7b 2617out_kfree_gso_skb:
0c772159 2618 if (likely(skb->next == NULL)) {
572a9d7b 2619 skb->destructor = DEV_GSO_CB(skb)->destructor;
0c772159
SS
2620 consume_skb(skb);
2621 return rc;
2622 }
f6a78bfc
HX
2623out_kfree_skb:
2624 kfree_skb(skb);
7b9c6090 2625out:
572a9d7b 2626 return rc;
f6a78bfc
HX
2627}
2628
1def9238
ED
2629static void qdisc_pkt_len_init(struct sk_buff *skb)
2630{
2631 const struct skb_shared_info *shinfo = skb_shinfo(skb);
2632
2633 qdisc_skb_cb(skb)->pkt_len = skb->len;
2634
2635 /* To get more precise estimation of bytes sent on wire,
2636 * we add to pkt_len the headers size of all segments
2637 */
2638 if (shinfo->gso_size) {
757b8b1d 2639 unsigned int hdr_len;
15e5a030 2640 u16 gso_segs = shinfo->gso_segs;
1def9238 2641
757b8b1d
ED
2642 /* mac layer + network layer */
2643 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2644
2645 /* + transport layer */
1def9238
ED
2646 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2647 hdr_len += tcp_hdrlen(skb);
2648 else
2649 hdr_len += sizeof(struct udphdr);
15e5a030
JW
2650
2651 if (shinfo->gso_type & SKB_GSO_DODGY)
2652 gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2653 shinfo->gso_size);
2654
2655 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
1def9238
ED
2656 }
2657}
2658
bbd8a0d3
KK
2659static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2660 struct net_device *dev,
2661 struct netdev_queue *txq)
2662{
2663 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2664 bool contended;
bbd8a0d3
KK
2665 int rc;
2666
1def9238 2667 qdisc_pkt_len_init(skb);
a2da570d 2668 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2669 /*
2670 * Heuristic to force contended enqueues to serialize on a
2671 * separate lock before trying to get qdisc main lock.
2672 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2673 * and dequeue packets faster.
2674 */
a2da570d 2675 contended = qdisc_is_running(q);
79640a4c
ED
2676 if (unlikely(contended))
2677 spin_lock(&q->busylock);
2678
bbd8a0d3
KK
2679 spin_lock(root_lock);
2680 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2681 kfree_skb(skb);
2682 rc = NET_XMIT_DROP;
2683 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2684 qdisc_run_begin(q)) {
bbd8a0d3
KK
2685 /*
2686 * This is a work-conserving queue; there are no old skbs
2687 * waiting to be sent out; and the qdisc is not running -
2688 * xmit the skb directly.
2689 */
7fee226a
ED
2690 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2691 skb_dst_force(skb);
bfe0d029 2692
bfe0d029
ED
2693 qdisc_bstats_update(q, skb);
2694
79640a4c
ED
2695 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2696 if (unlikely(contended)) {
2697 spin_unlock(&q->busylock);
2698 contended = false;
2699 }
bbd8a0d3 2700 __qdisc_run(q);
79640a4c 2701 } else
bc135b23 2702 qdisc_run_end(q);
bbd8a0d3
KK
2703
2704 rc = NET_XMIT_SUCCESS;
2705 } else {
7fee226a 2706 skb_dst_force(skb);
a2da570d 2707 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2708 if (qdisc_run_begin(q)) {
2709 if (unlikely(contended)) {
2710 spin_unlock(&q->busylock);
2711 contended = false;
2712 }
2713 __qdisc_run(q);
2714 }
bbd8a0d3
KK
2715 }
2716 spin_unlock(root_lock);
79640a4c
ED
2717 if (unlikely(contended))
2718 spin_unlock(&q->busylock);
bbd8a0d3
KK
2719 return rc;
2720}
2721
5bc1421e
NH
2722#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2723static void skb_update_prio(struct sk_buff *skb)
2724{
6977a79d 2725 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e 2726
91c68ce2
ED
2727 if (!skb->priority && skb->sk && map) {
2728 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2729
2730 if (prioidx < map->priomap_len)
2731 skb->priority = map->priomap[prioidx];
2732 }
5bc1421e
NH
2733}
2734#else
2735#define skb_update_prio(skb)
2736#endif
2737
745e20f1 2738static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2739#define RECURSION_LIMIT 10
745e20f1 2740
95603e22
MM
2741/**
2742 * dev_loopback_xmit - loop back @skb
2743 * @skb: buffer to transmit
2744 */
2745int dev_loopback_xmit(struct sk_buff *skb)
2746{
2747 skb_reset_mac_header(skb);
2748 __skb_pull(skb, skb_network_offset(skb));
2749 skb->pkt_type = PACKET_LOOPBACK;
2750 skb->ip_summed = CHECKSUM_UNNECESSARY;
2751 WARN_ON(!skb_dst(skb));
2752 skb_dst_force(skb);
2753 netif_rx_ni(skb);
2754 return 0;
2755}
2756EXPORT_SYMBOL(dev_loopback_xmit);
2757
d29f749e
DJ
2758/**
2759 * dev_queue_xmit - transmit a buffer
2760 * @skb: buffer to transmit
2761 *
2762 * Queue a buffer for transmission to a network device. The caller must
2763 * have set the device and priority and built the buffer before calling
2764 * this function. The function can be called from an interrupt.
2765 *
2766 * A negative errno code is returned on a failure. A success does not
2767 * guarantee the frame will be transmitted as it may be dropped due
2768 * to congestion or traffic shaping.
2769 *
2770 * -----------------------------------------------------------------------------------
2771 * I notice this method can also return errors from the queue disciplines,
2772 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2773 * be positive.
2774 *
2775 * Regardless of the return value, the skb is consumed, so it is currently
2776 * difficult to retry a send to this method. (You can bump the ref count
2777 * before sending to hold a reference for retry if you are careful.)
2778 *
2779 * When calling this method, interrupts MUST be enabled. This is because
2780 * the BH enable code must have IRQs enabled so that it will not deadlock.
2781 * --BLG
2782 */
1da177e4
LT
2783int dev_queue_xmit(struct sk_buff *skb)
2784{
2785 struct net_device *dev = skb->dev;
dc2b4847 2786 struct netdev_queue *txq;
1da177e4
LT
2787 struct Qdisc *q;
2788 int rc = -ENOMEM;
2789
6d1ccff6
ED
2790 skb_reset_mac_header(skb);
2791
4ec93edb
YH
2792 /* Disable soft irqs for various locks below. Also
2793 * stops preemption for RCU.
1da177e4 2794 */
4ec93edb 2795 rcu_read_lock_bh();
1da177e4 2796
5bc1421e
NH
2797 skb_update_prio(skb);
2798
8c4c49df 2799 txq = netdev_pick_tx(dev, skb);
a898def2 2800 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2801
1da177e4 2802#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2803 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2804#endif
cf66ba58 2805 trace_net_dev_queue(skb);
1da177e4 2806 if (q->enqueue) {
bbd8a0d3 2807 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2808 goto out;
1da177e4
LT
2809 }
2810
2811 /* The device has no queue. Common case for software devices:
2812 loopback, all the sorts of tunnels...
2813
932ff279
HX
2814 Really, it is unlikely that netif_tx_lock protection is necessary
2815 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2816 counters.)
2817 However, it is possible, that they rely on protection
2818 made by us here.
2819
2820 Check this and shot the lock. It is not prone from deadlocks.
2821 Either shot noqueue qdisc, it is even simpler 8)
2822 */
2823 if (dev->flags & IFF_UP) {
2824 int cpu = smp_processor_id(); /* ok because BHs are off */
2825
c773e847 2826 if (txq->xmit_lock_owner != cpu) {
1da177e4 2827
745e20f1
ED
2828 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2829 goto recursion_alert;
2830
c773e847 2831 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2832
73466498 2833 if (!netif_xmit_stopped(txq)) {
745e20f1 2834 __this_cpu_inc(xmit_recursion);
572a9d7b 2835 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2836 __this_cpu_dec(xmit_recursion);
572a9d7b 2837 if (dev_xmit_complete(rc)) {
c773e847 2838 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2839 goto out;
2840 }
2841 }
c773e847 2842 HARD_TX_UNLOCK(dev, txq);
e87cc472
JP
2843 net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2844 dev->name);
1da177e4
LT
2845 } else {
2846 /* Recursion is detected! It is possible,
745e20f1
ED
2847 * unfortunately
2848 */
2849recursion_alert:
e87cc472
JP
2850 net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2851 dev->name);
1da177e4
LT
2852 }
2853 }
2854
2855 rc = -ENETDOWN;
d4828d85 2856 rcu_read_unlock_bh();
1da177e4 2857
1da177e4
LT
2858 kfree_skb(skb);
2859 return rc;
2860out:
d4828d85 2861 rcu_read_unlock_bh();
1da177e4
LT
2862 return rc;
2863}
d1b19dff 2864EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2865
2866
2867/*=======================================================================
2868 Receiver routines
2869 =======================================================================*/
2870
6b2bedc3 2871int netdev_max_backlog __read_mostly = 1000;
c9e6bc64
ED
2872EXPORT_SYMBOL(netdev_max_backlog);
2873
3b098e2d 2874int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2875int netdev_budget __read_mostly = 300;
2876int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2877
eecfd7c4
ED
2878/* Called with irq disabled */
2879static inline void ____napi_schedule(struct softnet_data *sd,
2880 struct napi_struct *napi)
2881{
2882 list_add_tail(&napi->poll_list, &sd->poll_list);
2883 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2884}
2885
bfb564e7
KK
2886#ifdef CONFIG_RPS
2887
2888/* One global table that all flow-based protocols share. */
6e3f7faf 2889struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
2890EXPORT_SYMBOL(rps_sock_flow_table);
2891
c5905afb 2892struct static_key rps_needed __read_mostly;
adc9300e 2893
c445477d
BH
2894static struct rps_dev_flow *
2895set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2896 struct rps_dev_flow *rflow, u16 next_cpu)
2897{
09994d1b 2898 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
2899#ifdef CONFIG_RFS_ACCEL
2900 struct netdev_rx_queue *rxqueue;
2901 struct rps_dev_flow_table *flow_table;
2902 struct rps_dev_flow *old_rflow;
2903 u32 flow_id;
2904 u16 rxq_index;
2905 int rc;
2906
2907 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
2908 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2909 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
2910 goto out;
2911 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2912 if (rxq_index == skb_get_rx_queue(skb))
2913 goto out;
2914
2915 rxqueue = dev->_rx + rxq_index;
2916 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2917 if (!flow_table)
2918 goto out;
2919 flow_id = skb->rxhash & flow_table->mask;
2920 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2921 rxq_index, flow_id);
2922 if (rc < 0)
2923 goto out;
2924 old_rflow = rflow;
2925 rflow = &flow_table->flows[flow_id];
c445477d
BH
2926 rflow->filter = rc;
2927 if (old_rflow->filter == rflow->filter)
2928 old_rflow->filter = RPS_NO_FILTER;
2929 out:
2930#endif
2931 rflow->last_qtail =
09994d1b 2932 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
2933 }
2934
09994d1b 2935 rflow->cpu = next_cpu;
c445477d
BH
2936 return rflow;
2937}
2938
bfb564e7
KK
2939/*
2940 * get_rps_cpu is called from netif_receive_skb and returns the target
2941 * CPU from the RPS map of the receiving queue for a given skb.
2942 * rcu_read_lock must be held on entry.
2943 */
2944static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2945 struct rps_dev_flow **rflowp)
2946{
2947 struct netdev_rx_queue *rxqueue;
6e3f7faf 2948 struct rps_map *map;
bfb564e7
KK
2949 struct rps_dev_flow_table *flow_table;
2950 struct rps_sock_flow_table *sock_flow_table;
2951 int cpu = -1;
2952 u16 tcpu;
2953
2954 if (skb_rx_queue_recorded(skb)) {
2955 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
2956 if (unlikely(index >= dev->real_num_rx_queues)) {
2957 WARN_ONCE(dev->real_num_rx_queues > 1,
2958 "%s received packet on queue %u, but number "
2959 "of RX queues is %u\n",
2960 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
2961 goto done;
2962 }
2963 rxqueue = dev->_rx + index;
2964 } else
2965 rxqueue = dev->_rx;
2966
6e3f7faf
ED
2967 map = rcu_dereference(rxqueue->rps_map);
2968 if (map) {
85875236 2969 if (map->len == 1 &&
33d480ce 2970 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
2971 tcpu = map->cpus[0];
2972 if (cpu_online(tcpu))
2973 cpu = tcpu;
2974 goto done;
2975 }
33d480ce 2976 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 2977 goto done;
6febfca9 2978 }
bfb564e7 2979
2d47b459 2980 skb_reset_network_header(skb);
bfb564e7
KK
2981 if (!skb_get_rxhash(skb))
2982 goto done;
2983
fec5e652
TH
2984 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2985 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2986 if (flow_table && sock_flow_table) {
2987 u16 next_cpu;
2988 struct rps_dev_flow *rflow;
2989
2990 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2991 tcpu = rflow->cpu;
2992
2993 next_cpu = sock_flow_table->ents[skb->rxhash &
2994 sock_flow_table->mask];
2995
2996 /*
2997 * If the desired CPU (where last recvmsg was done) is
2998 * different from current CPU (one in the rx-queue flow
2999 * table entry), switch if one of the following holds:
3000 * - Current CPU is unset (equal to RPS_NO_CPU).
3001 * - Current CPU is offline.
3002 * - The current CPU's queue tail has advanced beyond the
3003 * last packet that was enqueued using this table entry.
3004 * This guarantees that all previous packets for the flow
3005 * have been dequeued, thus preserving in order delivery.
3006 */
3007 if (unlikely(tcpu != next_cpu) &&
3008 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3009 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
baefa31d
TH
3010 rflow->last_qtail)) >= 0)) {
3011 tcpu = next_cpu;
c445477d 3012 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
baefa31d 3013 }
c445477d 3014
fec5e652
TH
3015 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3016 *rflowp = rflow;
3017 cpu = tcpu;
3018 goto done;
3019 }
3020 }
3021
0a9627f2 3022 if (map) {
fec5e652 3023 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
3024
3025 if (cpu_online(tcpu)) {
3026 cpu = tcpu;
3027 goto done;
3028 }
3029 }
3030
3031done:
0a9627f2
TH
3032 return cpu;
3033}
3034
c445477d
BH
3035#ifdef CONFIG_RFS_ACCEL
3036
3037/**
3038 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3039 * @dev: Device on which the filter was set
3040 * @rxq_index: RX queue index
3041 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3042 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3043 *
3044 * Drivers that implement ndo_rx_flow_steer() should periodically call
3045 * this function for each installed filter and remove the filters for
3046 * which it returns %true.
3047 */
3048bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3049 u32 flow_id, u16 filter_id)
3050{
3051 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3052 struct rps_dev_flow_table *flow_table;
3053 struct rps_dev_flow *rflow;
3054 bool expire = true;
3055 int cpu;
3056
3057 rcu_read_lock();
3058 flow_table = rcu_dereference(rxqueue->rps_flow_table);
3059 if (flow_table && flow_id <= flow_table->mask) {
3060 rflow = &flow_table->flows[flow_id];
3061 cpu = ACCESS_ONCE(rflow->cpu);
3062 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3063 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3064 rflow->last_qtail) <
3065 (int)(10 * flow_table->mask)))
3066 expire = false;
3067 }
3068 rcu_read_unlock();
3069 return expire;
3070}
3071EXPORT_SYMBOL(rps_may_expire_flow);
3072
3073#endif /* CONFIG_RFS_ACCEL */
3074
0a9627f2 3075/* Called from hardirq (IPI) context */
e36fa2f7 3076static void rps_trigger_softirq(void *data)
0a9627f2 3077{
e36fa2f7
ED
3078 struct softnet_data *sd = data;
3079
eecfd7c4 3080 ____napi_schedule(sd, &sd->backlog);
dee42870 3081 sd->received_rps++;
0a9627f2 3082}
e36fa2f7 3083
fec5e652 3084#endif /* CONFIG_RPS */
0a9627f2 3085
e36fa2f7
ED
3086/*
3087 * Check if this softnet_data structure is another cpu one
3088 * If yes, queue it to our IPI list and return 1
3089 * If no, return 0
3090 */
3091static int rps_ipi_queued(struct softnet_data *sd)
3092{
3093#ifdef CONFIG_RPS
3094 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3095
3096 if (sd != mysd) {
3097 sd->rps_ipi_next = mysd->rps_ipi_list;
3098 mysd->rps_ipi_list = sd;
3099
3100 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3101 return 1;
3102 }
3103#endif /* CONFIG_RPS */
3104 return 0;
3105}
3106
0a9627f2
TH
3107/*
3108 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3109 * queue (may be a remote CPU queue).
3110 */
fec5e652
TH
3111static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3112 unsigned int *qtail)
0a9627f2 3113{
e36fa2f7 3114 struct softnet_data *sd;
0a9627f2
TH
3115 unsigned long flags;
3116
e36fa2f7 3117 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
3118
3119 local_irq_save(flags);
0a9627f2 3120
e36fa2f7 3121 rps_lock(sd);
6e7676c1
CG
3122 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3123 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 3124enqueue:
e36fa2f7 3125 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 3126 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 3127 rps_unlock(sd);
152102c7 3128 local_irq_restore(flags);
0a9627f2
TH
3129 return NET_RX_SUCCESS;
3130 }
3131
ebda37c2
ED
3132 /* Schedule NAPI for backlog device
3133 * We can use non atomic operation since we own the queue lock
3134 */
3135 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 3136 if (!rps_ipi_queued(sd))
eecfd7c4 3137 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
3138 }
3139 goto enqueue;
3140 }
3141
dee42870 3142 sd->dropped++;
e36fa2f7 3143 rps_unlock(sd);
0a9627f2 3144
0a9627f2
TH
3145 local_irq_restore(flags);
3146
caf586e5 3147 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
3148 kfree_skb(skb);
3149 return NET_RX_DROP;
3150}
1da177e4 3151
1da177e4
LT
3152/**
3153 * netif_rx - post buffer to the network code
3154 * @skb: buffer to post
3155 *
3156 * This function receives a packet from a device driver and queues it for
3157 * the upper (protocol) levels to process. It always succeeds. The buffer
3158 * may be dropped during processing for congestion control or by the
3159 * protocol layers.
3160 *
3161 * return values:
3162 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
3163 * NET_RX_DROP (packet was dropped)
3164 *
3165 */
3166
3167int netif_rx(struct sk_buff *skb)
3168{
b0e28f1e 3169 int ret;
1da177e4
LT
3170
3171 /* if netpoll wants it, pretend we never saw it */
3172 if (netpoll_rx(skb))
3173 return NET_RX_DROP;
3174
588f0330 3175 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 3176
cf66ba58 3177 trace_netif_rx(skb);
df334545 3178#ifdef CONFIG_RPS
c5905afb 3179 if (static_key_false(&rps_needed)) {
fec5e652 3180 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
3181 int cpu;
3182
cece1945 3183 preempt_disable();
b0e28f1e 3184 rcu_read_lock();
fec5e652
TH
3185
3186 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
3187 if (cpu < 0)
3188 cpu = smp_processor_id();
fec5e652
TH
3189
3190 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3191
b0e28f1e 3192 rcu_read_unlock();
cece1945 3193 preempt_enable();
adc9300e
ED
3194 } else
3195#endif
fec5e652
TH
3196 {
3197 unsigned int qtail;
3198 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3199 put_cpu();
3200 }
b0e28f1e 3201 return ret;
1da177e4 3202}
d1b19dff 3203EXPORT_SYMBOL(netif_rx);
1da177e4
LT
3204
3205int netif_rx_ni(struct sk_buff *skb)
3206{
3207 int err;
3208
3209 preempt_disable();
3210 err = netif_rx(skb);
3211 if (local_softirq_pending())
3212 do_softirq();
3213 preempt_enable();
3214
3215 return err;
3216}
1da177e4
LT
3217EXPORT_SYMBOL(netif_rx_ni);
3218
1da177e4
LT
3219static void net_tx_action(struct softirq_action *h)
3220{
3221 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3222
3223 if (sd->completion_queue) {
3224 struct sk_buff *clist;
3225
3226 local_irq_disable();
3227 clist = sd->completion_queue;
3228 sd->completion_queue = NULL;
3229 local_irq_enable();
3230
3231 while (clist) {
3232 struct sk_buff *skb = clist;
3233 clist = clist->next;
3234
547b792c 3235 WARN_ON(atomic_read(&skb->users));
07dc22e7 3236 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3237 __kfree_skb(skb);
3238 }
3239 }
3240
3241 if (sd->output_queue) {
37437bb2 3242 struct Qdisc *head;
1da177e4
LT
3243
3244 local_irq_disable();
3245 head = sd->output_queue;
3246 sd->output_queue = NULL;
a9cbd588 3247 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3248 local_irq_enable();
3249
3250 while (head) {
37437bb2
DM
3251 struct Qdisc *q = head;
3252 spinlock_t *root_lock;
3253
1da177e4
LT
3254 head = head->next_sched;
3255
5fb66229 3256 root_lock = qdisc_lock(q);
37437bb2 3257 if (spin_trylock(root_lock)) {
def82a1d
JP
3258 smp_mb__before_clear_bit();
3259 clear_bit(__QDISC_STATE_SCHED,
3260 &q->state);
37437bb2
DM
3261 qdisc_run(q);
3262 spin_unlock(root_lock);
1da177e4 3263 } else {
195648bb 3264 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3265 &q->state)) {
195648bb 3266 __netif_reschedule(q);
e8a83e10
JP
3267 } else {
3268 smp_mb__before_clear_bit();
3269 clear_bit(__QDISC_STATE_SCHED,
3270 &q->state);
3271 }
1da177e4
LT
3272 }
3273 }
3274 }
3275}
3276
ab95bfe0
JP
3277#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3278 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3279/* This hook is defined here for ATM LANE */
3280int (*br_fdb_test_addr_hook)(struct net_device *dev,
3281 unsigned char *addr) __read_mostly;
4fb019a0 3282EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3283#endif
1da177e4 3284
1da177e4
LT
3285#ifdef CONFIG_NET_CLS_ACT
3286/* TODO: Maybe we should just force sch_ingress to be compiled in
3287 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3288 * a compare and 2 stores extra right now if we dont have it on
3289 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3290 * NOTE: This doesn't stop any functionality; if you dont have
3291 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3292 *
3293 */
24824a09 3294static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3295{
1da177e4 3296 struct net_device *dev = skb->dev;
f697c3e8 3297 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3298 int result = TC_ACT_OK;
3299 struct Qdisc *q;
4ec93edb 3300
de384830 3301 if (unlikely(MAX_RED_LOOP < ttl++)) {
e87cc472
JP
3302 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3303 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3304 return TC_ACT_SHOT;
3305 }
1da177e4 3306
f697c3e8
HX
3307 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3308 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3309
83874000 3310 q = rxq->qdisc;
8d50b53d 3311 if (q != &noop_qdisc) {
83874000 3312 spin_lock(qdisc_lock(q));
a9312ae8
DM
3313 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3314 result = qdisc_enqueue_root(skb, q);
83874000
DM
3315 spin_unlock(qdisc_lock(q));
3316 }
f697c3e8
HX
3317
3318 return result;
3319}
86e65da9 3320
f697c3e8
HX
3321static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3322 struct packet_type **pt_prev,
3323 int *ret, struct net_device *orig_dev)
3324{
24824a09
ED
3325 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3326
3327 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 3328 goto out;
1da177e4 3329
f697c3e8
HX
3330 if (*pt_prev) {
3331 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3332 *pt_prev = NULL;
1da177e4
LT
3333 }
3334
24824a09 3335 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3336 case TC_ACT_SHOT:
3337 case TC_ACT_STOLEN:
3338 kfree_skb(skb);
3339 return NULL;
3340 }
3341
3342out:
3343 skb->tc_verd = 0;
3344 return skb;
1da177e4
LT
3345}
3346#endif
3347
ab95bfe0
JP
3348/**
3349 * netdev_rx_handler_register - register receive handler
3350 * @dev: device to register a handler for
3351 * @rx_handler: receive handler to register
93e2c32b 3352 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
3353 *
3354 * Register a receive hander for a device. This handler will then be
3355 * called from __netif_receive_skb. A negative errno code is returned
3356 * on a failure.
3357 *
3358 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3359 *
3360 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3361 */
3362int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3363 rx_handler_func_t *rx_handler,
3364 void *rx_handler_data)
ab95bfe0
JP
3365{
3366 ASSERT_RTNL();
3367
3368 if (dev->rx_handler)
3369 return -EBUSY;
3370
00cfec37 3371 /* Note: rx_handler_data must be set before rx_handler */
93e2c32b 3372 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3373 rcu_assign_pointer(dev->rx_handler, rx_handler);
3374
3375 return 0;
3376}
3377EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3378
3379/**
3380 * netdev_rx_handler_unregister - unregister receive handler
3381 * @dev: device to unregister a handler from
3382 *
166ec369 3383 * Unregister a receive handler from a device.
ab95bfe0
JP
3384 *
3385 * The caller must hold the rtnl_mutex.
3386 */
3387void netdev_rx_handler_unregister(struct net_device *dev)
3388{
3389
3390 ASSERT_RTNL();
a9b3cd7f 3391 RCU_INIT_POINTER(dev->rx_handler, NULL);
00cfec37
ED
3392 /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3393 * section has a guarantee to see a non NULL rx_handler_data
3394 * as well.
3395 */
3396 synchronize_net();
a9b3cd7f 3397 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3398}
3399EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3400
b4b9e355
MG
3401/*
3402 * Limit the use of PFMEMALLOC reserves to those protocols that implement
3403 * the special handling of PFMEMALLOC skbs.
3404 */
3405static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3406{
3407 switch (skb->protocol) {
3408 case __constant_htons(ETH_P_ARP):
3409 case __constant_htons(ETH_P_IP):
3410 case __constant_htons(ETH_P_IPV6):
3411 case __constant_htons(ETH_P_8021Q):
8ad227ff 3412 case __constant_htons(ETH_P_8021AD):
b4b9e355
MG
3413 return true;
3414 default:
3415 return false;
3416 }
3417}
3418
9754e293 3419static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
1da177e4
LT
3420{
3421 struct packet_type *ptype, *pt_prev;
ab95bfe0 3422 rx_handler_func_t *rx_handler;
f2ccd8fa 3423 struct net_device *orig_dev;
63d8ea7f 3424 struct net_device *null_or_dev;
8a4eb573 3425 bool deliver_exact = false;
1da177e4 3426 int ret = NET_RX_DROP;
252e3346 3427 __be16 type;
1da177e4 3428
588f0330 3429 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3430
cf66ba58 3431 trace_netif_receive_skb(skb);
9b22ea56 3432
1da177e4 3433 /* if we've gotten here through NAPI, check netpoll */
bea3348e 3434 if (netpoll_receive_skb(skb))
b4b9e355 3435 goto out;
1da177e4 3436
cc9bd5ce 3437 orig_dev = skb->dev;
8f903c70 3438
c1d2bbe1 3439 skb_reset_network_header(skb);
fda55eca
ED
3440 if (!skb_transport_header_was_set(skb))
3441 skb_reset_transport_header(skb);
0b5c9db1 3442 skb_reset_mac_len(skb);
1da177e4
LT
3443
3444 pt_prev = NULL;
3445
63d8ea7f 3446another_round:
b6858177 3447 skb->skb_iif = skb->dev->ifindex;
63d8ea7f
DM
3448
3449 __this_cpu_inc(softnet_data.processed);
3450
8ad227ff
PM
3451 if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3452 skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
bcc6d479
JP
3453 skb = vlan_untag(skb);
3454 if (unlikely(!skb))
c987fa71 3455 goto out;
bcc6d479
JP
3456 }
3457
1da177e4
LT
3458#ifdef CONFIG_NET_CLS_ACT
3459 if (skb->tc_verd & TC_NCLS) {
3460 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3461 goto ncls;
3462 }
3463#endif
3464
9754e293 3465 if (pfmemalloc)
b4b9e355
MG
3466 goto skip_taps;
3467
1da177e4 3468 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3469 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3470 if (pt_prev)
f2ccd8fa 3471 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3472 pt_prev = ptype;
3473 }
3474 }
3475
b4b9e355 3476skip_taps:
1da177e4 3477#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3478 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3479 if (!skb)
c987fa71 3480 goto out;
1da177e4
LT
3481ncls:
3482#endif
3483
9754e293 3484 if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
b4b9e355
MG
3485 goto drop;
3486
2425717b
JF
3487 if (vlan_tx_tag_present(skb)) {
3488 if (pt_prev) {
3489 ret = deliver_skb(skb, pt_prev, orig_dev);
3490 pt_prev = NULL;
3491 }
48cc32d3 3492 if (vlan_do_receive(&skb))
2425717b
JF
3493 goto another_round;
3494 else if (unlikely(!skb))
c987fa71 3495 goto out;
2425717b
JF
3496 }
3497
48cc32d3 3498 rx_handler = rcu_dereference(skb->dev->rx_handler);
ab95bfe0
JP
3499 if (rx_handler) {
3500 if (pt_prev) {
3501 ret = deliver_skb(skb, pt_prev, orig_dev);
3502 pt_prev = NULL;
3503 }
8a4eb573
JP
3504 switch (rx_handler(&skb)) {
3505 case RX_HANDLER_CONSUMED:
3bc1b1ad 3506 ret = NET_RX_SUCCESS;
c987fa71 3507 goto out;
8a4eb573 3508 case RX_HANDLER_ANOTHER:
63d8ea7f 3509 goto another_round;
8a4eb573
JP
3510 case RX_HANDLER_EXACT:
3511 deliver_exact = true;
3512 case RX_HANDLER_PASS:
3513 break;
3514 default:
3515 BUG();
3516 }
ab95bfe0 3517 }
1da177e4 3518
37b25f3f
ED
3519 if (unlikely(vlan_tx_tag_present(skb))) {
3520 if (vlan_tx_tag_get_id(skb))
3521 skb->pkt_type = PACKET_OTHERHOST;
3522 /* Note: we might in the future use prio bits
3523 * and set skb->priority like in vlan_do_receive()
3524 * For the time being, just ignore Priority Code Point
3525 */
3526 skb->vlan_tci = 0;
3527 }
48cc32d3 3528
63d8ea7f 3529 /* deliver only exact match when indicated */
8a4eb573 3530 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3531
1da177e4 3532 type = skb->protocol;
82d8a867
PE
3533 list_for_each_entry_rcu(ptype,
3534 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3535 if (ptype->type == type &&
e3f48d37
JP
3536 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3537 ptype->dev == orig_dev)) {
4ec93edb 3538 if (pt_prev)
f2ccd8fa 3539 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3540 pt_prev = ptype;
3541 }
3542 }
3543
3544 if (pt_prev) {
1080e512 3545 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
0e698bf6 3546 goto drop;
1080e512
MT
3547 else
3548 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3549 } else {
b4b9e355 3550drop:
caf586e5 3551 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3552 kfree_skb(skb);
3553 /* Jamal, now you will not able to escape explaining
3554 * me how you were going to use this. :-)
3555 */
3556 ret = NET_RX_DROP;
3557 }
3558
b4b9e355 3559out:
9754e293
DM
3560 return ret;
3561}
3562
3563static int __netif_receive_skb(struct sk_buff *skb)
3564{
3565 int ret;
3566
3567 if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3568 unsigned long pflags = current->flags;
3569
3570 /*
3571 * PFMEMALLOC skbs are special, they should
3572 * - be delivered to SOCK_MEMALLOC sockets only
3573 * - stay away from userspace
3574 * - have bounded memory usage
3575 *
3576 * Use PF_MEMALLOC as this saves us from propagating the allocation
3577 * context down to all allocation sites.
3578 */
3579 current->flags |= PF_MEMALLOC;
3580 ret = __netif_receive_skb_core(skb, true);
3581 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3582 } else
3583 ret = __netif_receive_skb_core(skb, false);
3584
1da177e4
LT
3585 return ret;
3586}
0a9627f2
TH
3587
3588/**
3589 * netif_receive_skb - process receive buffer from network
3590 * @skb: buffer to process
3591 *
3592 * netif_receive_skb() is the main receive data processing function.
3593 * It always succeeds. The buffer may be dropped during processing
3594 * for congestion control or by the protocol layers.
3595 *
3596 * This function may only be called from softirq context and interrupts
3597 * should be enabled.
3598 *
3599 * Return values (usually ignored):
3600 * NET_RX_SUCCESS: no congestion
3601 * NET_RX_DROP: packet was dropped
3602 */
3603int netif_receive_skb(struct sk_buff *skb)
3604{
c987fa71
JA
3605 int ret;
3606
588f0330 3607 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3608
c1f19b51
RC
3609 if (skb_defer_rx_timestamp(skb))
3610 return NET_RX_SUCCESS;
3611
c987fa71
JA
3612 rcu_read_lock();
3613
df334545 3614#ifdef CONFIG_RPS
c5905afb 3615 if (static_key_false(&rps_needed)) {
3b098e2d 3616 struct rps_dev_flow voidflow, *rflow = &voidflow;
c987fa71 3617 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3618
3b098e2d
ED
3619 if (cpu >= 0) {
3620 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3621 rcu_read_unlock();
adc9300e 3622 return ret;
3b098e2d 3623 }
fec5e652 3624 }
1e94d72f 3625#endif
c987fa71
JA
3626 ret = __netif_receive_skb(skb);
3627 rcu_read_unlock();
3628 return ret;
0a9627f2 3629}
d1b19dff 3630EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3631
88751275
ED
3632/* Network device is going away, flush any packets still pending
3633 * Called with irqs disabled.
3634 */
152102c7 3635static void flush_backlog(void *arg)
6e583ce5 3636{
152102c7 3637 struct net_device *dev = arg;
e36fa2f7 3638 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3639 struct sk_buff *skb, *tmp;
3640
e36fa2f7 3641 rps_lock(sd);
6e7676c1 3642 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3643 if (skb->dev == dev) {
e36fa2f7 3644 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3645 kfree_skb(skb);
76cc8b13 3646 input_queue_head_incr(sd);
6e583ce5 3647 }
6e7676c1 3648 }
e36fa2f7 3649 rps_unlock(sd);
6e7676c1
CG
3650
3651 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3652 if (skb->dev == dev) {
3653 __skb_unlink(skb, &sd->process_queue);
3654 kfree_skb(skb);
76cc8b13 3655 input_queue_head_incr(sd);
6e7676c1
CG
3656 }
3657 }
6e583ce5
SH
3658}
3659
d565b0a1
HX
3660static int napi_gro_complete(struct sk_buff *skb)
3661{
22061d80 3662 struct packet_offload *ptype;
d565b0a1 3663 __be16 type = skb->protocol;
22061d80 3664 struct list_head *head = &offload_base;
d565b0a1
HX
3665 int err = -ENOENT;
3666
c3c7c254
ED
3667 BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3668
fc59f9a3
HX
3669 if (NAPI_GRO_CB(skb)->count == 1) {
3670 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3671 goto out;
fc59f9a3 3672 }
d565b0a1
HX
3673
3674 rcu_read_lock();
3675 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3676 if (ptype->type != type || !ptype->callbacks.gro_complete)
d565b0a1
HX
3677 continue;
3678
f191a1d1 3679 err = ptype->callbacks.gro_complete(skb);
d565b0a1
HX
3680 break;
3681 }
3682 rcu_read_unlock();
3683
3684 if (err) {
3685 WARN_ON(&ptype->list == head);
3686 kfree_skb(skb);
3687 return NET_RX_SUCCESS;
3688 }
3689
3690out:
d565b0a1
HX
3691 return netif_receive_skb(skb);
3692}
3693
2e71a6f8
ED
3694/* napi->gro_list contains packets ordered by age.
3695 * youngest packets at the head of it.
3696 * Complete skbs in reverse order to reduce latencies.
3697 */
3698void napi_gro_flush(struct napi_struct *napi, bool flush_old)
d565b0a1 3699{
2e71a6f8 3700 struct sk_buff *skb, *prev = NULL;
d565b0a1 3701
2e71a6f8
ED
3702 /* scan list and build reverse chain */
3703 for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3704 skb->prev = prev;
3705 prev = skb;
3706 }
3707
3708 for (skb = prev; skb; skb = prev) {
d565b0a1 3709 skb->next = NULL;
2e71a6f8
ED
3710
3711 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3712 return;
3713
3714 prev = skb->prev;
d565b0a1 3715 napi_gro_complete(skb);
2e71a6f8 3716 napi->gro_count--;
d565b0a1
HX
3717 }
3718
3719 napi->gro_list = NULL;
3720}
86cac58b 3721EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3722
89c5fa33
ED
3723static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3724{
3725 struct sk_buff *p;
3726 unsigned int maclen = skb->dev->hard_header_len;
3727
3728 for (p = napi->gro_list; p; p = p->next) {
3729 unsigned long diffs;
3730
3731 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3732 diffs |= p->vlan_tci ^ skb->vlan_tci;
3733 if (maclen == ETH_HLEN)
3734 diffs |= compare_ether_header(skb_mac_header(p),
3735 skb_gro_mac_header(skb));
3736 else if (!diffs)
3737 diffs = memcmp(skb_mac_header(p),
3738 skb_gro_mac_header(skb),
3739 maclen);
3740 NAPI_GRO_CB(p)->same_flow = !diffs;
3741 NAPI_GRO_CB(p)->flush = 0;
3742 }
3743}
3744
bb728820 3745static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3746{
3747 struct sk_buff **pp = NULL;
22061d80 3748 struct packet_offload *ptype;
d565b0a1 3749 __be16 type = skb->protocol;
22061d80 3750 struct list_head *head = &offload_base;
0da2afd5 3751 int same_flow;
5b252f0c 3752 enum gro_result ret;
d565b0a1 3753
ce9e76c8 3754 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3755 goto normal;
3756
21dc3301 3757 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3758 goto normal;
3759
89c5fa33
ED
3760 gro_list_prepare(napi, skb);
3761
d565b0a1
HX
3762 rcu_read_lock();
3763 list_for_each_entry_rcu(ptype, head, list) {
f191a1d1 3764 if (ptype->type != type || !ptype->callbacks.gro_receive)
d565b0a1
HX
3765 continue;
3766
86911732 3767 skb_set_network_header(skb, skb_gro_offset(skb));
efd9450e 3768 skb_reset_mac_len(skb);
d565b0a1
HX
3769 NAPI_GRO_CB(skb)->same_flow = 0;
3770 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3771 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3772
f191a1d1 3773 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
d565b0a1
HX
3774 break;
3775 }
3776 rcu_read_unlock();
3777
3778 if (&ptype->list == head)
3779 goto normal;
3780
0da2afd5 3781 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3782 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3783
d565b0a1
HX
3784 if (pp) {
3785 struct sk_buff *nskb = *pp;
3786
3787 *pp = nskb->next;
3788 nskb->next = NULL;
3789 napi_gro_complete(nskb);
4ae5544f 3790 napi->gro_count--;
d565b0a1
HX
3791 }
3792
0da2afd5 3793 if (same_flow)
d565b0a1
HX
3794 goto ok;
3795
4ae5544f 3796 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3797 goto normal;
d565b0a1 3798
4ae5544f 3799 napi->gro_count++;
d565b0a1 3800 NAPI_GRO_CB(skb)->count = 1;
2e71a6f8 3801 NAPI_GRO_CB(skb)->age = jiffies;
86911732 3802 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3803 skb->next = napi->gro_list;
3804 napi->gro_list = skb;
5d0d9be8 3805 ret = GRO_HELD;
d565b0a1 3806
ad0f9904 3807pull:
cb18978c
HX
3808 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3809 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3810
3811 BUG_ON(skb->end - skb->tail < grow);
3812
3813 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3814
3815 skb->tail += grow;
3816 skb->data_len -= grow;
3817
3818 skb_shinfo(skb)->frags[0].page_offset += grow;
9e903e08 3819 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
cb18978c 3820
9e903e08 3821 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
ea2ab693 3822 skb_frag_unref(skb, 0);
cb18978c
HX
3823 memmove(skb_shinfo(skb)->frags,
3824 skb_shinfo(skb)->frags + 1,
e5093aec 3825 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3826 }
ad0f9904
HX
3827 }
3828
d565b0a1 3829ok:
5d0d9be8 3830 return ret;
d565b0a1
HX
3831
3832normal:
ad0f9904
HX
3833 ret = GRO_NORMAL;
3834 goto pull;
5d38a079 3835}
96e93eab 3836
5d38a079 3837
bb728820 3838static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3839{
5d0d9be8
HX
3840 switch (ret) {
3841 case GRO_NORMAL:
c7c4b3b6
BH
3842 if (netif_receive_skb(skb))
3843 ret = GRO_DROP;
3844 break;
5d38a079 3845
5d0d9be8 3846 case GRO_DROP:
5d38a079
HX
3847 kfree_skb(skb);
3848 break;
5b252f0c 3849
daa86548 3850 case GRO_MERGED_FREE:
d7e8883c
ED
3851 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3852 kmem_cache_free(skbuff_head_cache, skb);
3853 else
3854 __kfree_skb(skb);
daa86548
ED
3855 break;
3856
5b252f0c
BH
3857 case GRO_HELD:
3858 case GRO_MERGED:
3859 break;
5d38a079
HX
3860 }
3861
c7c4b3b6 3862 return ret;
5d0d9be8 3863}
5d0d9be8 3864
ca07e43e 3865static void skb_gro_reset_offset(struct sk_buff *skb)
78a478d0 3866{
ca07e43e
ED
3867 const struct skb_shared_info *pinfo = skb_shinfo(skb);
3868 const skb_frag_t *frag0 = &pinfo->frags[0];
3869
78a478d0
HX
3870 NAPI_GRO_CB(skb)->data_offset = 0;
3871 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3872 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3873
78d3fd0b 3874 if (skb->mac_header == skb->tail &&
ca07e43e
ED
3875 pinfo->nr_frags &&
3876 !PageHighMem(skb_frag_page(frag0))) {
3877 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3878 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
7489594c 3879 }
78a478d0 3880}
78a478d0 3881
c7c4b3b6 3882gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3883{
86911732
HX
3884 skb_gro_reset_offset(skb);
3885
89c5fa33 3886 return napi_skb_finish(dev_gro_receive(napi, skb), skb);
d565b0a1
HX
3887}
3888EXPORT_SYMBOL(napi_gro_receive);
3889
d0c2b0d2 3890static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 3891{
96e93eab 3892 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
3893 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3894 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 3895 skb->vlan_tci = 0;
66c46d74 3896 skb->dev = napi->dev;
6d152e23 3897 skb->skb_iif = 0;
fd6b1194 3898 skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
96e93eab
HX
3899
3900 napi->skb = skb;
3901}
96e93eab 3902
76620aaf 3903struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3904{
5d38a079 3905 struct sk_buff *skb = napi->skb;
5d38a079
HX
3906
3907 if (!skb) {
89d71a66
ED
3908 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3909 if (skb)
3910 napi->skb = skb;
80595d59 3911 }
96e93eab
HX
3912 return skb;
3913}
76620aaf 3914EXPORT_SYMBOL(napi_get_frags);
96e93eab 3915
bb728820 3916static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
c7c4b3b6 3917 gro_result_t ret)
96e93eab 3918{
5d0d9be8
HX
3919 switch (ret) {
3920 case GRO_NORMAL:
86911732 3921 case GRO_HELD:
e76b69cc 3922 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3923
c7c4b3b6
BH
3924 if (ret == GRO_HELD)
3925 skb_gro_pull(skb, -ETH_HLEN);
3926 else if (netif_receive_skb(skb))
3927 ret = GRO_DROP;
86911732 3928 break;
5d38a079 3929
5d0d9be8 3930 case GRO_DROP:
5d0d9be8
HX
3931 case GRO_MERGED_FREE:
3932 napi_reuse_skb(napi, skb);
3933 break;
5b252f0c
BH
3934
3935 case GRO_MERGED:
3936 break;
5d0d9be8 3937 }
5d38a079 3938
c7c4b3b6 3939 return ret;
5d38a079 3940}
5d0d9be8 3941
4adb9c4a 3942static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
76620aaf
HX
3943{
3944 struct sk_buff *skb = napi->skb;
3945 struct ethhdr *eth;
a5b1cf28
HX
3946 unsigned int hlen;
3947 unsigned int off;
76620aaf
HX
3948
3949 napi->skb = NULL;
3950
3951 skb_reset_mac_header(skb);
3952 skb_gro_reset_offset(skb);
3953
a5b1cf28
HX
3954 off = skb_gro_offset(skb);
3955 hlen = off + sizeof(*eth);
3956 eth = skb_gro_header_fast(skb, off);
3957 if (skb_gro_header_hard(skb, hlen)) {
3958 eth = skb_gro_header_slow(skb, hlen, off);
3959 if (unlikely(!eth)) {
3960 napi_reuse_skb(napi, skb);
3961 skb = NULL;
3962 goto out;
3963 }
76620aaf
HX
3964 }
3965
3966 skb_gro_pull(skb, sizeof(*eth));
3967
3968 /*
3969 * This works because the only protocols we care about don't require
3970 * special handling. We'll fix it up properly at the end.
3971 */
3972 skb->protocol = eth->h_proto;
3973
3974out:
3975 return skb;
3976}
76620aaf 3977
c7c4b3b6 3978gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3979{
76620aaf 3980 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3981
3982 if (!skb)
c7c4b3b6 3983 return GRO_DROP;
5d0d9be8 3984
89c5fa33 3985 return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
5d0d9be8 3986}
5d38a079
HX
3987EXPORT_SYMBOL(napi_gro_frags);
3988
e326bed2
ED
3989/*
3990 * net_rps_action sends any pending IPI's for rps.
3991 * Note: called with local irq disabled, but exits with local irq enabled.
3992 */
3993static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3994{
3995#ifdef CONFIG_RPS
3996 struct softnet_data *remsd = sd->rps_ipi_list;
3997
3998 if (remsd) {
3999 sd->rps_ipi_list = NULL;
4000
4001 local_irq_enable();
4002
4003 /* Send pending IPI's to kick RPS processing on remote cpus. */
4004 while (remsd) {
4005 struct softnet_data *next = remsd->rps_ipi_next;
4006
4007 if (cpu_online(remsd->cpu))
4008 __smp_call_function_single(remsd->cpu,
4009 &remsd->csd, 0);
4010 remsd = next;
4011 }
4012 } else
4013#endif
4014 local_irq_enable();
4015}
4016
bea3348e 4017static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
4018{
4019 int work = 0;
eecfd7c4 4020 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 4021
e326bed2
ED
4022#ifdef CONFIG_RPS
4023 /* Check if we have pending ipi, its better to send them now,
4024 * not waiting net_rx_action() end.
4025 */
4026 if (sd->rps_ipi_list) {
4027 local_irq_disable();
4028 net_rps_action_and_irq_enable(sd);
4029 }
4030#endif
bea3348e 4031 napi->weight = weight_p;
6e7676c1
CG
4032 local_irq_disable();
4033 while (work < quota) {
1da177e4 4034 struct sk_buff *skb;
6e7676c1
CG
4035 unsigned int qlen;
4036
4037 while ((skb = __skb_dequeue(&sd->process_queue))) {
c987fa71 4038 rcu_read_lock();
6e7676c1
CG
4039 local_irq_enable();
4040 __netif_receive_skb(skb);
c987fa71 4041 rcu_read_unlock();
6e7676c1 4042 local_irq_disable();
76cc8b13
TH
4043 input_queue_head_incr(sd);
4044 if (++work >= quota) {
4045 local_irq_enable();
4046 return work;
4047 }
6e7676c1 4048 }
1da177e4 4049
e36fa2f7 4050 rps_lock(sd);
6e7676c1 4051 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 4052 if (qlen)
6e7676c1
CG
4053 skb_queue_splice_tail_init(&sd->input_pkt_queue,
4054 &sd->process_queue);
76cc8b13 4055
6e7676c1 4056 if (qlen < quota - work) {
eecfd7c4
ED
4057 /*
4058 * Inline a custom version of __napi_complete().
4059 * only current cpu owns and manipulates this napi,
4060 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4061 * we can use a plain write instead of clear_bit(),
4062 * and we dont need an smp_mb() memory barrier.
4063 */
4064 list_del(&napi->poll_list);
4065 napi->state = 0;
4066
6e7676c1 4067 quota = work + qlen;
bea3348e 4068 }
e36fa2f7 4069 rps_unlock(sd);
6e7676c1
CG
4070 }
4071 local_irq_enable();
1da177e4 4072
bea3348e
SH
4073 return work;
4074}
1da177e4 4075
bea3348e
SH
4076/**
4077 * __napi_schedule - schedule for receive
c4ea43c5 4078 * @n: entry to schedule
bea3348e
SH
4079 *
4080 * The entry's receive function will be scheduled to run
4081 */
b5606c2d 4082void __napi_schedule(struct napi_struct *n)
bea3348e
SH
4083{
4084 unsigned long flags;
1da177e4 4085
bea3348e 4086 local_irq_save(flags);
eecfd7c4 4087 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 4088 local_irq_restore(flags);
1da177e4 4089}
bea3348e
SH
4090EXPORT_SYMBOL(__napi_schedule);
4091
d565b0a1
HX
4092void __napi_complete(struct napi_struct *n)
4093{
4094 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4095 BUG_ON(n->gro_list);
4096
4097 list_del(&n->poll_list);
4098 smp_mb__before_clear_bit();
4099 clear_bit(NAPI_STATE_SCHED, &n->state);
4100}
4101EXPORT_SYMBOL(__napi_complete);
4102
4103void napi_complete(struct napi_struct *n)
4104{
4105 unsigned long flags;
4106
4107 /*
4108 * don't let napi dequeue from the cpu poll list
4109 * just in case its running on a different cpu
4110 */
4111 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4112 return;
4113
2e71a6f8 4114 napi_gro_flush(n, false);
d565b0a1
HX
4115 local_irq_save(flags);
4116 __napi_complete(n);
4117 local_irq_restore(flags);
4118}
4119EXPORT_SYMBOL(napi_complete);
4120
4121void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4122 int (*poll)(struct napi_struct *, int), int weight)
4123{
4124 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 4125 napi->gro_count = 0;
d565b0a1 4126 napi->gro_list = NULL;
5d38a079 4127 napi->skb = NULL;
d565b0a1 4128 napi->poll = poll;
82dc3c63
ED
4129 if (weight > NAPI_POLL_WEIGHT)
4130 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4131 weight, dev->name);
d565b0a1
HX
4132 napi->weight = weight;
4133 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 4134 napi->dev = dev;
5d38a079 4135#ifdef CONFIG_NETPOLL
d565b0a1
HX
4136 spin_lock_init(&napi->poll_lock);
4137 napi->poll_owner = -1;
4138#endif
4139 set_bit(NAPI_STATE_SCHED, &napi->state);
4140}
4141EXPORT_SYMBOL(netif_napi_add);
4142
4143void netif_napi_del(struct napi_struct *napi)
4144{
4145 struct sk_buff *skb, *next;
4146
d7b06636 4147 list_del_init(&napi->dev_list);
76620aaf 4148 napi_free_frags(napi);
d565b0a1
HX
4149
4150 for (skb = napi->gro_list; skb; skb = next) {
4151 next = skb->next;
4152 skb->next = NULL;
4153 kfree_skb(skb);
4154 }
4155
4156 napi->gro_list = NULL;
4ae5544f 4157 napi->gro_count = 0;
d565b0a1
HX
4158}
4159EXPORT_SYMBOL(netif_napi_del);
4160
1da177e4
LT
4161static void net_rx_action(struct softirq_action *h)
4162{
e326bed2 4163 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 4164 unsigned long time_limit = jiffies + 2;
51b0bded 4165 int budget = netdev_budget;
53fb95d3
MM
4166 void *have;
4167
1da177e4
LT
4168 local_irq_disable();
4169
e326bed2 4170 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
4171 struct napi_struct *n;
4172 int work, weight;
1da177e4 4173
bea3348e 4174 /* If softirq window is exhuasted then punt.
24f8b238
SH
4175 * Allow this to run for 2 jiffies since which will allow
4176 * an average latency of 1.5/HZ.
bea3348e 4177 */
d1f41b67 4178 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
1da177e4
LT
4179 goto softnet_break;
4180
4181 local_irq_enable();
4182
bea3348e
SH
4183 /* Even though interrupts have been re-enabled, this
4184 * access is safe because interrupts can only add new
4185 * entries to the tail of this list, and only ->poll()
4186 * calls can remove this head entry from the list.
4187 */
e326bed2 4188 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 4189
bea3348e
SH
4190 have = netpoll_poll_lock(n);
4191
4192 weight = n->weight;
4193
0a7606c1
DM
4194 /* This NAPI_STATE_SCHED test is for avoiding a race
4195 * with netpoll's poll_napi(). Only the entity which
4196 * obtains the lock and sees NAPI_STATE_SCHED set will
4197 * actually make the ->poll() call. Therefore we avoid
25985edc 4198 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
4199 */
4200 work = 0;
4ea7e386 4201 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 4202 work = n->poll(n, weight);
4ea7e386
NH
4203 trace_napi_poll(n);
4204 }
bea3348e
SH
4205
4206 WARN_ON_ONCE(work > weight);
4207
4208 budget -= work;
4209
4210 local_irq_disable();
4211
4212 /* Drivers must not modify the NAPI state if they
4213 * consume the entire weight. In such cases this code
4214 * still "owns" the NAPI instance and therefore can
4215 * move the instance around on the list at-will.
4216 */
fed17f30 4217 if (unlikely(work == weight)) {
ff780cd8
HX
4218 if (unlikely(napi_disable_pending(n))) {
4219 local_irq_enable();
4220 napi_complete(n);
4221 local_irq_disable();
2e71a6f8
ED
4222 } else {
4223 if (n->gro_list) {
4224 /* flush too old packets
4225 * If HZ < 1000, flush all packets.
4226 */
4227 local_irq_enable();
4228 napi_gro_flush(n, HZ >= 1000);
4229 local_irq_disable();
4230 }
e326bed2 4231 list_move_tail(&n->poll_list, &sd->poll_list);
2e71a6f8 4232 }
fed17f30 4233 }
bea3348e
SH
4234
4235 netpoll_poll_unlock(have);
1da177e4
LT
4236 }
4237out:
e326bed2 4238 net_rps_action_and_irq_enable(sd);
0a9627f2 4239
db217334
CL
4240#ifdef CONFIG_NET_DMA
4241 /*
4242 * There may not be any more sk_buffs coming right now, so push
4243 * any pending DMA copies to hardware
4244 */
2ba05622 4245 dma_issue_pending_all();
db217334 4246#endif
bea3348e 4247
1da177e4
LT
4248 return;
4249
4250softnet_break:
dee42870 4251 sd->time_squeeze++;
1da177e4
LT
4252 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4253 goto out;
4254}
4255
9ff162a8
JP
4256struct netdev_upper {
4257 struct net_device *dev;
4258 bool master;
4259 struct list_head list;
4260 struct rcu_head rcu;
4261 struct list_head search_list;
4262};
4263
4264static void __append_search_uppers(struct list_head *search_list,
4265 struct net_device *dev)
4266{
4267 struct netdev_upper *upper;
4268
4269 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4270 /* check if this upper is not already in search list */
4271 if (list_empty(&upper->search_list))
4272 list_add_tail(&upper->search_list, search_list);
4273 }
4274}
4275
4276static bool __netdev_search_upper_dev(struct net_device *dev,
4277 struct net_device *upper_dev)
4278{
4279 LIST_HEAD(search_list);
4280 struct netdev_upper *upper;
4281 struct netdev_upper *tmp;
4282 bool ret = false;
4283
4284 __append_search_uppers(&search_list, dev);
4285 list_for_each_entry(upper, &search_list, search_list) {
4286 if (upper->dev == upper_dev) {
4287 ret = true;
4288 break;
4289 }
4290 __append_search_uppers(&search_list, upper->dev);
4291 }
4292 list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4293 INIT_LIST_HEAD(&upper->search_list);
4294 return ret;
4295}
4296
4297static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4298 struct net_device *upper_dev)
4299{
4300 struct netdev_upper *upper;
4301
4302 list_for_each_entry(upper, &dev->upper_dev_list, list) {
4303 if (upper->dev == upper_dev)
4304 return upper;
4305 }
4306 return NULL;
4307}
4308
4309/**
4310 * netdev_has_upper_dev - Check if device is linked to an upper device
4311 * @dev: device
4312 * @upper_dev: upper device to check
4313 *
4314 * Find out if a device is linked to specified upper device and return true
4315 * in case it is. Note that this checks only immediate upper device,
4316 * not through a complete stack of devices. The caller must hold the RTNL lock.
4317 */
4318bool netdev_has_upper_dev(struct net_device *dev,
4319 struct net_device *upper_dev)
4320{
4321 ASSERT_RTNL();
4322
4323 return __netdev_find_upper(dev, upper_dev);
4324}
4325EXPORT_SYMBOL(netdev_has_upper_dev);
4326
4327/**
4328 * netdev_has_any_upper_dev - Check if device is linked to some device
4329 * @dev: device
4330 *
4331 * Find out if a device is linked to an upper device and return true in case
4332 * it is. The caller must hold the RTNL lock.
4333 */
4334bool netdev_has_any_upper_dev(struct net_device *dev)
4335{
4336 ASSERT_RTNL();
4337
4338 return !list_empty(&dev->upper_dev_list);
4339}
4340EXPORT_SYMBOL(netdev_has_any_upper_dev);
4341
4342/**
4343 * netdev_master_upper_dev_get - Get master upper device
4344 * @dev: device
4345 *
4346 * Find a master upper device and return pointer to it or NULL in case
4347 * it's not there. The caller must hold the RTNL lock.
4348 */
4349struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4350{
4351 struct netdev_upper *upper;
4352
4353 ASSERT_RTNL();
4354
4355 if (list_empty(&dev->upper_dev_list))
4356 return NULL;
4357
4358 upper = list_first_entry(&dev->upper_dev_list,
4359 struct netdev_upper, list);
4360 if (likely(upper->master))
4361 return upper->dev;
4362 return NULL;
4363}
4364EXPORT_SYMBOL(netdev_master_upper_dev_get);
4365
4366/**
4367 * netdev_master_upper_dev_get_rcu - Get master upper device
4368 * @dev: device
4369 *
4370 * Find a master upper device and return pointer to it or NULL in case
4371 * it's not there. The caller must hold the RCU read lock.
4372 */
4373struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4374{
4375 struct netdev_upper *upper;
4376
4377 upper = list_first_or_null_rcu(&dev->upper_dev_list,
4378 struct netdev_upper, list);
4379 if (upper && likely(upper->master))
4380 return upper->dev;
4381 return NULL;
4382}
4383EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4384
4385static int __netdev_upper_dev_link(struct net_device *dev,
4386 struct net_device *upper_dev, bool master)
4387{
4388 struct netdev_upper *upper;
4389
4390 ASSERT_RTNL();
4391
4392 if (dev == upper_dev)
4393 return -EBUSY;
4394
4395 /* To prevent loops, check if dev is not upper device to upper_dev. */
4396 if (__netdev_search_upper_dev(upper_dev, dev))
4397 return -EBUSY;
4398
4399 if (__netdev_find_upper(dev, upper_dev))
4400 return -EEXIST;
4401
4402 if (master && netdev_master_upper_dev_get(dev))
4403 return -EBUSY;
4404
4405 upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4406 if (!upper)
4407 return -ENOMEM;
4408
4409 upper->dev = upper_dev;
4410 upper->master = master;
4411 INIT_LIST_HEAD(&upper->search_list);
4412
4413 /* Ensure that master upper link is always the first item in list. */
4414 if (master)
4415 list_add_rcu(&upper->list, &dev->upper_dev_list);
4416 else
4417 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4418 dev_hold(upper_dev);
4419
4420 return 0;
4421}
4422
4423/**
4424 * netdev_upper_dev_link - Add a link to the upper device
4425 * @dev: device
4426 * @upper_dev: new upper device
4427 *
4428 * Adds a link to device which is upper to this one. The caller must hold
4429 * the RTNL lock. On a failure a negative errno code is returned.
4430 * On success the reference counts are adjusted and the function
4431 * returns zero.
4432 */
4433int netdev_upper_dev_link(struct net_device *dev,
4434 struct net_device *upper_dev)
4435{
4436 return __netdev_upper_dev_link(dev, upper_dev, false);
4437}
4438EXPORT_SYMBOL(netdev_upper_dev_link);
4439
4440/**
4441 * netdev_master_upper_dev_link - Add a master link to the upper device
4442 * @dev: device
4443 * @upper_dev: new upper device
4444 *
4445 * Adds a link to device which is upper to this one. In this case, only
4446 * one master upper device can be linked, although other non-master devices
4447 * might be linked as well. The caller must hold the RTNL lock.
4448 * On a failure a negative errno code is returned. On success the reference
4449 * counts are adjusted and the function returns zero.
4450 */
4451int netdev_master_upper_dev_link(struct net_device *dev,
4452 struct net_device *upper_dev)
4453{
4454 return __netdev_upper_dev_link(dev, upper_dev, true);
4455}
4456EXPORT_SYMBOL(netdev_master_upper_dev_link);
4457
4458/**
4459 * netdev_upper_dev_unlink - Removes a link to upper device
4460 * @dev: device
4461 * @upper_dev: new upper device
4462 *
4463 * Removes a link to device which is upper to this one. The caller must hold
4464 * the RTNL lock.
4465 */
4466void netdev_upper_dev_unlink(struct net_device *dev,
4467 struct net_device *upper_dev)
4468{
4469 struct netdev_upper *upper;
4470
4471 ASSERT_RTNL();
4472
4473 upper = __netdev_find_upper(dev, upper_dev);
4474 if (!upper)
4475 return;
4476 list_del_rcu(&upper->list);
4477 dev_put(upper_dev);
4478 kfree_rcu(upper, rcu);
4479}
4480EXPORT_SYMBOL(netdev_upper_dev_unlink);
4481
b6c40d68
PM
4482static void dev_change_rx_flags(struct net_device *dev, int flags)
4483{
d314774c
SH
4484 const struct net_device_ops *ops = dev->netdev_ops;
4485
05cf2143 4486 if (ops->ndo_change_rx_flags)
d314774c 4487 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4488}
4489
dad9b335 4490static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4 4491{
b536db93 4492 unsigned int old_flags = dev->flags;
d04a48b0
EB
4493 kuid_t uid;
4494 kgid_t gid;
1da177e4 4495
24023451
PM
4496 ASSERT_RTNL();
4497
dad9b335
WC
4498 dev->flags |= IFF_PROMISC;
4499 dev->promiscuity += inc;
4500 if (dev->promiscuity == 0) {
4501 /*
4502 * Avoid overflow.
4503 * If inc causes overflow, untouch promisc and return error.
4504 */
4505 if (inc < 0)
4506 dev->flags &= ~IFF_PROMISC;
4507 else {
4508 dev->promiscuity -= inc;
7b6cd1ce
JP
4509 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4510 dev->name);
dad9b335
WC
4511 return -EOVERFLOW;
4512 }
4513 }
52609c0b 4514 if (dev->flags != old_flags) {
7b6cd1ce
JP
4515 pr_info("device %s %s promiscuous mode\n",
4516 dev->name,
4517 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
4518 if (audit_enabled) {
4519 current_uid_gid(&uid, &gid);
7759db82
KHK
4520 audit_log(current->audit_context, GFP_ATOMIC,
4521 AUDIT_ANOM_PROMISCUOUS,
4522 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4523 dev->name, (dev->flags & IFF_PROMISC),
4524 (old_flags & IFF_PROMISC),
e1760bd5 4525 from_kuid(&init_user_ns, audit_get_loginuid(current)),
d04a48b0
EB
4526 from_kuid(&init_user_ns, uid),
4527 from_kgid(&init_user_ns, gid),
7759db82 4528 audit_get_sessionid(current));
8192b0c4 4529 }
24023451 4530
b6c40d68 4531 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4532 }
dad9b335 4533 return 0;
1da177e4
LT
4534}
4535
4417da66
PM
4536/**
4537 * dev_set_promiscuity - update promiscuity count on a device
4538 * @dev: device
4539 * @inc: modifier
4540 *
4541 * Add or remove promiscuity from a device. While the count in the device
4542 * remains above zero the interface remains promiscuous. Once it hits zero
4543 * the device reverts back to normal filtering operation. A negative inc
4544 * value is used to drop promiscuity on the device.
dad9b335 4545 * Return 0 if successful or a negative errno code on error.
4417da66 4546 */
dad9b335 4547int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 4548{
b536db93 4549 unsigned int old_flags = dev->flags;
dad9b335 4550 int err;
4417da66 4551
dad9b335 4552 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4553 if (err < 0)
dad9b335 4554 return err;
4417da66
PM
4555 if (dev->flags != old_flags)
4556 dev_set_rx_mode(dev);
dad9b335 4557 return err;
4417da66 4558}
d1b19dff 4559EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4560
1da177e4
LT
4561/**
4562 * dev_set_allmulti - update allmulti count on a device
4563 * @dev: device
4564 * @inc: modifier
4565 *
4566 * Add or remove reception of all multicast frames to a device. While the
4567 * count in the device remains above zero the interface remains listening
4568 * to all interfaces. Once it hits zero the device reverts back to normal
4569 * filtering operation. A negative @inc value is used to drop the counter
4570 * when releasing a resource needing all multicasts.
dad9b335 4571 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4572 */
4573
dad9b335 4574int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4 4575{
b536db93 4576 unsigned int old_flags = dev->flags;
1da177e4 4577
24023451
PM
4578 ASSERT_RTNL();
4579
1da177e4 4580 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4581 dev->allmulti += inc;
4582 if (dev->allmulti == 0) {
4583 /*
4584 * Avoid overflow.
4585 * If inc causes overflow, untouch allmulti and return error.
4586 */
4587 if (inc < 0)
4588 dev->flags &= ~IFF_ALLMULTI;
4589 else {
4590 dev->allmulti -= inc;
7b6cd1ce
JP
4591 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4592 dev->name);
dad9b335
WC
4593 return -EOVERFLOW;
4594 }
4595 }
24023451 4596 if (dev->flags ^ old_flags) {
b6c40d68 4597 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4598 dev_set_rx_mode(dev);
24023451 4599 }
dad9b335 4600 return 0;
4417da66 4601}
d1b19dff 4602EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4603
4604/*
4605 * Upload unicast and multicast address lists to device and
4606 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4607 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4608 * are present.
4609 */
4610void __dev_set_rx_mode(struct net_device *dev)
4611{
d314774c
SH
4612 const struct net_device_ops *ops = dev->netdev_ops;
4613
4417da66
PM
4614 /* dev_open will call this function so the list will stay sane. */
4615 if (!(dev->flags&IFF_UP))
4616 return;
4617
4618 if (!netif_device_present(dev))
40b77c94 4619 return;
4417da66 4620
01789349 4621 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
4622 /* Unicast addresses changes may only happen under the rtnl,
4623 * therefore calling __dev_set_promiscuity here is safe.
4624 */
32e7bfc4 4625 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66 4626 __dev_set_promiscuity(dev, 1);
2d348d1f 4627 dev->uc_promisc = true;
32e7bfc4 4628 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66 4629 __dev_set_promiscuity(dev, -1);
2d348d1f 4630 dev->uc_promisc = false;
4417da66 4631 }
4417da66 4632 }
01789349
JP
4633
4634 if (ops->ndo_set_rx_mode)
4635 ops->ndo_set_rx_mode(dev);
4417da66 4636}
bc433f76 4637EXPORT_SYMBOL(__dev_set_rx_mode);
4417da66
PM
4638
4639void dev_set_rx_mode(struct net_device *dev)
4640{
b9e40857 4641 netif_addr_lock_bh(dev);
4417da66 4642 __dev_set_rx_mode(dev);
b9e40857 4643 netif_addr_unlock_bh(dev);
1da177e4
LT
4644}
4645
f0db275a
SH
4646/**
4647 * dev_get_flags - get flags reported to userspace
4648 * @dev: device
4649 *
4650 * Get the combination of flag bits exported through APIs to userspace.
4651 */
95c96174 4652unsigned int dev_get_flags(const struct net_device *dev)
1da177e4 4653{
95c96174 4654 unsigned int flags;
1da177e4
LT
4655
4656 flags = (dev->flags & ~(IFF_PROMISC |
4657 IFF_ALLMULTI |
b00055aa
SR
4658 IFF_RUNNING |
4659 IFF_LOWER_UP |
4660 IFF_DORMANT)) |
1da177e4
LT
4661 (dev->gflags & (IFF_PROMISC |
4662 IFF_ALLMULTI));
4663
b00055aa
SR
4664 if (netif_running(dev)) {
4665 if (netif_oper_up(dev))
4666 flags |= IFF_RUNNING;
4667 if (netif_carrier_ok(dev))
4668 flags |= IFF_LOWER_UP;
4669 if (netif_dormant(dev))
4670 flags |= IFF_DORMANT;
4671 }
1da177e4
LT
4672
4673 return flags;
4674}
d1b19dff 4675EXPORT_SYMBOL(dev_get_flags);
1da177e4 4676
bd380811 4677int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4678{
b536db93 4679 unsigned int old_flags = dev->flags;
bd380811 4680 int ret;
1da177e4 4681
24023451
PM
4682 ASSERT_RTNL();
4683
1da177e4
LT
4684 /*
4685 * Set the flags on our device.
4686 */
4687
4688 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4689 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4690 IFF_AUTOMEDIA)) |
4691 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4692 IFF_ALLMULTI));
4693
4694 /*
4695 * Load in the correct multicast list now the flags have changed.
4696 */
4697
b6c40d68
PM
4698 if ((old_flags ^ flags) & IFF_MULTICAST)
4699 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4700
4417da66 4701 dev_set_rx_mode(dev);
1da177e4
LT
4702
4703 /*
4704 * Have we downed the interface. We handle IFF_UP ourselves
4705 * according to user attempts to set it, rather than blindly
4706 * setting it.
4707 */
4708
4709 ret = 0;
4710 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4711 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4712
4713 if (!ret)
4417da66 4714 dev_set_rx_mode(dev);
1da177e4
LT
4715 }
4716
1da177e4 4717 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4718 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4719
1da177e4
LT
4720 dev->gflags ^= IFF_PROMISC;
4721 dev_set_promiscuity(dev, inc);
4722 }
4723
4724 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4725 is important. Some (broken) drivers set IFF_PROMISC, when
4726 IFF_ALLMULTI is requested not asking us and not reporting.
4727 */
4728 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4729 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4730
1da177e4
LT
4731 dev->gflags ^= IFF_ALLMULTI;
4732 dev_set_allmulti(dev, inc);
4733 }
4734
bd380811
PM
4735 return ret;
4736}
4737
4738void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4739{
4740 unsigned int changes = dev->flags ^ old_flags;
4741
4742 if (changes & IFF_UP) {
4743 if (dev->flags & IFF_UP)
4744 call_netdevice_notifiers(NETDEV_UP, dev);
4745 else
4746 call_netdevice_notifiers(NETDEV_DOWN, dev);
4747 }
4748
4749 if (dev->flags & IFF_UP &&
4750 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4751 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4752}
4753
4754/**
4755 * dev_change_flags - change device settings
4756 * @dev: device
4757 * @flags: device state flags
4758 *
4759 * Change settings on device based state flags. The flags are
4760 * in the userspace exported format.
4761 */
b536db93 4762int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 4763{
b536db93
ED
4764 int ret;
4765 unsigned int changes, old_flags = dev->flags;
bd380811
PM
4766
4767 ret = __dev_change_flags(dev, flags);
4768 if (ret < 0)
4769 return ret;
4770
4771 changes = old_flags ^ dev->flags;
7c355f53
TG
4772 if (changes)
4773 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4774
bd380811 4775 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4776 return ret;
4777}
d1b19dff 4778EXPORT_SYMBOL(dev_change_flags);
1da177e4 4779
f0db275a
SH
4780/**
4781 * dev_set_mtu - Change maximum transfer unit
4782 * @dev: device
4783 * @new_mtu: new transfer unit
4784 *
4785 * Change the maximum transfer size of the network device.
4786 */
1da177e4
LT
4787int dev_set_mtu(struct net_device *dev, int new_mtu)
4788{
d314774c 4789 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4790 int err;
4791
4792 if (new_mtu == dev->mtu)
4793 return 0;
4794
4795 /* MTU must be positive. */
4796 if (new_mtu < 0)
4797 return -EINVAL;
4798
4799 if (!netif_device_present(dev))
4800 return -ENODEV;
4801
4802 err = 0;
d314774c
SH
4803 if (ops->ndo_change_mtu)
4804 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4805 else
4806 dev->mtu = new_mtu;
d314774c 4807
e3d8fabe 4808 if (!err)
056925ab 4809 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4810 return err;
4811}
d1b19dff 4812EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4813
cbda10fa
VD
4814/**
4815 * dev_set_group - Change group this device belongs to
4816 * @dev: device
4817 * @new_group: group this device should belong to
4818 */
4819void dev_set_group(struct net_device *dev, int new_group)
4820{
4821 dev->group = new_group;
4822}
4823EXPORT_SYMBOL(dev_set_group);
4824
f0db275a
SH
4825/**
4826 * dev_set_mac_address - Change Media Access Control Address
4827 * @dev: device
4828 * @sa: new address
4829 *
4830 * Change the hardware (MAC) address of the device
4831 */
1da177e4
LT
4832int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4833{
d314774c 4834 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4835 int err;
4836
d314774c 4837 if (!ops->ndo_set_mac_address)
1da177e4
LT
4838 return -EOPNOTSUPP;
4839 if (sa->sa_family != dev->type)
4840 return -EINVAL;
4841 if (!netif_device_present(dev))
4842 return -ENODEV;
d314774c 4843 err = ops->ndo_set_mac_address(dev, sa);
f6521516
JP
4844 if (err)
4845 return err;
fbdeca2d 4846 dev->addr_assign_type = NET_ADDR_SET;
f6521516 4847 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
7bf23575 4848 add_device_randomness(dev->dev_addr, dev->addr_len);
f6521516 4849 return 0;
1da177e4 4850}
d1b19dff 4851EXPORT_SYMBOL(dev_set_mac_address);
1da177e4 4852
4bf84c35
JP
4853/**
4854 * dev_change_carrier - Change device carrier
4855 * @dev: device
691b3b7e 4856 * @new_carrier: new value
4bf84c35
JP
4857 *
4858 * Change device carrier
4859 */
4860int dev_change_carrier(struct net_device *dev, bool new_carrier)
4861{
4862 const struct net_device_ops *ops = dev->netdev_ops;
4863
4864 if (!ops->ndo_change_carrier)
4865 return -EOPNOTSUPP;
4866 if (!netif_device_present(dev))
4867 return -ENODEV;
4868 return ops->ndo_change_carrier(dev, new_carrier);
4869}
4870EXPORT_SYMBOL(dev_change_carrier);
4871
1da177e4
LT
4872/**
4873 * dev_new_index - allocate an ifindex
c4ea43c5 4874 * @net: the applicable net namespace
1da177e4
LT
4875 *
4876 * Returns a suitable unique value for a new device interface
4877 * number. The caller must hold the rtnl semaphore or the
4878 * dev_base_lock to be sure it remains unique.
4879 */
881d966b 4880static int dev_new_index(struct net *net)
1da177e4 4881{
aa79e66e 4882 int ifindex = net->ifindex;
1da177e4
LT
4883 for (;;) {
4884 if (++ifindex <= 0)
4885 ifindex = 1;
881d966b 4886 if (!__dev_get_by_index(net, ifindex))
aa79e66e 4887 return net->ifindex = ifindex;
1da177e4
LT
4888 }
4889}
4890
1da177e4 4891/* Delayed registration/unregisteration */
3b5b34fd 4892static LIST_HEAD(net_todo_list);
1da177e4 4893
6f05f629 4894static void net_set_todo(struct net_device *dev)
1da177e4 4895{
1da177e4 4896 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
4897}
4898
9b5e383c 4899static void rollback_registered_many(struct list_head *head)
93ee31f1 4900{
e93737b0 4901 struct net_device *dev, *tmp;
9b5e383c 4902
93ee31f1
DL
4903 BUG_ON(dev_boot_phase);
4904 ASSERT_RTNL();
4905
e93737b0 4906 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 4907 /* Some devices call without registering
e93737b0
KK
4908 * for initialization unwind. Remove those
4909 * devices and proceed with the remaining.
9b5e383c
ED
4910 */
4911 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
4912 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4913 dev->name, dev);
93ee31f1 4914
9b5e383c 4915 WARN_ON(1);
e93737b0
KK
4916 list_del(&dev->unreg_list);
4917 continue;
9b5e383c 4918 }
449f4544 4919 dev->dismantle = true;
9b5e383c 4920 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 4921 }
93ee31f1 4922
44345724
OP
4923 /* If device is running, close it first. */
4924 dev_close_many(head);
93ee31f1 4925
44345724 4926 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
4927 /* And unlink it from device chain. */
4928 unlist_netdevice(dev);
93ee31f1 4929
9b5e383c
ED
4930 dev->reg_state = NETREG_UNREGISTERING;
4931 }
93ee31f1
DL
4932
4933 synchronize_net();
4934
9b5e383c
ED
4935 list_for_each_entry(dev, head, unreg_list) {
4936 /* Shutdown queueing discipline. */
4937 dev_shutdown(dev);
93ee31f1
DL
4938
4939
9b5e383c
ED
4940 /* Notify protocols, that we are about to destroy
4941 this device. They should clean all the things.
4942 */
4943 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 4944
a2835763
PM
4945 if (!dev->rtnl_link_ops ||
4946 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4947 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4948
9b5e383c
ED
4949 /*
4950 * Flush the unicast and multicast chains
4951 */
a748ee24 4952 dev_uc_flush(dev);
22bedad3 4953 dev_mc_flush(dev);
93ee31f1 4954
9b5e383c
ED
4955 if (dev->netdev_ops->ndo_uninit)
4956 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 4957
9ff162a8
JP
4958 /* Notifier chain MUST detach us all upper devices. */
4959 WARN_ON(netdev_has_any_upper_dev(dev));
93ee31f1 4960
9b5e383c
ED
4961 /* Remove entries from kobject tree */
4962 netdev_unregister_kobject(dev);
024e9679
AD
4963#ifdef CONFIG_XPS
4964 /* Remove XPS queueing entries */
4965 netif_reset_xps_queues_gt(dev, 0);
4966#endif
9b5e383c 4967 }
93ee31f1 4968
850a545b 4969 synchronize_net();
395264d5 4970
a5ee1551 4971 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
4972 dev_put(dev);
4973}
4974
4975static void rollback_registered(struct net_device *dev)
4976{
4977 LIST_HEAD(single);
4978
4979 list_add(&dev->unreg_list, &single);
4980 rollback_registered_many(&single);
ceaaec98 4981 list_del(&single);
93ee31f1
DL
4982}
4983
c8f44aff
MM
4984static netdev_features_t netdev_fix_features(struct net_device *dev,
4985 netdev_features_t features)
b63365a2 4986{
57422dc5
MM
4987 /* Fix illegal checksum combinations */
4988 if ((features & NETIF_F_HW_CSUM) &&
4989 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 4990 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
4991 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4992 }
4993
b63365a2 4994 /* TSO requires that SG is present as well. */
ea2d3688 4995 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 4996 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 4997 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
4998 }
4999
ec5f0615
PS
5000 if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5001 !(features & NETIF_F_IP_CSUM)) {
5002 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5003 features &= ~NETIF_F_TSO;
5004 features &= ~NETIF_F_TSO_ECN;
5005 }
5006
5007 if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5008 !(features & NETIF_F_IPV6_CSUM)) {
5009 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5010 features &= ~NETIF_F_TSO6;
5011 }
5012
31d8b9e0
BH
5013 /* TSO ECN requires that TSO is present as well. */
5014 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5015 features &= ~NETIF_F_TSO_ECN;
5016
212b573f
MM
5017 /* Software GSO depends on SG. */
5018 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5019 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5020 features &= ~NETIF_F_GSO;
5021 }
5022
acd1130e 5023 /* UFO needs SG and checksumming */
b63365a2 5024 if (features & NETIF_F_UFO) {
79032644
MM
5025 /* maybe split UFO into V4 and V6? */
5026 if (!((features & NETIF_F_GEN_CSUM) ||
5027 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5028 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5029 netdev_dbg(dev,
acd1130e 5030 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5031 features &= ~NETIF_F_UFO;
5032 }
5033
5034 if (!(features & NETIF_F_SG)) {
6f404e44 5035 netdev_dbg(dev,
acd1130e 5036 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
5037 features &= ~NETIF_F_UFO;
5038 }
5039 }
5040
5041 return features;
5042}
b63365a2 5043
6cb6a27c 5044int __netdev_update_features(struct net_device *dev)
5455c699 5045{
c8f44aff 5046 netdev_features_t features;
5455c699
MM
5047 int err = 0;
5048
87267485
MM
5049 ASSERT_RTNL();
5050
5455c699
MM
5051 features = netdev_get_wanted_features(dev);
5052
5053 if (dev->netdev_ops->ndo_fix_features)
5054 features = dev->netdev_ops->ndo_fix_features(dev, features);
5055
5056 /* driver might be less strict about feature dependencies */
5057 features = netdev_fix_features(dev, features);
5058
5059 if (dev->features == features)
6cb6a27c 5060 return 0;
5455c699 5061
c8f44aff
MM
5062 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5063 &dev->features, &features);
5455c699
MM
5064
5065 if (dev->netdev_ops->ndo_set_features)
5066 err = dev->netdev_ops->ndo_set_features(dev, features);
5067
6cb6a27c 5068 if (unlikely(err < 0)) {
5455c699 5069 netdev_err(dev,
c8f44aff
MM
5070 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5071 err, &features, &dev->features);
6cb6a27c
MM
5072 return -1;
5073 }
5074
5075 if (!err)
5076 dev->features = features;
5077
5078 return 1;
5079}
5080
afe12cc8
MM
5081/**
5082 * netdev_update_features - recalculate device features
5083 * @dev: the device to check
5084 *
5085 * Recalculate dev->features set and send notifications if it
5086 * has changed. Should be called after driver or hardware dependent
5087 * conditions might have changed that influence the features.
5088 */
6cb6a27c
MM
5089void netdev_update_features(struct net_device *dev)
5090{
5091 if (__netdev_update_features(dev))
5092 netdev_features_change(dev);
5455c699
MM
5093}
5094EXPORT_SYMBOL(netdev_update_features);
5095
afe12cc8
MM
5096/**
5097 * netdev_change_features - recalculate device features
5098 * @dev: the device to check
5099 *
5100 * Recalculate dev->features set and send notifications even
5101 * if they have not changed. Should be called instead of
5102 * netdev_update_features() if also dev->vlan_features might
5103 * have changed to allow the changes to be propagated to stacked
5104 * VLAN devices.
5105 */
5106void netdev_change_features(struct net_device *dev)
5107{
5108 __netdev_update_features(dev);
5109 netdev_features_change(dev);
5110}
5111EXPORT_SYMBOL(netdev_change_features);
5112
fc4a7489
PM
5113/**
5114 * netif_stacked_transfer_operstate - transfer operstate
5115 * @rootdev: the root or lower level device to transfer state from
5116 * @dev: the device to transfer operstate to
5117 *
5118 * Transfer operational state from root to device. This is normally
5119 * called when a stacking relationship exists between the root
5120 * device and the device(a leaf device).
5121 */
5122void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5123 struct net_device *dev)
5124{
5125 if (rootdev->operstate == IF_OPER_DORMANT)
5126 netif_dormant_on(dev);
5127 else
5128 netif_dormant_off(dev);
5129
5130 if (netif_carrier_ok(rootdev)) {
5131 if (!netif_carrier_ok(dev))
5132 netif_carrier_on(dev);
5133 } else {
5134 if (netif_carrier_ok(dev))
5135 netif_carrier_off(dev);
5136 }
5137}
5138EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5139
bf264145 5140#ifdef CONFIG_RPS
1b4bf461
ED
5141static int netif_alloc_rx_queues(struct net_device *dev)
5142{
1b4bf461 5143 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5144 struct netdev_rx_queue *rx;
1b4bf461 5145
bd25fa7b 5146 BUG_ON(count < 1);
1b4bf461 5147
bd25fa7b 5148 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
62b5942a 5149 if (!rx)
bd25fa7b 5150 return -ENOMEM;
62b5942a 5151
bd25fa7b
TH
5152 dev->_rx = rx;
5153
bd25fa7b 5154 for (i = 0; i < count; i++)
fe822240 5155 rx[i].dev = dev;
1b4bf461
ED
5156 return 0;
5157}
bf264145 5158#endif
1b4bf461 5159
aa942104
CG
5160static void netdev_init_one_queue(struct net_device *dev,
5161 struct netdev_queue *queue, void *_unused)
5162{
5163 /* Initialize queue lock */
5164 spin_lock_init(&queue->_xmit_lock);
5165 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5166 queue->xmit_lock_owner = -1;
b236da69 5167 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 5168 queue->dev = dev;
114cf580
TH
5169#ifdef CONFIG_BQL
5170 dql_init(&queue->dql, HZ);
5171#endif
aa942104
CG
5172}
5173
e6484930
TH
5174static int netif_alloc_netdev_queues(struct net_device *dev)
5175{
5176 unsigned int count = dev->num_tx_queues;
5177 struct netdev_queue *tx;
5178
5179 BUG_ON(count < 1);
5180
5181 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
62b5942a 5182 if (!tx)
e6484930 5183 return -ENOMEM;
62b5942a 5184
e6484930 5185 dev->_tx = tx;
1d24eb48 5186
e6484930
TH
5187 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5188 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
5189
5190 return 0;
e6484930
TH
5191}
5192
1da177e4
LT
5193/**
5194 * register_netdevice - register a network device
5195 * @dev: device to register
5196 *
5197 * Take a completed network device structure and add it to the kernel
5198 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5199 * chain. 0 is returned on success. A negative errno code is returned
5200 * on a failure to set up the device, or if the name is a duplicate.
5201 *
5202 * Callers must hold the rtnl semaphore. You may want
5203 * register_netdev() instead of this.
5204 *
5205 * BUGS:
5206 * The locking appears insufficient to guarantee two parallel registers
5207 * will not get the same name.
5208 */
5209
5210int register_netdevice(struct net_device *dev)
5211{
1da177e4 5212 int ret;
d314774c 5213 struct net *net = dev_net(dev);
1da177e4
LT
5214
5215 BUG_ON(dev_boot_phase);
5216 ASSERT_RTNL();
5217
b17a7c17
SH
5218 might_sleep();
5219
1da177e4
LT
5220 /* When net_device's are persistent, this will be fatal. */
5221 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5222 BUG_ON(!net);
1da177e4 5223
f1f28aa3 5224 spin_lock_init(&dev->addr_list_lock);
cf508b12 5225 netdev_set_addr_lockdep_class(dev);
1da177e4 5226
1da177e4
LT
5227 dev->iflink = -1;
5228
828de4f6 5229 ret = dev_get_valid_name(net, dev, dev->name);
0696c3a8
PP
5230 if (ret < 0)
5231 goto out;
5232
1da177e4 5233 /* Init, if this function is available */
d314774c
SH
5234 if (dev->netdev_ops->ndo_init) {
5235 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5236 if (ret) {
5237 if (ret > 0)
5238 ret = -EIO;
90833aa4 5239 goto out;
1da177e4
LT
5240 }
5241 }
4ec93edb 5242
f646968f
PM
5243 if (((dev->hw_features | dev->features) &
5244 NETIF_F_HW_VLAN_CTAG_FILTER) &&
d2ed273d
MM
5245 (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5246 !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5247 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5248 ret = -EINVAL;
5249 goto err_uninit;
5250 }
5251
9c7dafbf
PE
5252 ret = -EBUSY;
5253 if (!dev->ifindex)
5254 dev->ifindex = dev_new_index(net);
5255 else if (__dev_get_by_index(net, dev->ifindex))
5256 goto err_uninit;
5257
1da177e4
LT
5258 if (dev->iflink == -1)
5259 dev->iflink = dev->ifindex;
5260
5455c699
MM
5261 /* Transfer changeable features to wanted_features and enable
5262 * software offloads (GSO and GRO).
5263 */
5264 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
5265 dev->features |= NETIF_F_SOFT_FEATURES;
5266 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 5267
c6e1a0d1 5268 /* Turn on no cache copy if HW is doing checksum */
34324dc2
MM
5269 if (!(dev->flags & IFF_LOOPBACK)) {
5270 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5271 if (dev->features & NETIF_F_ALL_CSUM) {
5272 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5273 dev->features |= NETIF_F_NOCACHE_COPY;
5274 }
c6e1a0d1
TH
5275 }
5276
1180e7d6 5277 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 5278 */
1180e7d6 5279 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 5280
ee579677
PS
5281 /* Make NETIF_F_SG inheritable to tunnel devices.
5282 */
5283 dev->hw_enc_features |= NETIF_F_SG;
5284
7ffbe3fd
JB
5285 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5286 ret = notifier_to_errno(ret);
5287 if (ret)
5288 goto err_uninit;
5289
8b41d188 5290 ret = netdev_register_kobject(dev);
b17a7c17 5291 if (ret)
7ce1b0ed 5292 goto err_uninit;
b17a7c17
SH
5293 dev->reg_state = NETREG_REGISTERED;
5294
6cb6a27c 5295 __netdev_update_features(dev);
8e9b59b2 5296
1da177e4
LT
5297 /*
5298 * Default initial state at registry is that the
5299 * device is present.
5300 */
5301
5302 set_bit(__LINK_STATE_PRESENT, &dev->state);
5303
8f4cccbb
BH
5304 linkwatch_init_dev(dev);
5305
1da177e4 5306 dev_init_scheduler(dev);
1da177e4 5307 dev_hold(dev);
ce286d32 5308 list_netdevice(dev);
7bf23575 5309 add_device_randomness(dev->dev_addr, dev->addr_len);
1da177e4 5310
948b337e
JP
5311 /* If the device has permanent device address, driver should
5312 * set dev_addr and also addr_assign_type should be set to
5313 * NET_ADDR_PERM (default value).
5314 */
5315 if (dev->addr_assign_type == NET_ADDR_PERM)
5316 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5317
1da177e4 5318 /* Notify protocols, that a new device appeared. */
056925ab 5319 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5320 ret = notifier_to_errno(ret);
93ee31f1
DL
5321 if (ret) {
5322 rollback_registered(dev);
5323 dev->reg_state = NETREG_UNREGISTERED;
5324 }
d90a909e
EB
5325 /*
5326 * Prevent userspace races by waiting until the network
5327 * device is fully setup before sending notifications.
5328 */
a2835763
PM
5329 if (!dev->rtnl_link_ops ||
5330 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5331 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5332
5333out:
5334 return ret;
7ce1b0ed
HX
5335
5336err_uninit:
d314774c
SH
5337 if (dev->netdev_ops->ndo_uninit)
5338 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5339 goto out;
1da177e4 5340}
d1b19dff 5341EXPORT_SYMBOL(register_netdevice);
1da177e4 5342
937f1ba5
BH
5343/**
5344 * init_dummy_netdev - init a dummy network device for NAPI
5345 * @dev: device to init
5346 *
5347 * This takes a network device structure and initialize the minimum
5348 * amount of fields so it can be used to schedule NAPI polls without
5349 * registering a full blown interface. This is to be used by drivers
5350 * that need to tie several hardware interfaces to a single NAPI
5351 * poll scheduler due to HW limitations.
5352 */
5353int init_dummy_netdev(struct net_device *dev)
5354{
5355 /* Clear everything. Note we don't initialize spinlocks
5356 * are they aren't supposed to be taken by any of the
5357 * NAPI code and this dummy netdev is supposed to be
5358 * only ever used for NAPI polls
5359 */
5360 memset(dev, 0, sizeof(struct net_device));
5361
5362 /* make sure we BUG if trying to hit standard
5363 * register/unregister code path
5364 */
5365 dev->reg_state = NETREG_DUMMY;
5366
937f1ba5
BH
5367 /* NAPI wants this */
5368 INIT_LIST_HEAD(&dev->napi_list);
5369
5370 /* a dummy interface is started by default */
5371 set_bit(__LINK_STATE_PRESENT, &dev->state);
5372 set_bit(__LINK_STATE_START, &dev->state);
5373
29b4433d
ED
5374 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5375 * because users of this 'device' dont need to change
5376 * its refcount.
5377 */
5378
937f1ba5
BH
5379 return 0;
5380}
5381EXPORT_SYMBOL_GPL(init_dummy_netdev);
5382
5383
1da177e4
LT
5384/**
5385 * register_netdev - register a network device
5386 * @dev: device to register
5387 *
5388 * Take a completed network device structure and add it to the kernel
5389 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5390 * chain. 0 is returned on success. A negative errno code is returned
5391 * on a failure to set up the device, or if the name is a duplicate.
5392 *
38b4da38 5393 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5394 * and expands the device name if you passed a format string to
5395 * alloc_netdev.
5396 */
5397int register_netdev(struct net_device *dev)
5398{
5399 int err;
5400
5401 rtnl_lock();
1da177e4 5402 err = register_netdevice(dev);
1da177e4
LT
5403 rtnl_unlock();
5404 return err;
5405}
5406EXPORT_SYMBOL(register_netdev);
5407
29b4433d
ED
5408int netdev_refcnt_read(const struct net_device *dev)
5409{
5410 int i, refcnt = 0;
5411
5412 for_each_possible_cpu(i)
5413 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5414 return refcnt;
5415}
5416EXPORT_SYMBOL(netdev_refcnt_read);
5417
2c53040f 5418/**
1da177e4 5419 * netdev_wait_allrefs - wait until all references are gone.
3de7a37b 5420 * @dev: target net_device
1da177e4
LT
5421 *
5422 * This is called when unregistering network devices.
5423 *
5424 * Any protocol or device that holds a reference should register
5425 * for netdevice notification, and cleanup and put back the
5426 * reference if they receive an UNREGISTER event.
5427 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5428 * call dev_put.
1da177e4
LT
5429 */
5430static void netdev_wait_allrefs(struct net_device *dev)
5431{
5432 unsigned long rebroadcast_time, warning_time;
29b4433d 5433 int refcnt;
1da177e4 5434
e014debe
ED
5435 linkwatch_forget_dev(dev);
5436
1da177e4 5437 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
5438 refcnt = netdev_refcnt_read(dev);
5439
5440 while (refcnt != 0) {
1da177e4 5441 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5442 rtnl_lock();
1da177e4
LT
5443
5444 /* Rebroadcast unregister notification */
056925ab 5445 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
1da177e4 5446
748e2d93 5447 __rtnl_unlock();
0115e8e3 5448 rcu_barrier();
748e2d93
ED
5449 rtnl_lock();
5450
0115e8e3 5451 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
1da177e4
LT
5452 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5453 &dev->state)) {
5454 /* We must not have linkwatch events
5455 * pending on unregister. If this
5456 * happens, we simply run the queue
5457 * unscheduled, resulting in a noop
5458 * for this device.
5459 */
5460 linkwatch_run_queue();
5461 }
5462
6756ae4b 5463 __rtnl_unlock();
1da177e4
LT
5464
5465 rebroadcast_time = jiffies;
5466 }
5467
5468 msleep(250);
5469
29b4433d
ED
5470 refcnt = netdev_refcnt_read(dev);
5471
1da177e4 5472 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
5473 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5474 dev->name, refcnt);
1da177e4
LT
5475 warning_time = jiffies;
5476 }
5477 }
5478}
5479
5480/* The sequence is:
5481 *
5482 * rtnl_lock();
5483 * ...
5484 * register_netdevice(x1);
5485 * register_netdevice(x2);
5486 * ...
5487 * unregister_netdevice(y1);
5488 * unregister_netdevice(y2);
5489 * ...
5490 * rtnl_unlock();
5491 * free_netdev(y1);
5492 * free_netdev(y2);
5493 *
58ec3b4d 5494 * We are invoked by rtnl_unlock().
1da177e4 5495 * This allows us to deal with problems:
b17a7c17 5496 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5497 * without deadlocking with linkwatch via keventd.
5498 * 2) Since we run with the RTNL semaphore not held, we can sleep
5499 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5500 *
5501 * We must not return until all unregister events added during
5502 * the interval the lock was held have been completed.
1da177e4 5503 */
1da177e4
LT
5504void netdev_run_todo(void)
5505{
626ab0e6 5506 struct list_head list;
1da177e4 5507
1da177e4 5508 /* Snapshot list, allow later requests */
626ab0e6 5509 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5510
5511 __rtnl_unlock();
626ab0e6 5512
0115e8e3
ED
5513
5514 /* Wait for rcu callbacks to finish before next phase */
850a545b
EB
5515 if (!list_empty(&list))
5516 rcu_barrier();
5517
1da177e4
LT
5518 while (!list_empty(&list)) {
5519 struct net_device *dev
e5e26d75 5520 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5521 list_del(&dev->todo_list);
5522
748e2d93 5523 rtnl_lock();
0115e8e3 5524 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
748e2d93 5525 __rtnl_unlock();
0115e8e3 5526
b17a7c17 5527 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 5528 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
5529 dev->name, dev->reg_state);
5530 dump_stack();
5531 continue;
5532 }
1da177e4 5533
b17a7c17 5534 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5535
152102c7 5536 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5537
b17a7c17 5538 netdev_wait_allrefs(dev);
1da177e4 5539
b17a7c17 5540 /* paranoia */
29b4433d 5541 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
5542 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5543 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 5544 WARN_ON(dev->dn_ptr);
1da177e4 5545
b17a7c17
SH
5546 if (dev->destructor)
5547 dev->destructor(dev);
9093bbb2
SH
5548
5549 /* Free network device */
5550 kobject_put(&dev->dev.kobj);
1da177e4 5551 }
1da177e4
LT
5552}
5553
3cfde79c
BH
5554/* Convert net_device_stats to rtnl_link_stats64. They have the same
5555 * fields in the same order, with only the type differing.
5556 */
77a1abf5
ED
5557void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5558 const struct net_device_stats *netdev_stats)
3cfde79c
BH
5559{
5560#if BITS_PER_LONG == 64
77a1abf5
ED
5561 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5562 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
5563#else
5564 size_t i, n = sizeof(*stats64) / sizeof(u64);
5565 const unsigned long *src = (const unsigned long *)netdev_stats;
5566 u64 *dst = (u64 *)stats64;
5567
5568 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5569 sizeof(*stats64) / sizeof(u64));
5570 for (i = 0; i < n; i++)
5571 dst[i] = src[i];
5572#endif
5573}
77a1abf5 5574EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 5575
eeda3fd6
SH
5576/**
5577 * dev_get_stats - get network device statistics
5578 * @dev: device to get statistics from
28172739 5579 * @storage: place to store stats
eeda3fd6 5580 *
d7753516
BH
5581 * Get network statistics from device. Return @storage.
5582 * The device driver may provide its own method by setting
5583 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5584 * otherwise the internal statistics structure is used.
eeda3fd6 5585 */
d7753516
BH
5586struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5587 struct rtnl_link_stats64 *storage)
7004bf25 5588{
eeda3fd6
SH
5589 const struct net_device_ops *ops = dev->netdev_ops;
5590
28172739
ED
5591 if (ops->ndo_get_stats64) {
5592 memset(storage, 0, sizeof(*storage));
caf586e5
ED
5593 ops->ndo_get_stats64(dev, storage);
5594 } else if (ops->ndo_get_stats) {
3cfde79c 5595 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
5596 } else {
5597 netdev_stats_to_stats64(storage, &dev->stats);
28172739 5598 }
caf586e5 5599 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
28172739 5600 return storage;
c45d286e 5601}
eeda3fd6 5602EXPORT_SYMBOL(dev_get_stats);
c45d286e 5603
24824a09 5604struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 5605{
24824a09 5606 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 5607
24824a09
ED
5608#ifdef CONFIG_NET_CLS_ACT
5609 if (queue)
5610 return queue;
5611 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5612 if (!queue)
5613 return NULL;
5614 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
5615 queue->qdisc = &noop_qdisc;
5616 queue->qdisc_sleeping = &noop_qdisc;
5617 rcu_assign_pointer(dev->ingress_queue, queue);
5618#endif
5619 return queue;
bb949fbd
DM
5620}
5621
2c60db03
ED
5622static const struct ethtool_ops default_ethtool_ops;
5623
d07d7507
SG
5624void netdev_set_default_ethtool_ops(struct net_device *dev,
5625 const struct ethtool_ops *ops)
5626{
5627 if (dev->ethtool_ops == &default_ethtool_ops)
5628 dev->ethtool_ops = ops;
5629}
5630EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5631
1da177e4 5632/**
36909ea4 5633 * alloc_netdev_mqs - allocate network device
1da177e4
LT
5634 * @sizeof_priv: size of private data to allocate space for
5635 * @name: device name format string
5636 * @setup: callback to initialize device
36909ea4
TH
5637 * @txqs: the number of TX subqueues to allocate
5638 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
5639 *
5640 * Allocates a struct net_device with private data area for driver use
f25f4e44 5641 * and performs basic initialization. Also allocates subquue structs
36909ea4 5642 * for each queue on the device.
1da177e4 5643 */
36909ea4
TH
5644struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5645 void (*setup)(struct net_device *),
5646 unsigned int txqs, unsigned int rxqs)
1da177e4 5647{
1da177e4 5648 struct net_device *dev;
7943986c 5649 size_t alloc_size;
1ce8e7b5 5650 struct net_device *p;
1da177e4 5651
b6fe17d6
SH
5652 BUG_ON(strlen(name) >= sizeof(dev->name));
5653
36909ea4 5654 if (txqs < 1) {
7b6cd1ce 5655 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
5656 return NULL;
5657 }
5658
36909ea4
TH
5659#ifdef CONFIG_RPS
5660 if (rxqs < 1) {
7b6cd1ce 5661 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
5662 return NULL;
5663 }
5664#endif
5665
fd2ea0a7 5666 alloc_size = sizeof(struct net_device);
d1643d24
AD
5667 if (sizeof_priv) {
5668 /* ensure 32-byte alignment of private area */
1ce8e7b5 5669 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5670 alloc_size += sizeof_priv;
5671 }
5672 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5673 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5674
31380de9 5675 p = kzalloc(alloc_size, GFP_KERNEL);
62b5942a 5676 if (!p)
1da177e4 5677 return NULL;
1da177e4 5678
1ce8e7b5 5679 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5680 dev->padded = (char *)dev - (char *)p;
ab9c73cc 5681
29b4433d
ED
5682 dev->pcpu_refcnt = alloc_percpu(int);
5683 if (!dev->pcpu_refcnt)
e6484930 5684 goto free_p;
ab9c73cc 5685
ab9c73cc 5686 if (dev_addr_init(dev))
29b4433d 5687 goto free_pcpu;
ab9c73cc 5688
22bedad3 5689 dev_mc_init(dev);
a748ee24 5690 dev_uc_init(dev);
ccffad25 5691
c346dca1 5692 dev_net_set(dev, &init_net);
1da177e4 5693
8d3bdbd5 5694 dev->gso_max_size = GSO_MAX_SIZE;
30b678d8 5695 dev->gso_max_segs = GSO_MAX_SEGS;
8d3bdbd5 5696
8d3bdbd5
DM
5697 INIT_LIST_HEAD(&dev->napi_list);
5698 INIT_LIST_HEAD(&dev->unreg_list);
5699 INIT_LIST_HEAD(&dev->link_watch_list);
9ff162a8 5700 INIT_LIST_HEAD(&dev->upper_dev_list);
8d3bdbd5
DM
5701 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5702 setup(dev);
5703
36909ea4
TH
5704 dev->num_tx_queues = txqs;
5705 dev->real_num_tx_queues = txqs;
ed9af2e8 5706 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 5707 goto free_all;
e8a0464c 5708
df334545 5709#ifdef CONFIG_RPS
36909ea4
TH
5710 dev->num_rx_queues = rxqs;
5711 dev->real_num_rx_queues = rxqs;
fe822240 5712 if (netif_alloc_rx_queues(dev))
8d3bdbd5 5713 goto free_all;
df334545 5714#endif
0a9627f2 5715
1da177e4 5716 strcpy(dev->name, name);
cbda10fa 5717 dev->group = INIT_NETDEV_GROUP;
2c60db03
ED
5718 if (!dev->ethtool_ops)
5719 dev->ethtool_ops = &default_ethtool_ops;
1da177e4 5720 return dev;
ab9c73cc 5721
8d3bdbd5
DM
5722free_all:
5723 free_netdev(dev);
5724 return NULL;
5725
29b4433d
ED
5726free_pcpu:
5727 free_percpu(dev->pcpu_refcnt);
ed9af2e8 5728 kfree(dev->_tx);
fe822240
TH
5729#ifdef CONFIG_RPS
5730 kfree(dev->_rx);
5731#endif
5732
ab9c73cc
JP
5733free_p:
5734 kfree(p);
5735 return NULL;
1da177e4 5736}
36909ea4 5737EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
5738
5739/**
5740 * free_netdev - free network device
5741 * @dev: device
5742 *
4ec93edb
YH
5743 * This function does the last stage of destroying an allocated device
5744 * interface. The reference to the device object is released.
1da177e4
LT
5745 * If this is the last reference then it will be freed.
5746 */
5747void free_netdev(struct net_device *dev)
5748{
d565b0a1
HX
5749 struct napi_struct *p, *n;
5750
f3005d7f
DL
5751 release_net(dev_net(dev));
5752
e8a0464c 5753 kfree(dev->_tx);
fe822240
TH
5754#ifdef CONFIG_RPS
5755 kfree(dev->_rx);
5756#endif
e8a0464c 5757
33d480ce 5758 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 5759
f001fde5
JP
5760 /* Flush device addresses */
5761 dev_addr_flush(dev);
5762
d565b0a1
HX
5763 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5764 netif_napi_del(p);
5765
29b4433d
ED
5766 free_percpu(dev->pcpu_refcnt);
5767 dev->pcpu_refcnt = NULL;
5768
3041a069 5769 /* Compatibility with error handling in drivers */
1da177e4
LT
5770 if (dev->reg_state == NETREG_UNINITIALIZED) {
5771 kfree((char *)dev - dev->padded);
5772 return;
5773 }
5774
5775 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5776 dev->reg_state = NETREG_RELEASED;
5777
43cb76d9
GKH
5778 /* will free via device release */
5779 put_device(&dev->dev);
1da177e4 5780}
d1b19dff 5781EXPORT_SYMBOL(free_netdev);
4ec93edb 5782
f0db275a
SH
5783/**
5784 * synchronize_net - Synchronize with packet receive processing
5785 *
5786 * Wait for packets currently being received to be done.
5787 * Does not block later packets from starting.
5788 */
4ec93edb 5789void synchronize_net(void)
1da177e4
LT
5790{
5791 might_sleep();
be3fc413
ED
5792 if (rtnl_is_locked())
5793 synchronize_rcu_expedited();
5794 else
5795 synchronize_rcu();
1da177e4 5796}
d1b19dff 5797EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
5798
5799/**
44a0873d 5800 * unregister_netdevice_queue - remove device from the kernel
1da177e4 5801 * @dev: device
44a0873d 5802 * @head: list
6ebfbc06 5803 *
1da177e4 5804 * This function shuts down a device interface and removes it
d59b54b1 5805 * from the kernel tables.
44a0873d 5806 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
5807 *
5808 * Callers must hold the rtnl semaphore. You may want
5809 * unregister_netdev() instead of this.
5810 */
5811
44a0873d 5812void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 5813{
a6620712
HX
5814 ASSERT_RTNL();
5815
44a0873d 5816 if (head) {
9fdce099 5817 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
5818 } else {
5819 rollback_registered(dev);
5820 /* Finish processing unregister after unlock */
5821 net_set_todo(dev);
5822 }
1da177e4 5823}
44a0873d 5824EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 5825
9b5e383c
ED
5826/**
5827 * unregister_netdevice_many - unregister many devices
5828 * @head: list of devices
6a827d8a
ED
5829 *
5830 * Note: As most callers use a stack allocated list_head,
5831 * we force a list_del() to make sure stack wont be corrupted later.
9b5e383c
ED
5832 */
5833void unregister_netdevice_many(struct list_head *head)
5834{
5835 struct net_device *dev;
5836
5837 if (!list_empty(head)) {
5838 rollback_registered_many(head);
5839 list_for_each_entry(dev, head, unreg_list)
5840 net_set_todo(dev);
6a827d8a 5841 list_del(head);
9b5e383c
ED
5842 }
5843}
63c8099d 5844EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 5845
1da177e4
LT
5846/**
5847 * unregister_netdev - remove device from the kernel
5848 * @dev: device
5849 *
5850 * This function shuts down a device interface and removes it
d59b54b1 5851 * from the kernel tables.
1da177e4
LT
5852 *
5853 * This is just a wrapper for unregister_netdevice that takes
5854 * the rtnl semaphore. In general you want to use this and not
5855 * unregister_netdevice.
5856 */
5857void unregister_netdev(struct net_device *dev)
5858{
5859 rtnl_lock();
5860 unregister_netdevice(dev);
5861 rtnl_unlock();
5862}
1da177e4
LT
5863EXPORT_SYMBOL(unregister_netdev);
5864
ce286d32
EB
5865/**
5866 * dev_change_net_namespace - move device to different nethost namespace
5867 * @dev: device
5868 * @net: network namespace
5869 * @pat: If not NULL name pattern to try if the current device name
5870 * is already taken in the destination network namespace.
5871 *
5872 * This function shuts down a device interface and moves it
5873 * to a new network namespace. On success 0 is returned, on
5874 * a failure a netagive errno code is returned.
5875 *
5876 * Callers must hold the rtnl semaphore.
5877 */
5878
5879int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5880{
ce286d32
EB
5881 int err;
5882
5883 ASSERT_RTNL();
5884
5885 /* Don't allow namespace local devices to be moved. */
5886 err = -EINVAL;
5887 if (dev->features & NETIF_F_NETNS_LOCAL)
5888 goto out;
5889
5890 /* Ensure the device has been registrered */
ce286d32
EB
5891 if (dev->reg_state != NETREG_REGISTERED)
5892 goto out;
5893
5894 /* Get out if there is nothing todo */
5895 err = 0;
878628fb 5896 if (net_eq(dev_net(dev), net))
ce286d32
EB
5897 goto out;
5898
5899 /* Pick the destination device name, and ensure
5900 * we can use it in the destination network namespace.
5901 */
5902 err = -EEXIST;
d9031024 5903 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
5904 /* We get here if we can't use the current device name */
5905 if (!pat)
5906 goto out;
828de4f6 5907 if (dev_get_valid_name(net, dev, pat) < 0)
ce286d32
EB
5908 goto out;
5909 }
5910
5911 /*
5912 * And now a mini version of register_netdevice unregister_netdevice.
5913 */
5914
5915 /* If device is running close it first. */
9b772652 5916 dev_close(dev);
ce286d32
EB
5917
5918 /* And unlink it from device chain */
5919 err = -ENODEV;
5920 unlist_netdevice(dev);
5921
5922 synchronize_net();
5923
5924 /* Shutdown queueing discipline. */
5925 dev_shutdown(dev);
5926
5927 /* Notify protocols, that we are about to destroy
5928 this device. They should clean all the things.
3b27e105
DL
5929
5930 Note that dev->reg_state stays at NETREG_REGISTERED.
5931 This is wanted because this way 8021q and macvlan know
5932 the device is just moving and can keep their slaves up.
ce286d32
EB
5933 */
5934 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6549dd43
G
5935 rcu_barrier();
5936 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
d2237d35 5937 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
ce286d32
EB
5938
5939 /*
5940 * Flush the unicast and multicast chains
5941 */
a748ee24 5942 dev_uc_flush(dev);
22bedad3 5943 dev_mc_flush(dev);
ce286d32 5944
4e66ae2e
SH
5945 /* Send a netdev-removed uevent to the old namespace */
5946 kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5947
ce286d32 5948 /* Actually switch the network namespace */
c346dca1 5949 dev_net_set(dev, net);
ce286d32 5950
ce286d32
EB
5951 /* If there is an ifindex conflict assign a new one */
5952 if (__dev_get_by_index(net, dev->ifindex)) {
5953 int iflink = (dev->iflink == dev->ifindex);
5954 dev->ifindex = dev_new_index(net);
5955 if (iflink)
5956 dev->iflink = dev->ifindex;
5957 }
5958
4e66ae2e
SH
5959 /* Send a netdev-add uevent to the new namespace */
5960 kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5961
8b41d188 5962 /* Fixup kobjects */
a1b3f594 5963 err = device_rename(&dev->dev, dev->name);
8b41d188 5964 WARN_ON(err);
ce286d32
EB
5965
5966 /* Add the device back in the hashes */
5967 list_netdevice(dev);
5968
5969 /* Notify protocols, that a new device appeared. */
5970 call_netdevice_notifiers(NETDEV_REGISTER, dev);
5971
d90a909e
EB
5972 /*
5973 * Prevent userspace races by waiting until the network
5974 * device is fully setup before sending notifications.
5975 */
5976 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5977
ce286d32
EB
5978 synchronize_net();
5979 err = 0;
5980out:
5981 return err;
5982}
463d0183 5983EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 5984
1da177e4
LT
5985static int dev_cpu_callback(struct notifier_block *nfb,
5986 unsigned long action,
5987 void *ocpu)
5988{
5989 struct sk_buff **list_skb;
1da177e4
LT
5990 struct sk_buff *skb;
5991 unsigned int cpu, oldcpu = (unsigned long)ocpu;
5992 struct softnet_data *sd, *oldsd;
5993
8bb78442 5994 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
5995 return NOTIFY_OK;
5996
5997 local_irq_disable();
5998 cpu = smp_processor_id();
5999 sd = &per_cpu(softnet_data, cpu);
6000 oldsd = &per_cpu(softnet_data, oldcpu);
6001
6002 /* Find end of our completion_queue. */
6003 list_skb = &sd->completion_queue;
6004 while (*list_skb)
6005 list_skb = &(*list_skb)->next;
6006 /* Append completion queue from offline CPU. */
6007 *list_skb = oldsd->completion_queue;
6008 oldsd->completion_queue = NULL;
6009
1da177e4 6010 /* Append output queue from offline CPU. */
a9cbd588
CG
6011 if (oldsd->output_queue) {
6012 *sd->output_queue_tailp = oldsd->output_queue;
6013 sd->output_queue_tailp = oldsd->output_queue_tailp;
6014 oldsd->output_queue = NULL;
6015 oldsd->output_queue_tailp = &oldsd->output_queue;
6016 }
06b5ff9f
ED
6017 /* Append NAPI poll list from offline CPU, with one exception :
6018 * process_backlog() must be called by cpu owning percpu backlog.
6019 * We properly handle process_queue & input_pkt_queue later.
6020 */
6021 while (!list_empty(&oldsd->poll_list)) {
6022 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
6023 struct napi_struct,
6024 poll_list);
6025
6026 list_del_init(&napi->poll_list);
6027 if (napi->poll == process_backlog)
6028 napi->state = 0;
6029 else
6030 ____napi_schedule(sd, napi);
264524d5 6031 }
1da177e4
LT
6032
6033 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6034 local_irq_enable();
6035
6036 /* Process offline CPU's input_pkt_queue */
76cc8b13 6037 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 6038 netif_rx(skb);
76cc8b13 6039 input_queue_head_incr(oldsd);
fec5e652 6040 }
06b5ff9f 6041 while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 6042 netif_rx(skb);
76cc8b13
TH
6043 input_queue_head_incr(oldsd);
6044 }
1da177e4
LT
6045
6046 return NOTIFY_OK;
6047}
1da177e4
LT
6048
6049
7f353bf2 6050/**
b63365a2
HX
6051 * netdev_increment_features - increment feature set by one
6052 * @all: current feature set
6053 * @one: new feature set
6054 * @mask: mask feature set
7f353bf2
HX
6055 *
6056 * Computes a new feature set after adding a device with feature set
b63365a2
HX
6057 * @one to the master device with current feature set @all. Will not
6058 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 6059 */
c8f44aff
MM
6060netdev_features_t netdev_increment_features(netdev_features_t all,
6061 netdev_features_t one, netdev_features_t mask)
b63365a2 6062{
1742f183
MM
6063 if (mask & NETIF_F_GEN_CSUM)
6064 mask |= NETIF_F_ALL_CSUM;
6065 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 6066
1742f183
MM
6067 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6068 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 6069
1742f183
MM
6070 /* If one device supports hw checksumming, set for all. */
6071 if (all & NETIF_F_GEN_CSUM)
6072 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
6073
6074 return all;
6075}
b63365a2 6076EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 6077
30d97d35
PE
6078static struct hlist_head *netdev_create_hash(void)
6079{
6080 int i;
6081 struct hlist_head *hash;
6082
6083 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6084 if (hash != NULL)
6085 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6086 INIT_HLIST_HEAD(&hash[i]);
6087
6088 return hash;
6089}
6090
881d966b 6091/* Initialize per network namespace state */
4665079c 6092static int __net_init netdev_init(struct net *net)
881d966b 6093{
734b6541
RM
6094 if (net != &init_net)
6095 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 6096
30d97d35
PE
6097 net->dev_name_head = netdev_create_hash();
6098 if (net->dev_name_head == NULL)
6099 goto err_name;
881d966b 6100
30d97d35
PE
6101 net->dev_index_head = netdev_create_hash();
6102 if (net->dev_index_head == NULL)
6103 goto err_idx;
881d966b
EB
6104
6105 return 0;
30d97d35
PE
6106
6107err_idx:
6108 kfree(net->dev_name_head);
6109err_name:
6110 return -ENOMEM;
881d966b
EB
6111}
6112
f0db275a
SH
6113/**
6114 * netdev_drivername - network driver for the device
6115 * @dev: network device
f0db275a
SH
6116 *
6117 * Determine network driver for device.
6118 */
3019de12 6119const char *netdev_drivername(const struct net_device *dev)
6579e57b 6120{
cf04a4c7
SH
6121 const struct device_driver *driver;
6122 const struct device *parent;
3019de12 6123 const char *empty = "";
6579e57b
AV
6124
6125 parent = dev->dev.parent;
6579e57b 6126 if (!parent)
3019de12 6127 return empty;
6579e57b
AV
6128
6129 driver = parent->driver;
6130 if (driver && driver->name)
3019de12
DM
6131 return driver->name;
6132 return empty;
6579e57b
AV
6133}
6134
b004ff49 6135static int __netdev_printk(const char *level, const struct net_device *dev,
256df2f3
JP
6136 struct va_format *vaf)
6137{
6138 int r;
6139
b004ff49 6140 if (dev && dev->dev.parent) {
666f355f
JP
6141 r = dev_printk_emit(level[1] - '0',
6142 dev->dev.parent,
6143 "%s %s %s: %pV",
6144 dev_driver_string(dev->dev.parent),
6145 dev_name(dev->dev.parent),
6146 netdev_name(dev), vaf);
b004ff49 6147 } else if (dev) {
256df2f3 6148 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
b004ff49 6149 } else {
256df2f3 6150 r = printk("%s(NULL net_device): %pV", level, vaf);
b004ff49 6151 }
256df2f3
JP
6152
6153 return r;
6154}
6155
6156int netdev_printk(const char *level, const struct net_device *dev,
6157 const char *format, ...)
6158{
6159 struct va_format vaf;
6160 va_list args;
6161 int r;
6162
6163 va_start(args, format);
6164
6165 vaf.fmt = format;
6166 vaf.va = &args;
6167
6168 r = __netdev_printk(level, dev, &vaf);
b004ff49 6169
256df2f3
JP
6170 va_end(args);
6171
6172 return r;
6173}
6174EXPORT_SYMBOL(netdev_printk);
6175
6176#define define_netdev_printk_level(func, level) \
6177int func(const struct net_device *dev, const char *fmt, ...) \
6178{ \
6179 int r; \
6180 struct va_format vaf; \
6181 va_list args; \
6182 \
6183 va_start(args, fmt); \
6184 \
6185 vaf.fmt = fmt; \
6186 vaf.va = &args; \
6187 \
6188 r = __netdev_printk(level, dev, &vaf); \
b004ff49 6189 \
256df2f3
JP
6190 va_end(args); \
6191 \
6192 return r; \
6193} \
6194EXPORT_SYMBOL(func);
6195
6196define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6197define_netdev_printk_level(netdev_alert, KERN_ALERT);
6198define_netdev_printk_level(netdev_crit, KERN_CRIT);
6199define_netdev_printk_level(netdev_err, KERN_ERR);
6200define_netdev_printk_level(netdev_warn, KERN_WARNING);
6201define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6202define_netdev_printk_level(netdev_info, KERN_INFO);
6203
4665079c 6204static void __net_exit netdev_exit(struct net *net)
881d966b
EB
6205{
6206 kfree(net->dev_name_head);
6207 kfree(net->dev_index_head);
6208}
6209
022cbae6 6210static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
6211 .init = netdev_init,
6212 .exit = netdev_exit,
6213};
6214
4665079c 6215static void __net_exit default_device_exit(struct net *net)
ce286d32 6216{
e008b5fc 6217 struct net_device *dev, *aux;
ce286d32 6218 /*
e008b5fc 6219 * Push all migratable network devices back to the
ce286d32
EB
6220 * initial network namespace
6221 */
6222 rtnl_lock();
e008b5fc 6223 for_each_netdev_safe(net, dev, aux) {
ce286d32 6224 int err;
aca51397 6225 char fb_name[IFNAMSIZ];
ce286d32
EB
6226
6227 /* Ignore unmoveable devices (i.e. loopback) */
6228 if (dev->features & NETIF_F_NETNS_LOCAL)
6229 continue;
6230
e008b5fc
EB
6231 /* Leave virtual devices for the generic cleanup */
6232 if (dev->rtnl_link_ops)
6233 continue;
d0c082ce 6234
25985edc 6235 /* Push remaining network devices to init_net */
aca51397
PE
6236 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6237 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 6238 if (err) {
7b6cd1ce
JP
6239 pr_emerg("%s: failed to move %s to init_net: %d\n",
6240 __func__, dev->name, err);
aca51397 6241 BUG();
ce286d32
EB
6242 }
6243 }
6244 rtnl_unlock();
6245}
6246
04dc7f6b
EB
6247static void __net_exit default_device_exit_batch(struct list_head *net_list)
6248{
6249 /* At exit all network devices most be removed from a network
b595076a 6250 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
6251 * Do this across as many network namespaces as possible to
6252 * improve batching efficiency.
6253 */
6254 struct net_device *dev;
6255 struct net *net;
6256 LIST_HEAD(dev_kill_list);
6257
6258 rtnl_lock();
6259 list_for_each_entry(net, net_list, exit_list) {
6260 for_each_netdev_reverse(net, dev) {
6261 if (dev->rtnl_link_ops)
6262 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6263 else
6264 unregister_netdevice_queue(dev, &dev_kill_list);
6265 }
6266 }
6267 unregister_netdevice_many(&dev_kill_list);
6268 rtnl_unlock();
6269}
6270
022cbae6 6271static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6272 .exit = default_device_exit,
04dc7f6b 6273 .exit_batch = default_device_exit_batch,
ce286d32
EB
6274};
6275
1da177e4
LT
6276/*
6277 * Initialize the DEV module. At boot time this walks the device list and
6278 * unhooks any devices that fail to initialise (normally hardware not
6279 * present) and leaves us with a valid list of present and active devices.
6280 *
6281 */
6282
6283/*
6284 * This is called single threaded during boot, so no need
6285 * to take the rtnl semaphore.
6286 */
6287static int __init net_dev_init(void)
6288{
6289 int i, rc = -ENOMEM;
6290
6291 BUG_ON(!dev_boot_phase);
6292
1da177e4
LT
6293 if (dev_proc_init())
6294 goto out;
6295
8b41d188 6296 if (netdev_kobject_init())
1da177e4
LT
6297 goto out;
6298
6299 INIT_LIST_HEAD(&ptype_all);
82d8a867 6300 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6301 INIT_LIST_HEAD(&ptype_base[i]);
6302
62532da9
VY
6303 INIT_LIST_HEAD(&offload_base);
6304
881d966b
EB
6305 if (register_pernet_subsys(&netdev_net_ops))
6306 goto out;
1da177e4
LT
6307
6308 /*
6309 * Initialise the packet receive queues.
6310 */
6311
6f912042 6312 for_each_possible_cpu(i) {
e36fa2f7 6313 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6314
dee42870 6315 memset(sd, 0, sizeof(*sd));
e36fa2f7 6316 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6317 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6318 sd->completion_queue = NULL;
6319 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6320 sd->output_queue = NULL;
6321 sd->output_queue_tailp = &sd->output_queue;
df334545 6322#ifdef CONFIG_RPS
e36fa2f7
ED
6323 sd->csd.func = rps_trigger_softirq;
6324 sd->csd.info = sd;
6325 sd->csd.flags = 0;
6326 sd->cpu = i;
1e94d72f 6327#endif
0a9627f2 6328
e36fa2f7
ED
6329 sd->backlog.poll = process_backlog;
6330 sd->backlog.weight = weight_p;
6331 sd->backlog.gro_list = NULL;
6332 sd->backlog.gro_count = 0;
1da177e4
LT
6333 }
6334
1da177e4
LT
6335 dev_boot_phase = 0;
6336
505d4f73
EB
6337 /* The loopback device is special if any other network devices
6338 * is present in a network namespace the loopback device must
6339 * be present. Since we now dynamically allocate and free the
6340 * loopback device ensure this invariant is maintained by
6341 * keeping the loopback device as the first device on the
6342 * list of network devices. Ensuring the loopback devices
6343 * is the first device that appears and the last network device
6344 * that disappears.
6345 */
6346 if (register_pernet_device(&loopback_net_ops))
6347 goto out;
6348
6349 if (register_pernet_device(&default_device_ops))
6350 goto out;
6351
962cf36c
CM
6352 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6353 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6354
6355 hotcpu_notifier(dev_cpu_callback, 0);
6356 dst_init();
1da177e4
LT
6357 rc = 0;
6358out:
6359 return rc;
6360}
6361
6362subsys_initcall(net_dev_init);