ipv4: fix checkpatch errors
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / net / core / dev.c
CommitLineData
1da177e4
LT
1/*
2 * NET3 Protocol independent device support routines.
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
8 *
9 * Derived from the non IP parts of dev.c 1.0.19
02c30a84 10 * Authors: Ross Biro
1da177e4
LT
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Mark Evans, <evansmp@uhura.aston.ac.uk>
13 *
14 * Additional Authors:
15 * Florian la Roche <rzsfl@rz.uni-sb.de>
16 * Alan Cox <gw4pts@gw4pts.ampr.org>
17 * David Hinds <dahinds@users.sourceforge.net>
18 * Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
19 * Adam Sulmicki <adam@cfar.umd.edu>
20 * Pekka Riikonen <priikone@poesidon.pspt.fi>
21 *
22 * Changes:
23 * D.J. Barrow : Fixed bug where dev->refcnt gets set
24 * to 2 if register_netdev gets called
25 * before net_dev_init & also removed a
26 * few lines of code in the process.
27 * Alan Cox : device private ioctl copies fields back.
28 * Alan Cox : Transmit queue code does relevant
29 * stunts to keep the queue safe.
30 * Alan Cox : Fixed double lock.
31 * Alan Cox : Fixed promisc NULL pointer trap
32 * ???????? : Support the full private ioctl range
33 * Alan Cox : Moved ioctl permission check into
34 * drivers
35 * Tim Kordas : SIOCADDMULTI/SIOCDELMULTI
36 * Alan Cox : 100 backlog just doesn't cut it when
37 * you start doing multicast video 8)
38 * Alan Cox : Rewrote net_bh and list manager.
39 * Alan Cox : Fix ETH_P_ALL echoback lengths.
40 * Alan Cox : Took out transmit every packet pass
41 * Saved a few bytes in the ioctl handler
42 * Alan Cox : Network driver sets packet type before
43 * calling netif_rx. Saves a function
44 * call a packet.
45 * Alan Cox : Hashed net_bh()
46 * Richard Kooijman: Timestamp fixes.
47 * Alan Cox : Wrong field in SIOCGIFDSTADDR
48 * Alan Cox : Device lock protection.
49 * Alan Cox : Fixed nasty side effect of device close
50 * changes.
51 * Rudi Cilibrasi : Pass the right thing to
52 * set_mac_address()
53 * Dave Miller : 32bit quantity for the device lock to
54 * make it work out on a Sparc.
55 * Bjorn Ekwall : Added KERNELD hack.
56 * Alan Cox : Cleaned up the backlog initialise.
57 * Craig Metz : SIOCGIFCONF fix if space for under
58 * 1 device.
59 * Thomas Bogendoerfer : Return ENODEV for dev_open, if there
60 * is no device open function.
61 * Andi Kleen : Fix error reporting for SIOCGIFCONF
62 * Michael Chastain : Fix signed/unsigned for SIOCGIFCONF
63 * Cyrus Durgin : Cleaned for KMOD
64 * Adam Sulmicki : Bug Fix : Network Device Unload
65 * A network device unload needs to purge
66 * the backlog queue.
67 * Paul Rusty Russell : SIOCSIFNAME
68 * Pekka Riikonen : Netdev boot-time settings code
69 * Andrew Morton : Make unregister_netdevice wait
70 * indefinitely on dev->refcnt
71 * J Hadi Salim : - Backlog queue sampling
72 * - netif_rx() feedback
73 */
74
75#include <asm/uaccess.h>
1da177e4 76#include <linux/bitops.h>
4fc268d2 77#include <linux/capability.h>
1da177e4
LT
78#include <linux/cpu.h>
79#include <linux/types.h>
80#include <linux/kernel.h>
08e9897d 81#include <linux/hash.h>
5a0e3ad6 82#include <linux/slab.h>
1da177e4 83#include <linux/sched.h>
4a3e2f71 84#include <linux/mutex.h>
1da177e4
LT
85#include <linux/string.h>
86#include <linux/mm.h>
87#include <linux/socket.h>
88#include <linux/sockios.h>
89#include <linux/errno.h>
90#include <linux/interrupt.h>
91#include <linux/if_ether.h>
92#include <linux/netdevice.h>
93#include <linux/etherdevice.h>
0187bdfb 94#include <linux/ethtool.h>
1da177e4
LT
95#include <linux/notifier.h>
96#include <linux/skbuff.h>
457c4cbc 97#include <net/net_namespace.h>
1da177e4
LT
98#include <net/sock.h>
99#include <linux/rtnetlink.h>
100#include <linux/proc_fs.h>
101#include <linux/seq_file.h>
102#include <linux/stat.h>
1da177e4
LT
103#include <net/dst.h>
104#include <net/pkt_sched.h>
105#include <net/checksum.h>
44540960 106#include <net/xfrm.h>
1da177e4
LT
107#include <linux/highmem.h>
108#include <linux/init.h>
109#include <linux/kmod.h>
110#include <linux/module.h>
1da177e4
LT
111#include <linux/netpoll.h>
112#include <linux/rcupdate.h>
113#include <linux/delay.h>
295f4a1f 114#include <net/wext.h>
1da177e4 115#include <net/iw_handler.h>
1da177e4 116#include <asm/current.h>
5bdb9886 117#include <linux/audit.h>
db217334 118#include <linux/dmaengine.h>
f6a78bfc 119#include <linux/err.h>
c7fa9d18 120#include <linux/ctype.h>
723e98b7 121#include <linux/if_arp.h>
6de329e2 122#include <linux/if_vlan.h>
8f0f2223 123#include <linux/ip.h>
ad55dcaf 124#include <net/ip.h>
8f0f2223
DM
125#include <linux/ipv6.h>
126#include <linux/in.h>
b6b2fed1
DM
127#include <linux/jhash.h>
128#include <linux/random.h>
9cbc1cb8 129#include <trace/events/napi.h>
cf66ba58 130#include <trace/events/net.h>
07dc22e7 131#include <trace/events/skb.h>
5acbbd42 132#include <linux/pci.h>
caeda9b9 133#include <linux/inetdevice.h>
c445477d 134#include <linux/cpu_rmap.h>
4dc360c5 135#include <linux/net_tstamp.h>
c5905afb 136#include <linux/static_key.h>
4504b861 137#include <net/flow_keys.h>
1da177e4 138
342709ef
PE
139#include "net-sysfs.h"
140
d565b0a1
HX
141/* Instead of increasing this, you should create a hash table. */
142#define MAX_GRO_SKBS 8
143
5d38a079
HX
144/* This should be increased if a protocol with a bigger head is added. */
145#define GRO_MAX_HEAD (MAX_HEADER + 128)
146
1da177e4
LT
147/*
148 * The list of packet types we will receive (as opposed to discard)
149 * and the routines to invoke.
150 *
151 * Why 16. Because with 16 the only overlap we get on a hash of the
152 * low nibble of the protocol value is RARP/SNAP/X.25.
153 *
154 * NOTE: That is no longer true with the addition of VLAN tags. Not
155 * sure which should go first, but I bet it won't make much
156 * difference if we are running VLANs. The good news is that
157 * this protocol won't be in the list unless compiled in, so
3041a069 158 * the average user (w/out VLANs) will not be adversely affected.
1da177e4
LT
159 * --BLG
160 *
161 * 0800 IP
162 * 8100 802.1Q VLAN
163 * 0001 802.3
164 * 0002 AX.25
165 * 0004 802.2
166 * 8035 RARP
167 * 0005 SNAP
168 * 0805 X.25
169 * 0806 ARP
170 * 8137 IPX
171 * 0009 Localtalk
172 * 86DD IPv6
173 */
174
82d8a867
PE
175#define PTYPE_HASH_SIZE (16)
176#define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
177
1da177e4 178static DEFINE_SPINLOCK(ptype_lock);
82d8a867 179static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
6b2bedc3 180static struct list_head ptype_all __read_mostly; /* Taps */
1da177e4 181
1da177e4 182/*
7562f876 183 * The @dev_base_head list is protected by @dev_base_lock and the rtnl
1da177e4
LT
184 * semaphore.
185 *
c6d14c84 186 * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
1da177e4
LT
187 *
188 * Writers must hold the rtnl semaphore while they loop through the
7562f876 189 * dev_base_head list, and hold dev_base_lock for writing when they do the
1da177e4
LT
190 * actual updates. This allows pure readers to access the list even
191 * while a writer is preparing to update it.
192 *
193 * To put it another way, dev_base_lock is held for writing only to
194 * protect against pure readers; the rtnl semaphore provides the
195 * protection against other writers.
196 *
197 * See, for example usages, register_netdevice() and
198 * unregister_netdevice(), which must be called with the rtnl
199 * semaphore held.
200 */
1da177e4 201DEFINE_RWLOCK(dev_base_lock);
1da177e4
LT
202EXPORT_SYMBOL(dev_base_lock);
203
4e985ada
TG
204static inline void dev_base_seq_inc(struct net *net)
205{
206 while (++net->dev_base_seq == 0);
207}
208
881d966b 209static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
1da177e4
LT
210{
211 unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
08e9897d 212 return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
1da177e4
LT
213}
214
881d966b 215static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
1da177e4 216{
7c28bd0b 217 return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
1da177e4
LT
218}
219
e36fa2f7 220static inline void rps_lock(struct softnet_data *sd)
152102c7
CG
221{
222#ifdef CONFIG_RPS
e36fa2f7 223 spin_lock(&sd->input_pkt_queue.lock);
152102c7
CG
224#endif
225}
226
e36fa2f7 227static inline void rps_unlock(struct softnet_data *sd)
152102c7
CG
228{
229#ifdef CONFIG_RPS
e36fa2f7 230 spin_unlock(&sd->input_pkt_queue.lock);
152102c7
CG
231#endif
232}
233
ce286d32
EB
234/* Device list insertion */
235static int list_netdevice(struct net_device *dev)
236{
c346dca1 237 struct net *net = dev_net(dev);
ce286d32
EB
238
239 ASSERT_RTNL();
240
241 write_lock_bh(&dev_base_lock);
c6d14c84 242 list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
72c9528b 243 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
fb699dfd
ED
244 hlist_add_head_rcu(&dev->index_hlist,
245 dev_index_hash(net, dev->ifindex));
ce286d32 246 write_unlock_bh(&dev_base_lock);
4e985ada
TG
247
248 dev_base_seq_inc(net);
249
ce286d32
EB
250 return 0;
251}
252
fb699dfd
ED
253/* Device list removal
254 * caller must respect a RCU grace period before freeing/reusing dev
255 */
ce286d32
EB
256static void unlist_netdevice(struct net_device *dev)
257{
258 ASSERT_RTNL();
259
260 /* Unlink dev from the device chain */
261 write_lock_bh(&dev_base_lock);
c6d14c84 262 list_del_rcu(&dev->dev_list);
72c9528b 263 hlist_del_rcu(&dev->name_hlist);
fb699dfd 264 hlist_del_rcu(&dev->index_hlist);
ce286d32 265 write_unlock_bh(&dev_base_lock);
4e985ada
TG
266
267 dev_base_seq_inc(dev_net(dev));
ce286d32
EB
268}
269
1da177e4
LT
270/*
271 * Our notifier list
272 */
273
f07d5b94 274static RAW_NOTIFIER_HEAD(netdev_chain);
1da177e4
LT
275
276/*
277 * Device drivers call our routines to queue packets here. We empty the
278 * queue in the local softnet handler.
279 */
bea3348e 280
9958da05 281DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
d1b19dff 282EXPORT_PER_CPU_SYMBOL(softnet_data);
1da177e4 283
cf508b12 284#ifdef CONFIG_LOCKDEP
723e98b7 285/*
c773e847 286 * register_netdevice() inits txq->_xmit_lock and sets lockdep class
723e98b7
JP
287 * according to dev->type
288 */
289static const unsigned short netdev_lock_type[] =
290 {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
291 ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
292 ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
293 ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
294 ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
295 ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
296 ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
297 ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
298 ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
299 ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
300 ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
301 ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
302 ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
2d91d78b 303 ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
929122cd 304 ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
fcb94e42 305 ARPHRD_VOID, ARPHRD_NONE};
723e98b7 306
36cbd3dc 307static const char *const netdev_lock_name[] =
723e98b7
JP
308 {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
309 "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
310 "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
311 "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
312 "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
313 "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
314 "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
315 "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
316 "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
317 "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
318 "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
319 "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
320 "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
2d91d78b 321 "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
929122cd 322 "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
fcb94e42 323 "_xmit_VOID", "_xmit_NONE"};
723e98b7
JP
324
325static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
cf508b12 326static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
723e98b7
JP
327
328static inline unsigned short netdev_lock_pos(unsigned short dev_type)
329{
330 int i;
331
332 for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
333 if (netdev_lock_type[i] == dev_type)
334 return i;
335 /* the last key is used by default */
336 return ARRAY_SIZE(netdev_lock_type) - 1;
337}
338
cf508b12
DM
339static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
340 unsigned short dev_type)
723e98b7
JP
341{
342 int i;
343
344 i = netdev_lock_pos(dev_type);
345 lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
346 netdev_lock_name[i]);
347}
cf508b12
DM
348
349static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
350{
351 int i;
352
353 i = netdev_lock_pos(dev->type);
354 lockdep_set_class_and_name(&dev->addr_list_lock,
355 &netdev_addr_lock_key[i],
356 netdev_lock_name[i]);
357}
723e98b7 358#else
cf508b12
DM
359static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
360 unsigned short dev_type)
361{
362}
363static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
723e98b7
JP
364{
365}
366#endif
1da177e4
LT
367
368/*******************************************************************************
369
370 Protocol management and registration routines
371
372*******************************************************************************/
373
1da177e4
LT
374/*
375 * Add a protocol ID to the list. Now that the input handler is
376 * smarter we can dispense with all the messy stuff that used to be
377 * here.
378 *
379 * BEWARE!!! Protocol handlers, mangling input packets,
380 * MUST BE last in hash buckets and checking protocol handlers
381 * MUST start from promiscuous ptype_all chain in net_bh.
382 * It is true now, do not change it.
383 * Explanation follows: if protocol handler, mangling packet, will
384 * be the first on list, it is not able to sense, that packet
385 * is cloned and should be copied-on-write, so that it will
386 * change it and subsequent readers will get broken packet.
387 * --ANK (980803)
388 */
389
c07b68e8
ED
390static inline struct list_head *ptype_head(const struct packet_type *pt)
391{
392 if (pt->type == htons(ETH_P_ALL))
393 return &ptype_all;
394 else
395 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
396}
397
1da177e4
LT
398/**
399 * dev_add_pack - add packet handler
400 * @pt: packet type declaration
401 *
402 * Add a protocol handler to the networking stack. The passed &packet_type
403 * is linked into kernel lists and may not be freed until it has been
404 * removed from the kernel lists.
405 *
4ec93edb 406 * This call does not sleep therefore it can not
1da177e4
LT
407 * guarantee all CPU's that are in middle of receiving packets
408 * will see the new packet type (until the next received packet).
409 */
410
411void dev_add_pack(struct packet_type *pt)
412{
c07b68e8 413 struct list_head *head = ptype_head(pt);
1da177e4 414
c07b68e8
ED
415 spin_lock(&ptype_lock);
416 list_add_rcu(&pt->list, head);
417 spin_unlock(&ptype_lock);
1da177e4 418}
d1b19dff 419EXPORT_SYMBOL(dev_add_pack);
1da177e4 420
1da177e4
LT
421/**
422 * __dev_remove_pack - remove packet handler
423 * @pt: packet type declaration
424 *
425 * Remove a protocol handler that was previously added to the kernel
426 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
427 * from the kernel lists and can be freed or reused once this function
4ec93edb 428 * returns.
1da177e4
LT
429 *
430 * The packet type might still be in use by receivers
431 * and must not be freed until after all the CPU's have gone
432 * through a quiescent state.
433 */
434void __dev_remove_pack(struct packet_type *pt)
435{
c07b68e8 436 struct list_head *head = ptype_head(pt);
1da177e4
LT
437 struct packet_type *pt1;
438
c07b68e8 439 spin_lock(&ptype_lock);
1da177e4
LT
440
441 list_for_each_entry(pt1, head, list) {
442 if (pt == pt1) {
443 list_del_rcu(&pt->list);
444 goto out;
445 }
446 }
447
7b6cd1ce 448 pr_warn("dev_remove_pack: %p not found\n", pt);
1da177e4 449out:
c07b68e8 450 spin_unlock(&ptype_lock);
1da177e4 451}
d1b19dff
ED
452EXPORT_SYMBOL(__dev_remove_pack);
453
1da177e4
LT
454/**
455 * dev_remove_pack - remove packet handler
456 * @pt: packet type declaration
457 *
458 * Remove a protocol handler that was previously added to the kernel
459 * protocol handlers by dev_add_pack(). The passed &packet_type is removed
460 * from the kernel lists and can be freed or reused once this function
461 * returns.
462 *
463 * This call sleeps to guarantee that no CPU is looking at the packet
464 * type after return.
465 */
466void dev_remove_pack(struct packet_type *pt)
467{
468 __dev_remove_pack(pt);
4ec93edb 469
1da177e4
LT
470 synchronize_net();
471}
d1b19dff 472EXPORT_SYMBOL(dev_remove_pack);
1da177e4
LT
473
474/******************************************************************************
475
476 Device Boot-time Settings Routines
477
478*******************************************************************************/
479
480/* Boot time configuration table */
481static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
482
483/**
484 * netdev_boot_setup_add - add new setup entry
485 * @name: name of the device
486 * @map: configured settings for the device
487 *
488 * Adds new setup entry to the dev_boot_setup list. The function
489 * returns 0 on error and 1 on success. This is a generic routine to
490 * all netdevices.
491 */
492static int netdev_boot_setup_add(char *name, struct ifmap *map)
493{
494 struct netdev_boot_setup *s;
495 int i;
496
497 s = dev_boot_setup;
498 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
499 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
500 memset(s[i].name, 0, sizeof(s[i].name));
93b3cff9 501 strlcpy(s[i].name, name, IFNAMSIZ);
1da177e4
LT
502 memcpy(&s[i].map, map, sizeof(s[i].map));
503 break;
504 }
505 }
506
507 return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
508}
509
510/**
511 * netdev_boot_setup_check - check boot time settings
512 * @dev: the netdevice
513 *
514 * Check boot time settings for the device.
515 * The found settings are set for the device to be used
516 * later in the device probing.
517 * Returns 0 if no settings found, 1 if they are.
518 */
519int netdev_boot_setup_check(struct net_device *dev)
520{
521 struct netdev_boot_setup *s = dev_boot_setup;
522 int i;
523
524 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
525 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
93b3cff9 526 !strcmp(dev->name, s[i].name)) {
1da177e4
LT
527 dev->irq = s[i].map.irq;
528 dev->base_addr = s[i].map.base_addr;
529 dev->mem_start = s[i].map.mem_start;
530 dev->mem_end = s[i].map.mem_end;
531 return 1;
532 }
533 }
534 return 0;
535}
d1b19dff 536EXPORT_SYMBOL(netdev_boot_setup_check);
1da177e4
LT
537
538
539/**
540 * netdev_boot_base - get address from boot time settings
541 * @prefix: prefix for network device
542 * @unit: id for network device
543 *
544 * Check boot time settings for the base address of device.
545 * The found settings are set for the device to be used
546 * later in the device probing.
547 * Returns 0 if no settings found.
548 */
549unsigned long netdev_boot_base(const char *prefix, int unit)
550{
551 const struct netdev_boot_setup *s = dev_boot_setup;
552 char name[IFNAMSIZ];
553 int i;
554
555 sprintf(name, "%s%d", prefix, unit);
556
557 /*
558 * If device already registered then return base of 1
559 * to indicate not to probe for this interface
560 */
881d966b 561 if (__dev_get_by_name(&init_net, name))
1da177e4
LT
562 return 1;
563
564 for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
565 if (!strcmp(name, s[i].name))
566 return s[i].map.base_addr;
567 return 0;
568}
569
570/*
571 * Saves at boot time configured settings for any netdevice.
572 */
573int __init netdev_boot_setup(char *str)
574{
575 int ints[5];
576 struct ifmap map;
577
578 str = get_options(str, ARRAY_SIZE(ints), ints);
579 if (!str || !*str)
580 return 0;
581
582 /* Save settings */
583 memset(&map, 0, sizeof(map));
584 if (ints[0] > 0)
585 map.irq = ints[1];
586 if (ints[0] > 1)
587 map.base_addr = ints[2];
588 if (ints[0] > 2)
589 map.mem_start = ints[3];
590 if (ints[0] > 3)
591 map.mem_end = ints[4];
592
593 /* Add new entry to the list */
594 return netdev_boot_setup_add(str, &map);
595}
596
597__setup("netdev=", netdev_boot_setup);
598
599/*******************************************************************************
600
601 Device Interface Subroutines
602
603*******************************************************************************/
604
605/**
606 * __dev_get_by_name - find a device by its name
c4ea43c5 607 * @net: the applicable net namespace
1da177e4
LT
608 * @name: name to find
609 *
610 * Find an interface by name. Must be called under RTNL semaphore
611 * or @dev_base_lock. If the name is found a pointer to the device
612 * is returned. If the name is not found then %NULL is returned. The
613 * reference counters are not incremented so the caller must be
614 * careful with locks.
615 */
616
881d966b 617struct net_device *__dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
618{
619 struct hlist_node *p;
0bd8d536
ED
620 struct net_device *dev;
621 struct hlist_head *head = dev_name_hash(net, name);
1da177e4 622
0bd8d536 623 hlist_for_each_entry(dev, p, head, name_hlist)
1da177e4
LT
624 if (!strncmp(dev->name, name, IFNAMSIZ))
625 return dev;
0bd8d536 626
1da177e4
LT
627 return NULL;
628}
d1b19dff 629EXPORT_SYMBOL(__dev_get_by_name);
1da177e4 630
72c9528b
ED
631/**
632 * dev_get_by_name_rcu - find a device by its name
633 * @net: the applicable net namespace
634 * @name: name to find
635 *
636 * Find an interface by name.
637 * If the name is found a pointer to the device is returned.
638 * If the name is not found then %NULL is returned.
639 * The reference counters are not incremented so the caller must be
640 * careful with locks. The caller must hold RCU lock.
641 */
642
643struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
644{
645 struct hlist_node *p;
646 struct net_device *dev;
647 struct hlist_head *head = dev_name_hash(net, name);
648
649 hlist_for_each_entry_rcu(dev, p, head, name_hlist)
650 if (!strncmp(dev->name, name, IFNAMSIZ))
651 return dev;
652
653 return NULL;
654}
655EXPORT_SYMBOL(dev_get_by_name_rcu);
656
1da177e4
LT
657/**
658 * dev_get_by_name - find a device by its name
c4ea43c5 659 * @net: the applicable net namespace
1da177e4
LT
660 * @name: name to find
661 *
662 * Find an interface by name. This can be called from any
663 * context and does its own locking. The returned handle has
664 * the usage count incremented and the caller must use dev_put() to
665 * release it when it is no longer needed. %NULL is returned if no
666 * matching device is found.
667 */
668
881d966b 669struct net_device *dev_get_by_name(struct net *net, const char *name)
1da177e4
LT
670{
671 struct net_device *dev;
672
72c9528b
ED
673 rcu_read_lock();
674 dev = dev_get_by_name_rcu(net, name);
1da177e4
LT
675 if (dev)
676 dev_hold(dev);
72c9528b 677 rcu_read_unlock();
1da177e4
LT
678 return dev;
679}
d1b19dff 680EXPORT_SYMBOL(dev_get_by_name);
1da177e4
LT
681
682/**
683 * __dev_get_by_index - find a device by its ifindex
c4ea43c5 684 * @net: the applicable net namespace
1da177e4
LT
685 * @ifindex: index of device
686 *
687 * Search for an interface by index. Returns %NULL if the device
688 * is not found or a pointer to the device. The device has not
689 * had its reference counter increased so the caller must be careful
690 * about locking. The caller must hold either the RTNL semaphore
691 * or @dev_base_lock.
692 */
693
881d966b 694struct net_device *__dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
695{
696 struct hlist_node *p;
0bd8d536
ED
697 struct net_device *dev;
698 struct hlist_head *head = dev_index_hash(net, ifindex);
1da177e4 699
0bd8d536 700 hlist_for_each_entry(dev, p, head, index_hlist)
1da177e4
LT
701 if (dev->ifindex == ifindex)
702 return dev;
0bd8d536 703
1da177e4
LT
704 return NULL;
705}
d1b19dff 706EXPORT_SYMBOL(__dev_get_by_index);
1da177e4 707
fb699dfd
ED
708/**
709 * dev_get_by_index_rcu - find a device by its ifindex
710 * @net: the applicable net namespace
711 * @ifindex: index of device
712 *
713 * Search for an interface by index. Returns %NULL if the device
714 * is not found or a pointer to the device. The device has not
715 * had its reference counter increased so the caller must be careful
716 * about locking. The caller must hold RCU lock.
717 */
718
719struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
720{
721 struct hlist_node *p;
722 struct net_device *dev;
723 struct hlist_head *head = dev_index_hash(net, ifindex);
724
725 hlist_for_each_entry_rcu(dev, p, head, index_hlist)
726 if (dev->ifindex == ifindex)
727 return dev;
728
729 return NULL;
730}
731EXPORT_SYMBOL(dev_get_by_index_rcu);
732
1da177e4
LT
733
734/**
735 * dev_get_by_index - find a device by its ifindex
c4ea43c5 736 * @net: the applicable net namespace
1da177e4
LT
737 * @ifindex: index of device
738 *
739 * Search for an interface by index. Returns NULL if the device
740 * is not found or a pointer to the device. The device returned has
741 * had a reference added and the pointer is safe until the user calls
742 * dev_put to indicate they have finished with it.
743 */
744
881d966b 745struct net_device *dev_get_by_index(struct net *net, int ifindex)
1da177e4
LT
746{
747 struct net_device *dev;
748
fb699dfd
ED
749 rcu_read_lock();
750 dev = dev_get_by_index_rcu(net, ifindex);
1da177e4
LT
751 if (dev)
752 dev_hold(dev);
fb699dfd 753 rcu_read_unlock();
1da177e4
LT
754 return dev;
755}
d1b19dff 756EXPORT_SYMBOL(dev_get_by_index);
1da177e4
LT
757
758/**
941666c2 759 * dev_getbyhwaddr_rcu - find a device by its hardware address
c4ea43c5 760 * @net: the applicable net namespace
1da177e4
LT
761 * @type: media type of device
762 * @ha: hardware address
763 *
764 * Search for an interface by MAC address. Returns NULL if the device
c506653d
ED
765 * is not found or a pointer to the device.
766 * The caller must hold RCU or RTNL.
941666c2 767 * The returned device has not had its ref count increased
1da177e4
LT
768 * and the caller must therefore be careful about locking
769 *
1da177e4
LT
770 */
771
941666c2
ED
772struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
773 const char *ha)
1da177e4
LT
774{
775 struct net_device *dev;
776
941666c2 777 for_each_netdev_rcu(net, dev)
1da177e4
LT
778 if (dev->type == type &&
779 !memcmp(dev->dev_addr, ha, dev->addr_len))
7562f876
PE
780 return dev;
781
782 return NULL;
1da177e4 783}
941666c2 784EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
cf309e3f 785
881d966b 786struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
1da177e4
LT
787{
788 struct net_device *dev;
789
4e9cac2b 790 ASSERT_RTNL();
881d966b 791 for_each_netdev(net, dev)
4e9cac2b 792 if (dev->type == type)
7562f876
PE
793 return dev;
794
795 return NULL;
4e9cac2b 796}
4e9cac2b
PM
797EXPORT_SYMBOL(__dev_getfirstbyhwtype);
798
881d966b 799struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
4e9cac2b 800{
99fe3c39 801 struct net_device *dev, *ret = NULL;
4e9cac2b 802
99fe3c39
ED
803 rcu_read_lock();
804 for_each_netdev_rcu(net, dev)
805 if (dev->type == type) {
806 dev_hold(dev);
807 ret = dev;
808 break;
809 }
810 rcu_read_unlock();
811 return ret;
1da177e4 812}
1da177e4
LT
813EXPORT_SYMBOL(dev_getfirstbyhwtype);
814
815/**
bb69ae04 816 * dev_get_by_flags_rcu - find any device with given flags
c4ea43c5 817 * @net: the applicable net namespace
1da177e4
LT
818 * @if_flags: IFF_* values
819 * @mask: bitmask of bits in if_flags to check
820 *
821 * Search for any interface with the given flags. Returns NULL if a device
bb69ae04
ED
822 * is not found or a pointer to the device. Must be called inside
823 * rcu_read_lock(), and result refcount is unchanged.
1da177e4
LT
824 */
825
bb69ae04 826struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
d1b19dff 827 unsigned short mask)
1da177e4 828{
7562f876 829 struct net_device *dev, *ret;
1da177e4 830
7562f876 831 ret = NULL;
c6d14c84 832 for_each_netdev_rcu(net, dev) {
1da177e4 833 if (((dev->flags ^ if_flags) & mask) == 0) {
7562f876 834 ret = dev;
1da177e4
LT
835 break;
836 }
837 }
7562f876 838 return ret;
1da177e4 839}
bb69ae04 840EXPORT_SYMBOL(dev_get_by_flags_rcu);
1da177e4
LT
841
842/**
843 * dev_valid_name - check if name is okay for network device
844 * @name: name string
845 *
846 * Network device names need to be valid file names to
c7fa9d18
DM
847 * to allow sysfs to work. We also disallow any kind of
848 * whitespace.
1da177e4 849 */
95f050bf 850bool dev_valid_name(const char *name)
1da177e4 851{
c7fa9d18 852 if (*name == '\0')
95f050bf 853 return false;
b6fe17d6 854 if (strlen(name) >= IFNAMSIZ)
95f050bf 855 return false;
c7fa9d18 856 if (!strcmp(name, ".") || !strcmp(name, ".."))
95f050bf 857 return false;
c7fa9d18
DM
858
859 while (*name) {
860 if (*name == '/' || isspace(*name))
95f050bf 861 return false;
c7fa9d18
DM
862 name++;
863 }
95f050bf 864 return true;
1da177e4 865}
d1b19dff 866EXPORT_SYMBOL(dev_valid_name);
1da177e4
LT
867
868/**
b267b179
EB
869 * __dev_alloc_name - allocate a name for a device
870 * @net: network namespace to allocate the device name in
1da177e4 871 * @name: name format string
b267b179 872 * @buf: scratch buffer and result name string
1da177e4
LT
873 *
874 * Passed a format string - eg "lt%d" it will try and find a suitable
3041a069
SH
875 * id. It scans list of devices to build up a free map, then chooses
876 * the first empty slot. The caller must hold the dev_base or rtnl lock
877 * while allocating the name and adding the device in order to avoid
878 * duplicates.
879 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
880 * Returns the number of the unit assigned or a negative errno code.
1da177e4
LT
881 */
882
b267b179 883static int __dev_alloc_name(struct net *net, const char *name, char *buf)
1da177e4
LT
884{
885 int i = 0;
1da177e4
LT
886 const char *p;
887 const int max_netdevices = 8*PAGE_SIZE;
cfcabdcc 888 unsigned long *inuse;
1da177e4
LT
889 struct net_device *d;
890
891 p = strnchr(name, IFNAMSIZ-1, '%');
892 if (p) {
893 /*
894 * Verify the string as this thing may have come from
895 * the user. There must be either one "%d" and no other "%"
896 * characters.
897 */
898 if (p[1] != 'd' || strchr(p + 2, '%'))
899 return -EINVAL;
900
901 /* Use one page as a bit array of possible slots */
cfcabdcc 902 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
1da177e4
LT
903 if (!inuse)
904 return -ENOMEM;
905
881d966b 906 for_each_netdev(net, d) {
1da177e4
LT
907 if (!sscanf(d->name, name, &i))
908 continue;
909 if (i < 0 || i >= max_netdevices)
910 continue;
911
912 /* avoid cases where sscanf is not exact inverse of printf */
b267b179 913 snprintf(buf, IFNAMSIZ, name, i);
1da177e4
LT
914 if (!strncmp(buf, d->name, IFNAMSIZ))
915 set_bit(i, inuse);
916 }
917
918 i = find_first_zero_bit(inuse, max_netdevices);
919 free_page((unsigned long) inuse);
920 }
921
d9031024
OP
922 if (buf != name)
923 snprintf(buf, IFNAMSIZ, name, i);
b267b179 924 if (!__dev_get_by_name(net, buf))
1da177e4 925 return i;
1da177e4
LT
926
927 /* It is possible to run out of possible slots
928 * when the name is long and there isn't enough space left
929 * for the digits, or if all bits are used.
930 */
931 return -ENFILE;
932}
933
b267b179
EB
934/**
935 * dev_alloc_name - allocate a name for a device
936 * @dev: device
937 * @name: name format string
938 *
939 * Passed a format string - eg "lt%d" it will try and find a suitable
940 * id. It scans list of devices to build up a free map, then chooses
941 * the first empty slot. The caller must hold the dev_base or rtnl lock
942 * while allocating the name and adding the device in order to avoid
943 * duplicates.
944 * Limited to bits_per_byte * page size devices (ie 32K on most platforms).
945 * Returns the number of the unit assigned or a negative errno code.
946 */
947
948int dev_alloc_name(struct net_device *dev, const char *name)
949{
950 char buf[IFNAMSIZ];
951 struct net *net;
952 int ret;
953
c346dca1
YH
954 BUG_ON(!dev_net(dev));
955 net = dev_net(dev);
b267b179
EB
956 ret = __dev_alloc_name(net, name, buf);
957 if (ret >= 0)
958 strlcpy(dev->name, buf, IFNAMSIZ);
959 return ret;
960}
d1b19dff 961EXPORT_SYMBOL(dev_alloc_name);
b267b179 962
1c5cae81 963static int dev_get_valid_name(struct net_device *dev, const char *name)
d9031024 964{
8ce6cebc
DL
965 struct net *net;
966
967 BUG_ON(!dev_net(dev));
968 net = dev_net(dev);
969
d9031024
OP
970 if (!dev_valid_name(name))
971 return -EINVAL;
972
1c5cae81 973 if (strchr(name, '%'))
8ce6cebc 974 return dev_alloc_name(dev, name);
d9031024
OP
975 else if (__dev_get_by_name(net, name))
976 return -EEXIST;
8ce6cebc
DL
977 else if (dev->name != name)
978 strlcpy(dev->name, name, IFNAMSIZ);
d9031024
OP
979
980 return 0;
981}
1da177e4
LT
982
983/**
984 * dev_change_name - change name of a device
985 * @dev: device
986 * @newname: name (or format string) must be at least IFNAMSIZ
987 *
988 * Change name of a device, can pass format strings "eth%d".
989 * for wildcarding.
990 */
cf04a4c7 991int dev_change_name(struct net_device *dev, const char *newname)
1da177e4 992{
fcc5a03a 993 char oldname[IFNAMSIZ];
1da177e4 994 int err = 0;
fcc5a03a 995 int ret;
881d966b 996 struct net *net;
1da177e4
LT
997
998 ASSERT_RTNL();
c346dca1 999 BUG_ON(!dev_net(dev));
1da177e4 1000
c346dca1 1001 net = dev_net(dev);
1da177e4
LT
1002 if (dev->flags & IFF_UP)
1003 return -EBUSY;
1004
c8d90dca
SH
1005 if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
1006 return 0;
1007
fcc5a03a
HX
1008 memcpy(oldname, dev->name, IFNAMSIZ);
1009
1c5cae81 1010 err = dev_get_valid_name(dev, newname);
d9031024
OP
1011 if (err < 0)
1012 return err;
1da177e4 1013
fcc5a03a 1014rollback:
a1b3f594
EB
1015 ret = device_rename(&dev->dev, dev->name);
1016 if (ret) {
1017 memcpy(dev->name, oldname, IFNAMSIZ);
1018 return ret;
dcc99773 1019 }
7f988eab
HX
1020
1021 write_lock_bh(&dev_base_lock);
372b2312 1022 hlist_del_rcu(&dev->name_hlist);
72c9528b
ED
1023 write_unlock_bh(&dev_base_lock);
1024
1025 synchronize_rcu();
1026
1027 write_lock_bh(&dev_base_lock);
1028 hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
7f988eab
HX
1029 write_unlock_bh(&dev_base_lock);
1030
056925ab 1031 ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
fcc5a03a
HX
1032 ret = notifier_to_errno(ret);
1033
1034 if (ret) {
91e9c07b
ED
1035 /* err >= 0 after dev_alloc_name() or stores the first errno */
1036 if (err >= 0) {
fcc5a03a
HX
1037 err = ret;
1038 memcpy(dev->name, oldname, IFNAMSIZ);
1039 goto rollback;
91e9c07b 1040 } else {
7b6cd1ce 1041 pr_err("%s: name change rollback failed: %d\n",
91e9c07b 1042 dev->name, ret);
fcc5a03a
HX
1043 }
1044 }
1da177e4
LT
1045
1046 return err;
1047}
1048
0b815a1a
SH
1049/**
1050 * dev_set_alias - change ifalias of a device
1051 * @dev: device
1052 * @alias: name up to IFALIASZ
f0db275a 1053 * @len: limit of bytes to copy from info
0b815a1a
SH
1054 *
1055 * Set ifalias for a device,
1056 */
1057int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1058{
1059 ASSERT_RTNL();
1060
1061 if (len >= IFALIASZ)
1062 return -EINVAL;
1063
96ca4a2c
OH
1064 if (!len) {
1065 if (dev->ifalias) {
1066 kfree(dev->ifalias);
1067 dev->ifalias = NULL;
1068 }
1069 return 0;
1070 }
1071
d1b19dff 1072 dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
0b815a1a
SH
1073 if (!dev->ifalias)
1074 return -ENOMEM;
1075
1076 strlcpy(dev->ifalias, alias, len+1);
1077 return len;
1078}
1079
1080
d8a33ac4 1081/**
3041a069 1082 * netdev_features_change - device changes features
d8a33ac4
SH
1083 * @dev: device to cause notification
1084 *
1085 * Called to indicate a device has changed features.
1086 */
1087void netdev_features_change(struct net_device *dev)
1088{
056925ab 1089 call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
d8a33ac4
SH
1090}
1091EXPORT_SYMBOL(netdev_features_change);
1092
1da177e4
LT
1093/**
1094 * netdev_state_change - device changes state
1095 * @dev: device to cause notification
1096 *
1097 * Called to indicate a device has changed state. This function calls
1098 * the notifier chains for netdev_chain and sends a NEWLINK message
1099 * to the routing socket.
1100 */
1101void netdev_state_change(struct net_device *dev)
1102{
1103 if (dev->flags & IFF_UP) {
056925ab 1104 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1da177e4
LT
1105 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1106 }
1107}
d1b19dff 1108EXPORT_SYMBOL(netdev_state_change);
1da177e4 1109
3ca5b404 1110int netdev_bonding_change(struct net_device *dev, unsigned long event)
c1da4ac7 1111{
3ca5b404 1112 return call_netdevice_notifiers(event, dev);
c1da4ac7
OG
1113}
1114EXPORT_SYMBOL(netdev_bonding_change);
1115
1da177e4
LT
1116/**
1117 * dev_load - load a network module
c4ea43c5 1118 * @net: the applicable net namespace
1da177e4
LT
1119 * @name: name of interface
1120 *
1121 * If a network interface is not present and the process has suitable
1122 * privileges this function loads the module. If module loading is not
1123 * available in this kernel then it becomes a nop.
1124 */
1125
881d966b 1126void dev_load(struct net *net, const char *name)
1da177e4 1127{
4ec93edb 1128 struct net_device *dev;
8909c9ad 1129 int no_module;
1da177e4 1130
72c9528b
ED
1131 rcu_read_lock();
1132 dev = dev_get_by_name_rcu(net, name);
1133 rcu_read_unlock();
1da177e4 1134
8909c9ad
VK
1135 no_module = !dev;
1136 if (no_module && capable(CAP_NET_ADMIN))
1137 no_module = request_module("netdev-%s", name);
1138 if (no_module && capable(CAP_SYS_MODULE)) {
1139 if (!request_module("%s", name))
7b6cd1ce
JP
1140 pr_err("Loading kernel module for a network device with CAP_SYS_MODULE (deprecated). Use CAP_NET_ADMIN and alias netdev-%s instead.\n",
1141 name);
8909c9ad 1142 }
1da177e4 1143}
d1b19dff 1144EXPORT_SYMBOL(dev_load);
1da177e4 1145
bd380811 1146static int __dev_open(struct net_device *dev)
1da177e4 1147{
d314774c 1148 const struct net_device_ops *ops = dev->netdev_ops;
3b8bcfd5 1149 int ret;
1da177e4 1150
e46b66bc
BH
1151 ASSERT_RTNL();
1152
1da177e4
LT
1153 if (!netif_device_present(dev))
1154 return -ENODEV;
1155
3b8bcfd5
JB
1156 ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1157 ret = notifier_to_errno(ret);
1158 if (ret)
1159 return ret;
1160
1da177e4 1161 set_bit(__LINK_STATE_START, &dev->state);
bada339b 1162
d314774c
SH
1163 if (ops->ndo_validate_addr)
1164 ret = ops->ndo_validate_addr(dev);
bada339b 1165
d314774c
SH
1166 if (!ret && ops->ndo_open)
1167 ret = ops->ndo_open(dev);
1da177e4 1168
bada339b
JG
1169 if (ret)
1170 clear_bit(__LINK_STATE_START, &dev->state);
1171 else {
1da177e4 1172 dev->flags |= IFF_UP;
b4bd07c2 1173 net_dmaengine_get();
4417da66 1174 dev_set_rx_mode(dev);
1da177e4 1175 dev_activate(dev);
1da177e4 1176 }
bada339b 1177
1da177e4
LT
1178 return ret;
1179}
1180
1181/**
bd380811
PM
1182 * dev_open - prepare an interface for use.
1183 * @dev: device to open
1da177e4 1184 *
bd380811
PM
1185 * Takes a device from down to up state. The device's private open
1186 * function is invoked and then the multicast lists are loaded. Finally
1187 * the device is moved into the up state and a %NETDEV_UP message is
1188 * sent to the netdev notifier chain.
1189 *
1190 * Calling this function on an active interface is a nop. On a failure
1191 * a negative errno code is returned.
1da177e4 1192 */
bd380811
PM
1193int dev_open(struct net_device *dev)
1194{
1195 int ret;
1196
bd380811
PM
1197 if (dev->flags & IFF_UP)
1198 return 0;
1199
bd380811
PM
1200 ret = __dev_open(dev);
1201 if (ret < 0)
1202 return ret;
1203
bd380811
PM
1204 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1205 call_netdevice_notifiers(NETDEV_UP, dev);
1206
1207 return ret;
1208}
1209EXPORT_SYMBOL(dev_open);
1210
44345724 1211static int __dev_close_many(struct list_head *head)
1da177e4 1212{
44345724 1213 struct net_device *dev;
e46b66bc 1214
bd380811 1215 ASSERT_RTNL();
9d5010db
DM
1216 might_sleep();
1217
44345724 1218 list_for_each_entry(dev, head, unreg_list) {
44345724 1219 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1da177e4 1220
44345724 1221 clear_bit(__LINK_STATE_START, &dev->state);
1da177e4 1222
44345724
OP
1223 /* Synchronize to scheduled poll. We cannot touch poll list, it
1224 * can be even on different cpu. So just clear netif_running().
1225 *
1226 * dev->stop() will invoke napi_disable() on all of it's
1227 * napi_struct instances on this device.
1228 */
1229 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1230 }
1da177e4 1231
44345724 1232 dev_deactivate_many(head);
d8b2a4d2 1233
44345724
OP
1234 list_for_each_entry(dev, head, unreg_list) {
1235 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4 1236
44345724
OP
1237 /*
1238 * Call the device specific close. This cannot fail.
1239 * Only if device is UP
1240 *
1241 * We allow it to be called even after a DETACH hot-plug
1242 * event.
1243 */
1244 if (ops->ndo_stop)
1245 ops->ndo_stop(dev);
1246
44345724 1247 dev->flags &= ~IFF_UP;
44345724
OP
1248 net_dmaengine_put();
1249 }
1250
1251 return 0;
1252}
1253
1254static int __dev_close(struct net_device *dev)
1255{
f87e6f47 1256 int retval;
44345724
OP
1257 LIST_HEAD(single);
1258
1259 list_add(&dev->unreg_list, &single);
f87e6f47
LT
1260 retval = __dev_close_many(&single);
1261 list_del(&single);
1262 return retval;
44345724
OP
1263}
1264
3fbd8758 1265static int dev_close_many(struct list_head *head)
44345724
OP
1266{
1267 struct net_device *dev, *tmp;
1268 LIST_HEAD(tmp_list);
1da177e4 1269
44345724
OP
1270 list_for_each_entry_safe(dev, tmp, head, unreg_list)
1271 if (!(dev->flags & IFF_UP))
1272 list_move(&dev->unreg_list, &tmp_list);
1273
1274 __dev_close_many(head);
1da177e4 1275
44345724
OP
1276 list_for_each_entry(dev, head, unreg_list) {
1277 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1278 call_netdevice_notifiers(NETDEV_DOWN, dev);
1279 }
bd380811 1280
44345724
OP
1281 /* rollback_registered_many needs the complete original list */
1282 list_splice(&tmp_list, head);
bd380811
PM
1283 return 0;
1284}
1285
1286/**
1287 * dev_close - shutdown an interface.
1288 * @dev: device to shutdown
1289 *
1290 * This function moves an active device into down state. A
1291 * %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1292 * is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1293 * chain.
1294 */
1295int dev_close(struct net_device *dev)
1296{
e14a5993
ED
1297 if (dev->flags & IFF_UP) {
1298 LIST_HEAD(single);
1da177e4 1299
e14a5993
ED
1300 list_add(&dev->unreg_list, &single);
1301 dev_close_many(&single);
1302 list_del(&single);
1303 }
1da177e4
LT
1304 return 0;
1305}
d1b19dff 1306EXPORT_SYMBOL(dev_close);
1da177e4
LT
1307
1308
0187bdfb
BH
1309/**
1310 * dev_disable_lro - disable Large Receive Offload on a device
1311 * @dev: device
1312 *
1313 * Disable Large Receive Offload (LRO) on a net device. Must be
1314 * called under RTNL. This is needed if received packets may be
1315 * forwarded to another interface.
1316 */
1317void dev_disable_lro(struct net_device *dev)
1318{
f11970e3
NH
1319 /*
1320 * If we're trying to disable lro on a vlan device
1321 * use the underlying physical device instead
1322 */
1323 if (is_vlan_dev(dev))
1324 dev = vlan_dev_real_dev(dev);
1325
bc5787c6
MM
1326 dev->wanted_features &= ~NETIF_F_LRO;
1327 netdev_update_features(dev);
27660515 1328
22d5969f
MM
1329 if (unlikely(dev->features & NETIF_F_LRO))
1330 netdev_WARN(dev, "failed to disable LRO!\n");
0187bdfb
BH
1331}
1332EXPORT_SYMBOL(dev_disable_lro);
1333
1334
881d966b
EB
1335static int dev_boot_phase = 1;
1336
1da177e4
LT
1337/**
1338 * register_netdevice_notifier - register a network notifier block
1339 * @nb: notifier
1340 *
1341 * Register a notifier to be called when network device events occur.
1342 * The notifier passed is linked into the kernel structures and must
1343 * not be reused until it has been unregistered. A negative errno code
1344 * is returned on a failure.
1345 *
1346 * When registered all registration and up events are replayed
4ec93edb 1347 * to the new notifier to allow device to have a race free
1da177e4
LT
1348 * view of the network device list.
1349 */
1350
1351int register_netdevice_notifier(struct notifier_block *nb)
1352{
1353 struct net_device *dev;
fcc5a03a 1354 struct net_device *last;
881d966b 1355 struct net *net;
1da177e4
LT
1356 int err;
1357
1358 rtnl_lock();
f07d5b94 1359 err = raw_notifier_chain_register(&netdev_chain, nb);
fcc5a03a
HX
1360 if (err)
1361 goto unlock;
881d966b
EB
1362 if (dev_boot_phase)
1363 goto unlock;
1364 for_each_net(net) {
1365 for_each_netdev(net, dev) {
1366 err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1367 err = notifier_to_errno(err);
1368 if (err)
1369 goto rollback;
1370
1371 if (!(dev->flags & IFF_UP))
1372 continue;
1da177e4 1373
881d966b
EB
1374 nb->notifier_call(nb, NETDEV_UP, dev);
1375 }
1da177e4 1376 }
fcc5a03a
HX
1377
1378unlock:
1da177e4
LT
1379 rtnl_unlock();
1380 return err;
fcc5a03a
HX
1381
1382rollback:
1383 last = dev;
881d966b
EB
1384 for_each_net(net) {
1385 for_each_netdev(net, dev) {
1386 if (dev == last)
8f891489 1387 goto outroll;
fcc5a03a 1388
881d966b
EB
1389 if (dev->flags & IFF_UP) {
1390 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1391 nb->notifier_call(nb, NETDEV_DOWN, dev);
1392 }
1393 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
a5ee1551 1394 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
fcc5a03a 1395 }
fcc5a03a 1396 }
c67625a1 1397
8f891489 1398outroll:
c67625a1 1399 raw_notifier_chain_unregister(&netdev_chain, nb);
fcc5a03a 1400 goto unlock;
1da177e4 1401}
d1b19dff 1402EXPORT_SYMBOL(register_netdevice_notifier);
1da177e4
LT
1403
1404/**
1405 * unregister_netdevice_notifier - unregister a network notifier block
1406 * @nb: notifier
1407 *
1408 * Unregister a notifier previously registered by
1409 * register_netdevice_notifier(). The notifier is unlinked into the
1410 * kernel structures and may then be reused. A negative errno code
1411 * is returned on a failure.
7d3d43da
EB
1412 *
1413 * After unregistering unregister and down device events are synthesized
1414 * for all devices on the device list to the removed notifier to remove
1415 * the need for special case cleanup code.
1da177e4
LT
1416 */
1417
1418int unregister_netdevice_notifier(struct notifier_block *nb)
1419{
7d3d43da
EB
1420 struct net_device *dev;
1421 struct net *net;
9f514950
HX
1422 int err;
1423
1424 rtnl_lock();
f07d5b94 1425 err = raw_notifier_chain_unregister(&netdev_chain, nb);
7d3d43da
EB
1426 if (err)
1427 goto unlock;
1428
1429 for_each_net(net) {
1430 for_each_netdev(net, dev) {
1431 if (dev->flags & IFF_UP) {
1432 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1433 nb->notifier_call(nb, NETDEV_DOWN, dev);
1434 }
1435 nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1436 nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1437 }
1438 }
1439unlock:
9f514950
HX
1440 rtnl_unlock();
1441 return err;
1da177e4 1442}
d1b19dff 1443EXPORT_SYMBOL(unregister_netdevice_notifier);
1da177e4
LT
1444
1445/**
1446 * call_netdevice_notifiers - call all network notifier blocks
1447 * @val: value passed unmodified to notifier function
c4ea43c5 1448 * @dev: net_device pointer passed unmodified to notifier function
1da177e4
LT
1449 *
1450 * Call all network notifier blocks. Parameters and return value
f07d5b94 1451 * are as for raw_notifier_call_chain().
1da177e4
LT
1452 */
1453
ad7379d4 1454int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1da177e4 1455{
ab930471 1456 ASSERT_RTNL();
ad7379d4 1457 return raw_notifier_call_chain(&netdev_chain, val, dev);
1da177e4 1458}
edf947f1 1459EXPORT_SYMBOL(call_netdevice_notifiers);
1da177e4 1460
c5905afb 1461static struct static_key netstamp_needed __read_mostly;
b90e5794 1462#ifdef HAVE_JUMP_LABEL
c5905afb 1463/* We are not allowed to call static_key_slow_dec() from irq context
b90e5794 1464 * If net_disable_timestamp() is called from irq context, defer the
c5905afb 1465 * static_key_slow_dec() calls.
b90e5794
ED
1466 */
1467static atomic_t netstamp_needed_deferred;
1468#endif
1da177e4
LT
1469
1470void net_enable_timestamp(void)
1471{
b90e5794
ED
1472#ifdef HAVE_JUMP_LABEL
1473 int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1474
1475 if (deferred) {
1476 while (--deferred)
c5905afb 1477 static_key_slow_dec(&netstamp_needed);
b90e5794
ED
1478 return;
1479 }
1480#endif
1481 WARN_ON(in_interrupt());
c5905afb 1482 static_key_slow_inc(&netstamp_needed);
1da177e4 1483}
d1b19dff 1484EXPORT_SYMBOL(net_enable_timestamp);
1da177e4
LT
1485
1486void net_disable_timestamp(void)
1487{
b90e5794
ED
1488#ifdef HAVE_JUMP_LABEL
1489 if (in_interrupt()) {
1490 atomic_inc(&netstamp_needed_deferred);
1491 return;
1492 }
1493#endif
c5905afb 1494 static_key_slow_dec(&netstamp_needed);
1da177e4 1495}
d1b19dff 1496EXPORT_SYMBOL(net_disable_timestamp);
1da177e4 1497
3b098e2d 1498static inline void net_timestamp_set(struct sk_buff *skb)
1da177e4 1499{
588f0330 1500 skb->tstamp.tv64 = 0;
c5905afb 1501 if (static_key_false(&netstamp_needed))
a61bbcf2 1502 __net_timestamp(skb);
1da177e4
LT
1503}
1504
588f0330 1505#define net_timestamp_check(COND, SKB) \
c5905afb 1506 if (static_key_false(&netstamp_needed)) { \
588f0330
ED
1507 if ((COND) && !(SKB)->tstamp.tv64) \
1508 __net_timestamp(SKB); \
1509 } \
3b098e2d 1510
4dc360c5
RC
1511static int net_hwtstamp_validate(struct ifreq *ifr)
1512{
1513 struct hwtstamp_config cfg;
1514 enum hwtstamp_tx_types tx_type;
1515 enum hwtstamp_rx_filters rx_filter;
1516 int tx_type_valid = 0;
1517 int rx_filter_valid = 0;
1518
1519 if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
1520 return -EFAULT;
1521
1522 if (cfg.flags) /* reserved for future extensions */
1523 return -EINVAL;
1524
1525 tx_type = cfg.tx_type;
1526 rx_filter = cfg.rx_filter;
1527
1528 switch (tx_type) {
1529 case HWTSTAMP_TX_OFF:
1530 case HWTSTAMP_TX_ON:
1531 case HWTSTAMP_TX_ONESTEP_SYNC:
1532 tx_type_valid = 1;
1533 break;
1534 }
1535
1536 switch (rx_filter) {
1537 case HWTSTAMP_FILTER_NONE:
1538 case HWTSTAMP_FILTER_ALL:
1539 case HWTSTAMP_FILTER_SOME:
1540 case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
1541 case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
1542 case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
1543 case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
1544 case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
1545 case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
1546 case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
1547 case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
1548 case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
1549 case HWTSTAMP_FILTER_PTP_V2_EVENT:
1550 case HWTSTAMP_FILTER_PTP_V2_SYNC:
1551 case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
1552 rx_filter_valid = 1;
1553 break;
1554 }
1555
1556 if (!tx_type_valid || !rx_filter_valid)
1557 return -ERANGE;
1558
1559 return 0;
1560}
1561
79b569f0
DL
1562static inline bool is_skb_forwardable(struct net_device *dev,
1563 struct sk_buff *skb)
1564{
1565 unsigned int len;
1566
1567 if (!(dev->flags & IFF_UP))
1568 return false;
1569
1570 len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1571 if (skb->len <= len)
1572 return true;
1573
1574 /* if TSO is enabled, we don't care about the length as the packet
1575 * could be forwarded without being segmented before
1576 */
1577 if (skb_is_gso(skb))
1578 return true;
1579
1580 return false;
1581}
1582
44540960
AB
1583/**
1584 * dev_forward_skb - loopback an skb to another netif
1585 *
1586 * @dev: destination network device
1587 * @skb: buffer to forward
1588 *
1589 * return values:
1590 * NET_RX_SUCCESS (no congestion)
6ec82562 1591 * NET_RX_DROP (packet was dropped, but freed)
44540960
AB
1592 *
1593 * dev_forward_skb can be used for injecting an skb from the
1594 * start_xmit function of one device into the receive queue
1595 * of another device.
1596 *
1597 * The receiving device may be in another namespace, so
1598 * we have to clear all information in the skb that could
1599 * impact namespace isolation.
1600 */
1601int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1602{
48c83012
MT
1603 if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1604 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1605 atomic_long_inc(&dev->rx_dropped);
1606 kfree_skb(skb);
1607 return NET_RX_DROP;
1608 }
1609 }
1610
44540960 1611 skb_orphan(skb);
c736eefa 1612 nf_reset(skb);
44540960 1613
79b569f0 1614 if (unlikely(!is_skb_forwardable(dev, skb))) {
caf586e5 1615 atomic_long_inc(&dev->rx_dropped);
6ec82562 1616 kfree_skb(skb);
44540960 1617 return NET_RX_DROP;
6ec82562 1618 }
3b9785c6 1619 skb->skb_iif = 0;
8a83a00b 1620 skb_set_dev(skb, dev);
44540960
AB
1621 skb->tstamp.tv64 = 0;
1622 skb->pkt_type = PACKET_HOST;
1623 skb->protocol = eth_type_trans(skb, dev);
44540960
AB
1624 return netif_rx(skb);
1625}
1626EXPORT_SYMBOL_GPL(dev_forward_skb);
1627
71d9dec2
CG
1628static inline int deliver_skb(struct sk_buff *skb,
1629 struct packet_type *pt_prev,
1630 struct net_device *orig_dev)
1631{
1632 atomic_inc(&skb->users);
1633 return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1634}
1635
1da177e4
LT
1636/*
1637 * Support routine. Sends outgoing frames to any network
1638 * taps currently in use.
1639 */
1640
f6a78bfc 1641static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1da177e4
LT
1642{
1643 struct packet_type *ptype;
71d9dec2
CG
1644 struct sk_buff *skb2 = NULL;
1645 struct packet_type *pt_prev = NULL;
a61bbcf2 1646
1da177e4
LT
1647 rcu_read_lock();
1648 list_for_each_entry_rcu(ptype, &ptype_all, list) {
1649 /* Never send packets back to the socket
1650 * they originated from - MvS (miquels@drinkel.ow.org)
1651 */
1652 if ((ptype->dev == dev || !ptype->dev) &&
1653 (ptype->af_packet_priv == NULL ||
1654 (struct sock *)ptype->af_packet_priv != skb->sk)) {
71d9dec2
CG
1655 if (pt_prev) {
1656 deliver_skb(skb2, pt_prev, skb->dev);
1657 pt_prev = ptype;
1658 continue;
1659 }
1660
1661 skb2 = skb_clone(skb, GFP_ATOMIC);
1da177e4
LT
1662 if (!skb2)
1663 break;
1664
70978182
ED
1665 net_timestamp_set(skb2);
1666
1da177e4
LT
1667 /* skb->nh should be correctly
1668 set by sender, so that the second statement is
1669 just protection against buggy protocols.
1670 */
459a98ed 1671 skb_reset_mac_header(skb2);
1da177e4 1672
d56f90a7 1673 if (skb_network_header(skb2) < skb2->data ||
27a884dc 1674 skb2->network_header > skb2->tail) {
1da177e4 1675 if (net_ratelimit())
7b6cd1ce
JP
1676 pr_crit("protocol %04x is buggy, dev %s\n",
1677 ntohs(skb2->protocol),
1678 dev->name);
c1d2bbe1 1679 skb_reset_network_header(skb2);
1da177e4
LT
1680 }
1681
b0e380b1 1682 skb2->transport_header = skb2->network_header;
1da177e4 1683 skb2->pkt_type = PACKET_OUTGOING;
71d9dec2 1684 pt_prev = ptype;
1da177e4
LT
1685 }
1686 }
71d9dec2
CG
1687 if (pt_prev)
1688 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1da177e4
LT
1689 rcu_read_unlock();
1690}
1691
4f57c087
JF
1692/* netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1693 * @dev: Network device
1694 * @txq: number of queues available
1695 *
1696 * If real_num_tx_queues is changed the tc mappings may no longer be
1697 * valid. To resolve this verify the tc mapping remains valid and if
1698 * not NULL the mapping. With no priorities mapping to this
1699 * offset/count pair it will no longer be used. In the worst case TC0
1700 * is invalid nothing can be done so disable priority mappings. If is
1701 * expected that drivers will fix this mapping if they can before
1702 * calling netif_set_real_num_tx_queues.
1703 */
bb134d22 1704static void netif_setup_tc(struct net_device *dev, unsigned int txq)
4f57c087
JF
1705{
1706 int i;
1707 struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1708
1709 /* If TC0 is invalidated disable TC mapping */
1710 if (tc->offset + tc->count > txq) {
7b6cd1ce 1711 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
4f57c087
JF
1712 dev->num_tc = 0;
1713 return;
1714 }
1715
1716 /* Invalidated prio to tc mappings set to TC0 */
1717 for (i = 1; i < TC_BITMASK + 1; i++) {
1718 int q = netdev_get_prio_tc_map(dev, i);
1719
1720 tc = &dev->tc_to_txq[q];
1721 if (tc->offset + tc->count > txq) {
7b6cd1ce
JP
1722 pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1723 i, q);
4f57c087
JF
1724 netdev_set_prio_tc_map(dev, i, 0);
1725 }
1726 }
1727}
1728
f0796d5c
JF
1729/*
1730 * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
1731 * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
1732 */
e6484930 1733int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
f0796d5c 1734{
1d24eb48
TH
1735 int rc;
1736
e6484930
TH
1737 if (txq < 1 || txq > dev->num_tx_queues)
1738 return -EINVAL;
f0796d5c 1739
5c56580b
BH
1740 if (dev->reg_state == NETREG_REGISTERED ||
1741 dev->reg_state == NETREG_UNREGISTERING) {
e6484930
TH
1742 ASSERT_RTNL();
1743
1d24eb48
TH
1744 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
1745 txq);
bf264145
TH
1746 if (rc)
1747 return rc;
1748
4f57c087
JF
1749 if (dev->num_tc)
1750 netif_setup_tc(dev, txq);
1751
e6484930
TH
1752 if (txq < dev->real_num_tx_queues)
1753 qdisc_reset_all_tx_gt(dev, txq);
f0796d5c 1754 }
e6484930
TH
1755
1756 dev->real_num_tx_queues = txq;
1757 return 0;
f0796d5c
JF
1758}
1759EXPORT_SYMBOL(netif_set_real_num_tx_queues);
56079431 1760
62fe0b40
BH
1761#ifdef CONFIG_RPS
1762/**
1763 * netif_set_real_num_rx_queues - set actual number of RX queues used
1764 * @dev: Network device
1765 * @rxq: Actual number of RX queues
1766 *
1767 * This must be called either with the rtnl_lock held or before
1768 * registration of the net device. Returns 0 on success, or a
4e7f7951
BH
1769 * negative error code. If called before registration, it always
1770 * succeeds.
62fe0b40
BH
1771 */
1772int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
1773{
1774 int rc;
1775
bd25fa7b
TH
1776 if (rxq < 1 || rxq > dev->num_rx_queues)
1777 return -EINVAL;
1778
62fe0b40
BH
1779 if (dev->reg_state == NETREG_REGISTERED) {
1780 ASSERT_RTNL();
1781
62fe0b40
BH
1782 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
1783 rxq);
1784 if (rc)
1785 return rc;
62fe0b40
BH
1786 }
1787
1788 dev->real_num_rx_queues = rxq;
1789 return 0;
1790}
1791EXPORT_SYMBOL(netif_set_real_num_rx_queues);
1792#endif
1793
def82a1d 1794static inline void __netif_reschedule(struct Qdisc *q)
56079431 1795{
def82a1d
JP
1796 struct softnet_data *sd;
1797 unsigned long flags;
56079431 1798
def82a1d
JP
1799 local_irq_save(flags);
1800 sd = &__get_cpu_var(softnet_data);
a9cbd588
CG
1801 q->next_sched = NULL;
1802 *sd->output_queue_tailp = q;
1803 sd->output_queue_tailp = &q->next_sched;
def82a1d
JP
1804 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1805 local_irq_restore(flags);
1806}
1807
1808void __netif_schedule(struct Qdisc *q)
1809{
1810 if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1811 __netif_reschedule(q);
56079431
DV
1812}
1813EXPORT_SYMBOL(__netif_schedule);
1814
bea3348e 1815void dev_kfree_skb_irq(struct sk_buff *skb)
56079431 1816{
3578b0c8 1817 if (atomic_dec_and_test(&skb->users)) {
bea3348e
SH
1818 struct softnet_data *sd;
1819 unsigned long flags;
56079431 1820
bea3348e
SH
1821 local_irq_save(flags);
1822 sd = &__get_cpu_var(softnet_data);
1823 skb->next = sd->completion_queue;
1824 sd->completion_queue = skb;
1825 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1826 local_irq_restore(flags);
1827 }
56079431 1828}
bea3348e 1829EXPORT_SYMBOL(dev_kfree_skb_irq);
56079431
DV
1830
1831void dev_kfree_skb_any(struct sk_buff *skb)
1832{
1833 if (in_irq() || irqs_disabled())
1834 dev_kfree_skb_irq(skb);
1835 else
1836 dev_kfree_skb(skb);
1837}
1838EXPORT_SYMBOL(dev_kfree_skb_any);
1839
1840
bea3348e
SH
1841/**
1842 * netif_device_detach - mark device as removed
1843 * @dev: network device
1844 *
1845 * Mark device as removed from system and therefore no longer available.
1846 */
56079431
DV
1847void netif_device_detach(struct net_device *dev)
1848{
1849 if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1850 netif_running(dev)) {
d543103a 1851 netif_tx_stop_all_queues(dev);
56079431
DV
1852 }
1853}
1854EXPORT_SYMBOL(netif_device_detach);
1855
bea3348e
SH
1856/**
1857 * netif_device_attach - mark device as attached
1858 * @dev: network device
1859 *
1860 * Mark device as attached from system and restart if needed.
1861 */
56079431
DV
1862void netif_device_attach(struct net_device *dev)
1863{
1864 if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1865 netif_running(dev)) {
d543103a 1866 netif_tx_wake_all_queues(dev);
4ec93edb 1867 __netdev_watchdog_up(dev);
56079431
DV
1868 }
1869}
1870EXPORT_SYMBOL(netif_device_attach);
1871
8a83a00b
AB
1872/**
1873 * skb_dev_set -- assign a new device to a buffer
1874 * @skb: buffer for the new device
1875 * @dev: network device
1876 *
1877 * If an skb is owned by a device already, we have to reset
1878 * all data private to the namespace a device belongs to
1879 * before assigning it a new device.
1880 */
1881#ifdef CONFIG_NET_NS
1882void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1883{
1884 skb_dst_drop(skb);
1885 if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1886 secpath_reset(skb);
1887 nf_reset(skb);
1888 skb_init_secmark(skb);
1889 skb->mark = 0;
1890 skb->priority = 0;
1891 skb->nf_trace = 0;
1892 skb->ipvs_property = 0;
1893#ifdef CONFIG_NET_SCHED
1894 skb->tc_index = 0;
1895#endif
1896 }
1897 skb->dev = dev;
1898}
1899EXPORT_SYMBOL(skb_set_dev);
1900#endif /* CONFIG_NET_NS */
1901
36c92474
BH
1902static void skb_warn_bad_offload(const struct sk_buff *skb)
1903{
65e9d2fa 1904 static const netdev_features_t null_features = 0;
36c92474
BH
1905 struct net_device *dev = skb->dev;
1906 const char *driver = "";
1907
1908 if (dev && dev->dev.parent)
1909 driver = dev_driver_string(dev->dev.parent);
1910
1911 WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
1912 "gso_type=%d ip_summed=%d\n",
65e9d2fa
MM
1913 driver, dev ? &dev->features : &null_features,
1914 skb->sk ? &skb->sk->sk_route_caps : &null_features,
36c92474
BH
1915 skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
1916 skb_shinfo(skb)->gso_type, skb->ip_summed);
1917}
1918
1da177e4
LT
1919/*
1920 * Invalidate hardware checksum when packet is to be mangled, and
1921 * complete checksum manually on outgoing path.
1922 */
84fa7933 1923int skb_checksum_help(struct sk_buff *skb)
1da177e4 1924{
d3bc23e7 1925 __wsum csum;
663ead3b 1926 int ret = 0, offset;
1da177e4 1927
84fa7933 1928 if (skb->ip_summed == CHECKSUM_COMPLETE)
a430a43d
HX
1929 goto out_set_summed;
1930
1931 if (unlikely(skb_shinfo(skb)->gso_size)) {
36c92474
BH
1932 skb_warn_bad_offload(skb);
1933 return -EINVAL;
1da177e4
LT
1934 }
1935
55508d60 1936 offset = skb_checksum_start_offset(skb);
a030847e
HX
1937 BUG_ON(offset >= skb_headlen(skb));
1938 csum = skb_checksum(skb, offset, skb->len - offset, 0);
1939
1940 offset += skb->csum_offset;
1941 BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1942
1943 if (skb_cloned(skb) &&
1944 !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1da177e4
LT
1945 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1946 if (ret)
1947 goto out;
1948 }
1949
a030847e 1950 *(__sum16 *)(skb->data + offset) = csum_fold(csum);
a430a43d 1951out_set_summed:
1da177e4 1952 skb->ip_summed = CHECKSUM_NONE;
4ec93edb 1953out:
1da177e4
LT
1954 return ret;
1955}
d1b19dff 1956EXPORT_SYMBOL(skb_checksum_help);
1da177e4 1957
f6a78bfc
HX
1958/**
1959 * skb_gso_segment - Perform segmentation on skb.
1960 * @skb: buffer to segment
576a30eb 1961 * @features: features for the output path (see dev->features)
f6a78bfc
HX
1962 *
1963 * This function segments the given skb and returns a list of segments.
576a30eb
HX
1964 *
1965 * It may return NULL if the skb requires no segmentation. This is
1966 * only possible when GSO is used for verifying header integrity.
f6a78bfc 1967 */
c8f44aff
MM
1968struct sk_buff *skb_gso_segment(struct sk_buff *skb,
1969 netdev_features_t features)
f6a78bfc
HX
1970{
1971 struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1972 struct packet_type *ptype;
252e3346 1973 __be16 type = skb->protocol;
c8d5bcd1 1974 int vlan_depth = ETH_HLEN;
a430a43d 1975 int err;
f6a78bfc 1976
c8d5bcd1
JG
1977 while (type == htons(ETH_P_8021Q)) {
1978 struct vlan_hdr *vh;
7b9c6090 1979
c8d5bcd1 1980 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
7b9c6090
JG
1981 return ERR_PTR(-EINVAL);
1982
c8d5bcd1
JG
1983 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
1984 type = vh->h_vlan_encapsulated_proto;
1985 vlan_depth += VLAN_HLEN;
7b9c6090
JG
1986 }
1987
459a98ed 1988 skb_reset_mac_header(skb);
b0e380b1 1989 skb->mac_len = skb->network_header - skb->mac_header;
f6a78bfc
HX
1990 __skb_pull(skb, skb->mac_len);
1991
67fd1a73 1992 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
36c92474 1993 skb_warn_bad_offload(skb);
67fd1a73 1994
a430a43d
HX
1995 if (skb_header_cloned(skb) &&
1996 (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1997 return ERR_PTR(err);
1998 }
1999
f6a78bfc 2000 rcu_read_lock();
82d8a867
PE
2001 list_for_each_entry_rcu(ptype,
2002 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
f6a78bfc 2003 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
84fa7933 2004 if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
a430a43d
HX
2005 err = ptype->gso_send_check(skb);
2006 segs = ERR_PTR(err);
2007 if (err || skb_gso_ok(skb, features))
2008 break;
d56f90a7
ACM
2009 __skb_push(skb, (skb->data -
2010 skb_network_header(skb)));
a430a43d 2011 }
576a30eb 2012 segs = ptype->gso_segment(skb, features);
f6a78bfc
HX
2013 break;
2014 }
2015 }
2016 rcu_read_unlock();
2017
98e399f8 2018 __skb_push(skb, skb->data - skb_mac_header(skb));
576a30eb 2019
f6a78bfc
HX
2020 return segs;
2021}
f6a78bfc
HX
2022EXPORT_SYMBOL(skb_gso_segment);
2023
fb286bb2
HX
2024/* Take action when hardware reception checksum errors are detected. */
2025#ifdef CONFIG_BUG
2026void netdev_rx_csum_fault(struct net_device *dev)
2027{
2028 if (net_ratelimit()) {
7b6cd1ce 2029 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
fb286bb2
HX
2030 dump_stack();
2031 }
2032}
2033EXPORT_SYMBOL(netdev_rx_csum_fault);
2034#endif
2035
1da177e4
LT
2036/* Actually, we should eliminate this check as soon as we know, that:
2037 * 1. IOMMU is present and allows to map all the memory.
2038 * 2. No high memory really exists on this machine.
2039 */
2040
9092c658 2041static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1da177e4 2042{
3d3a8533 2043#ifdef CONFIG_HIGHMEM
1da177e4 2044 int i;
5acbbd42 2045 if (!(dev->features & NETIF_F_HIGHDMA)) {
ea2ab693
IC
2046 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2047 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2048 if (PageHighMem(skb_frag_page(frag)))
5acbbd42 2049 return 1;
ea2ab693 2050 }
5acbbd42 2051 }
1da177e4 2052
5acbbd42
FT
2053 if (PCI_DMA_BUS_IS_PHYS) {
2054 struct device *pdev = dev->dev.parent;
1da177e4 2055
9092c658
ED
2056 if (!pdev)
2057 return 0;
5acbbd42 2058 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
ea2ab693
IC
2059 skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2060 dma_addr_t addr = page_to_phys(skb_frag_page(frag));
5acbbd42
FT
2061 if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2062 return 1;
2063 }
2064 }
3d3a8533 2065#endif
1da177e4
LT
2066 return 0;
2067}
1da177e4 2068
f6a78bfc
HX
2069struct dev_gso_cb {
2070 void (*destructor)(struct sk_buff *skb);
2071};
2072
2073#define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2074
2075static void dev_gso_skb_destructor(struct sk_buff *skb)
2076{
2077 struct dev_gso_cb *cb;
2078
2079 do {
2080 struct sk_buff *nskb = skb->next;
2081
2082 skb->next = nskb->next;
2083 nskb->next = NULL;
2084 kfree_skb(nskb);
2085 } while (skb->next);
2086
2087 cb = DEV_GSO_CB(skb);
2088 if (cb->destructor)
2089 cb->destructor(skb);
2090}
2091
2092/**
2093 * dev_gso_segment - Perform emulated hardware segmentation on skb.
2094 * @skb: buffer to segment
91ecb63c 2095 * @features: device features as applicable to this skb
f6a78bfc
HX
2096 *
2097 * This function segments the given skb and stores the list of segments
2098 * in skb->next.
2099 */
c8f44aff 2100static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
f6a78bfc 2101{
f6a78bfc 2102 struct sk_buff *segs;
576a30eb
HX
2103
2104 segs = skb_gso_segment(skb, features);
2105
2106 /* Verifying header integrity only. */
2107 if (!segs)
2108 return 0;
f6a78bfc 2109
801678c5 2110 if (IS_ERR(segs))
f6a78bfc
HX
2111 return PTR_ERR(segs);
2112
2113 skb->next = segs;
2114 DEV_GSO_CB(skb)->destructor = skb->destructor;
2115 skb->destructor = dev_gso_skb_destructor;
2116
2117 return 0;
2118}
2119
fc6055a5
ED
2120/*
2121 * Try to orphan skb early, right before transmission by the device.
2244d07b
OH
2122 * We cannot orphan skb if tx timestamp is requested or the sk-reference
2123 * is needed on driver level for other reasons, e.g. see net/can/raw.c
fc6055a5
ED
2124 */
2125static inline void skb_orphan_try(struct sk_buff *skb)
2126{
87fd308c
ED
2127 struct sock *sk = skb->sk;
2128
2244d07b 2129 if (sk && !skb_shinfo(skb)->tx_flags) {
87fd308c
ED
2130 /* skb_tx_hash() wont be able to get sk.
2131 * We copy sk_hash into skb->rxhash
2132 */
2133 if (!skb->rxhash)
2134 skb->rxhash = sk->sk_hash;
fc6055a5 2135 skb_orphan(skb);
87fd308c 2136 }
fc6055a5
ED
2137}
2138
c8f44aff 2139static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
03634668
JG
2140{
2141 return ((features & NETIF_F_GEN_CSUM) ||
2142 ((features & NETIF_F_V4_CSUM) &&
2143 protocol == htons(ETH_P_IP)) ||
2144 ((features & NETIF_F_V6_CSUM) &&
2145 protocol == htons(ETH_P_IPV6)) ||
2146 ((features & NETIF_F_FCOE_CRC) &&
2147 protocol == htons(ETH_P_FCOE)));
2148}
2149
c8f44aff
MM
2150static netdev_features_t harmonize_features(struct sk_buff *skb,
2151 __be16 protocol, netdev_features_t features)
f01a5236 2152{
d402786e 2153 if (!can_checksum_protocol(features, protocol)) {
f01a5236
JG
2154 features &= ~NETIF_F_ALL_CSUM;
2155 features &= ~NETIF_F_SG;
2156 } else if (illegal_highdma(skb->dev, skb)) {
2157 features &= ~NETIF_F_SG;
2158 }
2159
2160 return features;
2161}
2162
c8f44aff 2163netdev_features_t netif_skb_features(struct sk_buff *skb)
58e998c6
JG
2164{
2165 __be16 protocol = skb->protocol;
c8f44aff 2166 netdev_features_t features = skb->dev->features;
58e998c6
JG
2167
2168 if (protocol == htons(ETH_P_8021Q)) {
2169 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2170 protocol = veh->h_vlan_encapsulated_proto;
f01a5236
JG
2171 } else if (!vlan_tx_tag_present(skb)) {
2172 return harmonize_features(skb, protocol, features);
2173 }
58e998c6 2174
6ee400aa 2175 features &= (skb->dev->vlan_features | NETIF_F_HW_VLAN_TX);
f01a5236
JG
2176
2177 if (protocol != htons(ETH_P_8021Q)) {
2178 return harmonize_features(skb, protocol, features);
2179 } else {
2180 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
6ee400aa 2181 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_TX;
f01a5236
JG
2182 return harmonize_features(skb, protocol, features);
2183 }
58e998c6 2184}
f01a5236 2185EXPORT_SYMBOL(netif_skb_features);
58e998c6 2186
6afff0ca
JF
2187/*
2188 * Returns true if either:
2189 * 1. skb has frag_list and the device doesn't support FRAGLIST, or
2190 * 2. skb is fragmented and the device does not support SG, or if
2191 * at least one of fragments is in highmem and device does not
2192 * support DMA from it.
2193 */
2194static inline int skb_needs_linearize(struct sk_buff *skb,
02932ce9 2195 int features)
6afff0ca 2196{
02932ce9
JG
2197 return skb_is_nonlinear(skb) &&
2198 ((skb_has_frag_list(skb) &&
2199 !(features & NETIF_F_FRAGLIST)) ||
e1e78db6 2200 (skb_shinfo(skb)->nr_frags &&
02932ce9 2201 !(features & NETIF_F_SG)));
6afff0ca
JF
2202}
2203
fd2ea0a7
DM
2204int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2205 struct netdev_queue *txq)
f6a78bfc 2206{
00829823 2207 const struct net_device_ops *ops = dev->netdev_ops;
572a9d7b 2208 int rc = NETDEV_TX_OK;
ec764bf0 2209 unsigned int skb_len;
00829823 2210
f6a78bfc 2211 if (likely(!skb->next)) {
c8f44aff 2212 netdev_features_t features;
fc741216 2213
93f154b5 2214 /*
25985edc 2215 * If device doesn't need skb->dst, release it right now while
93f154b5
ED
2216 * its hot in this cpu cache
2217 */
adf30907
ED
2218 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2219 skb_dst_drop(skb);
2220
15c2d75f
ED
2221 if (!list_empty(&ptype_all))
2222 dev_queue_xmit_nit(skb, dev);
2223
fc6055a5 2224 skb_orphan_try(skb);
9ccb8975 2225
fc741216
JG
2226 features = netif_skb_features(skb);
2227
7b9c6090 2228 if (vlan_tx_tag_present(skb) &&
fc741216 2229 !(features & NETIF_F_HW_VLAN_TX)) {
7b9c6090
JG
2230 skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb));
2231 if (unlikely(!skb))
2232 goto out;
2233
2234 skb->vlan_tci = 0;
2235 }
2236
fc741216 2237 if (netif_needs_gso(skb, features)) {
91ecb63c 2238 if (unlikely(dev_gso_segment(skb, features)))
9ccb8975
DM
2239 goto out_kfree_skb;
2240 if (skb->next)
2241 goto gso;
6afff0ca 2242 } else {
02932ce9 2243 if (skb_needs_linearize(skb, features) &&
6afff0ca
JF
2244 __skb_linearize(skb))
2245 goto out_kfree_skb;
2246
2247 /* If packet is not checksummed and device does not
2248 * support checksumming for this protocol, complete
2249 * checksumming here.
2250 */
2251 if (skb->ip_summed == CHECKSUM_PARTIAL) {
55508d60
MM
2252 skb_set_transport_header(skb,
2253 skb_checksum_start_offset(skb));
03634668 2254 if (!(features & NETIF_F_ALL_CSUM) &&
6afff0ca
JF
2255 skb_checksum_help(skb))
2256 goto out_kfree_skb;
2257 }
9ccb8975
DM
2258 }
2259
ec764bf0 2260 skb_len = skb->len;
ac45f602 2261 rc = ops->ndo_start_xmit(skb, dev);
ec764bf0 2262 trace_net_dev_xmit(skb, rc, dev, skb_len);
ec634fe3 2263 if (rc == NETDEV_TX_OK)
08baf561 2264 txq_trans_update(txq);
ac45f602 2265 return rc;
f6a78bfc
HX
2266 }
2267
576a30eb 2268gso:
f6a78bfc
HX
2269 do {
2270 struct sk_buff *nskb = skb->next;
f6a78bfc
HX
2271
2272 skb->next = nskb->next;
2273 nskb->next = NULL;
068a2de5
KK
2274
2275 /*
25985edc 2276 * If device doesn't need nskb->dst, release it right now while
068a2de5
KK
2277 * its hot in this cpu cache
2278 */
2279 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2280 skb_dst_drop(nskb);
2281
ec764bf0 2282 skb_len = nskb->len;
00829823 2283 rc = ops->ndo_start_xmit(nskb, dev);
ec764bf0 2284 trace_net_dev_xmit(nskb, rc, dev, skb_len);
ec634fe3 2285 if (unlikely(rc != NETDEV_TX_OK)) {
572a9d7b
PM
2286 if (rc & ~NETDEV_TX_MASK)
2287 goto out_kfree_gso_skb;
f54d9e8d 2288 nskb->next = skb->next;
f6a78bfc
HX
2289 skb->next = nskb;
2290 return rc;
2291 }
08baf561 2292 txq_trans_update(txq);
73466498 2293 if (unlikely(netif_xmit_stopped(txq) && skb->next))
f54d9e8d 2294 return NETDEV_TX_BUSY;
f6a78bfc 2295 } while (skb->next);
4ec93edb 2296
572a9d7b
PM
2297out_kfree_gso_skb:
2298 if (likely(skb->next == NULL))
2299 skb->destructor = DEV_GSO_CB(skb)->destructor;
f6a78bfc
HX
2300out_kfree_skb:
2301 kfree_skb(skb);
7b9c6090 2302out:
572a9d7b 2303 return rc;
f6a78bfc
HX
2304}
2305
0a9627f2 2306static u32 hashrnd __read_mostly;
b6b2fed1 2307
a3d22a68
VZ
2308/*
2309 * Returns a Tx hash based on the given packet descriptor a Tx queues' number
2310 * to be used as a distribution range.
2311 */
2312u16 __skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb,
2313 unsigned int num_tx_queues)
8f0f2223 2314{
7019298a 2315 u32 hash;
4f57c087
JF
2316 u16 qoffset = 0;
2317 u16 qcount = num_tx_queues;
b6b2fed1 2318
513de11b
DM
2319 if (skb_rx_queue_recorded(skb)) {
2320 hash = skb_get_rx_queue(skb);
a3d22a68
VZ
2321 while (unlikely(hash >= num_tx_queues))
2322 hash -= num_tx_queues;
513de11b
DM
2323 return hash;
2324 }
ec581f6a 2325
4f57c087
JF
2326 if (dev->num_tc) {
2327 u8 tc = netdev_get_prio_tc_map(dev, skb->priority);
2328 qoffset = dev->tc_to_txq[tc].offset;
2329 qcount = dev->tc_to_txq[tc].count;
2330 }
2331
ec581f6a 2332 if (skb->sk && skb->sk->sk_hash)
7019298a 2333 hash = skb->sk->sk_hash;
ec581f6a 2334 else
87fd308c 2335 hash = (__force u16) skb->protocol ^ skb->rxhash;
0a9627f2 2336 hash = jhash_1word(hash, hashrnd);
b6b2fed1 2337
4f57c087 2338 return (u16) (((u64) hash * qcount) >> 32) + qoffset;
8f0f2223 2339}
a3d22a68 2340EXPORT_SYMBOL(__skb_tx_hash);
8f0f2223 2341
ed04642f
ED
2342static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
2343{
2344 if (unlikely(queue_index >= dev->real_num_tx_queues)) {
2345 if (net_ratelimit()) {
7b6cd1ce
JP
2346 pr_warn("%s selects TX queue %d, but real number of TX queues is %d\n",
2347 dev->name, queue_index,
2348 dev->real_num_tx_queues);
ed04642f
ED
2349 }
2350 return 0;
2351 }
2352 return queue_index;
2353}
2354
1d24eb48
TH
2355static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
2356{
bf264145 2357#ifdef CONFIG_XPS
1d24eb48
TH
2358 struct xps_dev_maps *dev_maps;
2359 struct xps_map *map;
2360 int queue_index = -1;
2361
2362 rcu_read_lock();
2363 dev_maps = rcu_dereference(dev->xps_maps);
2364 if (dev_maps) {
2365 map = rcu_dereference(
2366 dev_maps->cpu_map[raw_smp_processor_id()]);
2367 if (map) {
2368 if (map->len == 1)
2369 queue_index = map->queues[0];
2370 else {
2371 u32 hash;
2372 if (skb->sk && skb->sk->sk_hash)
2373 hash = skb->sk->sk_hash;
2374 else
2375 hash = (__force u16) skb->protocol ^
2376 skb->rxhash;
2377 hash = jhash_1word(hash, hashrnd);
2378 queue_index = map->queues[
2379 ((u64)hash * map->len) >> 32];
2380 }
2381 if (unlikely(queue_index >= dev->real_num_tx_queues))
2382 queue_index = -1;
2383 }
2384 }
2385 rcu_read_unlock();
2386
2387 return queue_index;
2388#else
2389 return -1;
2390#endif
2391}
2392
e8a0464c
DM
2393static struct netdev_queue *dev_pick_tx(struct net_device *dev,
2394 struct sk_buff *skb)
2395{
b0f77d0e 2396 int queue_index;
deabc772 2397 const struct net_device_ops *ops = dev->netdev_ops;
a4ee3ce3 2398
3853b584
TH
2399 if (dev->real_num_tx_queues == 1)
2400 queue_index = 0;
2401 else if (ops->ndo_select_queue) {
deabc772
HS
2402 queue_index = ops->ndo_select_queue(dev, skb);
2403 queue_index = dev_cap_txqueue(dev, queue_index);
2404 } else {
2405 struct sock *sk = skb->sk;
2406 queue_index = sk_tx_queue_get(sk);
a4ee3ce3 2407
3853b584
TH
2408 if (queue_index < 0 || skb->ooo_okay ||
2409 queue_index >= dev->real_num_tx_queues) {
2410 int old_index = queue_index;
fd2ea0a7 2411
1d24eb48
TH
2412 queue_index = get_xps_queue(dev, skb);
2413 if (queue_index < 0)
2414 queue_index = skb_tx_hash(dev, skb);
3853b584
TH
2415
2416 if (queue_index != old_index && sk) {
2417 struct dst_entry *dst =
2418 rcu_dereference_check(sk->sk_dst_cache, 1);
8728c544
ED
2419
2420 if (dst && skb_dst(skb) == dst)
2421 sk_tx_queue_set(sk, queue_index);
2422 }
a4ee3ce3
KK
2423 }
2424 }
eae792b7 2425
fd2ea0a7
DM
2426 skb_set_queue_mapping(skb, queue_index);
2427 return netdev_get_tx_queue(dev, queue_index);
e8a0464c
DM
2428}
2429
bbd8a0d3
KK
2430static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2431 struct net_device *dev,
2432 struct netdev_queue *txq)
2433{
2434 spinlock_t *root_lock = qdisc_lock(q);
a2da570d 2435 bool contended;
bbd8a0d3
KK
2436 int rc;
2437
a2da570d
ED
2438 qdisc_skb_cb(skb)->pkt_len = skb->len;
2439 qdisc_calculate_pkt_len(skb, q);
79640a4c
ED
2440 /*
2441 * Heuristic to force contended enqueues to serialize on a
2442 * separate lock before trying to get qdisc main lock.
2443 * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2444 * and dequeue packets faster.
2445 */
a2da570d 2446 contended = qdisc_is_running(q);
79640a4c
ED
2447 if (unlikely(contended))
2448 spin_lock(&q->busylock);
2449
bbd8a0d3
KK
2450 spin_lock(root_lock);
2451 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2452 kfree_skb(skb);
2453 rc = NET_XMIT_DROP;
2454 } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
bc135b23 2455 qdisc_run_begin(q)) {
bbd8a0d3
KK
2456 /*
2457 * This is a work-conserving queue; there are no old skbs
2458 * waiting to be sent out; and the qdisc is not running -
2459 * xmit the skb directly.
2460 */
7fee226a
ED
2461 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2462 skb_dst_force(skb);
bfe0d029 2463
bfe0d029
ED
2464 qdisc_bstats_update(q, skb);
2465
79640a4c
ED
2466 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2467 if (unlikely(contended)) {
2468 spin_unlock(&q->busylock);
2469 contended = false;
2470 }
bbd8a0d3 2471 __qdisc_run(q);
79640a4c 2472 } else
bc135b23 2473 qdisc_run_end(q);
bbd8a0d3
KK
2474
2475 rc = NET_XMIT_SUCCESS;
2476 } else {
7fee226a 2477 skb_dst_force(skb);
a2da570d 2478 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
79640a4c
ED
2479 if (qdisc_run_begin(q)) {
2480 if (unlikely(contended)) {
2481 spin_unlock(&q->busylock);
2482 contended = false;
2483 }
2484 __qdisc_run(q);
2485 }
bbd8a0d3
KK
2486 }
2487 spin_unlock(root_lock);
79640a4c
ED
2488 if (unlikely(contended))
2489 spin_unlock(&q->busylock);
bbd8a0d3
KK
2490 return rc;
2491}
2492
5bc1421e
NH
2493#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2494static void skb_update_prio(struct sk_buff *skb)
2495{
6977a79d 2496 struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
5bc1421e
NH
2497
2498 if ((!skb->priority) && (skb->sk) && map)
2499 skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
2500}
2501#else
2502#define skb_update_prio(skb)
2503#endif
2504
745e20f1 2505static DEFINE_PER_CPU(int, xmit_recursion);
11a766ce 2506#define RECURSION_LIMIT 10
745e20f1 2507
d29f749e
DJ
2508/**
2509 * dev_queue_xmit - transmit a buffer
2510 * @skb: buffer to transmit
2511 *
2512 * Queue a buffer for transmission to a network device. The caller must
2513 * have set the device and priority and built the buffer before calling
2514 * this function. The function can be called from an interrupt.
2515 *
2516 * A negative errno code is returned on a failure. A success does not
2517 * guarantee the frame will be transmitted as it may be dropped due
2518 * to congestion or traffic shaping.
2519 *
2520 * -----------------------------------------------------------------------------------
2521 * I notice this method can also return errors from the queue disciplines,
2522 * including NET_XMIT_DROP, which is a positive value. So, errors can also
2523 * be positive.
2524 *
2525 * Regardless of the return value, the skb is consumed, so it is currently
2526 * difficult to retry a send to this method. (You can bump the ref count
2527 * before sending to hold a reference for retry if you are careful.)
2528 *
2529 * When calling this method, interrupts MUST be enabled. This is because
2530 * the BH enable code must have IRQs enabled so that it will not deadlock.
2531 * --BLG
2532 */
1da177e4
LT
2533int dev_queue_xmit(struct sk_buff *skb)
2534{
2535 struct net_device *dev = skb->dev;
dc2b4847 2536 struct netdev_queue *txq;
1da177e4
LT
2537 struct Qdisc *q;
2538 int rc = -ENOMEM;
2539
4ec93edb
YH
2540 /* Disable soft irqs for various locks below. Also
2541 * stops preemption for RCU.
1da177e4 2542 */
4ec93edb 2543 rcu_read_lock_bh();
1da177e4 2544
5bc1421e
NH
2545 skb_update_prio(skb);
2546
eae792b7 2547 txq = dev_pick_tx(dev, skb);
a898def2 2548 q = rcu_dereference_bh(txq->qdisc);
37437bb2 2549
1da177e4 2550#ifdef CONFIG_NET_CLS_ACT
d1b19dff 2551 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
1da177e4 2552#endif
cf66ba58 2553 trace_net_dev_queue(skb);
1da177e4 2554 if (q->enqueue) {
bbd8a0d3 2555 rc = __dev_xmit_skb(skb, q, dev, txq);
37437bb2 2556 goto out;
1da177e4
LT
2557 }
2558
2559 /* The device has no queue. Common case for software devices:
2560 loopback, all the sorts of tunnels...
2561
932ff279
HX
2562 Really, it is unlikely that netif_tx_lock protection is necessary
2563 here. (f.e. loopback and IP tunnels are clean ignoring statistics
1da177e4
LT
2564 counters.)
2565 However, it is possible, that they rely on protection
2566 made by us here.
2567
2568 Check this and shot the lock. It is not prone from deadlocks.
2569 Either shot noqueue qdisc, it is even simpler 8)
2570 */
2571 if (dev->flags & IFF_UP) {
2572 int cpu = smp_processor_id(); /* ok because BHs are off */
2573
c773e847 2574 if (txq->xmit_lock_owner != cpu) {
1da177e4 2575
745e20f1
ED
2576 if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2577 goto recursion_alert;
2578
c773e847 2579 HARD_TX_LOCK(dev, txq, cpu);
1da177e4 2580
73466498 2581 if (!netif_xmit_stopped(txq)) {
745e20f1 2582 __this_cpu_inc(xmit_recursion);
572a9d7b 2583 rc = dev_hard_start_xmit(skb, dev, txq);
745e20f1 2584 __this_cpu_dec(xmit_recursion);
572a9d7b 2585 if (dev_xmit_complete(rc)) {
c773e847 2586 HARD_TX_UNLOCK(dev, txq);
1da177e4
LT
2587 goto out;
2588 }
2589 }
c773e847 2590 HARD_TX_UNLOCK(dev, txq);
1da177e4 2591 if (net_ratelimit())
7b6cd1ce
JP
2592 pr_crit("Virtual device %s asks to queue packet!\n",
2593 dev->name);
1da177e4
LT
2594 } else {
2595 /* Recursion is detected! It is possible,
745e20f1
ED
2596 * unfortunately
2597 */
2598recursion_alert:
1da177e4 2599 if (net_ratelimit())
7b6cd1ce
JP
2600 pr_crit("Dead loop on virtual device %s, fix it urgently!\n",
2601 dev->name);
1da177e4
LT
2602 }
2603 }
2604
2605 rc = -ENETDOWN;
d4828d85 2606 rcu_read_unlock_bh();
1da177e4 2607
1da177e4
LT
2608 kfree_skb(skb);
2609 return rc;
2610out:
d4828d85 2611 rcu_read_unlock_bh();
1da177e4
LT
2612 return rc;
2613}
d1b19dff 2614EXPORT_SYMBOL(dev_queue_xmit);
1da177e4
LT
2615
2616
2617/*=======================================================================
2618 Receiver routines
2619 =======================================================================*/
2620
6b2bedc3 2621int netdev_max_backlog __read_mostly = 1000;
3b098e2d 2622int netdev_tstamp_prequeue __read_mostly = 1;
6b2bedc3
SH
2623int netdev_budget __read_mostly = 300;
2624int weight_p __read_mostly = 64; /* old backlog weight */
1da177e4 2625
eecfd7c4
ED
2626/* Called with irq disabled */
2627static inline void ____napi_schedule(struct softnet_data *sd,
2628 struct napi_struct *napi)
2629{
2630 list_add_tail(&napi->poll_list, &sd->poll_list);
2631 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2632}
2633
0a9627f2 2634/*
bfb564e7 2635 * __skb_get_rxhash: calculate a flow hash based on src/dst addresses
bdeab991
TH
2636 * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
2637 * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
2638 * if hash is a canonical 4-tuple hash over transport ports.
0a9627f2 2639 */
bdeab991 2640void __skb_get_rxhash(struct sk_buff *skb)
0a9627f2 2641{
4504b861
ED
2642 struct flow_keys keys;
2643 u32 hash;
c6865cb3 2644
4504b861
ED
2645 if (!skb_flow_dissect(skb, &keys))
2646 return;
e971b722 2647
4504b861
ED
2648 if (keys.ports) {
2649 if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
2650 swap(keys.port16[0], keys.port16[1]);
2651 skb->l4_rxhash = 1;
0a9627f2
TH
2652 }
2653
b249dcb8 2654 /* get a consistent hash (same value on both flow directions) */
4504b861
ED
2655 if ((__force u32)keys.dst < (__force u32)keys.src)
2656 swap(keys.dst, keys.src);
0a9627f2 2657
4504b861
ED
2658 hash = jhash_3words((__force u32)keys.dst,
2659 (__force u32)keys.src,
2660 (__force u32)keys.ports, hashrnd);
bfb564e7
KK
2661 if (!hash)
2662 hash = 1;
2663
bdeab991 2664 skb->rxhash = hash;
bfb564e7
KK
2665}
2666EXPORT_SYMBOL(__skb_get_rxhash);
2667
2668#ifdef CONFIG_RPS
2669
2670/* One global table that all flow-based protocols share. */
6e3f7faf 2671struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
bfb564e7
KK
2672EXPORT_SYMBOL(rps_sock_flow_table);
2673
c5905afb 2674struct static_key rps_needed __read_mostly;
adc9300e 2675
c445477d
BH
2676static struct rps_dev_flow *
2677set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2678 struct rps_dev_flow *rflow, u16 next_cpu)
2679{
09994d1b 2680 if (next_cpu != RPS_NO_CPU) {
c445477d
BH
2681#ifdef CONFIG_RFS_ACCEL
2682 struct netdev_rx_queue *rxqueue;
2683 struct rps_dev_flow_table *flow_table;
2684 struct rps_dev_flow *old_rflow;
2685 u32 flow_id;
2686 u16 rxq_index;
2687 int rc;
2688
2689 /* Should we steer this flow to a different hardware queue? */
69a19ee6
BH
2690 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2691 !(dev->features & NETIF_F_NTUPLE))
c445477d
BH
2692 goto out;
2693 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2694 if (rxq_index == skb_get_rx_queue(skb))
2695 goto out;
2696
2697 rxqueue = dev->_rx + rxq_index;
2698 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2699 if (!flow_table)
2700 goto out;
2701 flow_id = skb->rxhash & flow_table->mask;
2702 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2703 rxq_index, flow_id);
2704 if (rc < 0)
2705 goto out;
2706 old_rflow = rflow;
2707 rflow = &flow_table->flows[flow_id];
c445477d
BH
2708 rflow->filter = rc;
2709 if (old_rflow->filter == rflow->filter)
2710 old_rflow->filter = RPS_NO_FILTER;
2711 out:
2712#endif
2713 rflow->last_qtail =
09994d1b 2714 per_cpu(softnet_data, next_cpu).input_queue_head;
c445477d
BH
2715 }
2716
09994d1b 2717 rflow->cpu = next_cpu;
c445477d
BH
2718 return rflow;
2719}
2720
bfb564e7
KK
2721/*
2722 * get_rps_cpu is called from netif_receive_skb and returns the target
2723 * CPU from the RPS map of the receiving queue for a given skb.
2724 * rcu_read_lock must be held on entry.
2725 */
2726static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2727 struct rps_dev_flow **rflowp)
2728{
2729 struct netdev_rx_queue *rxqueue;
6e3f7faf 2730 struct rps_map *map;
bfb564e7
KK
2731 struct rps_dev_flow_table *flow_table;
2732 struct rps_sock_flow_table *sock_flow_table;
2733 int cpu = -1;
2734 u16 tcpu;
2735
2736 if (skb_rx_queue_recorded(skb)) {
2737 u16 index = skb_get_rx_queue(skb);
62fe0b40
BH
2738 if (unlikely(index >= dev->real_num_rx_queues)) {
2739 WARN_ONCE(dev->real_num_rx_queues > 1,
2740 "%s received packet on queue %u, but number "
2741 "of RX queues is %u\n",
2742 dev->name, index, dev->real_num_rx_queues);
bfb564e7
KK
2743 goto done;
2744 }
2745 rxqueue = dev->_rx + index;
2746 } else
2747 rxqueue = dev->_rx;
2748
6e3f7faf
ED
2749 map = rcu_dereference(rxqueue->rps_map);
2750 if (map) {
85875236 2751 if (map->len == 1 &&
33d480ce 2752 !rcu_access_pointer(rxqueue->rps_flow_table)) {
6febfca9
CG
2753 tcpu = map->cpus[0];
2754 if (cpu_online(tcpu))
2755 cpu = tcpu;
2756 goto done;
2757 }
33d480ce 2758 } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
bfb564e7 2759 goto done;
6febfca9 2760 }
bfb564e7 2761
2d47b459 2762 skb_reset_network_header(skb);
bfb564e7
KK
2763 if (!skb_get_rxhash(skb))
2764 goto done;
2765
fec5e652
TH
2766 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2767 sock_flow_table = rcu_dereference(rps_sock_flow_table);
2768 if (flow_table && sock_flow_table) {
2769 u16 next_cpu;
2770 struct rps_dev_flow *rflow;
2771
2772 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
2773 tcpu = rflow->cpu;
2774
2775 next_cpu = sock_flow_table->ents[skb->rxhash &
2776 sock_flow_table->mask];
2777
2778 /*
2779 * If the desired CPU (where last recvmsg was done) is
2780 * different from current CPU (one in the rx-queue flow
2781 * table entry), switch if one of the following holds:
2782 * - Current CPU is unset (equal to RPS_NO_CPU).
2783 * - Current CPU is offline.
2784 * - The current CPU's queue tail has advanced beyond the
2785 * last packet that was enqueued using this table entry.
2786 * This guarantees that all previous packets for the flow
2787 * have been dequeued, thus preserving in order delivery.
2788 */
2789 if (unlikely(tcpu != next_cpu) &&
2790 (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
2791 ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
c445477d
BH
2792 rflow->last_qtail)) >= 0))
2793 rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
2794
fec5e652
TH
2795 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
2796 *rflowp = rflow;
2797 cpu = tcpu;
2798 goto done;
2799 }
2800 }
2801
0a9627f2 2802 if (map) {
fec5e652 2803 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
0a9627f2
TH
2804
2805 if (cpu_online(tcpu)) {
2806 cpu = tcpu;
2807 goto done;
2808 }
2809 }
2810
2811done:
0a9627f2
TH
2812 return cpu;
2813}
2814
c445477d
BH
2815#ifdef CONFIG_RFS_ACCEL
2816
2817/**
2818 * rps_may_expire_flow - check whether an RFS hardware filter may be removed
2819 * @dev: Device on which the filter was set
2820 * @rxq_index: RX queue index
2821 * @flow_id: Flow ID passed to ndo_rx_flow_steer()
2822 * @filter_id: Filter ID returned by ndo_rx_flow_steer()
2823 *
2824 * Drivers that implement ndo_rx_flow_steer() should periodically call
2825 * this function for each installed filter and remove the filters for
2826 * which it returns %true.
2827 */
2828bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
2829 u32 flow_id, u16 filter_id)
2830{
2831 struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
2832 struct rps_dev_flow_table *flow_table;
2833 struct rps_dev_flow *rflow;
2834 bool expire = true;
2835 int cpu;
2836
2837 rcu_read_lock();
2838 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2839 if (flow_table && flow_id <= flow_table->mask) {
2840 rflow = &flow_table->flows[flow_id];
2841 cpu = ACCESS_ONCE(rflow->cpu);
2842 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
2843 ((int)(per_cpu(softnet_data, cpu).input_queue_head -
2844 rflow->last_qtail) <
2845 (int)(10 * flow_table->mask)))
2846 expire = false;
2847 }
2848 rcu_read_unlock();
2849 return expire;
2850}
2851EXPORT_SYMBOL(rps_may_expire_flow);
2852
2853#endif /* CONFIG_RFS_ACCEL */
2854
0a9627f2 2855/* Called from hardirq (IPI) context */
e36fa2f7 2856static void rps_trigger_softirq(void *data)
0a9627f2 2857{
e36fa2f7
ED
2858 struct softnet_data *sd = data;
2859
eecfd7c4 2860 ____napi_schedule(sd, &sd->backlog);
dee42870 2861 sd->received_rps++;
0a9627f2 2862}
e36fa2f7 2863
fec5e652 2864#endif /* CONFIG_RPS */
0a9627f2 2865
e36fa2f7
ED
2866/*
2867 * Check if this softnet_data structure is another cpu one
2868 * If yes, queue it to our IPI list and return 1
2869 * If no, return 0
2870 */
2871static int rps_ipi_queued(struct softnet_data *sd)
2872{
2873#ifdef CONFIG_RPS
2874 struct softnet_data *mysd = &__get_cpu_var(softnet_data);
2875
2876 if (sd != mysd) {
2877 sd->rps_ipi_next = mysd->rps_ipi_list;
2878 mysd->rps_ipi_list = sd;
2879
2880 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2881 return 1;
2882 }
2883#endif /* CONFIG_RPS */
2884 return 0;
2885}
2886
0a9627f2
TH
2887/*
2888 * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2889 * queue (may be a remote CPU queue).
2890 */
fec5e652
TH
2891static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
2892 unsigned int *qtail)
0a9627f2 2893{
e36fa2f7 2894 struct softnet_data *sd;
0a9627f2
TH
2895 unsigned long flags;
2896
e36fa2f7 2897 sd = &per_cpu(softnet_data, cpu);
0a9627f2
TH
2898
2899 local_irq_save(flags);
0a9627f2 2900
e36fa2f7 2901 rps_lock(sd);
6e7676c1
CG
2902 if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
2903 if (skb_queue_len(&sd->input_pkt_queue)) {
0a9627f2 2904enqueue:
e36fa2f7 2905 __skb_queue_tail(&sd->input_pkt_queue, skb);
76cc8b13 2906 input_queue_tail_incr_save(sd, qtail);
e36fa2f7 2907 rps_unlock(sd);
152102c7 2908 local_irq_restore(flags);
0a9627f2
TH
2909 return NET_RX_SUCCESS;
2910 }
2911
ebda37c2
ED
2912 /* Schedule NAPI for backlog device
2913 * We can use non atomic operation since we own the queue lock
2914 */
2915 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
e36fa2f7 2916 if (!rps_ipi_queued(sd))
eecfd7c4 2917 ____napi_schedule(sd, &sd->backlog);
0a9627f2
TH
2918 }
2919 goto enqueue;
2920 }
2921
dee42870 2922 sd->dropped++;
e36fa2f7 2923 rps_unlock(sd);
0a9627f2 2924
0a9627f2
TH
2925 local_irq_restore(flags);
2926
caf586e5 2927 atomic_long_inc(&skb->dev->rx_dropped);
0a9627f2
TH
2928 kfree_skb(skb);
2929 return NET_RX_DROP;
2930}
1da177e4 2931
1da177e4
LT
2932/**
2933 * netif_rx - post buffer to the network code
2934 * @skb: buffer to post
2935 *
2936 * This function receives a packet from a device driver and queues it for
2937 * the upper (protocol) levels to process. It always succeeds. The buffer
2938 * may be dropped during processing for congestion control or by the
2939 * protocol layers.
2940 *
2941 * return values:
2942 * NET_RX_SUCCESS (no congestion)
1da177e4
LT
2943 * NET_RX_DROP (packet was dropped)
2944 *
2945 */
2946
2947int netif_rx(struct sk_buff *skb)
2948{
b0e28f1e 2949 int ret;
1da177e4
LT
2950
2951 /* if netpoll wants it, pretend we never saw it */
2952 if (netpoll_rx(skb))
2953 return NET_RX_DROP;
2954
588f0330 2955 net_timestamp_check(netdev_tstamp_prequeue, skb);
1da177e4 2956
cf66ba58 2957 trace_netif_rx(skb);
df334545 2958#ifdef CONFIG_RPS
c5905afb 2959 if (static_key_false(&rps_needed)) {
fec5e652 2960 struct rps_dev_flow voidflow, *rflow = &voidflow;
b0e28f1e
ED
2961 int cpu;
2962
cece1945 2963 preempt_disable();
b0e28f1e 2964 rcu_read_lock();
fec5e652
TH
2965
2966 cpu = get_rps_cpu(skb->dev, skb, &rflow);
b0e28f1e
ED
2967 if (cpu < 0)
2968 cpu = smp_processor_id();
fec5e652
TH
2969
2970 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
2971
b0e28f1e 2972 rcu_read_unlock();
cece1945 2973 preempt_enable();
adc9300e
ED
2974 } else
2975#endif
fec5e652
TH
2976 {
2977 unsigned int qtail;
2978 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
2979 put_cpu();
2980 }
b0e28f1e 2981 return ret;
1da177e4 2982}
d1b19dff 2983EXPORT_SYMBOL(netif_rx);
1da177e4
LT
2984
2985int netif_rx_ni(struct sk_buff *skb)
2986{
2987 int err;
2988
2989 preempt_disable();
2990 err = netif_rx(skb);
2991 if (local_softirq_pending())
2992 do_softirq();
2993 preempt_enable();
2994
2995 return err;
2996}
1da177e4
LT
2997EXPORT_SYMBOL(netif_rx_ni);
2998
1da177e4
LT
2999static void net_tx_action(struct softirq_action *h)
3000{
3001 struct softnet_data *sd = &__get_cpu_var(softnet_data);
3002
3003 if (sd->completion_queue) {
3004 struct sk_buff *clist;
3005
3006 local_irq_disable();
3007 clist = sd->completion_queue;
3008 sd->completion_queue = NULL;
3009 local_irq_enable();
3010
3011 while (clist) {
3012 struct sk_buff *skb = clist;
3013 clist = clist->next;
3014
547b792c 3015 WARN_ON(atomic_read(&skb->users));
07dc22e7 3016 trace_kfree_skb(skb, net_tx_action);
1da177e4
LT
3017 __kfree_skb(skb);
3018 }
3019 }
3020
3021 if (sd->output_queue) {
37437bb2 3022 struct Qdisc *head;
1da177e4
LT
3023
3024 local_irq_disable();
3025 head = sd->output_queue;
3026 sd->output_queue = NULL;
a9cbd588 3027 sd->output_queue_tailp = &sd->output_queue;
1da177e4
LT
3028 local_irq_enable();
3029
3030 while (head) {
37437bb2
DM
3031 struct Qdisc *q = head;
3032 spinlock_t *root_lock;
3033
1da177e4
LT
3034 head = head->next_sched;
3035
5fb66229 3036 root_lock = qdisc_lock(q);
37437bb2 3037 if (spin_trylock(root_lock)) {
def82a1d
JP
3038 smp_mb__before_clear_bit();
3039 clear_bit(__QDISC_STATE_SCHED,
3040 &q->state);
37437bb2
DM
3041 qdisc_run(q);
3042 spin_unlock(root_lock);
1da177e4 3043 } else {
195648bb 3044 if (!test_bit(__QDISC_STATE_DEACTIVATED,
e8a83e10 3045 &q->state)) {
195648bb 3046 __netif_reschedule(q);
e8a83e10
JP
3047 } else {
3048 smp_mb__before_clear_bit();
3049 clear_bit(__QDISC_STATE_SCHED,
3050 &q->state);
3051 }
1da177e4
LT
3052 }
3053 }
3054 }
3055}
3056
ab95bfe0
JP
3057#if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3058 (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
da678292
MM
3059/* This hook is defined here for ATM LANE */
3060int (*br_fdb_test_addr_hook)(struct net_device *dev,
3061 unsigned char *addr) __read_mostly;
4fb019a0 3062EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
da678292 3063#endif
1da177e4 3064
1da177e4
LT
3065#ifdef CONFIG_NET_CLS_ACT
3066/* TODO: Maybe we should just force sch_ingress to be compiled in
3067 * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3068 * a compare and 2 stores extra right now if we dont have it on
3069 * but have CONFIG_NET_CLS_ACT
25985edc
LDM
3070 * NOTE: This doesn't stop any functionality; if you dont have
3071 * the ingress scheduler, you just can't add policies on ingress.
1da177e4
LT
3072 *
3073 */
24824a09 3074static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
1da177e4 3075{
1da177e4 3076 struct net_device *dev = skb->dev;
f697c3e8 3077 u32 ttl = G_TC_RTTL(skb->tc_verd);
555353cf
DM
3078 int result = TC_ACT_OK;
3079 struct Qdisc *q;
4ec93edb 3080
de384830
SH
3081 if (unlikely(MAX_RED_LOOP < ttl++)) {
3082 if (net_ratelimit())
7b6cd1ce
JP
3083 pr_warn("Redir loop detected Dropping packet (%d->%d)\n",
3084 skb->skb_iif, dev->ifindex);
f697c3e8
HX
3085 return TC_ACT_SHOT;
3086 }
1da177e4 3087
f697c3e8
HX
3088 skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3089 skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
1da177e4 3090
83874000 3091 q = rxq->qdisc;
8d50b53d 3092 if (q != &noop_qdisc) {
83874000 3093 spin_lock(qdisc_lock(q));
a9312ae8
DM
3094 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3095 result = qdisc_enqueue_root(skb, q);
83874000
DM
3096 spin_unlock(qdisc_lock(q));
3097 }
f697c3e8
HX
3098
3099 return result;
3100}
86e65da9 3101
f697c3e8
HX
3102static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3103 struct packet_type **pt_prev,
3104 int *ret, struct net_device *orig_dev)
3105{
24824a09
ED
3106 struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3107
3108 if (!rxq || rxq->qdisc == &noop_qdisc)
f697c3e8 3109 goto out;
1da177e4 3110
f697c3e8
HX
3111 if (*pt_prev) {
3112 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3113 *pt_prev = NULL;
1da177e4
LT
3114 }
3115
24824a09 3116 switch (ing_filter(skb, rxq)) {
f697c3e8
HX
3117 case TC_ACT_SHOT:
3118 case TC_ACT_STOLEN:
3119 kfree_skb(skb);
3120 return NULL;
3121 }
3122
3123out:
3124 skb->tc_verd = 0;
3125 return skb;
1da177e4
LT
3126}
3127#endif
3128
ab95bfe0
JP
3129/**
3130 * netdev_rx_handler_register - register receive handler
3131 * @dev: device to register a handler for
3132 * @rx_handler: receive handler to register
93e2c32b 3133 * @rx_handler_data: data pointer that is used by rx handler
ab95bfe0
JP
3134 *
3135 * Register a receive hander for a device. This handler will then be
3136 * called from __netif_receive_skb. A negative errno code is returned
3137 * on a failure.
3138 *
3139 * The caller must hold the rtnl_mutex.
8a4eb573
JP
3140 *
3141 * For a general description of rx_handler, see enum rx_handler_result.
ab95bfe0
JP
3142 */
3143int netdev_rx_handler_register(struct net_device *dev,
93e2c32b
JP
3144 rx_handler_func_t *rx_handler,
3145 void *rx_handler_data)
ab95bfe0
JP
3146{
3147 ASSERT_RTNL();
3148
3149 if (dev->rx_handler)
3150 return -EBUSY;
3151
93e2c32b 3152 rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
ab95bfe0
JP
3153 rcu_assign_pointer(dev->rx_handler, rx_handler);
3154
3155 return 0;
3156}
3157EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3158
3159/**
3160 * netdev_rx_handler_unregister - unregister receive handler
3161 * @dev: device to unregister a handler from
3162 *
3163 * Unregister a receive hander from a device.
3164 *
3165 * The caller must hold the rtnl_mutex.
3166 */
3167void netdev_rx_handler_unregister(struct net_device *dev)
3168{
3169
3170 ASSERT_RTNL();
a9b3cd7f
SH
3171 RCU_INIT_POINTER(dev->rx_handler, NULL);
3172 RCU_INIT_POINTER(dev->rx_handler_data, NULL);
ab95bfe0
JP
3173}
3174EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3175
10f744d2 3176static int __netif_receive_skb(struct sk_buff *skb)
1da177e4
LT
3177{
3178 struct packet_type *ptype, *pt_prev;
ab95bfe0 3179 rx_handler_func_t *rx_handler;
f2ccd8fa 3180 struct net_device *orig_dev;
63d8ea7f 3181 struct net_device *null_or_dev;
8a4eb573 3182 bool deliver_exact = false;
1da177e4 3183 int ret = NET_RX_DROP;
252e3346 3184 __be16 type;
1da177e4 3185
588f0330 3186 net_timestamp_check(!netdev_tstamp_prequeue, skb);
81bbb3d4 3187
cf66ba58 3188 trace_netif_receive_skb(skb);
9b22ea56 3189
1da177e4 3190 /* if we've gotten here through NAPI, check netpoll */
bea3348e 3191 if (netpoll_receive_skb(skb))
1da177e4
LT
3192 return NET_RX_DROP;
3193
8964be4a
ED
3194 if (!skb->skb_iif)
3195 skb->skb_iif = skb->dev->ifindex;
cc9bd5ce 3196 orig_dev = skb->dev;
8f903c70 3197
c1d2bbe1 3198 skb_reset_network_header(skb);
badff6d0 3199 skb_reset_transport_header(skb);
0b5c9db1 3200 skb_reset_mac_len(skb);
1da177e4
LT
3201
3202 pt_prev = NULL;
3203
3204 rcu_read_lock();
3205
63d8ea7f
DM
3206another_round:
3207
3208 __this_cpu_inc(softnet_data.processed);
3209
bcc6d479
JP
3210 if (skb->protocol == cpu_to_be16(ETH_P_8021Q)) {
3211 skb = vlan_untag(skb);
3212 if (unlikely(!skb))
3213 goto out;
3214 }
3215
1da177e4
LT
3216#ifdef CONFIG_NET_CLS_ACT
3217 if (skb->tc_verd & TC_NCLS) {
3218 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3219 goto ncls;
3220 }
3221#endif
3222
3223 list_for_each_entry_rcu(ptype, &ptype_all, list) {
63d8ea7f 3224 if (!ptype->dev || ptype->dev == skb->dev) {
4ec93edb 3225 if (pt_prev)
f2ccd8fa 3226 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3227 pt_prev = ptype;
3228 }
3229 }
3230
3231#ifdef CONFIG_NET_CLS_ACT
f697c3e8
HX
3232 skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3233 if (!skb)
1da177e4 3234 goto out;
1da177e4
LT
3235ncls:
3236#endif
3237
6a32e4f9 3238 rx_handler = rcu_dereference(skb->dev->rx_handler);
2425717b
JF
3239 if (vlan_tx_tag_present(skb)) {
3240 if (pt_prev) {
3241 ret = deliver_skb(skb, pt_prev, orig_dev);
3242 pt_prev = NULL;
3243 }
6a32e4f9 3244 if (vlan_do_receive(&skb, !rx_handler))
2425717b
JF
3245 goto another_round;
3246 else if (unlikely(!skb))
3247 goto out;
3248 }
3249
ab95bfe0
JP
3250 if (rx_handler) {
3251 if (pt_prev) {
3252 ret = deliver_skb(skb, pt_prev, orig_dev);
3253 pt_prev = NULL;
3254 }
8a4eb573
JP
3255 switch (rx_handler(&skb)) {
3256 case RX_HANDLER_CONSUMED:
ab95bfe0 3257 goto out;
8a4eb573 3258 case RX_HANDLER_ANOTHER:
63d8ea7f 3259 goto another_round;
8a4eb573
JP
3260 case RX_HANDLER_EXACT:
3261 deliver_exact = true;
3262 case RX_HANDLER_PASS:
3263 break;
3264 default:
3265 BUG();
3266 }
ab95bfe0 3267 }
1da177e4 3268
63d8ea7f 3269 /* deliver only exact match when indicated */
8a4eb573 3270 null_or_dev = deliver_exact ? skb->dev : NULL;
1f3c8804 3271
1da177e4 3272 type = skb->protocol;
82d8a867
PE
3273 list_for_each_entry_rcu(ptype,
3274 &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
63d8ea7f 3275 if (ptype->type == type &&
e3f48d37
JP
3276 (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3277 ptype->dev == orig_dev)) {
4ec93edb 3278 if (pt_prev)
f2ccd8fa 3279 ret = deliver_skb(skb, pt_prev, orig_dev);
1da177e4
LT
3280 pt_prev = ptype;
3281 }
3282 }
3283
3284 if (pt_prev) {
f2ccd8fa 3285 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1da177e4 3286 } else {
caf586e5 3287 atomic_long_inc(&skb->dev->rx_dropped);
1da177e4
LT
3288 kfree_skb(skb);
3289 /* Jamal, now you will not able to escape explaining
3290 * me how you were going to use this. :-)
3291 */
3292 ret = NET_RX_DROP;
3293 }
3294
3295out:
3296 rcu_read_unlock();
3297 return ret;
3298}
0a9627f2
TH
3299
3300/**
3301 * netif_receive_skb - process receive buffer from network
3302 * @skb: buffer to process
3303 *
3304 * netif_receive_skb() is the main receive data processing function.
3305 * It always succeeds. The buffer may be dropped during processing
3306 * for congestion control or by the protocol layers.
3307 *
3308 * This function may only be called from softirq context and interrupts
3309 * should be enabled.
3310 *
3311 * Return values (usually ignored):
3312 * NET_RX_SUCCESS: no congestion
3313 * NET_RX_DROP: packet was dropped
3314 */
3315int netif_receive_skb(struct sk_buff *skb)
3316{
588f0330 3317 net_timestamp_check(netdev_tstamp_prequeue, skb);
3b098e2d 3318
c1f19b51
RC
3319 if (skb_defer_rx_timestamp(skb))
3320 return NET_RX_SUCCESS;
3321
df334545 3322#ifdef CONFIG_RPS
c5905afb 3323 if (static_key_false(&rps_needed)) {
3b098e2d
ED
3324 struct rps_dev_flow voidflow, *rflow = &voidflow;
3325 int cpu, ret;
fec5e652 3326
3b098e2d
ED
3327 rcu_read_lock();
3328
3329 cpu = get_rps_cpu(skb->dev, skb, &rflow);
0a9627f2 3330
3b098e2d
ED
3331 if (cpu >= 0) {
3332 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3333 rcu_read_unlock();
adc9300e 3334 return ret;
3b098e2d 3335 }
adc9300e 3336 rcu_read_unlock();
fec5e652 3337 }
1e94d72f 3338#endif
adc9300e 3339 return __netif_receive_skb(skb);
0a9627f2 3340}
d1b19dff 3341EXPORT_SYMBOL(netif_receive_skb);
1da177e4 3342
88751275
ED
3343/* Network device is going away, flush any packets still pending
3344 * Called with irqs disabled.
3345 */
152102c7 3346static void flush_backlog(void *arg)
6e583ce5 3347{
152102c7 3348 struct net_device *dev = arg;
e36fa2f7 3349 struct softnet_data *sd = &__get_cpu_var(softnet_data);
6e583ce5
SH
3350 struct sk_buff *skb, *tmp;
3351
e36fa2f7 3352 rps_lock(sd);
6e7676c1 3353 skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
6e583ce5 3354 if (skb->dev == dev) {
e36fa2f7 3355 __skb_unlink(skb, &sd->input_pkt_queue);
6e583ce5 3356 kfree_skb(skb);
76cc8b13 3357 input_queue_head_incr(sd);
6e583ce5 3358 }
6e7676c1 3359 }
e36fa2f7 3360 rps_unlock(sd);
6e7676c1
CG
3361
3362 skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3363 if (skb->dev == dev) {
3364 __skb_unlink(skb, &sd->process_queue);
3365 kfree_skb(skb);
76cc8b13 3366 input_queue_head_incr(sd);
6e7676c1
CG
3367 }
3368 }
6e583ce5
SH
3369}
3370
d565b0a1
HX
3371static int napi_gro_complete(struct sk_buff *skb)
3372{
3373 struct packet_type *ptype;
3374 __be16 type = skb->protocol;
3375 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
3376 int err = -ENOENT;
3377
fc59f9a3
HX
3378 if (NAPI_GRO_CB(skb)->count == 1) {
3379 skb_shinfo(skb)->gso_size = 0;
d565b0a1 3380 goto out;
fc59f9a3 3381 }
d565b0a1
HX
3382
3383 rcu_read_lock();
3384 list_for_each_entry_rcu(ptype, head, list) {
3385 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
3386 continue;
3387
3388 err = ptype->gro_complete(skb);
3389 break;
3390 }
3391 rcu_read_unlock();
3392
3393 if (err) {
3394 WARN_ON(&ptype->list == head);
3395 kfree_skb(skb);
3396 return NET_RX_SUCCESS;
3397 }
3398
3399out:
d565b0a1
HX
3400 return netif_receive_skb(skb);
3401}
3402
86cac58b 3403inline void napi_gro_flush(struct napi_struct *napi)
d565b0a1
HX
3404{
3405 struct sk_buff *skb, *next;
3406
3407 for (skb = napi->gro_list; skb; skb = next) {
3408 next = skb->next;
3409 skb->next = NULL;
3410 napi_gro_complete(skb);
3411 }
3412
4ae5544f 3413 napi->gro_count = 0;
d565b0a1
HX
3414 napi->gro_list = NULL;
3415}
86cac58b 3416EXPORT_SYMBOL(napi_gro_flush);
d565b0a1 3417
5b252f0c 3418enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
d565b0a1
HX
3419{
3420 struct sk_buff **pp = NULL;
3421 struct packet_type *ptype;
3422 __be16 type = skb->protocol;
3423 struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
0da2afd5 3424 int same_flow;
d565b0a1 3425 int mac_len;
5b252f0c 3426 enum gro_result ret;
d565b0a1 3427
ce9e76c8 3428 if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
d565b0a1
HX
3429 goto normal;
3430
21dc3301 3431 if (skb_is_gso(skb) || skb_has_frag_list(skb))
f17f5c91
HX
3432 goto normal;
3433
d565b0a1
HX
3434 rcu_read_lock();
3435 list_for_each_entry_rcu(ptype, head, list) {
d565b0a1
HX
3436 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
3437 continue;
3438
86911732 3439 skb_set_network_header(skb, skb_gro_offset(skb));
d565b0a1
HX
3440 mac_len = skb->network_header - skb->mac_header;
3441 skb->mac_len = mac_len;
3442 NAPI_GRO_CB(skb)->same_flow = 0;
3443 NAPI_GRO_CB(skb)->flush = 0;
5d38a079 3444 NAPI_GRO_CB(skb)->free = 0;
d565b0a1 3445
d565b0a1
HX
3446 pp = ptype->gro_receive(&napi->gro_list, skb);
3447 break;
3448 }
3449 rcu_read_unlock();
3450
3451 if (&ptype->list == head)
3452 goto normal;
3453
0da2afd5 3454 same_flow = NAPI_GRO_CB(skb)->same_flow;
5d0d9be8 3455 ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
0da2afd5 3456
d565b0a1
HX
3457 if (pp) {
3458 struct sk_buff *nskb = *pp;
3459
3460 *pp = nskb->next;
3461 nskb->next = NULL;
3462 napi_gro_complete(nskb);
4ae5544f 3463 napi->gro_count--;
d565b0a1
HX
3464 }
3465
0da2afd5 3466 if (same_flow)
d565b0a1
HX
3467 goto ok;
3468
4ae5544f 3469 if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
d565b0a1 3470 goto normal;
d565b0a1 3471
4ae5544f 3472 napi->gro_count++;
d565b0a1 3473 NAPI_GRO_CB(skb)->count = 1;
86911732 3474 skb_shinfo(skb)->gso_size = skb_gro_len(skb);
d565b0a1
HX
3475 skb->next = napi->gro_list;
3476 napi->gro_list = skb;
5d0d9be8 3477 ret = GRO_HELD;
d565b0a1 3478
ad0f9904 3479pull:
cb18978c
HX
3480 if (skb_headlen(skb) < skb_gro_offset(skb)) {
3481 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3482
3483 BUG_ON(skb->end - skb->tail < grow);
3484
3485 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3486
3487 skb->tail += grow;
3488 skb->data_len -= grow;
3489
3490 skb_shinfo(skb)->frags[0].page_offset += grow;
9e903e08 3491 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
cb18978c 3492
9e903e08 3493 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
ea2ab693 3494 skb_frag_unref(skb, 0);
cb18978c
HX
3495 memmove(skb_shinfo(skb)->frags,
3496 skb_shinfo(skb)->frags + 1,
e5093aec 3497 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
cb18978c 3498 }
ad0f9904
HX
3499 }
3500
d565b0a1 3501ok:
5d0d9be8 3502 return ret;
d565b0a1
HX
3503
3504normal:
ad0f9904
HX
3505 ret = GRO_NORMAL;
3506 goto pull;
5d38a079 3507}
96e93eab
HX
3508EXPORT_SYMBOL(dev_gro_receive);
3509
40d0802b 3510static inline gro_result_t
5b252f0c 3511__napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
96e93eab
HX
3512{
3513 struct sk_buff *p;
5ca3b72c 3514 unsigned int maclen = skb->dev->hard_header_len;
96e93eab
HX
3515
3516 for (p = napi->gro_list; p; p = p->next) {
40d0802b
ED
3517 unsigned long diffs;
3518
3519 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3701e513 3520 diffs |= p->vlan_tci ^ skb->vlan_tci;
5ca3b72c
ED
3521 if (maclen == ETH_HLEN)
3522 diffs |= compare_ether_header(skb_mac_header(p),
3523 skb_gro_mac_header(skb));
3524 else if (!diffs)
3525 diffs = memcmp(skb_mac_header(p),
3526 skb_gro_mac_header(skb),
3527 maclen);
40d0802b 3528 NAPI_GRO_CB(p)->same_flow = !diffs;
96e93eab
HX
3529 NAPI_GRO_CB(p)->flush = 0;
3530 }
3531
3532 return dev_gro_receive(napi, skb);
3533}
5d38a079 3534
c7c4b3b6 3535gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
5d38a079 3536{
5d0d9be8
HX
3537 switch (ret) {
3538 case GRO_NORMAL:
c7c4b3b6
BH
3539 if (netif_receive_skb(skb))
3540 ret = GRO_DROP;
3541 break;
5d38a079 3542
5d0d9be8 3543 case GRO_DROP:
5d0d9be8 3544 case GRO_MERGED_FREE:
5d38a079
HX
3545 kfree_skb(skb);
3546 break;
5b252f0c
BH
3547
3548 case GRO_HELD:
3549 case GRO_MERGED:
3550 break;
5d38a079
HX
3551 }
3552
c7c4b3b6 3553 return ret;
5d0d9be8
HX
3554}
3555EXPORT_SYMBOL(napi_skb_finish);
3556
78a478d0
HX
3557void skb_gro_reset_offset(struct sk_buff *skb)
3558{
3559 NAPI_GRO_CB(skb)->data_offset = 0;
3560 NAPI_GRO_CB(skb)->frag0 = NULL;
7489594c 3561 NAPI_GRO_CB(skb)->frag0_len = 0;
78a478d0 3562
78d3fd0b 3563 if (skb->mac_header == skb->tail &&
ea2ab693 3564 !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
78a478d0 3565 NAPI_GRO_CB(skb)->frag0 =
ea2ab693 3566 skb_frag_address(&skb_shinfo(skb)->frags[0]);
9e903e08 3567 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
7489594c 3568 }
78a478d0
HX
3569}
3570EXPORT_SYMBOL(skb_gro_reset_offset);
3571
c7c4b3b6 3572gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
5d0d9be8 3573{
86911732
HX
3574 skb_gro_reset_offset(skb);
3575
5d0d9be8 3576 return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
d565b0a1
HX
3577}
3578EXPORT_SYMBOL(napi_gro_receive);
3579
d0c2b0d2 3580static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
96e93eab 3581{
96e93eab 3582 __skb_pull(skb, skb_headlen(skb));
2a2a459e
ED
3583 /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3584 skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3701e513 3585 skb->vlan_tci = 0;
66c46d74 3586 skb->dev = napi->dev;
6d152e23 3587 skb->skb_iif = 0;
96e93eab
HX
3588
3589 napi->skb = skb;
3590}
96e93eab 3591
76620aaf 3592struct sk_buff *napi_get_frags(struct napi_struct *napi)
5d38a079 3593{
5d38a079 3594 struct sk_buff *skb = napi->skb;
5d38a079
HX
3595
3596 if (!skb) {
89d71a66
ED
3597 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3598 if (skb)
3599 napi->skb = skb;
80595d59 3600 }
96e93eab
HX
3601 return skb;
3602}
76620aaf 3603EXPORT_SYMBOL(napi_get_frags);
96e93eab 3604
c7c4b3b6
BH
3605gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3606 gro_result_t ret)
96e93eab 3607{
5d0d9be8
HX
3608 switch (ret) {
3609 case GRO_NORMAL:
86911732 3610 case GRO_HELD:
e76b69cc 3611 skb->protocol = eth_type_trans(skb, skb->dev);
86911732 3612
c7c4b3b6
BH
3613 if (ret == GRO_HELD)
3614 skb_gro_pull(skb, -ETH_HLEN);
3615 else if (netif_receive_skb(skb))
3616 ret = GRO_DROP;
86911732 3617 break;
5d38a079 3618
5d0d9be8 3619 case GRO_DROP:
5d0d9be8
HX
3620 case GRO_MERGED_FREE:
3621 napi_reuse_skb(napi, skb);
3622 break;
5b252f0c
BH
3623
3624 case GRO_MERGED:
3625 break;
5d0d9be8 3626 }
5d38a079 3627
c7c4b3b6 3628 return ret;
5d38a079 3629}
5d0d9be8
HX
3630EXPORT_SYMBOL(napi_frags_finish);
3631
76620aaf
HX
3632struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3633{
3634 struct sk_buff *skb = napi->skb;
3635 struct ethhdr *eth;
a5b1cf28
HX
3636 unsigned int hlen;
3637 unsigned int off;
76620aaf
HX
3638
3639 napi->skb = NULL;
3640
3641 skb_reset_mac_header(skb);
3642 skb_gro_reset_offset(skb);
3643
a5b1cf28
HX
3644 off = skb_gro_offset(skb);
3645 hlen = off + sizeof(*eth);
3646 eth = skb_gro_header_fast(skb, off);
3647 if (skb_gro_header_hard(skb, hlen)) {
3648 eth = skb_gro_header_slow(skb, hlen, off);
3649 if (unlikely(!eth)) {
3650 napi_reuse_skb(napi, skb);
3651 skb = NULL;
3652 goto out;
3653 }
76620aaf
HX
3654 }
3655
3656 skb_gro_pull(skb, sizeof(*eth));
3657
3658 /*
3659 * This works because the only protocols we care about don't require
3660 * special handling. We'll fix it up properly at the end.
3661 */
3662 skb->protocol = eth->h_proto;
3663
3664out:
3665 return skb;
3666}
3667EXPORT_SYMBOL(napi_frags_skb);
3668
c7c4b3b6 3669gro_result_t napi_gro_frags(struct napi_struct *napi)
5d0d9be8 3670{
76620aaf 3671 struct sk_buff *skb = napi_frags_skb(napi);
5d0d9be8
HX
3672
3673 if (!skb)
c7c4b3b6 3674 return GRO_DROP;
5d0d9be8
HX
3675
3676 return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3677}
5d38a079
HX
3678EXPORT_SYMBOL(napi_gro_frags);
3679
e326bed2
ED
3680/*
3681 * net_rps_action sends any pending IPI's for rps.
3682 * Note: called with local irq disabled, but exits with local irq enabled.
3683 */
3684static void net_rps_action_and_irq_enable(struct softnet_data *sd)
3685{
3686#ifdef CONFIG_RPS
3687 struct softnet_data *remsd = sd->rps_ipi_list;
3688
3689 if (remsd) {
3690 sd->rps_ipi_list = NULL;
3691
3692 local_irq_enable();
3693
3694 /* Send pending IPI's to kick RPS processing on remote cpus. */
3695 while (remsd) {
3696 struct softnet_data *next = remsd->rps_ipi_next;
3697
3698 if (cpu_online(remsd->cpu))
3699 __smp_call_function_single(remsd->cpu,
3700 &remsd->csd, 0);
3701 remsd = next;
3702 }
3703 } else
3704#endif
3705 local_irq_enable();
3706}
3707
bea3348e 3708static int process_backlog(struct napi_struct *napi, int quota)
1da177e4
LT
3709{
3710 int work = 0;
eecfd7c4 3711 struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
1da177e4 3712
e326bed2
ED
3713#ifdef CONFIG_RPS
3714 /* Check if we have pending ipi, its better to send them now,
3715 * not waiting net_rx_action() end.
3716 */
3717 if (sd->rps_ipi_list) {
3718 local_irq_disable();
3719 net_rps_action_and_irq_enable(sd);
3720 }
3721#endif
bea3348e 3722 napi->weight = weight_p;
6e7676c1
CG
3723 local_irq_disable();
3724 while (work < quota) {
1da177e4 3725 struct sk_buff *skb;
6e7676c1
CG
3726 unsigned int qlen;
3727
3728 while ((skb = __skb_dequeue(&sd->process_queue))) {
3729 local_irq_enable();
3730 __netif_receive_skb(skb);
6e7676c1 3731 local_irq_disable();
76cc8b13
TH
3732 input_queue_head_incr(sd);
3733 if (++work >= quota) {
3734 local_irq_enable();
3735 return work;
3736 }
6e7676c1 3737 }
1da177e4 3738
e36fa2f7 3739 rps_lock(sd);
6e7676c1 3740 qlen = skb_queue_len(&sd->input_pkt_queue);
76cc8b13 3741 if (qlen)
6e7676c1
CG
3742 skb_queue_splice_tail_init(&sd->input_pkt_queue,
3743 &sd->process_queue);
76cc8b13 3744
6e7676c1 3745 if (qlen < quota - work) {
eecfd7c4
ED
3746 /*
3747 * Inline a custom version of __napi_complete().
3748 * only current cpu owns and manipulates this napi,
3749 * and NAPI_STATE_SCHED is the only possible flag set on backlog.
3750 * we can use a plain write instead of clear_bit(),
3751 * and we dont need an smp_mb() memory barrier.
3752 */
3753 list_del(&napi->poll_list);
3754 napi->state = 0;
3755
6e7676c1 3756 quota = work + qlen;
bea3348e 3757 }
e36fa2f7 3758 rps_unlock(sd);
6e7676c1
CG
3759 }
3760 local_irq_enable();
1da177e4 3761
bea3348e
SH
3762 return work;
3763}
1da177e4 3764
bea3348e
SH
3765/**
3766 * __napi_schedule - schedule for receive
c4ea43c5 3767 * @n: entry to schedule
bea3348e
SH
3768 *
3769 * The entry's receive function will be scheduled to run
3770 */
b5606c2d 3771void __napi_schedule(struct napi_struct *n)
bea3348e
SH
3772{
3773 unsigned long flags;
1da177e4 3774
bea3348e 3775 local_irq_save(flags);
eecfd7c4 3776 ____napi_schedule(&__get_cpu_var(softnet_data), n);
bea3348e 3777 local_irq_restore(flags);
1da177e4 3778}
bea3348e
SH
3779EXPORT_SYMBOL(__napi_schedule);
3780
d565b0a1
HX
3781void __napi_complete(struct napi_struct *n)
3782{
3783 BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3784 BUG_ON(n->gro_list);
3785
3786 list_del(&n->poll_list);
3787 smp_mb__before_clear_bit();
3788 clear_bit(NAPI_STATE_SCHED, &n->state);
3789}
3790EXPORT_SYMBOL(__napi_complete);
3791
3792void napi_complete(struct napi_struct *n)
3793{
3794 unsigned long flags;
3795
3796 /*
3797 * don't let napi dequeue from the cpu poll list
3798 * just in case its running on a different cpu
3799 */
3800 if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3801 return;
3802
3803 napi_gro_flush(n);
3804 local_irq_save(flags);
3805 __napi_complete(n);
3806 local_irq_restore(flags);
3807}
3808EXPORT_SYMBOL(napi_complete);
3809
3810void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3811 int (*poll)(struct napi_struct *, int), int weight)
3812{
3813 INIT_LIST_HEAD(&napi->poll_list);
4ae5544f 3814 napi->gro_count = 0;
d565b0a1 3815 napi->gro_list = NULL;
5d38a079 3816 napi->skb = NULL;
d565b0a1
HX
3817 napi->poll = poll;
3818 napi->weight = weight;
3819 list_add(&napi->dev_list, &dev->napi_list);
d565b0a1 3820 napi->dev = dev;
5d38a079 3821#ifdef CONFIG_NETPOLL
d565b0a1
HX
3822 spin_lock_init(&napi->poll_lock);
3823 napi->poll_owner = -1;
3824#endif
3825 set_bit(NAPI_STATE_SCHED, &napi->state);
3826}
3827EXPORT_SYMBOL(netif_napi_add);
3828
3829void netif_napi_del(struct napi_struct *napi)
3830{
3831 struct sk_buff *skb, *next;
3832
d7b06636 3833 list_del_init(&napi->dev_list);
76620aaf 3834 napi_free_frags(napi);
d565b0a1
HX
3835
3836 for (skb = napi->gro_list; skb; skb = next) {
3837 next = skb->next;
3838 skb->next = NULL;
3839 kfree_skb(skb);
3840 }
3841
3842 napi->gro_list = NULL;
4ae5544f 3843 napi->gro_count = 0;
d565b0a1
HX
3844}
3845EXPORT_SYMBOL(netif_napi_del);
3846
1da177e4
LT
3847static void net_rx_action(struct softirq_action *h)
3848{
e326bed2 3849 struct softnet_data *sd = &__get_cpu_var(softnet_data);
24f8b238 3850 unsigned long time_limit = jiffies + 2;
51b0bded 3851 int budget = netdev_budget;
53fb95d3
MM
3852 void *have;
3853
1da177e4
LT
3854 local_irq_disable();
3855
e326bed2 3856 while (!list_empty(&sd->poll_list)) {
bea3348e
SH
3857 struct napi_struct *n;
3858 int work, weight;
1da177e4 3859
bea3348e 3860 /* If softirq window is exhuasted then punt.
24f8b238
SH
3861 * Allow this to run for 2 jiffies since which will allow
3862 * an average latency of 1.5/HZ.
bea3348e 3863 */
24f8b238 3864 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
1da177e4
LT
3865 goto softnet_break;
3866
3867 local_irq_enable();
3868
bea3348e
SH
3869 /* Even though interrupts have been re-enabled, this
3870 * access is safe because interrupts can only add new
3871 * entries to the tail of this list, and only ->poll()
3872 * calls can remove this head entry from the list.
3873 */
e326bed2 3874 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
1da177e4 3875
bea3348e
SH
3876 have = netpoll_poll_lock(n);
3877
3878 weight = n->weight;
3879
0a7606c1
DM
3880 /* This NAPI_STATE_SCHED test is for avoiding a race
3881 * with netpoll's poll_napi(). Only the entity which
3882 * obtains the lock and sees NAPI_STATE_SCHED set will
3883 * actually make the ->poll() call. Therefore we avoid
25985edc 3884 * accidentally calling ->poll() when NAPI is not scheduled.
0a7606c1
DM
3885 */
3886 work = 0;
4ea7e386 3887 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
0a7606c1 3888 work = n->poll(n, weight);
4ea7e386
NH
3889 trace_napi_poll(n);
3890 }
bea3348e
SH
3891
3892 WARN_ON_ONCE(work > weight);
3893
3894 budget -= work;
3895
3896 local_irq_disable();
3897
3898 /* Drivers must not modify the NAPI state if they
3899 * consume the entire weight. In such cases this code
3900 * still "owns" the NAPI instance and therefore can
3901 * move the instance around on the list at-will.
3902 */
fed17f30 3903 if (unlikely(work == weight)) {
ff780cd8
HX
3904 if (unlikely(napi_disable_pending(n))) {
3905 local_irq_enable();
3906 napi_complete(n);
3907 local_irq_disable();
3908 } else
e326bed2 3909 list_move_tail(&n->poll_list, &sd->poll_list);
fed17f30 3910 }
bea3348e
SH
3911
3912 netpoll_poll_unlock(have);
1da177e4
LT
3913 }
3914out:
e326bed2 3915 net_rps_action_and_irq_enable(sd);
0a9627f2 3916
db217334
CL
3917#ifdef CONFIG_NET_DMA
3918 /*
3919 * There may not be any more sk_buffs coming right now, so push
3920 * any pending DMA copies to hardware
3921 */
2ba05622 3922 dma_issue_pending_all();
db217334 3923#endif
bea3348e 3924
1da177e4
LT
3925 return;
3926
3927softnet_break:
dee42870 3928 sd->time_squeeze++;
1da177e4
LT
3929 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3930 goto out;
3931}
3932
d1b19dff 3933static gifconf_func_t *gifconf_list[NPROTO];
1da177e4
LT
3934
3935/**
3936 * register_gifconf - register a SIOCGIF handler
3937 * @family: Address family
3938 * @gifconf: Function handler
3939 *
3940 * Register protocol dependent address dumping routines. The handler
3941 * that is passed must not be freed or reused until it has been replaced
3942 * by another handler.
3943 */
d1b19dff 3944int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
1da177e4
LT
3945{
3946 if (family >= NPROTO)
3947 return -EINVAL;
3948 gifconf_list[family] = gifconf;
3949 return 0;
3950}
d1b19dff 3951EXPORT_SYMBOL(register_gifconf);
1da177e4
LT
3952
3953
3954/*
3955 * Map an interface index to its name (SIOCGIFNAME)
3956 */
3957
3958/*
3959 * We need this ioctl for efficient implementation of the
3960 * if_indextoname() function required by the IPv6 API. Without
3961 * it, we would have to search all the interfaces to find a
3962 * match. --pb
3963 */
3964
881d966b 3965static int dev_ifname(struct net *net, struct ifreq __user *arg)
1da177e4
LT
3966{
3967 struct net_device *dev;
3968 struct ifreq ifr;
3969
3970 /*
3971 * Fetch the caller's info block.
3972 */
3973
3974 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3975 return -EFAULT;
3976
fb699dfd
ED
3977 rcu_read_lock();
3978 dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
1da177e4 3979 if (!dev) {
fb699dfd 3980 rcu_read_unlock();
1da177e4
LT
3981 return -ENODEV;
3982 }
3983
3984 strcpy(ifr.ifr_name, dev->name);
fb699dfd 3985 rcu_read_unlock();
1da177e4
LT
3986
3987 if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3988 return -EFAULT;
3989 return 0;
3990}
3991
3992/*
3993 * Perform a SIOCGIFCONF call. This structure will change
3994 * size eventually, and there is nothing I can do about it.
3995 * Thus we will need a 'compatibility mode'.
3996 */
3997
881d966b 3998static int dev_ifconf(struct net *net, char __user *arg)
1da177e4
LT
3999{
4000 struct ifconf ifc;
4001 struct net_device *dev;
4002 char __user *pos;
4003 int len;
4004 int total;
4005 int i;
4006
4007 /*
4008 * Fetch the caller's info block.
4009 */
4010
4011 if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
4012 return -EFAULT;
4013
4014 pos = ifc.ifc_buf;
4015 len = ifc.ifc_len;
4016
4017 /*
4018 * Loop over the interfaces, and write an info block for each.
4019 */
4020
4021 total = 0;
881d966b 4022 for_each_netdev(net, dev) {
1da177e4
LT
4023 for (i = 0; i < NPROTO; i++) {
4024 if (gifconf_list[i]) {
4025 int done;
4026 if (!pos)
4027 done = gifconf_list[i](dev, NULL, 0);
4028 else
4029 done = gifconf_list[i](dev, pos + total,
4030 len - total);
4031 if (done < 0)
4032 return -EFAULT;
4033 total += done;
4034 }
4035 }
4ec93edb 4036 }
1da177e4
LT
4037
4038 /*
4039 * All done. Write the updated control block back to the caller.
4040 */
4041 ifc.ifc_len = total;
4042
4043 /*
4044 * Both BSD and Solaris return 0 here, so we do too.
4045 */
4046 return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
4047}
4048
4049#ifdef CONFIG_PROC_FS
f04565dd 4050
2def16ae 4051#define BUCKET_SPACE (32 - NETDEV_HASHBITS - 1)
f04565dd
MM
4052
4053#define get_bucket(x) ((x) >> BUCKET_SPACE)
4054#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
4055#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
4056
2def16ae 4057static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos)
f04565dd 4058{
f04565dd
MM
4059 struct net *net = seq_file_net(seq);
4060 struct net_device *dev;
4061 struct hlist_node *p;
4062 struct hlist_head *h;
2def16ae 4063 unsigned int count = 0, offset = get_offset(*pos);
f04565dd 4064
2def16ae 4065 h = &net->dev_name_head[get_bucket(*pos)];
f04565dd 4066 hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
2def16ae 4067 if (++count == offset)
f04565dd 4068 return dev;
f04565dd
MM
4069 }
4070
4071 return NULL;
4072}
4073
2def16ae 4074static inline struct net_device *dev_from_bucket(struct seq_file *seq, loff_t *pos)
f04565dd 4075{
f04565dd
MM
4076 struct net_device *dev;
4077 unsigned int bucket;
4078
f04565dd 4079 do {
2def16ae 4080 dev = dev_from_same_bucket(seq, pos);
f04565dd
MM
4081 if (dev)
4082 return dev;
4083
2def16ae
ED
4084 bucket = get_bucket(*pos) + 1;
4085 *pos = set_bucket_offset(bucket, 1);
f04565dd
MM
4086 } while (bucket < NETDEV_HASHENTRIES);
4087
4088 return NULL;
4089}
4090
1da177e4
LT
4091/*
4092 * This is invoked by the /proc filesystem handler to display a device
4093 * in detail.
4094 */
7562f876 4095void *dev_seq_start(struct seq_file *seq, loff_t *pos)
c6d14c84 4096 __acquires(RCU)
1da177e4 4097{
c6d14c84 4098 rcu_read_lock();
7562f876
PE
4099 if (!*pos)
4100 return SEQ_START_TOKEN;
1da177e4 4101
2def16ae 4102 if (get_bucket(*pos) >= NETDEV_HASHENTRIES)
f04565dd 4103 return NULL;
1da177e4 4104
2def16ae 4105 return dev_from_bucket(seq, pos);
1da177e4
LT
4106}
4107
4108void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4109{
f04565dd 4110 ++*pos;
2def16ae 4111 return dev_from_bucket(seq, pos);
1da177e4
LT
4112}
4113
4114void dev_seq_stop(struct seq_file *seq, void *v)
c6d14c84 4115 __releases(RCU)
1da177e4 4116{
c6d14c84 4117 rcu_read_unlock();
1da177e4
LT
4118}
4119
4120static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
4121{
28172739
ED
4122 struct rtnl_link_stats64 temp;
4123 const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp);
1da177e4 4124
be1f3c2c
BH
4125 seq_printf(seq, "%6s: %7llu %7llu %4llu %4llu %4llu %5llu %10llu %9llu "
4126 "%8llu %7llu %4llu %4llu %4llu %5llu %7llu %10llu\n",
5a1b5898
RR
4127 dev->name, stats->rx_bytes, stats->rx_packets,
4128 stats->rx_errors,
4129 stats->rx_dropped + stats->rx_missed_errors,
4130 stats->rx_fifo_errors,
4131 stats->rx_length_errors + stats->rx_over_errors +
4132 stats->rx_crc_errors + stats->rx_frame_errors,
4133 stats->rx_compressed, stats->multicast,
4134 stats->tx_bytes, stats->tx_packets,
4135 stats->tx_errors, stats->tx_dropped,
4136 stats->tx_fifo_errors, stats->collisions,
4137 stats->tx_carrier_errors +
4138 stats->tx_aborted_errors +
4139 stats->tx_window_errors +
4140 stats->tx_heartbeat_errors,
4141 stats->tx_compressed);
1da177e4
LT
4142}
4143
4144/*
4145 * Called from the PROCfs module. This now uses the new arbitrary sized
4146 * /proc/net interface to create /proc/net/dev
4147 */
4148static int dev_seq_show(struct seq_file *seq, void *v)
4149{
4150 if (v == SEQ_START_TOKEN)
4151 seq_puts(seq, "Inter-| Receive "
4152 " | Transmit\n"
4153 " face |bytes packets errs drop fifo frame "
4154 "compressed multicast|bytes packets errs "
4155 "drop fifo colls carrier compressed\n");
4156 else
4157 dev_seq_printf_stats(seq, v);
4158 return 0;
4159}
4160
dee42870 4161static struct softnet_data *softnet_get_online(loff_t *pos)
1da177e4 4162{
dee42870 4163 struct softnet_data *sd = NULL;
1da177e4 4164
0c0b0aca 4165 while (*pos < nr_cpu_ids)
4ec93edb 4166 if (cpu_online(*pos)) {
dee42870 4167 sd = &per_cpu(softnet_data, *pos);
1da177e4
LT
4168 break;
4169 } else
4170 ++*pos;
dee42870 4171 return sd;
1da177e4
LT
4172}
4173
4174static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
4175{
4176 return softnet_get_online(pos);
4177}
4178
4179static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4180{
4181 ++*pos;
4182 return softnet_get_online(pos);
4183}
4184
4185static void softnet_seq_stop(struct seq_file *seq, void *v)
4186{
4187}
4188
4189static int softnet_seq_show(struct seq_file *seq, void *v)
4190{
dee42870 4191 struct softnet_data *sd = v;
1da177e4 4192
0a9627f2 4193 seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
dee42870 4194 sd->processed, sd->dropped, sd->time_squeeze, 0,
c1ebcdb8 4195 0, 0, 0, 0, /* was fastroute */
dee42870 4196 sd->cpu_collision, sd->received_rps);
1da177e4
LT
4197 return 0;
4198}
4199
f690808e 4200static const struct seq_operations dev_seq_ops = {
1da177e4
LT
4201 .start = dev_seq_start,
4202 .next = dev_seq_next,
4203 .stop = dev_seq_stop,
4204 .show = dev_seq_show,
4205};
4206
4207static int dev_seq_open(struct inode *inode, struct file *file)
4208{
e372c414 4209 return seq_open_net(inode, file, &dev_seq_ops,
2def16ae 4210 sizeof(struct seq_net_private));
5cac98dd
AB
4211}
4212
9a32144e 4213static const struct file_operations dev_seq_fops = {
1da177e4
LT
4214 .owner = THIS_MODULE,
4215 .open = dev_seq_open,
4216 .read = seq_read,
4217 .llseek = seq_lseek,
e372c414 4218 .release = seq_release_net,
1da177e4
LT
4219};
4220
f690808e 4221static const struct seq_operations softnet_seq_ops = {
1da177e4
LT
4222 .start = softnet_seq_start,
4223 .next = softnet_seq_next,
4224 .stop = softnet_seq_stop,
4225 .show = softnet_seq_show,
4226};
4227
4228static int softnet_seq_open(struct inode *inode, struct file *file)
4229{
4230 return seq_open(file, &softnet_seq_ops);
4231}
4232
9a32144e 4233static const struct file_operations softnet_seq_fops = {
1da177e4
LT
4234 .owner = THIS_MODULE,
4235 .open = softnet_seq_open,
4236 .read = seq_read,
4237 .llseek = seq_lseek,
4238 .release = seq_release,
4239};
4240
0e1256ff
SH
4241static void *ptype_get_idx(loff_t pos)
4242{
4243 struct packet_type *pt = NULL;
4244 loff_t i = 0;
4245 int t;
4246
4247 list_for_each_entry_rcu(pt, &ptype_all, list) {
4248 if (i == pos)
4249 return pt;
4250 ++i;
4251 }
4252
82d8a867 4253 for (t = 0; t < PTYPE_HASH_SIZE; t++) {
0e1256ff
SH
4254 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
4255 if (i == pos)
4256 return pt;
4257 ++i;
4258 }
4259 }
4260 return NULL;
4261}
4262
4263static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
72348a42 4264 __acquires(RCU)
0e1256ff
SH
4265{
4266 rcu_read_lock();
4267 return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
4268}
4269
4270static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
4271{
4272 struct packet_type *pt;
4273 struct list_head *nxt;
4274 int hash;
4275
4276 ++*pos;
4277 if (v == SEQ_START_TOKEN)
4278 return ptype_get_idx(0);
4279
4280 pt = v;
4281 nxt = pt->list.next;
4282 if (pt->type == htons(ETH_P_ALL)) {
4283 if (nxt != &ptype_all)
4284 goto found;
4285 hash = 0;
4286 nxt = ptype_base[0].next;
4287 } else
82d8a867 4288 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
0e1256ff
SH
4289
4290 while (nxt == &ptype_base[hash]) {
82d8a867 4291 if (++hash >= PTYPE_HASH_SIZE)
0e1256ff
SH
4292 return NULL;
4293 nxt = ptype_base[hash].next;
4294 }
4295found:
4296 return list_entry(nxt, struct packet_type, list);
4297}
4298
4299static void ptype_seq_stop(struct seq_file *seq, void *v)
72348a42 4300 __releases(RCU)
0e1256ff
SH
4301{
4302 rcu_read_unlock();
4303}
4304
0e1256ff
SH
4305static int ptype_seq_show(struct seq_file *seq, void *v)
4306{
4307 struct packet_type *pt = v;
4308
4309 if (v == SEQ_START_TOKEN)
4310 seq_puts(seq, "Type Device Function\n");
c346dca1 4311 else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
0e1256ff
SH
4312 if (pt->type == htons(ETH_P_ALL))
4313 seq_puts(seq, "ALL ");
4314 else
4315 seq_printf(seq, "%04x", ntohs(pt->type));
4316
908cd2da
AD
4317 seq_printf(seq, " %-8s %pF\n",
4318 pt->dev ? pt->dev->name : "", pt->func);
0e1256ff
SH
4319 }
4320
4321 return 0;
4322}
4323
4324static const struct seq_operations ptype_seq_ops = {
4325 .start = ptype_seq_start,
4326 .next = ptype_seq_next,
4327 .stop = ptype_seq_stop,
4328 .show = ptype_seq_show,
4329};
4330
4331static int ptype_seq_open(struct inode *inode, struct file *file)
4332{
2feb27db
PE
4333 return seq_open_net(inode, file, &ptype_seq_ops,
4334 sizeof(struct seq_net_private));
0e1256ff
SH
4335}
4336
4337static const struct file_operations ptype_seq_fops = {
4338 .owner = THIS_MODULE,
4339 .open = ptype_seq_open,
4340 .read = seq_read,
4341 .llseek = seq_lseek,
2feb27db 4342 .release = seq_release_net,
0e1256ff
SH
4343};
4344
4345
4665079c 4346static int __net_init dev_proc_net_init(struct net *net)
1da177e4
LT
4347{
4348 int rc = -ENOMEM;
4349
881d966b 4350 if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
1da177e4 4351 goto out;
881d966b 4352 if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
1da177e4 4353 goto out_dev;
881d966b 4354 if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
457c4cbc 4355 goto out_softnet;
0e1256ff 4356
881d966b 4357 if (wext_proc_init(net))
457c4cbc 4358 goto out_ptype;
1da177e4
LT
4359 rc = 0;
4360out:
4361 return rc;
457c4cbc 4362out_ptype:
881d966b 4363 proc_net_remove(net, "ptype");
1da177e4 4364out_softnet:
881d966b 4365 proc_net_remove(net, "softnet_stat");
1da177e4 4366out_dev:
881d966b 4367 proc_net_remove(net, "dev");
1da177e4
LT
4368 goto out;
4369}
881d966b 4370
4665079c 4371static void __net_exit dev_proc_net_exit(struct net *net)
881d966b
EB
4372{
4373 wext_proc_exit(net);
4374
4375 proc_net_remove(net, "ptype");
4376 proc_net_remove(net, "softnet_stat");
4377 proc_net_remove(net, "dev");
4378}
4379
022cbae6 4380static struct pernet_operations __net_initdata dev_proc_ops = {
881d966b
EB
4381 .init = dev_proc_net_init,
4382 .exit = dev_proc_net_exit,
4383};
4384
4385static int __init dev_proc_init(void)
4386{
4387 return register_pernet_subsys(&dev_proc_ops);
4388}
1da177e4
LT
4389#else
4390#define dev_proc_init() 0
4391#endif /* CONFIG_PROC_FS */
4392
4393
4394/**
1765a575 4395 * netdev_set_master - set up master pointer
1da177e4
LT
4396 * @slave: slave device
4397 * @master: new master device
4398 *
4399 * Changes the master device of the slave. Pass %NULL to break the
4400 * bonding. The caller must hold the RTNL semaphore. On a failure
4401 * a negative errno code is returned. On success the reference counts
1765a575 4402 * are adjusted and the function returns zero.
1da177e4
LT
4403 */
4404int netdev_set_master(struct net_device *slave, struct net_device *master)
4405{
4406 struct net_device *old = slave->master;
4407
4408 ASSERT_RTNL();
4409
4410 if (master) {
4411 if (old)
4412 return -EBUSY;
4413 dev_hold(master);
4414 }
4415
4416 slave->master = master;
4ec93edb 4417
6df427fe 4418 if (old)
1da177e4 4419 dev_put(old);
1765a575
JP
4420 return 0;
4421}
4422EXPORT_SYMBOL(netdev_set_master);
4423
4424/**
4425 * netdev_set_bond_master - set up bonding master/slave pair
4426 * @slave: slave device
4427 * @master: new master device
4428 *
4429 * Changes the master device of the slave. Pass %NULL to break the
4430 * bonding. The caller must hold the RTNL semaphore. On a failure
4431 * a negative errno code is returned. On success %RTM_NEWLINK is sent
4432 * to the routing socket and the function returns zero.
4433 */
4434int netdev_set_bond_master(struct net_device *slave, struct net_device *master)
4435{
4436 int err;
4437
4438 ASSERT_RTNL();
4439
4440 err = netdev_set_master(slave, master);
4441 if (err)
4442 return err;
1da177e4
LT
4443 if (master)
4444 slave->flags |= IFF_SLAVE;
4445 else
4446 slave->flags &= ~IFF_SLAVE;
4447
4448 rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
4449 return 0;
4450}
1765a575 4451EXPORT_SYMBOL(netdev_set_bond_master);
1da177e4 4452
b6c40d68
PM
4453static void dev_change_rx_flags(struct net_device *dev, int flags)
4454{
d314774c
SH
4455 const struct net_device_ops *ops = dev->netdev_ops;
4456
4457 if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
4458 ops->ndo_change_rx_flags(dev, flags);
b6c40d68
PM
4459}
4460
dad9b335 4461static int __dev_set_promiscuity(struct net_device *dev, int inc)
1da177e4 4462{
b536db93 4463 unsigned int old_flags = dev->flags;
8192b0c4
DH
4464 uid_t uid;
4465 gid_t gid;
1da177e4 4466
24023451
PM
4467 ASSERT_RTNL();
4468
dad9b335
WC
4469 dev->flags |= IFF_PROMISC;
4470 dev->promiscuity += inc;
4471 if (dev->promiscuity == 0) {
4472 /*
4473 * Avoid overflow.
4474 * If inc causes overflow, untouch promisc and return error.
4475 */
4476 if (inc < 0)
4477 dev->flags &= ~IFF_PROMISC;
4478 else {
4479 dev->promiscuity -= inc;
7b6cd1ce
JP
4480 pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4481 dev->name);
dad9b335
WC
4482 return -EOVERFLOW;
4483 }
4484 }
52609c0b 4485 if (dev->flags != old_flags) {
7b6cd1ce
JP
4486 pr_info("device %s %s promiscuous mode\n",
4487 dev->name,
4488 dev->flags & IFF_PROMISC ? "entered" : "left");
8192b0c4
DH
4489 if (audit_enabled) {
4490 current_uid_gid(&uid, &gid);
7759db82
KHK
4491 audit_log(current->audit_context, GFP_ATOMIC,
4492 AUDIT_ANOM_PROMISCUOUS,
4493 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4494 dev->name, (dev->flags & IFF_PROMISC),
4495 (old_flags & IFF_PROMISC),
4496 audit_get_loginuid(current),
8192b0c4 4497 uid, gid,
7759db82 4498 audit_get_sessionid(current));
8192b0c4 4499 }
24023451 4500
b6c40d68 4501 dev_change_rx_flags(dev, IFF_PROMISC);
1da177e4 4502 }
dad9b335 4503 return 0;
1da177e4
LT
4504}
4505
4417da66
PM
4506/**
4507 * dev_set_promiscuity - update promiscuity count on a device
4508 * @dev: device
4509 * @inc: modifier
4510 *
4511 * Add or remove promiscuity from a device. While the count in the device
4512 * remains above zero the interface remains promiscuous. Once it hits zero
4513 * the device reverts back to normal filtering operation. A negative inc
4514 * value is used to drop promiscuity on the device.
dad9b335 4515 * Return 0 if successful or a negative errno code on error.
4417da66 4516 */
dad9b335 4517int dev_set_promiscuity(struct net_device *dev, int inc)
4417da66 4518{
b536db93 4519 unsigned int old_flags = dev->flags;
dad9b335 4520 int err;
4417da66 4521
dad9b335 4522 err = __dev_set_promiscuity(dev, inc);
4b5a698e 4523 if (err < 0)
dad9b335 4524 return err;
4417da66
PM
4525 if (dev->flags != old_flags)
4526 dev_set_rx_mode(dev);
dad9b335 4527 return err;
4417da66 4528}
d1b19dff 4529EXPORT_SYMBOL(dev_set_promiscuity);
4417da66 4530
1da177e4
LT
4531/**
4532 * dev_set_allmulti - update allmulti count on a device
4533 * @dev: device
4534 * @inc: modifier
4535 *
4536 * Add or remove reception of all multicast frames to a device. While the
4537 * count in the device remains above zero the interface remains listening
4538 * to all interfaces. Once it hits zero the device reverts back to normal
4539 * filtering operation. A negative @inc value is used to drop the counter
4540 * when releasing a resource needing all multicasts.
dad9b335 4541 * Return 0 if successful or a negative errno code on error.
1da177e4
LT
4542 */
4543
dad9b335 4544int dev_set_allmulti(struct net_device *dev, int inc)
1da177e4 4545{
b536db93 4546 unsigned int old_flags = dev->flags;
1da177e4 4547
24023451
PM
4548 ASSERT_RTNL();
4549
1da177e4 4550 dev->flags |= IFF_ALLMULTI;
dad9b335
WC
4551 dev->allmulti += inc;
4552 if (dev->allmulti == 0) {
4553 /*
4554 * Avoid overflow.
4555 * If inc causes overflow, untouch allmulti and return error.
4556 */
4557 if (inc < 0)
4558 dev->flags &= ~IFF_ALLMULTI;
4559 else {
4560 dev->allmulti -= inc;
7b6cd1ce
JP
4561 pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4562 dev->name);
dad9b335
WC
4563 return -EOVERFLOW;
4564 }
4565 }
24023451 4566 if (dev->flags ^ old_flags) {
b6c40d68 4567 dev_change_rx_flags(dev, IFF_ALLMULTI);
4417da66 4568 dev_set_rx_mode(dev);
24023451 4569 }
dad9b335 4570 return 0;
4417da66 4571}
d1b19dff 4572EXPORT_SYMBOL(dev_set_allmulti);
4417da66
PM
4573
4574/*
4575 * Upload unicast and multicast address lists to device and
4576 * configure RX filtering. When the device doesn't support unicast
53ccaae1 4577 * filtering it is put in promiscuous mode while unicast addresses
4417da66
PM
4578 * are present.
4579 */
4580void __dev_set_rx_mode(struct net_device *dev)
4581{
d314774c
SH
4582 const struct net_device_ops *ops = dev->netdev_ops;
4583
4417da66
PM
4584 /* dev_open will call this function so the list will stay sane. */
4585 if (!(dev->flags&IFF_UP))
4586 return;
4587
4588 if (!netif_device_present(dev))
40b77c94 4589 return;
4417da66 4590
01789349 4591 if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4417da66
PM
4592 /* Unicast addresses changes may only happen under the rtnl,
4593 * therefore calling __dev_set_promiscuity here is safe.
4594 */
32e7bfc4 4595 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4417da66 4596 __dev_set_promiscuity(dev, 1);
2d348d1f 4597 dev->uc_promisc = true;
32e7bfc4 4598 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4417da66 4599 __dev_set_promiscuity(dev, -1);
2d348d1f 4600 dev->uc_promisc = false;
4417da66 4601 }
4417da66 4602 }
01789349
JP
4603
4604 if (ops->ndo_set_rx_mode)
4605 ops->ndo_set_rx_mode(dev);
4417da66
PM
4606}
4607
4608void dev_set_rx_mode(struct net_device *dev)
4609{
b9e40857 4610 netif_addr_lock_bh(dev);
4417da66 4611 __dev_set_rx_mode(dev);
b9e40857 4612 netif_addr_unlock_bh(dev);
1da177e4
LT
4613}
4614
f0db275a
SH
4615/**
4616 * dev_get_flags - get flags reported to userspace
4617 * @dev: device
4618 *
4619 * Get the combination of flag bits exported through APIs to userspace.
4620 */
1da177e4
LT
4621unsigned dev_get_flags(const struct net_device *dev)
4622{
4623 unsigned flags;
4624
4625 flags = (dev->flags & ~(IFF_PROMISC |
4626 IFF_ALLMULTI |
b00055aa
SR
4627 IFF_RUNNING |
4628 IFF_LOWER_UP |
4629 IFF_DORMANT)) |
1da177e4
LT
4630 (dev->gflags & (IFF_PROMISC |
4631 IFF_ALLMULTI));
4632
b00055aa
SR
4633 if (netif_running(dev)) {
4634 if (netif_oper_up(dev))
4635 flags |= IFF_RUNNING;
4636 if (netif_carrier_ok(dev))
4637 flags |= IFF_LOWER_UP;
4638 if (netif_dormant(dev))
4639 flags |= IFF_DORMANT;
4640 }
1da177e4
LT
4641
4642 return flags;
4643}
d1b19dff 4644EXPORT_SYMBOL(dev_get_flags);
1da177e4 4645
bd380811 4646int __dev_change_flags(struct net_device *dev, unsigned int flags)
1da177e4 4647{
b536db93 4648 unsigned int old_flags = dev->flags;
bd380811 4649 int ret;
1da177e4 4650
24023451
PM
4651 ASSERT_RTNL();
4652
1da177e4
LT
4653 /*
4654 * Set the flags on our device.
4655 */
4656
4657 dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4658 IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4659 IFF_AUTOMEDIA)) |
4660 (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4661 IFF_ALLMULTI));
4662
4663 /*
4664 * Load in the correct multicast list now the flags have changed.
4665 */
4666
b6c40d68
PM
4667 if ((old_flags ^ flags) & IFF_MULTICAST)
4668 dev_change_rx_flags(dev, IFF_MULTICAST);
24023451 4669
4417da66 4670 dev_set_rx_mode(dev);
1da177e4
LT
4671
4672 /*
4673 * Have we downed the interface. We handle IFF_UP ourselves
4674 * according to user attempts to set it, rather than blindly
4675 * setting it.
4676 */
4677
4678 ret = 0;
4679 if ((old_flags ^ flags) & IFF_UP) { /* Bit is different ? */
bd380811 4680 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
1da177e4
LT
4681
4682 if (!ret)
4417da66 4683 dev_set_rx_mode(dev);
1da177e4
LT
4684 }
4685
1da177e4 4686 if ((flags ^ dev->gflags) & IFF_PROMISC) {
d1b19dff
ED
4687 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4688
1da177e4
LT
4689 dev->gflags ^= IFF_PROMISC;
4690 dev_set_promiscuity(dev, inc);
4691 }
4692
4693 /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4694 is important. Some (broken) drivers set IFF_PROMISC, when
4695 IFF_ALLMULTI is requested not asking us and not reporting.
4696 */
4697 if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
d1b19dff
ED
4698 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4699
1da177e4
LT
4700 dev->gflags ^= IFF_ALLMULTI;
4701 dev_set_allmulti(dev, inc);
4702 }
4703
bd380811
PM
4704 return ret;
4705}
4706
4707void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4708{
4709 unsigned int changes = dev->flags ^ old_flags;
4710
4711 if (changes & IFF_UP) {
4712 if (dev->flags & IFF_UP)
4713 call_netdevice_notifiers(NETDEV_UP, dev);
4714 else
4715 call_netdevice_notifiers(NETDEV_DOWN, dev);
4716 }
4717
4718 if (dev->flags & IFF_UP &&
4719 (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4720 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4721}
4722
4723/**
4724 * dev_change_flags - change device settings
4725 * @dev: device
4726 * @flags: device state flags
4727 *
4728 * Change settings on device based state flags. The flags are
4729 * in the userspace exported format.
4730 */
b536db93 4731int dev_change_flags(struct net_device *dev, unsigned int flags)
bd380811 4732{
b536db93
ED
4733 int ret;
4734 unsigned int changes, old_flags = dev->flags;
bd380811
PM
4735
4736 ret = __dev_change_flags(dev, flags);
4737 if (ret < 0)
4738 return ret;
4739
4740 changes = old_flags ^ dev->flags;
7c355f53
TG
4741 if (changes)
4742 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
1da177e4 4743
bd380811 4744 __dev_notify_flags(dev, old_flags);
1da177e4
LT
4745 return ret;
4746}
d1b19dff 4747EXPORT_SYMBOL(dev_change_flags);
1da177e4 4748
f0db275a
SH
4749/**
4750 * dev_set_mtu - Change maximum transfer unit
4751 * @dev: device
4752 * @new_mtu: new transfer unit
4753 *
4754 * Change the maximum transfer size of the network device.
4755 */
1da177e4
LT
4756int dev_set_mtu(struct net_device *dev, int new_mtu)
4757{
d314774c 4758 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4759 int err;
4760
4761 if (new_mtu == dev->mtu)
4762 return 0;
4763
4764 /* MTU must be positive. */
4765 if (new_mtu < 0)
4766 return -EINVAL;
4767
4768 if (!netif_device_present(dev))
4769 return -ENODEV;
4770
4771 err = 0;
d314774c
SH
4772 if (ops->ndo_change_mtu)
4773 err = ops->ndo_change_mtu(dev, new_mtu);
1da177e4
LT
4774 else
4775 dev->mtu = new_mtu;
d314774c 4776
1da177e4 4777 if (!err && dev->flags & IFF_UP)
056925ab 4778 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
1da177e4
LT
4779 return err;
4780}
d1b19dff 4781EXPORT_SYMBOL(dev_set_mtu);
1da177e4 4782
cbda10fa
VD
4783/**
4784 * dev_set_group - Change group this device belongs to
4785 * @dev: device
4786 * @new_group: group this device should belong to
4787 */
4788void dev_set_group(struct net_device *dev, int new_group)
4789{
4790 dev->group = new_group;
4791}
4792EXPORT_SYMBOL(dev_set_group);
4793
f0db275a
SH
4794/**
4795 * dev_set_mac_address - Change Media Access Control Address
4796 * @dev: device
4797 * @sa: new address
4798 *
4799 * Change the hardware (MAC) address of the device
4800 */
1da177e4
LT
4801int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4802{
d314774c 4803 const struct net_device_ops *ops = dev->netdev_ops;
1da177e4
LT
4804 int err;
4805
d314774c 4806 if (!ops->ndo_set_mac_address)
1da177e4
LT
4807 return -EOPNOTSUPP;
4808 if (sa->sa_family != dev->type)
4809 return -EINVAL;
4810 if (!netif_device_present(dev))
4811 return -ENODEV;
d314774c 4812 err = ops->ndo_set_mac_address(dev, sa);
1da177e4 4813 if (!err)
056925ab 4814 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
1da177e4
LT
4815 return err;
4816}
d1b19dff 4817EXPORT_SYMBOL(dev_set_mac_address);
1da177e4
LT
4818
4819/*
3710becf 4820 * Perform the SIOCxIFxxx calls, inside rcu_read_lock()
1da177e4 4821 */
14e3e079 4822static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
1da177e4
LT
4823{
4824 int err;
3710becf 4825 struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
1da177e4
LT
4826
4827 if (!dev)
4828 return -ENODEV;
4829
4830 switch (cmd) {
d1b19dff
ED
4831 case SIOCGIFFLAGS: /* Get interface flags */
4832 ifr->ifr_flags = (short) dev_get_flags(dev);
4833 return 0;
1da177e4 4834
d1b19dff
ED
4835 case SIOCGIFMETRIC: /* Get the metric on the interface
4836 (currently unused) */
4837 ifr->ifr_metric = 0;
4838 return 0;
1da177e4 4839
d1b19dff
ED
4840 case SIOCGIFMTU: /* Get the MTU of a device */
4841 ifr->ifr_mtu = dev->mtu;
4842 return 0;
1da177e4 4843
d1b19dff
ED
4844 case SIOCGIFHWADDR:
4845 if (!dev->addr_len)
4846 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4847 else
4848 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4849 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4850 ifr->ifr_hwaddr.sa_family = dev->type;
4851 return 0;
1da177e4 4852
d1b19dff
ED
4853 case SIOCGIFSLAVE:
4854 err = -EINVAL;
4855 break;
14e3e079 4856
d1b19dff
ED
4857 case SIOCGIFMAP:
4858 ifr->ifr_map.mem_start = dev->mem_start;
4859 ifr->ifr_map.mem_end = dev->mem_end;
4860 ifr->ifr_map.base_addr = dev->base_addr;
4861 ifr->ifr_map.irq = dev->irq;
4862 ifr->ifr_map.dma = dev->dma;
4863 ifr->ifr_map.port = dev->if_port;
4864 return 0;
14e3e079 4865
d1b19dff
ED
4866 case SIOCGIFINDEX:
4867 ifr->ifr_ifindex = dev->ifindex;
4868 return 0;
14e3e079 4869
d1b19dff
ED
4870 case SIOCGIFTXQLEN:
4871 ifr->ifr_qlen = dev->tx_queue_len;
4872 return 0;
14e3e079 4873
d1b19dff
ED
4874 default:
4875 /* dev_ioctl() should ensure this case
4876 * is never reached
4877 */
4878 WARN_ON(1);
41c31f31 4879 err = -ENOTTY;
d1b19dff 4880 break;
14e3e079
JG
4881
4882 }
4883 return err;
4884}
4885
4886/*
4887 * Perform the SIOCxIFxxx calls, inside rtnl_lock()
4888 */
4889static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4890{
4891 int err;
4892 struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
5f2f6da7 4893 const struct net_device_ops *ops;
14e3e079
JG
4894
4895 if (!dev)
4896 return -ENODEV;
4897
5f2f6da7
JP
4898 ops = dev->netdev_ops;
4899
14e3e079 4900 switch (cmd) {
d1b19dff
ED
4901 case SIOCSIFFLAGS: /* Set interface flags */
4902 return dev_change_flags(dev, ifr->ifr_flags);
14e3e079 4903
d1b19dff
ED
4904 case SIOCSIFMETRIC: /* Set the metric on the interface
4905 (currently unused) */
4906 return -EOPNOTSUPP;
14e3e079 4907
d1b19dff
ED
4908 case SIOCSIFMTU: /* Set the MTU of a device */
4909 return dev_set_mtu(dev, ifr->ifr_mtu);
1da177e4 4910
d1b19dff
ED
4911 case SIOCSIFHWADDR:
4912 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
1da177e4 4913
d1b19dff
ED
4914 case SIOCSIFHWBROADCAST:
4915 if (ifr->ifr_hwaddr.sa_family != dev->type)
4916 return -EINVAL;
4917 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4918 min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4919 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4920 return 0;
1da177e4 4921
d1b19dff
ED
4922 case SIOCSIFMAP:
4923 if (ops->ndo_set_config) {
1da177e4
LT
4924 if (!netif_device_present(dev))
4925 return -ENODEV;
d1b19dff
ED
4926 return ops->ndo_set_config(dev, &ifr->ifr_map);
4927 }
4928 return -EOPNOTSUPP;
1da177e4 4929
d1b19dff 4930 case SIOCADDMULTI:
b81693d9 4931 if (!ops->ndo_set_rx_mode ||
d1b19dff
ED
4932 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4933 return -EINVAL;
4934 if (!netif_device_present(dev))
4935 return -ENODEV;
22bedad3 4936 return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
d1b19dff
ED
4937
4938 case SIOCDELMULTI:
b81693d9 4939 if (!ops->ndo_set_rx_mode ||
d1b19dff
ED
4940 ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4941 return -EINVAL;
4942 if (!netif_device_present(dev))
4943 return -ENODEV;
22bedad3 4944 return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
1da177e4 4945
d1b19dff
ED
4946 case SIOCSIFTXQLEN:
4947 if (ifr->ifr_qlen < 0)
4948 return -EINVAL;
4949 dev->tx_queue_len = ifr->ifr_qlen;
4950 return 0;
1da177e4 4951
d1b19dff
ED
4952 case SIOCSIFNAME:
4953 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4954 return dev_change_name(dev, ifr->ifr_newname);
1da177e4 4955
4dc360c5
RC
4956 case SIOCSHWTSTAMP:
4957 err = net_hwtstamp_validate(ifr);
4958 if (err)
4959 return err;
4960 /* fall through */
4961
d1b19dff
ED
4962 /*
4963 * Unknown or private ioctl
4964 */
4965 default:
4966 if ((cmd >= SIOCDEVPRIVATE &&
4967 cmd <= SIOCDEVPRIVATE + 15) ||
4968 cmd == SIOCBONDENSLAVE ||
4969 cmd == SIOCBONDRELEASE ||
4970 cmd == SIOCBONDSETHWADDR ||
4971 cmd == SIOCBONDSLAVEINFOQUERY ||
4972 cmd == SIOCBONDINFOQUERY ||
4973 cmd == SIOCBONDCHANGEACTIVE ||
4974 cmd == SIOCGMIIPHY ||
4975 cmd == SIOCGMIIREG ||
4976 cmd == SIOCSMIIREG ||
4977 cmd == SIOCBRADDIF ||
4978 cmd == SIOCBRDELIF ||
4979 cmd == SIOCSHWTSTAMP ||
4980 cmd == SIOCWANDEV) {
4981 err = -EOPNOTSUPP;
4982 if (ops->ndo_do_ioctl) {
4983 if (netif_device_present(dev))
4984 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4985 else
4986 err = -ENODEV;
4987 }
4988 } else
4989 err = -EINVAL;
1da177e4
LT
4990
4991 }
4992 return err;
4993}
4994
4995/*
4996 * This function handles all "interface"-type I/O control requests. The actual
4997 * 'doing' part of this is dev_ifsioc above.
4998 */
4999
5000/**
5001 * dev_ioctl - network device ioctl
c4ea43c5 5002 * @net: the applicable net namespace
1da177e4
LT
5003 * @cmd: command to issue
5004 * @arg: pointer to a struct ifreq in user space
5005 *
5006 * Issue ioctl functions to devices. This is normally called by the
5007 * user space syscall interfaces but can sometimes be useful for
5008 * other purposes. The return value is the return from the syscall if
5009 * positive or a negative errno code on error.
5010 */
5011
881d966b 5012int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4
LT
5013{
5014 struct ifreq ifr;
5015 int ret;
5016 char *colon;
5017
5018 /* One special case: SIOCGIFCONF takes ifconf argument
5019 and requires shared lock, because it sleeps writing
5020 to user space.
5021 */
5022
5023 if (cmd == SIOCGIFCONF) {
6756ae4b 5024 rtnl_lock();
881d966b 5025 ret = dev_ifconf(net, (char __user *) arg);
6756ae4b 5026 rtnl_unlock();
1da177e4
LT
5027 return ret;
5028 }
5029 if (cmd == SIOCGIFNAME)
881d966b 5030 return dev_ifname(net, (struct ifreq __user *)arg);
1da177e4
LT
5031
5032 if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
5033 return -EFAULT;
5034
5035 ifr.ifr_name[IFNAMSIZ-1] = 0;
5036
5037 colon = strchr(ifr.ifr_name, ':');
5038 if (colon)
5039 *colon = 0;
5040
5041 /*
5042 * See which interface the caller is talking about.
5043 */
5044
5045 switch (cmd) {
d1b19dff
ED
5046 /*
5047 * These ioctl calls:
5048 * - can be done by all.
5049 * - atomic and do not require locking.
5050 * - return a value
5051 */
5052 case SIOCGIFFLAGS:
5053 case SIOCGIFMETRIC:
5054 case SIOCGIFMTU:
5055 case SIOCGIFHWADDR:
5056 case SIOCGIFSLAVE:
5057 case SIOCGIFMAP:
5058 case SIOCGIFINDEX:
5059 case SIOCGIFTXQLEN:
5060 dev_load(net, ifr.ifr_name);
3710becf 5061 rcu_read_lock();
d1b19dff 5062 ret = dev_ifsioc_locked(net, &ifr, cmd);
3710becf 5063 rcu_read_unlock();
d1b19dff
ED
5064 if (!ret) {
5065 if (colon)
5066 *colon = ':';
5067 if (copy_to_user(arg, &ifr,
5068 sizeof(struct ifreq)))
5069 ret = -EFAULT;
5070 }
5071 return ret;
1da177e4 5072
d1b19dff
ED
5073 case SIOCETHTOOL:
5074 dev_load(net, ifr.ifr_name);
5075 rtnl_lock();
5076 ret = dev_ethtool(net, &ifr);
5077 rtnl_unlock();
5078 if (!ret) {
5079 if (colon)
5080 *colon = ':';
5081 if (copy_to_user(arg, &ifr,
5082 sizeof(struct ifreq)))
5083 ret = -EFAULT;
5084 }
5085 return ret;
1da177e4 5086
d1b19dff
ED
5087 /*
5088 * These ioctl calls:
5089 * - require superuser power.
5090 * - require strict serialization.
5091 * - return a value
5092 */
5093 case SIOCGMIIPHY:
5094 case SIOCGMIIREG:
5095 case SIOCSIFNAME:
5096 if (!capable(CAP_NET_ADMIN))
5097 return -EPERM;
5098 dev_load(net, ifr.ifr_name);
5099 rtnl_lock();
5100 ret = dev_ifsioc(net, &ifr, cmd);
5101 rtnl_unlock();
5102 if (!ret) {
5103 if (colon)
5104 *colon = ':';
5105 if (copy_to_user(arg, &ifr,
5106 sizeof(struct ifreq)))
5107 ret = -EFAULT;
5108 }
5109 return ret;
1da177e4 5110
d1b19dff
ED
5111 /*
5112 * These ioctl calls:
5113 * - require superuser power.
5114 * - require strict serialization.
5115 * - do not return a value
5116 */
5117 case SIOCSIFFLAGS:
5118 case SIOCSIFMETRIC:
5119 case SIOCSIFMTU:
5120 case SIOCSIFMAP:
5121 case SIOCSIFHWADDR:
5122 case SIOCSIFSLAVE:
5123 case SIOCADDMULTI:
5124 case SIOCDELMULTI:
5125 case SIOCSIFHWBROADCAST:
5126 case SIOCSIFTXQLEN:
5127 case SIOCSMIIREG:
5128 case SIOCBONDENSLAVE:
5129 case SIOCBONDRELEASE:
5130 case SIOCBONDSETHWADDR:
5131 case SIOCBONDCHANGEACTIVE:
5132 case SIOCBRADDIF:
5133 case SIOCBRDELIF:
5134 case SIOCSHWTSTAMP:
5135 if (!capable(CAP_NET_ADMIN))
5136 return -EPERM;
5137 /* fall through */
5138 case SIOCBONDSLAVEINFOQUERY:
5139 case SIOCBONDINFOQUERY:
5140 dev_load(net, ifr.ifr_name);
5141 rtnl_lock();
5142 ret = dev_ifsioc(net, &ifr, cmd);
5143 rtnl_unlock();
5144 return ret;
5145
5146 case SIOCGIFMEM:
5147 /* Get the per device memory space. We can add this but
5148 * currently do not support it */
5149 case SIOCSIFMEM:
5150 /* Set the per device memory buffer space.
5151 * Not applicable in our case */
5152 case SIOCSIFLINK:
41c31f31 5153 return -ENOTTY;
d1b19dff
ED
5154
5155 /*
5156 * Unknown or private ioctl.
5157 */
5158 default:
5159 if (cmd == SIOCWANDEV ||
5160 (cmd >= SIOCDEVPRIVATE &&
5161 cmd <= SIOCDEVPRIVATE + 15)) {
881d966b 5162 dev_load(net, ifr.ifr_name);
1da177e4 5163 rtnl_lock();
881d966b 5164 ret = dev_ifsioc(net, &ifr, cmd);
1da177e4 5165 rtnl_unlock();
d1b19dff
ED
5166 if (!ret && copy_to_user(arg, &ifr,
5167 sizeof(struct ifreq)))
5168 ret = -EFAULT;
1da177e4 5169 return ret;
d1b19dff
ED
5170 }
5171 /* Take care of Wireless Extensions */
5172 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5173 return wext_handle_ioctl(net, &ifr, cmd, arg);
41c31f31 5174 return -ENOTTY;
1da177e4
LT
5175 }
5176}
5177
5178
5179/**
5180 * dev_new_index - allocate an ifindex
c4ea43c5 5181 * @net: the applicable net namespace
1da177e4
LT
5182 *
5183 * Returns a suitable unique value for a new device interface
5184 * number. The caller must hold the rtnl semaphore or the
5185 * dev_base_lock to be sure it remains unique.
5186 */
881d966b 5187static int dev_new_index(struct net *net)
1da177e4
LT
5188{
5189 static int ifindex;
5190 for (;;) {
5191 if (++ifindex <= 0)
5192 ifindex = 1;
881d966b 5193 if (!__dev_get_by_index(net, ifindex))
1da177e4
LT
5194 return ifindex;
5195 }
5196}
5197
1da177e4 5198/* Delayed registration/unregisteration */
3b5b34fd 5199static LIST_HEAD(net_todo_list);
1da177e4 5200
6f05f629 5201static void net_set_todo(struct net_device *dev)
1da177e4 5202{
1da177e4 5203 list_add_tail(&dev->todo_list, &net_todo_list);
1da177e4
LT
5204}
5205
9b5e383c 5206static void rollback_registered_many(struct list_head *head)
93ee31f1 5207{
e93737b0 5208 struct net_device *dev, *tmp;
9b5e383c 5209
93ee31f1
DL
5210 BUG_ON(dev_boot_phase);
5211 ASSERT_RTNL();
5212
e93737b0 5213 list_for_each_entry_safe(dev, tmp, head, unreg_list) {
9b5e383c 5214 /* Some devices call without registering
e93737b0
KK
5215 * for initialization unwind. Remove those
5216 * devices and proceed with the remaining.
9b5e383c
ED
5217 */
5218 if (dev->reg_state == NETREG_UNINITIALIZED) {
7b6cd1ce
JP
5219 pr_debug("unregister_netdevice: device %s/%p never was registered\n",
5220 dev->name, dev);
93ee31f1 5221
9b5e383c 5222 WARN_ON(1);
e93737b0
KK
5223 list_del(&dev->unreg_list);
5224 continue;
9b5e383c 5225 }
449f4544 5226 dev->dismantle = true;
9b5e383c 5227 BUG_ON(dev->reg_state != NETREG_REGISTERED);
44345724 5228 }
93ee31f1 5229
44345724
OP
5230 /* If device is running, close it first. */
5231 dev_close_many(head);
93ee31f1 5232
44345724 5233 list_for_each_entry(dev, head, unreg_list) {
9b5e383c
ED
5234 /* And unlink it from device chain. */
5235 unlist_netdevice(dev);
93ee31f1 5236
9b5e383c
ED
5237 dev->reg_state = NETREG_UNREGISTERING;
5238 }
93ee31f1
DL
5239
5240 synchronize_net();
5241
9b5e383c
ED
5242 list_for_each_entry(dev, head, unreg_list) {
5243 /* Shutdown queueing discipline. */
5244 dev_shutdown(dev);
93ee31f1
DL
5245
5246
9b5e383c
ED
5247 /* Notify protocols, that we are about to destroy
5248 this device. They should clean all the things.
5249 */
5250 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
93ee31f1 5251
a2835763
PM
5252 if (!dev->rtnl_link_ops ||
5253 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5254 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5255
9b5e383c
ED
5256 /*
5257 * Flush the unicast and multicast chains
5258 */
a748ee24 5259 dev_uc_flush(dev);
22bedad3 5260 dev_mc_flush(dev);
93ee31f1 5261
9b5e383c
ED
5262 if (dev->netdev_ops->ndo_uninit)
5263 dev->netdev_ops->ndo_uninit(dev);
93ee31f1 5264
9b5e383c
ED
5265 /* Notifier chain MUST detach us from master device. */
5266 WARN_ON(dev->master);
93ee31f1 5267
9b5e383c
ED
5268 /* Remove entries from kobject tree */
5269 netdev_unregister_kobject(dev);
5270 }
93ee31f1 5271
a5ee1551 5272 /* Process any work delayed until the end of the batch */
e5e26d75 5273 dev = list_first_entry(head, struct net_device, unreg_list);
a5ee1551 5274 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
93ee31f1 5275
850a545b 5276 synchronize_net();
395264d5 5277
a5ee1551 5278 list_for_each_entry(dev, head, unreg_list)
9b5e383c
ED
5279 dev_put(dev);
5280}
5281
5282static void rollback_registered(struct net_device *dev)
5283{
5284 LIST_HEAD(single);
5285
5286 list_add(&dev->unreg_list, &single);
5287 rollback_registered_many(&single);
ceaaec98 5288 list_del(&single);
93ee31f1
DL
5289}
5290
c8f44aff
MM
5291static netdev_features_t netdev_fix_features(struct net_device *dev,
5292 netdev_features_t features)
b63365a2 5293{
57422dc5
MM
5294 /* Fix illegal checksum combinations */
5295 if ((features & NETIF_F_HW_CSUM) &&
5296 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5297 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
57422dc5
MM
5298 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5299 }
5300
b63365a2
HX
5301 /* Fix illegal SG+CSUM combinations. */
5302 if ((features & NETIF_F_SG) &&
5303 !(features & NETIF_F_ALL_CSUM)) {
6f404e44
MM
5304 netdev_dbg(dev,
5305 "Dropping NETIF_F_SG since no checksum feature.\n");
b63365a2
HX
5306 features &= ~NETIF_F_SG;
5307 }
5308
5309 /* TSO requires that SG is present as well. */
ea2d3688 5310 if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
6f404e44 5311 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
ea2d3688 5312 features &= ~NETIF_F_ALL_TSO;
b63365a2
HX
5313 }
5314
31d8b9e0
BH
5315 /* TSO ECN requires that TSO is present as well. */
5316 if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5317 features &= ~NETIF_F_TSO_ECN;
5318
212b573f
MM
5319 /* Software GSO depends on SG. */
5320 if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
6f404e44 5321 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
212b573f
MM
5322 features &= ~NETIF_F_GSO;
5323 }
5324
acd1130e 5325 /* UFO needs SG and checksumming */
b63365a2 5326 if (features & NETIF_F_UFO) {
79032644
MM
5327 /* maybe split UFO into V4 and V6? */
5328 if (!((features & NETIF_F_GEN_CSUM) ||
5329 (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5330 == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
6f404e44 5331 netdev_dbg(dev,
acd1130e 5332 "Dropping NETIF_F_UFO since no checksum offload features.\n");
b63365a2
HX
5333 features &= ~NETIF_F_UFO;
5334 }
5335
5336 if (!(features & NETIF_F_SG)) {
6f404e44 5337 netdev_dbg(dev,
acd1130e 5338 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
b63365a2
HX
5339 features &= ~NETIF_F_UFO;
5340 }
5341 }
5342
5343 return features;
5344}
b63365a2 5345
6cb6a27c 5346int __netdev_update_features(struct net_device *dev)
5455c699 5347{
c8f44aff 5348 netdev_features_t features;
5455c699
MM
5349 int err = 0;
5350
87267485
MM
5351 ASSERT_RTNL();
5352
5455c699
MM
5353 features = netdev_get_wanted_features(dev);
5354
5355 if (dev->netdev_ops->ndo_fix_features)
5356 features = dev->netdev_ops->ndo_fix_features(dev, features);
5357
5358 /* driver might be less strict about feature dependencies */
5359 features = netdev_fix_features(dev, features);
5360
5361 if (dev->features == features)
6cb6a27c 5362 return 0;
5455c699 5363
c8f44aff
MM
5364 netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5365 &dev->features, &features);
5455c699
MM
5366
5367 if (dev->netdev_ops->ndo_set_features)
5368 err = dev->netdev_ops->ndo_set_features(dev, features);
5369
6cb6a27c 5370 if (unlikely(err < 0)) {
5455c699 5371 netdev_err(dev,
c8f44aff
MM
5372 "set_features() failed (%d); wanted %pNF, left %pNF\n",
5373 err, &features, &dev->features);
6cb6a27c
MM
5374 return -1;
5375 }
5376
5377 if (!err)
5378 dev->features = features;
5379
5380 return 1;
5381}
5382
afe12cc8
MM
5383/**
5384 * netdev_update_features - recalculate device features
5385 * @dev: the device to check
5386 *
5387 * Recalculate dev->features set and send notifications if it
5388 * has changed. Should be called after driver or hardware dependent
5389 * conditions might have changed that influence the features.
5390 */
6cb6a27c
MM
5391void netdev_update_features(struct net_device *dev)
5392{
5393 if (__netdev_update_features(dev))
5394 netdev_features_change(dev);
5455c699
MM
5395}
5396EXPORT_SYMBOL(netdev_update_features);
5397
afe12cc8
MM
5398/**
5399 * netdev_change_features - recalculate device features
5400 * @dev: the device to check
5401 *
5402 * Recalculate dev->features set and send notifications even
5403 * if they have not changed. Should be called instead of
5404 * netdev_update_features() if also dev->vlan_features might
5405 * have changed to allow the changes to be propagated to stacked
5406 * VLAN devices.
5407 */
5408void netdev_change_features(struct net_device *dev)
5409{
5410 __netdev_update_features(dev);
5411 netdev_features_change(dev);
5412}
5413EXPORT_SYMBOL(netdev_change_features);
5414
fc4a7489
PM
5415/**
5416 * netif_stacked_transfer_operstate - transfer operstate
5417 * @rootdev: the root or lower level device to transfer state from
5418 * @dev: the device to transfer operstate to
5419 *
5420 * Transfer operational state from root to device. This is normally
5421 * called when a stacking relationship exists between the root
5422 * device and the device(a leaf device).
5423 */
5424void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5425 struct net_device *dev)
5426{
5427 if (rootdev->operstate == IF_OPER_DORMANT)
5428 netif_dormant_on(dev);
5429 else
5430 netif_dormant_off(dev);
5431
5432 if (netif_carrier_ok(rootdev)) {
5433 if (!netif_carrier_ok(dev))
5434 netif_carrier_on(dev);
5435 } else {
5436 if (netif_carrier_ok(dev))
5437 netif_carrier_off(dev);
5438 }
5439}
5440EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5441
bf264145 5442#ifdef CONFIG_RPS
1b4bf461
ED
5443static int netif_alloc_rx_queues(struct net_device *dev)
5444{
1b4bf461 5445 unsigned int i, count = dev->num_rx_queues;
bd25fa7b 5446 struct netdev_rx_queue *rx;
1b4bf461 5447
bd25fa7b 5448 BUG_ON(count < 1);
1b4bf461 5449
bd25fa7b
TH
5450 rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5451 if (!rx) {
7b6cd1ce 5452 pr_err("netdev: Unable to allocate %u rx queues\n", count);
bd25fa7b 5453 return -ENOMEM;
1b4bf461 5454 }
bd25fa7b
TH
5455 dev->_rx = rx;
5456
bd25fa7b 5457 for (i = 0; i < count; i++)
fe822240 5458 rx[i].dev = dev;
1b4bf461
ED
5459 return 0;
5460}
bf264145 5461#endif
1b4bf461 5462
aa942104
CG
5463static void netdev_init_one_queue(struct net_device *dev,
5464 struct netdev_queue *queue, void *_unused)
5465{
5466 /* Initialize queue lock */
5467 spin_lock_init(&queue->_xmit_lock);
5468 netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5469 queue->xmit_lock_owner = -1;
b236da69 5470 netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
aa942104 5471 queue->dev = dev;
114cf580
TH
5472#ifdef CONFIG_BQL
5473 dql_init(&queue->dql, HZ);
5474#endif
aa942104
CG
5475}
5476
e6484930
TH
5477static int netif_alloc_netdev_queues(struct net_device *dev)
5478{
5479 unsigned int count = dev->num_tx_queues;
5480 struct netdev_queue *tx;
5481
5482 BUG_ON(count < 1);
5483
5484 tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5485 if (!tx) {
7b6cd1ce 5486 pr_err("netdev: Unable to allocate %u tx queues\n", count);
e6484930
TH
5487 return -ENOMEM;
5488 }
5489 dev->_tx = tx;
1d24eb48 5490
e6484930
TH
5491 netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5492 spin_lock_init(&dev->tx_global_lock);
aa942104
CG
5493
5494 return 0;
e6484930
TH
5495}
5496
1da177e4
LT
5497/**
5498 * register_netdevice - register a network device
5499 * @dev: device to register
5500 *
5501 * Take a completed network device structure and add it to the kernel
5502 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5503 * chain. 0 is returned on success. A negative errno code is returned
5504 * on a failure to set up the device, or if the name is a duplicate.
5505 *
5506 * Callers must hold the rtnl semaphore. You may want
5507 * register_netdev() instead of this.
5508 *
5509 * BUGS:
5510 * The locking appears insufficient to guarantee two parallel registers
5511 * will not get the same name.
5512 */
5513
5514int register_netdevice(struct net_device *dev)
5515{
1da177e4 5516 int ret;
d314774c 5517 struct net *net = dev_net(dev);
1da177e4
LT
5518
5519 BUG_ON(dev_boot_phase);
5520 ASSERT_RTNL();
5521
b17a7c17
SH
5522 might_sleep();
5523
1da177e4
LT
5524 /* When net_device's are persistent, this will be fatal. */
5525 BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
d314774c 5526 BUG_ON(!net);
1da177e4 5527
f1f28aa3 5528 spin_lock_init(&dev->addr_list_lock);
cf508b12 5529 netdev_set_addr_lockdep_class(dev);
1da177e4 5530
1da177e4
LT
5531 dev->iflink = -1;
5532
0696c3a8
PP
5533 ret = dev_get_valid_name(dev, dev->name);
5534 if (ret < 0)
5535 goto out;
5536
1da177e4 5537 /* Init, if this function is available */
d314774c
SH
5538 if (dev->netdev_ops->ndo_init) {
5539 ret = dev->netdev_ops->ndo_init(dev);
1da177e4
LT
5540 if (ret) {
5541 if (ret > 0)
5542 ret = -EIO;
90833aa4 5543 goto out;
1da177e4
LT
5544 }
5545 }
4ec93edb 5546
881d966b 5547 dev->ifindex = dev_new_index(net);
1da177e4
LT
5548 if (dev->iflink == -1)
5549 dev->iflink = dev->ifindex;
5550
5455c699
MM
5551 /* Transfer changeable features to wanted_features and enable
5552 * software offloads (GSO and GRO).
5553 */
5554 dev->hw_features |= NETIF_F_SOFT_FEATURES;
14d1232f
MM
5555 dev->features |= NETIF_F_SOFT_FEATURES;
5556 dev->wanted_features = dev->features & dev->hw_features;
1da177e4 5557
c6e1a0d1 5558 /* Turn on no cache copy if HW is doing checksum */
34324dc2
MM
5559 if (!(dev->flags & IFF_LOOPBACK)) {
5560 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5561 if (dev->features & NETIF_F_ALL_CSUM) {
5562 dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5563 dev->features |= NETIF_F_NOCACHE_COPY;
5564 }
c6e1a0d1
TH
5565 }
5566
1180e7d6 5567 /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
16c3ea78 5568 */
1180e7d6 5569 dev->vlan_features |= NETIF_F_HIGHDMA;
16c3ea78 5570
7ffbe3fd
JB
5571 ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5572 ret = notifier_to_errno(ret);
5573 if (ret)
5574 goto err_uninit;
5575
8b41d188 5576 ret = netdev_register_kobject(dev);
b17a7c17 5577 if (ret)
7ce1b0ed 5578 goto err_uninit;
b17a7c17
SH
5579 dev->reg_state = NETREG_REGISTERED;
5580
6cb6a27c 5581 __netdev_update_features(dev);
8e9b59b2 5582
1da177e4
LT
5583 /*
5584 * Default initial state at registry is that the
5585 * device is present.
5586 */
5587
5588 set_bit(__LINK_STATE_PRESENT, &dev->state);
5589
1da177e4 5590 dev_init_scheduler(dev);
1da177e4 5591 dev_hold(dev);
ce286d32 5592 list_netdevice(dev);
1da177e4
LT
5593
5594 /* Notify protocols, that a new device appeared. */
056925ab 5595 ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
fcc5a03a 5596 ret = notifier_to_errno(ret);
93ee31f1
DL
5597 if (ret) {
5598 rollback_registered(dev);
5599 dev->reg_state = NETREG_UNREGISTERED;
5600 }
d90a909e
EB
5601 /*
5602 * Prevent userspace races by waiting until the network
5603 * device is fully setup before sending notifications.
5604 */
a2835763
PM
5605 if (!dev->rtnl_link_ops ||
5606 dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5607 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
1da177e4
LT
5608
5609out:
5610 return ret;
7ce1b0ed
HX
5611
5612err_uninit:
d314774c
SH
5613 if (dev->netdev_ops->ndo_uninit)
5614 dev->netdev_ops->ndo_uninit(dev);
7ce1b0ed 5615 goto out;
1da177e4 5616}
d1b19dff 5617EXPORT_SYMBOL(register_netdevice);
1da177e4 5618
937f1ba5
BH
5619/**
5620 * init_dummy_netdev - init a dummy network device for NAPI
5621 * @dev: device to init
5622 *
5623 * This takes a network device structure and initialize the minimum
5624 * amount of fields so it can be used to schedule NAPI polls without
5625 * registering a full blown interface. This is to be used by drivers
5626 * that need to tie several hardware interfaces to a single NAPI
5627 * poll scheduler due to HW limitations.
5628 */
5629int init_dummy_netdev(struct net_device *dev)
5630{
5631 /* Clear everything. Note we don't initialize spinlocks
5632 * are they aren't supposed to be taken by any of the
5633 * NAPI code and this dummy netdev is supposed to be
5634 * only ever used for NAPI polls
5635 */
5636 memset(dev, 0, sizeof(struct net_device));
5637
5638 /* make sure we BUG if trying to hit standard
5639 * register/unregister code path
5640 */
5641 dev->reg_state = NETREG_DUMMY;
5642
937f1ba5
BH
5643 /* NAPI wants this */
5644 INIT_LIST_HEAD(&dev->napi_list);
5645
5646 /* a dummy interface is started by default */
5647 set_bit(__LINK_STATE_PRESENT, &dev->state);
5648 set_bit(__LINK_STATE_START, &dev->state);
5649
29b4433d
ED
5650 /* Note : We dont allocate pcpu_refcnt for dummy devices,
5651 * because users of this 'device' dont need to change
5652 * its refcount.
5653 */
5654
937f1ba5
BH
5655 return 0;
5656}
5657EXPORT_SYMBOL_GPL(init_dummy_netdev);
5658
5659
1da177e4
LT
5660/**
5661 * register_netdev - register a network device
5662 * @dev: device to register
5663 *
5664 * Take a completed network device structure and add it to the kernel
5665 * interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5666 * chain. 0 is returned on success. A negative errno code is returned
5667 * on a failure to set up the device, or if the name is a duplicate.
5668 *
38b4da38 5669 * This is a wrapper around register_netdevice that takes the rtnl semaphore
1da177e4
LT
5670 * and expands the device name if you passed a format string to
5671 * alloc_netdev.
5672 */
5673int register_netdev(struct net_device *dev)
5674{
5675 int err;
5676
5677 rtnl_lock();
1da177e4 5678 err = register_netdevice(dev);
1da177e4
LT
5679 rtnl_unlock();
5680 return err;
5681}
5682EXPORT_SYMBOL(register_netdev);
5683
29b4433d
ED
5684int netdev_refcnt_read(const struct net_device *dev)
5685{
5686 int i, refcnt = 0;
5687
5688 for_each_possible_cpu(i)
5689 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5690 return refcnt;
5691}
5692EXPORT_SYMBOL(netdev_refcnt_read);
5693
1da177e4
LT
5694/*
5695 * netdev_wait_allrefs - wait until all references are gone.
5696 *
5697 * This is called when unregistering network devices.
5698 *
5699 * Any protocol or device that holds a reference should register
5700 * for netdevice notification, and cleanup and put back the
5701 * reference if they receive an UNREGISTER event.
5702 * We can get stuck here if buggy protocols don't correctly
4ec93edb 5703 * call dev_put.
1da177e4
LT
5704 */
5705static void netdev_wait_allrefs(struct net_device *dev)
5706{
5707 unsigned long rebroadcast_time, warning_time;
29b4433d 5708 int refcnt;
1da177e4 5709
e014debe
ED
5710 linkwatch_forget_dev(dev);
5711
1da177e4 5712 rebroadcast_time = warning_time = jiffies;
29b4433d
ED
5713 refcnt = netdev_refcnt_read(dev);
5714
5715 while (refcnt != 0) {
1da177e4 5716 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
6756ae4b 5717 rtnl_lock();
1da177e4
LT
5718
5719 /* Rebroadcast unregister notification */
056925ab 5720 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 5721 /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
395264d5 5722 * should have already handle it the first time */
1da177e4
LT
5723
5724 if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5725 &dev->state)) {
5726 /* We must not have linkwatch events
5727 * pending on unregister. If this
5728 * happens, we simply run the queue
5729 * unscheduled, resulting in a noop
5730 * for this device.
5731 */
5732 linkwatch_run_queue();
5733 }
5734
6756ae4b 5735 __rtnl_unlock();
1da177e4
LT
5736
5737 rebroadcast_time = jiffies;
5738 }
5739
5740 msleep(250);
5741
29b4433d
ED
5742 refcnt = netdev_refcnt_read(dev);
5743
1da177e4 5744 if (time_after(jiffies, warning_time + 10 * HZ)) {
7b6cd1ce
JP
5745 pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5746 dev->name, refcnt);
1da177e4
LT
5747 warning_time = jiffies;
5748 }
5749 }
5750}
5751
5752/* The sequence is:
5753 *
5754 * rtnl_lock();
5755 * ...
5756 * register_netdevice(x1);
5757 * register_netdevice(x2);
5758 * ...
5759 * unregister_netdevice(y1);
5760 * unregister_netdevice(y2);
5761 * ...
5762 * rtnl_unlock();
5763 * free_netdev(y1);
5764 * free_netdev(y2);
5765 *
58ec3b4d 5766 * We are invoked by rtnl_unlock().
1da177e4 5767 * This allows us to deal with problems:
b17a7c17 5768 * 1) We can delete sysfs objects which invoke hotplug
1da177e4
LT
5769 * without deadlocking with linkwatch via keventd.
5770 * 2) Since we run with the RTNL semaphore not held, we can sleep
5771 * safely in order to wait for the netdev refcnt to drop to zero.
58ec3b4d
HX
5772 *
5773 * We must not return until all unregister events added during
5774 * the interval the lock was held have been completed.
1da177e4 5775 */
1da177e4
LT
5776void netdev_run_todo(void)
5777{
626ab0e6 5778 struct list_head list;
1da177e4 5779
1da177e4 5780 /* Snapshot list, allow later requests */
626ab0e6 5781 list_replace_init(&net_todo_list, &list);
58ec3b4d
HX
5782
5783 __rtnl_unlock();
626ab0e6 5784
850a545b
EB
5785 /* Wait for rcu callbacks to finish before attempting to drain
5786 * the device list. This usually avoids a 250ms wait.
5787 */
5788 if (!list_empty(&list))
5789 rcu_barrier();
5790
1da177e4
LT
5791 while (!list_empty(&list)) {
5792 struct net_device *dev
e5e26d75 5793 = list_first_entry(&list, struct net_device, todo_list);
1da177e4
LT
5794 list_del(&dev->todo_list);
5795
b17a7c17 5796 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
7b6cd1ce 5797 pr_err("network todo '%s' but state %d\n",
b17a7c17
SH
5798 dev->name, dev->reg_state);
5799 dump_stack();
5800 continue;
5801 }
1da177e4 5802
b17a7c17 5803 dev->reg_state = NETREG_UNREGISTERED;
1da177e4 5804
152102c7 5805 on_each_cpu(flush_backlog, dev, 1);
6e583ce5 5806
b17a7c17 5807 netdev_wait_allrefs(dev);
1da177e4 5808
b17a7c17 5809 /* paranoia */
29b4433d 5810 BUG_ON(netdev_refcnt_read(dev));
33d480ce
ED
5811 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5812 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
547b792c 5813 WARN_ON(dev->dn_ptr);
1da177e4 5814
b17a7c17
SH
5815 if (dev->destructor)
5816 dev->destructor(dev);
9093bbb2
SH
5817
5818 /* Free network device */
5819 kobject_put(&dev->dev.kobj);
1da177e4 5820 }
1da177e4
LT
5821}
5822
3cfde79c
BH
5823/* Convert net_device_stats to rtnl_link_stats64. They have the same
5824 * fields in the same order, with only the type differing.
5825 */
77a1abf5
ED
5826void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5827 const struct net_device_stats *netdev_stats)
3cfde79c
BH
5828{
5829#if BITS_PER_LONG == 64
77a1abf5
ED
5830 BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5831 memcpy(stats64, netdev_stats, sizeof(*stats64));
3cfde79c
BH
5832#else
5833 size_t i, n = sizeof(*stats64) / sizeof(u64);
5834 const unsigned long *src = (const unsigned long *)netdev_stats;
5835 u64 *dst = (u64 *)stats64;
5836
5837 BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5838 sizeof(*stats64) / sizeof(u64));
5839 for (i = 0; i < n; i++)
5840 dst[i] = src[i];
5841#endif
5842}
77a1abf5 5843EXPORT_SYMBOL(netdev_stats_to_stats64);
3cfde79c 5844
eeda3fd6
SH
5845/**
5846 * dev_get_stats - get network device statistics
5847 * @dev: device to get statistics from
28172739 5848 * @storage: place to store stats
eeda3fd6 5849 *
d7753516
BH
5850 * Get network statistics from device. Return @storage.
5851 * The device driver may provide its own method by setting
5852 * dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5853 * otherwise the internal statistics structure is used.
eeda3fd6 5854 */
d7753516
BH
5855struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5856 struct rtnl_link_stats64 *storage)
7004bf25 5857{
eeda3fd6
SH
5858 const struct net_device_ops *ops = dev->netdev_ops;
5859
28172739
ED
5860 if (ops->ndo_get_stats64) {
5861 memset(storage, 0, sizeof(*storage));
caf586e5
ED
5862 ops->ndo_get_stats64(dev, storage);
5863 } else if (ops->ndo_get_stats) {
3cfde79c 5864 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
caf586e5
ED
5865 } else {
5866 netdev_stats_to_stats64(storage, &dev->stats);
28172739 5867 }
caf586e5 5868 storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
28172739 5869 return storage;
c45d286e 5870}
eeda3fd6 5871EXPORT_SYMBOL(dev_get_stats);
c45d286e 5872
24824a09 5873struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
dc2b4847 5874{
24824a09 5875 struct netdev_queue *queue = dev_ingress_queue(dev);
dc2b4847 5876
24824a09
ED
5877#ifdef CONFIG_NET_CLS_ACT
5878 if (queue)
5879 return queue;
5880 queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5881 if (!queue)
5882 return NULL;
5883 netdev_init_one_queue(dev, queue, NULL);
24824a09
ED
5884 queue->qdisc = &noop_qdisc;
5885 queue->qdisc_sleeping = &noop_qdisc;
5886 rcu_assign_pointer(dev->ingress_queue, queue);
5887#endif
5888 return queue;
bb949fbd
DM
5889}
5890
1da177e4 5891/**
36909ea4 5892 * alloc_netdev_mqs - allocate network device
1da177e4
LT
5893 * @sizeof_priv: size of private data to allocate space for
5894 * @name: device name format string
5895 * @setup: callback to initialize device
36909ea4
TH
5896 * @txqs: the number of TX subqueues to allocate
5897 * @rxqs: the number of RX subqueues to allocate
1da177e4
LT
5898 *
5899 * Allocates a struct net_device with private data area for driver use
f25f4e44 5900 * and performs basic initialization. Also allocates subquue structs
36909ea4 5901 * for each queue on the device.
1da177e4 5902 */
36909ea4
TH
5903struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5904 void (*setup)(struct net_device *),
5905 unsigned int txqs, unsigned int rxqs)
1da177e4 5906{
1da177e4 5907 struct net_device *dev;
7943986c 5908 size_t alloc_size;
1ce8e7b5 5909 struct net_device *p;
1da177e4 5910
b6fe17d6
SH
5911 BUG_ON(strlen(name) >= sizeof(dev->name));
5912
36909ea4 5913 if (txqs < 1) {
7b6cd1ce 5914 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
55513fb4
TH
5915 return NULL;
5916 }
5917
36909ea4
TH
5918#ifdef CONFIG_RPS
5919 if (rxqs < 1) {
7b6cd1ce 5920 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
36909ea4
TH
5921 return NULL;
5922 }
5923#endif
5924
fd2ea0a7 5925 alloc_size = sizeof(struct net_device);
d1643d24
AD
5926 if (sizeof_priv) {
5927 /* ensure 32-byte alignment of private area */
1ce8e7b5 5928 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
d1643d24
AD
5929 alloc_size += sizeof_priv;
5930 }
5931 /* ensure 32-byte alignment of whole construct */
1ce8e7b5 5932 alloc_size += NETDEV_ALIGN - 1;
1da177e4 5933
31380de9 5934 p = kzalloc(alloc_size, GFP_KERNEL);
1da177e4 5935 if (!p) {
7b6cd1ce 5936 pr_err("alloc_netdev: Unable to allocate device\n");
1da177e4
LT
5937 return NULL;
5938 }
1da177e4 5939
1ce8e7b5 5940 dev = PTR_ALIGN(p, NETDEV_ALIGN);
1da177e4 5941 dev->padded = (char *)dev - (char *)p;
ab9c73cc 5942
29b4433d
ED
5943 dev->pcpu_refcnt = alloc_percpu(int);
5944 if (!dev->pcpu_refcnt)
e6484930 5945 goto free_p;
ab9c73cc 5946
ab9c73cc 5947 if (dev_addr_init(dev))
29b4433d 5948 goto free_pcpu;
ab9c73cc 5949
22bedad3 5950 dev_mc_init(dev);
a748ee24 5951 dev_uc_init(dev);
ccffad25 5952
c346dca1 5953 dev_net_set(dev, &init_net);
1da177e4 5954
8d3bdbd5
DM
5955 dev->gso_max_size = GSO_MAX_SIZE;
5956
8d3bdbd5
DM
5957 INIT_LIST_HEAD(&dev->napi_list);
5958 INIT_LIST_HEAD(&dev->unreg_list);
5959 INIT_LIST_HEAD(&dev->link_watch_list);
5960 dev->priv_flags = IFF_XMIT_DST_RELEASE;
5961 setup(dev);
5962
36909ea4
TH
5963 dev->num_tx_queues = txqs;
5964 dev->real_num_tx_queues = txqs;
ed9af2e8 5965 if (netif_alloc_netdev_queues(dev))
8d3bdbd5 5966 goto free_all;
e8a0464c 5967
df334545 5968#ifdef CONFIG_RPS
36909ea4
TH
5969 dev->num_rx_queues = rxqs;
5970 dev->real_num_rx_queues = rxqs;
fe822240 5971 if (netif_alloc_rx_queues(dev))
8d3bdbd5 5972 goto free_all;
df334545 5973#endif
0a9627f2 5974
1da177e4 5975 strcpy(dev->name, name);
cbda10fa 5976 dev->group = INIT_NETDEV_GROUP;
1da177e4 5977 return dev;
ab9c73cc 5978
8d3bdbd5
DM
5979free_all:
5980 free_netdev(dev);
5981 return NULL;
5982
29b4433d
ED
5983free_pcpu:
5984 free_percpu(dev->pcpu_refcnt);
ed9af2e8 5985 kfree(dev->_tx);
fe822240
TH
5986#ifdef CONFIG_RPS
5987 kfree(dev->_rx);
5988#endif
5989
ab9c73cc
JP
5990free_p:
5991 kfree(p);
5992 return NULL;
1da177e4 5993}
36909ea4 5994EXPORT_SYMBOL(alloc_netdev_mqs);
1da177e4
LT
5995
5996/**
5997 * free_netdev - free network device
5998 * @dev: device
5999 *
4ec93edb
YH
6000 * This function does the last stage of destroying an allocated device
6001 * interface. The reference to the device object is released.
1da177e4
LT
6002 * If this is the last reference then it will be freed.
6003 */
6004void free_netdev(struct net_device *dev)
6005{
d565b0a1
HX
6006 struct napi_struct *p, *n;
6007
f3005d7f
DL
6008 release_net(dev_net(dev));
6009
e8a0464c 6010 kfree(dev->_tx);
fe822240
TH
6011#ifdef CONFIG_RPS
6012 kfree(dev->_rx);
6013#endif
e8a0464c 6014
33d480ce 6015 kfree(rcu_dereference_protected(dev->ingress_queue, 1));
24824a09 6016
f001fde5
JP
6017 /* Flush device addresses */
6018 dev_addr_flush(dev);
6019
d565b0a1
HX
6020 list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
6021 netif_napi_del(p);
6022
29b4433d
ED
6023 free_percpu(dev->pcpu_refcnt);
6024 dev->pcpu_refcnt = NULL;
6025
3041a069 6026 /* Compatibility with error handling in drivers */
1da177e4
LT
6027 if (dev->reg_state == NETREG_UNINITIALIZED) {
6028 kfree((char *)dev - dev->padded);
6029 return;
6030 }
6031
6032 BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
6033 dev->reg_state = NETREG_RELEASED;
6034
43cb76d9
GKH
6035 /* will free via device release */
6036 put_device(&dev->dev);
1da177e4 6037}
d1b19dff 6038EXPORT_SYMBOL(free_netdev);
4ec93edb 6039
f0db275a
SH
6040/**
6041 * synchronize_net - Synchronize with packet receive processing
6042 *
6043 * Wait for packets currently being received to be done.
6044 * Does not block later packets from starting.
6045 */
4ec93edb 6046void synchronize_net(void)
1da177e4
LT
6047{
6048 might_sleep();
be3fc413
ED
6049 if (rtnl_is_locked())
6050 synchronize_rcu_expedited();
6051 else
6052 synchronize_rcu();
1da177e4 6053}
d1b19dff 6054EXPORT_SYMBOL(synchronize_net);
1da177e4
LT
6055
6056/**
44a0873d 6057 * unregister_netdevice_queue - remove device from the kernel
1da177e4 6058 * @dev: device
44a0873d 6059 * @head: list
6ebfbc06 6060 *
1da177e4 6061 * This function shuts down a device interface and removes it
d59b54b1 6062 * from the kernel tables.
44a0873d 6063 * If head not NULL, device is queued to be unregistered later.
1da177e4
LT
6064 *
6065 * Callers must hold the rtnl semaphore. You may want
6066 * unregister_netdev() instead of this.
6067 */
6068
44a0873d 6069void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
1da177e4 6070{
a6620712
HX
6071 ASSERT_RTNL();
6072
44a0873d 6073 if (head) {
9fdce099 6074 list_move_tail(&dev->unreg_list, head);
44a0873d
ED
6075 } else {
6076 rollback_registered(dev);
6077 /* Finish processing unregister after unlock */
6078 net_set_todo(dev);
6079 }
1da177e4 6080}
44a0873d 6081EXPORT_SYMBOL(unregister_netdevice_queue);
1da177e4 6082
9b5e383c
ED
6083/**
6084 * unregister_netdevice_many - unregister many devices
6085 * @head: list of devices
9b5e383c
ED
6086 */
6087void unregister_netdevice_many(struct list_head *head)
6088{
6089 struct net_device *dev;
6090
6091 if (!list_empty(head)) {
6092 rollback_registered_many(head);
6093 list_for_each_entry(dev, head, unreg_list)
6094 net_set_todo(dev);
6095 }
6096}
63c8099d 6097EXPORT_SYMBOL(unregister_netdevice_many);
9b5e383c 6098
1da177e4
LT
6099/**
6100 * unregister_netdev - remove device from the kernel
6101 * @dev: device
6102 *
6103 * This function shuts down a device interface and removes it
d59b54b1 6104 * from the kernel tables.
1da177e4
LT
6105 *
6106 * This is just a wrapper for unregister_netdevice that takes
6107 * the rtnl semaphore. In general you want to use this and not
6108 * unregister_netdevice.
6109 */
6110void unregister_netdev(struct net_device *dev)
6111{
6112 rtnl_lock();
6113 unregister_netdevice(dev);
6114 rtnl_unlock();
6115}
1da177e4
LT
6116EXPORT_SYMBOL(unregister_netdev);
6117
ce286d32
EB
6118/**
6119 * dev_change_net_namespace - move device to different nethost namespace
6120 * @dev: device
6121 * @net: network namespace
6122 * @pat: If not NULL name pattern to try if the current device name
6123 * is already taken in the destination network namespace.
6124 *
6125 * This function shuts down a device interface and moves it
6126 * to a new network namespace. On success 0 is returned, on
6127 * a failure a netagive errno code is returned.
6128 *
6129 * Callers must hold the rtnl semaphore.
6130 */
6131
6132int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
6133{
ce286d32
EB
6134 int err;
6135
6136 ASSERT_RTNL();
6137
6138 /* Don't allow namespace local devices to be moved. */
6139 err = -EINVAL;
6140 if (dev->features & NETIF_F_NETNS_LOCAL)
6141 goto out;
6142
6143 /* Ensure the device has been registrered */
6144 err = -EINVAL;
6145 if (dev->reg_state != NETREG_REGISTERED)
6146 goto out;
6147
6148 /* Get out if there is nothing todo */
6149 err = 0;
878628fb 6150 if (net_eq(dev_net(dev), net))
ce286d32
EB
6151 goto out;
6152
6153 /* Pick the destination device name, and ensure
6154 * we can use it in the destination network namespace.
6155 */
6156 err = -EEXIST;
d9031024 6157 if (__dev_get_by_name(net, dev->name)) {
ce286d32
EB
6158 /* We get here if we can't use the current device name */
6159 if (!pat)
6160 goto out;
1c5cae81 6161 if (dev_get_valid_name(dev, pat) < 0)
ce286d32
EB
6162 goto out;
6163 }
6164
6165 /*
6166 * And now a mini version of register_netdevice unregister_netdevice.
6167 */
6168
6169 /* If device is running close it first. */
9b772652 6170 dev_close(dev);
ce286d32
EB
6171
6172 /* And unlink it from device chain */
6173 err = -ENODEV;
6174 unlist_netdevice(dev);
6175
6176 synchronize_net();
6177
6178 /* Shutdown queueing discipline. */
6179 dev_shutdown(dev);
6180
6181 /* Notify protocols, that we are about to destroy
6182 this device. They should clean all the things.
3b27e105
DL
6183
6184 Note that dev->reg_state stays at NETREG_REGISTERED.
6185 This is wanted because this way 8021q and macvlan know
6186 the device is just moving and can keep their slaves up.
ce286d32
EB
6187 */
6188 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
a5ee1551 6189 call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
d2237d35 6190 rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
ce286d32
EB
6191
6192 /*
6193 * Flush the unicast and multicast chains
6194 */
a748ee24 6195 dev_uc_flush(dev);
22bedad3 6196 dev_mc_flush(dev);
ce286d32
EB
6197
6198 /* Actually switch the network namespace */
c346dca1 6199 dev_net_set(dev, net);
ce286d32 6200
ce286d32
EB
6201 /* If there is an ifindex conflict assign a new one */
6202 if (__dev_get_by_index(net, dev->ifindex)) {
6203 int iflink = (dev->iflink == dev->ifindex);
6204 dev->ifindex = dev_new_index(net);
6205 if (iflink)
6206 dev->iflink = dev->ifindex;
6207 }
6208
8b41d188 6209 /* Fixup kobjects */
a1b3f594 6210 err = device_rename(&dev->dev, dev->name);
8b41d188 6211 WARN_ON(err);
ce286d32
EB
6212
6213 /* Add the device back in the hashes */
6214 list_netdevice(dev);
6215
6216 /* Notify protocols, that a new device appeared. */
6217 call_netdevice_notifiers(NETDEV_REGISTER, dev);
6218
d90a909e
EB
6219 /*
6220 * Prevent userspace races by waiting until the network
6221 * device is fully setup before sending notifications.
6222 */
6223 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6224
ce286d32
EB
6225 synchronize_net();
6226 err = 0;
6227out:
6228 return err;
6229}
463d0183 6230EXPORT_SYMBOL_GPL(dev_change_net_namespace);
ce286d32 6231
1da177e4
LT
6232static int dev_cpu_callback(struct notifier_block *nfb,
6233 unsigned long action,
6234 void *ocpu)
6235{
6236 struct sk_buff **list_skb;
1da177e4
LT
6237 struct sk_buff *skb;
6238 unsigned int cpu, oldcpu = (unsigned long)ocpu;
6239 struct softnet_data *sd, *oldsd;
6240
8bb78442 6241 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1da177e4
LT
6242 return NOTIFY_OK;
6243
6244 local_irq_disable();
6245 cpu = smp_processor_id();
6246 sd = &per_cpu(softnet_data, cpu);
6247 oldsd = &per_cpu(softnet_data, oldcpu);
6248
6249 /* Find end of our completion_queue. */
6250 list_skb = &sd->completion_queue;
6251 while (*list_skb)
6252 list_skb = &(*list_skb)->next;
6253 /* Append completion queue from offline CPU. */
6254 *list_skb = oldsd->completion_queue;
6255 oldsd->completion_queue = NULL;
6256
1da177e4 6257 /* Append output queue from offline CPU. */
a9cbd588
CG
6258 if (oldsd->output_queue) {
6259 *sd->output_queue_tailp = oldsd->output_queue;
6260 sd->output_queue_tailp = oldsd->output_queue_tailp;
6261 oldsd->output_queue = NULL;
6262 oldsd->output_queue_tailp = &oldsd->output_queue;
6263 }
264524d5
HC
6264 /* Append NAPI poll list from offline CPU. */
6265 if (!list_empty(&oldsd->poll_list)) {
6266 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6267 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6268 }
1da177e4
LT
6269
6270 raise_softirq_irqoff(NET_TX_SOFTIRQ);
6271 local_irq_enable();
6272
6273 /* Process offline CPU's input_pkt_queue */
76cc8b13 6274 while ((skb = __skb_dequeue(&oldsd->process_queue))) {
1da177e4 6275 netif_rx(skb);
76cc8b13 6276 input_queue_head_incr(oldsd);
fec5e652 6277 }
76cc8b13 6278 while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6e7676c1 6279 netif_rx(skb);
76cc8b13
TH
6280 input_queue_head_incr(oldsd);
6281 }
1da177e4
LT
6282
6283 return NOTIFY_OK;
6284}
1da177e4
LT
6285
6286
7f353bf2 6287/**
b63365a2
HX
6288 * netdev_increment_features - increment feature set by one
6289 * @all: current feature set
6290 * @one: new feature set
6291 * @mask: mask feature set
7f353bf2
HX
6292 *
6293 * Computes a new feature set after adding a device with feature set
b63365a2
HX
6294 * @one to the master device with current feature set @all. Will not
6295 * enable anything that is off in @mask. Returns the new feature set.
7f353bf2 6296 */
c8f44aff
MM
6297netdev_features_t netdev_increment_features(netdev_features_t all,
6298 netdev_features_t one, netdev_features_t mask)
b63365a2 6299{
1742f183
MM
6300 if (mask & NETIF_F_GEN_CSUM)
6301 mask |= NETIF_F_ALL_CSUM;
6302 mask |= NETIF_F_VLAN_CHALLENGED;
7f353bf2 6303
1742f183
MM
6304 all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6305 all &= one | ~NETIF_F_ALL_FOR_ALL;
c6e1a0d1 6306
1742f183
MM
6307 /* If one device supports hw checksumming, set for all. */
6308 if (all & NETIF_F_GEN_CSUM)
6309 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
7f353bf2
HX
6310
6311 return all;
6312}
b63365a2 6313EXPORT_SYMBOL(netdev_increment_features);
7f353bf2 6314
30d97d35
PE
6315static struct hlist_head *netdev_create_hash(void)
6316{
6317 int i;
6318 struct hlist_head *hash;
6319
6320 hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6321 if (hash != NULL)
6322 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6323 INIT_HLIST_HEAD(&hash[i]);
6324
6325 return hash;
6326}
6327
881d966b 6328/* Initialize per network namespace state */
4665079c 6329static int __net_init netdev_init(struct net *net)
881d966b 6330{
881d966b 6331 INIT_LIST_HEAD(&net->dev_base_head);
881d966b 6332
30d97d35
PE
6333 net->dev_name_head = netdev_create_hash();
6334 if (net->dev_name_head == NULL)
6335 goto err_name;
881d966b 6336
30d97d35
PE
6337 net->dev_index_head = netdev_create_hash();
6338 if (net->dev_index_head == NULL)
6339 goto err_idx;
881d966b
EB
6340
6341 return 0;
30d97d35
PE
6342
6343err_idx:
6344 kfree(net->dev_name_head);
6345err_name:
6346 return -ENOMEM;
881d966b
EB
6347}
6348
f0db275a
SH
6349/**
6350 * netdev_drivername - network driver for the device
6351 * @dev: network device
f0db275a
SH
6352 *
6353 * Determine network driver for device.
6354 */
3019de12 6355const char *netdev_drivername(const struct net_device *dev)
6579e57b 6356{
cf04a4c7
SH
6357 const struct device_driver *driver;
6358 const struct device *parent;
3019de12 6359 const char *empty = "";
6579e57b
AV
6360
6361 parent = dev->dev.parent;
6579e57b 6362 if (!parent)
3019de12 6363 return empty;
6579e57b
AV
6364
6365 driver = parent->driver;
6366 if (driver && driver->name)
3019de12
DM
6367 return driver->name;
6368 return empty;
6579e57b
AV
6369}
6370
ffa10cb4 6371int __netdev_printk(const char *level, const struct net_device *dev,
256df2f3
JP
6372 struct va_format *vaf)
6373{
6374 int r;
6375
6376 if (dev && dev->dev.parent)
6377 r = dev_printk(level, dev->dev.parent, "%s: %pV",
6378 netdev_name(dev), vaf);
6379 else if (dev)
6380 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6381 else
6382 r = printk("%s(NULL net_device): %pV", level, vaf);
6383
6384 return r;
6385}
ffa10cb4 6386EXPORT_SYMBOL(__netdev_printk);
256df2f3
JP
6387
6388int netdev_printk(const char *level, const struct net_device *dev,
6389 const char *format, ...)
6390{
6391 struct va_format vaf;
6392 va_list args;
6393 int r;
6394
6395 va_start(args, format);
6396
6397 vaf.fmt = format;
6398 vaf.va = &args;
6399
6400 r = __netdev_printk(level, dev, &vaf);
6401 va_end(args);
6402
6403 return r;
6404}
6405EXPORT_SYMBOL(netdev_printk);
6406
6407#define define_netdev_printk_level(func, level) \
6408int func(const struct net_device *dev, const char *fmt, ...) \
6409{ \
6410 int r; \
6411 struct va_format vaf; \
6412 va_list args; \
6413 \
6414 va_start(args, fmt); \
6415 \
6416 vaf.fmt = fmt; \
6417 vaf.va = &args; \
6418 \
6419 r = __netdev_printk(level, dev, &vaf); \
6420 va_end(args); \
6421 \
6422 return r; \
6423} \
6424EXPORT_SYMBOL(func);
6425
6426define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6427define_netdev_printk_level(netdev_alert, KERN_ALERT);
6428define_netdev_printk_level(netdev_crit, KERN_CRIT);
6429define_netdev_printk_level(netdev_err, KERN_ERR);
6430define_netdev_printk_level(netdev_warn, KERN_WARNING);
6431define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6432define_netdev_printk_level(netdev_info, KERN_INFO);
6433
4665079c 6434static void __net_exit netdev_exit(struct net *net)
881d966b
EB
6435{
6436 kfree(net->dev_name_head);
6437 kfree(net->dev_index_head);
6438}
6439
022cbae6 6440static struct pernet_operations __net_initdata netdev_net_ops = {
881d966b
EB
6441 .init = netdev_init,
6442 .exit = netdev_exit,
6443};
6444
4665079c 6445static void __net_exit default_device_exit(struct net *net)
ce286d32 6446{
e008b5fc 6447 struct net_device *dev, *aux;
ce286d32 6448 /*
e008b5fc 6449 * Push all migratable network devices back to the
ce286d32
EB
6450 * initial network namespace
6451 */
6452 rtnl_lock();
e008b5fc 6453 for_each_netdev_safe(net, dev, aux) {
ce286d32 6454 int err;
aca51397 6455 char fb_name[IFNAMSIZ];
ce286d32
EB
6456
6457 /* Ignore unmoveable devices (i.e. loopback) */
6458 if (dev->features & NETIF_F_NETNS_LOCAL)
6459 continue;
6460
e008b5fc
EB
6461 /* Leave virtual devices for the generic cleanup */
6462 if (dev->rtnl_link_ops)
6463 continue;
d0c082ce 6464
25985edc 6465 /* Push remaining network devices to init_net */
aca51397
PE
6466 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6467 err = dev_change_net_namespace(dev, &init_net, fb_name);
ce286d32 6468 if (err) {
7b6cd1ce
JP
6469 pr_emerg("%s: failed to move %s to init_net: %d\n",
6470 __func__, dev->name, err);
aca51397 6471 BUG();
ce286d32
EB
6472 }
6473 }
6474 rtnl_unlock();
6475}
6476
04dc7f6b
EB
6477static void __net_exit default_device_exit_batch(struct list_head *net_list)
6478{
6479 /* At exit all network devices most be removed from a network
b595076a 6480 * namespace. Do this in the reverse order of registration.
04dc7f6b
EB
6481 * Do this across as many network namespaces as possible to
6482 * improve batching efficiency.
6483 */
6484 struct net_device *dev;
6485 struct net *net;
6486 LIST_HEAD(dev_kill_list);
6487
6488 rtnl_lock();
6489 list_for_each_entry(net, net_list, exit_list) {
6490 for_each_netdev_reverse(net, dev) {
6491 if (dev->rtnl_link_ops)
6492 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6493 else
6494 unregister_netdevice_queue(dev, &dev_kill_list);
6495 }
6496 }
6497 unregister_netdevice_many(&dev_kill_list);
ceaaec98 6498 list_del(&dev_kill_list);
04dc7f6b
EB
6499 rtnl_unlock();
6500}
6501
022cbae6 6502static struct pernet_operations __net_initdata default_device_ops = {
ce286d32 6503 .exit = default_device_exit,
04dc7f6b 6504 .exit_batch = default_device_exit_batch,
ce286d32
EB
6505};
6506
1da177e4
LT
6507/*
6508 * Initialize the DEV module. At boot time this walks the device list and
6509 * unhooks any devices that fail to initialise (normally hardware not
6510 * present) and leaves us with a valid list of present and active devices.
6511 *
6512 */
6513
6514/*
6515 * This is called single threaded during boot, so no need
6516 * to take the rtnl semaphore.
6517 */
6518static int __init net_dev_init(void)
6519{
6520 int i, rc = -ENOMEM;
6521
6522 BUG_ON(!dev_boot_phase);
6523
1da177e4
LT
6524 if (dev_proc_init())
6525 goto out;
6526
8b41d188 6527 if (netdev_kobject_init())
1da177e4
LT
6528 goto out;
6529
6530 INIT_LIST_HEAD(&ptype_all);
82d8a867 6531 for (i = 0; i < PTYPE_HASH_SIZE; i++)
1da177e4
LT
6532 INIT_LIST_HEAD(&ptype_base[i]);
6533
881d966b
EB
6534 if (register_pernet_subsys(&netdev_net_ops))
6535 goto out;
1da177e4
LT
6536
6537 /*
6538 * Initialise the packet receive queues.
6539 */
6540
6f912042 6541 for_each_possible_cpu(i) {
e36fa2f7 6542 struct softnet_data *sd = &per_cpu(softnet_data, i);
1da177e4 6543
dee42870 6544 memset(sd, 0, sizeof(*sd));
e36fa2f7 6545 skb_queue_head_init(&sd->input_pkt_queue);
6e7676c1 6546 skb_queue_head_init(&sd->process_queue);
e36fa2f7
ED
6547 sd->completion_queue = NULL;
6548 INIT_LIST_HEAD(&sd->poll_list);
a9cbd588
CG
6549 sd->output_queue = NULL;
6550 sd->output_queue_tailp = &sd->output_queue;
df334545 6551#ifdef CONFIG_RPS
e36fa2f7
ED
6552 sd->csd.func = rps_trigger_softirq;
6553 sd->csd.info = sd;
6554 sd->csd.flags = 0;
6555 sd->cpu = i;
1e94d72f 6556#endif
0a9627f2 6557
e36fa2f7
ED
6558 sd->backlog.poll = process_backlog;
6559 sd->backlog.weight = weight_p;
6560 sd->backlog.gro_list = NULL;
6561 sd->backlog.gro_count = 0;
1da177e4
LT
6562 }
6563
1da177e4
LT
6564 dev_boot_phase = 0;
6565
505d4f73
EB
6566 /* The loopback device is special if any other network devices
6567 * is present in a network namespace the loopback device must
6568 * be present. Since we now dynamically allocate and free the
6569 * loopback device ensure this invariant is maintained by
6570 * keeping the loopback device as the first device on the
6571 * list of network devices. Ensuring the loopback devices
6572 * is the first device that appears and the last network device
6573 * that disappears.
6574 */
6575 if (register_pernet_device(&loopback_net_ops))
6576 goto out;
6577
6578 if (register_pernet_device(&default_device_ops))
6579 goto out;
6580
962cf36c
CM
6581 open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6582 open_softirq(NET_RX_SOFTIRQ, net_rx_action);
1da177e4
LT
6583
6584 hotcpu_notifier(dev_cpu_callback, 0);
6585 dst_init();
6586 dev_mcast_init();
6587 rc = 0;
6588out:
6589 return rc;
6590}
6591
6592subsys_initcall(net_dev_init);
6593
e88721f8
KK
6594static int __init initialize_hashrnd(void)
6595{
0a9627f2 6596 get_random_bytes(&hashrnd, sizeof(hashrnd));
e88721f8
KK
6597 return 0;
6598}
6599
6600late_initcall_sync(initialize_hashrnd);
6601