net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <net/udp.h>
 133 #include "net-sysfs.h"
 134
 135 #ifdef UDP_SKT_WIFI
 136 #include <linux/ftrace_event.h>
 137 #endif
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 static DEFINE_SPINLOCK(ptype_lock);
 146 static DEFINE_SPINLOCK(offload_lock);
 147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 148 struct list_head ptype_all __read_mostly;       /* Taps */
 149 static struct list_head offload_base __read_mostly;
 150
 151 /*
 152  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 153  * semaphore.
 154  *
 155  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 156  *
 157  * Writers must hold the rtnl semaphore while they loop through the
 158  * dev_base_head list, and hold dev_base_lock for writing when they do the
 159  * actual updates.  This allows pure readers to access the list even
 160  * while a writer is preparing to update it.
 161  *
 162  * To put it another way, dev_base_lock is held for writing only to
 163  * protect against pure readers; the rtnl semaphore provides the
 164  * protection against other writers.
 165  *
 166  * See, for example usages, register_netdevice() and
 167  * unregister_netdevice(), which must be called with the rtnl
 168  * semaphore held.
 169  */
 170 DEFINE_RWLOCK(dev_base_lock);
 171 EXPORT_SYMBOL(dev_base_lock);
 172
 173 seqcount_t devnet_rename_seq;
 174
 175 static inline void dev_base_seq_inc(struct net *net)
 176 {
 177         while (++net->dev_base_seq == 0);
 178 }
 179
 180 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 181 {
 182         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 183
 184         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 185 }
 186
 187 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 188 {
 189         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 190 }
 191
 192 static inline void rps_lock(struct softnet_data *sd)
 193 {
 194 #ifdef CONFIG_RPS
 195         spin_lock(&sd->input_pkt_queue.lock);
 196 #endif
 197 }
 198
 199 static inline void rps_unlock(struct softnet_data *sd)
 200 {
 201 #ifdef CONFIG_RPS
 202         spin_unlock(&sd->input_pkt_queue.lock);
 203 #endif
 204 }
 205
 206 /* Device list insertion */
 207 static void list_netdevice(struct net_device *dev)
 208 {
 209         struct net *net = dev_net(dev);
 210
 211         ASSERT_RTNL();
 212
 213         write_lock_bh(&dev_base_lock);
 214         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 215         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 216         hlist_add_head_rcu(&dev->index_hlist,
 217                            dev_index_hash(net, dev->ifindex));
 218         write_unlock_bh(&dev_base_lock);
 219
 220         dev_base_seq_inc(net);
 221 }
 222
 223 /* Device list removal
 224  * caller must respect a RCU grace period before freeing/reusing dev
 225  */
 226 static void unlist_netdevice(struct net_device *dev)
 227 {
 228         ASSERT_RTNL();
 229
 230         /* Unlink dev from the device chain */
 231         write_lock_bh(&dev_base_lock);
 232         list_del_rcu(&dev->dev_list);
 233         hlist_del_rcu(&dev->name_hlist);
 234         hlist_del_rcu(&dev->index_hlist);
 235         write_unlock_bh(&dev_base_lock);
 236
 237         dev_base_seq_inc(dev_net(dev));
 238 }
 239
 240 /*
 241  *      Our notifier list
 242  */
 243
 244 static RAW_NOTIFIER_HEAD(netdev_chain);
 245
 246 /*
 247  *      Device drivers call our routines to queue packets here. We empty the
 248  *      queue in the local softnet handler.
 249  */
 250
 251 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 252 EXPORT_PER_CPU_SYMBOL(softnet_data);
 253
 254 #ifdef CONFIG_LOCKDEP
 255 /*
 256  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 257  * according to dev->type
 258  */
 259 static const unsigned short netdev_lock_type[] =
 260         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 261          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 262          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 263          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 264          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 265          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 266          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 267          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 268          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 269          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 270          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 271          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 272          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 273          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 274          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 275
 276 static const char *const netdev_lock_name[] =
 277         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 278          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 279          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 280          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 281          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 282          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 283          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 284          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 285          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 286          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 287          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 288          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 289          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 290          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 291          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 292
 293 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 294 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 295
 296 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 297 {
 298         int i;
 299
 300         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 301                 if (netdev_lock_type[i] == dev_type)
 302                         return i;
 303         /* the last key is used by default */
 304         return ARRAY_SIZE(netdev_lock_type) - 1;
 305 }
 306
 307 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 308                                                  unsigned short dev_type)
 309 {
 310         int i;
 311
 312         i = netdev_lock_pos(dev_type);
 313         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 314                                    netdev_lock_name[i]);
 315 }
 316
 317 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 318 {
 319         int i;
 320
 321         i = netdev_lock_pos(dev->type);
 322         lockdep_set_class_and_name(&dev->addr_list_lock,
 323                                    &netdev_addr_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326 #else
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330 }
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333 }
 334 #endif
 335
 336 /*******************************************************************************
 337
 338                 Protocol management and registration routines
 339
 340 *******************************************************************************/
 341
 342 /*
 343  *      Add a protocol ID to the list. Now that the input handler is
 344  *      smarter we can dispense with all the messy stuff that used to be
 345  *      here.
 346  *
 347  *      BEWARE!!! Protocol handlers, mangling input packets,
 348  *      MUST BE last in hash buckets and checking protocol handlers
 349  *      MUST start from promiscuous ptype_all chain in net_bh.
 350  *      It is true now, do not change it.
 351  *      Explanation follows: if protocol handler, mangling packet, will
 352  *      be the first on list, it is not able to sense, that packet
 353  *      is cloned and should be copied-on-write, so that it will
 354  *      change it and subsequent readers will get broken packet.
 355  *                                                      --ANK (980803)
 356  */
 357
 358 static inline struct list_head *ptype_head(const struct packet_type *pt)
 359 {
 360         if (pt->type == htons(ETH_P_ALL))
 361                 return &ptype_all;
 362         else
 363                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 364 }
 365
 366 /**
 367  *      dev_add_pack - add packet handler
 368  *      @pt: packet type declaration
 369  *
 370  *      Add a protocol handler to the networking stack. The passed &packet_type
 371  *      is linked into kernel lists and may not be freed until it has been
 372  *      removed from the kernel lists.
 373  *
 374  *      This call does not sleep therefore it can not
 375  *      guarantee all CPU's that are in middle of receiving packets
 376  *      will see the new packet type (until the next received packet).
 377  */
 378
 379 void dev_add_pack(struct packet_type *pt)
 380 {
 381         struct list_head *head = ptype_head(pt);
 382
 383         spin_lock(&ptype_lock);
 384         list_add_rcu(&pt->list, head);
 385         spin_unlock(&ptype_lock);
 386 }
 387 EXPORT_SYMBOL(dev_add_pack);
 388
 389 /**
 390  *      __dev_remove_pack        - remove packet handler
 391  *      @pt: packet type declaration
 392  *
 393  *      Remove a protocol handler that was previously added to the kernel
 394  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 395  *      from the kernel lists and can be freed or reused once this function
 396  *      returns.
 397  *
 398  *      The packet type might still be in use by receivers
 399  *      and must not be freed until after all the CPU's have gone
 400  *      through a quiescent state.
 401  */
 402 void __dev_remove_pack(struct packet_type *pt)
 403 {
 404         struct list_head *head = ptype_head(pt);
 405         struct packet_type *pt1;
 406
 407         spin_lock(&ptype_lock);
 408
 409         list_for_each_entry(pt1, head, list) {
 410                 if (pt == pt1) {
 411                         list_del_rcu(&pt->list);
 412                         goto out;
 413                 }
 414         }
 415
 416         pr_warn("dev_remove_pack: %p not found\n", pt);
 417 out:
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(__dev_remove_pack);
 421
 422 /**
 423  *      dev_remove_pack  - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      This call sleeps to guarantee that no CPU is looking at the packet
 432  *      type after return.
 433  */
 434 void dev_remove_pack(struct packet_type *pt)
 435 {
 436         __dev_remove_pack(pt);
 437
 438         synchronize_net();
 439 }
 440 EXPORT_SYMBOL(dev_remove_pack);
 441
 442
 443 /**
 444  *      dev_add_offload - register offload handlers
 445  *      @po: protocol offload declaration
 446  *
 447  *      Add protocol offload handlers to the networking stack. The passed
 448  *      &proto_offload is linked into kernel lists and may not be freed until
 449  *      it has been removed from the kernel lists.
 450  *
 451  *      This call does not sleep therefore it can not
 452  *      guarantee all CPU's that are in middle of receiving packets
 453  *      will see the new offload handlers (until the next received packet).
 454  */
 455 void dev_add_offload(struct packet_offload *po)
 456 {
 457         struct list_head *head = &offload_base;
 458
 459         spin_lock(&offload_lock);
 460         list_add_rcu(&po->list, head);
 461         spin_unlock(&offload_lock);
 462 }
 463 EXPORT_SYMBOL(dev_add_offload);
 464
 465 /**
 466  *      __dev_remove_offload     - remove offload handler
 467  *      @po: packet offload declaration
 468  *
 469  *      Remove a protocol offload handler that was previously added to the
 470  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 471  *      is removed from the kernel lists and can be freed or reused once this
 472  *      function returns.
 473  *
 474  *      The packet type might still be in use by receivers
 475  *      and must not be freed until after all the CPU's have gone
 476  *      through a quiescent state.
 477  */
 478 void __dev_remove_offload(struct packet_offload *po)
 479 {
 480         struct list_head *head = &offload_base;
 481         struct packet_offload *po1;
 482
 483         spin_lock(&offload_lock);
 484
 485         list_for_each_entry(po1, head, list) {
 486                 if (po == po1) {
 487                         list_del_rcu(&po->list);
 488                         goto out;
 489                 }
 490         }
 491
 492         pr_warn("dev_remove_offload: %p not found\n", po);
 493 out:
 494         spin_unlock(&offload_lock);
 495 }
 496 EXPORT_SYMBOL(__dev_remove_offload);
 497
 498 /**
 499  *      dev_remove_offload       - remove packet offload handler
 500  *      @po: packet offload declaration
 501  *
 502  *      Remove a packet offload handler that was previously added to the kernel
 503  *      offload handlers by dev_add_offload(). The passed &offload_type is
 504  *      removed from the kernel lists and can be freed or reused once this
 505  *      function returns.
 506  *
 507  *      This call sleeps to guarantee that no CPU is looking at the packet
 508  *      type after return.
 509  */
 510 void dev_remove_offload(struct packet_offload *po)
 511 {
 512         __dev_remove_offload(po);
 513
 514         synchronize_net();
 515 }
 516 EXPORT_SYMBOL(dev_remove_offload);
 517
 518 /******************************************************************************
 519
 520                       Device Boot-time Settings Routines
 521
 522 *******************************************************************************/
 523
 524 /* Boot time configuration table */
 525 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 526
 527 /**
 528  *      netdev_boot_setup_add   - add new setup entry
 529  *      @name: name of the device
 530  *      @map: configured settings for the device
 531  *
 532  *      Adds new setup entry to the dev_boot_setup list.  The function
 533  *      returns 0 on error and 1 on success.  This is a generic routine to
 534  *      all netdevices.
 535  */
 536 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 537 {
 538         struct netdev_boot_setup *s;
 539         int i;
 540
 541         s = dev_boot_setup;
 542         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 543                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 544                         memset(s[i].name, 0, sizeof(s[i].name));
 545                         strlcpy(s[i].name, name, IFNAMSIZ);
 546                         memcpy(&s[i].map, map, sizeof(s[i].map));
 547                         break;
 548                 }
 549         }
 550
 551         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 552 }
 553
 554 /**
 555  *      netdev_boot_setup_check - check boot time settings
 556  *      @dev: the netdevice
 557  *
 558  *      Check boot time settings for the device.
 559  *      The found settings are set for the device to be used
 560  *      later in the device probing.
 561  *      Returns 0 if no settings found, 1 if they are.
 562  */
 563 int netdev_boot_setup_check(struct net_device *dev)
 564 {
 565         struct netdev_boot_setup *s = dev_boot_setup;
 566         int i;
 567
 568         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 569                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 570                     !strcmp(dev->name, s[i].name)) {
 571                         dev->irq        = s[i].map.irq;
 572                         dev->base_addr  = s[i].map.base_addr;
 573                         dev->mem_start  = s[i].map.mem_start;
 574                         dev->mem_end    = s[i].map.mem_end;
 575                         return 1;
 576                 }
 577         }
 578         return 0;
 579 }
 580 EXPORT_SYMBOL(netdev_boot_setup_check);
 581
 582
 583 /**
 584  *      netdev_boot_base        - get address from boot time settings
 585  *      @prefix: prefix for network device
 586  *      @unit: id for network device
 587  *
 588  *      Check boot time settings for the base address of device.
 589  *      The found settings are set for the device to be used
 590  *      later in the device probing.
 591  *      Returns 0 if no settings found.
 592  */
 593 unsigned long netdev_boot_base(const char *prefix, int unit)
 594 {
 595         const struct netdev_boot_setup *s = dev_boot_setup;
 596         char name[IFNAMSIZ];
 597         int i;
 598
 599         sprintf(name, "%s%d", prefix, unit);
 600
 601         /*
 602          * If device already registered then return base of 1
 603          * to indicate not to probe for this interface
 604          */
 605         if (__dev_get_by_name(&init_net, name))
 606                 return 1;
 607
 608         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 609                 if (!strcmp(name, s[i].name))
 610                         return s[i].map.base_addr;
 611         return 0;
 612 }
 613
 614 /*
 615  * Saves at boot time configured settings for any netdevice.
 616  */
 617 int __init netdev_boot_setup(char *str)
 618 {
 619         int ints[5];
 620         struct ifmap map;
 621
 622         str = get_options(str, ARRAY_SIZE(ints), ints);
 623         if (!str || !*str)
 624                 return 0;
 625
 626         /* Save settings */
 627         memset(&map, 0, sizeof(map));
 628         if (ints[0] > 0)
 629                 map.irq = ints[1];
 630         if (ints[0] > 1)
 631                 map.base_addr = ints[2];
 632         if (ints[0] > 2)
 633                 map.mem_start = ints[3];
 634         if (ints[0] > 3)
 635                 map.mem_end = ints[4];
 636
 637         /* Add new entry to the list */
 638         return netdev_boot_setup_add(str, &map);
 639 }
 640
 641 __setup("netdev=", netdev_boot_setup);
 642
 643 /*******************************************************************************
 644
 645                             Device Interface Subroutines
 646
 647 *******************************************************************************/
 648
 649 /**
 650  *      __dev_get_by_name       - find a device by its name
 651  *      @net: the applicable net namespace
 652  *      @name: name to find
 653  *
 654  *      Find an interface by name. Must be called under RTNL semaphore
 655  *      or @dev_base_lock. If the name is found a pointer to the device
 656  *      is returned. If the name is not found then %NULL is returned. The
 657  *      reference counters are not incremented so the caller must be
 658  *      careful with locks.
 659  */
 660
 661 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 662 {
 663         struct net_device *dev;
 664         struct hlist_head *head = dev_name_hash(net, name);
 665
 666         hlist_for_each_entry(dev, head, name_hlist)
 667                 if (!strncmp(dev->name, name, IFNAMSIZ))
 668                         return dev;
 669
 670         return NULL;
 671 }
 672 EXPORT_SYMBOL(__dev_get_by_name);
 673
 674 /**
 675  *      dev_get_by_name_rcu     - find a device by its name
 676  *      @net: the applicable net namespace
 677  *      @name: name to find
 678  *
 679  *      Find an interface by name.
 680  *      If the name is found a pointer to the device is returned.
 681  *      If the name is not found then %NULL is returned.
 682  *      The reference counters are not incremented so the caller must be
 683  *      careful with locks. The caller must hold RCU lock.
 684  */
 685
 686 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 687 {
 688         struct net_device *dev;
 689         struct hlist_head *head = dev_name_hash(net, name);
 690
 691         hlist_for_each_entry_rcu(dev, head, name_hlist)
 692                 if (!strncmp(dev->name, name, IFNAMSIZ))
 693                         return dev;
 694
 695         return NULL;
 696 }
 697 EXPORT_SYMBOL(dev_get_by_name_rcu);
 698
 699 /**
 700  *      dev_get_by_name         - find a device by its name
 701  *      @net: the applicable net namespace
 702  *      @name: name to find
 703  *
 704  *      Find an interface by name. This can be called from any
 705  *      context and does its own locking. The returned handle has
 706  *      the usage count incremented and the caller must use dev_put() to
 707  *      release it when it is no longer needed. %NULL is returned if no
 708  *      matching device is found.
 709  */
 710
 711 struct net_device *dev_get_by_name(struct net *net, const char *name)
 712 {
 713         struct net_device *dev;
 714
 715         rcu_read_lock();
 716         dev = dev_get_by_name_rcu(net, name);
 717         if (dev)
 718                 dev_hold(dev);
 719         rcu_read_unlock();
 720         return dev;
 721 }
 722 EXPORT_SYMBOL(dev_get_by_name);
 723
 724 /**
 725  *      __dev_get_by_index - find a device by its ifindex
 726  *      @net: the applicable net namespace
 727  *      @ifindex: index of device
 728  *
 729  *      Search for an interface by index. Returns %NULL if the device
 730  *      is not found or a pointer to the device. The device has not
 731  *      had its reference counter increased so the caller must be careful
 732  *      about locking. The caller must hold either the RTNL semaphore
 733  *      or @dev_base_lock.
 734  */
 735
 736 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 737 {
 738         struct net_device *dev;
 739         struct hlist_head *head = dev_index_hash(net, ifindex);
 740
 741         hlist_for_each_entry(dev, head, index_hlist)
 742                 if (dev->ifindex == ifindex)
 743                         return dev;
 744
 745         return NULL;
 746 }
 747 EXPORT_SYMBOL(__dev_get_by_index);
 748
 749 /**
 750  *      dev_get_by_index_rcu - find a device by its ifindex
 751  *      @net: the applicable net namespace
 752  *      @ifindex: index of device
 753  *
 754  *      Search for an interface by index. Returns %NULL if the device
 755  *      is not found or a pointer to the device. The device has not
 756  *      had its reference counter increased so the caller must be careful
 757  *      about locking. The caller must hold RCU lock.
 758  */
 759
 760 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 761 {
 762         struct net_device *dev;
 763         struct hlist_head *head = dev_index_hash(net, ifindex);
 764
 765         hlist_for_each_entry_rcu(dev, head, index_hlist)
 766                 if (dev->ifindex == ifindex)
 767                         return dev;
 768
 769         return NULL;
 770 }
 771 EXPORT_SYMBOL(dev_get_by_index_rcu);
 772
 773
 774 /**
 775  *      dev_get_by_index - find a device by its ifindex
 776  *      @net: the applicable net namespace
 777  *      @ifindex: index of device
 778  *
 779  *      Search for an interface by index. Returns NULL if the device
 780  *      is not found or a pointer to the device. The device returned has
 781  *      had a reference added and the pointer is safe until the user calls
 782  *      dev_put to indicate they have finished with it.
 783  */
 784
 785 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 786 {
 787         struct net_device *dev;
 788
 789         rcu_read_lock();
 790         dev = dev_get_by_index_rcu(net, ifindex);
 791         if (dev)
 792                 dev_hold(dev);
 793         rcu_read_unlock();
 794         return dev;
 795 }
 796 EXPORT_SYMBOL(dev_get_by_index);
 797
 798 /**
 799  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 800  *      @net: network namespace
 801  *      @name: a pointer to the buffer where the name will be stored.
 802  *      @ifindex: the ifindex of the interface to get the name from.
 803  *
 804  *      The use of raw_seqcount_begin() and cond_resched() before
 805  *      retrying is required as we want to give the writers a chance
 806  *      to complete when CONFIG_PREEMPT is not set.
 807  */
 808 int netdev_get_name(struct net *net, char *name, int ifindex)
 809 {
 810         struct net_device *dev;
 811         unsigned int seq;
 812
 813 retry:
 814         seq = raw_seqcount_begin(&devnet_rename_seq);
 815         rcu_read_lock();
 816         dev = dev_get_by_index_rcu(net, ifindex);
 817         if (!dev) {
 818                 rcu_read_unlock();
 819                 return -ENODEV;
 820         }
 821
 822         strcpy(name, dev->name);
 823         rcu_read_unlock();
 824         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 825                 cond_resched();
 826                 goto retry;
 827         }
 828
 829         return 0;
 830 }
 831
 832 /**
 833  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 834  *      @net: the applicable net namespace
 835  *      @type: media type of device
 836  *      @ha: hardware address
 837  *
 838  *      Search for an interface by MAC address. Returns NULL if the device
 839  *      is not found or a pointer to the device.
 840  *      The caller must hold RCU or RTNL.
 841  *      The returned device has not had its ref count increased
 842  *      and the caller must therefore be careful about locking
 843  *
 844  */
 845
 846 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 847                                        const char *ha)
 848 {
 849         struct net_device *dev;
 850
 851         for_each_netdev_rcu(net, dev)
 852                 if (dev->type == type &&
 853                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 854                         return dev;
 855
 856         return NULL;
 857 }
 858 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 859
 860 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 861 {
 862         struct net_device *dev;
 863
 864         ASSERT_RTNL();
 865         for_each_netdev(net, dev)
 866                 if (dev->type == type)
 867                         return dev;
 868
 869         return NULL;
 870 }
 871 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 872
 873 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 874 {
 875         struct net_device *dev, *ret = NULL;
 876
 877         rcu_read_lock();
 878         for_each_netdev_rcu(net, dev)
 879                 if (dev->type == type) {
 880                         dev_hold(dev);
 881                         ret = dev;
 882                         break;
 883                 }
 884         rcu_read_unlock();
 885         return ret;
 886 }
 887 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 888
 889 /**
 890  *      dev_get_by_flags_rcu - find any device with given flags
 891  *      @net: the applicable net namespace
 892  *      @if_flags: IFF_* values
 893  *      @mask: bitmask of bits in if_flags to check
 894  *
 895  *      Search for any interface with the given flags. Returns NULL if a device
 896  *      is not found or a pointer to the device. Must be called inside
 897  *      rcu_read_lock(), and result refcount is unchanged.
 898  */
 899
 900 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 901                                     unsigned short mask)
 902 {
 903         struct net_device *dev, *ret;
 904
 905         ret = NULL;
 906         for_each_netdev_rcu(net, dev) {
 907                 if (((dev->flags ^ if_flags) & mask) == 0) {
 908                         ret = dev;
 909                         break;
 910                 }
 911         }
 912         return ret;
 913 }
 914 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 915
 916 /**
 917  *      dev_valid_name - check if name is okay for network device
 918  *      @name: name string
 919  *
 920  *      Network device names need to be valid file names to
 921  *      to allow sysfs to work.  We also disallow any kind of
 922  *      whitespace.
 923  */
 924 bool dev_valid_name(const char *name)
 925 {
 926         if (*name == '\0')
 927                 return false;
 928         if (strlen(name) >= IFNAMSIZ)
 929                 return false;
 930         if (!strcmp(name, ".") || !strcmp(name, ".."))
 931                 return false;
 932
 933         while (*name) {
 934                 if (*name == '/' || *name == ':' || isspace(*name))
 935                         return false;
 936                 name++;
 937         }
 938         return true;
 939 }
 940 EXPORT_SYMBOL(dev_valid_name);
 941
 942 /**
 943  *      __dev_alloc_name - allocate a name for a device
 944  *      @net: network namespace to allocate the device name in
 945  *      @name: name format string
 946  *      @buf:  scratch buffer and result name string
 947  *
 948  *      Passed a format string - eg "lt%d" it will try and find a suitable
 949  *      id. It scans list of devices to build up a free map, then chooses
 950  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 951  *      while allocating the name and adding the device in order to avoid
 952  *      duplicates.
 953  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 954  *      Returns the number of the unit assigned or a negative errno code.
 955  */
 956
 957 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 958 {
 959         int i = 0;
 960         const char *p;
 961         const int max_netdevices = 8*PAGE_SIZE;
 962         unsigned long *inuse;
 963         struct net_device *d;
 964
 965         p = strnchr(name, IFNAMSIZ-1, '%');
 966         if (p) {
 967                 /*
 968                  * Verify the string as this thing may have come from
 969                  * the user.  There must be either one "%d" and no other "%"
 970                  * characters.
 971                  */
 972                 if (p[1] != 'd' || strchr(p + 2, '%'))
 973                         return -EINVAL;
 974
 975                 /* Use one page as a bit array of possible slots */
 976                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 977                 if (!inuse)
 978                         return -ENOMEM;
 979
 980                 for_each_netdev(net, d) {
 981                         if (!sscanf(d->name, name, &i))
 982                                 continue;
 983                         if (i < 0 || i >= max_netdevices)
 984                                 continue;
 985
 986                         /*  avoid cases where sscanf is not exact inverse of printf */
 987                         snprintf(buf, IFNAMSIZ, name, i);
 988                         if (!strncmp(buf, d->name, IFNAMSIZ))
 989                                 set_bit(i, inuse);
 990                 }
 991
 992                 i = find_first_zero_bit(inuse, max_netdevices);
 993                 free_page((unsigned long) inuse);
 994         }
 995
 996         if (buf != name)
 997                 snprintf(buf, IFNAMSIZ, name, i);
 998         if (!__dev_get_by_name(net, buf))
 999                 return i;
1000
1001         /* It is possible to run out of possible slots
1002          * when the name is long and there isn't enough space left
1003          * for the digits, or if all bits are used.
1004          */
1005         return -ENFILE;
1006 }
1007
1008 /**
1009  *      dev_alloc_name - allocate a name for a device
1010  *      @dev: device
1011  *      @name: name format string
1012  *
1013  *      Passed a format string - eg "lt%d" it will try and find a suitable
1014  *      id. It scans list of devices to build up a free map, then chooses
1015  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1016  *      while allocating the name and adding the device in order to avoid
1017  *      duplicates.
1018  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1019  *      Returns the number of the unit assigned or a negative errno code.
1020  */
1021
1022 int dev_alloc_name(struct net_device *dev, const char *name)
1023 {
1024         char buf[IFNAMSIZ];
1025         struct net *net;
1026         int ret;
1027
1028         BUG_ON(!dev_net(dev));
1029         net = dev_net(dev);
1030         ret = __dev_alloc_name(net, name, buf);
1031         if (ret >= 0)
1032                 strlcpy(dev->name, buf, IFNAMSIZ);
1033         return ret;
1034 }
1035 EXPORT_SYMBOL(dev_alloc_name);
1036
1037 static int dev_alloc_name_ns(struct net *net,
1038                              struct net_device *dev,
1039                              const char *name)
1040 {
1041         char buf[IFNAMSIZ];
1042         int ret;
1043
1044         ret = __dev_alloc_name(net, name, buf);
1045         if (ret >= 0)
1046                 strlcpy(dev->name, buf, IFNAMSIZ);
1047         return ret;
1048 }
1049
1050 static int dev_get_valid_name(struct net *net,
1051                               struct net_device *dev,
1052                               const char *name)
1053 {
1054         BUG_ON(!net);
1055
1056         if (!dev_valid_name(name))
1057                 return -EINVAL;
1058
1059         if (strchr(name, '%'))
1060                 return dev_alloc_name_ns(net, dev, name);
1061         else if (__dev_get_by_name(net, name))
1062                 return -EEXIST;
1063         else if (dev->name != name)
1064                 strlcpy(dev->name, name, IFNAMSIZ);
1065
1066         return 0;
1067 }
1068
1069 /**
1070  *      dev_change_name - change name of a device
1071  *      @dev: device
1072  *      @newname: name (or format string) must be at least IFNAMSIZ
1073  *
1074  *      Change name of a device, can pass format strings "eth%d".
1075  *      for wildcarding.
1076  */
1077 int dev_change_name(struct net_device *dev, const char *newname)
1078 {
1079         char oldname[IFNAMSIZ];
1080         int err = 0;
1081         int ret;
1082         struct net *net;
1083
1084         ASSERT_RTNL();
1085         BUG_ON(!dev_net(dev));
1086
1087         net = dev_net(dev);
1088         if (dev->flags & IFF_UP)
1089                 return -EBUSY;
1090
1091         write_seqcount_begin(&devnet_rename_seq);
1092
1093         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1094                 write_seqcount_end(&devnet_rename_seq);
1095                 return 0;
1096         }
1097
1098         memcpy(oldname, dev->name, IFNAMSIZ);
1099
1100         err = dev_get_valid_name(net, dev, newname);
1101         if (err < 0) {
1102                 write_seqcount_end(&devnet_rename_seq);
1103                 return err;
1104         }
1105
1106 rollback:
1107         ret = device_rename(&dev->dev, dev->name);
1108         if (ret) {
1109                 memcpy(dev->name, oldname, IFNAMSIZ);
1110                 write_seqcount_end(&devnet_rename_seq);
1111                 return ret;
1112         }
1113
1114         write_seqcount_end(&devnet_rename_seq);
1115
1116         write_lock_bh(&dev_base_lock);
1117         hlist_del_rcu(&dev->name_hlist);
1118         write_unlock_bh(&dev_base_lock);
1119
1120         synchronize_rcu();
1121
1122         write_lock_bh(&dev_base_lock);
1123         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1124         write_unlock_bh(&dev_base_lock);
1125
1126         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1127         ret = notifier_to_errno(ret);
1128
1129         if (ret) {
1130                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1131                 if (err >= 0) {
1132                         err = ret;
1133                         write_seqcount_begin(&devnet_rename_seq);
1134                         memcpy(dev->name, oldname, IFNAMSIZ);
1135                         goto rollback;
1136                 } else {
1137                         pr_err("%s: name change rollback failed: %d\n",
1138                                dev->name, ret);
1139                 }
1140         }
1141
1142         return err;
1143 }
1144
1145 /**
1146  *      dev_set_alias - change ifalias of a device
1147  *      @dev: device
1148  *      @alias: name up to IFALIASZ
1149  *      @len: limit of bytes to copy from info
1150  *
1151  *      Set ifalias for a device,
1152  */
1153 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1154 {
1155         char *new_ifalias;
1156
1157         ASSERT_RTNL();
1158
1159         if (len >= IFALIASZ)
1160                 return -EINVAL;
1161
1162         if (!len) {
1163                 kfree(dev->ifalias);
1164                 dev->ifalias = NULL;
1165                 return 0;
1166         }
1167
1168         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1169         if (!new_ifalias)
1170                 return -ENOMEM;
1171         dev->ifalias = new_ifalias;
1172
1173         strlcpy(dev->ifalias, alias, len+1);
1174         return len;
1175 }
1176
1177
1178 /**
1179  *      netdev_features_change - device changes features
1180  *      @dev: device to cause notification
1181  *
1182  *      Called to indicate a device has changed features.
1183  */
1184 void netdev_features_change(struct net_device *dev)
1185 {
1186         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1187 }
1188 EXPORT_SYMBOL(netdev_features_change);
1189
1190 /**
1191  *      netdev_state_change - device changes state
1192  *      @dev: device to cause notification
1193  *
1194  *      Called to indicate a device has changed state. This function calls
1195  *      the notifier chains for netdev_chain and sends a NEWLINK message
1196  *      to the routing socket.
1197  */
1198 void netdev_state_change(struct net_device *dev)
1199 {
1200         if (dev->flags & IFF_UP) {
1201                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1202                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1203         }
1204 }
1205 EXPORT_SYMBOL(netdev_state_change);
1206
1207 /**
1208  *      netdev_notify_peers - notify network peers about existence of @dev
1209  *      @dev: network device
1210  *
1211  * Generate traffic such that interested network peers are aware of
1212  * @dev, such as by generating a gratuitous ARP. This may be used when
1213  * a device wants to inform the rest of the network about some sort of
1214  * reconfiguration such as a failover event or virtual machine
1215  * migration.
1216  */
1217 void netdev_notify_peers(struct net_device *dev)
1218 {
1219         rtnl_lock();
1220         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1221         rtnl_unlock();
1222 }
1223 EXPORT_SYMBOL(netdev_notify_peers);
1224
1225 static int __dev_open(struct net_device *dev)
1226 {
1227         const struct net_device_ops *ops = dev->netdev_ops;
1228         int ret;
1229
1230         ASSERT_RTNL();
1231
1232         if (!netif_device_present(dev))
1233                 return -ENODEV;
1234
1235         /* Block netpoll from trying to do any rx path servicing.
1236          * If we don't do this there is a chance ndo_poll_controller
1237          * or ndo_poll may be running while we open the device
1238          */
1239         ret = netpoll_rx_disable(dev);
1240         if (ret)
1241                 return ret;
1242
1243         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1244         ret = notifier_to_errno(ret);
1245         if (ret)
1246                 return ret;
1247
1248         set_bit(__LINK_STATE_START, &dev->state);
1249
1250         if (ops->ndo_validate_addr)
1251                 ret = ops->ndo_validate_addr(dev);
1252
1253         if (!ret && ops->ndo_open)
1254                 ret = ops->ndo_open(dev);
1255
1256         netpoll_rx_enable(dev);
1257
1258         if (ret)
1259                 clear_bit(__LINK_STATE_START, &dev->state);
1260         else {
1261                 dev->flags |= IFF_UP;
1262                 net_dmaengine_get();
1263                 dev_set_rx_mode(dev);
1264                 dev_activate(dev);
1265                 add_device_randomness(dev->dev_addr, dev->addr_len);
1266         }
1267
1268         return ret;
1269 }
1270
1271 /**
1272  *      dev_open        - prepare an interface for use.
1273  *      @dev:   device to open
1274  *
1275  *      Takes a device from down to up state. The device's private open
1276  *      function is invoked and then the multicast lists are loaded. Finally
1277  *      the device is moved into the up state and a %NETDEV_UP message is
1278  *      sent to the netdev notifier chain.
1279  *
1280  *      Calling this function on an active interface is a nop. On a failure
1281  *      a negative errno code is returned.
1282  */
1283 int dev_open(struct net_device *dev)
1284 {
1285         int ret;
1286
1287         if (dev->flags & IFF_UP)
1288                 return 0;
1289
1290         ret = __dev_open(dev);
1291         if (ret < 0)
1292                 return ret;
1293
1294         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1295         call_netdevice_notifiers(NETDEV_UP, dev);
1296
1297         return ret;
1298 }
1299 EXPORT_SYMBOL(dev_open);
1300
1301 static int __dev_close_many(struct list_head *head)
1302 {
1303         struct net_device *dev;
1304
1305         ASSERT_RTNL();
1306         might_sleep();
1307
1308         list_for_each_entry(dev, head, unreg_list) {
1309                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1310
1311                 clear_bit(__LINK_STATE_START, &dev->state);
1312
1313                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1314                  * can be even on different cpu. So just clear netif_running().
1315                  *
1316                  * dev->stop() will invoke napi_disable() on all of it's
1317                  * napi_struct instances on this device.
1318                  */
1319                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1320         }
1321
1322         dev_deactivate_many(head);
1323
1324         list_for_each_entry(dev, head, unreg_list) {
1325                 const struct net_device_ops *ops = dev->netdev_ops;
1326
1327                 /*
1328                  *      Call the device specific close. This cannot fail.
1329                  *      Only if device is UP
1330                  *
1331                  *      We allow it to be called even after a DETACH hot-plug
1332                  *      event.
1333                  */
1334                 if (ops->ndo_stop)
1335                         ops->ndo_stop(dev);
1336
1337                 dev->flags &= ~IFF_UP;
1338                 net_dmaengine_put();
1339         }
1340
1341         return 0;
1342 }
1343
1344 static int __dev_close(struct net_device *dev)
1345 {
1346         int retval;
1347         LIST_HEAD(single);
1348
1349         /* Temporarily disable netpoll until the interface is down */
1350         retval = netpoll_rx_disable(dev);
1351         if (retval)
1352                 return retval;
1353
1354         list_add(&dev->unreg_list, &single);
1355         retval = __dev_close_many(&single);
1356         list_del(&single);
1357
1358         netpoll_rx_enable(dev);
1359         return retval;
1360 }
1361
1362 static int dev_close_many(struct list_head *head)
1363 {
1364         struct net_device *dev, *tmp;
1365         LIST_HEAD(tmp_list);
1366
1367         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1368                 if (!(dev->flags & IFF_UP))
1369                         list_move(&dev->unreg_list, &tmp_list);
1370
1371         __dev_close_many(head);
1372
1373         list_for_each_entry(dev, head, unreg_list) {
1374                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1375                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1376         }
1377
1378         /* rollback_registered_many needs the complete original list */
1379         list_splice(&tmp_list, head);
1380         return 0;
1381 }
1382
1383 /**
1384  *      dev_close - shutdown an interface.
1385  *      @dev: device to shutdown
1386  *
1387  *      This function moves an active device into down state. A
1388  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390  *      chain.
1391  */
1392 int dev_close(struct net_device *dev)
1393 {
1394         int ret = 0;
1395         if (dev->flags & IFF_UP) {
1396                 LIST_HEAD(single);
1397
1398                 /* Block netpoll rx while the interface is going down */
1399                 ret = netpoll_rx_disable(dev);
1400                 if (ret)
1401                         return ret;
1402
1403                 list_add(&dev->unreg_list, &single);
1404                 dev_close_many(&single);
1405                 list_del(&single);
1406
1407                 netpoll_rx_enable(dev);
1408         }
1409         return ret;
1410 }
1411 EXPORT_SYMBOL(dev_close);
1412
1413
1414 /**
1415  *      dev_disable_lro - disable Large Receive Offload on a device
1416  *      @dev: device
1417  *
1418  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1419  *      called under RTNL.  This is needed if received packets may be
1420  *      forwarded to another interface.
1421  */
1422 void dev_disable_lro(struct net_device *dev)
1423 {
1424         /*
1425          * If we're trying to disable lro on a vlan device
1426          * use the underlying physical device instead
1427          */
1428         if (is_vlan_dev(dev))
1429                 dev = vlan_dev_real_dev(dev);
1430
1431         dev->wanted_features &= ~NETIF_F_LRO;
1432         netdev_update_features(dev);
1433
1434         if (unlikely(dev->features & NETIF_F_LRO))
1435                 netdev_WARN(dev, "failed to disable LRO!\n");
1436 }
1437 EXPORT_SYMBOL(dev_disable_lro);
1438
1439
1440 static int dev_boot_phase = 1;
1441
1442 /**
1443  *      register_netdevice_notifier - register a network notifier block
1444  *      @nb: notifier
1445  *
1446  *      Register a notifier to be called when network device events occur.
1447  *      The notifier passed is linked into the kernel structures and must
1448  *      not be reused until it has been unregistered. A negative errno code
1449  *      is returned on a failure.
1450  *
1451  *      When registered all registration and up events are replayed
1452  *      to the new notifier to allow device to have a race free
1453  *      view of the network device list.
1454  */
1455
1456 int register_netdevice_notifier(struct notifier_block *nb)
1457 {
1458         struct net_device *dev;
1459         struct net_device *last;
1460         struct net *net;
1461         int err;
1462
1463         rtnl_lock();
1464         err = raw_notifier_chain_register(&netdev_chain, nb);
1465         if (err)
1466                 goto unlock;
1467         if (dev_boot_phase)
1468                 goto unlock;
1469         for_each_net(net) {
1470                 for_each_netdev(net, dev) {
1471                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1472                         err = notifier_to_errno(err);
1473                         if (err)
1474                                 goto rollback;
1475
1476                         if (!(dev->flags & IFF_UP))
1477                                 continue;
1478
1479                         nb->notifier_call(nb, NETDEV_UP, dev);
1480                 }
1481         }
1482
1483 unlock:
1484         rtnl_unlock();
1485         return err;
1486
1487 rollback:
1488         last = dev;
1489         for_each_net(net) {
1490                 for_each_netdev(net, dev) {
1491                         if (dev == last)
1492                                 goto outroll;
1493
1494                         if (dev->flags & IFF_UP) {
1495                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1496                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1497                         }
1498                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1499                 }
1500         }
1501
1502 outroll:
1503         raw_notifier_chain_unregister(&netdev_chain, nb);
1504         goto unlock;
1505 }
1506 EXPORT_SYMBOL(register_netdevice_notifier);
1507
1508 /**
1509  *      unregister_netdevice_notifier - unregister a network notifier block
1510  *      @nb: notifier
1511  *
1512  *      Unregister a notifier previously registered by
1513  *      register_netdevice_notifier(). The notifier is unlinked into the
1514  *      kernel structures and may then be reused. A negative errno code
1515  *      is returned on a failure.
1516  *
1517  *      After unregistering unregister and down device events are synthesized
1518  *      for all devices on the device list to the removed notifier to remove
1519  *      the need for special case cleanup code.
1520  */
1521
1522 int unregister_netdevice_notifier(struct notifier_block *nb)
1523 {
1524         struct net_device *dev;
1525         struct net *net;
1526         int err;
1527
1528         rtnl_lock();
1529         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1530         if (err)
1531                 goto unlock;
1532
1533         for_each_net(net) {
1534                 for_each_netdev(net, dev) {
1535                         if (dev->flags & IFF_UP) {
1536                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1537                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1538                         }
1539                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1540                 }
1541         }
1542 unlock:
1543         rtnl_unlock();
1544         return err;
1545 }
1546 EXPORT_SYMBOL(unregister_netdevice_notifier);
1547
1548 /**
1549  *      call_netdevice_notifiers - call all network notifier blocks
1550  *      @val: value passed unmodified to notifier function
1551  *      @dev: net_device pointer passed unmodified to notifier function
1552  *
1553  *      Call all network notifier blocks.  Parameters and return value
1554  *      are as for raw_notifier_call_chain().
1555  */
1556
1557 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1558 {
1559         ASSERT_RTNL();
1560         return raw_notifier_call_chain(&netdev_chain, val, dev);
1561 }
1562 EXPORT_SYMBOL(call_netdevice_notifiers);
1563
1564 static struct static_key netstamp_needed __read_mostly;
1565 #ifdef HAVE_JUMP_LABEL
1566 static atomic_t netstamp_needed_deferred;
1567 static atomic_t netstamp_wanted;
1568 static void netstamp_clear(struct work_struct *work)
1569 {
1570         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1571         int wanted;
1572
1573         wanted = atomic_add_return(deferred, &netstamp_wanted);
1574         if (wanted > 0)
1575                 static_key_enable(&netstamp_needed);
1576         else
1577                 static_key_disable(&netstamp_needed);
1578 }
1579 static DECLARE_WORK(netstamp_work, netstamp_clear);
1580 #endif
1581
1582 void net_enable_timestamp(void)
1583 {
1584 #ifdef HAVE_JUMP_LABEL
1585         int wanted;
1586
1587         while (1) {
1588                 wanted = atomic_read(&netstamp_wanted);
1589                 if (wanted <= 0)
1590                         break;
1591                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted + 1) == wanted)
1592                         return;
1593         }
1594         atomic_inc(&netstamp_needed_deferred);
1595         schedule_work(&netstamp_work);
1596 #else
1597         static_key_slow_inc(&netstamp_needed);
1598 #endif
1599 }
1600 EXPORT_SYMBOL(net_enable_timestamp);
1601
1602 void net_disable_timestamp(void)
1603 {
1604 #ifdef HAVE_JUMP_LABEL
1605         int wanted;
1606
1607         while (1) {
1608                 wanted = atomic_read(&netstamp_wanted);
1609                 if (wanted <= 1)
1610                         break;
1611                 if (atomic_cmpxchg(&netstamp_wanted, wanted, wanted - 1) == wanted)
1612                         return;
1613         }
1614         atomic_dec(&netstamp_needed_deferred);
1615         schedule_work(&netstamp_work);
1616 #else
1617         static_key_slow_dec(&netstamp_needed);
1618 #endif
1619 }
1620 EXPORT_SYMBOL(net_disable_timestamp);
1621
1622 static inline void net_timestamp_set(struct sk_buff *skb)
1623 {
1624         skb->tstamp.tv64 = 0;
1625         if (static_key_false(&netstamp_needed))
1626                 __net_timestamp(skb);
1627 }
1628
1629 #define net_timestamp_check(COND, SKB)                  \
1630         if (static_key_false(&netstamp_needed)) {               \
1631                 if ((COND) && !(SKB)->tstamp.tv64)      \
1632                         __net_timestamp(SKB);           \
1633         }                                               \
1634
1635 static inline bool is_skb_forwardable(struct net_device *dev,
1636                                       struct sk_buff *skb)
1637 {
1638         unsigned int len;
1639
1640         if (!(dev->flags & IFF_UP))
1641                 return false;
1642
1643         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1644         if (skb->len <= len)
1645                 return true;
1646
1647         /* if TSO is enabled, we don't care about the length as the packet
1648          * could be forwarded without being segmented before
1649          */
1650         if (skb_is_gso(skb))
1651                 return true;
1652
1653         return false;
1654 }
1655
1656 /**
1657  * dev_forward_skb - loopback an skb to another netif
1658  *
1659  * @dev: destination network device
1660  * @skb: buffer to forward
1661  *
1662  * return values:
1663  *      NET_RX_SUCCESS  (no congestion)
1664  *      NET_RX_DROP     (packet was dropped, but freed)
1665  *
1666  * dev_forward_skb can be used for injecting an skb from the
1667  * start_xmit function of one device into the receive queue
1668  * of another device.
1669  *
1670  * The receiving device may be in another namespace, so
1671  * we have to clear all information in the skb that could
1672  * impact namespace isolation.
1673  */
1674 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1675 {
1676         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1677                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1678                         atomic_long_inc(&dev->rx_dropped);
1679                         kfree_skb(skb);
1680                         return NET_RX_DROP;
1681                 }
1682         }
1683
1684         skb_orphan(skb);
1685
1686         if (unlikely(!is_skb_forwardable(dev, skb))) {
1687                 atomic_long_inc(&dev->rx_dropped);
1688                 kfree_skb(skb);
1689                 return NET_RX_DROP;
1690         }
1691         skb->skb_iif = 0;
1692         skb->dev = dev;
1693         skb_dst_drop(skb);
1694         skb->tstamp.tv64 = 0;
1695         skb->pkt_type = PACKET_HOST;
1696         skb->protocol = eth_type_trans(skb, dev);
1697         skb->mark = 0;
1698         secpath_reset(skb);
1699         nf_reset(skb);
1700         nf_reset_trace(skb);
1701         return netif_rx(skb);
1702 }
1703 EXPORT_SYMBOL_GPL(dev_forward_skb);
1704
1705 static inline int deliver_skb(struct sk_buff *skb,
1706                               struct packet_type *pt_prev,
1707                               struct net_device *orig_dev)
1708 {
1709         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1710                 return -ENOMEM;
1711         atomic_inc(&skb->users);
1712         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1713 }
1714
1715 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1716 {
1717         if (!ptype->af_packet_priv || !skb->sk)
1718                 return false;
1719
1720         if (ptype->id_match)
1721                 return ptype->id_match(ptype, skb->sk);
1722         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1723                 return true;
1724
1725         return false;
1726 }
1727
1728 /*
1729  *      Support routine. Sends outgoing frames to any network
1730  *      taps currently in use.
1731  */
1732
1733 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1734 {
1735         struct packet_type *ptype;
1736         struct sk_buff *skb2 = NULL;
1737         struct packet_type *pt_prev = NULL;
1738
1739         rcu_read_lock();
1740         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1741                 /* Never send packets back to the socket
1742                  * they originated from - MvS (miquels@drinkel.ow.org)
1743                  */
1744                 if ((ptype->dev == dev || !ptype->dev) &&
1745                     (!skb_loop_sk(ptype, skb))) {
1746                         if (pt_prev) {
1747                                 deliver_skb(skb2, pt_prev, skb->dev);
1748                                 pt_prev = ptype;
1749                                 continue;
1750                         }
1751
1752                         skb2 = skb_clone(skb, GFP_ATOMIC);
1753                         if (!skb2)
1754                                 break;
1755
1756                         net_timestamp_set(skb2);
1757
1758                         /* skb->nh should be correctly
1759                            set by sender, so that the second statement is
1760                            just protection against buggy protocols.
1761                          */
1762                         skb_reset_mac_header(skb2);
1763
1764                         if (skb_network_header(skb2) < skb2->data ||
1765                             skb2->network_header > skb2->tail) {
1766                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1767                                                      ntohs(skb2->protocol),
1768                                                      dev->name);
1769                                 skb_reset_network_header(skb2);
1770                         }
1771
1772                         skb2->transport_header = skb2->network_header;
1773                         skb2->pkt_type = PACKET_OUTGOING;
1774                         pt_prev = ptype;
1775                 }
1776         }
1777         if (pt_prev)
1778                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1779         rcu_read_unlock();
1780 }
1781
1782 /**
1783  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1784  * @dev: Network device
1785  * @txq: number of queues available
1786  *
1787  * If real_num_tx_queues is changed the tc mappings may no longer be
1788  * valid. To resolve this verify the tc mapping remains valid and if
1789  * not NULL the mapping. With no priorities mapping to this
1790  * offset/count pair it will no longer be used. In the worst case TC0
1791  * is invalid nothing can be done so disable priority mappings. If is
1792  * expected that drivers will fix this mapping if they can before
1793  * calling netif_set_real_num_tx_queues.
1794  */
1795 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1796 {
1797         int i;
1798         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1799
1800         /* If TC0 is invalidated disable TC mapping */
1801         if (tc->offset + tc->count > txq) {
1802                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1803                 dev->num_tc = 0;
1804                 return;
1805         }
1806
1807         /* Invalidated prio to tc mappings set to TC0 */
1808         for (i = 1; i < TC_BITMASK + 1; i++) {
1809                 int q = netdev_get_prio_tc_map(dev, i);
1810
1811                 tc = &dev->tc_to_txq[q];
1812                 if (tc->offset + tc->count > txq) {
1813                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1814                                 i, q);
1815                         netdev_set_prio_tc_map(dev, i, 0);
1816                 }
1817         }
1818 }
1819
1820 #ifdef CONFIG_XPS
1821 static DEFINE_MUTEX(xps_map_mutex);
1822 #define xmap_dereference(P)             \
1823         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1824
1825 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1826                                         int cpu, u16 index)
1827 {
1828         struct xps_map *map = NULL;
1829         int pos;
1830
1831         if (dev_maps)
1832                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1833
1834         for (pos = 0; map && pos < map->len; pos++) {
1835                 if (map->queues[pos] == index) {
1836                         if (map->len > 1) {
1837                                 map->queues[pos] = map->queues[--map->len];
1838                         } else {
1839                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1840                                 kfree_rcu(map, rcu);
1841                                 map = NULL;
1842                         }
1843                         break;
1844                 }
1845         }
1846
1847         return map;
1848 }
1849
1850 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1851 {
1852         struct xps_dev_maps *dev_maps;
1853         int cpu, i;
1854         bool active = false;
1855
1856         mutex_lock(&xps_map_mutex);
1857         dev_maps = xmap_dereference(dev->xps_maps);
1858
1859         if (!dev_maps)
1860                 goto out_no_maps;
1861
1862         for_each_possible_cpu(cpu) {
1863                 for (i = index; i < dev->num_tx_queues; i++) {
1864                         if (!remove_xps_queue(dev_maps, cpu, i))
1865                                 break;
1866                 }
1867                 if (i == dev->num_tx_queues)
1868                         active = true;
1869         }
1870
1871         if (!active) {
1872                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1873                 kfree_rcu(dev_maps, rcu);
1874         }
1875
1876         for (i = index; i < dev->num_tx_queues; i++)
1877                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1878                                              NUMA_NO_NODE);
1879
1880 out_no_maps:
1881         mutex_unlock(&xps_map_mutex);
1882 }
1883
1884 static struct xps_map *expand_xps_map(struct xps_map *map,
1885                                       int cpu, u16 index)
1886 {
1887         struct xps_map *new_map;
1888         int alloc_len = XPS_MIN_MAP_ALLOC;
1889         int i, pos;
1890
1891         for (pos = 0; map && pos < map->len; pos++) {
1892                 if (map->queues[pos] != index)
1893                         continue;
1894                 return map;
1895         }
1896
1897         /* Need to add queue to this CPU's existing map */
1898         if (map) {
1899                 if (pos < map->alloc_len)
1900                         return map;
1901
1902                 alloc_len = map->alloc_len * 2;
1903         }
1904
1905         /* Need to allocate new map to store queue on this CPU's map */
1906         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1907                                cpu_to_node(cpu));
1908         if (!new_map)
1909                 return NULL;
1910
1911         for (i = 0; i < pos; i++)
1912                 new_map->queues[i] = map->queues[i];
1913         new_map->alloc_len = alloc_len;
1914         new_map->len = pos;
1915
1916         return new_map;
1917 }
1918
1919 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1920 {
1921         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1922         struct xps_map *map, *new_map;
1923         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1924         int cpu, numa_node_id = -2;
1925         bool active = false;
1926
1927         mutex_lock(&xps_map_mutex);
1928
1929         dev_maps = xmap_dereference(dev->xps_maps);
1930
1931         /* allocate memory for queue storage */
1932         for_each_online_cpu(cpu) {
1933                 if (!cpumask_test_cpu(cpu, mask))
1934                         continue;
1935
1936                 if (!new_dev_maps)
1937                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1938                 if (!new_dev_maps) {
1939                         mutex_unlock(&xps_map_mutex);
1940                         return -ENOMEM;
1941                 }
1942
1943                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1944                                  NULL;
1945
1946                 map = expand_xps_map(map, cpu, index);
1947                 if (!map)
1948                         goto error;
1949
1950                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1951         }
1952
1953         if (!new_dev_maps)
1954                 goto out_no_new_maps;
1955
1956         for_each_possible_cpu(cpu) {
1957                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1958                         /* add queue to CPU maps */
1959                         int pos = 0;
1960
1961                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1962                         while ((pos < map->len) && (map->queues[pos] != index))
1963                                 pos++;
1964
1965                         if (pos == map->len)
1966                                 map->queues[map->len++] = index;
1967 #ifdef CONFIG_NUMA
1968                         if (numa_node_id == -2)
1969                                 numa_node_id = cpu_to_node(cpu);
1970                         else if (numa_node_id != cpu_to_node(cpu))
1971                                 numa_node_id = -1;
1972 #endif
1973                 } else if (dev_maps) {
1974                         /* fill in the new device map from the old device map */
1975                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1976                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1977                 }
1978
1979         }
1980
1981         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1982
1983         /* Cleanup old maps */
1984         if (dev_maps) {
1985                 for_each_possible_cpu(cpu) {
1986                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1987                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1988                         if (map && map != new_map)
1989                                 kfree_rcu(map, rcu);
1990                 }
1991
1992                 kfree_rcu(dev_maps, rcu);
1993         }
1994
1995         dev_maps = new_dev_maps;
1996         active = true;
1997
1998 out_no_new_maps:
1999         /* update Tx queue numa node */
2000         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
2001                                      (numa_node_id >= 0) ? numa_node_id :
2002                                      NUMA_NO_NODE);
2003
2004         if (!dev_maps)
2005                 goto out_no_maps;
2006
2007         /* removes queue from unused CPUs */
2008         for_each_possible_cpu(cpu) {
2009                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
2010                         continue;
2011
2012                 if (remove_xps_queue(dev_maps, cpu, index))
2013                         active = true;
2014         }
2015
2016         /* free map if not active */
2017         if (!active) {
2018                 RCU_INIT_POINTER(dev->xps_maps, NULL);
2019                 kfree_rcu(dev_maps, rcu);
2020         }
2021
2022 out_no_maps:
2023         mutex_unlock(&xps_map_mutex);
2024
2025         return 0;
2026 error:
2027         /* remove any maps that we added */
2028         for_each_possible_cpu(cpu) {
2029                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2030                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2031                                  NULL;
2032                 if (new_map && new_map != map)
2033                         kfree(new_map);
2034         }
2035
2036         mutex_unlock(&xps_map_mutex);
2037
2038         kfree(new_dev_maps);
2039         return -ENOMEM;
2040 }
2041 EXPORT_SYMBOL(netif_set_xps_queue);
2042
2043 #endif
2044 /*
2045  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2046  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2047  */
2048 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2049 {
2050         int rc;
2051
2052         if (txq < 1 || txq > dev->num_tx_queues)
2053                 return -EINVAL;
2054
2055         if (dev->reg_state == NETREG_REGISTERED ||
2056             dev->reg_state == NETREG_UNREGISTERING) {
2057                 ASSERT_RTNL();
2058
2059                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2060                                                   txq);
2061                 if (rc)
2062                         return rc;
2063
2064                 if (dev->num_tc)
2065                         netif_setup_tc(dev, txq);
2066
2067                 if (txq < dev->real_num_tx_queues) {
2068                         qdisc_reset_all_tx_gt(dev, txq);
2069 #ifdef CONFIG_XPS
2070                         netif_reset_xps_queues_gt(dev, txq);
2071 #endif
2072                 }
2073         }
2074
2075         dev->real_num_tx_queues = txq;
2076         return 0;
2077 }
2078 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2079
2080 #ifdef CONFIG_RPS
2081 /**
2082  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2083  *      @dev: Network device
2084  *      @rxq: Actual number of RX queues
2085  *
2086  *      This must be called either with the rtnl_lock held or before
2087  *      registration of the net device.  Returns 0 on success, or a
2088  *      negative error code.  If called before registration, it always
2089  *      succeeds.
2090  */
2091 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2092 {
2093         int rc;
2094
2095         if (rxq < 1 || rxq > dev->num_rx_queues)
2096                 return -EINVAL;
2097
2098         if (dev->reg_state == NETREG_REGISTERED) {
2099                 ASSERT_RTNL();
2100
2101                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2102                                                   rxq);
2103                 if (rc)
2104                         return rc;
2105         }
2106
2107         dev->real_num_rx_queues = rxq;
2108         return 0;
2109 }
2110 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2111 #endif
2112
2113 /**
2114  * netif_get_num_default_rss_queues - default number of RSS queues
2115  *
2116  * This routine should set an upper limit on the number of RSS queues
2117  * used by default by multiqueue devices.
2118  */
2119 int netif_get_num_default_rss_queues(void)
2120 {
2121         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2122 }
2123 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2124
2125 static inline void __netif_reschedule(struct Qdisc *q)
2126 {
2127         struct softnet_data *sd;
2128         unsigned long flags;
2129
2130         local_irq_save(flags);
2131         sd = &__get_cpu_var(softnet_data);
2132         q->next_sched = NULL;
2133         *sd->output_queue_tailp = q;
2134         sd->output_queue_tailp = &q->next_sched;
2135         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2136         local_irq_restore(flags);
2137 }
2138
2139 void __netif_schedule(struct Qdisc *q)
2140 {
2141         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2142                 __netif_reschedule(q);
2143 }
2144 EXPORT_SYMBOL(__netif_schedule);
2145
2146 void dev_kfree_skb_irq(struct sk_buff *skb)
2147 {
2148         if (atomic_dec_and_test(&skb->users)) {
2149                 struct softnet_data *sd;
2150                 unsigned long flags;
2151
2152                 local_irq_save(flags);
2153                 sd = &__get_cpu_var(softnet_data);
2154                 skb->next = sd->completion_queue;
2155                 sd->completion_queue = skb;
2156                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2157                 local_irq_restore(flags);
2158         }
2159 }
2160 EXPORT_SYMBOL(dev_kfree_skb_irq);
2161
2162 void dev_kfree_skb_any(struct sk_buff *skb)
2163 {
2164         if (in_irq() || irqs_disabled())
2165                 dev_kfree_skb_irq(skb);
2166         else
2167                 dev_kfree_skb(skb);
2168 }
2169 EXPORT_SYMBOL(dev_kfree_skb_any);
2170
2171
2172 /**
2173  * netif_device_detach - mark device as removed
2174  * @dev: network device
2175  *
2176  * Mark device as removed from system and therefore no longer available.
2177  */
2178 void netif_device_detach(struct net_device *dev)
2179 {
2180         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2181             netif_running(dev)) {
2182                 netif_tx_stop_all_queues(dev);
2183         }
2184 }
2185 EXPORT_SYMBOL(netif_device_detach);
2186
2187 /**
2188  * netif_device_attach - mark device as attached
2189  * @dev: network device
2190  *
2191  * Mark device as attached from system and restart if needed.
2192  */
2193 void netif_device_attach(struct net_device *dev)
2194 {
2195         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2196             netif_running(dev)) {
2197                 netif_tx_wake_all_queues(dev);
2198                 __netdev_watchdog_up(dev);
2199         }
2200 }
2201 EXPORT_SYMBOL(netif_device_attach);
2202
2203 static void skb_warn_bad_offload(const struct sk_buff *skb)
2204 {
2205         static const netdev_features_t null_features = 0;
2206         struct net_device *dev = skb->dev;
2207         const char *driver = "";
2208
2209         if (!net_ratelimit())
2210                 return;
2211
2212         if (dev && dev->dev.parent)
2213                 driver = dev_driver_string(dev->dev.parent);
2214
2215         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2216              "gso_type=%d ip_summed=%d\n",
2217              driver, dev ? &dev->features : &null_features,
2218              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2219              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2220              skb_shinfo(skb)->gso_type, skb->ip_summed);
2221 }
2222
2223 /*
2224  * Invalidate hardware checksum when packet is to be mangled, and
2225  * complete checksum manually on outgoing path.
2226  */
2227 int skb_checksum_help(struct sk_buff *skb)
2228 {
2229         __wsum csum;
2230         int ret = 0, offset;
2231
2232         if (skb->ip_summed == CHECKSUM_COMPLETE)
2233                 goto out_set_summed;
2234
2235         if (unlikely(skb_shinfo(skb)->gso_size)) {
2236                 skb_warn_bad_offload(skb);
2237                 return -EINVAL;
2238         }
2239
2240         /* Before computing a checksum, we should make sure no frag could
2241          * be modified by an external entity : checksum could be wrong.
2242          */
2243         if (skb_has_shared_frag(skb)) {
2244                 ret = __skb_linearize(skb);
2245                 if (ret)
2246                         goto out;
2247         }
2248
2249         offset = skb_checksum_start_offset(skb);
2250         BUG_ON(offset >= skb_headlen(skb));
2251         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2252
2253         offset += skb->csum_offset;
2254         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2255
2256         if (skb_cloned(skb) &&
2257             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2258                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2259                 if (ret)
2260                         goto out;
2261         }
2262
2263         *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0;
2264 out_set_summed:
2265         skb->ip_summed = CHECKSUM_NONE;
2266 out:
2267         return ret;
2268 }
2269 EXPORT_SYMBOL(skb_checksum_help);
2270
2271 __be16 skb_network_protocol(struct sk_buff *skb)
2272 {
2273         __be16 type = skb->protocol;
2274         int vlan_depth = ETH_HLEN;
2275
2276         /* Tunnel gso handlers can set protocol to ethernet. */
2277         if (type == htons(ETH_P_TEB)) {
2278                 struct ethhdr *eth;
2279
2280                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2281                         return 0;
2282
2283                 eth = (struct ethhdr *)skb_mac_header(skb);
2284                 type = eth->h_proto;
2285         }
2286
2287         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2288                 struct vlan_hdr *vh;
2289
2290                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2291                         return 0;
2292
2293                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2294                 type = vh->h_vlan_encapsulated_proto;
2295                 vlan_depth += VLAN_HLEN;
2296         }
2297
2298         return type;
2299 }
2300
2301 /**
2302  *      skb_mac_gso_segment - mac layer segmentation handler.
2303  *      @skb: buffer to segment
2304  *      @features: features for the output path (see dev->features)
2305  */
2306 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2307                                     netdev_features_t features)
2308 {
2309         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2310         struct packet_offload *ptype;
2311         __be16 type = skb_network_protocol(skb);
2312
2313         if (unlikely(!type))
2314                 return ERR_PTR(-EINVAL);
2315
2316         __skb_pull(skb, skb->mac_len);
2317
2318         rcu_read_lock();
2319         list_for_each_entry_rcu(ptype, &offload_base, list) {
2320                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2321                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2322                                 int err;
2323
2324                                 err = ptype->callbacks.gso_send_check(skb);
2325                                 segs = ERR_PTR(err);
2326                                 if (err || skb_gso_ok(skb, features))
2327                                         break;
2328                                 __skb_push(skb, (skb->data -
2329                                                  skb_network_header(skb)));
2330                         }
2331                         segs = ptype->callbacks.gso_segment(skb, features);
2332                         break;
2333                 }
2334         }
2335         rcu_read_unlock();
2336
2337         __skb_push(skb, skb->data - skb_mac_header(skb));
2338
2339         return segs;
2340 }
2341 EXPORT_SYMBOL(skb_mac_gso_segment);
2342
2343
2344 /* openvswitch calls this on rx path, so we need a different check.
2345  */
2346 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2347 {
2348         if (tx_path)
2349                 return skb->ip_summed != CHECKSUM_PARTIAL &&
2350                        skb->ip_summed != CHECKSUM_NONE;
2351
2352         return skb->ip_summed == CHECKSUM_NONE;
2353 }
2354
2355 /**
2356  *      __skb_gso_segment - Perform segmentation on skb.
2357  *      @skb: buffer to segment
2358  *      @features: features for the output path (see dev->features)
2359  *      @tx_path: whether it is called in TX path
2360  *
2361  *      This function segments the given skb and returns a list of segments.
2362  *
2363  *      It may return NULL if the skb requires no segmentation.  This is
2364  *      only possible when GSO is used for verifying header integrity.
2365  */
2366 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2367                                   netdev_features_t features, bool tx_path)
2368 {
2369         struct sk_buff *segs;
2370
2371         if (unlikely(skb_needs_check(skb, tx_path))) {
2372                 int err;
2373
2374                 /* We're going to init ->check field in TCP or UDP header */
2375                 if (skb_header_cloned(skb) &&
2376                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2377                         return ERR_PTR(err);
2378         }
2379
2380         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2381         skb_reset_mac_header(skb);
2382         skb_reset_mac_len(skb);
2383
2384         segs = skb_mac_gso_segment(skb, features);
2385
2386         if (unlikely(skb_needs_check(skb, tx_path)))
2387                 skb_warn_bad_offload(skb);
2388
2389         return segs;
2390 }
2391 EXPORT_SYMBOL(__skb_gso_segment);
2392
2393 /* Take action when hardware reception checksum errors are detected. */
2394 #ifdef CONFIG_BUG
2395 void netdev_rx_csum_fault(struct net_device *dev)
2396 {
2397         if (net_ratelimit()) {
2398                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2399                 dump_stack();
2400         }
2401 }
2402 EXPORT_SYMBOL(netdev_rx_csum_fault);
2403 #endif
2404
2405 /* Actually, we should eliminate this check as soon as we know, that:
2406  * 1. IOMMU is present and allows to map all the memory.
2407  * 2. No high memory really exists on this machine.
2408  */
2409
2410 static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
2411 {
2412 #ifdef CONFIG_HIGHMEM
2413         int i;
2414         if (!(dev->features & NETIF_F_HIGHDMA)) {
2415                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2416                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2417                         if (PageHighMem(skb_frag_page(frag)))
2418                                 return 1;
2419                 }
2420         }
2421
2422         if (PCI_DMA_BUS_IS_PHYS) {
2423                 struct device *pdev = dev->dev.parent;
2424
2425                 if (!pdev)
2426                         return 0;
2427                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2428                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2429                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2430                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2431                                 return 1;
2432                 }
2433         }
2434 #endif
2435         return 0;
2436 }
2437
2438 struct dev_gso_cb {
2439         void (*destructor)(struct sk_buff *skb);
2440 };
2441
2442 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2443
2444 static void dev_gso_skb_destructor(struct sk_buff *skb)
2445 {
2446         struct dev_gso_cb *cb;
2447
2448         do {
2449                 struct sk_buff *nskb = skb->next;
2450
2451                 skb->next = nskb->next;
2452                 nskb->next = NULL;
2453                 kfree_skb(nskb);
2454         } while (skb->next);
2455
2456         cb = DEV_GSO_CB(skb);
2457         if (cb->destructor)
2458                 cb->destructor(skb);
2459 }
2460
2461 /**
2462  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2463  *      @skb: buffer to segment
2464  *      @features: device features as applicable to this skb
2465  *
2466  *      This function segments the given skb and stores the list of segments
2467  *      in skb->next.
2468  */
2469 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2470 {
2471         struct sk_buff *segs;
2472
2473         segs = skb_gso_segment(skb, features);
2474
2475         /* Verifying header integrity only. */
2476         if (!segs)
2477                 return 0;
2478
2479         if (IS_ERR(segs))
2480                 return PTR_ERR(segs);
2481
2482         skb->next = segs;
2483         DEV_GSO_CB(skb)->destructor = skb->destructor;
2484         skb->destructor = dev_gso_skb_destructor;
2485
2486         return 0;
2487 }
2488
2489 static netdev_features_t harmonize_features(struct sk_buff *skb,
2490                                             __be16 protocol,
2491                                             const struct net_device *dev,
2492                                             netdev_features_t features)
2493 {
2494         if (skb->ip_summed != CHECKSUM_NONE &&
2495             !can_checksum_protocol(features, protocol)) {
2496                 features &= ~NETIF_F_ALL_CSUM;
2497         }
2498         if (illegal_highdma(dev, skb))
2499                 features &= ~NETIF_F_SG;
2500
2501         return features;
2502 }
2503
2504 netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2505                                          const struct net_device *dev)
2506 {
2507         __be16 protocol = skb->protocol;
2508         netdev_features_t features = dev->features;
2509
2510         if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
2511                 features &= ~NETIF_F_GSO_MASK;
2512
2513         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2514                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2515                 protocol = veh->h_vlan_encapsulated_proto;
2516         } else if (!vlan_tx_tag_present(skb)) {
2517                 return harmonize_features(skb, protocol, dev, features);
2518         }
2519
2520         features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2521                                                NETIF_F_HW_VLAN_STAG_TX);
2522
2523         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2524                 return harmonize_features(skb, protocol, dev, features);
2525         } else {
2526                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2527                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2528                                 NETIF_F_HW_VLAN_STAG_TX;
2529                 return harmonize_features(skb, protocol, dev, features);
2530         }
2531
2532         return harmonize_features(skb, protocol, dev, features);
2533 }
2534 EXPORT_SYMBOL(netif_skb_dev_features);
2535
2536 /*
2537  * Returns true if either:
2538  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2539  *      2. skb is fragmented and the device does not support SG.
2540  */
2541 static inline int skb_needs_linearize(struct sk_buff *skb,
2542                                       netdev_features_t features)
2543 {
2544         return skb_is_nonlinear(skb) &&
2545                         ((skb_has_frag_list(skb) &&
2546                                 !(features & NETIF_F_FRAGLIST)) ||
2547                         (skb_shinfo(skb)->nr_frags &&
2548                                 !(features & NETIF_F_SG)));
2549 }
2550
2551 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2552                         struct netdev_queue *txq)
2553 {
2554         const struct net_device_ops *ops = dev->netdev_ops;
2555         int rc = NETDEV_TX_OK;
2556         unsigned int skb_len;
2557
2558         if (likely(!skb->next)) {
2559                 netdev_features_t features;
2560
2561                 /*
2562                  * If device doesn't need skb->dst, release it right now while
2563                  * its hot in this cpu cache
2564                  */
2565                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2566                         skb_dst_drop(skb);
2567
2568                 features = netif_skb_features(skb);
2569
2570                 if (vlan_tx_tag_present(skb) &&
2571                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2572                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2573                                              vlan_tx_tag_get(skb));
2574                         if (unlikely(!skb))
2575                                 goto out;
2576
2577                         skb->vlan_tci = 0;
2578                 }
2579
2580                 /* If encapsulation offload request, verify we are testing
2581                  * hardware encapsulation features instead of standard
2582                  * features for the netdev
2583                  */
2584                 if (skb->encapsulation)
2585                         features &= dev->hw_enc_features;
2586
2587                 if (netif_needs_gso(skb, features)) {
2588                         if (unlikely(dev_gso_segment(skb, features)))
2589                                 goto out_kfree_skb;
2590                         if (skb->next)
2591                                 goto gso;
2592                 } else {
2593                         if (skb_needs_linearize(skb, features) &&
2594                             __skb_linearize(skb))
2595                                 goto out_kfree_skb;
2596
2597                         /* If packet is not checksummed and device does not
2598                          * support checksumming for this protocol, complete
2599                          * checksumming here.
2600                          */
2601                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2602                                 if (skb->encapsulation)
2603                                         skb_set_inner_transport_header(skb,
2604                                                 skb_checksum_start_offset(skb));
2605                                 else
2606                                         skb_set_transport_header(skb,
2607                                                 skb_checksum_start_offset(skb));
2608                                 if (!(features & NETIF_F_ALL_CSUM) &&
2609                                      skb_checksum_help(skb))
2610                                         goto out_kfree_skb;
2611                         }
2612                 }
2613
2614                 if (!list_empty(&ptype_all))
2615                         dev_queue_xmit_nit(skb, dev);
2616
2617                 skb_len = skb->len;
2618                 rc = ops->ndo_start_xmit(skb, dev);
2619                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2620                 if (rc == NETDEV_TX_OK)
2621                         txq_trans_update(txq);
2622                 return rc;
2623         }
2624
2625 gso:
2626         do {
2627                 struct sk_buff *nskb = skb->next;
2628
2629                 skb->next = nskb->next;
2630                 nskb->next = NULL;
2631
2632                 if (!list_empty(&ptype_all))
2633                         dev_queue_xmit_nit(nskb, dev);
2634
2635                 skb_len = nskb->len;
2636                 rc = ops->ndo_start_xmit(nskb, dev);
2637                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2638                 if (unlikely(rc != NETDEV_TX_OK)) {
2639                         if (rc & ~NETDEV_TX_MASK)
2640                                 goto out_kfree_gso_skb;
2641                         nskb->next = skb->next;
2642                         skb->next = nskb;
2643                         return rc;
2644                 }
2645                 txq_trans_update(txq);
2646                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2647                         return NETDEV_TX_BUSY;
2648         } while (skb->next);
2649
2650 out_kfree_gso_skb:
2651         if (likely(skb->next == NULL)) {
2652                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2653                 consume_skb(skb);
2654                 return rc;
2655         }
2656 out_kfree_skb:
2657         kfree_skb(skb);
2658 out:
2659         return rc;
2660 }
2661
2662 static void qdisc_pkt_len_init(struct sk_buff *skb)
2663 {
2664         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2665
2666         qdisc_skb_cb(skb)->pkt_len = skb->len;
2667
2668         /* To get more precise estimation of bytes sent on wire,
2669          * we add to pkt_len the headers size of all segments
2670          */
2671         if (shinfo->gso_size)  {
2672                 unsigned int hdr_len;
2673                 u16 gso_segs = shinfo->gso_segs;
2674
2675                 /* mac layer + network layer */
2676                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2677
2678                 /* + transport layer */
2679                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2680                         hdr_len += tcp_hdrlen(skb);
2681                 else
2682                         hdr_len += sizeof(struct udphdr);
2683
2684                 if (shinfo->gso_type & SKB_GSO_DODGY)
2685                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2686                                                 shinfo->gso_size);
2687
2688                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2689         }
2690 }
2691
2692 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2693                                  struct net_device *dev,
2694                                  struct netdev_queue *txq)
2695 {
2696         spinlock_t *root_lock = qdisc_lock(q);
2697         bool contended;
2698         int rc;
2699
2700         qdisc_pkt_len_init(skb);
2701         qdisc_calculate_pkt_len(skb, q);
2702         /*
2703          * Heuristic to force contended enqueues to serialize on a
2704          * separate lock before trying to get qdisc main lock.
2705          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2706          * and dequeue packets faster.
2707          */
2708         contended = qdisc_is_running(q);
2709         if (unlikely(contended))
2710                 spin_lock(&q->busylock);
2711
2712         spin_lock(root_lock);
2713         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2714                 printk(KERN_WARNING "[mtk_net]__dev_xmit_skb drop skb_len = %d \n", skb->len);
2715                 kfree_skb(skb);
2716                 rc = NET_XMIT_DROP;
2717         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2718                    qdisc_run_begin(q)) {
2719                 /*
2720                  * This is a work-conserving queue; there are no old skbs
2721                  * waiting to be sent out; and the qdisc is not running -
2722                  * xmit the skb directly.
2723                  */
2724                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2725                         skb_dst_force(skb);
2726
2727                 qdisc_bstats_update(q, skb);
2728
2729                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2730                         if (unlikely(contended)) {
2731                                 spin_unlock(&q->busylock);
2732                                 contended = false;
2733                         }
2734                         __qdisc_run(q);
2735                 } else
2736                         qdisc_run_end(q);
2737
2738                 rc = NET_XMIT_SUCCESS;
2739         } else {
2740                 skb_dst_force(skb);
2741                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2742                 if (qdisc_run_begin(q)) {
2743                         if (unlikely(contended)) {
2744                                 spin_unlock(&q->busylock);
2745                                 contended = false;
2746                         }
2747                         __qdisc_run(q);
2748                 }
2749         }
2750         spin_unlock(root_lock);
2751         if (unlikely(contended))
2752                 spin_unlock(&q->busylock);
2753         return rc;
2754 }
2755
2756 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2757 static void skb_update_prio(struct sk_buff *skb)
2758 {
2759         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2760
2761         if (!skb->priority && skb->sk && map) {
2762                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2763
2764                 if (prioidx < map->priomap_len)
2765                         skb->priority = map->priomap[prioidx];
2766         }
2767 }
2768 #else
2769 #define skb_update_prio(skb)
2770 #endif
2771
2772 static DEFINE_PER_CPU(int, xmit_recursion);
2773 #define RECURSION_LIMIT 10
2774
2775 /**
2776  *      dev_loopback_xmit - loop back @skb
2777  *      @skb: buffer to transmit
2778  */
2779 int dev_loopback_xmit(struct sk_buff *skb)
2780 {
2781         skb_reset_mac_header(skb);
2782         __skb_pull(skb, skb_network_offset(skb));
2783         skb->pkt_type = PACKET_LOOPBACK;
2784         skb->ip_summed = CHECKSUM_UNNECESSARY;
2785         WARN_ON(!skb_dst(skb));
2786         skb_dst_force(skb);
2787         netif_rx_ni(skb);
2788         return 0;
2789 }
2790 EXPORT_SYMBOL(dev_loopback_xmit);
2791
2792 /**
2793  *      dev_queue_xmit - transmit a buffer
2794  *      @skb: buffer to transmit
2795  *
2796  *      Queue a buffer for transmission to a network device. The caller must
2797  *      have set the device and priority and built the buffer before calling
2798  *      this function. The function can be called from an interrupt.
2799  *
2800  *      A negative errno code is returned on a failure. A success does not
2801  *      guarantee the frame will be transmitted as it may be dropped due
2802  *      to congestion or traffic shaping.
2803  *
2804  * -----------------------------------------------------------------------------------
2805  *      I notice this method can also return errors from the queue disciplines,
2806  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2807  *      be positive.
2808  *
2809  *      Regardless of the return value, the skb is consumed, so it is currently
2810  *      difficult to retry a send to this method.  (You can bump the ref count
2811  *      before sending to hold a reference for retry if you are careful.)
2812  *
2813  *      When calling this method, interrupts MUST be enabled.  This is because
2814  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2815  *          --BLG
2816  */
2817 int dev_queue_xmit(struct sk_buff *skb)
2818 {
2819         struct net_device *dev = skb->dev;
2820         struct netdev_queue *txq;
2821         struct Qdisc *q;
2822         int rc = -ENOMEM;
2823
2824         skb_reset_mac_header(skb);
2825
2826 #ifdef UDP_SKT_WIFI
2827
2828         if (unlikely((sysctl_met_is_enable == 1) && (sysctl_udp_met_port > 0)
2829                  && (ip_hdr(skb)->protocol == IPPROTO_UDP) && skb->sk)) {
2830
2831             if (sysctl_udp_met_port == ntohs((inet_sk(skb->sk))->inet_sport)) {
2832                 struct udphdr * udp_iphdr = udp_hdr(skb);
2833                 if (udp_iphdr && (ntohs(udp_iphdr->len) >= 12)) {
2834                 __u16 * seq_id = (__u16 *)((char *)udp_iphdr + 10);
2835                     udp_event_trace_printk("F|%d|%s|%d\n", current->pid, *seq_id);
2836
2837                 }
2838             }
2839         }
2840 #endif
2841
2842         /* Disable soft irqs for various locks below. Also
2843          * stops preemption for RCU.
2844          */
2845         rcu_read_lock_bh();
2846
2847         skb_update_prio(skb);
2848
2849         txq = netdev_pick_tx(dev, skb);
2850         q = rcu_dereference_bh(txq->qdisc);
2851
2852 #ifdef CONFIG_NET_CLS_ACT
2853         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2854 #endif
2855         trace_net_dev_queue(skb);
2856         if (q->enqueue) {
2857                 rc = __dev_xmit_skb(skb, q, dev, txq);
2858                 goto out;
2859         }
2860
2861         /* The device has no queue. Common case for software devices:
2862            loopback, all the sorts of tunnels...
2863
2864            Really, it is unlikely that netif_tx_lock protection is necessary
2865            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2866            counters.)
2867            However, it is possible, that they rely on protection
2868            made by us here.
2869
2870            Check this and shot the lock. It is not prone from deadlocks.
2871            Either shot noqueue qdisc, it is even simpler 8)
2872          */
2873         if (dev->flags & IFF_UP) {
2874                 int cpu = smp_processor_id(); /* ok because BHs are off */
2875
2876                 if (txq->xmit_lock_owner != cpu) {
2877
2878                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2879                                 goto recursion_alert;
2880
2881                         HARD_TX_LOCK(dev, txq, cpu);
2882
2883                         if (!netif_xmit_stopped(txq)) {
2884                                 __this_cpu_inc(xmit_recursion);
2885                                 rc = dev_hard_start_xmit(skb, dev, txq);
2886                                 __this_cpu_dec(xmit_recursion);
2887                                 if (dev_xmit_complete(rc)) {
2888                                         HARD_TX_UNLOCK(dev, txq);
2889                                         goto out;
2890                                 }
2891                         }
2892                         HARD_TX_UNLOCK(dev, txq);
2893                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2894                                              dev->name);
2895                 } else {
2896                         /* Recursion is detected! It is possible,
2897                          * unfortunately
2898                          */
2899 recursion_alert:
2900                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2901                                              dev->name);
2902                 }
2903         }
2904
2905         rc = -ENETDOWN;
2906         rcu_read_unlock_bh();
2907
2908         kfree_skb(skb);
2909         return rc;
2910 out:
2911         rcu_read_unlock_bh();
2912         return rc;
2913 }
2914 EXPORT_SYMBOL(dev_queue_xmit);
2915
2916
2917 /*=======================================================================
2918                         Receiver routines
2919   =======================================================================*/
2920
2921 int netdev_max_backlog __read_mostly = 1000;
2922 EXPORT_SYMBOL(netdev_max_backlog);
2923
2924 int netdev_tstamp_prequeue __read_mostly = 1;
2925 int netdev_budget __read_mostly = 300;
2926 int weight_p __read_mostly = 64;            /* old backlog weight */
2927
2928 /* Called with irq disabled */
2929 static inline void ____napi_schedule(struct softnet_data *sd,
2930                                      struct napi_struct *napi)
2931 {
2932         list_add_tail(&napi->poll_list, &sd->poll_list);
2933         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2934 }
2935
2936 #ifdef CONFIG_RPS
2937
2938 /* One global table that all flow-based protocols share. */
2939 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2940 EXPORT_SYMBOL(rps_sock_flow_table);
2941
2942 struct static_key rps_needed __read_mostly;
2943
2944 static struct rps_dev_flow *
2945 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2946             struct rps_dev_flow *rflow, u16 next_cpu)
2947 {
2948         if (next_cpu != RPS_NO_CPU) {
2949 #ifdef CONFIG_RFS_ACCEL
2950                 struct netdev_rx_queue *rxqueue;
2951                 struct rps_dev_flow_table *flow_table;
2952                 struct rps_dev_flow *old_rflow;
2953                 u32 flow_id;
2954                 u16 rxq_index;
2955                 int rc;
2956
2957                 /* Should we steer this flow to a different hardware queue? */
2958                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2959                     !(dev->features & NETIF_F_NTUPLE))
2960                         goto out;
2961                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2962                 if (rxq_index == skb_get_rx_queue(skb))
2963                         goto out;
2964
2965                 rxqueue = dev->_rx + rxq_index;
2966                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2967                 if (!flow_table)
2968                         goto out;
2969                 flow_id = skb->rxhash & flow_table->mask;
2970                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2971                                                         rxq_index, flow_id);
2972                 if (rc < 0)
2973                         goto out;
2974                 old_rflow = rflow;
2975                 rflow = &flow_table->flows[flow_id];
2976                 rflow->filter = rc;
2977                 if (old_rflow->filter == rflow->filter)
2978                         old_rflow->filter = RPS_NO_FILTER;
2979         out:
2980 #endif
2981                 rflow->last_qtail =
2982                         per_cpu(softnet_data, next_cpu).input_queue_head;
2983         }
2984
2985         rflow->cpu = next_cpu;
2986         return rflow;
2987 }
2988
2989 /*
2990  * get_rps_cpu is called from netif_receive_skb and returns the target
2991  * CPU from the RPS map of the receiving queue for a given skb.
2992  * rcu_read_lock must be held on entry.
2993  */
2994 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2995                        struct rps_dev_flow **rflowp)
2996 {
2997         struct netdev_rx_queue *rxqueue;
2998         struct rps_map *map;
2999         struct rps_dev_flow_table *flow_table;
3000         struct rps_sock_flow_table *sock_flow_table;
3001         int cpu = -1;
3002         u16 tcpu;
3003
3004         if (skb_rx_queue_recorded(skb)) {
3005                 u16 index = skb_get_rx_queue(skb);
3006                 if (unlikely(index >= dev->real_num_rx_queues)) {
3007                         WARN_ONCE(dev->real_num_rx_queues > 1,
3008                                   "%s received packet on queue %u, but number "
3009                                   "of RX queues is %u\n",
3010                                   dev->name, index, dev->real_num_rx_queues);
3011                         goto done;
3012                 }
3013                 rxqueue = dev->_rx + index;
3014         } else
3015                 rxqueue = dev->_rx;
3016
3017         map = rcu_dereference(rxqueue->rps_map);
3018         if (map) {
3019                 if (map->len == 1 &&
3020                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
3021                         tcpu = map->cpus[0];
3022                         if (cpu_online(tcpu))
3023                                 cpu = tcpu;
3024                         goto done;
3025                 }
3026         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
3027                 goto done;
3028         }
3029
3030         skb_reset_network_header(skb);
3031         if (!skb_get_rxhash(skb))
3032                 goto done;
3033
3034         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3035         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3036         if (flow_table && sock_flow_table) {
3037                 u16 next_cpu;
3038                 struct rps_dev_flow *rflow;
3039
3040                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3041                 tcpu = rflow->cpu;
3042
3043                 next_cpu = sock_flow_table->ents[skb->rxhash &
3044                     sock_flow_table->mask];
3045
3046                 /*
3047                  * If the desired CPU (where last recvmsg was done) is
3048                  * different from current CPU (one in the rx-queue flow
3049                  * table entry), switch if one of the following holds:
3050                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3051                  *   - Current CPU is offline.
3052                  *   - The current CPU's queue tail has advanced beyond the
3053                  *     last packet that was enqueued using this table entry.
3054                  *     This guarantees that all previous packets for the flow
3055                  *     have been dequeued, thus preserving in order delivery.
3056                  */
3057                 if (unlikely(tcpu != next_cpu) &&
3058                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3059                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3060                       rflow->last_qtail)) >= 0)) {
3061                         tcpu = next_cpu;
3062                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3063                 }
3064
3065                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3066                         *rflowp = rflow;
3067                         cpu = tcpu;
3068                         goto done;
3069                 }
3070         }
3071
3072         if (map) {
3073                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3074
3075                 if (cpu_online(tcpu)) {
3076                         cpu = tcpu;
3077                         goto done;
3078                 }
3079         }
3080
3081 done:
3082         return cpu;
3083 }
3084
3085 #ifdef CONFIG_RFS_ACCEL
3086
3087 /**
3088  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3089  * @dev: Device on which the filter was set
3090  * @rxq_index: RX queue index
3091  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3092  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3093  *
3094  * Drivers that implement ndo_rx_flow_steer() should periodically call
3095  * this function for each installed filter and remove the filters for
3096  * which it returns %true.
3097  */
3098 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3099                          u32 flow_id, u16 filter_id)
3100 {
3101         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3102         struct rps_dev_flow_table *flow_table;
3103         struct rps_dev_flow *rflow;
3104         bool expire = true;
3105         int cpu;
3106
3107         rcu_read_lock();
3108         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3109         if (flow_table && flow_id <= flow_table->mask) {
3110                 rflow = &flow_table->flows[flow_id];
3111                 cpu = ACCESS_ONCE(rflow->cpu);
3112                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3113                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3114                            rflow->last_qtail) <
3115                      (int)(10 * flow_table->mask)))
3116                         expire = false;
3117         }
3118         rcu_read_unlock();
3119         return expire;
3120 }
3121 EXPORT_SYMBOL(rps_may_expire_flow);
3122
3123 #endif /* CONFIG_RFS_ACCEL */
3124
3125 /* Called from hardirq (IPI) context */
3126 static void rps_trigger_softirq(void *data)
3127 {
3128         struct softnet_data *sd = data;
3129
3130         ____napi_schedule(sd, &sd->backlog);
3131         sd->received_rps++;
3132 }
3133
3134 #endif /* CONFIG_RPS */
3135
3136 /*
3137  * Check if this softnet_data structure is another cpu one
3138  * If yes, queue it to our IPI list and return 1
3139  * If no, return 0
3140  */
3141 static int rps_ipi_queued(struct softnet_data *sd)
3142 {
3143 #ifdef CONFIG_RPS
3144         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3145
3146         if (sd != mysd) {
3147                 sd->rps_ipi_next = mysd->rps_ipi_list;
3148                 mysd->rps_ipi_list = sd;
3149
3150                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3151                 return 1;
3152         }
3153 #endif /* CONFIG_RPS */
3154         return 0;
3155 }
3156
3157 /*
3158  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3159  * queue (may be a remote CPU queue).
3160  */
3161 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3162                               unsigned int *qtail)
3163 {
3164         struct softnet_data *sd;
3165         unsigned long flags;
3166
3167         sd = &per_cpu(softnet_data, cpu);
3168
3169         local_irq_save(flags);
3170
3171         rps_lock(sd);
3172         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3173                 if (skb_queue_len(&sd->input_pkt_queue)) {
3174 enqueue:
3175                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3176                         input_queue_tail_incr_save(sd, qtail);
3177                         rps_unlock(sd);
3178                         local_irq_restore(flags);
3179                         return NET_RX_SUCCESS;
3180                 }
3181
3182                 /* Schedule NAPI for backlog device
3183                  * We can use non atomic operation since we own the queue lock
3184                  */
3185                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3186                         if (!rps_ipi_queued(sd))
3187                                 ____napi_schedule(sd, &sd->backlog);
3188                 }
3189                 goto enqueue;
3190         }
3191
3192         sd->dropped++;
3193         rps_unlock(sd);
3194
3195         local_irq_restore(flags);
3196
3197         atomic_long_inc(&skb->dev->rx_dropped);
3198         kfree_skb(skb);
3199         return NET_RX_DROP;
3200 }
3201
3202 /**
3203  *      netif_rx        -       post buffer to the network code
3204  *      @skb: buffer to post
3205  *
3206  *      This function receives a packet from a device driver and queues it for
3207  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3208  *      may be dropped during processing for congestion control or by the
3209  *      protocol layers.
3210  *
3211  *      return values:
3212  *      NET_RX_SUCCESS  (no congestion)
3213  *      NET_RX_DROP     (packet was dropped)
3214  *
3215  */
3216
3217 int netif_rx(struct sk_buff *skb)
3218 {
3219         int ret;
3220
3221         /* if netpoll wants it, pretend we never saw it */
3222         if (netpoll_rx(skb))
3223                 return NET_RX_DROP;
3224
3225         net_timestamp_check(netdev_tstamp_prequeue, skb);
3226
3227         trace_netif_rx(skb);
3228 #ifdef CONFIG_RPS
3229         if (static_key_false(&rps_needed)) {
3230                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3231                 int cpu;
3232
3233                 preempt_disable();
3234                 rcu_read_lock();
3235
3236                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3237                 if (cpu < 0)
3238                         cpu = smp_processor_id();
3239
3240                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3241
3242                 rcu_read_unlock();
3243                 preempt_enable();
3244         } else
3245 #endif
3246         {
3247                 unsigned int qtail;
3248                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3249                 put_cpu();
3250         }
3251         return ret;
3252 }
3253 EXPORT_SYMBOL(netif_rx);
3254
3255 int netif_rx_ni(struct sk_buff *skb)
3256 {
3257         int err;
3258
3259         preempt_disable();
3260         err = netif_rx(skb);
3261         if (local_softirq_pending())
3262                 do_softirq();
3263         preempt_enable();
3264
3265         return err;
3266 }
3267 EXPORT_SYMBOL(netif_rx_ni);
3268
3269 static void net_tx_action(struct softirq_action *h)
3270 {
3271         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3272
3273         if (sd->completion_queue) {
3274                 struct sk_buff *clist;
3275
3276                 local_irq_disable();
3277                 clist = sd->completion_queue;
3278                 sd->completion_queue = NULL;
3279                 local_irq_enable();
3280
3281                 while (clist) {
3282                         struct sk_buff *skb = clist;
3283                         clist = clist->next;
3284
3285                         WARN_ON(atomic_read(&skb->users));
3286                         trace_kfree_skb(skb, net_tx_action);
3287                         __kfree_skb(skb);
3288                 }
3289         }
3290
3291         if (sd->output_queue) {
3292                 struct Qdisc *head;
3293
3294                 local_irq_disable();
3295                 head = sd->output_queue;
3296                 sd->output_queue = NULL;
3297                 sd->output_queue_tailp = &sd->output_queue;
3298                 local_irq_enable();
3299
3300                 while (head) {
3301                         struct Qdisc *q = head;
3302                         spinlock_t *root_lock;
3303
3304                         head = head->next_sched;
3305
3306                         root_lock = qdisc_lock(q);
3307                         if (spin_trylock(root_lock)) {
3308                                 smp_mb__before_clear_bit();
3309                                 clear_bit(__QDISC_STATE_SCHED,
3310                                           &q->state);
3311                                 qdisc_run(q);
3312                                 spin_unlock(root_lock);
3313                         } else {
3314                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3315                                               &q->state)) {
3316                                         __netif_reschedule(q);
3317                                 } else {
3318                                         smp_mb__before_clear_bit();
3319                                         clear_bit(__QDISC_STATE_SCHED,
3320                                                   &q->state);
3321                                 }
3322                         }
3323                 }
3324         }
3325 }
3326
3327 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3328     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3329 /* This hook is defined here for ATM LANE */
3330 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3331                              unsigned char *addr) __read_mostly;
3332 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3333 #endif
3334
3335 #ifdef CONFIG_NET_CLS_ACT
3336 /* TODO: Maybe we should just force sch_ingress to be compiled in
3337  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3338  * a compare and 2 stores extra right now if we dont have it on
3339  * but have CONFIG_NET_CLS_ACT
3340  * NOTE: This doesn't stop any functionality; if you dont have
3341  * the ingress scheduler, you just can't add policies on ingress.
3342  *
3343  */
3344 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3345 {
3346         struct net_device *dev = skb->dev;
3347         u32 ttl = G_TC_RTTL(skb->tc_verd);
3348         int result = TC_ACT_OK;
3349         struct Qdisc *q;
3350
3351         if (unlikely(MAX_RED_LOOP < ttl++)) {
3352                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3353                                      skb->skb_iif, dev->ifindex);
3354                 return TC_ACT_SHOT;
3355         }
3356
3357         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3358         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3359
3360         q = rxq->qdisc;
3361         if (q != &noop_qdisc) {
3362                 spin_lock(qdisc_lock(q));
3363                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3364                         result = qdisc_enqueue_root(skb, q);
3365                 spin_unlock(qdisc_lock(q));
3366         }
3367
3368         return result;
3369 }
3370
3371 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3372                                          struct packet_type **pt_prev,
3373                                          int *ret, struct net_device *orig_dev)
3374 {
3375         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3376
3377         if (!rxq || rxq->qdisc == &noop_qdisc)
3378                 goto out;
3379
3380         if (*pt_prev) {
3381                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3382                 *pt_prev = NULL;
3383         }
3384
3385         switch (ing_filter(skb, rxq)) {
3386         case TC_ACT_SHOT:
3387         case TC_ACT_STOLEN:
3388                 kfree_skb(skb);
3389                 return NULL;
3390         }
3391
3392 out:
3393         skb->tc_verd = 0;
3394         return skb;
3395 }
3396 #endif
3397
3398 /**
3399  *      netdev_is_rx_handler_busy - check if receive handler is registered
3400  *      @dev: device to check
3401  *
3402  *      Check if a receive handler is already registered for a given device.
3403  *      Return true if there one.
3404  *
3405  *      The caller must hold the rtnl_mutex.
3406  */
3407 bool netdev_is_rx_handler_busy(struct net_device *dev)
3408 {
3409         ASSERT_RTNL();
3410         return dev && rtnl_dereference(dev->rx_handler);
3411 }
3412 EXPORT_SYMBOL_GPL(netdev_is_rx_handler_busy);
3413
3414 /**
3415  *      netdev_rx_handler_register - register receive handler
3416  *      @dev: device to register a handler for
3417  *      @rx_handler: receive handler to register
3418  *      @rx_handler_data: data pointer that is used by rx handler
3419  *
3420  *      Register a receive hander for a device. This handler will then be
3421  *      called from __netif_receive_skb. A negative errno code is returned
3422  *      on a failure.
3423  *
3424  *      The caller must hold the rtnl_mutex.
3425  *
3426  *      For a general description of rx_handler, see enum rx_handler_result.
3427  */
3428 int netdev_rx_handler_register(struct net_device *dev,
3429                                rx_handler_func_t *rx_handler,
3430                                void *rx_handler_data)
3431 {
3432         ASSERT_RTNL();
3433
3434         if (dev->rx_handler)
3435                 return -EBUSY;
3436
3437         /* Note: rx_handler_data must be set before rx_handler */
3438         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3439         rcu_assign_pointer(dev->rx_handler, rx_handler);
3440
3441         return 0;
3442 }
3443 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3444
3445 /**
3446  *      netdev_rx_handler_unregister - unregister receive handler
3447  *      @dev: device to unregister a handler from
3448  *
3449  *      Unregister a receive handler from a device.
3450  *
3451  *      The caller must hold the rtnl_mutex.
3452  */
3453 void netdev_rx_handler_unregister(struct net_device *dev)
3454 {
3455
3456         ASSERT_RTNL();
3457         RCU_INIT_POINTER(dev->rx_handler, NULL);
3458         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3459          * section has a guarantee to see a non NULL rx_handler_data
3460          * as well.
3461          */
3462         synchronize_net();
3463         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3464 }
3465 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3466
3467 /*
3468  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3469  * the special handling of PFMEMALLOC skbs.
3470  */
3471 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3472 {
3473         switch (skb->protocol) {
3474         case __constant_htons(ETH_P_ARP):
3475         case __constant_htons(ETH_P_IP):
3476         case __constant_htons(ETH_P_IPV6):
3477         case __constant_htons(ETH_P_8021Q):
3478         case __constant_htons(ETH_P_8021AD):
3479                 return true;
3480         default:
3481                 return false;
3482         }
3483 }
3484
3485 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3486 {
3487         struct packet_type *ptype, *pt_prev;
3488         rx_handler_func_t *rx_handler;
3489         struct net_device *orig_dev;
3490         struct net_device *null_or_dev;
3491         bool deliver_exact = false;
3492         int ret = NET_RX_DROP;
3493         __be16 type;
3494
3495         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3496
3497         trace_netif_receive_skb(skb);
3498
3499         /* if we've gotten here through NAPI, check netpoll */
3500         if (netpoll_receive_skb(skb))
3501                 goto out;
3502
3503         orig_dev = skb->dev;
3504
3505         skb_reset_network_header(skb);
3506         if (!skb_transport_header_was_set(skb))
3507                 skb_reset_transport_header(skb);
3508         skb_reset_mac_len(skb);
3509
3510         pt_prev = NULL;
3511
3512 another_round:
3513         skb->skb_iif = skb->dev->ifindex;
3514
3515         __this_cpu_inc(softnet_data.processed);
3516
3517         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3518             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3519                 skb = vlan_untag(skb);
3520                 if (unlikely(!skb))
3521                         goto out;
3522         }
3523
3524 #ifdef CONFIG_NET_CLS_ACT
3525         if (skb->tc_verd & TC_NCLS) {
3526                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3527                 goto ncls;
3528         }
3529 #endif
3530
3531         if (pfmemalloc)
3532                 goto skip_taps;
3533
3534         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3535                 if (!ptype->dev || ptype->dev == skb->dev) {
3536                         if (pt_prev)
3537                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3538                         pt_prev = ptype;
3539                 }
3540         }
3541
3542 skip_taps:
3543 #ifdef CONFIG_NET_CLS_ACT
3544         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3545         if (!skb)
3546                 goto out;
3547 ncls:
3548 #endif
3549
3550         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3551                 goto drop;
3552
3553         if (vlan_tx_tag_present(skb)) {
3554                 if (pt_prev) {
3555                         ret = deliver_skb(skb, pt_prev, orig_dev);
3556                         pt_prev = NULL;
3557                 }
3558                 if (vlan_do_receive(&skb))
3559                         goto another_round;
3560                 else if (unlikely(!skb))
3561                         goto out;
3562         }
3563
3564         rx_handler = rcu_dereference(skb->dev->rx_handler);
3565         if (rx_handler) {
3566                 if (pt_prev) {
3567                         ret = deliver_skb(skb, pt_prev, orig_dev);
3568                         pt_prev = NULL;
3569                 }
3570                 switch (rx_handler(&skb)) {
3571                 case RX_HANDLER_CONSUMED:
3572                         ret = NET_RX_SUCCESS;
3573                         goto out;
3574                 case RX_HANDLER_ANOTHER:
3575                         goto another_round;
3576                 case RX_HANDLER_EXACT:
3577                         deliver_exact = true;
3578                 case RX_HANDLER_PASS:
3579                         break;
3580                 default:
3581                         BUG();
3582                 }
3583         }
3584
3585         if (unlikely(vlan_tx_tag_present(skb))) {
3586                 if (vlan_tx_tag_get_id(skb))
3587                         skb->pkt_type = PACKET_OTHERHOST;
3588                 /* Note: we might in the future use prio bits
3589                  * and set skb->priority like in vlan_do_receive()
3590                  * For the time being, just ignore Priority Code Point
3591                  */
3592                 skb->vlan_tci = 0;
3593         }
3594
3595         /* deliver only exact match when indicated */
3596         null_or_dev = deliver_exact ? skb->dev : NULL;
3597
3598         type = skb->protocol;
3599         list_for_each_entry_rcu(ptype,
3600                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3601                 if (ptype->type == type &&
3602                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3603                      ptype->dev == orig_dev)) {
3604                         if (pt_prev)
3605                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3606                         pt_prev = ptype;
3607                 }
3608         }
3609
3610         if (pt_prev) {
3611                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3612                         goto drop;
3613                 else
3614                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3615         } else {
3616 drop:
3617                 atomic_long_inc(&skb->dev->rx_dropped);
3618                 kfree_skb(skb);
3619                 /* Jamal, now you will not able to escape explaining
3620                  * me how you were going to use this. :-)
3621                  */
3622                 ret = NET_RX_DROP;
3623         }
3624
3625 out:
3626         return ret;
3627 }
3628
3629 static int __netif_receive_skb(struct sk_buff *skb)
3630 {
3631         int ret;
3632
3633         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3634                 unsigned long pflags = current->flags;
3635
3636                 /*
3637                  * PFMEMALLOC skbs are special, they should
3638                  * - be delivered to SOCK_MEMALLOC sockets only
3639                  * - stay away from userspace
3640                  * - have bounded memory usage
3641                  *
3642                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3643                  * context down to all allocation sites.
3644                  */
3645                 current->flags |= PF_MEMALLOC;
3646                 ret = __netif_receive_skb_core(skb, true);
3647                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3648         } else
3649                 ret = __netif_receive_skb_core(skb, false);
3650
3651         return ret;
3652 }
3653
3654 /**
3655  *      netif_receive_skb - process receive buffer from network
3656  *      @skb: buffer to process
3657  *
3658  *      netif_receive_skb() is the main receive data processing function.
3659  *      It always succeeds. The buffer may be dropped during processing
3660  *      for congestion control or by the protocol layers.
3661  *
3662  *      This function may only be called from softirq context and interrupts
3663  *      should be enabled.
3664  *
3665  *      Return values (usually ignored):
3666  *      NET_RX_SUCCESS: no congestion
3667  *      NET_RX_DROP: packet was dropped
3668  */
3669 int netif_receive_skb(struct sk_buff *skb)
3670 {
3671         int ret;
3672
3673         net_timestamp_check(netdev_tstamp_prequeue, skb);
3674
3675         if (skb_defer_rx_timestamp(skb))
3676                 return NET_RX_SUCCESS;
3677
3678         rcu_read_lock();
3679
3680 #ifdef CONFIG_RPS
3681         if (static_key_false(&rps_needed)) {
3682                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3683                 int cpu = get_rps_cpu(skb->dev, skb, &rflow);
3684
3685                 if (cpu >= 0) {
3686                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3687                         rcu_read_unlock();
3688                         return ret;
3689                 }
3690         }
3691 #endif
3692         ret = __netif_receive_skb(skb);
3693         rcu_read_unlock();
3694         return ret;
3695 }
3696 EXPORT_SYMBOL(netif_receive_skb);
3697
3698 /* Network device is going away, flush any packets still pending
3699  * Called with irqs disabled.
3700  */
3701 static void flush_backlog(void *arg)
3702 {
3703         struct net_device *dev = arg;
3704         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3705         struct sk_buff *skb, *tmp;
3706
3707         rps_lock(sd);
3708         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3709                 if (skb->dev == dev) {
3710                         __skb_unlink(skb, &sd->input_pkt_queue);
3711                         kfree_skb(skb);
3712                         input_queue_head_incr(sd);
3713                 }
3714         }
3715         rps_unlock(sd);
3716
3717         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3718                 if (skb->dev == dev) {
3719                         __skb_unlink(skb, &sd->process_queue);
3720                         kfree_skb(skb);
3721                         input_queue_head_incr(sd);
3722                 }
3723         }
3724 }
3725
3726 static int napi_gro_complete(struct sk_buff *skb)
3727 {
3728         struct packet_offload *ptype;
3729         __be16 type = skb->protocol;
3730         struct list_head *head = &offload_base;
3731         int err = -ENOENT;
3732
3733         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3734
3735         if (NAPI_GRO_CB(skb)->count == 1) {
3736                 skb_shinfo(skb)->gso_size = 0;
3737                 goto out;
3738         }
3739
3740         rcu_read_lock();
3741         list_for_each_entry_rcu(ptype, head, list) {
3742                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3743                         continue;
3744
3745                 err = ptype->callbacks.gro_complete(skb);
3746                 break;
3747         }
3748         rcu_read_unlock();
3749
3750         if (err) {
3751                 WARN_ON(&ptype->list == head);
3752                 kfree_skb(skb);
3753                 return NET_RX_SUCCESS;
3754         }
3755
3756 out:
3757         return netif_receive_skb(skb);
3758 }
3759
3760 /* napi->gro_list contains packets ordered by age.
3761  * youngest packets at the head of it.
3762  * Complete skbs in reverse order to reduce latencies.
3763  */
3764 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3765 {
3766         struct sk_buff *skb, *prev = NULL;
3767
3768         /* scan list and build reverse chain */
3769         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3770                 skb->prev = prev;
3771                 prev = skb;
3772         }
3773
3774         for (skb = prev; skb; skb = prev) {
3775                 skb->next = NULL;
3776
3777                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3778                         return;
3779
3780                 prev = skb->prev;
3781                 napi_gro_complete(skb);
3782                 napi->gro_count--;
3783         }
3784
3785         napi->gro_list = NULL;
3786 }
3787 EXPORT_SYMBOL(napi_gro_flush);
3788
3789 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3790 {
3791         struct sk_buff *p;
3792         unsigned int maclen = skb->dev->hard_header_len;
3793
3794         for (p = napi->gro_list; p; p = p->next) {
3795                 unsigned long diffs;
3796
3797                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3798                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3799                 if (maclen == ETH_HLEN)
3800                         diffs |= compare_ether_header(skb_mac_header(p),
3801                                                       skb_gro_mac_header(skb));
3802                 else if (!diffs)
3803                         diffs = memcmp(skb_mac_header(p),
3804                                        skb_gro_mac_header(skb),
3805                                        maclen);
3806                 NAPI_GRO_CB(p)->same_flow = !diffs;
3807                 NAPI_GRO_CB(p)->flush = 0;
3808         }
3809 }
3810
3811 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3812 {
3813         struct sk_buff **pp = NULL;
3814         struct packet_offload *ptype;
3815         __be16 type = skb->protocol;
3816         struct list_head *head = &offload_base;
3817         int same_flow;
3818         enum gro_result ret;
3819
3820         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3821                 goto normal;
3822
3823         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3824                 goto normal;
3825
3826         gro_list_prepare(napi, skb);
3827
3828         rcu_read_lock();
3829         list_for_each_entry_rcu(ptype, head, list) {
3830                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3831                         continue;
3832
3833                 skb_set_network_header(skb, skb_gro_offset(skb));
3834                 skb_reset_mac_len(skb);
3835                 NAPI_GRO_CB(skb)->same_flow = 0;
3836                 NAPI_GRO_CB(skb)->flush = 0;
3837                 NAPI_GRO_CB(skb)->free = 0;
3838
3839                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3840                 break;
3841         }
3842         rcu_read_unlock();
3843
3844         if (&ptype->list == head)
3845                 goto normal;
3846
3847         same_flow = NAPI_GRO_CB(skb)->same_flow;
3848         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3849
3850         if (pp) {
3851                 struct sk_buff *nskb = *pp;
3852
3853                 *pp = nskb->next;
3854                 nskb->next = NULL;
3855                 napi_gro_complete(nskb);
3856                 napi->gro_count--;
3857         }
3858
3859         if (same_flow)
3860                 goto ok;
3861
3862         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3863                 goto normal;
3864
3865         napi->gro_count++;
3866         NAPI_GRO_CB(skb)->count = 1;
3867         NAPI_GRO_CB(skb)->age = jiffies;
3868         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3869         skb->next = napi->gro_list;
3870         napi->gro_list = skb;
3871         ret = GRO_HELD;
3872
3873 pull:
3874         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3875                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3876
3877                 BUG_ON(skb->end - skb->tail < grow);
3878
3879                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3880
3881                 skb->tail += grow;
3882                 skb->data_len -= grow;
3883
3884                 skb_shinfo(skb)->frags[0].page_offset += grow;
3885                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3886
3887                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3888                         skb_frag_unref(skb, 0);
3889                         memmove(skb_shinfo(skb)->frags,
3890                                 skb_shinfo(skb)->frags + 1,
3891                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3892                 }
3893         }
3894
3895 ok:
3896         return ret;
3897
3898 normal:
3899         ret = GRO_NORMAL;
3900         goto pull;
3901 }
3902
3903
3904 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3905 {
3906         switch (ret) {
3907         case GRO_NORMAL:
3908                 if (netif_receive_skb(skb))
3909                         ret = GRO_DROP;
3910                 break;
3911
3912         case GRO_DROP:
3913                 kfree_skb(skb);
3914                 break;
3915
3916         case GRO_MERGED_FREE:
3917                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3918                         kmem_cache_free(skbuff_head_cache, skb);
3919                 else
3920                         __kfree_skb(skb);
3921                 break;
3922
3923         case GRO_HELD:
3924         case GRO_MERGED:
3925                 break;
3926         }
3927
3928         return ret;
3929 }
3930
3931 static void skb_gro_reset_offset(struct sk_buff *skb)
3932 {
3933         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3934         const skb_frag_t *frag0 = &pinfo->frags[0];
3935
3936         NAPI_GRO_CB(skb)->data_offset = 0;
3937         NAPI_GRO_CB(skb)->frag0 = NULL;
3938         NAPI_GRO_CB(skb)->frag0_len = 0;
3939
3940         if (skb->mac_header == skb->tail &&
3941             pinfo->nr_frags &&
3942             !PageHighMem(skb_frag_page(frag0))) {
3943                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3944                 NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int,
3945                                                     skb_frag_size(frag0),
3946                                                     skb->end - skb->tail);
3947         }
3948 }
3949
3950 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3951 {
3952         skb_gro_reset_offset(skb);
3953
3954         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3955 }
3956 EXPORT_SYMBOL(napi_gro_receive);
3957
3958 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3959 {
3960         __skb_pull(skb, skb_headlen(skb));
3961         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3962         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3963         skb->vlan_tci = 0;
3964         skb->dev = napi->dev;
3965         skb->skb_iif = 0;
3966         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
3967
3968         napi->skb = skb;
3969 }
3970
3971 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3972 {
3973         struct sk_buff *skb = napi->skb;
3974
3975         if (!skb) {
3976                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3977                 if (skb)
3978                         napi->skb = skb;
3979         }
3980         return skb;
3981 }
3982 EXPORT_SYMBOL(napi_get_frags);
3983
3984 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3985                                gro_result_t ret)
3986 {
3987         switch (ret) {
3988         case GRO_NORMAL:
3989         case GRO_HELD:
3990                 skb->protocol = eth_type_trans(skb, skb->dev);
3991
3992                 if (ret == GRO_HELD)
3993                         skb_gro_pull(skb, -ETH_HLEN);
3994                 else if (netif_receive_skb(skb))
3995                         ret = GRO_DROP;
3996                 break;
3997
3998         case GRO_DROP:
3999         case GRO_MERGED_FREE:
4000                 napi_reuse_skb(napi, skb);
4001                 break;
4002
4003         case GRO_MERGED:
4004                 break;
4005         }
4006
4007         return ret;
4008 }
4009
4010 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
4011 {
4012         struct sk_buff *skb = napi->skb;
4013         struct ethhdr *eth;
4014         unsigned int hlen;
4015         unsigned int off;
4016
4017         napi->skb = NULL;
4018
4019         skb_reset_mac_header(skb);
4020         skb_gro_reset_offset(skb);
4021
4022         off = skb_gro_offset(skb);
4023         hlen = off + sizeof(*eth);
4024         eth = skb_gro_header_fast(skb, off);
4025         if (skb_gro_header_hard(skb, hlen)) {
4026                 eth = skb_gro_header_slow(skb, hlen, off);
4027                 if (unlikely(!eth)) {
4028                         napi_reuse_skb(napi, skb);
4029                         skb = NULL;
4030                         goto out;
4031                 }
4032         }
4033
4034         skb_gro_pull(skb, sizeof(*eth));
4035
4036         /*
4037          * This works because the only protocols we care about don't require
4038          * special handling.  We'll fix it up properly at the end.
4039          */
4040         skb->protocol = eth->h_proto;
4041
4042 out:
4043         return skb;
4044 }
4045
4046 gro_result_t napi_gro_frags(struct napi_struct *napi)
4047 {
4048         struct sk_buff *skb = napi_frags_skb(napi);
4049
4050         if (!skb)
4051                 return GRO_DROP;
4052
4053         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4054 }
4055 EXPORT_SYMBOL(napi_gro_frags);
4056
4057 /*
4058  * net_rps_action sends any pending IPI's for rps.
4059  * Note: called with local irq disabled, but exits with local irq enabled.
4060  */
4061 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4062 {
4063 #ifdef CONFIG_RPS
4064         struct softnet_data *remsd = sd->rps_ipi_list;
4065
4066         if (remsd) {
4067                 sd->rps_ipi_list = NULL;
4068
4069                 local_irq_enable();
4070
4071                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4072                 while (remsd) {
4073                         struct softnet_data *next = remsd->rps_ipi_next;
4074
4075                         if (cpu_online(remsd->cpu))
4076                                 __smp_call_function_single(remsd->cpu,
4077                                                            &remsd->csd, 0);
4078                         remsd = next;
4079                 }
4080         } else
4081 #endif
4082                 local_irq_enable();
4083 }
4084
4085 static int process_backlog(struct napi_struct *napi, int quota)
4086 {
4087         int work = 0;
4088         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4089
4090 #ifdef CONFIG_RPS
4091         /* Check if we have pending ipi, its better to send them now,
4092          * not waiting net_rx_action() end.
4093          */
4094         if (sd->rps_ipi_list) {
4095                 local_irq_disable();
4096                 net_rps_action_and_irq_enable(sd);
4097         }
4098 #endif
4099         napi->weight = weight_p;
4100         local_irq_disable();
4101         while (work < quota) {
4102                 struct sk_buff *skb;
4103                 unsigned int qlen;
4104
4105                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4106                         rcu_read_lock();
4107                         local_irq_enable();
4108                         __netif_receive_skb(skb);
4109                         rcu_read_unlock();
4110                         local_irq_disable();
4111                         input_queue_head_incr(sd);
4112                         if (++work >= quota) {
4113                                 local_irq_enable();
4114                                 return work;
4115                         }
4116                 }
4117
4118                 rps_lock(sd);
4119                 qlen = skb_queue_len(&sd->input_pkt_queue);
4120                 if (qlen)
4121                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4122                                                    &sd->process_queue);
4123
4124                 if (qlen < quota - work) {
4125                         /*
4126                          * Inline a custom version of __napi_complete().
4127                          * only current cpu owns and manipulates this napi,
4128                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4129                          * we can use a plain write instead of clear_bit(),
4130                          * and we dont need an smp_mb() memory barrier.
4131                          */
4132                         list_del(&napi->poll_list);
4133                         napi->state = 0;
4134
4135                         quota = work + qlen;
4136                 }
4137                 rps_unlock(sd);
4138         }
4139         local_irq_enable();
4140
4141         return work;
4142 }
4143
4144 /**
4145  * __napi_schedule - schedule for receive
4146  * @n: entry to schedule
4147  *
4148  * The entry's receive function will be scheduled to run
4149  */
4150 void __napi_schedule(struct napi_struct *n)
4151 {
4152         unsigned long flags;
4153
4154         local_irq_save(flags);
4155         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4156         local_irq_restore(flags);
4157 }
4158 EXPORT_SYMBOL(__napi_schedule);
4159
4160 void __napi_complete(struct napi_struct *n)
4161 {
4162         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4163         BUG_ON(n->gro_list);
4164
4165         list_del(&n->poll_list);
4166         smp_mb__before_clear_bit();
4167         clear_bit(NAPI_STATE_SCHED, &n->state);
4168 }
4169 EXPORT_SYMBOL(__napi_complete);
4170
4171 void napi_complete(struct napi_struct *n)
4172 {
4173         unsigned long flags;
4174
4175         /*
4176          * don't let napi dequeue from the cpu poll list
4177          * just in case its running on a different cpu
4178          */
4179         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4180                 return;
4181
4182         napi_gro_flush(n, false);
4183         local_irq_save(flags);
4184         __napi_complete(n);
4185         local_irq_restore(flags);
4186 }
4187 EXPORT_SYMBOL(napi_complete);
4188
4189 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4190                     int (*poll)(struct napi_struct *, int), int weight)
4191 {
4192         INIT_LIST_HEAD(&napi->poll_list);
4193         napi->gro_count = 0;
4194         napi->gro_list = NULL;
4195         napi->skb = NULL;
4196         napi->poll = poll;
4197         if (weight > NAPI_POLL_WEIGHT)
4198                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4199                             weight, dev->name);
4200         napi->weight = weight;
4201         list_add(&napi->dev_list, &dev->napi_list);
4202         napi->dev = dev;
4203 #ifdef CONFIG_NETPOLL
4204         spin_lock_init(&napi->poll_lock);
4205         napi->poll_owner = -1;
4206 #endif
4207         set_bit(NAPI_STATE_SCHED, &napi->state);
4208 }
4209 EXPORT_SYMBOL(netif_napi_add);
4210
4211 void netif_napi_del(struct napi_struct *napi)
4212 {
4213         struct sk_buff *skb, *next;
4214
4215         list_del_init(&napi->dev_list);
4216         napi_free_frags(napi);
4217
4218         for (skb = napi->gro_list; skb; skb = next) {
4219                 next = skb->next;
4220                 skb->next = NULL;
4221                 kfree_skb(skb);
4222         }
4223
4224         napi->gro_list = NULL;
4225         napi->gro_count = 0;
4226 }
4227 EXPORT_SYMBOL(netif_napi_del);
4228
4229 static void net_rx_action(struct softirq_action *h)
4230 {
4231         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4232         unsigned long time_limit = jiffies + 2;
4233         int budget = netdev_budget;
4234         void *have;
4235
4236         local_irq_disable();
4237
4238         while (!list_empty(&sd->poll_list)) {
4239                 struct napi_struct *n;
4240                 int work, weight;
4241
4242                 /* If softirq window is exhuasted then punt.
4243                  * Allow this to run for 2 jiffies since which will allow
4244                  * an average latency of 1.5/HZ.
4245                  */
4246                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4247                         goto softnet_break;
4248
4249                 local_irq_enable();
4250
4251                 /* Even though interrupts have been re-enabled, this
4252                  * access is safe because interrupts can only add new
4253                  * entries to the tail of this list, and only ->poll()
4254                  * calls can remove this head entry from the list.
4255                  */
4256                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4257
4258                 have = netpoll_poll_lock(n);
4259
4260                 weight = n->weight;
4261
4262                 /* This NAPI_STATE_SCHED test is for avoiding a race
4263                  * with netpoll's poll_napi().  Only the entity which
4264                  * obtains the lock and sees NAPI_STATE_SCHED set will
4265                  * actually make the ->poll() call.  Therefore we avoid
4266                  * accidentally calling ->poll() when NAPI is not scheduled.
4267                  */
4268                 work = 0;
4269                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4270                         work = n->poll(n, weight);
4271                         trace_napi_poll(n);
4272                 }
4273
4274                 WARN_ON_ONCE(work > weight);
4275
4276                 budget -= work;
4277
4278                 local_irq_disable();
4279
4280                 /* Drivers must not modify the NAPI state if they
4281                  * consume the entire weight.  In such cases this code
4282                  * still "owns" the NAPI instance and therefore can
4283                  * move the instance around on the list at-will.
4284                  */
4285                 if (unlikely(work == weight)) {
4286                         if (unlikely(napi_disable_pending(n))) {
4287                                 local_irq_enable();
4288                                 napi_complete(n);
4289                                 local_irq_disable();
4290                         } else {
4291                                 if (n->gro_list) {
4292                                         /* flush too old packets
4293                                          * If HZ < 1000, flush all packets.
4294                                          */
4295                                         local_irq_enable();
4296                                         napi_gro_flush(n, HZ >= 1000);
4297                                         local_irq_disable();
4298                                 }
4299                                 list_move_tail(&n->poll_list, &sd->poll_list);
4300                         }
4301                 }
4302
4303                 netpoll_poll_unlock(have);
4304         }
4305 out:
4306         net_rps_action_and_irq_enable(sd);
4307
4308 #ifdef CONFIG_NET_DMA
4309         /*
4310          * There may not be any more sk_buffs coming right now, so push
4311          * any pending DMA copies to hardware
4312          */
4313         dma_issue_pending_all();
4314 #endif
4315
4316         return;
4317
4318 softnet_break:
4319         sd->time_squeeze++;
4320         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4321         goto out;
4322 }
4323
4324 struct netdev_upper {
4325         struct net_device *dev;
4326         bool master;
4327         struct list_head list;
4328         struct rcu_head rcu;
4329         struct list_head search_list;
4330 };
4331
4332 static void __append_search_uppers(struct list_head *search_list,
4333                                    struct net_device *dev)
4334 {
4335         struct netdev_upper *upper;
4336
4337         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4338                 /* check if this upper is not already in search list */
4339                 if (list_empty(&upper->search_list))
4340                         list_add_tail(&upper->search_list, search_list);
4341         }
4342 }
4343
4344 static bool __netdev_search_upper_dev(struct net_device *dev,
4345                                       struct net_device *upper_dev)
4346 {
4347         LIST_HEAD(search_list);
4348         struct netdev_upper *upper;
4349         struct netdev_upper *tmp;
4350         bool ret = false;
4351
4352         __append_search_uppers(&search_list, dev);
4353         list_for_each_entry(upper, &search_list, search_list) {
4354                 if (upper->dev == upper_dev) {
4355                         ret = true;
4356                         break;
4357                 }
4358                 __append_search_uppers(&search_list, upper->dev);
4359         }
4360         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4361                 INIT_LIST_HEAD(&upper->search_list);
4362         return ret;
4363 }
4364
4365 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4366                                                 struct net_device *upper_dev)
4367 {
4368         struct netdev_upper *upper;
4369
4370         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4371                 if (upper->dev == upper_dev)
4372                         return upper;
4373         }
4374         return NULL;
4375 }
4376
4377 /**
4378  * netdev_has_upper_dev - Check if device is linked to an upper device
4379  * @dev: device
4380  * @upper_dev: upper device to check
4381  *
4382  * Find out if a device is linked to specified upper device and return true
4383  * in case it is. Note that this checks only immediate upper device,
4384  * not through a complete stack of devices. The caller must hold the RTNL lock.
4385  */
4386 bool netdev_has_upper_dev(struct net_device *dev,
4387                           struct net_device *upper_dev)
4388 {
4389         ASSERT_RTNL();
4390
4391         return __netdev_find_upper(dev, upper_dev);
4392 }
4393 EXPORT_SYMBOL(netdev_has_upper_dev);
4394
4395 /**
4396  * netdev_has_any_upper_dev - Check if device is linked to some device
4397  * @dev: device
4398  *
4399  * Find out if a device is linked to an upper device and return true in case
4400  * it is. The caller must hold the RTNL lock.
4401  */
4402 bool netdev_has_any_upper_dev(struct net_device *dev)
4403 {
4404         ASSERT_RTNL();
4405
4406         return !list_empty(&dev->upper_dev_list);
4407 }
4408 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4409
4410 /**
4411  * netdev_master_upper_dev_get - Get master upper device
4412  * @dev: device
4413  *
4414  * Find a master upper device and return pointer to it or NULL in case
4415  * it's not there. The caller must hold the RTNL lock.
4416  */
4417 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4418 {
4419         struct netdev_upper *upper;
4420
4421         ASSERT_RTNL();
4422
4423         if (list_empty(&dev->upper_dev_list))
4424                 return NULL;
4425
4426         upper = list_first_entry(&dev->upper_dev_list,
4427                                  struct netdev_upper, list);
4428         if (likely(upper->master))
4429                 return upper->dev;
4430         return NULL;
4431 }
4432 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4433
4434 /**
4435  * netdev_master_upper_dev_get_rcu - Get master upper device
4436  * @dev: device
4437  *
4438  * Find a master upper device and return pointer to it or NULL in case
4439  * it's not there. The caller must hold the RCU read lock.
4440  */
4441 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4442 {
4443         struct netdev_upper *upper;
4444
4445         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4446                                        struct netdev_upper, list);
4447         if (upper && likely(upper->master))
4448                 return upper->dev;
4449         return NULL;
4450 }
4451 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4452
4453 static int __netdev_upper_dev_link(struct net_device *dev,
4454                                    struct net_device *upper_dev, bool master)
4455 {
4456         struct netdev_upper *upper;
4457
4458         ASSERT_RTNL();
4459
4460         if (dev == upper_dev)
4461                 return -EBUSY;
4462
4463         /* To prevent loops, check if dev is not upper device to upper_dev. */
4464         if (__netdev_search_upper_dev(upper_dev, dev))
4465                 return -EBUSY;
4466
4467         if (__netdev_find_upper(dev, upper_dev))
4468                 return -EEXIST;
4469
4470         if (master && netdev_master_upper_dev_get(dev))
4471                 return -EBUSY;
4472
4473         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4474         if (!upper)
4475                 return -ENOMEM;
4476
4477         upper->dev = upper_dev;
4478         upper->master = master;
4479         INIT_LIST_HEAD(&upper->search_list);
4480
4481         /* Ensure that master upper link is always the first item in list. */
4482         if (master)
4483                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4484         else
4485                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4486         dev_hold(upper_dev);
4487
4488         return 0;
4489 }
4490
4491 /**
4492  * netdev_upper_dev_link - Add a link to the upper device
4493  * @dev: device
4494  * @upper_dev: new upper device
4495  *
4496  * Adds a link to device which is upper to this one. The caller must hold
4497  * the RTNL lock. On a failure a negative errno code is returned.
4498  * On success the reference counts are adjusted and the function
4499  * returns zero.
4500  */
4501 int netdev_upper_dev_link(struct net_device *dev,
4502                           struct net_device *upper_dev)
4503 {
4504         return __netdev_upper_dev_link(dev, upper_dev, false);
4505 }
4506 EXPORT_SYMBOL(netdev_upper_dev_link);
4507
4508 /**
4509  * netdev_master_upper_dev_link - Add a master link to the upper device
4510  * @dev: device
4511  * @upper_dev: new upper device
4512  *
4513  * Adds a link to device which is upper to this one. In this case, only
4514  * one master upper device can be linked, although other non-master devices
4515  * might be linked as well. The caller must hold the RTNL lock.
4516  * On a failure a negative errno code is returned. On success the reference
4517  * counts are adjusted and the function returns zero.
4518  */
4519 int netdev_master_upper_dev_link(struct net_device *dev,
4520                                  struct net_device *upper_dev)
4521 {
4522         return __netdev_upper_dev_link(dev, upper_dev, true);
4523 }
4524 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4525
4526 /**
4527  * netdev_upper_dev_unlink - Removes a link to upper device
4528  * @dev: device
4529  * @upper_dev: new upper device
4530  *
4531  * Removes a link to device which is upper to this one. The caller must hold
4532  * the RTNL lock.
4533  */
4534 void netdev_upper_dev_unlink(struct net_device *dev,
4535                              struct net_device *upper_dev)
4536 {
4537         struct netdev_upper *upper;
4538
4539         ASSERT_RTNL();
4540
4541         upper = __netdev_find_upper(dev, upper_dev);
4542         if (!upper)
4543                 return;
4544         list_del_rcu(&upper->list);
4545         dev_put(upper_dev);
4546         kfree_rcu(upper, rcu);
4547 }
4548 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4549
4550 static void dev_change_rx_flags(struct net_device *dev, int flags)
4551 {
4552         const struct net_device_ops *ops = dev->netdev_ops;
4553
4554         if (ops->ndo_change_rx_flags)
4555                 ops->ndo_change_rx_flags(dev, flags);
4556 }
4557
4558 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4559 {
4560         unsigned int old_flags = dev->flags;
4561         kuid_t uid;
4562         kgid_t gid;
4563
4564         ASSERT_RTNL();
4565
4566         dev->flags |= IFF_PROMISC;
4567         dev->promiscuity += inc;
4568         if (dev->promiscuity == 0) {
4569                 /*
4570                  * Avoid overflow.
4571                  * If inc causes overflow, untouch promisc and return error.
4572                  */
4573                 if (inc < 0)
4574                         dev->flags &= ~IFF_PROMISC;
4575                 else {
4576                         dev->promiscuity -= inc;
4577                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4578                                 dev->name);
4579                         return -EOVERFLOW;
4580                 }
4581         }
4582         if (dev->flags != old_flags) {
4583                 pr_info("device %s %s promiscuous mode\n",
4584                         dev->name,
4585                         dev->flags & IFF_PROMISC ? "entered" : "left");
4586                 if (audit_enabled) {
4587                         current_uid_gid(&uid, &gid);
4588                         audit_log(current->audit_context, GFP_ATOMIC,
4589                                 AUDIT_ANOM_PROMISCUOUS,
4590                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4591                                 dev->name, (dev->flags & IFF_PROMISC),
4592                                 (old_flags & IFF_PROMISC),
4593                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4594                                 from_kuid(&init_user_ns, uid),
4595                                 from_kgid(&init_user_ns, gid),
4596                                 audit_get_sessionid(current));
4597                 }
4598
4599                 dev_change_rx_flags(dev, IFF_PROMISC);
4600         }
4601         return 0;
4602 }
4603
4604 /**
4605  *      dev_set_promiscuity     - update promiscuity count on a device
4606  *      @dev: device
4607  *      @inc: modifier
4608  *
4609  *      Add or remove promiscuity from a device. While the count in the device
4610  *      remains above zero the interface remains promiscuous. Once it hits zero
4611  *      the device reverts back to normal filtering operation. A negative inc
4612  *      value is used to drop promiscuity on the device.
4613  *      Return 0 if successful or a negative errno code on error.
4614  */
4615 int dev_set_promiscuity(struct net_device *dev, int inc)
4616 {
4617         unsigned int old_flags = dev->flags;
4618         int err;
4619
4620         err = __dev_set_promiscuity(dev, inc);
4621         if (err < 0)
4622                 return err;
4623         if (dev->flags != old_flags)
4624                 dev_set_rx_mode(dev);
4625         return err;
4626 }
4627 EXPORT_SYMBOL(dev_set_promiscuity);
4628
4629 /**
4630  *      dev_set_allmulti        - update allmulti count on a device
4631  *      @dev: device
4632  *      @inc: modifier
4633  *
4634  *      Add or remove reception of all multicast frames to a device. While the
4635  *      count in the device remains above zero the interface remains listening
4636  *      to all interfaces. Once it hits zero the device reverts back to normal
4637  *      filtering operation. A negative @inc value is used to drop the counter
4638  *      when releasing a resource needing all multicasts.
4639  *      Return 0 if successful or a negative errno code on error.
4640  */
4641
4642 int dev_set_allmulti(struct net_device *dev, int inc)
4643 {
4644         unsigned int old_flags = dev->flags;
4645
4646         ASSERT_RTNL();
4647
4648         dev->flags |= IFF_ALLMULTI;
4649         dev->allmulti += inc;
4650         if (dev->allmulti == 0) {
4651                 /*
4652                  * Avoid overflow.
4653                  * If inc causes overflow, untouch allmulti and return error.
4654                  */
4655                 if (inc < 0)
4656                         dev->flags &= ~IFF_ALLMULTI;
4657                 else {
4658                         dev->allmulti -= inc;
4659                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4660                                 dev->name);
4661                         return -EOVERFLOW;
4662                 }
4663         }
4664         if (dev->flags ^ old_flags) {
4665                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4666                 dev_set_rx_mode(dev);
4667         }
4668         return 0;
4669 }
4670 EXPORT_SYMBOL(dev_set_allmulti);
4671
4672 /*
4673  *      Upload unicast and multicast address lists to device and
4674  *      configure RX filtering. When the device doesn't support unicast
4675  *      filtering it is put in promiscuous mode while unicast addresses
4676  *      are present.
4677  */
4678 void __dev_set_rx_mode(struct net_device *dev)
4679 {
4680         const struct net_device_ops *ops = dev->netdev_ops;
4681
4682         /* dev_open will call this function so the list will stay sane. */
4683         if (!(dev->flags&IFF_UP))
4684                 return;
4685
4686         if (!netif_device_present(dev))
4687                 return;
4688
4689         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4690                 /* Unicast addresses changes may only happen under the rtnl,
4691                  * therefore calling __dev_set_promiscuity here is safe.
4692                  */
4693                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4694                         __dev_set_promiscuity(dev, 1);
4695                         dev->uc_promisc = true;
4696                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4697                         __dev_set_promiscuity(dev, -1);
4698                         dev->uc_promisc = false;
4699                 }
4700         }
4701
4702         if (ops->ndo_set_rx_mode)
4703                 ops->ndo_set_rx_mode(dev);
4704 }
4705 EXPORT_SYMBOL(__dev_set_rx_mode);
4706
4707 void dev_set_rx_mode(struct net_device *dev)
4708 {
4709         netif_addr_lock_bh(dev);
4710         __dev_set_rx_mode(dev);
4711         netif_addr_unlock_bh(dev);
4712 }
4713
4714 /**
4715  *      dev_get_flags - get flags reported to userspace
4716  *      @dev: device
4717  *
4718  *      Get the combination of flag bits exported through APIs to userspace.
4719  */
4720 unsigned int dev_get_flags(const struct net_device *dev)
4721 {
4722         unsigned int flags;
4723
4724         flags = (dev->flags & ~(IFF_PROMISC |
4725                                 IFF_ALLMULTI |
4726                                 IFF_RUNNING |
4727                                 IFF_LOWER_UP |
4728                                 IFF_DORMANT)) |
4729                 (dev->gflags & (IFF_PROMISC |
4730                                 IFF_ALLMULTI));
4731
4732         if (netif_running(dev)) {
4733                 if (netif_oper_up(dev))
4734                         flags |= IFF_RUNNING;
4735                 if (netif_carrier_ok(dev))
4736                         flags |= IFF_LOWER_UP;
4737                 if (netif_dormant(dev))
4738                         flags |= IFF_DORMANT;
4739         }
4740
4741         return flags;
4742 }
4743 EXPORT_SYMBOL(dev_get_flags);
4744
4745 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4746 {
4747         unsigned int old_flags = dev->flags;
4748         int ret;
4749
4750         ASSERT_RTNL();
4751
4752         /*
4753          *      Set the flags on our device.
4754          */
4755
4756         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4757                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4758                                IFF_AUTOMEDIA)) |
4759                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4760                                     IFF_ALLMULTI));
4761
4762         /*
4763          *      Load in the correct multicast list now the flags have changed.
4764          */
4765
4766         if ((old_flags ^ flags) & IFF_MULTICAST)
4767                 dev_change_rx_flags(dev, IFF_MULTICAST);
4768
4769         dev_set_rx_mode(dev);
4770
4771         /*
4772          *      Have we downed the interface. We handle IFF_UP ourselves
4773          *      according to user attempts to set it, rather than blindly
4774          *      setting it.
4775          */
4776
4777         ret = 0;
4778         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4779                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4780
4781                 if (!ret)
4782                         dev_set_rx_mode(dev);
4783         }
4784
4785         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4786                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4787
4788                 dev->gflags ^= IFF_PROMISC;
4789                 dev_set_promiscuity(dev, inc);
4790         }
4791
4792         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4793            is important. Some (broken) drivers set IFF_PROMISC, when
4794            IFF_ALLMULTI is requested not asking us and not reporting.
4795          */
4796         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4797                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4798
4799                 dev->gflags ^= IFF_ALLMULTI;
4800                 dev_set_allmulti(dev, inc);
4801         }
4802
4803         return ret;
4804 }
4805
4806 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4807 {
4808         unsigned int changes = dev->flags ^ old_flags;
4809
4810         if (changes & IFF_UP) {
4811                 if (dev->flags & IFF_UP)
4812                         call_netdevice_notifiers(NETDEV_UP, dev);
4813                 else
4814                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4815         }
4816
4817         if (dev->flags & IFF_UP &&
4818             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4819                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4820 }
4821
4822 /**
4823  *      dev_change_flags - change device settings
4824  *      @dev: device
4825  *      @flags: device state flags
4826  *
4827  *      Change settings on device based state flags. The flags are
4828  *      in the userspace exported format.
4829  */
4830 int dev_change_flags(struct net_device *dev, unsigned int flags)
4831 {
4832         int ret;
4833         unsigned int changes, old_flags = dev->flags;
4834
4835         ret = __dev_change_flags(dev, flags);
4836         if (ret < 0)
4837                 return ret;
4838
4839         changes = old_flags ^ dev->flags;
4840         if (changes)
4841                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4842
4843         __dev_notify_flags(dev, old_flags);
4844         return ret;
4845 }
4846 EXPORT_SYMBOL(dev_change_flags);
4847
4848 /**
4849  *      dev_set_mtu - Change maximum transfer unit
4850  *      @dev: device
4851  *      @new_mtu: new transfer unit
4852  *
4853  *      Change the maximum transfer size of the network device.
4854  */
4855 int dev_set_mtu(struct net_device *dev, int new_mtu)
4856 {
4857         const struct net_device_ops *ops = dev->netdev_ops;
4858         int err;
4859
4860         if (new_mtu == dev->mtu)
4861                 return 0;
4862
4863         /*      MTU must be positive.    */
4864         if (new_mtu < 0)
4865                 return -EINVAL;
4866
4867         if (!netif_device_present(dev))
4868                 return -ENODEV;
4869
4870         err = 0;
4871         if (ops->ndo_change_mtu)
4872                 err = ops->ndo_change_mtu(dev, new_mtu);
4873         else
4874                 dev->mtu = new_mtu;
4875
4876         if (!err)
4877                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4878         return err;
4879 }
4880 EXPORT_SYMBOL(dev_set_mtu);
4881
4882 /**
4883  *      dev_set_group - Change group this device belongs to
4884  *      @dev: device
4885  *      @new_group: group this device should belong to
4886  */
4887 void dev_set_group(struct net_device *dev, int new_group)
4888 {
4889         dev->group = new_group;
4890 }
4891 EXPORT_SYMBOL(dev_set_group);
4892
4893 /**
4894  *      dev_set_mac_address - Change Media Access Control Address
4895  *      @dev: device
4896  *      @sa: new address
4897  *
4898  *      Change the hardware (MAC) address of the device
4899  */
4900 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4901 {
4902         const struct net_device_ops *ops = dev->netdev_ops;
4903         int err;
4904
4905         if (!ops->ndo_set_mac_address)
4906                 return -EOPNOTSUPP;
4907         if (sa->sa_family != dev->type)
4908                 return -EINVAL;
4909         if (!netif_device_present(dev))
4910                 return -ENODEV;
4911         err = ops->ndo_set_mac_address(dev, sa);
4912         if (err)
4913                 return err;
4914         dev->addr_assign_type = NET_ADDR_SET;
4915         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4916         add_device_randomness(dev->dev_addr, dev->addr_len);
4917         return 0;
4918 }
4919 EXPORT_SYMBOL(dev_set_mac_address);
4920
4921 /**
4922  *      dev_change_carrier - Change device carrier
4923  *      @dev: device
4924  *      @new_carrier: new value
4925  *
4926  *      Change device carrier
4927  */
4928 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4929 {
4930         const struct net_device_ops *ops = dev->netdev_ops;
4931
4932         if (!ops->ndo_change_carrier)
4933                 return -EOPNOTSUPP;
4934         if (!netif_device_present(dev))
4935                 return -ENODEV;
4936         return ops->ndo_change_carrier(dev, new_carrier);
4937 }
4938 EXPORT_SYMBOL(dev_change_carrier);
4939
4940 /**
4941  *      dev_new_index   -       allocate an ifindex
4942  *      @net: the applicable net namespace
4943  *
4944  *      Returns a suitable unique value for a new device interface
4945  *      number.  The caller must hold the rtnl semaphore or the
4946  *      dev_base_lock to be sure it remains unique.
4947  */
4948 static int dev_new_index(struct net *net)
4949 {
4950         int ifindex = net->ifindex;
4951         for (;;) {
4952                 if (++ifindex <= 0)
4953                         ifindex = 1;
4954                 if (!__dev_get_by_index(net, ifindex))
4955                         return net->ifindex = ifindex;
4956         }
4957 }
4958
4959 /* Delayed registration/unregisteration */
4960 static LIST_HEAD(net_todo_list);
4961
4962 static void net_set_todo(struct net_device *dev)
4963 {
4964         list_add_tail(&dev->todo_list, &net_todo_list);
4965 }
4966
4967 static void rollback_registered_many(struct list_head *head)
4968 {
4969         struct net_device *dev, *tmp;
4970
4971         BUG_ON(dev_boot_phase);
4972         ASSERT_RTNL();
4973
4974         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4975                 /* Some devices call without registering
4976                  * for initialization unwind. Remove those
4977                  * devices and proceed with the remaining.
4978                  */
4979                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4980                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4981                                  dev->name, dev);
4982
4983                         WARN_ON(1);
4984                         list_del(&dev->unreg_list);
4985                         continue;
4986                 }
4987                 dev->dismantle = true;
4988                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4989         }
4990
4991         /* If device is running, close it first. */
4992         dev_close_many(head);
4993
4994         list_for_each_entry(dev, head, unreg_list) {
4995                 /* And unlink it from device chain. */
4996                 unlist_netdevice(dev);
4997
4998                 dev->reg_state = NETREG_UNREGISTERING;
4999         }
5000
5001         synchronize_net();
5002
5003         list_for_each_entry(dev, head, unreg_list) {
5004                 /* Shutdown queueing discipline. */
5005                 dev_shutdown(dev);
5006
5007
5008                 /* Notify protocols, that we are about to destroy
5009                    this device. They should clean all the things.
5010                 */
5011                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5012
5013                 if (!dev->rtnl_link_ops ||
5014                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5015                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5016
5017                 /*
5018                  *      Flush the unicast and multicast chains
5019                  */
5020                 dev_uc_flush(dev);
5021                 dev_mc_flush(dev);
5022
5023                 if (dev->netdev_ops->ndo_uninit)
5024                         dev->netdev_ops->ndo_uninit(dev);
5025
5026                 /* Notifier chain MUST detach us all upper devices. */
5027                 WARN_ON(netdev_has_any_upper_dev(dev));
5028
5029                 /* Remove entries from kobject tree */
5030                 netdev_unregister_kobject(dev);
5031 #ifdef CONFIG_XPS
5032                 /* Remove XPS queueing entries */
5033                 netif_reset_xps_queues_gt(dev, 0);
5034 #endif
5035         }
5036
5037         synchronize_net();
5038
5039         list_for_each_entry(dev, head, unreg_list)
5040                 dev_put(dev);
5041 }
5042
5043 static void rollback_registered(struct net_device *dev)
5044 {
5045         LIST_HEAD(single);
5046
5047         list_add(&dev->unreg_list, &single);
5048         rollback_registered_many(&single);
5049         list_del(&single);
5050 }
5051
5052 static netdev_features_t netdev_fix_features(struct net_device *dev,
5053         netdev_features_t features)
5054 {
5055         /* Fix illegal checksum combinations */
5056         if ((features & NETIF_F_HW_CSUM) &&
5057             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5058                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5059                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5060         }
5061
5062         /* TSO requires that SG is present as well. */
5063         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5064                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5065                 features &= ~NETIF_F_ALL_TSO;
5066         }
5067
5068         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5069                                         !(features & NETIF_F_IP_CSUM)) {
5070                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5071                 features &= ~NETIF_F_TSO;
5072                 features &= ~NETIF_F_TSO_ECN;
5073         }
5074
5075         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5076                                          !(features & NETIF_F_IPV6_CSUM)) {
5077                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5078                 features &= ~NETIF_F_TSO6;
5079         }
5080
5081         /* TSO ECN requires that TSO is present as well. */
5082         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5083                 features &= ~NETIF_F_TSO_ECN;
5084
5085         /* Software GSO depends on SG. */
5086         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5087                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5088                 features &= ~NETIF_F_GSO;
5089         }
5090
5091         /* UFO needs SG and checksumming */
5092         if (features & NETIF_F_UFO) {
5093                 /* maybe split UFO into V4 and V6? */
5094                 if (!((features & NETIF_F_GEN_CSUM) ||
5095                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5096                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5097                         netdev_dbg(dev,
5098                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5099                         features &= ~NETIF_F_UFO;
5100                 }
5101
5102                 if (!(features & NETIF_F_SG)) {
5103                         netdev_dbg(dev,
5104                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5105                         features &= ~NETIF_F_UFO;
5106                 }
5107         }
5108
5109         return features;
5110 }
5111
5112 int __netdev_update_features(struct net_device *dev)
5113 {
5114         netdev_features_t features;
5115         int err = 0;
5116
5117         ASSERT_RTNL();
5118
5119         features = netdev_get_wanted_features(dev);
5120
5121         if (dev->netdev_ops->ndo_fix_features)
5122                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5123
5124         /* driver might be less strict about feature dependencies */
5125         features = netdev_fix_features(dev, features);
5126
5127         if (dev->features == features)
5128                 return 0;
5129
5130         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5131                 &dev->features, &features);
5132
5133         if (dev->netdev_ops->ndo_set_features)
5134                 err = dev->netdev_ops->ndo_set_features(dev, features);
5135
5136         if (unlikely(err < 0)) {
5137                 netdev_err(dev,
5138                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5139                         err, &features, &dev->features);
5140                 return -1;
5141         }
5142
5143         if (!err)
5144                 dev->features = features;
5145
5146         return 1;
5147 }
5148
5149 /**
5150  *      netdev_update_features - recalculate device features
5151  *      @dev: the device to check
5152  *
5153  *      Recalculate dev->features set and send notifications if it
5154  *      has changed. Should be called after driver or hardware dependent
5155  *      conditions might have changed that influence the features.
5156  */
5157 void netdev_update_features(struct net_device *dev)
5158 {
5159         if (__netdev_update_features(dev))
5160                 netdev_features_change(dev);
5161 }
5162 EXPORT_SYMBOL(netdev_update_features);
5163
5164 /**
5165  *      netdev_change_features - recalculate device features
5166  *      @dev: the device to check
5167  *
5168  *      Recalculate dev->features set and send notifications even
5169  *      if they have not changed. Should be called instead of
5170  *      netdev_update_features() if also dev->vlan_features might
5171  *      have changed to allow the changes to be propagated to stacked
5172  *      VLAN devices.
5173  */
5174 void netdev_change_features(struct net_device *dev)
5175 {
5176         __netdev_update_features(dev);
5177         netdev_features_change(dev);
5178 }
5179 EXPORT_SYMBOL(netdev_change_features);
5180
5181 /**
5182  *      netif_stacked_transfer_operstate -      transfer operstate
5183  *      @rootdev: the root or lower level device to transfer state from
5184  *      @dev: the device to transfer operstate to
5185  *
5186  *      Transfer operational state from root to device. This is normally
5187  *      called when a stacking relationship exists between the root
5188  *      device and the device(a leaf device).
5189  */
5190 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5191                                         struct net_device *dev)
5192 {
5193         if (rootdev->operstate == IF_OPER_DORMANT)
5194                 netif_dormant_on(dev);
5195         else
5196                 netif_dormant_off(dev);
5197
5198         if (netif_carrier_ok(rootdev)) {
5199                 if (!netif_carrier_ok(dev))
5200                         netif_carrier_on(dev);
5201         } else {
5202                 if (netif_carrier_ok(dev))
5203                         netif_carrier_off(dev);
5204         }
5205 }
5206 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5207
5208 #ifdef CONFIG_RPS
5209 static int netif_alloc_rx_queues(struct net_device *dev)
5210 {
5211         unsigned int i, count = dev->num_rx_queues;
5212         struct netdev_rx_queue *rx;
5213
5214         BUG_ON(count < 1);
5215
5216         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5217         if (!rx)
5218                 return -ENOMEM;
5219
5220         dev->_rx = rx;
5221
5222         for (i = 0; i < count; i++)
5223                 rx[i].dev = dev;
5224         return 0;
5225 }
5226 #endif
5227
5228 static void netdev_init_one_queue(struct net_device *dev,
5229                                   struct netdev_queue *queue, void *_unused)
5230 {
5231         /* Initialize queue lock */
5232         spin_lock_init(&queue->_xmit_lock);
5233         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5234         queue->xmit_lock_owner = -1;
5235         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5236         queue->dev = dev;
5237 #ifdef CONFIG_BQL
5238         dql_init(&queue->dql, HZ);
5239 #endif
5240 }
5241
5242 static int netif_alloc_netdev_queues(struct net_device *dev)
5243 {
5244         unsigned int count = dev->num_tx_queues;
5245         struct netdev_queue *tx;
5246
5247         BUG_ON(count < 1);
5248
5249         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5250         if (!tx)
5251                 return -ENOMEM;
5252
5253         dev->_tx = tx;
5254
5255         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5256         spin_lock_init(&dev->tx_global_lock);
5257
5258         return 0;
5259 }
5260
5261 /**
5262  *      register_netdevice      - register a network device
5263  *      @dev: device to register
5264  *
5265  *      Take a completed network device structure and add it to the kernel
5266  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5267  *      chain. 0 is returned on success. A negative errno code is returned
5268  *      on a failure to set up the device, or if the name is a duplicate.
5269  *
5270  *      Callers must hold the rtnl semaphore. You may want
5271  *      register_netdev() instead of this.
5272  *
5273  *      BUGS:
5274  *      The locking appears insufficient to guarantee two parallel registers
5275  *      will not get the same name.
5276  */
5277
5278 int register_netdevice(struct net_device *dev)
5279 {
5280         int ret;
5281         struct net *net = dev_net(dev);
5282
5283         BUG_ON(dev_boot_phase);
5284         ASSERT_RTNL();
5285
5286         might_sleep();
5287
5288         /* When net_device's are persistent, this will be fatal. */
5289         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5290         BUG_ON(!net);
5291
5292         spin_lock_init(&dev->addr_list_lock);
5293         netdev_set_addr_lockdep_class(dev);
5294
5295         dev->iflink = -1;
5296
5297         ret = dev_get_valid_name(net, dev, dev->name);
5298         if (ret < 0)
5299                 goto out;
5300
5301         /* Init, if this function is available */
5302         if (dev->netdev_ops->ndo_init) {
5303                 ret = dev->netdev_ops->ndo_init(dev);
5304                 if (ret) {
5305                         if (ret > 0)
5306                                 ret = -EIO;
5307                         goto out;
5308                 }
5309         }
5310
5311         if (((dev->hw_features | dev->features) &
5312              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5313             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5314              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5315                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5316                 ret = -EINVAL;
5317                 goto err_uninit;
5318         }
5319
5320         ret = -EBUSY;
5321         if (!dev->ifindex)
5322                 dev->ifindex = dev_new_index(net);
5323         else if (__dev_get_by_index(net, dev->ifindex))
5324                 goto err_uninit;
5325
5326         if (dev->iflink == -1)
5327                 dev->iflink = dev->ifindex;
5328
5329         /* Transfer changeable features to wanted_features and enable
5330          * software offloads (GSO and GRO).
5331          */
5332         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5333         dev->features |= NETIF_F_SOFT_FEATURES;
5334         dev->wanted_features = dev->features & dev->hw_features;
5335
5336         /* Turn on no cache copy if HW is doing checksum */
5337         if (!(dev->flags & IFF_LOOPBACK)) {
5338                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5339                 if (dev->features & NETIF_F_ALL_CSUM) {
5340                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5341                         dev->features |= NETIF_F_NOCACHE_COPY;
5342                 }
5343         }
5344
5345         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5346          */
5347         dev->vlan_features |= NETIF_F_HIGHDMA;
5348
5349         /* Make NETIF_F_SG inheritable to tunnel devices.
5350          */
5351         dev->hw_enc_features |= NETIF_F_SG;
5352
5353         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5354         ret = notifier_to_errno(ret);
5355         if (ret)
5356                 goto err_uninit;
5357
5358         ret = netdev_register_kobject(dev);
5359         if (ret)
5360                 goto err_uninit;
5361         dev->reg_state = NETREG_REGISTERED;
5362
5363         __netdev_update_features(dev);
5364
5365         /*
5366          *      Default initial state at registry is that the
5367          *      device is present.
5368          */
5369
5370         set_bit(__LINK_STATE_PRESENT, &dev->state);
5371
5372         linkwatch_init_dev(dev);
5373
5374         dev_init_scheduler(dev);
5375         dev_hold(dev);
5376         list_netdevice(dev);
5377         add_device_randomness(dev->dev_addr, dev->addr_len);
5378
5379         /* If the device has permanent device address, driver should
5380          * set dev_addr and also addr_assign_type should be set to
5381          * NET_ADDR_PERM (default value).
5382          */
5383         if (dev->addr_assign_type == NET_ADDR_PERM)
5384                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5385
5386         /* Notify protocols, that a new device appeared. */
5387         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5388         ret = notifier_to_errno(ret);
5389         if (ret) {
5390                 rollback_registered(dev);
5391                 dev->reg_state = NETREG_UNREGISTERED;
5392         }
5393         /*
5394          *      Prevent userspace races by waiting until the network
5395          *      device is fully setup before sending notifications.
5396          */
5397         if (!dev->rtnl_link_ops ||
5398             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5399                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5400
5401 out:
5402         return ret;
5403
5404 err_uninit:
5405         if (dev->netdev_ops->ndo_uninit)
5406                 dev->netdev_ops->ndo_uninit(dev);
5407         goto out;
5408 }
5409 EXPORT_SYMBOL(register_netdevice);
5410
5411 /**
5412  *      init_dummy_netdev       - init a dummy network device for NAPI
5413  *      @dev: device to init
5414  *
5415  *      This takes a network device structure and initialize the minimum
5416  *      amount of fields so it can be used to schedule NAPI polls without
5417  *      registering a full blown interface. This is to be used by drivers
5418  *      that need to tie several hardware interfaces to a single NAPI
5419  *      poll scheduler due to HW limitations.
5420  */
5421 int init_dummy_netdev(struct net_device *dev)
5422 {
5423         /* Clear everything. Note we don't initialize spinlocks
5424          * are they aren't supposed to be taken by any of the
5425          * NAPI code and this dummy netdev is supposed to be
5426          * only ever used for NAPI polls
5427          */
5428         memset(dev, 0, sizeof(struct net_device));
5429
5430         /* make sure we BUG if trying to hit standard
5431          * register/unregister code path
5432          */
5433         dev->reg_state = NETREG_DUMMY;
5434
5435         /* NAPI wants this */
5436         INIT_LIST_HEAD(&dev->napi_list);
5437
5438         /* a dummy interface is started by default */
5439         set_bit(__LINK_STATE_PRESENT, &dev->state);
5440         set_bit(__LINK_STATE_START, &dev->state);
5441
5442         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5443          * because users of this 'device' dont need to change
5444          * its refcount.
5445          */
5446
5447         return 0;
5448 }
5449 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5450
5451
5452 /**
5453  *      register_netdev - register a network device
5454  *      @dev: device to register
5455  *
5456  *      Take a completed network device structure and add it to the kernel
5457  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5458  *      chain. 0 is returned on success. A negative errno code is returned
5459  *      on a failure to set up the device, or if the name is a duplicate.
5460  *
5461  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5462  *      and expands the device name if you passed a format string to
5463  *      alloc_netdev.
5464  */
5465 int register_netdev(struct net_device *dev)
5466 {
5467         int err;
5468
5469         rtnl_lock();
5470         err = register_netdevice(dev);
5471         rtnl_unlock();
5472         return err;
5473 }
5474 EXPORT_SYMBOL(register_netdev);
5475
5476 int netdev_refcnt_read(const struct net_device *dev)
5477 {
5478         int i, refcnt = 0;
5479
5480         for_each_possible_cpu(i)
5481                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5482         return refcnt;
5483 }
5484 EXPORT_SYMBOL(netdev_refcnt_read);
5485
5486 /**
5487  * netdev_wait_allrefs - wait until all references are gone.
5488  * @dev: target net_device
5489  *
5490  * This is called when unregistering network devices.
5491  *
5492  * Any protocol or device that holds a reference should register
5493  * for netdevice notification, and cleanup and put back the
5494  * reference if they receive an UNREGISTER event.
5495  * We can get stuck here if buggy protocols don't correctly
5496  * call dev_put.
5497  */
5498 static void netdev_wait_allrefs(struct net_device *dev)
5499 {
5500         unsigned long rebroadcast_time, warning_time;
5501         int refcnt;
5502
5503         linkwatch_forget_dev(dev);
5504
5505         rebroadcast_time = warning_time = jiffies;
5506         refcnt = netdev_refcnt_read(dev);
5507
5508         while (refcnt != 0) {
5509                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5510                         rtnl_lock();
5511
5512                         /* Rebroadcast unregister notification */
5513                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5514
5515                         __rtnl_unlock();
5516                         rcu_barrier();
5517                         rtnl_lock();
5518
5519                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5520                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5521                                      &dev->state)) {
5522                                 /* We must not have linkwatch events
5523                                  * pending on unregister. If this
5524                                  * happens, we simply run the queue
5525                                  * unscheduled, resulting in a noop
5526                                  * for this device.
5527                                  */
5528                                 linkwatch_run_queue();
5529                         }
5530
5531                         __rtnl_unlock();
5532
5533                         rebroadcast_time = jiffies;
5534                 }
5535
5536                 msleep(250);
5537
5538                 refcnt = netdev_refcnt_read(dev);
5539
5540                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5541                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5542                                  dev->name, refcnt);
5543                         warning_time = jiffies;
5544                 }
5545         }
5546 }
5547
5548 /* The sequence is:
5549  *
5550  *      rtnl_lock();
5551  *      ...
5552  *      register_netdevice(x1);
5553  *      register_netdevice(x2);
5554  *      ...
5555  *      unregister_netdevice(y1);
5556  *      unregister_netdevice(y2);
5557  *      ...
5558  *      rtnl_unlock();
5559  *      free_netdev(y1);
5560  *      free_netdev(y2);
5561  *
5562  * We are invoked by rtnl_unlock().
5563  * This allows us to deal with problems:
5564  * 1) We can delete sysfs objects which invoke hotplug
5565  *    without deadlocking with linkwatch via keventd.
5566  * 2) Since we run with the RTNL semaphore not held, we can sleep
5567  *    safely in order to wait for the netdev refcnt to drop to zero.
5568  *
5569  * We must not return until all unregister events added during
5570  * the interval the lock was held have been completed.
5571  */
5572 void netdev_run_todo(void)
5573 {
5574         struct list_head list;
5575
5576         /* Snapshot list, allow later requests */
5577         list_replace_init(&net_todo_list, &list);
5578
5579         __rtnl_unlock();
5580
5581
5582         /* Wait for rcu callbacks to finish before next phase */
5583         if (!list_empty(&list))
5584                 rcu_barrier();
5585
5586         while (!list_empty(&list)) {
5587                 struct net_device *dev
5588                         = list_first_entry(&list, struct net_device, todo_list);
5589                 list_del(&dev->todo_list);
5590
5591                 rtnl_lock();
5592                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5593                 __rtnl_unlock();
5594
5595                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5596                         pr_err("network todo '%s' but state %d\n",
5597                                dev->name, dev->reg_state);
5598                         dump_stack();
5599                         continue;
5600                 }
5601
5602                 dev->reg_state = NETREG_UNREGISTERED;
5603
5604                 on_each_cpu(flush_backlog, dev, 1);
5605
5606                 netdev_wait_allrefs(dev);
5607
5608                 /* paranoia */
5609                 BUG_ON(netdev_refcnt_read(dev));
5610                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5611                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5612                 WARN_ON(dev->dn_ptr);
5613
5614                 if (dev->destructor)
5615                         dev->destructor(dev);
5616
5617                 /* Free network device */
5618                 kobject_put(&dev->dev.kobj);
5619         }
5620 }
5621
5622 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5623  * fields in the same order, with only the type differing.
5624  */
5625 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5626                              const struct net_device_stats *netdev_stats)
5627 {
5628 #if BITS_PER_LONG == 64
5629         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5630         memcpy(stats64, netdev_stats, sizeof(*stats64));
5631 #else
5632         size_t i, n = sizeof(*stats64) / sizeof(u64);
5633         const unsigned long *src = (const unsigned long *)netdev_stats;
5634         u64 *dst = (u64 *)stats64;
5635
5636         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5637                      sizeof(*stats64) / sizeof(u64));
5638         for (i = 0; i < n; i++)
5639                 dst[i] = src[i];
5640 #endif
5641 }
5642 EXPORT_SYMBOL(netdev_stats_to_stats64);
5643
5644 /**
5645  *      dev_get_stats   - get network device statistics
5646  *      @dev: device to get statistics from
5647  *      @storage: place to store stats
5648  *
5649  *      Get network statistics from device. Return @storage.
5650  *      The device driver may provide its own method by setting
5651  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5652  *      otherwise the internal statistics structure is used.
5653  */
5654 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5655                                         struct rtnl_link_stats64 *storage)
5656 {
5657         const struct net_device_ops *ops = dev->netdev_ops;
5658
5659         if (ops->ndo_get_stats64) {
5660                 memset(storage, 0, sizeof(*storage));
5661                 ops->ndo_get_stats64(dev, storage);
5662         } else if (ops->ndo_get_stats) {
5663                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5664         } else {
5665                 netdev_stats_to_stats64(storage, &dev->stats);
5666         }
5667         storage->rx_dropped += (unsigned long)atomic_long_read(&dev->rx_dropped);
5668         return storage;
5669 }
5670 EXPORT_SYMBOL(dev_get_stats);
5671
5672 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5673 {
5674         struct netdev_queue *queue = dev_ingress_queue(dev);
5675
5676 #ifdef CONFIG_NET_CLS_ACT
5677         if (queue)
5678                 return queue;
5679         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5680         if (!queue)
5681                 return NULL;
5682         netdev_init_one_queue(dev, queue, NULL);
5683         queue->qdisc = &noop_qdisc;
5684         queue->qdisc_sleeping = &noop_qdisc;
5685         rcu_assign_pointer(dev->ingress_queue, queue);
5686 #endif
5687         return queue;
5688 }
5689
5690 static const struct ethtool_ops default_ethtool_ops;
5691
5692 void netdev_set_default_ethtool_ops(struct net_device *dev,
5693                                     const struct ethtool_ops *ops)
5694 {
5695         if (dev->ethtool_ops == &default_ethtool_ops)
5696                 dev->ethtool_ops = ops;
5697 }
5698 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5699
5700 /**
5701  *      alloc_netdev_mqs - allocate network device
5702  *      @sizeof_priv:   size of private data to allocate space for
5703  *      @name:          device name format string
5704  *      @setup:         callback to initialize device
5705  *      @txqs:          the number of TX subqueues to allocate
5706  *      @rxqs:          the number of RX subqueues to allocate
5707  *
5708  *      Allocates a struct net_device with private data area for driver use
5709  *      and performs basic initialization.  Also allocates subquue structs
5710  *      for each queue on the device.
5711  */
5712 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5713                 void (*setup)(struct net_device *),
5714                 unsigned int txqs, unsigned int rxqs)
5715 {
5716         struct net_device *dev;
5717         size_t alloc_size;
5718         struct net_device *p;
5719
5720         BUG_ON(strlen(name) >= sizeof(dev->name));
5721
5722         if (txqs < 1) {
5723                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5724                 return NULL;
5725         }
5726
5727 #ifdef CONFIG_RPS
5728         if (rxqs < 1) {
5729                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5730                 return NULL;
5731         }
5732 #endif
5733
5734         alloc_size = sizeof(struct net_device);
5735         if (sizeof_priv) {
5736                 /* ensure 32-byte alignment of private area */
5737                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5738                 alloc_size += sizeof_priv;
5739         }
5740         /* ensure 32-byte alignment of whole construct */
5741         alloc_size += NETDEV_ALIGN - 1;
5742
5743         p = kzalloc(alloc_size, GFP_KERNEL);
5744         if (!p)
5745                 return NULL;
5746
5747         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5748         dev->padded = (char *)dev - (char *)p;
5749
5750         dev->pcpu_refcnt = alloc_percpu(int);
5751         if (!dev->pcpu_refcnt)
5752                 goto free_p;
5753
5754         if (dev_addr_init(dev))
5755                 goto free_pcpu;
5756
5757         dev_mc_init(dev);
5758         dev_uc_init(dev);
5759
5760         dev_net_set(dev, &init_net);
5761
5762         dev->gso_max_size = GSO_MAX_SIZE;
5763         dev->gso_max_segs = GSO_MAX_SEGS;
5764
5765         INIT_LIST_HEAD(&dev->napi_list);
5766         INIT_LIST_HEAD(&dev->unreg_list);
5767         INIT_LIST_HEAD(&dev->link_watch_list);
5768         INIT_LIST_HEAD(&dev->upper_dev_list);
5769         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5770         setup(dev);
5771
5772         dev->num_tx_queues = txqs;
5773         dev->real_num_tx_queues = txqs;
5774         if (netif_alloc_netdev_queues(dev))
5775                 goto free_all;
5776
5777 #ifdef CONFIG_RPS
5778         dev->num_rx_queues = rxqs;
5779         dev->real_num_rx_queues = rxqs;
5780         if (netif_alloc_rx_queues(dev))
5781                 goto free_all;
5782 #endif
5783
5784         strcpy(dev->name, name);
5785         dev->group = INIT_NETDEV_GROUP;
5786         if (!dev->ethtool_ops)
5787                 dev->ethtool_ops = &default_ethtool_ops;
5788         return dev;
5789
5790 free_all:
5791         free_netdev(dev);
5792         return NULL;
5793
5794 free_pcpu:
5795         free_percpu(dev->pcpu_refcnt);
5796         kfree(dev->_tx);
5797 #ifdef CONFIG_RPS
5798         kfree(dev->_rx);
5799 #endif
5800
5801 free_p:
5802         kfree(p);
5803         return NULL;
5804 }
5805 EXPORT_SYMBOL(alloc_netdev_mqs);
5806
5807 /**
5808  *      free_netdev - free network device
5809  *      @dev: device
5810  *
5811  *      This function does the last stage of destroying an allocated device
5812  *      interface. The reference to the device object is released.
5813  *      If this is the last reference then it will be freed.
5814  */
5815 void free_netdev(struct net_device *dev)
5816 {
5817         struct napi_struct *p, *n;
5818
5819         release_net(dev_net(dev));
5820
5821         kfree(dev->_tx);
5822 #ifdef CONFIG_RPS
5823         kfree(dev->_rx);
5824 #endif
5825
5826         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5827
5828         /* Flush device addresses */
5829         dev_addr_flush(dev);
5830
5831         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5832                 netif_napi_del(p);
5833
5834         free_percpu(dev->pcpu_refcnt);
5835         dev->pcpu_refcnt = NULL;
5836
5837         /*  Compatibility with error handling in drivers */
5838         if (dev->reg_state == NETREG_UNINITIALIZED) {
5839                 kfree((char *)dev - dev->padded);
5840                 return;
5841         }
5842
5843         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5844         dev->reg_state = NETREG_RELEASED;
5845
5846         /* will free via device release */
5847         put_device(&dev->dev);
5848 }
5849 EXPORT_SYMBOL(free_netdev);
5850
5851 /**
5852  *      synchronize_net -  Synchronize with packet receive processing
5853  *
5854  *      Wait for packets currently being received to be done.
5855  *      Does not block later packets from starting.
5856  */
5857 void synchronize_net(void)
5858 {
5859         might_sleep();
5860         if (rtnl_is_locked())
5861                 synchronize_rcu_expedited();
5862         else
5863                 synchronize_rcu();
5864 }
5865 EXPORT_SYMBOL(synchronize_net);
5866
5867 /**
5868  *      unregister_netdevice_queue - remove device from the kernel
5869  *      @dev: device
5870  *      @head: list
5871  *
5872  *      This function shuts down a device interface and removes it
5873  *      from the kernel tables.
5874  *      If head not NULL, device is queued to be unregistered later.
5875  *
5876  *      Callers must hold the rtnl semaphore.  You may want
5877  *      unregister_netdev() instead of this.
5878  */
5879
5880 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5881 {
5882         ASSERT_RTNL();
5883
5884         if (head) {
5885                 list_move_tail(&dev->unreg_list, head);
5886         } else {
5887                 rollback_registered(dev);
5888                 /* Finish processing unregister after unlock */
5889                 net_set_todo(dev);
5890         }
5891 }
5892 EXPORT_SYMBOL(unregister_netdevice_queue);
5893
5894 /**
5895  *      unregister_netdevice_many - unregister many devices
5896  *      @head: list of devices
5897  *
5898  *  Note: As most callers use a stack allocated list_head,
5899  *  we force a list_del() to make sure stack wont be corrupted later.
5900  */
5901 void unregister_netdevice_many(struct list_head *head)
5902 {
5903         struct net_device *dev;
5904
5905         if (!list_empty(head)) {
5906                 rollback_registered_many(head);
5907                 list_for_each_entry(dev, head, unreg_list)
5908                         net_set_todo(dev);
5909                 list_del(head);
5910         }
5911 }
5912 EXPORT_SYMBOL(unregister_netdevice_many);
5913
5914 /**
5915  *      unregister_netdev - remove device from the kernel
5916  *      @dev: device
5917  *
5918  *      This function shuts down a device interface and removes it
5919  *      from the kernel tables.
5920  *
5921  *      This is just a wrapper for unregister_netdevice that takes
5922  *      the rtnl semaphore.  In general you want to use this and not
5923  *      unregister_netdevice.
5924  */
5925 void unregister_netdev(struct net_device *dev)
5926 {
5927         rtnl_lock();
5928         unregister_netdevice(dev);
5929         rtnl_unlock();
5930 }
5931 EXPORT_SYMBOL(unregister_netdev);
5932
5933 /**
5934  *      dev_change_net_namespace - move device to different nethost namespace
5935  *      @dev: device
5936  *      @net: network namespace
5937  *      @pat: If not NULL name pattern to try if the current device name
5938  *            is already taken in the destination network namespace.
5939  *
5940  *      This function shuts down a device interface and moves it
5941  *      to a new network namespace. On success 0 is returned, on
5942  *      a failure a netagive errno code is returned.
5943  *
5944  *      Callers must hold the rtnl semaphore.
5945  */
5946
5947 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5948 {
5949         int err;
5950
5951         ASSERT_RTNL();
5952
5953         /* Don't allow namespace local devices to be moved. */
5954         err = -EINVAL;
5955         if (dev->features & NETIF_F_NETNS_LOCAL)
5956                 goto out;
5957
5958         /* Ensure the device has been registrered */
5959         if (dev->reg_state != NETREG_REGISTERED)
5960                 goto out;
5961
5962         /* Get out if there is nothing todo */
5963         err = 0;
5964         if (net_eq(dev_net(dev), net))
5965                 goto out;
5966
5967         /* Pick the destination device name, and ensure
5968          * we can use it in the destination network namespace.
5969          */
5970         err = -EEXIST;
5971         if (__dev_get_by_name(net, dev->name)) {
5972                 /* We get here if we can't use the current device name */
5973                 if (!pat)
5974                         goto out;
5975                 if (dev_get_valid_name(net, dev, pat) < 0)
5976                         goto out;
5977         }
5978
5979         /*
5980          * And now a mini version of register_netdevice unregister_netdevice.
5981          */
5982
5983         /* If device is running close it first. */
5984         dev_close(dev);
5985
5986         /* And unlink it from device chain */
5987         err = -ENODEV;
5988         unlist_netdevice(dev);
5989
5990         synchronize_net();
5991
5992         /* Shutdown queueing discipline. */
5993         dev_shutdown(dev);
5994
5995         /* Notify protocols, that we are about to destroy
5996            this device. They should clean all the things.
5997
5998            Note that dev->reg_state stays at NETREG_REGISTERED.
5999            This is wanted because this way 8021q and macvlan know
6000            the device is just moving and can keep their slaves up.
6001         */
6002         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
6003         rcu_barrier();
6004         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
6005         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
6006
6007         /*
6008          *      Flush the unicast and multicast chains
6009          */
6010         dev_uc_flush(dev);
6011         dev_mc_flush(dev);
6012
6013         /* Send a netdev-removed uevent to the old namespace */
6014         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
6015
6016         /* Actually switch the network namespace */
6017         dev_net_set(dev, net);
6018
6019         /* If there is an ifindex conflict assign a new one */
6020         if (__dev_get_by_index(net, dev->ifindex)) {
6021                 int iflink = (dev->iflink == dev->ifindex);
6022                 dev->ifindex = dev_new_index(net);
6023                 if (iflink)
6024                         dev->iflink = dev->ifindex;
6025         }
6026
6027         /* Send a netdev-add uevent to the new namespace */
6028         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
6029
6030         /* Fixup kobjects */
6031         err = device_rename(&dev->dev, dev->name);
6032         WARN_ON(err);
6033
6034         /* Add the device back in the hashes */
6035         list_netdevice(dev);
6036
6037         /* Notify protocols, that a new device appeared. */
6038         call_netdevice_notifiers(NETDEV_REGISTER, dev);
6039
6040         /*
6041          *      Prevent userspace races by waiting until the network
6042          *      device is fully setup before sending notifications.
6043          */
6044         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6045
6046         synchronize_net();
6047         err = 0;
6048 out:
6049         return err;
6050 }
6051 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6052
6053 static int dev_cpu_callback(struct notifier_block *nfb,
6054                             unsigned long action,
6055                             void *ocpu)
6056 {
6057         struct sk_buff **list_skb;
6058         struct sk_buff *skb;
6059         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6060         struct softnet_data *sd, *oldsd;
6061
6062         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6063                 return NOTIFY_OK;
6064
6065         local_irq_disable();
6066         cpu = smp_processor_id();
6067         sd = &per_cpu(softnet_data, cpu);
6068         oldsd = &per_cpu(softnet_data, oldcpu);
6069
6070         /* Find end of our completion_queue. */
6071         list_skb = &sd->completion_queue;
6072         while (*list_skb)
6073                 list_skb = &(*list_skb)->next;
6074         /* Append completion queue from offline CPU. */
6075         *list_skb = oldsd->completion_queue;
6076         oldsd->completion_queue = NULL;
6077
6078         /* Append output queue from offline CPU. */
6079         if (oldsd->output_queue) {
6080                 *sd->output_queue_tailp = oldsd->output_queue;
6081                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6082                 oldsd->output_queue = NULL;
6083                 oldsd->output_queue_tailp = &oldsd->output_queue;
6084         }
6085         /* Append NAPI poll list from offline CPU, with one exception :
6086          * process_backlog() must be called by cpu owning percpu backlog.
6087          * We properly handle process_queue & input_pkt_queue later.
6088          */
6089         while (!list_empty(&oldsd->poll_list)) {
6090                 struct napi_struct *napi = list_first_entry(&oldsd->poll_list,
6091                                                             struct napi_struct,
6092                                                             poll_list);
6093
6094                 list_del_init(&napi->poll_list);
6095                 if (napi->poll == process_backlog)
6096                         napi->state = 0;
6097                 else
6098                         ____napi_schedule(sd, napi);
6099         }
6100
6101         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6102         local_irq_enable();
6103
6104         /* Process offline CPU's input_pkt_queue */
6105         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6106                 netif_rx(skb);
6107                 input_queue_head_incr(oldsd);
6108         }
6109         while ((skb = skb_dequeue(&oldsd->input_pkt_queue))) {
6110                 netif_rx(skb);
6111                 input_queue_head_incr(oldsd);
6112         }
6113
6114         return NOTIFY_OK;
6115 }
6116
6117
6118 /**
6119  *      netdev_increment_features - increment feature set by one
6120  *      @all: current feature set
6121  *      @one: new feature set
6122  *      @mask: mask feature set
6123  *
6124  *      Computes a new feature set after adding a device with feature set
6125  *      @one to the master device with current feature set @all.  Will not
6126  *      enable anything that is off in @mask. Returns the new feature set.
6127  */
6128 netdev_features_t netdev_increment_features(netdev_features_t all,
6129         netdev_features_t one, netdev_features_t mask)
6130 {
6131         if (mask & NETIF_F_GEN_CSUM)
6132                 mask |= NETIF_F_ALL_CSUM;
6133         mask |= NETIF_F_VLAN_CHALLENGED;
6134
6135         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6136         all &= one | ~NETIF_F_ALL_FOR_ALL;
6137
6138         /* If one device supports hw checksumming, set for all. */
6139         if (all & NETIF_F_GEN_CSUM)
6140                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6141
6142         return all;
6143 }
6144 EXPORT_SYMBOL(netdev_increment_features);
6145
6146 static struct hlist_head *netdev_create_hash(void)
6147 {
6148         int i;
6149         struct hlist_head *hash;
6150
6151         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6152         if (hash != NULL)
6153                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6154                         INIT_HLIST_HEAD(&hash[i]);
6155
6156         return hash;
6157 }
6158
6159 /* Initialize per network namespace state */
6160 static int __net_init netdev_init(struct net *net)
6161 {
6162         if (net != &init_net)
6163                 INIT_LIST_HEAD(&net->dev_base_head);
6164
6165         net->dev_name_head = netdev_create_hash();
6166         if (net->dev_name_head == NULL)
6167                 goto err_name;
6168
6169         net->dev_index_head = netdev_create_hash();
6170         if (net->dev_index_head == NULL)
6171                 goto err_idx;
6172
6173         return 0;
6174
6175 err_idx:
6176         kfree(net->dev_name_head);
6177 err_name:
6178         return -ENOMEM;
6179 }
6180
6181 /**
6182  *      netdev_drivername - network driver for the device
6183  *      @dev: network device
6184  *
6185  *      Determine network driver for device.
6186  */
6187 const char *netdev_drivername(const struct net_device *dev)
6188 {
6189         const struct device_driver *driver;
6190         const struct device *parent;
6191         const char *empty = "";
6192
6193         parent = dev->dev.parent;
6194         if (!parent)
6195                 return empty;
6196
6197         driver = parent->driver;
6198         if (driver && driver->name)
6199                 return driver->name;
6200         return empty;
6201 }
6202
6203 static int __netdev_printk(const char *level, const struct net_device *dev,
6204                            struct va_format *vaf)
6205 {
6206         int r;
6207
6208         if (dev && dev->dev.parent) {
6209                 r = dev_printk_emit(level[1] - '0',
6210                                     dev->dev.parent,
6211                                     "%s %s %s: %pV",
6212                                     dev_driver_string(dev->dev.parent),
6213                                     dev_name(dev->dev.parent),
6214                                     netdev_name(dev), vaf);
6215         } else if (dev) {
6216                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6217         } else {
6218                 r = printk("%s(NULL net_device): %pV", level, vaf);
6219         }
6220
6221         return r;
6222 }
6223
6224 int netdev_printk(const char *level, const struct net_device *dev,
6225                   const char *format, ...)
6226 {
6227         struct va_format vaf;
6228         va_list args;
6229         int r;
6230
6231         va_start(args, format);
6232
6233         vaf.fmt = format;
6234         vaf.va = &args;
6235
6236         r = __netdev_printk(level, dev, &vaf);
6237
6238         va_end(args);
6239
6240         return r;
6241 }
6242 EXPORT_SYMBOL(netdev_printk);
6243
6244 #define define_netdev_printk_level(func, level)                 \
6245 int func(const struct net_device *dev, const char *fmt, ...)    \
6246 {                                                               \
6247         int r;                                                  \
6248         struct va_format vaf;                                   \
6249         va_list args;                                           \
6250                                                                 \
6251         va_start(args, fmt);                                    \
6252                                                                 \
6253         vaf.fmt = fmt;                                          \
6254         vaf.va = &args;                                         \
6255                                                                 \
6256         r = __netdev_printk(level, dev, &vaf);                  \
6257                                                                 \
6258         va_end(args);                                           \
6259                                                                 \
6260         return r;                                               \
6261 }                                                               \
6262 EXPORT_SYMBOL(func);
6263
6264 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6265 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6266 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6267 define_netdev_printk_level(netdev_err, KERN_ERR);
6268 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6269 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6270 define_netdev_printk_level(netdev_info, KERN_INFO);
6271
6272 static void __net_exit netdev_exit(struct net *net)
6273 {
6274         kfree(net->dev_name_head);
6275         kfree(net->dev_index_head);
6276 }
6277
6278 static struct pernet_operations __net_initdata netdev_net_ops = {
6279         .init = netdev_init,
6280         .exit = netdev_exit,
6281 };
6282
6283 static void __net_exit default_device_exit(struct net *net)
6284 {
6285         struct net_device *dev, *aux;
6286         /*
6287          * Push all migratable network devices back to the
6288          * initial network namespace
6289          */
6290         rtnl_lock();
6291         for_each_netdev_safe(net, dev, aux) {
6292                 int err;
6293                 char fb_name[IFNAMSIZ];
6294
6295                 /* Ignore unmoveable devices (i.e. loopback) */
6296                 if (dev->features & NETIF_F_NETNS_LOCAL)
6297                         continue;
6298
6299                 /* Leave virtual devices for the generic cleanup */
6300                 if (dev->rtnl_link_ops)
6301                         continue;
6302
6303                 /* Push remaining network devices to init_net */
6304                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6305                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6306                 if (err) {
6307                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6308                                  __func__, dev->name, err);
6309                         BUG();
6310                 }
6311         }
6312         rtnl_unlock();
6313 }
6314
6315 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6316 {
6317         /* At exit all network devices most be removed from a network
6318          * namespace.  Do this in the reverse order of registration.
6319          * Do this across as many network namespaces as possible to
6320          * improve batching efficiency.
6321          */
6322         struct net_device *dev;
6323         struct net *net;
6324         LIST_HEAD(dev_kill_list);
6325
6326         rtnl_lock();
6327         list_for_each_entry(net, net_list, exit_list) {
6328                 for_each_netdev_reverse(net, dev) {
6329                         if (dev->rtnl_link_ops)
6330                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6331                         else
6332                                 unregister_netdevice_queue(dev, &dev_kill_list);
6333                 }
6334         }
6335         unregister_netdevice_many(&dev_kill_list);
6336         rtnl_unlock();
6337 }
6338
6339 static struct pernet_operations __net_initdata default_device_ops = {
6340         .exit = default_device_exit,
6341         .exit_batch = default_device_exit_batch,
6342 };
6343
6344 /*
6345  *      Initialize the DEV module. At boot time this walks the device list and
6346  *      unhooks any devices that fail to initialise (normally hardware not
6347  *      present) and leaves us with a valid list of present and active devices.
6348  *
6349  */
6350
6351 /*
6352  *       This is called single threaded during boot, so no need
6353  *       to take the rtnl semaphore.
6354  */
6355 static int __init net_dev_init(void)
6356 {
6357         int i, rc = -ENOMEM;
6358
6359         BUG_ON(!dev_boot_phase);
6360
6361         if (dev_proc_init())
6362                 goto out;
6363
6364         if (netdev_kobject_init())
6365                 goto out;
6366
6367         INIT_LIST_HEAD(&ptype_all);
6368         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6369                 INIT_LIST_HEAD(&ptype_base[i]);
6370
6371         INIT_LIST_HEAD(&offload_base);
6372
6373         if (register_pernet_subsys(&netdev_net_ops))
6374                 goto out;
6375
6376         /*
6377          *      Initialise the packet receive queues.
6378          */
6379
6380         for_each_possible_cpu(i) {
6381                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6382
6383                 memset(sd, 0, sizeof(*sd));
6384                 skb_queue_head_init(&sd->input_pkt_queue);
6385                 skb_queue_head_init(&sd->process_queue);
6386                 sd->completion_queue = NULL;
6387                 INIT_LIST_HEAD(&sd->poll_list);
6388                 sd->output_queue = NULL;
6389                 sd->output_queue_tailp = &sd->output_queue;
6390 #ifdef CONFIG_RPS
6391                 sd->csd.func = rps_trigger_softirq;
6392                 sd->csd.info = sd;
6393                 sd->csd.flags = 0;
6394                 sd->cpu = i;
6395 #endif
6396
6397                 sd->backlog.poll = process_backlog;
6398                 sd->backlog.weight = weight_p;
6399                 sd->backlog.gro_list = NULL;
6400                 sd->backlog.gro_count = 0;
6401         }
6402
6403         dev_boot_phase = 0;
6404
6405         /* The loopback device is special if any other network devices
6406          * is present in a network namespace the loopback device must
6407          * be present. Since we now dynamically allocate and free the
6408          * loopback device ensure this invariant is maintained by
6409          * keeping the loopback device as the first device on the
6410          * list of network devices.  Ensuring the loopback devices
6411          * is the first device that appears and the last network device
6412          * that disappears.
6413          */
6414         if (register_pernet_device(&loopback_net_ops))
6415                 goto out;
6416
6417         if (register_pernet_device(&default_device_ops))
6418                 goto out;
6419
6420         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6421         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6422
6423         hotcpu_notifier(dev_cpu_callback, 0);
6424         dst_init();
6425         rc = 0;
6426 out:
6427         return rc;
6428 }
6429
6430 subsys_initcall(net_dev_init);