net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <linux/bitops.h>
  77 #include <linux/capability.h>
  78 #include <linux/cpu.h>
  79 #include <linux/types.h>
  80 #include <linux/kernel.h>
  81 #include <linux/hash.h>
  82 #include <linux/slab.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/stat.h>
 101 #include <net/dst.h>
 102 #include <net/pkt_sched.h>
 103 #include <net/checksum.h>
 104 #include <net/xfrm.h>
 105 #include <linux/highmem.h>
 106 #include <linux/init.h>
 107 #include <linux/module.h>
 108 #include <linux/netpoll.h>
 109 #include <linux/rcupdate.h>
 110 #include <linux/delay.h>
 111 #include <net/iw_handler.h>
 112 #include <asm/current.h>
 113 #include <linux/audit.h>
 114 #include <linux/dmaengine.h>
 115 #include <linux/err.h>
 116 #include <linux/ctype.h>
 117 #include <linux/if_arp.h>
 118 #include <linux/if_vlan.h>
 119 #include <linux/ip.h>
 120 #include <net/ip.h>
 121 #include <linux/ipv6.h>
 122 #include <linux/in.h>
 123 #include <linux/jhash.h>
 124 #include <linux/random.h>
 125 #include <trace/events/napi.h>
 126 #include <trace/events/net.h>
 127 #include <trace/events/skb.h>
 128 #include <linux/pci.h>
 129 #include <linux/inetdevice.h>
 130 #include <linux/cpu_rmap.h>
 131 #include <linux/static_key.h>
 132 #include <net/udp.h>
 133 #include "net-sysfs.h"
 134
 135 #ifdef UDP_SKT_WIFI
 136 #include <linux/ftrace_event.h>
 137 #endif
 138
 139 /* Instead of increasing this, you should create a hash table. */
 140 #define MAX_GRO_SKBS 8
 141
 142 /* This should be increased if a protocol with a bigger head is added. */
 143 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 144
 145 static DEFINE_SPINLOCK(ptype_lock);
 146 static DEFINE_SPINLOCK(offload_lock);
 147 struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 148 struct list_head ptype_all __read_mostly;       /* Taps */
 149 static struct list_head offload_base __read_mostly;
 150
 151 /*
 152  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 153  * semaphore.
 154  *
 155  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 156  *
 157  * Writers must hold the rtnl semaphore while they loop through the
 158  * dev_base_head list, and hold dev_base_lock for writing when they do the
 159  * actual updates.  This allows pure readers to access the list even
 160  * while a writer is preparing to update it.
 161  *
 162  * To put it another way, dev_base_lock is held for writing only to
 163  * protect against pure readers; the rtnl semaphore provides the
 164  * protection against other writers.
 165  *
 166  * See, for example usages, register_netdevice() and
 167  * unregister_netdevice(), which must be called with the rtnl
 168  * semaphore held.
 169  */
 170 DEFINE_RWLOCK(dev_base_lock);
 171 EXPORT_SYMBOL(dev_base_lock);
 172
 173 seqcount_t devnet_rename_seq;
 174
 175 static inline void dev_base_seq_inc(struct net *net)
 176 {
 177         while (++net->dev_base_seq == 0);
 178 }
 179
 180 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 181 {
 182         unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 183
 184         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 185 }
 186
 187 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 188 {
 189         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 190 }
 191
 192 static inline void rps_lock(struct softnet_data *sd)
 193 {
 194 #ifdef CONFIG_RPS
 195         spin_lock(&sd->input_pkt_queue.lock);
 196 #endif
 197 }
 198
 199 static inline void rps_unlock(struct softnet_data *sd)
 200 {
 201 #ifdef CONFIG_RPS
 202         spin_unlock(&sd->input_pkt_queue.lock);
 203 #endif
 204 }
 205
 206 /* Device list insertion */
 207 static void list_netdevice(struct net_device *dev)
 208 {
 209         struct net *net = dev_net(dev);
 210
 211         ASSERT_RTNL();
 212
 213         write_lock_bh(&dev_base_lock);
 214         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 215         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 216         hlist_add_head_rcu(&dev->index_hlist,
 217                            dev_index_hash(net, dev->ifindex));
 218         write_unlock_bh(&dev_base_lock);
 219
 220         dev_base_seq_inc(net);
 221 }
 222
 223 /* Device list removal
 224  * caller must respect a RCU grace period before freeing/reusing dev
 225  */
 226 static void unlist_netdevice(struct net_device *dev)
 227 {
 228         ASSERT_RTNL();
 229
 230         /* Unlink dev from the device chain */
 231         write_lock_bh(&dev_base_lock);
 232         list_del_rcu(&dev->dev_list);
 233         hlist_del_rcu(&dev->name_hlist);
 234         hlist_del_rcu(&dev->index_hlist);
 235         write_unlock_bh(&dev_base_lock);
 236
 237         dev_base_seq_inc(dev_net(dev));
 238 }
 239
 240 /*
 241  *      Our notifier list
 242  */
 243
 244 static RAW_NOTIFIER_HEAD(netdev_chain);
 245
 246 /*
 247  *      Device drivers call our routines to queue packets here. We empty the
 248  *      queue in the local softnet handler.
 249  */
 250
 251 DEFINE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
 252 EXPORT_PER_CPU_SYMBOL(softnet_data);
 253
 254 #ifdef CONFIG_LOCKDEP
 255 /*
 256  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 257  * according to dev->type
 258  */
 259 static const unsigned short netdev_lock_type[] =
 260         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 261          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 262          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 263          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 264          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 265          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 266          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 267          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 268          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 269          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 270          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 271          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 272          ARPHRD_FCFABRIC, ARPHRD_IEEE80211, ARPHRD_IEEE80211_PRISM,
 273          ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET, ARPHRD_PHONET_PIPE,
 274          ARPHRD_IEEE802154, ARPHRD_VOID, ARPHRD_NONE};
 275
 276 static const char *const netdev_lock_name[] =
 277         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 278          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 279          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 280          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 281          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 282          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 283          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 284          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 285          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 286          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 287          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 288          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 289          "_xmit_FCFABRIC", "_xmit_IEEE80211", "_xmit_IEEE80211_PRISM",
 290          "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET", "_xmit_PHONET_PIPE",
 291          "_xmit_IEEE802154", "_xmit_VOID", "_xmit_NONE"};
 292
 293 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 294 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 295
 296 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 297 {
 298         int i;
 299
 300         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 301                 if (netdev_lock_type[i] == dev_type)
 302                         return i;
 303         /* the last key is used by default */
 304         return ARRAY_SIZE(netdev_lock_type) - 1;
 305 }
 306
 307 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 308                                                  unsigned short dev_type)
 309 {
 310         int i;
 311
 312         i = netdev_lock_pos(dev_type);
 313         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 314                                    netdev_lock_name[i]);
 315 }
 316
 317 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 318 {
 319         int i;
 320
 321         i = netdev_lock_pos(dev->type);
 322         lockdep_set_class_and_name(&dev->addr_list_lock,
 323                                    &netdev_addr_lock_key[i],
 324                                    netdev_lock_name[i]);
 325 }
 326 #else
 327 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 328                                                  unsigned short dev_type)
 329 {
 330 }
 331 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 332 {
 333 }
 334 #endif
 335
 336 /*******************************************************************************
 337
 338                 Protocol management and registration routines
 339
 340 *******************************************************************************/
 341
 342 /*
 343  *      Add a protocol ID to the list. Now that the input handler is
 344  *      smarter we can dispense with all the messy stuff that used to be
 345  *      here.
 346  *
 347  *      BEWARE!!! Protocol handlers, mangling input packets,
 348  *      MUST BE last in hash buckets and checking protocol handlers
 349  *      MUST start from promiscuous ptype_all chain in net_bh.
 350  *      It is true now, do not change it.
 351  *      Explanation follows: if protocol handler, mangling packet, will
 352  *      be the first on list, it is not able to sense, that packet
 353  *      is cloned and should be copied-on-write, so that it will
 354  *      change it and subsequent readers will get broken packet.
 355  *                                                      --ANK (980803)
 356  */
 357
 358 static inline struct list_head *ptype_head(const struct packet_type *pt)
 359 {
 360         if (pt->type == htons(ETH_P_ALL))
 361                 return &ptype_all;
 362         else
 363                 return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 364 }
 365
 366 /**
 367  *      dev_add_pack - add packet handler
 368  *      @pt: packet type declaration
 369  *
 370  *      Add a protocol handler to the networking stack. The passed &packet_type
 371  *      is linked into kernel lists and may not be freed until it has been
 372  *      removed from the kernel lists.
 373  *
 374  *      This call does not sleep therefore it can not
 375  *      guarantee all CPU's that are in middle of receiving packets
 376  *      will see the new packet type (until the next received packet).
 377  */
 378
 379 void dev_add_pack(struct packet_type *pt)
 380 {
 381         struct list_head *head = ptype_head(pt);
 382
 383         spin_lock(&ptype_lock);
 384         list_add_rcu(&pt->list, head);
 385         spin_unlock(&ptype_lock);
 386 }
 387 EXPORT_SYMBOL(dev_add_pack);
 388
 389 /**
 390  *      __dev_remove_pack        - remove packet handler
 391  *      @pt: packet type declaration
 392  *
 393  *      Remove a protocol handler that was previously added to the kernel
 394  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 395  *      from the kernel lists and can be freed or reused once this function
 396  *      returns.
 397  *
 398  *      The packet type might still be in use by receivers
 399  *      and must not be freed until after all the CPU's have gone
 400  *      through a quiescent state.
 401  */
 402 void __dev_remove_pack(struct packet_type *pt)
 403 {
 404         struct list_head *head = ptype_head(pt);
 405         struct packet_type *pt1;
 406
 407         spin_lock(&ptype_lock);
 408
 409         list_for_each_entry(pt1, head, list) {
 410                 if (pt == pt1) {
 411                         list_del_rcu(&pt->list);
 412                         goto out;
 413                 }
 414         }
 415
 416         pr_warn("dev_remove_pack: %p not found\n", pt);
 417 out:
 418         spin_unlock(&ptype_lock);
 419 }
 420 EXPORT_SYMBOL(__dev_remove_pack);
 421
 422 /**
 423  *      dev_remove_pack  - remove packet handler
 424  *      @pt: packet type declaration
 425  *
 426  *      Remove a protocol handler that was previously added to the kernel
 427  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 428  *      from the kernel lists and can be freed or reused once this function
 429  *      returns.
 430  *
 431  *      This call sleeps to guarantee that no CPU is looking at the packet
 432  *      type after return.
 433  */
 434 void dev_remove_pack(struct packet_type *pt)
 435 {
 436         __dev_remove_pack(pt);
 437
 438         synchronize_net();
 439 }
 440 EXPORT_SYMBOL(dev_remove_pack);
 441
 442
 443 /**
 444  *      dev_add_offload - register offload handlers
 445  *      @po: protocol offload declaration
 446  *
 447  *      Add protocol offload handlers to the networking stack. The passed
 448  *      &proto_offload is linked into kernel lists and may not be freed until
 449  *      it has been removed from the kernel lists.
 450  *
 451  *      This call does not sleep therefore it can not
 452  *      guarantee all CPU's that are in middle of receiving packets
 453  *      will see the new offload handlers (until the next received packet).
 454  */
 455 void dev_add_offload(struct packet_offload *po)
 456 {
 457         struct list_head *head = &offload_base;
 458
 459         spin_lock(&offload_lock);
 460         list_add_rcu(&po->list, head);
 461         spin_unlock(&offload_lock);
 462 }
 463 EXPORT_SYMBOL(dev_add_offload);
 464
 465 /**
 466  *      __dev_remove_offload     - remove offload handler
 467  *      @po: packet offload declaration
 468  *
 469  *      Remove a protocol offload handler that was previously added to the
 470  *      kernel offload handlers by dev_add_offload(). The passed &offload_type
 471  *      is removed from the kernel lists and can be freed or reused once this
 472  *      function returns.
 473  *
 474  *      The packet type might still be in use by receivers
 475  *      and must not be freed until after all the CPU's have gone
 476  *      through a quiescent state.
 477  */
 478 void __dev_remove_offload(struct packet_offload *po)
 479 {
 480         struct list_head *head = &offload_base;
 481         struct packet_offload *po1;
 482
 483         spin_lock(&offload_lock);
 484
 485         list_for_each_entry(po1, head, list) {
 486                 if (po == po1) {
 487                         list_del_rcu(&po->list);
 488                         goto out;
 489                 }
 490         }
 491
 492         pr_warn("dev_remove_offload: %p not found\n", po);
 493 out:
 494         spin_unlock(&offload_lock);
 495 }
 496 EXPORT_SYMBOL(__dev_remove_offload);
 497
 498 /**
 499  *      dev_remove_offload       - remove packet offload handler
 500  *      @po: packet offload declaration
 501  *
 502  *      Remove a packet offload handler that was previously added to the kernel
 503  *      offload handlers by dev_add_offload(). The passed &offload_type is
 504  *      removed from the kernel lists and can be freed or reused once this
 505  *      function returns.
 506  *
 507  *      This call sleeps to guarantee that no CPU is looking at the packet
 508  *      type after return.
 509  */
 510 void dev_remove_offload(struct packet_offload *po)
 511 {
 512         __dev_remove_offload(po);
 513
 514         synchronize_net();
 515 }
 516 EXPORT_SYMBOL(dev_remove_offload);
 517
 518 /******************************************************************************
 519
 520                       Device Boot-time Settings Routines
 521
 522 *******************************************************************************/
 523
 524 /* Boot time configuration table */
 525 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 526
 527 /**
 528  *      netdev_boot_setup_add   - add new setup entry
 529  *      @name: name of the device
 530  *      @map: configured settings for the device
 531  *
 532  *      Adds new setup entry to the dev_boot_setup list.  The function
 533  *      returns 0 on error and 1 on success.  This is a generic routine to
 534  *      all netdevices.
 535  */
 536 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 537 {
 538         struct netdev_boot_setup *s;
 539         int i;
 540
 541         s = dev_boot_setup;
 542         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 543                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 544                         memset(s[i].name, 0, sizeof(s[i].name));
 545                         strlcpy(s[i].name, name, IFNAMSIZ);
 546                         memcpy(&s[i].map, map, sizeof(s[i].map));
 547                         break;
 548                 }
 549         }
 550
 551         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 552 }
 553
 554 /**
 555  *      netdev_boot_setup_check - check boot time settings
 556  *      @dev: the netdevice
 557  *
 558  *      Check boot time settings for the device.
 559  *      The found settings are set for the device to be used
 560  *      later in the device probing.
 561  *      Returns 0 if no settings found, 1 if they are.
 562  */
 563 int netdev_boot_setup_check(struct net_device *dev)
 564 {
 565         struct netdev_boot_setup *s = dev_boot_setup;
 566         int i;
 567
 568         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 569                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 570                     !strcmp(dev->name, s[i].name)) {
 571                         dev->irq        = s[i].map.irq;
 572                         dev->base_addr  = s[i].map.base_addr;
 573                         dev->mem_start  = s[i].map.mem_start;
 574                         dev->mem_end    = s[i].map.mem_end;
 575                         return 1;
 576                 }
 577         }
 578         return 0;
 579 }
 580 EXPORT_SYMBOL(netdev_boot_setup_check);
 581
 582
 583 /**
 584  *      netdev_boot_base        - get address from boot time settings
 585  *      @prefix: prefix for network device
 586  *      @unit: id for network device
 587  *
 588  *      Check boot time settings for the base address of device.
 589  *      The found settings are set for the device to be used
 590  *      later in the device probing.
 591  *      Returns 0 if no settings found.
 592  */
 593 unsigned long netdev_boot_base(const char *prefix, int unit)
 594 {
 595         const struct netdev_boot_setup *s = dev_boot_setup;
 596         char name[IFNAMSIZ];
 597         int i;
 598
 599         sprintf(name, "%s%d", prefix, unit);
 600
 601         /*
 602          * If device already registered then return base of 1
 603          * to indicate not to probe for this interface
 604          */
 605         if (__dev_get_by_name(&init_net, name))
 606                 return 1;
 607
 608         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 609                 if (!strcmp(name, s[i].name))
 610                         return s[i].map.base_addr;
 611         return 0;
 612 }
 613
 614 /*
 615  * Saves at boot time configured settings for any netdevice.
 616  */
 617 int __init netdev_boot_setup(char *str)
 618 {
 619         int ints[5];
 620         struct ifmap map;
 621
 622         str = get_options(str, ARRAY_SIZE(ints), ints);
 623         if (!str || !*str)
 624                 return 0;
 625
 626         /* Save settings */
 627         memset(&map, 0, sizeof(map));
 628         if (ints[0] > 0)
 629                 map.irq = ints[1];
 630         if (ints[0] > 1)
 631                 map.base_addr = ints[2];
 632         if (ints[0] > 2)
 633                 map.mem_start = ints[3];
 634         if (ints[0] > 3)
 635                 map.mem_end = ints[4];
 636
 637         /* Add new entry to the list */
 638         return netdev_boot_setup_add(str, &map);
 639 }
 640
 641 __setup("netdev=", netdev_boot_setup);
 642
 643 /*******************************************************************************
 644
 645                             Device Interface Subroutines
 646
 647 *******************************************************************************/
 648
 649 /**
 650  *      __dev_get_by_name       - find a device by its name
 651  *      @net: the applicable net namespace
 652  *      @name: name to find
 653  *
 654  *      Find an interface by name. Must be called under RTNL semaphore
 655  *      or @dev_base_lock. If the name is found a pointer to the device
 656  *      is returned. If the name is not found then %NULL is returned. The
 657  *      reference counters are not incremented so the caller must be
 658  *      careful with locks.
 659  */
 660
 661 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 662 {
 663         struct net_device *dev;
 664         struct hlist_head *head = dev_name_hash(net, name);
 665
 666         hlist_for_each_entry(dev, head, name_hlist)
 667                 if (!strncmp(dev->name, name, IFNAMSIZ))
 668                         return dev;
 669
 670         return NULL;
 671 }
 672 EXPORT_SYMBOL(__dev_get_by_name);
 673
 674 /**
 675  *      dev_get_by_name_rcu     - find a device by its name
 676  *      @net: the applicable net namespace
 677  *      @name: name to find
 678  *
 679  *      Find an interface by name.
 680  *      If the name is found a pointer to the device is returned.
 681  *      If the name is not found then %NULL is returned.
 682  *      The reference counters are not incremented so the caller must be
 683  *      careful with locks. The caller must hold RCU lock.
 684  */
 685
 686 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 687 {
 688         struct net_device *dev;
 689         struct hlist_head *head = dev_name_hash(net, name);
 690
 691         hlist_for_each_entry_rcu(dev, head, name_hlist)
 692                 if (!strncmp(dev->name, name, IFNAMSIZ))
 693                         return dev;
 694
 695         return NULL;
 696 }
 697 EXPORT_SYMBOL(dev_get_by_name_rcu);
 698
 699 /**
 700  *      dev_get_by_name         - find a device by its name
 701  *      @net: the applicable net namespace
 702  *      @name: name to find
 703  *
 704  *      Find an interface by name. This can be called from any
 705  *      context and does its own locking. The returned handle has
 706  *      the usage count incremented and the caller must use dev_put() to
 707  *      release it when it is no longer needed. %NULL is returned if no
 708  *      matching device is found.
 709  */
 710
 711 struct net_device *dev_get_by_name(struct net *net, const char *name)
 712 {
 713         struct net_device *dev;
 714
 715         rcu_read_lock();
 716         dev = dev_get_by_name_rcu(net, name);
 717         if (dev)
 718                 dev_hold(dev);
 719         rcu_read_unlock();
 720         return dev;
 721 }
 722 EXPORT_SYMBOL(dev_get_by_name);
 723
 724 /**
 725  *      __dev_get_by_index - find a device by its ifindex
 726  *      @net: the applicable net namespace
 727  *      @ifindex: index of device
 728  *
 729  *      Search for an interface by index. Returns %NULL if the device
 730  *      is not found or a pointer to the device. The device has not
 731  *      had its reference counter increased so the caller must be careful
 732  *      about locking. The caller must hold either the RTNL semaphore
 733  *      or @dev_base_lock.
 734  */
 735
 736 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 737 {
 738         struct net_device *dev;
 739         struct hlist_head *head = dev_index_hash(net, ifindex);
 740
 741         hlist_for_each_entry(dev, head, index_hlist)
 742                 if (dev->ifindex == ifindex)
 743                         return dev;
 744
 745         return NULL;
 746 }
 747 EXPORT_SYMBOL(__dev_get_by_index);
 748
 749 /**
 750  *      dev_get_by_index_rcu - find a device by its ifindex
 751  *      @net: the applicable net namespace
 752  *      @ifindex: index of device
 753  *
 754  *      Search for an interface by index. Returns %NULL if the device
 755  *      is not found or a pointer to the device. The device has not
 756  *      had its reference counter increased so the caller must be careful
 757  *      about locking. The caller must hold RCU lock.
 758  */
 759
 760 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 761 {
 762         struct net_device *dev;
 763         struct hlist_head *head = dev_index_hash(net, ifindex);
 764
 765         hlist_for_each_entry_rcu(dev, head, index_hlist)
 766                 if (dev->ifindex == ifindex)
 767                         return dev;
 768
 769         return NULL;
 770 }
 771 EXPORT_SYMBOL(dev_get_by_index_rcu);
 772
 773
 774 /**
 775  *      dev_get_by_index - find a device by its ifindex
 776  *      @net: the applicable net namespace
 777  *      @ifindex: index of device
 778  *
 779  *      Search for an interface by index. Returns NULL if the device
 780  *      is not found or a pointer to the device. The device returned has
 781  *      had a reference added and the pointer is safe until the user calls
 782  *      dev_put to indicate they have finished with it.
 783  */
 784
 785 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 786 {
 787         struct net_device *dev;
 788
 789         rcu_read_lock();
 790         dev = dev_get_by_index_rcu(net, ifindex);
 791         if (dev)
 792                 dev_hold(dev);
 793         rcu_read_unlock();
 794         return dev;
 795 }
 796 EXPORT_SYMBOL(dev_get_by_index);
 797
 798 /**
 799  *      netdev_get_name - get a netdevice name, knowing its ifindex.
 800  *      @net: network namespace
 801  *      @name: a pointer to the buffer where the name will be stored.
 802  *      @ifindex: the ifindex of the interface to get the name from.
 803  *
 804  *      The use of raw_seqcount_begin() and cond_resched() before
 805  *      retrying is required as we want to give the writers a chance
 806  *      to complete when CONFIG_PREEMPT is not set.
 807  */
 808 int netdev_get_name(struct net *net, char *name, int ifindex)
 809 {
 810         struct net_device *dev;
 811         unsigned int seq;
 812
 813 retry:
 814         seq = raw_seqcount_begin(&devnet_rename_seq);
 815         rcu_read_lock();
 816         dev = dev_get_by_index_rcu(net, ifindex);
 817         if (!dev) {
 818                 rcu_read_unlock();
 819                 return -ENODEV;
 820         }
 821
 822         strcpy(name, dev->name);
 823         rcu_read_unlock();
 824         if (read_seqcount_retry(&devnet_rename_seq, seq)) {
 825                 cond_resched();
 826                 goto retry;
 827         }
 828
 829         return 0;
 830 }
 831
 832 /**
 833  *      dev_getbyhwaddr_rcu - find a device by its hardware address
 834  *      @net: the applicable net namespace
 835  *      @type: media type of device
 836  *      @ha: hardware address
 837  *
 838  *      Search for an interface by MAC address. Returns NULL if the device
 839  *      is not found or a pointer to the device.
 840  *      The caller must hold RCU or RTNL.
 841  *      The returned device has not had its ref count increased
 842  *      and the caller must therefore be careful about locking
 843  *
 844  */
 845
 846 struct net_device *dev_getbyhwaddr_rcu(struct net *net, unsigned short type,
 847                                        const char *ha)
 848 {
 849         struct net_device *dev;
 850
 851         for_each_netdev_rcu(net, dev)
 852                 if (dev->type == type &&
 853                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 854                         return dev;
 855
 856         return NULL;
 857 }
 858 EXPORT_SYMBOL(dev_getbyhwaddr_rcu);
 859
 860 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 861 {
 862         struct net_device *dev;
 863
 864         ASSERT_RTNL();
 865         for_each_netdev(net, dev)
 866                 if (dev->type == type)
 867                         return dev;
 868
 869         return NULL;
 870 }
 871 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 872
 873 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 874 {
 875         struct net_device *dev, *ret = NULL;
 876
 877         rcu_read_lock();
 878         for_each_netdev_rcu(net, dev)
 879                 if (dev->type == type) {
 880                         dev_hold(dev);
 881                         ret = dev;
 882                         break;
 883                 }
 884         rcu_read_unlock();
 885         return ret;
 886 }
 887 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 888
 889 /**
 890  *      dev_get_by_flags_rcu - find any device with given flags
 891  *      @net: the applicable net namespace
 892  *      @if_flags: IFF_* values
 893  *      @mask: bitmask of bits in if_flags to check
 894  *
 895  *      Search for any interface with the given flags. Returns NULL if a device
 896  *      is not found or a pointer to the device. Must be called inside
 897  *      rcu_read_lock(), and result refcount is unchanged.
 898  */
 899
 900 struct net_device *dev_get_by_flags_rcu(struct net *net, unsigned short if_flags,
 901                                     unsigned short mask)
 902 {
 903         struct net_device *dev, *ret;
 904
 905         ret = NULL;
 906         for_each_netdev_rcu(net, dev) {
 907                 if (((dev->flags ^ if_flags) & mask) == 0) {
 908                         ret = dev;
 909                         break;
 910                 }
 911         }
 912         return ret;
 913 }
 914 EXPORT_SYMBOL(dev_get_by_flags_rcu);
 915
 916 /**
 917  *      dev_valid_name - check if name is okay for network device
 918  *      @name: name string
 919  *
 920  *      Network device names need to be valid file names to
 921  *      to allow sysfs to work.  We also disallow any kind of
 922  *      whitespace.
 923  */
 924 bool dev_valid_name(const char *name)
 925 {
 926         if (*name == '\0')
 927                 return false;
 928         if (strlen(name) >= IFNAMSIZ)
 929                 return false;
 930         if (!strcmp(name, ".") || !strcmp(name, ".."))
 931                 return false;
 932
 933         while (*name) {
 934                 if (*name == '/' || isspace(*name))
 935                         return false;
 936                 name++;
 937         }
 938         return true;
 939 }
 940 EXPORT_SYMBOL(dev_valid_name);
 941
 942 /**
 943  *      __dev_alloc_name - allocate a name for a device
 944  *      @net: network namespace to allocate the device name in
 945  *      @name: name format string
 946  *      @buf:  scratch buffer and result name string
 947  *
 948  *      Passed a format string - eg "lt%d" it will try and find a suitable
 949  *      id. It scans list of devices to build up a free map, then chooses
 950  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 951  *      while allocating the name and adding the device in order to avoid
 952  *      duplicates.
 953  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 954  *      Returns the number of the unit assigned or a negative errno code.
 955  */
 956
 957 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 958 {
 959         int i = 0;
 960         const char *p;
 961         const int max_netdevices = 8*PAGE_SIZE;
 962         unsigned long *inuse;
 963         struct net_device *d;
 964
 965         p = strnchr(name, IFNAMSIZ-1, '%');
 966         if (p) {
 967                 /*
 968                  * Verify the string as this thing may have come from
 969                  * the user.  There must be either one "%d" and no other "%"
 970                  * characters.
 971                  */
 972                 if (p[1] != 'd' || strchr(p + 2, '%'))
 973                         return -EINVAL;
 974
 975                 /* Use one page as a bit array of possible slots */
 976                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 977                 if (!inuse)
 978                         return -ENOMEM;
 979
 980                 for_each_netdev(net, d) {
 981                         if (!sscanf(d->name, name, &i))
 982                                 continue;
 983                         if (i < 0 || i >= max_netdevices)
 984                                 continue;
 985
 986                         /*  avoid cases where sscanf is not exact inverse of printf */
 987                         snprintf(buf, IFNAMSIZ, name, i);
 988                         if (!strncmp(buf, d->name, IFNAMSIZ))
 989                                 set_bit(i, inuse);
 990                 }
 991
 992                 i = find_first_zero_bit(inuse, max_netdevices);
 993                 free_page((unsigned long) inuse);
 994         }
 995
 996         if (buf != name)
 997                 snprintf(buf, IFNAMSIZ, name, i);
 998         if (!__dev_get_by_name(net, buf))
 999                 return i;
1000
1001         /* It is possible to run out of possible slots
1002          * when the name is long and there isn't enough space left
1003          * for the digits, or if all bits are used.
1004          */
1005         return -ENFILE;
1006 }
1007
1008 /**
1009  *      dev_alloc_name - allocate a name for a device
1010  *      @dev: device
1011  *      @name: name format string
1012  *
1013  *      Passed a format string - eg "lt%d" it will try and find a suitable
1014  *      id. It scans list of devices to build up a free map, then chooses
1015  *      the first empty slot. The caller must hold the dev_base or rtnl lock
1016  *      while allocating the name and adding the device in order to avoid
1017  *      duplicates.
1018  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
1019  *      Returns the number of the unit assigned or a negative errno code.
1020  */
1021
1022 int dev_alloc_name(struct net_device *dev, const char *name)
1023 {
1024         char buf[IFNAMSIZ];
1025         struct net *net;
1026         int ret;
1027
1028         BUG_ON(!dev_net(dev));
1029         net = dev_net(dev);
1030         ret = __dev_alloc_name(net, name, buf);
1031         if (ret >= 0)
1032                 strlcpy(dev->name, buf, IFNAMSIZ);
1033         return ret;
1034 }
1035 EXPORT_SYMBOL(dev_alloc_name);
1036
1037 static int dev_alloc_name_ns(struct net *net,
1038                              struct net_device *dev,
1039                              const char *name)
1040 {
1041         char buf[IFNAMSIZ];
1042         int ret;
1043
1044         ret = __dev_alloc_name(net, name, buf);
1045         if (ret >= 0)
1046                 strlcpy(dev->name, buf, IFNAMSIZ);
1047         return ret;
1048 }
1049
1050 static int dev_get_valid_name(struct net *net,
1051                               struct net_device *dev,
1052                               const char *name)
1053 {
1054         BUG_ON(!net);
1055
1056         if (!dev_valid_name(name))
1057                 return -EINVAL;
1058
1059         if (strchr(name, '%'))
1060                 return dev_alloc_name_ns(net, dev, name);
1061         else if (__dev_get_by_name(net, name))
1062                 return -EEXIST;
1063         else if (dev->name != name)
1064                 strlcpy(dev->name, name, IFNAMSIZ);
1065
1066         return 0;
1067 }
1068
1069 /**
1070  *      dev_change_name - change name of a device
1071  *      @dev: device
1072  *      @newname: name (or format string) must be at least IFNAMSIZ
1073  *
1074  *      Change name of a device, can pass format strings "eth%d".
1075  *      for wildcarding.
1076  */
1077 int dev_change_name(struct net_device *dev, const char *newname)
1078 {
1079         char oldname[IFNAMSIZ];
1080         int err = 0;
1081         int ret;
1082         struct net *net;
1083
1084         ASSERT_RTNL();
1085         BUG_ON(!dev_net(dev));
1086
1087         net = dev_net(dev);
1088         if (dev->flags & IFF_UP)
1089                 return -EBUSY;
1090
1091         write_seqcount_begin(&devnet_rename_seq);
1092
1093         if (strncmp(newname, dev->name, IFNAMSIZ) == 0) {
1094                 write_seqcount_end(&devnet_rename_seq);
1095                 return 0;
1096         }
1097
1098         memcpy(oldname, dev->name, IFNAMSIZ);
1099
1100         err = dev_get_valid_name(net, dev, newname);
1101         if (err < 0) {
1102                 write_seqcount_end(&devnet_rename_seq);
1103                 return err;
1104         }
1105
1106 rollback:
1107         ret = device_rename(&dev->dev, dev->name);
1108         if (ret) {
1109                 memcpy(dev->name, oldname, IFNAMSIZ);
1110                 write_seqcount_end(&devnet_rename_seq);
1111                 return ret;
1112         }
1113
1114         write_seqcount_end(&devnet_rename_seq);
1115
1116         write_lock_bh(&dev_base_lock);
1117         hlist_del_rcu(&dev->name_hlist);
1118         write_unlock_bh(&dev_base_lock);
1119
1120         synchronize_rcu();
1121
1122         write_lock_bh(&dev_base_lock);
1123         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1124         write_unlock_bh(&dev_base_lock);
1125
1126         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1127         ret = notifier_to_errno(ret);
1128
1129         if (ret) {
1130                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1131                 if (err >= 0) {
1132                         err = ret;
1133                         write_seqcount_begin(&devnet_rename_seq);
1134                         memcpy(dev->name, oldname, IFNAMSIZ);
1135                         goto rollback;
1136                 } else {
1137                         pr_err("%s: name change rollback failed: %d\n",
1138                                dev->name, ret);
1139                 }
1140         }
1141
1142         return err;
1143 }
1144
1145 /**
1146  *      dev_set_alias - change ifalias of a device
1147  *      @dev: device
1148  *      @alias: name up to IFALIASZ
1149  *      @len: limit of bytes to copy from info
1150  *
1151  *      Set ifalias for a device,
1152  */
1153 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1154 {
1155         char *new_ifalias;
1156
1157         ASSERT_RTNL();
1158
1159         if (len >= IFALIASZ)
1160                 return -EINVAL;
1161
1162         if (!len) {
1163                 kfree(dev->ifalias);
1164                 dev->ifalias = NULL;
1165                 return 0;
1166         }
1167
1168         new_ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1169         if (!new_ifalias)
1170                 return -ENOMEM;
1171         dev->ifalias = new_ifalias;
1172
1173         strlcpy(dev->ifalias, alias, len+1);
1174         return len;
1175 }
1176
1177
1178 /**
1179  *      netdev_features_change - device changes features
1180  *      @dev: device to cause notification
1181  *
1182  *      Called to indicate a device has changed features.
1183  */
1184 void netdev_features_change(struct net_device *dev)
1185 {
1186         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1187 }
1188 EXPORT_SYMBOL(netdev_features_change);
1189
1190 /**
1191  *      netdev_state_change - device changes state
1192  *      @dev: device to cause notification
1193  *
1194  *      Called to indicate a device has changed state. This function calls
1195  *      the notifier chains for netdev_chain and sends a NEWLINK message
1196  *      to the routing socket.
1197  */
1198 void netdev_state_change(struct net_device *dev)
1199 {
1200         if (dev->flags & IFF_UP) {
1201                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1202                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1203         }
1204 }
1205 EXPORT_SYMBOL(netdev_state_change);
1206
1207 /**
1208  *      netdev_notify_peers - notify network peers about existence of @dev
1209  *      @dev: network device
1210  *
1211  * Generate traffic such that interested network peers are aware of
1212  * @dev, such as by generating a gratuitous ARP. This may be used when
1213  * a device wants to inform the rest of the network about some sort of
1214  * reconfiguration such as a failover event or virtual machine
1215  * migration.
1216  */
1217 void netdev_notify_peers(struct net_device *dev)
1218 {
1219         rtnl_lock();
1220         call_netdevice_notifiers(NETDEV_NOTIFY_PEERS, dev);
1221         rtnl_unlock();
1222 }
1223 EXPORT_SYMBOL(netdev_notify_peers);
1224
1225 static int __dev_open(struct net_device *dev)
1226 {
1227         const struct net_device_ops *ops = dev->netdev_ops;
1228         int ret;
1229
1230         ASSERT_RTNL();
1231
1232         if (!netif_device_present(dev))
1233                 return -ENODEV;
1234
1235         /* Block netpoll from trying to do any rx path servicing.
1236          * If we don't do this there is a chance ndo_poll_controller
1237          * or ndo_poll may be running while we open the device
1238          */
1239         ret = netpoll_rx_disable(dev);
1240         if (ret)
1241                 return ret;
1242
1243         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1244         ret = notifier_to_errno(ret);
1245         if (ret)
1246                 return ret;
1247
1248         set_bit(__LINK_STATE_START, &dev->state);
1249
1250         if (ops->ndo_validate_addr)
1251                 ret = ops->ndo_validate_addr(dev);
1252
1253         if (!ret && ops->ndo_open)
1254                 ret = ops->ndo_open(dev);
1255
1256         netpoll_rx_enable(dev);
1257
1258         if (ret)
1259                 clear_bit(__LINK_STATE_START, &dev->state);
1260         else {
1261                 dev->flags |= IFF_UP;
1262                 net_dmaengine_get();
1263                 dev_set_rx_mode(dev);
1264                 dev_activate(dev);
1265                 add_device_randomness(dev->dev_addr, dev->addr_len);
1266         }
1267
1268         return ret;
1269 }
1270
1271 /**
1272  *      dev_open        - prepare an interface for use.
1273  *      @dev:   device to open
1274  *
1275  *      Takes a device from down to up state. The device's private open
1276  *      function is invoked and then the multicast lists are loaded. Finally
1277  *      the device is moved into the up state and a %NETDEV_UP message is
1278  *      sent to the netdev notifier chain.
1279  *
1280  *      Calling this function on an active interface is a nop. On a failure
1281  *      a negative errno code is returned.
1282  */
1283 int dev_open(struct net_device *dev)
1284 {
1285         int ret;
1286
1287         if (dev->flags & IFF_UP)
1288                 return 0;
1289
1290         ret = __dev_open(dev);
1291         if (ret < 0)
1292                 return ret;
1293
1294         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1295         call_netdevice_notifiers(NETDEV_UP, dev);
1296
1297         return ret;
1298 }
1299 EXPORT_SYMBOL(dev_open);
1300
1301 static int __dev_close_many(struct list_head *head)
1302 {
1303         struct net_device *dev;
1304
1305         ASSERT_RTNL();
1306         might_sleep();
1307
1308         list_for_each_entry(dev, head, unreg_list) {
1309                 call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1310
1311                 clear_bit(__LINK_STATE_START, &dev->state);
1312
1313                 /* Synchronize to scheduled poll. We cannot touch poll list, it
1314                  * can be even on different cpu. So just clear netif_running().
1315                  *
1316                  * dev->stop() will invoke napi_disable() on all of it's
1317                  * napi_struct instances on this device.
1318                  */
1319                 smp_mb__after_clear_bit(); /* Commit netif_running(). */
1320         }
1321
1322         dev_deactivate_many(head);
1323
1324         list_for_each_entry(dev, head, unreg_list) {
1325                 const struct net_device_ops *ops = dev->netdev_ops;
1326
1327                 /*
1328                  *      Call the device specific close. This cannot fail.
1329                  *      Only if device is UP
1330                  *
1331                  *      We allow it to be called even after a DETACH hot-plug
1332                  *      event.
1333                  */
1334                 if (ops->ndo_stop)
1335                         ops->ndo_stop(dev);
1336
1337                 dev->flags &= ~IFF_UP;
1338                 net_dmaengine_put();
1339         }
1340
1341         return 0;
1342 }
1343
1344 static int __dev_close(struct net_device *dev)
1345 {
1346         int retval;
1347         LIST_HEAD(single);
1348
1349         /* Temporarily disable netpoll until the interface is down */
1350         retval = netpoll_rx_disable(dev);
1351         if (retval)
1352                 return retval;
1353
1354         list_add(&dev->unreg_list, &single);
1355         retval = __dev_close_many(&single);
1356         list_del(&single);
1357
1358         netpoll_rx_enable(dev);
1359         return retval;
1360 }
1361
1362 static int dev_close_many(struct list_head *head)
1363 {
1364         struct net_device *dev, *tmp;
1365         LIST_HEAD(tmp_list);
1366
1367         list_for_each_entry_safe(dev, tmp, head, unreg_list)
1368                 if (!(dev->flags & IFF_UP))
1369                         list_move(&dev->unreg_list, &tmp_list);
1370
1371         __dev_close_many(head);
1372
1373         list_for_each_entry(dev, head, unreg_list) {
1374                 rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1375                 call_netdevice_notifiers(NETDEV_DOWN, dev);
1376         }
1377
1378         /* rollback_registered_many needs the complete original list */
1379         list_splice(&tmp_list, head);
1380         return 0;
1381 }
1382
1383 /**
1384  *      dev_close - shutdown an interface.
1385  *      @dev: device to shutdown
1386  *
1387  *      This function moves an active device into down state. A
1388  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1389  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1390  *      chain.
1391  */
1392 int dev_close(struct net_device *dev)
1393 {
1394         int ret = 0;
1395         if (dev->flags & IFF_UP) {
1396                 LIST_HEAD(single);
1397
1398                 /* Block netpoll rx while the interface is going down */
1399                 ret = netpoll_rx_disable(dev);
1400                 if (ret)
1401                         return ret;
1402
1403                 list_add(&dev->unreg_list, &single);
1404                 dev_close_many(&single);
1405                 list_del(&single);
1406
1407                 netpoll_rx_enable(dev);
1408         }
1409         return ret;
1410 }
1411 EXPORT_SYMBOL(dev_close);
1412
1413
1414 /**
1415  *      dev_disable_lro - disable Large Receive Offload on a device
1416  *      @dev: device
1417  *
1418  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1419  *      called under RTNL.  This is needed if received packets may be
1420  *      forwarded to another interface.
1421  */
1422 void dev_disable_lro(struct net_device *dev)
1423 {
1424         /*
1425          * If we're trying to disable lro on a vlan device
1426          * use the underlying physical device instead
1427          */
1428         if (is_vlan_dev(dev))
1429                 dev = vlan_dev_real_dev(dev);
1430
1431         dev->wanted_features &= ~NETIF_F_LRO;
1432         netdev_update_features(dev);
1433
1434         if (unlikely(dev->features & NETIF_F_LRO))
1435                 netdev_WARN(dev, "failed to disable LRO!\n");
1436 }
1437 EXPORT_SYMBOL(dev_disable_lro);
1438
1439
1440 static int dev_boot_phase = 1;
1441
1442 /**
1443  *      register_netdevice_notifier - register a network notifier block
1444  *      @nb: notifier
1445  *
1446  *      Register a notifier to be called when network device events occur.
1447  *      The notifier passed is linked into the kernel structures and must
1448  *      not be reused until it has been unregistered. A negative errno code
1449  *      is returned on a failure.
1450  *
1451  *      When registered all registration and up events are replayed
1452  *      to the new notifier to allow device to have a race free
1453  *      view of the network device list.
1454  */
1455
1456 int register_netdevice_notifier(struct notifier_block *nb)
1457 {
1458         struct net_device *dev;
1459         struct net_device *last;
1460         struct net *net;
1461         int err;
1462
1463         rtnl_lock();
1464         err = raw_notifier_chain_register(&netdev_chain, nb);
1465         if (err)
1466                 goto unlock;
1467         if (dev_boot_phase)
1468                 goto unlock;
1469         for_each_net(net) {
1470                 for_each_netdev(net, dev) {
1471                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1472                         err = notifier_to_errno(err);
1473                         if (err)
1474                                 goto rollback;
1475
1476                         if (!(dev->flags & IFF_UP))
1477                                 continue;
1478
1479                         nb->notifier_call(nb, NETDEV_UP, dev);
1480                 }
1481         }
1482
1483 unlock:
1484         rtnl_unlock();
1485         return err;
1486
1487 rollback:
1488         last = dev;
1489         for_each_net(net) {
1490                 for_each_netdev(net, dev) {
1491                         if (dev == last)
1492                                 goto outroll;
1493
1494                         if (dev->flags & IFF_UP) {
1495                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1496                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1497                         }
1498                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1499                 }
1500         }
1501
1502 outroll:
1503         raw_notifier_chain_unregister(&netdev_chain, nb);
1504         goto unlock;
1505 }
1506 EXPORT_SYMBOL(register_netdevice_notifier);
1507
1508 /**
1509  *      unregister_netdevice_notifier - unregister a network notifier block
1510  *      @nb: notifier
1511  *
1512  *      Unregister a notifier previously registered by
1513  *      register_netdevice_notifier(). The notifier is unlinked into the
1514  *      kernel structures and may then be reused. A negative errno code
1515  *      is returned on a failure.
1516  *
1517  *      After unregistering unregister and down device events are synthesized
1518  *      for all devices on the device list to the removed notifier to remove
1519  *      the need for special case cleanup code.
1520  */
1521
1522 int unregister_netdevice_notifier(struct notifier_block *nb)
1523 {
1524         struct net_device *dev;
1525         struct net *net;
1526         int err;
1527
1528         rtnl_lock();
1529         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1530         if (err)
1531                 goto unlock;
1532
1533         for_each_net(net) {
1534                 for_each_netdev(net, dev) {
1535                         if (dev->flags & IFF_UP) {
1536                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1537                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1538                         }
1539                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1540                 }
1541         }
1542 unlock:
1543         rtnl_unlock();
1544         return err;
1545 }
1546 EXPORT_SYMBOL(unregister_netdevice_notifier);
1547
1548 /**
1549  *      call_netdevice_notifiers - call all network notifier blocks
1550  *      @val: value passed unmodified to notifier function
1551  *      @dev: net_device pointer passed unmodified to notifier function
1552  *
1553  *      Call all network notifier blocks.  Parameters and return value
1554  *      are as for raw_notifier_call_chain().
1555  */
1556
1557 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1558 {
1559         ASSERT_RTNL();
1560         return raw_notifier_call_chain(&netdev_chain, val, dev);
1561 }
1562 EXPORT_SYMBOL(call_netdevice_notifiers);
1563
1564 static struct static_key netstamp_needed __read_mostly;
1565 #ifdef HAVE_JUMP_LABEL
1566 /* We are not allowed to call static_key_slow_dec() from irq context
1567  * If net_disable_timestamp() is called from irq context, defer the
1568  * static_key_slow_dec() calls.
1569  */
1570 static atomic_t netstamp_needed_deferred;
1571 #endif
1572
1573 void net_enable_timestamp(void)
1574 {
1575 #ifdef HAVE_JUMP_LABEL
1576         int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
1577
1578         if (deferred) {
1579                 while (--deferred)
1580                         static_key_slow_dec(&netstamp_needed);
1581                 return;
1582         }
1583 #endif
1584         static_key_slow_inc(&netstamp_needed);
1585 }
1586 EXPORT_SYMBOL(net_enable_timestamp);
1587
1588 void net_disable_timestamp(void)
1589 {
1590 #ifdef HAVE_JUMP_LABEL
1591         if (in_interrupt()) {
1592                 atomic_inc(&netstamp_needed_deferred);
1593                 return;
1594         }
1595 #endif
1596         static_key_slow_dec(&netstamp_needed);
1597 }
1598 EXPORT_SYMBOL(net_disable_timestamp);
1599
1600 static inline void net_timestamp_set(struct sk_buff *skb)
1601 {
1602         skb->tstamp.tv64 = 0;
1603         if (static_key_false(&netstamp_needed))
1604                 __net_timestamp(skb);
1605 }
1606
1607 #define net_timestamp_check(COND, SKB)                  \
1608         if (static_key_false(&netstamp_needed)) {               \
1609                 if ((COND) && !(SKB)->tstamp.tv64)      \
1610                         __net_timestamp(SKB);           \
1611         }                                               \
1612
1613 static inline bool is_skb_forwardable(struct net_device *dev,
1614                                       struct sk_buff *skb)
1615 {
1616         unsigned int len;
1617
1618         if (!(dev->flags & IFF_UP))
1619                 return false;
1620
1621         len = dev->mtu + dev->hard_header_len + VLAN_HLEN;
1622         if (skb->len <= len)
1623                 return true;
1624
1625         /* if TSO is enabled, we don't care about the length as the packet
1626          * could be forwarded without being segmented before
1627          */
1628         if (skb_is_gso(skb))
1629                 return true;
1630
1631         return false;
1632 }
1633
1634 /**
1635  * dev_forward_skb - loopback an skb to another netif
1636  *
1637  * @dev: destination network device
1638  * @skb: buffer to forward
1639  *
1640  * return values:
1641  *      NET_RX_SUCCESS  (no congestion)
1642  *      NET_RX_DROP     (packet was dropped, but freed)
1643  *
1644  * dev_forward_skb can be used for injecting an skb from the
1645  * start_xmit function of one device into the receive queue
1646  * of another device.
1647  *
1648  * The receiving device may be in another namespace, so
1649  * we have to clear all information in the skb that could
1650  * impact namespace isolation.
1651  */
1652 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1653 {
1654         if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
1655                 if (skb_copy_ubufs(skb, GFP_ATOMIC)) {
1656                         atomic_long_inc(&dev->rx_dropped);
1657                         kfree_skb(skb);
1658                         return NET_RX_DROP;
1659                 }
1660         }
1661
1662         skb_orphan(skb);
1663
1664         if (unlikely(!is_skb_forwardable(dev, skb))) {
1665                 atomic_long_inc(&dev->rx_dropped);
1666                 kfree_skb(skb);
1667                 return NET_RX_DROP;
1668         }
1669         skb->skb_iif = 0;
1670         skb->dev = dev;
1671         skb_dst_drop(skb);
1672         skb->tstamp.tv64 = 0;
1673         skb->pkt_type = PACKET_HOST;
1674         skb->protocol = eth_type_trans(skb, dev);
1675         skb->mark = 0;
1676         secpath_reset(skb);
1677         nf_reset(skb);
1678         nf_reset_trace(skb);
1679         return netif_rx(skb);
1680 }
1681 EXPORT_SYMBOL_GPL(dev_forward_skb);
1682
1683 static inline int deliver_skb(struct sk_buff *skb,
1684                               struct packet_type *pt_prev,
1685                               struct net_device *orig_dev)
1686 {
1687         if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
1688                 return -ENOMEM;
1689         atomic_inc(&skb->users);
1690         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1691 }
1692
1693 static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
1694 {
1695         if (!ptype->af_packet_priv || !skb->sk)
1696                 return false;
1697
1698         if (ptype->id_match)
1699                 return ptype->id_match(ptype, skb->sk);
1700         else if ((struct sock *)ptype->af_packet_priv == skb->sk)
1701                 return true;
1702
1703         return false;
1704 }
1705
1706 /*
1707  *      Support routine. Sends outgoing frames to any network
1708  *      taps currently in use.
1709  */
1710
1711 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1712 {
1713         struct packet_type *ptype;
1714         struct sk_buff *skb2 = NULL;
1715         struct packet_type *pt_prev = NULL;
1716
1717         rcu_read_lock();
1718         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1719                 /* Never send packets back to the socket
1720                  * they originated from - MvS (miquels@drinkel.ow.org)
1721                  */
1722                 if ((ptype->dev == dev || !ptype->dev) &&
1723                     (!skb_loop_sk(ptype, skb))) {
1724                         if (pt_prev) {
1725                                 deliver_skb(skb2, pt_prev, skb->dev);
1726                                 pt_prev = ptype;
1727                                 continue;
1728                         }
1729
1730                         skb2 = skb_clone(skb, GFP_ATOMIC);
1731                         if (!skb2)
1732                                 break;
1733
1734                         net_timestamp_set(skb2);
1735
1736                         /* skb->nh should be correctly
1737                            set by sender, so that the second statement is
1738                            just protection against buggy protocols.
1739                          */
1740                         skb_reset_mac_header(skb2);
1741
1742                         if (skb_network_header(skb2) < skb2->data ||
1743                             skb2->network_header > skb2->tail) {
1744                                 net_crit_ratelimited("protocol %04x is buggy, dev %s\n",
1745                                                      ntohs(skb2->protocol),
1746                                                      dev->name);
1747                                 skb_reset_network_header(skb2);
1748                         }
1749
1750                         skb2->transport_header = skb2->network_header;
1751                         skb2->pkt_type = PACKET_OUTGOING;
1752                         pt_prev = ptype;
1753                 }
1754         }
1755         if (pt_prev)
1756                 pt_prev->func(skb2, skb->dev, pt_prev, skb->dev);
1757         rcu_read_unlock();
1758 }
1759
1760 /**
1761  * netif_setup_tc - Handle tc mappings on real_num_tx_queues change
1762  * @dev: Network device
1763  * @txq: number of queues available
1764  *
1765  * If real_num_tx_queues is changed the tc mappings may no longer be
1766  * valid. To resolve this verify the tc mapping remains valid and if
1767  * not NULL the mapping. With no priorities mapping to this
1768  * offset/count pair it will no longer be used. In the worst case TC0
1769  * is invalid nothing can be done so disable priority mappings. If is
1770  * expected that drivers will fix this mapping if they can before
1771  * calling netif_set_real_num_tx_queues.
1772  */
1773 static void netif_setup_tc(struct net_device *dev, unsigned int txq)
1774 {
1775         int i;
1776         struct netdev_tc_txq *tc = &dev->tc_to_txq[0];
1777
1778         /* If TC0 is invalidated disable TC mapping */
1779         if (tc->offset + tc->count > txq) {
1780                 pr_warn("Number of in use tx queues changed invalidating tc mappings. Priority traffic classification disabled!\n");
1781                 dev->num_tc = 0;
1782                 return;
1783         }
1784
1785         /* Invalidated prio to tc mappings set to TC0 */
1786         for (i = 1; i < TC_BITMASK + 1; i++) {
1787                 int q = netdev_get_prio_tc_map(dev, i);
1788
1789                 tc = &dev->tc_to_txq[q];
1790                 if (tc->offset + tc->count > txq) {
1791                         pr_warn("Number of in use tx queues changed. Priority %i to tc mapping %i is no longer valid. Setting map to 0\n",
1792                                 i, q);
1793                         netdev_set_prio_tc_map(dev, i, 0);
1794                 }
1795         }
1796 }
1797
1798 #ifdef CONFIG_XPS
1799 static DEFINE_MUTEX(xps_map_mutex);
1800 #define xmap_dereference(P)             \
1801         rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
1802
1803 static struct xps_map *remove_xps_queue(struct xps_dev_maps *dev_maps,
1804                                         int cpu, u16 index)
1805 {
1806         struct xps_map *map = NULL;
1807         int pos;
1808
1809         if (dev_maps)
1810                 map = xmap_dereference(dev_maps->cpu_map[cpu]);
1811
1812         for (pos = 0; map && pos < map->len; pos++) {
1813                 if (map->queues[pos] == index) {
1814                         if (map->len > 1) {
1815                                 map->queues[pos] = map->queues[--map->len];
1816                         } else {
1817                                 RCU_INIT_POINTER(dev_maps->cpu_map[cpu], NULL);
1818                                 kfree_rcu(map, rcu);
1819                                 map = NULL;
1820                         }
1821                         break;
1822                 }
1823         }
1824
1825         return map;
1826 }
1827
1828 static void netif_reset_xps_queues_gt(struct net_device *dev, u16 index)
1829 {
1830         struct xps_dev_maps *dev_maps;
1831         int cpu, i;
1832         bool active = false;
1833
1834         mutex_lock(&xps_map_mutex);
1835         dev_maps = xmap_dereference(dev->xps_maps);
1836
1837         if (!dev_maps)
1838                 goto out_no_maps;
1839
1840         for_each_possible_cpu(cpu) {
1841                 for (i = index; i < dev->num_tx_queues; i++) {
1842                         if (!remove_xps_queue(dev_maps, cpu, i))
1843                                 break;
1844                 }
1845                 if (i == dev->num_tx_queues)
1846                         active = true;
1847         }
1848
1849         if (!active) {
1850                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1851                 kfree_rcu(dev_maps, rcu);
1852         }
1853
1854         for (i = index; i < dev->num_tx_queues; i++)
1855                 netdev_queue_numa_node_write(netdev_get_tx_queue(dev, i),
1856                                              NUMA_NO_NODE);
1857
1858 out_no_maps:
1859         mutex_unlock(&xps_map_mutex);
1860 }
1861
1862 static struct xps_map *expand_xps_map(struct xps_map *map,
1863                                       int cpu, u16 index)
1864 {
1865         struct xps_map *new_map;
1866         int alloc_len = XPS_MIN_MAP_ALLOC;
1867         int i, pos;
1868
1869         for (pos = 0; map && pos < map->len; pos++) {
1870                 if (map->queues[pos] != index)
1871                         continue;
1872                 return map;
1873         }
1874
1875         /* Need to add queue to this CPU's existing map */
1876         if (map) {
1877                 if (pos < map->alloc_len)
1878                         return map;
1879
1880                 alloc_len = map->alloc_len * 2;
1881         }
1882
1883         /* Need to allocate new map to store queue on this CPU's map */
1884         new_map = kzalloc_node(XPS_MAP_SIZE(alloc_len), GFP_KERNEL,
1885                                cpu_to_node(cpu));
1886         if (!new_map)
1887                 return NULL;
1888
1889         for (i = 0; i < pos; i++)
1890                 new_map->queues[i] = map->queues[i];
1891         new_map->alloc_len = alloc_len;
1892         new_map->len = pos;
1893
1894         return new_map;
1895 }
1896
1897 int netif_set_xps_queue(struct net_device *dev, struct cpumask *mask, u16 index)
1898 {
1899         struct xps_dev_maps *dev_maps, *new_dev_maps = NULL;
1900         struct xps_map *map, *new_map;
1901         int maps_sz = max_t(unsigned int, XPS_DEV_MAPS_SIZE, L1_CACHE_BYTES);
1902         int cpu, numa_node_id = -2;
1903         bool active = false;
1904
1905         mutex_lock(&xps_map_mutex);
1906
1907         dev_maps = xmap_dereference(dev->xps_maps);
1908
1909         /* allocate memory for queue storage */
1910         for_each_online_cpu(cpu) {
1911                 if (!cpumask_test_cpu(cpu, mask))
1912                         continue;
1913
1914                 if (!new_dev_maps)
1915                         new_dev_maps = kzalloc(maps_sz, GFP_KERNEL);
1916                 if (!new_dev_maps) {
1917                         mutex_unlock(&xps_map_mutex);
1918                         return -ENOMEM;
1919                 }
1920
1921                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
1922                                  NULL;
1923
1924                 map = expand_xps_map(map, cpu, index);
1925                 if (!map)
1926                         goto error;
1927
1928                 RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1929         }
1930
1931         if (!new_dev_maps)
1932                 goto out_no_new_maps;
1933
1934         for_each_possible_cpu(cpu) {
1935                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu)) {
1936                         /* add queue to CPU maps */
1937                         int pos = 0;
1938
1939                         map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1940                         while ((pos < map->len) && (map->queues[pos] != index))
1941                                 pos++;
1942
1943                         if (pos == map->len)
1944                                 map->queues[map->len++] = index;
1945 #ifdef CONFIG_NUMA
1946                         if (numa_node_id == -2)
1947                                 numa_node_id = cpu_to_node(cpu);
1948                         else if (numa_node_id != cpu_to_node(cpu))
1949                                 numa_node_id = -1;
1950 #endif
1951                 } else if (dev_maps) {
1952                         /* fill in the new device map from the old device map */
1953                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1954                         RCU_INIT_POINTER(new_dev_maps->cpu_map[cpu], map);
1955                 }
1956
1957         }
1958
1959         rcu_assign_pointer(dev->xps_maps, new_dev_maps);
1960
1961         /* Cleanup old maps */
1962         if (dev_maps) {
1963                 for_each_possible_cpu(cpu) {
1964                         new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
1965                         map = xmap_dereference(dev_maps->cpu_map[cpu]);
1966                         if (map && map != new_map)
1967                                 kfree_rcu(map, rcu);
1968                 }
1969
1970                 kfree_rcu(dev_maps, rcu);
1971         }
1972
1973         dev_maps = new_dev_maps;
1974         active = true;
1975
1976 out_no_new_maps:
1977         /* update Tx queue numa node */
1978         netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index),
1979                                      (numa_node_id >= 0) ? numa_node_id :
1980                                      NUMA_NO_NODE);
1981
1982         if (!dev_maps)
1983                 goto out_no_maps;
1984
1985         /* removes queue from unused CPUs */
1986         for_each_possible_cpu(cpu) {
1987                 if (cpumask_test_cpu(cpu, mask) && cpu_online(cpu))
1988                         continue;
1989
1990                 if (remove_xps_queue(dev_maps, cpu, index))
1991                         active = true;
1992         }
1993
1994         /* free map if not active */
1995         if (!active) {
1996                 RCU_INIT_POINTER(dev->xps_maps, NULL);
1997                 kfree_rcu(dev_maps, rcu);
1998         }
1999
2000 out_no_maps:
2001         mutex_unlock(&xps_map_mutex);
2002
2003         return 0;
2004 error:
2005         /* remove any maps that we added */
2006         for_each_possible_cpu(cpu) {
2007                 new_map = xmap_dereference(new_dev_maps->cpu_map[cpu]);
2008                 map = dev_maps ? xmap_dereference(dev_maps->cpu_map[cpu]) :
2009                                  NULL;
2010                 if (new_map && new_map != map)
2011                         kfree(new_map);
2012         }
2013
2014         mutex_unlock(&xps_map_mutex);
2015
2016         kfree(new_dev_maps);
2017         return -ENOMEM;
2018 }
2019 EXPORT_SYMBOL(netif_set_xps_queue);
2020
2021 #endif
2022 /*
2023  * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues
2024  * greater then real_num_tx_queues stale skbs on the qdisc must be flushed.
2025  */
2026 int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq)
2027 {
2028         int rc;
2029
2030         if (txq < 1 || txq > dev->num_tx_queues)
2031                 return -EINVAL;
2032
2033         if (dev->reg_state == NETREG_REGISTERED ||
2034             dev->reg_state == NETREG_UNREGISTERING) {
2035                 ASSERT_RTNL();
2036
2037                 rc = netdev_queue_update_kobjects(dev, dev->real_num_tx_queues,
2038                                                   txq);
2039                 if (rc)
2040                         return rc;
2041
2042                 if (dev->num_tc)
2043                         netif_setup_tc(dev, txq);
2044
2045                 if (txq < dev->real_num_tx_queues) {
2046                         qdisc_reset_all_tx_gt(dev, txq);
2047 #ifdef CONFIG_XPS
2048                         netif_reset_xps_queues_gt(dev, txq);
2049 #endif
2050                 }
2051         }
2052
2053         dev->real_num_tx_queues = txq;
2054         return 0;
2055 }
2056 EXPORT_SYMBOL(netif_set_real_num_tx_queues);
2057
2058 #ifdef CONFIG_RPS
2059 /**
2060  *      netif_set_real_num_rx_queues - set actual number of RX queues used
2061  *      @dev: Network device
2062  *      @rxq: Actual number of RX queues
2063  *
2064  *      This must be called either with the rtnl_lock held or before
2065  *      registration of the net device.  Returns 0 on success, or a
2066  *      negative error code.  If called before registration, it always
2067  *      succeeds.
2068  */
2069 int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq)
2070 {
2071         int rc;
2072
2073         if (rxq < 1 || rxq > dev->num_rx_queues)
2074                 return -EINVAL;
2075
2076         if (dev->reg_state == NETREG_REGISTERED) {
2077                 ASSERT_RTNL();
2078
2079                 rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues,
2080                                                   rxq);
2081                 if (rc)
2082                         return rc;
2083         }
2084
2085         dev->real_num_rx_queues = rxq;
2086         return 0;
2087 }
2088 EXPORT_SYMBOL(netif_set_real_num_rx_queues);
2089 #endif
2090
2091 /**
2092  * netif_get_num_default_rss_queues - default number of RSS queues
2093  *
2094  * This routine should set an upper limit on the number of RSS queues
2095  * used by default by multiqueue devices.
2096  */
2097 int netif_get_num_default_rss_queues(void)
2098 {
2099         return min_t(int, DEFAULT_MAX_NUM_RSS_QUEUES, num_online_cpus());
2100 }
2101 EXPORT_SYMBOL(netif_get_num_default_rss_queues);
2102
2103 static inline void __netif_reschedule(struct Qdisc *q)
2104 {
2105         struct softnet_data *sd;
2106         unsigned long flags;
2107
2108         local_irq_save(flags);
2109         sd = &__get_cpu_var(softnet_data);
2110         q->next_sched = NULL;
2111         *sd->output_queue_tailp = q;
2112         sd->output_queue_tailp = &q->next_sched;
2113         raise_softirq_irqoff(NET_TX_SOFTIRQ);
2114         local_irq_restore(flags);
2115 }
2116
2117 void __netif_schedule(struct Qdisc *q)
2118 {
2119         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
2120                 __netif_reschedule(q);
2121 }
2122 EXPORT_SYMBOL(__netif_schedule);
2123
2124 void dev_kfree_skb_irq(struct sk_buff *skb)
2125 {
2126         if (atomic_dec_and_test(&skb->users)) {
2127                 struct softnet_data *sd;
2128                 unsigned long flags;
2129
2130                 local_irq_save(flags);
2131                 sd = &__get_cpu_var(softnet_data);
2132                 skb->next = sd->completion_queue;
2133                 sd->completion_queue = skb;
2134                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
2135                 local_irq_restore(flags);
2136         }
2137 }
2138 EXPORT_SYMBOL(dev_kfree_skb_irq);
2139
2140 void dev_kfree_skb_any(struct sk_buff *skb)
2141 {
2142         if (in_irq() || irqs_disabled())
2143                 dev_kfree_skb_irq(skb);
2144         else
2145                 dev_kfree_skb(skb);
2146 }
2147 EXPORT_SYMBOL(dev_kfree_skb_any);
2148
2149
2150 /**
2151  * netif_device_detach - mark device as removed
2152  * @dev: network device
2153  *
2154  * Mark device as removed from system and therefore no longer available.
2155  */
2156 void netif_device_detach(struct net_device *dev)
2157 {
2158         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
2159             netif_running(dev)) {
2160                 netif_tx_stop_all_queues(dev);
2161         }
2162 }
2163 EXPORT_SYMBOL(netif_device_detach);
2164
2165 /**
2166  * netif_device_attach - mark device as attached
2167  * @dev: network device
2168  *
2169  * Mark device as attached from system and restart if needed.
2170  */
2171 void netif_device_attach(struct net_device *dev)
2172 {
2173         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
2174             netif_running(dev)) {
2175                 netif_tx_wake_all_queues(dev);
2176                 __netdev_watchdog_up(dev);
2177         }
2178 }
2179 EXPORT_SYMBOL(netif_device_attach);
2180
2181 static void skb_warn_bad_offload(const struct sk_buff *skb)
2182 {
2183         static const netdev_features_t null_features = 0;
2184         struct net_device *dev = skb->dev;
2185         const char *driver = "";
2186
2187         if (!net_ratelimit())
2188                 return;
2189
2190         if (dev && dev->dev.parent)
2191                 driver = dev_driver_string(dev->dev.parent);
2192
2193         WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
2194              "gso_type=%d ip_summed=%d\n",
2195              driver, dev ? &dev->features : &null_features,
2196              skb->sk ? &skb->sk->sk_route_caps : &null_features,
2197              skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
2198              skb_shinfo(skb)->gso_type, skb->ip_summed);
2199 }
2200
2201 /*
2202  * Invalidate hardware checksum when packet is to be mangled, and
2203  * complete checksum manually on outgoing path.
2204  */
2205 int skb_checksum_help(struct sk_buff *skb)
2206 {
2207         __wsum csum;
2208         int ret = 0, offset;
2209
2210         if (skb->ip_summed == CHECKSUM_COMPLETE)
2211                 goto out_set_summed;
2212
2213         if (unlikely(skb_shinfo(skb)->gso_size)) {
2214                 skb_warn_bad_offload(skb);
2215                 return -EINVAL;
2216         }
2217
2218         /* Before computing a checksum, we should make sure no frag could
2219          * be modified by an external entity : checksum could be wrong.
2220          */
2221         if (skb_has_shared_frag(skb)) {
2222                 ret = __skb_linearize(skb);
2223                 if (ret)
2224                         goto out;
2225         }
2226
2227         offset = skb_checksum_start_offset(skb);
2228         BUG_ON(offset >= skb_headlen(skb));
2229         csum = skb_checksum(skb, offset, skb->len - offset, 0);
2230
2231         offset += skb->csum_offset;
2232         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
2233
2234         if (skb_cloned(skb) &&
2235             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
2236                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
2237                 if (ret)
2238                         goto out;
2239         }
2240
2241         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
2242 out_set_summed:
2243         skb->ip_summed = CHECKSUM_NONE;
2244 out:
2245         return ret;
2246 }
2247 EXPORT_SYMBOL(skb_checksum_help);
2248
2249 __be16 skb_network_protocol(struct sk_buff *skb)
2250 {
2251         __be16 type = skb->protocol;
2252         int vlan_depth = ETH_HLEN;
2253
2254         /* Tunnel gso handlers can set protocol to ethernet. */
2255         if (type == htons(ETH_P_TEB)) {
2256                 struct ethhdr *eth;
2257
2258                 if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr))))
2259                         return 0;
2260
2261                 eth = (struct ethhdr *)skb_mac_header(skb);
2262                 type = eth->h_proto;
2263         }
2264
2265         while (type == htons(ETH_P_8021Q) || type == htons(ETH_P_8021AD)) {
2266                 struct vlan_hdr *vh;
2267
2268                 if (unlikely(!pskb_may_pull(skb, vlan_depth + VLAN_HLEN)))
2269                         return 0;
2270
2271                 vh = (struct vlan_hdr *)(skb->data + vlan_depth);
2272                 type = vh->h_vlan_encapsulated_proto;
2273                 vlan_depth += VLAN_HLEN;
2274         }
2275
2276         return type;
2277 }
2278
2279 /**
2280  *      skb_mac_gso_segment - mac layer segmentation handler.
2281  *      @skb: buffer to segment
2282  *      @features: features for the output path (see dev->features)
2283  */
2284 struct sk_buff *skb_mac_gso_segment(struct sk_buff *skb,
2285                                     netdev_features_t features)
2286 {
2287         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
2288         struct packet_offload *ptype;
2289         __be16 type = skb_network_protocol(skb);
2290
2291         if (unlikely(!type))
2292                 return ERR_PTR(-EINVAL);
2293
2294         __skb_pull(skb, skb->mac_len);
2295
2296         rcu_read_lock();
2297         list_for_each_entry_rcu(ptype, &offload_base, list) {
2298                 if (ptype->type == type && ptype->callbacks.gso_segment) {
2299                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
2300                                 int err;
2301
2302                                 err = ptype->callbacks.gso_send_check(skb);
2303                                 segs = ERR_PTR(err);
2304                                 if (err || skb_gso_ok(skb, features))
2305                                         break;
2306                                 __skb_push(skb, (skb->data -
2307                                                  skb_network_header(skb)));
2308                         }
2309                         segs = ptype->callbacks.gso_segment(skb, features);
2310                         break;
2311                 }
2312         }
2313         rcu_read_unlock();
2314
2315         __skb_push(skb, skb->data - skb_mac_header(skb));
2316
2317         return segs;
2318 }
2319 EXPORT_SYMBOL(skb_mac_gso_segment);
2320
2321
2322 /* openvswitch calls this on rx path, so we need a different check.
2323  */
2324 static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
2325 {
2326         if (tx_path)
2327                 return skb->ip_summed != CHECKSUM_PARTIAL;
2328         else
2329                 return skb->ip_summed == CHECKSUM_NONE;
2330 }
2331
2332 /**
2333  *      __skb_gso_segment - Perform segmentation on skb.
2334  *      @skb: buffer to segment
2335  *      @features: features for the output path (see dev->features)
2336  *      @tx_path: whether it is called in TX path
2337  *
2338  *      This function segments the given skb and returns a list of segments.
2339  *
2340  *      It may return NULL if the skb requires no segmentation.  This is
2341  *      only possible when GSO is used for verifying header integrity.
2342  */
2343 struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
2344                                   netdev_features_t features, bool tx_path)
2345 {
2346         if (unlikely(skb_needs_check(skb, tx_path))) {
2347                 int err;
2348
2349                 skb_warn_bad_offload(skb);
2350
2351                 if (skb_header_cloned(skb) &&
2352                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
2353                         return ERR_PTR(err);
2354         }
2355
2356         SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
2357         skb_reset_mac_header(skb);
2358         skb_reset_mac_len(skb);
2359
2360         return skb_mac_gso_segment(skb, features);
2361 }
2362 EXPORT_SYMBOL(__skb_gso_segment);
2363
2364 /* Take action when hardware reception checksum errors are detected. */
2365 #ifdef CONFIG_BUG
2366 void netdev_rx_csum_fault(struct net_device *dev)
2367 {
2368         if (net_ratelimit()) {
2369                 pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
2370                 dump_stack();
2371         }
2372 }
2373 EXPORT_SYMBOL(netdev_rx_csum_fault);
2374 #endif
2375
2376 /* Actually, we should eliminate this check as soon as we know, that:
2377  * 1. IOMMU is present and allows to map all the memory.
2378  * 2. No high memory really exists on this machine.
2379  */
2380
2381 static int illegal_highdma(const struct net_device *dev, struct sk_buff *skb)
2382 {
2383 #ifdef CONFIG_HIGHMEM
2384         int i;
2385         if (!(dev->features & NETIF_F_HIGHDMA)) {
2386                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2387                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2388                         if (PageHighMem(skb_frag_page(frag)))
2389                                 return 1;
2390                 }
2391         }
2392
2393         if (PCI_DMA_BUS_IS_PHYS) {
2394                 struct device *pdev = dev->dev.parent;
2395
2396                 if (!pdev)
2397                         return 0;
2398                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
2399                         skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
2400                         dma_addr_t addr = page_to_phys(skb_frag_page(frag));
2401                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
2402                                 return 1;
2403                 }
2404         }
2405 #endif
2406         return 0;
2407 }
2408
2409 struct dev_gso_cb {
2410         void (*destructor)(struct sk_buff *skb);
2411 };
2412
2413 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
2414
2415 static void dev_gso_skb_destructor(struct sk_buff *skb)
2416 {
2417         struct dev_gso_cb *cb;
2418
2419         do {
2420                 struct sk_buff *nskb = skb->next;
2421
2422                 skb->next = nskb->next;
2423                 nskb->next = NULL;
2424                 kfree_skb(nskb);
2425         } while (skb->next);
2426
2427         cb = DEV_GSO_CB(skb);
2428         if (cb->destructor)
2429                 cb->destructor(skb);
2430 }
2431
2432 /**
2433  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
2434  *      @skb: buffer to segment
2435  *      @features: device features as applicable to this skb
2436  *
2437  *      This function segments the given skb and stores the list of segments
2438  *      in skb->next.
2439  */
2440 static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
2441 {
2442         struct sk_buff *segs;
2443
2444         segs = skb_gso_segment(skb, features);
2445
2446         /* Verifying header integrity only. */
2447         if (!segs)
2448                 return 0;
2449
2450         if (IS_ERR(segs))
2451                 return PTR_ERR(segs);
2452
2453         skb->next = segs;
2454         DEV_GSO_CB(skb)->destructor = skb->destructor;
2455         skb->destructor = dev_gso_skb_destructor;
2456
2457         return 0;
2458 }
2459
2460 static netdev_features_t harmonize_features(struct sk_buff *skb,
2461                                             __be16 protocol,
2462                                             const struct net_device *dev,
2463                                             netdev_features_t features)
2464 {
2465         if (skb->ip_summed != CHECKSUM_NONE &&
2466             !can_checksum_protocol(features, protocol)) {
2467                 features &= ~NETIF_F_ALL_CSUM;
2468         } else if (illegal_highdma(dev, skb)) {
2469                 features &= ~NETIF_F_SG;
2470         }
2471
2472         return features;
2473 }
2474
2475 netdev_features_t netif_skb_dev_features(struct sk_buff *skb,
2476                                          const struct net_device *dev)
2477 {
2478         __be16 protocol = skb->protocol;
2479         netdev_features_t features = dev->features;
2480
2481         if (skb_shinfo(skb)->gso_segs > dev->gso_max_segs)
2482                 features &= ~NETIF_F_GSO_MASK;
2483
2484         if (protocol == htons(ETH_P_8021Q) || protocol == htons(ETH_P_8021AD)) {
2485                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
2486                 protocol = veh->h_vlan_encapsulated_proto;
2487         } else if (!vlan_tx_tag_present(skb)) {
2488                 return harmonize_features(skb, protocol, dev, features);
2489         }
2490
2491         features &= (dev->vlan_features | NETIF_F_HW_VLAN_CTAG_TX |
2492                                                NETIF_F_HW_VLAN_STAG_TX);
2493
2494         if (protocol != htons(ETH_P_8021Q) && protocol != htons(ETH_P_8021AD)) {
2495                 return harmonize_features(skb, protocol, dev, features);
2496         } else {
2497                 features &= NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST |
2498                                 NETIF_F_GEN_CSUM | NETIF_F_HW_VLAN_CTAG_TX |
2499                                 NETIF_F_HW_VLAN_STAG_TX;
2500                 return harmonize_features(skb, protocol, dev, features);
2501         }
2502
2503         return harmonize_features(skb, protocol, dev, features);
2504 }
2505 EXPORT_SYMBOL(netif_skb_dev_features);
2506
2507 /*
2508  * Returns true if either:
2509  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2510  *      2. skb is fragmented and the device does not support SG.
2511  */
2512 static inline int skb_needs_linearize(struct sk_buff *skb,
2513                                       netdev_features_t features)
2514 {
2515         return skb_is_nonlinear(skb) &&
2516                         ((skb_has_frag_list(skb) &&
2517                                 !(features & NETIF_F_FRAGLIST)) ||
2518                         (skb_shinfo(skb)->nr_frags &&
2519                                 !(features & NETIF_F_SG)));
2520 }
2521
2522 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
2523                         struct netdev_queue *txq)
2524 {
2525         const struct net_device_ops *ops = dev->netdev_ops;
2526         int rc = NETDEV_TX_OK;
2527         unsigned int skb_len;
2528
2529         if (likely(!skb->next)) {
2530                 netdev_features_t features;
2531
2532                 /*
2533                  * If device doesn't need skb->dst, release it right now while
2534                  * its hot in this cpu cache
2535                  */
2536                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
2537                         skb_dst_drop(skb);
2538
2539                 features = netif_skb_features(skb);
2540
2541                 if (vlan_tx_tag_present(skb) &&
2542                     !vlan_hw_offload_capable(features, skb->vlan_proto)) {
2543                         skb = __vlan_put_tag(skb, skb->vlan_proto,
2544                                              vlan_tx_tag_get(skb));
2545                         if (unlikely(!skb))
2546                                 goto out;
2547
2548                         skb->vlan_tci = 0;
2549                 }
2550
2551                 /* If encapsulation offload request, verify we are testing
2552                  * hardware encapsulation features instead of standard
2553                  * features for the netdev
2554                  */
2555                 if (skb->encapsulation)
2556                         features &= dev->hw_enc_features;
2557
2558                 if (netif_needs_gso(skb, features)) {
2559                         if (unlikely(dev_gso_segment(skb, features)))
2560                                 goto out_kfree_skb;
2561                         if (skb->next)
2562                                 goto gso;
2563                 } else {
2564                         if (skb_needs_linearize(skb, features) &&
2565                             __skb_linearize(skb))
2566                                 goto out_kfree_skb;
2567
2568                         /* If packet is not checksummed and device does not
2569                          * support checksumming for this protocol, complete
2570                          * checksumming here.
2571                          */
2572                         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2573                                 if (skb->encapsulation)
2574                                         skb_set_inner_transport_header(skb,
2575                                                 skb_checksum_start_offset(skb));
2576                                 else
2577                                         skb_set_transport_header(skb,
2578                                                 skb_checksum_start_offset(skb));
2579                                 if (!(features & NETIF_F_ALL_CSUM) &&
2580                                      skb_checksum_help(skb))
2581                                         goto out_kfree_skb;
2582                         }
2583                 }
2584
2585                 if (!list_empty(&ptype_all))
2586                         dev_queue_xmit_nit(skb, dev);
2587
2588                 skb_len = skb->len;
2589                 rc = ops->ndo_start_xmit(skb, dev);
2590                 trace_net_dev_xmit(skb, rc, dev, skb_len);
2591                 if (rc == NETDEV_TX_OK)
2592                         txq_trans_update(txq);
2593                 return rc;
2594         }
2595
2596 gso:
2597         do {
2598                 struct sk_buff *nskb = skb->next;
2599
2600                 skb->next = nskb->next;
2601                 nskb->next = NULL;
2602
2603                 if (!list_empty(&ptype_all))
2604                         dev_queue_xmit_nit(nskb, dev);
2605
2606                 skb_len = nskb->len;
2607                 rc = ops->ndo_start_xmit(nskb, dev);
2608                 trace_net_dev_xmit(nskb, rc, dev, skb_len);
2609                 if (unlikely(rc != NETDEV_TX_OK)) {
2610                         if (rc & ~NETDEV_TX_MASK)
2611                                 goto out_kfree_gso_skb;
2612                         nskb->next = skb->next;
2613                         skb->next = nskb;
2614                         return rc;
2615                 }
2616                 txq_trans_update(txq);
2617                 if (unlikely(netif_xmit_stopped(txq) && skb->next))
2618                         return NETDEV_TX_BUSY;
2619         } while (skb->next);
2620
2621 out_kfree_gso_skb:
2622         if (likely(skb->next == NULL)) {
2623                 skb->destructor = DEV_GSO_CB(skb)->destructor;
2624                 consume_skb(skb);
2625                 return rc;
2626         }
2627 out_kfree_skb:
2628         kfree_skb(skb);
2629 out:
2630         return rc;
2631 }
2632
2633 static void qdisc_pkt_len_init(struct sk_buff *skb)
2634 {
2635         const struct skb_shared_info *shinfo = skb_shinfo(skb);
2636
2637         qdisc_skb_cb(skb)->pkt_len = skb->len;
2638
2639         /* To get more precise estimation of bytes sent on wire,
2640          * we add to pkt_len the headers size of all segments
2641          */
2642         if (shinfo->gso_size)  {
2643                 unsigned int hdr_len;
2644                 u16 gso_segs = shinfo->gso_segs;
2645
2646                 /* mac layer + network layer */
2647                 hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
2648
2649                 /* + transport layer */
2650                 if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))
2651                         hdr_len += tcp_hdrlen(skb);
2652                 else
2653                         hdr_len += sizeof(struct udphdr);
2654
2655                 if (shinfo->gso_type & SKB_GSO_DODGY)
2656                         gso_segs = DIV_ROUND_UP(skb->len - hdr_len,
2657                                                 shinfo->gso_size);
2658
2659                 qdisc_skb_cb(skb)->pkt_len += (gso_segs - 1) * hdr_len;
2660         }
2661 }
2662
2663 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2664                                  struct net_device *dev,
2665                                  struct netdev_queue *txq)
2666 {
2667         spinlock_t *root_lock = qdisc_lock(q);
2668         bool contended;
2669         int rc;
2670
2671         qdisc_pkt_len_init(skb);
2672         qdisc_calculate_pkt_len(skb, q);
2673         /*
2674          * Heuristic to force contended enqueues to serialize on a
2675          * separate lock before trying to get qdisc main lock.
2676          * This permits __QDISC_STATE_RUNNING owner to get the lock more often
2677          * and dequeue packets faster.
2678          */
2679         contended = qdisc_is_running(q);
2680         if (unlikely(contended))
2681                 spin_lock(&q->busylock);
2682
2683         spin_lock(root_lock);
2684         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2685                 printk(KERN_WARNING "[mtk_net]__dev_xmit_skb drop skb_len = %d \n", skb->len);
2686                 kfree_skb(skb);
2687                 rc = NET_XMIT_DROP;
2688         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2689                    qdisc_run_begin(q)) {
2690                 /*
2691                  * This is a work-conserving queue; there are no old skbs
2692                  * waiting to be sent out; and the qdisc is not running -
2693                  * xmit the skb directly.
2694                  */
2695                 if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE))
2696                         skb_dst_force(skb);
2697
2698                 qdisc_bstats_update(q, skb);
2699
2700                 if (sch_direct_xmit(skb, q, dev, txq, root_lock)) {
2701                         if (unlikely(contended)) {
2702                                 spin_unlock(&q->busylock);
2703                                 contended = false;
2704                         }
2705                         __qdisc_run(q);
2706                 } else
2707                         qdisc_run_end(q);
2708
2709                 rc = NET_XMIT_SUCCESS;
2710         } else {
2711                 skb_dst_force(skb);
2712                 rc = q->enqueue(skb, q) & NET_XMIT_MASK;
2713                 if (qdisc_run_begin(q)) {
2714                         if (unlikely(contended)) {
2715                                 spin_unlock(&q->busylock);
2716                                 contended = false;
2717                         }
2718                         __qdisc_run(q);
2719                 }
2720         }
2721         spin_unlock(root_lock);
2722         if (unlikely(contended))
2723                 spin_unlock(&q->busylock);
2724         return rc;
2725 }
2726
2727 #if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
2728 static void skb_update_prio(struct sk_buff *skb)
2729 {
2730         struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
2731
2732         if (!skb->priority && skb->sk && map) {
2733                 unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
2734
2735                 if (prioidx < map->priomap_len)
2736                         skb->priority = map->priomap[prioidx];
2737         }
2738 }
2739 #else
2740 #define skb_update_prio(skb)
2741 #endif
2742
2743 static DEFINE_PER_CPU(int, xmit_recursion);
2744 #define RECURSION_LIMIT 10
2745
2746 /**
2747  *      dev_loopback_xmit - loop back @skb
2748  *      @skb: buffer to transmit
2749  */
2750 int dev_loopback_xmit(struct sk_buff *skb)
2751 {
2752         skb_reset_mac_header(skb);
2753         __skb_pull(skb, skb_network_offset(skb));
2754         skb->pkt_type = PACKET_LOOPBACK;
2755         skb->ip_summed = CHECKSUM_UNNECESSARY;
2756         WARN_ON(!skb_dst(skb));
2757         skb_dst_force(skb);
2758         netif_rx_ni(skb);
2759         return 0;
2760 }
2761 EXPORT_SYMBOL(dev_loopback_xmit);
2762
2763 /**
2764  *      dev_queue_xmit - transmit a buffer
2765  *      @skb: buffer to transmit
2766  *
2767  *      Queue a buffer for transmission to a network device. The caller must
2768  *      have set the device and priority and built the buffer before calling
2769  *      this function. The function can be called from an interrupt.
2770  *
2771  *      A negative errno code is returned on a failure. A success does not
2772  *      guarantee the frame will be transmitted as it may be dropped due
2773  *      to congestion or traffic shaping.
2774  *
2775  * -----------------------------------------------------------------------------------
2776  *      I notice this method can also return errors from the queue disciplines,
2777  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2778  *      be positive.
2779  *
2780  *      Regardless of the return value, the skb is consumed, so it is currently
2781  *      difficult to retry a send to this method.  (You can bump the ref count
2782  *      before sending to hold a reference for retry if you are careful.)
2783  *
2784  *      When calling this method, interrupts MUST be enabled.  This is because
2785  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2786  *          --BLG
2787  */
2788 int dev_queue_xmit(struct sk_buff *skb)
2789 {
2790         struct net_device *dev = skb->dev;
2791         struct netdev_queue *txq;
2792         struct Qdisc *q;
2793         int rc = -ENOMEM;
2794
2795         skb_reset_mac_header(skb);
2796
2797 #ifdef UDP_SKT_WIFI
2798
2799         if (unlikely((sysctl_met_is_enable == 1) && (sysctl_udp_met_port > 0)
2800                  && (ip_hdr(skb)->protocol == IPPROTO_UDP) && skb->sk)) {
2801
2802             if (sysctl_udp_met_port == ntohs((inet_sk(skb->sk))->inet_sport)) {
2803                 struct udphdr * udp_iphdr = udp_hdr(skb);
2804                 if (udp_iphdr && (ntohs(udp_iphdr->len) >= 12)) {
2805                 __u16 * seq_id = (__u16 *)((char *)udp_iphdr + 10);
2806                     udp_event_trace_printk("F|%d|%s|%d\n", current->pid, *seq_id);
2807
2808                 }
2809             }
2810         }
2811 #endif
2812
2813         /* Disable soft irqs for various locks below. Also
2814          * stops preemption for RCU.
2815          */
2816         rcu_read_lock_bh();
2817
2818         skb_update_prio(skb);
2819
2820         txq = netdev_pick_tx(dev, skb);
2821         q = rcu_dereference_bh(txq->qdisc);
2822
2823 #ifdef CONFIG_NET_CLS_ACT
2824         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2825 #endif
2826         trace_net_dev_queue(skb);
2827         if (q->enqueue) {
2828                 rc = __dev_xmit_skb(skb, q, dev, txq);
2829                 goto out;
2830         }
2831
2832         /* The device has no queue. Common case for software devices:
2833            loopback, all the sorts of tunnels...
2834
2835            Really, it is unlikely that netif_tx_lock protection is necessary
2836            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2837            counters.)
2838            However, it is possible, that they rely on protection
2839            made by us here.
2840
2841            Check this and shot the lock. It is not prone from deadlocks.
2842            Either shot noqueue qdisc, it is even simpler 8)
2843          */
2844         if (dev->flags & IFF_UP) {
2845                 int cpu = smp_processor_id(); /* ok because BHs are off */
2846
2847                 if (txq->xmit_lock_owner != cpu) {
2848
2849                         if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT)
2850                                 goto recursion_alert;
2851
2852                         HARD_TX_LOCK(dev, txq, cpu);
2853
2854                         if (!netif_xmit_stopped(txq)) {
2855                                 __this_cpu_inc(xmit_recursion);
2856                                 rc = dev_hard_start_xmit(skb, dev, txq);
2857                                 __this_cpu_dec(xmit_recursion);
2858                                 if (dev_xmit_complete(rc)) {
2859                                         HARD_TX_UNLOCK(dev, txq);
2860                                         goto out;
2861                                 }
2862                         }
2863                         HARD_TX_UNLOCK(dev, txq);
2864                         net_crit_ratelimited("Virtual device %s asks to queue packet!\n",
2865                                              dev->name);
2866                 } else {
2867                         /* Recursion is detected! It is possible,
2868                          * unfortunately
2869                          */
2870 recursion_alert:
2871                         net_crit_ratelimited("Dead loop on virtual device %s, fix it urgently!\n",
2872                                              dev->name);
2873                 }
2874         }
2875
2876         rc = -ENETDOWN;
2877         rcu_read_unlock_bh();
2878
2879         kfree_skb(skb);
2880         return rc;
2881 out:
2882         rcu_read_unlock_bh();
2883         return rc;
2884 }
2885 EXPORT_SYMBOL(dev_queue_xmit);
2886
2887
2888 /*=======================================================================
2889                         Receiver routines
2890   =======================================================================*/
2891
2892 int netdev_max_backlog __read_mostly = 1000;
2893 EXPORT_SYMBOL(netdev_max_backlog);
2894
2895 int netdev_tstamp_prequeue __read_mostly = 1;
2896 int netdev_budget __read_mostly = 300;
2897 int weight_p __read_mostly = 64;            /* old backlog weight */
2898
2899 /* Called with irq disabled */
2900 static inline void ____napi_schedule(struct softnet_data *sd,
2901                                      struct napi_struct *napi)
2902 {
2903         list_add_tail(&napi->poll_list, &sd->poll_list);
2904         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2905 }
2906
2907 #ifdef CONFIG_RPS
2908
2909 /* One global table that all flow-based protocols share. */
2910 struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
2911 EXPORT_SYMBOL(rps_sock_flow_table);
2912
2913 struct static_key rps_needed __read_mostly;
2914
2915 static struct rps_dev_flow *
2916 set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2917             struct rps_dev_flow *rflow, u16 next_cpu)
2918 {
2919         if (next_cpu != RPS_NO_CPU) {
2920 #ifdef CONFIG_RFS_ACCEL
2921                 struct netdev_rx_queue *rxqueue;
2922                 struct rps_dev_flow_table *flow_table;
2923                 struct rps_dev_flow *old_rflow;
2924                 u32 flow_id;
2925                 u16 rxq_index;
2926                 int rc;
2927
2928                 /* Should we steer this flow to a different hardware queue? */
2929                 if (!skb_rx_queue_recorded(skb) || !dev->rx_cpu_rmap ||
2930                     !(dev->features & NETIF_F_NTUPLE))
2931                         goto out;
2932                 rxq_index = cpu_rmap_lookup_index(dev->rx_cpu_rmap, next_cpu);
2933                 if (rxq_index == skb_get_rx_queue(skb))
2934                         goto out;
2935
2936                 rxqueue = dev->_rx + rxq_index;
2937                 flow_table = rcu_dereference(rxqueue->rps_flow_table);
2938                 if (!flow_table)
2939                         goto out;
2940                 flow_id = skb->rxhash & flow_table->mask;
2941                 rc = dev->netdev_ops->ndo_rx_flow_steer(dev, skb,
2942                                                         rxq_index, flow_id);
2943                 if (rc < 0)
2944                         goto out;
2945                 old_rflow = rflow;
2946                 rflow = &flow_table->flows[flow_id];
2947                 rflow->filter = rc;
2948                 if (old_rflow->filter == rflow->filter)
2949                         old_rflow->filter = RPS_NO_FILTER;
2950         out:
2951 #endif
2952                 rflow->last_qtail =
2953                         per_cpu(softnet_data, next_cpu).input_queue_head;
2954         }
2955
2956         rflow->cpu = next_cpu;
2957         return rflow;
2958 }
2959
2960 /*
2961  * get_rps_cpu is called from netif_receive_skb and returns the target
2962  * CPU from the RPS map of the receiving queue for a given skb.
2963  * rcu_read_lock must be held on entry.
2964  */
2965 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
2966                        struct rps_dev_flow **rflowp)
2967 {
2968         struct netdev_rx_queue *rxqueue;
2969         struct rps_map *map;
2970         struct rps_dev_flow_table *flow_table;
2971         struct rps_sock_flow_table *sock_flow_table;
2972         int cpu = -1;
2973         u16 tcpu;
2974
2975         if (skb_rx_queue_recorded(skb)) {
2976                 u16 index = skb_get_rx_queue(skb);
2977                 if (unlikely(index >= dev->real_num_rx_queues)) {
2978                         WARN_ONCE(dev->real_num_rx_queues > 1,
2979                                   "%s received packet on queue %u, but number "
2980                                   "of RX queues is %u\n",
2981                                   dev->name, index, dev->real_num_rx_queues);
2982                         goto done;
2983                 }
2984                 rxqueue = dev->_rx + index;
2985         } else
2986                 rxqueue = dev->_rx;
2987
2988         map = rcu_dereference(rxqueue->rps_map);
2989         if (map) {
2990                 if (map->len == 1 &&
2991                     !rcu_access_pointer(rxqueue->rps_flow_table)) {
2992                         tcpu = map->cpus[0];
2993                         if (cpu_online(tcpu))
2994                                 cpu = tcpu;
2995                         goto done;
2996                 }
2997         } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
2998                 goto done;
2999         }
3000
3001         skb_reset_network_header(skb);
3002         if (!skb_get_rxhash(skb))
3003                 goto done;
3004
3005         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3006         sock_flow_table = rcu_dereference(rps_sock_flow_table);
3007         if (flow_table && sock_flow_table) {
3008                 u16 next_cpu;
3009                 struct rps_dev_flow *rflow;
3010
3011                 rflow = &flow_table->flows[skb->rxhash & flow_table->mask];
3012                 tcpu = rflow->cpu;
3013
3014                 next_cpu = sock_flow_table->ents[skb->rxhash &
3015                     sock_flow_table->mask];
3016
3017                 /*
3018                  * If the desired CPU (where last recvmsg was done) is
3019                  * different from current CPU (one in the rx-queue flow
3020                  * table entry), switch if one of the following holds:
3021                  *   - Current CPU is unset (equal to RPS_NO_CPU).
3022                  *   - Current CPU is offline.
3023                  *   - The current CPU's queue tail has advanced beyond the
3024                  *     last packet that was enqueued using this table entry.
3025                  *     This guarantees that all previous packets for the flow
3026                  *     have been dequeued, thus preserving in order delivery.
3027                  */
3028                 if (unlikely(tcpu != next_cpu) &&
3029                     (tcpu == RPS_NO_CPU || !cpu_online(tcpu) ||
3030                      ((int)(per_cpu(softnet_data, tcpu).input_queue_head -
3031                       rflow->last_qtail)) >= 0)) {
3032                         tcpu = next_cpu;
3033                         rflow = set_rps_cpu(dev, skb, rflow, next_cpu);
3034                 }
3035
3036                 if (tcpu != RPS_NO_CPU && cpu_online(tcpu)) {
3037                         *rflowp = rflow;
3038                         cpu = tcpu;
3039                         goto done;
3040                 }
3041         }
3042
3043         if (map) {
3044                 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
3045
3046                 if (cpu_online(tcpu)) {
3047                         cpu = tcpu;
3048                         goto done;
3049                 }
3050         }
3051
3052 done:
3053         return cpu;
3054 }
3055
3056 #ifdef CONFIG_RFS_ACCEL
3057
3058 /**
3059  * rps_may_expire_flow - check whether an RFS hardware filter may be removed
3060  * @dev: Device on which the filter was set
3061  * @rxq_index: RX queue index
3062  * @flow_id: Flow ID passed to ndo_rx_flow_steer()
3063  * @filter_id: Filter ID returned by ndo_rx_flow_steer()
3064  *
3065  * Drivers that implement ndo_rx_flow_steer() should periodically call
3066  * this function for each installed filter and remove the filters for
3067  * which it returns %true.
3068  */
3069 bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
3070                          u32 flow_id, u16 filter_id)
3071 {
3072         struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
3073         struct rps_dev_flow_table *flow_table;
3074         struct rps_dev_flow *rflow;
3075         bool expire = true;
3076         int cpu;
3077
3078         rcu_read_lock();
3079         flow_table = rcu_dereference(rxqueue->rps_flow_table);
3080         if (flow_table && flow_id <= flow_table->mask) {
3081                 rflow = &flow_table->flows[flow_id];
3082                 cpu = ACCESS_ONCE(rflow->cpu);
3083                 if (rflow->filter == filter_id && cpu != RPS_NO_CPU &&
3084                     ((int)(per_cpu(softnet_data, cpu).input_queue_head -
3085                            rflow->last_qtail) <
3086                      (int)(10 * flow_table->mask)))
3087                         expire = false;
3088         }
3089         rcu_read_unlock();
3090         return expire;
3091 }
3092 EXPORT_SYMBOL(rps_may_expire_flow);
3093
3094 #endif /* CONFIG_RFS_ACCEL */
3095
3096 /* Called from hardirq (IPI) context */
3097 static void rps_trigger_softirq(void *data)
3098 {
3099         struct softnet_data *sd = data;
3100
3101         ____napi_schedule(sd, &sd->backlog);
3102         sd->received_rps++;
3103 }
3104
3105 #endif /* CONFIG_RPS */
3106
3107 /*
3108  * Check if this softnet_data structure is another cpu one
3109  * If yes, queue it to our IPI list and return 1
3110  * If no, return 0
3111  */
3112 static int rps_ipi_queued(struct softnet_data *sd)
3113 {
3114 #ifdef CONFIG_RPS
3115         struct softnet_data *mysd = &__get_cpu_var(softnet_data);
3116
3117         if (sd != mysd) {
3118                 sd->rps_ipi_next = mysd->rps_ipi_list;
3119                 mysd->rps_ipi_list = sd;
3120
3121                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3122                 return 1;
3123         }
3124 #endif /* CONFIG_RPS */
3125         return 0;
3126 }
3127
3128 /*
3129  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
3130  * queue (may be a remote CPU queue).
3131  */
3132 static int enqueue_to_backlog(struct sk_buff *skb, int cpu,
3133                               unsigned int *qtail)
3134 {
3135         struct softnet_data *sd;
3136         unsigned long flags;
3137
3138         sd = &per_cpu(softnet_data, cpu);
3139
3140         local_irq_save(flags);
3141
3142         rps_lock(sd);
3143         if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {
3144                 if (skb_queue_len(&sd->input_pkt_queue)) {
3145 enqueue:
3146                         __skb_queue_tail(&sd->input_pkt_queue, skb);
3147                         input_queue_tail_incr_save(sd, qtail);
3148                         rps_unlock(sd);
3149                         local_irq_restore(flags);
3150                         return NET_RX_SUCCESS;
3151                 }
3152
3153                 /* Schedule NAPI for backlog device
3154                  * We can use non atomic operation since we own the queue lock
3155                  */
3156                 if (!__test_and_set_bit(NAPI_STATE_SCHED, &sd->backlog.state)) {
3157                         if (!rps_ipi_queued(sd))
3158                                 ____napi_schedule(sd, &sd->backlog);
3159                 }
3160                 goto enqueue;
3161         }
3162
3163         sd->dropped++;
3164         rps_unlock(sd);
3165
3166         local_irq_restore(flags);
3167
3168         atomic_long_inc(&skb->dev->rx_dropped);
3169         kfree_skb(skb);
3170         return NET_RX_DROP;
3171 }
3172
3173 /**
3174  *      netif_rx        -       post buffer to the network code
3175  *      @skb: buffer to post
3176  *
3177  *      This function receives a packet from a device driver and queues it for
3178  *      the upper (protocol) levels to process.  It always succeeds. The buffer
3179  *      may be dropped during processing for congestion control or by the
3180  *      protocol layers.
3181  *
3182  *      return values:
3183  *      NET_RX_SUCCESS  (no congestion)
3184  *      NET_RX_DROP     (packet was dropped)
3185  *
3186  */
3187
3188 int netif_rx(struct sk_buff *skb)
3189 {
3190         int ret;
3191
3192         /* if netpoll wants it, pretend we never saw it */
3193         if (netpoll_rx(skb))
3194                 return NET_RX_DROP;
3195
3196         net_timestamp_check(netdev_tstamp_prequeue, skb);
3197
3198         trace_netif_rx(skb);
3199 #ifdef CONFIG_RPS
3200         if (static_key_false(&rps_needed)) {
3201                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3202                 int cpu;
3203
3204                 preempt_disable();
3205                 rcu_read_lock();
3206
3207                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3208                 if (cpu < 0)
3209                         cpu = smp_processor_id();
3210
3211                 ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3212
3213                 rcu_read_unlock();
3214                 preempt_enable();
3215         } else
3216 #endif
3217         {
3218                 unsigned int qtail;
3219                 ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
3220                 put_cpu();
3221         }
3222         return ret;
3223 }
3224 EXPORT_SYMBOL(netif_rx);
3225
3226 int netif_rx_ni(struct sk_buff *skb)
3227 {
3228         int err;
3229
3230         preempt_disable();
3231         err = netif_rx(skb);
3232         if (local_softirq_pending())
3233                 do_softirq();
3234         preempt_enable();
3235
3236         return err;
3237 }
3238 EXPORT_SYMBOL(netif_rx_ni);
3239
3240 static void net_tx_action(struct softirq_action *h)
3241 {
3242         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3243
3244         if (sd->completion_queue) {
3245                 struct sk_buff *clist;
3246
3247                 local_irq_disable();
3248                 clist = sd->completion_queue;
3249                 sd->completion_queue = NULL;
3250                 local_irq_enable();
3251
3252                 while (clist) {
3253                         struct sk_buff *skb = clist;
3254                         clist = clist->next;
3255
3256                         WARN_ON(atomic_read(&skb->users));
3257                         trace_kfree_skb(skb, net_tx_action);
3258                         __kfree_skb(skb);
3259                 }
3260         }
3261
3262         if (sd->output_queue) {
3263                 struct Qdisc *head;
3264
3265                 local_irq_disable();
3266                 head = sd->output_queue;
3267                 sd->output_queue = NULL;
3268                 sd->output_queue_tailp = &sd->output_queue;
3269                 local_irq_enable();
3270
3271                 while (head) {
3272                         struct Qdisc *q = head;
3273                         spinlock_t *root_lock;
3274
3275                         head = head->next_sched;
3276
3277                         root_lock = qdisc_lock(q);
3278                         if (spin_trylock(root_lock)) {
3279                                 smp_mb__before_clear_bit();
3280                                 clear_bit(__QDISC_STATE_SCHED,
3281                                           &q->state);
3282                                 qdisc_run(q);
3283                                 spin_unlock(root_lock);
3284                         } else {
3285                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
3286                                               &q->state)) {
3287                                         __netif_reschedule(q);
3288                                 } else {
3289                                         smp_mb__before_clear_bit();
3290                                         clear_bit(__QDISC_STATE_SCHED,
3291                                                   &q->state);
3292                                 }
3293                         }
3294                 }
3295         }
3296 }
3297
3298 #if (defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)) && \
3299     (defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE))
3300 /* This hook is defined here for ATM LANE */
3301 int (*br_fdb_test_addr_hook)(struct net_device *dev,
3302                              unsigned char *addr) __read_mostly;
3303 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
3304 #endif
3305
3306 #ifdef CONFIG_NET_CLS_ACT
3307 /* TODO: Maybe we should just force sch_ingress to be compiled in
3308  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
3309  * a compare and 2 stores extra right now if we dont have it on
3310  * but have CONFIG_NET_CLS_ACT
3311  * NOTE: This doesn't stop any functionality; if you dont have
3312  * the ingress scheduler, you just can't add policies on ingress.
3313  *
3314  */
3315 static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq)
3316 {
3317         struct net_device *dev = skb->dev;
3318         u32 ttl = G_TC_RTTL(skb->tc_verd);
3319         int result = TC_ACT_OK;
3320         struct Qdisc *q;
3321
3322         if (unlikely(MAX_RED_LOOP < ttl++)) {
3323                 net_warn_ratelimited("Redir loop detected Dropping packet (%d->%d)\n",
3324                                      skb->skb_iif, dev->ifindex);
3325                 return TC_ACT_SHOT;
3326         }
3327
3328         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
3329         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
3330
3331         q = rxq->qdisc;
3332         if (q != &noop_qdisc) {
3333                 spin_lock(qdisc_lock(q));
3334                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
3335                         result = qdisc_enqueue_root(skb, q);
3336                 spin_unlock(qdisc_lock(q));
3337         }
3338
3339         return result;
3340 }
3341
3342 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
3343                                          struct packet_type **pt_prev,
3344                                          int *ret, struct net_device *orig_dev)
3345 {
3346         struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue);
3347
3348         if (!rxq || rxq->qdisc == &noop_qdisc)
3349                 goto out;
3350
3351         if (*pt_prev) {
3352                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
3353                 *pt_prev = NULL;
3354         }
3355
3356         switch (ing_filter(skb, rxq)) {
3357         case TC_ACT_SHOT:
3358         case TC_ACT_STOLEN:
3359                 kfree_skb(skb);
3360                 return NULL;
3361         }
3362
3363 out:
3364         skb->tc_verd = 0;
3365         return skb;
3366 }
3367 #endif
3368
3369 /**
3370  *      netdev_rx_handler_register - register receive handler
3371  *      @dev: device to register a handler for
3372  *      @rx_handler: receive handler to register
3373  *      @rx_handler_data: data pointer that is used by rx handler
3374  *
3375  *      Register a receive hander for a device. This handler will then be
3376  *      called from __netif_receive_skb. A negative errno code is returned
3377  *      on a failure.
3378  *
3379  *      The caller must hold the rtnl_mutex.
3380  *
3381  *      For a general description of rx_handler, see enum rx_handler_result.
3382  */
3383 int netdev_rx_handler_register(struct net_device *dev,
3384                                rx_handler_func_t *rx_handler,
3385                                void *rx_handler_data)
3386 {
3387         ASSERT_RTNL();
3388
3389         if (dev->rx_handler)
3390                 return -EBUSY;
3391
3392         /* Note: rx_handler_data must be set before rx_handler */
3393         rcu_assign_pointer(dev->rx_handler_data, rx_handler_data);
3394         rcu_assign_pointer(dev->rx_handler, rx_handler);
3395
3396         return 0;
3397 }
3398 EXPORT_SYMBOL_GPL(netdev_rx_handler_register);
3399
3400 /**
3401  *      netdev_rx_handler_unregister - unregister receive handler
3402  *      @dev: device to unregister a handler from
3403  *
3404  *      Unregister a receive handler from a device.
3405  *
3406  *      The caller must hold the rtnl_mutex.
3407  */
3408 void netdev_rx_handler_unregister(struct net_device *dev)
3409 {
3410
3411         ASSERT_RTNL();
3412         RCU_INIT_POINTER(dev->rx_handler, NULL);
3413         /* a reader seeing a non NULL rx_handler in a rcu_read_lock()
3414          * section has a guarantee to see a non NULL rx_handler_data
3415          * as well.
3416          */
3417         synchronize_net();
3418         RCU_INIT_POINTER(dev->rx_handler_data, NULL);
3419 }
3420 EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
3421
3422 /*
3423  * Limit the use of PFMEMALLOC reserves to those protocols that implement
3424  * the special handling of PFMEMALLOC skbs.
3425  */
3426 static bool skb_pfmemalloc_protocol(struct sk_buff *skb)
3427 {
3428         switch (skb->protocol) {
3429         case __constant_htons(ETH_P_ARP):
3430         case __constant_htons(ETH_P_IP):
3431         case __constant_htons(ETH_P_IPV6):
3432         case __constant_htons(ETH_P_8021Q):
3433         case __constant_htons(ETH_P_8021AD):
3434                 return true;
3435         default:
3436                 return false;
3437         }
3438 }
3439
3440 static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc)
3441 {
3442         struct packet_type *ptype, *pt_prev;
3443         rx_handler_func_t *rx_handler;
3444         struct net_device *orig_dev;
3445         struct net_device *null_or_dev;
3446         bool deliver_exact = false;
3447         int ret = NET_RX_DROP;
3448         __be16 type;
3449
3450         net_timestamp_check(!netdev_tstamp_prequeue, skb);
3451
3452         trace_netif_receive_skb(skb);
3453
3454         /* if we've gotten here through NAPI, check netpoll */
3455         if (netpoll_receive_skb(skb))
3456                 goto out;
3457
3458         orig_dev = skb->dev;
3459
3460         skb_reset_network_header(skb);
3461         if (!skb_transport_header_was_set(skb))
3462                 skb_reset_transport_header(skb);
3463         skb_reset_mac_len(skb);
3464
3465         pt_prev = NULL;
3466
3467         rcu_read_lock();
3468
3469 another_round:
3470         skb->skb_iif = skb->dev->ifindex;
3471
3472         __this_cpu_inc(softnet_data.processed);
3473
3474         if (skb->protocol == cpu_to_be16(ETH_P_8021Q) ||
3475             skb->protocol == cpu_to_be16(ETH_P_8021AD)) {
3476                 skb = vlan_untag(skb);
3477                 if (unlikely(!skb))
3478                         goto unlock;
3479         }
3480
3481 #ifdef CONFIG_NET_CLS_ACT
3482         if (skb->tc_verd & TC_NCLS) {
3483                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
3484                 goto ncls;
3485         }
3486 #endif
3487
3488         if (pfmemalloc)
3489                 goto skip_taps;
3490
3491         list_for_each_entry_rcu(ptype, &ptype_all, list) {
3492                 if (!ptype->dev || ptype->dev == skb->dev) {
3493                         if (pt_prev)
3494                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3495                         pt_prev = ptype;
3496                 }
3497         }
3498
3499 skip_taps:
3500 #ifdef CONFIG_NET_CLS_ACT
3501         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
3502         if (!skb)
3503                 goto unlock;
3504 ncls:
3505 #endif
3506
3507         if (pfmemalloc && !skb_pfmemalloc_protocol(skb))
3508                 goto drop;
3509
3510         if (vlan_tx_tag_present(skb)) {
3511                 if (pt_prev) {
3512                         ret = deliver_skb(skb, pt_prev, orig_dev);
3513                         pt_prev = NULL;
3514                 }
3515                 if (vlan_do_receive(&skb))
3516                         goto another_round;
3517                 else if (unlikely(!skb))
3518                         goto unlock;
3519         }
3520
3521         rx_handler = rcu_dereference(skb->dev->rx_handler);
3522         if (rx_handler) {
3523                 if (pt_prev) {
3524                         ret = deliver_skb(skb, pt_prev, orig_dev);
3525                         pt_prev = NULL;
3526                 }
3527                 switch (rx_handler(&skb)) {
3528                 case RX_HANDLER_CONSUMED:
3529                         ret = NET_RX_SUCCESS;
3530                         goto unlock;
3531                 case RX_HANDLER_ANOTHER:
3532                         goto another_round;
3533                 case RX_HANDLER_EXACT:
3534                         deliver_exact = true;
3535                 case RX_HANDLER_PASS:
3536                         break;
3537                 default:
3538                         BUG();
3539                 }
3540         }
3541
3542         if (unlikely(vlan_tx_tag_present(skb))) {
3543                 if (vlan_tx_tag_get_id(skb))
3544                         skb->pkt_type = PACKET_OTHERHOST;
3545                 /* Note: we might in the future use prio bits
3546                  * and set skb->priority like in vlan_do_receive()
3547                  * For the time being, just ignore Priority Code Point
3548                  */
3549                 skb->vlan_tci = 0;
3550         }
3551
3552         /* deliver only exact match when indicated */
3553         null_or_dev = deliver_exact ? skb->dev : NULL;
3554
3555         type = skb->protocol;
3556         list_for_each_entry_rcu(ptype,
3557                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
3558                 if (ptype->type == type &&
3559                     (ptype->dev == null_or_dev || ptype->dev == skb->dev ||
3560                      ptype->dev == orig_dev)) {
3561                         if (pt_prev)
3562                                 ret = deliver_skb(skb, pt_prev, orig_dev);
3563                         pt_prev = ptype;
3564                 }
3565         }
3566
3567         if (pt_prev) {
3568                 if (unlikely(skb_orphan_frags(skb, GFP_ATOMIC)))
3569                         goto drop;
3570                 else
3571                         ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
3572         } else {
3573 drop:
3574                 atomic_long_inc(&skb->dev->rx_dropped);
3575                 kfree_skb(skb);
3576                 /* Jamal, now you will not able to escape explaining
3577                  * me how you were going to use this. :-)
3578                  */
3579                 ret = NET_RX_DROP;
3580         }
3581
3582 unlock:
3583         rcu_read_unlock();
3584 out:
3585         return ret;
3586 }
3587
3588 static int __netif_receive_skb(struct sk_buff *skb)
3589 {
3590         int ret;
3591
3592         if (sk_memalloc_socks() && skb_pfmemalloc(skb)) {
3593                 unsigned long pflags = current->flags;
3594
3595                 /*
3596                  * PFMEMALLOC skbs are special, they should
3597                  * - be delivered to SOCK_MEMALLOC sockets only
3598                  * - stay away from userspace
3599                  * - have bounded memory usage
3600                  *
3601                  * Use PF_MEMALLOC as this saves us from propagating the allocation
3602                  * context down to all allocation sites.
3603                  */
3604                 current->flags |= PF_MEMALLOC;
3605                 ret = __netif_receive_skb_core(skb, true);
3606                 tsk_restore_flags(current, pflags, PF_MEMALLOC);
3607         } else
3608                 ret = __netif_receive_skb_core(skb, false);
3609
3610         return ret;
3611 }
3612
3613 /**
3614  *      netif_receive_skb - process receive buffer from network
3615  *      @skb: buffer to process
3616  *
3617  *      netif_receive_skb() is the main receive data processing function.
3618  *      It always succeeds. The buffer may be dropped during processing
3619  *      for congestion control or by the protocol layers.
3620  *
3621  *      This function may only be called from softirq context and interrupts
3622  *      should be enabled.
3623  *
3624  *      Return values (usually ignored):
3625  *      NET_RX_SUCCESS: no congestion
3626  *      NET_RX_DROP: packet was dropped
3627  */
3628 int netif_receive_skb(struct sk_buff *skb)
3629 {
3630         net_timestamp_check(netdev_tstamp_prequeue, skb);
3631
3632         if (skb_defer_rx_timestamp(skb))
3633                 return NET_RX_SUCCESS;
3634
3635 #ifdef CONFIG_RPS
3636         if (static_key_false(&rps_needed)) {
3637                 struct rps_dev_flow voidflow, *rflow = &voidflow;
3638                 int cpu, ret;
3639
3640                 rcu_read_lock();
3641
3642                 cpu = get_rps_cpu(skb->dev, skb, &rflow);
3643
3644                 if (cpu >= 0) {
3645                         ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
3646                         rcu_read_unlock();
3647                         return ret;
3648                 }
3649                 rcu_read_unlock();
3650         }
3651 #endif
3652         return __netif_receive_skb(skb);
3653 }
3654 EXPORT_SYMBOL(netif_receive_skb);
3655
3656 /* Network device is going away, flush any packets still pending
3657  * Called with irqs disabled.
3658  */
3659 static void flush_backlog(void *arg)
3660 {
3661         struct net_device *dev = arg;
3662         struct softnet_data *sd = &__get_cpu_var(softnet_data);
3663         struct sk_buff *skb, *tmp;
3664
3665         rps_lock(sd);
3666         skb_queue_walk_safe(&sd->input_pkt_queue, skb, tmp) {
3667                 if (skb->dev == dev) {
3668                         __skb_unlink(skb, &sd->input_pkt_queue);
3669                         kfree_skb(skb);
3670                         input_queue_head_incr(sd);
3671                 }
3672         }
3673         rps_unlock(sd);
3674
3675         skb_queue_walk_safe(&sd->process_queue, skb, tmp) {
3676                 if (skb->dev == dev) {
3677                         __skb_unlink(skb, &sd->process_queue);
3678                         kfree_skb(skb);
3679                         input_queue_head_incr(sd);
3680                 }
3681         }
3682 }
3683
3684 static int napi_gro_complete(struct sk_buff *skb)
3685 {
3686         struct packet_offload *ptype;
3687         __be16 type = skb->protocol;
3688         struct list_head *head = &offload_base;
3689         int err = -ENOENT;
3690
3691         BUILD_BUG_ON(sizeof(struct napi_gro_cb) > sizeof(skb->cb));
3692
3693         if (NAPI_GRO_CB(skb)->count == 1) {
3694                 skb_shinfo(skb)->gso_size = 0;
3695                 goto out;
3696         }
3697
3698         rcu_read_lock();
3699         list_for_each_entry_rcu(ptype, head, list) {
3700                 if (ptype->type != type || !ptype->callbacks.gro_complete)
3701                         continue;
3702
3703                 err = ptype->callbacks.gro_complete(skb);
3704                 break;
3705         }
3706         rcu_read_unlock();
3707
3708         if (err) {
3709                 WARN_ON(&ptype->list == head);
3710                 kfree_skb(skb);
3711                 return NET_RX_SUCCESS;
3712         }
3713
3714 out:
3715         return netif_receive_skb(skb);
3716 }
3717
3718 /* napi->gro_list contains packets ordered by age.
3719  * youngest packets at the head of it.
3720  * Complete skbs in reverse order to reduce latencies.
3721  */
3722 void napi_gro_flush(struct napi_struct *napi, bool flush_old)
3723 {
3724         struct sk_buff *skb, *prev = NULL;
3725
3726         /* scan list and build reverse chain */
3727         for (skb = napi->gro_list; skb != NULL; skb = skb->next) {
3728                 skb->prev = prev;
3729                 prev = skb;
3730         }
3731
3732         for (skb = prev; skb; skb = prev) {
3733                 skb->next = NULL;
3734
3735                 if (flush_old && NAPI_GRO_CB(skb)->age == jiffies)
3736                         return;
3737
3738                 prev = skb->prev;
3739                 napi_gro_complete(skb);
3740                 napi->gro_count--;
3741         }
3742
3743         napi->gro_list = NULL;
3744 }
3745 EXPORT_SYMBOL(napi_gro_flush);
3746
3747 static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
3748 {
3749         struct sk_buff *p;
3750         unsigned int maclen = skb->dev->hard_header_len;
3751
3752         for (p = napi->gro_list; p; p = p->next) {
3753                 unsigned long diffs;
3754
3755                 diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
3756                 diffs |= p->vlan_tci ^ skb->vlan_tci;
3757                 if (maclen == ETH_HLEN)
3758                         diffs |= compare_ether_header(skb_mac_header(p),
3759                                                       skb_gro_mac_header(skb));
3760                 else if (!diffs)
3761                         diffs = memcmp(skb_mac_header(p),
3762                                        skb_gro_mac_header(skb),
3763                                        maclen);
3764                 NAPI_GRO_CB(p)->same_flow = !diffs;
3765                 NAPI_GRO_CB(p)->flush = 0;
3766         }
3767 }
3768
3769 static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3770 {
3771         struct sk_buff **pp = NULL;
3772         struct packet_offload *ptype;
3773         __be16 type = skb->protocol;
3774         struct list_head *head = &offload_base;
3775         int same_flow;
3776         enum gro_result ret;
3777
3778         if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb))
3779                 goto normal;
3780
3781         if (skb_is_gso(skb) || skb_has_frag_list(skb))
3782                 goto normal;
3783
3784         gro_list_prepare(napi, skb);
3785
3786         rcu_read_lock();
3787         list_for_each_entry_rcu(ptype, head, list) {
3788                 if (ptype->type != type || !ptype->callbacks.gro_receive)
3789                         continue;
3790
3791                 skb_set_network_header(skb, skb_gro_offset(skb));
3792                 skb_reset_mac_len(skb);
3793                 NAPI_GRO_CB(skb)->same_flow = 0;
3794                 NAPI_GRO_CB(skb)->flush = 0;
3795                 NAPI_GRO_CB(skb)->free = 0;
3796
3797                 pp = ptype->callbacks.gro_receive(&napi->gro_list, skb);
3798                 break;
3799         }
3800         rcu_read_unlock();
3801
3802         if (&ptype->list == head)
3803                 goto normal;
3804
3805         same_flow = NAPI_GRO_CB(skb)->same_flow;
3806         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
3807
3808         if (pp) {
3809                 struct sk_buff *nskb = *pp;
3810
3811                 *pp = nskb->next;
3812                 nskb->next = NULL;
3813                 napi_gro_complete(nskb);
3814                 napi->gro_count--;
3815         }
3816
3817         if (same_flow)
3818                 goto ok;
3819
3820         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
3821                 goto normal;
3822
3823         napi->gro_count++;
3824         NAPI_GRO_CB(skb)->count = 1;
3825         NAPI_GRO_CB(skb)->age = jiffies;
3826         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
3827         skb->next = napi->gro_list;
3828         napi->gro_list = skb;
3829         ret = GRO_HELD;
3830
3831 pull:
3832         if (skb_headlen(skb) < skb_gro_offset(skb)) {
3833                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
3834
3835                 BUG_ON(skb->end - skb->tail < grow);
3836
3837                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
3838
3839                 skb->tail += grow;
3840                 skb->data_len -= grow;
3841
3842                 skb_shinfo(skb)->frags[0].page_offset += grow;
3843                 skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
3844
3845                 if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
3846                         skb_frag_unref(skb, 0);
3847                         memmove(skb_shinfo(skb)->frags,
3848                                 skb_shinfo(skb)->frags + 1,
3849                                 --skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
3850                 }
3851         }
3852
3853 ok:
3854         return ret;
3855
3856 normal:
3857         ret = GRO_NORMAL;
3858         goto pull;
3859 }
3860
3861
3862 static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
3863 {
3864         switch (ret) {
3865         case GRO_NORMAL:
3866                 if (netif_receive_skb(skb))
3867                         ret = GRO_DROP;
3868                 break;
3869
3870         case GRO_DROP:
3871                 kfree_skb(skb);
3872                 break;
3873
3874         case GRO_MERGED_FREE:
3875                 if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
3876                         kmem_cache_free(skbuff_head_cache, skb);
3877                 else
3878                         __kfree_skb(skb);
3879                 break;
3880
3881         case GRO_HELD:
3882         case GRO_MERGED:
3883                 break;
3884         }
3885
3886         return ret;
3887 }
3888
3889 static void skb_gro_reset_offset(struct sk_buff *skb)
3890 {
3891         const struct skb_shared_info *pinfo = skb_shinfo(skb);
3892         const skb_frag_t *frag0 = &pinfo->frags[0];
3893
3894         NAPI_GRO_CB(skb)->data_offset = 0;
3895         NAPI_GRO_CB(skb)->frag0 = NULL;
3896         NAPI_GRO_CB(skb)->frag0_len = 0;
3897
3898         if (skb->mac_header == skb->tail &&
3899             pinfo->nr_frags &&
3900             !PageHighMem(skb_frag_page(frag0))) {
3901                 NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0);
3902                 NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(frag0);
3903         }
3904 }
3905
3906 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3907 {
3908         skb_gro_reset_offset(skb);
3909
3910         return napi_skb_finish(dev_gro_receive(napi, skb), skb);
3911 }
3912 EXPORT_SYMBOL(napi_gro_receive);
3913
3914 static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3915 {
3916         __skb_pull(skb, skb_headlen(skb));
3917         /* restore the reserve we had after netdev_alloc_skb_ip_align() */
3918         skb_reserve(skb, NET_SKB_PAD + NET_IP_ALIGN - skb_headroom(skb));
3919         skb->vlan_tci = 0;
3920         skb->dev = napi->dev;
3921         skb->skb_iif = 0;
3922         skb->truesize = SKB_TRUESIZE(skb_end_offset(skb));
3923
3924         napi->skb = skb;
3925 }
3926
3927 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3928 {
3929         struct sk_buff *skb = napi->skb;
3930
3931         if (!skb) {
3932                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3933                 if (skb)
3934                         napi->skb = skb;
3935         }
3936         return skb;
3937 }
3938 EXPORT_SYMBOL(napi_get_frags);
3939
3940 static gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3941                                gro_result_t ret)
3942 {
3943         switch (ret) {
3944         case GRO_NORMAL:
3945         case GRO_HELD:
3946                 skb->protocol = eth_type_trans(skb, skb->dev);
3947
3948                 if (ret == GRO_HELD)
3949                         skb_gro_pull(skb, -ETH_HLEN);
3950                 else if (netif_receive_skb(skb))
3951                         ret = GRO_DROP;
3952                 break;
3953
3954         case GRO_DROP:
3955         case GRO_MERGED_FREE:
3956                 napi_reuse_skb(napi, skb);
3957                 break;
3958
3959         case GRO_MERGED:
3960                 break;
3961         }
3962
3963         return ret;
3964 }
3965
3966 static struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3967 {
3968         struct sk_buff *skb = napi->skb;
3969         struct ethhdr *eth;
3970         unsigned int hlen;
3971         unsigned int off;
3972
3973         napi->skb = NULL;
3974
3975         skb_reset_mac_header(skb);
3976         skb_gro_reset_offset(skb);
3977
3978         off = skb_gro_offset(skb);
3979         hlen = off + sizeof(*eth);
3980         eth = skb_gro_header_fast(skb, off);
3981         if (skb_gro_header_hard(skb, hlen)) {
3982                 eth = skb_gro_header_slow(skb, hlen, off);
3983                 if (unlikely(!eth)) {
3984                         napi_reuse_skb(napi, skb);
3985                         skb = NULL;
3986                         goto out;
3987                 }
3988         }
3989
3990         skb_gro_pull(skb, sizeof(*eth));
3991
3992         /*
3993          * This works because the only protocols we care about don't require
3994          * special handling.  We'll fix it up properly at the end.
3995          */
3996         skb->protocol = eth->h_proto;
3997
3998 out:
3999         return skb;
4000 }
4001
4002 gro_result_t napi_gro_frags(struct napi_struct *napi)
4003 {
4004         struct sk_buff *skb = napi_frags_skb(napi);
4005
4006         if (!skb)
4007                 return GRO_DROP;
4008
4009         return napi_frags_finish(napi, skb, dev_gro_receive(napi, skb));
4010 }
4011 EXPORT_SYMBOL(napi_gro_frags);
4012
4013 /*
4014  * net_rps_action sends any pending IPI's for rps.
4015  * Note: called with local irq disabled, but exits with local irq enabled.
4016  */
4017 static void net_rps_action_and_irq_enable(struct softnet_data *sd)
4018 {
4019 #ifdef CONFIG_RPS
4020         struct softnet_data *remsd = sd->rps_ipi_list;
4021
4022         if (remsd) {
4023                 sd->rps_ipi_list = NULL;
4024
4025                 local_irq_enable();
4026
4027                 /* Send pending IPI's to kick RPS processing on remote cpus. */
4028                 while (remsd) {
4029                         struct softnet_data *next = remsd->rps_ipi_next;
4030
4031                         if (cpu_online(remsd->cpu))
4032                                 __smp_call_function_single(remsd->cpu,
4033                                                            &remsd->csd, 0);
4034                         remsd = next;
4035                 }
4036         } else
4037 #endif
4038                 local_irq_enable();
4039 }
4040
4041 static int process_backlog(struct napi_struct *napi, int quota)
4042 {
4043         int work = 0;
4044         struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);
4045
4046 #ifdef CONFIG_RPS
4047         /* Check if we have pending ipi, its better to send them now,
4048          * not waiting net_rx_action() end.
4049          */
4050         if (sd->rps_ipi_list) {
4051                 local_irq_disable();
4052                 net_rps_action_and_irq_enable(sd);
4053         }
4054 #endif
4055         napi->weight = weight_p;
4056         local_irq_disable();
4057         while (work < quota) {
4058                 struct sk_buff *skb;
4059                 unsigned int qlen;
4060
4061                 while ((skb = __skb_dequeue(&sd->process_queue))) {
4062                         local_irq_enable();
4063                         __netif_receive_skb(skb);
4064                         local_irq_disable();
4065                         input_queue_head_incr(sd);
4066                         if (++work >= quota) {
4067                                 local_irq_enable();
4068                                 return work;
4069                         }
4070                 }
4071
4072                 rps_lock(sd);
4073                 qlen = skb_queue_len(&sd->input_pkt_queue);
4074                 if (qlen)
4075                         skb_queue_splice_tail_init(&sd->input_pkt_queue,
4076                                                    &sd->process_queue);
4077
4078                 if (qlen < quota - work) {
4079                         /*
4080                          * Inline a custom version of __napi_complete().
4081                          * only current cpu owns and manipulates this napi,
4082                          * and NAPI_STATE_SCHED is the only possible flag set on backlog.
4083                          * we can use a plain write instead of clear_bit(),
4084                          * and we dont need an smp_mb() memory barrier.
4085                          */
4086                         list_del(&napi->poll_list);
4087                         napi->state = 0;
4088
4089                         quota = work + qlen;
4090                 }
4091                 rps_unlock(sd);
4092         }
4093         local_irq_enable();
4094
4095         return work;
4096 }
4097
4098 /**
4099  * __napi_schedule - schedule for receive
4100  * @n: entry to schedule
4101  *
4102  * The entry's receive function will be scheduled to run
4103  */
4104 void __napi_schedule(struct napi_struct *n)
4105 {
4106         unsigned long flags;
4107
4108         local_irq_save(flags);
4109         ____napi_schedule(&__get_cpu_var(softnet_data), n);
4110         local_irq_restore(flags);
4111 }
4112 EXPORT_SYMBOL(__napi_schedule);
4113
4114 void __napi_complete(struct napi_struct *n)
4115 {
4116         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
4117         BUG_ON(n->gro_list);
4118
4119         list_del(&n->poll_list);
4120         smp_mb__before_clear_bit();
4121         clear_bit(NAPI_STATE_SCHED, &n->state);
4122 }
4123 EXPORT_SYMBOL(__napi_complete);
4124
4125 void napi_complete(struct napi_struct *n)
4126 {
4127         unsigned long flags;
4128
4129         /*
4130          * don't let napi dequeue from the cpu poll list
4131          * just in case its running on a different cpu
4132          */
4133         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
4134                 return;
4135
4136         napi_gro_flush(n, false);
4137         local_irq_save(flags);
4138         __napi_complete(n);
4139         local_irq_restore(flags);
4140 }
4141 EXPORT_SYMBOL(napi_complete);
4142
4143 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
4144                     int (*poll)(struct napi_struct *, int), int weight)
4145 {
4146         INIT_LIST_HEAD(&napi->poll_list);
4147         napi->gro_count = 0;
4148         napi->gro_list = NULL;
4149         napi->skb = NULL;
4150         napi->poll = poll;
4151         if (weight > NAPI_POLL_WEIGHT)
4152                 pr_err_once("netif_napi_add() called with weight %d on device %s\n",
4153                             weight, dev->name);
4154         napi->weight = weight;
4155         list_add(&napi->dev_list, &dev->napi_list);
4156         napi->dev = dev;
4157 #ifdef CONFIG_NETPOLL
4158         spin_lock_init(&napi->poll_lock);
4159         napi->poll_owner = -1;
4160 #endif
4161         set_bit(NAPI_STATE_SCHED, &napi->state);
4162 }
4163 EXPORT_SYMBOL(netif_napi_add);
4164
4165 void netif_napi_del(struct napi_struct *napi)
4166 {
4167         struct sk_buff *skb, *next;
4168
4169         list_del_init(&napi->dev_list);
4170         napi_free_frags(napi);
4171
4172         for (skb = napi->gro_list; skb; skb = next) {
4173                 next = skb->next;
4174                 skb->next = NULL;
4175                 kfree_skb(skb);
4176         }
4177
4178         napi->gro_list = NULL;
4179         napi->gro_count = 0;
4180 }
4181 EXPORT_SYMBOL(netif_napi_del);
4182
4183 static void net_rx_action(struct softirq_action *h)
4184 {
4185         struct softnet_data *sd = &__get_cpu_var(softnet_data);
4186         unsigned long time_limit = jiffies + 2;
4187         int budget = netdev_budget;
4188         void *have;
4189
4190         local_irq_disable();
4191
4192         while (!list_empty(&sd->poll_list)) {
4193                 struct napi_struct *n;
4194                 int work, weight;
4195
4196                 /* If softirq window is exhuasted then punt.
4197                  * Allow this to run for 2 jiffies since which will allow
4198                  * an average latency of 1.5/HZ.
4199                  */
4200                 if (unlikely(budget <= 0 || time_after_eq(jiffies, time_limit)))
4201                         goto softnet_break;
4202
4203                 local_irq_enable();
4204
4205                 /* Even though interrupts have been re-enabled, this
4206                  * access is safe because interrupts can only add new
4207                  * entries to the tail of this list, and only ->poll()
4208                  * calls can remove this head entry from the list.
4209                  */
4210                 n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
4211
4212                 have = netpoll_poll_lock(n);
4213
4214                 weight = n->weight;
4215
4216                 /* This NAPI_STATE_SCHED test is for avoiding a race
4217                  * with netpoll's poll_napi().  Only the entity which
4218                  * obtains the lock and sees NAPI_STATE_SCHED set will
4219                  * actually make the ->poll() call.  Therefore we avoid
4220                  * accidentally calling ->poll() when NAPI is not scheduled.
4221                  */
4222                 work = 0;
4223                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4224                         work = n->poll(n, weight);
4225                         trace_napi_poll(n);
4226                 }
4227
4228                 WARN_ON_ONCE(work > weight);
4229
4230                 budget -= work;
4231
4232                 local_irq_disable();
4233
4234                 /* Drivers must not modify the NAPI state if they
4235                  * consume the entire weight.  In such cases this code
4236                  * still "owns" the NAPI instance and therefore can
4237                  * move the instance around on the list at-will.
4238                  */
4239                 if (unlikely(work == weight)) {
4240                         if (unlikely(napi_disable_pending(n))) {
4241                                 local_irq_enable();
4242                                 napi_complete(n);
4243                                 local_irq_disable();
4244                         } else {
4245                                 if (n->gro_list) {
4246                                         /* flush too old packets
4247                                          * If HZ < 1000, flush all packets.
4248                                          */
4249                                         local_irq_enable();
4250                                         napi_gro_flush(n, HZ >= 1000);
4251                                         local_irq_disable();
4252                                 }
4253                                 list_move_tail(&n->poll_list, &sd->poll_list);
4254                         }
4255                 }
4256
4257                 netpoll_poll_unlock(have);
4258         }
4259 out:
4260         net_rps_action_and_irq_enable(sd);
4261
4262 #ifdef CONFIG_NET_DMA
4263         /*
4264          * There may not be any more sk_buffs coming right now, so push
4265          * any pending DMA copies to hardware
4266          */
4267         dma_issue_pending_all();
4268 #endif
4269
4270         return;
4271
4272 softnet_break:
4273         sd->time_squeeze++;
4274         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4275         goto out;
4276 }
4277
4278 struct netdev_upper {
4279         struct net_device *dev;
4280         bool master;
4281         struct list_head list;
4282         struct rcu_head rcu;
4283         struct list_head search_list;
4284 };
4285
4286 static void __append_search_uppers(struct list_head *search_list,
4287                                    struct net_device *dev)
4288 {
4289         struct netdev_upper *upper;
4290
4291         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4292                 /* check if this upper is not already in search list */
4293                 if (list_empty(&upper->search_list))
4294                         list_add_tail(&upper->search_list, search_list);
4295         }
4296 }
4297
4298 static bool __netdev_search_upper_dev(struct net_device *dev,
4299                                       struct net_device *upper_dev)
4300 {
4301         LIST_HEAD(search_list);
4302         struct netdev_upper *upper;
4303         struct netdev_upper *tmp;
4304         bool ret = false;
4305
4306         __append_search_uppers(&search_list, dev);
4307         list_for_each_entry(upper, &search_list, search_list) {
4308                 if (upper->dev == upper_dev) {
4309                         ret = true;
4310                         break;
4311                 }
4312                 __append_search_uppers(&search_list, upper->dev);
4313         }
4314         list_for_each_entry_safe(upper, tmp, &search_list, search_list)
4315                 INIT_LIST_HEAD(&upper->search_list);
4316         return ret;
4317 }
4318
4319 static struct netdev_upper *__netdev_find_upper(struct net_device *dev,
4320                                                 struct net_device *upper_dev)
4321 {
4322         struct netdev_upper *upper;
4323
4324         list_for_each_entry(upper, &dev->upper_dev_list, list) {
4325                 if (upper->dev == upper_dev)
4326                         return upper;
4327         }
4328         return NULL;
4329 }
4330
4331 /**
4332  * netdev_has_upper_dev - Check if device is linked to an upper device
4333  * @dev: device
4334  * @upper_dev: upper device to check
4335  *
4336  * Find out if a device is linked to specified upper device and return true
4337  * in case it is. Note that this checks only immediate upper device,
4338  * not through a complete stack of devices. The caller must hold the RTNL lock.
4339  */
4340 bool netdev_has_upper_dev(struct net_device *dev,
4341                           struct net_device *upper_dev)
4342 {
4343         ASSERT_RTNL();
4344
4345         return __netdev_find_upper(dev, upper_dev);
4346 }
4347 EXPORT_SYMBOL(netdev_has_upper_dev);
4348
4349 /**
4350  * netdev_has_any_upper_dev - Check if device is linked to some device
4351  * @dev: device
4352  *
4353  * Find out if a device is linked to an upper device and return true in case
4354  * it is. The caller must hold the RTNL lock.
4355  */
4356 bool netdev_has_any_upper_dev(struct net_device *dev)
4357 {
4358         ASSERT_RTNL();
4359
4360         return !list_empty(&dev->upper_dev_list);
4361 }
4362 EXPORT_SYMBOL(netdev_has_any_upper_dev);
4363
4364 /**
4365  * netdev_master_upper_dev_get - Get master upper device
4366  * @dev: device
4367  *
4368  * Find a master upper device and return pointer to it or NULL in case
4369  * it's not there. The caller must hold the RTNL lock.
4370  */
4371 struct net_device *netdev_master_upper_dev_get(struct net_device *dev)
4372 {
4373         struct netdev_upper *upper;
4374
4375         ASSERT_RTNL();
4376
4377         if (list_empty(&dev->upper_dev_list))
4378                 return NULL;
4379
4380         upper = list_first_entry(&dev->upper_dev_list,
4381                                  struct netdev_upper, list);
4382         if (likely(upper->master))
4383                 return upper->dev;
4384         return NULL;
4385 }
4386 EXPORT_SYMBOL(netdev_master_upper_dev_get);
4387
4388 /**
4389  * netdev_master_upper_dev_get_rcu - Get master upper device
4390  * @dev: device
4391  *
4392  * Find a master upper device and return pointer to it or NULL in case
4393  * it's not there. The caller must hold the RCU read lock.
4394  */
4395 struct net_device *netdev_master_upper_dev_get_rcu(struct net_device *dev)
4396 {
4397         struct netdev_upper *upper;
4398
4399         upper = list_first_or_null_rcu(&dev->upper_dev_list,
4400                                        struct netdev_upper, list);
4401         if (upper && likely(upper->master))
4402                 return upper->dev;
4403         return NULL;
4404 }
4405 EXPORT_SYMBOL(netdev_master_upper_dev_get_rcu);
4406
4407 static int __netdev_upper_dev_link(struct net_device *dev,
4408                                    struct net_device *upper_dev, bool master)
4409 {
4410         struct netdev_upper *upper;
4411
4412         ASSERT_RTNL();
4413
4414         if (dev == upper_dev)
4415                 return -EBUSY;
4416
4417         /* To prevent loops, check if dev is not upper device to upper_dev. */
4418         if (__netdev_search_upper_dev(upper_dev, dev))
4419                 return -EBUSY;
4420
4421         if (__netdev_find_upper(dev, upper_dev))
4422                 return -EEXIST;
4423
4424         if (master && netdev_master_upper_dev_get(dev))
4425                 return -EBUSY;
4426
4427         upper = kmalloc(sizeof(*upper), GFP_KERNEL);
4428         if (!upper)
4429                 return -ENOMEM;
4430
4431         upper->dev = upper_dev;
4432         upper->master = master;
4433         INIT_LIST_HEAD(&upper->search_list);
4434
4435         /* Ensure that master upper link is always the first item in list. */
4436         if (master)
4437                 list_add_rcu(&upper->list, &dev->upper_dev_list);
4438         else
4439                 list_add_tail_rcu(&upper->list, &dev->upper_dev_list);
4440         dev_hold(upper_dev);
4441
4442         return 0;
4443 }
4444
4445 /**
4446  * netdev_upper_dev_link - Add a link to the upper device
4447  * @dev: device
4448  * @upper_dev: new upper device
4449  *
4450  * Adds a link to device which is upper to this one. The caller must hold
4451  * the RTNL lock. On a failure a negative errno code is returned.
4452  * On success the reference counts are adjusted and the function
4453  * returns zero.
4454  */
4455 int netdev_upper_dev_link(struct net_device *dev,
4456                           struct net_device *upper_dev)
4457 {
4458         return __netdev_upper_dev_link(dev, upper_dev, false);
4459 }
4460 EXPORT_SYMBOL(netdev_upper_dev_link);
4461
4462 /**
4463  * netdev_master_upper_dev_link - Add a master link to the upper device
4464  * @dev: device
4465  * @upper_dev: new upper device
4466  *
4467  * Adds a link to device which is upper to this one. In this case, only
4468  * one master upper device can be linked, although other non-master devices
4469  * might be linked as well. The caller must hold the RTNL lock.
4470  * On a failure a negative errno code is returned. On success the reference
4471  * counts are adjusted and the function returns zero.
4472  */
4473 int netdev_master_upper_dev_link(struct net_device *dev,
4474                                  struct net_device *upper_dev)
4475 {
4476         return __netdev_upper_dev_link(dev, upper_dev, true);
4477 }
4478 EXPORT_SYMBOL(netdev_master_upper_dev_link);
4479
4480 /**
4481  * netdev_upper_dev_unlink - Removes a link to upper device
4482  * @dev: device
4483  * @upper_dev: new upper device
4484  *
4485  * Removes a link to device which is upper to this one. The caller must hold
4486  * the RTNL lock.
4487  */
4488 void netdev_upper_dev_unlink(struct net_device *dev,
4489                              struct net_device *upper_dev)
4490 {
4491         struct netdev_upper *upper;
4492
4493         ASSERT_RTNL();
4494
4495         upper = __netdev_find_upper(dev, upper_dev);
4496         if (!upper)
4497                 return;
4498         list_del_rcu(&upper->list);
4499         dev_put(upper_dev);
4500         kfree_rcu(upper, rcu);
4501 }
4502 EXPORT_SYMBOL(netdev_upper_dev_unlink);
4503
4504 static void dev_change_rx_flags(struct net_device *dev, int flags)
4505 {
4506         const struct net_device_ops *ops = dev->netdev_ops;
4507
4508         if (ops->ndo_change_rx_flags)
4509                 ops->ndo_change_rx_flags(dev, flags);
4510 }
4511
4512 static int __dev_set_promiscuity(struct net_device *dev, int inc)
4513 {
4514         unsigned int old_flags = dev->flags;
4515         kuid_t uid;
4516         kgid_t gid;
4517
4518         ASSERT_RTNL();
4519
4520         dev->flags |= IFF_PROMISC;
4521         dev->promiscuity += inc;
4522         if (dev->promiscuity == 0) {
4523                 /*
4524                  * Avoid overflow.
4525                  * If inc causes overflow, untouch promisc and return error.
4526                  */
4527                 if (inc < 0)
4528                         dev->flags &= ~IFF_PROMISC;
4529                 else {
4530                         dev->promiscuity -= inc;
4531                         pr_warn("%s: promiscuity touches roof, set promiscuity failed. promiscuity feature of device might be broken.\n",
4532                                 dev->name);
4533                         return -EOVERFLOW;
4534                 }
4535         }
4536         if (dev->flags != old_flags) {
4537                 pr_info("device %s %s promiscuous mode\n",
4538                         dev->name,
4539                         dev->flags & IFF_PROMISC ? "entered" : "left");
4540                 if (audit_enabled) {
4541                         current_uid_gid(&uid, &gid);
4542                         audit_log(current->audit_context, GFP_ATOMIC,
4543                                 AUDIT_ANOM_PROMISCUOUS,
4544                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
4545                                 dev->name, (dev->flags & IFF_PROMISC),
4546                                 (old_flags & IFF_PROMISC),
4547                                 from_kuid(&init_user_ns, audit_get_loginuid(current)),
4548                                 from_kuid(&init_user_ns, uid),
4549                                 from_kgid(&init_user_ns, gid),
4550                                 audit_get_sessionid(current));
4551                 }
4552
4553                 dev_change_rx_flags(dev, IFF_PROMISC);
4554         }
4555         return 0;
4556 }
4557
4558 /**
4559  *      dev_set_promiscuity     - update promiscuity count on a device
4560  *      @dev: device
4561  *      @inc: modifier
4562  *
4563  *      Add or remove promiscuity from a device. While the count in the device
4564  *      remains above zero the interface remains promiscuous. Once it hits zero
4565  *      the device reverts back to normal filtering operation. A negative inc
4566  *      value is used to drop promiscuity on the device.
4567  *      Return 0 if successful or a negative errno code on error.
4568  */
4569 int dev_set_promiscuity(struct net_device *dev, int inc)
4570 {
4571         unsigned int old_flags = dev->flags;
4572         int err;
4573
4574         err = __dev_set_promiscuity(dev, inc);
4575         if (err < 0)
4576                 return err;
4577         if (dev->flags != old_flags)
4578                 dev_set_rx_mode(dev);
4579         return err;
4580 }
4581 EXPORT_SYMBOL(dev_set_promiscuity);
4582
4583 /**
4584  *      dev_set_allmulti        - update allmulti count on a device
4585  *      @dev: device
4586  *      @inc: modifier
4587  *
4588  *      Add or remove reception of all multicast frames to a device. While the
4589  *      count in the device remains above zero the interface remains listening
4590  *      to all interfaces. Once it hits zero the device reverts back to normal
4591  *      filtering operation. A negative @inc value is used to drop the counter
4592  *      when releasing a resource needing all multicasts.
4593  *      Return 0 if successful or a negative errno code on error.
4594  */
4595
4596 int dev_set_allmulti(struct net_device *dev, int inc)
4597 {
4598         unsigned int old_flags = dev->flags;
4599
4600         ASSERT_RTNL();
4601
4602         dev->flags |= IFF_ALLMULTI;
4603         dev->allmulti += inc;
4604         if (dev->allmulti == 0) {
4605                 /*
4606                  * Avoid overflow.
4607                  * If inc causes overflow, untouch allmulti and return error.
4608                  */
4609                 if (inc < 0)
4610                         dev->flags &= ~IFF_ALLMULTI;
4611                 else {
4612                         dev->allmulti -= inc;
4613                         pr_warn("%s: allmulti touches roof, set allmulti failed. allmulti feature of device might be broken.\n",
4614                                 dev->name);
4615                         return -EOVERFLOW;
4616                 }
4617         }
4618         if (dev->flags ^ old_flags) {
4619                 dev_change_rx_flags(dev, IFF_ALLMULTI);
4620                 dev_set_rx_mode(dev);
4621         }
4622         return 0;
4623 }
4624 EXPORT_SYMBOL(dev_set_allmulti);
4625
4626 /*
4627  *      Upload unicast and multicast address lists to device and
4628  *      configure RX filtering. When the device doesn't support unicast
4629  *      filtering it is put in promiscuous mode while unicast addresses
4630  *      are present.
4631  */
4632 void __dev_set_rx_mode(struct net_device *dev)
4633 {
4634         const struct net_device_ops *ops = dev->netdev_ops;
4635
4636         /* dev_open will call this function so the list will stay sane. */
4637         if (!(dev->flags&IFF_UP))
4638                 return;
4639
4640         if (!netif_device_present(dev))
4641                 return;
4642
4643         if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
4644                 /* Unicast addresses changes may only happen under the rtnl,
4645                  * therefore calling __dev_set_promiscuity here is safe.
4646                  */
4647                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
4648                         __dev_set_promiscuity(dev, 1);
4649                         dev->uc_promisc = true;
4650                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
4651                         __dev_set_promiscuity(dev, -1);
4652                         dev->uc_promisc = false;
4653                 }
4654         }
4655
4656         if (ops->ndo_set_rx_mode)
4657                 ops->ndo_set_rx_mode(dev);
4658 }
4659 EXPORT_SYMBOL(__dev_set_rx_mode);
4660
4661 void dev_set_rx_mode(struct net_device *dev)
4662 {
4663         netif_addr_lock_bh(dev);
4664         __dev_set_rx_mode(dev);
4665         netif_addr_unlock_bh(dev);
4666 }
4667
4668 /**
4669  *      dev_get_flags - get flags reported to userspace
4670  *      @dev: device
4671  *
4672  *      Get the combination of flag bits exported through APIs to userspace.
4673  */
4674 unsigned int dev_get_flags(const struct net_device *dev)
4675 {
4676         unsigned int flags;
4677
4678         flags = (dev->flags & ~(IFF_PROMISC |
4679                                 IFF_ALLMULTI |
4680                                 IFF_RUNNING |
4681                                 IFF_LOWER_UP |
4682                                 IFF_DORMANT)) |
4683                 (dev->gflags & (IFF_PROMISC |
4684                                 IFF_ALLMULTI));
4685
4686         if (netif_running(dev)) {
4687                 if (netif_oper_up(dev))
4688                         flags |= IFF_RUNNING;
4689                 if (netif_carrier_ok(dev))
4690                         flags |= IFF_LOWER_UP;
4691                 if (netif_dormant(dev))
4692                         flags |= IFF_DORMANT;
4693         }
4694
4695         return flags;
4696 }
4697 EXPORT_SYMBOL(dev_get_flags);
4698
4699 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4700 {
4701         unsigned int old_flags = dev->flags;
4702         int ret;
4703
4704         ASSERT_RTNL();
4705
4706         /*
4707          *      Set the flags on our device.
4708          */
4709
4710         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4711                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4712                                IFF_AUTOMEDIA)) |
4713                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4714                                     IFF_ALLMULTI));
4715
4716         /*
4717          *      Load in the correct multicast list now the flags have changed.
4718          */
4719
4720         if ((old_flags ^ flags) & IFF_MULTICAST)
4721                 dev_change_rx_flags(dev, IFF_MULTICAST);
4722
4723         dev_set_rx_mode(dev);
4724
4725         /*
4726          *      Have we downed the interface. We handle IFF_UP ourselves
4727          *      according to user attempts to set it, rather than blindly
4728          *      setting it.
4729          */
4730
4731         ret = 0;
4732         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4733                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4734
4735                 if (!ret)
4736                         dev_set_rx_mode(dev);
4737         }
4738
4739         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4740                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4741
4742                 dev->gflags ^= IFF_PROMISC;
4743                 dev_set_promiscuity(dev, inc);
4744         }
4745
4746         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4747            is important. Some (broken) drivers set IFF_PROMISC, when
4748            IFF_ALLMULTI is requested not asking us and not reporting.
4749          */
4750         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4751                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4752
4753                 dev->gflags ^= IFF_ALLMULTI;
4754                 dev_set_allmulti(dev, inc);
4755         }
4756
4757         return ret;
4758 }
4759
4760 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4761 {
4762         unsigned int changes = dev->flags ^ old_flags;
4763
4764         if (changes & IFF_UP) {
4765                 if (dev->flags & IFF_UP)
4766                         call_netdevice_notifiers(NETDEV_UP, dev);
4767                 else
4768                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4769         }
4770
4771         if (dev->flags & IFF_UP &&
4772             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4773                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4774 }
4775
4776 /**
4777  *      dev_change_flags - change device settings
4778  *      @dev: device
4779  *      @flags: device state flags
4780  *
4781  *      Change settings on device based state flags. The flags are
4782  *      in the userspace exported format.
4783  */
4784 int dev_change_flags(struct net_device *dev, unsigned int flags)
4785 {
4786         int ret;
4787         unsigned int changes, old_flags = dev->flags;
4788
4789         ret = __dev_change_flags(dev, flags);
4790         if (ret < 0)
4791                 return ret;
4792
4793         changes = old_flags ^ dev->flags;
4794         if (changes)
4795                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4796
4797         __dev_notify_flags(dev, old_flags);
4798         return ret;
4799 }
4800 EXPORT_SYMBOL(dev_change_flags);
4801
4802 /**
4803  *      dev_set_mtu - Change maximum transfer unit
4804  *      @dev: device
4805  *      @new_mtu: new transfer unit
4806  *
4807  *      Change the maximum transfer size of the network device.
4808  */
4809 int dev_set_mtu(struct net_device *dev, int new_mtu)
4810 {
4811         const struct net_device_ops *ops = dev->netdev_ops;
4812         int err;
4813
4814         if (new_mtu == dev->mtu)
4815                 return 0;
4816
4817         /*      MTU must be positive.    */
4818         if (new_mtu < 0)
4819                 return -EINVAL;
4820
4821         if (!netif_device_present(dev))
4822                 return -ENODEV;
4823
4824         err = 0;
4825         if (ops->ndo_change_mtu)
4826                 err = ops->ndo_change_mtu(dev, new_mtu);
4827         else
4828                 dev->mtu = new_mtu;
4829
4830         if (!err)
4831                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4832         return err;
4833 }
4834 EXPORT_SYMBOL(dev_set_mtu);
4835
4836 /**
4837  *      dev_set_group - Change group this device belongs to
4838  *      @dev: device
4839  *      @new_group: group this device should belong to
4840  */
4841 void dev_set_group(struct net_device *dev, int new_group)
4842 {
4843         dev->group = new_group;
4844 }
4845 EXPORT_SYMBOL(dev_set_group);
4846
4847 /**
4848  *      dev_set_mac_address - Change Media Access Control Address
4849  *      @dev: device
4850  *      @sa: new address
4851  *
4852  *      Change the hardware (MAC) address of the device
4853  */
4854 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4855 {
4856         const struct net_device_ops *ops = dev->netdev_ops;
4857         int err;
4858
4859         if (!ops->ndo_set_mac_address)
4860                 return -EOPNOTSUPP;
4861         if (sa->sa_family != dev->type)
4862                 return -EINVAL;
4863         if (!netif_device_present(dev))
4864                 return -ENODEV;
4865         err = ops->ndo_set_mac_address(dev, sa);
4866         if (err)
4867                 return err;
4868         dev->addr_assign_type = NET_ADDR_SET;
4869         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4870         add_device_randomness(dev->dev_addr, dev->addr_len);
4871         return 0;
4872 }
4873 EXPORT_SYMBOL(dev_set_mac_address);
4874
4875 /**
4876  *      dev_change_carrier - Change device carrier
4877  *      @dev: device
4878  *      @new_carrier: new value
4879  *
4880  *      Change device carrier
4881  */
4882 int dev_change_carrier(struct net_device *dev, bool new_carrier)
4883 {
4884         const struct net_device_ops *ops = dev->netdev_ops;
4885
4886         if (!ops->ndo_change_carrier)
4887                 return -EOPNOTSUPP;
4888         if (!netif_device_present(dev))
4889                 return -ENODEV;
4890         return ops->ndo_change_carrier(dev, new_carrier);
4891 }
4892 EXPORT_SYMBOL(dev_change_carrier);
4893
4894 /**
4895  *      dev_new_index   -       allocate an ifindex
4896  *      @net: the applicable net namespace
4897  *
4898  *      Returns a suitable unique value for a new device interface
4899  *      number.  The caller must hold the rtnl semaphore or the
4900  *      dev_base_lock to be sure it remains unique.
4901  */
4902 static int dev_new_index(struct net *net)
4903 {
4904         int ifindex = net->ifindex;
4905         for (;;) {
4906                 if (++ifindex <= 0)
4907                         ifindex = 1;
4908                 if (!__dev_get_by_index(net, ifindex))
4909                         return net->ifindex = ifindex;
4910         }
4911 }
4912
4913 /* Delayed registration/unregisteration */
4914 static LIST_HEAD(net_todo_list);
4915
4916 static void net_set_todo(struct net_device *dev)
4917 {
4918         list_add_tail(&dev->todo_list, &net_todo_list);
4919 }
4920
4921 static void rollback_registered_many(struct list_head *head)
4922 {
4923         struct net_device *dev, *tmp;
4924
4925         BUG_ON(dev_boot_phase);
4926         ASSERT_RTNL();
4927
4928         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
4929                 /* Some devices call without registering
4930                  * for initialization unwind. Remove those
4931                  * devices and proceed with the remaining.
4932                  */
4933                 if (dev->reg_state == NETREG_UNINITIALIZED) {
4934                         pr_debug("unregister_netdevice: device %s/%p never was registered\n",
4935                                  dev->name, dev);
4936
4937                         WARN_ON(1);
4938                         list_del(&dev->unreg_list);
4939                         continue;
4940                 }
4941                 dev->dismantle = true;
4942                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
4943         }
4944
4945         /* If device is running, close it first. */
4946         dev_close_many(head);
4947
4948         list_for_each_entry(dev, head, unreg_list) {
4949                 /* And unlink it from device chain. */
4950                 unlist_netdevice(dev);
4951
4952                 dev->reg_state = NETREG_UNREGISTERING;
4953         }
4954
4955         synchronize_net();
4956
4957         list_for_each_entry(dev, head, unreg_list) {
4958                 /* Shutdown queueing discipline. */
4959                 dev_shutdown(dev);
4960
4961
4962                 /* Notify protocols, that we are about to destroy
4963                    this device. They should clean all the things.
4964                 */
4965                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4966
4967                 if (!dev->rtnl_link_ops ||
4968                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
4969                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
4970
4971                 /*
4972                  *      Flush the unicast and multicast chains
4973                  */
4974                 dev_uc_flush(dev);
4975                 dev_mc_flush(dev);
4976
4977                 if (dev->netdev_ops->ndo_uninit)
4978                         dev->netdev_ops->ndo_uninit(dev);
4979
4980                 /* Notifier chain MUST detach us all upper devices. */
4981                 WARN_ON(netdev_has_any_upper_dev(dev));
4982
4983                 /* Remove entries from kobject tree */
4984                 netdev_unregister_kobject(dev);
4985 #ifdef CONFIG_XPS
4986                 /* Remove XPS queueing entries */
4987                 netif_reset_xps_queues_gt(dev, 0);
4988 #endif
4989         }
4990
4991         synchronize_net();
4992
4993         list_for_each_entry(dev, head, unreg_list)
4994                 dev_put(dev);
4995 }
4996
4997 static void rollback_registered(struct net_device *dev)
4998 {
4999         LIST_HEAD(single);
5000
5001         list_add(&dev->unreg_list, &single);
5002         rollback_registered_many(&single);
5003         list_del(&single);
5004 }
5005
5006 static netdev_features_t netdev_fix_features(struct net_device *dev,
5007         netdev_features_t features)
5008 {
5009         /* Fix illegal checksum combinations */
5010         if ((features & NETIF_F_HW_CSUM) &&
5011             (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5012                 netdev_warn(dev, "mixed HW and IP checksum settings.\n");
5013                 features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5014         }
5015
5016         /* TSO requires that SG is present as well. */
5017         if ((features & NETIF_F_ALL_TSO) && !(features & NETIF_F_SG)) {
5018                 netdev_dbg(dev, "Dropping TSO features since no SG feature.\n");
5019                 features &= ~NETIF_F_ALL_TSO;
5020         }
5021
5022         if ((features & NETIF_F_TSO) && !(features & NETIF_F_HW_CSUM) &&
5023                                         !(features & NETIF_F_IP_CSUM)) {
5024                 netdev_dbg(dev, "Dropping TSO features since no CSUM feature.\n");
5025                 features &= ~NETIF_F_TSO;
5026                 features &= ~NETIF_F_TSO_ECN;
5027         }
5028
5029         if ((features & NETIF_F_TSO6) && !(features & NETIF_F_HW_CSUM) &&
5030                                          !(features & NETIF_F_IPV6_CSUM)) {
5031                 netdev_dbg(dev, "Dropping TSO6 features since no CSUM feature.\n");
5032                 features &= ~NETIF_F_TSO6;
5033         }
5034
5035         /* TSO ECN requires that TSO is present as well. */
5036         if ((features & NETIF_F_ALL_TSO) == NETIF_F_TSO_ECN)
5037                 features &= ~NETIF_F_TSO_ECN;
5038
5039         /* Software GSO depends on SG. */
5040         if ((features & NETIF_F_GSO) && !(features & NETIF_F_SG)) {
5041                 netdev_dbg(dev, "Dropping NETIF_F_GSO since no SG feature.\n");
5042                 features &= ~NETIF_F_GSO;
5043         }
5044
5045         /* UFO needs SG and checksumming */
5046         if (features & NETIF_F_UFO) {
5047                 /* maybe split UFO into V4 and V6? */
5048                 if (!((features & NETIF_F_GEN_CSUM) ||
5049                     (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
5050                             == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5051                         netdev_dbg(dev,
5052                                 "Dropping NETIF_F_UFO since no checksum offload features.\n");
5053                         features &= ~NETIF_F_UFO;
5054                 }
5055
5056                 if (!(features & NETIF_F_SG)) {
5057                         netdev_dbg(dev,
5058                                 "Dropping NETIF_F_UFO since no NETIF_F_SG feature.\n");
5059                         features &= ~NETIF_F_UFO;
5060                 }
5061         }
5062
5063         return features;
5064 }
5065
5066 int __netdev_update_features(struct net_device *dev)
5067 {
5068         netdev_features_t features;
5069         int err = 0;
5070
5071         ASSERT_RTNL();
5072
5073         features = netdev_get_wanted_features(dev);
5074
5075         if (dev->netdev_ops->ndo_fix_features)
5076                 features = dev->netdev_ops->ndo_fix_features(dev, features);
5077
5078         /* driver might be less strict about feature dependencies */
5079         features = netdev_fix_features(dev, features);
5080
5081         if (dev->features == features)
5082                 return 0;
5083
5084         netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
5085                 &dev->features, &features);
5086
5087         if (dev->netdev_ops->ndo_set_features)
5088                 err = dev->netdev_ops->ndo_set_features(dev, features);
5089
5090         if (unlikely(err < 0)) {
5091                 netdev_err(dev,
5092                         "set_features() failed (%d); wanted %pNF, left %pNF\n",
5093                         err, &features, &dev->features);
5094                 return -1;
5095         }
5096
5097         if (!err)
5098                 dev->features = features;
5099
5100         return 1;
5101 }
5102
5103 /**
5104  *      netdev_update_features - recalculate device features
5105  *      @dev: the device to check
5106  *
5107  *      Recalculate dev->features set and send notifications if it
5108  *      has changed. Should be called after driver or hardware dependent
5109  *      conditions might have changed that influence the features.
5110  */
5111 void netdev_update_features(struct net_device *dev)
5112 {
5113         if (__netdev_update_features(dev))
5114                 netdev_features_change(dev);
5115 }
5116 EXPORT_SYMBOL(netdev_update_features);
5117
5118 /**
5119  *      netdev_change_features - recalculate device features
5120  *      @dev: the device to check
5121  *
5122  *      Recalculate dev->features set and send notifications even
5123  *      if they have not changed. Should be called instead of
5124  *      netdev_update_features() if also dev->vlan_features might
5125  *      have changed to allow the changes to be propagated to stacked
5126  *      VLAN devices.
5127  */
5128 void netdev_change_features(struct net_device *dev)
5129 {
5130         __netdev_update_features(dev);
5131         netdev_features_change(dev);
5132 }
5133 EXPORT_SYMBOL(netdev_change_features);
5134
5135 /**
5136  *      netif_stacked_transfer_operstate -      transfer operstate
5137  *      @rootdev: the root or lower level device to transfer state from
5138  *      @dev: the device to transfer operstate to
5139  *
5140  *      Transfer operational state from root to device. This is normally
5141  *      called when a stacking relationship exists between the root
5142  *      device and the device(a leaf device).
5143  */
5144 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5145                                         struct net_device *dev)
5146 {
5147         if (rootdev->operstate == IF_OPER_DORMANT)
5148                 netif_dormant_on(dev);
5149         else
5150                 netif_dormant_off(dev);
5151
5152         if (netif_carrier_ok(rootdev)) {
5153                 if (!netif_carrier_ok(dev))
5154                         netif_carrier_on(dev);
5155         } else {
5156                 if (netif_carrier_ok(dev))
5157                         netif_carrier_off(dev);
5158         }
5159 }
5160 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5161
5162 #ifdef CONFIG_RPS
5163 static int netif_alloc_rx_queues(struct net_device *dev)
5164 {
5165         unsigned int i, count = dev->num_rx_queues;
5166         struct netdev_rx_queue *rx;
5167
5168         BUG_ON(count < 1);
5169
5170         rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5171         if (!rx)
5172                 return -ENOMEM;
5173
5174         dev->_rx = rx;
5175
5176         for (i = 0; i < count; i++)
5177                 rx[i].dev = dev;
5178         return 0;
5179 }
5180 #endif
5181
5182 static void netdev_init_one_queue(struct net_device *dev,
5183                                   struct netdev_queue *queue, void *_unused)
5184 {
5185         /* Initialize queue lock */
5186         spin_lock_init(&queue->_xmit_lock);
5187         netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type);
5188         queue->xmit_lock_owner = -1;
5189         netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
5190         queue->dev = dev;
5191 #ifdef CONFIG_BQL
5192         dql_init(&queue->dql, HZ);
5193 #endif
5194 }
5195
5196 static int netif_alloc_netdev_queues(struct net_device *dev)
5197 {
5198         unsigned int count = dev->num_tx_queues;
5199         struct netdev_queue *tx;
5200
5201         BUG_ON(count < 1);
5202
5203         tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL);
5204         if (!tx)
5205                 return -ENOMEM;
5206
5207         dev->_tx = tx;
5208
5209         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5210         spin_lock_init(&dev->tx_global_lock);
5211
5212         return 0;
5213 }
5214
5215 /**
5216  *      register_netdevice      - register a network device
5217  *      @dev: device to register
5218  *
5219  *      Take a completed network device structure and add it to the kernel
5220  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5221  *      chain. 0 is returned on success. A negative errno code is returned
5222  *      on a failure to set up the device, or if the name is a duplicate.
5223  *
5224  *      Callers must hold the rtnl semaphore. You may want
5225  *      register_netdev() instead of this.
5226  *
5227  *      BUGS:
5228  *      The locking appears insufficient to guarantee two parallel registers
5229  *      will not get the same name.
5230  */
5231
5232 int register_netdevice(struct net_device *dev)
5233 {
5234         int ret;
5235         struct net *net = dev_net(dev);
5236
5237         BUG_ON(dev_boot_phase);
5238         ASSERT_RTNL();
5239
5240         might_sleep();
5241
5242         /* When net_device's are persistent, this will be fatal. */
5243         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5244         BUG_ON(!net);
5245
5246         spin_lock_init(&dev->addr_list_lock);
5247         netdev_set_addr_lockdep_class(dev);
5248
5249         dev->iflink = -1;
5250
5251         ret = dev_get_valid_name(net, dev, dev->name);
5252         if (ret < 0)
5253                 goto out;
5254
5255         /* Init, if this function is available */
5256         if (dev->netdev_ops->ndo_init) {
5257                 ret = dev->netdev_ops->ndo_init(dev);
5258                 if (ret) {
5259                         if (ret > 0)
5260                                 ret = -EIO;
5261                         goto out;
5262                 }
5263         }
5264
5265         if (((dev->hw_features | dev->features) &
5266              NETIF_F_HW_VLAN_CTAG_FILTER) &&
5267             (!dev->netdev_ops->ndo_vlan_rx_add_vid ||
5268              !dev->netdev_ops->ndo_vlan_rx_kill_vid)) {
5269                 netdev_WARN(dev, "Buggy VLAN acceleration in driver!\n");
5270                 ret = -EINVAL;
5271                 goto err_uninit;
5272         }
5273
5274         ret = -EBUSY;
5275         if (!dev->ifindex)
5276                 dev->ifindex = dev_new_index(net);
5277         else if (__dev_get_by_index(net, dev->ifindex))
5278                 goto err_uninit;
5279
5280         if (dev->iflink == -1)
5281                 dev->iflink = dev->ifindex;
5282
5283         /* Transfer changeable features to wanted_features and enable
5284          * software offloads (GSO and GRO).
5285          */
5286         dev->hw_features |= NETIF_F_SOFT_FEATURES;
5287         dev->features |= NETIF_F_SOFT_FEATURES;
5288         dev->wanted_features = dev->features & dev->hw_features;
5289
5290         /* Turn on no cache copy if HW is doing checksum */
5291         if (!(dev->flags & IFF_LOOPBACK)) {
5292                 dev->hw_features |= NETIF_F_NOCACHE_COPY;
5293                 if (dev->features & NETIF_F_ALL_CSUM) {
5294                         dev->wanted_features |= NETIF_F_NOCACHE_COPY;
5295                         dev->features |= NETIF_F_NOCACHE_COPY;
5296                 }
5297         }
5298
5299         /* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
5300          */
5301         dev->vlan_features |= NETIF_F_HIGHDMA;
5302
5303         /* Make NETIF_F_SG inheritable to tunnel devices.
5304          */
5305         dev->hw_enc_features |= NETIF_F_SG;
5306
5307         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5308         ret = notifier_to_errno(ret);
5309         if (ret)
5310                 goto err_uninit;
5311
5312         ret = netdev_register_kobject(dev);
5313         if (ret)
5314                 goto err_uninit;
5315         dev->reg_state = NETREG_REGISTERED;
5316
5317         __netdev_update_features(dev);
5318
5319         /*
5320          *      Default initial state at registry is that the
5321          *      device is present.
5322          */
5323
5324         set_bit(__LINK_STATE_PRESENT, &dev->state);
5325
5326         linkwatch_init_dev(dev);
5327
5328         dev_init_scheduler(dev);
5329         dev_hold(dev);
5330         list_netdevice(dev);
5331         add_device_randomness(dev->dev_addr, dev->addr_len);
5332
5333         /* If the device has permanent device address, driver should
5334          * set dev_addr and also addr_assign_type should be set to
5335          * NET_ADDR_PERM (default value).
5336          */
5337         if (dev->addr_assign_type == NET_ADDR_PERM)
5338                 memcpy(dev->perm_addr, dev->dev_addr, dev->addr_len);
5339
5340         /* Notify protocols, that a new device appeared. */
5341         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5342         ret = notifier_to_errno(ret);
5343         if (ret) {
5344                 rollback_registered(dev);
5345                 dev->reg_state = NETREG_UNREGISTERED;
5346         }
5347         /*
5348          *      Prevent userspace races by waiting until the network
5349          *      device is fully setup before sending notifications.
5350          */
5351         if (!dev->rtnl_link_ops ||
5352             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5353                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5354
5355 out:
5356         return ret;
5357
5358 err_uninit:
5359         if (dev->netdev_ops->ndo_uninit)
5360                 dev->netdev_ops->ndo_uninit(dev);
5361         goto out;
5362 }
5363 EXPORT_SYMBOL(register_netdevice);
5364
5365 /**
5366  *      init_dummy_netdev       - init a dummy network device for NAPI
5367  *      @dev: device to init
5368  *
5369  *      This takes a network device structure and initialize the minimum
5370  *      amount of fields so it can be used to schedule NAPI polls without
5371  *      registering a full blown interface. This is to be used by drivers
5372  *      that need to tie several hardware interfaces to a single NAPI
5373  *      poll scheduler due to HW limitations.
5374  */
5375 int init_dummy_netdev(struct net_device *dev)
5376 {
5377         /* Clear everything. Note we don't initialize spinlocks
5378          * are they aren't supposed to be taken by any of the
5379          * NAPI code and this dummy netdev is supposed to be
5380          * only ever used for NAPI polls
5381          */
5382         memset(dev, 0, sizeof(struct net_device));
5383
5384         /* make sure we BUG if trying to hit standard
5385          * register/unregister code path
5386          */
5387         dev->reg_state = NETREG_DUMMY;
5388
5389         /* NAPI wants this */
5390         INIT_LIST_HEAD(&dev->napi_list);
5391
5392         /* a dummy interface is started by default */
5393         set_bit(__LINK_STATE_PRESENT, &dev->state);
5394         set_bit(__LINK_STATE_START, &dev->state);
5395
5396         /* Note : We dont allocate pcpu_refcnt for dummy devices,
5397          * because users of this 'device' dont need to change
5398          * its refcount.
5399          */
5400
5401         return 0;
5402 }
5403 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5404
5405
5406 /**
5407  *      register_netdev - register a network device
5408  *      @dev: device to register
5409  *
5410  *      Take a completed network device structure and add it to the kernel
5411  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5412  *      chain. 0 is returned on success. A negative errno code is returned
5413  *      on a failure to set up the device, or if the name is a duplicate.
5414  *
5415  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5416  *      and expands the device name if you passed a format string to
5417  *      alloc_netdev.
5418  */
5419 int register_netdev(struct net_device *dev)
5420 {
5421         int err;
5422
5423         rtnl_lock();
5424         err = register_netdevice(dev);
5425         rtnl_unlock();
5426         return err;
5427 }
5428 EXPORT_SYMBOL(register_netdev);
5429
5430 int netdev_refcnt_read(const struct net_device *dev)
5431 {
5432         int i, refcnt = 0;
5433
5434         for_each_possible_cpu(i)
5435                 refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i);
5436         return refcnt;
5437 }
5438 EXPORT_SYMBOL(netdev_refcnt_read);
5439
5440 /**
5441  * netdev_wait_allrefs - wait until all references are gone.
5442  * @dev: target net_device
5443  *
5444  * This is called when unregistering network devices.
5445  *
5446  * Any protocol or device that holds a reference should register
5447  * for netdevice notification, and cleanup and put back the
5448  * reference if they receive an UNREGISTER event.
5449  * We can get stuck here if buggy protocols don't correctly
5450  * call dev_put.
5451  */
5452 static void netdev_wait_allrefs(struct net_device *dev)
5453 {
5454         unsigned long rebroadcast_time, warning_time;
5455         int refcnt;
5456
5457         linkwatch_forget_dev(dev);
5458
5459         rebroadcast_time = warning_time = jiffies;
5460         refcnt = netdev_refcnt_read(dev);
5461
5462         while (refcnt != 0) {
5463                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5464                         rtnl_lock();
5465
5466                         /* Rebroadcast unregister notification */
5467                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5468
5469                         __rtnl_unlock();
5470                         rcu_barrier();
5471                         rtnl_lock();
5472
5473                         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5474                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5475                                      &dev->state)) {
5476                                 /* We must not have linkwatch events
5477                                  * pending on unregister. If this
5478                                  * happens, we simply run the queue
5479                                  * unscheduled, resulting in a noop
5480                                  * for this device.
5481                                  */
5482                                 linkwatch_run_queue();
5483                         }
5484
5485                         __rtnl_unlock();
5486
5487                         rebroadcast_time = jiffies;
5488                 }
5489
5490                 msleep(250);
5491
5492                 refcnt = netdev_refcnt_read(dev);
5493
5494                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5495                         pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
5496                                  dev->name, refcnt);
5497                         warning_time = jiffies;
5498                 }
5499         }
5500 }
5501
5502 /* The sequence is:
5503  *
5504  *      rtnl_lock();
5505  *      ...
5506  *      register_netdevice(x1);
5507  *      register_netdevice(x2);
5508  *      ...
5509  *      unregister_netdevice(y1);
5510  *      unregister_netdevice(y2);
5511  *      ...
5512  *      rtnl_unlock();
5513  *      free_netdev(y1);
5514  *      free_netdev(y2);
5515  *
5516  * We are invoked by rtnl_unlock().
5517  * This allows us to deal with problems:
5518  * 1) We can delete sysfs objects which invoke hotplug
5519  *    without deadlocking with linkwatch via keventd.
5520  * 2) Since we run with the RTNL semaphore not held, we can sleep
5521  *    safely in order to wait for the netdev refcnt to drop to zero.
5522  *
5523  * We must not return until all unregister events added during
5524  * the interval the lock was held have been completed.
5525  */
5526 void netdev_run_todo(void)
5527 {
5528         struct list_head list;
5529
5530         /* Snapshot list, allow later requests */
5531         list_replace_init(&net_todo_list, &list);
5532
5533         __rtnl_unlock();
5534
5535
5536         /* Wait for rcu callbacks to finish before next phase */
5537         if (!list_empty(&list))
5538                 rcu_barrier();
5539
5540         while (!list_empty(&list)) {
5541                 struct net_device *dev
5542                         = list_first_entry(&list, struct net_device, todo_list);
5543                 list_del(&dev->todo_list);
5544
5545                 rtnl_lock();
5546                 call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5547                 __rtnl_unlock();
5548
5549                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5550                         pr_err("network todo '%s' but state %d\n",
5551                                dev->name, dev->reg_state);
5552                         dump_stack();
5553                         continue;
5554                 }
5555
5556                 dev->reg_state = NETREG_UNREGISTERED;
5557
5558                 on_each_cpu(flush_backlog, dev, 1);
5559
5560                 netdev_wait_allrefs(dev);
5561
5562                 /* paranoia */
5563                 BUG_ON(netdev_refcnt_read(dev));
5564                 WARN_ON(rcu_access_pointer(dev->ip_ptr));
5565                 WARN_ON(rcu_access_pointer(dev->ip6_ptr));
5566                 WARN_ON(dev->dn_ptr);
5567
5568                 if (dev->destructor)
5569                         dev->destructor(dev);
5570
5571                 /* Free network device */
5572                 kobject_put(&dev->dev.kobj);
5573         }
5574 }
5575
5576 /* Convert net_device_stats to rtnl_link_stats64.  They have the same
5577  * fields in the same order, with only the type differing.
5578  */
5579 void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
5580                              const struct net_device_stats *netdev_stats)
5581 {
5582 #if BITS_PER_LONG == 64
5583         BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
5584         memcpy(stats64, netdev_stats, sizeof(*stats64));
5585 #else
5586         size_t i, n = sizeof(*stats64) / sizeof(u64);
5587         const unsigned long *src = (const unsigned long *)netdev_stats;
5588         u64 *dst = (u64 *)stats64;
5589
5590         BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
5591                      sizeof(*stats64) / sizeof(u64));
5592         for (i = 0; i < n; i++)
5593                 dst[i] = src[i];
5594 #endif
5595 }
5596 EXPORT_SYMBOL(netdev_stats_to_stats64);
5597
5598 /**
5599  *      dev_get_stats   - get network device statistics
5600  *      @dev: device to get statistics from
5601  *      @storage: place to store stats
5602  *
5603  *      Get network statistics from device. Return @storage.
5604  *      The device driver may provide its own method by setting
5605  *      dev->netdev_ops->get_stats64 or dev->netdev_ops->get_stats;
5606  *      otherwise the internal statistics structure is used.
5607  */
5608 struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
5609                                         struct rtnl_link_stats64 *storage)
5610 {
5611         const struct net_device_ops *ops = dev->netdev_ops;
5612
5613         if (ops->ndo_get_stats64) {
5614                 memset(storage, 0, sizeof(*storage));
5615                 ops->ndo_get_stats64(dev, storage);
5616         } else if (ops->ndo_get_stats) {
5617                 netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev));
5618         } else {
5619                 netdev_stats_to_stats64(storage, &dev->stats);
5620         }
5621         storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
5622         return storage;
5623 }
5624 EXPORT_SYMBOL(dev_get_stats);
5625
5626 struct netdev_queue *dev_ingress_queue_create(struct net_device *dev)
5627 {
5628         struct netdev_queue *queue = dev_ingress_queue(dev);
5629
5630 #ifdef CONFIG_NET_CLS_ACT
5631         if (queue)
5632                 return queue;
5633         queue = kzalloc(sizeof(*queue), GFP_KERNEL);
5634         if (!queue)
5635                 return NULL;
5636         netdev_init_one_queue(dev, queue, NULL);
5637         queue->qdisc = &noop_qdisc;
5638         queue->qdisc_sleeping = &noop_qdisc;
5639         rcu_assign_pointer(dev->ingress_queue, queue);
5640 #endif
5641         return queue;
5642 }
5643
5644 static const struct ethtool_ops default_ethtool_ops;
5645
5646 void netdev_set_default_ethtool_ops(struct net_device *dev,
5647                                     const struct ethtool_ops *ops)
5648 {
5649         if (dev->ethtool_ops == &default_ethtool_ops)
5650                 dev->ethtool_ops = ops;
5651 }
5652 EXPORT_SYMBOL_GPL(netdev_set_default_ethtool_ops);
5653
5654 /**
5655  *      alloc_netdev_mqs - allocate network device
5656  *      @sizeof_priv:   size of private data to allocate space for
5657  *      @name:          device name format string
5658  *      @setup:         callback to initialize device
5659  *      @txqs:          the number of TX subqueues to allocate
5660  *      @rxqs:          the number of RX subqueues to allocate
5661  *
5662  *      Allocates a struct net_device with private data area for driver use
5663  *      and performs basic initialization.  Also allocates subquue structs
5664  *      for each queue on the device.
5665  */
5666 struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
5667                 void (*setup)(struct net_device *),
5668                 unsigned int txqs, unsigned int rxqs)
5669 {
5670         struct net_device *dev;
5671         size_t alloc_size;
5672         struct net_device *p;
5673
5674         BUG_ON(strlen(name) >= sizeof(dev->name));
5675
5676         if (txqs < 1) {
5677                 pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
5678                 return NULL;
5679         }
5680
5681 #ifdef CONFIG_RPS
5682         if (rxqs < 1) {
5683                 pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
5684                 return NULL;
5685         }
5686 #endif
5687
5688         alloc_size = sizeof(struct net_device);
5689         if (sizeof_priv) {
5690                 /* ensure 32-byte alignment of private area */
5691                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5692                 alloc_size += sizeof_priv;
5693         }
5694         /* ensure 32-byte alignment of whole construct */
5695         alloc_size += NETDEV_ALIGN - 1;
5696
5697         p = kzalloc(alloc_size, GFP_KERNEL);
5698         if (!p)
5699                 return NULL;
5700
5701         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5702         dev->padded = (char *)dev - (char *)p;
5703
5704         dev->pcpu_refcnt = alloc_percpu(int);
5705         if (!dev->pcpu_refcnt)
5706                 goto free_p;
5707
5708         if (dev_addr_init(dev))
5709                 goto free_pcpu;
5710
5711         dev_mc_init(dev);
5712         dev_uc_init(dev);
5713
5714         dev_net_set(dev, &init_net);
5715
5716         dev->gso_max_size = GSO_MAX_SIZE;
5717         dev->gso_max_segs = GSO_MAX_SEGS;
5718
5719         INIT_LIST_HEAD(&dev->napi_list);
5720         INIT_LIST_HEAD(&dev->unreg_list);
5721         INIT_LIST_HEAD(&dev->link_watch_list);
5722         INIT_LIST_HEAD(&dev->upper_dev_list);
5723         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5724         setup(dev);
5725
5726         dev->num_tx_queues = txqs;
5727         dev->real_num_tx_queues = txqs;
5728         if (netif_alloc_netdev_queues(dev))
5729                 goto free_all;
5730
5731 #ifdef CONFIG_RPS
5732         dev->num_rx_queues = rxqs;
5733         dev->real_num_rx_queues = rxqs;
5734         if (netif_alloc_rx_queues(dev))
5735                 goto free_all;
5736 #endif
5737
5738         strcpy(dev->name, name);
5739         dev->group = INIT_NETDEV_GROUP;
5740         if (!dev->ethtool_ops)
5741                 dev->ethtool_ops = &default_ethtool_ops;
5742         return dev;
5743
5744 free_all:
5745         free_netdev(dev);
5746         return NULL;
5747
5748 free_pcpu:
5749         free_percpu(dev->pcpu_refcnt);
5750         kfree(dev->_tx);
5751 #ifdef CONFIG_RPS
5752         kfree(dev->_rx);
5753 #endif
5754
5755 free_p:
5756         kfree(p);
5757         return NULL;
5758 }
5759 EXPORT_SYMBOL(alloc_netdev_mqs);
5760
5761 /**
5762  *      free_netdev - free network device
5763  *      @dev: device
5764  *
5765  *      This function does the last stage of destroying an allocated device
5766  *      interface. The reference to the device object is released.
5767  *      If this is the last reference then it will be freed.
5768  */
5769 void free_netdev(struct net_device *dev)
5770 {
5771         struct napi_struct *p, *n;
5772
5773         release_net(dev_net(dev));
5774
5775         kfree(dev->_tx);
5776 #ifdef CONFIG_RPS
5777         kfree(dev->_rx);
5778 #endif
5779
5780         kfree(rcu_dereference_protected(dev->ingress_queue, 1));
5781
5782         /* Flush device addresses */
5783         dev_addr_flush(dev);
5784
5785         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5786                 netif_napi_del(p);
5787
5788         free_percpu(dev->pcpu_refcnt);
5789         dev->pcpu_refcnt = NULL;
5790
5791         /*  Compatibility with error handling in drivers */
5792         if (dev->reg_state == NETREG_UNINITIALIZED) {
5793                 kfree((char *)dev - dev->padded);
5794                 return;
5795         }
5796
5797         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5798         dev->reg_state = NETREG_RELEASED;
5799
5800         /* will free via device release */
5801         put_device(&dev->dev);
5802 }
5803 EXPORT_SYMBOL(free_netdev);
5804
5805 /**
5806  *      synchronize_net -  Synchronize with packet receive processing
5807  *
5808  *      Wait for packets currently being received to be done.
5809  *      Does not block later packets from starting.
5810  */
5811 void synchronize_net(void)
5812 {
5813         might_sleep();
5814         if (rtnl_is_locked())
5815                 synchronize_rcu_expedited();
5816         else
5817                 synchronize_rcu();
5818 }
5819 EXPORT_SYMBOL(synchronize_net);
5820
5821 /**
5822  *      unregister_netdevice_queue - remove device from the kernel
5823  *      @dev: device
5824  *      @head: list
5825  *
5826  *      This function shuts down a device interface and removes it
5827  *      from the kernel tables.
5828  *      If head not NULL, device is queued to be unregistered later.
5829  *
5830  *      Callers must hold the rtnl semaphore.  You may want
5831  *      unregister_netdev() instead of this.
5832  */
5833
5834 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5835 {
5836         ASSERT_RTNL();
5837
5838         if (head) {
5839                 list_move_tail(&dev->unreg_list, head);
5840         } else {
5841                 rollback_registered(dev);
5842                 /* Finish processing unregister after unlock */
5843                 net_set_todo(dev);
5844         }
5845 }
5846 EXPORT_SYMBOL(unregister_netdevice_queue);
5847
5848 /**
5849  *      unregister_netdevice_many - unregister many devices
5850  *      @head: list of devices
5851  *
5852  *  Note: As most callers use a stack allocated list_head,
5853  *  we force a list_del() to make sure stack wont be corrupted later.
5854  */
5855 void unregister_netdevice_many(struct list_head *head)
5856 {
5857         struct net_device *dev;
5858
5859         if (!list_empty(head)) {
5860                 rollback_registered_many(head);
5861                 list_for_each_entry(dev, head, unreg_list)
5862                         net_set_todo(dev);
5863                 list_del(head);
5864         }
5865 }
5866 EXPORT_SYMBOL(unregister_netdevice_many);
5867
5868 /**
5869  *      unregister_netdev - remove device from the kernel
5870  *      @dev: device
5871  *
5872  *      This function shuts down a device interface and removes it
5873  *      from the kernel tables.
5874  *
5875  *      This is just a wrapper for unregister_netdevice that takes
5876  *      the rtnl semaphore.  In general you want to use this and not
5877  *      unregister_netdevice.
5878  */
5879 void unregister_netdev(struct net_device *dev)
5880 {
5881         rtnl_lock();
5882         unregister_netdevice(dev);
5883         rtnl_unlock();
5884 }
5885 EXPORT_SYMBOL(unregister_netdev);
5886
5887 /**
5888  *      dev_change_net_namespace - move device to different nethost namespace
5889  *      @dev: device
5890  *      @net: network namespace
5891  *      @pat: If not NULL name pattern to try if the current device name
5892  *            is already taken in the destination network namespace.
5893  *
5894  *      This function shuts down a device interface and moves it
5895  *      to a new network namespace. On success 0 is returned, on
5896  *      a failure a netagive errno code is returned.
5897  *
5898  *      Callers must hold the rtnl semaphore.
5899  */
5900
5901 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5902 {
5903         int err;
5904
5905         ASSERT_RTNL();
5906
5907         /* Don't allow namespace local devices to be moved. */
5908         err = -EINVAL;
5909         if (dev->features & NETIF_F_NETNS_LOCAL)
5910                 goto out;
5911
5912         /* Ensure the device has been registrered */
5913         if (dev->reg_state != NETREG_REGISTERED)
5914                 goto out;
5915
5916         /* Get out if there is nothing todo */
5917         err = 0;
5918         if (net_eq(dev_net(dev), net))
5919                 goto out;
5920
5921         /* Pick the destination device name, and ensure
5922          * we can use it in the destination network namespace.
5923          */
5924         err = -EEXIST;
5925         if (__dev_get_by_name(net, dev->name)) {
5926                 /* We get here if we can't use the current device name */
5927                 if (!pat)
5928                         goto out;
5929                 if (dev_get_valid_name(net, dev, pat) < 0)
5930                         goto out;
5931         }
5932
5933         /*
5934          * And now a mini version of register_netdevice unregister_netdevice.
5935          */
5936
5937         /* If device is running close it first. */
5938         dev_close(dev);
5939
5940         /* And unlink it from device chain */
5941         err = -ENODEV;
5942         unlist_netdevice(dev);
5943
5944         synchronize_net();
5945
5946         /* Shutdown queueing discipline. */
5947         dev_shutdown(dev);
5948
5949         /* Notify protocols, that we are about to destroy
5950            this device. They should clean all the things.
5951
5952            Note that dev->reg_state stays at NETREG_REGISTERED.
5953            This is wanted because this way 8021q and macvlan know
5954            the device is just moving and can keep their slaves up.
5955         */
5956         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5957         rcu_barrier();
5958         call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev);
5959         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5960
5961         /*
5962          *      Flush the unicast and multicast chains
5963          */
5964         dev_uc_flush(dev);
5965         dev_mc_flush(dev);
5966
5967         /* Send a netdev-removed uevent to the old namespace */
5968         kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
5969
5970         /* Actually switch the network namespace */
5971         dev_net_set(dev, net);
5972
5973         /* If there is an ifindex conflict assign a new one */
5974         if (__dev_get_by_index(net, dev->ifindex)) {
5975                 int iflink = (dev->iflink == dev->ifindex);
5976                 dev->ifindex = dev_new_index(net);
5977                 if (iflink)
5978                         dev->iflink = dev->ifindex;
5979         }
5980
5981         /* Send a netdev-add uevent to the new namespace */
5982         kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
5983
5984         /* Fixup kobjects */
5985         err = device_rename(&dev->dev, dev->name);
5986         WARN_ON(err);
5987
5988         /* Add the device back in the hashes */
5989         list_netdevice(dev);
5990
5991         /* Notify protocols, that a new device appeared. */
5992         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5993
5994         /*
5995          *      Prevent userspace races by waiting until the network
5996          *      device is fully setup before sending notifications.
5997          */
5998         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5999
6000         synchronize_net();
6001         err = 0;
6002 out:
6003         return err;
6004 }
6005 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6006
6007 static int dev_cpu_callback(struct notifier_block *nfb,
6008                             unsigned long action,
6009                             void *ocpu)
6010 {
6011         struct sk_buff **list_skb;
6012         struct sk_buff *skb;
6013         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6014         struct softnet_data *sd, *oldsd;
6015
6016         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6017                 return NOTIFY_OK;
6018
6019         local_irq_disable();
6020         cpu = smp_processor_id();
6021         sd = &per_cpu(softnet_data, cpu);
6022         oldsd = &per_cpu(softnet_data, oldcpu);
6023
6024         /* Find end of our completion_queue. */
6025         list_skb = &sd->completion_queue;
6026         while (*list_skb)
6027                 list_skb = &(*list_skb)->next;
6028         /* Append completion queue from offline CPU. */
6029         *list_skb = oldsd->completion_queue;
6030         oldsd->completion_queue = NULL;
6031
6032         /* Append output queue from offline CPU. */
6033         if (oldsd->output_queue) {
6034                 *sd->output_queue_tailp = oldsd->output_queue;
6035                 sd->output_queue_tailp = oldsd->output_queue_tailp;
6036                 oldsd->output_queue = NULL;
6037                 oldsd->output_queue_tailp = &oldsd->output_queue;
6038         }
6039         /* Append NAPI poll list from offline CPU. */
6040         if (!list_empty(&oldsd->poll_list)) {
6041                 list_splice_init(&oldsd->poll_list, &sd->poll_list);
6042                 raise_softirq_irqoff(NET_RX_SOFTIRQ);
6043         }
6044
6045         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6046         local_irq_enable();
6047
6048         /* Process offline CPU's input_pkt_queue */
6049         while ((skb = __skb_dequeue(&oldsd->process_queue))) {
6050                 netif_rx(skb);
6051                 input_queue_head_incr(oldsd);
6052         }
6053         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue))) {
6054                 netif_rx(skb);
6055                 input_queue_head_incr(oldsd);
6056         }
6057
6058         return NOTIFY_OK;
6059 }
6060
6061
6062 /**
6063  *      netdev_increment_features - increment feature set by one
6064  *      @all: current feature set
6065  *      @one: new feature set
6066  *      @mask: mask feature set
6067  *
6068  *      Computes a new feature set after adding a device with feature set
6069  *      @one to the master device with current feature set @all.  Will not
6070  *      enable anything that is off in @mask. Returns the new feature set.
6071  */
6072 netdev_features_t netdev_increment_features(netdev_features_t all,
6073         netdev_features_t one, netdev_features_t mask)
6074 {
6075         if (mask & NETIF_F_GEN_CSUM)
6076                 mask |= NETIF_F_ALL_CSUM;
6077         mask |= NETIF_F_VLAN_CHALLENGED;
6078
6079         all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
6080         all &= one | ~NETIF_F_ALL_FOR_ALL;
6081
6082         /* If one device supports hw checksumming, set for all. */
6083         if (all & NETIF_F_GEN_CSUM)
6084                 all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
6085
6086         return all;
6087 }
6088 EXPORT_SYMBOL(netdev_increment_features);
6089
6090 static struct hlist_head *netdev_create_hash(void)
6091 {
6092         int i;
6093         struct hlist_head *hash;
6094
6095         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6096         if (hash != NULL)
6097                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6098                         INIT_HLIST_HEAD(&hash[i]);
6099
6100         return hash;
6101 }
6102
6103 /* Initialize per network namespace state */
6104 static int __net_init netdev_init(struct net *net)
6105 {
6106         if (net != &init_net)
6107                 INIT_LIST_HEAD(&net->dev_base_head);
6108
6109         net->dev_name_head = netdev_create_hash();
6110         if (net->dev_name_head == NULL)
6111                 goto err_name;
6112
6113         net->dev_index_head = netdev_create_hash();
6114         if (net->dev_index_head == NULL)
6115                 goto err_idx;
6116
6117         return 0;
6118
6119 err_idx:
6120         kfree(net->dev_name_head);
6121 err_name:
6122         return -ENOMEM;
6123 }
6124
6125 /**
6126  *      netdev_drivername - network driver for the device
6127  *      @dev: network device
6128  *
6129  *      Determine network driver for device.
6130  */
6131 const char *netdev_drivername(const struct net_device *dev)
6132 {
6133         const struct device_driver *driver;
6134         const struct device *parent;
6135         const char *empty = "";
6136
6137         parent = dev->dev.parent;
6138         if (!parent)
6139                 return empty;
6140
6141         driver = parent->driver;
6142         if (driver && driver->name)
6143                 return driver->name;
6144         return empty;
6145 }
6146
6147 static int __netdev_printk(const char *level, const struct net_device *dev,
6148                            struct va_format *vaf)
6149 {
6150         int r;
6151
6152         if (dev && dev->dev.parent) {
6153                 r = dev_printk_emit(level[1] - '0',
6154                                     dev->dev.parent,
6155                                     "%s %s %s: %pV",
6156                                     dev_driver_string(dev->dev.parent),
6157                                     dev_name(dev->dev.parent),
6158                                     netdev_name(dev), vaf);
6159         } else if (dev) {
6160                 r = printk("%s%s: %pV", level, netdev_name(dev), vaf);
6161         } else {
6162                 r = printk("%s(NULL net_device): %pV", level, vaf);
6163         }
6164
6165         return r;
6166 }
6167
6168 int netdev_printk(const char *level, const struct net_device *dev,
6169                   const char *format, ...)
6170 {
6171         struct va_format vaf;
6172         va_list args;
6173         int r;
6174
6175         va_start(args, format);
6176
6177         vaf.fmt = format;
6178         vaf.va = &args;
6179
6180         r = __netdev_printk(level, dev, &vaf);
6181
6182         va_end(args);
6183
6184         return r;
6185 }
6186 EXPORT_SYMBOL(netdev_printk);
6187
6188 #define define_netdev_printk_level(func, level)                 \
6189 int func(const struct net_device *dev, const char *fmt, ...)    \
6190 {                                                               \
6191         int r;                                                  \
6192         struct va_format vaf;                                   \
6193         va_list args;                                           \
6194                                                                 \
6195         va_start(args, fmt);                                    \
6196                                                                 \
6197         vaf.fmt = fmt;                                          \
6198         vaf.va = &args;                                         \
6199                                                                 \
6200         r = __netdev_printk(level, dev, &vaf);                  \
6201                                                                 \
6202         va_end(args);                                           \
6203                                                                 \
6204         return r;                                               \
6205 }                                                               \
6206 EXPORT_SYMBOL(func);
6207
6208 define_netdev_printk_level(netdev_emerg, KERN_EMERG);
6209 define_netdev_printk_level(netdev_alert, KERN_ALERT);
6210 define_netdev_printk_level(netdev_crit, KERN_CRIT);
6211 define_netdev_printk_level(netdev_err, KERN_ERR);
6212 define_netdev_printk_level(netdev_warn, KERN_WARNING);
6213 define_netdev_printk_level(netdev_notice, KERN_NOTICE);
6214 define_netdev_printk_level(netdev_info, KERN_INFO);
6215
6216 static void __net_exit netdev_exit(struct net *net)
6217 {
6218         kfree(net->dev_name_head);
6219         kfree(net->dev_index_head);
6220 }
6221
6222 static struct pernet_operations __net_initdata netdev_net_ops = {
6223         .init = netdev_init,
6224         .exit = netdev_exit,
6225 };
6226
6227 static void __net_exit default_device_exit(struct net *net)
6228 {
6229         struct net_device *dev, *aux;
6230         /*
6231          * Push all migratable network devices back to the
6232          * initial network namespace
6233          */
6234         rtnl_lock();
6235         for_each_netdev_safe(net, dev, aux) {
6236                 int err;
6237                 char fb_name[IFNAMSIZ];
6238
6239                 /* Ignore unmoveable devices (i.e. loopback) */
6240                 if (dev->features & NETIF_F_NETNS_LOCAL)
6241                         continue;
6242
6243                 /* Leave virtual devices for the generic cleanup */
6244                 if (dev->rtnl_link_ops)
6245                         continue;
6246
6247                 /* Push remaining network devices to init_net */
6248                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6249                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6250                 if (err) {
6251                         pr_emerg("%s: failed to move %s to init_net: %d\n",
6252                                  __func__, dev->name, err);
6253                         BUG();
6254                 }
6255         }
6256         rtnl_unlock();
6257 }
6258
6259 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6260 {
6261         /* At exit all network devices most be removed from a network
6262          * namespace.  Do this in the reverse order of registration.
6263          * Do this across as many network namespaces as possible to
6264          * improve batching efficiency.
6265          */
6266         struct net_device *dev;
6267         struct net *net;
6268         LIST_HEAD(dev_kill_list);
6269
6270         rtnl_lock();
6271         list_for_each_entry(net, net_list, exit_list) {
6272                 for_each_netdev_reverse(net, dev) {
6273                         if (dev->rtnl_link_ops)
6274                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6275                         else
6276                                 unregister_netdevice_queue(dev, &dev_kill_list);
6277                 }
6278         }
6279         unregister_netdevice_many(&dev_kill_list);
6280         rtnl_unlock();
6281 }
6282
6283 static struct pernet_operations __net_initdata default_device_ops = {
6284         .exit = default_device_exit,
6285         .exit_batch = default_device_exit_batch,
6286 };
6287
6288 /*
6289  *      Initialize the DEV module. At boot time this walks the device list and
6290  *      unhooks any devices that fail to initialise (normally hardware not
6291  *      present) and leaves us with a valid list of present and active devices.
6292  *
6293  */
6294
6295 /*
6296  *       This is called single threaded during boot, so no need
6297  *       to take the rtnl semaphore.
6298  */
6299 static int __init net_dev_init(void)
6300 {
6301         int i, rc = -ENOMEM;
6302
6303         BUG_ON(!dev_boot_phase);
6304
6305         if (dev_proc_init())
6306                 goto out;
6307
6308         if (netdev_kobject_init())
6309                 goto out;
6310
6311         INIT_LIST_HEAD(&ptype_all);
6312         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6313                 INIT_LIST_HEAD(&ptype_base[i]);
6314
6315         INIT_LIST_HEAD(&offload_base);
6316
6317         if (register_pernet_subsys(&netdev_net_ops))
6318                 goto out;
6319
6320         /*
6321          *      Initialise the packet receive queues.
6322          */
6323
6324         for_each_possible_cpu(i) {
6325                 struct softnet_data *sd = &per_cpu(softnet_data, i);
6326
6327                 memset(sd, 0, sizeof(*sd));
6328                 skb_queue_head_init(&sd->input_pkt_queue);
6329                 skb_queue_head_init(&sd->process_queue);
6330                 sd->completion_queue = NULL;
6331                 INIT_LIST_HEAD(&sd->poll_list);
6332                 sd->output_queue = NULL;
6333                 sd->output_queue_tailp = &sd->output_queue;
6334 #ifdef CONFIG_RPS
6335                 sd->csd.func = rps_trigger_softirq;
6336                 sd->csd.info = sd;
6337                 sd->csd.flags = 0;
6338                 sd->cpu = i;
6339 #endif
6340
6341                 sd->backlog.poll = process_backlog;
6342                 sd->backlog.weight = weight_p;
6343                 sd->backlog.gro_list = NULL;
6344                 sd->backlog.gro_count = 0;
6345         }
6346
6347         dev_boot_phase = 0;
6348
6349         /* The loopback device is special if any other network devices
6350          * is present in a network namespace the loopback device must
6351          * be present. Since we now dynamically allocate and free the
6352          * loopback device ensure this invariant is maintained by
6353          * keeping the loopback device as the first device on the
6354          * list of network devices.  Ensuring the loopback devices
6355          * is the first device that appears and the last network device
6356          * that disappears.
6357          */
6358         if (register_pernet_device(&loopback_net_ops))
6359                 goto out;
6360
6361         if (register_pernet_device(&default_device_ops))
6362                 goto out;
6363
6364         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6365         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6366
6367         hotcpu_notifier(dev_cpu_callback, 0);
6368         dst_init();
6369         rc = 0;
6370 out:
6371         return rc;
6372 }
6373
6374 subsys_initcall(net_dev_init);