SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/hash.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/ethtool.h>
  95 #include <linux/notifier.h>
  96 #include <linux/skbuff.h>
  97 #include <net/net_namespace.h>
  98 #include <net/sock.h>
  99 #include <linux/rtnetlink.h>
 100 #include <linux/proc_fs.h>
 101 #include <linux/seq_file.h>
 102 #include <linux/stat.h>
 103 #include <linux/if_bridge.h>
 104 #include <linux/if_macvlan.h>
 105 #include <net/dst.h>
 106 #include <net/pkt_sched.h>
 107 #include <net/checksum.h>
 108 #include <net/xfrm.h>
 109 #include <linux/highmem.h>
 110 #include <linux/init.h>
 111 #include <linux/kmod.h>
 112 #include <linux/module.h>
 113 #include <linux/netpoll.h>
 114 #include <linux/rcupdate.h>
 115 #include <linux/delay.h>
 116 #include <net/wext.h>
 117 #include <net/iw_handler.h>
 118 #include <asm/current.h>
 119 #include <linux/audit.h>
 120 #include <linux/dmaengine.h>
 121 #include <linux/err.h>
 122 #include <linux/ctype.h>
 123 #include <linux/if_arp.h>
 124 #include <linux/if_vlan.h>
 125 #include <linux/ip.h>
 126 #include <net/ip.h>
 127 #include <linux/ipv6.h>
 128 #include <linux/in.h>
 129 #include <linux/jhash.h>
 130 #include <linux/random.h>
 131 #include <trace/events/napi.h>
 132 #include <linux/pci.h>
 133
 134 #include "net-sysfs.h"
 135
 136 /* Instead of increasing this, you should create a hash table. */
 137 #define MAX_GRO_SKBS 8
 138
 139 /* This should be increased if a protocol with a bigger head is added. */
 140 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 141
 142 /*
 143  *      The list of packet types we will receive (as opposed to discard)
 144  *      and the routines to invoke.
 145  *
 146  *      Why 16. Because with 16 the only overlap we get on a hash of the
 147  *      low nibble of the protocol value is RARP/SNAP/X.25.
 148  *
 149  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 150  *             sure which should go first, but I bet it won't make much
 151  *             difference if we are running VLANs.  The good news is that
 152  *             this protocol won't be in the list unless compiled in, so
 153  *             the average user (w/out VLANs) will not be adversely affected.
 154  *             --BLG
 155  *
 156  *              0800    IP
 157  *              8100    802.1Q VLAN
 158  *              0001    802.3
 159  *              0002    AX.25
 160  *              0004    802.2
 161  *              8035    RARP
 162  *              0005    SNAP
 163  *              0805    X.25
 164  *              0806    ARP
 165  *              8137    IPX
 166  *              0009    Localtalk
 167  *              86DD    IPv6
 168  */
 169
 170 #define PTYPE_HASH_SIZE (16)
 171 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 172
 173 static DEFINE_SPINLOCK(ptype_lock);
 174 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 175 static struct list_head ptype_all __read_mostly;        /* Taps */
 176
 177 /*
 178  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 179  * semaphore.
 180  *
 181  * Pure readers hold dev_base_lock for reading, or rcu_read_lock()
 182  *
 183  * Writers must hold the rtnl semaphore while they loop through the
 184  * dev_base_head list, and hold dev_base_lock for writing when they do the
 185  * actual updates.  This allows pure readers to access the list even
 186  * while a writer is preparing to update it.
 187  *
 188  * To put it another way, dev_base_lock is held for writing only to
 189  * protect against pure readers; the rtnl semaphore provides the
 190  * protection against other writers.
 191  *
 192  * See, for example usages, register_netdevice() and
 193  * unregister_netdevice(), which must be called with the rtnl
 194  * semaphore held.
 195  */
 196 DEFINE_RWLOCK(dev_base_lock);
 197 EXPORT_SYMBOL(dev_base_lock);
 198
 199 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 200 {
 201         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 202         return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
 203 }
 204
 205 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 206 {
 207         return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
 208 }
 209
 210 static inline void rps_lock(struct softnet_data *queue)
 211 {
 212 #ifdef CONFIG_RPS
 213         spin_lock(&queue->input_pkt_queue.lock);
 214 #endif
 215 }
 216
 217 static inline void rps_unlock(struct softnet_data *queue)
 218 {
 219 #ifdef CONFIG_RPS
 220         spin_unlock(&queue->input_pkt_queue.lock);
 221 #endif
 222 }
 223
 224 /* Device list insertion */
 225 static int list_netdevice(struct net_device *dev)
 226 {
 227         struct net *net = dev_net(dev);
 228
 229         ASSERT_RTNL();
 230
 231         write_lock_bh(&dev_base_lock);
 232         list_add_tail_rcu(&dev->dev_list, &net->dev_base_head);
 233         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
 234         hlist_add_head_rcu(&dev->index_hlist,
 235                            dev_index_hash(net, dev->ifindex));
 236         write_unlock_bh(&dev_base_lock);
 237         return 0;
 238 }
 239
 240 /* Device list removal
 241  * caller must respect a RCU grace period before freeing/reusing dev
 242  */
 243 static void unlist_netdevice(struct net_device *dev)
 244 {
 245         ASSERT_RTNL();
 246
 247         /* Unlink dev from the device chain */
 248         write_lock_bh(&dev_base_lock);
 249         list_del_rcu(&dev->dev_list);
 250         hlist_del_rcu(&dev->name_hlist);
 251         hlist_del_rcu(&dev->index_hlist);
 252         write_unlock_bh(&dev_base_lock);
 253 }
 254
 255 /*
 256  *      Our notifier list
 257  */
 258
 259 static RAW_NOTIFIER_HEAD(netdev_chain);
 260
 261 /*
 262  *      Device drivers call our routines to queue packets here. We empty the
 263  *      queue in the local softnet handler.
 264  */
 265
 266 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 267 EXPORT_PER_CPU_SYMBOL(softnet_data);
 268
 269 #ifdef CONFIG_LOCKDEP
 270 /*
 271  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 272  * according to dev->type
 273  */
 274 static const unsigned short netdev_lock_type[] =
 275         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 276          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 277          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 278          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 279          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 280          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 281          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 282          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 283          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 284          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 285          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 286          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 287          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 288          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 289          ARPHRD_PHONET_PIPE, ARPHRD_IEEE802154,
 290          ARPHRD_VOID, ARPHRD_NONE};
 291
 292 static const char *const netdev_lock_name[] =
 293         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 294          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 295          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 296          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 297          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 298          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 299          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 300          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 301          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 302          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 303          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 304          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 305          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 306          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 307          "_xmit_PHONET_PIPE", "_xmit_IEEE802154",
 308          "_xmit_VOID", "_xmit_NONE"};
 309
 310 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 311 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 312
 313 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 314 {
 315         int i;
 316
 317         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 318                 if (netdev_lock_type[i] == dev_type)
 319                         return i;
 320         /* the last key is used by default */
 321         return ARRAY_SIZE(netdev_lock_type) - 1;
 322 }
 323
 324 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 325                                                  unsigned short dev_type)
 326 {
 327         int i;
 328
 329         i = netdev_lock_pos(dev_type);
 330         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 331                                    netdev_lock_name[i]);
 332 }
 333
 334 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 335 {
 336         int i;
 337
 338         i = netdev_lock_pos(dev->type);
 339         lockdep_set_class_and_name(&dev->addr_list_lock,
 340                                    &netdev_addr_lock_key[i],
 341                                    netdev_lock_name[i]);
 342 }
 343 #else
 344 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 345                                                  unsigned short dev_type)
 346 {
 347 }
 348 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 349 {
 350 }
 351 #endif
 352
 353 /*******************************************************************************
 354
 355                 Protocol management and registration routines
 356
 357 *******************************************************************************/
 358
 359 /*
 360  *      Add a protocol ID to the list. Now that the input handler is
 361  *      smarter we can dispense with all the messy stuff that used to be
 362  *      here.
 363  *
 364  *      BEWARE!!! Protocol handlers, mangling input packets,
 365  *      MUST BE last in hash buckets and checking protocol handlers
 366  *      MUST start from promiscuous ptype_all chain in net_bh.
 367  *      It is true now, do not change it.
 368  *      Explanation follows: if protocol handler, mangling packet, will
 369  *      be the first on list, it is not able to sense, that packet
 370  *      is cloned and should be copied-on-write, so that it will
 371  *      change it and subsequent readers will get broken packet.
 372  *                                                      --ANK (980803)
 373  */
 374
 375 /**
 376  *      dev_add_pack - add packet handler
 377  *      @pt: packet type declaration
 378  *
 379  *      Add a protocol handler to the networking stack. The passed &packet_type
 380  *      is linked into kernel lists and may not be freed until it has been
 381  *      removed from the kernel lists.
 382  *
 383  *      This call does not sleep therefore it can not
 384  *      guarantee all CPU's that are in middle of receiving packets
 385  *      will see the new packet type (until the next received packet).
 386  */
 387
 388 void dev_add_pack(struct packet_type *pt)
 389 {
 390         int hash;
 391
 392         spin_lock_bh(&ptype_lock);
 393         if (pt->type == htons(ETH_P_ALL))
 394                 list_add_rcu(&pt->list, &ptype_all);
 395         else {
 396                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 397                 list_add_rcu(&pt->list, &ptype_base[hash]);
 398         }
 399         spin_unlock_bh(&ptype_lock);
 400 }
 401 EXPORT_SYMBOL(dev_add_pack);
 402
 403 /**
 404  *      __dev_remove_pack        - remove packet handler
 405  *      @pt: packet type declaration
 406  *
 407  *      Remove a protocol handler that was previously added to the kernel
 408  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 409  *      from the kernel lists and can be freed or reused once this function
 410  *      returns.
 411  *
 412  *      The packet type might still be in use by receivers
 413  *      and must not be freed until after all the CPU's have gone
 414  *      through a quiescent state.
 415  */
 416 void __dev_remove_pack(struct packet_type *pt)
 417 {
 418         struct list_head *head;
 419         struct packet_type *pt1;
 420
 421         spin_lock_bh(&ptype_lock);
 422
 423         if (pt->type == htons(ETH_P_ALL))
 424                 head = &ptype_all;
 425         else
 426                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 427
 428         list_for_each_entry(pt1, head, list) {
 429                 if (pt == pt1) {
 430                         list_del_rcu(&pt->list);
 431                         goto out;
 432                 }
 433         }
 434
 435         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 436 out:
 437         spin_unlock_bh(&ptype_lock);
 438 }
 439 EXPORT_SYMBOL(__dev_remove_pack);
 440
 441 /**
 442  *      dev_remove_pack  - remove packet handler
 443  *      @pt: packet type declaration
 444  *
 445  *      Remove a protocol handler that was previously added to the kernel
 446  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 447  *      from the kernel lists and can be freed or reused once this function
 448  *      returns.
 449  *
 450  *      This call sleeps to guarantee that no CPU is looking at the packet
 451  *      type after return.
 452  */
 453 void dev_remove_pack(struct packet_type *pt)
 454 {
 455         __dev_remove_pack(pt);
 456
 457         synchronize_net();
 458 }
 459 EXPORT_SYMBOL(dev_remove_pack);
 460
 461 /******************************************************************************
 462
 463                       Device Boot-time Settings Routines
 464
 465 *******************************************************************************/
 466
 467 /* Boot time configuration table */
 468 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 469
 470 /**
 471  *      netdev_boot_setup_add   - add new setup entry
 472  *      @name: name of the device
 473  *      @map: configured settings for the device
 474  *
 475  *      Adds new setup entry to the dev_boot_setup list.  The function
 476  *      returns 0 on error and 1 on success.  This is a generic routine to
 477  *      all netdevices.
 478  */
 479 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 480 {
 481         struct netdev_boot_setup *s;
 482         int i;
 483
 484         s = dev_boot_setup;
 485         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 486                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 487                         memset(s[i].name, 0, sizeof(s[i].name));
 488                         strlcpy(s[i].name, name, IFNAMSIZ);
 489                         memcpy(&s[i].map, map, sizeof(s[i].map));
 490                         break;
 491                 }
 492         }
 493
 494         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 495 }
 496
 497 /**
 498  *      netdev_boot_setup_check - check boot time settings
 499  *      @dev: the netdevice
 500  *
 501  *      Check boot time settings for the device.
 502  *      The found settings are set for the device to be used
 503  *      later in the device probing.
 504  *      Returns 0 if no settings found, 1 if they are.
 505  */
 506 int netdev_boot_setup_check(struct net_device *dev)
 507 {
 508         struct netdev_boot_setup *s = dev_boot_setup;
 509         int i;
 510
 511         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 512                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 513                     !strcmp(dev->name, s[i].name)) {
 514                         dev->irq        = s[i].map.irq;
 515                         dev->base_addr  = s[i].map.base_addr;
 516                         dev->mem_start  = s[i].map.mem_start;
 517                         dev->mem_end    = s[i].map.mem_end;
 518                         return 1;
 519                 }
 520         }
 521         return 0;
 522 }
 523 EXPORT_SYMBOL(netdev_boot_setup_check);
 524
 525
 526 /**
 527  *      netdev_boot_base        - get address from boot time settings
 528  *      @prefix: prefix for network device
 529  *      @unit: id for network device
 530  *
 531  *      Check boot time settings for the base address of device.
 532  *      The found settings are set for the device to be used
 533  *      later in the device probing.
 534  *      Returns 0 if no settings found.
 535  */
 536 unsigned long netdev_boot_base(const char *prefix, int unit)
 537 {
 538         const struct netdev_boot_setup *s = dev_boot_setup;
 539         char name[IFNAMSIZ];
 540         int i;
 541
 542         sprintf(name, "%s%d", prefix, unit);
 543
 544         /*
 545          * If device already registered then return base of 1
 546          * to indicate not to probe for this interface
 547          */
 548         if (__dev_get_by_name(&init_net, name))
 549                 return 1;
 550
 551         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 552                 if (!strcmp(name, s[i].name))
 553                         return s[i].map.base_addr;
 554         return 0;
 555 }
 556
 557 /*
 558  * Saves at boot time configured settings for any netdevice.
 559  */
 560 int __init netdev_boot_setup(char *str)
 561 {
 562         int ints[5];
 563         struct ifmap map;
 564
 565         str = get_options(str, ARRAY_SIZE(ints), ints);
 566         if (!str || !*str)
 567                 return 0;
 568
 569         /* Save settings */
 570         memset(&map, 0, sizeof(map));
 571         if (ints[0] > 0)
 572                 map.irq = ints[1];
 573         if (ints[0] > 1)
 574                 map.base_addr = ints[2];
 575         if (ints[0] > 2)
 576                 map.mem_start = ints[3];
 577         if (ints[0] > 3)
 578                 map.mem_end = ints[4];
 579
 580         /* Add new entry to the list */
 581         return netdev_boot_setup_add(str, &map);
 582 }
 583
 584 __setup("netdev=", netdev_boot_setup);
 585
 586 /*******************************************************************************
 587
 588                             Device Interface Subroutines
 589
 590 *******************************************************************************/
 591
 592 /**
 593  *      __dev_get_by_name       - find a device by its name
 594  *      @net: the applicable net namespace
 595  *      @name: name to find
 596  *
 597  *      Find an interface by name. Must be called under RTNL semaphore
 598  *      or @dev_base_lock. If the name is found a pointer to the device
 599  *      is returned. If the name is not found then %NULL is returned. The
 600  *      reference counters are not incremented so the caller must be
 601  *      careful with locks.
 602  */
 603
 604 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 605 {
 606         struct hlist_node *p;
 607         struct net_device *dev;
 608         struct hlist_head *head = dev_name_hash(net, name);
 609
 610         hlist_for_each_entry(dev, p, head, name_hlist)
 611                 if (!strncmp(dev->name, name, IFNAMSIZ))
 612                         return dev;
 613
 614         return NULL;
 615 }
 616 EXPORT_SYMBOL(__dev_get_by_name);
 617
 618 /**
 619  *      dev_get_by_name_rcu     - find a device by its name
 620  *      @net: the applicable net namespace
 621  *      @name: name to find
 622  *
 623  *      Find an interface by name.
 624  *      If the name is found a pointer to the device is returned.
 625  *      If the name is not found then %NULL is returned.
 626  *      The reference counters are not incremented so the caller must be
 627  *      careful with locks. The caller must hold RCU lock.
 628  */
 629
 630 struct net_device *dev_get_by_name_rcu(struct net *net, const char *name)
 631 {
 632         struct hlist_node *p;
 633         struct net_device *dev;
 634         struct hlist_head *head = dev_name_hash(net, name);
 635
 636         hlist_for_each_entry_rcu(dev, p, head, name_hlist)
 637                 if (!strncmp(dev->name, name, IFNAMSIZ))
 638                         return dev;
 639
 640         return NULL;
 641 }
 642 EXPORT_SYMBOL(dev_get_by_name_rcu);
 643
 644 /**
 645  *      dev_get_by_name         - find a device by its name
 646  *      @net: the applicable net namespace
 647  *      @name: name to find
 648  *
 649  *      Find an interface by name. This can be called from any
 650  *      context and does its own locking. The returned handle has
 651  *      the usage count incremented and the caller must use dev_put() to
 652  *      release it when it is no longer needed. %NULL is returned if no
 653  *      matching device is found.
 654  */
 655
 656 struct net_device *dev_get_by_name(struct net *net, const char *name)
 657 {
 658         struct net_device *dev;
 659
 660         rcu_read_lock();
 661         dev = dev_get_by_name_rcu(net, name);
 662         if (dev)
 663                 dev_hold(dev);
 664         rcu_read_unlock();
 665         return dev;
 666 }
 667 EXPORT_SYMBOL(dev_get_by_name);
 668
 669 /**
 670  *      __dev_get_by_index - find a device by its ifindex
 671  *      @net: the applicable net namespace
 672  *      @ifindex: index of device
 673  *
 674  *      Search for an interface by index. Returns %NULL if the device
 675  *      is not found or a pointer to the device. The device has not
 676  *      had its reference counter increased so the caller must be careful
 677  *      about locking. The caller must hold either the RTNL semaphore
 678  *      or @dev_base_lock.
 679  */
 680
 681 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 682 {
 683         struct hlist_node *p;
 684         struct net_device *dev;
 685         struct hlist_head *head = dev_index_hash(net, ifindex);
 686
 687         hlist_for_each_entry(dev, p, head, index_hlist)
 688                 if (dev->ifindex == ifindex)
 689                         return dev;
 690
 691         return NULL;
 692 }
 693 EXPORT_SYMBOL(__dev_get_by_index);
 694
 695 /**
 696  *      dev_get_by_index_rcu - find a device by its ifindex
 697  *      @net: the applicable net namespace
 698  *      @ifindex: index of device
 699  *
 700  *      Search for an interface by index. Returns %NULL if the device
 701  *      is not found or a pointer to the device. The device has not
 702  *      had its reference counter increased so the caller must be careful
 703  *      about locking. The caller must hold RCU lock.
 704  */
 705
 706 struct net_device *dev_get_by_index_rcu(struct net *net, int ifindex)
 707 {
 708         struct hlist_node *p;
 709         struct net_device *dev;
 710         struct hlist_head *head = dev_index_hash(net, ifindex);
 711
 712         hlist_for_each_entry_rcu(dev, p, head, index_hlist)
 713                 if (dev->ifindex == ifindex)
 714                         return dev;
 715
 716         return NULL;
 717 }
 718 EXPORT_SYMBOL(dev_get_by_index_rcu);
 719
 720
 721 /**
 722  *      dev_get_by_index - find a device by its ifindex
 723  *      @net: the applicable net namespace
 724  *      @ifindex: index of device
 725  *
 726  *      Search for an interface by index. Returns NULL if the device
 727  *      is not found or a pointer to the device. The device returned has
 728  *      had a reference added and the pointer is safe until the user calls
 729  *      dev_put to indicate they have finished with it.
 730  */
 731
 732 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 733 {
 734         struct net_device *dev;
 735
 736         rcu_read_lock();
 737         dev = dev_get_by_index_rcu(net, ifindex);
 738         if (dev)
 739                 dev_hold(dev);
 740         rcu_read_unlock();
 741         return dev;
 742 }
 743 EXPORT_SYMBOL(dev_get_by_index);
 744
 745 /**
 746  *      dev_getbyhwaddr - find a device by its hardware address
 747  *      @net: the applicable net namespace
 748  *      @type: media type of device
 749  *      @ha: hardware address
 750  *
 751  *      Search for an interface by MAC address. Returns NULL if the device
 752  *      is not found or a pointer to the device. The caller must hold the
 753  *      rtnl semaphore. The returned device has not had its ref count increased
 754  *      and the caller must therefore be careful about locking
 755  *
 756  *      BUGS:
 757  *      If the API was consistent this would be __dev_get_by_hwaddr
 758  */
 759
 760 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 761 {
 762         struct net_device *dev;
 763
 764         ASSERT_RTNL();
 765
 766         for_each_netdev(net, dev)
 767                 if (dev->type == type &&
 768                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 769                         return dev;
 770
 771         return NULL;
 772 }
 773 EXPORT_SYMBOL(dev_getbyhwaddr);
 774
 775 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 776 {
 777         struct net_device *dev;
 778
 779         ASSERT_RTNL();
 780         for_each_netdev(net, dev)
 781                 if (dev->type == type)
 782                         return dev;
 783
 784         return NULL;
 785 }
 786 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 787
 788 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 789 {
 790         struct net_device *dev, *ret = NULL;
 791
 792         rcu_read_lock();
 793         for_each_netdev_rcu(net, dev)
 794                 if (dev->type == type) {
 795                         dev_hold(dev);
 796                         ret = dev;
 797                         break;
 798                 }
 799         rcu_read_unlock();
 800         return ret;
 801 }
 802 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 803
 804 /**
 805  *      dev_get_by_flags - find any device with given flags
 806  *      @net: the applicable net namespace
 807  *      @if_flags: IFF_* values
 808  *      @mask: bitmask of bits in if_flags to check
 809  *
 810  *      Search for any interface with the given flags. Returns NULL if a device
 811  *      is not found or a pointer to the device. The device returned has
 812  *      had a reference added and the pointer is safe until the user calls
 813  *      dev_put to indicate they have finished with it.
 814  */
 815
 816 struct net_device *dev_get_by_flags(struct net *net, unsigned short if_flags,
 817                                     unsigned short mask)
 818 {
 819         struct net_device *dev, *ret;
 820
 821         ret = NULL;
 822         rcu_read_lock();
 823         for_each_netdev_rcu(net, dev) {
 824                 if (((dev->flags ^ if_flags) & mask) == 0) {
 825                         dev_hold(dev);
 826                         ret = dev;
 827                         break;
 828                 }
 829         }
 830         rcu_read_unlock();
 831         return ret;
 832 }
 833 EXPORT_SYMBOL(dev_get_by_flags);
 834
 835 /**
 836  *      dev_valid_name - check if name is okay for network device
 837  *      @name: name string
 838  *
 839  *      Network device names need to be valid file names to
 840  *      to allow sysfs to work.  We also disallow any kind of
 841  *      whitespace.
 842  */
 843 int dev_valid_name(const char *name)
 844 {
 845         if (*name == '\0')
 846                 return 0;
 847         if (strlen(name) >= IFNAMSIZ)
 848                 return 0;
 849         if (!strcmp(name, ".") || !strcmp(name, ".."))
 850                 return 0;
 851
 852         while (*name) {
 853                 if (*name == '/' || isspace(*name))
 854                         return 0;
 855                 name++;
 856         }
 857         return 1;
 858 }
 859 EXPORT_SYMBOL(dev_valid_name);
 860
 861 /**
 862  *      __dev_alloc_name - allocate a name for a device
 863  *      @net: network namespace to allocate the device name in
 864  *      @name: name format string
 865  *      @buf:  scratch buffer and result name string
 866  *
 867  *      Passed a format string - eg "lt%d" it will try and find a suitable
 868  *      id. It scans list of devices to build up a free map, then chooses
 869  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 870  *      while allocating the name and adding the device in order to avoid
 871  *      duplicates.
 872  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 873  *      Returns the number of the unit assigned or a negative errno code.
 874  */
 875
 876 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 877 {
 878         int i = 0;
 879         const char *p;
 880         const int max_netdevices = 8*PAGE_SIZE;
 881         unsigned long *inuse;
 882         struct net_device *d;
 883
 884         p = strnchr(name, IFNAMSIZ-1, '%');
 885         if (p) {
 886                 /*
 887                  * Verify the string as this thing may have come from
 888                  * the user.  There must be either one "%d" and no other "%"
 889                  * characters.
 890                  */
 891                 if (p[1] != 'd' || strchr(p + 2, '%'))
 892                         return -EINVAL;
 893
 894                 /* Use one page as a bit array of possible slots */
 895                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 896                 if (!inuse)
 897                         return -ENOMEM;
 898
 899                 for_each_netdev(net, d) {
 900                         if (!sscanf(d->name, name, &i))
 901                                 continue;
 902                         if (i < 0 || i >= max_netdevices)
 903                                 continue;
 904
 905                         /*  avoid cases where sscanf is not exact inverse of printf */
 906                         snprintf(buf, IFNAMSIZ, name, i);
 907                         if (!strncmp(buf, d->name, IFNAMSIZ))
 908                                 set_bit(i, inuse);
 909                 }
 910
 911                 i = find_first_zero_bit(inuse, max_netdevices);
 912                 free_page((unsigned long) inuse);
 913         }
 914
 915         if (buf != name)
 916                 snprintf(buf, IFNAMSIZ, name, i);
 917         if (!__dev_get_by_name(net, buf))
 918                 return i;
 919
 920         /* It is possible to run out of possible slots
 921          * when the name is long and there isn't enough space left
 922          * for the digits, or if all bits are used.
 923          */
 924         return -ENFILE;
 925 }
 926
 927 /**
 928  *      dev_alloc_name - allocate a name for a device
 929  *      @dev: device
 930  *      @name: name format string
 931  *
 932  *      Passed a format string - eg "lt%d" it will try and find a suitable
 933  *      id. It scans list of devices to build up a free map, then chooses
 934  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 935  *      while allocating the name and adding the device in order to avoid
 936  *      duplicates.
 937  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 938  *      Returns the number of the unit assigned or a negative errno code.
 939  */
 940
 941 int dev_alloc_name(struct net_device *dev, const char *name)
 942 {
 943         char buf[IFNAMSIZ];
 944         struct net *net;
 945         int ret;
 946
 947         BUG_ON(!dev_net(dev));
 948         net = dev_net(dev);
 949         ret = __dev_alloc_name(net, name, buf);
 950         if (ret >= 0)
 951                 strlcpy(dev->name, buf, IFNAMSIZ);
 952         return ret;
 953 }
 954 EXPORT_SYMBOL(dev_alloc_name);
 955
 956 static int dev_get_valid_name(struct net *net, const char *name, char *buf,
 957                               bool fmt)
 958 {
 959         if (!dev_valid_name(name))
 960                 return -EINVAL;
 961
 962         if (fmt && strchr(name, '%'))
 963                 return __dev_alloc_name(net, name, buf);
 964         else if (__dev_get_by_name(net, name))
 965                 return -EEXIST;
 966         else if (buf != name)
 967                 strlcpy(buf, name, IFNAMSIZ);
 968
 969         return 0;
 970 }
 971
 972 /**
 973  *      dev_change_name - change name of a device
 974  *      @dev: device
 975  *      @newname: name (or format string) must be at least IFNAMSIZ
 976  *
 977  *      Change name of a device, can pass format strings "eth%d".
 978  *      for wildcarding.
 979  */
 980 int dev_change_name(struct net_device *dev, const char *newname)
 981 {
 982         char oldname[IFNAMSIZ];
 983         int err = 0;
 984         int ret;
 985         struct net *net;
 986
 987         ASSERT_RTNL();
 988         BUG_ON(!dev_net(dev));
 989
 990         net = dev_net(dev);
 991         if (dev->flags & IFF_UP)
 992                 return -EBUSY;
 993
 994         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 995                 return 0;
 996
 997         memcpy(oldname, dev->name, IFNAMSIZ);
 998
 999         err = dev_get_valid_name(net, newname, dev->name, 1);
1000         if (err < 0)
1001                 return err;
1002
1003 rollback:
1004         /* For now only devices in the initial network namespace
1005          * are in sysfs.
1006          */
1007         if (net_eq(net, &init_net)) {
1008                 ret = device_rename(&dev->dev, dev->name);
1009                 if (ret) {
1010                         memcpy(dev->name, oldname, IFNAMSIZ);
1011                         return ret;
1012                 }
1013         }
1014
1015         write_lock_bh(&dev_base_lock);
1016         hlist_del(&dev->name_hlist);
1017         write_unlock_bh(&dev_base_lock);
1018
1019         synchronize_rcu();
1020
1021         write_lock_bh(&dev_base_lock);
1022         hlist_add_head_rcu(&dev->name_hlist, dev_name_hash(net, dev->name));
1023         write_unlock_bh(&dev_base_lock);
1024
1025         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
1026         ret = notifier_to_errno(ret);
1027
1028         if (ret) {
1029                 /* err >= 0 after dev_alloc_name() or stores the first errno */
1030                 if (err >= 0) {
1031                         err = ret;
1032                         memcpy(dev->name, oldname, IFNAMSIZ);
1033                         goto rollback;
1034                 } else {
1035                         printk(KERN_ERR
1036                                "%s: name change rollback failed: %d.\n",
1037                                dev->name, ret);
1038                 }
1039         }
1040
1041         return err;
1042 }
1043
1044 /**
1045  *      dev_set_alias - change ifalias of a device
1046  *      @dev: device
1047  *      @alias: name up to IFALIASZ
1048  *      @len: limit of bytes to copy from info
1049  *
1050  *      Set ifalias for a device,
1051  */
1052 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
1053 {
1054         ASSERT_RTNL();
1055
1056         if (len >= IFALIASZ)
1057                 return -EINVAL;
1058
1059         if (!len) {
1060                 if (dev->ifalias) {
1061                         kfree(dev->ifalias);
1062                         dev->ifalias = NULL;
1063                 }
1064                 return 0;
1065         }
1066
1067         dev->ifalias = krealloc(dev->ifalias, len + 1, GFP_KERNEL);
1068         if (!dev->ifalias)
1069                 return -ENOMEM;
1070
1071         strlcpy(dev->ifalias, alias, len+1);
1072         return len;
1073 }
1074
1075
1076 /**
1077  *      netdev_features_change - device changes features
1078  *      @dev: device to cause notification
1079  *
1080  *      Called to indicate a device has changed features.
1081  */
1082 void netdev_features_change(struct net_device *dev)
1083 {
1084         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
1085 }
1086 EXPORT_SYMBOL(netdev_features_change);
1087
1088 /**
1089  *      netdev_state_change - device changes state
1090  *      @dev: device to cause notification
1091  *
1092  *      Called to indicate a device has changed state. This function calls
1093  *      the notifier chains for netdev_chain and sends a NEWLINK message
1094  *      to the routing socket.
1095  */
1096 void netdev_state_change(struct net_device *dev)
1097 {
1098         if (dev->flags & IFF_UP) {
1099                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1100                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1101         }
1102 }
1103 EXPORT_SYMBOL(netdev_state_change);
1104
1105 int netdev_bonding_change(struct net_device *dev, unsigned long event)
1106 {
1107         return call_netdevice_notifiers(event, dev);
1108 }
1109 EXPORT_SYMBOL(netdev_bonding_change);
1110
1111 /**
1112  *      dev_load        - load a network module
1113  *      @net: the applicable net namespace
1114  *      @name: name of interface
1115  *
1116  *      If a network interface is not present and the process has suitable
1117  *      privileges this function loads the module. If module loading is not
1118  *      available in this kernel then it becomes a nop.
1119  */
1120
1121 void dev_load(struct net *net, const char *name)
1122 {
1123         struct net_device *dev;
1124
1125         rcu_read_lock();
1126         dev = dev_get_by_name_rcu(net, name);
1127         rcu_read_unlock();
1128
1129         if (!dev && capable(CAP_NET_ADMIN))
1130                 request_module("%s", name);
1131 }
1132 EXPORT_SYMBOL(dev_load);
1133
1134 static int __dev_open(struct net_device *dev)
1135 {
1136         const struct net_device_ops *ops = dev->netdev_ops;
1137         int ret;
1138
1139         ASSERT_RTNL();
1140
1141         /*
1142          *      Is it even present?
1143          */
1144         if (!netif_device_present(dev))
1145                 return -ENODEV;
1146
1147         ret = call_netdevice_notifiers(NETDEV_PRE_UP, dev);
1148         ret = notifier_to_errno(ret);
1149         if (ret)
1150                 return ret;
1151
1152         /*
1153          *      Call device private open method
1154          */
1155         set_bit(__LINK_STATE_START, &dev->state);
1156
1157         if (ops->ndo_validate_addr)
1158                 ret = ops->ndo_validate_addr(dev);
1159
1160         if (!ret && ops->ndo_open)
1161                 ret = ops->ndo_open(dev);
1162
1163         /*
1164          *      If it went open OK then:
1165          */
1166
1167         if (ret)
1168                 clear_bit(__LINK_STATE_START, &dev->state);
1169         else {
1170                 /*
1171                  *      Set the flags.
1172                  */
1173                 dev->flags |= IFF_UP;
1174
1175                 /*
1176                  *      Enable NET_DMA
1177                  */
1178                 net_dmaengine_get();
1179
1180                 /*
1181                  *      Initialize multicasting status
1182                  */
1183                 dev_set_rx_mode(dev);
1184
1185                 /*
1186                  *      Wakeup transmit queue engine
1187                  */
1188                 dev_activate(dev);
1189         }
1190
1191         return ret;
1192 }
1193
1194 /**
1195  *      dev_open        - prepare an interface for use.
1196  *      @dev:   device to open
1197  *
1198  *      Takes a device from down to up state. The device's private open
1199  *      function is invoked and then the multicast lists are loaded. Finally
1200  *      the device is moved into the up state and a %NETDEV_UP message is
1201  *      sent to the netdev notifier chain.
1202  *
1203  *      Calling this function on an active interface is a nop. On a failure
1204  *      a negative errno code is returned.
1205  */
1206 int dev_open(struct net_device *dev)
1207 {
1208         int ret;
1209
1210         /*
1211          *      Is it already up?
1212          */
1213         if (dev->flags & IFF_UP)
1214                 return 0;
1215
1216         /*
1217          *      Open device
1218          */
1219         ret = __dev_open(dev);
1220         if (ret < 0)
1221                 return ret;
1222
1223         /*
1224          *      ... and announce new interface.
1225          */
1226         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1227         call_netdevice_notifiers(NETDEV_UP, dev);
1228
1229         return ret;
1230 }
1231 EXPORT_SYMBOL(dev_open);
1232
1233 static int __dev_close(struct net_device *dev)
1234 {
1235         const struct net_device_ops *ops = dev->netdev_ops;
1236
1237         ASSERT_RTNL();
1238         might_sleep();
1239
1240         /*
1241          *      Tell people we are going down, so that they can
1242          *      prepare to death, when device is still operating.
1243          */
1244         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1245
1246         clear_bit(__LINK_STATE_START, &dev->state);
1247
1248         /* Synchronize to scheduled poll. We cannot touch poll list,
1249          * it can be even on different cpu. So just clear netif_running().
1250          *
1251          * dev->stop() will invoke napi_disable() on all of it's
1252          * napi_struct instances on this device.
1253          */
1254         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1255
1256         dev_deactivate(dev);
1257
1258         /*
1259          *      Call the device specific close. This cannot fail.
1260          *      Only if device is UP
1261          *
1262          *      We allow it to be called even after a DETACH hot-plug
1263          *      event.
1264          */
1265         if (ops->ndo_stop)
1266                 ops->ndo_stop(dev);
1267
1268         /*
1269          *      Device is now down.
1270          */
1271
1272         dev->flags &= ~IFF_UP;
1273
1274         /*
1275          *      Shutdown NET_DMA
1276          */
1277         net_dmaengine_put();
1278
1279         return 0;
1280 }
1281
1282 /**
1283  *      dev_close - shutdown an interface.
1284  *      @dev: device to shutdown
1285  *
1286  *      This function moves an active device into down state. A
1287  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1288  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1289  *      chain.
1290  */
1291 int dev_close(struct net_device *dev)
1292 {
1293         if (!(dev->flags & IFF_UP))
1294                 return 0;
1295
1296         __dev_close(dev);
1297
1298         /*
1299          * Tell people we are down
1300          */
1301         rtmsg_ifinfo(RTM_NEWLINK, dev, IFF_UP|IFF_RUNNING);
1302         call_netdevice_notifiers(NETDEV_DOWN, dev);
1303
1304         return 0;
1305 }
1306 EXPORT_SYMBOL(dev_close);
1307
1308
1309 /**
1310  *      dev_disable_lro - disable Large Receive Offload on a device
1311  *      @dev: device
1312  *
1313  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1314  *      called under RTNL.  This is needed if received packets may be
1315  *      forwarded to another interface.
1316  */
1317 void dev_disable_lro(struct net_device *dev)
1318 {
1319         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1320             dev->ethtool_ops->set_flags) {
1321                 u32 flags = dev->ethtool_ops->get_flags(dev);
1322                 if (flags & ETH_FLAG_LRO) {
1323                         flags &= ~ETH_FLAG_LRO;
1324                         dev->ethtool_ops->set_flags(dev, flags);
1325                 }
1326         }
1327         WARN_ON(dev->features & NETIF_F_LRO);
1328 }
1329 EXPORT_SYMBOL(dev_disable_lro);
1330
1331
1332 static int dev_boot_phase = 1;
1333
1334 /*
1335  *      Device change register/unregister. These are not inline or static
1336  *      as we export them to the world.
1337  */
1338
1339 /**
1340  *      register_netdevice_notifier - register a network notifier block
1341  *      @nb: notifier
1342  *
1343  *      Register a notifier to be called when network device events occur.
1344  *      The notifier passed is linked into the kernel structures and must
1345  *      not be reused until it has been unregistered. A negative errno code
1346  *      is returned on a failure.
1347  *
1348  *      When registered all registration and up events are replayed
1349  *      to the new notifier to allow device to have a race free
1350  *      view of the network device list.
1351  */
1352
1353 int register_netdevice_notifier(struct notifier_block *nb)
1354 {
1355         struct net_device *dev;
1356         struct net_device *last;
1357         struct net *net;
1358         int err;
1359
1360         rtnl_lock();
1361         err = raw_notifier_chain_register(&netdev_chain, nb);
1362         if (err)
1363                 goto unlock;
1364         if (dev_boot_phase)
1365                 goto unlock;
1366         for_each_net(net) {
1367                 for_each_netdev(net, dev) {
1368                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1369                         err = notifier_to_errno(err);
1370                         if (err)
1371                                 goto rollback;
1372
1373                         if (!(dev->flags & IFF_UP))
1374                                 continue;
1375
1376                         nb->notifier_call(nb, NETDEV_UP, dev);
1377                 }
1378         }
1379
1380 unlock:
1381         rtnl_unlock();
1382         return err;
1383
1384 rollback:
1385         last = dev;
1386         for_each_net(net) {
1387                 for_each_netdev(net, dev) {
1388                         if (dev == last)
1389                                 break;
1390
1391                         if (dev->flags & IFF_UP) {
1392                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1393                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1394                         }
1395                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1396                         nb->notifier_call(nb, NETDEV_UNREGISTER_BATCH, dev);
1397                 }
1398         }
1399
1400         raw_notifier_chain_unregister(&netdev_chain, nb);
1401         goto unlock;
1402 }
1403 EXPORT_SYMBOL(register_netdevice_notifier);
1404
1405 /**
1406  *      unregister_netdevice_notifier - unregister a network notifier block
1407  *      @nb: notifier
1408  *
1409  *      Unregister a notifier previously registered by
1410  *      register_netdevice_notifier(). The notifier is unlinked into the
1411  *      kernel structures and may then be reused. A negative errno code
1412  *      is returned on a failure.
1413  */
1414
1415 int unregister_netdevice_notifier(struct notifier_block *nb)
1416 {
1417         int err;
1418
1419         rtnl_lock();
1420         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1421         rtnl_unlock();
1422         return err;
1423 }
1424 EXPORT_SYMBOL(unregister_netdevice_notifier);
1425
1426 /**
1427  *      call_netdevice_notifiers - call all network notifier blocks
1428  *      @val: value passed unmodified to notifier function
1429  *      @dev: net_device pointer passed unmodified to notifier function
1430  *
1431  *      Call all network notifier blocks.  Parameters and return value
1432  *      are as for raw_notifier_call_chain().
1433  */
1434
1435 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1436 {
1437         return raw_notifier_call_chain(&netdev_chain, val, dev);
1438 }
1439
1440 /* When > 0 there are consumers of rx skb time stamps */
1441 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1442
1443 void net_enable_timestamp(void)
1444 {
1445         atomic_inc(&netstamp_needed);
1446 }
1447 EXPORT_SYMBOL(net_enable_timestamp);
1448
1449 void net_disable_timestamp(void)
1450 {
1451         atomic_dec(&netstamp_needed);
1452 }
1453 EXPORT_SYMBOL(net_disable_timestamp);
1454
1455 static inline void net_timestamp(struct sk_buff *skb)
1456 {
1457         if (atomic_read(&netstamp_needed))
1458                 __net_timestamp(skb);
1459         else
1460                 skb->tstamp.tv64 = 0;
1461 }
1462
1463 /**
1464  * dev_forward_skb - loopback an skb to another netif
1465  *
1466  * @dev: destination network device
1467  * @skb: buffer to forward
1468  *
1469  * return values:
1470  *      NET_RX_SUCCESS  (no congestion)
1471  *      NET_RX_DROP     (packet was dropped)
1472  *
1473  * dev_forward_skb can be used for injecting an skb from the
1474  * start_xmit function of one device into the receive queue
1475  * of another device.
1476  *
1477  * The receiving device may be in another namespace, so
1478  * we have to clear all information in the skb that could
1479  * impact namespace isolation.
1480  */
1481 int dev_forward_skb(struct net_device *dev, struct sk_buff *skb)
1482 {
1483         skb_orphan(skb);
1484
1485         if (!(dev->flags & IFF_UP))
1486                 return NET_RX_DROP;
1487
1488         if (skb->len > (dev->mtu + dev->hard_header_len))
1489                 return NET_RX_DROP;
1490
1491         skb_set_dev(skb, dev);
1492         skb->tstamp.tv64 = 0;
1493         skb->pkt_type = PACKET_HOST;
1494         skb->protocol = eth_type_trans(skb, dev);
1495         return netif_rx(skb);
1496 }
1497 EXPORT_SYMBOL_GPL(dev_forward_skb);
1498
1499 /*
1500  *      Support routine. Sends outgoing frames to any network
1501  *      taps currently in use.
1502  */
1503
1504 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1505 {
1506         struct packet_type *ptype;
1507
1508 #ifdef CONFIG_NET_CLS_ACT
1509         if (!(skb->tstamp.tv64 && (G_TC_FROM(skb->tc_verd) & AT_INGRESS)))
1510                 net_timestamp(skb);
1511 #else
1512         net_timestamp(skb);
1513 #endif
1514
1515         rcu_read_lock();
1516         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1517                 /* Never send packets back to the socket
1518                  * they originated from - MvS (miquels@drinkel.ow.org)
1519                  */
1520                 if ((ptype->dev == dev || !ptype->dev) &&
1521                     (ptype->af_packet_priv == NULL ||
1522                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1523                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1524                         if (!skb2)
1525                                 break;
1526
1527                         /* skb->nh should be correctly
1528                            set by sender, so that the second statement is
1529                            just protection against buggy protocols.
1530                          */
1531                         skb_reset_mac_header(skb2);
1532
1533                         if (skb_network_header(skb2) < skb2->data ||
1534                             skb2->network_header > skb2->tail) {
1535                                 if (net_ratelimit())
1536                                         printk(KERN_CRIT "protocol %04x is "
1537                                                "buggy, dev %s\n",
1538                                                skb2->protocol, dev->name);
1539                                 skb_reset_network_header(skb2);
1540                         }
1541
1542                         skb2->transport_header = skb2->network_header;
1543                         skb2->pkt_type = PACKET_OUTGOING;
1544                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1545                 }
1546         }
1547         rcu_read_unlock();
1548 }
1549
1550
1551 static inline void __netif_reschedule(struct Qdisc *q)
1552 {
1553         struct softnet_data *sd;
1554         unsigned long flags;
1555
1556         local_irq_save(flags);
1557         sd = &__get_cpu_var(softnet_data);
1558         q->next_sched = sd->output_queue;
1559         sd->output_queue = q;
1560         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1561         local_irq_restore(flags);
1562 }
1563
1564 void __netif_schedule(struct Qdisc *q)
1565 {
1566         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1567                 __netif_reschedule(q);
1568 }
1569 EXPORT_SYMBOL(__netif_schedule);
1570
1571 void dev_kfree_skb_irq(struct sk_buff *skb)
1572 {
1573         if (atomic_dec_and_test(&skb->users)) {
1574                 struct softnet_data *sd;
1575                 unsigned long flags;
1576
1577                 local_irq_save(flags);
1578                 sd = &__get_cpu_var(softnet_data);
1579                 skb->next = sd->completion_queue;
1580                 sd->completion_queue = skb;
1581                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1582                 local_irq_restore(flags);
1583         }
1584 }
1585 EXPORT_SYMBOL(dev_kfree_skb_irq);
1586
1587 void dev_kfree_skb_any(struct sk_buff *skb)
1588 {
1589         if (in_irq() || irqs_disabled())
1590                 dev_kfree_skb_irq(skb);
1591         else
1592                 dev_kfree_skb(skb);
1593 }
1594 EXPORT_SYMBOL(dev_kfree_skb_any);
1595
1596
1597 /**
1598  * netif_device_detach - mark device as removed
1599  * @dev: network device
1600  *
1601  * Mark device as removed from system and therefore no longer available.
1602  */
1603 void netif_device_detach(struct net_device *dev)
1604 {
1605         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1606             netif_running(dev)) {
1607                 netif_tx_stop_all_queues(dev);
1608         }
1609 }
1610 EXPORT_SYMBOL(netif_device_detach);
1611
1612 /**
1613  * netif_device_attach - mark device as attached
1614  * @dev: network device
1615  *
1616  * Mark device as attached from system and restart if needed.
1617  */
1618 void netif_device_attach(struct net_device *dev)
1619 {
1620         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1621             netif_running(dev)) {
1622                 netif_tx_wake_all_queues(dev);
1623                 __netdev_watchdog_up(dev);
1624         }
1625 }
1626 EXPORT_SYMBOL(netif_device_attach);
1627
1628 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1629 {
1630         return ((features & NETIF_F_GEN_CSUM) ||
1631                 ((features & NETIF_F_IP_CSUM) &&
1632                  protocol == htons(ETH_P_IP)) ||
1633                 ((features & NETIF_F_IPV6_CSUM) &&
1634                  protocol == htons(ETH_P_IPV6)) ||
1635                 ((features & NETIF_F_FCOE_CRC) &&
1636                  protocol == htons(ETH_P_FCOE)));
1637 }
1638
1639 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1640 {
1641         if (can_checksum_protocol(dev->features, skb->protocol))
1642                 return true;
1643
1644         if (skb->protocol == htons(ETH_P_8021Q)) {
1645                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1646                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1647                                           veh->h_vlan_encapsulated_proto))
1648                         return true;
1649         }
1650
1651         return false;
1652 }
1653
1654 /**
1655  * skb_dev_set -- assign a new device to a buffer
1656  * @skb: buffer for the new device
1657  * @dev: network device
1658  *
1659  * If an skb is owned by a device already, we have to reset
1660  * all data private to the namespace a device belongs to
1661  * before assigning it a new device.
1662  */
1663 #ifdef CONFIG_NET_NS
1664 void skb_set_dev(struct sk_buff *skb, struct net_device *dev)
1665 {
1666         skb_dst_drop(skb);
1667         if (skb->dev && !net_eq(dev_net(skb->dev), dev_net(dev))) {
1668                 secpath_reset(skb);
1669                 nf_reset(skb);
1670                 skb_init_secmark(skb);
1671                 skb->mark = 0;
1672                 skb->priority = 0;
1673                 skb->nf_trace = 0;
1674                 skb->ipvs_property = 0;
1675 #ifdef CONFIG_NET_SCHED
1676                 skb->tc_index = 0;
1677 #endif
1678         }
1679         skb->dev = dev;
1680 }
1681 EXPORT_SYMBOL(skb_set_dev);
1682 #endif /* CONFIG_NET_NS */
1683
1684 /*
1685  * Invalidate hardware checksum when packet is to be mangled, and
1686  * complete checksum manually on outgoing path.
1687  */
1688 int skb_checksum_help(struct sk_buff *skb)
1689 {
1690         __wsum csum;
1691         int ret = 0, offset;
1692
1693         if (skb->ip_summed == CHECKSUM_COMPLETE)
1694                 goto out_set_summed;
1695
1696         if (unlikely(skb_shinfo(skb)->gso_size)) {
1697                 /* Let GSO fix up the checksum. */
1698                 goto out_set_summed;
1699         }
1700
1701         offset = skb->csum_start - skb_headroom(skb);
1702         BUG_ON(offset >= skb_headlen(skb));
1703         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1704
1705         offset += skb->csum_offset;
1706         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1707
1708         if (skb_cloned(skb) &&
1709             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1710                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1711                 if (ret)
1712                         goto out;
1713         }
1714
1715         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1716 out_set_summed:
1717         skb->ip_summed = CHECKSUM_NONE;
1718 out:
1719         return ret;
1720 }
1721 EXPORT_SYMBOL(skb_checksum_help);
1722
1723 /**
1724  *      skb_gso_segment - Perform segmentation on skb.
1725  *      @skb: buffer to segment
1726  *      @features: features for the output path (see dev->features)
1727  *
1728  *      This function segments the given skb and returns a list of segments.
1729  *
1730  *      It may return NULL if the skb requires no segmentation.  This is
1731  *      only possible when GSO is used for verifying header integrity.
1732  */
1733 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1734 {
1735         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1736         struct packet_type *ptype;
1737         __be16 type = skb->protocol;
1738         int err;
1739
1740         skb_reset_mac_header(skb);
1741         skb->mac_len = skb->network_header - skb->mac_header;
1742         __skb_pull(skb, skb->mac_len);
1743
1744         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1745                 struct net_device *dev = skb->dev;
1746                 struct ethtool_drvinfo info = {};
1747
1748                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1749                         dev->ethtool_ops->get_drvinfo(dev, &info);
1750
1751                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1752                         "ip_summed=%d",
1753                      info.driver, dev ? dev->features : 0L,
1754                      skb->sk ? skb->sk->sk_route_caps : 0L,
1755                      skb->len, skb->data_len, skb->ip_summed);
1756
1757                 if (skb_header_cloned(skb) &&
1758                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1759                         return ERR_PTR(err);
1760         }
1761
1762         rcu_read_lock();
1763         list_for_each_entry_rcu(ptype,
1764                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1765                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1766                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1767                                 err = ptype->gso_send_check(skb);
1768                                 segs = ERR_PTR(err);
1769                                 if (err || skb_gso_ok(skb, features))
1770                                         break;
1771                                 __skb_push(skb, (skb->data -
1772                                                  skb_network_header(skb)));
1773                         }
1774                         segs = ptype->gso_segment(skb, features);
1775                         break;
1776                 }
1777         }
1778         rcu_read_unlock();
1779
1780         __skb_push(skb, skb->data - skb_mac_header(skb));
1781
1782         return segs;
1783 }
1784 EXPORT_SYMBOL(skb_gso_segment);
1785
1786 /* Take action when hardware reception checksum errors are detected. */
1787 #ifdef CONFIG_BUG
1788 void netdev_rx_csum_fault(struct net_device *dev)
1789 {
1790         if (net_ratelimit()) {
1791                 printk(KERN_ERR "%s: hw csum failure.\n",
1792                         dev ? dev->name : "<unknown>");
1793                 dump_stack();
1794         }
1795 }
1796 EXPORT_SYMBOL(netdev_rx_csum_fault);
1797 #endif
1798
1799 /* Actually, we should eliminate this check as soon as we know, that:
1800  * 1. IOMMU is present and allows to map all the memory.
1801  * 2. No high memory really exists on this machine.
1802  */
1803
1804 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1805 {
1806 #ifdef CONFIG_HIGHMEM
1807         int i;
1808         if (!(dev->features & NETIF_F_HIGHDMA)) {
1809                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1810                         if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1811                                 return 1;
1812         }
1813
1814         if (PCI_DMA_BUS_IS_PHYS) {
1815                 struct device *pdev = dev->dev.parent;
1816
1817                 for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
1818                         dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
1819                         if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
1820                                 return 1;
1821                 }
1822         }
1823 #endif
1824         return 0;
1825 }
1826
1827 struct dev_gso_cb {
1828         void (*destructor)(struct sk_buff *skb);
1829 };
1830
1831 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1832
1833 static void dev_gso_skb_destructor(struct sk_buff *skb)
1834 {
1835         struct dev_gso_cb *cb;
1836
1837         do {
1838                 struct sk_buff *nskb = skb->next;
1839
1840                 skb->next = nskb->next;
1841                 nskb->next = NULL;
1842                 kfree_skb(nskb);
1843         } while (skb->next);
1844
1845         cb = DEV_GSO_CB(skb);
1846         if (cb->destructor)
1847                 cb->destructor(skb);
1848 }
1849
1850 /**
1851  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1852  *      @skb: buffer to segment
1853  *
1854  *      This function segments the given skb and stores the list of segments
1855  *      in skb->next.
1856  */
1857 static int dev_gso_segment(struct sk_buff *skb)
1858 {
1859         struct net_device *dev = skb->dev;
1860         struct sk_buff *segs;
1861         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1862                                          NETIF_F_SG : 0);
1863
1864         segs = skb_gso_segment(skb, features);
1865
1866         /* Verifying header integrity only. */
1867         if (!segs)
1868                 return 0;
1869
1870         if (IS_ERR(segs))
1871                 return PTR_ERR(segs);
1872
1873         skb->next = segs;
1874         DEV_GSO_CB(skb)->destructor = skb->destructor;
1875         skb->destructor = dev_gso_skb_destructor;
1876
1877         return 0;
1878 }
1879
1880 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1881                         struct netdev_queue *txq)
1882 {
1883         const struct net_device_ops *ops = dev->netdev_ops;
1884         int rc = NETDEV_TX_OK;
1885
1886         if (likely(!skb->next)) {
1887                 if (!list_empty(&ptype_all))
1888                         dev_queue_xmit_nit(skb, dev);
1889
1890                 if (netif_needs_gso(dev, skb)) {
1891                         if (unlikely(dev_gso_segment(skb)))
1892                                 goto out_kfree_skb;
1893                         if (skb->next)
1894                                 goto gso;
1895                 }
1896
1897                 /*
1898                  * If device doesnt need skb->dst, release it right now while
1899                  * its hot in this cpu cache
1900                  */
1901                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1902                         skb_dst_drop(skb);
1903
1904                 rc = ops->ndo_start_xmit(skb, dev);
1905                 if (rc == NETDEV_TX_OK)
1906                         txq_trans_update(txq);
1907                 /*
1908                  * TODO: if skb_orphan() was called by
1909                  * dev->hard_start_xmit() (for example, the unmodified
1910                  * igb driver does that; bnx2 doesn't), then
1911                  * skb_tx_software_timestamp() will be unable to send
1912                  * back the time stamp.
1913                  *
1914                  * How can this be prevented? Always create another
1915                  * reference to the socket before calling
1916                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1917                  * does anything in dev->hard_start_xmit() by clearing
1918                  * the skb destructor before the call and restoring it
1919                  * afterwards, then doing the skb_orphan() ourselves?
1920                  */
1921                 return rc;
1922         }
1923
1924 gso:
1925         do {
1926                 struct sk_buff *nskb = skb->next;
1927
1928                 skb->next = nskb->next;
1929                 nskb->next = NULL;
1930
1931                 /*
1932                  * If device doesnt need nskb->dst, release it right now while
1933                  * its hot in this cpu cache
1934                  */
1935                 if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
1936                         skb_dst_drop(nskb);
1937
1938                 rc = ops->ndo_start_xmit(nskb, dev);
1939                 if (unlikely(rc != NETDEV_TX_OK)) {
1940                         if (rc & ~NETDEV_TX_MASK)
1941                                 goto out_kfree_gso_skb;
1942                         nskb->next = skb->next;
1943                         skb->next = nskb;
1944                         return rc;
1945                 }
1946                 txq_trans_update(txq);
1947                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1948                         return NETDEV_TX_BUSY;
1949         } while (skb->next);
1950
1951 out_kfree_gso_skb:
1952         if (likely(skb->next == NULL))
1953                 skb->destructor = DEV_GSO_CB(skb)->destructor;
1954 out_kfree_skb:
1955         kfree_skb(skb);
1956         return rc;
1957 }
1958
1959 static u32 hashrnd __read_mostly;
1960
1961 u16 skb_tx_hash(const struct net_device *dev, const struct sk_buff *skb)
1962 {
1963         u32 hash;
1964
1965         if (skb_rx_queue_recorded(skb)) {
1966                 hash = skb_get_rx_queue(skb);
1967                 while (unlikely(hash >= dev->real_num_tx_queues))
1968                         hash -= dev->real_num_tx_queues;
1969                 return hash;
1970         }
1971
1972         if (skb->sk && skb->sk->sk_hash)
1973                 hash = skb->sk->sk_hash;
1974         else
1975                 hash = skb->protocol;
1976
1977         hash = jhash_1word(hash, hashrnd);
1978
1979         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1980 }
1981 EXPORT_SYMBOL(skb_tx_hash);
1982
1983 static inline u16 dev_cap_txqueue(struct net_device *dev, u16 queue_index)
1984 {
1985         if (unlikely(queue_index >= dev->real_num_tx_queues)) {
1986                 if (net_ratelimit()) {
1987                         netdev_warn(dev, "selects TX queue %d, but "
1988                              "real number of TX queues is %d\n",
1989                              queue_index, dev->real_num_tx_queues);
1990                 }
1991                 return 0;
1992         }
1993         return queue_index;
1994 }
1995
1996 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1997                                         struct sk_buff *skb)
1998 {
1999         u16 queue_index;
2000         struct sock *sk = skb->sk;
2001
2002         if (sk_tx_queue_recorded(sk)) {
2003                 queue_index = sk_tx_queue_get(sk);
2004         } else {
2005                 const struct net_device_ops *ops = dev->netdev_ops;
2006
2007                 if (ops->ndo_select_queue) {
2008                         queue_index = ops->ndo_select_queue(dev, skb);
2009                         queue_index = dev_cap_txqueue(dev, queue_index);
2010                 } else {
2011                         queue_index = 0;
2012                         if (dev->real_num_tx_queues > 1)
2013                                 queue_index = skb_tx_hash(dev, skb);
2014
2015                         if (sk && sk->sk_dst_cache)
2016                                 sk_tx_queue_set(sk, queue_index);
2017                 }
2018         }
2019
2020         skb_set_queue_mapping(skb, queue_index);
2021         return netdev_get_tx_queue(dev, queue_index);
2022 }
2023
2024 static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
2025                                  struct net_device *dev,
2026                                  struct netdev_queue *txq)
2027 {
2028         spinlock_t *root_lock = qdisc_lock(q);
2029         int rc;
2030
2031         spin_lock(root_lock);
2032         if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
2033                 kfree_skb(skb);
2034                 rc = NET_XMIT_DROP;
2035         } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
2036                    !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
2037                 /*
2038                  * This is a work-conserving queue; there are no old skbs
2039                  * waiting to be sent out; and the qdisc is not running -
2040                  * xmit the skb directly.
2041                  */
2042                 __qdisc_update_bstats(q, skb->len);
2043                 if (sch_direct_xmit(skb, q, dev, txq, root_lock))
2044                         __qdisc_run(q);
2045                 else
2046                         clear_bit(__QDISC_STATE_RUNNING, &q->state);
2047
2048                 rc = NET_XMIT_SUCCESS;
2049         } else {
2050                 rc = qdisc_enqueue_root(skb, q);
2051                 qdisc_run(q);
2052         }
2053         spin_unlock(root_lock);
2054
2055         return rc;
2056 }
2057
2058 /*
2059  * Returns true if either:
2060  *      1. skb has frag_list and the device doesn't support FRAGLIST, or
2061  *      2. skb is fragmented and the device does not support SG, or if
2062  *         at least one of fragments is in highmem and device does not
2063  *         support DMA from it.
2064  */
2065 static inline int skb_needs_linearize(struct sk_buff *skb,
2066                                       struct net_device *dev)
2067 {
2068         return (skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) ||
2069                (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) ||
2070                                               illegal_highdma(dev, skb)));
2071 }
2072
2073 /**
2074  *      dev_queue_xmit - transmit a buffer
2075  *      @skb: buffer to transmit
2076  *
2077  *      Queue a buffer for transmission to a network device. The caller must
2078  *      have set the device and priority and built the buffer before calling
2079  *      this function. The function can be called from an interrupt.
2080  *
2081  *      A negative errno code is returned on a failure. A success does not
2082  *      guarantee the frame will be transmitted as it may be dropped due
2083  *      to congestion or traffic shaping.
2084  *
2085  * -----------------------------------------------------------------------------------
2086  *      I notice this method can also return errors from the queue disciplines,
2087  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
2088  *      be positive.
2089  *
2090  *      Regardless of the return value, the skb is consumed, so it is currently
2091  *      difficult to retry a send to this method.  (You can bump the ref count
2092  *      before sending to hold a reference for retry if you are careful.)
2093  *
2094  *      When calling this method, interrupts MUST be enabled.  This is because
2095  *      the BH enable code must have IRQs enabled so that it will not deadlock.
2096  *          --BLG
2097  */
2098 int dev_queue_xmit(struct sk_buff *skb)
2099 {
2100         struct net_device *dev = skb->dev;
2101         struct netdev_queue *txq;
2102         struct Qdisc *q;
2103         int rc = -ENOMEM;
2104
2105         /* GSO will handle the following emulations directly. */
2106         if (netif_needs_gso(dev, skb))
2107                 goto gso;
2108
2109         /* Convert a paged skb to linear, if required */
2110         if (skb_needs_linearize(skb, dev) && __skb_linearize(skb))
2111                 goto out_kfree_skb;
2112
2113         /* If packet is not checksummed and device does not support
2114          * checksumming for this protocol, complete checksumming here.
2115          */
2116         if (skb->ip_summed == CHECKSUM_PARTIAL) {
2117                 skb_set_transport_header(skb, skb->csum_start -
2118                                               skb_headroom(skb));
2119                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
2120                         goto out_kfree_skb;
2121         }
2122
2123 gso:
2124         /* Disable soft irqs for various locks below. Also
2125          * stops preemption for RCU.
2126          */
2127         rcu_read_lock_bh();
2128
2129         txq = dev_pick_tx(dev, skb);
2130         q = rcu_dereference_bh(txq->qdisc);
2131
2132 #ifdef CONFIG_NET_CLS_ACT
2133         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
2134 #endif
2135         if (q->enqueue) {
2136                 rc = __dev_xmit_skb(skb, q, dev, txq);
2137                 goto out;
2138         }
2139
2140         /* The device has no queue. Common case for software devices:
2141            loopback, all the sorts of tunnels...
2142
2143            Really, it is unlikely that netif_tx_lock protection is necessary
2144            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
2145            counters.)
2146            However, it is possible, that they rely on protection
2147            made by us here.
2148
2149            Check this and shot the lock. It is not prone from deadlocks.
2150            Either shot noqueue qdisc, it is even simpler 8)
2151          */
2152         if (dev->flags & IFF_UP) {
2153                 int cpu = smp_processor_id(); /* ok because BHs are off */
2154
2155                 if (txq->xmit_lock_owner != cpu) {
2156
2157                         HARD_TX_LOCK(dev, txq, cpu);
2158
2159                         if (!netif_tx_queue_stopped(txq)) {
2160                                 rc = dev_hard_start_xmit(skb, dev, txq);
2161                                 if (dev_xmit_complete(rc)) {
2162                                         HARD_TX_UNLOCK(dev, txq);
2163                                         goto out;
2164                                 }
2165                         }
2166                         HARD_TX_UNLOCK(dev, txq);
2167                         if (net_ratelimit())
2168                                 printk(KERN_CRIT "Virtual device %s asks to "
2169                                        "queue packet!\n", dev->name);
2170                 } else {
2171                         /* Recursion is detected! It is possible,
2172                          * unfortunately */
2173                         if (net_ratelimit())
2174                                 printk(KERN_CRIT "Dead loop on virtual device "
2175                                        "%s, fix it urgently!\n", dev->name);
2176                 }
2177         }
2178
2179         rc = -ENETDOWN;
2180         rcu_read_unlock_bh();
2181
2182 out_kfree_skb:
2183         kfree_skb(skb);
2184         return rc;
2185 out:
2186         rcu_read_unlock_bh();
2187         return rc;
2188 }
2189 EXPORT_SYMBOL(dev_queue_xmit);
2190
2191
2192 /*=======================================================================
2193                         Receiver routines
2194   =======================================================================*/
2195
2196 int netdev_max_backlog __read_mostly = 1000;
2197 int netdev_budget __read_mostly = 300;
2198 int weight_p __read_mostly = 64;            /* old backlog weight */
2199
2200 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
2201
2202 #ifdef CONFIG_RPS
2203 /*
2204  * get_rps_cpu is called from netif_receive_skb and returns the target
2205  * CPU from the RPS map of the receiving queue for a given skb.
2206  */
2207 static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb)
2208 {
2209         struct ipv6hdr *ip6;
2210         struct iphdr *ip;
2211         struct netdev_rx_queue *rxqueue;
2212         struct rps_map *map;
2213         int cpu = -1;
2214         u8 ip_proto;
2215         u32 addr1, addr2, ports, ihl;
2216
2217         rcu_read_lock();
2218
2219         if (skb_rx_queue_recorded(skb)) {
2220                 u16 index = skb_get_rx_queue(skb);
2221                 if (unlikely(index >= dev->num_rx_queues)) {
2222                         if (net_ratelimit()) {
2223                                 netdev_warn(dev, "received packet on queue "
2224                                     "%u, but number of RX queues is %u\n",
2225                                      index, dev->num_rx_queues);
2226                         }
2227                         goto done;
2228                 }
2229                 rxqueue = dev->_rx + index;
2230         } else
2231                 rxqueue = dev->_rx;
2232
2233         if (!rxqueue->rps_map)
2234                 goto done;
2235
2236         if (skb->rxhash)
2237                 goto got_hash; /* Skip hash computation on packet header */
2238
2239         switch (skb->protocol) {
2240         case __constant_htons(ETH_P_IP):
2241                 if (!pskb_may_pull(skb, sizeof(*ip)))
2242                         goto done;
2243
2244                 ip = (struct iphdr *) skb->data;
2245                 ip_proto = ip->protocol;
2246                 addr1 = ip->saddr;
2247                 addr2 = ip->daddr;
2248                 ihl = ip->ihl;
2249                 break;
2250         case __constant_htons(ETH_P_IPV6):
2251                 if (!pskb_may_pull(skb, sizeof(*ip6)))
2252                         goto done;
2253
2254                 ip6 = (struct ipv6hdr *) skb->data;
2255                 ip_proto = ip6->nexthdr;
2256                 addr1 = ip6->saddr.s6_addr32[3];
2257                 addr2 = ip6->daddr.s6_addr32[3];
2258                 ihl = (40 >> 2);
2259                 break;
2260         default:
2261                 goto done;
2262         }
2263         ports = 0;
2264         switch (ip_proto) {
2265         case IPPROTO_TCP:
2266         case IPPROTO_UDP:
2267         case IPPROTO_DCCP:
2268         case IPPROTO_ESP:
2269         case IPPROTO_AH:
2270         case IPPROTO_SCTP:
2271         case IPPROTO_UDPLITE:
2272                 if (pskb_may_pull(skb, (ihl * 4) + 4))
2273                         ports = *((u32 *) (skb->data + (ihl * 4)));
2274                 break;
2275
2276         default:
2277                 break;
2278         }
2279
2280         skb->rxhash = jhash_3words(addr1, addr2, ports, hashrnd);
2281         if (!skb->rxhash)
2282                 skb->rxhash = 1;
2283
2284 got_hash:
2285         map = rcu_dereference(rxqueue->rps_map);
2286         if (map) {
2287                 u16 tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32];
2288
2289                 if (cpu_online(tcpu)) {
2290                         cpu = tcpu;
2291                         goto done;
2292                 }
2293         }
2294
2295 done:
2296         rcu_read_unlock();
2297         return cpu;
2298 }
2299
2300 /*
2301  * This structure holds the per-CPU mask of CPUs for which IPIs are scheduled
2302  * to be sent to kick remote softirq processing.  There are two masks since
2303  * the sending of IPIs must be done with interrupts enabled.  The select field
2304  * indicates the current mask that enqueue_backlog uses to schedule IPIs.
2305  * select is flipped before net_rps_action is called while still under lock,
2306  * net_rps_action then uses the non-selected mask to send the IPIs and clears
2307  * it without conflicting with enqueue_backlog operation.
2308  */
2309 struct rps_remote_softirq_cpus {
2310         cpumask_t mask[2];
2311         int select;
2312 };
2313 static DEFINE_PER_CPU(struct rps_remote_softirq_cpus, rps_remote_softirq_cpus);
2314
2315 /* Called from hardirq (IPI) context */
2316 static void trigger_softirq(void *data)
2317 {
2318         struct softnet_data *queue = data;
2319         __napi_schedule(&queue->backlog);
2320         __get_cpu_var(netdev_rx_stat).received_rps++;
2321 }
2322 #endif /* CONFIG_SMP */
2323
2324 /*
2325  * enqueue_to_backlog is called to queue an skb to a per CPU backlog
2326  * queue (may be a remote CPU queue).
2327  */
2328 static int enqueue_to_backlog(struct sk_buff *skb, int cpu)
2329 {
2330         struct softnet_data *queue;
2331         unsigned long flags;
2332
2333         queue = &per_cpu(softnet_data, cpu);
2334
2335         local_irq_save(flags);
2336         __get_cpu_var(netdev_rx_stat).total++;
2337
2338         rps_lock(queue);
2339         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
2340                 if (queue->input_pkt_queue.qlen) {
2341 enqueue:
2342                         __skb_queue_tail(&queue->input_pkt_queue, skb);
2343                         rps_unlock(queue);
2344                         local_irq_restore(flags);
2345                         return NET_RX_SUCCESS;
2346                 }
2347
2348                 /* Schedule NAPI for backlog device */
2349                 if (napi_schedule_prep(&queue->backlog)) {
2350 #ifdef CONFIG_RPS
2351                         if (cpu != smp_processor_id()) {
2352                                 struct rps_remote_softirq_cpus *rcpus =
2353                                     &__get_cpu_var(rps_remote_softirq_cpus);
2354
2355                                 cpu_set(cpu, rcpus->mask[rcpus->select]);
2356                                 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2357                         } else
2358                                 __napi_schedule(&queue->backlog);
2359 #else
2360                         __napi_schedule(&queue->backlog);
2361 #endif
2362                 }
2363                 goto enqueue;
2364         }
2365
2366         rps_unlock(queue);
2367
2368         __get_cpu_var(netdev_rx_stat).dropped++;
2369         local_irq_restore(flags);
2370
2371         kfree_skb(skb);
2372         return NET_RX_DROP;
2373 }
2374
2375 /**
2376  *      netif_rx        -       post buffer to the network code
2377  *      @skb: buffer to post
2378  *
2379  *      This function receives a packet from a device driver and queues it for
2380  *      the upper (protocol) levels to process.  It always succeeds. The buffer
2381  *      may be dropped during processing for congestion control or by the
2382  *      protocol layers.
2383  *
2384  *      return values:
2385  *      NET_RX_SUCCESS  (no congestion)
2386  *      NET_RX_DROP     (packet was dropped)
2387  *
2388  */
2389
2390 int netif_rx(struct sk_buff *skb)
2391 {
2392         int cpu;
2393
2394         /* if netpoll wants it, pretend we never saw it */
2395         if (netpoll_rx(skb))
2396                 return NET_RX_DROP;
2397
2398         if (!skb->tstamp.tv64)
2399                 net_timestamp(skb);
2400
2401 #ifdef CONFIG_RPS
2402         cpu = get_rps_cpu(skb->dev, skb);
2403         if (cpu < 0)
2404                 cpu = smp_processor_id();
2405 #else
2406         cpu = smp_processor_id();
2407 #endif
2408
2409         return enqueue_to_backlog(skb, cpu);
2410 }
2411 EXPORT_SYMBOL(netif_rx);
2412
2413 int netif_rx_ni(struct sk_buff *skb)
2414 {
2415         int err;
2416
2417         preempt_disable();
2418         err = netif_rx(skb);
2419         if (local_softirq_pending())
2420                 do_softirq();
2421         preempt_enable();
2422
2423         return err;
2424 }
2425 EXPORT_SYMBOL(netif_rx_ni);
2426
2427 static void net_tx_action(struct softirq_action *h)
2428 {
2429         struct softnet_data *sd = &__get_cpu_var(softnet_data);
2430
2431         if (sd->completion_queue) {
2432                 struct sk_buff *clist;
2433
2434                 local_irq_disable();
2435                 clist = sd->completion_queue;
2436                 sd->completion_queue = NULL;
2437                 local_irq_enable();
2438
2439                 while (clist) {
2440                         struct sk_buff *skb = clist;
2441                         clist = clist->next;
2442
2443                         WARN_ON(atomic_read(&skb->users));
2444                         __kfree_skb(skb);
2445                 }
2446         }
2447
2448         if (sd->output_queue) {
2449                 struct Qdisc *head;
2450
2451                 local_irq_disable();
2452                 head = sd->output_queue;
2453                 sd->output_queue = NULL;
2454                 local_irq_enable();
2455
2456                 while (head) {
2457                         struct Qdisc *q = head;
2458                         spinlock_t *root_lock;
2459
2460                         head = head->next_sched;
2461
2462                         root_lock = qdisc_lock(q);
2463                         if (spin_trylock(root_lock)) {
2464                                 smp_mb__before_clear_bit();
2465                                 clear_bit(__QDISC_STATE_SCHED,
2466                                           &q->state);
2467                                 qdisc_run(q);
2468                                 spin_unlock(root_lock);
2469                         } else {
2470                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2471                                               &q->state)) {
2472                                         __netif_reschedule(q);
2473                                 } else {
2474                                         smp_mb__before_clear_bit();
2475                                         clear_bit(__QDISC_STATE_SCHED,
2476                                                   &q->state);
2477                                 }
2478                         }
2479                 }
2480         }
2481 }
2482
2483 static inline int deliver_skb(struct sk_buff *skb,
2484                               struct packet_type *pt_prev,
2485                               struct net_device *orig_dev)
2486 {
2487         atomic_inc(&skb->users);
2488         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2489 }
2490
2491 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2492
2493 #if defined(CONFIG_ATM_LANE) || defined(CONFIG_ATM_LANE_MODULE)
2494 /* This hook is defined here for ATM LANE */
2495 int (*br_fdb_test_addr_hook)(struct net_device *dev,
2496                              unsigned char *addr) __read_mostly;
2497 EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
2498 #endif
2499
2500 /*
2501  * If bridge module is loaded call bridging hook.
2502  *  returns NULL if packet was consumed.
2503  */
2504 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2505                                         struct sk_buff *skb) __read_mostly;
2506 EXPORT_SYMBOL_GPL(br_handle_frame_hook);
2507
2508 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2509                                             struct packet_type **pt_prev, int *ret,
2510                                             struct net_device *orig_dev)
2511 {
2512         struct net_bridge_port *port;
2513
2514         if (skb->pkt_type == PACKET_LOOPBACK ||
2515             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2516                 return skb;
2517
2518         if (*pt_prev) {
2519                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2520                 *pt_prev = NULL;
2521         }
2522
2523         return br_handle_frame_hook(port, skb);
2524 }
2525 #else
2526 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2527 #endif
2528
2529 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2530 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2531 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2532
2533 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2534                                              struct packet_type **pt_prev,
2535                                              int *ret,
2536                                              struct net_device *orig_dev)
2537 {
2538         if (skb->dev->macvlan_port == NULL)
2539                 return skb;
2540
2541         if (*pt_prev) {
2542                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2543                 *pt_prev = NULL;
2544         }
2545         return macvlan_handle_frame_hook(skb);
2546 }
2547 #else
2548 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2549 #endif
2550
2551 #ifdef CONFIG_NET_CLS_ACT
2552 /* TODO: Maybe we should just force sch_ingress to be compiled in
2553  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2554  * a compare and 2 stores extra right now if we dont have it on
2555  * but have CONFIG_NET_CLS_ACT
2556  * NOTE: This doesnt stop any functionality; if you dont have
2557  * the ingress scheduler, you just cant add policies on ingress.
2558  *
2559  */
2560 static int ing_filter(struct sk_buff *skb)
2561 {
2562         struct net_device *dev = skb->dev;
2563         u32 ttl = G_TC_RTTL(skb->tc_verd);
2564         struct netdev_queue *rxq;
2565         int result = TC_ACT_OK;
2566         struct Qdisc *q;
2567
2568         if (MAX_RED_LOOP < ttl++) {
2569                 printk(KERN_WARNING
2570                        "Redir loop detected Dropping packet (%d->%d)\n",
2571                        skb->skb_iif, dev->ifindex);
2572                 return TC_ACT_SHOT;
2573         }
2574
2575         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2576         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2577
2578         rxq = &dev->rx_queue;
2579
2580         q = rxq->qdisc;
2581         if (q != &noop_qdisc) {
2582                 spin_lock(qdisc_lock(q));
2583                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2584                         result = qdisc_enqueue_root(skb, q);
2585                 spin_unlock(qdisc_lock(q));
2586         }
2587
2588         return result;
2589 }
2590
2591 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2592                                          struct packet_type **pt_prev,
2593                                          int *ret, struct net_device *orig_dev)
2594 {
2595         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2596                 goto out;
2597
2598         if (*pt_prev) {
2599                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2600                 *pt_prev = NULL;
2601         } else {
2602                 /* Huh? Why does turning on AF_PACKET affect this? */
2603                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2604         }
2605
2606         switch (ing_filter(skb)) {
2607         case TC_ACT_SHOT:
2608         case TC_ACT_STOLEN:
2609                 kfree_skb(skb);
2610                 return NULL;
2611         }
2612
2613 out:
2614         skb->tc_verd = 0;
2615         return skb;
2616 }
2617 #endif
2618
2619 /*
2620  *      netif_nit_deliver - deliver received packets to network taps
2621  *      @skb: buffer
2622  *
2623  *      This function is used to deliver incoming packets to network
2624  *      taps. It should be used when the normal netif_receive_skb path
2625  *      is bypassed, for example because of VLAN acceleration.
2626  */
2627 void netif_nit_deliver(struct sk_buff *skb)
2628 {
2629         struct packet_type *ptype;
2630
2631         if (list_empty(&ptype_all))
2632                 return;
2633
2634         skb_reset_network_header(skb);
2635         skb_reset_transport_header(skb);
2636         skb->mac_len = skb->network_header - skb->mac_header;
2637
2638         rcu_read_lock();
2639         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2640                 if (!ptype->dev || ptype->dev == skb->dev)
2641                         deliver_skb(skb, ptype, skb->dev);
2642         }
2643         rcu_read_unlock();
2644 }
2645
2646 static int __netif_receive_skb(struct sk_buff *skb)
2647 {
2648         struct packet_type *ptype, *pt_prev;
2649         struct net_device *orig_dev;
2650         struct net_device *master;
2651         struct net_device *null_or_orig;
2652         struct net_device *null_or_bond;
2653         int ret = NET_RX_DROP;
2654         __be16 type;
2655
2656         if (!skb->tstamp.tv64)
2657                 net_timestamp(skb);
2658
2659         if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))
2660                 return NET_RX_SUCCESS;
2661
2662         /* if we've gotten here through NAPI, check netpoll */
2663         if (netpoll_receive_skb(skb))
2664                 return NET_RX_DROP;
2665
2666         if (!skb->skb_iif)
2667                 skb->skb_iif = skb->dev->ifindex;
2668
2669         null_or_orig = NULL;
2670         orig_dev = skb->dev;
2671         master = ACCESS_ONCE(orig_dev->master);
2672         if (master) {
2673                 if (skb_bond_should_drop(skb, master))
2674                         null_or_orig = orig_dev; /* deliver only exact match */
2675                 else
2676                         skb->dev = master;
2677         }
2678
2679         __get_cpu_var(netdev_rx_stat).total++;
2680
2681         skb_reset_network_header(skb);
2682         skb_reset_transport_header(skb);
2683         skb->mac_len = skb->network_header - skb->mac_header;
2684
2685         pt_prev = NULL;
2686
2687         rcu_read_lock();
2688
2689 #ifdef CONFIG_NET_CLS_ACT
2690         if (skb->tc_verd & TC_NCLS) {
2691                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2692                 goto ncls;
2693         }
2694 #endif
2695
2696         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2697                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2698                     ptype->dev == orig_dev) {
2699                         if (pt_prev)
2700                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2701                         pt_prev = ptype;
2702                 }
2703         }
2704
2705 #ifdef CONFIG_NET_CLS_ACT
2706         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2707         if (!skb)
2708                 goto out;
2709 ncls:
2710 #endif
2711
2712         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2713         if (!skb)
2714                 goto out;
2715         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2716         if (!skb)
2717                 goto out;
2718
2719         /*
2720          * Make sure frames received on VLAN interfaces stacked on
2721          * bonding interfaces still make their way to any base bonding
2722          * device that may have registered for a specific ptype.  The
2723          * handler may have to adjust skb->dev and orig_dev.
2724          */
2725         null_or_bond = NULL;
2726         if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) &&
2727             (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {
2728                 null_or_bond = vlan_dev_real_dev(skb->dev);
2729         }
2730
2731         type = skb->protocol;
2732         list_for_each_entry_rcu(ptype,
2733                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2734                 if (ptype->type == type && (ptype->dev == null_or_orig ||
2735                      ptype->dev == skb->dev || ptype->dev == orig_dev ||
2736                      ptype->dev == null_or_bond)) {
2737                         if (pt_prev)
2738                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2739                         pt_prev = ptype;
2740                 }
2741         }
2742
2743         if (pt_prev) {
2744                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2745         } else {
2746                 kfree_skb(skb);
2747                 /* Jamal, now you will not able to escape explaining
2748                  * me how you were going to use this. :-)
2749                  */
2750                 ret = NET_RX_DROP;
2751         }
2752
2753 out:
2754         rcu_read_unlock();
2755         return ret;
2756 }
2757
2758 /**
2759  *      netif_receive_skb - process receive buffer from network
2760  *      @skb: buffer to process
2761  *
2762  *      netif_receive_skb() is the main receive data processing function.
2763  *      It always succeeds. The buffer may be dropped during processing
2764  *      for congestion control or by the protocol layers.
2765  *
2766  *      This function may only be called from softirq context and interrupts
2767  *      should be enabled.
2768  *
2769  *      Return values (usually ignored):
2770  *      NET_RX_SUCCESS: no congestion
2771  *      NET_RX_DROP: packet was dropped
2772  */
2773 int netif_receive_skb(struct sk_buff *skb)
2774 {
2775 #ifdef CONFIG_RPS
2776         int cpu;
2777
2778         cpu = get_rps_cpu(skb->dev, skb);
2779
2780         if (cpu < 0)
2781                 return __netif_receive_skb(skb);
2782         else
2783                 return enqueue_to_backlog(skb, cpu);
2784 #else
2785         return __netif_receive_skb(skb);
2786 #endif
2787 }
2788 EXPORT_SYMBOL(netif_receive_skb);
2789
2790 /* Network device is going away, flush any packets still pending  */
2791 static void flush_backlog(void *arg)
2792 {
2793         struct net_device *dev = arg;
2794         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2795         struct sk_buff *skb, *tmp;
2796
2797         rps_lock(queue);
2798         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2799                 if (skb->dev == dev) {
2800                         __skb_unlink(skb, &queue->input_pkt_queue);
2801                         kfree_skb(skb);
2802                 }
2803         rps_unlock(queue);
2804 }
2805
2806 static int napi_gro_complete(struct sk_buff *skb)
2807 {
2808         struct packet_type *ptype;
2809         __be16 type = skb->protocol;
2810         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2811         int err = -ENOENT;
2812
2813         if (NAPI_GRO_CB(skb)->count == 1) {
2814                 skb_shinfo(skb)->gso_size = 0;
2815                 goto out;
2816         }
2817
2818         rcu_read_lock();
2819         list_for_each_entry_rcu(ptype, head, list) {
2820                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2821                         continue;
2822
2823                 err = ptype->gro_complete(skb);
2824                 break;
2825         }
2826         rcu_read_unlock();
2827
2828         if (err) {
2829                 WARN_ON(&ptype->list == head);
2830                 kfree_skb(skb);
2831                 return NET_RX_SUCCESS;
2832         }
2833
2834 out:
2835         return netif_receive_skb(skb);
2836 }
2837
2838 static void napi_gro_flush(struct napi_struct *napi)
2839 {
2840         struct sk_buff *skb, *next;
2841
2842         for (skb = napi->gro_list; skb; skb = next) {
2843                 next = skb->next;
2844                 skb->next = NULL;
2845                 napi_gro_complete(skb);
2846         }
2847
2848         napi->gro_count = 0;
2849         napi->gro_list = NULL;
2850 }
2851
2852 enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2853 {
2854         struct sk_buff **pp = NULL;
2855         struct packet_type *ptype;
2856         __be16 type = skb->protocol;
2857         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2858         int same_flow;
2859         int mac_len;
2860         enum gro_result ret;
2861
2862         if (!(skb->dev->features & NETIF_F_GRO))
2863                 goto normal;
2864
2865         if (skb_is_gso(skb) || skb_has_frags(skb))
2866                 goto normal;
2867
2868         rcu_read_lock();
2869         list_for_each_entry_rcu(ptype, head, list) {
2870                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2871                         continue;
2872
2873                 skb_set_network_header(skb, skb_gro_offset(skb));
2874                 mac_len = skb->network_header - skb->mac_header;
2875                 skb->mac_len = mac_len;
2876                 NAPI_GRO_CB(skb)->same_flow = 0;
2877                 NAPI_GRO_CB(skb)->flush = 0;
2878                 NAPI_GRO_CB(skb)->free = 0;
2879
2880                 pp = ptype->gro_receive(&napi->gro_list, skb);
2881                 break;
2882         }
2883         rcu_read_unlock();
2884
2885         if (&ptype->list == head)
2886                 goto normal;
2887
2888         same_flow = NAPI_GRO_CB(skb)->same_flow;
2889         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2890
2891         if (pp) {
2892                 struct sk_buff *nskb = *pp;
2893
2894                 *pp = nskb->next;
2895                 nskb->next = NULL;
2896                 napi_gro_complete(nskb);
2897                 napi->gro_count--;
2898         }
2899
2900         if (same_flow)
2901                 goto ok;
2902
2903         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2904                 goto normal;
2905
2906         napi->gro_count++;
2907         NAPI_GRO_CB(skb)->count = 1;
2908         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2909         skb->next = napi->gro_list;
2910         napi->gro_list = skb;
2911         ret = GRO_HELD;
2912
2913 pull:
2914         if (skb_headlen(skb) < skb_gro_offset(skb)) {
2915                 int grow = skb_gro_offset(skb) - skb_headlen(skb);
2916
2917                 BUG_ON(skb->end - skb->tail < grow);
2918
2919                 memcpy(skb_tail_pointer(skb), NAPI_GRO_CB(skb)->frag0, grow);
2920
2921                 skb->tail += grow;
2922                 skb->data_len -= grow;
2923
2924                 skb_shinfo(skb)->frags[0].page_offset += grow;
2925                 skb_shinfo(skb)->frags[0].size -= grow;
2926
2927                 if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
2928                         put_page(skb_shinfo(skb)->frags[0].page);
2929                         memmove(skb_shinfo(skb)->frags,
2930                                 skb_shinfo(skb)->frags + 1,
2931                                 --skb_shinfo(skb)->nr_frags);
2932                 }
2933         }
2934
2935 ok:
2936         return ret;
2937
2938 normal:
2939         ret = GRO_NORMAL;
2940         goto pull;
2941 }
2942 EXPORT_SYMBOL(dev_gro_receive);
2943
2944 static gro_result_t
2945 __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2946 {
2947         struct sk_buff *p;
2948
2949         if (netpoll_rx_on(skb))
2950                 return GRO_NORMAL;
2951
2952         for (p = napi->gro_list; p; p = p->next) {
2953                 NAPI_GRO_CB(p)->same_flow =
2954                         (p->dev == skb->dev) &&
2955                         !compare_ether_header(skb_mac_header(p),
2956                                               skb_gro_mac_header(skb));
2957                 NAPI_GRO_CB(p)->flush = 0;
2958         }
2959
2960         return dev_gro_receive(napi, skb);
2961 }
2962
2963 gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
2964 {
2965         switch (ret) {
2966         case GRO_NORMAL:
2967                 if (netif_receive_skb(skb))
2968                         ret = GRO_DROP;
2969                 break;
2970
2971         case GRO_DROP:
2972         case GRO_MERGED_FREE:
2973                 kfree_skb(skb);
2974                 break;
2975
2976         case GRO_HELD:
2977         case GRO_MERGED:
2978                 break;
2979         }
2980
2981         return ret;
2982 }
2983 EXPORT_SYMBOL(napi_skb_finish);
2984
2985 void skb_gro_reset_offset(struct sk_buff *skb)
2986 {
2987         NAPI_GRO_CB(skb)->data_offset = 0;
2988         NAPI_GRO_CB(skb)->frag0 = NULL;
2989         NAPI_GRO_CB(skb)->frag0_len = 0;
2990
2991         if (skb->mac_header == skb->tail &&
2992             !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
2993                 NAPI_GRO_CB(skb)->frag0 =
2994                         page_address(skb_shinfo(skb)->frags[0].page) +
2995                         skb_shinfo(skb)->frags[0].page_offset;
2996                 NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
2997         }
2998 }
2999 EXPORT_SYMBOL(skb_gro_reset_offset);
3000
3001 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
3002 {
3003         skb_gro_reset_offset(skb);
3004
3005         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
3006 }
3007 EXPORT_SYMBOL(napi_gro_receive);
3008
3009 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
3010 {
3011         __skb_pull(skb, skb_headlen(skb));
3012         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
3013
3014         napi->skb = skb;
3015 }
3016 EXPORT_SYMBOL(napi_reuse_skb);
3017
3018 struct sk_buff *napi_get_frags(struct napi_struct *napi)
3019 {
3020         struct sk_buff *skb = napi->skb;
3021
3022         if (!skb) {
3023                 skb = netdev_alloc_skb_ip_align(napi->dev, GRO_MAX_HEAD);
3024                 if (skb)
3025                         napi->skb = skb;
3026         }
3027         return skb;
3028 }
3029 EXPORT_SYMBOL(napi_get_frags);
3030
3031 gro_result_t napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb,
3032                                gro_result_t ret)
3033 {
3034         switch (ret) {
3035         case GRO_NORMAL:
3036         case GRO_HELD:
3037                 skb->protocol = eth_type_trans(skb, skb->dev);
3038
3039                 if (ret == GRO_HELD)
3040                         skb_gro_pull(skb, -ETH_HLEN);
3041                 else if (netif_receive_skb(skb))
3042                         ret = GRO_DROP;
3043                 break;
3044
3045         case GRO_DROP:
3046         case GRO_MERGED_FREE:
3047                 napi_reuse_skb(napi, skb);
3048                 break;
3049
3050         case GRO_MERGED:
3051                 break;
3052         }
3053
3054         return ret;
3055 }
3056 EXPORT_SYMBOL(napi_frags_finish);
3057
3058 struct sk_buff *napi_frags_skb(struct napi_struct *napi)
3059 {
3060         struct sk_buff *skb = napi->skb;
3061         struct ethhdr *eth;
3062         unsigned int hlen;
3063         unsigned int off;
3064
3065         napi->skb = NULL;
3066
3067         skb_reset_mac_header(skb);
3068         skb_gro_reset_offset(skb);
3069
3070         off = skb_gro_offset(skb);
3071         hlen = off + sizeof(*eth);
3072         eth = skb_gro_header_fast(skb, off);
3073         if (skb_gro_header_hard(skb, hlen)) {
3074                 eth = skb_gro_header_slow(skb, hlen, off);
3075                 if (unlikely(!eth)) {
3076                         napi_reuse_skb(napi, skb);
3077                         skb = NULL;
3078                         goto out;
3079                 }
3080         }
3081
3082         skb_gro_pull(skb, sizeof(*eth));
3083
3084         /*
3085          * This works because the only protocols we care about don't require
3086          * special handling.  We'll fix it up properly at the end.
3087          */
3088         skb->protocol = eth->h_proto;
3089
3090 out:
3091         return skb;
3092 }
3093 EXPORT_SYMBOL(napi_frags_skb);
3094
3095 gro_result_t napi_gro_frags(struct napi_struct *napi)
3096 {
3097         struct sk_buff *skb = napi_frags_skb(napi);
3098
3099         if (!skb)
3100                 return GRO_DROP;
3101
3102         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
3103 }
3104 EXPORT_SYMBOL(napi_gro_frags);
3105
3106 static int process_backlog(struct napi_struct *napi, int quota)
3107 {
3108         int work = 0;
3109         struct softnet_data *queue = &__get_cpu_var(softnet_data);
3110         unsigned long start_time = jiffies;
3111
3112         napi->weight = weight_p;
3113         do {
3114                 struct sk_buff *skb;
3115
3116                 local_irq_disable();
3117                 rps_lock(queue);
3118                 skb = __skb_dequeue(&queue->input_pkt_queue);
3119                 if (!skb) {
3120                         __napi_complete(napi);
3121                         spin_unlock_irq(&queue->input_pkt_queue.lock);
3122                         break;
3123                 }
3124                 rps_unlock(queue);
3125                 local_irq_enable();
3126
3127                 __netif_receive_skb(skb);
3128         } while (++work < quota && jiffies == start_time);
3129
3130         return work;
3131 }
3132
3133 /**
3134  * __napi_schedule - schedule for receive
3135  * @n: entry to schedule
3136  *
3137  * The entry's receive function will be scheduled to run
3138  */
3139 void __napi_schedule(struct napi_struct *n)
3140 {
3141         unsigned long flags;
3142
3143         local_irq_save(flags);
3144         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
3145         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3146         local_irq_restore(flags);
3147 }
3148 EXPORT_SYMBOL(__napi_schedule);
3149
3150 void __napi_complete(struct napi_struct *n)
3151 {
3152         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
3153         BUG_ON(n->gro_list);
3154
3155         list_del(&n->poll_list);
3156         smp_mb__before_clear_bit();
3157         clear_bit(NAPI_STATE_SCHED, &n->state);
3158 }
3159 EXPORT_SYMBOL(__napi_complete);
3160
3161 void napi_complete(struct napi_struct *n)
3162 {
3163         unsigned long flags;
3164
3165         /*
3166          * don't let napi dequeue from the cpu poll list
3167          * just in case its running on a different cpu
3168          */
3169         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
3170                 return;
3171
3172         napi_gro_flush(n);
3173         local_irq_save(flags);
3174         __napi_complete(n);
3175         local_irq_restore(flags);
3176 }
3177 EXPORT_SYMBOL(napi_complete);
3178
3179 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
3180                     int (*poll)(struct napi_struct *, int), int weight)
3181 {
3182         INIT_LIST_HEAD(&napi->poll_list);
3183         napi->gro_count = 0;
3184         napi->gro_list = NULL;
3185         napi->skb = NULL;
3186         napi->poll = poll;
3187         napi->weight = weight;
3188         list_add(&napi->dev_list, &dev->napi_list);
3189         napi->dev = dev;
3190 #ifdef CONFIG_NETPOLL
3191         spin_lock_init(&napi->poll_lock);
3192         napi->poll_owner = -1;
3193 #endif
3194         set_bit(NAPI_STATE_SCHED, &napi->state);
3195 }
3196 EXPORT_SYMBOL(netif_napi_add);
3197
3198 void netif_napi_del(struct napi_struct *napi)
3199 {
3200         struct sk_buff *skb, *next;
3201
3202         list_del_init(&napi->dev_list);
3203         napi_free_frags(napi);
3204
3205         for (skb = napi->gro_list; skb; skb = next) {
3206                 next = skb->next;
3207                 skb->next = NULL;
3208                 kfree_skb(skb);
3209         }
3210
3211         napi->gro_list = NULL;
3212         napi->gro_count = 0;
3213 }
3214 EXPORT_SYMBOL(netif_napi_del);
3215
3216 #ifdef CONFIG_RPS
3217 /*
3218  * net_rps_action sends any pending IPI's for rps.  This is only called from
3219  * softirq and interrupts must be enabled.
3220  */
3221 static void net_rps_action(cpumask_t *mask)
3222 {
3223         int cpu;
3224
3225         /* Send pending IPI's to kick RPS processing on remote cpus. */
3226         for_each_cpu_mask_nr(cpu, *mask) {
3227                 struct softnet_data *queue = &per_cpu(softnet_data, cpu);
3228                 if (cpu_online(cpu))
3229                         __smp_call_function_single(cpu, &queue->csd, 0);
3230         }
3231         cpus_clear(*mask);
3232 }
3233 #endif
3234
3235 static void net_rx_action(struct softirq_action *h)
3236 {
3237         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
3238         unsigned long time_limit = jiffies + 2;
3239         int budget = netdev_budget;
3240         void *have;
3241 #ifdef CONFIG_RPS
3242         int select;
3243         struct rps_remote_softirq_cpus *rcpus;
3244 #endif
3245
3246         local_irq_disable();
3247
3248         while (!list_empty(list)) {
3249                 struct napi_struct *n;
3250                 int work, weight;
3251
3252                 /* If softirq window is exhuasted then punt.
3253                  * Allow this to run for 2 jiffies since which will allow
3254                  * an average latency of 1.5/HZ.
3255                  */
3256                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
3257                         goto softnet_break;
3258
3259                 local_irq_enable();
3260
3261                 /* Even though interrupts have been re-enabled, this
3262                  * access is safe because interrupts can only add new
3263                  * entries to the tail of this list, and only ->poll()
3264                  * calls can remove this head entry from the list.
3265                  */
3266                 n = list_first_entry(list, struct napi_struct, poll_list);
3267
3268                 have = netpoll_poll_lock(n);
3269
3270                 weight = n->weight;
3271
3272                 /* This NAPI_STATE_SCHED test is for avoiding a race
3273                  * with netpoll's poll_napi().  Only the entity which
3274                  * obtains the lock and sees NAPI_STATE_SCHED set will
3275                  * actually make the ->poll() call.  Therefore we avoid
3276                  * accidently calling ->poll() when NAPI is not scheduled.
3277                  */
3278                 work = 0;
3279                 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
3280                         work = n->poll(n, weight);
3281                         trace_napi_poll(n);
3282                 }
3283
3284                 WARN_ON_ONCE(work > weight);
3285
3286                 budget -= work;
3287
3288                 local_irq_disable();
3289
3290                 /* Drivers must not modify the NAPI state if they
3291                  * consume the entire weight.  In such cases this code
3292                  * still "owns" the NAPI instance and therefore can
3293                  * move the instance around on the list at-will.
3294                  */
3295                 if (unlikely(work == weight)) {
3296                         if (unlikely(napi_disable_pending(n))) {
3297                                 local_irq_enable();
3298                                 napi_complete(n);
3299                                 local_irq_disable();
3300                         } else
3301                                 list_move_tail(&n->poll_list, list);
3302                 }
3303
3304                 netpoll_poll_unlock(have);
3305         }
3306 out:
3307 #ifdef CONFIG_RPS
3308         rcpus = &__get_cpu_var(rps_remote_softirq_cpus);
3309         select = rcpus->select;
3310         rcpus->select ^= 1;
3311
3312         local_irq_enable();
3313
3314         net_rps_action(&rcpus->mask[select]);
3315 #else
3316         local_irq_enable();
3317 #endif
3318
3319 #ifdef CONFIG_NET_DMA
3320         /*
3321          * There may not be any more sk_buffs coming right now, so push
3322          * any pending DMA copies to hardware
3323          */
3324         dma_issue_pending_all();
3325 #endif
3326
3327         return;
3328
3329 softnet_break:
3330         __get_cpu_var(netdev_rx_stat).time_squeeze++;
3331         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
3332         goto out;
3333 }
3334
3335 static gifconf_func_t *gifconf_list[NPROTO];
3336
3337 /**
3338  *      register_gifconf        -       register a SIOCGIF handler
3339  *      @family: Address family
3340  *      @gifconf: Function handler
3341  *
3342  *      Register protocol dependent address dumping routines. The handler
3343  *      that is passed must not be freed or reused until it has been replaced
3344  *      by another handler.
3345  */
3346 int register_gifconf(unsigned int family, gifconf_func_t *gifconf)
3347 {
3348         if (family >= NPROTO)
3349                 return -EINVAL;
3350         gifconf_list[family] = gifconf;
3351         return 0;
3352 }
3353 EXPORT_SYMBOL(register_gifconf);
3354
3355
3356 /*
3357  *      Map an interface index to its name (SIOCGIFNAME)
3358  */
3359
3360 /*
3361  *      We need this ioctl for efficient implementation of the
3362  *      if_indextoname() function required by the IPv6 API.  Without
3363  *      it, we would have to search all the interfaces to find a
3364  *      match.  --pb
3365  */
3366
3367 static int dev_ifname(struct net *net, struct ifreq __user *arg)
3368 {
3369         struct net_device *dev;
3370         struct ifreq ifr;
3371
3372         /*
3373          *      Fetch the caller's info block.
3374          */
3375
3376         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
3377                 return -EFAULT;
3378
3379         rcu_read_lock();
3380         dev = dev_get_by_index_rcu(net, ifr.ifr_ifindex);
3381         if (!dev) {
3382                 rcu_read_unlock();
3383                 return -ENODEV;
3384         }
3385
3386         strcpy(ifr.ifr_name, dev->name);
3387         rcu_read_unlock();
3388
3389         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
3390                 return -EFAULT;
3391         return 0;
3392 }
3393
3394 /*
3395  *      Perform a SIOCGIFCONF call. This structure will change
3396  *      size eventually, and there is nothing I can do about it.
3397  *      Thus we will need a 'compatibility mode'.
3398  */
3399
3400 static int dev_ifconf(struct net *net, char __user *arg)
3401 {
3402         struct ifconf ifc;
3403         struct net_device *dev;
3404         char __user *pos;
3405         int len;
3406         int total;
3407         int i;
3408
3409         /*
3410          *      Fetch the caller's info block.
3411          */
3412
3413         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
3414                 return -EFAULT;
3415
3416         pos = ifc.ifc_buf;
3417         len = ifc.ifc_len;
3418
3419         /*
3420          *      Loop over the interfaces, and write an info block for each.
3421          */
3422
3423         total = 0;
3424         for_each_netdev(net, dev) {
3425                 for (i = 0; i < NPROTO; i++) {
3426                         if (gifconf_list[i]) {
3427                                 int done;
3428                                 if (!pos)
3429                                         done = gifconf_list[i](dev, NULL, 0);
3430                                 else
3431                                         done = gifconf_list[i](dev, pos + total,
3432                                                                len - total);
3433                                 if (done < 0)
3434                                         return -EFAULT;
3435                                 total += done;
3436                         }
3437                 }
3438         }
3439
3440         /*
3441          *      All done.  Write the updated control block back to the caller.
3442          */
3443         ifc.ifc_len = total;
3444
3445         /*
3446          *      Both BSD and Solaris return 0 here, so we do too.
3447          */
3448         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
3449 }
3450
3451 #ifdef CONFIG_PROC_FS
3452 /*
3453  *      This is invoked by the /proc filesystem handler to display a device
3454  *      in detail.
3455  */
3456 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
3457         __acquires(RCU)
3458 {
3459         struct net *net = seq_file_net(seq);
3460         loff_t off;
3461         struct net_device *dev;
3462
3463         rcu_read_lock();
3464         if (!*pos)
3465                 return SEQ_START_TOKEN;
3466
3467         off = 1;
3468         for_each_netdev_rcu(net, dev)
3469                 if (off++ == *pos)
3470                         return dev;
3471
3472         return NULL;
3473 }
3474
3475 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3476 {
3477         struct net_device *dev = (v == SEQ_START_TOKEN) ?
3478                                   first_net_device(seq_file_net(seq)) :
3479                                   next_net_device((struct net_device *)v);
3480
3481         ++*pos;
3482         return rcu_dereference(dev);
3483 }
3484
3485 void dev_seq_stop(struct seq_file *seq, void *v)
3486         __releases(RCU)
3487 {
3488         rcu_read_unlock();
3489 }
3490
3491 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
3492 {
3493         const struct net_device_stats *stats = dev_get_stats(dev);
3494
3495         seq_printf(seq, "%6s: %7lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
3496                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
3497                    dev->name, stats->rx_bytes, stats->rx_packets,
3498                    stats->rx_errors,
3499                    stats->rx_dropped + stats->rx_missed_errors,
3500                    stats->rx_fifo_errors,
3501                    stats->rx_length_errors + stats->rx_over_errors +
3502                     stats->rx_crc_errors + stats->rx_frame_errors,
3503                    stats->rx_compressed, stats->multicast,
3504                    stats->tx_bytes, stats->tx_packets,
3505                    stats->tx_errors, stats->tx_dropped,
3506                    stats->tx_fifo_errors, stats->collisions,
3507                    stats->tx_carrier_errors +
3508                     stats->tx_aborted_errors +
3509                     stats->tx_window_errors +
3510                     stats->tx_heartbeat_errors,
3511                    stats->tx_compressed);
3512 }
3513
3514 /*
3515  *      Called from the PROCfs module. This now uses the new arbitrary sized
3516  *      /proc/net interface to create /proc/net/dev
3517  */
3518 static int dev_seq_show(struct seq_file *seq, void *v)
3519 {
3520         if (v == SEQ_START_TOKEN)
3521                 seq_puts(seq, "Inter-|   Receive                            "
3522                               "                    |  Transmit\n"
3523                               " face |bytes    packets errs drop fifo frame "
3524                               "compressed multicast|bytes    packets errs "
3525                               "drop fifo colls carrier compressed\n");
3526         else
3527                 dev_seq_printf_stats(seq, v);
3528         return 0;
3529 }
3530
3531 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3532 {
3533         struct netif_rx_stats *rc = NULL;
3534
3535         while (*pos < nr_cpu_ids)
3536                 if (cpu_online(*pos)) {
3537                         rc = &per_cpu(netdev_rx_stat, *pos);
3538                         break;
3539                 } else
3540                         ++*pos;
3541         return rc;
3542 }
3543
3544 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3545 {
3546         return softnet_get_online(pos);
3547 }
3548
3549 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3550 {
3551         ++*pos;
3552         return softnet_get_online(pos);
3553 }
3554
3555 static void softnet_seq_stop(struct seq_file *seq, void *v)
3556 {
3557 }
3558
3559 static int softnet_seq_show(struct seq_file *seq, void *v)
3560 {
3561         struct netif_rx_stats *s = v;
3562
3563         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3564                    s->total, s->dropped, s->time_squeeze, 0,
3565                    0, 0, 0, 0, /* was fastroute */
3566                    s->cpu_collision, s->received_rps);
3567         return 0;
3568 }
3569
3570 static const struct seq_operations dev_seq_ops = {
3571         .start = dev_seq_start,
3572         .next  = dev_seq_next,
3573         .stop  = dev_seq_stop,
3574         .show  = dev_seq_show,
3575 };
3576
3577 static int dev_seq_open(struct inode *inode, struct file *file)
3578 {
3579         return seq_open_net(inode, file, &dev_seq_ops,
3580                             sizeof(struct seq_net_private));
3581 }
3582
3583 static const struct file_operations dev_seq_fops = {
3584         .owner   = THIS_MODULE,
3585         .open    = dev_seq_open,
3586         .read    = seq_read,
3587         .llseek  = seq_lseek,
3588         .release = seq_release_net,
3589 };
3590
3591 static const struct seq_operations softnet_seq_ops = {
3592         .start = softnet_seq_start,
3593         .next  = softnet_seq_next,
3594         .stop  = softnet_seq_stop,
3595         .show  = softnet_seq_show,
3596 };
3597
3598 static int softnet_seq_open(struct inode *inode, struct file *file)
3599 {
3600         return seq_open(file, &softnet_seq_ops);
3601 }
3602
3603 static const struct file_operations softnet_seq_fops = {
3604         .owner   = THIS_MODULE,
3605         .open    = softnet_seq_open,
3606         .read    = seq_read,
3607         .llseek  = seq_lseek,
3608         .release = seq_release,
3609 };
3610
3611 static void *ptype_get_idx(loff_t pos)
3612 {
3613         struct packet_type *pt = NULL;
3614         loff_t i = 0;
3615         int t;
3616
3617         list_for_each_entry_rcu(pt, &ptype_all, list) {
3618                 if (i == pos)
3619                         return pt;
3620                 ++i;
3621         }
3622
3623         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3624                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3625                         if (i == pos)
3626                                 return pt;
3627                         ++i;
3628                 }
3629         }
3630         return NULL;
3631 }
3632
3633 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3634         __acquires(RCU)
3635 {
3636         rcu_read_lock();
3637         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3638 }
3639
3640 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3641 {
3642         struct packet_type *pt;
3643         struct list_head *nxt;
3644         int hash;
3645
3646         ++*pos;
3647         if (v == SEQ_START_TOKEN)
3648                 return ptype_get_idx(0);
3649
3650         pt = v;
3651         nxt = pt->list.next;
3652         if (pt->type == htons(ETH_P_ALL)) {
3653                 if (nxt != &ptype_all)
3654                         goto found;
3655                 hash = 0;
3656                 nxt = ptype_base[0].next;
3657         } else
3658                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3659
3660         while (nxt == &ptype_base[hash]) {
3661                 if (++hash >= PTYPE_HASH_SIZE)
3662                         return NULL;
3663                 nxt = ptype_base[hash].next;
3664         }
3665 found:
3666         return list_entry(nxt, struct packet_type, list);
3667 }
3668
3669 static void ptype_seq_stop(struct seq_file *seq, void *v)
3670         __releases(RCU)
3671 {
3672         rcu_read_unlock();
3673 }
3674
3675 static int ptype_seq_show(struct seq_file *seq, void *v)
3676 {
3677         struct packet_type *pt = v;
3678
3679         if (v == SEQ_START_TOKEN)
3680                 seq_puts(seq, "Type Device      Function\n");
3681         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3682                 if (pt->type == htons(ETH_P_ALL))
3683                         seq_puts(seq, "ALL ");
3684                 else
3685                         seq_printf(seq, "%04x", ntohs(pt->type));
3686
3687                 seq_printf(seq, " %-8s %pF\n",
3688                            pt->dev ? pt->dev->name : "", pt->func);
3689         }
3690
3691         return 0;
3692 }
3693
3694 static const struct seq_operations ptype_seq_ops = {
3695         .start = ptype_seq_start,
3696         .next  = ptype_seq_next,
3697         .stop  = ptype_seq_stop,
3698         .show  = ptype_seq_show,
3699 };
3700
3701 static int ptype_seq_open(struct inode *inode, struct file *file)
3702 {
3703         return seq_open_net(inode, file, &ptype_seq_ops,
3704                         sizeof(struct seq_net_private));
3705 }
3706
3707 static const struct file_operations ptype_seq_fops = {
3708         .owner   = THIS_MODULE,
3709         .open    = ptype_seq_open,
3710         .read    = seq_read,
3711         .llseek  = seq_lseek,
3712         .release = seq_release_net,
3713 };
3714
3715
3716 static int __net_init dev_proc_net_init(struct net *net)
3717 {
3718         int rc = -ENOMEM;
3719
3720         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3721                 goto out;
3722         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3723                 goto out_dev;
3724         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3725                 goto out_softnet;
3726
3727         if (wext_proc_init(net))
3728                 goto out_ptype;
3729         rc = 0;
3730 out:
3731         return rc;
3732 out_ptype:
3733         proc_net_remove(net, "ptype");
3734 out_softnet:
3735         proc_net_remove(net, "softnet_stat");
3736 out_dev:
3737         proc_net_remove(net, "dev");
3738         goto out;
3739 }
3740
3741 static void __net_exit dev_proc_net_exit(struct net *net)
3742 {
3743         wext_proc_exit(net);
3744
3745         proc_net_remove(net, "ptype");
3746         proc_net_remove(net, "softnet_stat");
3747         proc_net_remove(net, "dev");
3748 }
3749
3750 static struct pernet_operations __net_initdata dev_proc_ops = {
3751         .init = dev_proc_net_init,
3752         .exit = dev_proc_net_exit,
3753 };
3754
3755 static int __init dev_proc_init(void)
3756 {
3757         return register_pernet_subsys(&dev_proc_ops);
3758 }
3759 #else
3760 #define dev_proc_init() 0
3761 #endif  /* CONFIG_PROC_FS */
3762
3763
3764 /**
3765  *      netdev_set_master       -       set up master/slave pair
3766  *      @slave: slave device
3767  *      @master: new master device
3768  *
3769  *      Changes the master device of the slave. Pass %NULL to break the
3770  *      bonding. The caller must hold the RTNL semaphore. On a failure
3771  *      a negative errno code is returned. On success the reference counts
3772  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3773  *      function returns zero.
3774  */
3775 int netdev_set_master(struct net_device *slave, struct net_device *master)
3776 {
3777         struct net_device *old = slave->master;
3778
3779         ASSERT_RTNL();
3780
3781         if (master) {
3782                 if (old)
3783                         return -EBUSY;
3784                 dev_hold(master);
3785         }
3786
3787         slave->master = master;
3788
3789         if (old) {
3790                 synchronize_net();
3791                 dev_put(old);
3792         }
3793         if (master)
3794                 slave->flags |= IFF_SLAVE;
3795         else
3796                 slave->flags &= ~IFF_SLAVE;
3797
3798         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3799         return 0;
3800 }
3801 EXPORT_SYMBOL(netdev_set_master);
3802
3803 static void dev_change_rx_flags(struct net_device *dev, int flags)
3804 {
3805         const struct net_device_ops *ops = dev->netdev_ops;
3806
3807         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3808                 ops->ndo_change_rx_flags(dev, flags);
3809 }
3810
3811 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3812 {
3813         unsigned short old_flags = dev->flags;
3814         uid_t uid;
3815         gid_t gid;
3816
3817         ASSERT_RTNL();
3818
3819         dev->flags |= IFF_PROMISC;
3820         dev->promiscuity += inc;
3821         if (dev->promiscuity == 0) {
3822                 /*
3823                  * Avoid overflow.
3824                  * If inc causes overflow, untouch promisc and return error.
3825                  */
3826                 if (inc < 0)
3827                         dev->flags &= ~IFF_PROMISC;
3828                 else {
3829                         dev->promiscuity -= inc;
3830                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3831                                 "set promiscuity failed, promiscuity feature "
3832                                 "of device might be broken.\n", dev->name);
3833                         return -EOVERFLOW;
3834                 }
3835         }
3836         if (dev->flags != old_flags) {
3837                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3838                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3839                                                                "left");
3840                 if (audit_enabled) {
3841                         current_uid_gid(&uid, &gid);
3842                         audit_log(current->audit_context, GFP_ATOMIC,
3843                                 AUDIT_ANOM_PROMISCUOUS,
3844                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3845                                 dev->name, (dev->flags & IFF_PROMISC),
3846                                 (old_flags & IFF_PROMISC),
3847                                 audit_get_loginuid(current),
3848                                 uid, gid,
3849                                 audit_get_sessionid(current));
3850                 }
3851
3852                 dev_change_rx_flags(dev, IFF_PROMISC);
3853         }
3854         return 0;
3855 }
3856
3857 /**
3858  *      dev_set_promiscuity     - update promiscuity count on a device
3859  *      @dev: device
3860  *      @inc: modifier
3861  *
3862  *      Add or remove promiscuity from a device. While the count in the device
3863  *      remains above zero the interface remains promiscuous. Once it hits zero
3864  *      the device reverts back to normal filtering operation. A negative inc
3865  *      value is used to drop promiscuity on the device.
3866  *      Return 0 if successful or a negative errno code on error.
3867  */
3868 int dev_set_promiscuity(struct net_device *dev, int inc)
3869 {
3870         unsigned short old_flags = dev->flags;
3871         int err;
3872
3873         err = __dev_set_promiscuity(dev, inc);
3874         if (err < 0)
3875                 return err;
3876         if (dev->flags != old_flags)
3877                 dev_set_rx_mode(dev);
3878         return err;
3879 }
3880 EXPORT_SYMBOL(dev_set_promiscuity);
3881
3882 /**
3883  *      dev_set_allmulti        - update allmulti count on a device
3884  *      @dev: device
3885  *      @inc: modifier
3886  *
3887  *      Add or remove reception of all multicast frames to a device. While the
3888  *      count in the device remains above zero the interface remains listening
3889  *      to all interfaces. Once it hits zero the device reverts back to normal
3890  *      filtering operation. A negative @inc value is used to drop the counter
3891  *      when releasing a resource needing all multicasts.
3892  *      Return 0 if successful or a negative errno code on error.
3893  */
3894
3895 int dev_set_allmulti(struct net_device *dev, int inc)
3896 {
3897         unsigned short old_flags = dev->flags;
3898
3899         ASSERT_RTNL();
3900
3901         dev->flags |= IFF_ALLMULTI;
3902         dev->allmulti += inc;
3903         if (dev->allmulti == 0) {
3904                 /*
3905                  * Avoid overflow.
3906                  * If inc causes overflow, untouch allmulti and return error.
3907                  */
3908                 if (inc < 0)
3909                         dev->flags &= ~IFF_ALLMULTI;
3910                 else {
3911                         dev->allmulti -= inc;
3912                         printk(KERN_WARNING "%s: allmulti touches roof, "
3913                                 "set allmulti failed, allmulti feature of "
3914                                 "device might be broken.\n", dev->name);
3915                         return -EOVERFLOW;
3916                 }
3917         }
3918         if (dev->flags ^ old_flags) {
3919                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3920                 dev_set_rx_mode(dev);
3921         }
3922         return 0;
3923 }
3924 EXPORT_SYMBOL(dev_set_allmulti);
3925
3926 /*
3927  *      Upload unicast and multicast address lists to device and
3928  *      configure RX filtering. When the device doesn't support unicast
3929  *      filtering it is put in promiscuous mode while unicast addresses
3930  *      are present.
3931  */
3932 void __dev_set_rx_mode(struct net_device *dev)
3933 {
3934         const struct net_device_ops *ops = dev->netdev_ops;
3935
3936         /* dev_open will call this function so the list will stay sane. */
3937         if (!(dev->flags&IFF_UP))
3938                 return;
3939
3940         if (!netif_device_present(dev))
3941                 return;
3942
3943         if (ops->ndo_set_rx_mode)
3944                 ops->ndo_set_rx_mode(dev);
3945         else {
3946                 /* Unicast addresses changes may only happen under the rtnl,
3947                  * therefore calling __dev_set_promiscuity here is safe.
3948                  */
3949                 if (!netdev_uc_empty(dev) && !dev->uc_promisc) {
3950                         __dev_set_promiscuity(dev, 1);
3951                         dev->uc_promisc = 1;
3952                 } else if (netdev_uc_empty(dev) && dev->uc_promisc) {
3953                         __dev_set_promiscuity(dev, -1);
3954                         dev->uc_promisc = 0;
3955                 }
3956
3957                 if (ops->ndo_set_multicast_list)
3958                         ops->ndo_set_multicast_list(dev);
3959         }
3960 }
3961
3962 void dev_set_rx_mode(struct net_device *dev)
3963 {
3964         netif_addr_lock_bh(dev);
3965         __dev_set_rx_mode(dev);
3966         netif_addr_unlock_bh(dev);
3967 }
3968
3969 /* hw addresses list handling functions */
3970
3971 static int __hw_addr_add(struct netdev_hw_addr_list *list, unsigned char *addr,
3972                          int addr_len, unsigned char addr_type)
3973 {
3974         struct netdev_hw_addr *ha;
3975         int alloc_size;
3976
3977         if (addr_len > MAX_ADDR_LEN)
3978                 return -EINVAL;
3979
3980         list_for_each_entry(ha, &list->list, list) {
3981                 if (!memcmp(ha->addr, addr, addr_len) &&
3982                     ha->type == addr_type) {
3983                         ha->refcount++;
3984                         return 0;
3985                 }
3986         }
3987
3988
3989         alloc_size = sizeof(*ha);
3990         if (alloc_size < L1_CACHE_BYTES)
3991                 alloc_size = L1_CACHE_BYTES;
3992         ha = kmalloc(alloc_size, GFP_ATOMIC);
3993         if (!ha)
3994                 return -ENOMEM;
3995         memcpy(ha->addr, addr, addr_len);
3996         ha->type = addr_type;
3997         ha->refcount = 1;
3998         ha->synced = false;
3999         list_add_tail_rcu(&ha->list, &list->list);
4000         list->count++;
4001         return 0;
4002 }
4003
4004 static void ha_rcu_free(struct rcu_head *head)
4005 {
4006         struct netdev_hw_addr *ha;
4007
4008         ha = container_of(head, struct netdev_hw_addr, rcu_head);
4009         kfree(ha);
4010 }
4011
4012 static int __hw_addr_del(struct netdev_hw_addr_list *list, unsigned char *addr,
4013                          int addr_len, unsigned char addr_type)
4014 {
4015         struct netdev_hw_addr *ha;
4016
4017         list_for_each_entry(ha, &list->list, list) {
4018                 if (!memcmp(ha->addr, addr, addr_len) &&
4019                     (ha->type == addr_type || !addr_type)) {
4020                         if (--ha->refcount)
4021                                 return 0;
4022                         list_del_rcu(&ha->list);
4023                         call_rcu(&ha->rcu_head, ha_rcu_free);
4024                         list->count--;
4025                         return 0;
4026                 }
4027         }
4028         return -ENOENT;
4029 }
4030
4031 static int __hw_addr_add_multiple(struct netdev_hw_addr_list *to_list,
4032                                   struct netdev_hw_addr_list *from_list,
4033                                   int addr_len,
4034                                   unsigned char addr_type)
4035 {
4036         int err;
4037         struct netdev_hw_addr *ha, *ha2;
4038         unsigned char type;
4039
4040         list_for_each_entry(ha, &from_list->list, list) {
4041                 type = addr_type ? addr_type : ha->type;
4042                 err = __hw_addr_add(to_list, ha->addr, addr_len, type);
4043                 if (err)
4044                         goto unroll;
4045         }
4046         return 0;
4047
4048 unroll:
4049         list_for_each_entry(ha2, &from_list->list, list) {
4050                 if (ha2 == ha)
4051                         break;
4052                 type = addr_type ? addr_type : ha2->type;
4053                 __hw_addr_del(to_list, ha2->addr, addr_len, type);
4054         }
4055         return err;
4056 }
4057
4058 static void __hw_addr_del_multiple(struct netdev_hw_addr_list *to_list,
4059                                    struct netdev_hw_addr_list *from_list,
4060                                    int addr_len,
4061                                    unsigned char addr_type)
4062 {
4063         struct netdev_hw_addr *ha;
4064         unsigned char type;
4065
4066         list_for_each_entry(ha, &from_list->list, list) {
4067                 type = addr_type ? addr_type : ha->type;
4068                 __hw_addr_del(to_list, ha->addr, addr_len, addr_type);
4069         }
4070 }
4071
4072 static int __hw_addr_sync(struct netdev_hw_addr_list *to_list,
4073                           struct netdev_hw_addr_list *from_list,
4074                           int addr_len)
4075 {
4076         int err = 0;
4077         struct netdev_hw_addr *ha, *tmp;
4078
4079         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
4080                 if (!ha->synced) {
4081                         err = __hw_addr_add(to_list, ha->addr,
4082                                             addr_len, ha->type);
4083                         if (err)
4084                                 break;
4085                         ha->synced = true;
4086                         ha->refcount++;
4087                 } else if (ha->refcount == 1) {
4088                         __hw_addr_del(to_list, ha->addr, addr_len, ha->type);
4089                         __hw_addr_del(from_list, ha->addr, addr_len, ha->type);
4090                 }
4091         }
4092         return err;
4093 }
4094
4095 static void __hw_addr_unsync(struct netdev_hw_addr_list *to_list,
4096                              struct netdev_hw_addr_list *from_list,
4097                              int addr_len)
4098 {
4099         struct netdev_hw_addr *ha, *tmp;
4100
4101         list_for_each_entry_safe(ha, tmp, &from_list->list, list) {
4102                 if (ha->synced) {
4103                         __hw_addr_del(to_list, ha->addr,
4104                                       addr_len, ha->type);
4105                         ha->synced = false;
4106                         __hw_addr_del(from_list, ha->addr,
4107                                       addr_len, ha->type);
4108                 }
4109         }
4110 }
4111
4112 static void __hw_addr_flush(struct netdev_hw_addr_list *list)
4113 {
4114         struct netdev_hw_addr *ha, *tmp;
4115
4116         list_for_each_entry_safe(ha, tmp, &list->list, list) {
4117                 list_del_rcu(&ha->list);
4118                 call_rcu(&ha->rcu_head, ha_rcu_free);
4119         }
4120         list->count = 0;
4121 }
4122
4123 static void __hw_addr_init(struct netdev_hw_addr_list *list)
4124 {
4125         INIT_LIST_HEAD(&list->list);
4126         list->count = 0;
4127 }
4128
4129 /* Device addresses handling functions */
4130
4131 static void dev_addr_flush(struct net_device *dev)
4132 {
4133         /* rtnl_mutex must be held here */
4134
4135         __hw_addr_flush(&dev->dev_addrs);
4136         dev->dev_addr = NULL;
4137 }
4138
4139 static int dev_addr_init(struct net_device *dev)
4140 {
4141         unsigned char addr[MAX_ADDR_LEN];
4142         struct netdev_hw_addr *ha;
4143         int err;
4144
4145         /* rtnl_mutex must be held here */
4146
4147         __hw_addr_init(&dev->dev_addrs);
4148         memset(addr, 0, sizeof(addr));
4149         err = __hw_addr_add(&dev->dev_addrs, addr, sizeof(addr),
4150                             NETDEV_HW_ADDR_T_LAN);
4151         if (!err) {
4152                 /*
4153                  * Get the first (previously created) address from the list
4154                  * and set dev_addr pointer to this location.
4155                  */
4156                 ha = list_first_entry(&dev->dev_addrs.list,
4157                                       struct netdev_hw_addr, list);
4158                 dev->dev_addr = ha->addr;
4159         }
4160         return err;
4161 }
4162
4163 /**
4164  *      dev_addr_add    - Add a device address
4165  *      @dev: device
4166  *      @addr: address to add
4167  *      @addr_type: address type
4168  *
4169  *      Add a device address to the device or increase the reference count if
4170  *      it already exists.
4171  *
4172  *      The caller must hold the rtnl_mutex.
4173  */
4174 int dev_addr_add(struct net_device *dev, unsigned char *addr,
4175                  unsigned char addr_type)
4176 {
4177         int err;
4178
4179         ASSERT_RTNL();
4180
4181         err = __hw_addr_add(&dev->dev_addrs, addr, dev->addr_len, addr_type);
4182         if (!err)
4183                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4184         return err;
4185 }
4186 EXPORT_SYMBOL(dev_addr_add);
4187
4188 /**
4189  *      dev_addr_del    - Release a device address.
4190  *      @dev: device
4191  *      @addr: address to delete
4192  *      @addr_type: address type
4193  *
4194  *      Release reference to a device address and remove it from the device
4195  *      if the reference count drops to zero.
4196  *
4197  *      The caller must hold the rtnl_mutex.
4198  */
4199 int dev_addr_del(struct net_device *dev, unsigned char *addr,
4200                  unsigned char addr_type)
4201 {
4202         int err;
4203         struct netdev_hw_addr *ha;
4204
4205         ASSERT_RTNL();
4206
4207         /*
4208          * We can not remove the first address from the list because
4209          * dev->dev_addr points to that.
4210          */
4211         ha = list_first_entry(&dev->dev_addrs.list,
4212                               struct netdev_hw_addr, list);
4213         if (ha->addr == dev->dev_addr && ha->refcount == 1)
4214                 return -ENOENT;
4215
4216         err = __hw_addr_del(&dev->dev_addrs, addr, dev->addr_len,
4217                             addr_type);
4218         if (!err)
4219                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4220         return err;
4221 }
4222 EXPORT_SYMBOL(dev_addr_del);
4223
4224 /**
4225  *      dev_addr_add_multiple   - Add device addresses from another device
4226  *      @to_dev: device to which addresses will be added
4227  *      @from_dev: device from which addresses will be added
4228  *      @addr_type: address type - 0 means type will be used from from_dev
4229  *
4230  *      Add device addresses of the one device to another.
4231  **
4232  *      The caller must hold the rtnl_mutex.
4233  */
4234 int dev_addr_add_multiple(struct net_device *to_dev,
4235                           struct net_device *from_dev,
4236                           unsigned char addr_type)
4237 {
4238         int err;
4239
4240         ASSERT_RTNL();
4241
4242         if (from_dev->addr_len != to_dev->addr_len)
4243                 return -EINVAL;
4244         err = __hw_addr_add_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4245                                      to_dev->addr_len, addr_type);
4246         if (!err)
4247                 call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4248         return err;
4249 }
4250 EXPORT_SYMBOL(dev_addr_add_multiple);
4251
4252 /**
4253  *      dev_addr_del_multiple   - Delete device addresses by another device
4254  *      @to_dev: device where the addresses will be deleted
4255  *      @from_dev: device by which addresses the addresses will be deleted
4256  *      @addr_type: address type - 0 means type will used from from_dev
4257  *
4258  *      Deletes addresses in to device by the list of addresses in from device.
4259  *
4260  *      The caller must hold the rtnl_mutex.
4261  */
4262 int dev_addr_del_multiple(struct net_device *to_dev,
4263                           struct net_device *from_dev,
4264                           unsigned char addr_type)
4265 {
4266         ASSERT_RTNL();
4267
4268         if (from_dev->addr_len != to_dev->addr_len)
4269                 return -EINVAL;
4270         __hw_addr_del_multiple(&to_dev->dev_addrs, &from_dev->dev_addrs,
4271                                to_dev->addr_len, addr_type);
4272         call_netdevice_notifiers(NETDEV_CHANGEADDR, to_dev);
4273         return 0;
4274 }
4275 EXPORT_SYMBOL(dev_addr_del_multiple);
4276
4277 /* multicast addresses handling functions */
4278
4279 int __dev_addr_delete(struct dev_addr_list **list, int *count,
4280                       void *addr, int alen, int glbl)
4281 {
4282         struct dev_addr_list *da;
4283
4284         for (; (da = *list) != NULL; list = &da->next) {
4285                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4286                     alen == da->da_addrlen) {
4287                         if (glbl) {
4288                                 int old_glbl = da->da_gusers;
4289                                 da->da_gusers = 0;
4290                                 if (old_glbl == 0)
4291                                         break;
4292                         }
4293                         if (--da->da_users)
4294                                 return 0;
4295
4296                         *list = da->next;
4297                         kfree(da);
4298                         (*count)--;
4299                         return 0;
4300                 }
4301         }
4302         return -ENOENT;
4303 }
4304
4305 int __dev_addr_add(struct dev_addr_list **list, int *count,
4306                    void *addr, int alen, int glbl)
4307 {
4308         struct dev_addr_list *da;
4309
4310         for (da = *list; da != NULL; da = da->next) {
4311                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
4312                     da->da_addrlen == alen) {
4313                         if (glbl) {
4314                                 int old_glbl = da->da_gusers;
4315                                 da->da_gusers = 1;
4316                                 if (old_glbl)
4317                                         return 0;
4318                         }
4319                         da->da_users++;
4320                         return 0;
4321                 }
4322         }
4323
4324         da = kzalloc(sizeof(*da), GFP_ATOMIC);
4325         if (da == NULL)
4326                 return -ENOMEM;
4327         memcpy(da->da_addr, addr, alen);
4328         da->da_addrlen = alen;
4329         da->da_users = 1;
4330         da->da_gusers = glbl ? 1 : 0;
4331         da->next = *list;
4332         *list = da;
4333         (*count)++;
4334         return 0;
4335 }
4336
4337 /**
4338  *      dev_unicast_delete      - Release secondary unicast address.
4339  *      @dev: device
4340  *      @addr: address to delete
4341  *
4342  *      Release reference to a secondary unicast address and remove it
4343  *      from the device if the reference count drops to zero.
4344  *
4345  *      The caller must hold the rtnl_mutex.
4346  */
4347 int dev_unicast_delete(struct net_device *dev, void *addr)
4348 {
4349         int err;
4350
4351         ASSERT_RTNL();
4352
4353         netif_addr_lock_bh(dev);
4354         err = __hw_addr_del(&dev->uc, addr, dev->addr_len,
4355                             NETDEV_HW_ADDR_T_UNICAST);
4356         if (!err)
4357                 __dev_set_rx_mode(dev);
4358         netif_addr_unlock_bh(dev);
4359         return err;
4360 }
4361 EXPORT_SYMBOL(dev_unicast_delete);
4362
4363 /**
4364  *      dev_unicast_add         - add a secondary unicast address
4365  *      @dev: device
4366  *      @addr: address to add
4367  *
4368  *      Add a secondary unicast address to the device or increase
4369  *      the reference count if it already exists.
4370  *
4371  *      The caller must hold the rtnl_mutex.
4372  */
4373 int dev_unicast_add(struct net_device *dev, void *addr)
4374 {
4375         int err;
4376
4377         ASSERT_RTNL();
4378
4379         netif_addr_lock_bh(dev);
4380         err = __hw_addr_add(&dev->uc, addr, dev->addr_len,
4381                             NETDEV_HW_ADDR_T_UNICAST);
4382         if (!err)
4383                 __dev_set_rx_mode(dev);
4384         netif_addr_unlock_bh(dev);
4385         return err;
4386 }
4387 EXPORT_SYMBOL(dev_unicast_add);
4388
4389 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
4390                     struct dev_addr_list **from, int *from_count)
4391 {
4392         struct dev_addr_list *da, *next;
4393         int err = 0;
4394
4395         da = *from;
4396         while (da != NULL) {
4397                 next = da->next;
4398                 if (!da->da_synced) {
4399                         err = __dev_addr_add(to, to_count,
4400                                              da->da_addr, da->da_addrlen, 0);
4401                         if (err < 0)
4402                                 break;
4403                         da->da_synced = 1;
4404                         da->da_users++;
4405                 } else if (da->da_users == 1) {
4406                         __dev_addr_delete(to, to_count,
4407                                           da->da_addr, da->da_addrlen, 0);
4408                         __dev_addr_delete(from, from_count,
4409                                           da->da_addr, da->da_addrlen, 0);
4410                 }
4411                 da = next;
4412         }
4413         return err;
4414 }
4415 EXPORT_SYMBOL_GPL(__dev_addr_sync);
4416
4417 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
4418                        struct dev_addr_list **from, int *from_count)
4419 {
4420         struct dev_addr_list *da, *next;
4421
4422         da = *from;
4423         while (da != NULL) {
4424                 next = da->next;
4425                 if (da->da_synced) {
4426                         __dev_addr_delete(to, to_count,
4427                                           da->da_addr, da->da_addrlen, 0);
4428                         da->da_synced = 0;
4429                         __dev_addr_delete(from, from_count,
4430                                           da->da_addr, da->da_addrlen, 0);
4431                 }
4432                 da = next;
4433         }
4434 }
4435 EXPORT_SYMBOL_GPL(__dev_addr_unsync);
4436
4437 /**
4438  *      dev_unicast_sync - Synchronize device's unicast list to another device
4439  *      @to: destination device
4440  *      @from: source device
4441  *
4442  *      Add newly added addresses to the destination device and release
4443  *      addresses that have no users left. The source device must be
4444  *      locked by netif_tx_lock_bh.
4445  *
4446  *      This function is intended to be called from the dev->set_rx_mode
4447  *      function of layered software devices.
4448  */
4449 int dev_unicast_sync(struct net_device *to, struct net_device *from)
4450 {
4451         int err = 0;
4452
4453         if (to->addr_len != from->addr_len)
4454                 return -EINVAL;
4455
4456         netif_addr_lock_bh(to);
4457         err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
4458         if (!err)
4459                 __dev_set_rx_mode(to);
4460         netif_addr_unlock_bh(to);
4461         return err;
4462 }
4463 EXPORT_SYMBOL(dev_unicast_sync);
4464
4465 /**
4466  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
4467  *      @to: destination device
4468  *      @from: source device
4469  *
4470  *      Remove all addresses that were added to the destination device by
4471  *      dev_unicast_sync(). This function is intended to be called from the
4472  *      dev->stop function of layered software devices.
4473  */
4474 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
4475 {
4476         if (to->addr_len != from->addr_len)
4477                 return;
4478
4479         netif_addr_lock_bh(from);
4480         netif_addr_lock(to);
4481         __hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
4482         __dev_set_rx_mode(to);
4483         netif_addr_unlock(to);
4484         netif_addr_unlock_bh(from);
4485 }
4486 EXPORT_SYMBOL(dev_unicast_unsync);
4487
4488 void dev_unicast_flush(struct net_device *dev)
4489 {
4490         netif_addr_lock_bh(dev);
4491         __hw_addr_flush(&dev->uc);
4492         netif_addr_unlock_bh(dev);
4493 }
4494 EXPORT_SYMBOL(dev_unicast_flush);
4495
4496 static void dev_unicast_init(struct net_device *dev)
4497 {
4498         __hw_addr_init(&dev->uc);
4499 }
4500
4501
4502 static void __dev_addr_discard(struct dev_addr_list **list)
4503 {
4504         struct dev_addr_list *tmp;
4505
4506         while (*list != NULL) {
4507                 tmp = *list;
4508                 *list = tmp->next;
4509                 if (tmp->da_users > tmp->da_gusers)
4510                         printk("__dev_addr_discard: address leakage! "
4511                                "da_users=%d\n", tmp->da_users);
4512                 kfree(tmp);
4513         }
4514 }
4515
4516 void dev_addr_discard(struct net_device *dev)
4517 {
4518         netif_addr_lock_bh(dev);
4519
4520         __dev_addr_discard(&dev->mc_list);
4521         netdev_mc_count(dev) = 0;
4522
4523         netif_addr_unlock_bh(dev);
4524 }
4525 EXPORT_SYMBOL(dev_addr_discard);
4526
4527 /**
4528  *      dev_get_flags - get flags reported to userspace
4529  *      @dev: device
4530  *
4531  *      Get the combination of flag bits exported through APIs to userspace.
4532  */
4533 unsigned dev_get_flags(const struct net_device *dev)
4534 {
4535         unsigned flags;
4536
4537         flags = (dev->flags & ~(IFF_PROMISC |
4538                                 IFF_ALLMULTI |
4539                                 IFF_RUNNING |
4540                                 IFF_LOWER_UP |
4541                                 IFF_DORMANT)) |
4542                 (dev->gflags & (IFF_PROMISC |
4543                                 IFF_ALLMULTI));
4544
4545         if (netif_running(dev)) {
4546                 if (netif_oper_up(dev))
4547                         flags |= IFF_RUNNING;
4548                 if (netif_carrier_ok(dev))
4549                         flags |= IFF_LOWER_UP;
4550                 if (netif_dormant(dev))
4551                         flags |= IFF_DORMANT;
4552         }
4553
4554         return flags;
4555 }
4556 EXPORT_SYMBOL(dev_get_flags);
4557
4558 int __dev_change_flags(struct net_device *dev, unsigned int flags)
4559 {
4560         int old_flags = dev->flags;
4561         int ret;
4562
4563         ASSERT_RTNL();
4564
4565         /*
4566          *      Set the flags on our device.
4567          */
4568
4569         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
4570                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
4571                                IFF_AUTOMEDIA)) |
4572                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
4573                                     IFF_ALLMULTI));
4574
4575         /*
4576          *      Load in the correct multicast list now the flags have changed.
4577          */
4578
4579         if ((old_flags ^ flags) & IFF_MULTICAST)
4580                 dev_change_rx_flags(dev, IFF_MULTICAST);
4581
4582         dev_set_rx_mode(dev);
4583
4584         /*
4585          *      Have we downed the interface. We handle IFF_UP ourselves
4586          *      according to user attempts to set it, rather than blindly
4587          *      setting it.
4588          */
4589
4590         ret = 0;
4591         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
4592                 ret = ((old_flags & IFF_UP) ? __dev_close : __dev_open)(dev);
4593
4594                 if (!ret)
4595                         dev_set_rx_mode(dev);
4596         }
4597
4598         if ((flags ^ dev->gflags) & IFF_PROMISC) {
4599                 int inc = (flags & IFF_PROMISC) ? 1 : -1;
4600
4601                 dev->gflags ^= IFF_PROMISC;
4602                 dev_set_promiscuity(dev, inc);
4603         }
4604
4605         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
4606            is important. Some (broken) drivers set IFF_PROMISC, when
4607            IFF_ALLMULTI is requested not asking us and not reporting.
4608          */
4609         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
4610                 int inc = (flags & IFF_ALLMULTI) ? 1 : -1;
4611
4612                 dev->gflags ^= IFF_ALLMULTI;
4613                 dev_set_allmulti(dev, inc);
4614         }
4615
4616         return ret;
4617 }
4618
4619 void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
4620 {
4621         unsigned int changes = dev->flags ^ old_flags;
4622
4623         if (changes & IFF_UP) {
4624                 if (dev->flags & IFF_UP)
4625                         call_netdevice_notifiers(NETDEV_UP, dev);
4626                 else
4627                         call_netdevice_notifiers(NETDEV_DOWN, dev);
4628         }
4629
4630         if (dev->flags & IFF_UP &&
4631             (changes & ~(IFF_UP | IFF_PROMISC | IFF_ALLMULTI | IFF_VOLATILE)))
4632                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
4633 }
4634
4635 /**
4636  *      dev_change_flags - change device settings
4637  *      @dev: device
4638  *      @flags: device state flags
4639  *
4640  *      Change settings on device based state flags. The flags are
4641  *      in the userspace exported format.
4642  */
4643 int dev_change_flags(struct net_device *dev, unsigned flags)
4644 {
4645         int ret, changes;
4646         int old_flags = dev->flags;
4647
4648         ret = __dev_change_flags(dev, flags);
4649         if (ret < 0)
4650                 return ret;
4651
4652         changes = old_flags ^ dev->flags;
4653         if (changes)
4654                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
4655
4656         __dev_notify_flags(dev, old_flags);
4657         return ret;
4658 }
4659 EXPORT_SYMBOL(dev_change_flags);
4660
4661 /**
4662  *      dev_set_mtu - Change maximum transfer unit
4663  *      @dev: device
4664  *      @new_mtu: new transfer unit
4665  *
4666  *      Change the maximum transfer size of the network device.
4667  */
4668 int dev_set_mtu(struct net_device *dev, int new_mtu)
4669 {
4670         const struct net_device_ops *ops = dev->netdev_ops;
4671         int err;
4672
4673         if (new_mtu == dev->mtu)
4674                 return 0;
4675
4676         /*      MTU must be positive.    */
4677         if (new_mtu < 0)
4678                 return -EINVAL;
4679
4680         if (!netif_device_present(dev))
4681                 return -ENODEV;
4682
4683         err = 0;
4684         if (ops->ndo_change_mtu)
4685                 err = ops->ndo_change_mtu(dev, new_mtu);
4686         else
4687                 dev->mtu = new_mtu;
4688
4689         if (!err && dev->flags & IFF_UP)
4690                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
4691         return err;
4692 }
4693 EXPORT_SYMBOL(dev_set_mtu);
4694
4695 /**
4696  *      dev_set_mac_address - Change Media Access Control Address
4697  *      @dev: device
4698  *      @sa: new address
4699  *
4700  *      Change the hardware (MAC) address of the device
4701  */
4702 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
4703 {
4704         const struct net_device_ops *ops = dev->netdev_ops;
4705         int err;
4706
4707         if (!ops->ndo_set_mac_address)
4708                 return -EOPNOTSUPP;
4709         if (sa->sa_family != dev->type)
4710                 return -EINVAL;
4711         if (!netif_device_present(dev))
4712                 return -ENODEV;
4713         err = ops->ndo_set_mac_address(dev, sa);
4714         if (!err)
4715                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4716         return err;
4717 }
4718 EXPORT_SYMBOL(dev_set_mac_address);
4719
4720 /*
4721  *      Perform the SIOCxIFxxx calls, inside rcu_read_lock()
4722  */
4723 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
4724 {
4725         int err;
4726         struct net_device *dev = dev_get_by_name_rcu(net, ifr->ifr_name);
4727
4728         if (!dev)
4729                 return -ENODEV;
4730
4731         switch (cmd) {
4732         case SIOCGIFFLAGS:      /* Get interface flags */
4733                 ifr->ifr_flags = (short) dev_get_flags(dev);
4734                 return 0;
4735
4736         case SIOCGIFMETRIC:     /* Get the metric on the interface
4737                                    (currently unused) */
4738                 ifr->ifr_metric = 0;
4739                 return 0;
4740
4741         case SIOCGIFMTU:        /* Get the MTU of a device */
4742                 ifr->ifr_mtu = dev->mtu;
4743                 return 0;
4744
4745         case SIOCGIFHWADDR:
4746                 if (!dev->addr_len)
4747                         memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
4748                 else
4749                         memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
4750                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4751                 ifr->ifr_hwaddr.sa_family = dev->type;
4752                 return 0;
4753
4754         case SIOCGIFSLAVE:
4755                 err = -EINVAL;
4756                 break;
4757
4758         case SIOCGIFMAP:
4759                 ifr->ifr_map.mem_start = dev->mem_start;
4760                 ifr->ifr_map.mem_end   = dev->mem_end;
4761                 ifr->ifr_map.base_addr = dev->base_addr;
4762                 ifr->ifr_map.irq       = dev->irq;
4763                 ifr->ifr_map.dma       = dev->dma;
4764                 ifr->ifr_map.port      = dev->if_port;
4765                 return 0;
4766
4767         case SIOCGIFINDEX:
4768                 ifr->ifr_ifindex = dev->ifindex;
4769                 return 0;
4770
4771         case SIOCGIFTXQLEN:
4772                 ifr->ifr_qlen = dev->tx_queue_len;
4773                 return 0;
4774
4775         default:
4776                 /* dev_ioctl() should ensure this case
4777                  * is never reached
4778                  */
4779                 WARN_ON(1);
4780                 err = -EINVAL;
4781                 break;
4782
4783         }
4784         return err;
4785 }
4786
4787 /*
4788  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
4789  */
4790 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
4791 {
4792         int err;
4793         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
4794         const struct net_device_ops *ops;
4795
4796         if (!dev)
4797                 return -ENODEV;
4798
4799         ops = dev->netdev_ops;
4800
4801         switch (cmd) {
4802         case SIOCSIFFLAGS:      /* Set interface flags */
4803                 return dev_change_flags(dev, ifr->ifr_flags);
4804
4805         case SIOCSIFMETRIC:     /* Set the metric on the interface
4806                                    (currently unused) */
4807                 return -EOPNOTSUPP;
4808
4809         case SIOCSIFMTU:        /* Set the MTU of a device */
4810                 return dev_set_mtu(dev, ifr->ifr_mtu);
4811
4812         case SIOCSIFHWADDR:
4813                 return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
4814
4815         case SIOCSIFHWBROADCAST:
4816                 if (ifr->ifr_hwaddr.sa_family != dev->type)
4817                         return -EINVAL;
4818                 memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
4819                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
4820                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
4821                 return 0;
4822
4823         case SIOCSIFMAP:
4824                 if (ops->ndo_set_config) {
4825                         if (!netif_device_present(dev))
4826                                 return -ENODEV;
4827                         return ops->ndo_set_config(dev, &ifr->ifr_map);
4828                 }
4829                 return -EOPNOTSUPP;
4830
4831         case SIOCADDMULTI:
4832                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4833                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4834                         return -EINVAL;
4835                 if (!netif_device_present(dev))
4836                         return -ENODEV;
4837                 return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
4838                                   dev->addr_len, 1);
4839
4840         case SIOCDELMULTI:
4841                 if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
4842                     ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
4843                         return -EINVAL;
4844                 if (!netif_device_present(dev))
4845                         return -ENODEV;
4846                 return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
4847                                      dev->addr_len, 1);
4848
4849         case SIOCSIFTXQLEN:
4850                 if (ifr->ifr_qlen < 0)
4851                         return -EINVAL;
4852                 dev->tx_queue_len = ifr->ifr_qlen;
4853                 return 0;
4854
4855         case SIOCSIFNAME:
4856                 ifr->ifr_newname[IFNAMSIZ-1] = '\0';
4857                 return dev_change_name(dev, ifr->ifr_newname);
4858
4859         /*
4860          *      Unknown or private ioctl
4861          */
4862         default:
4863                 if ((cmd >= SIOCDEVPRIVATE &&
4864                     cmd <= SIOCDEVPRIVATE + 15) ||
4865                     cmd == SIOCBONDENSLAVE ||
4866                     cmd == SIOCBONDRELEASE ||
4867                     cmd == SIOCBONDSETHWADDR ||
4868                     cmd == SIOCBONDSLAVEINFOQUERY ||
4869                     cmd == SIOCBONDINFOQUERY ||
4870                     cmd == SIOCBONDCHANGEACTIVE ||
4871                     cmd == SIOCGMIIPHY ||
4872                     cmd == SIOCGMIIREG ||
4873                     cmd == SIOCSMIIREG ||
4874                     cmd == SIOCBRADDIF ||
4875                     cmd == SIOCBRDELIF ||
4876                     cmd == SIOCSHWTSTAMP ||
4877                     cmd == SIOCWANDEV) {
4878                         err = -EOPNOTSUPP;
4879                         if (ops->ndo_do_ioctl) {
4880                                 if (netif_device_present(dev))
4881                                         err = ops->ndo_do_ioctl(dev, ifr, cmd);
4882                                 else
4883                                         err = -ENODEV;
4884                         }
4885                 } else
4886                         err = -EINVAL;
4887
4888         }
4889         return err;
4890 }
4891
4892 /*
4893  *      This function handles all "interface"-type I/O control requests. The actual
4894  *      'doing' part of this is dev_ifsioc above.
4895  */
4896
4897 /**
4898  *      dev_ioctl       -       network device ioctl
4899  *      @net: the applicable net namespace
4900  *      @cmd: command to issue
4901  *      @arg: pointer to a struct ifreq in user space
4902  *
4903  *      Issue ioctl functions to devices. This is normally called by the
4904  *      user space syscall interfaces but can sometimes be useful for
4905  *      other purposes. The return value is the return from the syscall if
4906  *      positive or a negative errno code on error.
4907  */
4908
4909 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4910 {
4911         struct ifreq ifr;
4912         int ret;
4913         char *colon;
4914
4915         /* One special case: SIOCGIFCONF takes ifconf argument
4916            and requires shared lock, because it sleeps writing
4917            to user space.
4918          */
4919
4920         if (cmd == SIOCGIFCONF) {
4921                 rtnl_lock();
4922                 ret = dev_ifconf(net, (char __user *) arg);
4923                 rtnl_unlock();
4924                 return ret;
4925         }
4926         if (cmd == SIOCGIFNAME)
4927                 return dev_ifname(net, (struct ifreq __user *)arg);
4928
4929         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4930                 return -EFAULT;
4931
4932         ifr.ifr_name[IFNAMSIZ-1] = 0;
4933
4934         colon = strchr(ifr.ifr_name, ':');
4935         if (colon)
4936                 *colon = 0;
4937
4938         /*
4939          *      See which interface the caller is talking about.
4940          */
4941
4942         switch (cmd) {
4943         /*
4944          *      These ioctl calls:
4945          *      - can be done by all.
4946          *      - atomic and do not require locking.
4947          *      - return a value
4948          */
4949         case SIOCGIFFLAGS:
4950         case SIOCGIFMETRIC:
4951         case SIOCGIFMTU:
4952         case SIOCGIFHWADDR:
4953         case SIOCGIFSLAVE:
4954         case SIOCGIFMAP:
4955         case SIOCGIFINDEX:
4956         case SIOCGIFTXQLEN:
4957                 dev_load(net, ifr.ifr_name);
4958                 rcu_read_lock();
4959                 ret = dev_ifsioc_locked(net, &ifr, cmd);
4960                 rcu_read_unlock();
4961                 if (!ret) {
4962                         if (colon)
4963                                 *colon = ':';
4964                         if (copy_to_user(arg, &ifr,
4965                                          sizeof(struct ifreq)))
4966                                 ret = -EFAULT;
4967                 }
4968                 return ret;
4969
4970         case SIOCETHTOOL:
4971                 dev_load(net, ifr.ifr_name);
4972                 rtnl_lock();
4973                 ret = dev_ethtool(net, &ifr);
4974                 rtnl_unlock();
4975                 if (!ret) {
4976                         if (colon)
4977                                 *colon = ':';
4978                         if (copy_to_user(arg, &ifr,
4979                                          sizeof(struct ifreq)))
4980                                 ret = -EFAULT;
4981                 }
4982                 return ret;
4983
4984         /*
4985          *      These ioctl calls:
4986          *      - require superuser power.
4987          *      - require strict serialization.
4988          *      - return a value
4989          */
4990         case SIOCGMIIPHY:
4991         case SIOCGMIIREG:
4992         case SIOCSIFNAME:
4993                 if (!capable(CAP_NET_ADMIN))
4994                         return -EPERM;
4995                 dev_load(net, ifr.ifr_name);
4996                 rtnl_lock();
4997                 ret = dev_ifsioc(net, &ifr, cmd);
4998                 rtnl_unlock();
4999                 if (!ret) {
5000                         if (colon)
5001                                 *colon = ':';
5002                         if (copy_to_user(arg, &ifr,
5003                                          sizeof(struct ifreq)))
5004                                 ret = -EFAULT;
5005                 }
5006                 return ret;
5007
5008         /*
5009          *      These ioctl calls:
5010          *      - require superuser power.
5011          *      - require strict serialization.
5012          *      - do not return a value
5013          */
5014         case SIOCSIFFLAGS:
5015         case SIOCSIFMETRIC:
5016         case SIOCSIFMTU:
5017         case SIOCSIFMAP:
5018         case SIOCSIFHWADDR:
5019         case SIOCSIFSLAVE:
5020         case SIOCADDMULTI:
5021         case SIOCDELMULTI:
5022         case SIOCSIFHWBROADCAST:
5023         case SIOCSIFTXQLEN:
5024         case SIOCSMIIREG:
5025         case SIOCBONDENSLAVE:
5026         case SIOCBONDRELEASE:
5027         case SIOCBONDSETHWADDR:
5028         case SIOCBONDCHANGEACTIVE:
5029         case SIOCBRADDIF:
5030         case SIOCBRDELIF:
5031         case SIOCSHWTSTAMP:
5032                 if (!capable(CAP_NET_ADMIN))
5033                         return -EPERM;
5034                 /* fall through */
5035         case SIOCBONDSLAVEINFOQUERY:
5036         case SIOCBONDINFOQUERY:
5037                 dev_load(net, ifr.ifr_name);
5038                 rtnl_lock();
5039                 ret = dev_ifsioc(net, &ifr, cmd);
5040                 rtnl_unlock();
5041                 return ret;
5042
5043         case SIOCGIFMEM:
5044                 /* Get the per device memory space. We can add this but
5045                  * currently do not support it */
5046         case SIOCSIFMEM:
5047                 /* Set the per device memory buffer space.
5048                  * Not applicable in our case */
5049         case SIOCSIFLINK:
5050                 return -EINVAL;
5051
5052         /*
5053          *      Unknown or private ioctl.
5054          */
5055         default:
5056                 if (cmd == SIOCWANDEV ||
5057                     (cmd >= SIOCDEVPRIVATE &&
5058                      cmd <= SIOCDEVPRIVATE + 15)) {
5059                         dev_load(net, ifr.ifr_name);
5060                         rtnl_lock();
5061                         ret = dev_ifsioc(net, &ifr, cmd);
5062                         rtnl_unlock();
5063                         if (!ret && copy_to_user(arg, &ifr,
5064                                                  sizeof(struct ifreq)))
5065                                 ret = -EFAULT;
5066                         return ret;
5067                 }
5068                 /* Take care of Wireless Extensions */
5069                 if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
5070                         return wext_handle_ioctl(net, &ifr, cmd, arg);
5071                 return -EINVAL;
5072         }
5073 }
5074
5075
5076 /**
5077  *      dev_new_index   -       allocate an ifindex
5078  *      @net: the applicable net namespace
5079  *
5080  *      Returns a suitable unique value for a new device interface
5081  *      number.  The caller must hold the rtnl semaphore or the
5082  *      dev_base_lock to be sure it remains unique.
5083  */
5084 static int dev_new_index(struct net *net)
5085 {
5086         static int ifindex;
5087         for (;;) {
5088                 if (++ifindex <= 0)
5089                         ifindex = 1;
5090                 if (!__dev_get_by_index(net, ifindex))
5091                         return ifindex;
5092         }
5093 }
5094
5095 /* Delayed registration/unregisteration */
5096 static LIST_HEAD(net_todo_list);
5097
5098 static void net_set_todo(struct net_device *dev)
5099 {
5100         list_add_tail(&dev->todo_list, &net_todo_list);
5101 }
5102
5103 static void rollback_registered_many(struct list_head *head)
5104 {
5105         struct net_device *dev, *tmp;
5106
5107         BUG_ON(dev_boot_phase);
5108         ASSERT_RTNL();
5109
5110         list_for_each_entry_safe(dev, tmp, head, unreg_list) {
5111                 /* Some devices call without registering
5112                  * for initialization unwind. Remove those
5113                  * devices and proceed with the remaining.
5114                  */
5115                 if (dev->reg_state == NETREG_UNINITIALIZED) {
5116                         pr_debug("unregister_netdevice: device %s/%p never "
5117                                  "was registered\n", dev->name, dev);
5118
5119                         WARN_ON(1);
5120                         list_del(&dev->unreg_list);
5121                         continue;
5122                 }
5123
5124                 BUG_ON(dev->reg_state != NETREG_REGISTERED);
5125
5126                 /* If device is running, close it first. */
5127                 dev_close(dev);
5128
5129                 /* And unlink it from device chain. */
5130                 unlist_netdevice(dev);
5131
5132                 dev->reg_state = NETREG_UNREGISTERING;
5133         }
5134
5135         synchronize_net();
5136
5137         list_for_each_entry(dev, head, unreg_list) {
5138                 /* Shutdown queueing discipline. */
5139                 dev_shutdown(dev);
5140
5141
5142                 /* Notify protocols, that we are about to destroy
5143                    this device. They should clean all the things.
5144                 */
5145                 call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5146
5147                 if (!dev->rtnl_link_ops ||
5148                     dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5149                         rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
5150
5151                 /*
5152                  *      Flush the unicast and multicast chains
5153                  */
5154                 dev_unicast_flush(dev);
5155                 dev_addr_discard(dev);
5156
5157                 if (dev->netdev_ops->ndo_uninit)
5158                         dev->netdev_ops->ndo_uninit(dev);
5159
5160                 /* Notifier chain MUST detach us from master device. */
5161                 WARN_ON(dev->master);
5162
5163                 /* Remove entries from kobject tree */
5164                 netdev_unregister_kobject(dev);
5165         }
5166
5167         /* Process any work delayed until the end of the batch */
5168         dev = list_first_entry(head, struct net_device, unreg_list);
5169         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5170
5171         synchronize_net();
5172
5173         list_for_each_entry(dev, head, unreg_list)
5174                 dev_put(dev);
5175 }
5176
5177 static void rollback_registered(struct net_device *dev)
5178 {
5179         LIST_HEAD(single);
5180
5181         list_add(&dev->unreg_list, &single);
5182         rollback_registered_many(&single);
5183 }
5184
5185 static void __netdev_init_queue_locks_one(struct net_device *dev,
5186                                           struct netdev_queue *dev_queue,
5187                                           void *_unused)
5188 {
5189         spin_lock_init(&dev_queue->_xmit_lock);
5190         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
5191         dev_queue->xmit_lock_owner = -1;
5192 }
5193
5194 static void netdev_init_queue_locks(struct net_device *dev)
5195 {
5196         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
5197         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
5198 }
5199
5200 unsigned long netdev_fix_features(unsigned long features, const char *name)
5201 {
5202         /* Fix illegal SG+CSUM combinations. */
5203         if ((features & NETIF_F_SG) &&
5204             !(features & NETIF_F_ALL_CSUM)) {
5205                 if (name)
5206                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
5207                                "checksum feature.\n", name);
5208                 features &= ~NETIF_F_SG;
5209         }
5210
5211         /* TSO requires that SG is present as well. */
5212         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
5213                 if (name)
5214                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
5215                                "SG feature.\n", name);
5216                 features &= ~NETIF_F_TSO;
5217         }
5218
5219         if (features & NETIF_F_UFO) {
5220                 if (!(features & NETIF_F_GEN_CSUM)) {
5221                         if (name)
5222                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5223                                        "since no NETIF_F_HW_CSUM feature.\n",
5224                                        name);
5225                         features &= ~NETIF_F_UFO;
5226                 }
5227
5228                 if (!(features & NETIF_F_SG)) {
5229                         if (name)
5230                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
5231                                        "since no NETIF_F_SG feature.\n", name);
5232                         features &= ~NETIF_F_UFO;
5233                 }
5234         }
5235
5236         return features;
5237 }
5238 EXPORT_SYMBOL(netdev_fix_features);
5239
5240 /**
5241  *      netif_stacked_transfer_operstate -      transfer operstate
5242  *      @rootdev: the root or lower level device to transfer state from
5243  *      @dev: the device to transfer operstate to
5244  *
5245  *      Transfer operational state from root to device. This is normally
5246  *      called when a stacking relationship exists between the root
5247  *      device and the device(a leaf device).
5248  */
5249 void netif_stacked_transfer_operstate(const struct net_device *rootdev,
5250                                         struct net_device *dev)
5251 {
5252         if (rootdev->operstate == IF_OPER_DORMANT)
5253                 netif_dormant_on(dev);
5254         else
5255                 netif_dormant_off(dev);
5256
5257         if (netif_carrier_ok(rootdev)) {
5258                 if (!netif_carrier_ok(dev))
5259                         netif_carrier_on(dev);
5260         } else {
5261                 if (netif_carrier_ok(dev))
5262                         netif_carrier_off(dev);
5263         }
5264 }
5265 EXPORT_SYMBOL(netif_stacked_transfer_operstate);
5266
5267 /**
5268  *      register_netdevice      - register a network device
5269  *      @dev: device to register
5270  *
5271  *      Take a completed network device structure and add it to the kernel
5272  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5273  *      chain. 0 is returned on success. A negative errno code is returned
5274  *      on a failure to set up the device, or if the name is a duplicate.
5275  *
5276  *      Callers must hold the rtnl semaphore. You may want
5277  *      register_netdev() instead of this.
5278  *
5279  *      BUGS:
5280  *      The locking appears insufficient to guarantee two parallel registers
5281  *      will not get the same name.
5282  */
5283
5284 int register_netdevice(struct net_device *dev)
5285 {
5286         int ret;
5287         struct net *net = dev_net(dev);
5288
5289         BUG_ON(dev_boot_phase);
5290         ASSERT_RTNL();
5291
5292         might_sleep();
5293
5294         /* When net_device's are persistent, this will be fatal. */
5295         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
5296         BUG_ON(!net);
5297
5298         spin_lock_init(&dev->addr_list_lock);
5299         netdev_set_addr_lockdep_class(dev);
5300         netdev_init_queue_locks(dev);
5301
5302         dev->iflink = -1;
5303
5304 #ifdef CONFIG_RPS
5305         if (!dev->num_rx_queues) {
5306                 /*
5307                  * Allocate a single RX queue if driver never called
5308                  * alloc_netdev_mq
5309                  */
5310
5311                 dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL);
5312                 if (!dev->_rx) {
5313                         ret = -ENOMEM;
5314                         goto out;
5315                 }
5316
5317                 dev->_rx->first = dev->_rx;
5318                 atomic_set(&dev->_rx->count, 1);
5319                 dev->num_rx_queues = 1;
5320         }
5321 #endif
5322         /* Init, if this function is available */
5323         if (dev->netdev_ops->ndo_init) {
5324                 ret = dev->netdev_ops->ndo_init(dev);
5325                 if (ret) {
5326                         if (ret > 0)
5327                                 ret = -EIO;
5328                         goto out;
5329                 }
5330         }
5331
5332         ret = dev_get_valid_name(net, dev->name, dev->name, 0);
5333         if (ret)
5334                 goto err_uninit;
5335
5336         dev->ifindex = dev_new_index(net);
5337         if (dev->iflink == -1)
5338                 dev->iflink = dev->ifindex;
5339
5340         /* Fix illegal checksum combinations */
5341         if ((dev->features & NETIF_F_HW_CSUM) &&
5342             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5343                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
5344                        dev->name);
5345                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
5346         }
5347
5348         if ((dev->features & NETIF_F_NO_CSUM) &&
5349             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
5350                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
5351                        dev->name);
5352                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
5353         }
5354
5355         dev->features = netdev_fix_features(dev->features, dev->name);
5356
5357         /* Enable software GSO if SG is supported. */
5358         if (dev->features & NETIF_F_SG)
5359                 dev->features |= NETIF_F_GSO;
5360
5361         netdev_initialize_kobject(dev);
5362
5363         ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev);
5364         ret = notifier_to_errno(ret);
5365         if (ret)
5366                 goto err_uninit;
5367
5368         ret = netdev_register_kobject(dev);
5369         if (ret)
5370                 goto err_uninit;
5371         dev->reg_state = NETREG_REGISTERED;
5372
5373         /*
5374          *      Default initial state at registry is that the
5375          *      device is present.
5376          */
5377
5378         set_bit(__LINK_STATE_PRESENT, &dev->state);
5379
5380         dev_init_scheduler(dev);
5381         dev_hold(dev);
5382         list_netdevice(dev);
5383
5384         /* Notify protocols, that a new device appeared. */
5385         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
5386         ret = notifier_to_errno(ret);
5387         if (ret) {
5388                 rollback_registered(dev);
5389                 dev->reg_state = NETREG_UNREGISTERED;
5390         }
5391         /*
5392          *      Prevent userspace races by waiting until the network
5393          *      device is fully setup before sending notifications.
5394          */
5395         if (!dev->rtnl_link_ops ||
5396             dev->rtnl_link_state == RTNL_LINK_INITIALIZED)
5397                 rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
5398
5399 out:
5400         return ret;
5401
5402 err_uninit:
5403         if (dev->netdev_ops->ndo_uninit)
5404                 dev->netdev_ops->ndo_uninit(dev);
5405         goto out;
5406 }
5407 EXPORT_SYMBOL(register_netdevice);
5408
5409 /**
5410  *      init_dummy_netdev       - init a dummy network device for NAPI
5411  *      @dev: device to init
5412  *
5413  *      This takes a network device structure and initialize the minimum
5414  *      amount of fields so it can be used to schedule NAPI polls without
5415  *      registering a full blown interface. This is to be used by drivers
5416  *      that need to tie several hardware interfaces to a single NAPI
5417  *      poll scheduler due to HW limitations.
5418  */
5419 int init_dummy_netdev(struct net_device *dev)
5420 {
5421         /* Clear everything. Note we don't initialize spinlocks
5422          * are they aren't supposed to be taken by any of the
5423          * NAPI code and this dummy netdev is supposed to be
5424          * only ever used for NAPI polls
5425          */
5426         memset(dev, 0, sizeof(struct net_device));
5427
5428         /* make sure we BUG if trying to hit standard
5429          * register/unregister code path
5430          */
5431         dev->reg_state = NETREG_DUMMY;
5432
5433         /* initialize the ref count */
5434         atomic_set(&dev->refcnt, 1);
5435
5436         /* NAPI wants this */
5437         INIT_LIST_HEAD(&dev->napi_list);
5438
5439         /* a dummy interface is started by default */
5440         set_bit(__LINK_STATE_PRESENT, &dev->state);
5441         set_bit(__LINK_STATE_START, &dev->state);
5442
5443         return 0;
5444 }
5445 EXPORT_SYMBOL_GPL(init_dummy_netdev);
5446
5447
5448 /**
5449  *      register_netdev - register a network device
5450  *      @dev: device to register
5451  *
5452  *      Take a completed network device structure and add it to the kernel
5453  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
5454  *      chain. 0 is returned on success. A negative errno code is returned
5455  *      on a failure to set up the device, or if the name is a duplicate.
5456  *
5457  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
5458  *      and expands the device name if you passed a format string to
5459  *      alloc_netdev.
5460  */
5461 int register_netdev(struct net_device *dev)
5462 {
5463         int err;
5464
5465         rtnl_lock();
5466
5467         /*
5468          * If the name is a format string the caller wants us to do a
5469          * name allocation.
5470          */
5471         if (strchr(dev->name, '%')) {
5472                 err = dev_alloc_name(dev, dev->name);
5473                 if (err < 0)
5474                         goto out;
5475         }
5476
5477         err = register_netdevice(dev);
5478 out:
5479         rtnl_unlock();
5480         return err;
5481 }
5482 EXPORT_SYMBOL(register_netdev);
5483
5484 /*
5485  * netdev_wait_allrefs - wait until all references are gone.
5486  *
5487  * This is called when unregistering network devices.
5488  *
5489  * Any protocol or device that holds a reference should register
5490  * for netdevice notification, and cleanup and put back the
5491  * reference if they receive an UNREGISTER event.
5492  * We can get stuck here if buggy protocols don't correctly
5493  * call dev_put.
5494  */
5495 static void netdev_wait_allrefs(struct net_device *dev)
5496 {
5497         unsigned long rebroadcast_time, warning_time;
5498
5499         linkwatch_forget_dev(dev);
5500
5501         rebroadcast_time = warning_time = jiffies;
5502         while (atomic_read(&dev->refcnt) != 0) {
5503                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
5504                         rtnl_lock();
5505
5506                         /* Rebroadcast unregister notification */
5507                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5508                         /* don't resend NETDEV_UNREGISTER_BATCH, _BATCH users
5509                          * should have already handle it the first time */
5510
5511                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
5512                                      &dev->state)) {
5513                                 /* We must not have linkwatch events
5514                                  * pending on unregister. If this
5515                                  * happens, we simply run the queue
5516                                  * unscheduled, resulting in a noop
5517                                  * for this device.
5518                                  */
5519                                 linkwatch_run_queue();
5520                         }
5521
5522                         __rtnl_unlock();
5523
5524                         rebroadcast_time = jiffies;
5525                 }
5526
5527                 msleep(250);
5528
5529                 if (time_after(jiffies, warning_time + 10 * HZ)) {
5530                         printk(KERN_EMERG "unregister_netdevice: "
5531                                "waiting for %s to become free. Usage "
5532                                "count = %d\n",
5533                                dev->name, atomic_read(&dev->refcnt));
5534                         warning_time = jiffies;
5535                 }
5536         }
5537 }
5538
5539 /* The sequence is:
5540  *
5541  *      rtnl_lock();
5542  *      ...
5543  *      register_netdevice(x1);
5544  *      register_netdevice(x2);
5545  *      ...
5546  *      unregister_netdevice(y1);
5547  *      unregister_netdevice(y2);
5548  *      ...
5549  *      rtnl_unlock();
5550  *      free_netdev(y1);
5551  *      free_netdev(y2);
5552  *
5553  * We are invoked by rtnl_unlock().
5554  * This allows us to deal with problems:
5555  * 1) We can delete sysfs objects which invoke hotplug
5556  *    without deadlocking with linkwatch via keventd.
5557  * 2) Since we run with the RTNL semaphore not held, we can sleep
5558  *    safely in order to wait for the netdev refcnt to drop to zero.
5559  *
5560  * We must not return until all unregister events added during
5561  * the interval the lock was held have been completed.
5562  */
5563 void netdev_run_todo(void)
5564 {
5565         struct list_head list;
5566
5567         /* Snapshot list, allow later requests */
5568         list_replace_init(&net_todo_list, &list);
5569
5570         __rtnl_unlock();
5571
5572         while (!list_empty(&list)) {
5573                 struct net_device *dev
5574                         = list_first_entry(&list, struct net_device, todo_list);
5575                 list_del(&dev->todo_list);
5576
5577                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
5578                         printk(KERN_ERR "network todo '%s' but state %d\n",
5579                                dev->name, dev->reg_state);
5580                         dump_stack();
5581                         continue;
5582                 }
5583
5584                 dev->reg_state = NETREG_UNREGISTERED;
5585
5586                 on_each_cpu(flush_backlog, dev, 1);
5587
5588                 netdev_wait_allrefs(dev);
5589
5590                 /* paranoia */
5591                 BUG_ON(atomic_read(&dev->refcnt));
5592                 WARN_ON(dev->ip_ptr);
5593                 WARN_ON(dev->ip6_ptr);
5594                 WARN_ON(dev->dn_ptr);
5595
5596                 if (dev->destructor)
5597                         dev->destructor(dev);
5598
5599                 /* Free network device */
5600                 kobject_put(&dev->dev.kobj);
5601         }
5602 }
5603
5604 /**
5605  *      dev_txq_stats_fold - fold tx_queues stats
5606  *      @dev: device to get statistics from
5607  *      @stats: struct net_device_stats to hold results
5608  */
5609 void dev_txq_stats_fold(const struct net_device *dev,
5610                         struct net_device_stats *stats)
5611 {
5612         unsigned long tx_bytes = 0, tx_packets = 0, tx_dropped = 0;
5613         unsigned int i;
5614         struct netdev_queue *txq;
5615
5616         for (i = 0; i < dev->num_tx_queues; i++) {
5617                 txq = netdev_get_tx_queue(dev, i);
5618                 tx_bytes   += txq->tx_bytes;
5619                 tx_packets += txq->tx_packets;
5620                 tx_dropped += txq->tx_dropped;
5621         }
5622         if (tx_bytes || tx_packets || tx_dropped) {
5623                 stats->tx_bytes   = tx_bytes;
5624                 stats->tx_packets = tx_packets;
5625                 stats->tx_dropped = tx_dropped;
5626         }
5627 }
5628 EXPORT_SYMBOL(dev_txq_stats_fold);
5629
5630 /**
5631  *      dev_get_stats   - get network device statistics
5632  *      @dev: device to get statistics from
5633  *
5634  *      Get network statistics from device. The device driver may provide
5635  *      its own method by setting dev->netdev_ops->get_stats; otherwise
5636  *      the internal statistics structure is used.
5637  */
5638 const struct net_device_stats *dev_get_stats(struct net_device *dev)
5639 {
5640         const struct net_device_ops *ops = dev->netdev_ops;
5641
5642         if (ops->ndo_get_stats)
5643                 return ops->ndo_get_stats(dev);
5644
5645         dev_txq_stats_fold(dev, &dev->stats);
5646         return &dev->stats;
5647 }
5648 EXPORT_SYMBOL(dev_get_stats);
5649
5650 static void netdev_init_one_queue(struct net_device *dev,
5651                                   struct netdev_queue *queue,
5652                                   void *_unused)
5653 {
5654         queue->dev = dev;
5655 }
5656
5657 static void netdev_init_queues(struct net_device *dev)
5658 {
5659         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
5660         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
5661         spin_lock_init(&dev->tx_global_lock);
5662 }
5663
5664 /**
5665  *      alloc_netdev_mq - allocate network device
5666  *      @sizeof_priv:   size of private data to allocate space for
5667  *      @name:          device name format string
5668  *      @setup:         callback to initialize device
5669  *      @queue_count:   the number of subqueues to allocate
5670  *
5671  *      Allocates a struct net_device with private data area for driver use
5672  *      and performs basic initialization.  Also allocates subquue structs
5673  *      for each queue on the device at the end of the netdevice.
5674  */
5675 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
5676                 void (*setup)(struct net_device *), unsigned int queue_count)
5677 {
5678         struct netdev_queue *tx;
5679         struct net_device *dev;
5680         size_t alloc_size;
5681         struct net_device *p;
5682 #ifdef CONFIG_RPS
5683         struct netdev_rx_queue *rx;
5684         int i;
5685 #endif
5686
5687         BUG_ON(strlen(name) >= sizeof(dev->name));
5688
5689         alloc_size = sizeof(struct net_device);
5690         if (sizeof_priv) {
5691                 /* ensure 32-byte alignment of private area */
5692                 alloc_size = ALIGN(alloc_size, NETDEV_ALIGN);
5693                 alloc_size += sizeof_priv;
5694         }
5695         /* ensure 32-byte alignment of whole construct */
5696         alloc_size += NETDEV_ALIGN - 1;
5697
5698         p = kzalloc(alloc_size, GFP_KERNEL);
5699         if (!p) {
5700                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
5701                 return NULL;
5702         }
5703
5704         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
5705         if (!tx) {
5706                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5707                        "tx qdiscs.\n");
5708                 goto free_p;
5709         }
5710
5711 #ifdef CONFIG_RPS
5712         rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL);
5713         if (!rx) {
5714                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
5715                        "rx queues.\n");
5716                 goto free_tx;
5717         }
5718
5719         atomic_set(&rx->count, queue_count);
5720
5721         /*
5722          * Set a pointer to first element in the array which holds the
5723          * reference count.
5724          */
5725         for (i = 0; i < queue_count; i++)
5726                 rx[i].first = rx;
5727 #endif
5728
5729         dev = PTR_ALIGN(p, NETDEV_ALIGN);
5730         dev->padded = (char *)dev - (char *)p;
5731
5732         if (dev_addr_init(dev))
5733                 goto free_rx;
5734
5735         dev_unicast_init(dev);
5736
5737         dev_net_set(dev, &init_net);
5738
5739         dev->_tx = tx;
5740         dev->num_tx_queues = queue_count;
5741         dev->real_num_tx_queues = queue_count;
5742
5743 #ifdef CONFIG_RPS
5744         dev->_rx = rx;
5745         dev->num_rx_queues = queue_count;
5746 #endif
5747
5748         dev->gso_max_size = GSO_MAX_SIZE;
5749
5750         netdev_init_queues(dev);
5751
5752         INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list);
5753         dev->ethtool_ntuple_list.count = 0;
5754         INIT_LIST_HEAD(&dev->napi_list);
5755         INIT_LIST_HEAD(&dev->unreg_list);
5756         INIT_LIST_HEAD(&dev->link_watch_list);
5757         dev->priv_flags = IFF_XMIT_DST_RELEASE;
5758         setup(dev);
5759         strcpy(dev->name, name);
5760         return dev;
5761
5762 free_rx:
5763 #ifdef CONFIG_RPS
5764         kfree(rx);
5765 free_tx:
5766 #endif
5767         kfree(tx);
5768 free_p:
5769         kfree(p);
5770         return NULL;
5771 }
5772 EXPORT_SYMBOL(alloc_netdev_mq);
5773
5774 /**
5775  *      free_netdev - free network device
5776  *      @dev: device
5777  *
5778  *      This function does the last stage of destroying an allocated device
5779  *      interface. The reference to the device object is released.
5780  *      If this is the last reference then it will be freed.
5781  */
5782 void free_netdev(struct net_device *dev)
5783 {
5784         struct napi_struct *p, *n;
5785
5786         release_net(dev_net(dev));
5787
5788         kfree(dev->_tx);
5789
5790         /* Flush device addresses */
5791         dev_addr_flush(dev);
5792
5793         /* Clear ethtool n-tuple list */
5794         ethtool_ntuple_flush(dev);
5795
5796         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
5797                 netif_napi_del(p);
5798
5799         /*  Compatibility with error handling in drivers */
5800         if (dev->reg_state == NETREG_UNINITIALIZED) {
5801                 kfree((char *)dev - dev->padded);
5802                 return;
5803         }
5804
5805         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
5806         dev->reg_state = NETREG_RELEASED;
5807
5808         /* will free via device release */
5809         put_device(&dev->dev);
5810 }
5811 EXPORT_SYMBOL(free_netdev);
5812
5813 /**
5814  *      synchronize_net -  Synchronize with packet receive processing
5815  *
5816  *      Wait for packets currently being received to be done.
5817  *      Does not block later packets from starting.
5818  */
5819 void synchronize_net(void)
5820 {
5821         might_sleep();
5822         synchronize_rcu();
5823 }
5824 EXPORT_SYMBOL(synchronize_net);
5825
5826 /**
5827  *      unregister_netdevice_queue - remove device from the kernel
5828  *      @dev: device
5829  *      @head: list
5830  *
5831  *      This function shuts down a device interface and removes it
5832  *      from the kernel tables.
5833  *      If head not NULL, device is queued to be unregistered later.
5834  *
5835  *      Callers must hold the rtnl semaphore.  You may want
5836  *      unregister_netdev() instead of this.
5837  */
5838
5839 void unregister_netdevice_queue(struct net_device *dev, struct list_head *head)
5840 {
5841         ASSERT_RTNL();
5842
5843         if (head) {
5844                 list_move_tail(&dev->unreg_list, head);
5845         } else {
5846                 rollback_registered(dev);
5847                 /* Finish processing unregister after unlock */
5848                 net_set_todo(dev);
5849         }
5850 }
5851 EXPORT_SYMBOL(unregister_netdevice_queue);
5852
5853 /**
5854  *      unregister_netdevice_many - unregister many devices
5855  *      @head: list of devices
5856  */
5857 void unregister_netdevice_many(struct list_head *head)
5858 {
5859         struct net_device *dev;
5860
5861         if (!list_empty(head)) {
5862                 rollback_registered_many(head);
5863                 list_for_each_entry(dev, head, unreg_list)
5864                         net_set_todo(dev);
5865         }
5866 }
5867 EXPORT_SYMBOL(unregister_netdevice_many);
5868
5869 /**
5870  *      unregister_netdev - remove device from the kernel
5871  *      @dev: device
5872  *
5873  *      This function shuts down a device interface and removes it
5874  *      from the kernel tables.
5875  *
5876  *      This is just a wrapper for unregister_netdevice that takes
5877  *      the rtnl semaphore.  In general you want to use this and not
5878  *      unregister_netdevice.
5879  */
5880 void unregister_netdev(struct net_device *dev)
5881 {
5882         rtnl_lock();
5883         unregister_netdevice(dev);
5884         rtnl_unlock();
5885 }
5886 EXPORT_SYMBOL(unregister_netdev);
5887
5888 /**
5889  *      dev_change_net_namespace - move device to different nethost namespace
5890  *      @dev: device
5891  *      @net: network namespace
5892  *      @pat: If not NULL name pattern to try if the current device name
5893  *            is already taken in the destination network namespace.
5894  *
5895  *      This function shuts down a device interface and moves it
5896  *      to a new network namespace. On success 0 is returned, on
5897  *      a failure a netagive errno code is returned.
5898  *
5899  *      Callers must hold the rtnl semaphore.
5900  */
5901
5902 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
5903 {
5904         int err;
5905
5906         ASSERT_RTNL();
5907
5908         /* Don't allow namespace local devices to be moved. */
5909         err = -EINVAL;
5910         if (dev->features & NETIF_F_NETNS_LOCAL)
5911                 goto out;
5912
5913 #ifdef CONFIG_SYSFS
5914         /* Don't allow real devices to be moved when sysfs
5915          * is enabled.
5916          */
5917         err = -EINVAL;
5918         if (dev->dev.parent)
5919                 goto out;
5920 #endif
5921
5922         /* Ensure the device has been registrered */
5923         err = -EINVAL;
5924         if (dev->reg_state != NETREG_REGISTERED)
5925                 goto out;
5926
5927         /* Get out if there is nothing todo */
5928         err = 0;
5929         if (net_eq(dev_net(dev), net))
5930                 goto out;
5931
5932         /* Pick the destination device name, and ensure
5933          * we can use it in the destination network namespace.
5934          */
5935         err = -EEXIST;
5936         if (__dev_get_by_name(net, dev->name)) {
5937                 /* We get here if we can't use the current device name */
5938                 if (!pat)
5939                         goto out;
5940                 if (dev_get_valid_name(net, pat, dev->name, 1))
5941                         goto out;
5942         }
5943
5944         /*
5945          * And now a mini version of register_netdevice unregister_netdevice.
5946          */
5947
5948         /* If device is running close it first. */
5949         dev_close(dev);
5950
5951         /* And unlink it from device chain */
5952         err = -ENODEV;
5953         unlist_netdevice(dev);
5954
5955         synchronize_net();
5956
5957         /* Shutdown queueing discipline. */
5958         dev_shutdown(dev);
5959
5960         /* Notify protocols, that we are about to destroy
5961            this device. They should clean all the things.
5962         */
5963         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
5964         call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
5965
5966         /*
5967          *      Flush the unicast and multicast chains
5968          */
5969         dev_unicast_flush(dev);
5970         dev_addr_discard(dev);
5971
5972         netdev_unregister_kobject(dev);
5973
5974         /* Actually switch the network namespace */
5975         dev_net_set(dev, net);
5976
5977         /* If there is an ifindex conflict assign a new one */
5978         if (__dev_get_by_index(net, dev->ifindex)) {
5979                 int iflink = (dev->iflink == dev->ifindex);
5980                 dev->ifindex = dev_new_index(net);
5981                 if (iflink)
5982                         dev->iflink = dev->ifindex;
5983         }
5984
5985         /* Fixup kobjects */
5986         err = netdev_register_kobject(dev);
5987         WARN_ON(err);
5988
5989         /* Add the device back in the hashes */
5990         list_netdevice(dev);
5991
5992         /* Notify protocols, that a new device appeared. */
5993         call_netdevice_notifiers(NETDEV_REGISTER, dev);
5994
5995         /*
5996          *      Prevent userspace races by waiting until the network
5997          *      device is fully setup before sending notifications.
5998          */
5999         rtmsg_ifinfo(RTM_NEWLINK, dev, ~0U);
6000
6001         synchronize_net();
6002         err = 0;
6003 out:
6004         return err;
6005 }
6006 EXPORT_SYMBOL_GPL(dev_change_net_namespace);
6007
6008 static int dev_cpu_callback(struct notifier_block *nfb,
6009                             unsigned long action,
6010                             void *ocpu)
6011 {
6012         struct sk_buff **list_skb;
6013         struct Qdisc **list_net;
6014         struct sk_buff *skb;
6015         unsigned int cpu, oldcpu = (unsigned long)ocpu;
6016         struct softnet_data *sd, *oldsd;
6017
6018         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
6019                 return NOTIFY_OK;
6020
6021         local_irq_disable();
6022         cpu = smp_processor_id();
6023         sd = &per_cpu(softnet_data, cpu);
6024         oldsd = &per_cpu(softnet_data, oldcpu);
6025
6026         /* Find end of our completion_queue. */
6027         list_skb = &sd->completion_queue;
6028         while (*list_skb)
6029                 list_skb = &(*list_skb)->next;
6030         /* Append completion queue from offline CPU. */
6031         *list_skb = oldsd->completion_queue;
6032         oldsd->completion_queue = NULL;
6033
6034         /* Find end of our output_queue. */
6035         list_net = &sd->output_queue;
6036         while (*list_net)
6037                 list_net = &(*list_net)->next_sched;
6038         /* Append output queue from offline CPU. */
6039         *list_net = oldsd->output_queue;
6040         oldsd->output_queue = NULL;
6041
6042         raise_softirq_irqoff(NET_TX_SOFTIRQ);
6043         local_irq_enable();
6044
6045         /* Process offline CPU's input_pkt_queue */
6046         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
6047                 netif_rx(skb);
6048
6049         return NOTIFY_OK;
6050 }
6051
6052
6053 /**
6054  *      netdev_increment_features - increment feature set by one
6055  *      @all: current feature set
6056  *      @one: new feature set
6057  *      @mask: mask feature set
6058  *
6059  *      Computes a new feature set after adding a device with feature set
6060  *      @one to the master device with current feature set @all.  Will not
6061  *      enable anything that is off in @mask. Returns the new feature set.
6062  */
6063 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
6064                                         unsigned long mask)
6065 {
6066         /* If device needs checksumming, downgrade to it. */
6067         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
6068                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
6069         else if (mask & NETIF_F_ALL_CSUM) {
6070                 /* If one device supports v4/v6 checksumming, set for all. */
6071                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
6072                     !(all & NETIF_F_GEN_CSUM)) {
6073                         all &= ~NETIF_F_ALL_CSUM;
6074                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
6075                 }
6076
6077                 /* If one device supports hw checksumming, set for all. */
6078                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
6079                         all &= ~NETIF_F_ALL_CSUM;
6080                         all |= NETIF_F_HW_CSUM;
6081                 }
6082         }
6083
6084         one |= NETIF_F_ALL_CSUM;
6085
6086         one |= all & NETIF_F_ONE_FOR_ALL;
6087         all &= one | NETIF_F_LLTX | NETIF_F_GSO | NETIF_F_UFO;
6088         all |= one & mask & NETIF_F_ONE_FOR_ALL;
6089
6090         return all;
6091 }
6092 EXPORT_SYMBOL(netdev_increment_features);
6093
6094 static struct hlist_head *netdev_create_hash(void)
6095 {
6096         int i;
6097         struct hlist_head *hash;
6098
6099         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
6100         if (hash != NULL)
6101                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
6102                         INIT_HLIST_HEAD(&hash[i]);
6103
6104         return hash;
6105 }
6106
6107 /* Initialize per network namespace state */
6108 static int __net_init netdev_init(struct net *net)
6109 {
6110         INIT_LIST_HEAD(&net->dev_base_head);
6111
6112         net->dev_name_head = netdev_create_hash();
6113         if (net->dev_name_head == NULL)
6114                 goto err_name;
6115
6116         net->dev_index_head = netdev_create_hash();
6117         if (net->dev_index_head == NULL)
6118                 goto err_idx;
6119
6120         return 0;
6121
6122 err_idx:
6123         kfree(net->dev_name_head);
6124 err_name:
6125         return -ENOMEM;
6126 }
6127
6128 /**
6129  *      netdev_drivername - network driver for the device
6130  *      @dev: network device
6131  *      @buffer: buffer for resulting name
6132  *      @len: size of buffer
6133  *
6134  *      Determine network driver for device.
6135  */
6136 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
6137 {
6138         const struct device_driver *driver;
6139         const struct device *parent;
6140
6141         if (len <= 0 || !buffer)
6142                 return buffer;
6143         buffer[0] = 0;
6144
6145         parent = dev->dev.parent;
6146
6147         if (!parent)
6148                 return buffer;
6149
6150         driver = parent->driver;
6151         if (driver && driver->name)
6152                 strlcpy(buffer, driver->name, len);
6153         return buffer;
6154 }
6155
6156 static void __net_exit netdev_exit(struct net *net)
6157 {
6158         kfree(net->dev_name_head);
6159         kfree(net->dev_index_head);
6160 }
6161
6162 static struct pernet_operations __net_initdata netdev_net_ops = {
6163         .init = netdev_init,
6164         .exit = netdev_exit,
6165 };
6166
6167 static void __net_exit default_device_exit(struct net *net)
6168 {
6169         struct net_device *dev, *aux;
6170         /*
6171          * Push all migratable network devices back to the
6172          * initial network namespace
6173          */
6174         rtnl_lock();
6175         for_each_netdev_safe(net, dev, aux) {
6176                 int err;
6177                 char fb_name[IFNAMSIZ];
6178
6179                 /* Ignore unmoveable devices (i.e. loopback) */
6180                 if (dev->features & NETIF_F_NETNS_LOCAL)
6181                         continue;
6182
6183                 /* Leave virtual devices for the generic cleanup */
6184                 if (dev->rtnl_link_ops)
6185                         continue;
6186
6187                 /* Push remaing network devices to init_net */
6188                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
6189                 err = dev_change_net_namespace(dev, &init_net, fb_name);
6190                 if (err) {
6191                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
6192                                 __func__, dev->name, err);
6193                         BUG();
6194                 }
6195         }
6196         rtnl_unlock();
6197 }
6198
6199 static void __net_exit default_device_exit_batch(struct list_head *net_list)
6200 {
6201         /* At exit all network devices most be removed from a network
6202          * namespace.  Do this in the reverse order of registeration.
6203          * Do this across as many network namespaces as possible to
6204          * improve batching efficiency.
6205          */
6206         struct net_device *dev;
6207         struct net *net;
6208         LIST_HEAD(dev_kill_list);
6209
6210         rtnl_lock();
6211         list_for_each_entry(net, net_list, exit_list) {
6212                 for_each_netdev_reverse(net, dev) {
6213                         if (dev->rtnl_link_ops)
6214                                 dev->rtnl_link_ops->dellink(dev, &dev_kill_list);
6215                         else
6216                                 unregister_netdevice_queue(dev, &dev_kill_list);
6217                 }
6218         }
6219         unregister_netdevice_many(&dev_kill_list);
6220         rtnl_unlock();
6221 }
6222
6223 static struct pernet_operations __net_initdata default_device_ops = {
6224         .exit = default_device_exit,
6225         .exit_batch = default_device_exit_batch,
6226 };
6227
6228 /*
6229  *      Initialize the DEV module. At boot time this walks the device list and
6230  *      unhooks any devices that fail to initialise (normally hardware not
6231  *      present) and leaves us with a valid list of present and active devices.
6232  *
6233  */
6234
6235 /*
6236  *       This is called single threaded during boot, so no need
6237  *       to take the rtnl semaphore.
6238  */
6239 static int __init net_dev_init(void)
6240 {
6241         int i, rc = -ENOMEM;
6242
6243         BUG_ON(!dev_boot_phase);
6244
6245         if (dev_proc_init())
6246                 goto out;
6247
6248         if (netdev_kobject_init())
6249                 goto out;
6250
6251         INIT_LIST_HEAD(&ptype_all);
6252         for (i = 0; i < PTYPE_HASH_SIZE; i++)
6253                 INIT_LIST_HEAD(&ptype_base[i]);
6254
6255         if (register_pernet_subsys(&netdev_net_ops))
6256                 goto out;
6257
6258         /*
6259          *      Initialise the packet receive queues.
6260          */
6261
6262         for_each_possible_cpu(i) {
6263                 struct softnet_data *queue;
6264
6265                 queue = &per_cpu(softnet_data, i);
6266                 skb_queue_head_init(&queue->input_pkt_queue);
6267                 queue->completion_queue = NULL;
6268                 INIT_LIST_HEAD(&queue->poll_list);
6269
6270 #ifdef CONFIG_RPS
6271                 queue->csd.func = trigger_softirq;
6272                 queue->csd.info = queue;
6273                 queue->csd.flags = 0;
6274 #endif
6275
6276                 queue->backlog.poll = process_backlog;
6277                 queue->backlog.weight = weight_p;
6278                 queue->backlog.gro_list = NULL;
6279                 queue->backlog.gro_count = 0;
6280         }
6281
6282         dev_boot_phase = 0;
6283
6284         /* The loopback device is special if any other network devices
6285          * is present in a network namespace the loopback device must
6286          * be present. Since we now dynamically allocate and free the
6287          * loopback device ensure this invariant is maintained by
6288          * keeping the loopback device as the first device on the
6289          * list of network devices.  Ensuring the loopback devices
6290          * is the first device that appears and the last network device
6291          * that disappears.
6292          */
6293         if (register_pernet_device(&loopback_net_ops))
6294                 goto out;
6295
6296         if (register_pernet_device(&default_device_ops))
6297                 goto out;
6298
6299         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
6300         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
6301
6302         hotcpu_notifier(dev_cpu_callback, 0);
6303         dst_init();
6304         dev_mcast_init();
6305         rc = 0;
6306 out:
6307         return rc;
6308 }
6309
6310 subsys_initcall(net_dev_init);
6311
6312 static int __init initialize_hashrnd(void)
6313 {
6314         get_random_bytes(&hashrnd, sizeof(hashrnd));
6315         return 0;
6316 }
6317
6318 late_initcall_sync(initialize_hashrnd);
6319