SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/cpu.h>
  80 #include <linux/types.h>
  81 #include <linux/kernel.h>
  82 #include <linux/sched.h>
  83 #include <linux/mutex.h>
  84 #include <linux/string.h>
  85 #include <linux/mm.h>
  86 #include <linux/socket.h>
  87 #include <linux/sockios.h>
  88 #include <linux/errno.h>
  89 #include <linux/interrupt.h>
  90 #include <linux/if_ether.h>
  91 #include <linux/netdevice.h>
  92 #include <linux/etherdevice.h>
  93 #include <linux/ethtool.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/net_namespace.h>
  97 #include <net/sock.h>
  98 #include <linux/rtnetlink.h>
  99 #include <linux/proc_fs.h>
 100 #include <linux/seq_file.h>
 101 #include <linux/stat.h>
 102 #include <linux/if_bridge.h>
 103 #include <linux/if_macvlan.h>
 104 #include <net/dst.h>
 105 #include <net/pkt_sched.h>
 106 #include <net/checksum.h>
 107 #include <linux/highmem.h>
 108 #include <linux/init.h>
 109 #include <linux/kmod.h>
 110 #include <linux/module.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <net/wext.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118 #include <linux/dmaengine.h>
 119 #include <linux/err.h>
 120 #include <linux/ctype.h>
 121 #include <linux/if_arp.h>
 122 #include <linux/if_vlan.h>
 123 #include <linux/ip.h>
 124 #include <net/ip.h>
 125 #include <linux/ipv6.h>
 126 #include <linux/in.h>
 127 #include <linux/jhash.h>
 128 #include <linux/random.h>
 129
 130 #include "net-sysfs.h"
 131
 132 /* Instead of increasing this, you should create a hash table. */
 133 #define MAX_GRO_SKBS 8
 134
 135 /* This should be increased if a protocol with a bigger head is added. */
 136 #define GRO_MAX_HEAD (MAX_HEADER + 128)
 137
 138 enum {
 139         GRO_MERGED,
 140         GRO_MERGED_FREE,
 141         GRO_HELD,
 142         GRO_NORMAL,
 143         GRO_DROP,
 144 };
 145
 146 /*
 147  *      The list of packet types we will receive (as opposed to discard)
 148  *      and the routines to invoke.
 149  *
 150  *      Why 16. Because with 16 the only overlap we get on a hash of the
 151  *      low nibble of the protocol value is RARP/SNAP/X.25.
 152  *
 153  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 154  *             sure which should go first, but I bet it won't make much
 155  *             difference if we are running VLANs.  The good news is that
 156  *             this protocol won't be in the list unless compiled in, so
 157  *             the average user (w/out VLANs) will not be adversely affected.
 158  *             --BLG
 159  *
 160  *              0800    IP
 161  *              8100    802.1Q VLAN
 162  *              0001    802.3
 163  *              0002    AX.25
 164  *              0004    802.2
 165  *              8035    RARP
 166  *              0005    SNAP
 167  *              0805    X.25
 168  *              0806    ARP
 169  *              8137    IPX
 170  *              0009    Localtalk
 171  *              86DD    IPv6
 172  */
 173
 174 #define PTYPE_HASH_SIZE (16)
 175 #define PTYPE_HASH_MASK (PTYPE_HASH_SIZE - 1)
 176
 177 static DEFINE_SPINLOCK(ptype_lock);
 178 static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly;
 179 static struct list_head ptype_all __read_mostly;        /* Taps */
 180
 181 /*
 182  * The @dev_base_head list is protected by @dev_base_lock and the rtnl
 183  * semaphore.
 184  *
 185  * Pure readers hold dev_base_lock for reading.
 186  *
 187  * Writers must hold the rtnl semaphore while they loop through the
 188  * dev_base_head list, and hold dev_base_lock for writing when they do the
 189  * actual updates.  This allows pure readers to access the list even
 190  * while a writer is preparing to update it.
 191  *
 192  * To put it another way, dev_base_lock is held for writing only to
 193  * protect against pure readers; the rtnl semaphore provides the
 194  * protection against other writers.
 195  *
 196  * See, for example usages, register_netdevice() and
 197  * unregister_netdevice(), which must be called with the rtnl
 198  * semaphore held.
 199  */
 200 DEFINE_RWLOCK(dev_base_lock);
 201
 202 EXPORT_SYMBOL(dev_base_lock);
 203
 204 #define NETDEV_HASHBITS 8
 205 #define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
 206
 207 static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
 208 {
 209         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 210         return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
 211 }
 212
 213 static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
 214 {
 215         return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
 216 }
 217
 218 /* Device list insertion */
 219 static int list_netdevice(struct net_device *dev)
 220 {
 221         struct net *net = dev_net(dev);
 222
 223         ASSERT_RTNL();
 224
 225         write_lock_bh(&dev_base_lock);
 226         list_add_tail(&dev->dev_list, &net->dev_base_head);
 227         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 228         hlist_add_head(&dev->index_hlist, dev_index_hash(net, dev->ifindex));
 229         write_unlock_bh(&dev_base_lock);
 230         return 0;
 231 }
 232
 233 /* Device list removal */
 234 static void unlist_netdevice(struct net_device *dev)
 235 {
 236         ASSERT_RTNL();
 237
 238         /* Unlink dev from the device chain */
 239         write_lock_bh(&dev_base_lock);
 240         list_del(&dev->dev_list);
 241         hlist_del(&dev->name_hlist);
 242         hlist_del(&dev->index_hlist);
 243         write_unlock_bh(&dev_base_lock);
 244 }
 245
 246 /*
 247  *      Our notifier list
 248  */
 249
 250 static RAW_NOTIFIER_HEAD(netdev_chain);
 251
 252 /*
 253  *      Device drivers call our routines to queue packets here. We empty the
 254  *      queue in the local softnet handler.
 255  */
 256
 257 DEFINE_PER_CPU(struct softnet_data, softnet_data);
 258
 259 #ifdef CONFIG_LOCKDEP
 260 /*
 261  * register_netdevice() inits txq->_xmit_lock and sets lockdep class
 262  * according to dev->type
 263  */
 264 static const unsigned short netdev_lock_type[] =
 265         {ARPHRD_NETROM, ARPHRD_ETHER, ARPHRD_EETHER, ARPHRD_AX25,
 266          ARPHRD_PRONET, ARPHRD_CHAOS, ARPHRD_IEEE802, ARPHRD_ARCNET,
 267          ARPHRD_APPLETLK, ARPHRD_DLCI, ARPHRD_ATM, ARPHRD_METRICOM,
 268          ARPHRD_IEEE1394, ARPHRD_EUI64, ARPHRD_INFINIBAND, ARPHRD_SLIP,
 269          ARPHRD_CSLIP, ARPHRD_SLIP6, ARPHRD_CSLIP6, ARPHRD_RSRVD,
 270          ARPHRD_ADAPT, ARPHRD_ROSE, ARPHRD_X25, ARPHRD_HWX25,
 271          ARPHRD_PPP, ARPHRD_CISCO, ARPHRD_LAPB, ARPHRD_DDCMP,
 272          ARPHRD_RAWHDLC, ARPHRD_TUNNEL, ARPHRD_TUNNEL6, ARPHRD_FRAD,
 273          ARPHRD_SKIP, ARPHRD_LOOPBACK, ARPHRD_LOCALTLK, ARPHRD_FDDI,
 274          ARPHRD_BIF, ARPHRD_SIT, ARPHRD_IPDDP, ARPHRD_IPGRE,
 275          ARPHRD_PIMREG, ARPHRD_HIPPI, ARPHRD_ASH, ARPHRD_ECONET,
 276          ARPHRD_IRDA, ARPHRD_FCPP, ARPHRD_FCAL, ARPHRD_FCPL,
 277          ARPHRD_FCFABRIC, ARPHRD_IEEE802_TR, ARPHRD_IEEE80211,
 278          ARPHRD_IEEE80211_PRISM, ARPHRD_IEEE80211_RADIOTAP, ARPHRD_PHONET,
 279          ARPHRD_PHONET_PIPE, ARPHRD_VOID, ARPHRD_NONE};
 280
 281 static const char *netdev_lock_name[] =
 282         {"_xmit_NETROM", "_xmit_ETHER", "_xmit_EETHER", "_xmit_AX25",
 283          "_xmit_PRONET", "_xmit_CHAOS", "_xmit_IEEE802", "_xmit_ARCNET",
 284          "_xmit_APPLETLK", "_xmit_DLCI", "_xmit_ATM", "_xmit_METRICOM",
 285          "_xmit_IEEE1394", "_xmit_EUI64", "_xmit_INFINIBAND", "_xmit_SLIP",
 286          "_xmit_CSLIP", "_xmit_SLIP6", "_xmit_CSLIP6", "_xmit_RSRVD",
 287          "_xmit_ADAPT", "_xmit_ROSE", "_xmit_X25", "_xmit_HWX25",
 288          "_xmit_PPP", "_xmit_CISCO", "_xmit_LAPB", "_xmit_DDCMP",
 289          "_xmit_RAWHDLC", "_xmit_TUNNEL", "_xmit_TUNNEL6", "_xmit_FRAD",
 290          "_xmit_SKIP", "_xmit_LOOPBACK", "_xmit_LOCALTLK", "_xmit_FDDI",
 291          "_xmit_BIF", "_xmit_SIT", "_xmit_IPDDP", "_xmit_IPGRE",
 292          "_xmit_PIMREG", "_xmit_HIPPI", "_xmit_ASH", "_xmit_ECONET",
 293          "_xmit_IRDA", "_xmit_FCPP", "_xmit_FCAL", "_xmit_FCPL",
 294          "_xmit_FCFABRIC", "_xmit_IEEE802_TR", "_xmit_IEEE80211",
 295          "_xmit_IEEE80211_PRISM", "_xmit_IEEE80211_RADIOTAP", "_xmit_PHONET",
 296          "_xmit_PHONET_PIPE", "_xmit_VOID", "_xmit_NONE"};
 297
 298 static struct lock_class_key netdev_xmit_lock_key[ARRAY_SIZE(netdev_lock_type)];
 299 static struct lock_class_key netdev_addr_lock_key[ARRAY_SIZE(netdev_lock_type)];
 300
 301 static inline unsigned short netdev_lock_pos(unsigned short dev_type)
 302 {
 303         int i;
 304
 305         for (i = 0; i < ARRAY_SIZE(netdev_lock_type); i++)
 306                 if (netdev_lock_type[i] == dev_type)
 307                         return i;
 308         /* the last key is used by default */
 309         return ARRAY_SIZE(netdev_lock_type) - 1;
 310 }
 311
 312 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 313                                                  unsigned short dev_type)
 314 {
 315         int i;
 316
 317         i = netdev_lock_pos(dev_type);
 318         lockdep_set_class_and_name(lock, &netdev_xmit_lock_key[i],
 319                                    netdev_lock_name[i]);
 320 }
 321
 322 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 323 {
 324         int i;
 325
 326         i = netdev_lock_pos(dev->type);
 327         lockdep_set_class_and_name(&dev->addr_list_lock,
 328                                    &netdev_addr_lock_key[i],
 329                                    netdev_lock_name[i]);
 330 }
 331 #else
 332 static inline void netdev_set_xmit_lockdep_class(spinlock_t *lock,
 333                                                  unsigned short dev_type)
 334 {
 335 }
 336 static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 337 {
 338 }
 339 #endif
 340
 341 /*******************************************************************************
 342
 343                 Protocol management and registration routines
 344
 345 *******************************************************************************/
 346
 347 /*
 348  *      Add a protocol ID to the list. Now that the input handler is
 349  *      smarter we can dispense with all the messy stuff that used to be
 350  *      here.
 351  *
 352  *      BEWARE!!! Protocol handlers, mangling input packets,
 353  *      MUST BE last in hash buckets and checking protocol handlers
 354  *      MUST start from promiscuous ptype_all chain in net_bh.
 355  *      It is true now, do not change it.
 356  *      Explanation follows: if protocol handler, mangling packet, will
 357  *      be the first on list, it is not able to sense, that packet
 358  *      is cloned and should be copied-on-write, so that it will
 359  *      change it and subsequent readers will get broken packet.
 360  *                                                      --ANK (980803)
 361  */
 362
 363 /**
 364  *      dev_add_pack - add packet handler
 365  *      @pt: packet type declaration
 366  *
 367  *      Add a protocol handler to the networking stack. The passed &packet_type
 368  *      is linked into kernel lists and may not be freed until it has been
 369  *      removed from the kernel lists.
 370  *
 371  *      This call does not sleep therefore it can not
 372  *      guarantee all CPU's that are in middle of receiving packets
 373  *      will see the new packet type (until the next received packet).
 374  */
 375
 376 void dev_add_pack(struct packet_type *pt)
 377 {
 378         int hash;
 379
 380         spin_lock_bh(&ptype_lock);
 381         if (pt->type == htons(ETH_P_ALL))
 382                 list_add_rcu(&pt->list, &ptype_all);
 383         else {
 384                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
 385                 list_add_rcu(&pt->list, &ptype_base[hash]);
 386         }
 387         spin_unlock_bh(&ptype_lock);
 388 }
 389
 390 /**
 391  *      __dev_remove_pack        - remove packet handler
 392  *      @pt: packet type declaration
 393  *
 394  *      Remove a protocol handler that was previously added to the kernel
 395  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 396  *      from the kernel lists and can be freed or reused once this function
 397  *      returns.
 398  *
 399  *      The packet type might still be in use by receivers
 400  *      and must not be freed until after all the CPU's have gone
 401  *      through a quiescent state.
 402  */
 403 void __dev_remove_pack(struct packet_type *pt)
 404 {
 405         struct list_head *head;
 406         struct packet_type *pt1;
 407
 408         spin_lock_bh(&ptype_lock);
 409
 410         if (pt->type == htons(ETH_P_ALL))
 411                 head = &ptype_all;
 412         else
 413                 head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 414
 415         list_for_each_entry(pt1, head, list) {
 416                 if (pt == pt1) {
 417                         list_del_rcu(&pt->list);
 418                         goto out;
 419                 }
 420         }
 421
 422         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 423 out:
 424         spin_unlock_bh(&ptype_lock);
 425 }
 426 /**
 427  *      dev_remove_pack  - remove packet handler
 428  *      @pt: packet type declaration
 429  *
 430  *      Remove a protocol handler that was previously added to the kernel
 431  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 432  *      from the kernel lists and can be freed or reused once this function
 433  *      returns.
 434  *
 435  *      This call sleeps to guarantee that no CPU is looking at the packet
 436  *      type after return.
 437  */
 438 void dev_remove_pack(struct packet_type *pt)
 439 {
 440         __dev_remove_pack(pt);
 441
 442         synchronize_net();
 443 }
 444
 445 /******************************************************************************
 446
 447                       Device Boot-time Settings Routines
 448
 449 *******************************************************************************/
 450
 451 /* Boot time configuration table */
 452 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 453
 454 /**
 455  *      netdev_boot_setup_add   - add new setup entry
 456  *      @name: name of the device
 457  *      @map: configured settings for the device
 458  *
 459  *      Adds new setup entry to the dev_boot_setup list.  The function
 460  *      returns 0 on error and 1 on success.  This is a generic routine to
 461  *      all netdevices.
 462  */
 463 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 464 {
 465         struct netdev_boot_setup *s;
 466         int i;
 467
 468         s = dev_boot_setup;
 469         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 470                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 471                         memset(s[i].name, 0, sizeof(s[i].name));
 472                         strlcpy(s[i].name, name, IFNAMSIZ);
 473                         memcpy(&s[i].map, map, sizeof(s[i].map));
 474                         break;
 475                 }
 476         }
 477
 478         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 479 }
 480
 481 /**
 482  *      netdev_boot_setup_check - check boot time settings
 483  *      @dev: the netdevice
 484  *
 485  *      Check boot time settings for the device.
 486  *      The found settings are set for the device to be used
 487  *      later in the device probing.
 488  *      Returns 0 if no settings found, 1 if they are.
 489  */
 490 int netdev_boot_setup_check(struct net_device *dev)
 491 {
 492         struct netdev_boot_setup *s = dev_boot_setup;
 493         int i;
 494
 495         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 496                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 497                     !strcmp(dev->name, s[i].name)) {
 498                         dev->irq        = s[i].map.irq;
 499                         dev->base_addr  = s[i].map.base_addr;
 500                         dev->mem_start  = s[i].map.mem_start;
 501                         dev->mem_end    = s[i].map.mem_end;
 502                         return 1;
 503                 }
 504         }
 505         return 0;
 506 }
 507
 508
 509 /**
 510  *      netdev_boot_base        - get address from boot time settings
 511  *      @prefix: prefix for network device
 512  *      @unit: id for network device
 513  *
 514  *      Check boot time settings for the base address of device.
 515  *      The found settings are set for the device to be used
 516  *      later in the device probing.
 517  *      Returns 0 if no settings found.
 518  */
 519 unsigned long netdev_boot_base(const char *prefix, int unit)
 520 {
 521         const struct netdev_boot_setup *s = dev_boot_setup;
 522         char name[IFNAMSIZ];
 523         int i;
 524
 525         sprintf(name, "%s%d", prefix, unit);
 526
 527         /*
 528          * If device already registered then return base of 1
 529          * to indicate not to probe for this interface
 530          */
 531         if (__dev_get_by_name(&init_net, name))
 532                 return 1;
 533
 534         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 535                 if (!strcmp(name, s[i].name))
 536                         return s[i].map.base_addr;
 537         return 0;
 538 }
 539
 540 /*
 541  * Saves at boot time configured settings for any netdevice.
 542  */
 543 int __init netdev_boot_setup(char *str)
 544 {
 545         int ints[5];
 546         struct ifmap map;
 547
 548         str = get_options(str, ARRAY_SIZE(ints), ints);
 549         if (!str || !*str)
 550                 return 0;
 551
 552         /* Save settings */
 553         memset(&map, 0, sizeof(map));
 554         if (ints[0] > 0)
 555                 map.irq = ints[1];
 556         if (ints[0] > 1)
 557                 map.base_addr = ints[2];
 558         if (ints[0] > 2)
 559                 map.mem_start = ints[3];
 560         if (ints[0] > 3)
 561                 map.mem_end = ints[4];
 562
 563         /* Add new entry to the list */
 564         return netdev_boot_setup_add(str, &map);
 565 }
 566
 567 __setup("netdev=", netdev_boot_setup);
 568
 569 /*******************************************************************************
 570
 571                             Device Interface Subroutines
 572
 573 *******************************************************************************/
 574
 575 /**
 576  *      __dev_get_by_name       - find a device by its name
 577  *      @net: the applicable net namespace
 578  *      @name: name to find
 579  *
 580  *      Find an interface by name. Must be called under RTNL semaphore
 581  *      or @dev_base_lock. If the name is found a pointer to the device
 582  *      is returned. If the name is not found then %NULL is returned. The
 583  *      reference counters are not incremented so the caller must be
 584  *      careful with locks.
 585  */
 586
 587 struct net_device *__dev_get_by_name(struct net *net, const char *name)
 588 {
 589         struct hlist_node *p;
 590
 591         hlist_for_each(p, dev_name_hash(net, name)) {
 592                 struct net_device *dev
 593                         = hlist_entry(p, struct net_device, name_hlist);
 594                 if (!strncmp(dev->name, name, IFNAMSIZ))
 595                         return dev;
 596         }
 597         return NULL;
 598 }
 599
 600 /**
 601  *      dev_get_by_name         - find a device by its name
 602  *      @net: the applicable net namespace
 603  *      @name: name to find
 604  *
 605  *      Find an interface by name. This can be called from any
 606  *      context and does its own locking. The returned handle has
 607  *      the usage count incremented and the caller must use dev_put() to
 608  *      release it when it is no longer needed. %NULL is returned if no
 609  *      matching device is found.
 610  */
 611
 612 struct net_device *dev_get_by_name(struct net *net, const char *name)
 613 {
 614         struct net_device *dev;
 615
 616         read_lock(&dev_base_lock);
 617         dev = __dev_get_by_name(net, name);
 618         if (dev)
 619                 dev_hold(dev);
 620         read_unlock(&dev_base_lock);
 621         return dev;
 622 }
 623
 624 /**
 625  *      __dev_get_by_index - find a device by its ifindex
 626  *      @net: the applicable net namespace
 627  *      @ifindex: index of device
 628  *
 629  *      Search for an interface by index. Returns %NULL if the device
 630  *      is not found or a pointer to the device. The device has not
 631  *      had its reference counter increased so the caller must be careful
 632  *      about locking. The caller must hold either the RTNL semaphore
 633  *      or @dev_base_lock.
 634  */
 635
 636 struct net_device *__dev_get_by_index(struct net *net, int ifindex)
 637 {
 638         struct hlist_node *p;
 639
 640         hlist_for_each(p, dev_index_hash(net, ifindex)) {
 641                 struct net_device *dev
 642                         = hlist_entry(p, struct net_device, index_hlist);
 643                 if (dev->ifindex == ifindex)
 644                         return dev;
 645         }
 646         return NULL;
 647 }
 648
 649
 650 /**
 651  *      dev_get_by_index - find a device by its ifindex
 652  *      @net: the applicable net namespace
 653  *      @ifindex: index of device
 654  *
 655  *      Search for an interface by index. Returns NULL if the device
 656  *      is not found or a pointer to the device. The device returned has
 657  *      had a reference added and the pointer is safe until the user calls
 658  *      dev_put to indicate they have finished with it.
 659  */
 660
 661 struct net_device *dev_get_by_index(struct net *net, int ifindex)
 662 {
 663         struct net_device *dev;
 664
 665         read_lock(&dev_base_lock);
 666         dev = __dev_get_by_index(net, ifindex);
 667         if (dev)
 668                 dev_hold(dev);
 669         read_unlock(&dev_base_lock);
 670         return dev;
 671 }
 672
 673 /**
 674  *      dev_getbyhwaddr - find a device by its hardware address
 675  *      @net: the applicable net namespace
 676  *      @type: media type of device
 677  *      @ha: hardware address
 678  *
 679  *      Search for an interface by MAC address. Returns NULL if the device
 680  *      is not found or a pointer to the device. The caller must hold the
 681  *      rtnl semaphore. The returned device has not had its ref count increased
 682  *      and the caller must therefore be careful about locking
 683  *
 684  *      BUGS:
 685  *      If the API was consistent this would be __dev_get_by_hwaddr
 686  */
 687
 688 struct net_device *dev_getbyhwaddr(struct net *net, unsigned short type, char *ha)
 689 {
 690         struct net_device *dev;
 691
 692         ASSERT_RTNL();
 693
 694         for_each_netdev(net, dev)
 695                 if (dev->type == type &&
 696                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 697                         return dev;
 698
 699         return NULL;
 700 }
 701
 702 EXPORT_SYMBOL(dev_getbyhwaddr);
 703
 704 struct net_device *__dev_getfirstbyhwtype(struct net *net, unsigned short type)
 705 {
 706         struct net_device *dev;
 707
 708         ASSERT_RTNL();
 709         for_each_netdev(net, dev)
 710                 if (dev->type == type)
 711                         return dev;
 712
 713         return NULL;
 714 }
 715
 716 EXPORT_SYMBOL(__dev_getfirstbyhwtype);
 717
 718 struct net_device *dev_getfirstbyhwtype(struct net *net, unsigned short type)
 719 {
 720         struct net_device *dev;
 721
 722         rtnl_lock();
 723         dev = __dev_getfirstbyhwtype(net, type);
 724         if (dev)
 725                 dev_hold(dev);
 726         rtnl_unlock();
 727         return dev;
 728 }
 729
 730 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 731
 732 /**
 733  *      dev_get_by_flags - find any device with given flags
 734  *      @net: the applicable net namespace
 735  *      @if_flags: IFF_* values
 736  *      @mask: bitmask of bits in if_flags to check
 737  *
 738  *      Search for any interface with the given flags. Returns NULL if a device
 739  *      is not found or a pointer to the device. The device returned has
 740  *      had a reference added and the pointer is safe until the user calls
 741  *      dev_put to indicate they have finished with it.
 742  */
 743
 744 struct net_device * dev_get_by_flags(struct net *net, unsigned short if_flags, unsigned short mask)
 745 {
 746         struct net_device *dev, *ret;
 747
 748         ret = NULL;
 749         read_lock(&dev_base_lock);
 750         for_each_netdev(net, dev) {
 751                 if (((dev->flags ^ if_flags) & mask) == 0) {
 752                         dev_hold(dev);
 753                         ret = dev;
 754                         break;
 755                 }
 756         }
 757         read_unlock(&dev_base_lock);
 758         return ret;
 759 }
 760
 761 /**
 762  *      dev_valid_name - check if name is okay for network device
 763  *      @name: name string
 764  *
 765  *      Network device names need to be valid file names to
 766  *      to allow sysfs to work.  We also disallow any kind of
 767  *      whitespace.
 768  */
 769 int dev_valid_name(const char *name)
 770 {
 771         if (*name == '\0')
 772                 return 0;
 773         if (strlen(name) >= IFNAMSIZ)
 774                 return 0;
 775         if (!strcmp(name, ".") || !strcmp(name, ".."))
 776                 return 0;
 777
 778         while (*name) {
 779                 if (*name == '/' || isspace(*name))
 780                         return 0;
 781                 name++;
 782         }
 783         return 1;
 784 }
 785
 786 /**
 787  *      __dev_alloc_name - allocate a name for a device
 788  *      @net: network namespace to allocate the device name in
 789  *      @name: name format string
 790  *      @buf:  scratch buffer and result name string
 791  *
 792  *      Passed a format string - eg "lt%d" it will try and find a suitable
 793  *      id. It scans list of devices to build up a free map, then chooses
 794  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 795  *      while allocating the name and adding the device in order to avoid
 796  *      duplicates.
 797  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 798  *      Returns the number of the unit assigned or a negative errno code.
 799  */
 800
 801 static int __dev_alloc_name(struct net *net, const char *name, char *buf)
 802 {
 803         int i = 0;
 804         const char *p;
 805         const int max_netdevices = 8*PAGE_SIZE;
 806         unsigned long *inuse;
 807         struct net_device *d;
 808
 809         p = strnchr(name, IFNAMSIZ-1, '%');
 810         if (p) {
 811                 /*
 812                  * Verify the string as this thing may have come from
 813                  * the user.  There must be either one "%d" and no other "%"
 814                  * characters.
 815                  */
 816                 if (p[1] != 'd' || strchr(p + 2, '%'))
 817                         return -EINVAL;
 818
 819                 /* Use one page as a bit array of possible slots */
 820                 inuse = (unsigned long *) get_zeroed_page(GFP_ATOMIC);
 821                 if (!inuse)
 822                         return -ENOMEM;
 823
 824                 for_each_netdev(net, d) {
 825                         if (!sscanf(d->name, name, &i))
 826                                 continue;
 827                         if (i < 0 || i >= max_netdevices)
 828                                 continue;
 829
 830                         /*  avoid cases where sscanf is not exact inverse of printf */
 831                         snprintf(buf, IFNAMSIZ, name, i);
 832                         if (!strncmp(buf, d->name, IFNAMSIZ))
 833                                 set_bit(i, inuse);
 834                 }
 835
 836                 i = find_first_zero_bit(inuse, max_netdevices);
 837                 free_page((unsigned long) inuse);
 838         }
 839
 840         snprintf(buf, IFNAMSIZ, name, i);
 841         if (!__dev_get_by_name(net, buf))
 842                 return i;
 843
 844         /* It is possible to run out of possible slots
 845          * when the name is long and there isn't enough space left
 846          * for the digits, or if all bits are used.
 847          */
 848         return -ENFILE;
 849 }
 850
 851 /**
 852  *      dev_alloc_name - allocate a name for a device
 853  *      @dev: device
 854  *      @name: name format string
 855  *
 856  *      Passed a format string - eg "lt%d" it will try and find a suitable
 857  *      id. It scans list of devices to build up a free map, then chooses
 858  *      the first empty slot. The caller must hold the dev_base or rtnl lock
 859  *      while allocating the name and adding the device in order to avoid
 860  *      duplicates.
 861  *      Limited to bits_per_byte * page size devices (ie 32K on most platforms).
 862  *      Returns the number of the unit assigned or a negative errno code.
 863  */
 864
 865 int dev_alloc_name(struct net_device *dev, const char *name)
 866 {
 867         char buf[IFNAMSIZ];
 868         struct net *net;
 869         int ret;
 870
 871         BUG_ON(!dev_net(dev));
 872         net = dev_net(dev);
 873         ret = __dev_alloc_name(net, name, buf);
 874         if (ret >= 0)
 875                 strlcpy(dev->name, buf, IFNAMSIZ);
 876         return ret;
 877 }
 878
 879
 880 /**
 881  *      dev_change_name - change name of a device
 882  *      @dev: device
 883  *      @newname: name (or format string) must be at least IFNAMSIZ
 884  *
 885  *      Change name of a device, can pass format strings "eth%d".
 886  *      for wildcarding.
 887  */
 888 int dev_change_name(struct net_device *dev, const char *newname)
 889 {
 890         char oldname[IFNAMSIZ];
 891         int err = 0;
 892         int ret;
 893         struct net *net;
 894
 895         ASSERT_RTNL();
 896         BUG_ON(!dev_net(dev));
 897
 898         net = dev_net(dev);
 899         if (dev->flags & IFF_UP)
 900                 return -EBUSY;
 901
 902         if (!dev_valid_name(newname))
 903                 return -EINVAL;
 904
 905         if (strncmp(newname, dev->name, IFNAMSIZ) == 0)
 906                 return 0;
 907
 908         memcpy(oldname, dev->name, IFNAMSIZ);
 909
 910         if (strchr(newname, '%')) {
 911                 err = dev_alloc_name(dev, newname);
 912                 if (err < 0)
 913                         return err;
 914         }
 915         else if (__dev_get_by_name(net, newname))
 916                 return -EEXIST;
 917         else
 918                 strlcpy(dev->name, newname, IFNAMSIZ);
 919
 920 rollback:
 921         /* For now only devices in the initial network namespace
 922          * are in sysfs.
 923          */
 924         if (net == &init_net) {
 925                 ret = device_rename(&dev->dev, dev->name);
 926                 if (ret) {
 927                         memcpy(dev->name, oldname, IFNAMSIZ);
 928                         return ret;
 929                 }
 930         }
 931
 932         write_lock_bh(&dev_base_lock);
 933         hlist_del(&dev->name_hlist);
 934         hlist_add_head(&dev->name_hlist, dev_name_hash(net, dev->name));
 935         write_unlock_bh(&dev_base_lock);
 936
 937         ret = call_netdevice_notifiers(NETDEV_CHANGENAME, dev);
 938         ret = notifier_to_errno(ret);
 939
 940         if (ret) {
 941                 if (err) {
 942                         printk(KERN_ERR
 943                                "%s: name change rollback failed: %d.\n",
 944                                dev->name, ret);
 945                 } else {
 946                         err = ret;
 947                         memcpy(dev->name, oldname, IFNAMSIZ);
 948                         goto rollback;
 949                 }
 950         }
 951
 952         return err;
 953 }
 954
 955 /**
 956  *      dev_set_alias - change ifalias of a device
 957  *      @dev: device
 958  *      @alias: name up to IFALIASZ
 959  *      @len: limit of bytes to copy from info
 960  *
 961  *      Set ifalias for a device,
 962  */
 963 int dev_set_alias(struct net_device *dev, const char *alias, size_t len)
 964 {
 965         ASSERT_RTNL();
 966
 967         if (len >= IFALIASZ)
 968                 return -EINVAL;
 969
 970         if (!len) {
 971                 if (dev->ifalias) {
 972                         kfree(dev->ifalias);
 973                         dev->ifalias = NULL;
 974                 }
 975                 return 0;
 976         }
 977
 978         dev->ifalias = krealloc(dev->ifalias, len+1, GFP_KERNEL);
 979         if (!dev->ifalias)
 980                 return -ENOMEM;
 981
 982         strlcpy(dev->ifalias, alias, len+1);
 983         return len;
 984 }
 985
 986
 987 /**
 988  *      netdev_features_change - device changes features
 989  *      @dev: device to cause notification
 990  *
 991  *      Called to indicate a device has changed features.
 992  */
 993 void netdev_features_change(struct net_device *dev)
 994 {
 995         call_netdevice_notifiers(NETDEV_FEAT_CHANGE, dev);
 996 }
 997 EXPORT_SYMBOL(netdev_features_change);
 998
 999 /**
1000  *      netdev_state_change - device changes state
1001  *      @dev: device to cause notification
1002  *
1003  *      Called to indicate a device has changed state. This function calls
1004  *      the notifier chains for netdev_chain and sends a NEWLINK message
1005  *      to the routing socket.
1006  */
1007 void netdev_state_change(struct net_device *dev)
1008 {
1009         if (dev->flags & IFF_UP) {
1010                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
1011                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
1012         }
1013 }
1014
1015 void netdev_bonding_change(struct net_device *dev)
1016 {
1017         call_netdevice_notifiers(NETDEV_BONDING_FAILOVER, dev);
1018 }
1019 EXPORT_SYMBOL(netdev_bonding_change);
1020
1021 /**
1022  *      dev_load        - load a network module
1023  *      @net: the applicable net namespace
1024  *      @name: name of interface
1025  *
1026  *      If a network interface is not present and the process has suitable
1027  *      privileges this function loads the module. If module loading is not
1028  *      available in this kernel then it becomes a nop.
1029  */
1030
1031 void dev_load(struct net *net, const char *name)
1032 {
1033         struct net_device *dev;
1034
1035         read_lock(&dev_base_lock);
1036         dev = __dev_get_by_name(net, name);
1037         read_unlock(&dev_base_lock);
1038
1039         if (!dev && capable(CAP_SYS_MODULE))
1040                 request_module("%s", name);
1041 }
1042
1043 /**
1044  *      dev_open        - prepare an interface for use.
1045  *      @dev:   device to open
1046  *
1047  *      Takes a device from down to up state. The device's private open
1048  *      function is invoked and then the multicast lists are loaded. Finally
1049  *      the device is moved into the up state and a %NETDEV_UP message is
1050  *      sent to the netdev notifier chain.
1051  *
1052  *      Calling this function on an active interface is a nop. On a failure
1053  *      a negative errno code is returned.
1054  */
1055 int dev_open(struct net_device *dev)
1056 {
1057         const struct net_device_ops *ops = dev->netdev_ops;
1058         int ret = 0;
1059
1060         ASSERT_RTNL();
1061
1062         /*
1063          *      Is it already up?
1064          */
1065
1066         if (dev->flags & IFF_UP)
1067                 return 0;
1068
1069         /*
1070          *      Is it even present?
1071          */
1072         if (!netif_device_present(dev))
1073                 return -ENODEV;
1074
1075         /*
1076          *      Call device private open method
1077          */
1078         set_bit(__LINK_STATE_START, &dev->state);
1079
1080         if (ops->ndo_validate_addr)
1081                 ret = ops->ndo_validate_addr(dev);
1082
1083         if (!ret && ops->ndo_open)
1084                 ret = ops->ndo_open(dev);
1085
1086         /*
1087          *      If it went open OK then:
1088          */
1089
1090         if (ret)
1091                 clear_bit(__LINK_STATE_START, &dev->state);
1092         else {
1093                 /*
1094                  *      Set the flags.
1095                  */
1096                 dev->flags |= IFF_UP;
1097
1098                 /*
1099                  *      Enable NET_DMA
1100                  */
1101                 net_dmaengine_get();
1102
1103                 /*
1104                  *      Initialize multicasting status
1105                  */
1106                 dev_set_rx_mode(dev);
1107
1108                 /*
1109                  *      Wakeup transmit queue engine
1110                  */
1111                 dev_activate(dev);
1112
1113                 /*
1114                  *      ... and announce new interface.
1115                  */
1116                 call_netdevice_notifiers(NETDEV_UP, dev);
1117         }
1118
1119         return ret;
1120 }
1121
1122 /**
1123  *      dev_close - shutdown an interface.
1124  *      @dev: device to shutdown
1125  *
1126  *      This function moves an active device into down state. A
1127  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
1128  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
1129  *      chain.
1130  */
1131 int dev_close(struct net_device *dev)
1132 {
1133         const struct net_device_ops *ops = dev->netdev_ops;
1134         ASSERT_RTNL();
1135
1136         might_sleep();
1137
1138         if (!(dev->flags & IFF_UP))
1139                 return 0;
1140
1141         /*
1142          *      Tell people we are going down, so that they can
1143          *      prepare to death, when device is still operating.
1144          */
1145         call_netdevice_notifiers(NETDEV_GOING_DOWN, dev);
1146
1147         clear_bit(__LINK_STATE_START, &dev->state);
1148
1149         /* Synchronize to scheduled poll. We cannot touch poll list,
1150          * it can be even on different cpu. So just clear netif_running().
1151          *
1152          * dev->stop() will invoke napi_disable() on all of it's
1153          * napi_struct instances on this device.
1154          */
1155         smp_mb__after_clear_bit(); /* Commit netif_running(). */
1156
1157         dev_deactivate(dev);
1158
1159         /*
1160          *      Call the device specific close. This cannot fail.
1161          *      Only if device is UP
1162          *
1163          *      We allow it to be called even after a DETACH hot-plug
1164          *      event.
1165          */
1166         if (ops->ndo_stop)
1167                 ops->ndo_stop(dev);
1168
1169         /*
1170          *      Device is now down.
1171          */
1172
1173         dev->flags &= ~IFF_UP;
1174
1175         /*
1176          * Tell people we are down
1177          */
1178         call_netdevice_notifiers(NETDEV_DOWN, dev);
1179
1180         /*
1181          *      Shutdown NET_DMA
1182          */
1183         net_dmaengine_put();
1184
1185         return 0;
1186 }
1187
1188
1189 /**
1190  *      dev_disable_lro - disable Large Receive Offload on a device
1191  *      @dev: device
1192  *
1193  *      Disable Large Receive Offload (LRO) on a net device.  Must be
1194  *      called under RTNL.  This is needed if received packets may be
1195  *      forwarded to another interface.
1196  */
1197 void dev_disable_lro(struct net_device *dev)
1198 {
1199         if (dev->ethtool_ops && dev->ethtool_ops->get_flags &&
1200             dev->ethtool_ops->set_flags) {
1201                 u32 flags = dev->ethtool_ops->get_flags(dev);
1202                 if (flags & ETH_FLAG_LRO) {
1203                         flags &= ~ETH_FLAG_LRO;
1204                         dev->ethtool_ops->set_flags(dev, flags);
1205                 }
1206         }
1207         WARN_ON(dev->features & NETIF_F_LRO);
1208 }
1209 EXPORT_SYMBOL(dev_disable_lro);
1210
1211
1212 static int dev_boot_phase = 1;
1213
1214 /*
1215  *      Device change register/unregister. These are not inline or static
1216  *      as we export them to the world.
1217  */
1218
1219 /**
1220  *      register_netdevice_notifier - register a network notifier block
1221  *      @nb: notifier
1222  *
1223  *      Register a notifier to be called when network device events occur.
1224  *      The notifier passed is linked into the kernel structures and must
1225  *      not be reused until it has been unregistered. A negative errno code
1226  *      is returned on a failure.
1227  *
1228  *      When registered all registration and up events are replayed
1229  *      to the new notifier to allow device to have a race free
1230  *      view of the network device list.
1231  */
1232
1233 int register_netdevice_notifier(struct notifier_block *nb)
1234 {
1235         struct net_device *dev;
1236         struct net_device *last;
1237         struct net *net;
1238         int err;
1239
1240         rtnl_lock();
1241         err = raw_notifier_chain_register(&netdev_chain, nb);
1242         if (err)
1243                 goto unlock;
1244         if (dev_boot_phase)
1245                 goto unlock;
1246         for_each_net(net) {
1247                 for_each_netdev(net, dev) {
1248                         err = nb->notifier_call(nb, NETDEV_REGISTER, dev);
1249                         err = notifier_to_errno(err);
1250                         if (err)
1251                                 goto rollback;
1252
1253                         if (!(dev->flags & IFF_UP))
1254                                 continue;
1255
1256                         nb->notifier_call(nb, NETDEV_UP, dev);
1257                 }
1258         }
1259
1260 unlock:
1261         rtnl_unlock();
1262         return err;
1263
1264 rollback:
1265         last = dev;
1266         for_each_net(net) {
1267                 for_each_netdev(net, dev) {
1268                         if (dev == last)
1269                                 break;
1270
1271                         if (dev->flags & IFF_UP) {
1272                                 nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
1273                                 nb->notifier_call(nb, NETDEV_DOWN, dev);
1274                         }
1275                         nb->notifier_call(nb, NETDEV_UNREGISTER, dev);
1276                 }
1277         }
1278
1279         raw_notifier_chain_unregister(&netdev_chain, nb);
1280         goto unlock;
1281 }
1282
1283 /**
1284  *      unregister_netdevice_notifier - unregister a network notifier block
1285  *      @nb: notifier
1286  *
1287  *      Unregister a notifier previously registered by
1288  *      register_netdevice_notifier(). The notifier is unlinked into the
1289  *      kernel structures and may then be reused. A negative errno code
1290  *      is returned on a failure.
1291  */
1292
1293 int unregister_netdevice_notifier(struct notifier_block *nb)
1294 {
1295         int err;
1296
1297         rtnl_lock();
1298         err = raw_notifier_chain_unregister(&netdev_chain, nb);
1299         rtnl_unlock();
1300         return err;
1301 }
1302
1303 /**
1304  *      call_netdevice_notifiers - call all network notifier blocks
1305  *      @val: value passed unmodified to notifier function
1306  *      @dev: net_device pointer passed unmodified to notifier function
1307  *
1308  *      Call all network notifier blocks.  Parameters and return value
1309  *      are as for raw_notifier_call_chain().
1310  */
1311
1312 int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
1313 {
1314         return raw_notifier_call_chain(&netdev_chain, val, dev);
1315 }
1316
1317 /* When > 0 there are consumers of rx skb time stamps */
1318 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1319
1320 void net_enable_timestamp(void)
1321 {
1322         atomic_inc(&netstamp_needed);
1323 }
1324
1325 void net_disable_timestamp(void)
1326 {
1327         atomic_dec(&netstamp_needed);
1328 }
1329
1330 static inline void net_timestamp(struct sk_buff *skb)
1331 {
1332         if (atomic_read(&netstamp_needed))
1333                 __net_timestamp(skb);
1334         else
1335                 skb->tstamp.tv64 = 0;
1336 }
1337
1338 /*
1339  *      Support routine. Sends outgoing frames to any network
1340  *      taps currently in use.
1341  */
1342
1343 static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1344 {
1345         struct packet_type *ptype;
1346
1347         net_timestamp(skb);
1348
1349         rcu_read_lock();
1350         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1351                 /* Never send packets back to the socket
1352                  * they originated from - MvS (miquels@drinkel.ow.org)
1353                  */
1354                 if ((ptype->dev == dev || !ptype->dev) &&
1355                     (ptype->af_packet_priv == NULL ||
1356                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1357                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1358                         if (!skb2)
1359                                 break;
1360
1361                         /* skb->nh should be correctly
1362                            set by sender, so that the second statement is
1363                            just protection against buggy protocols.
1364                          */
1365                         skb_reset_mac_header(skb2);
1366
1367                         if (skb_network_header(skb2) < skb2->data ||
1368                             skb2->network_header > skb2->tail) {
1369                                 if (net_ratelimit())
1370                                         printk(KERN_CRIT "protocol %04x is "
1371                                                "buggy, dev %s\n",
1372                                                skb2->protocol, dev->name);
1373                                 skb_reset_network_header(skb2);
1374                         }
1375
1376                         skb2->transport_header = skb2->network_header;
1377                         skb2->pkt_type = PACKET_OUTGOING;
1378                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1379                 }
1380         }
1381         rcu_read_unlock();
1382 }
1383
1384
1385 static inline void __netif_reschedule(struct Qdisc *q)
1386 {
1387         struct softnet_data *sd;
1388         unsigned long flags;
1389
1390         local_irq_save(flags);
1391         sd = &__get_cpu_var(softnet_data);
1392         q->next_sched = sd->output_queue;
1393         sd->output_queue = q;
1394         raise_softirq_irqoff(NET_TX_SOFTIRQ);
1395         local_irq_restore(flags);
1396 }
1397
1398 void __netif_schedule(struct Qdisc *q)
1399 {
1400         if (!test_and_set_bit(__QDISC_STATE_SCHED, &q->state))
1401                 __netif_reschedule(q);
1402 }
1403 EXPORT_SYMBOL(__netif_schedule);
1404
1405 void dev_kfree_skb_irq(struct sk_buff *skb)
1406 {
1407         if (atomic_dec_and_test(&skb->users)) {
1408                 struct softnet_data *sd;
1409                 unsigned long flags;
1410
1411                 local_irq_save(flags);
1412                 sd = &__get_cpu_var(softnet_data);
1413                 skb->next = sd->completion_queue;
1414                 sd->completion_queue = skb;
1415                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1416                 local_irq_restore(flags);
1417         }
1418 }
1419 EXPORT_SYMBOL(dev_kfree_skb_irq);
1420
1421 void dev_kfree_skb_any(struct sk_buff *skb)
1422 {
1423         if (in_irq() || irqs_disabled())
1424                 dev_kfree_skb_irq(skb);
1425         else
1426                 dev_kfree_skb(skb);
1427 }
1428 EXPORT_SYMBOL(dev_kfree_skb_any);
1429
1430
1431 /**
1432  * netif_device_detach - mark device as removed
1433  * @dev: network device
1434  *
1435  * Mark device as removed from system and therefore no longer available.
1436  */
1437 void netif_device_detach(struct net_device *dev)
1438 {
1439         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1440             netif_running(dev)) {
1441                 netif_stop_queue(dev);
1442         }
1443 }
1444 EXPORT_SYMBOL(netif_device_detach);
1445
1446 /**
1447  * netif_device_attach - mark device as attached
1448  * @dev: network device
1449  *
1450  * Mark device as attached from system and restart if needed.
1451  */
1452 void netif_device_attach(struct net_device *dev)
1453 {
1454         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1455             netif_running(dev)) {
1456                 netif_wake_queue(dev);
1457                 __netdev_watchdog_up(dev);
1458         }
1459 }
1460 EXPORT_SYMBOL(netif_device_attach);
1461
1462 static bool can_checksum_protocol(unsigned long features, __be16 protocol)
1463 {
1464         return ((features & NETIF_F_GEN_CSUM) ||
1465                 ((features & NETIF_F_IP_CSUM) &&
1466                  protocol == htons(ETH_P_IP)) ||
1467                 ((features & NETIF_F_IPV6_CSUM) &&
1468                  protocol == htons(ETH_P_IPV6)));
1469 }
1470
1471 static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb)
1472 {
1473         if (can_checksum_protocol(dev->features, skb->protocol))
1474                 return true;
1475
1476         if (skb->protocol == htons(ETH_P_8021Q)) {
1477                 struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
1478                 if (can_checksum_protocol(dev->features & dev->vlan_features,
1479                                           veh->h_vlan_encapsulated_proto))
1480                         return true;
1481         }
1482
1483         return false;
1484 }
1485
1486 /*
1487  * Invalidate hardware checksum when packet is to be mangled, and
1488  * complete checksum manually on outgoing path.
1489  */
1490 int skb_checksum_help(struct sk_buff *skb)
1491 {
1492         __wsum csum;
1493         int ret = 0, offset;
1494
1495         if (skb->ip_summed == CHECKSUM_COMPLETE)
1496                 goto out_set_summed;
1497
1498         if (unlikely(skb_shinfo(skb)->gso_size)) {
1499                 /* Let GSO fix up the checksum. */
1500                 goto out_set_summed;
1501         }
1502
1503         offset = skb->csum_start - skb_headroom(skb);
1504         BUG_ON(offset >= skb_headlen(skb));
1505         csum = skb_checksum(skb, offset, skb->len - offset, 0);
1506
1507         offset += skb->csum_offset;
1508         BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));
1509
1510         if (skb_cloned(skb) &&
1511             !skb_clone_writable(skb, offset + sizeof(__sum16))) {
1512                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1513                 if (ret)
1514                         goto out;
1515         }
1516
1517         *(__sum16 *)(skb->data + offset) = csum_fold(csum);
1518 out_set_summed:
1519         skb->ip_summed = CHECKSUM_NONE;
1520 out:
1521         return ret;
1522 }
1523
1524 /**
1525  *      skb_gso_segment - Perform segmentation on skb.
1526  *      @skb: buffer to segment
1527  *      @features: features for the output path (see dev->features)
1528  *
1529  *      This function segments the given skb and returns a list of segments.
1530  *
1531  *      It may return NULL if the skb requires no segmentation.  This is
1532  *      only possible when GSO is used for verifying header integrity.
1533  */
1534 struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features)
1535 {
1536         struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
1537         struct packet_type *ptype;
1538         __be16 type = skb->protocol;
1539         int err;
1540
1541         skb_reset_mac_header(skb);
1542         skb->mac_len = skb->network_header - skb->mac_header;
1543         __skb_pull(skb, skb->mac_len);
1544
1545         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1546                 struct net_device *dev = skb->dev;
1547                 struct ethtool_drvinfo info = {};
1548
1549                 if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
1550                         dev->ethtool_ops->get_drvinfo(dev, &info);
1551
1552                 WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d "
1553                         "ip_summed=%d",
1554                      info.driver, dev ? dev->features : 0L,
1555                      skb->sk ? skb->sk->sk_route_caps : 0L,
1556                      skb->len, skb->data_len, skb->ip_summed);
1557
1558                 if (skb_header_cloned(skb) &&
1559                     (err = pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
1560                         return ERR_PTR(err);
1561         }
1562
1563         rcu_read_lock();
1564         list_for_each_entry_rcu(ptype,
1565                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
1566                 if (ptype->type == type && !ptype->dev && ptype->gso_segment) {
1567                         if (unlikely(skb->ip_summed != CHECKSUM_PARTIAL)) {
1568                                 err = ptype->gso_send_check(skb);
1569                                 segs = ERR_PTR(err);
1570                                 if (err || skb_gso_ok(skb, features))
1571                                         break;
1572                                 __skb_push(skb, (skb->data -
1573                                                  skb_network_header(skb)));
1574                         }
1575                         segs = ptype->gso_segment(skb, features);
1576                         break;
1577                 }
1578         }
1579         rcu_read_unlock();
1580
1581         __skb_push(skb, skb->data - skb_mac_header(skb));
1582
1583         return segs;
1584 }
1585
1586 EXPORT_SYMBOL(skb_gso_segment);
1587
1588 /* Take action when hardware reception checksum errors are detected. */
1589 #ifdef CONFIG_BUG
1590 void netdev_rx_csum_fault(struct net_device *dev)
1591 {
1592         if (net_ratelimit()) {
1593                 printk(KERN_ERR "%s: hw csum failure.\n",
1594                         dev ? dev->name : "<unknown>");
1595                 dump_stack();
1596         }
1597 }
1598 EXPORT_SYMBOL(netdev_rx_csum_fault);
1599 #endif
1600
1601 /* Actually, we should eliminate this check as soon as we know, that:
1602  * 1. IOMMU is present and allows to map all the memory.
1603  * 2. No high memory really exists on this machine.
1604  */
1605
1606 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1607 {
1608 #ifdef CONFIG_HIGHMEM
1609         int i;
1610
1611         if (dev->features & NETIF_F_HIGHDMA)
1612                 return 0;
1613
1614         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1615                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1616                         return 1;
1617
1618 #endif
1619         return 0;
1620 }
1621
1622 struct dev_gso_cb {
1623         void (*destructor)(struct sk_buff *skb);
1624 };
1625
1626 #define DEV_GSO_CB(skb) ((struct dev_gso_cb *)(skb)->cb)
1627
1628 static void dev_gso_skb_destructor(struct sk_buff *skb)
1629 {
1630         struct dev_gso_cb *cb;
1631
1632         do {
1633                 struct sk_buff *nskb = skb->next;
1634
1635                 skb->next = nskb->next;
1636                 nskb->next = NULL;
1637                 kfree_skb(nskb);
1638         } while (skb->next);
1639
1640         cb = DEV_GSO_CB(skb);
1641         if (cb->destructor)
1642                 cb->destructor(skb);
1643 }
1644
1645 /**
1646  *      dev_gso_segment - Perform emulated hardware segmentation on skb.
1647  *      @skb: buffer to segment
1648  *
1649  *      This function segments the given skb and stores the list of segments
1650  *      in skb->next.
1651  */
1652 static int dev_gso_segment(struct sk_buff *skb)
1653 {
1654         struct net_device *dev = skb->dev;
1655         struct sk_buff *segs;
1656         int features = dev->features & ~(illegal_highdma(dev, skb) ?
1657                                          NETIF_F_SG : 0);
1658
1659         segs = skb_gso_segment(skb, features);
1660
1661         /* Verifying header integrity only. */
1662         if (!segs)
1663                 return 0;
1664
1665         if (IS_ERR(segs))
1666                 return PTR_ERR(segs);
1667
1668         skb->next = segs;
1669         DEV_GSO_CB(skb)->destructor = skb->destructor;
1670         skb->destructor = dev_gso_skb_destructor;
1671
1672         return 0;
1673 }
1674
1675 int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
1676                         struct netdev_queue *txq)
1677 {
1678         const struct net_device_ops *ops = dev->netdev_ops;
1679         int rc;
1680
1681         prefetch(&dev->netdev_ops->ndo_start_xmit);
1682         if (likely(!skb->next)) {
1683                 if (!list_empty(&ptype_all))
1684                         dev_queue_xmit_nit(skb, dev);
1685
1686                 if (netif_needs_gso(dev, skb)) {
1687                         if (unlikely(dev_gso_segment(skb)))
1688                                 goto out_kfree_skb;
1689                         if (skb->next)
1690                                 goto gso;
1691                 }
1692
1693                 rc = ops->ndo_start_xmit(skb, dev);
1694                 /*
1695                  * TODO: if skb_orphan() was called by
1696                  * dev->hard_start_xmit() (for example, the unmodified
1697                  * igb driver does that; bnx2 doesn't), then
1698                  * skb_tx_software_timestamp() will be unable to send
1699                  * back the time stamp.
1700                  *
1701                  * How can this be prevented? Always create another
1702                  * reference to the socket before calling
1703                  * dev->hard_start_xmit()? Prevent that skb_orphan()
1704                  * does anything in dev->hard_start_xmit() by clearing
1705                  * the skb destructor before the call and restoring it
1706                  * afterwards, then doing the skb_orphan() ourselves?
1707                  */
1708                 return rc;
1709         }
1710
1711 gso:
1712         do {
1713                 struct sk_buff *nskb = skb->next;
1714
1715                 skb->next = nskb->next;
1716                 nskb->next = NULL;
1717                 rc = ops->ndo_start_xmit(nskb, dev);
1718                 if (unlikely(rc)) {
1719                         nskb->next = skb->next;
1720                         skb->next = nskb;
1721                         return rc;
1722                 }
1723                 if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
1724                         return NETDEV_TX_BUSY;
1725         } while (skb->next);
1726
1727         skb->destructor = DEV_GSO_CB(skb)->destructor;
1728
1729 out_kfree_skb:
1730         kfree_skb(skb);
1731         return 0;
1732 }
1733
1734 static u32 skb_tx_hashrnd;
1735
1736 static u16 skb_tx_hash(struct net_device *dev, struct sk_buff *skb)
1737 {
1738         u32 hash;
1739
1740         if (skb_rx_queue_recorded(skb)) {
1741                 hash = skb_get_rx_queue(skb);
1742         } else if (skb->sk && skb->sk->sk_hash) {
1743                 hash = skb->sk->sk_hash;
1744         } else
1745                 hash = skb->protocol;
1746
1747         hash = jhash_1word(hash, skb_tx_hashrnd);
1748
1749         return (u16) (((u64) hash * dev->real_num_tx_queues) >> 32);
1750 }
1751
1752 static struct netdev_queue *dev_pick_tx(struct net_device *dev,
1753                                         struct sk_buff *skb)
1754 {
1755         const struct net_device_ops *ops = dev->netdev_ops;
1756         u16 queue_index = 0;
1757
1758         if (ops->ndo_select_queue)
1759                 queue_index = ops->ndo_select_queue(dev, skb);
1760         else if (dev->real_num_tx_queues > 1)
1761                 queue_index = skb_tx_hash(dev, skb);
1762
1763         skb_set_queue_mapping(skb, queue_index);
1764         return netdev_get_tx_queue(dev, queue_index);
1765 }
1766
1767 /**
1768  *      dev_queue_xmit - transmit a buffer
1769  *      @skb: buffer to transmit
1770  *
1771  *      Queue a buffer for transmission to a network device. The caller must
1772  *      have set the device and priority and built the buffer before calling
1773  *      this function. The function can be called from an interrupt.
1774  *
1775  *      A negative errno code is returned on a failure. A success does not
1776  *      guarantee the frame will be transmitted as it may be dropped due
1777  *      to congestion or traffic shaping.
1778  *
1779  * -----------------------------------------------------------------------------------
1780  *      I notice this method can also return errors from the queue disciplines,
1781  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1782  *      be positive.
1783  *
1784  *      Regardless of the return value, the skb is consumed, so it is currently
1785  *      difficult to retry a send to this method.  (You can bump the ref count
1786  *      before sending to hold a reference for retry if you are careful.)
1787  *
1788  *      When calling this method, interrupts MUST be enabled.  This is because
1789  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1790  *          --BLG
1791  */
1792 int dev_queue_xmit(struct sk_buff *skb)
1793 {
1794         struct net_device *dev = skb->dev;
1795         struct netdev_queue *txq;
1796         struct Qdisc *q;
1797         int rc = -ENOMEM;
1798
1799         /* GSO will handle the following emulations directly. */
1800         if (netif_needs_gso(dev, skb))
1801                 goto gso;
1802
1803         if (skb_shinfo(skb)->frag_list &&
1804             !(dev->features & NETIF_F_FRAGLIST) &&
1805             __skb_linearize(skb))
1806                 goto out_kfree_skb;
1807
1808         /* Fragmented skb is linearized if device does not support SG,
1809          * or if at least one of fragments is in highmem and device
1810          * does not support DMA from it.
1811          */
1812         if (skb_shinfo(skb)->nr_frags &&
1813             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1814             __skb_linearize(skb))
1815                 goto out_kfree_skb;
1816
1817         /* If packet is not checksummed and device does not support
1818          * checksumming for this protocol, complete checksumming here.
1819          */
1820         if (skb->ip_summed == CHECKSUM_PARTIAL) {
1821                 skb_set_transport_header(skb, skb->csum_start -
1822                                               skb_headroom(skb));
1823                 if (!dev_can_checksum(dev, skb) && skb_checksum_help(skb))
1824                         goto out_kfree_skb;
1825         }
1826
1827 gso:
1828         /* Disable soft irqs for various locks below. Also
1829          * stops preemption for RCU.
1830          */
1831         rcu_read_lock_bh();
1832
1833         txq = dev_pick_tx(dev, skb);
1834         q = rcu_dereference(txq->qdisc);
1835
1836 #ifdef CONFIG_NET_CLS_ACT
1837         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1838 #endif
1839         if (q->enqueue) {
1840                 spinlock_t *root_lock = qdisc_lock(q);
1841
1842                 spin_lock(root_lock);
1843
1844                 if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
1845                         kfree_skb(skb);
1846                         rc = NET_XMIT_DROP;
1847                 } else {
1848                         rc = qdisc_enqueue_root(skb, q);
1849                         qdisc_run(q);
1850                 }
1851                 spin_unlock(root_lock);
1852
1853                 goto out;
1854         }
1855
1856         /* The device has no queue. Common case for software devices:
1857            loopback, all the sorts of tunnels...
1858
1859            Really, it is unlikely that netif_tx_lock protection is necessary
1860            here.  (f.e. loopback and IP tunnels are clean ignoring statistics
1861            counters.)
1862            However, it is possible, that they rely on protection
1863            made by us here.
1864
1865            Check this and shot the lock. It is not prone from deadlocks.
1866            Either shot noqueue qdisc, it is even simpler 8)
1867          */
1868         if (dev->flags & IFF_UP) {
1869                 int cpu = smp_processor_id(); /* ok because BHs are off */
1870
1871                 if (txq->xmit_lock_owner != cpu) {
1872
1873                         HARD_TX_LOCK(dev, txq, cpu);
1874
1875                         if (!netif_tx_queue_stopped(txq)) {
1876                                 rc = 0;
1877                                 if (!dev_hard_start_xmit(skb, dev, txq)) {
1878                                         HARD_TX_UNLOCK(dev, txq);
1879                                         goto out;
1880                                 }
1881                         }
1882                         HARD_TX_UNLOCK(dev, txq);
1883                         if (net_ratelimit())
1884                                 printk(KERN_CRIT "Virtual device %s asks to "
1885                                        "queue packet!\n", dev->name);
1886                 } else {
1887                         /* Recursion is detected! It is possible,
1888                          * unfortunately */
1889                         if (net_ratelimit())
1890                                 printk(KERN_CRIT "Dead loop on virtual device "
1891                                        "%s, fix it urgently!\n", dev->name);
1892                 }
1893         }
1894
1895         rc = -ENETDOWN;
1896         rcu_read_unlock_bh();
1897
1898 out_kfree_skb:
1899         kfree_skb(skb);
1900         return rc;
1901 out:
1902         rcu_read_unlock_bh();
1903         return rc;
1904 }
1905
1906
1907 /*=======================================================================
1908                         Receiver routines
1909   =======================================================================*/
1910
1911 int netdev_max_backlog __read_mostly = 1000;
1912 int netdev_budget __read_mostly = 300;
1913 int weight_p __read_mostly = 64;            /* old backlog weight */
1914
1915 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1916
1917
1918 /**
1919  *      netif_rx        -       post buffer to the network code
1920  *      @skb: buffer to post
1921  *
1922  *      This function receives a packet from a device driver and queues it for
1923  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1924  *      may be dropped during processing for congestion control or by the
1925  *      protocol layers.
1926  *
1927  *      return values:
1928  *      NET_RX_SUCCESS  (no congestion)
1929  *      NET_RX_DROP     (packet was dropped)
1930  *
1931  */
1932
1933 int netif_rx(struct sk_buff *skb)
1934 {
1935         struct softnet_data *queue;
1936         unsigned long flags;
1937
1938         /* if netpoll wants it, pretend we never saw it */
1939         if (netpoll_rx(skb))
1940                 return NET_RX_DROP;
1941
1942         if (!skb->tstamp.tv64)
1943                 net_timestamp(skb);
1944
1945         /*
1946          * The code is rearranged so that the path is the most
1947          * short when CPU is congested, but is still operating.
1948          */
1949         local_irq_save(flags);
1950         queue = &__get_cpu_var(softnet_data);
1951
1952         __get_cpu_var(netdev_rx_stat).total++;
1953         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1954                 if (queue->input_pkt_queue.qlen) {
1955 enqueue:
1956                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1957                         local_irq_restore(flags);
1958                         return NET_RX_SUCCESS;
1959                 }
1960
1961                 napi_schedule(&queue->backlog);
1962                 goto enqueue;
1963         }
1964
1965         __get_cpu_var(netdev_rx_stat).dropped++;
1966         local_irq_restore(flags);
1967
1968         kfree_skb(skb);
1969         return NET_RX_DROP;
1970 }
1971
1972 int netif_rx_ni(struct sk_buff *skb)
1973 {
1974         int err;
1975
1976         preempt_disable();
1977         err = netif_rx(skb);
1978         if (local_softirq_pending())
1979                 do_softirq();
1980         preempt_enable();
1981
1982         return err;
1983 }
1984
1985 EXPORT_SYMBOL(netif_rx_ni);
1986
1987 static void net_tx_action(struct softirq_action *h)
1988 {
1989         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1990
1991         if (sd->completion_queue) {
1992                 struct sk_buff *clist;
1993
1994                 local_irq_disable();
1995                 clist = sd->completion_queue;
1996                 sd->completion_queue = NULL;
1997                 local_irq_enable();
1998
1999                 while (clist) {
2000                         struct sk_buff *skb = clist;
2001                         clist = clist->next;
2002
2003                         WARN_ON(atomic_read(&skb->users));
2004                         __kfree_skb(skb);
2005                 }
2006         }
2007
2008         if (sd->output_queue) {
2009                 struct Qdisc *head;
2010
2011                 local_irq_disable();
2012                 head = sd->output_queue;
2013                 sd->output_queue = NULL;
2014                 local_irq_enable();
2015
2016                 while (head) {
2017                         struct Qdisc *q = head;
2018                         spinlock_t *root_lock;
2019
2020                         head = head->next_sched;
2021
2022                         root_lock = qdisc_lock(q);
2023                         if (spin_trylock(root_lock)) {
2024                                 smp_mb__before_clear_bit();
2025                                 clear_bit(__QDISC_STATE_SCHED,
2026                                           &q->state);
2027                                 qdisc_run(q);
2028                                 spin_unlock(root_lock);
2029                         } else {
2030                                 if (!test_bit(__QDISC_STATE_DEACTIVATED,
2031                                               &q->state)) {
2032                                         __netif_reschedule(q);
2033                                 } else {
2034                                         smp_mb__before_clear_bit();
2035                                         clear_bit(__QDISC_STATE_SCHED,
2036                                                   &q->state);
2037                                 }
2038                         }
2039                 }
2040         }
2041 }
2042
2043 static inline int deliver_skb(struct sk_buff *skb,
2044                               struct packet_type *pt_prev,
2045                               struct net_device *orig_dev)
2046 {
2047         atomic_inc(&skb->users);
2048         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2049 }
2050
2051 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
2052 /* These hooks defined here for ATM */
2053 struct net_bridge;
2054 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
2055                                                 unsigned char *addr);
2056 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent) __read_mostly;
2057
2058 /*
2059  * If bridge module is loaded call bridging hook.
2060  *  returns NULL if packet was consumed.
2061  */
2062 struct sk_buff *(*br_handle_frame_hook)(struct net_bridge_port *p,
2063                                         struct sk_buff *skb) __read_mostly;
2064 static inline struct sk_buff *handle_bridge(struct sk_buff *skb,
2065                                             struct packet_type **pt_prev, int *ret,
2066                                             struct net_device *orig_dev)
2067 {
2068         struct net_bridge_port *port;
2069
2070         if (skb->pkt_type == PACKET_LOOPBACK ||
2071             (port = rcu_dereference(skb->dev->br_port)) == NULL)
2072                 return skb;
2073
2074         if (*pt_prev) {
2075                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2076                 *pt_prev = NULL;
2077         }
2078
2079         return br_handle_frame_hook(port, skb);
2080 }
2081 #else
2082 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (skb)
2083 #endif
2084
2085 #if defined(CONFIG_MACVLAN) || defined(CONFIG_MACVLAN_MODULE)
2086 struct sk_buff *(*macvlan_handle_frame_hook)(struct sk_buff *skb) __read_mostly;
2087 EXPORT_SYMBOL_GPL(macvlan_handle_frame_hook);
2088
2089 static inline struct sk_buff *handle_macvlan(struct sk_buff *skb,
2090                                              struct packet_type **pt_prev,
2091                                              int *ret,
2092                                              struct net_device *orig_dev)
2093 {
2094         if (skb->dev->macvlan_port == NULL)
2095                 return skb;
2096
2097         if (*pt_prev) {
2098                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2099                 *pt_prev = NULL;
2100         }
2101         return macvlan_handle_frame_hook(skb);
2102 }
2103 #else
2104 #define handle_macvlan(skb, pt_prev, ret, orig_dev)     (skb)
2105 #endif
2106
2107 #ifdef CONFIG_NET_CLS_ACT
2108 /* TODO: Maybe we should just force sch_ingress to be compiled in
2109  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
2110  * a compare and 2 stores extra right now if we dont have it on
2111  * but have CONFIG_NET_CLS_ACT
2112  * NOTE: This doesnt stop any functionality; if you dont have
2113  * the ingress scheduler, you just cant add policies on ingress.
2114  *
2115  */
2116 static int ing_filter(struct sk_buff *skb)
2117 {
2118         struct net_device *dev = skb->dev;
2119         u32 ttl = G_TC_RTTL(skb->tc_verd);
2120         struct netdev_queue *rxq;
2121         int result = TC_ACT_OK;
2122         struct Qdisc *q;
2123
2124         if (MAX_RED_LOOP < ttl++) {
2125                 printk(KERN_WARNING
2126                        "Redir loop detected Dropping packet (%d->%d)\n",
2127                        skb->iif, dev->ifindex);
2128                 return TC_ACT_SHOT;
2129         }
2130
2131         skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl);
2132         skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS);
2133
2134         rxq = &dev->rx_queue;
2135
2136         q = rxq->qdisc;
2137         if (q != &noop_qdisc) {
2138                 spin_lock(qdisc_lock(q));
2139                 if (likely(!test_bit(__QDISC_STATE_DEACTIVATED, &q->state)))
2140                         result = qdisc_enqueue_root(skb, q);
2141                 spin_unlock(qdisc_lock(q));
2142         }
2143
2144         return result;
2145 }
2146
2147 static inline struct sk_buff *handle_ing(struct sk_buff *skb,
2148                                          struct packet_type **pt_prev,
2149                                          int *ret, struct net_device *orig_dev)
2150 {
2151         if (skb->dev->rx_queue.qdisc == &noop_qdisc)
2152                 goto out;
2153
2154         if (*pt_prev) {
2155                 *ret = deliver_skb(skb, *pt_prev, orig_dev);
2156                 *pt_prev = NULL;
2157         } else {
2158                 /* Huh? Why does turning on AF_PACKET affect this? */
2159                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
2160         }
2161
2162         switch (ing_filter(skb)) {
2163         case TC_ACT_SHOT:
2164         case TC_ACT_STOLEN:
2165                 kfree_skb(skb);
2166                 return NULL;
2167         }
2168
2169 out:
2170         skb->tc_verd = 0;
2171         return skb;
2172 }
2173 #endif
2174
2175 /*
2176  *      netif_nit_deliver - deliver received packets to network taps
2177  *      @skb: buffer
2178  *
2179  *      This function is used to deliver incoming packets to network
2180  *      taps. It should be used when the normal netif_receive_skb path
2181  *      is bypassed, for example because of VLAN acceleration.
2182  */
2183 void netif_nit_deliver(struct sk_buff *skb)
2184 {
2185         struct packet_type *ptype;
2186
2187         if (list_empty(&ptype_all))
2188                 return;
2189
2190         skb_reset_network_header(skb);
2191         skb_reset_transport_header(skb);
2192         skb->mac_len = skb->network_header - skb->mac_header;
2193
2194         rcu_read_lock();
2195         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2196                 if (!ptype->dev || ptype->dev == skb->dev)
2197                         deliver_skb(skb, ptype, skb->dev);
2198         }
2199         rcu_read_unlock();
2200 }
2201
2202 /**
2203  *      netif_receive_skb - process receive buffer from network
2204  *      @skb: buffer to process
2205  *
2206  *      netif_receive_skb() is the main receive data processing function.
2207  *      It always succeeds. The buffer may be dropped during processing
2208  *      for congestion control or by the protocol layers.
2209  *
2210  *      This function may only be called from softirq context and interrupts
2211  *      should be enabled.
2212  *
2213  *      Return values (usually ignored):
2214  *      NET_RX_SUCCESS: no congestion
2215  *      NET_RX_DROP: packet was dropped
2216  */
2217 int netif_receive_skb(struct sk_buff *skb)
2218 {
2219         struct packet_type *ptype, *pt_prev;
2220         struct net_device *orig_dev;
2221         struct net_device *null_or_orig;
2222         int ret = NET_RX_DROP;
2223         __be16 type;
2224
2225         if (skb->vlan_tci && vlan_hwaccel_do_receive(skb))
2226                 return NET_RX_SUCCESS;
2227
2228         /* if we've gotten here through NAPI, check netpoll */
2229         if (netpoll_receive_skb(skb))
2230                 return NET_RX_DROP;
2231
2232         if (!skb->tstamp.tv64)
2233                 net_timestamp(skb);
2234
2235         if (!skb->iif)
2236                 skb->iif = skb->dev->ifindex;
2237
2238         null_or_orig = NULL;
2239         orig_dev = skb->dev;
2240         if (orig_dev->master) {
2241                 if (skb_bond_should_drop(skb))
2242                         null_or_orig = orig_dev; /* deliver only exact match */
2243                 else
2244                         skb->dev = orig_dev->master;
2245         }
2246
2247         __get_cpu_var(netdev_rx_stat).total++;
2248
2249         skb_reset_network_header(skb);
2250         skb_reset_transport_header(skb);
2251         skb->mac_len = skb->network_header - skb->mac_header;
2252
2253         pt_prev = NULL;
2254
2255         rcu_read_lock();
2256
2257 #ifdef CONFIG_NET_CLS_ACT
2258         if (skb->tc_verd & TC_NCLS) {
2259                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
2260                 goto ncls;
2261         }
2262 #endif
2263
2264         list_for_each_entry_rcu(ptype, &ptype_all, list) {
2265                 if (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2266                     ptype->dev == orig_dev) {
2267                         if (pt_prev)
2268                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2269                         pt_prev = ptype;
2270                 }
2271         }
2272
2273 #ifdef CONFIG_NET_CLS_ACT
2274         skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
2275         if (!skb)
2276                 goto out;
2277 ncls:
2278 #endif
2279
2280         skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);
2281         if (!skb)
2282                 goto out;
2283         skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);
2284         if (!skb)
2285                 goto out;
2286
2287         skb_orphan(skb);
2288
2289         type = skb->protocol;
2290         list_for_each_entry_rcu(ptype,
2291                         &ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {
2292                 if (ptype->type == type &&
2293                     (ptype->dev == null_or_orig || ptype->dev == skb->dev ||
2294                      ptype->dev == orig_dev)) {
2295                         if (pt_prev)
2296                                 ret = deliver_skb(skb, pt_prev, orig_dev);
2297                         pt_prev = ptype;
2298                 }
2299         }
2300
2301         if (pt_prev) {
2302                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
2303         } else {
2304                 kfree_skb(skb);
2305                 /* Jamal, now you will not able to escape explaining
2306                  * me how you were going to use this. :-)
2307                  */
2308                 ret = NET_RX_DROP;
2309         }
2310
2311 out:
2312         rcu_read_unlock();
2313         return ret;
2314 }
2315
2316 /* Network device is going away, flush any packets still pending  */
2317 static void flush_backlog(void *arg)
2318 {
2319         struct net_device *dev = arg;
2320         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2321         struct sk_buff *skb, *tmp;
2322
2323         skb_queue_walk_safe(&queue->input_pkt_queue, skb, tmp)
2324                 if (skb->dev == dev) {
2325                         __skb_unlink(skb, &queue->input_pkt_queue);
2326                         kfree_skb(skb);
2327                 }
2328 }
2329
2330 static int napi_gro_complete(struct sk_buff *skb)
2331 {
2332         struct packet_type *ptype;
2333         __be16 type = skb->protocol;
2334         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2335         int err = -ENOENT;
2336
2337         if (NAPI_GRO_CB(skb)->count == 1)
2338                 goto out;
2339
2340         rcu_read_lock();
2341         list_for_each_entry_rcu(ptype, head, list) {
2342                 if (ptype->type != type || ptype->dev || !ptype->gro_complete)
2343                         continue;
2344
2345                 err = ptype->gro_complete(skb);
2346                 break;
2347         }
2348         rcu_read_unlock();
2349
2350         if (err) {
2351                 WARN_ON(&ptype->list == head);
2352                 kfree_skb(skb);
2353                 return NET_RX_SUCCESS;
2354         }
2355
2356 out:
2357         skb_shinfo(skb)->gso_size = 0;
2358         return netif_receive_skb(skb);
2359 }
2360
2361 void napi_gro_flush(struct napi_struct *napi)
2362 {
2363         struct sk_buff *skb, *next;
2364
2365         for (skb = napi->gro_list; skb; skb = next) {
2366                 next = skb->next;
2367                 skb->next = NULL;
2368                 napi_gro_complete(skb);
2369         }
2370
2371         napi->gro_count = 0;
2372         napi->gro_list = NULL;
2373 }
2374 EXPORT_SYMBOL(napi_gro_flush);
2375
2376 void *skb_gro_header(struct sk_buff *skb, unsigned int hlen)
2377 {
2378         unsigned int offset = skb_gro_offset(skb);
2379
2380         hlen += offset;
2381         if (hlen <= skb_headlen(skb))
2382                 return skb->data + offset;
2383
2384         if (unlikely(!skb_shinfo(skb)->nr_frags ||
2385                      skb_shinfo(skb)->frags[0].size <=
2386                      hlen - skb_headlen(skb) ||
2387                      PageHighMem(skb_shinfo(skb)->frags[0].page)))
2388                 return pskb_may_pull(skb, hlen) ? skb->data + offset : NULL;
2389
2390         return page_address(skb_shinfo(skb)->frags[0].page) +
2391                skb_shinfo(skb)->frags[0].page_offset +
2392                offset - skb_headlen(skb);
2393 }
2394 EXPORT_SYMBOL(skb_gro_header);
2395
2396 int dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2397 {
2398         struct sk_buff **pp = NULL;
2399         struct packet_type *ptype;
2400         __be16 type = skb->protocol;
2401         struct list_head *head = &ptype_base[ntohs(type) & PTYPE_HASH_MASK];
2402         int same_flow;
2403         int mac_len;
2404         int ret;
2405
2406         if (!(skb->dev->features & NETIF_F_GRO))
2407                 goto normal;
2408
2409         if (skb_is_gso(skb) || skb_shinfo(skb)->frag_list)
2410                 goto normal;
2411
2412         rcu_read_lock();
2413         list_for_each_entry_rcu(ptype, head, list) {
2414                 if (ptype->type != type || ptype->dev || !ptype->gro_receive)
2415                         continue;
2416
2417                 skb_set_network_header(skb, skb_gro_offset(skb));
2418                 mac_len = skb->network_header - skb->mac_header;
2419                 skb->mac_len = mac_len;
2420                 NAPI_GRO_CB(skb)->same_flow = 0;
2421                 NAPI_GRO_CB(skb)->flush = 0;
2422                 NAPI_GRO_CB(skb)->free = 0;
2423
2424                 pp = ptype->gro_receive(&napi->gro_list, skb);
2425                 break;
2426         }
2427         rcu_read_unlock();
2428
2429         if (&ptype->list == head)
2430                 goto normal;
2431
2432         same_flow = NAPI_GRO_CB(skb)->same_flow;
2433         ret = NAPI_GRO_CB(skb)->free ? GRO_MERGED_FREE : GRO_MERGED;
2434
2435         if (pp) {
2436                 struct sk_buff *nskb = *pp;
2437
2438                 *pp = nskb->next;
2439                 nskb->next = NULL;
2440                 napi_gro_complete(nskb);
2441                 napi->gro_count--;
2442         }
2443
2444         if (same_flow)
2445                 goto ok;
2446
2447         if (NAPI_GRO_CB(skb)->flush || napi->gro_count >= MAX_GRO_SKBS)
2448                 goto normal;
2449
2450         napi->gro_count++;
2451         NAPI_GRO_CB(skb)->count = 1;
2452         skb_shinfo(skb)->gso_size = skb_gro_len(skb);
2453         skb->next = napi->gro_list;
2454         napi->gro_list = skb;
2455         ret = GRO_HELD;
2456
2457 pull:
2458         if (unlikely(!pskb_may_pull(skb, skb_gro_offset(skb)))) {
2459                 if (napi->gro_list == skb)
2460                         napi->gro_list = skb->next;
2461                 ret = GRO_DROP;
2462         }
2463
2464 ok:
2465         return ret;
2466
2467 normal:
2468         ret = GRO_NORMAL;
2469         goto pull;
2470 }
2471 EXPORT_SYMBOL(dev_gro_receive);
2472
2473 static int __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2474 {
2475         struct sk_buff *p;
2476
2477         for (p = napi->gro_list; p; p = p->next) {
2478                 NAPI_GRO_CB(p)->same_flow = !compare_ether_header(
2479                         skb_mac_header(p), skb_gro_mac_header(skb));
2480                 NAPI_GRO_CB(p)->flush = 0;
2481         }
2482
2483         return dev_gro_receive(napi, skb);
2484 }
2485
2486 int napi_skb_finish(int ret, struct sk_buff *skb)
2487 {
2488         int err = NET_RX_SUCCESS;
2489
2490         switch (ret) {
2491         case GRO_NORMAL:
2492                 return netif_receive_skb(skb);
2493
2494         case GRO_DROP:
2495                 err = NET_RX_DROP;
2496                 /* fall through */
2497
2498         case GRO_MERGED_FREE:
2499                 kfree_skb(skb);
2500                 break;
2501         }
2502
2503         return err;
2504 }
2505 EXPORT_SYMBOL(napi_skb_finish);
2506
2507 int napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
2508 {
2509         skb_gro_reset_offset(skb);
2510
2511         return napi_skb_finish(__napi_gro_receive(napi, skb), skb);
2512 }
2513 EXPORT_SYMBOL(napi_gro_receive);
2514
2515 void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb)
2516 {
2517         __skb_pull(skb, skb_headlen(skb));
2518         skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb));
2519
2520         napi->skb = skb;
2521 }
2522 EXPORT_SYMBOL(napi_reuse_skb);
2523
2524 struct sk_buff *napi_fraginfo_skb(struct napi_struct *napi,
2525                                   struct napi_gro_fraginfo *info)
2526 {
2527         struct net_device *dev = napi->dev;
2528         struct sk_buff *skb = napi->skb;
2529         struct ethhdr *eth;
2530         skb_frag_t *frag;
2531         int i;
2532
2533         napi->skb = NULL;
2534
2535         if (!skb) {
2536                 skb = netdev_alloc_skb(dev, GRO_MAX_HEAD + NET_IP_ALIGN);
2537                 if (!skb)
2538                         goto out;
2539
2540                 skb_reserve(skb, NET_IP_ALIGN);
2541         }
2542
2543         BUG_ON(info->nr_frags > MAX_SKB_FRAGS);
2544         frag = &info->frags[info->nr_frags - 1];
2545
2546         for (i = skb_shinfo(skb)->nr_frags; i < info->nr_frags; i++) {
2547                 skb_fill_page_desc(skb, i, frag->page, frag->page_offset,
2548                                    frag->size);
2549                 frag++;
2550         }
2551         skb_shinfo(skb)->nr_frags = info->nr_frags;
2552
2553         skb->data_len = info->len;
2554         skb->len += info->len;
2555         skb->truesize += info->len;
2556
2557         skb_reset_mac_header(skb);
2558         skb_gro_reset_offset(skb);
2559
2560         eth = skb_gro_header(skb, sizeof(*eth));
2561         if (!eth) {
2562                 napi_reuse_skb(napi, skb);
2563                 skb = NULL;
2564                 goto out;
2565         }
2566
2567         skb_gro_pull(skb, sizeof(*eth));
2568
2569         /*
2570          * This works because the only protocols we care about don't require
2571          * special handling.  We'll fix it up properly at the end.
2572          */
2573         skb->protocol = eth->h_proto;
2574
2575         skb->ip_summed = info->ip_summed;
2576         skb->csum = info->csum;
2577
2578 out:
2579         return skb;
2580 }
2581 EXPORT_SYMBOL(napi_fraginfo_skb);
2582
2583 int napi_frags_finish(struct napi_struct *napi, struct sk_buff *skb, int ret)
2584 {
2585         int err = NET_RX_SUCCESS;
2586
2587         switch (ret) {
2588         case GRO_NORMAL:
2589         case GRO_HELD:
2590                 skb->protocol = eth_type_trans(skb, napi->dev);
2591
2592                 if (ret == GRO_NORMAL)
2593                         return netif_receive_skb(skb);
2594
2595                 skb_gro_pull(skb, -ETH_HLEN);
2596                 break;
2597
2598         case GRO_DROP:
2599                 err = NET_RX_DROP;
2600                 /* fall through */
2601
2602         case GRO_MERGED_FREE:
2603                 napi_reuse_skb(napi, skb);
2604                 break;
2605         }
2606
2607         return err;
2608 }
2609 EXPORT_SYMBOL(napi_frags_finish);
2610
2611 int napi_gro_frags(struct napi_struct *napi, struct napi_gro_fraginfo *info)
2612 {
2613         struct sk_buff *skb = napi_fraginfo_skb(napi, info);
2614
2615         if (!skb)
2616                 return NET_RX_DROP;
2617
2618         return napi_frags_finish(napi, skb, __napi_gro_receive(napi, skb));
2619 }
2620 EXPORT_SYMBOL(napi_gro_frags);
2621
2622 static int process_backlog(struct napi_struct *napi, int quota)
2623 {
2624         int work = 0;
2625         struct softnet_data *queue = &__get_cpu_var(softnet_data);
2626         unsigned long start_time = jiffies;
2627
2628         napi->weight = weight_p;
2629         do {
2630                 struct sk_buff *skb;
2631
2632                 local_irq_disable();
2633                 skb = __skb_dequeue(&queue->input_pkt_queue);
2634                 if (!skb) {
2635                         __napi_complete(napi);
2636                         local_irq_enable();
2637                         break;
2638                 }
2639                 local_irq_enable();
2640
2641                 napi_gro_receive(napi, skb);
2642         } while (++work < quota && jiffies == start_time);
2643
2644         napi_gro_flush(napi);
2645
2646         return work;
2647 }
2648
2649 /**
2650  * __napi_schedule - schedule for receive
2651  * @n: entry to schedule
2652  *
2653  * The entry's receive function will be scheduled to run
2654  */
2655 void __napi_schedule(struct napi_struct *n)
2656 {
2657         unsigned long flags;
2658
2659         local_irq_save(flags);
2660         list_add_tail(&n->poll_list, &__get_cpu_var(softnet_data).poll_list);
2661         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2662         local_irq_restore(flags);
2663 }
2664 EXPORT_SYMBOL(__napi_schedule);
2665
2666 void __napi_complete(struct napi_struct *n)
2667 {
2668         BUG_ON(!test_bit(NAPI_STATE_SCHED, &n->state));
2669         BUG_ON(n->gro_list);
2670
2671         list_del(&n->poll_list);
2672         smp_mb__before_clear_bit();
2673         clear_bit(NAPI_STATE_SCHED, &n->state);
2674 }
2675 EXPORT_SYMBOL(__napi_complete);
2676
2677 void napi_complete(struct napi_struct *n)
2678 {
2679         unsigned long flags;
2680
2681         /*
2682          * don't let napi dequeue from the cpu poll list
2683          * just in case its running on a different cpu
2684          */
2685         if (unlikely(test_bit(NAPI_STATE_NPSVC, &n->state)))
2686                 return;
2687
2688         napi_gro_flush(n);
2689         local_irq_save(flags);
2690         __napi_complete(n);
2691         local_irq_restore(flags);
2692 }
2693 EXPORT_SYMBOL(napi_complete);
2694
2695 void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
2696                     int (*poll)(struct napi_struct *, int), int weight)
2697 {
2698         INIT_LIST_HEAD(&napi->poll_list);
2699         napi->gro_count = 0;
2700         napi->gro_list = NULL;
2701         napi->skb = NULL;
2702         napi->poll = poll;
2703         napi->weight = weight;
2704         list_add(&napi->dev_list, &dev->napi_list);
2705         napi->dev = dev;
2706 #ifdef CONFIG_NETPOLL
2707         spin_lock_init(&napi->poll_lock);
2708         napi->poll_owner = -1;
2709 #endif
2710         set_bit(NAPI_STATE_SCHED, &napi->state);
2711 }
2712 EXPORT_SYMBOL(netif_napi_add);
2713
2714 void netif_napi_del(struct napi_struct *napi)
2715 {
2716         struct sk_buff *skb, *next;
2717
2718         list_del_init(&napi->dev_list);
2719         kfree(napi->skb);
2720
2721         for (skb = napi->gro_list; skb; skb = next) {
2722                 next = skb->next;
2723                 skb->next = NULL;
2724                 kfree_skb(skb);
2725         }
2726
2727         napi->gro_list = NULL;
2728         napi->gro_count = 0;
2729 }
2730 EXPORT_SYMBOL(netif_napi_del);
2731
2732
2733 static void net_rx_action(struct softirq_action *h)
2734 {
2735         struct list_head *list = &__get_cpu_var(softnet_data).poll_list;
2736         unsigned long time_limit = jiffies + 2;
2737         int budget = netdev_budget;
2738         void *have;
2739
2740         local_irq_disable();
2741
2742         while (!list_empty(list)) {
2743                 struct napi_struct *n;
2744                 int work, weight;
2745
2746                 /* If softirq window is exhuasted then punt.
2747                  * Allow this to run for 2 jiffies since which will allow
2748                  * an average latency of 1.5/HZ.
2749                  */
2750                 if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
2751                         goto softnet_break;
2752
2753                 local_irq_enable();
2754
2755                 /* Even though interrupts have been re-enabled, this
2756                  * access is safe because interrupts can only add new
2757                  * entries to the tail of this list, and only ->poll()
2758                  * calls can remove this head entry from the list.
2759                  */
2760                 n = list_entry(list->next, struct napi_struct, poll_list);
2761
2762                 have = netpoll_poll_lock(n);
2763
2764                 weight = n->weight;
2765
2766                 /* This NAPI_STATE_SCHED test is for avoiding a race
2767                  * with netpoll's poll_napi().  Only the entity which
2768                  * obtains the lock and sees NAPI_STATE_SCHED set will
2769                  * actually make the ->poll() call.  Therefore we avoid
2770                  * accidently calling ->poll() when NAPI is not scheduled.
2771                  */
2772                 work = 0;
2773                 if (test_bit(NAPI_STATE_SCHED, &n->state))
2774                         work = n->poll(n, weight);
2775
2776                 WARN_ON_ONCE(work > weight);
2777
2778                 budget -= work;
2779
2780                 local_irq_disable();
2781
2782                 /* Drivers must not modify the NAPI state if they
2783                  * consume the entire weight.  In such cases this code
2784                  * still "owns" the NAPI instance and therefore can
2785                  * move the instance around on the list at-will.
2786                  */
2787                 if (unlikely(work == weight)) {
2788                         if (unlikely(napi_disable_pending(n)))
2789                                 __napi_complete(n);
2790                         else
2791                                 list_move_tail(&n->poll_list, list);
2792                 }
2793
2794                 netpoll_poll_unlock(have);
2795         }
2796 out:
2797         local_irq_enable();
2798
2799 #ifdef CONFIG_NET_DMA
2800         /*
2801          * There may not be any more sk_buffs coming right now, so push
2802          * any pending DMA copies to hardware
2803          */
2804         dma_issue_pending_all();
2805 #endif
2806
2807         return;
2808
2809 softnet_break:
2810         __get_cpu_var(netdev_rx_stat).time_squeeze++;
2811         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
2812         goto out;
2813 }
2814
2815 static gifconf_func_t * gifconf_list [NPROTO];
2816
2817 /**
2818  *      register_gifconf        -       register a SIOCGIF handler
2819  *      @family: Address family
2820  *      @gifconf: Function handler
2821  *
2822  *      Register protocol dependent address dumping routines. The handler
2823  *      that is passed must not be freed or reused until it has been replaced
2824  *      by another handler.
2825  */
2826 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
2827 {
2828         if (family >= NPROTO)
2829                 return -EINVAL;
2830         gifconf_list[family] = gifconf;
2831         return 0;
2832 }
2833
2834
2835 /*
2836  *      Map an interface index to its name (SIOCGIFNAME)
2837  */
2838
2839 /*
2840  *      We need this ioctl for efficient implementation of the
2841  *      if_indextoname() function required by the IPv6 API.  Without
2842  *      it, we would have to search all the interfaces to find a
2843  *      match.  --pb
2844  */
2845
2846 static int dev_ifname(struct net *net, struct ifreq __user *arg)
2847 {
2848         struct net_device *dev;
2849         struct ifreq ifr;
2850
2851         /*
2852          *      Fetch the caller's info block.
2853          */
2854
2855         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2856                 return -EFAULT;
2857
2858         read_lock(&dev_base_lock);
2859         dev = __dev_get_by_index(net, ifr.ifr_ifindex);
2860         if (!dev) {
2861                 read_unlock(&dev_base_lock);
2862                 return -ENODEV;
2863         }
2864
2865         strcpy(ifr.ifr_name, dev->name);
2866         read_unlock(&dev_base_lock);
2867
2868         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
2869                 return -EFAULT;
2870         return 0;
2871 }
2872
2873 /*
2874  *      Perform a SIOCGIFCONF call. This structure will change
2875  *      size eventually, and there is nothing I can do about it.
2876  *      Thus we will need a 'compatibility mode'.
2877  */
2878
2879 static int dev_ifconf(struct net *net, char __user *arg)
2880 {
2881         struct ifconf ifc;
2882         struct net_device *dev;
2883         char __user *pos;
2884         int len;
2885         int total;
2886         int i;
2887
2888         /*
2889          *      Fetch the caller's info block.
2890          */
2891
2892         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
2893                 return -EFAULT;
2894
2895         pos = ifc.ifc_buf;
2896         len = ifc.ifc_len;
2897
2898         /*
2899          *      Loop over the interfaces, and write an info block for each.
2900          */
2901
2902         total = 0;
2903         for_each_netdev(net, dev) {
2904                 for (i = 0; i < NPROTO; i++) {
2905                         if (gifconf_list[i]) {
2906                                 int done;
2907                                 if (!pos)
2908                                         done = gifconf_list[i](dev, NULL, 0);
2909                                 else
2910                                         done = gifconf_list[i](dev, pos + total,
2911                                                                len - total);
2912                                 if (done < 0)
2913                                         return -EFAULT;
2914                                 total += done;
2915                         }
2916                 }
2917         }
2918
2919         /*
2920          *      All done.  Write the updated control block back to the caller.
2921          */
2922         ifc.ifc_len = total;
2923
2924         /*
2925          *      Both BSD and Solaris return 0 here, so we do too.
2926          */
2927         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
2928 }
2929
2930 #ifdef CONFIG_PROC_FS
2931 /*
2932  *      This is invoked by the /proc filesystem handler to display a device
2933  *      in detail.
2934  */
2935 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
2936         __acquires(dev_base_lock)
2937 {
2938         struct net *net = seq_file_net(seq);
2939         loff_t off;
2940         struct net_device *dev;
2941
2942         read_lock(&dev_base_lock);
2943         if (!*pos)
2944                 return SEQ_START_TOKEN;
2945
2946         off = 1;
2947         for_each_netdev(net, dev)
2948                 if (off++ == *pos)
2949                         return dev;
2950
2951         return NULL;
2952 }
2953
2954 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2955 {
2956         struct net *net = seq_file_net(seq);
2957         ++*pos;
2958         return v == SEQ_START_TOKEN ?
2959                 first_net_device(net) : next_net_device((struct net_device *)v);
2960 }
2961
2962 void dev_seq_stop(struct seq_file *seq, void *v)
2963         __releases(dev_base_lock)
2964 {
2965         read_unlock(&dev_base_lock);
2966 }
2967
2968 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2969 {
2970         const struct net_device_stats *stats = dev_get_stats(dev);
2971
2972         seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2973                    "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2974                    dev->name, stats->rx_bytes, stats->rx_packets,
2975                    stats->rx_errors,
2976                    stats->rx_dropped + stats->rx_missed_errors,
2977                    stats->rx_fifo_errors,
2978                    stats->rx_length_errors + stats->rx_over_errors +
2979                     stats->rx_crc_errors + stats->rx_frame_errors,
2980                    stats->rx_compressed, stats->multicast,
2981                    stats->tx_bytes, stats->tx_packets,
2982                    stats->tx_errors, stats->tx_dropped,
2983                    stats->tx_fifo_errors, stats->collisions,
2984                    stats->tx_carrier_errors +
2985                     stats->tx_aborted_errors +
2986                     stats->tx_window_errors +
2987                     stats->tx_heartbeat_errors,
2988                    stats->tx_compressed);
2989 }
2990
2991 /*
2992  *      Called from the PROCfs module. This now uses the new arbitrary sized
2993  *      /proc/net interface to create /proc/net/dev
2994  */
2995 static int dev_seq_show(struct seq_file *seq, void *v)
2996 {
2997         if (v == SEQ_START_TOKEN)
2998                 seq_puts(seq, "Inter-|   Receive                            "
2999                               "                    |  Transmit\n"
3000                               " face |bytes    packets errs drop fifo frame "
3001                               "compressed multicast|bytes    packets errs "
3002                               "drop fifo colls carrier compressed\n");
3003         else
3004                 dev_seq_printf_stats(seq, v);
3005         return 0;
3006 }
3007
3008 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
3009 {
3010         struct netif_rx_stats *rc = NULL;
3011
3012         while (*pos < nr_cpu_ids)
3013                 if (cpu_online(*pos)) {
3014                         rc = &per_cpu(netdev_rx_stat, *pos);
3015                         break;
3016                 } else
3017                         ++*pos;
3018         return rc;
3019 }
3020
3021 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
3022 {
3023         return softnet_get_online(pos);
3024 }
3025
3026 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3027 {
3028         ++*pos;
3029         return softnet_get_online(pos);
3030 }
3031
3032 static void softnet_seq_stop(struct seq_file *seq, void *v)
3033 {
3034 }
3035
3036 static int softnet_seq_show(struct seq_file *seq, void *v)
3037 {
3038         struct netif_rx_stats *s = v;
3039
3040         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
3041                    s->total, s->dropped, s->time_squeeze, 0,
3042                    0, 0, 0, 0, /* was fastroute */
3043                    s->cpu_collision );
3044         return 0;
3045 }
3046
3047 static const struct seq_operations dev_seq_ops = {
3048         .start = dev_seq_start,
3049         .next  = dev_seq_next,
3050         .stop  = dev_seq_stop,
3051         .show  = dev_seq_show,
3052 };
3053
3054 static int dev_seq_open(struct inode *inode, struct file *file)
3055 {
3056         return seq_open_net(inode, file, &dev_seq_ops,
3057                             sizeof(struct seq_net_private));
3058 }
3059
3060 static const struct file_operations dev_seq_fops = {
3061         .owner   = THIS_MODULE,
3062         .open    = dev_seq_open,
3063         .read    = seq_read,
3064         .llseek  = seq_lseek,
3065         .release = seq_release_net,
3066 };
3067
3068 static const struct seq_operations softnet_seq_ops = {
3069         .start = softnet_seq_start,
3070         .next  = softnet_seq_next,
3071         .stop  = softnet_seq_stop,
3072         .show  = softnet_seq_show,
3073 };
3074
3075 static int softnet_seq_open(struct inode *inode, struct file *file)
3076 {
3077         return seq_open(file, &softnet_seq_ops);
3078 }
3079
3080 static const struct file_operations softnet_seq_fops = {
3081         .owner   = THIS_MODULE,
3082         .open    = softnet_seq_open,
3083         .read    = seq_read,
3084         .llseek  = seq_lseek,
3085         .release = seq_release,
3086 };
3087
3088 static void *ptype_get_idx(loff_t pos)
3089 {
3090         struct packet_type *pt = NULL;
3091         loff_t i = 0;
3092         int t;
3093
3094         list_for_each_entry_rcu(pt, &ptype_all, list) {
3095                 if (i == pos)
3096                         return pt;
3097                 ++i;
3098         }
3099
3100         for (t = 0; t < PTYPE_HASH_SIZE; t++) {
3101                 list_for_each_entry_rcu(pt, &ptype_base[t], list) {
3102                         if (i == pos)
3103                                 return pt;
3104                         ++i;
3105                 }
3106         }
3107         return NULL;
3108 }
3109
3110 static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
3111         __acquires(RCU)
3112 {
3113         rcu_read_lock();
3114         return *pos ? ptype_get_idx(*pos - 1) : SEQ_START_TOKEN;
3115 }
3116
3117 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
3118 {
3119         struct packet_type *pt;
3120         struct list_head *nxt;
3121         int hash;
3122
3123         ++*pos;
3124         if (v == SEQ_START_TOKEN)
3125                 return ptype_get_idx(0);
3126
3127         pt = v;
3128         nxt = pt->list.next;
3129         if (pt->type == htons(ETH_P_ALL)) {
3130                 if (nxt != &ptype_all)
3131                         goto found;
3132                 hash = 0;
3133                 nxt = ptype_base[0].next;
3134         } else
3135                 hash = ntohs(pt->type) & PTYPE_HASH_MASK;
3136
3137         while (nxt == &ptype_base[hash]) {
3138                 if (++hash >= PTYPE_HASH_SIZE)
3139                         return NULL;
3140                 nxt = ptype_base[hash].next;
3141         }
3142 found:
3143         return list_entry(nxt, struct packet_type, list);
3144 }
3145
3146 static void ptype_seq_stop(struct seq_file *seq, void *v)
3147         __releases(RCU)
3148 {
3149         rcu_read_unlock();
3150 }
3151
3152 static int ptype_seq_show(struct seq_file *seq, void *v)
3153 {
3154         struct packet_type *pt = v;
3155
3156         if (v == SEQ_START_TOKEN)
3157                 seq_puts(seq, "Type Device      Function\n");
3158         else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) {
3159                 if (pt->type == htons(ETH_P_ALL))
3160                         seq_puts(seq, "ALL ");
3161                 else
3162                         seq_printf(seq, "%04x", ntohs(pt->type));
3163
3164                 seq_printf(seq, " %-8s %pF\n",
3165                            pt->dev ? pt->dev->name : "", pt->func);
3166         }
3167
3168         return 0;
3169 }
3170
3171 static const struct seq_operations ptype_seq_ops = {
3172         .start = ptype_seq_start,
3173         .next  = ptype_seq_next,
3174         .stop  = ptype_seq_stop,
3175         .show  = ptype_seq_show,
3176 };
3177
3178 static int ptype_seq_open(struct inode *inode, struct file *file)
3179 {
3180         return seq_open_net(inode, file, &ptype_seq_ops,
3181                         sizeof(struct seq_net_private));
3182 }
3183
3184 static const struct file_operations ptype_seq_fops = {
3185         .owner   = THIS_MODULE,
3186         .open    = ptype_seq_open,
3187         .read    = seq_read,
3188         .llseek  = seq_lseek,
3189         .release = seq_release_net,
3190 };
3191
3192
3193 static int __net_init dev_proc_net_init(struct net *net)
3194 {
3195         int rc = -ENOMEM;
3196
3197         if (!proc_net_fops_create(net, "dev", S_IRUGO, &dev_seq_fops))
3198                 goto out;
3199         if (!proc_net_fops_create(net, "softnet_stat", S_IRUGO, &softnet_seq_fops))
3200                 goto out_dev;
3201         if (!proc_net_fops_create(net, "ptype", S_IRUGO, &ptype_seq_fops))
3202                 goto out_softnet;
3203
3204         if (wext_proc_init(net))
3205                 goto out_ptype;
3206         rc = 0;
3207 out:
3208         return rc;
3209 out_ptype:
3210         proc_net_remove(net, "ptype");
3211 out_softnet:
3212         proc_net_remove(net, "softnet_stat");
3213 out_dev:
3214         proc_net_remove(net, "dev");
3215         goto out;
3216 }
3217
3218 static void __net_exit dev_proc_net_exit(struct net *net)
3219 {
3220         wext_proc_exit(net);
3221
3222         proc_net_remove(net, "ptype");
3223         proc_net_remove(net, "softnet_stat");
3224         proc_net_remove(net, "dev");
3225 }
3226
3227 static struct pernet_operations __net_initdata dev_proc_ops = {
3228         .init = dev_proc_net_init,
3229         .exit = dev_proc_net_exit,
3230 };
3231
3232 static int __init dev_proc_init(void)
3233 {
3234         return register_pernet_subsys(&dev_proc_ops);
3235 }
3236 #else
3237 #define dev_proc_init() 0
3238 #endif  /* CONFIG_PROC_FS */
3239
3240
3241 /**
3242  *      netdev_set_master       -       set up master/slave pair
3243  *      @slave: slave device
3244  *      @master: new master device
3245  *
3246  *      Changes the master device of the slave. Pass %NULL to break the
3247  *      bonding. The caller must hold the RTNL semaphore. On a failure
3248  *      a negative errno code is returned. On success the reference counts
3249  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
3250  *      function returns zero.
3251  */
3252 int netdev_set_master(struct net_device *slave, struct net_device *master)
3253 {
3254         struct net_device *old = slave->master;
3255
3256         ASSERT_RTNL();
3257
3258         if (master) {
3259                 if (old)
3260                         return -EBUSY;
3261                 dev_hold(master);
3262         }
3263
3264         slave->master = master;
3265
3266         synchronize_net();
3267
3268         if (old)
3269                 dev_put(old);
3270
3271         if (master)
3272                 slave->flags |= IFF_SLAVE;
3273         else
3274                 slave->flags &= ~IFF_SLAVE;
3275
3276         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
3277         return 0;
3278 }
3279
3280 static void dev_change_rx_flags(struct net_device *dev, int flags)
3281 {
3282         const struct net_device_ops *ops = dev->netdev_ops;
3283
3284         if ((dev->flags & IFF_UP) && ops->ndo_change_rx_flags)
3285                 ops->ndo_change_rx_flags(dev, flags);
3286 }
3287
3288 static int __dev_set_promiscuity(struct net_device *dev, int inc)
3289 {
3290         unsigned short old_flags = dev->flags;
3291         uid_t uid;
3292         gid_t gid;
3293
3294         ASSERT_RTNL();
3295
3296         dev->flags |= IFF_PROMISC;
3297         dev->promiscuity += inc;
3298         if (dev->promiscuity == 0) {
3299                 /*
3300                  * Avoid overflow.
3301                  * If inc causes overflow, untouch promisc and return error.
3302                  */
3303                 if (inc < 0)
3304                         dev->flags &= ~IFF_PROMISC;
3305                 else {
3306                         dev->promiscuity -= inc;
3307                         printk(KERN_WARNING "%s: promiscuity touches roof, "
3308                                 "set promiscuity failed, promiscuity feature "
3309                                 "of device might be broken.\n", dev->name);
3310                         return -EOVERFLOW;
3311                 }
3312         }
3313         if (dev->flags != old_flags) {
3314                 printk(KERN_INFO "device %s %s promiscuous mode\n",
3315                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
3316                                                                "left");
3317                 if (audit_enabled) {
3318                         current_uid_gid(&uid, &gid);
3319                         audit_log(current->audit_context, GFP_ATOMIC,
3320                                 AUDIT_ANOM_PROMISCUOUS,
3321                                 "dev=%s prom=%d old_prom=%d auid=%u uid=%u gid=%u ses=%u",
3322                                 dev->name, (dev->flags & IFF_PROMISC),
3323                                 (old_flags & IFF_PROMISC),
3324                                 audit_get_loginuid(current),
3325                                 uid, gid,
3326                                 audit_get_sessionid(current));
3327                 }
3328
3329                 dev_change_rx_flags(dev, IFF_PROMISC);
3330         }
3331         return 0;
3332 }
3333
3334 /**
3335  *      dev_set_promiscuity     - update promiscuity count on a device
3336  *      @dev: device
3337  *      @inc: modifier
3338  *
3339  *      Add or remove promiscuity from a device. While the count in the device
3340  *      remains above zero the interface remains promiscuous. Once it hits zero
3341  *      the device reverts back to normal filtering operation. A negative inc
3342  *      value is used to drop promiscuity on the device.
3343  *      Return 0 if successful or a negative errno code on error.
3344  */
3345 int dev_set_promiscuity(struct net_device *dev, int inc)
3346 {
3347         unsigned short old_flags = dev->flags;
3348         int err;
3349
3350         err = __dev_set_promiscuity(dev, inc);
3351         if (err < 0)
3352                 return err;
3353         if (dev->flags != old_flags)
3354                 dev_set_rx_mode(dev);
3355         return err;
3356 }
3357
3358 /**
3359  *      dev_set_allmulti        - update allmulti count on a device
3360  *      @dev: device
3361  *      @inc: modifier
3362  *
3363  *      Add or remove reception of all multicast frames to a device. While the
3364  *      count in the device remains above zero the interface remains listening
3365  *      to all interfaces. Once it hits zero the device reverts back to normal
3366  *      filtering operation. A negative @inc value is used to drop the counter
3367  *      when releasing a resource needing all multicasts.
3368  *      Return 0 if successful or a negative errno code on error.
3369  */
3370
3371 int dev_set_allmulti(struct net_device *dev, int inc)
3372 {
3373         unsigned short old_flags = dev->flags;
3374
3375         ASSERT_RTNL();
3376
3377         dev->flags |= IFF_ALLMULTI;
3378         dev->allmulti += inc;
3379         if (dev->allmulti == 0) {
3380                 /*
3381                  * Avoid overflow.
3382                  * If inc causes overflow, untouch allmulti and return error.
3383                  */
3384                 if (inc < 0)
3385                         dev->flags &= ~IFF_ALLMULTI;
3386                 else {
3387                         dev->allmulti -= inc;
3388                         printk(KERN_WARNING "%s: allmulti touches roof, "
3389                                 "set allmulti failed, allmulti feature of "
3390                                 "device might be broken.\n", dev->name);
3391                         return -EOVERFLOW;
3392                 }
3393         }
3394         if (dev->flags ^ old_flags) {
3395                 dev_change_rx_flags(dev, IFF_ALLMULTI);
3396                 dev_set_rx_mode(dev);
3397         }
3398         return 0;
3399 }
3400
3401 /*
3402  *      Upload unicast and multicast address lists to device and
3403  *      configure RX filtering. When the device doesn't support unicast
3404  *      filtering it is put in promiscuous mode while unicast addresses
3405  *      are present.
3406  */
3407 void __dev_set_rx_mode(struct net_device *dev)
3408 {
3409         const struct net_device_ops *ops = dev->netdev_ops;
3410
3411         /* dev_open will call this function so the list will stay sane. */
3412         if (!(dev->flags&IFF_UP))
3413                 return;
3414
3415         if (!netif_device_present(dev))
3416                 return;
3417
3418         if (ops->ndo_set_rx_mode)
3419                 ops->ndo_set_rx_mode(dev);
3420         else {
3421                 /* Unicast addresses changes may only happen under the rtnl,
3422                  * therefore calling __dev_set_promiscuity here is safe.
3423                  */
3424                 if (dev->uc_count > 0 && !dev->uc_promisc) {
3425                         __dev_set_promiscuity(dev, 1);
3426                         dev->uc_promisc = 1;
3427                 } else if (dev->uc_count == 0 && dev->uc_promisc) {
3428                         __dev_set_promiscuity(dev, -1);
3429                         dev->uc_promisc = 0;
3430                 }
3431
3432                 if (ops->ndo_set_multicast_list)
3433                         ops->ndo_set_multicast_list(dev);
3434         }
3435 }
3436
3437 void dev_set_rx_mode(struct net_device *dev)
3438 {
3439         netif_addr_lock_bh(dev);
3440         __dev_set_rx_mode(dev);
3441         netif_addr_unlock_bh(dev);
3442 }
3443
3444 int __dev_addr_delete(struct dev_addr_list **list, int *count,
3445                       void *addr, int alen, int glbl)
3446 {
3447         struct dev_addr_list *da;
3448
3449         for (; (da = *list) != NULL; list = &da->next) {
3450                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3451                     alen == da->da_addrlen) {
3452                         if (glbl) {
3453                                 int old_glbl = da->da_gusers;
3454                                 da->da_gusers = 0;
3455                                 if (old_glbl == 0)
3456                                         break;
3457                         }
3458                         if (--da->da_users)
3459                                 return 0;
3460
3461                         *list = da->next;
3462                         kfree(da);
3463                         (*count)--;
3464                         return 0;
3465                 }
3466         }
3467         return -ENOENT;
3468 }
3469
3470 int __dev_addr_add(struct dev_addr_list **list, int *count,
3471                    void *addr, int alen, int glbl)
3472 {
3473         struct dev_addr_list *da;
3474
3475         for (da = *list; da != NULL; da = da->next) {
3476                 if (memcmp(da->da_addr, addr, da->da_addrlen) == 0 &&
3477                     da->da_addrlen == alen) {
3478                         if (glbl) {
3479                                 int old_glbl = da->da_gusers;
3480                                 da->da_gusers = 1;
3481                                 if (old_glbl)
3482                                         return 0;
3483                         }
3484                         da->da_users++;
3485                         return 0;
3486                 }
3487         }
3488
3489         da = kzalloc(sizeof(*da), GFP_ATOMIC);
3490         if (da == NULL)
3491                 return -ENOMEM;
3492         memcpy(da->da_addr, addr, alen);
3493         da->da_addrlen = alen;
3494         da->da_users = 1;
3495         da->da_gusers = glbl ? 1 : 0;
3496         da->next = *list;
3497         *list = da;
3498         (*count)++;
3499         return 0;
3500 }
3501
3502 /**
3503  *      dev_unicast_delete      - Release secondary unicast address.
3504  *      @dev: device
3505  *      @addr: address to delete
3506  *      @alen: length of @addr
3507  *
3508  *      Release reference to a secondary unicast address and remove it
3509  *      from the device if the reference count drops to zero.
3510  *
3511  *      The caller must hold the rtnl_mutex.
3512  */
3513 int dev_unicast_delete(struct net_device *dev, void *addr, int alen)
3514 {
3515         int err;
3516
3517         ASSERT_RTNL();
3518
3519         netif_addr_lock_bh(dev);
3520         err = __dev_addr_delete(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3521         if (!err)
3522                 __dev_set_rx_mode(dev);
3523         netif_addr_unlock_bh(dev);
3524         return err;
3525 }
3526 EXPORT_SYMBOL(dev_unicast_delete);
3527
3528 /**
3529  *      dev_unicast_add         - add a secondary unicast address
3530  *      @dev: device
3531  *      @addr: address to add
3532  *      @alen: length of @addr
3533  *
3534  *      Add a secondary unicast address to the device or increase
3535  *      the reference count if it already exists.
3536  *
3537  *      The caller must hold the rtnl_mutex.
3538  */
3539 int dev_unicast_add(struct net_device *dev, void *addr, int alen)
3540 {
3541         int err;
3542
3543         ASSERT_RTNL();
3544
3545         netif_addr_lock_bh(dev);
3546         err = __dev_addr_add(&dev->uc_list, &dev->uc_count, addr, alen, 0);
3547         if (!err)
3548                 __dev_set_rx_mode(dev);
3549         netif_addr_unlock_bh(dev);
3550         return err;
3551 }
3552 EXPORT_SYMBOL(dev_unicast_add);
3553
3554 int __dev_addr_sync(struct dev_addr_list **to, int *to_count,
3555                     struct dev_addr_list **from, int *from_count)
3556 {
3557         struct dev_addr_list *da, *next;
3558         int err = 0;
3559
3560         da = *from;
3561         while (da != NULL) {
3562                 next = da->next;
3563                 if (!da->da_synced) {
3564                         err = __dev_addr_add(to, to_count,
3565                                              da->da_addr, da->da_addrlen, 0);
3566                         if (err < 0)
3567                                 break;
3568                         da->da_synced = 1;
3569                         da->da_users++;
3570                 } else if (da->da_users == 1) {
3571                         __dev_addr_delete(to, to_count,
3572                                           da->da_addr, da->da_addrlen, 0);
3573                         __dev_addr_delete(from, from_count,
3574                                           da->da_addr, da->da_addrlen, 0);
3575                 }
3576                 da = next;
3577         }
3578         return err;
3579 }
3580
3581 void __dev_addr_unsync(struct dev_addr_list **to, int *to_count,
3582                        struct dev_addr_list **from, int *from_count)
3583 {
3584         struct dev_addr_list *da, *next;
3585
3586         da = *from;
3587         while (da != NULL) {
3588                 next = da->next;
3589                 if (da->da_synced) {
3590                         __dev_addr_delete(to, to_count,
3591                                           da->da_addr, da->da_addrlen, 0);
3592                         da->da_synced = 0;
3593                         __dev_addr_delete(from, from_count,
3594                                           da->da_addr, da->da_addrlen, 0);
3595                 }
3596                 da = next;
3597         }
3598 }
3599
3600 /**
3601  *      dev_unicast_sync - Synchronize device's unicast list to another device
3602  *      @to: destination device
3603  *      @from: source device
3604  *
3605  *      Add newly added addresses to the destination device and release
3606  *      addresses that have no users left. The source device must be
3607  *      locked by netif_tx_lock_bh.
3608  *
3609  *      This function is intended to be called from the dev->set_rx_mode
3610  *      function of layered software devices.
3611  */
3612 int dev_unicast_sync(struct net_device *to, struct net_device *from)
3613 {
3614         int err = 0;
3615
3616         netif_addr_lock_bh(to);
3617         err = __dev_addr_sync(&to->uc_list, &to->uc_count,
3618                               &from->uc_list, &from->uc_count);
3619         if (!err)
3620                 __dev_set_rx_mode(to);
3621         netif_addr_unlock_bh(to);
3622         return err;
3623 }
3624 EXPORT_SYMBOL(dev_unicast_sync);
3625
3626 /**
3627  *      dev_unicast_unsync - Remove synchronized addresses from the destination device
3628  *      @to: destination device
3629  *      @from: source device
3630  *
3631  *      Remove all addresses that were added to the destination device by
3632  *      dev_unicast_sync(). This function is intended to be called from the
3633  *      dev->stop function of layered software devices.
3634  */
3635 void dev_unicast_unsync(struct net_device *to, struct net_device *from)
3636 {
3637         netif_addr_lock_bh(from);
3638         netif_addr_lock(to);
3639
3640         __dev_addr_unsync(&to->uc_list, &to->uc_count,
3641                           &from->uc_list, &from->uc_count);
3642         __dev_set_rx_mode(to);
3643
3644         netif_addr_unlock(to);
3645         netif_addr_unlock_bh(from);
3646 }
3647 EXPORT_SYMBOL(dev_unicast_unsync);
3648
3649 static void __dev_addr_discard(struct dev_addr_list **list)
3650 {
3651         struct dev_addr_list *tmp;
3652
3653         while (*list != NULL) {
3654                 tmp = *list;
3655                 *list = tmp->next;
3656                 if (tmp->da_users > tmp->da_gusers)
3657                         printk("__dev_addr_discard: address leakage! "
3658                                "da_users=%d\n", tmp->da_users);
3659                 kfree(tmp);
3660         }
3661 }
3662
3663 static void dev_addr_discard(struct net_device *dev)
3664 {
3665         netif_addr_lock_bh(dev);
3666
3667         __dev_addr_discard(&dev->uc_list);
3668         dev->uc_count = 0;
3669
3670         __dev_addr_discard(&dev->mc_list);
3671         dev->mc_count = 0;
3672
3673         netif_addr_unlock_bh(dev);
3674 }
3675
3676 /**
3677  *      dev_get_flags - get flags reported to userspace
3678  *      @dev: device
3679  *
3680  *      Get the combination of flag bits exported through APIs to userspace.
3681  */
3682 unsigned dev_get_flags(const struct net_device *dev)
3683 {
3684         unsigned flags;
3685
3686         flags = (dev->flags & ~(IFF_PROMISC |
3687                                 IFF_ALLMULTI |
3688                                 IFF_RUNNING |
3689                                 IFF_LOWER_UP |
3690                                 IFF_DORMANT)) |
3691                 (dev->gflags & (IFF_PROMISC |
3692                                 IFF_ALLMULTI));
3693
3694         if (netif_running(dev)) {
3695                 if (netif_oper_up(dev))
3696                         flags |= IFF_RUNNING;
3697                 if (netif_carrier_ok(dev))
3698                         flags |= IFF_LOWER_UP;
3699                 if (netif_dormant(dev))
3700                         flags |= IFF_DORMANT;
3701         }
3702
3703         return flags;
3704 }
3705
3706 /**
3707  *      dev_change_flags - change device settings
3708  *      @dev: device
3709  *      @flags: device state flags
3710  *
3711  *      Change settings on device based state flags. The flags are
3712  *      in the userspace exported format.
3713  */
3714 int dev_change_flags(struct net_device *dev, unsigned flags)
3715 {
3716         int ret, changes;
3717         int old_flags = dev->flags;
3718
3719         ASSERT_RTNL();
3720
3721         /*
3722          *      Set the flags on our device.
3723          */
3724
3725         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
3726                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
3727                                IFF_AUTOMEDIA)) |
3728                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
3729                                     IFF_ALLMULTI));
3730
3731         /*
3732          *      Load in the correct multicast list now the flags have changed.
3733          */
3734
3735         if ((old_flags ^ flags) & IFF_MULTICAST)
3736                 dev_change_rx_flags(dev, IFF_MULTICAST);
3737
3738         dev_set_rx_mode(dev);
3739
3740         /*
3741          *      Have we downed the interface. We handle IFF_UP ourselves
3742          *      according to user attempts to set it, rather than blindly
3743          *      setting it.
3744          */
3745
3746         ret = 0;
3747         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
3748                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
3749
3750                 if (!ret)
3751                         dev_set_rx_mode(dev);
3752         }
3753
3754         if (dev->flags & IFF_UP &&
3755             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
3756                                           IFF_VOLATILE)))
3757                 call_netdevice_notifiers(NETDEV_CHANGE, dev);
3758
3759         if ((flags ^ dev->gflags) & IFF_PROMISC) {
3760                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
3761                 dev->gflags ^= IFF_PROMISC;
3762                 dev_set_promiscuity(dev, inc);
3763         }
3764
3765         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
3766            is important. Some (broken) drivers set IFF_PROMISC, when
3767            IFF_ALLMULTI is requested not asking us and not reporting.
3768          */
3769         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
3770                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
3771                 dev->gflags ^= IFF_ALLMULTI;
3772                 dev_set_allmulti(dev, inc);
3773         }
3774
3775         /* Exclude state transition flags, already notified */
3776         changes = (old_flags ^ dev->flags) & ~(IFF_UP | IFF_RUNNING);
3777         if (changes)
3778                 rtmsg_ifinfo(RTM_NEWLINK, dev, changes);
3779
3780         return ret;
3781 }
3782
3783 /**
3784  *      dev_set_mtu - Change maximum transfer unit
3785  *      @dev: device
3786  *      @new_mtu: new transfer unit
3787  *
3788  *      Change the maximum transfer size of the network device.
3789  */
3790 int dev_set_mtu(struct net_device *dev, int new_mtu)
3791 {
3792         const struct net_device_ops *ops = dev->netdev_ops;
3793         int err;
3794
3795         if (new_mtu == dev->mtu)
3796                 return 0;
3797
3798         /*      MTU must be positive.    */
3799         if (new_mtu < 0)
3800                 return -EINVAL;
3801
3802         if (!netif_device_present(dev))
3803                 return -ENODEV;
3804
3805         err = 0;
3806         if (ops->ndo_change_mtu)
3807                 err = ops->ndo_change_mtu(dev, new_mtu);
3808         else
3809                 dev->mtu = new_mtu;
3810
3811         if (!err && dev->flags & IFF_UP)
3812                 call_netdevice_notifiers(NETDEV_CHANGEMTU, dev);
3813         return err;
3814 }
3815
3816 /**
3817  *      dev_set_mac_address - Change Media Access Control Address
3818  *      @dev: device
3819  *      @sa: new address
3820  *
3821  *      Change the hardware (MAC) address of the device
3822  */
3823 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
3824 {
3825         const struct net_device_ops *ops = dev->netdev_ops;
3826         int err;
3827
3828         if (!ops->ndo_set_mac_address)
3829                 return -EOPNOTSUPP;
3830         if (sa->sa_family != dev->type)
3831                 return -EINVAL;
3832         if (!netif_device_present(dev))
3833                 return -ENODEV;
3834         err = ops->ndo_set_mac_address(dev, sa);
3835         if (!err)
3836                 call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3837         return err;
3838 }
3839
3840 /*
3841  *      Perform the SIOCxIFxxx calls, inside read_lock(dev_base_lock)
3842  */
3843 static int dev_ifsioc_locked(struct net *net, struct ifreq *ifr, unsigned int cmd)
3844 {
3845         int err;
3846         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3847
3848         if (!dev)
3849                 return -ENODEV;
3850
3851         switch (cmd) {
3852                 case SIOCGIFFLAGS:      /* Get interface flags */
3853                         ifr->ifr_flags = dev_get_flags(dev);
3854                         return 0;
3855
3856                 case SIOCGIFMETRIC:     /* Get the metric on the interface
3857                                            (currently unused) */
3858                         ifr->ifr_metric = 0;
3859                         return 0;
3860
3861                 case SIOCGIFMTU:        /* Get the MTU of a device */
3862                         ifr->ifr_mtu = dev->mtu;
3863                         return 0;
3864
3865                 case SIOCGIFHWADDR:
3866                         if (!dev->addr_len)
3867                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
3868                         else
3869                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
3870                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3871                         ifr->ifr_hwaddr.sa_family = dev->type;
3872                         return 0;
3873
3874                 case SIOCGIFSLAVE:
3875                         err = -EINVAL;
3876                         break;
3877
3878                 case SIOCGIFMAP:
3879                         ifr->ifr_map.mem_start = dev->mem_start;
3880                         ifr->ifr_map.mem_end   = dev->mem_end;
3881                         ifr->ifr_map.base_addr = dev->base_addr;
3882                         ifr->ifr_map.irq       = dev->irq;
3883                         ifr->ifr_map.dma       = dev->dma;
3884                         ifr->ifr_map.port      = dev->if_port;
3885                         return 0;
3886
3887                 case SIOCGIFINDEX:
3888                         ifr->ifr_ifindex = dev->ifindex;
3889                         return 0;
3890
3891                 case SIOCGIFTXQLEN:
3892                         ifr->ifr_qlen = dev->tx_queue_len;
3893                         return 0;
3894
3895                 default:
3896                         /* dev_ioctl() should ensure this case
3897                          * is never reached
3898                          */
3899                         WARN_ON(1);
3900                         err = -EINVAL;
3901                         break;
3902
3903         }
3904         return err;
3905 }
3906
3907 /*
3908  *      Perform the SIOCxIFxxx calls, inside rtnl_lock()
3909  */
3910 static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
3911 {
3912         int err;
3913         struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
3914         const struct net_device_ops *ops;
3915
3916         if (!dev)
3917                 return -ENODEV;
3918
3919         ops = dev->netdev_ops;
3920
3921         switch (cmd) {
3922                 case SIOCSIFFLAGS:      /* Set interface flags */
3923                         return dev_change_flags(dev, ifr->ifr_flags);
3924
3925                 case SIOCSIFMETRIC:     /* Set the metric on the interface
3926                                            (currently unused) */
3927                         return -EOPNOTSUPP;
3928
3929                 case SIOCSIFMTU:        /* Set the MTU of a device */
3930                         return dev_set_mtu(dev, ifr->ifr_mtu);
3931
3932                 case SIOCSIFHWADDR:
3933                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
3934
3935                 case SIOCSIFHWBROADCAST:
3936                         if (ifr->ifr_hwaddr.sa_family != dev->type)
3937                                 return -EINVAL;
3938                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
3939                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
3940                         call_netdevice_notifiers(NETDEV_CHANGEADDR, dev);
3941                         return 0;
3942
3943                 case SIOCSIFMAP:
3944                         if (ops->ndo_set_config) {
3945                                 if (!netif_device_present(dev))
3946                                         return -ENODEV;
3947                                 return ops->ndo_set_config(dev, &ifr->ifr_map);
3948                         }
3949                         return -EOPNOTSUPP;
3950
3951                 case SIOCADDMULTI:
3952                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3953                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3954                                 return -EINVAL;
3955                         if (!netif_device_present(dev))
3956                                 return -ENODEV;
3957                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
3958                                           dev->addr_len, 1);
3959
3960                 case SIOCDELMULTI:
3961                         if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
3962                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
3963                                 return -EINVAL;
3964                         if (!netif_device_present(dev))
3965                                 return -ENODEV;
3966                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
3967                                              dev->addr_len, 1);
3968
3969                 case SIOCSIFTXQLEN:
3970                         if (ifr->ifr_qlen < 0)
3971                                 return -EINVAL;
3972                         dev->tx_queue_len = ifr->ifr_qlen;
3973                         return 0;
3974
3975                 case SIOCSIFNAME:
3976                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
3977                         return dev_change_name(dev, ifr->ifr_newname);
3978
3979                 /*
3980                  *      Unknown or private ioctl
3981                  */
3982
3983                 default:
3984                         if ((cmd >= SIOCDEVPRIVATE &&
3985                             cmd <= SIOCDEVPRIVATE + 15) ||
3986                             cmd == SIOCBONDENSLAVE ||
3987                             cmd == SIOCBONDRELEASE ||
3988                             cmd == SIOCBONDSETHWADDR ||
3989                             cmd == SIOCBONDSLAVEINFOQUERY ||
3990                             cmd == SIOCBONDINFOQUERY ||
3991                             cmd == SIOCBONDCHANGEACTIVE ||
3992                             cmd == SIOCGMIIPHY ||
3993                             cmd == SIOCGMIIREG ||
3994                             cmd == SIOCSMIIREG ||
3995                             cmd == SIOCBRADDIF ||
3996                             cmd == SIOCBRDELIF ||
3997                             cmd == SIOCSHWTSTAMP ||
3998                             cmd == SIOCWANDEV) {
3999                                 err = -EOPNOTSUPP;
4000                                 if (ops->ndo_do_ioctl) {
4001                                         if (netif_device_present(dev))
4002                                                 err = ops->ndo_do_ioctl(dev, ifr, cmd);
4003                                         else
4004                                                 err = -ENODEV;
4005                                 }
4006                         } else
4007                                 err = -EINVAL;
4008
4009         }
4010         return err;
4011 }
4012
4013 /*
4014  *      This function handles all "interface"-type I/O control requests. The actual
4015  *      'doing' part of this is dev_ifsioc above.
4016  */
4017
4018 /**
4019  *      dev_ioctl       -       network device ioctl
4020  *      @net: the applicable net namespace
4021  *      @cmd: command to issue
4022  *      @arg: pointer to a struct ifreq in user space
4023  *
4024  *      Issue ioctl functions to devices. This is normally called by the
4025  *      user space syscall interfaces but can sometimes be useful for
4026  *      other purposes. The return value is the return from the syscall if
4027  *      positive or a negative errno code on error.
4028  */
4029
4030 int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
4031 {
4032         struct ifreq ifr;
4033         int ret;
4034         char *colon;
4035
4036         /* One special case: SIOCGIFCONF takes ifconf argument
4037            and requires shared lock, because it sleeps writing
4038            to user space.
4039          */
4040
4041         if (cmd == SIOCGIFCONF) {
4042                 rtnl_lock();
4043                 ret = dev_ifconf(net, (char __user *) arg);
4044                 rtnl_unlock();
4045                 return ret;
4046         }
4047         if (cmd == SIOCGIFNAME)
4048                 return dev_ifname(net, (struct ifreq __user *)arg);
4049
4050         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
4051                 return -EFAULT;
4052
4053         ifr.ifr_name[IFNAMSIZ-1] = 0;
4054
4055         colon = strchr(ifr.ifr_name, ':');
4056         if (colon)
4057                 *colon = 0;
4058
4059         /*
4060          *      See which interface the caller is talking about.
4061          */
4062
4063         switch (cmd) {
4064                 /*
4065                  *      These ioctl calls:
4066                  *      - can be done by all.
4067                  *      - atomic and do not require locking.
4068                  *      - return a value
4069                  */
4070                 case SIOCGIFFLAGS:
4071                 case SIOCGIFMETRIC:
4072                 case SIOCGIFMTU:
4073                 case SIOCGIFHWADDR:
4074                 case SIOCGIFSLAVE:
4075                 case SIOCGIFMAP:
4076                 case SIOCGIFINDEX:
4077                 case SIOCGIFTXQLEN:
4078                         dev_load(net, ifr.ifr_name);
4079                         read_lock(&dev_base_lock);
4080                         ret = dev_ifsioc_locked(net, &ifr, cmd);
4081                         read_unlock(&dev_base_lock);
4082                         if (!ret) {
4083                                 if (colon)
4084                                         *colon = ':';
4085                                 if (copy_to_user(arg, &ifr,
4086                                                  sizeof(struct ifreq)))
4087                                         ret = -EFAULT;
4088                         }
4089                         return ret;
4090
4091                 case SIOCETHTOOL:
4092                         dev_load(net, ifr.ifr_name);
4093                         rtnl_lock();
4094                         ret = dev_ethtool(net, &ifr);
4095                         rtnl_unlock();
4096                         if (!ret) {
4097                                 if (colon)
4098                                         *colon = ':';
4099                                 if (copy_to_user(arg, &ifr,
4100                                                  sizeof(struct ifreq)))
4101                                         ret = -EFAULT;
4102                         }
4103                         return ret;
4104
4105                 /*
4106                  *      These ioctl calls:
4107                  *      - require superuser power.
4108                  *      - require strict serialization.
4109                  *      - return a value
4110                  */
4111                 case SIOCGMIIPHY:
4112                 case SIOCGMIIREG:
4113                 case SIOCSIFNAME:
4114                         if (!capable(CAP_NET_ADMIN))
4115                                 return -EPERM;
4116                         dev_load(net, ifr.ifr_name);
4117                         rtnl_lock();
4118                         ret = dev_ifsioc(net, &ifr, cmd);
4119                         rtnl_unlock();
4120                         if (!ret) {
4121                                 if (colon)
4122                                         *colon = ':';
4123                                 if (copy_to_user(arg, &ifr,
4124                                                  sizeof(struct ifreq)))
4125                                         ret = -EFAULT;
4126                         }
4127                         return ret;
4128
4129                 /*
4130                  *      These ioctl calls:
4131                  *      - require superuser power.
4132                  *      - require strict serialization.
4133                  *      - do not return a value
4134                  */
4135                 case SIOCSIFFLAGS:
4136                 case SIOCSIFMETRIC:
4137                 case SIOCSIFMTU:
4138                 case SIOCSIFMAP:
4139                 case SIOCSIFHWADDR:
4140                 case SIOCSIFSLAVE:
4141                 case SIOCADDMULTI:
4142                 case SIOCDELMULTI:
4143                 case SIOCSIFHWBROADCAST:
4144                 case SIOCSIFTXQLEN:
4145                 case SIOCSMIIREG:
4146                 case SIOCBONDENSLAVE:
4147                 case SIOCBONDRELEASE:
4148                 case SIOCBONDSETHWADDR:
4149                 case SIOCBONDCHANGEACTIVE:
4150                 case SIOCBRADDIF:
4151                 case SIOCBRDELIF:
4152                 case SIOCSHWTSTAMP:
4153                         if (!capable(CAP_NET_ADMIN))
4154                                 return -EPERM;
4155                         /* fall through */
4156                 case SIOCBONDSLAVEINFOQUERY:
4157                 case SIOCBONDINFOQUERY:
4158                         dev_load(net, ifr.ifr_name);
4159                         rtnl_lock();
4160                         ret = dev_ifsioc(net, &ifr, cmd);
4161                         rtnl_unlock();
4162                         return ret;
4163
4164                 case SIOCGIFMEM:
4165                         /* Get the per device memory space. We can add this but
4166                          * currently do not support it */
4167                 case SIOCSIFMEM:
4168                         /* Set the per device memory buffer space.
4169                          * Not applicable in our case */
4170                 case SIOCSIFLINK:
4171                         return -EINVAL;
4172
4173                 /*
4174                  *      Unknown or private ioctl.
4175                  */
4176                 default:
4177                         if (cmd == SIOCWANDEV ||
4178                             (cmd >= SIOCDEVPRIVATE &&
4179                              cmd <= SIOCDEVPRIVATE + 15)) {
4180                                 dev_load(net, ifr.ifr_name);
4181                                 rtnl_lock();
4182                                 ret = dev_ifsioc(net, &ifr, cmd);
4183                                 rtnl_unlock();
4184                                 if (!ret && copy_to_user(arg, &ifr,
4185                                                          sizeof(struct ifreq)))
4186                                         ret = -EFAULT;
4187                                 return ret;
4188                         }
4189                         /* Take care of Wireless Extensions */
4190                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST)
4191                                 return wext_handle_ioctl(net, &ifr, cmd, arg);
4192                         return -EINVAL;
4193         }
4194 }
4195
4196
4197 /**
4198  *      dev_new_index   -       allocate an ifindex
4199  *      @net: the applicable net namespace
4200  *
4201  *      Returns a suitable unique value for a new device interface
4202  *      number.  The caller must hold the rtnl semaphore or the
4203  *      dev_base_lock to be sure it remains unique.
4204  */
4205 static int dev_new_index(struct net *net)
4206 {
4207         static int ifindex;
4208         for (;;) {
4209                 if (++ifindex <= 0)
4210                         ifindex = 1;
4211                 if (!__dev_get_by_index(net, ifindex))
4212                         return ifindex;
4213         }
4214 }
4215
4216 /* Delayed registration/unregisteration */
4217 static LIST_HEAD(net_todo_list);
4218
4219 static void net_set_todo(struct net_device *dev)
4220 {
4221         list_add_tail(&dev->todo_list, &net_todo_list);
4222 }
4223
4224 static void rollback_registered(struct net_device *dev)
4225 {
4226         BUG_ON(dev_boot_phase);
4227         ASSERT_RTNL();
4228
4229         /* Some devices call without registering for initialization unwind. */
4230         if (dev->reg_state == NETREG_UNINITIALIZED) {
4231                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
4232                                   "was registered\n", dev->name, dev);
4233
4234                 WARN_ON(1);
4235                 return;
4236         }
4237
4238         BUG_ON(dev->reg_state != NETREG_REGISTERED);
4239
4240         /* If device is running, close it first. */
4241         dev_close(dev);
4242
4243         /* And unlink it from device chain. */
4244         unlist_netdevice(dev);
4245
4246         dev->reg_state = NETREG_UNREGISTERING;
4247
4248         synchronize_net();
4249
4250         /* Shutdown queueing discipline. */
4251         dev_shutdown(dev);
4252
4253
4254         /* Notify protocols, that we are about to destroy
4255            this device. They should clean all the things.
4256         */
4257         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4258
4259         /*
4260          *      Flush the unicast and multicast chains
4261          */
4262         dev_addr_discard(dev);
4263
4264         if (dev->netdev_ops->ndo_uninit)
4265                 dev->netdev_ops->ndo_uninit(dev);
4266
4267         /* Notifier chain MUST detach us from master device. */
4268         WARN_ON(dev->master);
4269
4270         /* Remove entries from kobject tree */
4271         netdev_unregister_kobject(dev);
4272
4273         synchronize_net();
4274
4275         dev_put(dev);
4276 }
4277
4278 static void __netdev_init_queue_locks_one(struct net_device *dev,
4279                                           struct netdev_queue *dev_queue,
4280                                           void *_unused)
4281 {
4282         spin_lock_init(&dev_queue->_xmit_lock);
4283         netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type);
4284         dev_queue->xmit_lock_owner = -1;
4285 }
4286
4287 static void netdev_init_queue_locks(struct net_device *dev)
4288 {
4289         netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL);
4290         __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL);
4291 }
4292
4293 unsigned long netdev_fix_features(unsigned long features, const char *name)
4294 {
4295         /* Fix illegal SG+CSUM combinations. */
4296         if ((features & NETIF_F_SG) &&
4297             !(features & NETIF_F_ALL_CSUM)) {
4298                 if (name)
4299                         printk(KERN_NOTICE "%s: Dropping NETIF_F_SG since no "
4300                                "checksum feature.\n", name);
4301                 features &= ~NETIF_F_SG;
4302         }
4303
4304         /* TSO requires that SG is present as well. */
4305         if ((features & NETIF_F_TSO) && !(features & NETIF_F_SG)) {
4306                 if (name)
4307                         printk(KERN_NOTICE "%s: Dropping NETIF_F_TSO since no "
4308                                "SG feature.\n", name);
4309                 features &= ~NETIF_F_TSO;
4310         }
4311
4312         if (features & NETIF_F_UFO) {
4313                 if (!(features & NETIF_F_GEN_CSUM)) {
4314                         if (name)
4315                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4316                                        "since no NETIF_F_HW_CSUM feature.\n",
4317                                        name);
4318                         features &= ~NETIF_F_UFO;
4319                 }
4320
4321                 if (!(features & NETIF_F_SG)) {
4322                         if (name)
4323                                 printk(KERN_ERR "%s: Dropping NETIF_F_UFO "
4324                                        "since no NETIF_F_SG feature.\n", name);
4325                         features &= ~NETIF_F_UFO;
4326                 }
4327         }
4328
4329         return features;
4330 }
4331 EXPORT_SYMBOL(netdev_fix_features);
4332
4333 /**
4334  *      register_netdevice      - register a network device
4335  *      @dev: device to register
4336  *
4337  *      Take a completed network device structure and add it to the kernel
4338  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4339  *      chain. 0 is returned on success. A negative errno code is returned
4340  *      on a failure to set up the device, or if the name is a duplicate.
4341  *
4342  *      Callers must hold the rtnl semaphore. You may want
4343  *      register_netdev() instead of this.
4344  *
4345  *      BUGS:
4346  *      The locking appears insufficient to guarantee two parallel registers
4347  *      will not get the same name.
4348  */
4349
4350 int register_netdevice(struct net_device *dev)
4351 {
4352         struct hlist_head *head;
4353         struct hlist_node *p;
4354         int ret;
4355         struct net *net = dev_net(dev);
4356
4357         BUG_ON(dev_boot_phase);
4358         ASSERT_RTNL();
4359
4360         might_sleep();
4361
4362         /* When net_device's are persistent, this will be fatal. */
4363         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
4364         BUG_ON(!net);
4365
4366         spin_lock_init(&dev->addr_list_lock);
4367         netdev_set_addr_lockdep_class(dev);
4368         netdev_init_queue_locks(dev);
4369
4370         dev->iflink = -1;
4371
4372 #ifdef CONFIG_COMPAT_NET_DEV_OPS
4373         /* Netdevice_ops API compatiability support.
4374          * This is temporary until all network devices are converted.
4375          */
4376         if (dev->netdev_ops) {
4377                 const struct net_device_ops *ops = dev->netdev_ops;
4378
4379                 dev->init = ops->ndo_init;
4380                 dev->uninit = ops->ndo_uninit;
4381                 dev->open = ops->ndo_open;
4382                 dev->change_rx_flags = ops->ndo_change_rx_flags;
4383                 dev->set_rx_mode = ops->ndo_set_rx_mode;
4384                 dev->set_multicast_list = ops->ndo_set_multicast_list;
4385                 dev->set_mac_address = ops->ndo_set_mac_address;
4386                 dev->validate_addr = ops->ndo_validate_addr;
4387                 dev->do_ioctl = ops->ndo_do_ioctl;
4388                 dev->set_config = ops->ndo_set_config;
4389                 dev->change_mtu = ops->ndo_change_mtu;
4390                 dev->tx_timeout = ops->ndo_tx_timeout;
4391                 dev->get_stats = ops->ndo_get_stats;
4392                 dev->vlan_rx_register = ops->ndo_vlan_rx_register;
4393                 dev->vlan_rx_add_vid = ops->ndo_vlan_rx_add_vid;
4394                 dev->vlan_rx_kill_vid = ops->ndo_vlan_rx_kill_vid;
4395 #ifdef CONFIG_NET_POLL_CONTROLLER
4396                 dev->poll_controller = ops->ndo_poll_controller;
4397 #endif
4398         } else {
4399                 char drivername[64];
4400                 pr_info("%s (%s): not using net_device_ops yet\n",
4401                         dev->name, netdev_drivername(dev, drivername, 64));
4402
4403                 /* This works only because net_device_ops and the
4404                    compatiablity structure are the same. */
4405                 dev->netdev_ops = (void *) &(dev->init);
4406         }
4407 #endif
4408
4409         /* Init, if this function is available */
4410         if (dev->netdev_ops->ndo_init) {
4411                 ret = dev->netdev_ops->ndo_init(dev);
4412                 if (ret) {
4413                         if (ret > 0)
4414                                 ret = -EIO;
4415                         goto out;
4416                 }
4417         }
4418
4419         if (!dev_valid_name(dev->name)) {
4420                 ret = -EINVAL;
4421                 goto err_uninit;
4422         }
4423
4424         dev->ifindex = dev_new_index(net);
4425         if (dev->iflink == -1)
4426                 dev->iflink = dev->ifindex;
4427
4428         /* Check for existence of name */
4429         head = dev_name_hash(net, dev->name);
4430         hlist_for_each(p, head) {
4431                 struct net_device *d
4432                         = hlist_entry(p, struct net_device, name_hlist);
4433                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
4434                         ret = -EEXIST;
4435                         goto err_uninit;
4436                 }
4437         }
4438
4439         /* Fix illegal checksum combinations */
4440         if ((dev->features & NETIF_F_HW_CSUM) &&
4441             (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4442                 printk(KERN_NOTICE "%s: mixed HW and IP checksum settings.\n",
4443                        dev->name);
4444                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
4445         }
4446
4447         if ((dev->features & NETIF_F_NO_CSUM) &&
4448             (dev->features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
4449                 printk(KERN_NOTICE "%s: mixed no checksumming and other settings.\n",
4450                        dev->name);
4451                 dev->features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
4452         }
4453
4454         dev->features = netdev_fix_features(dev->features, dev->name);
4455
4456         /* Enable software GSO if SG is supported. */
4457         if (dev->features & NETIF_F_SG)
4458                 dev->features |= NETIF_F_GSO;
4459
4460         netdev_initialize_kobject(dev);
4461         ret = netdev_register_kobject(dev);
4462         if (ret)
4463                 goto err_uninit;
4464         dev->reg_state = NETREG_REGISTERED;
4465
4466         /*
4467          *      Default initial state at registry is that the
4468          *      device is present.
4469          */
4470
4471         set_bit(__LINK_STATE_PRESENT, &dev->state);
4472
4473         dev_init_scheduler(dev);
4474         dev_hold(dev);
4475         list_netdevice(dev);
4476
4477         /* Notify protocols, that a new device appeared. */
4478         ret = call_netdevice_notifiers(NETDEV_REGISTER, dev);
4479         ret = notifier_to_errno(ret);
4480         if (ret) {
4481                 rollback_registered(dev);
4482                 dev->reg_state = NETREG_UNREGISTERED;
4483         }
4484
4485 out:
4486         return ret;
4487
4488 err_uninit:
4489         if (dev->netdev_ops->ndo_uninit)
4490                 dev->netdev_ops->ndo_uninit(dev);
4491         goto out;
4492 }
4493
4494 /**
4495  *      init_dummy_netdev       - init a dummy network device for NAPI
4496  *      @dev: device to init
4497  *
4498  *      This takes a network device structure and initialize the minimum
4499  *      amount of fields so it can be used to schedule NAPI polls without
4500  *      registering a full blown interface. This is to be used by drivers
4501  *      that need to tie several hardware interfaces to a single NAPI
4502  *      poll scheduler due to HW limitations.
4503  */
4504 int init_dummy_netdev(struct net_device *dev)
4505 {
4506         /* Clear everything. Note we don't initialize spinlocks
4507          * are they aren't supposed to be taken by any of the
4508          * NAPI code and this dummy netdev is supposed to be
4509          * only ever used for NAPI polls
4510          */
4511         memset(dev, 0, sizeof(struct net_device));
4512
4513         /* make sure we BUG if trying to hit standard
4514          * register/unregister code path
4515          */
4516         dev->reg_state = NETREG_DUMMY;
4517
4518         /* initialize the ref count */
4519         atomic_set(&dev->refcnt, 1);
4520
4521         /* NAPI wants this */
4522         INIT_LIST_HEAD(&dev->napi_list);
4523
4524         /* a dummy interface is started by default */
4525         set_bit(__LINK_STATE_PRESENT, &dev->state);
4526         set_bit(__LINK_STATE_START, &dev->state);
4527
4528         return 0;
4529 }
4530 EXPORT_SYMBOL_GPL(init_dummy_netdev);
4531
4532
4533 /**
4534  *      register_netdev - register a network device
4535  *      @dev: device to register
4536  *
4537  *      Take a completed network device structure and add it to the kernel
4538  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
4539  *      chain. 0 is returned on success. A negative errno code is returned
4540  *      on a failure to set up the device, or if the name is a duplicate.
4541  *
4542  *      This is a wrapper around register_netdevice that takes the rtnl semaphore
4543  *      and expands the device name if you passed a format string to
4544  *      alloc_netdev.
4545  */
4546 int register_netdev(struct net_device *dev)
4547 {
4548         int err;
4549
4550         rtnl_lock();
4551
4552         /*
4553          * If the name is a format string the caller wants us to do a
4554          * name allocation.
4555          */
4556         if (strchr(dev->name, '%')) {
4557                 err = dev_alloc_name(dev, dev->name);
4558                 if (err < 0)
4559                         goto out;
4560         }
4561
4562         err = register_netdevice(dev);
4563 out:
4564         rtnl_unlock();
4565         return err;
4566 }
4567 EXPORT_SYMBOL(register_netdev);
4568
4569 /*
4570  * netdev_wait_allrefs - wait until all references are gone.
4571  *
4572  * This is called when unregistering network devices.
4573  *
4574  * Any protocol or device that holds a reference should register
4575  * for netdevice notification, and cleanup and put back the
4576  * reference if they receive an UNREGISTER event.
4577  * We can get stuck here if buggy protocols don't correctly
4578  * call dev_put.
4579  */
4580 static void netdev_wait_allrefs(struct net_device *dev)
4581 {
4582         unsigned long rebroadcast_time, warning_time;
4583
4584         rebroadcast_time = warning_time = jiffies;
4585         while (atomic_read(&dev->refcnt) != 0) {
4586                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
4587                         rtnl_lock();
4588
4589                         /* Rebroadcast unregister notification */
4590                         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4591
4592                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
4593                                      &dev->state)) {
4594                                 /* We must not have linkwatch events
4595                                  * pending on unregister. If this
4596                                  * happens, we simply run the queue
4597                                  * unscheduled, resulting in a noop
4598                                  * for this device.
4599                                  */
4600                                 linkwatch_run_queue();
4601                         }
4602
4603                         __rtnl_unlock();
4604
4605                         rebroadcast_time = jiffies;
4606                 }
4607
4608                 msleep(250);
4609
4610                 if (time_after(jiffies, warning_time + 10 * HZ)) {
4611                         printk(KERN_EMERG "unregister_netdevice: "
4612                                "waiting for %s to become free. Usage "
4613                                "count = %d\n",
4614                                dev->name, atomic_read(&dev->refcnt));
4615                         warning_time = jiffies;
4616                 }
4617         }
4618 }
4619
4620 /* The sequence is:
4621  *
4622  *      rtnl_lock();
4623  *      ...
4624  *      register_netdevice(x1);
4625  *      register_netdevice(x2);
4626  *      ...
4627  *      unregister_netdevice(y1);
4628  *      unregister_netdevice(y2);
4629  *      ...
4630  *      rtnl_unlock();
4631  *      free_netdev(y1);
4632  *      free_netdev(y2);
4633  *
4634  * We are invoked by rtnl_unlock().
4635  * This allows us to deal with problems:
4636  * 1) We can delete sysfs objects which invoke hotplug
4637  *    without deadlocking with linkwatch via keventd.
4638  * 2) Since we run with the RTNL semaphore not held, we can sleep
4639  *    safely in order to wait for the netdev refcnt to drop to zero.
4640  *
4641  * We must not return until all unregister events added during
4642  * the interval the lock was held have been completed.
4643  */
4644 void netdev_run_todo(void)
4645 {
4646         struct list_head list;
4647
4648         /* Snapshot list, allow later requests */
4649         list_replace_init(&net_todo_list, &list);
4650
4651         __rtnl_unlock();
4652
4653         while (!list_empty(&list)) {
4654                 struct net_device *dev
4655                         = list_entry(list.next, struct net_device, todo_list);
4656                 list_del(&dev->todo_list);
4657
4658                 if (unlikely(dev->reg_state != NETREG_UNREGISTERING)) {
4659                         printk(KERN_ERR "network todo '%s' but state %d\n",
4660                                dev->name, dev->reg_state);
4661                         dump_stack();
4662                         continue;
4663                 }
4664
4665                 dev->reg_state = NETREG_UNREGISTERED;
4666
4667                 on_each_cpu(flush_backlog, dev, 1);
4668
4669                 netdev_wait_allrefs(dev);
4670
4671                 /* paranoia */
4672                 BUG_ON(atomic_read(&dev->refcnt));
4673                 WARN_ON(dev->ip_ptr);
4674                 WARN_ON(dev->ip6_ptr);
4675                 WARN_ON(dev->dn_ptr);
4676
4677                 if (dev->destructor)
4678                         dev->destructor(dev);
4679
4680                 /* Free network device */
4681                 kobject_put(&dev->dev.kobj);
4682         }
4683 }
4684
4685 /**
4686  *      dev_get_stats   - get network device statistics
4687  *      @dev: device to get statistics from
4688  *
4689  *      Get network statistics from device. The device driver may provide
4690  *      its own method by setting dev->netdev_ops->get_stats; otherwise
4691  *      the internal statistics structure is used.
4692  */
4693 const struct net_device_stats *dev_get_stats(struct net_device *dev)
4694  {
4695         const struct net_device_ops *ops = dev->netdev_ops;
4696
4697         if (ops->ndo_get_stats)
4698                 return ops->ndo_get_stats(dev);
4699         else
4700                 return &dev->stats;
4701 }
4702 EXPORT_SYMBOL(dev_get_stats);
4703
4704 static void netdev_init_one_queue(struct net_device *dev,
4705                                   struct netdev_queue *queue,
4706                                   void *_unused)
4707 {
4708         queue->dev = dev;
4709 }
4710
4711 static void netdev_init_queues(struct net_device *dev)
4712 {
4713         netdev_init_one_queue(dev, &dev->rx_queue, NULL);
4714         netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL);
4715         spin_lock_init(&dev->tx_global_lock);
4716 }
4717
4718 /**
4719  *      alloc_netdev_mq - allocate network device
4720  *      @sizeof_priv:   size of private data to allocate space for
4721  *      @name:          device name format string
4722  *      @setup:         callback to initialize device
4723  *      @queue_count:   the number of subqueues to allocate
4724  *
4725  *      Allocates a struct net_device with private data area for driver use
4726  *      and performs basic initialization.  Also allocates subquue structs
4727  *      for each queue on the device at the end of the netdevice.
4728  */
4729 struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name,
4730                 void (*setup)(struct net_device *), unsigned int queue_count)
4731 {
4732         struct netdev_queue *tx;
4733         struct net_device *dev;
4734         size_t alloc_size;
4735         void *p;
4736
4737         BUG_ON(strlen(name) >= sizeof(dev->name));
4738
4739         alloc_size = sizeof(struct net_device);
4740         if (sizeof_priv) {
4741                 /* ensure 32-byte alignment of private area */
4742                 alloc_size = (alloc_size + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
4743                 alloc_size += sizeof_priv;
4744         }
4745         /* ensure 32-byte alignment of whole construct */
4746         alloc_size += NETDEV_ALIGN_CONST;
4747
4748         p = kzalloc(alloc_size, GFP_KERNEL);
4749         if (!p) {
4750                 printk(KERN_ERR "alloc_netdev: Unable to allocate device.\n");
4751                 return NULL;
4752         }
4753
4754         tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL);
4755         if (!tx) {
4756                 printk(KERN_ERR "alloc_netdev: Unable to allocate "
4757                        "tx qdiscs.\n");
4758                 kfree(p);
4759                 return NULL;
4760         }
4761
4762         dev = (struct net_device *)
4763                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
4764         dev->padded = (char *)dev - (char *)p;
4765         dev_net_set(dev, &init_net);
4766
4767         dev->_tx = tx;
4768         dev->num_tx_queues = queue_count;
4769         dev->real_num_tx_queues = queue_count;
4770
4771         dev->gso_max_size = GSO_MAX_SIZE;
4772
4773         netdev_init_queues(dev);
4774
4775         INIT_LIST_HEAD(&dev->napi_list);
4776         setup(dev);
4777         strcpy(dev->name, name);
4778         return dev;
4779 }
4780 EXPORT_SYMBOL(alloc_netdev_mq);
4781
4782 /**
4783  *      free_netdev - free network device
4784  *      @dev: device
4785  *
4786  *      This function does the last stage of destroying an allocated device
4787  *      interface. The reference to the device object is released.
4788  *      If this is the last reference then it will be freed.
4789  */
4790 void free_netdev(struct net_device *dev)
4791 {
4792         struct napi_struct *p, *n;
4793
4794         release_net(dev_net(dev));
4795
4796         kfree(dev->_tx);
4797
4798         list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
4799                 netif_napi_del(p);
4800
4801         /*  Compatibility with error handling in drivers */
4802         if (dev->reg_state == NETREG_UNINITIALIZED) {
4803                 kfree((char *)dev - dev->padded);
4804                 return;
4805         }
4806
4807         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
4808         dev->reg_state = NETREG_RELEASED;
4809
4810         /* will free via device release */
4811         put_device(&dev->dev);
4812 }
4813
4814 /**
4815  *      synchronize_net -  Synchronize with packet receive processing
4816  *
4817  *      Wait for packets currently being received to be done.
4818  *      Does not block later packets from starting.
4819  */
4820 void synchronize_net(void)
4821 {
4822         might_sleep();
4823         synchronize_rcu();
4824 }
4825
4826 /**
4827  *      unregister_netdevice - remove device from the kernel
4828  *      @dev: device
4829  *
4830  *      This function shuts down a device interface and removes it
4831  *      from the kernel tables.
4832  *
4833  *      Callers must hold the rtnl semaphore.  You may want
4834  *      unregister_netdev() instead of this.
4835  */
4836
4837 void unregister_netdevice(struct net_device *dev)
4838 {
4839         ASSERT_RTNL();
4840
4841         rollback_registered(dev);
4842         /* Finish processing unregister after unlock */
4843         net_set_todo(dev);
4844 }
4845
4846 /**
4847  *      unregister_netdev - remove device from the kernel
4848  *      @dev: device
4849  *
4850  *      This function shuts down a device interface and removes it
4851  *      from the kernel tables.
4852  *
4853  *      This is just a wrapper for unregister_netdevice that takes
4854  *      the rtnl semaphore.  In general you want to use this and not
4855  *      unregister_netdevice.
4856  */
4857 void unregister_netdev(struct net_device *dev)
4858 {
4859         rtnl_lock();
4860         unregister_netdevice(dev);
4861         rtnl_unlock();
4862 }
4863
4864 EXPORT_SYMBOL(unregister_netdev);
4865
4866 /**
4867  *      dev_change_net_namespace - move device to different nethost namespace
4868  *      @dev: device
4869  *      @net: network namespace
4870  *      @pat: If not NULL name pattern to try if the current device name
4871  *            is already taken in the destination network namespace.
4872  *
4873  *      This function shuts down a device interface and moves it
4874  *      to a new network namespace. On success 0 is returned, on
4875  *      a failure a netagive errno code is returned.
4876  *
4877  *      Callers must hold the rtnl semaphore.
4878  */
4879
4880 int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat)
4881 {
4882         char buf[IFNAMSIZ];
4883         const char *destname;
4884         int err;
4885
4886         ASSERT_RTNL();
4887
4888         /* Don't allow namespace local devices to be moved. */
4889         err = -EINVAL;
4890         if (dev->features & NETIF_F_NETNS_LOCAL)
4891                 goto out;
4892
4893 #ifdef CONFIG_SYSFS
4894         /* Don't allow real devices to be moved when sysfs
4895          * is enabled.
4896          */
4897         err = -EINVAL;
4898         if (dev->dev.parent)
4899                 goto out;
4900 #endif
4901
4902         /* Ensure the device has been registrered */
4903         err = -EINVAL;
4904         if (dev->reg_state != NETREG_REGISTERED)
4905                 goto out;
4906
4907         /* Get out if there is nothing todo */
4908         err = 0;
4909         if (net_eq(dev_net(dev), net))
4910                 goto out;
4911
4912         /* Pick the destination device name, and ensure
4913          * we can use it in the destination network namespace.
4914          */
4915         err = -EEXIST;
4916         destname = dev->name;
4917         if (__dev_get_by_name(net, destname)) {
4918                 /* We get here if we can't use the current device name */
4919                 if (!pat)
4920                         goto out;
4921                 if (!dev_valid_name(pat))
4922                         goto out;
4923                 if (strchr(pat, '%')) {
4924                         if (__dev_alloc_name(net, pat, buf) < 0)
4925                                 goto out;
4926                         destname = buf;
4927                 } else
4928                         destname = pat;
4929                 if (__dev_get_by_name(net, destname))
4930                         goto out;
4931         }
4932
4933         /*
4934          * And now a mini version of register_netdevice unregister_netdevice.
4935          */
4936
4937         /* If device is running close it first. */
4938         dev_close(dev);
4939
4940         /* And unlink it from device chain */
4941         err = -ENODEV;
4942         unlist_netdevice(dev);
4943
4944         synchronize_net();
4945
4946         /* Shutdown queueing discipline. */
4947         dev_shutdown(dev);
4948
4949         /* Notify protocols, that we are about to destroy
4950            this device. They should clean all the things.
4951         */
4952         call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
4953
4954         /*
4955          *      Flush the unicast and multicast chains
4956          */
4957         dev_addr_discard(dev);
4958
4959         netdev_unregister_kobject(dev);
4960
4961         /* Actually switch the network namespace */
4962         dev_net_set(dev, net);
4963
4964         /* Assign the new device name */
4965         if (destname != dev->name)
4966                 strcpy(dev->name, destname);
4967
4968         /* If there is an ifindex conflict assign a new one */
4969         if (__dev_get_by_index(net, dev->ifindex)) {
4970                 int iflink = (dev->iflink == dev->ifindex);
4971                 dev->ifindex = dev_new_index(net);
4972                 if (iflink)
4973                         dev->iflink = dev->ifindex;
4974         }
4975
4976         /* Fixup kobjects */
4977         err = netdev_register_kobject(dev);
4978         WARN_ON(err);
4979
4980         /* Add the device back in the hashes */
4981         list_netdevice(dev);
4982
4983         /* Notify protocols, that a new device appeared. */
4984         call_netdevice_notifiers(NETDEV_REGISTER, dev);
4985
4986         synchronize_net();
4987         err = 0;
4988 out:
4989         return err;
4990 }
4991
4992 static int dev_cpu_callback(struct notifier_block *nfb,
4993                             unsigned long action,
4994                             void *ocpu)
4995 {
4996         struct sk_buff **list_skb;
4997         struct Qdisc **list_net;
4998         struct sk_buff *skb;
4999         unsigned int cpu, oldcpu = (unsigned long)ocpu;
5000         struct softnet_data *sd, *oldsd;
5001
5002         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
5003                 return NOTIFY_OK;
5004
5005         local_irq_disable();
5006         cpu = smp_processor_id();
5007         sd = &per_cpu(softnet_data, cpu);
5008         oldsd = &per_cpu(softnet_data, oldcpu);
5009
5010         /* Find end of our completion_queue. */
5011         list_skb = &sd->completion_queue;
5012         while (*list_skb)
5013                 list_skb = &(*list_skb)->next;
5014         /* Append completion queue from offline CPU. */
5015         *list_skb = oldsd->completion_queue;
5016         oldsd->completion_queue = NULL;
5017
5018         /* Find end of our output_queue. */
5019         list_net = &sd->output_queue;
5020         while (*list_net)
5021                 list_net = &(*list_net)->next_sched;
5022         /* Append output queue from offline CPU. */
5023         *list_net = oldsd->output_queue;
5024         oldsd->output_queue = NULL;
5025
5026         raise_softirq_irqoff(NET_TX_SOFTIRQ);
5027         local_irq_enable();
5028
5029         /* Process offline CPU's input_pkt_queue */
5030         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
5031                 netif_rx(skb);
5032
5033         return NOTIFY_OK;
5034 }
5035
5036
5037 /**
5038  *      netdev_increment_features - increment feature set by one
5039  *      @all: current feature set
5040  *      @one: new feature set
5041  *      @mask: mask feature set
5042  *
5043  *      Computes a new feature set after adding a device with feature set
5044  *      @one to the master device with current feature set @all.  Will not
5045  *      enable anything that is off in @mask. Returns the new feature set.
5046  */
5047 unsigned long netdev_increment_features(unsigned long all, unsigned long one,
5048                                         unsigned long mask)
5049 {
5050         /* If device needs checksumming, downgrade to it. */
5051         if (all & NETIF_F_NO_CSUM && !(one & NETIF_F_NO_CSUM))
5052                 all ^= NETIF_F_NO_CSUM | (one & NETIF_F_ALL_CSUM);
5053         else if (mask & NETIF_F_ALL_CSUM) {
5054                 /* If one device supports v4/v6 checksumming, set for all. */
5055                 if (one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM) &&
5056                     !(all & NETIF_F_GEN_CSUM)) {
5057                         all &= ~NETIF_F_ALL_CSUM;
5058                         all |= one & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
5059                 }
5060
5061                 /* If one device supports hw checksumming, set for all. */
5062                 if (one & NETIF_F_GEN_CSUM && !(all & NETIF_F_GEN_CSUM)) {
5063                         all &= ~NETIF_F_ALL_CSUM;
5064                         all |= NETIF_F_HW_CSUM;
5065                 }
5066         }
5067
5068         one |= NETIF_F_ALL_CSUM;
5069
5070         one |= all & NETIF_F_ONE_FOR_ALL;
5071         all &= one | NETIF_F_LLTX | NETIF_F_GSO;
5072         all |= one & mask & NETIF_F_ONE_FOR_ALL;
5073
5074         return all;
5075 }
5076 EXPORT_SYMBOL(netdev_increment_features);
5077
5078 static struct hlist_head *netdev_create_hash(void)
5079 {
5080         int i;
5081         struct hlist_head *hash;
5082
5083         hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
5084         if (hash != NULL)
5085                 for (i = 0; i < NETDEV_HASHENTRIES; i++)
5086                         INIT_HLIST_HEAD(&hash[i]);
5087
5088         return hash;
5089 }
5090
5091 /* Initialize per network namespace state */
5092 static int __net_init netdev_init(struct net *net)
5093 {
5094         INIT_LIST_HEAD(&net->dev_base_head);
5095
5096         net->dev_name_head = netdev_create_hash();
5097         if (net->dev_name_head == NULL)
5098                 goto err_name;
5099
5100         net->dev_index_head = netdev_create_hash();
5101         if (net->dev_index_head == NULL)
5102                 goto err_idx;
5103
5104         return 0;
5105
5106 err_idx:
5107         kfree(net->dev_name_head);
5108 err_name:
5109         return -ENOMEM;
5110 }
5111
5112 /**
5113  *      netdev_drivername - network driver for the device
5114  *      @dev: network device
5115  *      @buffer: buffer for resulting name
5116  *      @len: size of buffer
5117  *
5118  *      Determine network driver for device.
5119  */
5120 char *netdev_drivername(const struct net_device *dev, char *buffer, int len)
5121 {
5122         const struct device_driver *driver;
5123         const struct device *parent;
5124
5125         if (len <= 0 || !buffer)
5126                 return buffer;
5127         buffer[0] = 0;
5128
5129         parent = dev->dev.parent;
5130
5131         if (!parent)
5132                 return buffer;
5133
5134         driver = parent->driver;
5135         if (driver && driver->name)
5136                 strlcpy(buffer, driver->name, len);
5137         return buffer;
5138 }
5139
5140 static void __net_exit netdev_exit(struct net *net)
5141 {
5142         kfree(net->dev_name_head);
5143         kfree(net->dev_index_head);
5144 }
5145
5146 static struct pernet_operations __net_initdata netdev_net_ops = {
5147         .init = netdev_init,
5148         .exit = netdev_exit,
5149 };
5150
5151 static void __net_exit default_device_exit(struct net *net)
5152 {
5153         struct net_device *dev;
5154         /*
5155          * Push all migratable of the network devices back to the
5156          * initial network namespace
5157          */
5158         rtnl_lock();
5159 restart:
5160         for_each_netdev(net, dev) {
5161                 int err;
5162                 char fb_name[IFNAMSIZ];
5163
5164                 /* Ignore unmoveable devices (i.e. loopback) */
5165                 if (dev->features & NETIF_F_NETNS_LOCAL)
5166                         continue;
5167
5168                 /* Delete virtual devices */
5169                 if (dev->rtnl_link_ops && dev->rtnl_link_ops->dellink) {
5170                         dev->rtnl_link_ops->dellink(dev);
5171                         goto restart;
5172                 }
5173
5174                 /* Push remaing network devices to init_net */
5175                 snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
5176                 err = dev_change_net_namespace(dev, &init_net, fb_name);
5177                 if (err) {
5178                         printk(KERN_EMERG "%s: failed to move %s to init_net: %d\n",
5179                                 __func__, dev->name, err);
5180                         BUG();
5181                 }
5182                 goto restart;
5183         }
5184         rtnl_unlock();
5185 }
5186
5187 static struct pernet_operations __net_initdata default_device_ops = {
5188         .exit = default_device_exit,
5189 };
5190
5191 /*
5192  *      Initialize the DEV module. At boot time this walks the device list and
5193  *      unhooks any devices that fail to initialise (normally hardware not
5194  *      present) and leaves us with a valid list of present and active devices.
5195  *
5196  */
5197
5198 /*
5199  *       This is called single threaded during boot, so no need
5200  *       to take the rtnl semaphore.
5201  */
5202 static int __init net_dev_init(void)
5203 {
5204         int i, rc = -ENOMEM;
5205
5206         BUG_ON(!dev_boot_phase);
5207
5208         if (dev_proc_init())
5209                 goto out;
5210
5211         if (netdev_kobject_init())
5212                 goto out;
5213
5214         INIT_LIST_HEAD(&ptype_all);
5215         for (i = 0; i < PTYPE_HASH_SIZE; i++)
5216                 INIT_LIST_HEAD(&ptype_base[i]);
5217
5218         if (register_pernet_subsys(&netdev_net_ops))
5219                 goto out;
5220
5221         /*
5222          *      Initialise the packet receive queues.
5223          */
5224
5225         for_each_possible_cpu(i) {
5226                 struct softnet_data *queue;
5227
5228                 queue = &per_cpu(softnet_data, i);
5229                 skb_queue_head_init(&queue->input_pkt_queue);
5230                 queue->completion_queue = NULL;
5231                 INIT_LIST_HEAD(&queue->poll_list);
5232
5233                 queue->backlog.poll = process_backlog;
5234                 queue->backlog.weight = weight_p;
5235                 queue->backlog.gro_list = NULL;
5236                 queue->backlog.gro_count = 0;
5237         }
5238
5239         dev_boot_phase = 0;
5240
5241         /* The loopback device is special if any other network devices
5242          * is present in a network namespace the loopback device must
5243          * be present. Since we now dynamically allocate and free the
5244          * loopback device ensure this invariant is maintained by
5245          * keeping the loopback device as the first device on the
5246          * list of network devices.  Ensuring the loopback devices
5247          * is the first device that appears and the last network device
5248          * that disappears.
5249          */
5250         if (register_pernet_device(&loopback_net_ops))
5251                 goto out;
5252
5253         if (register_pernet_device(&default_device_ops))
5254                 goto out;
5255
5256         open_softirq(NET_TX_SOFTIRQ, net_tx_action);
5257         open_softirq(NET_RX_SOFTIRQ, net_rx_action);
5258
5259         hotcpu_notifier(dev_cpu_callback, 0);
5260         dst_init();
5261         dev_mcast_init();
5262         rc = 0;
5263 out:
5264         return rc;
5265 }
5266
5267 subsys_initcall(net_dev_init);
5268
5269 static int __init initialize_hashrnd(void)
5270 {
5271         get_random_bytes(&skb_tx_hashrnd, sizeof(skb_tx_hashrnd));
5272         return 0;
5273 }
5274
5275 late_initcall_sync(initialize_hashrnd);
5276
5277 EXPORT_SYMBOL(__dev_get_by_index);
5278 EXPORT_SYMBOL(__dev_get_by_name);
5279 EXPORT_SYMBOL(__dev_remove_pack);
5280 EXPORT_SYMBOL(dev_valid_name);
5281 EXPORT_SYMBOL(dev_add_pack);
5282 EXPORT_SYMBOL(dev_alloc_name);
5283 EXPORT_SYMBOL(dev_close);
5284 EXPORT_SYMBOL(dev_get_by_flags);
5285 EXPORT_SYMBOL(dev_get_by_index);
5286 EXPORT_SYMBOL(dev_get_by_name);
5287 EXPORT_SYMBOL(dev_open);
5288 EXPORT_SYMBOL(dev_queue_xmit);
5289 EXPORT_SYMBOL(dev_remove_pack);
5290 EXPORT_SYMBOL(dev_set_allmulti);
5291 EXPORT_SYMBOL(dev_set_promiscuity);
5292 EXPORT_SYMBOL(dev_change_flags);
5293 EXPORT_SYMBOL(dev_set_mtu);
5294 EXPORT_SYMBOL(dev_set_mac_address);
5295 EXPORT_SYMBOL(free_netdev);
5296 EXPORT_SYMBOL(netdev_boot_setup_check);
5297 EXPORT_SYMBOL(netdev_set_master);
5298 EXPORT_SYMBOL(netdev_state_change);
5299 EXPORT_SYMBOL(netif_receive_skb);
5300 EXPORT_SYMBOL(netif_rx);
5301 EXPORT_SYMBOL(register_gifconf);
5302 EXPORT_SYMBOL(register_netdevice);
5303 EXPORT_SYMBOL(register_netdevice_notifier);
5304 EXPORT_SYMBOL(skb_checksum_help);
5305 EXPORT_SYMBOL(synchronize_net);
5306 EXPORT_SYMBOL(unregister_netdevice);
5307 EXPORT_SYMBOL(unregister_netdevice_notifier);
5308 EXPORT_SYMBOL(net_enable_timestamp);
5309 EXPORT_SYMBOL(net_disable_timestamp);
5310 EXPORT_SYMBOL(dev_get_flags);
5311
5312 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
5313 EXPORT_SYMBOL(br_handle_frame_hook);
5314 EXPORT_SYMBOL(br_fdb_get_hook);
5315 EXPORT_SYMBOL(br_fdb_put_hook);
5316 #endif
5317
5318 EXPORT_SYMBOL(dev_load);
5319
5320 EXPORT_PER_CPU_SYMBOL(softnet_data);