err.no Git - linux-2.6/blob - net/core/dev.c

   1 /*
   2  *      NET3    Protocol independent device support routines.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  *      Derived from the non IP parts of dev.c 1.0.19
  10  *              Authors:        Ross Biro
  11  *                              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *                              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *
  14  *      Additional Authors:
  15  *              Florian la Roche <rzsfl@rz.uni-sb.de>
  16  *              Alan Cox <gw4pts@gw4pts.ampr.org>
  17  *              David Hinds <dahinds@users.sourceforge.net>
  18  *              Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
  19  *              Adam Sulmicki <adam@cfar.umd.edu>
  20  *              Pekka Riikonen <priikone@poesidon.pspt.fi>
  21  *
  22  *      Changes:
  23  *              D.J. Barrow     :       Fixed bug where dev->refcnt gets set
  24  *                                      to 2 if register_netdev gets called
  25  *                                      before net_dev_init & also removed a
  26  *                                      few lines of code in the process.
  27  *              Alan Cox        :       device private ioctl copies fields back.
  28  *              Alan Cox        :       Transmit queue code does relevant
  29  *                                      stunts to keep the queue safe.
  30  *              Alan Cox        :       Fixed double lock.
  31  *              Alan Cox        :       Fixed promisc NULL pointer trap
  32  *              ????????        :       Support the full private ioctl range
  33  *              Alan Cox        :       Moved ioctl permission check into
  34  *                                      drivers
  35  *              Tim Kordas      :       SIOCADDMULTI/SIOCDELMULTI
  36  *              Alan Cox        :       100 backlog just doesn't cut it when
  37  *                                      you start doing multicast video 8)
  38  *              Alan Cox        :       Rewrote net_bh and list manager.
  39  *              Alan Cox        :       Fix ETH_P_ALL echoback lengths.
  40  *              Alan Cox        :       Took out transmit every packet pass
  41  *                                      Saved a few bytes in the ioctl handler
  42  *              Alan Cox        :       Network driver sets packet type before
  43  *                                      calling netif_rx. Saves a function
  44  *                                      call a packet.
  45  *              Alan Cox        :       Hashed net_bh()
  46  *              Richard Kooijman:       Timestamp fixes.
  47  *              Alan Cox        :       Wrong field in SIOCGIFDSTADDR
  48  *              Alan Cox        :       Device lock protection.
  49  *              Alan Cox        :       Fixed nasty side effect of device close
  50  *                                      changes.
  51  *              Rudi Cilibrasi  :       Pass the right thing to
  52  *                                      set_mac_address()
  53  *              Dave Miller     :       32bit quantity for the device lock to
  54  *                                      make it work out on a Sparc.
  55  *              Bjorn Ekwall    :       Added KERNELD hack.
  56  *              Alan Cox        :       Cleaned up the backlog initialise.
  57  *              Craig Metz      :       SIOCGIFCONF fix if space for under
  58  *                                      1 device.
  59  *          Thomas Bogendoerfer :       Return ENODEV for dev_open, if there
  60  *                                      is no device open function.
  61  *              Andi Kleen      :       Fix error reporting for SIOCGIFCONF
  62  *          Michael Chastain    :       Fix signed/unsigned for SIOCGIFCONF
  63  *              Cyrus Durgin    :       Cleaned for KMOD
  64  *              Adam Sulmicki   :       Bug Fix : Network Device Unload
  65  *                                      A network device unload needs to purge
  66  *                                      the backlog queue.
  67  *      Paul Rusty Russell      :       SIOCSIFNAME
  68  *              Pekka Riikonen  :       Netdev boot-time settings code
  69  *              Andrew Morton   :       Make unregister_netdevice wait
  70  *                                      indefinitely on dev->refcnt
  71  *              J Hadi Salim    :       - Backlog queue sampling
  72  *                                      - netif_rx() feedback
  73  */
  74
  75 #include <asm/uaccess.h>
  76 #include <asm/system.h>
  77 #include <linux/bitops.h>
  78 #include <linux/capability.h>
  79 #include <linux/config.h>
  80 #include <linux/cpu.h>
  81 #include <linux/types.h>
  82 #include <linux/kernel.h>
  83 #include <linux/sched.h>
  84 #include <linux/mutex.h>
  85 #include <linux/string.h>
  86 #include <linux/mm.h>
  87 #include <linux/socket.h>
  88 #include <linux/sockios.h>
  89 #include <linux/errno.h>
  90 #include <linux/interrupt.h>
  91 #include <linux/if_ether.h>
  92 #include <linux/netdevice.h>
  93 #include <linux/etherdevice.h>
  94 #include <linux/notifier.h>
  95 #include <linux/skbuff.h>
  96 #include <net/sock.h>
  97 #include <linux/rtnetlink.h>
  98 #include <linux/proc_fs.h>
  99 #include <linux/seq_file.h>
 100 #include <linux/stat.h>
 101 #include <linux/if_bridge.h>
 102 #include <linux/divert.h>
 103 #include <net/dst.h>
 104 #include <net/pkt_sched.h>
 105 #include <net/checksum.h>
 106 #include <linux/highmem.h>
 107 #include <linux/init.h>
 108 #include <linux/kmod.h>
 109 #include <linux/module.h>
 110 #include <linux/kallsyms.h>
 111 #include <linux/netpoll.h>
 112 #include <linux/rcupdate.h>
 113 #include <linux/delay.h>
 114 #include <linux/wireless.h>
 115 #include <net/iw_handler.h>
 116 #include <asm/current.h>
 117 #include <linux/audit.h>
 118
 119 /*
 120  *      The list of packet types we will receive (as opposed to discard)
 121  *      and the routines to invoke.
 122  *
 123  *      Why 16. Because with 16 the only overlap we get on a hash of the
 124  *      low nibble of the protocol value is RARP/SNAP/X.25.
 125  *
 126  *      NOTE:  That is no longer true with the addition of VLAN tags.  Not
 127  *             sure which should go first, but I bet it won't make much
 128  *             difference if we are running VLANs.  The good news is that
 129  *             this protocol won't be in the list unless compiled in, so
 130  *             the average user (w/out VLANs) will not be adversly affected.
 131  *             --BLG
 132  *
 133  *              0800    IP
 134  *              8100    802.1Q VLAN
 135  *              0001    802.3
 136  *              0002    AX.25
 137  *              0004    802.2
 138  *              8035    RARP
 139  *              0005    SNAP
 140  *              0805    X.25
 141  *              0806    ARP
 142  *              8137    IPX
 143  *              0009    Localtalk
 144  *              86DD    IPv6
 145  */
 146
 147 static DEFINE_SPINLOCK(ptype_lock);
 148 static struct list_head ptype_base[16]; /* 16 way hashed list */
 149 static struct list_head ptype_all;              /* Taps */
 150
 151 /*
 152  * The @dev_base list is protected by @dev_base_lock and the rtln
 153  * semaphore.
 154  *
 155  * Pure readers hold dev_base_lock for reading.
 156  *
 157  * Writers must hold the rtnl semaphore while they loop through the
 158  * dev_base list, and hold dev_base_lock for writing when they do the
 159  * actual updates.  This allows pure readers to access the list even
 160  * while a writer is preparing to update it.
 161  *
 162  * To put it another way, dev_base_lock is held for writing only to
 163  * protect against pure readers; the rtnl semaphore provides the
 164  * protection against other writers.
 165  *
 166  * See, for example usages, register_netdevice() and
 167  * unregister_netdevice(), which must be called with the rtnl
 168  * semaphore held.
 169  */
 170 struct net_device *dev_base;
 171 static struct net_device **dev_tail = &dev_base;
 172 DEFINE_RWLOCK(dev_base_lock);
 173
 174 EXPORT_SYMBOL(dev_base);
 175 EXPORT_SYMBOL(dev_base_lock);
 176
 177 #define NETDEV_HASHBITS 8
 178 static struct hlist_head dev_name_head[1<<NETDEV_HASHBITS];
 179 static struct hlist_head dev_index_head[1<<NETDEV_HASHBITS];
 180
 181 static inline struct hlist_head *dev_name_hash(const char *name)
 182 {
 183         unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
 184         return &dev_name_head[hash & ((1<<NETDEV_HASHBITS)-1)];
 185 }
 186
 187 static inline struct hlist_head *dev_index_hash(int ifindex)
 188 {
 189         return &dev_index_head[ifindex & ((1<<NETDEV_HASHBITS)-1)];
 190 }
 191
 192 /*
 193  *      Our notifier list
 194  */
 195
 196 static BLOCKING_NOTIFIER_HEAD(netdev_chain);
 197
 198 /*
 199  *      Device drivers call our routines to queue packets here. We empty the
 200  *      queue in the local softnet handler.
 201  */
 202 DEFINE_PER_CPU(struct softnet_data, softnet_data) = { NULL };
 203
 204 #ifdef CONFIG_SYSFS
 205 extern int netdev_sysfs_init(void);
 206 extern int netdev_register_sysfs(struct net_device *);
 207 extern void netdev_unregister_sysfs(struct net_device *);
 208 #else
 209 #define netdev_sysfs_init()             (0)
 210 #define netdev_register_sysfs(dev)      (0)
 211 #define netdev_unregister_sysfs(dev)    do { } while(0)
 212 #endif
 213
 214
 215 /*******************************************************************************
 216
 217                 Protocol management and registration routines
 218
 219 *******************************************************************************/
 220
 221 /*
 222  *      For efficiency
 223  */
 224
 225 int netdev_nit;
 226
 227 /*
 228  *      Add a protocol ID to the list. Now that the input handler is
 229  *      smarter we can dispense with all the messy stuff that used to be
 230  *      here.
 231  *
 232  *      BEWARE!!! Protocol handlers, mangling input packets,
 233  *      MUST BE last in hash buckets and checking protocol handlers
 234  *      MUST start from promiscuous ptype_all chain in net_bh.
 235  *      It is true now, do not change it.
 236  *      Explanation follows: if protocol handler, mangling packet, will
 237  *      be the first on list, it is not able to sense, that packet
 238  *      is cloned and should be copied-on-write, so that it will
 239  *      change it and subsequent readers will get broken packet.
 240  *                                                      --ANK (980803)
 241  */
 242
 243 /**
 244  *      dev_add_pack - add packet handler
 245  *      @pt: packet type declaration
 246  *
 247  *      Add a protocol handler to the networking stack. The passed &packet_type
 248  *      is linked into kernel lists and may not be freed until it has been
 249  *      removed from the kernel lists.
 250  *
 251  *      This call does not sleep therefore it can not
 252  *      guarantee all CPU's that are in middle of receiving packets
 253  *      will see the new packet type (until the next received packet).
 254  */
 255
 256 void dev_add_pack(struct packet_type *pt)
 257 {
 258         int hash;
 259
 260         spin_lock_bh(&ptype_lock);
 261         if (pt->type == htons(ETH_P_ALL)) {
 262                 netdev_nit++;
 263                 list_add_rcu(&pt->list, &ptype_all);
 264         } else {
 265                 hash = ntohs(pt->type) & 15;
 266                 list_add_rcu(&pt->list, &ptype_base[hash]);
 267         }
 268         spin_unlock_bh(&ptype_lock);
 269 }
 270
 271 /**
 272  *      __dev_remove_pack        - remove packet handler
 273  *      @pt: packet type declaration
 274  *
 275  *      Remove a protocol handler that was previously added to the kernel
 276  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 277  *      from the kernel lists and can be freed or reused once this function
 278  *      returns.
 279  *
 280  *      The packet type might still be in use by receivers
 281  *      and must not be freed until after all the CPU's have gone
 282  *      through a quiescent state.
 283  */
 284 void __dev_remove_pack(struct packet_type *pt)
 285 {
 286         struct list_head *head;
 287         struct packet_type *pt1;
 288
 289         spin_lock_bh(&ptype_lock);
 290
 291         if (pt->type == htons(ETH_P_ALL)) {
 292                 netdev_nit--;
 293                 head = &ptype_all;
 294         } else
 295                 head = &ptype_base[ntohs(pt->type) & 15];
 296
 297         list_for_each_entry(pt1, head, list) {
 298                 if (pt == pt1) {
 299                         list_del_rcu(&pt->list);
 300                         goto out;
 301                 }
 302         }
 303
 304         printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt);
 305 out:
 306         spin_unlock_bh(&ptype_lock);
 307 }
 308 /**
 309  *      dev_remove_pack  - remove packet handler
 310  *      @pt: packet type declaration
 311  *
 312  *      Remove a protocol handler that was previously added to the kernel
 313  *      protocol handlers by dev_add_pack(). The passed &packet_type is removed
 314  *      from the kernel lists and can be freed or reused once this function
 315  *      returns.
 316  *
 317  *      This call sleeps to guarantee that no CPU is looking at the packet
 318  *      type after return.
 319  */
 320 void dev_remove_pack(struct packet_type *pt)
 321 {
 322         __dev_remove_pack(pt);
 323
 324         synchronize_net();
 325 }
 326
 327 /******************************************************************************
 328
 329                       Device Boot-time Settings Routines
 330
 331 *******************************************************************************/
 332
 333 /* Boot time configuration table */
 334 static struct netdev_boot_setup dev_boot_setup[NETDEV_BOOT_SETUP_MAX];
 335
 336 /**
 337  *      netdev_boot_setup_add   - add new setup entry
 338  *      @name: name of the device
 339  *      @map: configured settings for the device
 340  *
 341  *      Adds new setup entry to the dev_boot_setup list.  The function
 342  *      returns 0 on error and 1 on success.  This is a generic routine to
 343  *      all netdevices.
 344  */
 345 static int netdev_boot_setup_add(char *name, struct ifmap *map)
 346 {
 347         struct netdev_boot_setup *s;
 348         int i;
 349
 350         s = dev_boot_setup;
 351         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 352                 if (s[i].name[0] == '\0' || s[i].name[0] == ' ') {
 353                         memset(s[i].name, 0, sizeof(s[i].name));
 354                         strcpy(s[i].name, name);
 355                         memcpy(&s[i].map, map, sizeof(s[i].map));
 356                         break;
 357                 }
 358         }
 359
 360         return i >= NETDEV_BOOT_SETUP_MAX ? 0 : 1;
 361 }
 362
 363 /**
 364  *      netdev_boot_setup_check - check boot time settings
 365  *      @dev: the netdevice
 366  *
 367  *      Check boot time settings for the device.
 368  *      The found settings are set for the device to be used
 369  *      later in the device probing.
 370  *      Returns 0 if no settings found, 1 if they are.
 371  */
 372 int netdev_boot_setup_check(struct net_device *dev)
 373 {
 374         struct netdev_boot_setup *s = dev_boot_setup;
 375         int i;
 376
 377         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++) {
 378                 if (s[i].name[0] != '\0' && s[i].name[0] != ' ' &&
 379                     !strncmp(dev->name, s[i].name, strlen(s[i].name))) {
 380                         dev->irq        = s[i].map.irq;
 381                         dev->base_addr  = s[i].map.base_addr;
 382                         dev->mem_start  = s[i].map.mem_start;
 383                         dev->mem_end    = s[i].map.mem_end;
 384                         return 1;
 385                 }
 386         }
 387         return 0;
 388 }
 389
 390
 391 /**
 392  *      netdev_boot_base        - get address from boot time settings
 393  *      @prefix: prefix for network device
 394  *      @unit: id for network device
 395  *
 396  *      Check boot time settings for the base address of device.
 397  *      The found settings are set for the device to be used
 398  *      later in the device probing.
 399  *      Returns 0 if no settings found.
 400  */
 401 unsigned long netdev_boot_base(const char *prefix, int unit)
 402 {
 403         const struct netdev_boot_setup *s = dev_boot_setup;
 404         char name[IFNAMSIZ];
 405         int i;
 406
 407         sprintf(name, "%s%d", prefix, unit);
 408
 409         /*
 410          * If device already registered then return base of 1
 411          * to indicate not to probe for this interface
 412          */
 413         if (__dev_get_by_name(name))
 414                 return 1;
 415
 416         for (i = 0; i < NETDEV_BOOT_SETUP_MAX; i++)
 417                 if (!strcmp(name, s[i].name))
 418                         return s[i].map.base_addr;
 419         return 0;
 420 }
 421
 422 /*
 423  * Saves at boot time configured settings for any netdevice.
 424  */
 425 int __init netdev_boot_setup(char *str)
 426 {
 427         int ints[5];
 428         struct ifmap map;
 429
 430         str = get_options(str, ARRAY_SIZE(ints), ints);
 431         if (!str || !*str)
 432                 return 0;
 433
 434         /* Save settings */
 435         memset(&map, 0, sizeof(map));
 436         if (ints[0] > 0)
 437                 map.irq = ints[1];
 438         if (ints[0] > 1)
 439                 map.base_addr = ints[2];
 440         if (ints[0] > 2)
 441                 map.mem_start = ints[3];
 442         if (ints[0] > 3)
 443                 map.mem_end = ints[4];
 444
 445         /* Add new entry to the list */
 446         return netdev_boot_setup_add(str, &map);
 447 }
 448
 449 __setup("netdev=", netdev_boot_setup);
 450
 451 /*******************************************************************************
 452
 453                             Device Interface Subroutines
 454
 455 *******************************************************************************/
 456
 457 /**
 458  *      __dev_get_by_name       - find a device by its name
 459  *      @name: name to find
 460  *
 461  *      Find an interface by name. Must be called under RTNL semaphore
 462  *      or @dev_base_lock. If the name is found a pointer to the device
 463  *      is returned. If the name is not found then %NULL is returned. The
 464  *      reference counters are not incremented so the caller must be
 465  *      careful with locks.
 466  */
 467
 468 struct net_device *__dev_get_by_name(const char *name)
 469 {
 470         struct hlist_node *p;
 471
 472         hlist_for_each(p, dev_name_hash(name)) {
 473                 struct net_device *dev
 474                         = hlist_entry(p, struct net_device, name_hlist);
 475                 if (!strncmp(dev->name, name, IFNAMSIZ))
 476                         return dev;
 477         }
 478         return NULL;
 479 }
 480
 481 /**
 482  *      dev_get_by_name         - find a device by its name
 483  *      @name: name to find
 484  *
 485  *      Find an interface by name. This can be called from any
 486  *      context and does its own locking. The returned handle has
 487  *      the usage count incremented and the caller must use dev_put() to
 488  *      release it when it is no longer needed. %NULL is returned if no
 489  *      matching device is found.
 490  */
 491
 492 struct net_device *dev_get_by_name(const char *name)
 493 {
 494         struct net_device *dev;
 495
 496         read_lock(&dev_base_lock);
 497         dev = __dev_get_by_name(name);
 498         if (dev)
 499                 dev_hold(dev);
 500         read_unlock(&dev_base_lock);
 501         return dev;
 502 }
 503
 504 /**
 505  *      __dev_get_by_index - find a device by its ifindex
 506  *      @ifindex: index of device
 507  *
 508  *      Search for an interface by index. Returns %NULL if the device
 509  *      is not found or a pointer to the device. The device has not
 510  *      had its reference counter increased so the caller must be careful
 511  *      about locking. The caller must hold either the RTNL semaphore
 512  *      or @dev_base_lock.
 513  */
 514
 515 struct net_device *__dev_get_by_index(int ifindex)
 516 {
 517         struct hlist_node *p;
 518
 519         hlist_for_each(p, dev_index_hash(ifindex)) {
 520                 struct net_device *dev
 521                         = hlist_entry(p, struct net_device, index_hlist);
 522                 if (dev->ifindex == ifindex)
 523                         return dev;
 524         }
 525         return NULL;
 526 }
 527
 528
 529 /**
 530  *      dev_get_by_index - find a device by its ifindex
 531  *      @ifindex: index of device
 532  *
 533  *      Search for an interface by index. Returns NULL if the device
 534  *      is not found or a pointer to the device. The device returned has
 535  *      had a reference added and the pointer is safe until the user calls
 536  *      dev_put to indicate they have finished with it.
 537  */
 538
 539 struct net_device *dev_get_by_index(int ifindex)
 540 {
 541         struct net_device *dev;
 542
 543         read_lock(&dev_base_lock);
 544         dev = __dev_get_by_index(ifindex);
 545         if (dev)
 546                 dev_hold(dev);
 547         read_unlock(&dev_base_lock);
 548         return dev;
 549 }
 550
 551 /**
 552  *      dev_getbyhwaddr - find a device by its hardware address
 553  *      @type: media type of device
 554  *      @ha: hardware address
 555  *
 556  *      Search for an interface by MAC address. Returns NULL if the device
 557  *      is not found or a pointer to the device. The caller must hold the
 558  *      rtnl semaphore. The returned device has not had its ref count increased
 559  *      and the caller must therefore be careful about locking
 560  *
 561  *      BUGS:
 562  *      If the API was consistent this would be __dev_get_by_hwaddr
 563  */
 564
 565 struct net_device *dev_getbyhwaddr(unsigned short type, char *ha)
 566 {
 567         struct net_device *dev;
 568
 569         ASSERT_RTNL();
 570
 571         for (dev = dev_base; dev; dev = dev->next)
 572                 if (dev->type == type &&
 573                     !memcmp(dev->dev_addr, ha, dev->addr_len))
 574                         break;
 575         return dev;
 576 }
 577
 578 EXPORT_SYMBOL(dev_getbyhwaddr);
 579
 580 struct net_device *dev_getfirstbyhwtype(unsigned short type)
 581 {
 582         struct net_device *dev;
 583
 584         rtnl_lock();
 585         for (dev = dev_base; dev; dev = dev->next) {
 586                 if (dev->type == type) {
 587                         dev_hold(dev);
 588                         break;
 589                 }
 590         }
 591         rtnl_unlock();
 592         return dev;
 593 }
 594
 595 EXPORT_SYMBOL(dev_getfirstbyhwtype);
 596
 597 /**
 598  *      dev_get_by_flags - find any device with given flags
 599  *      @if_flags: IFF_* values
 600  *      @mask: bitmask of bits in if_flags to check
 601  *
 602  *      Search for any interface with the given flags. Returns NULL if a device
 603  *      is not found or a pointer to the device. The device returned has
 604  *      had a reference added and the pointer is safe until the user calls
 605  *      dev_put to indicate they have finished with it.
 606  */
 607
 608 struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask)
 609 {
 610         struct net_device *dev;
 611
 612         read_lock(&dev_base_lock);
 613         for (dev = dev_base; dev != NULL; dev = dev->next) {
 614                 if (((dev->flags ^ if_flags) & mask) == 0) {
 615                         dev_hold(dev);
 616                         break;
 617                 }
 618         }
 619         read_unlock(&dev_base_lock);
 620         return dev;
 621 }
 622
 623 /**
 624  *      dev_valid_name - check if name is okay for network device
 625  *      @name: name string
 626  *
 627  *      Network device names need to be valid file names to
 628  *      to allow sysfs to work
 629  */
 630 int dev_valid_name(const char *name)
 631 {
 632         return !(*name == '\0'
 633                  || !strcmp(name, ".")
 634                  || !strcmp(name, "..")
 635                  || strchr(name, '/'));
 636 }
 637
 638 /**
 639  *      dev_alloc_name - allocate a name for a device
 640  *      @dev: device
 641  *      @name: name format string
 642  *
 643  *      Passed a format string - eg "lt%d" it will try and find a suitable
 644  *      id. Not efficient for many devices, not called a lot. The caller
 645  *      must hold the dev_base or rtnl lock while allocating the name and
 646  *      adding the device in order to avoid duplicates. Returns the number
 647  *      of the unit assigned or a negative errno code.
 648  */
 649
 650 int dev_alloc_name(struct net_device *dev, const char *name)
 651 {
 652         int i = 0;
 653         char buf[IFNAMSIZ];
 654         const char *p;
 655         const int max_netdevices = 8*PAGE_SIZE;
 656         long *inuse;
 657         struct net_device *d;
 658
 659         p = strnchr(name, IFNAMSIZ-1, '%');
 660         if (p) {
 661                 /*
 662                  * Verify the string as this thing may have come from
 663                  * the user.  There must be either one "%d" and no other "%"
 664                  * characters.
 665                  */
 666                 if (p[1] != 'd' || strchr(p + 2, '%'))
 667                         return -EINVAL;
 668
 669                 /* Use one page as a bit array of possible slots */
 670                 inuse = (long *) get_zeroed_page(GFP_ATOMIC);
 671                 if (!inuse)
 672                         return -ENOMEM;
 673
 674                 for (d = dev_base; d; d = d->next) {
 675                         if (!sscanf(d->name, name, &i))
 676                                 continue;
 677                         if (i < 0 || i >= max_netdevices)
 678                                 continue;
 679
 680                         /*  avoid cases where sscanf is not exact inverse of printf */
 681                         snprintf(buf, sizeof(buf), name, i);
 682                         if (!strncmp(buf, d->name, IFNAMSIZ))
 683                                 set_bit(i, inuse);
 684                 }
 685
 686                 i = find_first_zero_bit(inuse, max_netdevices);
 687                 free_page((unsigned long) inuse);
 688         }
 689
 690         snprintf(buf, sizeof(buf), name, i);
 691         if (!__dev_get_by_name(buf)) {
 692                 strlcpy(dev->name, buf, IFNAMSIZ);
 693                 return i;
 694         }
 695
 696         /* It is possible to run out of possible slots
 697          * when the name is long and there isn't enough space left
 698          * for the digits, or if all bits are used.
 699          */
 700         return -ENFILE;
 701 }
 702
 703
 704 /**
 705  *      dev_change_name - change name of a device
 706  *      @dev: device
 707  *      @newname: name (or format string) must be at least IFNAMSIZ
 708  *
 709  *      Change name of a device, can pass format strings "eth%d".
 710  *      for wildcarding.
 711  */
 712 int dev_change_name(struct net_device *dev, char *newname)
 713 {
 714         int err = 0;
 715
 716         ASSERT_RTNL();
 717
 718         if (dev->flags & IFF_UP)
 719                 return -EBUSY;
 720
 721         if (!dev_valid_name(newname))
 722                 return -EINVAL;
 723
 724         if (strchr(newname, '%')) {
 725                 err = dev_alloc_name(dev, newname);
 726                 if (err < 0)
 727                         return err;
 728                 strcpy(newname, dev->name);
 729         }
 730         else if (__dev_get_by_name(newname))
 731                 return -EEXIST;
 732         else
 733                 strlcpy(dev->name, newname, IFNAMSIZ);
 734
 735         err = class_device_rename(&dev->class_dev, dev->name);
 736         if (!err) {
 737                 hlist_del(&dev->name_hlist);
 738                 hlist_add_head(&dev->name_hlist, dev_name_hash(dev->name));
 739                 blocking_notifier_call_chain(&netdev_chain,
 740                                 NETDEV_CHANGENAME, dev);
 741         }
 742
 743         return err;
 744 }
 745
 746 /**
 747  *      netdev_features_change - device changes fatures
 748  *      @dev: device to cause notification
 749  *
 750  *      Called to indicate a device has changed features.
 751  */
 752 void netdev_features_change(struct net_device *dev)
 753 {
 754         blocking_notifier_call_chain(&netdev_chain, NETDEV_FEAT_CHANGE, dev);
 755 }
 756 EXPORT_SYMBOL(netdev_features_change);
 757
 758 /**
 759  *      netdev_state_change - device changes state
 760  *      @dev: device to cause notification
 761  *
 762  *      Called to indicate a device has changed state. This function calls
 763  *      the notifier chains for netdev_chain and sends a NEWLINK message
 764  *      to the routing socket.
 765  */
 766 void netdev_state_change(struct net_device *dev)
 767 {
 768         if (dev->flags & IFF_UP) {
 769                 blocking_notifier_call_chain(&netdev_chain,
 770                                 NETDEV_CHANGE, dev);
 771                 rtmsg_ifinfo(RTM_NEWLINK, dev, 0);
 772         }
 773 }
 774
 775 /**
 776  *      dev_load        - load a network module
 777  *      @name: name of interface
 778  *
 779  *      If a network interface is not present and the process has suitable
 780  *      privileges this function loads the module. If module loading is not
 781  *      available in this kernel then it becomes a nop.
 782  */
 783
 784 void dev_load(const char *name)
 785 {
 786         struct net_device *dev;
 787
 788         read_lock(&dev_base_lock);
 789         dev = __dev_get_by_name(name);
 790         read_unlock(&dev_base_lock);
 791
 792         if (!dev && capable(CAP_SYS_MODULE))
 793                 request_module("%s", name);
 794 }
 795
 796 static int default_rebuild_header(struct sk_buff *skb)
 797 {
 798         printk(KERN_DEBUG "%s: default_rebuild_header called -- BUG!\n",
 799                skb->dev ? skb->dev->name : "NULL!!!");
 800         kfree_skb(skb);
 801         return 1;
 802 }
 803
 804
 805 /**
 806  *      dev_open        - prepare an interface for use.
 807  *      @dev:   device to open
 808  *
 809  *      Takes a device from down to up state. The device's private open
 810  *      function is invoked and then the multicast lists are loaded. Finally
 811  *      the device is moved into the up state and a %NETDEV_UP message is
 812  *      sent to the netdev notifier chain.
 813  *
 814  *      Calling this function on an active interface is a nop. On a failure
 815  *      a negative errno code is returned.
 816  */
 817 int dev_open(struct net_device *dev)
 818 {
 819         int ret = 0;
 820
 821         /*
 822          *      Is it already up?
 823          */
 824
 825         if (dev->flags & IFF_UP)
 826                 return 0;
 827
 828         /*
 829          *      Is it even present?
 830          */
 831         if (!netif_device_present(dev))
 832                 return -ENODEV;
 833
 834         /*
 835          *      Call device private open method
 836          */
 837         set_bit(__LINK_STATE_START, &dev->state);
 838         if (dev->open) {
 839                 ret = dev->open(dev);
 840                 if (ret)
 841                         clear_bit(__LINK_STATE_START, &dev->state);
 842         }
 843
 844         /*
 845          *      If it went open OK then:
 846          */
 847
 848         if (!ret) {
 849                 /*
 850                  *      Set the flags.
 851                  */
 852                 dev->flags |= IFF_UP;
 853
 854                 /*
 855                  *      Initialize multicasting status
 856                  */
 857                 dev_mc_upload(dev);
 858
 859                 /*
 860                  *      Wakeup transmit queue engine
 861                  */
 862                 dev_activate(dev);
 863
 864                 /*
 865                  *      ... and announce new interface.
 866                  */
 867                 blocking_notifier_call_chain(&netdev_chain, NETDEV_UP, dev);
 868         }
 869         return ret;
 870 }
 871
 872 /**
 873  *      dev_close - shutdown an interface.
 874  *      @dev: device to shutdown
 875  *
 876  *      This function moves an active device into down state. A
 877  *      %NETDEV_GOING_DOWN is sent to the netdev notifier chain. The device
 878  *      is then deactivated and finally a %NETDEV_DOWN is sent to the notifier
 879  *      chain.
 880  */
 881 int dev_close(struct net_device *dev)
 882 {
 883         if (!(dev->flags & IFF_UP))
 884                 return 0;
 885
 886         /*
 887          *      Tell people we are going down, so that they can
 888          *      prepare to death, when device is still operating.
 889          */
 890         blocking_notifier_call_chain(&netdev_chain, NETDEV_GOING_DOWN, dev);
 891
 892         dev_deactivate(dev);
 893
 894         clear_bit(__LINK_STATE_START, &dev->state);
 895
 896         /* Synchronize to scheduled poll. We cannot touch poll list,
 897          * it can be even on different cpu. So just clear netif_running(),
 898          * and wait when poll really will happen. Actually, the best place
 899          * for this is inside dev->stop() after device stopped its irq
 900          * engine, but this requires more changes in devices. */
 901
 902         smp_mb__after_clear_bit(); /* Commit netif_running(). */
 903         while (test_bit(__LINK_STATE_RX_SCHED, &dev->state)) {
 904                 /* No hurry. */
 905                 msleep(1);
 906         }
 907
 908         /*
 909          *      Call the device specific close. This cannot fail.
 910          *      Only if device is UP
 911          *
 912          *      We allow it to be called even after a DETACH hot-plug
 913          *      event.
 914          */
 915         if (dev->stop)
 916                 dev->stop(dev);
 917
 918         /*
 919          *      Device is now down.
 920          */
 921
 922         dev->flags &= ~IFF_UP;
 923
 924         /*
 925          * Tell people we are down
 926          */
 927         blocking_notifier_call_chain(&netdev_chain, NETDEV_DOWN, dev);
 928
 929         return 0;
 930 }
 931
 932
 933 /*
 934  *      Device change register/unregister. These are not inline or static
 935  *      as we export them to the world.
 936  */
 937
 938 /**
 939  *      register_netdevice_notifier - register a network notifier block
 940  *      @nb: notifier
 941  *
 942  *      Register a notifier to be called when network device events occur.
 943  *      The notifier passed is linked into the kernel structures and must
 944  *      not be reused until it has been unregistered. A negative errno code
 945  *      is returned on a failure.
 946  *
 947  *      When registered all registration and up events are replayed
 948  *      to the new notifier to allow device to have a race free
 949  *      view of the network device list.
 950  */
 951
 952 int register_netdevice_notifier(struct notifier_block *nb)
 953 {
 954         struct net_device *dev;
 955         int err;
 956
 957         rtnl_lock();
 958         err = blocking_notifier_chain_register(&netdev_chain, nb);
 959         if (!err) {
 960                 for (dev = dev_base; dev; dev = dev->next) {
 961                         nb->notifier_call(nb, NETDEV_REGISTER, dev);
 962
 963                         if (dev->flags & IFF_UP)
 964                                 nb->notifier_call(nb, NETDEV_UP, dev);
 965                 }
 966         }
 967         rtnl_unlock();
 968         return err;
 969 }
 970
 971 /**
 972  *      unregister_netdevice_notifier - unregister a network notifier block
 973  *      @nb: notifier
 974  *
 975  *      Unregister a notifier previously registered by
 976  *      register_netdevice_notifier(). The notifier is unlinked into the
 977  *      kernel structures and may then be reused. A negative errno code
 978  *      is returned on a failure.
 979  */
 980
 981 int unregister_netdevice_notifier(struct notifier_block *nb)
 982 {
 983         int err;
 984
 985         rtnl_lock();
 986         err = blocking_notifier_chain_unregister(&netdev_chain, nb);
 987         rtnl_unlock();
 988         return err;
 989 }
 990
 991 /**
 992  *      call_netdevice_notifiers - call all network notifier blocks
 993  *      @val: value passed unmodified to notifier function
 994  *      @v:   pointer passed unmodified to notifier function
 995  *
 996  *      Call all network notifier blocks.  Parameters and return value
 997  *      are as for blocking_notifier_call_chain().
 998  */
 999
1000 int call_netdevice_notifiers(unsigned long val, void *v)
1001 {
1002         return blocking_notifier_call_chain(&netdev_chain, val, v);
1003 }
1004
1005 /* When > 0 there are consumers of rx skb time stamps */
1006 static atomic_t netstamp_needed = ATOMIC_INIT(0);
1007
1008 void net_enable_timestamp(void)
1009 {
1010         atomic_inc(&netstamp_needed);
1011 }
1012
1013 void net_disable_timestamp(void)
1014 {
1015         atomic_dec(&netstamp_needed);
1016 }
1017
1018 void __net_timestamp(struct sk_buff *skb)
1019 {
1020         struct timeval tv;
1021
1022         do_gettimeofday(&tv);
1023         skb_set_timestamp(skb, &tv);
1024 }
1025 EXPORT_SYMBOL(__net_timestamp);
1026
1027 static inline void net_timestamp(struct sk_buff *skb)
1028 {
1029         if (atomic_read(&netstamp_needed))
1030                 __net_timestamp(skb);
1031         else {
1032                 skb->tstamp.off_sec = 0;
1033                 skb->tstamp.off_usec = 0;
1034         }
1035 }
1036
1037 /*
1038  *      Support routine. Sends outgoing frames to any network
1039  *      taps currently in use.
1040  */
1041
1042 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
1043 {
1044         struct packet_type *ptype;
1045
1046         net_timestamp(skb);
1047
1048         rcu_read_lock();
1049         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1050                 /* Never send packets back to the socket
1051                  * they originated from - MvS (miquels@drinkel.ow.org)
1052                  */
1053                 if ((ptype->dev == dev || !ptype->dev) &&
1054                     (ptype->af_packet_priv == NULL ||
1055                      (struct sock *)ptype->af_packet_priv != skb->sk)) {
1056                         struct sk_buff *skb2= skb_clone(skb, GFP_ATOMIC);
1057                         if (!skb2)
1058                                 break;
1059
1060                         /* skb->nh should be correctly
1061                            set by sender, so that the second statement is
1062                            just protection against buggy protocols.
1063                          */
1064                         skb2->mac.raw = skb2->data;
1065
1066                         if (skb2->nh.raw < skb2->data ||
1067                             skb2->nh.raw > skb2->tail) {
1068                                 if (net_ratelimit())
1069                                         printk(KERN_CRIT "protocol %04x is "
1070                                                "buggy, dev %s\n",
1071                                                skb2->protocol, dev->name);
1072                                 skb2->nh.raw = skb2->data;
1073                         }
1074
1075                         skb2->h.raw = skb2->nh.raw;
1076                         skb2->pkt_type = PACKET_OUTGOING;
1077                         ptype->func(skb2, skb->dev, ptype, skb->dev);
1078                 }
1079         }
1080         rcu_read_unlock();
1081 }
1082
1083
1084 void __netif_schedule(struct net_device *dev)
1085 {
1086         if (!test_and_set_bit(__LINK_STATE_SCHED, &dev->state)) {
1087                 unsigned long flags;
1088                 struct softnet_data *sd;
1089
1090                 local_irq_save(flags);
1091                 sd = &__get_cpu_var(softnet_data);
1092                 dev->next_sched = sd->output_queue;
1093                 sd->output_queue = dev;
1094                 raise_softirq_irqoff(NET_TX_SOFTIRQ);
1095                 local_irq_restore(flags);
1096         }
1097 }
1098 EXPORT_SYMBOL(__netif_schedule);
1099
1100 void __netif_rx_schedule(struct net_device *dev)
1101 {
1102         unsigned long flags;
1103
1104         local_irq_save(flags);
1105         dev_hold(dev);
1106         list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
1107         if (dev->quota < 0)
1108                 dev->quota += dev->weight;
1109         else
1110                 dev->quota = dev->weight;
1111         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1112         local_irq_restore(flags);
1113 }
1114 EXPORT_SYMBOL(__netif_rx_schedule);
1115
1116 void dev_kfree_skb_any(struct sk_buff *skb)
1117 {
1118         if (in_irq() || irqs_disabled())
1119                 dev_kfree_skb_irq(skb);
1120         else
1121                 dev_kfree_skb(skb);
1122 }
1123 EXPORT_SYMBOL(dev_kfree_skb_any);
1124
1125
1126 /* Hot-plugging. */
1127 void netif_device_detach(struct net_device *dev)
1128 {
1129         if (test_and_clear_bit(__LINK_STATE_PRESENT, &dev->state) &&
1130             netif_running(dev)) {
1131                 netif_stop_queue(dev);
1132         }
1133 }
1134 EXPORT_SYMBOL(netif_device_detach);
1135
1136 void netif_device_attach(struct net_device *dev)
1137 {
1138         if (!test_and_set_bit(__LINK_STATE_PRESENT, &dev->state) &&
1139             netif_running(dev)) {
1140                 netif_wake_queue(dev);
1141                 __netdev_watchdog_up(dev);
1142         }
1143 }
1144 EXPORT_SYMBOL(netif_device_attach);
1145
1146
1147 /*
1148  * Invalidate hardware checksum when packet is to be mangled, and
1149  * complete checksum manually on outgoing path.
1150  */
1151 int skb_checksum_help(struct sk_buff *skb, int inward)
1152 {
1153         unsigned int csum;
1154         int ret = 0, offset = skb->h.raw - skb->data;
1155
1156         if (inward) {
1157                 skb->ip_summed = CHECKSUM_NONE;
1158                 goto out;
1159         }
1160
1161         if (skb_cloned(skb)) {
1162                 ret = pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
1163                 if (ret)
1164                         goto out;
1165         }
1166
1167         BUG_ON(offset > (int)skb->len);
1168         csum = skb_checksum(skb, offset, skb->len-offset, 0);
1169
1170         offset = skb->tail - skb->h.raw;
1171         BUG_ON(offset <= 0);
1172         BUG_ON(skb->csum + 2 > offset);
1173
1174         *(u16*)(skb->h.raw + skb->csum) = csum_fold(csum);
1175         skb->ip_summed = CHECKSUM_NONE;
1176 out:
1177         return ret;
1178 }
1179
1180 /* Take action when hardware reception checksum errors are detected. */
1181 #ifdef CONFIG_BUG
1182 void netdev_rx_csum_fault(struct net_device *dev)
1183 {
1184         if (net_ratelimit()) {
1185                 printk(KERN_ERR "%s: hw csum failure.\n",
1186                         dev ? dev->name : "<unknown>");
1187                 dump_stack();
1188         }
1189 }
1190 EXPORT_SYMBOL(netdev_rx_csum_fault);
1191 #endif
1192
1193 #ifdef CONFIG_HIGHMEM
1194 /* Actually, we should eliminate this check as soon as we know, that:
1195  * 1. IOMMU is present and allows to map all the memory.
1196  * 2. No high memory really exists on this machine.
1197  */
1198
1199 static inline int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
1200 {
1201         int i;
1202
1203         if (dev->features & NETIF_F_HIGHDMA)
1204                 return 0;
1205
1206         for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
1207                 if (PageHighMem(skb_shinfo(skb)->frags[i].page))
1208                         return 1;
1209
1210         return 0;
1211 }
1212 #else
1213 #define illegal_highdma(dev, skb)       (0)
1214 #endif
1215
1216 /* Keep head the same: replace data */
1217 int __skb_linearize(struct sk_buff *skb, gfp_t gfp_mask)
1218 {
1219         unsigned int size;
1220         u8 *data;
1221         long offset;
1222         struct skb_shared_info *ninfo;
1223         int headerlen = skb->data - skb->head;
1224         int expand = (skb->tail + skb->data_len) - skb->end;
1225
1226         if (skb_shared(skb))
1227                 BUG();
1228
1229         if (expand <= 0)
1230                 expand = 0;
1231
1232         size = skb->end - skb->head + expand;
1233         size = SKB_DATA_ALIGN(size);
1234         data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);
1235         if (!data)
1236                 return -ENOMEM;
1237
1238         /* Copy entire thing */
1239         if (skb_copy_bits(skb, -headerlen, data, headerlen + skb->len))
1240                 BUG();
1241
1242         /* Set up shinfo */
1243         ninfo = (struct skb_shared_info*)(data + size);
1244         atomic_set(&ninfo->dataref, 1);
1245         ninfo->tso_size = skb_shinfo(skb)->tso_size;
1246         ninfo->tso_segs = skb_shinfo(skb)->tso_segs;
1247         ninfo->nr_frags = 0;
1248         ninfo->frag_list = NULL;
1249
1250         /* Offset between the two in bytes */
1251         offset = data - skb->head;
1252
1253         /* Free old data. */
1254         skb_release_data(skb);
1255
1256         skb->head = data;
1257         skb->end  = data + size;
1258
1259         /* Set up new pointers */
1260         skb->h.raw   += offset;
1261         skb->nh.raw  += offset;
1262         skb->mac.raw += offset;
1263         skb->tail    += offset;
1264         skb->data    += offset;
1265
1266         /* We are no longer a clone, even if we were. */
1267         skb->cloned    = 0;
1268
1269         skb->tail     += skb->data_len;
1270         skb->data_len  = 0;
1271         return 0;
1272 }
1273
1274 #define HARD_TX_LOCK(dev, cpu) {                        \
1275         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1276                 spin_lock(&dev->xmit_lock);             \
1277                 dev->xmit_lock_owner = cpu;             \
1278         }                                               \
1279 }
1280
1281 #define HARD_TX_UNLOCK(dev) {                           \
1282         if ((dev->features & NETIF_F_LLTX) == 0) {      \
1283                 dev->xmit_lock_owner = -1;              \
1284                 spin_unlock(&dev->xmit_lock);           \
1285         }                                               \
1286 }
1287
1288 /**
1289  *      dev_queue_xmit - transmit a buffer
1290  *      @skb: buffer to transmit
1291  *
1292  *      Queue a buffer for transmission to a network device. The caller must
1293  *      have set the device and priority and built the buffer before calling
1294  *      this function. The function can be called from an interrupt.
1295  *
1296  *      A negative errno code is returned on a failure. A success does not
1297  *      guarantee the frame will be transmitted as it may be dropped due
1298  *      to congestion or traffic shaping.
1299  *
1300  * -----------------------------------------------------------------------------------
1301  *      I notice this method can also return errors from the queue disciplines,
1302  *      including NET_XMIT_DROP, which is a positive value.  So, errors can also
1303  *      be positive.
1304  *
1305  *      Regardless of the return value, the skb is consumed, so it is currently
1306  *      difficult to retry a send to this method.  (You can bump the ref count
1307  *      before sending to hold a reference for retry if you are careful.)
1308  *
1309  *      When calling this method, interrupts MUST be enabled.  This is because
1310  *      the BH enable code must have IRQs enabled so that it will not deadlock.
1311  *          --BLG
1312  */
1313
1314 int dev_queue_xmit(struct sk_buff *skb)
1315 {
1316         struct net_device *dev = skb->dev;
1317         struct Qdisc *q;
1318         int rc = -ENOMEM;
1319
1320         if (skb_shinfo(skb)->frag_list &&
1321             !(dev->features & NETIF_F_FRAGLIST) &&
1322             __skb_linearize(skb, GFP_ATOMIC))
1323                 goto out_kfree_skb;
1324
1325         /* Fragmented skb is linearized if device does not support SG,
1326          * or if at least one of fragments is in highmem and device
1327          * does not support DMA from it.
1328          */
1329         if (skb_shinfo(skb)->nr_frags &&
1330             (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)) &&
1331             __skb_linearize(skb, GFP_ATOMIC))
1332                 goto out_kfree_skb;
1333
1334         /* If packet is not checksummed and device does not support
1335          * checksumming for this protocol, complete checksumming here.
1336          */
1337         if (skb->ip_summed == CHECKSUM_HW &&
1338             (!(dev->features & (NETIF_F_HW_CSUM | NETIF_F_NO_CSUM)) &&
1339              (!(dev->features & NETIF_F_IP_CSUM) ||
1340               skb->protocol != htons(ETH_P_IP))))
1341                 if (skb_checksum_help(skb, 0))
1342                         goto out_kfree_skb;
1343
1344         spin_lock_prefetch(&dev->queue_lock);
1345
1346         /* Disable soft irqs for various locks below. Also
1347          * stops preemption for RCU.
1348          */
1349         local_bh_disable();
1350
1351         /* Updates of qdisc are serialized by queue_lock.
1352          * The struct Qdisc which is pointed to by qdisc is now a
1353          * rcu structure - it may be accessed without acquiring
1354          * a lock (but the structure may be stale.) The freeing of the
1355          * qdisc will be deferred until it's known that there are no
1356          * more references to it.
1357          *
1358          * If the qdisc has an enqueue function, we still need to
1359          * hold the queue_lock before calling it, since queue_lock
1360          * also serializes access to the device queue.
1361          */
1362
1363         q = rcu_dereference(dev->qdisc);
1364 #ifdef CONFIG_NET_CLS_ACT
1365         skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_EGRESS);
1366 #endif
1367         if (q->enqueue) {
1368                 /* Grab device queue */
1369                 spin_lock(&dev->queue_lock);
1370
1371                 rc = q->enqueue(skb, q);
1372
1373                 qdisc_run(dev);
1374
1375                 spin_unlock(&dev->queue_lock);
1376                 rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc;
1377                 goto out;
1378         }
1379
1380         /* The device has no queue. Common case for software devices:
1381            loopback, all the sorts of tunnels...
1382
1383            Really, it is unlikely that xmit_lock protection is necessary here.
1384            (f.e. loopback and IP tunnels are clean ignoring statistics
1385            counters.)
1386            However, it is possible, that they rely on protection
1387            made by us here.
1388
1389            Check this and shot the lock. It is not prone from deadlocks.
1390            Either shot noqueue qdisc, it is even simpler 8)
1391          */
1392         if (dev->flags & IFF_UP) {
1393                 int cpu = smp_processor_id(); /* ok because BHs are off */
1394
1395                 if (dev->xmit_lock_owner != cpu) {
1396
1397                         HARD_TX_LOCK(dev, cpu);
1398
1399                         if (!netif_queue_stopped(dev)) {
1400                                 if (netdev_nit)
1401                                         dev_queue_xmit_nit(skb, dev);
1402
1403                                 rc = 0;
1404                                 if (!dev->hard_start_xmit(skb, dev)) {
1405                                         HARD_TX_UNLOCK(dev);
1406                                         goto out;
1407                                 }
1408                         }
1409                         HARD_TX_UNLOCK(dev);
1410                         if (net_ratelimit())
1411                                 printk(KERN_CRIT "Virtual device %s asks to "
1412                                        "queue packet!\n", dev->name);
1413                 } else {
1414                         /* Recursion is detected! It is possible,
1415                          * unfortunately */
1416                         if (net_ratelimit())
1417                                 printk(KERN_CRIT "Dead loop on virtual device "
1418                                        "%s, fix it urgently!\n", dev->name);
1419                 }
1420         }
1421
1422         rc = -ENETDOWN;
1423         local_bh_enable();
1424
1425 out_kfree_skb:
1426         kfree_skb(skb);
1427         return rc;
1428 out:
1429         local_bh_enable();
1430         return rc;
1431 }
1432
1433
1434 /*=======================================================================
1435                         Receiver routines
1436   =======================================================================*/
1437
1438 int netdev_max_backlog = 1000;
1439 int netdev_budget = 300;
1440 int weight_p = 64;            /* old backlog weight */
1441
1442 DEFINE_PER_CPU(struct netif_rx_stats, netdev_rx_stat) = { 0, };
1443
1444
1445 /**
1446  *      netif_rx        -       post buffer to the network code
1447  *      @skb: buffer to post
1448  *
1449  *      This function receives a packet from a device driver and queues it for
1450  *      the upper (protocol) levels to process.  It always succeeds. The buffer
1451  *      may be dropped during processing for congestion control or by the
1452  *      protocol layers.
1453  *
1454  *      return values:
1455  *      NET_RX_SUCCESS  (no congestion)
1456  *      NET_RX_CN_LOW   (low congestion)
1457  *      NET_RX_CN_MOD   (moderate congestion)
1458  *      NET_RX_CN_HIGH  (high congestion)
1459  *      NET_RX_DROP     (packet was dropped)
1460  *
1461  */
1462
1463 int netif_rx(struct sk_buff *skb)
1464 {
1465         struct softnet_data *queue;
1466         unsigned long flags;
1467
1468         /* if netpoll wants it, pretend we never saw it */
1469         if (netpoll_rx(skb))
1470                 return NET_RX_DROP;
1471
1472         if (!skb->tstamp.off_sec)
1473                 net_timestamp(skb);
1474
1475         /*
1476          * The code is rearranged so that the path is the most
1477          * short when CPU is congested, but is still operating.
1478          */
1479         local_irq_save(flags);
1480         queue = &__get_cpu_var(softnet_data);
1481
1482         __get_cpu_var(netdev_rx_stat).total++;
1483         if (queue->input_pkt_queue.qlen <= netdev_max_backlog) {
1484                 if (queue->input_pkt_queue.qlen) {
1485 enqueue:
1486                         dev_hold(skb->dev);
1487                         __skb_queue_tail(&queue->input_pkt_queue, skb);
1488                         local_irq_restore(flags);
1489                         return NET_RX_SUCCESS;
1490                 }
1491
1492                 netif_rx_schedule(&queue->backlog_dev);
1493                 goto enqueue;
1494         }
1495
1496         __get_cpu_var(netdev_rx_stat).dropped++;
1497         local_irq_restore(flags);
1498
1499         kfree_skb(skb);
1500         return NET_RX_DROP;
1501 }
1502
1503 int netif_rx_ni(struct sk_buff *skb)
1504 {
1505         int err;
1506
1507         preempt_disable();
1508         err = netif_rx(skb);
1509         if (local_softirq_pending())
1510                 do_softirq();
1511         preempt_enable();
1512
1513         return err;
1514 }
1515
1516 EXPORT_SYMBOL(netif_rx_ni);
1517
1518 static inline struct net_device *skb_bond(struct sk_buff *skb)
1519 {
1520         struct net_device *dev = skb->dev;
1521
1522         if (dev->master) {
1523                 /*
1524                  * On bonding slaves other than the currently active
1525                  * slave, suppress duplicates except for 802.3ad
1526                  * ETH_P_SLOW and alb non-mcast/bcast.
1527                  */
1528                 if (dev->priv_flags & IFF_SLAVE_INACTIVE) {
1529                         if (dev->master->priv_flags & IFF_MASTER_ALB) {
1530                                 if (skb->pkt_type != PACKET_BROADCAST &&
1531                                     skb->pkt_type != PACKET_MULTICAST)
1532                                         goto keep;
1533                         }
1534
1535                         if (dev->master->priv_flags & IFF_MASTER_8023AD &&
1536                             skb->protocol == __constant_htons(ETH_P_SLOW))
1537                                 goto keep;
1538
1539                         kfree_skb(skb);
1540                         return NULL;
1541                 }
1542 keep:
1543                 skb->dev = dev->master;
1544         }
1545
1546         return dev;
1547 }
1548
1549 static void net_tx_action(struct softirq_action *h)
1550 {
1551         struct softnet_data *sd = &__get_cpu_var(softnet_data);
1552
1553         if (sd->completion_queue) {
1554                 struct sk_buff *clist;
1555
1556                 local_irq_disable();
1557                 clist = sd->completion_queue;
1558                 sd->completion_queue = NULL;
1559                 local_irq_enable();
1560
1561                 while (clist) {
1562                         struct sk_buff *skb = clist;
1563                         clist = clist->next;
1564
1565                         BUG_TRAP(!atomic_read(&skb->users));
1566                         __kfree_skb(skb);
1567                 }
1568         }
1569
1570         if (sd->output_queue) {
1571                 struct net_device *head;
1572
1573                 local_irq_disable();
1574                 head = sd->output_queue;
1575                 sd->output_queue = NULL;
1576                 local_irq_enable();
1577
1578                 while (head) {
1579                         struct net_device *dev = head;
1580                         head = head->next_sched;
1581
1582                         smp_mb__before_clear_bit();
1583                         clear_bit(__LINK_STATE_SCHED, &dev->state);
1584
1585                         if (spin_trylock(&dev->queue_lock)) {
1586                                 qdisc_run(dev);
1587                                 spin_unlock(&dev->queue_lock);
1588                         } else {
1589                                 netif_schedule(dev);
1590                         }
1591                 }
1592         }
1593 }
1594
1595 static __inline__ int deliver_skb(struct sk_buff *skb,
1596                                   struct packet_type *pt_prev,
1597                                   struct net_device *orig_dev)
1598 {
1599         atomic_inc(&skb->users);
1600         return pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1601 }
1602
1603 #if defined(CONFIG_BRIDGE) || defined (CONFIG_BRIDGE_MODULE)
1604 int (*br_handle_frame_hook)(struct net_bridge_port *p, struct sk_buff **pskb);
1605 struct net_bridge;
1606 struct net_bridge_fdb_entry *(*br_fdb_get_hook)(struct net_bridge *br,
1607                                                 unsigned char *addr);
1608 void (*br_fdb_put_hook)(struct net_bridge_fdb_entry *ent);
1609
1610 static __inline__ int handle_bridge(struct sk_buff **pskb,
1611                                     struct packet_type **pt_prev, int *ret,
1612                                     struct net_device *orig_dev)
1613 {
1614         struct net_bridge_port *port;
1615
1616         if ((*pskb)->pkt_type == PACKET_LOOPBACK ||
1617             (port = rcu_dereference((*pskb)->dev->br_port)) == NULL)
1618                 return 0;
1619
1620         if (*pt_prev) {
1621                 *ret = deliver_skb(*pskb, *pt_prev, orig_dev);
1622                 *pt_prev = NULL;
1623         }
1624
1625         return br_handle_frame_hook(port, pskb);
1626 }
1627 #else
1628 #define handle_bridge(skb, pt_prev, ret, orig_dev)      (0)
1629 #endif
1630
1631 #ifdef CONFIG_NET_CLS_ACT
1632 /* TODO: Maybe we should just force sch_ingress to be compiled in
1633  * when CONFIG_NET_CLS_ACT is? otherwise some useless instructions
1634  * a compare and 2 stores extra right now if we dont have it on
1635  * but have CONFIG_NET_CLS_ACT
1636  * NOTE: This doesnt stop any functionality; if you dont have
1637  * the ingress scheduler, you just cant add policies on ingress.
1638  *
1639  */
1640 static int ing_filter(struct sk_buff *skb)
1641 {
1642         struct Qdisc *q;
1643         struct net_device *dev = skb->dev;
1644         int result = TC_ACT_OK;
1645
1646         if (dev->qdisc_ingress) {
1647                 __u32 ttl = (__u32) G_TC_RTTL(skb->tc_verd);
1648                 if (MAX_RED_LOOP < ttl++) {
1649                         printk("Redir loop detected Dropping packet (%s->%s)\n",
1650                                 skb->input_dev->name, skb->dev->name);
1651                         return TC_ACT_SHOT;
1652                 }
1653
1654                 skb->tc_verd = SET_TC_RTTL(skb->tc_verd,ttl);
1655
1656                 skb->tc_verd = SET_TC_AT(skb->tc_verd,AT_INGRESS);
1657
1658                 spin_lock(&dev->ingress_lock);
1659                 if ((q = dev->qdisc_ingress) != NULL)
1660                         result = q->enqueue(skb, q);
1661                 spin_unlock(&dev->ingress_lock);
1662
1663         }
1664
1665         return result;
1666 }
1667 #endif
1668
1669 int netif_receive_skb(struct sk_buff *skb)
1670 {
1671         struct packet_type *ptype, *pt_prev;
1672         struct net_device *orig_dev;
1673         int ret = NET_RX_DROP;
1674         unsigned short type;
1675
1676         /* if we've gotten here through NAPI, check netpoll */
1677         if (skb->dev->poll && netpoll_rx(skb))
1678                 return NET_RX_DROP;
1679
1680         if (!skb->tstamp.off_sec)
1681                 net_timestamp(skb);
1682
1683         if (!skb->input_dev)
1684                 skb->input_dev = skb->dev;
1685
1686         orig_dev = skb_bond(skb);
1687
1688         if (!orig_dev)
1689                 return NET_RX_DROP;
1690
1691         __get_cpu_var(netdev_rx_stat).total++;
1692
1693         skb->h.raw = skb->nh.raw = skb->data;
1694         skb->mac_len = skb->nh.raw - skb->mac.raw;
1695
1696         pt_prev = NULL;
1697
1698         rcu_read_lock();
1699
1700 #ifdef CONFIG_NET_CLS_ACT
1701         if (skb->tc_verd & TC_NCLS) {
1702                 skb->tc_verd = CLR_TC_NCLS(skb->tc_verd);
1703                 goto ncls;
1704         }
1705 #endif
1706
1707         list_for_each_entry_rcu(ptype, &ptype_all, list) {
1708                 if (!ptype->dev || ptype->dev == skb->dev) {
1709                         if (pt_prev)
1710                                 ret = deliver_skb(skb, pt_prev, orig_dev);
1711                         pt_prev = ptype;
1712                 }
1713         }
1714
1715 #ifdef CONFIG_NET_CLS_ACT
1716         if (pt_prev) {
1717                 ret = deliver_skb(skb, pt_prev, orig_dev);
1718                 pt_prev = NULL; /* noone else should process this after*/
1719         } else {
1720                 skb->tc_verd = SET_TC_OK2MUNGE(skb->tc_verd);
1721         }
1722
1723         ret = ing_filter(skb);
1724
1725         if (ret == TC_ACT_SHOT || (ret == TC_ACT_STOLEN)) {
1726                 kfree_skb(skb);
1727                 goto out;
1728         }
1729
1730         skb->tc_verd = 0;
1731 ncls:
1732 #endif
1733
1734         handle_diverter(skb);
1735
1736         if (handle_bridge(&skb, &pt_prev, &ret, orig_dev))
1737                 goto out;
1738
1739         type = skb->protocol;
1740         list_for_each_entry_rcu(ptype, &ptype_base[ntohs(type)&15], list) {
1741                 if (ptype->type == type &&
1742                     (!ptype->dev || ptype->dev == skb->dev)) {
1743                         if (pt_prev)
1744                                 ret = deliver_skb(skb, pt_prev, orig_dev);
1745                         pt_prev = ptype;
1746                 }
1747         }
1748
1749         if (pt_prev) {
1750                 ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
1751         } else {
1752                 kfree_skb(skb);
1753                 /* Jamal, now you will not able to escape explaining
1754                  * me how you were going to use this. :-)
1755                  */
1756                 ret = NET_RX_DROP;
1757         }
1758
1759 out:
1760         rcu_read_unlock();
1761         return ret;
1762 }
1763
1764 static int process_backlog(struct net_device *backlog_dev, int *budget)
1765 {
1766         int work = 0;
1767         int quota = min(backlog_dev->quota, *budget);
1768         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1769         unsigned long start_time = jiffies;
1770
1771         backlog_dev->weight = weight_p;
1772         for (;;) {
1773                 struct sk_buff *skb;
1774                 struct net_device *dev;
1775
1776                 local_irq_disable();
1777                 skb = __skb_dequeue(&queue->input_pkt_queue);
1778                 if (!skb)
1779                         goto job_done;
1780                 local_irq_enable();
1781
1782                 dev = skb->dev;
1783
1784                 netif_receive_skb(skb);
1785
1786                 dev_put(dev);
1787
1788                 work++;
1789
1790                 if (work >= quota || jiffies - start_time > 1)
1791                         break;
1792
1793         }
1794
1795         backlog_dev->quota -= work;
1796         *budget -= work;
1797         return -1;
1798
1799 job_done:
1800         backlog_dev->quota -= work;
1801         *budget -= work;
1802
1803         list_del(&backlog_dev->poll_list);
1804         smp_mb__before_clear_bit();
1805         netif_poll_enable(backlog_dev);
1806
1807         local_irq_enable();
1808         return 0;
1809 }
1810
1811 static void net_rx_action(struct softirq_action *h)
1812 {
1813         struct softnet_data *queue = &__get_cpu_var(softnet_data);
1814         unsigned long start_time = jiffies;
1815         int budget = netdev_budget;
1816         void *have;
1817
1818         local_irq_disable();
1819
1820         while (!list_empty(&queue->poll_list)) {
1821                 struct net_device *dev;
1822
1823                 if (budget <= 0 || jiffies - start_time > 1)
1824                         goto softnet_break;
1825
1826                 local_irq_enable();
1827
1828                 dev = list_entry(queue->poll_list.next,
1829                                  struct net_device, poll_list);
1830                 have = netpoll_poll_lock(dev);
1831
1832                 if (dev->quota <= 0 || dev->poll(dev, &budget)) {
1833                         netpoll_poll_unlock(have);
1834                         local_irq_disable();
1835                         list_move_tail(&dev->poll_list, &queue->poll_list);
1836                         if (dev->quota < 0)
1837                                 dev->quota += dev->weight;
1838                         else
1839                                 dev->quota = dev->weight;
1840                 } else {
1841                         netpoll_poll_unlock(have);
1842                         dev_put(dev);
1843                         local_irq_disable();
1844                 }
1845         }
1846 out:
1847         local_irq_enable();
1848         return;
1849
1850 softnet_break:
1851         __get_cpu_var(netdev_rx_stat).time_squeeze++;
1852         __raise_softirq_irqoff(NET_RX_SOFTIRQ);
1853         goto out;
1854 }
1855
1856 static gifconf_func_t * gifconf_list [NPROTO];
1857
1858 /**
1859  *      register_gifconf        -       register a SIOCGIF handler
1860  *      @family: Address family
1861  *      @gifconf: Function handler
1862  *
1863  *      Register protocol dependent address dumping routines. The handler
1864  *      that is passed must not be freed or reused until it has been replaced
1865  *      by another handler.
1866  */
1867 int register_gifconf(unsigned int family, gifconf_func_t * gifconf)
1868 {
1869         if (family >= NPROTO)
1870                 return -EINVAL;
1871         gifconf_list[family] = gifconf;
1872         return 0;
1873 }
1874
1875
1876 /*
1877  *      Map an interface index to its name (SIOCGIFNAME)
1878  */
1879
1880 /*
1881  *      We need this ioctl for efficient implementation of the
1882  *      if_indextoname() function required by the IPv6 API.  Without
1883  *      it, we would have to search all the interfaces to find a
1884  *      match.  --pb
1885  */
1886
1887 static int dev_ifname(struct ifreq __user *arg)
1888 {
1889         struct net_device *dev;
1890         struct ifreq ifr;
1891
1892         /*
1893          *      Fetch the caller's info block.
1894          */
1895
1896         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
1897                 return -EFAULT;
1898
1899         read_lock(&dev_base_lock);
1900         dev = __dev_get_by_index(ifr.ifr_ifindex);
1901         if (!dev) {
1902                 read_unlock(&dev_base_lock);
1903                 return -ENODEV;
1904         }
1905
1906         strcpy(ifr.ifr_name, dev->name);
1907         read_unlock(&dev_base_lock);
1908
1909         if (copy_to_user(arg, &ifr, sizeof(struct ifreq)))
1910                 return -EFAULT;
1911         return 0;
1912 }
1913
1914 /*
1915  *      Perform a SIOCGIFCONF call. This structure will change
1916  *      size eventually, and there is nothing I can do about it.
1917  *      Thus we will need a 'compatibility mode'.
1918  */
1919
1920 static int dev_ifconf(char __user *arg)
1921 {
1922         struct ifconf ifc;
1923         struct net_device *dev;
1924         char __user *pos;
1925         int len;
1926         int total;
1927         int i;
1928
1929         /*
1930          *      Fetch the caller's info block.
1931          */
1932
1933         if (copy_from_user(&ifc, arg, sizeof(struct ifconf)))
1934                 return -EFAULT;
1935
1936         pos = ifc.ifc_buf;
1937         len = ifc.ifc_len;
1938
1939         /*
1940          *      Loop over the interfaces, and write an info block for each.
1941          */
1942
1943         total = 0;
1944         for (dev = dev_base; dev; dev = dev->next) {
1945                 for (i = 0; i < NPROTO; i++) {
1946                         if (gifconf_list[i]) {
1947                                 int done;
1948                                 if (!pos)
1949                                         done = gifconf_list[i](dev, NULL, 0);
1950                                 else
1951                                         done = gifconf_list[i](dev, pos + total,
1952                                                                len - total);
1953                                 if (done < 0)
1954                                         return -EFAULT;
1955                                 total += done;
1956                         }
1957                 }
1958         }
1959
1960         /*
1961          *      All done.  Write the updated control block back to the caller.
1962          */
1963         ifc.ifc_len = total;
1964
1965         /*
1966          *      Both BSD and Solaris return 0 here, so we do too.
1967          */
1968         return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0;
1969 }
1970
1971 #ifdef CONFIG_PROC_FS
1972 /*
1973  *      This is invoked by the /proc filesystem handler to display a device
1974  *      in detail.
1975  */
1976 static __inline__ struct net_device *dev_get_idx(loff_t pos)
1977 {
1978         struct net_device *dev;
1979         loff_t i;
1980
1981         for (i = 0, dev = dev_base; dev && i < pos; ++i, dev = dev->next);
1982
1983         return i == pos ? dev : NULL;
1984 }
1985
1986 void *dev_seq_start(struct seq_file *seq, loff_t *pos)
1987 {
1988         read_lock(&dev_base_lock);
1989         return *pos ? dev_get_idx(*pos - 1) : SEQ_START_TOKEN;
1990 }
1991
1992 void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1993 {
1994         ++*pos;
1995         return v == SEQ_START_TOKEN ? dev_base : ((struct net_device *)v)->next;
1996 }
1997
1998 void dev_seq_stop(struct seq_file *seq, void *v)
1999 {
2000         read_unlock(&dev_base_lock);
2001 }
2002
2003 static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev)
2004 {
2005         if (dev->get_stats) {
2006                 struct net_device_stats *stats = dev->get_stats(dev);
2007
2008                 seq_printf(seq, "%6s:%8lu %7lu %4lu %4lu %4lu %5lu %10lu %9lu "
2009                                 "%8lu %7lu %4lu %4lu %4lu %5lu %7lu %10lu\n",
2010                            dev->name, stats->rx_bytes, stats->rx_packets,
2011                            stats->rx_errors,
2012                            stats->rx_dropped + stats->rx_missed_errors,
2013                            stats->rx_fifo_errors,
2014                            stats->rx_length_errors + stats->rx_over_errors +
2015                              stats->rx_crc_errors + stats->rx_frame_errors,
2016                            stats->rx_compressed, stats->multicast,
2017                            stats->tx_bytes, stats->tx_packets,
2018                            stats->tx_errors, stats->tx_dropped,
2019                            stats->tx_fifo_errors, stats->collisions,
2020                            stats->tx_carrier_errors +
2021                              stats->tx_aborted_errors +
2022                              stats->tx_window_errors +
2023                              stats->tx_heartbeat_errors,
2024                            stats->tx_compressed);
2025         } else
2026                 seq_printf(seq, "%6s: No statistics available.\n", dev->name);
2027 }
2028
2029 /*
2030  *      Called from the PROCfs module. This now uses the new arbitrary sized
2031  *      /proc/net interface to create /proc/net/dev
2032  */
2033 static int dev_seq_show(struct seq_file *seq, void *v)
2034 {
2035         if (v == SEQ_START_TOKEN)
2036                 seq_puts(seq, "Inter-|   Receive                            "
2037                               "                    |  Transmit\n"
2038                               " face |bytes    packets errs drop fifo frame "
2039                               "compressed multicast|bytes    packets errs "
2040                               "drop fifo colls carrier compressed\n");
2041         else
2042                 dev_seq_printf_stats(seq, v);
2043         return 0;
2044 }
2045
2046 static struct netif_rx_stats *softnet_get_online(loff_t *pos)
2047 {
2048         struct netif_rx_stats *rc = NULL;
2049
2050         while (*pos < NR_CPUS)
2051                 if (cpu_online(*pos)) {
2052                         rc = &per_cpu(netdev_rx_stat, *pos);
2053                         break;
2054                 } else
2055                         ++*pos;
2056         return rc;
2057 }
2058
2059 static void *softnet_seq_start(struct seq_file *seq, loff_t *pos)
2060 {
2061         return softnet_get_online(pos);
2062 }
2063
2064 static void *softnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2065 {
2066         ++*pos;
2067         return softnet_get_online(pos);
2068 }
2069
2070 static void softnet_seq_stop(struct seq_file *seq, void *v)
2071 {
2072 }
2073
2074 static int softnet_seq_show(struct seq_file *seq, void *v)
2075 {
2076         struct netif_rx_stats *s = v;
2077
2078         seq_printf(seq, "%08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
2079                    s->total, s->dropped, s->time_squeeze, 0,
2080                    0, 0, 0, 0, /* was fastroute */
2081                    s->cpu_collision );
2082         return 0;
2083 }
2084
2085 static struct seq_operations dev_seq_ops = {
2086         .start = dev_seq_start,
2087         .next  = dev_seq_next,
2088         .stop  = dev_seq_stop,
2089         .show  = dev_seq_show,
2090 };
2091
2092 static int dev_seq_open(struct inode *inode, struct file *file)
2093 {
2094         return seq_open(file, &dev_seq_ops);
2095 }
2096
2097 static struct file_operations dev_seq_fops = {
2098         .owner   = THIS_MODULE,
2099         .open    = dev_seq_open,
2100         .read    = seq_read,
2101         .llseek  = seq_lseek,
2102         .release = seq_release,
2103 };
2104
2105 static struct seq_operations softnet_seq_ops = {
2106         .start = softnet_seq_start,
2107         .next  = softnet_seq_next,
2108         .stop  = softnet_seq_stop,
2109         .show  = softnet_seq_show,
2110 };
2111
2112 static int softnet_seq_open(struct inode *inode, struct file *file)
2113 {
2114         return seq_open(file, &softnet_seq_ops);
2115 }
2116
2117 static struct file_operations softnet_seq_fops = {
2118         .owner   = THIS_MODULE,
2119         .open    = softnet_seq_open,
2120         .read    = seq_read,
2121         .llseek  = seq_lseek,
2122         .release = seq_release,
2123 };
2124
2125 #ifdef CONFIG_WIRELESS_EXT
2126 extern int wireless_proc_init(void);
2127 #else
2128 #define wireless_proc_init() 0
2129 #endif
2130
2131 static int __init dev_proc_init(void)
2132 {
2133         int rc = -ENOMEM;
2134
2135         if (!proc_net_fops_create("dev", S_IRUGO, &dev_seq_fops))
2136                 goto out;
2137         if (!proc_net_fops_create("softnet_stat", S_IRUGO, &softnet_seq_fops))
2138                 goto out_dev;
2139         if (wireless_proc_init())
2140                 goto out_softnet;
2141         rc = 0;
2142 out:
2143         return rc;
2144 out_softnet:
2145         proc_net_remove("softnet_stat");
2146 out_dev:
2147         proc_net_remove("dev");
2148         goto out;
2149 }
2150 #else
2151 #define dev_proc_init() 0
2152 #endif  /* CONFIG_PROC_FS */
2153
2154
2155 /**
2156  *      netdev_set_master       -       set up master/slave pair
2157  *      @slave: slave device
2158  *      @master: new master device
2159  *
2160  *      Changes the master device of the slave. Pass %NULL to break the
2161  *      bonding. The caller must hold the RTNL semaphore. On a failure
2162  *      a negative errno code is returned. On success the reference counts
2163  *      are adjusted, %RTM_NEWLINK is sent to the routing socket and the
2164  *      function returns zero.
2165  */
2166 int netdev_set_master(struct net_device *slave, struct net_device *master)
2167 {
2168         struct net_device *old = slave->master;
2169
2170         ASSERT_RTNL();
2171
2172         if (master) {
2173                 if (old)
2174                         return -EBUSY;
2175                 dev_hold(master);
2176         }
2177
2178         slave->master = master;
2179
2180         synchronize_net();
2181
2182         if (old)
2183                 dev_put(old);
2184
2185         if (master)
2186                 slave->flags |= IFF_SLAVE;
2187         else
2188                 slave->flags &= ~IFF_SLAVE;
2189
2190         rtmsg_ifinfo(RTM_NEWLINK, slave, IFF_SLAVE);
2191         return 0;
2192 }
2193
2194 /**
2195  *      dev_set_promiscuity     - update promiscuity count on a device
2196  *      @dev: device
2197  *      @inc: modifier
2198  *
2199  *      Add or remove promsicuity from a device. While the count in the device
2200  *      remains above zero the interface remains promiscuous. Once it hits zero
2201  *      the device reverts back to normal filtering operation. A negative inc
2202  *      value is used to drop promiscuity on the device.
2203  */
2204 void dev_set_promiscuity(struct net_device *dev, int inc)
2205 {
2206         unsigned short old_flags = dev->flags;
2207
2208         if ((dev->promiscuity += inc) == 0)
2209                 dev->flags &= ~IFF_PROMISC;
2210         else
2211                 dev->flags |= IFF_PROMISC;
2212         if (dev->flags != old_flags) {
2213                 dev_mc_upload(dev);
2214                 printk(KERN_INFO "device %s %s promiscuous mode\n",
2215                        dev->name, (dev->flags & IFF_PROMISC) ? "entered" :
2216                                                                "left");
2217                 audit_log(current->audit_context, GFP_ATOMIC,
2218                         AUDIT_ANOM_PROMISCUOUS,
2219                         "dev=%s prom=%d old_prom=%d auid=%u",
2220                         dev->name, (dev->flags & IFF_PROMISC),
2221                         (old_flags & IFF_PROMISC),
2222                         audit_get_loginuid(current->audit_context));
2223         }
2224 }
2225
2226 /**
2227  *      dev_set_allmulti        - update allmulti count on a device
2228  *      @dev: device
2229  *      @inc: modifier
2230  *
2231  *      Add or remove reception of all multicast frames to a device. While the
2232  *      count in the device remains above zero the interface remains listening
2233  *      to all interfaces. Once it hits zero the device reverts back to normal
2234  *      filtering operation. A negative @inc value is used to drop the counter
2235  *      when releasing a resource needing all multicasts.
2236  */
2237
2238 void dev_set_allmulti(struct net_device *dev, int inc)
2239 {
2240         unsigned short old_flags = dev->flags;
2241
2242         dev->flags |= IFF_ALLMULTI;
2243         if ((dev->allmulti += inc) == 0)
2244                 dev->flags &= ~IFF_ALLMULTI;
2245         if (dev->flags ^ old_flags)
2246                 dev_mc_upload(dev);
2247 }
2248
2249 unsigned dev_get_flags(const struct net_device *dev)
2250 {
2251         unsigned flags;
2252
2253         flags = (dev->flags & ~(IFF_PROMISC |
2254                                 IFF_ALLMULTI |
2255                                 IFF_RUNNING |
2256                                 IFF_LOWER_UP |
2257                                 IFF_DORMANT)) |
2258                 (dev->gflags & (IFF_PROMISC |
2259                                 IFF_ALLMULTI));
2260
2261         if (netif_running(dev)) {
2262                 if (netif_oper_up(dev))
2263                         flags |= IFF_RUNNING;
2264                 if (netif_carrier_ok(dev))
2265                         flags |= IFF_LOWER_UP;
2266                 if (netif_dormant(dev))
2267                         flags |= IFF_DORMANT;
2268         }
2269
2270         return flags;
2271 }
2272
2273 int dev_change_flags(struct net_device *dev, unsigned flags)
2274 {
2275         int ret;
2276         int old_flags = dev->flags;
2277
2278         /*
2279          *      Set the flags on our device.
2280          */
2281
2282         dev->flags = (flags & (IFF_DEBUG | IFF_NOTRAILERS | IFF_NOARP |
2283                                IFF_DYNAMIC | IFF_MULTICAST | IFF_PORTSEL |
2284                                IFF_AUTOMEDIA)) |
2285                      (dev->flags & (IFF_UP | IFF_VOLATILE | IFF_PROMISC |
2286                                     IFF_ALLMULTI));
2287
2288         /*
2289          *      Load in the correct multicast list now the flags have changed.
2290          */
2291
2292         dev_mc_upload(dev);
2293
2294         /*
2295          *      Have we downed the interface. We handle IFF_UP ourselves
2296          *      according to user attempts to set it, rather than blindly
2297          *      setting it.
2298          */
2299
2300         ret = 0;
2301         if ((old_flags ^ flags) & IFF_UP) {     /* Bit is different  ? */
2302                 ret = ((old_flags & IFF_UP) ? dev_close : dev_open)(dev);
2303
2304                 if (!ret)
2305                         dev_mc_upload(dev);
2306         }
2307
2308         if (dev->flags & IFF_UP &&
2309             ((old_flags ^ dev->flags) &~ (IFF_UP | IFF_PROMISC | IFF_ALLMULTI |
2310                                           IFF_VOLATILE)))
2311                 blocking_notifier_call_chain(&netdev_chain,
2312                                 NETDEV_CHANGE, dev);
2313
2314         if ((flags ^ dev->gflags) & IFF_PROMISC) {
2315                 int inc = (flags & IFF_PROMISC) ? +1 : -1;
2316                 dev->gflags ^= IFF_PROMISC;
2317                 dev_set_promiscuity(dev, inc);
2318         }
2319
2320         /* NOTE: order of synchronization of IFF_PROMISC and IFF_ALLMULTI
2321            is important. Some (broken) drivers set IFF_PROMISC, when
2322            IFF_ALLMULTI is requested not asking us and not reporting.
2323          */
2324         if ((flags ^ dev->gflags) & IFF_ALLMULTI) {
2325                 int inc = (flags & IFF_ALLMULTI) ? +1 : -1;
2326                 dev->gflags ^= IFF_ALLMULTI;
2327                 dev_set_allmulti(dev, inc);
2328         }
2329
2330         if (old_flags ^ dev->flags)
2331                 rtmsg_ifinfo(RTM_NEWLINK, dev, old_flags ^ dev->flags);
2332
2333         return ret;
2334 }
2335
2336 int dev_set_mtu(struct net_device *dev, int new_mtu)
2337 {
2338         int err;
2339
2340         if (new_mtu == dev->mtu)
2341                 return 0;
2342
2343         /*      MTU must be positive.    */
2344         if (new_mtu < 0)
2345                 return -EINVAL;
2346
2347         if (!netif_device_present(dev))
2348                 return -ENODEV;
2349
2350         err = 0;
2351         if (dev->change_mtu)
2352                 err = dev->change_mtu(dev, new_mtu);
2353         else
2354                 dev->mtu = new_mtu;
2355         if (!err && dev->flags & IFF_UP)
2356                 blocking_notifier_call_chain(&netdev_chain,
2357                                 NETDEV_CHANGEMTU, dev);
2358         return err;
2359 }
2360
2361 int dev_set_mac_address(struct net_device *dev, struct sockaddr *sa)
2362 {
2363         int err;
2364
2365         if (!dev->set_mac_address)
2366                 return -EOPNOTSUPP;
2367         if (sa->sa_family != dev->type)
2368                 return -EINVAL;
2369         if (!netif_device_present(dev))
2370                 return -ENODEV;
2371         err = dev->set_mac_address(dev, sa);
2372         if (!err)
2373                 blocking_notifier_call_chain(&netdev_chain,
2374                                 NETDEV_CHANGEADDR, dev);
2375         return err;
2376 }
2377
2378 /*
2379  *      Perform the SIOCxIFxxx calls.
2380  */
2381 static int dev_ifsioc(struct ifreq *ifr, unsigned int cmd)
2382 {
2383         int err;
2384         struct net_device *dev = __dev_get_by_name(ifr->ifr_name);
2385
2386         if (!dev)
2387                 return -ENODEV;
2388
2389         switch (cmd) {
2390                 case SIOCGIFFLAGS:      /* Get interface flags */
2391                         ifr->ifr_flags = dev_get_flags(dev);
2392                         return 0;
2393
2394                 case SIOCSIFFLAGS:      /* Set interface flags */
2395                         return dev_change_flags(dev, ifr->ifr_flags);
2396
2397                 case SIOCGIFMETRIC:     /* Get the metric on the interface
2398                                            (currently unused) */
2399                         ifr->ifr_metric = 0;
2400                         return 0;
2401
2402                 case SIOCSIFMETRIC:     /* Set the metric on the interface
2403                                            (currently unused) */
2404                         return -EOPNOTSUPP;
2405
2406                 case SIOCGIFMTU:        /* Get the MTU of a device */
2407                         ifr->ifr_mtu = dev->mtu;
2408                         return 0;
2409
2410                 case SIOCSIFMTU:        /* Set the MTU of a device */
2411                         return dev_set_mtu(dev, ifr->ifr_mtu);
2412
2413                 case SIOCGIFHWADDR:
2414                         if (!dev->addr_len)
2415                                 memset(ifr->ifr_hwaddr.sa_data, 0, sizeof ifr->ifr_hwaddr.sa_data);
2416                         else
2417                                 memcpy(ifr->ifr_hwaddr.sa_data, dev->dev_addr,
2418                                        min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2419                         ifr->ifr_hwaddr.sa_family = dev->type;
2420                         return 0;
2421
2422                 case SIOCSIFHWADDR:
2423                         return dev_set_mac_address(dev, &ifr->ifr_hwaddr);
2424
2425                 case SIOCSIFHWBROADCAST:
2426                         if (ifr->ifr_hwaddr.sa_family != dev->type)
2427                                 return -EINVAL;
2428                         memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data,
2429                                min(sizeof ifr->ifr_hwaddr.sa_data, (size_t) dev->addr_len));
2430                         blocking_notifier_call_chain(&netdev_chain,
2431                                             NETDEV_CHANGEADDR, dev);
2432                         return 0;
2433
2434                 case SIOCGIFMAP:
2435                         ifr->ifr_map.mem_start = dev->mem_start;
2436                         ifr->ifr_map.mem_end   = dev->mem_end;
2437                         ifr->ifr_map.base_addr = dev->base_addr;
2438                         ifr->ifr_map.irq       = dev->irq;
2439                         ifr->ifr_map.dma       = dev->dma;
2440                         ifr->ifr_map.port      = dev->if_port;
2441                         return 0;
2442
2443                 case SIOCSIFMAP:
2444                         if (dev->set_config) {
2445                                 if (!netif_device_present(dev))
2446                                         return -ENODEV;
2447                                 return dev->set_config(dev, &ifr->ifr_map);
2448                         }
2449                         return -EOPNOTSUPP;
2450
2451                 case SIOCADDMULTI:
2452                         if (!dev->set_multicast_list ||
2453                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2454                                 return -EINVAL;
2455                         if (!netif_device_present(dev))
2456                                 return -ENODEV;
2457                         return dev_mc_add(dev, ifr->ifr_hwaddr.sa_data,
2458                                           dev->addr_len, 1);
2459
2460                 case SIOCDELMULTI:
2461                         if (!dev->set_multicast_list ||
2462                             ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
2463                                 return -EINVAL;
2464                         if (!netif_device_present(dev))
2465                                 return -ENODEV;
2466                         return dev_mc_delete(dev, ifr->ifr_hwaddr.sa_data,
2467                                              dev->addr_len, 1);
2468
2469                 case SIOCGIFINDEX:
2470                         ifr->ifr_ifindex = dev->ifindex;
2471                         return 0;
2472
2473                 case SIOCGIFTXQLEN:
2474                         ifr->ifr_qlen = dev->tx_queue_len;
2475                         return 0;
2476
2477                 case SIOCSIFTXQLEN:
2478                         if (ifr->ifr_qlen < 0)
2479                                 return -EINVAL;
2480                         dev->tx_queue_len = ifr->ifr_qlen;
2481                         return 0;
2482
2483                 case SIOCSIFNAME:
2484                         ifr->ifr_newname[IFNAMSIZ-1] = '\0';
2485                         return dev_change_name(dev, ifr->ifr_newname);
2486
2487                 /*
2488                  *      Unknown or private ioctl
2489                  */
2490
2491                 default:
2492                         if ((cmd >= SIOCDEVPRIVATE &&
2493                             cmd <= SIOCDEVPRIVATE + 15) ||
2494                             cmd == SIOCBONDENSLAVE ||
2495                             cmd == SIOCBONDRELEASE ||
2496                             cmd == SIOCBONDSETHWADDR ||
2497                             cmd == SIOCBONDSLAVEINFOQUERY ||
2498                             cmd == SIOCBONDINFOQUERY ||
2499                             cmd == SIOCBONDCHANGEACTIVE ||
2500                             cmd == SIOCGMIIPHY ||
2501                             cmd == SIOCGMIIREG ||
2502                             cmd == SIOCSMIIREG ||
2503                             cmd == SIOCBRADDIF ||
2504                             cmd == SIOCBRDELIF ||
2505                             cmd == SIOCWANDEV) {
2506                                 err = -EOPNOTSUPP;
2507                                 if (dev->do_ioctl) {
2508                                         if (netif_device_present(dev))
2509                                                 err = dev->do_ioctl(dev, ifr,
2510                                                                     cmd);
2511                                         else
2512                                                 err = -ENODEV;
2513                                 }
2514                         } else
2515                                 err = -EINVAL;
2516
2517         }
2518         return err;
2519 }
2520
2521 /*
2522  *      This function handles all "interface"-type I/O control requests. The actual
2523  *      'doing' part of this is dev_ifsioc above.
2524  */
2525
2526 /**
2527  *      dev_ioctl       -       network device ioctl
2528  *      @cmd: command to issue
2529  *      @arg: pointer to a struct ifreq in user space
2530  *
2531  *      Issue ioctl functions to devices. This is normally called by the
2532  *      user space syscall interfaces but can sometimes be useful for
2533  *      other purposes. The return value is the return from the syscall if
2534  *      positive or a negative errno code on error.
2535  */
2536
2537 int dev_ioctl(unsigned int cmd, void __user *arg)
2538 {
2539         struct ifreq ifr;
2540         int ret;
2541         char *colon;
2542
2543         /* One special case: SIOCGIFCONF takes ifconf argument
2544            and requires shared lock, because it sleeps writing
2545            to user space.
2546          */
2547
2548         if (cmd == SIOCGIFCONF) {
2549                 rtnl_lock();
2550                 ret = dev_ifconf((char __user *) arg);
2551                 rtnl_unlock();
2552                 return ret;
2553         }
2554         if (cmd == SIOCGIFNAME)
2555                 return dev_ifname((struct ifreq __user *)arg);
2556
2557         if (copy_from_user(&ifr, arg, sizeof(struct ifreq)))
2558                 return -EFAULT;
2559
2560         ifr.ifr_name[IFNAMSIZ-1] = 0;
2561
2562         colon = strchr(ifr.ifr_name, ':');
2563         if (colon)
2564                 *colon = 0;
2565
2566         /*
2567          *      See which interface the caller is talking about.
2568          */
2569
2570         switch (cmd) {
2571                 /*
2572                  *      These ioctl calls:
2573                  *      - can be done by all.
2574                  *      - atomic and do not require locking.
2575                  *      - return a value
2576                  */
2577                 case SIOCGIFFLAGS:
2578                 case SIOCGIFMETRIC:
2579                 case SIOCGIFMTU:
2580                 case SIOCGIFHWADDR:
2581                 case SIOCGIFSLAVE:
2582                 case SIOCGIFMAP:
2583                 case SIOCGIFINDEX:
2584                 case SIOCGIFTXQLEN:
2585                         dev_load(ifr.ifr_name);
2586                         read_lock(&dev_base_lock);
2587                         ret = dev_ifsioc(&ifr, cmd);
2588                         read_unlock(&dev_base_lock);
2589                         if (!ret) {
2590                                 if (colon)
2591                                         *colon = ':';
2592                                 if (copy_to_user(arg, &ifr,
2593                                                  sizeof(struct ifreq)))
2594                                         ret = -EFAULT;
2595                         }
2596                         return ret;
2597
2598                 case SIOCETHTOOL:
2599                         dev_load(ifr.ifr_name);
2600                         rtnl_lock();
2601                         ret = dev_ethtool(&ifr);
2602                         rtnl_unlock();
2603                         if (!ret) {
2604                                 if (colon)
2605                                         *colon = ':';
2606                                 if (copy_to_user(arg, &ifr,
2607                                                  sizeof(struct ifreq)))
2608                                         ret = -EFAULT;
2609                         }
2610                         return ret;
2611
2612                 /*
2613                  *      These ioctl calls:
2614                  *      - require superuser power.
2615                  *      - require strict serialization.
2616                  *      - return a value
2617                  */
2618                 case SIOCGMIIPHY:
2619                 case SIOCGMIIREG:
2620                 case SIOCSIFNAME:
2621                         if (!capable(CAP_NET_ADMIN))
2622                                 return -EPERM;
2623                         dev_load(ifr.ifr_name);
2624                         rtnl_lock();
2625                         ret = dev_ifsioc(&ifr, cmd);
2626                         rtnl_unlock();
2627                         if (!ret) {
2628                                 if (colon)
2629                                         *colon = ':';
2630                                 if (copy_to_user(arg, &ifr,
2631                                                  sizeof(struct ifreq)))
2632                                         ret = -EFAULT;
2633                         }
2634                         return ret;
2635
2636                 /*
2637                  *      These ioctl calls:
2638                  *      - require superuser power.
2639                  *      - require strict serialization.
2640                  *      - do not return a value
2641                  */
2642                 case SIOCSIFFLAGS:
2643                 case SIOCSIFMETRIC:
2644                 case SIOCSIFMTU:
2645                 case SIOCSIFMAP:
2646                 case SIOCSIFHWADDR:
2647                 case SIOCSIFSLAVE:
2648                 case SIOCADDMULTI:
2649                 case SIOCDELMULTI:
2650                 case SIOCSIFHWBROADCAST:
2651                 case SIOCSIFTXQLEN:
2652                 case SIOCSMIIREG:
2653                 case SIOCBONDENSLAVE:
2654                 case SIOCBONDRELEASE:
2655                 case SIOCBONDSETHWADDR:
2656                 case SIOCBONDCHANGEACTIVE:
2657                 case SIOCBRADDIF:
2658                 case SIOCBRDELIF:
2659                         if (!capable(CAP_NET_ADMIN))
2660                                 return -EPERM;
2661                         /* fall through */
2662                 case SIOCBONDSLAVEINFOQUERY:
2663                 case SIOCBONDINFOQUERY:
2664                         dev_load(ifr.ifr_name);
2665                         rtnl_lock();
2666                         ret = dev_ifsioc(&ifr, cmd);
2667                         rtnl_unlock();
2668                         return ret;
2669
2670                 case SIOCGIFMEM:
2671                         /* Get the per device memory space. We can add this but
2672                          * currently do not support it */
2673                 case SIOCSIFMEM:
2674                         /* Set the per device memory buffer space.
2675                          * Not applicable in our case */
2676                 case SIOCSIFLINK:
2677                         return -EINVAL;
2678
2679                 /*
2680                  *      Unknown or private ioctl.
2681                  */
2682                 default:
2683                         if (cmd == SIOCWANDEV ||
2684                             (cmd >= SIOCDEVPRIVATE &&
2685                              cmd <= SIOCDEVPRIVATE + 15)) {
2686                                 dev_load(ifr.ifr_name);
2687                                 rtnl_lock();
2688                                 ret = dev_ifsioc(&ifr, cmd);
2689                                 rtnl_unlock();
2690                                 if (!ret && copy_to_user(arg, &ifr,
2691                                                          sizeof(struct ifreq)))
2692                                         ret = -EFAULT;
2693                                 return ret;
2694                         }
2695 #ifdef CONFIG_WIRELESS_EXT
2696                         /* Take care of Wireless Extensions */
2697                         if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) {
2698                                 /* If command is `set a parameter', or
2699                                  * `get the encoding parameters', check if
2700                                  * the user has the right to do it */
2701                                 if (IW_IS_SET(cmd) || cmd == SIOCGIWENCODE) {
2702                                         if (!capable(CAP_NET_ADMIN))
2703                                                 return -EPERM;
2704                                 }
2705                                 dev_load(ifr.ifr_name);
2706                                 rtnl_lock();
2707                                 /* Follow me in net/core/wireless.c */
2708                                 ret = wireless_process_ioctl(&ifr, cmd);
2709                                 rtnl_unlock();
2710                                 if (IW_IS_GET(cmd) &&
2711                                     copy_to_user(arg, &ifr,
2712                                                  sizeof(struct ifreq)))
2713                                         ret = -EFAULT;
2714                                 return ret;
2715                         }
2716 #endif  /* CONFIG_WIRELESS_EXT */
2717                         return -EINVAL;
2718         }
2719 }
2720
2721
2722 /**
2723  *      dev_new_index   -       allocate an ifindex
2724  *
2725  *      Returns a suitable unique value for a new device interface
2726  *      number.  The caller must hold the rtnl semaphore or the
2727  *      dev_base_lock to be sure it remains unique.
2728  */
2729 static int dev_new_index(void)
2730 {
2731         static int ifindex;
2732         for (;;) {
2733                 if (++ifindex <= 0)
2734                         ifindex = 1;
2735                 if (!__dev_get_by_index(ifindex))
2736                         return ifindex;
2737         }
2738 }
2739
2740 static int dev_boot_phase = 1;
2741
2742 /* Delayed registration/unregisteration */
2743 static DEFINE_SPINLOCK(net_todo_list_lock);
2744 static struct list_head net_todo_list = LIST_HEAD_INIT(net_todo_list);
2745
2746 static inline void net_set_todo(struct net_device *dev)
2747 {
2748         spin_lock(&net_todo_list_lock);
2749         list_add_tail(&dev->todo_list, &net_todo_list);
2750         spin_unlock(&net_todo_list_lock);
2751 }
2752
2753 /**
2754  *      register_netdevice      - register a network device
2755  *      @dev: device to register
2756  *
2757  *      Take a completed network device structure and add it to the kernel
2758  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2759  *      chain. 0 is returned on success. A negative errno code is returned
2760  *      on a failure to set up the device, or if the name is a duplicate.
2761  *
2762  *      Callers must hold the rtnl semaphore. You may want
2763  *      register_netdev() instead of this.
2764  *
2765  *      BUGS:
2766  *      The locking appears insufficient to guarantee two parallel registers
2767  *      will not get the same name.
2768  */
2769
2770 int register_netdevice(struct net_device *dev)
2771 {
2772         struct hlist_head *head;
2773         struct hlist_node *p;
2774         int ret;
2775
2776         BUG_ON(dev_boot_phase);
2777         ASSERT_RTNL();
2778
2779         /* When net_device's are persistent, this will be fatal. */
2780         BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
2781
2782         spin_lock_init(&dev->queue_lock);
2783         spin_lock_init(&dev->xmit_lock);
2784         dev->xmit_lock_owner = -1;
2785 #ifdef CONFIG_NET_CLS_ACT
2786         spin_lock_init(&dev->ingress_lock);
2787 #endif
2788
2789         ret = alloc_divert_blk(dev);
2790         if (ret)
2791                 goto out;
2792
2793         dev->iflink = -1;
2794
2795         /* Init, if this function is available */
2796         if (dev->init) {
2797                 ret = dev->init(dev);
2798                 if (ret) {
2799                         if (ret > 0)
2800                                 ret = -EIO;
2801                         goto out_err;
2802                 }
2803         }
2804
2805         if (!dev_valid_name(dev->name)) {
2806                 ret = -EINVAL;
2807                 goto out_err;
2808         }
2809
2810         dev->ifindex = dev_new_index();
2811         if (dev->iflink == -1)
2812                 dev->iflink = dev->ifindex;
2813
2814         /* Check for existence of name */
2815         head = dev_name_hash(dev->name);
2816         hlist_for_each(p, head) {
2817                 struct net_device *d
2818                         = hlist_entry(p, struct net_device, name_hlist);
2819                 if (!strncmp(d->name, dev->name, IFNAMSIZ)) {
2820                         ret = -EEXIST;
2821                         goto out_err;
2822                 }
2823         }
2824
2825         /* Fix illegal SG+CSUM combinations. */
2826         if ((dev->features & NETIF_F_SG) &&
2827             !(dev->features & (NETIF_F_IP_CSUM |
2828                                NETIF_F_NO_CSUM |
2829                                NETIF_F_HW_CSUM))) {
2830                 printk("%s: Dropping NETIF_F_SG since no checksum feature.\n",
2831                        dev->name);
2832                 dev->features &= ~NETIF_F_SG;
2833         }
2834
2835         /* TSO requires that SG is present as well. */
2836         if ((dev->features & NETIF_F_TSO) &&
2837             !(dev->features & NETIF_F_SG)) {
2838                 printk("%s: Dropping NETIF_F_TSO since no SG feature.\n",
2839                        dev->name);
2840                 dev->features &= ~NETIF_F_TSO;
2841         }
2842         if (dev->features & NETIF_F_UFO) {
2843                 if (!(dev->features & NETIF_F_HW_CSUM)) {
2844                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
2845                                         "NETIF_F_HW_CSUM feature.\n",
2846                                                         dev->name);
2847                         dev->features &= ~NETIF_F_UFO;
2848                 }
2849                 if (!(dev->features & NETIF_F_SG)) {
2850                         printk(KERN_ERR "%s: Dropping NETIF_F_UFO since no "
2851                                         "NETIF_F_SG feature.\n",
2852                                         dev->name);
2853                         dev->features &= ~NETIF_F_UFO;
2854                 }
2855         }
2856
2857         /*
2858          *      nil rebuild_header routine,
2859          *      that should be never called and used as just bug trap.
2860          */
2861
2862         if (!dev->rebuild_header)
2863                 dev->rebuild_header = default_rebuild_header;
2864
2865         /*
2866          *      Default initial state at registry is that the
2867          *      device is present.
2868          */
2869
2870         set_bit(__LINK_STATE_PRESENT, &dev->state);
2871
2872         dev->next = NULL;
2873         dev_init_scheduler(dev);
2874         write_lock_bh(&dev_base_lock);
2875         *dev_tail = dev;
2876         dev_tail = &dev->next;
2877         hlist_add_head(&dev->name_hlist, head);
2878         hlist_add_head(&dev->index_hlist, dev_index_hash(dev->ifindex));
2879         dev_hold(dev);
2880         dev->reg_state = NETREG_REGISTERING;
2881         write_unlock_bh(&dev_base_lock);
2882
2883         /* Notify protocols, that a new device appeared. */
2884         blocking_notifier_call_chain(&netdev_chain, NETDEV_REGISTER, dev);
2885
2886         /* Finish registration after unlock */
2887         net_set_todo(dev);
2888         ret = 0;
2889
2890 out:
2891         return ret;
2892 out_err:
2893         free_divert_blk(dev);
2894         goto out;
2895 }
2896
2897 /**
2898  *      register_netdev - register a network device
2899  *      @dev: device to register
2900  *
2901  *      Take a completed network device structure and add it to the kernel
2902  *      interfaces. A %NETDEV_REGISTER message is sent to the netdev notifier
2903  *      chain. 0 is returned on success. A negative errno code is returned
2904  *      on a failure to set up the device, or if the name is a duplicate.
2905  *
2906  *      This is a wrapper around register_netdev that takes the rtnl semaphore
2907  *      and expands the device name if you passed a format string to
2908  *      alloc_netdev.
2909  */
2910 int register_netdev(struct net_device *dev)
2911 {
2912         int err;
2913
2914         rtnl_lock();
2915
2916         /*
2917          * If the name is a format string the caller wants us to do a
2918          * name allocation.
2919          */
2920         if (strchr(dev->name, '%')) {
2921                 err = dev_alloc_name(dev, dev->name);
2922                 if (err < 0)
2923                         goto out;
2924         }
2925
2926         /*
2927          * Back compatibility hook. Kill this one in 2.5
2928          */
2929         if (dev->name[0] == 0 || dev->name[0] == ' ') {
2930                 err = dev_alloc_name(dev, "eth%d");
2931                 if (err < 0)
2932                         goto out;
2933         }
2934
2935         err = register_netdevice(dev);
2936 out:
2937         rtnl_unlock();
2938         return err;
2939 }
2940 EXPORT_SYMBOL(register_netdev);
2941
2942 /*
2943  * netdev_wait_allrefs - wait until all references are gone.
2944  *
2945  * This is called when unregistering network devices.
2946  *
2947  * Any protocol or device that holds a reference should register
2948  * for netdevice notification, and cleanup and put back the
2949  * reference if they receive an UNREGISTER event.
2950  * We can get stuck here if buggy protocols don't correctly
2951  * call dev_put.
2952  */
2953 static void netdev_wait_allrefs(struct net_device *dev)
2954 {
2955         unsigned long rebroadcast_time, warning_time;
2956
2957         rebroadcast_time = warning_time = jiffies;
2958         while (atomic_read(&dev->refcnt) != 0) {
2959                 if (time_after(jiffies, rebroadcast_time + 1 * HZ)) {
2960                         rtnl_lock();
2961
2962                         /* Rebroadcast unregister notification */
2963                         blocking_notifier_call_chain(&netdev_chain,
2964                                             NETDEV_UNREGISTER, dev);
2965
2966                         if (test_bit(__LINK_STATE_LINKWATCH_PENDING,
2967                                      &dev->state)) {
2968                                 /* We must not have linkwatch events
2969                                  * pending on unregister. If this
2970                                  * happens, we simply run the queue
2971                                  * unscheduled, resulting in a noop
2972                                  * for this device.
2973                                  */
2974                                 linkwatch_run_queue();
2975                         }
2976
2977                         __rtnl_unlock();
2978
2979                         rebroadcast_time = jiffies;
2980                 }
2981
2982                 msleep(250);
2983
2984                 if (time_after(jiffies, warning_time + 10 * HZ)) {
2985                         printk(KERN_EMERG "unregister_netdevice: "
2986                                "waiting for %s to become free. Usage "
2987                                "count = %d\n",
2988                                dev->name, atomic_read(&dev->refcnt));
2989                         warning_time = jiffies;
2990                 }
2991         }
2992 }
2993
2994 /* The sequence is:
2995  *
2996  *      rtnl_lock();
2997  *      ...
2998  *      register_netdevice(x1);
2999  *      register_netdevice(x2);
3000  *      ...
3001  *      unregister_netdevice(y1);
3002  *      unregister_netdevice(y2);
3003  *      ...
3004  *      rtnl_unlock();
3005  *      free_netdev(y1);
3006  *      free_netdev(y2);
3007  *
3008  * We are invoked by rtnl_unlock() after it drops the semaphore.
3009  * This allows us to deal with problems:
3010  * 1) We can create/delete sysfs objects which invoke hotplug
3011  *    without deadlocking with linkwatch via keventd.
3012  * 2) Since we run with the RTNL semaphore not held, we can sleep
3013  *    safely in order to wait for the netdev refcnt to drop to zero.
3014  */
3015 static DEFINE_MUTEX(net_todo_run_mutex);
3016 void netdev_run_todo(void)
3017 {
3018         struct list_head list = LIST_HEAD_INIT(list);
3019         int err;
3020
3021
3022         /* Need to guard against multiple cpu's getting out of order. */
3023         mutex_lock(&net_todo_run_mutex);
3024
3025         /* Not safe to do outside the semaphore.  We must not return
3026          * until all unregister events invoked by the local processor
3027          * have been completed (either by this todo run, or one on
3028          * another cpu).
3029          */
3030         if (list_empty(&net_todo_list))
3031                 goto out;
3032
3033         /* Snapshot list, allow later requests */
3034         spin_lock(&net_todo_list_lock);
3035         list_splice_init(&net_todo_list, &list);
3036         spin_unlock(&net_todo_list_lock);
3037
3038         while (!list_empty(&list)) {
3039                 struct net_device *dev
3040                         = list_entry(list.next, struct net_device, todo_list);
3041                 list_del(&dev->todo_list);
3042
3043                 switch(dev->reg_state) {
3044                 case NETREG_REGISTERING:
3045                         dev->reg_state = NETREG_REGISTERED;
3046                         err = netdev_register_sysfs(dev);
3047                         if (err)
3048                                 printk(KERN_ERR "%s: failed sysfs registration (%d)\n",
3049                                        dev->name, err);
3050                         break;
3051
3052                 case NETREG_UNREGISTERING:
3053                         netdev_unregister_sysfs(dev);
3054                         dev->reg_state = NETREG_UNREGISTERED;
3055
3056                         netdev_wait_allrefs(dev);
3057
3058                         /* paranoia */
3059                         BUG_ON(atomic_read(&dev->refcnt));
3060                         BUG_TRAP(!dev->ip_ptr);
3061                         BUG_TRAP(!dev->ip6_ptr);
3062                         BUG_TRAP(!dev->dn_ptr);
3063
3064
3065                         /* It must be the very last action,
3066                          * after this 'dev' may point to freed up memory.
3067                          */
3068                         if (dev->destructor)
3069                                 dev->destructor(dev);
3070                         break;
3071
3072                 default:
3073                         printk(KERN_ERR "network todo '%s' but state %d\n",
3074                                dev->name, dev->reg_state);
3075                         break;
3076                 }
3077         }
3078
3079 out:
3080         mutex_unlock(&net_todo_run_mutex);
3081 }
3082
3083 /**
3084  *      alloc_netdev - allocate network device
3085  *      @sizeof_priv:   size of private data to allocate space for
3086  *      @name:          device name format string
3087  *      @setup:         callback to initialize device
3088  *
3089  *      Allocates a struct net_device with private data area for driver use
3090  *      and performs basic initialization.
3091  */
3092 struct net_device *alloc_netdev(int sizeof_priv, const char *name,
3093                 void (*setup)(struct net_device *))
3094 {
3095         void *p;
3096         struct net_device *dev;
3097         int alloc_size;
3098
3099         /* ensure 32-byte alignment of both the device and private area */
3100         alloc_size = (sizeof(*dev) + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST;
3101         alloc_size += sizeof_priv + NETDEV_ALIGN_CONST;
3102
3103         p = kzalloc(alloc_size, GFP_KERNEL);
3104         if (!p) {
3105                 printk(KERN_ERR "alloc_dev: Unable to allocate device.\n");
3106                 return NULL;
3107         }
3108
3109         dev = (struct net_device *)
3110                 (((long)p + NETDEV_ALIGN_CONST) & ~NETDEV_ALIGN_CONST);
3111         dev->padded = (char *)dev - (char *)p;
3112
3113         if (sizeof_priv)
3114                 dev->priv = netdev_priv(dev);
3115
3116         setup(dev);
3117         strcpy(dev->name, name);
3118         return dev;
3119 }
3120 EXPORT_SYMBOL(alloc_netdev);
3121
3122 /**
3123  *      free_netdev - free network device
3124  *      @dev: device
3125  *
3126  *      This function does the last stage of destroying an allocated device
3127  *      interface. The reference to the device object is released.
3128  *      If this is the last reference then it will be freed.
3129  */
3130 void free_netdev(struct net_device *dev)
3131 {
3132 #ifdef CONFIG_SYSFS
3133         /*  Compatiablity with error handling in drivers */
3134         if (dev->reg_state == NETREG_UNINITIALIZED) {
3135                 kfree((char *)dev - dev->padded);
3136                 return;
3137         }
3138
3139         BUG_ON(dev->reg_state != NETREG_UNREGISTERED);
3140         dev->reg_state = NETREG_RELEASED;
3141
3142         /* will free via class release */
3143         class_device_put(&dev->class_dev);
3144 #else
3145         kfree((char *)dev - dev->padded);
3146 #endif
3147 }
3148
3149 /* Synchronize with packet receive processing. */
3150 void synchronize_net(void)
3151 {
3152         might_sleep();
3153         synchronize_rcu();
3154 }
3155
3156 /**
3157  *      unregister_netdevice - remove device from the kernel
3158  *      @dev: device
3159  *
3160  *      This function shuts down a device interface and removes it
3161  *      from the kernel tables. On success 0 is returned, on a failure
3162  *      a negative errno code is returned.
3163  *
3164  *      Callers must hold the rtnl semaphore.  You may want
3165  *      unregister_netdev() instead of this.
3166  */
3167
3168 int unregister_netdevice(struct net_device *dev)
3169 {
3170         struct net_device *d, **dp;
3171
3172         BUG_ON(dev_boot_phase);
3173         ASSERT_RTNL();
3174
3175         /* Some devices call without registering for initialization unwind. */
3176         if (dev->reg_state == NETREG_UNINITIALIZED) {
3177                 printk(KERN_DEBUG "unregister_netdevice: device %s/%p never "
3178                                   "was registered\n", dev->name, dev);
3179                 return -ENODEV;
3180         }
3181
3182         BUG_ON(dev->reg_state != NETREG_REGISTERED);
3183
3184         /* If device is running, close it first. */
3185         if (dev->flags & IFF_UP)
3186                 dev_close(dev);
3187
3188         /* And unlink it from device chain. */
3189         for (dp = &dev_base; (d = *dp) != NULL; dp = &d->next) {
3190                 if (d == dev) {
3191                         write_lock_bh(&dev_base_lock);
3192                         hlist_del(&dev->name_hlist);
3193                         hlist_del(&dev->index_hlist);
3194                         if (dev_tail == &dev->next)
3195                                 dev_tail = dp;
3196                         *dp = d->next;
3197                         write_unlock_bh(&dev_base_lock);
3198                         break;
3199                 }
3200         }
3201         if (!d) {
3202                 printk(KERN_ERR "unregister net_device: '%s' not found\n",
3203                        dev->name);
3204                 return -ENODEV;
3205         }
3206
3207         dev->reg_state = NETREG_UNREGISTERING;
3208
3209         synchronize_net();
3210
3211         /* Shutdown queueing discipline. */
3212         dev_shutdown(dev);
3213
3214
3215         /* Notify protocols, that we are about to destroy
3216            this device. They should clean all the things.
3217         */
3218         blocking_notifier_call_chain(&netdev_chain, NETDEV_UNREGISTER, dev);
3219
3220         /*
3221          *      Flush the multicast chain
3222          */
3223         dev_mc_discard(dev);
3224
3225         if (dev->uninit)
3226                 dev->uninit(dev);
3227
3228         /* Notifier chain MUST detach us from master device. */
3229         BUG_TRAP(!dev->master);
3230
3231         free_divert_blk(dev);
3232
3233         /* Finish processing unregister after unlock */
3234         net_set_todo(dev);
3235
3236         synchronize_net();
3237
3238         dev_put(dev);
3239         return 0;
3240 }
3241
3242 /**
3243  *      unregister_netdev - remove device from the kernel
3244  *      @dev: device
3245  *
3246  *      This function shuts down a device interface and removes it
3247  *      from the kernel tables. On success 0 is returned, on a failure
3248  *      a negative errno code is returned.
3249  *
3250  *      This is just a wrapper for unregister_netdevice that takes
3251  *      the rtnl semaphore.  In general you want to use this and not
3252  *      unregister_netdevice.
3253  */
3254 void unregister_netdev(struct net_device *dev)
3255 {
3256         rtnl_lock();
3257         unregister_netdevice(dev);
3258         rtnl_unlock();
3259 }
3260
3261 EXPORT_SYMBOL(unregister_netdev);
3262
3263 #ifdef CONFIG_HOTPLUG_CPU
3264 static int dev_cpu_callback(struct notifier_block *nfb,
3265                             unsigned long action,
3266                             void *ocpu)
3267 {
3268         struct sk_buff **list_skb;
3269         struct net_device **list_net;
3270         struct sk_buff *skb;
3271         unsigned int cpu, oldcpu = (unsigned long)ocpu;
3272         struct softnet_data *sd, *oldsd;
3273
3274         if (action != CPU_DEAD)
3275                 return NOTIFY_OK;
3276
3277         local_irq_disable();
3278         cpu = smp_processor_id();
3279         sd = &per_cpu(softnet_data, cpu);
3280         oldsd = &per_cpu(softnet_data, oldcpu);
3281
3282         /* Find end of our completion_queue. */
3283         list_skb = &sd->completion_queue;
3284         while (*list_skb)
3285                 list_skb = &(*list_skb)->next;
3286         /* Append completion queue from offline CPU. */
3287         *list_skb = oldsd->completion_queue;
3288         oldsd->completion_queue = NULL;
3289
3290         /* Find end of our output_queue. */
3291         list_net = &sd->output_queue;
3292         while (*list_net)
3293                 list_net = &(*list_net)->next_sched;
3294         /* Append output queue from offline CPU. */
3295         *list_net = oldsd->output_queue;
3296         oldsd->output_queue = NULL;
3297
3298         raise_softirq_irqoff(NET_TX_SOFTIRQ);
3299         local_irq_enable();
3300
3301         /* Process offline CPU's input_pkt_queue */
3302         while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
3303                 netif_rx(skb);
3304
3305         return NOTIFY_OK;
3306 }
3307 #endif /* CONFIG_HOTPLUG_CPU */
3308
3309
3310 /*
3311  *      Initialize the DEV module. At boot time this walks the device list and
3312  *      unhooks any devices that fail to initialise (normally hardware not
3313  *      present) and leaves us with a valid list of present and active devices.
3314  *
3315  */
3316
3317 /*
3318  *       This is called single threaded during boot, so no need
3319  *       to take the rtnl semaphore.
3320  */
3321 static int __init net_dev_init(void)
3322 {
3323         int i, rc = -ENOMEM;
3324
3325         BUG_ON(!dev_boot_phase);
3326
3327         net_random_init();
3328
3329         if (dev_proc_init())
3330                 goto out;
3331
3332         if (netdev_sysfs_init())
3333                 goto out;
3334
3335         INIT_LIST_HEAD(&ptype_all);
3336         for (i = 0; i < 16; i++)
3337                 INIT_LIST_HEAD(&ptype_base[i]);
3338
3339         for (i = 0; i < ARRAY_SIZE(dev_name_head); i++)
3340                 INIT_HLIST_HEAD(&dev_name_head[i]);
3341
3342         for (i = 0; i < ARRAY_SIZE(dev_index_head); i++)
3343                 INIT_HLIST_HEAD(&dev_index_head[i]);
3344
3345         /*
3346          *      Initialise the packet receive queues.
3347          */
3348
3349         for_each_possible_cpu(i) {
3350                 struct softnet_data *queue;
3351
3352                 queue = &per_cpu(softnet_data, i);
3353                 skb_queue_head_init(&queue->input_pkt_queue);
3354                 queue->completion_queue = NULL;
3355                 INIT_LIST_HEAD(&queue->poll_list);
3356                 set_bit(__LINK_STATE_START, &queue->backlog_dev.state);
3357                 queue->backlog_dev.weight = weight_p;
3358                 queue->backlog_dev.poll = process_backlog;
3359                 atomic_set(&queue->backlog_dev.refcnt, 1);
3360         }
3361
3362         dev_boot_phase = 0;
3363
3364         open_softirq(NET_TX_SOFTIRQ, net_tx_action, NULL);
3365         open_softirq(NET_RX_SOFTIRQ, net_rx_action, NULL);
3366
3367         hotcpu_notifier(dev_cpu_callback, 0);
3368         dst_init();
3369         dev_mcast_init();
3370         rc = 0;
3371 out:
3372         return rc;
3373 }
3374
3375 subsys_initcall(net_dev_init);
3376
3377 EXPORT_SYMBOL(__dev_get_by_index);
3378 EXPORT_SYMBOL(__dev_get_by_name);
3379 EXPORT_SYMBOL(__dev_remove_pack);
3380 EXPORT_SYMBOL(__skb_linearize);
3381 EXPORT_SYMBOL(dev_valid_name);
3382 EXPORT_SYMBOL(dev_add_pack);
3383 EXPORT_SYMBOL(dev_alloc_name);
3384 EXPORT_SYMBOL(dev_close);
3385 EXPORT_SYMBOL(dev_get_by_flags);
3386 EXPORT_SYMBOL(dev_get_by_index);
3387 EXPORT_SYMBOL(dev_get_by_name);
3388 EXPORT_SYMBOL(dev_open);
3389 EXPORT_SYMBOL(dev_queue_xmit);
3390 EXPORT_SYMBOL(dev_remove_pack);
3391 EXPORT_SYMBOL(dev_set_allmulti);
3392 EXPORT_SYMBOL(dev_set_promiscuity);
3393 EXPORT_SYMBOL(dev_change_flags);
3394 EXPORT_SYMBOL(dev_set_mtu);
3395 EXPORT_SYMBOL(dev_set_mac_address);
3396 EXPORT_SYMBOL(free_netdev);
3397 EXPORT_SYMBOL(netdev_boot_setup_check);
3398 EXPORT_SYMBOL(netdev_set_master);
3399 EXPORT_SYMBOL(netdev_state_change);
3400 EXPORT_SYMBOL(netif_receive_skb);
3401 EXPORT_SYMBOL(netif_rx);
3402 EXPORT_SYMBOL(register_gifconf);
3403 EXPORT_SYMBOL(register_netdevice);
3404 EXPORT_SYMBOL(register_netdevice_notifier);
3405 EXPORT_SYMBOL(skb_checksum_help);
3406 EXPORT_SYMBOL(synchronize_net);
3407 EXPORT_SYMBOL(unregister_netdevice);
3408 EXPORT_SYMBOL(unregister_netdevice_notifier);
3409 EXPORT_SYMBOL(net_enable_timestamp);
3410 EXPORT_SYMBOL(net_disable_timestamp);
3411 EXPORT_SYMBOL(dev_get_flags);
3412
3413 #if defined(CONFIG_BRIDGE) || defined(CONFIG_BRIDGE_MODULE)
3414 EXPORT_SYMBOL(br_handle_frame_hook);
3415 EXPORT_SYMBOL(br_fdb_get_hook);
3416 EXPORT_SYMBOL(br_fdb_put_hook);
3417 #endif
3418
3419 #ifdef CONFIG_KMOD
3420 EXPORT_SYMBOL(dev_load);
3421 #endif
3422
3423 EXPORT_PER_CPU_SYMBOL(softnet_data);