nfsd: track last inode only in use_wgather case
[safe/jmp/linux-2.6] / net / packet / af_packet.c
index bf26990..f546e81 100644 (file)
@@ -5,8 +5,6 @@
  *
  *             PACKET - implements raw packet sockets.
  *
- * Version:    $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
- *
  * Authors:    Ross Biro
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             Alan Cox, <gw4pts@gw4pts.ampr.org>
@@ -61,6 +59,7 @@
 #include <linux/wireless.h>
 #include <linux/kernel.h>
 #include <linux/kmod.h>
+#include <net/net_namespace.h>
 #include <net/ip.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
 #include <linux/poll.h>
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/mutex.h>
 
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
 
-#define CONFIG_SOCK_PACKET     1
-
-/*
-   Proposed replacement for SIOC{ADD,DEL}MULTI and
-   IFF_PROMISC, IFF_ALLMULTI flags.
-
-   It is more expensive, but I believe,
-   it is really correct solution: reentereble, safe and fault tolerant.
-
-   IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
-   reference count and global flag, so that real status is
-   (gflag|(count != 0)), so that we can use obsolete faulty interface
-   not harming clever users.
- */
-#define CONFIG_PACKET_MULTICAST        1
-
 /*
    Assumptions:
    - if device has no dev->hard_header routine, it adds and removes ll header
@@ -114,22 +98,22 @@ On receive:
 -----------
 
 Incoming, dev->hard_header!=NULL
-   mac.raw -> ll header
-   data    -> data
+   mac_header -> ll header
+   data       -> data
 
 Outgoing, dev->hard_header!=NULL
-   mac.raw -> ll header
-   data    -> ll header
+   mac_header -> ll header
+   data       -> ll header
 
 Incoming, dev->hard_header==NULL
-   mac.raw -> UNKNOWN position. It is very likely, that it points to ll header.
-             PPP makes it, that is wrong, because introduce assymetry
-             between rx and tx paths.
-   data    -> data
+   mac_header -> UNKNOWN position. It is very likely, that it points to ll
+                header.  PPP makes it, that is wrong, because introduce
+                assymetry between rx and tx paths.
+   data       -> data
 
 Outgoing, dev->hard_header==NULL
-   mac.raw -> data. ll header is still not built!
-   data    -> data
+   mac_header -> data. ll header is still not built!
+   data       -> data
 
 Resume
   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
@@ -139,27 +123,19 @@ On transmit:
 ------------
 
 dev->hard_header != NULL
-   mac.raw -> ll header
-   data    -> ll header
+   mac_header -> ll header
+   data       -> ll header
 
 dev->hard_header == NULL (ll header is added by device, we cannot control it)
-   mac.raw -> data
-   data -> data
+   mac_header -> data
+   data       -> data
 
    We should set nh.raw on output to correct posistion,
    packet classifier depends on it.
  */
 
-/* List of all packet sockets. */
-static HLIST_HEAD(packet_sklist);
-static DEFINE_RWLOCK(packet_sklist_lock);
-
-static atomic_t packet_socks_nr;
-
-
 /* Private packet socket structures. */
 
-#ifdef CONFIG_PACKET_MULTICAST
 struct packet_mclist
 {
        struct packet_mclist    *next;
@@ -179,7 +155,7 @@ struct packet_mreq_max
        unsigned short  mr_alen;
        unsigned char   mr_address[MAX_ADDR_LEN];
 };
-#endif
+
 #ifdef CONFIG_PACKET_MMAP
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
 #endif
@@ -200,18 +176,21 @@ struct packet_sock {
 #endif
        struct packet_type      prot_hook;
        spinlock_t              bind_lock;
+       struct mutex            pg_vec_lock;
        unsigned int            running:1,      /* prot_hook is attached*/
-                               auxdata:1;
+                               auxdata:1,
+                               origdev:1;
        int                     ifindex;        /* bound device         */
        __be16                  num;
-#ifdef CONFIG_PACKET_MULTICAST
        struct packet_mclist    *mclist;
-#endif
 #ifdef CONFIG_PACKET_MMAP
        atomic_t                mapped;
        unsigned int            pg_vec_order;
        unsigned int            pg_vec_pages;
        unsigned int            pg_vec_len;
+       enum tpacket_versions   tp_version;
+       unsigned int            tp_hdrlen;
+       unsigned int            tp_reserve;
 #endif
 };
 
@@ -227,17 +206,52 @@ struct packet_skb_cb {
 
 #ifdef CONFIG_PACKET_MMAP
 
-static inline char *packet_lookup_frame(struct packet_sock *po, unsigned int position)
+static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
+                                int status)
 {
        unsigned int pg_vec_pos, frame_offset;
-       char *frame;
+       union {
+               struct tpacket_hdr *h1;
+               struct tpacket2_hdr *h2;
+               void *raw;
+       } h;
 
        pg_vec_pos = position / po->frames_per_block;
        frame_offset = position % po->frames_per_block;
 
-       frame = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
+       h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
+       switch (po->tp_version) {
+       case TPACKET_V1:
+               if (status != (h.h1->tp_status ? TP_STATUS_USER :
+                                               TP_STATUS_KERNEL))
+                       return NULL;
+               break;
+       case TPACKET_V2:
+               if (status != (h.h2->tp_status ? TP_STATUS_USER :
+                                               TP_STATUS_KERNEL))
+                       return NULL;
+               break;
+       }
+       return h.raw;
+}
 
-       return frame;
+static void __packet_set_status(struct packet_sock *po, void *frame, int status)
+{
+       union {
+               struct tpacket_hdr *h1;
+               struct tpacket2_hdr *h2;
+               void *raw;
+       } h;
+
+       h.raw = frame;
+       switch (po->tp_version) {
+       case TPACKET_V1:
+               h.h1->tp_status = status;
+               break;
+       case TPACKET_V2:
+               h.h2->tp_status = status;
+               break;
+       }
 }
 #endif
 
@@ -248,24 +262,20 @@ static inline struct packet_sock *pkt_sk(struct sock *sk)
 
 static void packet_sock_destruct(struct sock *sk)
 {
-       BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
-       BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
+       WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+       WARN_ON(atomic_read(&sk->sk_wmem_alloc));
 
        if (!sock_flag(sk, SOCK_DEAD)) {
                printk("Attempt to release alive packet socket: %p\n", sk);
                return;
        }
 
-       atomic_dec(&packet_socks_nr);
-#ifdef PACKET_REFCNT_DEBUG
-       printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
-#endif
+       sk_refcnt_debug_dec(sk);
 }
 
 
 static const struct proto_ops packet_ops;
 
-#ifdef CONFIG_SOCK_PACKET
 static const struct proto_ops packet_ops_spkt;
 
 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
@@ -287,13 +297,16 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct
         *      Incoming packets have ll header pulled,
         *      push it back.
         *
-        *      For outgoing ones skb->data == skb->mac.raw
+        *      For outgoing ones skb->data == skb_mac_header(skb)
         *      so that this procedure is noop.
         */
 
        if (skb->pkt_type == PACKET_LOOPBACK)
                goto out;
 
+       if (dev_net(dev) != sock_net(sk))
+               goto out;
+
        if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
                goto oom;
 
@@ -306,7 +319,7 @@ static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct
 
        spkt = &PACKET_SKB_CB(skb)->sa.pkt;
 
-       skb_push(skb, skb->data-skb->mac.raw);
+       skb_push(skb, skb->data - skb_mac_header(skb));
 
        /*
         *      The SOCK_PACKET socket receives _all_ frames.
@@ -365,7 +378,7 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
         */
 
        saddr->spkt_device[13] = 0;
-       dev = dev_get_by_name(saddr->spkt_device);
+       dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
        err = -ENODEV;
        if (dev == NULL)
                goto out_unlock;
@@ -404,14 +417,14 @@ static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
         * notable one here. This should really be fixed at the driver level.
         */
        skb_reserve(skb, LL_RESERVED_SPACE(dev));
-       skb->nh.raw = skb->data;
+       skb_reset_network_header(skb);
 
        /* Try to align data part correctly */
-       if (dev->hard_header) {
+       if (dev->header_ops) {
                skb->data -= dev->hard_header_len;
                skb->tail -= dev->hard_header_len;
                if (len < dev->hard_header_len)
-                       skb->nh.raw = skb->data;
+                       skb_reset_network_header(skb);
        }
 
        /* Returns -EFAULT on error */
@@ -437,7 +450,6 @@ out_unlock:
                dev_put(dev);
        return err;
 }
-#endif
 
 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
                                      unsigned int res)
@@ -480,9 +492,12 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
        sk = pt->af_packet_priv;
        po = pkt_sk(sk);
 
+       if (dev_net(dev) != sock_net(sk))
+               goto drop;
+
        skb->dev = dev;
 
-       if (dev->hard_header) {
+       if (dev->header_ops) {
                /* The device has an explicit notion of ll header,
                   exported to higher levels.
 
@@ -491,10 +506,10 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
                   never delivered to user.
                 */
                if (sk->sk_type != SOCK_DGRAM)
-                       skb_push(skb, skb->data - skb->mac.raw);
+                       skb_push(skb, skb->data - skb_mac_header(skb));
                else if (skb->pkt_type == PACKET_OUTGOING) {
                        /* Special case: outgoing packets have ll header at head */
-                       skb_pull(skb, skb->nh.raw - skb->data);
+                       skb_pull(skb, skb_network_offset(skb));
                }
        }
 
@@ -531,11 +546,12 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet
        sll->sll_hatype = dev->type;
        sll->sll_protocol = skb->protocol;
        sll->sll_pkttype = skb->pkt_type;
-       sll->sll_ifindex = dev->ifindex;
-       sll->sll_halen = 0;
+       if (unlikely(po->origdev))
+               sll->sll_ifindex = orig_dev->ifindex;
+       else
+               sll->sll_ifindex = dev->ifindex;
 
-       if (dev->hard_header_parse)
-               sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
+       sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
 
        PACKET_SKB_CB(skb)->origlen = skb->len;
 
@@ -568,7 +584,7 @@ drop_n_restore:
                skb->len = skb_len;
        }
 drop:
-       kfree_skb(skb);
+       consume_skb(skb);
        return 0;
 }
 
@@ -578,13 +594,19 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
        struct sock *sk;
        struct packet_sock *po;
        struct sockaddr_ll *sll;
-       struct tpacket_hdr *h;
+       union {
+               struct tpacket_hdr *h1;
+               struct tpacket2_hdr *h2;
+               void *raw;
+       } h;
        u8 * skb_head = skb->data;
        int skb_len = skb->len;
        unsigned int snaplen, res;
        unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
-       unsigned short macoff, netoff;
+       unsigned short macoff, netoff, hdrlen;
        struct sk_buff *copy_skb = NULL;
+       struct timeval tv;
+       struct timespec ts;
 
        if (skb->pkt_type == PACKET_LOOPBACK)
                goto drop;
@@ -592,12 +614,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
        sk = pt->af_packet_priv;
        po = pkt_sk(sk);
 
-       if (dev->hard_header) {
+       if (dev_net(dev) != sock_net(sk))
+               goto drop;
+
+       if (dev->header_ops) {
                if (sk->sk_type != SOCK_DGRAM)
-                       skb_push(skb, skb->data - skb->mac.raw);
+                       skb_push(skb, skb->data - skb_mac_header(skb));
                else if (skb->pkt_type == PACKET_OUTGOING) {
                        /* Special case: outgoing packets have ll header at head */
-                       skb_pull(skb, skb->nh.raw - skb->data);
+                       skb_pull(skb, skb_network_offset(skb));
                }
        }
 
@@ -613,10 +638,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
                snaplen = res;
 
        if (sk->sk_type == SOCK_DGRAM) {
-               macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
+               macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
+                                 po->tp_reserve;
        } else {
-               unsigned maclen = skb->nh.raw - skb->data;
-               netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
+               unsigned maclen = skb_network_offset(skb);
+               netoff = TPACKET_ALIGN(po->tp_hdrlen +
+                                      (maclen < 16 ? 16 : maclen)) +
+                       po->tp_reserve;
                macoff = netoff - maclen;
        }
 
@@ -639,9 +667,8 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
        }
 
        spin_lock(&sk->sk_receive_queue.lock);
-       h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
-
-       if (h->tp_status)
+       h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
+       if (!h.raw)
                goto ring_is_full;
        po->head = po->head != po->frame_max ? po->head+1 : 0;
        po->stats.tp_packets++;
@@ -653,37 +680,59 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packe
                status &= ~TP_STATUS_LOSING;
        spin_unlock(&sk->sk_receive_queue.lock);
 
-       skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
+       skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
 
-       h->tp_len = skb->len;
-       h->tp_snaplen = snaplen;
-       h->tp_mac = macoff;
-       h->tp_net = netoff;
-       if (skb->tstamp.off_sec == 0) {
-               __net_timestamp(skb);
-               sock_enable_timestamp(sk);
+       switch (po->tp_version) {
+       case TPACKET_V1:
+               h.h1->tp_len = skb->len;
+               h.h1->tp_snaplen = snaplen;
+               h.h1->tp_mac = macoff;
+               h.h1->tp_net = netoff;
+               if (skb->tstamp.tv64)
+                       tv = ktime_to_timeval(skb->tstamp);
+               else
+                       do_gettimeofday(&tv);
+               h.h1->tp_sec = tv.tv_sec;
+               h.h1->tp_usec = tv.tv_usec;
+               hdrlen = sizeof(*h.h1);
+               break;
+       case TPACKET_V2:
+               h.h2->tp_len = skb->len;
+               h.h2->tp_snaplen = snaplen;
+               h.h2->tp_mac = macoff;
+               h.h2->tp_net = netoff;
+               if (skb->tstamp.tv64)
+                       ts = ktime_to_timespec(skb->tstamp);
+               else
+                       getnstimeofday(&ts);
+               h.h2->tp_sec = ts.tv_sec;
+               h.h2->tp_nsec = ts.tv_nsec;
+               h.h2->tp_vlan_tci = skb->vlan_tci;
+               hdrlen = sizeof(*h.h2);
+               break;
+       default:
+               BUG();
        }
-       h->tp_sec = skb->tstamp.off_sec;
-       h->tp_usec = skb->tstamp.off_usec;
 
-       sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
-       sll->sll_halen = 0;
-       if (dev->hard_header_parse)
-               sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
+       sll = h.raw + TPACKET_ALIGN(hdrlen);
+       sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
        sll->sll_family = AF_PACKET;
        sll->sll_hatype = dev->type;
        sll->sll_protocol = skb->protocol;
        sll->sll_pkttype = skb->pkt_type;
-       sll->sll_ifindex = dev->ifindex;
+       if (unlikely(po->origdev))
+               sll->sll_ifindex = orig_dev->ifindex;
+       else
+               sll->sll_ifindex = dev->ifindex;
 
-       h->tp_status = status;
+       __packet_set_status(po, h.raw, status);
        smp_mb();
 
        {
                struct page *p_start, *p_end;
-               u8 *h_end = (u8 *)h + macoff + snaplen - 1;
+               u8 *h_end = h.raw + macoff + snaplen - 1;
 
-               p_start = virt_to_page(h);
+               p_start = virt_to_page(h.raw);
                p_end = virt_to_page(h_end);
                while (p_start <= p_end) {
                        flush_dcache_page(p_start);
@@ -707,8 +756,7 @@ ring_is_full:
        spin_unlock(&sk->sk_receive_queue.lock);
 
        sk->sk_data_ready(sk, 0);
-       if (copy_skb)
-               kfree_skb(copy_skb);
+       kfree_skb(copy_skb);
        goto drop_n_restore;
 }
 
@@ -748,7 +796,7 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
        }
 
 
-       dev = dev_get_by_index(ifindex);
+       dev = dev_get_by_index(sock_net(sk), ifindex);
        err = -ENXIO;
        if (dev == NULL)
                goto out_unlock;
@@ -763,24 +811,18 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
        if (len > dev->mtu+reserve)
                goto out_unlock;
 
-       skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
+       skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
                                msg->msg_flags & MSG_DONTWAIT, &err);
        if (skb==NULL)
                goto out_unlock;
 
        skb_reserve(skb, LL_RESERVED_SPACE(dev));
-       skb->nh.raw = skb->data;
+       skb_reset_network_header(skb);
 
-       if (dev->hard_header) {
-               int res;
-               err = -EINVAL;
-               res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
-               if (sock->type != SOCK_DGRAM) {
-                       skb->tail = skb->data;
-                       skb->len = 0;
-               } else if (res < 0)
-                       goto out_free;
-       }
+       err = -EINVAL;
+       if (sock->type == SOCK_DGRAM &&
+           dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
+               goto out_free;
 
        /* Returns -EFAULT on error */
        err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
@@ -821,15 +863,18 @@ static int packet_release(struct socket *sock)
 {
        struct sock *sk = sock->sk;
        struct packet_sock *po;
+       struct net *net;
 
        if (!sk)
                return 0;
 
+       net = sock_net(sk);
        po = pkt_sk(sk);
 
-       write_lock_bh(&packet_sklist_lock);
+       write_lock_bh(&net->packet.sklist_lock);
        sk_del_node_init(sk);
-       write_unlock_bh(&packet_sklist_lock);
+       sock_prot_inuse_add(net, sk->sk_prot, -1);
+       write_unlock_bh(&net->packet.sklist_lock);
 
        /*
         *      Unhook packet receive handler.
@@ -845,9 +890,7 @@ static int packet_release(struct socket *sock)
                __sock_put(sk);
        }
 
-#ifdef CONFIG_PACKET_MULTICAST
        packet_flush_mclist(sk);
-#endif
 
 #ifdef CONFIG_PACKET_MMAP
        if (po->pg_vec) {
@@ -867,6 +910,7 @@ static int packet_release(struct socket *sock)
        /* Purge queues */
 
        skb_queue_purge(&sk->sk_receive_queue);
+       sk_refcnt_debug_release(sk);
 
        sock_put(sk);
        return 0;
@@ -904,20 +948,14 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protoc
        if (protocol == 0)
                goto out_unlock;
 
-       if (dev) {
-               if (dev->flags&IFF_UP) {
-                       dev_add_pack(&po->prot_hook);
-                       sock_hold(sk);
-                       po->running = 1;
-               } else {
-                       sk->sk_err = ENETDOWN;
-                       if (!sock_flag(sk, SOCK_DEAD))
-                               sk->sk_error_report(sk);
-               }
-       } else {
+       if (!dev || (dev->flags & IFF_UP)) {
                dev_add_pack(&po->prot_hook);
                sock_hold(sk);
                po->running = 1;
+       } else {
+               sk->sk_err = ENETDOWN;
+               if (!sock_flag(sk, SOCK_DEAD))
+                       sk->sk_error_report(sk);
        }
 
 out_unlock:
@@ -930,8 +968,6 @@ out_unlock:
  *     Bind a packet socket to a device
  */
 
-#ifdef CONFIG_SOCK_PACKET
-
 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
        struct sock *sk=sock->sk;
@@ -947,14 +983,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int add
                return -EINVAL;
        strlcpy(name,uaddr->sa_data,sizeof(name));
 
-       dev = dev_get_by_name(name);
+       dev = dev_get_by_name(sock_net(sk), name);
        if (dev) {
                err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
                dev_put(dev);
        }
        return err;
 }
-#endif
 
 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
@@ -975,7 +1010,7 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len
 
        if (sll->sll_ifindex) {
                err = -ENODEV;
-               dev = dev_get_by_index(sll->sll_ifindex);
+               dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
                if (dev == NULL)
                        goto out;
        }
@@ -997,7 +1032,7 @@ static struct proto packet_proto = {
  *     Create a packet of type SOCK_PACKET.
  */
 
-static int packet_create(struct socket *sock, int protocol)
+static int packet_create(struct net *net, struct socket *sock, int protocol)
 {
        struct sock *sk;
        struct packet_sock *po;
@@ -1006,25 +1041,21 @@ static int packet_create(struct socket *sock, int protocol)
 
        if (!capable(CAP_NET_RAW))
                return -EPERM;
-       if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
-#ifdef CONFIG_SOCK_PACKET
-           && sock->type != SOCK_PACKET
-#endif
-           )
+       if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
+           sock->type != SOCK_PACKET)
                return -ESOCKTNOSUPPORT;
 
        sock->state = SS_UNCONNECTED;
 
        err = -ENOBUFS;
-       sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
+       sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
        if (sk == NULL)
                goto out;
 
        sock->ops = &packet_ops;
-#ifdef CONFIG_SOCK_PACKET
        if (sock->type == SOCK_PACKET)
                sock->ops = &packet_ops_spkt;
-#endif
+
        sock_init_data(sock, sk);
 
        po = pkt_sk(sk);
@@ -1032,18 +1063,19 @@ static int packet_create(struct socket *sock, int protocol)
        po->num = proto;
 
        sk->sk_destruct = packet_sock_destruct;
-       atomic_inc(&packet_socks_nr);
+       sk_refcnt_debug_inc(sk);
 
        /*
         *      Attach a protocol block
         */
 
        spin_lock_init(&po->bind_lock);
+       mutex_init(&po->pg_vec_lock);
        po->prot_hook.func = packet_rcv;
-#ifdef CONFIG_SOCK_PACKET
+
        if (sock->type == SOCK_PACKET)
                po->prot_hook.func = packet_rcv_spkt;
-#endif
+
        po->prot_hook.af_packet_priv = sk;
 
        if (proto) {
@@ -1053,9 +1085,10 @@ static int packet_create(struct socket *sock, int protocol)
                po->running = 1;
        }
 
-       write_lock_bh(&packet_sklist_lock);
-       sk_add_node(sk, &packet_sklist);
-       write_unlock_bh(&packet_sklist_lock);
+       write_lock_bh(&net->packet.sklist_lock);
+       sk_add_node(sk, &net->packet.sklist);
+       sock_prot_inuse_add(net, &packet_proto, 1);
+       write_unlock_bh(&net->packet.sklist_lock);
        return(0);
 out:
        return err;
@@ -1146,7 +1179,8 @@ static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
                aux.tp_len = PACKET_SKB_CB(skb)->origlen;
                aux.tp_snaplen = skb->len;
                aux.tp_mac = 0;
-               aux.tp_net = skb->nh.raw - skb->data;
+               aux.tp_net = skb_network_offset(skb);
+               aux.tp_vlan_tci = skb->vlan_tci;
 
                put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
        }
@@ -1163,7 +1197,6 @@ out:
        return err;
 }
 
-#ifdef CONFIG_SOCK_PACKET
 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
                               int *uaddr_len, int peer)
 {
@@ -1174,7 +1207,7 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
                return -EOPNOTSUPP;
 
        uaddr->sa_family = AF_PACKET;
-       dev = dev_get_by_index(pkt_sk(sk)->ifindex);
+       dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
        if (dev) {
                strlcpy(uaddr->sa_data, dev->name, 15);
                dev_put(dev);
@@ -1184,7 +1217,6 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
 
        return 0;
 }
-#endif
 
 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
                          int *uaddr_len, int peer)
@@ -1200,7 +1232,7 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
        sll->sll_family = AF_PACKET;
        sll->sll_ifindex = po->ifindex;
        sll->sll_protocol = po->num;
-       dev = dev_get_by_index(po->ifindex);
+       dev = dev_get_by_index(sock_net(sk), po->ifindex);
        if (dev) {
                sll->sll_hatype = dev->type;
                sll->sll_halen = dev->addr_len;
@@ -1215,8 +1247,8 @@ static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
        return 0;
 }
 
-#ifdef CONFIG_PACKET_MULTICAST
-static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
+static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
+                        int what)
 {
        switch (i->type) {
        case PACKET_MR_MULTICAST:
@@ -1226,13 +1258,14 @@ static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int w
                        dev_mc_delete(dev, i->addr, i->alen, 0);
                break;
        case PACKET_MR_PROMISC:
-               dev_set_promiscuity(dev, what);
+               return dev_set_promiscuity(dev, what);
                break;
        case PACKET_MR_ALLMULTI:
-               dev_set_allmulti(dev, what);
+               return dev_set_allmulti(dev, what);
                break;
        default:;
        }
+       return 0;
 }
 
 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
@@ -1253,7 +1286,7 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
        rtnl_lock();
 
        err = -ENODEV;
-       dev = __dev_get_by_index(mreq->mr_ifindex);
+       dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
        if (!dev)
                goto done;
 
@@ -1286,7 +1319,11 @@ static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
        i->count = 1;
        i->next = po->mclist;
        po->mclist = i;
-       packet_dev_mc(dev, i, +1);
+       err = packet_dev_mc(dev, i, 1);
+       if (err) {
+               po->mclist = i->next;
+               kfree(i);
+       }
 
 done:
        rtnl_unlock();
@@ -1307,7 +1344,7 @@ static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
                        if (--ml->count == 0) {
                                struct net_device *dev;
                                *mlp = ml->next;
-                               dev = dev_get_by_index(ml->ifindex);
+                               dev = dev_get_by_index(sock_net(sk), ml->ifindex);
                                if (dev) {
                                        packet_dev_mc(dev, ml, -1);
                                        dev_put(dev);
@@ -1335,7 +1372,7 @@ static void packet_flush_mclist(struct sock *sk)
                struct net_device *dev;
 
                po->mclist = ml->next;
-               if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
+               if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
                        packet_dev_mc(dev, ml, -1);
                        dev_put(dev);
                }
@@ -1343,7 +1380,6 @@ static void packet_flush_mclist(struct sock *sk)
        }
        rtnl_unlock();
 }
-#endif
 
 static int
 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
@@ -1356,7 +1392,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
                return -ENOPROTOOPT;
 
        switch(optname) {
-#ifdef CONFIG_PACKET_MULTICAST
        case PACKET_ADD_MEMBERSHIP:
        case PACKET_DROP_MEMBERSHIP:
        {
@@ -1377,7 +1412,7 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
                        ret = packet_mc_drop(sk, &mreq);
                return ret;
        }
-#endif
+
 #ifdef CONFIG_PACKET_MMAP
        case PACKET_RX_RING:
        {
@@ -1401,6 +1436,38 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
                pkt_sk(sk)->copy_thresh = val;
                return 0;
        }
+       case PACKET_VERSION:
+       {
+               int val;
+
+               if (optlen != sizeof(val))
+                       return -EINVAL;
+               if (po->pg_vec)
+                       return -EBUSY;
+               if (copy_from_user(&val, optval, sizeof(val)))
+                       return -EFAULT;
+               switch (val) {
+               case TPACKET_V1:
+               case TPACKET_V2:
+                       po->tp_version = val;
+                       return 0;
+               default:
+                       return -EINVAL;
+               }
+       }
+       case PACKET_RESERVE:
+       {
+               unsigned int val;
+
+               if (optlen != sizeof(val))
+                       return -EINVAL;
+               if (po->pg_vec)
+                       return -EBUSY;
+               if (copy_from_user(&val, optval, sizeof(val)))
+                       return -EFAULT;
+               po->tp_reserve = val;
+               return 0;
+       }
 #endif
        case PACKET_AUXDATA:
        {
@@ -1414,6 +1481,18 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
                po->auxdata = !!val;
                return 0;
        }
+       case PACKET_ORIGDEV:
+       {
+               int val;
+
+               if (optlen < sizeof(val))
+                       return -EINVAL;
+               if (copy_from_user(&val, optval, sizeof(val)))
+                       return -EFAULT;
+
+               po->origdev = !!val;
+               return 0;
+       }
        default:
                return -ENOPROTOOPT;
        }
@@ -1457,6 +1536,44 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
 
                data = &val;
                break;
+       case PACKET_ORIGDEV:
+               if (len > sizeof(int))
+                       len = sizeof(int);
+               val = po->origdev;
+
+               data = &val;
+               break;
+#ifdef CONFIG_PACKET_MMAP
+       case PACKET_VERSION:
+               if (len > sizeof(int))
+                       len = sizeof(int);
+               val = po->tp_version;
+               data = &val;
+               break;
+       case PACKET_HDRLEN:
+               if (len > sizeof(int))
+                       len = sizeof(int);
+               if (copy_from_user(&val, optval, len))
+                       return -EFAULT;
+               switch (val) {
+               case TPACKET_V1:
+                       val = sizeof(struct tpacket_hdr);
+                       break;
+               case TPACKET_V2:
+                       val = sizeof(struct tpacket2_hdr);
+                       break;
+               default:
+                       return -EINVAL;
+               }
+               data = &val;
+               break;
+       case PACKET_RESERVE:
+               if (len > sizeof(unsigned int))
+                       len = sizeof(unsigned int);
+               val = po->tp_reserve;
+               data = &val;
+               break;
+#endif
        default:
                return -ENOPROTOOPT;
        }
@@ -1473,19 +1590,19 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
 {
        struct sock *sk;
        struct hlist_node *node;
-       struct net_device *dev = (struct net_device*)data;
+       struct net_device *dev = data;
+       struct net *net = dev_net(dev);
 
-       read_lock(&packet_sklist_lock);
-       sk_for_each(sk, node, &packet_sklist) {
+       read_lock(&net->packet.sklist_lock);
+       sk_for_each(sk, node, &net->packet.sklist) {
                struct packet_sock *po = pkt_sk(sk);
 
                switch (msg) {
                case NETDEV_UNREGISTER:
-#ifdef CONFIG_PACKET_MULTICAST
                        if (po->mclist)
                                packet_dev_mclist(dev, po->mclist, -1);
-                       // fallthrough
-#endif
+                       /* fallthrough */
+
                case NETDEV_DOWN:
                        if (dev->ifindex == po->ifindex) {
                                spin_lock(&po->bind_lock);
@@ -1516,7 +1633,7 @@ static int packet_notifier(struct notifier_block *this, unsigned long msg, void
                        break;
                }
        }
-       read_unlock(&packet_sklist_lock);
+       read_unlock(&net->packet.sklist_lock);
        return NOTIFY_DONE;
 }
 
@@ -1546,6 +1663,8 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
                }
                case SIOCGSTAMP:
                        return sock_get_timestamp(sk, (struct timeval __user *)arg);
+               case SIOCGSTAMPNS:
+                       return sock_get_timestampns(sk, (struct timespec __user *)arg);
 
 #ifdef CONFIG_INET
                case SIOCADDRT:
@@ -1562,6 +1681,8 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd,
                case SIOCGIFDSTADDR:
                case SIOCSIFDSTADDR:
                case SIOCSIFFLAGS:
+                       if (!net_eq(sock_net(sk), &init_net))
+                               return -ENOIOCTLCMD;
                        return inet_dgram_ops.ioctl(sock, cmd, arg);
 #endif
 
@@ -1586,11 +1707,8 @@ static unsigned int packet_poll(struct file * file, struct socket *sock,
        spin_lock_bh(&sk->sk_receive_queue.lock);
        if (po->pg_vec) {
                unsigned last = po->head ? po->head-1 : po->frame_max;
-               struct tpacket_hdr *h;
-
-               h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
 
-               if (h->tp_status)
+               if (packet_lookup_frame(po, last, TP_STATUS_USER))
                        mask |= POLLIN | POLLRDNORM;
        }
        spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -1627,11 +1745,6 @@ static struct vm_operations_struct packet_mmap_ops = {
        .close =packet_mm_close,
 };
 
-static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
-{
-       return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
-}
-
 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
 {
        int i;
@@ -1645,8 +1758,9 @@ static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
 
 static inline char *alloc_one_pg_vec_page(unsigned long order)
 {
-       return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
-                                        order);
+       gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
+
+       return (char *) __get_free_pages(gfp_flags, order);
 }
 
 static char **alloc_pg_vec(struct tpacket_req *req, int order)
@@ -1683,18 +1797,28 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
        int err = 0;
 
        if (req->tp_block_nr) {
-               int i, l;
+               int i;
 
                /* Sanity tests and some calculations */
 
                if (unlikely(po->pg_vec))
                        return -EBUSY;
 
+               switch (po->tp_version) {
+               case TPACKET_V1:
+                       po->tp_hdrlen = TPACKET_HDRLEN;
+                       break;
+               case TPACKET_V2:
+                       po->tp_hdrlen = TPACKET2_HDRLEN;
+                       break;
+               }
+
                if (unlikely((int)req->tp_block_size <= 0))
                        return -EINVAL;
                if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
                        return -EINVAL;
-               if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
+               if (unlikely(req->tp_frame_size < po->tp_hdrlen +
+                                                 po->tp_reserve))
                        return -EINVAL;
                if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
                        return -EINVAL;
@@ -1712,15 +1836,12 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
                if (unlikely(!pg_vec))
                        goto out;
 
-               l = 0;
                for (i = 0; i < req->tp_block_nr; i++) {
-                       char *ptr = pg_vec[i];
-                       struct tpacket_hdr *header;
+                       void *ptr = pg_vec[i];
                        int k;
 
                        for (k = 0; k < po->frames_per_block; k++) {
-                               header = (struct tpacket_hdr *) ptr;
-                               header->tp_status = TP_STATUS_KERNEL;
+                               __packet_set_status(po, ptr, TP_STATUS_KERNEL);
                                ptr += req->tp_frame_size;
                        }
                }
@@ -1747,6 +1868,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
        synchronize_net();
 
        err = -EBUSY;
+       mutex_lock(&po->pg_vec_lock);
        if (closing || atomic_read(&po->mapped) == 0) {
                err = 0;
 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
@@ -1768,6 +1890,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
                if (atomic_read(&po->mapped))
                        printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
        }
+       mutex_unlock(&po->pg_vec_lock);
 
        spin_lock(&po->bind_lock);
        if (was_running && !po->running) {
@@ -1800,7 +1923,7 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
 
        size = vma->vm_end - vma->vm_start;
 
-       lock_sock(sk);
+       mutex_lock(&po->pg_vec_lock);
        if (po->pg_vec == NULL)
                goto out;
        if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
@@ -1823,13 +1946,12 @@ static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_st
        err = 0;
 
 out:
-       release_sock(sk);
+       mutex_unlock(&po->pg_vec_lock);
        return err;
 }
 #endif
 
 
-#ifdef CONFIG_SOCK_PACKET
 static const struct proto_ops packet_ops_spkt = {
        .family =       PF_PACKET,
        .owner =        THIS_MODULE,
@@ -1850,7 +1972,6 @@ static const struct proto_ops packet_ops_spkt = {
        .mmap =         sock_no_mmap,
        .sendpage =     sock_no_sendpage,
 };
-#endif
 
 static const struct proto_ops packet_ops = {
        .family =       PF_PACKET,
@@ -1884,12 +2005,12 @@ static struct notifier_block packet_netdev_notifier = {
 };
 
 #ifdef CONFIG_PROC_FS
-static inline struct sock *packet_seq_idx(loff_t off)
+static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
 {
        struct sock *s;
        struct hlist_node *node;
 
-       sk_for_each(s, node, &packet_sklist) {
+       sk_for_each(s, node, &net->packet.sklist) {
                if (!off--)
                        return s;
        }
@@ -1897,22 +2018,27 @@ static inline struct sock *packet_seq_idx(loff_t off)
 }
 
 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
+       __acquires(seq_file_net(seq)->packet.sklist_lock)
 {
-       read_lock(&packet_sklist_lock);
-       return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
+       struct net *net = seq_file_net(seq);
+       read_lock(&net->packet.sklist_lock);
+       return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
 }
 
 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+       struct net *net = seq_file_net(seq);
        ++*pos;
        return  (v == SEQ_START_TOKEN)
-               ? sk_head(&packet_sklist)
+               ? sk_head(&net->packet.sklist)
                : sk_next((struct sock*)v) ;
 }
 
 static void packet_seq_stop(struct seq_file *seq, void *v)
+       __releases(seq_file_net(seq)->packet.sklist_lock)
 {
-       read_unlock(&packet_sklist_lock);
+       struct net *net = seq_file_net(seq);
+       read_unlock(&net->packet.sklist_lock);
 }
 
 static int packet_seq_show(struct seq_file *seq, void *v)
@@ -1939,7 +2065,7 @@ static int packet_seq_show(struct seq_file *seq, void *v)
        return 0;
 }
 
-static struct seq_operations packet_seq_ops = {
+static const struct seq_operations packet_seq_ops = {
        .start  = packet_seq_start,
        .next   = packet_seq_next,
        .stop   = packet_seq_stop,
@@ -1948,7 +2074,8 @@ static struct seq_operations packet_seq_ops = {
 
 static int packet_seq_open(struct inode *inode, struct file *file)
 {
-       return seq_open(file, &packet_seq_ops);
+       return seq_open_net(inode, file, &packet_seq_ops,
+                           sizeof(struct seq_net_private));
 }
 
 static const struct file_operations packet_seq_fops = {
@@ -1956,15 +2083,37 @@ static const struct file_operations packet_seq_fops = {
        .open           = packet_seq_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-       .release        = seq_release,
+       .release        = seq_release_net,
 };
 
 #endif
 
+static int packet_net_init(struct net *net)
+{
+       rwlock_init(&net->packet.sklist_lock);
+       INIT_HLIST_HEAD(&net->packet.sklist);
+
+       if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
+               return -ENOMEM;
+
+       return 0;
+}
+
+static void packet_net_exit(struct net *net)
+{
+       proc_net_remove(net, "packet");
+}
+
+static struct pernet_operations packet_net_ops = {
+       .init = packet_net_init,
+       .exit = packet_net_exit,
+};
+
+
 static void __exit packet_exit(void)
 {
-       proc_net_remove("packet");
        unregister_netdevice_notifier(&packet_netdev_notifier);
+       unregister_pernet_subsys(&packet_net_ops);
        sock_unregister(PF_PACKET);
        proto_unregister(&packet_proto);
 }
@@ -1977,8 +2126,8 @@ static int __init packet_init(void)
                goto out;
 
        sock_register(&packet_family_ops);
+       register_pernet_subsys(&packet_net_ops);
        register_netdevice_notifier(&packet_netdev_notifier);
-       proc_net_fops_create("packet", 0, &packet_seq_fops);
 out:
        return rc;
 }