remove final fastcall users
[safe/jmp/linux-2.6] / net / core / sock.c
index 96e00b0..09cb3a7 100644 (file)
@@ -34,7 +34,7 @@
  *             Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
  *                                     was buggy. Put a remove_sock() in the handler
  *                                     for memory when we hit 0. Also altered the timer
- *                                     code. The ACK stuff can wait and needs major 
+ *                                     code. The ACK stuff can wait and needs major
  *                                     TCP layer surgery.
  *             Alan Cox        :       Fixed TCP ack bug, removed remove sock
  *                                     and fixed timer/inet_bh race.
@@ -91,7 +91,7 @@
  *             2 of the License, or (at your option) any later version.
  */
 
-#include <linux/config.h>
+#include <linux/capability.h>
 #include <linux/errno.h>
 #include <linux/types.h>
 #include <linux/socket.h>
 #include <linux/poll.h>
 #include <linux/tcp.h>
 #include <linux/init.h>
+#include <linux/highmem.h>
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/netdevice.h>
 #include <net/protocol.h>
 #include <linux/skbuff.h>
+#include <net/net_namespace.h>
+#include <net/request_sock.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
 #include <linux/ipsec.h>
 #include <net/tcp.h>
 #endif
 
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family:
+ */
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+static const char *af_family_key_strings[AF_MAX+1] = {
+  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
+  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
+  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
+  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
+  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
+  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
+  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
+  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
+  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
+  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
+  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
+  "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
+};
+static const char *af_family_slock_key_strings[AF_MAX+1] = {
+  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
+  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
+  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
+  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
+  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
+  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
+  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
+  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
+  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
+  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
+  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
+  "slock-AF_RXRPC" , "slock-AF_MAX"
+};
+static const char *af_family_clock_key_strings[AF_MAX+1] = {
+  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
+  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
+  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
+  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
+  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
+  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
+  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
+  "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
+  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
+  "clock-27"       , "clock-28"          , "clock-29"          ,
+  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
+  "clock-AF_RXRPC" , "clock-AF_MAX"
+};
+#endif
+
+/*
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+
 /* Take into consideration the size of the struct sk_buff overhead in the
  * determination of these values, since that is non-constant across
  * platforms.  This makes socket queueing behavior and performance
 #define SK_RMEM_MAX            (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
 
 /* Run time adjustable parameters. */
-__u32 sysctl_wmem_max = SK_WMEM_MAX;
-__u32 sysctl_rmem_max = SK_RMEM_MAX;
-__u32 sysctl_wmem_default = SK_WMEM_MAX;
-__u32 sysctl_rmem_default = SK_RMEM_MAX;
+__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
 
 /* Maximal space eaten by iovec or ancilliary data plus some space */
-int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
+int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
 
 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
 {
@@ -155,7 +221,20 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
                return -EINVAL;
        if (copy_from_user(&tv, optval, sizeof(tv)))
                return -EFAULT;
-
+       if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+               return -EDOM;
+
+       if (tv.tv_sec < 0) {
+               static int warned __read_mostly;
+
+               *timeo_p = 0;
+               if (warned < 10 && net_ratelimit())
+                       warned++;
+                       printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
+                              "tries to set negative timeout\n",
+                               current->comm, task_pid_nr(current));
+               return 0;
+       }
        *timeo_p = MAX_SCHEDULE_TIMEOUT;
        if (tv.tv_sec == 0 && tv.tv_usec == 0)
                return 0;
@@ -168,8 +247,8 @@ static void sock_warn_obsolete_bsdism(const char *name)
 {
        static int warned;
        static char warncomm[TASK_COMM_LEN];
-       if (strcmp(warncomm, current->comm) && warned < 5) { 
-               strcpy(warncomm,  current->comm); 
+       if (strcmp(warncomm, current->comm) && warned < 5) {
+               strcpy(warncomm,  current->comm);
                printk(KERN_WARNING "process `%s' is using obsolete "
                       "%s SO_BSDCOMPAT\n", warncomm, name);
                warned++;
@@ -177,14 +256,182 @@ static void sock_warn_obsolete_bsdism(const char *name)
 }
 
 static void sock_disable_timestamp(struct sock *sk)
-{      
-       if (sock_flag(sk, SOCK_TIMESTAMP)) { 
+{
+       if (sock_flag(sk, SOCK_TIMESTAMP)) {
                sock_reset_flag(sk, SOCK_TIMESTAMP);
                net_disable_timestamp();
        }
 }
 
 
+int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+       int err = 0;
+       int skb_len;
+
+       /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
+          number of warnings when compiling with -W --ANK
+        */
+       if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
+           (unsigned)sk->sk_rcvbuf) {
+               err = -ENOMEM;
+               goto out;
+       }
+
+       err = sk_filter(sk, skb);
+       if (err)
+               goto out;
+
+       if (!sk_rmem_schedule(sk, skb->truesize)) {
+               err = -ENOBUFS;
+               goto out;
+       }
+
+       skb->dev = NULL;
+       skb_set_owner_r(skb, sk);
+
+       /* Cache the SKB length before we tack it onto the receive
+        * queue.  Once it is added it no longer belongs to us and
+        * may be freed by other threads of control pulling packets
+        * from the queue.
+        */
+       skb_len = skb->len;
+
+       skb_queue_tail(&sk->sk_receive_queue, skb);
+
+       if (!sock_flag(sk, SOCK_DEAD))
+               sk->sk_data_ready(sk, skb_len);
+out:
+       return err;
+}
+EXPORT_SYMBOL(sock_queue_rcv_skb);
+
+int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
+{
+       int rc = NET_RX_SUCCESS;
+
+       if (sk_filter(sk, skb))
+               goto discard_and_relse;
+
+       skb->dev = NULL;
+
+       if (nested)
+               bh_lock_sock_nested(sk);
+       else
+               bh_lock_sock(sk);
+       if (!sock_owned_by_user(sk)) {
+               /*
+                * trylock + unlock semantics:
+                */
+               mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
+               rc = sk->sk_backlog_rcv(sk, skb);
+
+               mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+       } else
+               sk_add_backlog(sk, skb);
+       bh_unlock_sock(sk);
+out:
+       sock_put(sk);
+       return rc;
+discard_and_relse:
+       kfree_skb(skb);
+       goto out;
+}
+EXPORT_SYMBOL(sk_receive_skb);
+
+struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
+{
+       struct dst_entry *dst = sk->sk_dst_cache;
+
+       if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+               sk->sk_dst_cache = NULL;
+               dst_release(dst);
+               return NULL;
+       }
+
+       return dst;
+}
+EXPORT_SYMBOL(__sk_dst_check);
+
+struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
+{
+       struct dst_entry *dst = sk_dst_get(sk);
+
+       if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
+               sk_dst_reset(sk);
+               dst_release(dst);
+               return NULL;
+       }
+
+       return dst;
+}
+EXPORT_SYMBOL(sk_dst_check);
+
+static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+{
+       int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+       struct net *net = sk->sk_net;
+       char devname[IFNAMSIZ];
+       int index;
+
+       /* Sorry... */
+       ret = -EPERM;
+       if (!capable(CAP_NET_RAW))
+               goto out;
+
+       ret = -EINVAL;
+       if (optlen < 0)
+               goto out;
+
+       /* Bind this socket to a particular device like "eth0",
+        * as specified in the passed interface name. If the
+        * name is "" or the option length is zero the socket
+        * is not bound.
+        */
+       if (optlen > IFNAMSIZ - 1)
+               optlen = IFNAMSIZ - 1;
+       memset(devname, 0, sizeof(devname));
+
+       ret = -EFAULT;
+       if (copy_from_user(devname, optval, optlen))
+               goto out;
+
+       if (devname[0] == '\0') {
+               index = 0;
+       } else {
+               struct net_device *dev = dev_get_by_name(net, devname);
+
+               ret = -ENODEV;
+               if (!dev)
+                       goto out;
+
+               index = dev->ifindex;
+               dev_put(dev);
+       }
+
+       lock_sock(sk);
+       sk->sk_bound_dev_if = index;
+       sk_dst_reset(sk);
+       release_sock(sk);
+
+       ret = 0;
+
+out:
+#endif
+
+       return ret;
+}
+
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+       if (valbool)
+               sock_set_flag(sk, bit);
+       else
+               sock_reset_flag(sk, bit);
+}
+
 /*
  *     This is meant for all protocols to use and covers goings on
  *     at the socket level. Everything here is generic.
@@ -194,258 +441,246 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                    char __user *optval, int optlen)
 {
        struct sock *sk=sock->sk;
-       struct sk_filter *filter;
        int val;
        int valbool;
        struct linger ling;
        int ret = 0;
-       
+
        /*
         *      Options without arguments
         */
 
 #ifdef SO_DONTLINGER           /* Compatibility item... */
-       switch (optname) {
-               case SO_DONTLINGER:
-                       sock_reset_flag(sk, SOCK_LINGER);
-                       return 0;
+       if (optname == SO_DONTLINGER) {
+               lock_sock(sk);
+               sock_reset_flag(sk, SOCK_LINGER);
+               release_sock(sk);
+               return 0;
        }
-#endif 
-               
-       if(optlen<sizeof(int))
-               return(-EINVAL);
-       
+#endif
+
+       if (optname == SO_BINDTODEVICE)
+               return sock_bindtodevice(sk, optval, optlen);
+
+       if (optlen < sizeof(int))
+               return -EINVAL;
+
        if (get_user(val, (int __user *)optval))
                return -EFAULT;
-       
-       valbool = val?1:0;
+
+       valbool = val?1:0;
 
        lock_sock(sk);
 
-       switch(optname) 
-       {
-               case SO_DEBUG:  
-                       if(val && !capable(CAP_NET_ADMIN))
-                       {
-                               ret = -EACCES;
-                       }
-                       else if (valbool)
-                               sock_set_flag(sk, SOCK_DBG);
-                       else
-                               sock_reset_flag(sk, SOCK_DBG);
-                       break;
-               case SO_REUSEADDR:
-                       sk->sk_reuse = valbool;
-                       break;
-               case SO_TYPE:
-               case SO_ERROR:
-                       ret = -ENOPROTOOPT;
-                       break;
-               case SO_DONTROUTE:
-                       if (valbool)
-                               sock_set_flag(sk, SOCK_LOCALROUTE);
-                       else
-                               sock_reset_flag(sk, SOCK_LOCALROUTE);
-                       break;
-               case SO_BROADCAST:
-                       sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
-                       break;
-               case SO_SNDBUF:
-                       /* Don't error on this BSD doesn't and if you think
-                          about it this is right. Otherwise apps have to
-                          play 'guess the biggest size' games. RCVBUF/SNDBUF
-                          are treated in BSD as hints */
-                          
-                       if (val > sysctl_wmem_max)
-                               val = sysctl_wmem_max;
-
-                       sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
-                       if ((val * 2) < SOCK_MIN_SNDBUF)
-                               sk->sk_sndbuf = SOCK_MIN_SNDBUF;
-                       else
-                               sk->sk_sndbuf = val * 2;
+       switch(optname) {
+       case SO_DEBUG:
+               if (val && !capable(CAP_NET_ADMIN)) {
+                       ret = -EACCES;
+               } else
+                       sock_valbool_flag(sk, SOCK_DBG, valbool);
+               break;
+       case SO_REUSEADDR:
+               sk->sk_reuse = valbool;
+               break;
+       case SO_TYPE:
+       case SO_ERROR:
+               ret = -ENOPROTOOPT;
+               break;
+       case SO_DONTROUTE:
+               sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+               break;
+       case SO_BROADCAST:
+               sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+               break;
+       case SO_SNDBUF:
+               /* Don't error on this BSD doesn't and if you think
+                  about it this is right. Otherwise apps have to
+                  play 'guess the biggest size' games. RCVBUF/SNDBUF
+                  are treated in BSD as hints */
+
+               if (val > sysctl_wmem_max)
+                       val = sysctl_wmem_max;
+set_sndbuf:
+               sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+               if ((val * 2) < SOCK_MIN_SNDBUF)
+                       sk->sk_sndbuf = SOCK_MIN_SNDBUF;
+               else
+                       sk->sk_sndbuf = val * 2;
+
+               /*
+                *      Wake up sending tasks if we
+                *      upped the value.
+                */
+               sk->sk_write_space(sk);
+               break;
 
-                       /*
-                        *      Wake up sending tasks if we
-                        *      upped the value.
-                        */
-                       sk->sk_write_space(sk);
+       case SO_SNDBUFFORCE:
+               if (!capable(CAP_NET_ADMIN)) {
+                       ret = -EPERM;
                        break;
-
-               case SO_RCVBUF:
-                       /* Don't error on this BSD doesn't and if you think
-                          about it this is right. Otherwise apps have to
-                          play 'guess the biggest size' games. RCVBUF/SNDBUF
-                          are treated in BSD as hints */
-                         
-                       if (val > sysctl_rmem_max)
-                               val = sysctl_rmem_max;
-
-                       sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
-                       /* FIXME: is this lower bound the right one? */
-                       if ((val * 2) < SOCK_MIN_RCVBUF)
-                               sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
-                       else
-                               sk->sk_rcvbuf = val * 2;
+               }
+               goto set_sndbuf;
+
+       case SO_RCVBUF:
+               /* Don't error on this BSD doesn't and if you think
+                  about it this is right. Otherwise apps have to
+                  play 'guess the biggest size' games. RCVBUF/SNDBUF
+                  are treated in BSD as hints */
+
+               if (val > sysctl_rmem_max)
+                       val = sysctl_rmem_max;
+set_rcvbuf:
+               sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+               /*
+                * We double it on the way in to account for
+                * "struct sk_buff" etc. overhead.   Applications
+                * assume that the SO_RCVBUF setting they make will
+                * allow that much actual data to be received on that
+                * socket.
+                *
+                * Applications are unaware that "struct sk_buff" and
+                * other overheads allocate from the receive buffer
+                * during socket buffer allocation.
+                *
+                * And after considering the possible alternatives,
+                * returning the value we actually used in getsockopt
+                * is the most desirable behavior.
+                */
+               if ((val * 2) < SOCK_MIN_RCVBUF)
+                       sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
+               else
+                       sk->sk_rcvbuf = val * 2;
+               break;
+
+       case SO_RCVBUFFORCE:
+               if (!capable(CAP_NET_ADMIN)) {
+                       ret = -EPERM;
                        break;
+               }
+               goto set_rcvbuf;
 
-               case SO_KEEPALIVE:
+       case SO_KEEPALIVE:
 #ifdef CONFIG_INET
-                       if (sk->sk_protocol == IPPROTO_TCP)
-                               tcp_set_keepalive(sk, valbool);
+               if (sk->sk_protocol == IPPROTO_TCP)
+                       tcp_set_keepalive(sk, valbool);
 #endif
-                       sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+               sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+               break;
+
+       case SO_OOBINLINE:
+               sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+               break;
+
+       case SO_NO_CHECK:
+               sk->sk_no_check = valbool;
+               break;
+
+       case SO_PRIORITY:
+               if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+                       sk->sk_priority = val;
+               else
+                       ret = -EPERM;
+               break;
+
+       case SO_LINGER:
+               if (optlen < sizeof(ling)) {
+                       ret = -EINVAL;  /* 1003.1g */
                        break;
-
-               case SO_OOBINLINE:
-                       sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
-                       break;
-
-               case SO_NO_CHECK:
-                       sk->sk_no_check = valbool;
-                       break;
-
-               case SO_PRIORITY:
-                       if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
-                               sk->sk_priority = val;
-                       else
-                               ret = -EPERM;
+               }
+               if (copy_from_user(&ling,optval,sizeof(ling))) {
+                       ret = -EFAULT;
                        break;
-
-               case SO_LINGER:
-                       if(optlen<sizeof(ling)) {
-                               ret = -EINVAL;  /* 1003.1g */
-                               break;
-                       }
-                       if (copy_from_user(&ling,optval,sizeof(ling))) {
-                               ret = -EFAULT;
-                               break;
-                       }
-                       if (!ling.l_onoff)
-                               sock_reset_flag(sk, SOCK_LINGER);
-                       else {
+               }
+               if (!ling.l_onoff)
+                       sock_reset_flag(sk, SOCK_LINGER);
+               else {
 #if (BITS_PER_LONG == 32)
-                               if (ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
-                                       sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
-                               else
+                       if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+                               sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+                       else
 #endif
-                                       sk->sk_lingertime = ling.l_linger * HZ;
-                               sock_set_flag(sk, SOCK_LINGER);
-                       }
-                       break;
-
-               case SO_BSDCOMPAT:
-                       sock_warn_obsolete_bsdism("setsockopt");
-                       break;
-
-               case SO_PASSCRED:
-                       if (valbool)
-                               set_bit(SOCK_PASSCRED, &sock->flags);
+                               sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+                       sock_set_flag(sk, SOCK_LINGER);
+               }
+               break;
+
+       case SO_BSDCOMPAT:
+               sock_warn_obsolete_bsdism("setsockopt");
+               break;
+
+       case SO_PASSCRED:
+               if (valbool)
+                       set_bit(SOCK_PASSCRED, &sock->flags);
+               else
+                       clear_bit(SOCK_PASSCRED, &sock->flags);
+               break;
+
+       case SO_TIMESTAMP:
+       case SO_TIMESTAMPNS:
+               if (valbool)  {
+                       if (optname == SO_TIMESTAMP)
+                               sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
                        else
-                               clear_bit(SOCK_PASSCRED, &sock->flags);
-                       break;
+                               sock_set_flag(sk, SOCK_RCVTSTAMPNS);
+                       sock_set_flag(sk, SOCK_RCVTSTAMP);
+                       sock_enable_timestamp(sk);
+               } else {
+                       sock_reset_flag(sk, SOCK_RCVTSTAMP);
+                       sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+               }
+               break;
 
-               case SO_TIMESTAMP:
-                       if (valbool)  {
-                               sock_set_flag(sk, SOCK_RCVTSTAMP);
-                               sock_enable_timestamp(sk);
-                       } else
-                               sock_reset_flag(sk, SOCK_RCVTSTAMP);
-                       break;
+       case SO_RCVLOWAT:
+               if (val < 0)
+                       val = INT_MAX;
+               sk->sk_rcvlowat = val ? : 1;
+               break;
 
-               case SO_RCVLOWAT:
-                       if (val < 0)
-                               val = INT_MAX;
-                       sk->sk_rcvlowat = val ? : 1;
-                       break;
+       case SO_RCVTIMEO:
+               ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+               break;
 
-               case SO_RCVTIMEO:
-                       ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
-                       break;
+       case SO_SNDTIMEO:
+               ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+               break;
 
-               case SO_SNDTIMEO:
-                       ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
-                       break;
-
-#ifdef CONFIG_NETDEVICES
-               case SO_BINDTODEVICE:
-               {
-                       char devname[IFNAMSIZ]; 
+       case SO_ATTACH_FILTER:
+               ret = -EINVAL;
+               if (optlen == sizeof(struct sock_fprog)) {
+                       struct sock_fprog fprog;
 
-                       /* Sorry... */ 
-                       if (!capable(CAP_NET_RAW)) {
-                               ret = -EPERM;
+                       ret = -EFAULT;
+                       if (copy_from_user(&fprog, optval, sizeof(fprog)))
                                break;
-                       }
-
-                       /* Bind this socket to a particular device like "eth0",
-                        * as specified in the passed interface name. If the
-                        * name is "" or the option length is zero the socket 
-                        * is not bound. 
-                        */ 
-
-                       if (!valbool) {
-                               sk->sk_bound_dev_if = 0;
-                       } else {
-                               if (optlen > IFNAMSIZ) 
-                                       optlen = IFNAMSIZ; 
-                               if (copy_from_user(devname, optval, optlen)) {
-                                       ret = -EFAULT;
-                                       break;
-                               }
-
-                               /* Remove any cached route for this socket. */
-                               sk_dst_reset(sk);
 
-                               if (devname[0] == '\0') {
-                                       sk->sk_bound_dev_if = 0;
-                               } else {
-                                       struct net_device *dev = dev_get_by_name(devname);
-                                       if (!dev) {
-                                               ret = -ENODEV;
-                                               break;
-                                       }
-                                       sk->sk_bound_dev_if = dev->ifindex;
-                                       dev_put(dev);
-                               }
-                       }
-                       break;
+                       ret = sk_attach_filter(&fprog, sk);
                }
-#endif
-
-
-               case SO_ATTACH_FILTER:
-                       ret = -EINVAL;
-                       if (optlen == sizeof(struct sock_fprog)) {
-                               struct sock_fprog fprog;
-
-                               ret = -EFAULT;
-                               if (copy_from_user(&fprog, optval, sizeof(fprog)))
-                                       break;
-
-                               ret = sk_attach_filter(&fprog, sk);
-                       }
-                       break;
-
-               case SO_DETACH_FILTER:
-                       spin_lock_bh(&sk->sk_lock.slock);
-                       filter = sk->sk_filter;
-                        if (filter) {
-                               sk->sk_filter = NULL;
-                               spin_unlock_bh(&sk->sk_lock.slock);
-                               sk_filter_release(sk, filter);
-                               break;
-                       }
-                       spin_unlock_bh(&sk->sk_lock.slock);
-                       ret = -ENONET;
-                       break;
+               break;
+
+       case SO_DETACH_FILTER:
+               ret = sk_detach_filter(sk);
+               break;
+
+       case SO_PASSSEC:
+               if (valbool)
+                       set_bit(SOCK_PASSSEC, &sock->flags);
+               else
+                       clear_bit(SOCK_PASSSEC, &sock->flags);
+               break;
+       case SO_MARK:
+               if (!capable(CAP_NET_ADMIN))
+                       ret = -EPERM;
+               else {
+                       sk->sk_mark = val;
+               }
+               break;
 
                /* We implement the SO_SNDLOWAT etc to
                   not be settable (1003.1g 5.3) */
-               default:
-                       ret = -ENOPROTOOPT;
-                       break;
-       }
+       default:
+               ret = -ENOPROTOOPT;
+               break;
+       }
        release_sock(sk);
        return ret;
 }
@@ -455,218 +690,291 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                    char __user *optval, int __user *optlen)
 {
        struct sock *sk = sock->sk;
-       
-       union
-       {
-               int val;
-               struct linger ling;
+
+       union {
+               int val;
+               struct linger ling;
                struct timeval tm;
        } v;
-       
+
        unsigned int lv = sizeof(int);
        int len;
-       
-       if(get_user(len,optlen))
-               return -EFAULT;
-       if(len < 0)
+
+       if (get_user(len, optlen))
+               return -EFAULT;
+       if (len < 0)
                return -EINVAL;
-               
-       switch(optname) 
-       {
-               case SO_DEBUG:          
-                       v.val = sock_flag(sk, SOCK_DBG);
-                       break;
-               
-               case SO_DONTROUTE:
-                       v.val = sock_flag(sk, SOCK_LOCALROUTE);
-                       break;
-               
-               case SO_BROADCAST:
-                       v.val = !!sock_flag(sk, SOCK_BROADCAST);
-                       break;
 
-               case SO_SNDBUF:
-                       v.val = sk->sk_sndbuf;
-                       break;
-               
-               case SO_RCVBUF:
-                       v.val = sk->sk_rcvbuf;
-                       break;
+       switch(optname) {
+       case SO_DEBUG:
+               v.val = sock_flag(sk, SOCK_DBG);
+               break;
+
+       case SO_DONTROUTE:
+               v.val = sock_flag(sk, SOCK_LOCALROUTE);
+               break;
+
+       case SO_BROADCAST:
+               v.val = !!sock_flag(sk, SOCK_BROADCAST);
+               break;
+
+       case SO_SNDBUF:
+               v.val = sk->sk_sndbuf;
+               break;
+
+       case SO_RCVBUF:
+               v.val = sk->sk_rcvbuf;
+               break;
+
+       case SO_REUSEADDR:
+               v.val = sk->sk_reuse;
+               break;
+
+       case SO_KEEPALIVE:
+               v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
+               break;
+
+       case SO_TYPE:
+               v.val = sk->sk_type;
+               break;
+
+       case SO_ERROR:
+               v.val = -sock_error(sk);
+               if (v.val==0)
+                       v.val = xchg(&sk->sk_err_soft, 0);
+               break;
+
+       case SO_OOBINLINE:
+               v.val = !!sock_flag(sk, SOCK_URGINLINE);
+               break;
+
+       case SO_NO_CHECK:
+               v.val = sk->sk_no_check;
+               break;
+
+       case SO_PRIORITY:
+               v.val = sk->sk_priority;
+               break;
+
+       case SO_LINGER:
+               lv              = sizeof(v.ling);
+               v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
+               v.ling.l_linger = sk->sk_lingertime / HZ;
+               break;
+
+       case SO_BSDCOMPAT:
+               sock_warn_obsolete_bsdism("getsockopt");
+               break;
+
+       case SO_TIMESTAMP:
+               v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+                               !sock_flag(sk, SOCK_RCVTSTAMPNS);
+               break;
+
+       case SO_TIMESTAMPNS:
+               v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+               break;
+
+       case SO_RCVTIMEO:
+               lv=sizeof(struct timeval);
+               if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
+                       v.tm.tv_sec = 0;
+                       v.tm.tv_usec = 0;
+               } else {
+                       v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
+                       v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+               }
+               break;
+
+       case SO_SNDTIMEO:
+               lv=sizeof(struct timeval);
+               if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
+                       v.tm.tv_sec = 0;
+                       v.tm.tv_usec = 0;
+               } else {
+                       v.tm.tv_sec = sk->sk_sndtimeo / HZ;
+                       v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+               }
+               break;
 
-               case SO_REUSEADDR:
-                       v.val = sk->sk_reuse;
-                       break;
+       case SO_RCVLOWAT:
+               v.val = sk->sk_rcvlowat;
+               break;
 
-               case SO_KEEPALIVE:
-                       v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
-                       break;
+       case SO_SNDLOWAT:
+               v.val=1;
+               break;
 
-               case SO_TYPE:
-                       v.val = sk->sk_type;                            
-                       break;
+       case SO_PASSCRED:
+               v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
+               break;
 
-               case SO_ERROR:
-                       v.val = -sock_error(sk);
-                       if(v.val==0)
-                               v.val = xchg(&sk->sk_err_soft, 0);
-                       break;
+       case SO_PEERCRED:
+               if (len > sizeof(sk->sk_peercred))
+                       len = sizeof(sk->sk_peercred);
+               if (copy_to_user(optval, &sk->sk_peercred, len))
+                       return -EFAULT;
+               goto lenout;
 
-               case SO_OOBINLINE:
-                       v.val = !!sock_flag(sk, SOCK_URGINLINE);
-                       break;
-       
-               case SO_NO_CHECK:
-                       v.val = sk->sk_no_check;
-                       break;
+       case SO_PEERNAME:
+       {
+               char address[128];
+
+               if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+                       return -ENOTCONN;
+               if (lv < len)
+                       return -EINVAL;
+               if (copy_to_user(optval, address, len))
+                       return -EFAULT;
+               goto lenout;
+       }
 
-               case SO_PRIORITY:
-                       v.val = sk->sk_priority;
-                       break;
-               
-               case SO_LINGER: 
-                       lv              = sizeof(v.ling);
-                       v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
-                       v.ling.l_linger = sk->sk_lingertime / HZ;
-                       break;
-                                       
-               case SO_BSDCOMPAT:
-                       sock_warn_obsolete_bsdism("getsockopt");
-                       break;
+       /* Dubious BSD thing... Probably nobody even uses it, but
+        * the UNIX standard wants it for whatever reason... -DaveM
+        */
+       case SO_ACCEPTCONN:
+               v.val = sk->sk_state == TCP_LISTEN;
+               break;
 
-               case SO_TIMESTAMP:
-                       v.val = sock_flag(sk, SOCK_RCVTSTAMP);
-                       break;
+       case SO_PASSSEC:
+               v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
+               break;
 
-               case SO_RCVTIMEO:
-                       lv=sizeof(struct timeval);
-                       if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
-                               v.tm.tv_sec = 0;
-                               v.tm.tv_usec = 0;
-                       } else {
-                               v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
-                               v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
-                       }
-                       break;
+       case SO_PEERSEC:
+               return security_socket_getpeersec_stream(sock, optval, optlen, len);
 
-               case SO_SNDTIMEO:
-                       lv=sizeof(struct timeval);
-                       if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
-                               v.tm.tv_sec = 0;
-                               v.tm.tv_usec = 0;
-                       } else {
-                               v.tm.tv_sec = sk->sk_sndtimeo / HZ;
-                               v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
-                       }
-                       break;
+       case SO_MARK:
+               v.val = sk->sk_mark;
+               break;
 
-               case SO_RCVLOWAT:
-                       v.val = sk->sk_rcvlowat;
-                       break;
+       default:
+               return -ENOPROTOOPT;
+       }
+
+       if (len > lv)
+               len = lv;
+       if (copy_to_user(optval, &v, len))
+               return -EFAULT;
+lenout:
+       if (put_user(len, optlen))
+               return -EFAULT;
+       return 0;
+}
 
-               case SO_SNDLOWAT:
-                       v.val=1;
-                       break; 
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
+ */
+static inline void sock_lock_init(struct sock *sk)
+{
+       sock_lock_init_class_and_name(sk,
+                       af_family_slock_key_strings[sk->sk_family],
+                       af_family_slock_keys + sk->sk_family,
+                       af_family_key_strings[sk->sk_family],
+                       af_family_keys + sk->sk_family);
+}
 
-               case SO_PASSCRED:
-                       v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
-                       break;
+static void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+       void *sptr = nsk->sk_security;
+#endif
 
-               case SO_PEERCRED:
-                       if (len > sizeof(sk->sk_peercred))
-                               len = sizeof(sk->sk_peercred);
-                       if (copy_to_user(optval, &sk->sk_peercred, len))
-                               return -EFAULT;
-                       goto lenout;
-
-               case SO_PEERNAME:
-               {
-                       char address[128];
-
-                       if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
-                               return -ENOTCONN;
-                       if (lv < len)
-                               return -EINVAL;
-                       if (copy_to_user(optval, address, len))
-                               return -EFAULT;
-                       goto lenout;
-               }
+       memcpy(nsk, osk, osk->sk_prot->obj_size);
+#ifdef CONFIG_SECURITY_NETWORK
+       nsk->sk_security = sptr;
+       security_sk_clone(osk, nsk);
+#endif
+}
 
-               /* Dubious BSD thing... Probably nobody even uses it, but
-                * the UNIX standard wants it for whatever reason... -DaveM
-                */
-               case SO_ACCEPTCONN:
-                       v.val = sk->sk_state == TCP_LISTEN;
-                       break;
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+               int family)
+{
+       struct sock *sk;
+       struct kmem_cache *slab;
+
+       slab = prot->slab;
+       if (slab != NULL)
+               sk = kmem_cache_alloc(slab, priority);
+       else
+               sk = kmalloc(prot->obj_size, priority);
 
-               case SO_PEERSEC:
-                       return security_socket_getpeersec(sock, optval, optlen, len);
+       if (sk != NULL) {
+               if (security_sk_alloc(sk, family, priority))
+                       goto out_free;
 
-               default:
-                       return(-ENOPROTOOPT);
+               if (!try_module_get(prot->owner))
+                       goto out_free_sec;
        }
-       if (len > lv)
-               len = lv;
-       if (copy_to_user(optval, &v, len))
-               return -EFAULT;
-lenout:
-       if (put_user(len, optlen))
-               return -EFAULT;
-       return 0;
+
+       return sk;
+
+out_free_sec:
+       security_sk_free(sk);
+out_free:
+       if (slab != NULL)
+               kmem_cache_free(slab, sk);
+       else
+               kfree(sk);
+       return NULL;
+}
+
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+       struct kmem_cache *slab;
+       struct module *owner;
+
+       owner = prot->owner;
+       slab = prot->slab;
+
+       security_sk_free(sk);
+       if (slab != NULL)
+               kmem_cache_free(slab, sk);
+       else
+               kfree(sk);
+       module_put(owner);
 }
 
 /**
  *     sk_alloc - All socket objects are allocated here
+ *     @net: the applicable net namespace
  *     @family: protocol family
  *     @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
  *     @prot: struct proto associated with this new sock instance
  *     @zero_it: if we should zero the newly allocated sock
  */
-struct sock *sk_alloc(int family, int priority, struct proto *prot, int zero_it)
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+                     struct proto *prot)
 {
-       struct sock *sk = NULL;
-       kmem_cache_t *slab = prot->slab;
-
-       if (slab != NULL)
-               sk = kmem_cache_alloc(slab, priority);
-       else
-               sk = kmalloc(prot->obj_size, priority);
+       struct sock *sk;
 
+       sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
        if (sk) {
-               if (zero_it) {
-                       memset(sk, 0, prot->obj_size);
-                       sk->sk_family = family;
-                       /*
-                        * See comment in struct sock definition to understand
-                        * why we need sk_prot_creator -acme
-                        */
-                       sk->sk_prot = sk->sk_prot_creator = prot;
-                       sock_lock_init(sk);
-               }
-               
-               if (security_sk_alloc(sk, family, priority)) {
-                       if (slab != NULL)
-                               kmem_cache_free(slab, sk);
-                       else
-                               kfree(sk);
-                       sk = NULL;
-               } else
-                       __module_get(prot->owner);
+               sk->sk_family = family;
+               /*
+                * See comment in struct sock definition to understand
+                * why we need sk_prot_creator -acme
+                */
+               sk->sk_prot = sk->sk_prot_creator = prot;
+               sock_lock_init(sk);
+               sk->sk_net = get_net(net);
        }
+
        return sk;
 }
 
 void sk_free(struct sock *sk)
 {
        struct sk_filter *filter;
-       struct module *owner = sk->sk_prot_creator->owner;
 
        if (sk->sk_destruct)
                sk->sk_destruct(sk);
 
-       filter = sk->sk_filter;
+       filter = rcu_dereference(sk->sk_filter);
        if (filter) {
-               sk_filter_release(sk, filter);
-               sk->sk_filter = NULL;
+               sk_filter_uncharge(sk, filter);
+               rcu_assign_pointer(sk->sk_filter, NULL);
        }
 
        sock_disable_timestamp(sk);
@@ -675,14 +983,107 @@ void sk_free(struct sock *sk)
                printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
                       __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
 
-       security_sk_free(sk);
-       if (sk->sk_prot_creator->slab != NULL)
-               kmem_cache_free(sk->sk_prot_creator->slab, sk);
-       else
-               kfree(sk);
-       module_put(owner);
+       put_net(sk->sk_net);
+       sk_prot_free(sk->sk_prot_creator, sk);
+}
+
+struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+{
+       struct sock *newsk;
+
+       newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
+       if (newsk != NULL) {
+               struct sk_filter *filter;
+
+               sock_copy(newsk, sk);
+
+               /* SANITY */
+               get_net(newsk->sk_net);
+               sk_node_init(&newsk->sk_node);
+               sock_lock_init(newsk);
+               bh_lock_sock(newsk);
+               newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
+
+               atomic_set(&newsk->sk_rmem_alloc, 0);
+               atomic_set(&newsk->sk_wmem_alloc, 0);
+               atomic_set(&newsk->sk_omem_alloc, 0);
+               skb_queue_head_init(&newsk->sk_receive_queue);
+               skb_queue_head_init(&newsk->sk_write_queue);
+#ifdef CONFIG_NET_DMA
+               skb_queue_head_init(&newsk->sk_async_wait_queue);
+#endif
+
+               rwlock_init(&newsk->sk_dst_lock);
+               rwlock_init(&newsk->sk_callback_lock);
+               lockdep_set_class_and_name(&newsk->sk_callback_lock,
+                               af_callback_keys + newsk->sk_family,
+                               af_family_clock_key_strings[newsk->sk_family]);
+
+               newsk->sk_dst_cache     = NULL;
+               newsk->sk_wmem_queued   = 0;
+               newsk->sk_forward_alloc = 0;
+               newsk->sk_send_head     = NULL;
+               newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+
+               sock_reset_flag(newsk, SOCK_DONE);
+               skb_queue_head_init(&newsk->sk_error_queue);
+
+               filter = newsk->sk_filter;
+               if (filter != NULL)
+                       sk_filter_charge(newsk, filter);
+
+               if (unlikely(xfrm_sk_clone_policy(newsk))) {
+                       /* It is still raw copy of parent, so invalidate
+                        * destructor and make plain sk_free() */
+                       newsk->sk_destruct = NULL;
+                       sk_free(newsk);
+                       newsk = NULL;
+                       goto out;
+               }
+
+               newsk->sk_err      = 0;
+               newsk->sk_priority = 0;
+               atomic_set(&newsk->sk_refcnt, 2);
+
+               /*
+                * Increment the counter in the same struct proto as the master
+                * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
+                * is the same as sk->sk_prot->socks, as this field was copied
+                * with memcpy).
+                *
+                * This _changes_ the previous behaviour, where
+                * tcp_create_openreq_child always was incrementing the
+                * equivalent to tcp_prot->socks (inet_sock_nr), so this have
+                * to be taken into account in all callers. -acme
+                */
+               sk_refcnt_debug_inc(newsk);
+               newsk->sk_socket = NULL;
+               newsk->sk_sleep  = NULL;
+
+               if (newsk->sk_prot->sockets_allocated)
+                       atomic_inc(newsk->sk_prot->sockets_allocated);
+       }
+out:
+       return newsk;
 }
 
+EXPORT_SYMBOL_GPL(sk_clone);
+
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+       __sk_dst_set(sk, dst);
+       sk->sk_route_caps = dst->dev->features;
+       if (sk->sk_route_caps & NETIF_F_GSO)
+               sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+       if (sk_can_gso(sk)) {
+               if (dst->header_len)
+                       sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+               else
+                       sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+       }
+}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
+
 void __init sk_init(void)
 {
        if (num_physpages <= 4096) {
@@ -701,8 +1102,8 @@ void __init sk_init(void)
  */
 
 
-/* 
- * Write buffer destructor automatically called from kfree_skb. 
+/*
+ * Write buffer destructor automatically called from kfree_skb.
  */
 void sock_wfree(struct sk_buff *skb)
 {
@@ -715,14 +1116,16 @@ void sock_wfree(struct sk_buff *skb)
        sock_put(sk);
 }
 
-/* 
- * Read buffer destructor automatically called from kfree_skb. 
+/*
+ * Read buffer destructor automatically called from kfree_skb.
  */
 void sock_rfree(struct sk_buff *skb)
 {
        struct sock *sk = skb->sk;
 
+       skb_truesize_check(skb);
        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+       sk_mem_uncharge(skb->sk, skb->truesize);
 }
 
 
@@ -749,7 +1152,8 @@ unsigned long sock_i_ino(struct sock *sk)
 /*
  * Allocate a skb from the socket's send buffer.
  */
-struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int priority)
+struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
+                            gfp_t priority)
 {
        if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
                struct sk_buff * skb = alloc_skb(size, priority);
@@ -763,8 +1167,9 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force, int
 
 /*
  * Allocate a skb from the socket's receive buffer.
- */ 
-struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int priority)
+ */
+struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
+                            gfp_t priority)
 {
        if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
                struct sk_buff *skb = alloc_skb(size, priority);
@@ -776,16 +1181,16 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force, int
        return NULL;
 }
 
-/* 
+/*
  * Allocate a memory block from the socket's option memory buffer.
- */ 
-void *sock_kmalloc(struct sock *sk, int size, int priority)
+ */
+void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
 {
        if ((unsigned)size <= sysctl_optmem_max &&
            atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
                void *mem;
                /* First do the add, to avoid the race if kmalloc
-                * might sleep.
+                * might sleep.
                 */
                atomic_add(size, &sk->sk_omem_alloc);
                mem = kmalloc(size, priority);
@@ -843,7 +1248,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
                                            int noblock, int *errcode)
 {
        struct sk_buff *skb;
-       unsigned int gfp_mask;
+       gfp_t gfp_mask;
        long timeo;
        int err;
 
@@ -862,7 +1267,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
                        goto failure;
 
                if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
-                       skb = alloc_skb(header_len, sk->sk_allocation);
+                       skb = alloc_skb(header_len, gfp_mask);
                        if (skb) {
                                int npages;
                                int i;
@@ -921,7 +1326,7 @@ failure:
        return NULL;
 }
 
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
                                    int noblock, int *errcode)
 {
        return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
@@ -931,13 +1336,13 @@ static void __lock_sock(struct sock *sk)
 {
        DEFINE_WAIT(wait);
 
-       for(;;) {
+       for (;;) {
                prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
                                        TASK_UNINTERRUPTIBLE);
                spin_unlock_bh(&sk->sk_lock.slock);
                schedule();
                spin_lock_bh(&sk->sk_lock.slock);
-               if(!sock_owned_by_user(sk))
+               if (!sock_owned_by_user(sk))
                        break;
        }
        finish_wait(&sk->sk_lock.wq, &wait);
@@ -969,7 +1374,7 @@ static void __release_sock(struct sock *sk)
                } while (skb != NULL);
 
                bh_lock_sock(sk);
-       } while((skb = sk->sk_backlog.head) != NULL);
+       } while ((skb = sk->sk_backlog.head) != NULL);
 }
 
 /**
@@ -997,6 +1402,103 @@ int sk_wait_data(struct sock *sk, long *timeo)
 
 EXPORT_SYMBOL(sk_wait_data);
 
+/**
+ *     __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ *     @sk: socket
+ *     @size: memory size to allocate
+ *     @kind: allocation type
+ *
+ *     If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ *     rmem allocation. This function assumes that protocols which have
+ *     memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+       struct proto *prot = sk->sk_prot;
+       int amt = sk_mem_pages(size);
+       int allocated;
+
+       sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+       allocated = atomic_add_return(amt, prot->memory_allocated);
+
+       /* Under limit. */
+       if (allocated <= prot->sysctl_mem[0]) {
+               if (prot->memory_pressure && *prot->memory_pressure)
+                       *prot->memory_pressure = 0;
+               return 1;
+       }
+
+       /* Under pressure. */
+       if (allocated > prot->sysctl_mem[1])
+               if (prot->enter_memory_pressure)
+                       prot->enter_memory_pressure();
+
+       /* Over hard limit. */
+       if (allocated > prot->sysctl_mem[2])
+               goto suppress_allocation;
+
+       /* guarantee minimum buffer size under pressure */
+       if (kind == SK_MEM_RECV) {
+               if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+                       return 1;
+       } else { /* SK_MEM_SEND */
+               if (sk->sk_type == SOCK_STREAM) {
+                       if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+                               return 1;
+               } else if (atomic_read(&sk->sk_wmem_alloc) <
+                          prot->sysctl_wmem[0])
+                               return 1;
+       }
+
+       if (prot->memory_pressure) {
+               if (!*prot->memory_pressure ||
+                   prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
+                   sk_mem_pages(sk->sk_wmem_queued +
+                                atomic_read(&sk->sk_rmem_alloc) +
+                                sk->sk_forward_alloc))
+                       return 1;
+       }
+
+suppress_allocation:
+
+       if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+               sk_stream_moderate_sndbuf(sk);
+
+               /* Fail only if socket is _under_ its sndbuf.
+                * In this case we cannot block, so that we have to fail.
+                */
+               if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+                       return 1;
+       }
+
+       /* Alas. Undo changes. */
+       sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+       atomic_sub(amt, prot->memory_allocated);
+       return 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ *     __sk_reclaim - reclaim memory_allocated
+ *     @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+       struct proto *prot = sk->sk_prot;
+
+       atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+                  prot->memory_allocated);
+       sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+       if (prot->memory_pressure && *prot->memory_pressure &&
+           (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+               *prot->memory_pressure = 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
 /*
  * Set of default routines for initialising struct proto_ops when
  * the protocol does not support a particular function. In certain
@@ -1009,7 +1511,7 @@ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
        return -EOPNOTSUPP;
 }
 
-int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
                    int len, int flags)
 {
        return -EOPNOTSUPP;
@@ -1025,7 +1527,7 @@ int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
        return -EOPNOTSUPP;
 }
 
-int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
                    int *len, int peer)
 {
        return -EOPNOTSUPP;
@@ -1111,7 +1613,7 @@ static void sock_def_error_report(struct sock *sk)
        read_lock(&sk->sk_callback_lock);
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible(sk->sk_sleep);
-       sk_wake_async(sk,0,POLL_ERR); 
+       sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
        read_unlock(&sk->sk_callback_lock);
 }
 
@@ -1120,7 +1622,7 @@ static void sock_def_readable(struct sock *sk, int len)
        read_lock(&sk->sk_callback_lock);
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                wake_up_interruptible(sk->sk_sleep);
-       sk_wake_async(sk,1,POLL_IN);
+       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
        read_unlock(&sk->sk_callback_lock);
 }
 
@@ -1131,13 +1633,13 @@ static void sock_def_write_space(struct sock *sk)
        /* Do not wake up a writer until he can make "significant"
         * progress.  --DaveM
         */
-       if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+       if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
                if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                        wake_up_interruptible(sk->sk_sleep);
 
                /* Should agree with poll, otherwise some programs break */
                if (sock_writeable(sk))
-                       sk_wake_async(sk, 2, POLL_OUT);
+                       sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
 
        read_unlock(&sk->sk_callback_lock);
@@ -1145,15 +1647,14 @@ static void sock_def_write_space(struct sock *sk)
 
 static void sock_def_destruct(struct sock *sk)
 {
-       if (sk->sk_protinfo)
-               kfree(sk->sk_protinfo);
+       kfree(sk->sk_protinfo);
 }
 
 void sk_send_sigurg(struct sock *sk)
 {
        if (sk->sk_socket && sk->sk_socket->file)
                if (send_sigurg(&sk->sk_socket->file->f_owner))
-                       sk_wake_async(sk, 3, POLL_PRI);
+                       sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
 }
 
 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
@@ -1178,11 +1679,14 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        skb_queue_head_init(&sk->sk_receive_queue);
        skb_queue_head_init(&sk->sk_write_queue);
        skb_queue_head_init(&sk->sk_error_queue);
+#ifdef CONFIG_NET_DMA
+       skb_queue_head_init(&sk->sk_async_wait_queue);
+#endif
 
        sk->sk_send_head        =       NULL;
 
        init_timer(&sk->sk_timer);
-       
+
        sk->sk_allocation       =       GFP_KERNEL;
        sk->sk_rcvbuf           =       sysctl_rmem_default;
        sk->sk_sndbuf           =       sysctl_wmem_default;
@@ -1191,8 +1695,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
        sock_set_flag(sk, SOCK_ZAPPED);
 
-       if(sock)
-       {
+       if (sock) {
                sk->sk_type     =       sock->type;
                sk->sk_sleep    =       &sock->wait;
                sock->sk        =       sk;
@@ -1201,6 +1704,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
 
        rwlock_init(&sk->sk_dst_lock);
        rwlock_init(&sk->sk_callback_lock);
+       lockdep_set_class_and_name(&sk->sk_callback_lock,
+                       af_callback_keys + sk->sk_family,
+                       af_family_clock_key_strings[sk->sk_family]);
 
        sk->sk_state_change     =       sock_def_wakeup;
        sk->sk_data_ready       =       sock_def_readable;
@@ -1219,57 +1725,85 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
 
-       sk->sk_stamp.tv_sec     = -1L;
-       sk->sk_stamp.tv_usec    = -1L;
+       sk->sk_stamp = ktime_set(-1L, -1L);
 
        atomic_set(&sk->sk_refcnt, 1);
+       atomic_set(&sk->sk_drops, 0);
 }
 
-void fastcall lock_sock(struct sock *sk)
+void lock_sock_nested(struct sock *sk, int subclass)
 {
        might_sleep();
-       spin_lock_bh(&(sk->sk_lock.slock));
-       if (sk->sk_lock.owner)
+       spin_lock_bh(&sk->sk_lock.slock);
+       if (sk->sk_lock.owned)
                __lock_sock(sk);
-       sk->sk_lock.owner = (void *)1;
-       spin_unlock_bh(&(sk->sk_lock.slock));
+       sk->sk_lock.owned = 1;
+       spin_unlock(&sk->sk_lock.slock);
+       /*
+        * The sk_lock has mutex_lock() semantics here:
+        */
+       mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+       local_bh_enable();
 }
 
-EXPORT_SYMBOL(lock_sock);
+EXPORT_SYMBOL(lock_sock_nested);
 
-void fastcall release_sock(struct sock *sk)
+void release_sock(struct sock *sk)
 {
-       spin_lock_bh(&(sk->sk_lock.slock));
+       /*
+        * The sk_lock has mutex_unlock() semantics:
+        */
+       mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+
+       spin_lock_bh(&sk->sk_lock.slock);
        if (sk->sk_backlog.tail)
                __release_sock(sk);
-       sk->sk_lock.owner = NULL;
-        if (waitqueue_active(&(sk->sk_lock.wq)))
-               wake_up(&(sk->sk_lock.wq));
-       spin_unlock_bh(&(sk->sk_lock.slock));
+       sk->sk_lock.owned = 0;
+       if (waitqueue_active(&sk->sk_lock.wq))
+               wake_up(&sk->sk_lock.wq);
+       spin_unlock_bh(&sk->sk_lock.slock);
 }
 EXPORT_SYMBOL(release_sock);
 
 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
-{ 
+{
+       struct timeval tv;
        if (!sock_flag(sk, SOCK_TIMESTAMP))
                sock_enable_timestamp(sk);
-       if (sk->sk_stamp.tv_sec == -1) 
+       tv = ktime_to_timeval(sk->sk_stamp);
+       if (tv.tv_sec == -1)
                return -ENOENT;
-       if (sk->sk_stamp.tv_sec == 0)
-               do_gettimeofday(&sk->sk_stamp);
-       return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
-               -EFAULT : 0; 
-} 
+       if (tv.tv_sec == 0) {
+               sk->sk_stamp = ktime_get_real();
+               tv = ktime_to_timeval(sk->sk_stamp);
+       }
+       return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
+}
 EXPORT_SYMBOL(sock_get_timestamp);
 
+int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+       struct timespec ts;
+       if (!sock_flag(sk, SOCK_TIMESTAMP))
+               sock_enable_timestamp(sk);
+       ts = ktime_to_timespec(sk->sk_stamp);
+       if (ts.tv_sec == -1)
+               return -ENOENT;
+       if (ts.tv_sec == 0) {
+               sk->sk_stamp = ktime_get_real();
+               ts = ktime_to_timespec(sk->sk_stamp);
+       }
+       return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestampns);
+
 void sock_enable_timestamp(struct sock *sk)
-{      
-       if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
+{
+       if (!sock_flag(sk, SOCK_TIMESTAMP)) {
                sock_set_flag(sk, SOCK_TIMESTAMP);
                net_enable_timestamp();
        }
 }
-EXPORT_SYMBOL(sock_enable_timestamp); 
 
 /*
  *     Get a socket option on an socket.
@@ -1288,6 +1822,20 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname,
 
 EXPORT_SYMBOL(sock_common_getsockopt);
 
+#ifdef CONFIG_COMPAT
+int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
+                                 char __user *optval, int __user *optlen)
+{
+       struct sock *sk = sock->sk;
+
+       if (sk->sk_prot->compat_getsockopt != NULL)
+               return sk->sk_prot->compat_getsockopt(sk, level, optname,
+                                                     optval, optlen);
+       return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_getsockopt);
+#endif
+
 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
                        struct msghdr *msg, size_t size, int flags)
 {
@@ -1317,6 +1865,20 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname,
 
 EXPORT_SYMBOL(sock_common_setsockopt);
 
+#ifdef CONFIG_COMPAT
+int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
+                                 char __user *optval, int optlen)
+{
+       struct sock *sk = sock->sk;
+
+       if (sk->sk_prot->compat_setsockopt != NULL)
+               return sk->sk_prot->compat_setsockopt(sk, level, optname,
+                                                     optval, optlen);
+       return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
+}
+EXPORT_SYMBOL(compat_sock_common_setsockopt);
+#endif
+
 void sk_common_release(struct sock *sk)
 {
        if (sk->sk_prot->destroy)
@@ -1348,11 +1910,7 @@ void sk_common_release(struct sock *sk)
 
        xfrm_sk_free_policy(sk);
 
-#ifdef INET_REFCNT_DEBUG
-       if (atomic_read(&sk->sk_refcnt) != 1)
-               printk(KERN_DEBUG "Destruction of the socket %p delayed, c=%d\n",
-                      sk, atomic_read(&sk->sk_refcnt));
-#endif
+       sk_refcnt_debug_release(sk);
        sock_put(sk);
 }
 
@@ -1363,25 +1921,83 @@ static LIST_HEAD(proto_list);
 
 int proto_register(struct proto *prot, int alloc_slab)
 {
-       int rc = -ENOBUFS;
+       char *request_sock_slab_name = NULL;
+       char *timewait_sock_slab_name;
+
+       if (sock_prot_inuse_init(prot) != 0) {
+               printk(KERN_CRIT "%s: Can't alloc inuse counters!\n", prot->name);
+               goto out;
+       }
 
        if (alloc_slab) {
                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-                                              SLAB_HWCACHE_ALIGN, NULL, NULL);
+                                              SLAB_HWCACHE_ALIGN, NULL);
 
                if (prot->slab == NULL) {
                        printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
                               prot->name);
-                       goto out;
+                       goto out_free_inuse;
+               }
+
+               if (prot->rsk_prot != NULL) {
+                       static const char mask[] = "request_sock_%s";
+
+                       request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+                       if (request_sock_slab_name == NULL)
+                               goto out_free_sock_slab;
+
+                       sprintf(request_sock_slab_name, mask, prot->name);
+                       prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+                                                                prot->rsk_prot->obj_size, 0,
+                                                                SLAB_HWCACHE_ALIGN, NULL);
+
+                       if (prot->rsk_prot->slab == NULL) {
+                               printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
+                                      prot->name);
+                               goto out_free_request_sock_slab_name;
+                       }
+               }
+
+               if (prot->twsk_prot != NULL) {
+                       static const char mask[] = "tw_sock_%s";
+
+                       timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+
+                       if (timewait_sock_slab_name == NULL)
+                               goto out_free_request_sock_slab;
+
+                       sprintf(timewait_sock_slab_name, mask, prot->name);
+                       prot->twsk_prot->twsk_slab =
+                               kmem_cache_create(timewait_sock_slab_name,
+                                                 prot->twsk_prot->twsk_obj_size,
+                                                 0, SLAB_HWCACHE_ALIGN,
+                                                 NULL);
+                       if (prot->twsk_prot->twsk_slab == NULL)
+                               goto out_free_timewait_sock_slab_name;
                }
        }
 
        write_lock(&proto_list_lock);
        list_add(&prot->node, &proto_list);
        write_unlock(&proto_list_lock);
-       rc = 0;
+       return 0;
+
+out_free_timewait_sock_slab_name:
+       kfree(timewait_sock_slab_name);
+out_free_request_sock_slab:
+       if (prot->rsk_prot && prot->rsk_prot->slab) {
+               kmem_cache_destroy(prot->rsk_prot->slab);
+               prot->rsk_prot->slab = NULL;
+       }
+out_free_request_sock_slab_name:
+       kfree(request_sock_slab_name);
+out_free_sock_slab:
+       kmem_cache_destroy(prot->slab);
+       prot->slab = NULL;
+out_free_inuse:
+       sock_prot_inuse_free(prot);
 out:
-       return rc;
+       return -ENOBUFS;
 }
 
 EXPORT_SYMBOL(proto_register);
@@ -1389,62 +2005,50 @@ EXPORT_SYMBOL(proto_register);
 void proto_unregister(struct proto *prot)
 {
        write_lock(&proto_list_lock);
+       list_del(&prot->node);
+       write_unlock(&proto_list_lock);
+
+       sock_prot_inuse_free(prot);
 
        if (prot->slab != NULL) {
                kmem_cache_destroy(prot->slab);
                prot->slab = NULL;
        }
 
-       list_del(&prot->node);
-       write_unlock(&proto_list_lock);
-}
-
-EXPORT_SYMBOL(proto_unregister);
+       if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
+               const char *name = kmem_cache_name(prot->rsk_prot->slab);
 
-#ifdef CONFIG_PROC_FS
-static inline struct proto *__proto_head(void)
-{
-       return list_entry(proto_list.next, struct proto, node);
-}
+               kmem_cache_destroy(prot->rsk_prot->slab);
+               kfree(name);
+               prot->rsk_prot->slab = NULL;
+       }
 
-static inline struct proto *proto_head(void)
-{
-       return list_empty(&proto_list) ? NULL : __proto_head();
-}
+       if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
+               const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
 
-static inline struct proto *proto_next(struct proto *proto)
-{
-       return proto->node.next == &proto_list ? NULL :
-               list_entry(proto->node.next, struct proto, node);
+               kmem_cache_destroy(prot->twsk_prot->twsk_slab);
+               kfree(name);
+               prot->twsk_prot->twsk_slab = NULL;
+       }
 }
 
-static inline struct proto *proto_get_idx(loff_t pos)
-{
-       struct proto *proto;
-       loff_t i = 0;
-
-       list_for_each_entry(proto, &proto_list, node)
-               if (i++ == pos)
-                       goto out;
-
-       proto = NULL;
-out:
-       return proto;
-}
+EXPORT_SYMBOL(proto_unregister);
 
+#ifdef CONFIG_PROC_FS
 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+       __acquires(proto_list_lock)
 {
        read_lock(&proto_list_lock);
-       return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
+       return seq_list_start_head(&proto_list, *pos);
 }
 
 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
-       ++*pos;
-       return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
+       return seq_list_next(v, &proto_list, pos);
 }
 
 static void proto_seq_stop(struct seq_file *seq, void *v)
+       __releases(proto_list_lock)
 {
        read_unlock(&proto_list_lock);
 }
@@ -1489,7 +2093,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
 
 static int proto_seq_show(struct seq_file *seq, void *v)
 {
-       if (v == SEQ_START_TOKEN)
+       if (v == &proto_list)
                seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
                           "protocol",
                           "size",
@@ -1501,11 +2105,11 @@ static int proto_seq_show(struct seq_file *seq, void *v)
                           "module",
                           "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
        else
-               proto_seq_printf(seq, v);
+               proto_seq_printf(seq, list_entry(v, struct proto, node));
        return 0;
 }
 
-static struct seq_operations proto_seq_ops = {
+static const struct seq_operations proto_seq_ops = {
        .start  = proto_seq_start,
        .next   = proto_seq_next,
        .stop   = proto_seq_stop,
@@ -1517,7 +2121,7 @@ static int proto_seq_open(struct inode *inode, struct file *file)
        return seq_open(file, &proto_seq_ops);
 }
 
-static struct file_operations proto_seq_fops = {
+static const struct file_operations proto_seq_fops = {
        .owner          = THIS_MODULE,
        .open           = proto_seq_open,
        .read           = seq_read,
@@ -1528,7 +2132,7 @@ static struct file_operations proto_seq_fops = {
 static int __init proto_init(void)
 {
        /* register /proc/net/protocols */
-       return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
+       return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
 }
 
 subsys_initcall(proto_init);
@@ -1563,8 +2167,4 @@ EXPORT_SYMBOL(sock_wfree);
 EXPORT_SYMBOL(sock_wmalloc);
 EXPORT_SYMBOL(sock_i_uid);
 EXPORT_SYMBOL(sock_i_ino);
-#ifdef CONFIG_SYSCTL
 EXPORT_SYMBOL(sysctl_optmem_max);
-EXPORT_SYMBOL(sysctl_rmem_max);
-EXPORT_SYMBOL(sysctl_wmem_max);
-#endif