net/sunqe.c section fix

[safe/jmp/linux-2.6] / net / core / sock.c
diff --git a/net/core/sock.c b/net/core/sock.c

index 5d820c3..433715f 100644 (file)
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -34,7 +34,7 @@
   *             Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
   *                                     was buggy. Put a remove_sock() in the handler
   *                                     for memory when we hit 0. Also altered the timer
- *                                     code. The ACK stuff can wait and needs major 
+ *                                     code. The ACK stuff can wait and needs major
   *                                     TCP layer surgery.
   *             Alan Cox        :       Fixed TCP ack bug, removed remove sock
   *                                     and fixed timer/inet_bh race.
@@ -92,7 +92,6 @@
   */
  
  #include <linux/capability.h>
-#include <linux/config.h>
  #include <linux/errno.h>
  #include <linux/types.h>
  #include <linux/socket.h>
@@ -112,6 +111,7 @@
  #include <linux/poll.h>
  #include <linux/tcp.h>
  #include <linux/init.h>
+#include <linux/highmem.h>
  
  #include <asm/uaccess.h>
  #include <asm/system.h>
@@ -119,6 +119,7 @@
  #include <linux/netdevice.h>
  #include <net/protocol.h>
  #include <linux/skbuff.h>
+#include <net/net_namespace.h>
  #include <net/request_sock.h>
  #include <net/sock.h>
  #include <net/xfrm.h>
@@ -130,6 +131,69 @@
  #include <net/tcp.h>
  #endif
  
+/*
+ * Each address family might have different locking rules, so we have
+ * one slock key per address family:
+ */
+static struct lock_class_key af_family_keys[AF_MAX];
+static struct lock_class_key af_family_slock_keys[AF_MAX];
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+/*
+ * Make lock validator output more readable. (we pre-construct these
+ * strings build-time, so that runtime initialization of socket
+ * locks is fast):
+ */
+static const char *af_family_key_strings[AF_MAX+1] = {
+  "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
+  "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
+  "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
+  "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
+  "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
+  "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
+  "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
+  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
+  "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
+  "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
+  "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
+  "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
+};
+static const char *af_family_slock_key_strings[AF_MAX+1] = {
+  "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
+  "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
+  "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
+  "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
+  "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
+  "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
+  "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
+  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
+  "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
+  "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
+  "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
+  "slock-AF_RXRPC" , "slock-AF_MAX"
+};
+static const char *af_family_clock_key_strings[AF_MAX+1] = {
+  "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
+  "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
+  "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
+  "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
+  "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
+  "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
+  "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
+  "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
+  "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
+  "clock-27"       , "clock-28"          , "clock-29"          ,
+  "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
+  "clock-AF_RXRPC" , "clock-AF_MAX"
+};
+#endif
+
+/*
+ * sk_callback_lock locking rules are per-address-family,
+ * so split the lock classes by using a per-AF key:
+ */
+static struct lock_class_key af_callback_keys[AF_MAX];
+
  /* Take into consideration the size of the struct sk_buff overhead in the
   * determination of these values, since that is non-constant across
   * platforms.  This makes socket queueing behavior and performance
@@ -141,13 +205,13 @@
  #define SK_RMEM_MAX            (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
  
  /* Run time adjustable parameters. */
-__u32 sysctl_wmem_max = SK_WMEM_MAX;
-__u32 sysctl_rmem_max = SK_RMEM_MAX;
-__u32 sysctl_wmem_default = SK_WMEM_MAX;
-__u32 sysctl_rmem_default = SK_RMEM_MAX;
+__u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
+__u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
+__u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
  
  /* Maximal space eaten by iovec or ancilliary data plus some space */
-int sysctl_optmem_max = sizeof(unsigned long)*(2*UIO_MAXIOV + 512);
+int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
  
  static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
  {
@@ -157,7 +221,20 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
                 return -EINVAL;
         if (copy_from_user(&tv, optval, sizeof(tv)))
                 return -EFAULT;
-
+       if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
+               return -EDOM;
+
+       if (tv.tv_sec < 0) {
+               static int warned __read_mostly;
+
+               *timeo_p = 0;
+               if (warned < 10 && net_ratelimit())
+                       warned++;
+                       printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
+                              "tries to set negative timeout\n",
+                               current->comm, task_pid_nr(current));
+               return 0;
+       }
         *timeo_p = MAX_SCHEDULE_TIMEOUT;
         if (tv.tv_sec == 0 && tv.tv_usec == 0)
                 return 0;
@@ -170,8 +247,8 @@ static void sock_warn_obsolete_bsdism(const char *name)
  {
         static int warned;
         static char warncomm[TASK_COMM_LEN];
-       if (strcmp(warncomm, current->comm) && warned < 5) { 
-               strcpy(warncomm,  current->comm); 
+       if (strcmp(warncomm, current->comm) && warned < 5) {
+               strcpy(warncomm,  current->comm);
                 printk(KERN_WARNING "process `%s' is using obsolete "
                        "%s SO_BSDCOMPAT\n", warncomm, name);
                 warned++;
@@ -179,8 +256,8 @@ static void sock_warn_obsolete_bsdism(const char *name)
  }
  
  static void sock_disable_timestamp(struct sock *sk)
-{      
-       if (sock_flag(sk, SOCK_TIMESTAMP)) { 
+{
+       if (sock_flag(sk, SOCK_TIMESTAMP)) {
                 sock_reset_flag(sk, SOCK_TIMESTAMP);
                 net_disable_timestamp();
         }
@@ -201,14 +278,15 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
                 goto out;
         }
  
-       /* It would be deadlock, if sock_queue_rcv_skb is used
-          with socket lock! We assume that users of this
-          function are lock free.
-       */
-       err = sk_filter(sk, skb, 1);
+       err = sk_filter(sk, skb);
         if (err)
                 goto out;
  
+       if (!sk_rmem_schedule(sk, skb->truesize)) {
+               err = -ENOBUFS;
+               goto out;
+       }
+
         skb->dev = NULL;
         skb_set_owner_r(skb, sk);
  
@@ -228,19 +306,29 @@ out:
  }
  EXPORT_SYMBOL(sock_queue_rcv_skb);
  
-int sk_receive_skb(struct sock *sk, struct sk_buff *skb)
+int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
  {
         int rc = NET_RX_SUCCESS;
  
-       if (sk_filter(sk, skb, 0))
+       if (sk_filter(sk, skb))
                 goto discard_and_relse;
  
         skb->dev = NULL;
  
-       bh_lock_sock(sk);
-       if (!sock_owned_by_user(sk))
-               rc = sk->sk_backlog_rcv(sk, skb);
+       if (nested)
+               bh_lock_sock_nested(sk);
         else
+               bh_lock_sock(sk);
+       if (!sock_owned_by_user(sk)) {
+               /*
+                * trylock + unlock semantics:
+                */
+               mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
+
+               rc = sk->sk_backlog_rcv(sk, skb);
+
+               mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+       } else
                 sk_add_backlog(sk, skb);
         bh_unlock_sock(sk);
  out:
@@ -280,6 +368,70 @@ struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
  }
  EXPORT_SYMBOL(sk_dst_check);
  
+static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
+{
+       int ret = -ENOPROTOOPT;
+#ifdef CONFIG_NETDEVICES
+       struct net *net = sk->sk_net;
+       char devname[IFNAMSIZ];
+       int index;
+
+       /* Sorry... */
+       ret = -EPERM;
+       if (!capable(CAP_NET_RAW))
+               goto out;
+
+       ret = -EINVAL;
+       if (optlen < 0)
+               goto out;
+
+       /* Bind this socket to a particular device like "eth0",
+        * as specified in the passed interface name. If the
+        * name is "" or the option length is zero the socket
+        * is not bound.
+        */
+       if (optlen > IFNAMSIZ - 1)
+               optlen = IFNAMSIZ - 1;
+       memset(devname, 0, sizeof(devname));
+
+       ret = -EFAULT;
+       if (copy_from_user(devname, optval, optlen))
+               goto out;
+
+       if (devname[0] == '\0') {
+               index = 0;
+       } else {
+               struct net_device *dev = dev_get_by_name(net, devname);
+
+               ret = -ENODEV;
+               if (!dev)
+                       goto out;
+
+               index = dev->ifindex;
+               dev_put(dev);
+       }
+
+       lock_sock(sk);
+       sk->sk_bound_dev_if = index;
+       sk_dst_reset(sk);
+       release_sock(sk);
+
+       ret = 0;
+
+out:
+#endif
+
+       return ret;
+}
+
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+       if (valbool)
+               sock_set_flag(sk, bit);
+       else
+               sock_reset_flag(sk, bit);
+}
+
  /*
   *     This is meant for all protocols to use and covers goings on
   *     at the socket level. Everything here is generic.
@@ -289,12 +441,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                     char __user *optval, int optlen)
  {
         struct sock *sk=sock->sk;
-       struct sk_filter *filter;
         int val;
         int valbool;
         struct linger ling;
         int ret = 0;
-       
+
         /*
          *      Options without arguments
          */
@@ -307,270 +458,229 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
                 return 0;
         }
  #endif
-       
-       if(optlen<sizeof(int))
-               return(-EINVAL);
-       
+
+       if (optname == SO_BINDTODEVICE)
+               return sock_bindtodevice(sk, optval, optlen);
+
+       if (optlen < sizeof(int))
+               return -EINVAL;
+
         if (get_user(val, (int __user *)optval))
                 return -EFAULT;
-       
-       valbool = val?1:0;
+
+       valbool = val?1:0;
  
         lock_sock(sk);
  
-       switch(optname) 
-       {
-               case SO_DEBUG:  
-                       if(val && !capable(CAP_NET_ADMIN))
-                       {
-                               ret = -EACCES;
-                       }
-                       else if (valbool)
-                               sock_set_flag(sk, SOCK_DBG);
-                       else
-                               sock_reset_flag(sk, SOCK_DBG);
-                       break;
-               case SO_REUSEADDR:
-                       sk->sk_reuse = valbool;
-                       break;
-               case SO_TYPE:
-               case SO_ERROR:
-                       ret = -ENOPROTOOPT;
-                       break;
-               case SO_DONTROUTE:
-                       if (valbool)
-                               sock_set_flag(sk, SOCK_LOCALROUTE);
-                       else
-                               sock_reset_flag(sk, SOCK_LOCALROUTE);
-                       break;
-               case SO_BROADCAST:
-                       sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
-                       break;
-               case SO_SNDBUF:
-                       /* Don't error on this BSD doesn't and if you think
-                          about it this is right. Otherwise apps have to
-                          play 'guess the biggest size' games. RCVBUF/SNDBUF
-                          are treated in BSD as hints */
-                          
-                       if (val > sysctl_wmem_max)
-                               val = sysctl_wmem_max;
+       switch(optname) {
+       case SO_DEBUG:
+               if (val && !capable(CAP_NET_ADMIN)) {
+                       ret = -EACCES;
+               } else
+                       sock_valbool_flag(sk, SOCK_DBG, valbool);
+               break;
+       case SO_REUSEADDR:
+               sk->sk_reuse = valbool;
+               break;
+       case SO_TYPE:
+       case SO_ERROR:
+               ret = -ENOPROTOOPT;
+               break;
+       case SO_DONTROUTE:
+               sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
+               break;
+       case SO_BROADCAST:
+               sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
+               break;
+       case SO_SNDBUF:
+               /* Don't error on this BSD doesn't and if you think
+                  about it this is right. Otherwise apps have to
+                  play 'guess the biggest size' games. RCVBUF/SNDBUF
+                  are treated in BSD as hints */
+
+               if (val > sysctl_wmem_max)
+                       val = sysctl_wmem_max;
  set_sndbuf:
-                       sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
-                       if ((val * 2) < SOCK_MIN_SNDBUF)
-                               sk->sk_sndbuf = SOCK_MIN_SNDBUF;
-                       else
-                               sk->sk_sndbuf = val * 2;
+               sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+               if ((val * 2) < SOCK_MIN_SNDBUF)
+                       sk->sk_sndbuf = SOCK_MIN_SNDBUF;
+               else
+                       sk->sk_sndbuf = val * 2;
  
-                       /*
-                        *      Wake up sending tasks if we
-                        *      upped the value.
-                        */
-                       sk->sk_write_space(sk);
+               /*
+                *      Wake up sending tasks if we
+                *      upped the value.
+                */
+               sk->sk_write_space(sk);
+               break;
+
+       case SO_SNDBUFFORCE:
+               if (!capable(CAP_NET_ADMIN)) {
+                       ret = -EPERM;
                         break;
+               }
+               goto set_sndbuf;
  
-               case SO_SNDBUFFORCE:
-                       if (!capable(CAP_NET_ADMIN)) {
-                               ret = -EPERM;
-                               break;
-                       }
-                       goto set_sndbuf;
-
-               case SO_RCVBUF:
-                       /* Don't error on this BSD doesn't and if you think
-                          about it this is right. Otherwise apps have to
-                          play 'guess the biggest size' games. RCVBUF/SNDBUF
-                          are treated in BSD as hints */
-                         
-                       if (val > sysctl_rmem_max)
-                               val = sysctl_rmem_max;
+       case SO_RCVBUF:
+               /* Don't error on this BSD doesn't and if you think
+                  about it this is right. Otherwise apps have to
+                  play 'guess the biggest size' games. RCVBUF/SNDBUF
+                  are treated in BSD as hints */
+
+               if (val > sysctl_rmem_max)
+                       val = sysctl_rmem_max;
  set_rcvbuf:
-                       sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
-                       /*
-                        * We double it on the way in to account for
-                        * "struct sk_buff" etc. overhead.   Applications
-                        * assume that the SO_RCVBUF setting they make will
-                        * allow that much actual data to be received on that
-                        * socket.
-                        *
-                        * Applications are unaware that "struct sk_buff" and
-                        * other overheads allocate from the receive buffer
-                        * during socket buffer allocation.
-                        *
-                        * And after considering the possible alternatives,
-                        * returning the value we actually used in getsockopt
-                        * is the most desirable behavior.
-                        */
-                       if ((val * 2) < SOCK_MIN_RCVBUF)
-                               sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
-                       else
-                               sk->sk_rcvbuf = val * 2;
+               sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+               /*
+                * We double it on the way in to account for
+                * "struct sk_buff" etc. overhead.   Applications
+                * assume that the SO_RCVBUF setting they make will
+                * allow that much actual data to be received on that
+                * socket.
+                *
+                * Applications are unaware that "struct sk_buff" and
+                * other overheads allocate from the receive buffer
+                * during socket buffer allocation.
+                *
+                * And after considering the possible alternatives,
+                * returning the value we actually used in getsockopt
+                * is the most desirable behavior.
+                */
+               if ((val * 2) < SOCK_MIN_RCVBUF)
+                       sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
+               else
+                       sk->sk_rcvbuf = val * 2;
+               break;
+
+       case SO_RCVBUFFORCE:
+               if (!capable(CAP_NET_ADMIN)) {
+                       ret = -EPERM;
                         break;
+               }
+               goto set_rcvbuf;
  
-               case SO_RCVBUFFORCE:
-                       if (!capable(CAP_NET_ADMIN)) {
-                               ret = -EPERM;
-                               break;
-                       }
-                       goto set_rcvbuf;
-
-               case SO_KEEPALIVE:
+       case SO_KEEPALIVE:
  #ifdef CONFIG_INET
-                       if (sk->sk_protocol == IPPROTO_TCP)
-                               tcp_set_keepalive(sk, valbool);
+               if (sk->sk_protocol == IPPROTO_TCP)
+                       tcp_set_keepalive(sk, valbool);
  #endif
-                       sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+               sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
+               break;
+
+       case SO_OOBINLINE:
+               sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
+               break;
+
+       case SO_NO_CHECK:
+               sk->sk_no_check = valbool;
+               break;
+
+       case SO_PRIORITY:
+               if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
+                       sk->sk_priority = val;
+               else
+                       ret = -EPERM;
+               break;
+
+       case SO_LINGER:
+               if (optlen < sizeof(ling)) {
+                       ret = -EINVAL;  /* 1003.1g */
                         break;
-
-               case SO_OOBINLINE:
-                       sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
-                       break;
-
-               case SO_NO_CHECK:
-                       sk->sk_no_check = valbool;
-                       break;
-
-               case SO_PRIORITY:
-                       if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN)) 
-                               sk->sk_priority = val;
-                       else
-                               ret = -EPERM;
+               }
+               if (copy_from_user(&ling,optval,sizeof(ling))) {
+                       ret = -EFAULT;
                         break;
-
-               case SO_LINGER:
-                       if(optlen<sizeof(ling)) {
-                               ret = -EINVAL;  /* 1003.1g */
-                               break;
-                       }
-                       if (copy_from_user(&ling,optval,sizeof(ling))) {
-                               ret = -EFAULT;
-                               break;
-                       }
-                       if (!ling.l_onoff)
-                               sock_reset_flag(sk, SOCK_LINGER);
-                       else {
+               }
+               if (!ling.l_onoff)
+                       sock_reset_flag(sk, SOCK_LINGER);
+               else {
  #if (BITS_PER_LONG == 32)
-                               if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
-                                       sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
-                               else
+                       if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
+                               sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
+                       else
  #endif
-                                       sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
-                               sock_set_flag(sk, SOCK_LINGER);
-                       }
-                       break;
-
-               case SO_BSDCOMPAT:
-                       sock_warn_obsolete_bsdism("setsockopt");
-                       break;
-
-               case SO_PASSCRED:
-                       if (valbool)
-                               set_bit(SOCK_PASSCRED, &sock->flags);
+                               sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
+                       sock_set_flag(sk, SOCK_LINGER);
+               }
+               break;
+
+       case SO_BSDCOMPAT:
+               sock_warn_obsolete_bsdism("setsockopt");
+               break;
+
+       case SO_PASSCRED:
+               if (valbool)
+                       set_bit(SOCK_PASSCRED, &sock->flags);
+               else
+                       clear_bit(SOCK_PASSCRED, &sock->flags);
+               break;
+
+       case SO_TIMESTAMP:
+       case SO_TIMESTAMPNS:
+               if (valbool)  {
+                       if (optname == SO_TIMESTAMP)
+                               sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
                         else
-                               clear_bit(SOCK_PASSCRED, &sock->flags);
-                       break;
-
-               case SO_TIMESTAMP:
-                       if (valbool)  {
-                               sock_set_flag(sk, SOCK_RCVTSTAMP);
-                               sock_enable_timestamp(sk);
-                       } else
-                               sock_reset_flag(sk, SOCK_RCVTSTAMP);
-                       break;
+                               sock_set_flag(sk, SOCK_RCVTSTAMPNS);
+                       sock_set_flag(sk, SOCK_RCVTSTAMP);
+                       sock_enable_timestamp(sk);
+               } else {
+                       sock_reset_flag(sk, SOCK_RCVTSTAMP);
+                       sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
+               }
+               break;
  
-               case SO_RCVLOWAT:
-                       if (val < 0)
-                               val = INT_MAX;
-                       sk->sk_rcvlowat = val ? : 1;
-                       break;
+       case SO_RCVLOWAT:
+               if (val < 0)
+                       val = INT_MAX;
+               sk->sk_rcvlowat = val ? : 1;
+               break;
  
-               case SO_RCVTIMEO:
-                       ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
-                       break;
+       case SO_RCVTIMEO:
+               ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
+               break;
  
-               case SO_SNDTIMEO:
-                       ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
-                       break;
+       case SO_SNDTIMEO:
+               ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
+               break;
  
-#ifdef CONFIG_NETDEVICES
-               case SO_BINDTODEVICE:
-               {
-                       char devname[IFNAMSIZ]; 
+       case SO_ATTACH_FILTER:
+               ret = -EINVAL;
+               if (optlen == sizeof(struct sock_fprog)) {
+                       struct sock_fprog fprog;
  
-                       /* Sorry... */ 
-                       if (!capable(CAP_NET_RAW)) {
-                               ret = -EPERM;
+                       ret = -EFAULT;
+                       if (copy_from_user(&fprog, optval, sizeof(fprog)))
                                 break;
-                       }
-
-                       /* Bind this socket to a particular device like "eth0",
-                        * as specified in the passed interface name. If the
-                        * name is "" or the option length is zero the socket 
-                        * is not bound. 
-                        */ 
-
-                       if (!valbool) {
-                               sk->sk_bound_dev_if = 0;
-                       } else {
-                               if (optlen > IFNAMSIZ - 1)
-                                       optlen = IFNAMSIZ - 1;
-                               memset(devname, 0, sizeof(devname));
-                               if (copy_from_user(devname, optval, optlen)) {
-                                       ret = -EFAULT;
-                                       break;
-                               }
  
-                               /* Remove any cached route for this socket. */
-                               sk_dst_reset(sk);
-
-                               if (devname[0] == '\0') {
-                                       sk->sk_bound_dev_if = 0;
-                               } else {
-                                       struct net_device *dev = dev_get_by_name(devname);
-                                       if (!dev) {
-                                               ret = -ENODEV;
-                                               break;
-                                       }
-                                       sk->sk_bound_dev_if = dev->ifindex;
-                                       dev_put(dev);
-                               }
-                       }
-                       break;
+                       ret = sk_attach_filter(&fprog, sk);
                 }
-#endif
-
-
-               case SO_ATTACH_FILTER:
-                       ret = -EINVAL;
-                       if (optlen == sizeof(struct sock_fprog)) {
-                               struct sock_fprog fprog;
-
-                               ret = -EFAULT;
-                               if (copy_from_user(&fprog, optval, sizeof(fprog)))
-                                       break;
-
-                               ret = sk_attach_filter(&fprog, sk);
-                       }
-                       break;
-
-               case SO_DETACH_FILTER:
-                       spin_lock_bh(&sk->sk_lock.slock);
-                       filter = sk->sk_filter;
-                        if (filter) {
-                               sk->sk_filter = NULL;
-                               spin_unlock_bh(&sk->sk_lock.slock);
-                               sk_filter_release(sk, filter);
-                               break;
-                       }
-                       spin_unlock_bh(&sk->sk_lock.slock);
-                       ret = -ENONET;
-                       break;
+               break;
+
+       case SO_DETACH_FILTER:
+               ret = sk_detach_filter(sk);
+               break;
+
+       case SO_PASSSEC:
+               if (valbool)
+                       set_bit(SOCK_PASSSEC, &sock->flags);
+               else
+                       clear_bit(SOCK_PASSSEC, &sock->flags);
+               break;
+       case SO_MARK:
+               if (!capable(CAP_NET_ADMIN))
+                       ret = -EPERM;
+               else {
+                       sk->sk_mark = val;
+               }
+               break;
  
                 /* We implement the SO_SNDLOWAT etc to
                    not be settable (1003.1g 5.3) */
-               default:
-                       ret = -ENOPROTOOPT;
-                       break;
-       }
+       default:
+               ret = -ENOPROTOOPT;
+               break;
+       }
         release_sock(sk);
         return ret;
  }
@@ -580,203 +690,228 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                     char __user *optval, int __user *optlen)
  {
         struct sock *sk = sock->sk;
-       
-       union
-       {
-               int val;
-               struct linger ling;
+
+       union {
+               int val;
+               struct linger ling;
                 struct timeval tm;
         } v;
-       
+
         unsigned int lv = sizeof(int);
         int len;
-       
-       if(get_user(len,optlen))
-               return -EFAULT;
-       if(len < 0)
-               return -EINVAL;
-               
-       switch(optname) 
-       {
-               case SO_DEBUG:          
-                       v.val = sock_flag(sk, SOCK_DBG);
-                       break;
-               
-               case SO_DONTROUTE:
-                       v.val = sock_flag(sk, SOCK_LOCALROUTE);
-                       break;
-               
-               case SO_BROADCAST:
-                       v.val = !!sock_flag(sk, SOCK_BROADCAST);
-                       break;
-
-               case SO_SNDBUF:
-                       v.val = sk->sk_sndbuf;
-                       break;
-               
-               case SO_RCVBUF:
-                       v.val = sk->sk_rcvbuf;
-                       break;
-
-               case SO_REUSEADDR:
-                       v.val = sk->sk_reuse;
-                       break;
-
-               case SO_KEEPALIVE:
-                       v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
-                       break;
-
-               case SO_TYPE:
-                       v.val = sk->sk_type;                            
-                       break;
  
-               case SO_ERROR:
-                       v.val = -sock_error(sk);
-                       if(v.val==0)
-                               v.val = xchg(&sk->sk_err_soft, 0);
-                       break;
-
-               case SO_OOBINLINE:
-                       v.val = !!sock_flag(sk, SOCK_URGINLINE);
-                       break;
-       
-               case SO_NO_CHECK:
-                       v.val = sk->sk_no_check;
-                       break;
+       if (get_user(len, optlen))
+               return -EFAULT;
+       if (len < 0)
+               return -EINVAL;
  
-               case SO_PRIORITY:
-                       v.val = sk->sk_priority;
-                       break;
-               
-               case SO_LINGER: 
-                       lv              = sizeof(v.ling);
-                       v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
-                       v.ling.l_linger = sk->sk_lingertime / HZ;
-                       break;
-                                       
-               case SO_BSDCOMPAT:
-                       sock_warn_obsolete_bsdism("getsockopt");
-                       break;
+       switch(optname) {
+       case SO_DEBUG:
+               v.val = sock_flag(sk, SOCK_DBG);
+               break;
+
+       case SO_DONTROUTE:
+               v.val = sock_flag(sk, SOCK_LOCALROUTE);
+               break;
+
+       case SO_BROADCAST:
+               v.val = !!sock_flag(sk, SOCK_BROADCAST);
+               break;
+
+       case SO_SNDBUF:
+               v.val = sk->sk_sndbuf;
+               break;
+
+       case SO_RCVBUF:
+               v.val = sk->sk_rcvbuf;
+               break;
+
+       case SO_REUSEADDR:
+               v.val = sk->sk_reuse;
+               break;
+
+       case SO_KEEPALIVE:
+               v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
+               break;
+
+       case SO_TYPE:
+               v.val = sk->sk_type;
+               break;
+
+       case SO_ERROR:
+               v.val = -sock_error(sk);
+               if (v.val==0)
+                       v.val = xchg(&sk->sk_err_soft, 0);
+               break;
+
+       case SO_OOBINLINE:
+               v.val = !!sock_flag(sk, SOCK_URGINLINE);
+               break;
+
+       case SO_NO_CHECK:
+               v.val = sk->sk_no_check;
+               break;
+
+       case SO_PRIORITY:
+               v.val = sk->sk_priority;
+               break;
+
+       case SO_LINGER:
+               lv              = sizeof(v.ling);
+               v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
+               v.ling.l_linger = sk->sk_lingertime / HZ;
+               break;
+
+       case SO_BSDCOMPAT:
+               sock_warn_obsolete_bsdism("getsockopt");
+               break;
+
+       case SO_TIMESTAMP:
+               v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
+                               !sock_flag(sk, SOCK_RCVTSTAMPNS);
+               break;
+
+       case SO_TIMESTAMPNS:
+               v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
+               break;
+
+       case SO_RCVTIMEO:
+               lv=sizeof(struct timeval);
+               if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
+                       v.tm.tv_sec = 0;
+                       v.tm.tv_usec = 0;
+               } else {
+                       v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
+                       v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
+               }
+               break;
+
+       case SO_SNDTIMEO:
+               lv=sizeof(struct timeval);
+               if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
+                       v.tm.tv_sec = 0;
+                       v.tm.tv_usec = 0;
+               } else {
+                       v.tm.tv_sec = sk->sk_sndtimeo / HZ;
+                       v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
+               }
+               break;
  
-               case SO_TIMESTAMP:
-                       v.val = sock_flag(sk, SOCK_RCVTSTAMP);
-                       break;
+       case SO_RCVLOWAT:
+               v.val = sk->sk_rcvlowat;
+               break;
  
-               case SO_RCVTIMEO:
-                       lv=sizeof(struct timeval);
-                       if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
-                               v.tm.tv_sec = 0;
-                               v.tm.tv_usec = 0;
-                       } else {
-                               v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
-                               v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
-                       }
-                       break;
+       case SO_SNDLOWAT:
+               v.val=1;
+               break;
  
-               case SO_SNDTIMEO:
-                       lv=sizeof(struct timeval);
-                       if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
-                               v.tm.tv_sec = 0;
-                               v.tm.tv_usec = 0;
-                       } else {
-                               v.tm.tv_sec = sk->sk_sndtimeo / HZ;
-                               v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
-                       }
-                       break;
+       case SO_PASSCRED:
+               v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
+               break;
  
-               case SO_RCVLOWAT:
-                       v.val = sk->sk_rcvlowat;
-                       break;
+       case SO_PEERCRED:
+               if (len > sizeof(sk->sk_peercred))
+                       len = sizeof(sk->sk_peercred);
+               if (copy_to_user(optval, &sk->sk_peercred, len))
+                       return -EFAULT;
+               goto lenout;
  
-               case SO_SNDLOWAT:
-                       v.val=1;
-                       break; 
+       case SO_PEERNAME:
+       {
+               char address[128];
+
+               if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
+                       return -ENOTCONN;
+               if (lv < len)
+                       return -EINVAL;
+               if (copy_to_user(optval, address, len))
+                       return -EFAULT;
+               goto lenout;
+       }
  
-               case SO_PASSCRED:
-                       v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
-                       break;
+       /* Dubious BSD thing... Probably nobody even uses it, but
+        * the UNIX standard wants it for whatever reason... -DaveM
+        */
+       case SO_ACCEPTCONN:
+               v.val = sk->sk_state == TCP_LISTEN;
+               break;
  
-               case SO_PEERCRED:
-                       if (len > sizeof(sk->sk_peercred))
-                               len = sizeof(sk->sk_peercred);
-                       if (copy_to_user(optval, &sk->sk_peercred, len))
-                               return -EFAULT;
-                       goto lenout;
-
-               case SO_PEERNAME:
-               {
-                       char address[128];
-
-                       if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
-                               return -ENOTCONN;
-                       if (lv < len)
-                               return -EINVAL;
-                       if (copy_to_user(optval, address, len))
-                               return -EFAULT;
-                       goto lenout;
-               }
+       case SO_PASSSEC:
+               v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
+               break;
  
-               /* Dubious BSD thing... Probably nobody even uses it, but
-                * the UNIX standard wants it for whatever reason... -DaveM
-                */
-               case SO_ACCEPTCONN:
-                       v.val = sk->sk_state == TCP_LISTEN;
-                       break;
+       case SO_PEERSEC:
+               return security_socket_getpeersec_stream(sock, optval, optlen, len);
  
-               case SO_PEERSEC:
-                       return security_socket_getpeersec_stream(sock, optval, optlen, len);
+       case SO_MARK:
+               v.val = sk->sk_mark;
+               break;
  
-               default:
-                       return(-ENOPROTOOPT);
+       default:
+               return -ENOPROTOOPT;
         }
+
         if (len > lv)
                 len = lv;
         if (copy_to_user(optval, &v, len))
                 return -EFAULT;
  lenout:
-       if (put_user(len, optlen))
-               return -EFAULT;
-       return 0;
+       if (put_user(len, optlen))
+               return -EFAULT;
+       return 0;
  }
  
-/**
- *     sk_alloc - All socket objects are allocated here
- *     @family: protocol family
- *     @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
- *     @prot: struct proto associated with this new sock instance
- *     @zero_it: if we should zero the newly allocated sock
+/*
+ * Initialize an sk_lock.
+ *
+ * (We also register the sk_lock with the lock validator.)
   */
-struct sock *sk_alloc(int family, gfp_t priority,
-                     struct proto *prot, int zero_it)
+static inline void sock_lock_init(struct sock *sk)
  {
-       struct sock *sk = NULL;
-       kmem_cache_t *slab = prot->slab;
+       sock_lock_init_class_and_name(sk,
+                       af_family_slock_key_strings[sk->sk_family],
+                       af_family_slock_keys + sk->sk_family,
+                       af_family_key_strings[sk->sk_family],
+                       af_family_keys + sk->sk_family);
+}
  
+static void sock_copy(struct sock *nsk, const struct sock *osk)
+{
+#ifdef CONFIG_SECURITY_NETWORK
+       void *sptr = nsk->sk_security;
+#endif
+
+       memcpy(nsk, osk, osk->sk_prot->obj_size);
+#ifdef CONFIG_SECURITY_NETWORK
+       nsk->sk_security = sptr;
+       security_sk_clone(osk, nsk);
+#endif
+}
+
+static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
+               int family)
+{
+       struct sock *sk;
+       struct kmem_cache *slab;
+
+       slab = prot->slab;
         if (slab != NULL)
                 sk = kmem_cache_alloc(slab, priority);
         else
                 sk = kmalloc(prot->obj_size, priority);
  
-       if (sk) {
-               if (zero_it) {
-                       memset(sk, 0, prot->obj_size);
-                       sk->sk_family = family;
-                       /*
-                        * See comment in struct sock definition to understand
-                        * why we need sk_prot_creator -acme
-                        */
-                       sk->sk_prot = sk->sk_prot_creator = prot;
-                       sock_lock_init(sk);
-               }
-               
+       if (sk != NULL) {
                 if (security_sk_alloc(sk, family, priority))
                         goto out_free;
  
                 if (!try_module_get(prot->owner))
-                       goto out_free;
+                       goto out_free_sec;
         }
+
         return sk;
  
+out_free_sec:
+       security_sk_free(sk);
  out_free:
         if (slab != NULL)
                 kmem_cache_free(slab, sk);
@@ -785,18 +920,61 @@ out_free:
         return NULL;
  }
  
+static void sk_prot_free(struct proto *prot, struct sock *sk)
+{
+       struct kmem_cache *slab;
+       struct module *owner;
+
+       owner = prot->owner;
+       slab = prot->slab;
+
+       security_sk_free(sk);
+       if (slab != NULL)
+               kmem_cache_free(slab, sk);
+       else
+               kfree(sk);
+       module_put(owner);
+}
+
+/**
+ *     sk_alloc - All socket objects are allocated here
+ *     @net: the applicable net namespace
+ *     @family: protocol family
+ *     @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *     @prot: struct proto associated with this new sock instance
+ *     @zero_it: if we should zero the newly allocated sock
+ */
+struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
+                     struct proto *prot)
+{
+       struct sock *sk;
+
+       sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
+       if (sk) {
+               sk->sk_family = family;
+               /*
+                * See comment in struct sock definition to understand
+                * why we need sk_prot_creator -acme
+                */
+               sk->sk_prot = sk->sk_prot_creator = prot;
+               sock_lock_init(sk);
+               sk->sk_net = get_net(net);
+       }
+
+       return sk;
+}
+
  void sk_free(struct sock *sk)
  {
         struct sk_filter *filter;
-       struct module *owner = sk->sk_prot_creator->owner;
  
         if (sk->sk_destruct)
                 sk->sk_destruct(sk);
  
-       filter = sk->sk_filter;
+       filter = rcu_dereference(sk->sk_filter);
         if (filter) {
-               sk_filter_release(sk, filter);
-               sk->sk_filter = NULL;
+               sk_filter_uncharge(sk, filter);
+               rcu_assign_pointer(sk->sk_filter, NULL);
         }
  
         sock_disable_timestamp(sk);
@@ -805,27 +983,26 @@ void sk_free(struct sock *sk)
                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
                        __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
  
-       security_sk_free(sk);
-       if (sk->sk_prot_creator->slab != NULL)
-               kmem_cache_free(sk->sk_prot_creator->slab, sk);
-       else
-               kfree(sk);
-       module_put(owner);
+       put_net(sk->sk_net);
+       sk_prot_free(sk->sk_prot_creator, sk);
  }
  
  struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
  {
-       struct sock *newsk = sk_alloc(sk->sk_family, priority, sk->sk_prot, 0);
+       struct sock *newsk;
  
+       newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
         if (newsk != NULL) {
                 struct sk_filter *filter;
  
-               memcpy(newsk, sk, sk->sk_prot->obj_size);
+               sock_copy(newsk, sk);
  
                 /* SANITY */
+               get_net(newsk->sk_net);
                 sk_node_init(&newsk->sk_node);
                 sock_lock_init(newsk);
                 bh_lock_sock(newsk);
+               newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
  
                 atomic_set(&newsk->sk_rmem_alloc, 0);
                 atomic_set(&newsk->sk_wmem_alloc, 0);
@@ -838,12 +1015,14 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
  
                 rwlock_init(&newsk->sk_dst_lock);
                 rwlock_init(&newsk->sk_callback_lock);
+               lockdep_set_class_and_name(&newsk->sk_callback_lock,
+                               af_callback_keys + newsk->sk_family,
+                               af_family_clock_key_strings[newsk->sk_family]);
  
                 newsk->sk_dst_cache     = NULL;
                 newsk->sk_wmem_queued   = 0;
                 newsk->sk_forward_alloc = 0;
                 newsk->sk_send_head     = NULL;
-               newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
  
                 sock_reset_flag(newsk, SOCK_DONE);
@@ -890,6 +1069,21 @@ out:
  
  EXPORT_SYMBOL_GPL(sk_clone);
  
+void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
+{
+       __sk_dst_set(sk, dst);
+       sk->sk_route_caps = dst->dev->features;
+       if (sk->sk_route_caps & NETIF_F_GSO)
+               sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
+       if (sk_can_gso(sk)) {
+               if (dst->header_len)
+                       sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+               else
+                       sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+       }
+}
+EXPORT_SYMBOL_GPL(sk_setup_caps);
+
  void __init sk_init(void)
  {
         if (num_physpages <= 4096) {
@@ -908,8 +1102,8 @@ void __init sk_init(void)
   */
  
  
-/* 
- * Write buffer destructor automatically called from kfree_skb. 
+/*
+ * Write buffer destructor automatically called from kfree_skb.
   */
  void sock_wfree(struct sk_buff *skb)
  {
@@ -922,14 +1116,16 @@ void sock_wfree(struct sk_buff *skb)
         sock_put(sk);
  }
  
-/* 
- * Read buffer destructor automatically called from kfree_skb. 
+/*
+ * Read buffer destructor automatically called from kfree_skb.
   */
  void sock_rfree(struct sk_buff *skb)
  {
         struct sock *sk = skb->sk;
  
+       skb_truesize_check(skb);
         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+       sk_mem_uncharge(skb->sk, skb->truesize);
  }
  
  
@@ -971,7 +1167,7 @@ struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
  
  /*
   * Allocate a skb from the socket's receive buffer.
- */ 
+ */
  struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
                              gfp_t priority)
  {
@@ -985,16 +1181,16 @@ struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
         return NULL;
  }
  
-/* 
+/*
   * Allocate a memory block from the socket's option memory buffer.
- */ 
+ */
  void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
  {
         if ((unsigned)size <= sysctl_optmem_max &&
             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
                 void *mem;
                 /* First do the add, to avoid the race if kmalloc
-                * might sleep.
+                * might sleep.
                  */
                 atomic_add(size, &sk->sk_omem_alloc);
                 mem = kmalloc(size, priority);
@@ -1071,7 +1267,7 @@ static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
                         goto failure;
  
                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
-                       skb = alloc_skb(header_len, sk->sk_allocation);
+                       skb = alloc_skb(header_len, gfp_mask);
                         if (skb) {
                                 int npages;
                                 int i;
@@ -1130,7 +1326,7 @@ failure:
         return NULL;
  }
  
-struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, 
+struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
                                     int noblock, int *errcode)
  {
         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
@@ -1140,13 +1336,13 @@ static void __lock_sock(struct sock *sk)
  {
         DEFINE_WAIT(wait);
  
-       for(;;) {
+       for (;;) {
                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
                                         TASK_UNINTERRUPTIBLE);
                 spin_unlock_bh(&sk->sk_lock.slock);
                 schedule();
                 spin_lock_bh(&sk->sk_lock.slock);
-               if(!sock_owned_by_user(sk))
+               if (!sock_owned_by_user(sk))
                         break;
         }
         finish_wait(&sk->sk_lock.wq, &wait);
@@ -1178,7 +1374,7 @@ static void __release_sock(struct sock *sk)
                 } while (skb != NULL);
  
                 bh_lock_sock(sk);
-       } while((skb = sk->sk_backlog.head) != NULL);
+       } while ((skb = sk->sk_backlog.head) != NULL);
  }
  
  /**
@@ -1206,6 +1402,103 @@ int sk_wait_data(struct sock *sk, long *timeo)
  
  EXPORT_SYMBOL(sk_wait_data);
  
+/**
+ *     __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ *     @sk: socket
+ *     @size: memory size to allocate
+ *     @kind: allocation type
+ *
+ *     If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ *     rmem allocation. This function assumes that protocols which have
+ *     memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+       struct proto *prot = sk->sk_prot;
+       int amt = sk_mem_pages(size);
+       int allocated;
+
+       sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+       allocated = atomic_add_return(amt, prot->memory_allocated);
+
+       /* Under limit. */
+       if (allocated <= prot->sysctl_mem[0]) {
+               if (prot->memory_pressure && *prot->memory_pressure)
+                       *prot->memory_pressure = 0;
+               return 1;
+       }
+
+       /* Under pressure. */
+       if (allocated > prot->sysctl_mem[1])
+               if (prot->enter_memory_pressure)
+                       prot->enter_memory_pressure();
+
+       /* Over hard limit. */
+       if (allocated > prot->sysctl_mem[2])
+               goto suppress_allocation;
+
+       /* guarantee minimum buffer size under pressure */
+       if (kind == SK_MEM_RECV) {
+               if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+                       return 1;
+       } else { /* SK_MEM_SEND */
+               if (sk->sk_type == SOCK_STREAM) {
+                       if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+                               return 1;
+               } else if (atomic_read(&sk->sk_wmem_alloc) <
+                          prot->sysctl_wmem[0])
+                               return 1;
+       }
+
+       if (prot->memory_pressure) {
+               if (!*prot->memory_pressure ||
+                   prot->sysctl_mem[2] > atomic_read(prot->sockets_allocated) *
+                   sk_mem_pages(sk->sk_wmem_queued +
+                                atomic_read(&sk->sk_rmem_alloc) +
+                                sk->sk_forward_alloc))
+                       return 1;
+       }
+
+suppress_allocation:
+
+       if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+               sk_stream_moderate_sndbuf(sk);
+
+               /* Fail only if socket is _under_ its sndbuf.
+                * In this case we cannot block, so that we have to fail.
+                */
+               if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+                       return 1;
+       }
+
+       /* Alas. Undo changes. */
+       sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+       atomic_sub(amt, prot->memory_allocated);
+       return 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ *     __sk_reclaim - reclaim memory_allocated
+ *     @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+       struct proto *prot = sk->sk_prot;
+
+       atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+                  prot->memory_allocated);
+       sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+       if (prot->memory_pressure && *prot->memory_pressure &&
+           (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+               *prot->memory_pressure = 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
  /*
   * Set of default routines for initialising struct proto_ops when
   * the protocol does not support a particular function. In certain
@@ -1218,7 +1511,7 @@ int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
         return -EOPNOTSUPP;
  }
  
-int sock_no_connect(struct socket *sock, struct sockaddr *saddr, 
+int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
                     int len, int flags)
  {
         return -EOPNOTSUPP;
@@ -1234,7 +1527,7 @@ int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
         return -EOPNOTSUPP;
  }
  
-int sock_no_getname(struct socket *sock, struct sockaddr *saddr, 
+int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
                     int *len, int peer)
  {
         return -EOPNOTSUPP;
@@ -1320,7 +1613,7 @@ static void sock_def_error_report(struct sock *sk)
         read_lock(&sk->sk_callback_lock);
         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                 wake_up_interruptible(sk->sk_sleep);
-       sk_wake_async(sk,0,POLL_ERR); 
+       sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
         read_unlock(&sk->sk_callback_lock);
  }
  
@@ -1329,7 +1622,7 @@ static void sock_def_readable(struct sock *sk, int len)
         read_lock(&sk->sk_callback_lock);
         if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                 wake_up_interruptible(sk->sk_sleep);
-       sk_wake_async(sk,1,POLL_IN);
+       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
         read_unlock(&sk->sk_callback_lock);
  }
  
@@ -1340,13 +1633,13 @@ static void sock_def_write_space(struct sock *sk)
         /* Do not wake up a writer until he can make "significant"
          * progress.  --DaveM
          */
-       if((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
+       if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
                 if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
                         wake_up_interruptible(sk->sk_sleep);
  
                 /* Should agree with poll, otherwise some programs break */
                 if (sock_writeable(sk))
-                       sk_wake_async(sk, 2, POLL_OUT);
+                       sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
         }
  
         read_unlock(&sk->sk_callback_lock);
@@ -1361,7 +1654,7 @@ void sk_send_sigurg(struct sock *sk)
  {
         if (sk->sk_socket && sk->sk_socket->file)
                 if (send_sigurg(&sk->sk_socket->file->f_owner))
-                       sk_wake_async(sk, 3, POLL_PRI);
+                       sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
  }
  
  void sk_reset_timer(struct sock *sk, struct timer_list* timer,
@@ -1393,7 +1686,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
         sk->sk_send_head        =       NULL;
  
         init_timer(&sk->sk_timer);
-       
+
         sk->sk_allocation       =       GFP_KERNEL;
         sk->sk_rcvbuf           =       sysctl_rmem_default;
         sk->sk_sndbuf           =       sysctl_wmem_default;
@@ -1402,8 +1695,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
  
         sock_set_flag(sk, SOCK_ZAPPED);
  
-       if(sock)
-       {
+       if (sock) {
                 sk->sk_type     =       sock->type;
                 sk->sk_sleep    =       &sock->wait;
                 sock->sk        =       sk;
@@ -1412,6 +1704,9 @@ void sock_init_data(struct socket *sock, struct sock *sk)
  
         rwlock_init(&sk->sk_dst_lock);
         rwlock_init(&sk->sk_callback_lock);
+       lockdep_set_class_and_name(&sk->sk_callback_lock,
+                       af_callback_keys + sk->sk_family,
+                       af_family_clock_key_strings[sk->sk_family]);
  
         sk->sk_state_change     =       sock_def_wakeup;
         sk->sk_data_ready       =       sock_def_readable;
@@ -1430,57 +1725,85 @@ void sock_init_data(struct socket *sock, struct sock *sk)
         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
  
-       sk->sk_stamp.tv_sec     = -1L;
-       sk->sk_stamp.tv_usec    = -1L;
+       sk->sk_stamp = ktime_set(-1L, -1L);
  
         atomic_set(&sk->sk_refcnt, 1);
+       atomic_set(&sk->sk_drops, 0);
  }
  
-void fastcall lock_sock(struct sock *sk)
+void fastcall lock_sock_nested(struct sock *sk, int subclass)
  {
         might_sleep();
-       spin_lock_bh(&(sk->sk_lock.slock));
-       if (sk->sk_lock.owner)
+       spin_lock_bh(&sk->sk_lock.slock);
+       if (sk->sk_lock.owned)
                 __lock_sock(sk);
-       sk->sk_lock.owner = (void *)1;
-       spin_unlock_bh(&(sk->sk_lock.slock));
+       sk->sk_lock.owned = 1;
+       spin_unlock(&sk->sk_lock.slock);
+       /*
+        * The sk_lock has mutex_lock() semantics here:
+        */
+       mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
+       local_bh_enable();
  }
  
-EXPORT_SYMBOL(lock_sock);
+EXPORT_SYMBOL(lock_sock_nested);
  
  void fastcall release_sock(struct sock *sk)
  {
-       spin_lock_bh(&(sk->sk_lock.slock));
+       /*
+        * The sk_lock has mutex_unlock() semantics:
+        */
+       mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
+
+       spin_lock_bh(&sk->sk_lock.slock);
         if (sk->sk_backlog.tail)
                 __release_sock(sk);
-       sk->sk_lock.owner = NULL;
-        if (waitqueue_active(&(sk->sk_lock.wq)))
-               wake_up(&(sk->sk_lock.wq));
-       spin_unlock_bh(&(sk->sk_lock.slock));
+       sk->sk_lock.owned = 0;
+       if (waitqueue_active(&sk->sk_lock.wq))
+               wake_up(&sk->sk_lock.wq);
+       spin_unlock_bh(&sk->sk_lock.slock);
  }
  EXPORT_SYMBOL(release_sock);
  
  int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
-{ 
+{
+       struct timeval tv;
         if (!sock_flag(sk, SOCK_TIMESTAMP))
                 sock_enable_timestamp(sk);
-       if (sk->sk_stamp.tv_sec == -1) 
+       tv = ktime_to_timeval(sk->sk_stamp);
+       if (tv.tv_sec == -1)
                 return -ENOENT;
-       if (sk->sk_stamp.tv_sec == 0)
-               do_gettimeofday(&sk->sk_stamp);
-       return copy_to_user(userstamp, &sk->sk_stamp, sizeof(struct timeval)) ?
-               -EFAULT : 0; 
-} 
+       if (tv.tv_sec == 0) {
+               sk->sk_stamp = ktime_get_real();
+               tv = ktime_to_timeval(sk->sk_stamp);
+       }
+       return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
+}
  EXPORT_SYMBOL(sock_get_timestamp);
  
+int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
+{
+       struct timespec ts;
+       if (!sock_flag(sk, SOCK_TIMESTAMP))
+               sock_enable_timestamp(sk);
+       ts = ktime_to_timespec(sk->sk_stamp);
+       if (ts.tv_sec == -1)
+               return -ENOENT;
+       if (ts.tv_sec == 0) {
+               sk->sk_stamp = ktime_get_real();
+               ts = ktime_to_timespec(sk->sk_stamp);
+       }
+       return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
+}
+EXPORT_SYMBOL(sock_get_timestampns);
+
  void sock_enable_timestamp(struct sock *sk)
-{      
-       if (!sock_flag(sk, SOCK_TIMESTAMP)) { 
+{
+       if (!sock_flag(sk, SOCK_TIMESTAMP)) {
                 sock_set_flag(sk, SOCK_TIMESTAMP);
                 net_enable_timestamp();
         }
  }
-EXPORT_SYMBOL(sock_enable_timestamp); 
  
  /*
   *     Get a socket option on an socket.
@@ -1505,7 +1828,7 @@ int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
  {
         struct sock *sk = sock->sk;
  
-       if (sk->sk_prot->compat_setsockopt != NULL)
+       if (sk->sk_prot->compat_getsockopt != NULL)
                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
                                                       optval, optlen);
         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
@@ -1600,16 +1923,20 @@ int proto_register(struct proto *prot, int alloc_slab)
  {
         char *request_sock_slab_name = NULL;
         char *timewait_sock_slab_name;
-       int rc = -ENOBUFS;
+
+       if (sock_prot_inuse_init(prot) != 0) {
+               printk(KERN_CRIT "%s: Can't alloc inuse counters!\n", prot->name);
+               goto out;
+       }
  
         if (alloc_slab) {
                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-                                              SLAB_HWCACHE_ALIGN, NULL, NULL);
+                                              SLAB_HWCACHE_ALIGN, NULL);
  
                 if (prot->slab == NULL) {
                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
                                prot->name);
-                       goto out;
+                       goto out_free_inuse;
                 }
  
                 if (prot->rsk_prot != NULL) {
@@ -1622,7 +1949,7 @@ int proto_register(struct proto *prot, int alloc_slab)
                         sprintf(request_sock_slab_name, mask, prot->name);
                         prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
                                                                  prot->rsk_prot->obj_size, 0,
-                                                                SLAB_HWCACHE_ALIGN, NULL, NULL);
+                                                                SLAB_HWCACHE_ALIGN, NULL);
  
                         if (prot->rsk_prot->slab == NULL) {
                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
@@ -1644,7 +1971,7 @@ int proto_register(struct proto *prot, int alloc_slab)
                                 kmem_cache_create(timewait_sock_slab_name,
                                                   prot->twsk_prot->twsk_obj_size,
                                                   0, SLAB_HWCACHE_ALIGN,
-                                                 NULL, NULL);
+                                                 NULL);
                         if (prot->twsk_prot->twsk_slab == NULL)
                                 goto out_free_timewait_sock_slab_name;
                 }
@@ -1653,9 +1980,8 @@ int proto_register(struct proto *prot, int alloc_slab)
         write_lock(&proto_list_lock);
         list_add(&prot->node, &proto_list);
         write_unlock(&proto_list_lock);
-       rc = 0;
-out:
-       return rc;
+       return 0;
+
  out_free_timewait_sock_slab_name:
         kfree(timewait_sock_slab_name);
  out_free_request_sock_slab:
@@ -1668,7 +1994,10 @@ out_free_request_sock_slab_name:
  out_free_sock_slab:
         kmem_cache_destroy(prot->slab);
         prot->slab = NULL;
-       goto out;
+out_free_inuse:
+       sock_prot_inuse_free(prot);
+out:
+       return -ENOBUFS;
  }
  
  EXPORT_SYMBOL(proto_register);
@@ -1679,6 +2008,8 @@ void proto_unregister(struct proto *prot)
         list_del(&prot->node);
         write_unlock(&proto_list_lock);
  
+       sock_prot_inuse_free(prot);
+
         if (prot->slab != NULL) {
                 kmem_cache_destroy(prot->slab);
                 prot->slab = NULL;
@@ -1704,49 +2035,20 @@ void proto_unregister(struct proto *prot)
  EXPORT_SYMBOL(proto_unregister);
  
  #ifdef CONFIG_PROC_FS
-static inline struct proto *__proto_head(void)
-{
-       return list_entry(proto_list.next, struct proto, node);
-}
-
-static inline struct proto *proto_head(void)
-{
-       return list_empty(&proto_list) ? NULL : __proto_head();
-}
-
-static inline struct proto *proto_next(struct proto *proto)
-{
-       return proto->node.next == &proto_list ? NULL :
-               list_entry(proto->node.next, struct proto, node);
-}
-
-static inline struct proto *proto_get_idx(loff_t pos)
-{
-       struct proto *proto;
-       loff_t i = 0;
-
-       list_for_each_entry(proto, &proto_list, node)
-               if (i++ == pos)
-                       goto out;
-
-       proto = NULL;
-out:
-       return proto;
-}
-
  static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+       __acquires(proto_list_lock)
  {
         read_lock(&proto_list_lock);
-       return *pos ? proto_get_idx(*pos - 1) : SEQ_START_TOKEN;
+       return seq_list_start_head(&proto_list, *pos);
  }
  
  static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  {
-       ++*pos;
-       return v == SEQ_START_TOKEN ? proto_head() : proto_next(v);
+       return seq_list_next(v, &proto_list, pos);
  }
  
  static void proto_seq_stop(struct seq_file *seq, void *v)
+       __releases(proto_list_lock)
  {
         read_unlock(&proto_list_lock);
  }
@@ -1791,7 +2093,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
  
  static int proto_seq_show(struct seq_file *seq, void *v)
  {
-       if (v == SEQ_START_TOKEN)
+       if (v == &proto_list)
                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
                            "protocol",
                            "size",
@@ -1803,11 +2105,11 @@ static int proto_seq_show(struct seq_file *seq, void *v)
                            "module",
                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
         else
-               proto_seq_printf(seq, v);
+               proto_seq_printf(seq, list_entry(v, struct proto, node));
         return 0;
  }
  
-static struct seq_operations proto_seq_ops = {
+static const struct seq_operations proto_seq_ops = {
         .start  = proto_seq_start,
         .next   = proto_seq_next,
         .stop   = proto_seq_stop,
@@ -1819,7 +2121,7 @@ static int proto_seq_open(struct inode *inode, struct file *file)
         return seq_open(file, &proto_seq_ops);
  }
  
-static struct file_operations proto_seq_fops = {
+static const struct file_operations proto_seq_fops = {
         .owner          = THIS_MODULE,
         .open           = proto_seq_open,
         .read           = seq_read,
@@ -1830,7 +2132,7 @@ static struct file_operations proto_seq_fops = {
  static int __init proto_init(void)
  {
         /* register /proc/net/protocols */
-       return proc_net_fops_create("protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
+       return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
  }
  
  subsys_initcall(proto_init);
@@ -1866,7 +2168,3 @@ EXPORT_SYMBOL(sock_wmalloc);
  EXPORT_SYMBOL(sock_i_uid);
  EXPORT_SYMBOL(sock_i_ino);
  EXPORT_SYMBOL(sysctl_optmem_max);
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_rmem_max);
-EXPORT_SYMBOL(sysctl_wmem_max);
-#endif