wimax: oops: wimax_dev_add() is the only one that can initialize the state
[safe/jmp/linux-2.6] / net / core / sock.c
index eac7aa0..7dbf3ff 100644 (file)
@@ -7,8 +7,6 @@
  *             handler for protocols to use and generic option handler.
  *
  *
- * Version:    $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
- *
  * Authors:    Ross Biro
  *             Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  *             Florian La Roche, <flla@stud.uni-sb.de>
 #include <net/net_namespace.h>
 #include <net/request_sock.h>
 #include <net/sock.h>
+#include <linux/net_tstamp.h>
 #include <net/xfrm.h>
 #include <linux/ipsec.h>
 
 static struct lock_class_key af_family_keys[AF_MAX];
 static struct lock_class_key af_family_slock_keys[AF_MAX];
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
 /*
  * Make lock validator output more readable. (we pre-construct these
  * strings build-time, so that runtime initialization of socket
@@ -152,11 +150,12 @@ static const char *af_family_key_strings[AF_MAX+1] = {
   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
-  "sk_lock-21"       , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
+  "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
-  "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
+  "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
+  "sk_lock-AF_MAX"
 };
 static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
@@ -166,11 +165,12 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = {
   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
-  "slock-21"       , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
+  "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
-  "slock-AF_RXRPC" , "slock-AF_MAX"
+  "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
+  "slock-AF_MAX"
 };
 static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
@@ -180,13 +180,13 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = {
   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
-  "clock-21"       , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
+  "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
-  "clock-27"       , "clock-28"          , "clock-29"          ,
+  "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
-  "clock-AF_RXRPC" , "clock-AF_MAX"
+  "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
+  "clock-AF_MAX"
 };
-#endif
 
 /*
  * sk_callback_lock locking rules are per-address-family,
@@ -228,11 +228,12 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
                static int warned __read_mostly;
 
                *timeo_p = 0;
-               if (warned < 10 && net_ratelimit())
+               if (warned < 10 && net_ratelimit()) {
                        warned++;
                        printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
                               "tries to set negative timeout\n",
                                current->comm, task_pid_nr(current));
+               }
                return 0;
        }
        *timeo_p = MAX_SCHEDULE_TIMEOUT;
@@ -255,11 +256,14 @@ static void sock_warn_obsolete_bsdism(const char *name)
        }
 }
 
-static void sock_disable_timestamp(struct sock *sk)
+static void sock_disable_timestamp(struct sock *sk, int flag)
 {
-       if (sock_flag(sk, SOCK_TIMESTAMP)) {
-               sock_reset_flag(sk, SOCK_TIMESTAMP);
-               net_disable_timestamp();
+       if (sock_flag(sk, flag)) {
+               sock_reset_flag(sk, flag);
+               if (!sock_flag(sk, SOCK_TIMESTAMP) &&
+                   !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
+                       net_disable_timestamp();
+               }
        }
 }
 
@@ -269,7 +273,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
        int err = 0;
        int skb_len;
 
-       /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
+       /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
           number of warnings when compiling with -W --ANK
         */
        if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
@@ -282,6 +286,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
        if (err)
                goto out;
 
+       if (!sk_rmem_schedule(sk, skb->truesize)) {
+               err = -ENOBUFS;
+               goto out;
+       }
+
        skb->dev = NULL;
        skb_set_owner_r(skb, sk);
 
@@ -320,7 +329,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
                 */
                mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
 
-               rc = sk->sk_backlog_rcv(sk, skb);
+               rc = sk_backlog_rcv(sk, skb);
 
                mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
        } else
@@ -367,7 +376,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
 {
        int ret = -ENOPROTOOPT;
 #ifdef CONFIG_NETDEVICES
-       struct net *net = sk->sk_net;
+       struct net *net = sock_net(sk);
        char devname[IFNAMSIZ];
        int index;
 
@@ -445,15 +454,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
         *      Options without arguments
         */
 
-#ifdef SO_DONTLINGER           /* Compatibility item... */
-       if (optname == SO_DONTLINGER) {
-               lock_sock(sk);
-               sock_reset_flag(sk, SOCK_LINGER);
-               release_sock(sk);
-               return 0;
-       }
-#endif
-
        if (optname == SO_BINDTODEVICE)
                return sock_bindtodevice(sk, optval, optlen);
 
@@ -618,13 +618,38 @@ set_rcvbuf:
                        else
                                sock_set_flag(sk, SOCK_RCVTSTAMPNS);
                        sock_set_flag(sk, SOCK_RCVTSTAMP);
-                       sock_enable_timestamp(sk);
+                       sock_enable_timestamp(sk, SOCK_TIMESTAMP);
                } else {
                        sock_reset_flag(sk, SOCK_RCVTSTAMP);
                        sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
                }
                break;
 
+       case SO_TIMESTAMPING:
+               if (val & ~SOF_TIMESTAMPING_MASK) {
+                       ret = EINVAL;
+                       break;
+               }
+               sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
+                                 val & SOF_TIMESTAMPING_TX_HARDWARE);
+               sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
+                                 val & SOF_TIMESTAMPING_TX_SOFTWARE);
+               sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
+                                 val & SOF_TIMESTAMPING_RX_HARDWARE);
+               if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+                       sock_enable_timestamp(sk,
+                                             SOCK_TIMESTAMPING_RX_SOFTWARE);
+               else
+                       sock_disable_timestamp(sk,
+                                              SOCK_TIMESTAMPING_RX_SOFTWARE);
+               sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
+                                 val & SOF_TIMESTAMPING_SOFTWARE);
+               sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
+                                 val & SOF_TIMESTAMPING_SYS_HARDWARE);
+               sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
+                                 val & SOF_TIMESTAMPING_RAW_HARDWARE);
+               break;
+
        case SO_RCVLOWAT:
                if (val < 0)
                        val = INT_MAX;
@@ -662,6 +687,13 @@ set_rcvbuf:
                else
                        clear_bit(SOCK_PASSSEC, &sock->flags);
                break;
+       case SO_MARK:
+               if (!capable(CAP_NET_ADMIN))
+                       ret = -EPERM;
+               else {
+                       sk->sk_mark = val;
+               }
+               break;
 
                /* We implement the SO_SNDLOWAT etc to
                   not be settable (1003.1g 5.3) */
@@ -693,6 +725,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
        if (len < 0)
                return -EINVAL;
 
+       memset(&v, 0, sizeof(v));
+
        switch(optname) {
        case SO_DEBUG:
                v.val = sock_flag(sk, SOCK_DBG);
@@ -763,6 +797,24 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
                v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
                break;
 
+       case SO_TIMESTAMPING:
+               v.val = 0;
+               if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
+                       v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
+               if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
+                       v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
+               if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
+                       v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
+               if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+                       v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
+               if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
+                       v.val |= SOF_TIMESTAMPING_SOFTWARE;
+               if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
+                       v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
+               if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
+                       v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+               break;
+
        case SO_RCVTIMEO:
                lv=sizeof(struct timeval);
                if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
@@ -831,6 +883,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
        case SO_PEERSEC:
                return security_socket_getpeersec_stream(sock, optval, optlen, len);
 
+       case SO_MARK:
+               v.val = sk->sk_mark;
+               break;
+
        default:
                return -ENOPROTOOPT;
        }
@@ -926,7 +982,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
  *     @family: protocol family
  *     @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
  *     @prot: struct proto associated with this new sock instance
- *     @zero_it: if we should zero the newly allocated sock
  */
 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                      struct proto *prot)
@@ -942,7 +997,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
                 */
                sk->sk_prot = sk->sk_prot_creator = prot;
                sock_lock_init(sk);
-               sk->sk_net = get_net(net);
+               sock_net_set(sk, get_net(net));
        }
 
        return sk;
@@ -961,16 +1016,37 @@ void sk_free(struct sock *sk)
                rcu_assign_pointer(sk->sk_filter, NULL);
        }
 
-       sock_disable_timestamp(sk);
+       sock_disable_timestamp(sk, SOCK_TIMESTAMP);
+       sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
 
        if (atomic_read(&sk->sk_omem_alloc))
                printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
-                      __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
+                      __func__, atomic_read(&sk->sk_omem_alloc));
 
-       put_net(sk->sk_net);
+       put_net(sock_net(sk));
        sk_prot_free(sk->sk_prot_creator, sk);
 }
 
+/*
+ * Last sock_put should drop referrence to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking referrence to stopping namespace
+ * is not an option.
+ * Take referrence to a socket to remove it from hash _alive_ and after that
+ * destroy it in the context of init_net.
+ */
+void sk_release_kernel(struct sock *sk)
+{
+       if (sk == NULL || sk->sk_socket == NULL)
+               return;
+
+       sock_hold(sk);
+       sock_release(sk->sk_socket);
+       release_net(sock_net(sk));
+       sock_net_set(sk, get_net(&init_net));
+       sock_put(sk);
+}
+EXPORT_SYMBOL(sk_release_kernel);
+
 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
 {
        struct sock *newsk;
@@ -982,7 +1058,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
                sock_copy(newsk, sk);
 
                /* SANITY */
-               get_net(newsk->sk_net);
+               get_net(sock_net(newsk));
                sk_node_init(&newsk->sk_node);
                sock_lock_init(newsk);
                bh_lock_sock(newsk);
@@ -1041,11 +1117,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
                 * to be taken into account in all callers. -acme
                 */
                sk_refcnt_debug_inc(newsk);
-               newsk->sk_socket = NULL;
+               sk_set_socket(newsk, NULL);
                newsk->sk_sleep  = NULL;
 
                if (newsk->sk_prot->sockets_allocated)
-                       atomic_inc(newsk->sk_prot->sockets_allocated);
+                       percpu_counter_inc(newsk->sk_prot->sockets_allocated);
        }
 out:
        return newsk;
@@ -1060,10 +1136,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
        if (sk->sk_route_caps & NETIF_F_GSO)
                sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
        if (sk_can_gso(sk)) {
-               if (dst->header_len)
+               if (dst->header_len) {
                        sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
-               else
+               } else {
                        sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+                       sk->sk_gso_max_size = dst->dev->gso_max_size;
+               }
        }
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
@@ -1108,6 +1186,7 @@ void sock_rfree(struct sk_buff *skb)
        struct sock *sk = skb->sk;
 
        atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+       sk_mem_uncharge(skb->sk, skb->truesize);
 }
 
 
@@ -1224,10 +1303,9 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo)
  *     Generic send/receive buffer handlers
  */
 
-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
-                                           unsigned long header_len,
-                                           unsigned long data_len,
-                                           int noblock, int *errcode)
+struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
+                                    unsigned long data_len, int noblock,
+                                    int *errcode)
 {
        struct sk_buff *skb;
        gfp_t gfp_mask;
@@ -1307,6 +1385,7 @@ failure:
        *errcode = err;
        return NULL;
 }
+EXPORT_SYMBOL(sock_alloc_send_pskb);
 
 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
                                    int noblock, int *errcode)
@@ -1342,7 +1421,7 @@ static void __release_sock(struct sock *sk)
                        struct sk_buff *next = skb->next;
 
                        skb->next = NULL;
-                       sk->sk_backlog_rcv(sk, skb);
+                       sk_backlog_rcv(sk, skb);
 
                        /*
                         * We are in process context here with softirqs
@@ -1384,6 +1463,107 @@ int sk_wait_data(struct sock *sk, long *timeo)
 
 EXPORT_SYMBOL(sk_wait_data);
 
+/**
+ *     __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ *     @sk: socket
+ *     @size: memory size to allocate
+ *     @kind: allocation type
+ *
+ *     If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ *     rmem allocation. This function assumes that protocols which have
+ *     memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+       struct proto *prot = sk->sk_prot;
+       int amt = sk_mem_pages(size);
+       int allocated;
+
+       sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+       allocated = atomic_add_return(amt, prot->memory_allocated);
+
+       /* Under limit. */
+       if (allocated <= prot->sysctl_mem[0]) {
+               if (prot->memory_pressure && *prot->memory_pressure)
+                       *prot->memory_pressure = 0;
+               return 1;
+       }
+
+       /* Under pressure. */
+       if (allocated > prot->sysctl_mem[1])
+               if (prot->enter_memory_pressure)
+                       prot->enter_memory_pressure(sk);
+
+       /* Over hard limit. */
+       if (allocated > prot->sysctl_mem[2])
+               goto suppress_allocation;
+
+       /* guarantee minimum buffer size under pressure */
+       if (kind == SK_MEM_RECV) {
+               if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+                       return 1;
+       } else { /* SK_MEM_SEND */
+               if (sk->sk_type == SOCK_STREAM) {
+                       if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+                               return 1;
+               } else if (atomic_read(&sk->sk_wmem_alloc) <
+                          prot->sysctl_wmem[0])
+                               return 1;
+       }
+
+       if (prot->memory_pressure) {
+               int alloc;
+
+               if (!*prot->memory_pressure)
+                       return 1;
+               alloc = percpu_counter_read_positive(prot->sockets_allocated);
+               if (prot->sysctl_mem[2] > alloc *
+                   sk_mem_pages(sk->sk_wmem_queued +
+                                atomic_read(&sk->sk_rmem_alloc) +
+                                sk->sk_forward_alloc))
+                       return 1;
+       }
+
+suppress_allocation:
+
+       if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+               sk_stream_moderate_sndbuf(sk);
+
+               /* Fail only if socket is _under_ its sndbuf.
+                * In this case we cannot block, so that we have to fail.
+                */
+               if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+                       return 1;
+       }
+
+       /* Alas. Undo changes. */
+       sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+       atomic_sub(amt, prot->memory_allocated);
+       return 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ *     __sk_reclaim - reclaim memory_allocated
+ *     @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+       struct proto *prot = sk->sk_prot;
+
+       atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+                  prot->memory_allocated);
+       sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+       if (prot->memory_pressure && *prot->memory_pressure &&
+           (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+               *prot->memory_pressure = 0;
+}
+
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
 /*
  * Set of default routines for initialising struct proto_ops when
  * the protocol does not support a particular function. In certain
@@ -1497,8 +1677,8 @@ static void sock_def_error_report(struct sock *sk)
 {
        read_lock(&sk->sk_callback_lock);
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-               wake_up_interruptible(sk->sk_sleep);
-       sk_wake_async(sk,0,POLL_ERR);
+               wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
+       sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
        read_unlock(&sk->sk_callback_lock);
 }
 
@@ -1506,8 +1686,9 @@ static void sock_def_readable(struct sock *sk, int len)
 {
        read_lock(&sk->sk_callback_lock);
        if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-               wake_up_interruptible(sk->sk_sleep);
-       sk_wake_async(sk,1,POLL_IN);
+               wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
+                                               POLLRDNORM | POLLRDBAND);
+       sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
        read_unlock(&sk->sk_callback_lock);
 }
 
@@ -1520,11 +1701,12 @@ static void sock_def_write_space(struct sock *sk)
         */
        if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
                if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-                       wake_up_interruptible(sk->sk_sleep);
+                       wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
+                                               POLLWRNORM | POLLWRBAND);
 
                /* Should agree with poll, otherwise some programs break */
                if (sock_writeable(sk))
-                       sk_wake_async(sk, 2, POLL_OUT);
+                       sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
        }
 
        read_unlock(&sk->sk_callback_lock);
@@ -1539,7 +1721,7 @@ void sk_send_sigurg(struct sock *sk)
 {
        if (sk->sk_socket && sk->sk_socket->file)
                if (send_sigurg(&sk->sk_socket->file->f_owner))
-                       sk_wake_async(sk, 3, POLL_PRI);
+                       sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
 }
 
 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
@@ -1576,7 +1758,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        sk->sk_rcvbuf           =       sysctl_rmem_default;
        sk->sk_sndbuf           =       sysctl_wmem_default;
        sk->sk_state            =       TCP_CLOSE;
-       sk->sk_socket           =       sock;
+       sk_set_socket(sk, sock);
 
        sock_set_flag(sk, SOCK_ZAPPED);
 
@@ -1610,13 +1792,13 @@ void sock_init_data(struct socket *sock, struct sock *sk)
        sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
        sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
 
-       sk->sk_stamp = ktime_set(-1L, -1L);
+       sk->sk_stamp = ktime_set(-1L, 0);
 
        atomic_set(&sk->sk_refcnt, 1);
        atomic_set(&sk->sk_drops, 0);
 }
 
-void fastcall lock_sock_nested(struct sock *sk, int subclass)
+void lock_sock_nested(struct sock *sk, int subclass)
 {
        might_sleep();
        spin_lock_bh(&sk->sk_lock.slock);
@@ -1633,7 +1815,7 @@ void fastcall lock_sock_nested(struct sock *sk, int subclass)
 
 EXPORT_SYMBOL(lock_sock_nested);
 
-void fastcall release_sock(struct sock *sk)
+void release_sock(struct sock *sk)
 {
        /*
         * The sk_lock has mutex_unlock() semantics:
@@ -1654,7 +1836,7 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
 {
        struct timeval tv;
        if (!sock_flag(sk, SOCK_TIMESTAMP))
-               sock_enable_timestamp(sk);
+               sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        tv = ktime_to_timeval(sk->sk_stamp);
        if (tv.tv_sec == -1)
                return -ENOENT;
@@ -1670,7 +1852,7 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
 {
        struct timespec ts;
        if (!sock_flag(sk, SOCK_TIMESTAMP))
-               sock_enable_timestamp(sk);
+               sock_enable_timestamp(sk, SOCK_TIMESTAMP);
        ts = ktime_to_timespec(sk->sk_stamp);
        if (ts.tv_sec == -1)
                return -ENOENT;
@@ -1682,11 +1864,20 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
 }
 EXPORT_SYMBOL(sock_get_timestampns);
 
-void sock_enable_timestamp(struct sock *sk)
+void sock_enable_timestamp(struct sock *sk, int flag)
 {
-       if (!sock_flag(sk, SOCK_TIMESTAMP)) {
-               sock_set_flag(sk, SOCK_TIMESTAMP);
-               net_enable_timestamp();
+       if (!sock_flag(sk, flag)) {
+               sock_set_flag(sk, flag);
+               /*
+                * we just set one of the two flags which require net
+                * time stamping, but time stamping might have been on
+                * already because of the other one
+                */
+               if (!sock_flag(sk,
+                               flag == SOCK_TIMESTAMP ?
+                               SOCK_TIMESTAMPING_RX_SOFTWARE :
+                               SOCK_TIMESTAMP))
+                       net_enable_timestamp();
        }
 }
 
@@ -1804,35 +1995,130 @@ EXPORT_SYMBOL(sk_common_release);
 static DEFINE_RWLOCK(proto_list_lock);
 static LIST_HEAD(proto_list);
 
-int proto_register(struct proto *prot, int alloc_slab)
+#ifdef CONFIG_PROC_FS
+#define PROTO_INUSE_NR 64      /* should be enough for the first time */
+struct prot_inuse {
+       int val[PROTO_INUSE_NR];
+};
+
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+#ifdef CONFIG_NET_NS
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
 {
-       char *request_sock_slab_name = NULL;
-       char *timewait_sock_slab_name;
+       int cpu = smp_processor_id();
+       per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
 
-       if (pcounter_alloc(&prot->inuse) != 0) {
-               printk(KERN_CRIT "%s: Can't alloc inuse counters!\n", prot->name);
-               goto out;
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+       int cpu, idx = prot->inuse_idx;
+       int res = 0;
+
+       for_each_possible_cpu(cpu)
+               res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+
+       return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+
+static int sock_inuse_init_net(struct net *net)
+{
+       net->core.inuse = alloc_percpu(struct prot_inuse);
+       return net->core.inuse ? 0 : -ENOMEM;
+}
+
+static void sock_inuse_exit_net(struct net *net)
+{
+       free_percpu(net->core.inuse);
+}
+
+static struct pernet_operations net_inuse_ops = {
+       .init = sock_inuse_init_net,
+       .exit = sock_inuse_exit_net,
+};
+
+static __init int net_inuse_init(void)
+{
+       if (register_pernet_subsys(&net_inuse_ops))
+               panic("Cannot initialize net inuse counters");
+
+       return 0;
+}
+
+core_initcall(net_inuse_init);
+#else
+static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
+
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+       __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
+{
+       int cpu, idx = prot->inuse_idx;
+       int res = 0;
+
+       for_each_possible_cpu(cpu)
+               res += per_cpu(prot_inuse, cpu).val[idx];
+
+       return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+#endif
+
+static void assign_proto_idx(struct proto *prot)
+{
+       prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
+
+       if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+               printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
+               return;
        }
 
+       set_bit(prot->inuse_idx, proto_inuse_idx);
+}
+
+static void release_proto_idx(struct proto *prot)
+{
+       if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+               clear_bit(prot->inuse_idx, proto_inuse_idx);
+}
+#else
+static inline void assign_proto_idx(struct proto *prot)
+{
+}
+
+static inline void release_proto_idx(struct proto *prot)
+{
+}
+#endif
+
+int proto_register(struct proto *prot, int alloc_slab)
+{
        if (alloc_slab) {
                prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-                                              SLAB_HWCACHE_ALIGN, NULL);
+                                       SLAB_HWCACHE_ALIGN | prot->slab_flags,
+                                       NULL);
 
                if (prot->slab == NULL) {
                        printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
                               prot->name);
-                       goto out_free_inuse;
+                       goto out;
                }
 
                if (prot->rsk_prot != NULL) {
                        static const char mask[] = "request_sock_%s";
 
-                       request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
-                       if (request_sock_slab_name == NULL)
+                       prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+                       if (prot->rsk_prot->slab_name == NULL)
                                goto out_free_sock_slab;
 
-                       sprintf(request_sock_slab_name, mask, prot->name);
-                       prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+                       sprintf(prot->rsk_prot->slab_name, mask, prot->name);
+                       prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
                                                                 prot->rsk_prot->obj_size, 0,
                                                                 SLAB_HWCACHE_ALIGN, NULL);
 
@@ -1846,16 +2132,18 @@ int proto_register(struct proto *prot, int alloc_slab)
                if (prot->twsk_prot != NULL) {
                        static const char mask[] = "tw_sock_%s";
 
-                       timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+                       prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
 
-                       if (timewait_sock_slab_name == NULL)
+                       if (prot->twsk_prot->twsk_slab_name == NULL)
                                goto out_free_request_sock_slab;
 
-                       sprintf(timewait_sock_slab_name, mask, prot->name);
+                       sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
                        prot->twsk_prot->twsk_slab =
-                               kmem_cache_create(timewait_sock_slab_name,
+                               kmem_cache_create(prot->twsk_prot->twsk_slab_name,
                                                  prot->twsk_prot->twsk_obj_size,
-                                                 0, SLAB_HWCACHE_ALIGN,
+                                                 0,
+                                                 SLAB_HWCACHE_ALIGN |
+                                                       prot->slab_flags,
                                                  NULL);
                        if (prot->twsk_prot->twsk_slab == NULL)
                                goto out_free_timewait_sock_slab_name;
@@ -1864,23 +2152,22 @@ int proto_register(struct proto *prot, int alloc_slab)
 
        write_lock(&proto_list_lock);
        list_add(&prot->node, &proto_list);
+       assign_proto_idx(prot);
        write_unlock(&proto_list_lock);
        return 0;
 
 out_free_timewait_sock_slab_name:
-       kfree(timewait_sock_slab_name);
+       kfree(prot->twsk_prot->twsk_slab_name);
 out_free_request_sock_slab:
        if (prot->rsk_prot && prot->rsk_prot->slab) {
                kmem_cache_destroy(prot->rsk_prot->slab);
                prot->rsk_prot->slab = NULL;
        }
 out_free_request_sock_slab_name:
-       kfree(request_sock_slab_name);
+       kfree(prot->rsk_prot->slab_name);
 out_free_sock_slab:
        kmem_cache_destroy(prot->slab);
        prot->slab = NULL;
-out_free_inuse:
-       pcounter_free(&prot->inuse);
 out:
        return -ENOBUFS;
 }
@@ -1890,29 +2177,24 @@ EXPORT_SYMBOL(proto_register);
 void proto_unregister(struct proto *prot)
 {
        write_lock(&proto_list_lock);
+       release_proto_idx(prot);
        list_del(&prot->node);
        write_unlock(&proto_list_lock);
 
-       pcounter_free(&prot->inuse);
-
        if (prot->slab != NULL) {
                kmem_cache_destroy(prot->slab);
                prot->slab = NULL;
        }
 
        if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
-               const char *name = kmem_cache_name(prot->rsk_prot->slab);
-
                kmem_cache_destroy(prot->rsk_prot->slab);
-               kfree(name);
+               kfree(prot->rsk_prot->slab_name);
                prot->rsk_prot->slab = NULL;
        }
 
        if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
-               const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
-
                kmem_cache_destroy(prot->twsk_prot->twsk_slab);
-               kfree(name);
+               kfree(prot->twsk_prot->twsk_slab_name);
                prot->twsk_prot->twsk_slab = NULL;
        }
 }
@@ -1921,6 +2203,7 @@ EXPORT_SYMBOL(proto_unregister);
 
 #ifdef CONFIG_PROC_FS
 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+       __acquires(proto_list_lock)
 {
        read_lock(&proto_list_lock);
        return seq_list_start_head(&proto_list, *pos);
@@ -1932,6 +2215,7 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 }
 
 static void proto_seq_stop(struct seq_file *seq, void *v)
+       __releases(proto_list_lock)
 {
        read_unlock(&proto_list_lock);
 }
@@ -1947,7 +2231,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
                        "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
                   proto->name,
                   proto->obj_size,
-                  proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
+                  sock_prot_inuse_get(seq_file_net(seq), proto),
                   proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
                   proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
                   proto->max_header,
@@ -2001,7 +2285,8 @@ static const struct seq_operations proto_seq_ops = {
 
 static int proto_seq_open(struct inode *inode, struct file *file)
 {
-       return seq_open(file, &proto_seq_ops);
+       return seq_open_net(inode, file, &proto_seq_ops,
+                           sizeof(struct seq_net_private));
 }
 
 static const struct file_operations proto_seq_fops = {
@@ -2009,13 +2294,31 @@ static const struct file_operations proto_seq_fops = {
        .open           = proto_seq_open,
        .read           = seq_read,
        .llseek         = seq_lseek,
-       .release        = seq_release,
+       .release        = seq_release_net,
+};
+
+static __net_init int proto_init_net(struct net *net)
+{
+       if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
+               return -ENOMEM;
+
+       return 0;
+}
+
+static __net_exit void proto_exit_net(struct net *net)
+{
+       proc_net_remove(net, "protocols");
+}
+
+
+static __net_initdata struct pernet_operations proto_net_ops = {
+       .init = proto_init_net,
+       .exit = proto_exit_net,
 };
 
 static int __init proto_init(void)
 {
-       /* register /proc/net/protocols */
-       return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
+       return register_pernet_subsys(&proto_net_ops);
 }
 
 subsys_initcall(proto_init);