X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fcore%2Fsock.c;h=7dbf3ffb35ccd6cb3308dab70fa33ce9cd9f8d5c;hb=719bfeaae8104fca4ca5d47c02592b08682f14fa;hp=07101381b8b78df697987325a421d37dd308537f;hpb=55b333253d5bcafbe187b50474e40789301c53c6;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/core/sock.c b/net/core/sock.c index 0710138..7dbf3ff 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -7,8 +7,6 @@ * handler for protocols to use and generic option handler. * * - * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Florian La Roche, @@ -122,6 +120,7 @@ #include #include #include +#include #include #include @@ -138,7 +137,6 @@ static struct lock_class_key af_family_keys[AF_MAX]; static struct lock_class_key af_family_slock_keys[AF_MAX]; -#ifdef CONFIG_DEBUG_LOCK_ALLOC /* * Make lock validator output more readable. (we pre-construct these * strings build-time, so that runtime initialization of socket @@ -152,11 +150,12 @@ static const char *af_family_key_strings[AF_MAX+1] = { "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" , "sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" , "sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" , - "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , + "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" , "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" , - "sk_lock-27" , "sk_lock-28" , "sk_lock-29" , + "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" , "sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" , - "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX" + "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" , + "sk_lock-AF_MAX" }; static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" , @@ -166,11 +165,12 @@ static const char *af_family_slock_key_strings[AF_MAX+1] = { "slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" , "slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" , "slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" , - "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" , + "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" , "slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" , - "slock-27" , "slock-28" , "slock-29" , + "slock-27" , "slock-28" , "slock-AF_CAN" , "slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" , - "slock-AF_RXRPC" , "slock-AF_MAX" + "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" , + "slock-AF_MAX" }; static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" , @@ -180,13 +180,13 @@ static const char *af_family_clock_key_strings[AF_MAX+1] = { "clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" , "clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" , "clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" , - "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" , + "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" , "clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" , - "clock-27" , "clock-28" , "clock-29" , + "clock-27" , "clock-28" , "clock-AF_CAN" , "clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" , - "clock-AF_RXRPC" , "clock-AF_MAX" + "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" , + "clock-AF_MAX" }; -#endif /* * sk_callback_lock locking rules are per-address-family, @@ -228,11 +228,12 @@ static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) static int warned __read_mostly; *timeo_p = 0; - if (warned < 10 && net_ratelimit()) + if (warned < 10 && net_ratelimit()) { warned++; printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) " "tries to set negative timeout\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); + } return 0; } *timeo_p = MAX_SCHEDULE_TIMEOUT; @@ -255,11 +256,14 @@ static void sock_warn_obsolete_bsdism(const char *name) } } -static void sock_disable_timestamp(struct sock *sk) +static void sock_disable_timestamp(struct sock *sk, int flag) { - if (sock_flag(sk, SOCK_TIMESTAMP)) { - sock_reset_flag(sk, SOCK_TIMESTAMP); - net_disable_timestamp(); + if (sock_flag(sk, flag)) { + sock_reset_flag(sk, flag); + if (!sock_flag(sk, SOCK_TIMESTAMP) && + !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) { + net_disable_timestamp(); + } } } @@ -269,7 +273,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) int err = 0; int skb_len; - /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces + /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces number of warnings when compiling with -W --ANK */ if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= @@ -282,6 +286,11 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (err) goto out; + if (!sk_rmem_schedule(sk, skb->truesize)) { + err = -ENOBUFS; + goto out; + } + skb->dev = NULL; skb_set_owner_r(skb, sk); @@ -320,7 +329,7 @@ int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) */ mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_); - rc = sk->sk_backlog_rcv(sk, skb); + rc = sk_backlog_rcv(sk, skb); mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_); } else @@ -367,7 +376,7 @@ static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES - struct net *net = sk->sk_net; + struct net *net = sock_net(sk); char devname[IFNAMSIZ]; int index; @@ -419,6 +428,14 @@ out: return ret; } +static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +{ + if (valbool) + sock_set_flag(sk, bit); + else + sock_reset_flag(sk, bit); +} + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. @@ -437,15 +454,6 @@ int sock_setsockopt(struct socket *sock, int level, int optname, * Options without arguments */ -#ifdef SO_DONTLINGER /* Compatibility item... */ - if (optname == SO_DONTLINGER) { - lock_sock(sk); - sock_reset_flag(sk, SOCK_LINGER); - release_sock(sk); - return 0; - } -#endif - if (optname == SO_BINDTODEVICE) return sock_bindtodevice(sk, optval, optlen); @@ -463,11 +471,8 @@ int sock_setsockopt(struct socket *sock, int level, int optname, case SO_DEBUG: if (val && !capable(CAP_NET_ADMIN)) { ret = -EACCES; - } - else if (valbool) - sock_set_flag(sk, SOCK_DBG); - else - sock_reset_flag(sk, SOCK_DBG); + } else + sock_valbool_flag(sk, SOCK_DBG, valbool); break; case SO_REUSEADDR: sk->sk_reuse = valbool; @@ -477,10 +482,7 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ret = -ENOPROTOOPT; break; case SO_DONTROUTE: - if (valbool) - sock_set_flag(sk, SOCK_LOCALROUTE); - else - sock_reset_flag(sk, SOCK_LOCALROUTE); + sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool); break; case SO_BROADCAST: sock_valbool_flag(sk, SOCK_BROADCAST, valbool); @@ -616,13 +618,38 @@ set_rcvbuf: else sock_set_flag(sk, SOCK_RCVTSTAMPNS); sock_set_flag(sk, SOCK_RCVTSTAMP); - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); } else { sock_reset_flag(sk, SOCK_RCVTSTAMP); sock_reset_flag(sk, SOCK_RCVTSTAMPNS); } break; + case SO_TIMESTAMPING: + if (val & ~SOF_TIMESTAMPING_MASK) { + ret = EINVAL; + break; + } + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE, + val & SOF_TIMESTAMPING_TX_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE, + val & SOF_TIMESTAMPING_TX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE, + val & SOF_TIMESTAMPING_RX_HARDWARE); + if (val & SOF_TIMESTAMPING_RX_SOFTWARE) + sock_enable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + else + sock_disable_timestamp(sk, + SOCK_TIMESTAMPING_RX_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE, + val & SOF_TIMESTAMPING_SOFTWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE, + val & SOF_TIMESTAMPING_SYS_HARDWARE); + sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE, + val & SOF_TIMESTAMPING_RAW_HARDWARE); + break; + case SO_RCVLOWAT: if (val < 0) val = INT_MAX; @@ -660,6 +687,13 @@ set_rcvbuf: else clear_bit(SOCK_PASSSEC, &sock->flags); break; + case SO_MARK: + if (!capable(CAP_NET_ADMIN)) + ret = -EPERM; + else { + sk->sk_mark = val; + } + break; /* We implement the SO_SNDLOWAT etc to not be settable (1003.1g 5.3) */ @@ -691,6 +725,8 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len < 0) return -EINVAL; + memset(&v, 0, sizeof(v)); + switch(optname) { case SO_DEBUG: v.val = sock_flag(sk, SOCK_DBG); @@ -761,6 +797,24 @@ int sock_getsockopt(struct socket *sock, int level, int optname, v.val = sock_flag(sk, SOCK_RCVTSTAMPNS); break; + case SO_TIMESTAMPING: + v.val = 0; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_TX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_TX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RX_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_RX_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE)) + v.val |= SOF_TIMESTAMPING_SOFTWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE)) + v.val |= SOF_TIMESTAMPING_SYS_HARDWARE; + if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE)) + v.val |= SOF_TIMESTAMPING_RAW_HARDWARE; + break; + case SO_RCVTIMEO: lv=sizeof(struct timeval); if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) { @@ -829,6 +883,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname, case SO_PEERSEC: return security_socket_getpeersec_stream(sock, optval, optlen, len); + case SO_MARK: + v.val = sk->sk_mark; + break; + default: return -ENOPROTOOPT; } @@ -857,46 +915,43 @@ static inline void sock_lock_init(struct sock *sk) af_family_keys + sk->sk_family); } -/** - * sk_alloc - All socket objects are allocated here - * @net: the applicable net namespace - * @family: protocol family - * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) - * @prot: struct proto associated with this new sock instance - * @zero_it: if we should zero the newly allocated sock - */ -struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot, int zero_it) +static void sock_copy(struct sock *nsk, const struct sock *osk) +{ +#ifdef CONFIG_SECURITY_NETWORK + void *sptr = nsk->sk_security; +#endif + + memcpy(nsk, osk, osk->sk_prot->obj_size); +#ifdef CONFIG_SECURITY_NETWORK + nsk->sk_security = sptr; + security_sk_clone(osk, nsk); +#endif +} + +static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority, + int family) { - struct sock *sk = NULL; - struct kmem_cache *slab = prot->slab; + struct sock *sk; + struct kmem_cache *slab; + slab = prot->slab; if (slab != NULL) sk = kmem_cache_alloc(slab, priority); else sk = kmalloc(prot->obj_size, priority); - if (sk) { - if (zero_it) { - memset(sk, 0, prot->obj_size); - sk->sk_family = family; - /* - * See comment in struct sock definition to understand - * why we need sk_prot_creator -acme - */ - sk->sk_prot = sk->sk_prot_creator = prot; - sock_lock_init(sk); - sk->sk_net = get_net(net); - } - + if (sk != NULL) { if (security_sk_alloc(sk, family, priority)) goto out_free; if (!try_module_get(prot->owner)) - goto out_free; + goto out_free_sec; } + return sk; +out_free_sec: + security_sk_free(sk); out_free: if (slab != NULL) kmem_cache_free(slab, sk); @@ -905,45 +960,105 @@ out_free: return NULL; } +static void sk_prot_free(struct proto *prot, struct sock *sk) +{ + struct kmem_cache *slab; + struct module *owner; + + owner = prot->owner; + slab = prot->slab; + + security_sk_free(sk); + if (slab != NULL) + kmem_cache_free(slab, sk); + else + kfree(sk); + module_put(owner); +} + +/** + * sk_alloc - All socket objects are allocated here + * @net: the applicable net namespace + * @family: protocol family + * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc) + * @prot: struct proto associated with this new sock instance + */ +struct sock *sk_alloc(struct net *net, int family, gfp_t priority, + struct proto *prot) +{ + struct sock *sk; + + sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family); + if (sk) { + sk->sk_family = family; + /* + * See comment in struct sock definition to understand + * why we need sk_prot_creator -acme + */ + sk->sk_prot = sk->sk_prot_creator = prot; + sock_lock_init(sk); + sock_net_set(sk, get_net(net)); + } + + return sk; +} + void sk_free(struct sock *sk) { struct sk_filter *filter; - struct module *owner = sk->sk_prot_creator->owner; if (sk->sk_destruct) sk->sk_destruct(sk); filter = rcu_dereference(sk->sk_filter); if (filter) { - sk_filter_release(sk, filter); + sk_filter_uncharge(sk, filter); rcu_assign_pointer(sk->sk_filter, NULL); } - sock_disable_timestamp(sk); + sock_disable_timestamp(sk, SOCK_TIMESTAMP); + sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE); if (atomic_read(&sk->sk_omem_alloc)) printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n", - __FUNCTION__, atomic_read(&sk->sk_omem_alloc)); + __func__, atomic_read(&sk->sk_omem_alloc)); - security_sk_free(sk); - put_net(sk->sk_net); - if (sk->sk_prot_creator->slab != NULL) - kmem_cache_free(sk->sk_prot_creator->slab, sk); - else - kfree(sk); - module_put(owner); + put_net(sock_net(sk)); + sk_prot_free(sk->sk_prot_creator, sk); } +/* + * Last sock_put should drop referrence to sk->sk_net. It has already + * been dropped in sk_change_net. Taking referrence to stopping namespace + * is not an option. + * Take referrence to a socket to remove it from hash _alive_ and after that + * destroy it in the context of init_net. + */ +void sk_release_kernel(struct sock *sk) +{ + if (sk == NULL || sk->sk_socket == NULL) + return; + + sock_hold(sk); + sock_release(sk->sk_socket); + release_net(sock_net(sk)); + sock_net_set(sk, get_net(&init_net)); + sock_put(sk); +} +EXPORT_SYMBOL(sk_release_kernel); + struct sock *sk_clone(const struct sock *sk, const gfp_t priority) { - struct sock *newsk = sk_alloc(sk->sk_net, sk->sk_family, priority, sk->sk_prot, 0); + struct sock *newsk; + newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family); if (newsk != NULL) { struct sk_filter *filter; sock_copy(newsk, sk); /* SANITY */ + get_net(sock_net(newsk)); sk_node_init(&newsk->sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); @@ -1002,11 +1117,11 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority) * to be taken into account in all callers. -acme */ sk_refcnt_debug_inc(newsk); - newsk->sk_socket = NULL; + sk_set_socket(newsk, NULL); newsk->sk_sleep = NULL; if (newsk->sk_prot->sockets_allocated) - atomic_inc(newsk->sk_prot->sockets_allocated); + percpu_counter_inc(newsk->sk_prot->sockets_allocated); } out: return newsk; @@ -1021,10 +1136,12 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst) if (sk->sk_route_caps & NETIF_F_GSO) sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE; if (sk_can_gso(sk)) { - if (dst->header_len) + if (dst->header_len) { sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - else + } else { sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; + sk->sk_gso_max_size = dst->dev->gso_max_size; + } } } EXPORT_SYMBOL_GPL(sk_setup_caps); @@ -1069,6 +1186,7 @@ void sock_rfree(struct sk_buff *skb) struct sock *sk = skb->sk; atomic_sub(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_uncharge(skb->sk, skb->truesize); } @@ -1185,10 +1303,9 @@ static long sock_wait_for_wmem(struct sock * sk, long timeo) * Generic send/receive buffer handlers */ -static struct sk_buff *sock_alloc_send_pskb(struct sock *sk, - unsigned long header_len, - unsigned long data_len, - int noblock, int *errcode) +struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len, + unsigned long data_len, int noblock, + int *errcode) { struct sk_buff *skb; gfp_t gfp_mask; @@ -1268,6 +1385,7 @@ failure: *errcode = err; return NULL; } +EXPORT_SYMBOL(sock_alloc_send_pskb); struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, int noblock, int *errcode) @@ -1303,7 +1421,7 @@ static void __release_sock(struct sock *sk) struct sk_buff *next = skb->next; skb->next = NULL; - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); /* * We are in process context here with softirqs @@ -1345,6 +1463,107 @@ int sk_wait_data(struct sock *sk, long *timeo) EXPORT_SYMBOL(sk_wait_data); +/** + * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated + * @sk: socket + * @size: memory size to allocate + * @kind: allocation type + * + * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means + * rmem allocation. This function assumes that protocols which have + * memory_pressure use sk_wmem_queued as write buffer accounting. + */ +int __sk_mem_schedule(struct sock *sk, int size, int kind) +{ + struct proto *prot = sk->sk_prot; + int amt = sk_mem_pages(size); + int allocated; + + sk->sk_forward_alloc += amt * SK_MEM_QUANTUM; + allocated = atomic_add_return(amt, prot->memory_allocated); + + /* Under limit. */ + if (allocated <= prot->sysctl_mem[0]) { + if (prot->memory_pressure && *prot->memory_pressure) + *prot->memory_pressure = 0; + return 1; + } + + /* Under pressure. */ + if (allocated > prot->sysctl_mem[1]) + if (prot->enter_memory_pressure) + prot->enter_memory_pressure(sk); + + /* Over hard limit. */ + if (allocated > prot->sysctl_mem[2]) + goto suppress_allocation; + + /* guarantee minimum buffer size under pressure */ + if (kind == SK_MEM_RECV) { + if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0]) + return 1; + } else { /* SK_MEM_SEND */ + if (sk->sk_type == SOCK_STREAM) { + if (sk->sk_wmem_queued < prot->sysctl_wmem[0]) + return 1; + } else if (atomic_read(&sk->sk_wmem_alloc) < + prot->sysctl_wmem[0]) + return 1; + } + + if (prot->memory_pressure) { + int alloc; + + if (!*prot->memory_pressure) + return 1; + alloc = percpu_counter_read_positive(prot->sockets_allocated); + if (prot->sysctl_mem[2] > alloc * + sk_mem_pages(sk->sk_wmem_queued + + atomic_read(&sk->sk_rmem_alloc) + + sk->sk_forward_alloc)) + return 1; + } + +suppress_allocation: + + if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) { + sk_stream_moderate_sndbuf(sk); + + /* Fail only if socket is _under_ its sndbuf. + * In this case we cannot block, so that we have to fail. + */ + if (sk->sk_wmem_queued + size >= sk->sk_sndbuf) + return 1; + } + + /* Alas. Undo changes. */ + sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; + atomic_sub(amt, prot->memory_allocated); + return 0; +} + +EXPORT_SYMBOL(__sk_mem_schedule); + +/** + * __sk_reclaim - reclaim memory_allocated + * @sk: socket + */ +void __sk_mem_reclaim(struct sock *sk) +{ + struct proto *prot = sk->sk_prot; + + atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, + prot->memory_allocated); + sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; + + if (prot->memory_pressure && *prot->memory_pressure && + (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0])) + *prot->memory_pressure = 0; +} + +EXPORT_SYMBOL(__sk_mem_reclaim); + + /* * Set of default routines for initialising struct proto_ops when * the protocol does not support a particular function. In certain @@ -1458,8 +1677,8 @@ static void sock_def_error_report(struct sock *sk) { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk,0,POLL_ERR); + wake_up_interruptible_poll(sk->sk_sleep, POLLERR); + sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR); read_unlock(&sk->sk_callback_lock); } @@ -1467,8 +1686,9 @@ static void sock_def_readable(struct sock *sk, int len) { read_lock(&sk->sk_callback_lock); if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk,1,POLL_IN); + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN | + POLLRDNORM | POLLRDBAND); + sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&sk->sk_callback_lock); } @@ -1481,11 +1701,12 @@ static void sock_def_write_space(struct sock *sk) */ if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); + wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT | + POLLWRNORM | POLLWRBAND); /* Should agree with poll, otherwise some programs break */ if (sock_writeable(sk)) - sk_wake_async(sk, 2, POLL_OUT); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); @@ -1500,7 +1721,7 @@ void sk_send_sigurg(struct sock *sk) { if (sk->sk_socket && sk->sk_socket->file) if (send_sigurg(&sk->sk_socket->file->f_owner)) - sk_wake_async(sk, 3, POLL_PRI); + sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI); } void sk_reset_timer(struct sock *sk, struct timer_list* timer, @@ -1537,7 +1758,7 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvbuf = sysctl_rmem_default; sk->sk_sndbuf = sysctl_wmem_default; sk->sk_state = TCP_CLOSE; - sk->sk_socket = sock; + sk_set_socket(sk, sock); sock_set_flag(sk, SOCK_ZAPPED); @@ -1571,12 +1792,13 @@ void sock_init_data(struct socket *sock, struct sock *sk) sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT; - sk->sk_stamp = ktime_set(-1L, -1L); + sk->sk_stamp = ktime_set(-1L, 0); atomic_set(&sk->sk_refcnt, 1); + atomic_set(&sk->sk_drops, 0); } -void fastcall lock_sock_nested(struct sock *sk, int subclass) +void lock_sock_nested(struct sock *sk, int subclass) { might_sleep(); spin_lock_bh(&sk->sk_lock.slock); @@ -1593,7 +1815,7 @@ void fastcall lock_sock_nested(struct sock *sk, int subclass) EXPORT_SYMBOL(lock_sock_nested); -void fastcall release_sock(struct sock *sk) +void release_sock(struct sock *sk) { /* * The sk_lock has mutex_unlock() semantics: @@ -1614,7 +1836,7 @@ int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp) { struct timeval tv; if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); tv = ktime_to_timeval(sk->sk_stamp); if (tv.tv_sec == -1) return -ENOENT; @@ -1630,7 +1852,7 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) { struct timespec ts; if (!sock_flag(sk, SOCK_TIMESTAMP)) - sock_enable_timestamp(sk); + sock_enable_timestamp(sk, SOCK_TIMESTAMP); ts = ktime_to_timespec(sk->sk_stamp); if (ts.tv_sec == -1) return -ENOENT; @@ -1642,14 +1864,22 @@ int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp) } EXPORT_SYMBOL(sock_get_timestampns); -void sock_enable_timestamp(struct sock *sk) +void sock_enable_timestamp(struct sock *sk, int flag) { - if (!sock_flag(sk, SOCK_TIMESTAMP)) { - sock_set_flag(sk, SOCK_TIMESTAMP); - net_enable_timestamp(); + if (!sock_flag(sk, flag)) { + sock_set_flag(sk, flag); + /* + * we just set one of the two flags which require net + * time stamping, but time stamping might have been on + * already because of the other one + */ + if (!sock_flag(sk, + flag == SOCK_TIMESTAMP ? + SOCK_TIMESTAMPING_RX_SOFTWARE : + SOCK_TIMESTAMP)) + net_enable_timestamp(); } } -EXPORT_SYMBOL(sock_enable_timestamp); /* * Get a socket option on an socket. @@ -1765,15 +1995,114 @@ EXPORT_SYMBOL(sk_common_release); static DEFINE_RWLOCK(proto_list_lock); static LIST_HEAD(proto_list); -int proto_register(struct proto *prot, int alloc_slab) +#ifdef CONFIG_PROC_FS +#define PROTO_INUSE_NR 64 /* should be enough for the first time */ +struct prot_inuse { + int val[PROTO_INUSE_NR]; +}; + +static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); + +#ifdef CONFIG_NET_NS +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + int cpu = smp_processor_id(); + per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); + +static int sock_inuse_init_net(struct net *net) +{ + net->core.inuse = alloc_percpu(struct prot_inuse); + return net->core.inuse ? 0 : -ENOMEM; +} + +static void sock_inuse_exit_net(struct net *net) +{ + free_percpu(net->core.inuse); +} + +static struct pernet_operations net_inuse_ops = { + .init = sock_inuse_init_net, + .exit = sock_inuse_exit_net, +}; + +static __init int net_inuse_init(void) { - char *request_sock_slab_name = NULL; - char *timewait_sock_slab_name; - int rc = -ENOBUFS; + if (register_pernet_subsys(&net_inuse_ops)) + panic("Cannot initialize net inuse counters"); + return 0; +} + +core_initcall(net_inuse_init); +#else +static DEFINE_PER_CPU(struct prot_inuse, prot_inuse); + +void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) +{ + __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_add); + +int sock_prot_inuse_get(struct net *net, struct proto *prot) +{ + int cpu, idx = prot->inuse_idx; + int res = 0; + + for_each_possible_cpu(cpu) + res += per_cpu(prot_inuse, cpu).val[idx]; + + return res >= 0 ? res : 0; +} +EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +#endif + +static void assign_proto_idx(struct proto *prot) +{ + prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR); + + if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) { + printk(KERN_ERR "PROTO_INUSE_NR exhausted\n"); + return; + } + + set_bit(prot->inuse_idx, proto_inuse_idx); +} + +static void release_proto_idx(struct proto *prot) +{ + if (prot->inuse_idx != PROTO_INUSE_NR - 1) + clear_bit(prot->inuse_idx, proto_inuse_idx); +} +#else +static inline void assign_proto_idx(struct proto *prot) +{ +} + +static inline void release_proto_idx(struct proto *prot) +{ +} +#endif + +int proto_register(struct proto *prot, int alloc_slab) +{ if (alloc_slab) { prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0, - SLAB_HWCACHE_ALIGN, NULL); + SLAB_HWCACHE_ALIGN | prot->slab_flags, + NULL); if (prot->slab == NULL) { printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n", @@ -1784,12 +2113,12 @@ int proto_register(struct proto *prot, int alloc_slab) if (prot->rsk_prot != NULL) { static const char mask[] = "request_sock_%s"; - request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); - if (request_sock_slab_name == NULL) + prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + if (prot->rsk_prot->slab_name == NULL) goto out_free_sock_slab; - sprintf(request_sock_slab_name, mask, prot->name); - prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name, + sprintf(prot->rsk_prot->slab_name, mask, prot->name); + prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name, prot->rsk_prot->obj_size, 0, SLAB_HWCACHE_ALIGN, NULL); @@ -1803,16 +2132,18 @@ int proto_register(struct proto *prot, int alloc_slab) if (prot->twsk_prot != NULL) { static const char mask[] = "tw_sock_%s"; - timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); + prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); - if (timewait_sock_slab_name == NULL) + if (prot->twsk_prot->twsk_slab_name == NULL) goto out_free_request_sock_slab; - sprintf(timewait_sock_slab_name, mask, prot->name); + sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name); prot->twsk_prot->twsk_slab = - kmem_cache_create(timewait_sock_slab_name, + kmem_cache_create(prot->twsk_prot->twsk_slab_name, prot->twsk_prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, + 0, + SLAB_HWCACHE_ALIGN | + prot->slab_flags, NULL); if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; @@ -1821,23 +2152,24 @@ int proto_register(struct proto *prot, int alloc_slab) write_lock(&proto_list_lock); list_add(&prot->node, &proto_list); + assign_proto_idx(prot); write_unlock(&proto_list_lock); - rc = 0; -out: - return rc; + return 0; + out_free_timewait_sock_slab_name: - kfree(timewait_sock_slab_name); + kfree(prot->twsk_prot->twsk_slab_name); out_free_request_sock_slab: if (prot->rsk_prot && prot->rsk_prot->slab) { kmem_cache_destroy(prot->rsk_prot->slab); prot->rsk_prot->slab = NULL; } out_free_request_sock_slab_name: - kfree(request_sock_slab_name); + kfree(prot->rsk_prot->slab_name); out_free_sock_slab: kmem_cache_destroy(prot->slab); prot->slab = NULL; - goto out; +out: + return -ENOBUFS; } EXPORT_SYMBOL(proto_register); @@ -1845,6 +2177,7 @@ EXPORT_SYMBOL(proto_register); void proto_unregister(struct proto *prot) { write_lock(&proto_list_lock); + release_proto_idx(prot); list_del(&prot->node); write_unlock(&proto_list_lock); @@ -1854,18 +2187,14 @@ void proto_unregister(struct proto *prot) } if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) { - const char *name = kmem_cache_name(prot->rsk_prot->slab); - kmem_cache_destroy(prot->rsk_prot->slab); - kfree(name); + kfree(prot->rsk_prot->slab_name); prot->rsk_prot->slab = NULL; } if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { - const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab); - kmem_cache_destroy(prot->twsk_prot->twsk_slab); - kfree(name); + kfree(prot->twsk_prot->twsk_slab_name); prot->twsk_prot->twsk_slab = NULL; } } @@ -1874,6 +2203,7 @@ EXPORT_SYMBOL(proto_unregister); #ifdef CONFIG_PROC_FS static void *proto_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(proto_list_lock) { read_lock(&proto_list_lock); return seq_list_start_head(&proto_list, *pos); @@ -1885,6 +2215,7 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void proto_seq_stop(struct seq_file *seq, void *v) + __releases(proto_list_lock) { read_unlock(&proto_list_lock); } @@ -1900,7 +2231,7 @@ static void proto_seq_printf(struct seq_file *seq, struct proto *proto) "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n", proto->name, proto->obj_size, - proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1, + sock_prot_inuse_get(seq_file_net(seq), proto), proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1, proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI", proto->max_header, @@ -1954,7 +2285,8 @@ static const struct seq_operations proto_seq_ops = { static int proto_seq_open(struct inode *inode, struct file *file) { - return seq_open(file, &proto_seq_ops); + return seq_open_net(inode, file, &proto_seq_ops, + sizeof(struct seq_net_private)); } static const struct file_operations proto_seq_fops = { @@ -1962,13 +2294,31 @@ static const struct file_operations proto_seq_fops = { .open = proto_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release, + .release = seq_release_net, +}; + +static __net_init int proto_init_net(struct net *net) +{ + if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops)) + return -ENOMEM; + + return 0; +} + +static __net_exit void proto_exit_net(struct net *net) +{ + proc_net_remove(net, "protocols"); +} + + +static __net_initdata struct pernet_operations proto_net_ops = { + .init = proto_init_net, + .exit = proto_exit_net, }; static int __init proto_init(void) { - /* register /proc/net/protocols */ - return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0; + return register_pernet_subsys(&proto_net_ops); } subsys_initcall(proto_init); @@ -2004,7 +2354,3 @@ EXPORT_SYMBOL(sock_wmalloc); EXPORT_SYMBOL(sock_i_uid); EXPORT_SYMBOL(sock_i_ino); EXPORT_SYMBOL(sysctl_optmem_max); -#ifdef CONFIG_SYSCTL -EXPORT_SYMBOL(sysctl_rmem_max); -EXPORT_SYMBOL(sysctl_wmem_max); -#endif