* handler for protocols to use and generic option handler.
*
*
- * Version: $Id: sock.c,v 1.117 2002/02/01 22:01:03 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
#include <net/net_namespace.h>
#include <net/request_sock.h>
#include <net/sock.h>
+#include <linux/net_tstamp.h>
#include <net/xfrm.h>
#include <linux/ipsec.h>
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
"sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI" , "sk_lock-AF_SECURITY" ,
"sk_lock-AF_KEY" , "sk_lock-AF_NETLINK" , "sk_lock-AF_PACKET" ,
"sk_lock-AF_ASH" , "sk_lock-AF_ECONET" , "sk_lock-AF_ATMSVC" ,
- "sk_lock-21" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
+ "sk_lock-AF_RDS" , "sk_lock-AF_SNA" , "sk_lock-AF_IRDA" ,
"sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE" , "sk_lock-AF_LLC" ,
- "sk_lock-27" , "sk_lock-28" , "sk_lock-29" ,
+ "sk_lock-27" , "sk_lock-28" , "sk_lock-AF_CAN" ,
"sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
- "sk_lock-AF_RXRPC" , "sk_lock-AF_MAX"
+ "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
+ "sk_lock-AF_IEEE802154",
+ "sk_lock-AF_MAX"
};
static const char *af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
"slock-AF_DECnet", "slock-AF_NETBEUI" , "slock-AF_SECURITY" ,
"slock-AF_KEY" , "slock-AF_NETLINK" , "slock-AF_PACKET" ,
"slock-AF_ASH" , "slock-AF_ECONET" , "slock-AF_ATMSVC" ,
- "slock-21" , "slock-AF_SNA" , "slock-AF_IRDA" ,
+ "slock-AF_RDS" , "slock-AF_SNA" , "slock-AF_IRDA" ,
"slock-AF_PPPOX" , "slock-AF_WANPIPE" , "slock-AF_LLC" ,
- "slock-27" , "slock-28" , "slock-29" ,
+ "slock-27" , "slock-28" , "slock-AF_CAN" ,
"slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
- "slock-AF_RXRPC" , "slock-AF_MAX"
+ "slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
+ "slock-AF_IEEE802154",
+ "slock-AF_MAX"
};
static const char *af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
"clock-AF_DECnet", "clock-AF_NETBEUI" , "clock-AF_SECURITY" ,
"clock-AF_KEY" , "clock-AF_NETLINK" , "clock-AF_PACKET" ,
"clock-AF_ASH" , "clock-AF_ECONET" , "clock-AF_ATMSVC" ,
- "clock-21" , "clock-AF_SNA" , "clock-AF_IRDA" ,
+ "clock-AF_RDS" , "clock-AF_SNA" , "clock-AF_IRDA" ,
"clock-AF_PPPOX" , "clock-AF_WANPIPE" , "clock-AF_LLC" ,
- "clock-27" , "clock-28" , "clock-29" ,
+ "clock-27" , "clock-28" , "clock-AF_CAN" ,
"clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
- "clock-AF_RXRPC" , "clock-AF_MAX"
+ "clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
+ "clock-AF_IEEE802154",
+ "clock-AF_MAX"
};
-#endif
/*
* sk_callback_lock locking rules are per-address-family,
/* Maximal space eaten by iovec or ancilliary data plus some space */
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
+EXPORT_SYMBOL(sysctl_optmem_max);
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
static int warned __read_mostly;
*timeo_p = 0;
- if (warned < 10 && net_ratelimit())
+ if (warned < 10 && net_ratelimit()) {
warned++;
printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
"tries to set negative timeout\n",
current->comm, task_pid_nr(current));
+ }
return 0;
}
*timeo_p = MAX_SCHEDULE_TIMEOUT;
}
}
-static void sock_disable_timestamp(struct sock *sk)
+static void sock_disable_timestamp(struct sock *sk, int flag)
{
- if (sock_flag(sk, SOCK_TIMESTAMP)) {
- sock_reset_flag(sk, SOCK_TIMESTAMP);
- net_disable_timestamp();
+ if (sock_flag(sk, flag)) {
+ sock_reset_flag(sk, flag);
+ if (!sock_flag(sk, SOCK_TIMESTAMP) &&
+ !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
+ net_disable_timestamp();
+ }
}
}
int err = 0;
int skb_len;
- /* Cast skb->rcvbuf to unsigned... It's pointless, but reduces
+ /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
number of warnings when compiling with -W --ANK
*/
if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
if (err)
goto out;
+ if (!sk_rmem_schedule(sk, skb->truesize)) {
+ err = -ENOBUFS;
+ goto out;
+ }
+
skb->dev = NULL;
skb_set_owner_r(skb, sk);
*/
mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
- rc = sk->sk_backlog_rcv(sk, skb);
+ rc = sk_backlog_rcv(sk, skb);
mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
} else
{
int ret = -ENOPROTOOPT;
#ifdef CONFIG_NETDEVICES
- struct net *net = sk->sk_net;
+ struct net *net = sock_net(sk);
char devname[IFNAMSIZ];
int index;
return ret;
}
+static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
+{
+ if (valbool)
+ sock_set_flag(sk, bit);
+ else
+ sock_reset_flag(sk, bit);
+}
+
/*
* This is meant for all protocols to use and covers goings on
* at the socket level. Everything here is generic.
int sock_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, int optlen)
{
- struct sock *sk=sock->sk;
+ struct sock *sk = sock->sk;
int val;
int valbool;
struct linger ling;
* Options without arguments
*/
-#ifdef SO_DONTLINGER /* Compatibility item... */
- if (optname == SO_DONTLINGER) {
- lock_sock(sk);
- sock_reset_flag(sk, SOCK_LINGER);
- release_sock(sk);
- return 0;
- }
-#endif
-
if (optname == SO_BINDTODEVICE)
return sock_bindtodevice(sk, optval, optlen);
if (get_user(val, (int __user *)optval))
return -EFAULT;
- valbool = val?1:0;
+ valbool = val ? 1 : 0;
lock_sock(sk);
- switch(optname) {
+ switch (optname) {
case SO_DEBUG:
- if (val && !capable(CAP_NET_ADMIN)) {
+ if (val && !capable(CAP_NET_ADMIN))
ret = -EACCES;
- }
- else if (valbool)
- sock_set_flag(sk, SOCK_DBG);
else
- sock_reset_flag(sk, SOCK_DBG);
+ sock_valbool_flag(sk, SOCK_DBG, valbool);
break;
case SO_REUSEADDR:
sk->sk_reuse = valbool;
ret = -ENOPROTOOPT;
break;
case SO_DONTROUTE:
- if (valbool)
- sock_set_flag(sk, SOCK_LOCALROUTE);
- else
- sock_reset_flag(sk, SOCK_LOCALROUTE);
+ sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
break;
case SO_BROADCAST:
sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
ret = -EINVAL; /* 1003.1g */
break;
}
- if (copy_from_user(&ling,optval,sizeof(ling))) {
+ if (copy_from_user(&ling, optval, sizeof(ling))) {
ret = -EFAULT;
break;
}
else
sock_set_flag(sk, SOCK_RCVTSTAMPNS);
sock_set_flag(sk, SOCK_RCVTSTAMP);
- sock_enable_timestamp(sk);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
} else {
sock_reset_flag(sk, SOCK_RCVTSTAMP);
sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
}
break;
+ case SO_TIMESTAMPING:
+ if (val & ~SOF_TIMESTAMPING_MASK) {
+ ret = EINVAL;
+ break;
+ }
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
+ val & SOF_TIMESTAMPING_TX_HARDWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
+ val & SOF_TIMESTAMPING_TX_SOFTWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
+ val & SOF_TIMESTAMPING_RX_HARDWARE);
+ if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
+ sock_enable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ else
+ sock_disable_timestamp(sk,
+ SOCK_TIMESTAMPING_RX_SOFTWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
+ val & SOF_TIMESTAMPING_SOFTWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
+ val & SOF_TIMESTAMPING_SYS_HARDWARE);
+ sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
+ val & SOF_TIMESTAMPING_RAW_HARDWARE);
+ break;
+
case SO_RCVLOWAT:
if (val < 0)
val = INT_MAX;
else
clear_bit(SOCK_PASSSEC, &sock->flags);
break;
+ case SO_MARK:
+ if (!capable(CAP_NET_ADMIN))
+ ret = -EPERM;
+ else
+ sk->sk_mark = val;
+ break;
/* We implement the SO_SNDLOWAT etc to
not be settable (1003.1g 5.3) */
release_sock(sk);
return ret;
}
+EXPORT_SYMBOL(sock_setsockopt);
int sock_getsockopt(struct socket *sock, int level, int optname,
if (len < 0)
return -EINVAL;
- switch(optname) {
+ memset(&v, 0, sizeof(v));
+
+ switch (optname) {
case SO_DEBUG:
v.val = sock_flag(sk, SOCK_DBG);
break;
case SO_ERROR:
v.val = -sock_error(sk);
- if (v.val==0)
+ if (v.val == 0)
v.val = xchg(&sk->sk_err_soft, 0);
break;
v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
break;
+ case SO_TIMESTAMPING:
+ v.val = 0;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
+ v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+ v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
+ v.val |= SOF_TIMESTAMPING_SOFTWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
+ if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
+ v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
+ break;
+
case SO_RCVTIMEO:
- lv=sizeof(struct timeval);
+ lv = sizeof(struct timeval);
if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
v.tm.tv_sec = 0;
v.tm.tv_usec = 0;
break;
case SO_SNDTIMEO:
- lv=sizeof(struct timeval);
+ lv = sizeof(struct timeval);
if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
v.tm.tv_sec = 0;
v.tm.tv_usec = 0;
break;
case SO_SNDLOWAT:
- v.val=1;
+ v.val = 1;
break;
case SO_PASSCRED:
case SO_PEERSEC:
return security_socket_getpeersec_stream(sock, optval, optlen, len);
+ case SO_MARK:
+ v.val = sk->sk_mark;
+ break;
+
default:
return -ENOPROTOOPT;
}
* @family: protocol family
* @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
* @prot: struct proto associated with this new sock instance
- * @zero_it: if we should zero the newly allocated sock
*/
struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
struct proto *prot)
*/
sk->sk_prot = sk->sk_prot_creator = prot;
sock_lock_init(sk);
- sk->sk_net = get_net(net);
+ sock_net_set(sk, get_net(net));
}
return sk;
}
+EXPORT_SYMBOL(sk_alloc);
-void sk_free(struct sock *sk)
+static void __sk_free(struct sock *sk)
{
struct sk_filter *filter;
rcu_assign_pointer(sk->sk_filter, NULL);
}
- sock_disable_timestamp(sk);
+ sock_disable_timestamp(sk, SOCK_TIMESTAMP);
+ sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
if (atomic_read(&sk->sk_omem_alloc))
printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
- __FUNCTION__, atomic_read(&sk->sk_omem_alloc));
+ __func__, atomic_read(&sk->sk_omem_alloc));
- put_net(sk->sk_net);
+ put_net(sock_net(sk));
sk_prot_free(sk->sk_prot_creator, sk);
}
+void sk_free(struct sock *sk)
+{
+ /*
+ * We substract one from sk_wmem_alloc and can know if
+ * some packets are still in some tx queue.
+ * If not null, sock_wfree() will call __sk_free(sk) later
+ */
+ if (atomic_dec_and_test(&sk->sk_wmem_alloc))
+ __sk_free(sk);
+}
+EXPORT_SYMBOL(sk_free);
+
+/*
+ * Last sock_put should drop referrence to sk->sk_net. It has already
+ * been dropped in sk_change_net. Taking referrence to stopping namespace
+ * is not an option.
+ * Take referrence to a socket to remove it from hash _alive_ and after that
+ * destroy it in the context of init_net.
+ */
+void sk_release_kernel(struct sock *sk)
+{
+ if (sk == NULL || sk->sk_socket == NULL)
+ return;
+
+ sock_hold(sk);
+ sock_release(sk->sk_socket);
+ release_net(sock_net(sk));
+ sock_net_set(sk, get_net(&init_net));
+ sock_put(sk);
+}
+EXPORT_SYMBOL(sk_release_kernel);
+
struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk;
sock_copy(newsk, sk);
/* SANITY */
- get_net(newsk->sk_net);
+ get_net(sock_net(newsk));
sk_node_init(&newsk->sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
atomic_set(&newsk->sk_rmem_alloc, 0);
- atomic_set(&newsk->sk_wmem_alloc, 0);
+ /*
+ * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
+ */
+ atomic_set(&newsk->sk_wmem_alloc, 1);
atomic_set(&newsk->sk_omem_alloc, 0);
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
* to be taken into account in all callers. -acme
*/
sk_refcnt_debug_inc(newsk);
- newsk->sk_socket = NULL;
+ sk_set_socket(newsk, NULL);
newsk->sk_sleep = NULL;
if (newsk->sk_prot->sockets_allocated)
- atomic_inc(newsk->sk_prot->sockets_allocated);
+ percpu_counter_inc(newsk->sk_prot->sockets_allocated);
}
out:
return newsk;
}
-
EXPORT_SYMBOL_GPL(sk_clone);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
if (sk_can_gso(sk)) {
- if (dst->header_len)
+ if (dst->header_len) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
- else
+ } else {
sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
+ sk->sk_gso_max_size = dst->dev->gso_max_size;
+ }
}
}
EXPORT_SYMBOL_GPL(sk_setup_caps);
void sock_wfree(struct sk_buff *skb)
{
struct sock *sk = skb->sk;
+ int res;
/* In case it might be waiting for more memory. */
- atomic_sub(skb->truesize, &sk->sk_wmem_alloc);
+ res = atomic_sub_return(skb->truesize, &sk->sk_wmem_alloc);
if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE))
sk->sk_write_space(sk);
- sock_put(sk);
+ /*
+ * if sk_wmem_alloc reached 0, we are last user and should
+ * free this sock, as sk_free() call could not do it.
+ */
+ if (res == 0)
+ __sk_free(sk);
}
+EXPORT_SYMBOL(sock_wfree);
/*
* Read buffer destructor automatically called from kfree_skb.
struct sock *sk = skb->sk;
atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_uncharge(skb->sk, skb->truesize);
}
+EXPORT_SYMBOL(sock_rfree);
int sock_i_uid(struct sock *sk)
read_unlock(&sk->sk_callback_lock);
return uid;
}
+EXPORT_SYMBOL(sock_i_uid);
unsigned long sock_i_ino(struct sock *sk)
{
read_unlock(&sk->sk_callback_lock);
return ino;
}
+EXPORT_SYMBOL(sock_i_ino);
/*
* Allocate a skb from the socket's send buffer.
gfp_t priority)
{
if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
- struct sk_buff * skb = alloc_skb(size, priority);
+ struct sk_buff *skb = alloc_skb(size, priority);
if (skb) {
skb_set_owner_w(skb, sk);
return skb;
}
return NULL;
}
+EXPORT_SYMBOL(sock_wmalloc);
/*
* Allocate a skb from the socket's receive buffer.
}
return NULL;
}
+EXPORT_SYMBOL(sock_kmalloc);
/*
* Free an option memory block.
kfree(mem);
atomic_sub(size, &sk->sk_omem_alloc);
}
+EXPORT_SYMBOL(sock_kfree_s);
/* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
I think, these locks should be removed for datagram sockets.
*/
-static long sock_wait_for_wmem(struct sock * sk, long timeo)
+static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
* Generic send/receive buffer handlers
*/
-static struct sk_buff *sock_alloc_send_pskb(struct sock *sk,
- unsigned long header_len,
- unsigned long data_len,
- int noblock, int *errcode)
+struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
+ unsigned long data_len, int noblock,
+ int *errcode)
{
struct sk_buff *skb;
gfp_t gfp_mask;
*errcode = err;
return NULL;
}
+EXPORT_SYMBOL(sock_alloc_send_pskb);
struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
int noblock, int *errcode)
{
return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
}
+EXPORT_SYMBOL(sock_alloc_send_skb);
static void __lock_sock(struct sock *sk)
{
struct sk_buff *next = skb->next;
skb->next = NULL;
- sk->sk_backlog_rcv(sk, skb);
+ sk_backlog_rcv(sk, skb);
/*
* We are in process context here with softirqs
finish_wait(sk->sk_sleep, &wait);
return rc;
}
-
EXPORT_SYMBOL(sk_wait_data);
+/**
+ * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
+ * @sk: socket
+ * @size: memory size to allocate
+ * @kind: allocation type
+ *
+ * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
+ * rmem allocation. This function assumes that protocols which have
+ * memory_pressure use sk_wmem_queued as write buffer accounting.
+ */
+int __sk_mem_schedule(struct sock *sk, int size, int kind)
+{
+ struct proto *prot = sk->sk_prot;
+ int amt = sk_mem_pages(size);
+ int allocated;
+
+ sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
+ allocated = atomic_add_return(amt, prot->memory_allocated);
+
+ /* Under limit. */
+ if (allocated <= prot->sysctl_mem[0]) {
+ if (prot->memory_pressure && *prot->memory_pressure)
+ *prot->memory_pressure = 0;
+ return 1;
+ }
+
+ /* Under pressure. */
+ if (allocated > prot->sysctl_mem[1])
+ if (prot->enter_memory_pressure)
+ prot->enter_memory_pressure(sk);
+
+ /* Over hard limit. */
+ if (allocated > prot->sysctl_mem[2])
+ goto suppress_allocation;
+
+ /* guarantee minimum buffer size under pressure */
+ if (kind == SK_MEM_RECV) {
+ if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
+ return 1;
+ } else { /* SK_MEM_SEND */
+ if (sk->sk_type == SOCK_STREAM) {
+ if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
+ return 1;
+ } else if (atomic_read(&sk->sk_wmem_alloc) <
+ prot->sysctl_wmem[0])
+ return 1;
+ }
+
+ if (prot->memory_pressure) {
+ int alloc;
+
+ if (!*prot->memory_pressure)
+ return 1;
+ alloc = percpu_counter_read_positive(prot->sockets_allocated);
+ if (prot->sysctl_mem[2] > alloc *
+ sk_mem_pages(sk->sk_wmem_queued +
+ atomic_read(&sk->sk_rmem_alloc) +
+ sk->sk_forward_alloc))
+ return 1;
+ }
+
+suppress_allocation:
+
+ if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
+ sk_stream_moderate_sndbuf(sk);
+
+ /* Fail only if socket is _under_ its sndbuf.
+ * In this case we cannot block, so that we have to fail.
+ */
+ if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
+ return 1;
+ }
+
+ /* Alas. Undo changes. */
+ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
+ atomic_sub(amt, prot->memory_allocated);
+ return 0;
+}
+EXPORT_SYMBOL(__sk_mem_schedule);
+
+/**
+ * __sk_reclaim - reclaim memory_allocated
+ * @sk: socket
+ */
+void __sk_mem_reclaim(struct sock *sk)
+{
+ struct proto *prot = sk->sk_prot;
+
+ atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
+ prot->memory_allocated);
+ sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
+
+ if (prot->memory_pressure && *prot->memory_pressure &&
+ (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
+ *prot->memory_pressure = 0;
+}
+EXPORT_SYMBOL(__sk_mem_reclaim);
+
+
/*
* Set of default routines for initialising struct proto_ops when
* the protocol does not support a particular function. In certain
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_bind);
int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
int len, int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_connect);
int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_socketpair);
int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_accept);
int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
int *len, int peer)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_getname);
-unsigned int sock_no_poll(struct file * file, struct socket *sock, poll_table *pt)
+unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
{
return 0;
}
+EXPORT_SYMBOL(sock_no_poll);
int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_ioctl);
int sock_no_listen(struct socket *sock, int backlog)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_listen);
int sock_no_shutdown(struct socket *sock, int how)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_shutdown);
int sock_no_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, int optlen)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_setsockopt);
int sock_no_getsockopt(struct socket *sock, int level, int optname,
char __user *optval, int __user *optlen)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_getsockopt);
int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
size_t len)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_sendmsg);
int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
size_t len, int flags)
{
return -EOPNOTSUPP;
}
+EXPORT_SYMBOL(sock_no_recvmsg);
int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
{
/* Mirror missing mmap method error code */
return -ENODEV;
}
+EXPORT_SYMBOL(sock_no_mmap);
ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
{
kunmap(page);
return res;
}
+EXPORT_SYMBOL(sock_no_sendpage);
/*
* Default Socket Callbacks
{
read_lock(&sk->sk_callback_lock);
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- sk_wake_async(sk,0,POLL_ERR);
+ wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
+ sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
read_unlock(&sk->sk_callback_lock);
}
{
read_lock(&sk->sk_callback_lock);
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
- sk_wake_async(sk,1,POLL_IN);
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
+ POLLRDNORM | POLLRDBAND);
+ sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
read_unlock(&sk->sk_callback_lock);
}
*/
if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
- wake_up_interruptible(sk->sk_sleep);
+ wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
+ POLLWRNORM | POLLWRBAND);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
- sk_wake_async(sk, 2, POLL_OUT);
+ sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
read_unlock(&sk->sk_callback_lock);
{
if (sk->sk_socket && sk->sk_socket->file)
if (send_sigurg(&sk->sk_socket->file->f_owner))
- sk_wake_async(sk, 3, POLL_PRI);
+ sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
}
+EXPORT_SYMBOL(sk_send_sigurg);
void sk_reset_timer(struct sock *sk, struct timer_list* timer,
unsigned long expires)
if (!mod_timer(timer, expires))
sock_hold(sk);
}
-
EXPORT_SYMBOL(sk_reset_timer);
void sk_stop_timer(struct sock *sk, struct timer_list* timer)
if (timer_pending(timer) && del_timer(timer))
__sock_put(sk);
}
-
EXPORT_SYMBOL(sk_stop_timer);
void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_rcvbuf = sysctl_rmem_default;
sk->sk_sndbuf = sysctl_wmem_default;
sk->sk_state = TCP_CLOSE;
- sk->sk_socket = sock;
+ sk_set_socket(sk, sock);
sock_set_flag(sk, SOCK_ZAPPED);
sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
sk->sk_sndtimeo = MAX_SCHEDULE_TIMEOUT;
- sk->sk_stamp = ktime_set(-1L, -1L);
+ sk->sk_stamp = ktime_set(-1L, 0);
atomic_set(&sk->sk_refcnt, 1);
+ atomic_set(&sk->sk_wmem_alloc, 1);
+ atomic_set(&sk->sk_drops, 0);
}
+EXPORT_SYMBOL(sock_init_data);
-void fastcall lock_sock_nested(struct sock *sk, int subclass)
+void lock_sock_nested(struct sock *sk, int subclass)
{
might_sleep();
spin_lock_bh(&sk->sk_lock.slock);
mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
local_bh_enable();
}
-
EXPORT_SYMBOL(lock_sock_nested);
-void fastcall release_sock(struct sock *sk)
+void release_sock(struct sock *sk)
{
/*
* The sk_lock has mutex_unlock() semantics:
{
struct timeval tv;
if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
tv = ktime_to_timeval(sk->sk_stamp);
if (tv.tv_sec == -1)
return -ENOENT;
{
struct timespec ts;
if (!sock_flag(sk, SOCK_TIMESTAMP))
- sock_enable_timestamp(sk);
+ sock_enable_timestamp(sk, SOCK_TIMESTAMP);
ts = ktime_to_timespec(sk->sk_stamp);
if (ts.tv_sec == -1)
return -ENOENT;
}
EXPORT_SYMBOL(sock_get_timestampns);
-void sock_enable_timestamp(struct sock *sk)
+void sock_enable_timestamp(struct sock *sk, int flag)
{
- if (!sock_flag(sk, SOCK_TIMESTAMP)) {
- sock_set_flag(sk, SOCK_TIMESTAMP);
- net_enable_timestamp();
+ if (!sock_flag(sk, flag)) {
+ sock_set_flag(sk, flag);
+ /*
+ * we just set one of the two flags which require net
+ * time stamping, but time stamping might have been on
+ * already because of the other one
+ */
+ if (!sock_flag(sk,
+ flag == SOCK_TIMESTAMP ?
+ SOCK_TIMESTAMPING_RX_SOFTWARE :
+ SOCK_TIMESTAMP))
+ net_enable_timestamp();
}
}
return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(sock_common_getsockopt);
#ifdef CONFIG_COMPAT
msg->msg_namelen = addr_len;
return err;
}
-
EXPORT_SYMBOL(sock_common_recvmsg);
/*
return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
}
-
EXPORT_SYMBOL(sock_common_setsockopt);
#ifdef CONFIG_COMPAT
sk_refcnt_debug_release(sk);
sock_put(sk);
}
-
EXPORT_SYMBOL(sk_common_release);
static DEFINE_RWLOCK(proto_list_lock);
static LIST_HEAD(proto_list);
-#ifdef CONFIG_SMP
-/*
- * Define default functions to keep track of inuse sockets per protocol
- * Note that often used protocols use dedicated functions to get a speed increase.
- * (see DEFINE_PROTO_INUSE/REF_PROTO_INUSE)
- */
-static void inuse_add(struct proto *prot, int inc)
+#ifdef CONFIG_PROC_FS
+#define PROTO_INUSE_NR 64 /* should be enough for the first time */
+struct prot_inuse {
+ int val[PROTO_INUSE_NR];
+};
+
+static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
+
+#ifdef CONFIG_NET_NS
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
{
- per_cpu_ptr(prot->inuse_ptr, smp_processor_id())[0] += inc;
+ int cpu = smp_processor_id();
+ per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
-static int inuse_get(const struct proto *prot)
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
- int res = 0, cpu;
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
for_each_possible_cpu(cpu)
- res += per_cpu_ptr(prot->inuse_ptr, cpu)[0];
- return res;
+ res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
+
+ return res >= 0 ? res : 0;
}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
-static int inuse_init(struct proto *prot)
+static int sock_inuse_init_net(struct net *net)
{
- if (!prot->inuse_getval || !prot->inuse_add) {
- prot->inuse_ptr = alloc_percpu(int);
- if (prot->inuse_ptr == NULL)
- return -ENOBUFS;
+ net->core.inuse = alloc_percpu(struct prot_inuse);
+ return net->core.inuse ? 0 : -ENOMEM;
+}
+
+static void sock_inuse_exit_net(struct net *net)
+{
+ free_percpu(net->core.inuse);
+}
+
+static struct pernet_operations net_inuse_ops = {
+ .init = sock_inuse_init_net,
+ .exit = sock_inuse_exit_net,
+};
+
+static __init int net_inuse_init(void)
+{
+ if (register_pernet_subsys(&net_inuse_ops))
+ panic("Cannot initialize net inuse counters");
- prot->inuse_getval = inuse_get;
- prot->inuse_add = inuse_add;
- }
return 0;
}
-static void inuse_fini(struct proto *prot)
+core_initcall(net_inuse_init);
+#else
+static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
+
+void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
+{
+ __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
+
+int sock_prot_inuse_get(struct net *net, struct proto *prot)
{
- if (prot->inuse_ptr != NULL) {
- free_percpu(prot->inuse_ptr);
- prot->inuse_ptr = NULL;
- prot->inuse_getval = NULL;
- prot->inuse_add = NULL;
+ int cpu, idx = prot->inuse_idx;
+ int res = 0;
+
+ for_each_possible_cpu(cpu)
+ res += per_cpu(prot_inuse, cpu).val[idx];
+
+ return res >= 0 ? res : 0;
+}
+EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
+#endif
+
+static void assign_proto_idx(struct proto *prot)
+{
+ prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
+
+ if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
+ printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
+ return;
}
+
+ set_bit(prot->inuse_idx, proto_inuse_idx);
+}
+
+static void release_proto_idx(struct proto *prot)
+{
+ if (prot->inuse_idx != PROTO_INUSE_NR - 1)
+ clear_bit(prot->inuse_idx, proto_inuse_idx);
}
#else
-static inline int inuse_init(struct proto *prot)
+static inline void assign_proto_idx(struct proto *prot)
{
- return 0;
}
-static inline void inuse_fini(struct proto *prot)
+static inline void release_proto_idx(struct proto *prot)
{
}
#endif
int proto_register(struct proto *prot, int alloc_slab)
{
- char *request_sock_slab_name = NULL;
- char *timewait_sock_slab_name;
-
- if (inuse_init(prot))
- goto out;
-
if (alloc_slab) {
prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
- SLAB_HWCACHE_ALIGN, NULL);
+ SLAB_HWCACHE_ALIGN | prot->slab_flags,
+ NULL);
if (prot->slab == NULL) {
printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
prot->name);
- goto out_free_inuse;
+ goto out;
}
if (prot->rsk_prot != NULL) {
static const char mask[] = "request_sock_%s";
- request_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
- if (request_sock_slab_name == NULL)
+ prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+ if (prot->rsk_prot->slab_name == NULL)
goto out_free_sock_slab;
- sprintf(request_sock_slab_name, mask, prot->name);
- prot->rsk_prot->slab = kmem_cache_create(request_sock_slab_name,
+ sprintf(prot->rsk_prot->slab_name, mask, prot->name);
+ prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
prot->rsk_prot->obj_size, 0,
SLAB_HWCACHE_ALIGN, NULL);
if (prot->twsk_prot != NULL) {
static const char mask[] = "tw_sock_%s";
- timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
+ prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
- if (timewait_sock_slab_name == NULL)
+ if (prot->twsk_prot->twsk_slab_name == NULL)
goto out_free_request_sock_slab;
- sprintf(timewait_sock_slab_name, mask, prot->name);
+ sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
prot->twsk_prot->twsk_slab =
- kmem_cache_create(timewait_sock_slab_name,
+ kmem_cache_create(prot->twsk_prot->twsk_slab_name,
prot->twsk_prot->twsk_obj_size,
- 0, SLAB_HWCACHE_ALIGN,
+ 0,
+ SLAB_HWCACHE_ALIGN |
+ prot->slab_flags,
NULL);
if (prot->twsk_prot->twsk_slab == NULL)
goto out_free_timewait_sock_slab_name;
write_lock(&proto_list_lock);
list_add(&prot->node, &proto_list);
+ assign_proto_idx(prot);
write_unlock(&proto_list_lock);
return 0;
out_free_timewait_sock_slab_name:
- kfree(timewait_sock_slab_name);
+ kfree(prot->twsk_prot->twsk_slab_name);
out_free_request_sock_slab:
if (prot->rsk_prot && prot->rsk_prot->slab) {
kmem_cache_destroy(prot->rsk_prot->slab);
prot->rsk_prot->slab = NULL;
}
out_free_request_sock_slab_name:
- kfree(request_sock_slab_name);
+ kfree(prot->rsk_prot->slab_name);
out_free_sock_slab:
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
-out_free_inuse:
- inuse_fini(prot);
out:
return -ENOBUFS;
}
-
EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
write_lock(&proto_list_lock);
+ release_proto_idx(prot);
list_del(&prot->node);
write_unlock(&proto_list_lock);
- inuse_fini(prot);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
prot->slab = NULL;
}
if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
- const char *name = kmem_cache_name(prot->rsk_prot->slab);
-
kmem_cache_destroy(prot->rsk_prot->slab);
- kfree(name);
+ kfree(prot->rsk_prot->slab_name);
prot->rsk_prot->slab = NULL;
}
if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
- const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab);
-
kmem_cache_destroy(prot->twsk_prot->twsk_slab);
- kfree(name);
+ kfree(prot->twsk_prot->twsk_slab_name);
prot->twsk_prot->twsk_slab = NULL;
}
}
-
EXPORT_SYMBOL(proto_unregister);
#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(proto_list_lock)
{
read_lock(&proto_list_lock);
return seq_list_start_head(&proto_list, *pos);
}
static void proto_seq_stop(struct seq_file *seq, void *v)
+ __releases(proto_list_lock)
{
read_unlock(&proto_list_lock);
}
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
- proto->sockets_allocated != NULL ? atomic_read(proto->sockets_allocated) : -1,
+ sock_prot_inuse_get(seq_file_net(seq), proto),
proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
proto->max_header,
static int proto_seq_open(struct inode *inode, struct file *file)
{
- return seq_open(file, &proto_seq_ops);
+ return seq_open_net(inode, file, &proto_seq_ops,
+ sizeof(struct seq_net_private));
}
static const struct file_operations proto_seq_fops = {
.open = proto_seq_open,
.read = seq_read,
.llseek = seq_lseek,
- .release = seq_release,
+ .release = seq_release_net,
+};
+
+static __net_init int proto_init_net(struct net *net)
+{
+ if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
+ return -ENOMEM;
+
+ return 0;
+}
+
+static __net_exit void proto_exit_net(struct net *net)
+{
+ proc_net_remove(net, "protocols");
+}
+
+
+static __net_initdata struct pernet_operations proto_net_ops = {
+ .init = proto_init_net,
+ .exit = proto_exit_net,
};
static int __init proto_init(void)
{
- /* register /proc/net/protocols */
- return proc_net_fops_create(&init_net, "protocols", S_IRUGO, &proto_seq_fops) == NULL ? -ENOBUFS : 0;
+ return register_pernet_subsys(&proto_net_ops);
}
subsys_initcall(proto_init);
#endif /* PROC_FS */
-
-EXPORT_SYMBOL(sk_alloc);
-EXPORT_SYMBOL(sk_free);
-EXPORT_SYMBOL(sk_send_sigurg);
-EXPORT_SYMBOL(sock_alloc_send_skb);
-EXPORT_SYMBOL(sock_init_data);
-EXPORT_SYMBOL(sock_kfree_s);
-EXPORT_SYMBOL(sock_kmalloc);
-EXPORT_SYMBOL(sock_no_accept);
-EXPORT_SYMBOL(sock_no_bind);
-EXPORT_SYMBOL(sock_no_connect);
-EXPORT_SYMBOL(sock_no_getname);
-EXPORT_SYMBOL(sock_no_getsockopt);
-EXPORT_SYMBOL(sock_no_ioctl);
-EXPORT_SYMBOL(sock_no_listen);
-EXPORT_SYMBOL(sock_no_mmap);
-EXPORT_SYMBOL(sock_no_poll);
-EXPORT_SYMBOL(sock_no_recvmsg);
-EXPORT_SYMBOL(sock_no_sendmsg);
-EXPORT_SYMBOL(sock_no_sendpage);
-EXPORT_SYMBOL(sock_no_setsockopt);
-EXPORT_SYMBOL(sock_no_shutdown);
-EXPORT_SYMBOL(sock_no_socketpair);
-EXPORT_SYMBOL(sock_rfree);
-EXPORT_SYMBOL(sock_setsockopt);
-EXPORT_SYMBOL(sock_wfree);
-EXPORT_SYMBOL(sock_wmalloc);
-EXPORT_SYMBOL(sock_i_uid);
-EXPORT_SYMBOL(sock_i_ino);
-EXPORT_SYMBOL(sysctl_optmem_max);
-#ifdef CONFIG_SYSCTL
-EXPORT_SYMBOL(sysctl_rmem_max);
-EXPORT_SYMBOL(sysctl_wmem_max);
-#endif