/*
* NETLINK Kernel-user communication protocol.
*
- * Authors: Alan Cox <alan@redhat.com>
+ * Authors: Alan Cox <alan@lxorguk.ukuu.org.uk>
* Alexey Kuznetsov <kuznet@ms2.inr.ac.ru>
*
* This program is free software; you can redistribute it and/or
struct module *module;
};
+struct listeners_rcu_head {
+ struct rcu_head rcu_head;
+ void *ptr;
+};
+
#define NETLINK_KERNEL_SOCKET 0x1
#define NETLINK_RECV_PKTINFO 0x2
+#define NETLINK_BROADCAST_SEND_ERROR 0x4
+#define NETLINK_RECV_NO_ENOBUFS 0x8
static inline struct netlink_sock *nlk_sk(struct sock *sk)
{
printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
return;
}
- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
- BUG_TRAP(!nlk_sk(sk)->groups);
+
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(nlk_sk(sk)->groups);
}
/* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on
* this, _but_ remember, it adds useless work on UP machines.
*/
-static void netlink_table_grab(void)
+void netlink_table_grab(void)
__acquires(nl_table_lock)
{
+ might_sleep();
+
write_lock_irq(&nl_table_lock);
if (atomic_read(&nl_table_users)) {
}
}
-static void netlink_table_ungrab(void)
+void netlink_table_ungrab(void)
__releases(nl_table_lock)
{
write_unlock_irq(&nl_table_lock);
return 0;
}
-static int netlink_create(struct net *net, struct socket *sock, int protocol)
+static int netlink_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct module *module = NULL;
struct mutex *cb_mutex;
return -EPROTONOSUPPORT;
netlink_lock_table();
-#ifdef CONFIG_KMOD
+#ifdef CONFIG_MODULES
if (!nl_table[protocol].registered) {
netlink_unlock_table();
request_module("net-pf-%d-proto-%d", PF_NETLINK, protocol);
if (err < 0)
goto out_module;
+ local_bh_disable();
+ sock_prot_inuse_add(net, &netlink_proto, 1);
+ local_bh_enable();
+
nlk = nlk_sk(sock->sk);
nlk->module = module;
out:
skb_queue_purge(&sk->sk_write_queue);
- if (nlk->pid && !nlk->subscriptions) {
+ if (nlk->pid) {
struct netlink_notify n = {
.net = sock_net(sk),
.protocol = sk->sk_protocol,
kfree(nlk->groups);
nlk->groups = NULL;
+ local_bh_disable();
+ sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1);
+ local_bh_enable();
sock_put(sk);
return 0;
}
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
- struct sockaddr_nl *nladdr = (struct sockaddr_nl *)addr;
+ DECLARE_SOCKADDR(struct sockaddr_nl *, nladdr, addr);
nladdr->nl_family = AF_NETLINK;
nladdr->nl_pad = 0;
static void netlink_overrun(struct sock *sk)
{
- if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
- sk->sk_err = ENOBUFS;
- sk->sk_error_report(sk);
+ struct netlink_sock *nlk = nlk_sk(sk);
+
+ if (!(nlk->flags & NETLINK_RECV_NO_ENOBUFS)) {
+ if (!test_and_set_bit(0, &nlk_sk(sk)->state)) {
+ sk->sk_err = ENOBUFS;
+ sk->sk_error_report(sk);
+ }
}
+ atomic_inc(&sk->sk_drops);
}
static struct sock *netlink_getsockbypid(struct sock *ssk, u32 pid)
* 0: continue
* 1: repeat lookup - reference dropped while waiting for socket memory.
*/
-int netlink_attachskb(struct sock *sk, struct sk_buff *skb, int nonblock,
+int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
long *timeo, struct sock *ssk)
{
struct netlink_sock *nlk;
return netlink_unicast_kernel(sk, skb);
if (sk_filter(sk, skb)) {
- int err = skb->len;
+ err = skb->len;
kfree_skb(skb);
sock_put(sk);
return err;
}
- err = netlink_attachskb(sk, skb, nonblock, &timeo, ssk);
+ err = netlink_attachskb(sk, skb, &timeo, ssk);
if (err == 1)
goto retry;
if (err)
u32 pid;
u32 group;
int failure;
+ int delivery_failure;
int congested;
int delivered;
gfp_t allocation;
netlink_overrun(sk);
/* Clone failed. Notify ALL listeners. */
p->failure = 1;
+ if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ p->delivery_failure = 1;
} else if (sk_filter(sk, p->skb2)) {
kfree_skb(p->skb2);
p->skb2 = NULL;
} else if ((val = netlink_broadcast_deliver(sk, p->skb2)) < 0) {
netlink_overrun(sk);
+ if (nlk->flags & NETLINK_BROADCAST_SEND_ERROR)
+ p->delivery_failure = 1;
} else {
p->congested |= val;
p->delivered = 1;
info.pid = pid;
info.group = group;
info.failure = 0;
+ info.delivery_failure = 0;
info.congested = 0;
info.delivered = 0;
info.allocation = allocation;
netlink_unlock_table();
- if (info.skb2)
- kfree_skb(info.skb2);
+ kfree_skb(info.skb2);
+
+ if (info.delivery_failure)
+ return -ENOBUFS;
if (info.delivered) {
if (info.congested && (allocation & __GFP_WAIT))
yield();
return 0;
}
- if (info.failure)
- return -ENOBUFS;
return -ESRCH;
}
EXPORT_SYMBOL(netlink_broadcast);
if (sk == p->exclude_sk)
goto out;
- if (sock_net(sk) != sock_net(p->exclude_sk))
+ if (!net_eq(sock_net(sk), sock_net(p->exclude_sk)))
goto out;
if (nlk->pid == p->pid || p->group - 1 >= nlk->ngroups ||
return 0;
}
+/**
+ * netlink_set_err - report error to broadcast listeners
+ * @ssk: the kernel netlink socket, as returned by netlink_kernel_create()
+ * @pid: the PID of a process that we want to skip (if any)
+ * @groups: the broadcast group that will notice the error
+ * @code: error code, must be negative (as usual in kernelspace)
+ */
void netlink_set_err(struct sock *ssk, u32 pid, u32 group, int code)
{
struct netlink_set_err_data info;
info.exclude_sk = ssk;
info.pid = pid;
info.group = group;
- info.code = code;
+ /* sk->sk_err wants a positive error value */
+ info.code = -code;
read_lock(&nl_table_lock);
read_unlock(&nl_table_lock);
}
+EXPORT_SYMBOL(netlink_set_err);
/* must be called with netlink table grabbed */
static void netlink_update_socket_mc(struct netlink_sock *nlk,
}
static int netlink_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+ char __user *optval, unsigned int optlen)
{
struct sock *sk = sock->sk;
struct netlink_sock *nlk = nlk_sk(sk);
err = 0;
break;
}
+ case NETLINK_BROADCAST_ERROR:
+ if (val)
+ nlk->flags |= NETLINK_BROADCAST_SEND_ERROR;
+ else
+ nlk->flags &= ~NETLINK_BROADCAST_SEND_ERROR;
+ err = 0;
+ break;
+ case NETLINK_NO_ENOBUFS:
+ if (val) {
+ nlk->flags |= NETLINK_RECV_NO_ENOBUFS;
+ clear_bit(0, &nlk->state);
+ wake_up_interruptible(&nlk->wait);
+ } else
+ nlk->flags &= ~NETLINK_RECV_NO_ENOBUFS;
+ err = 0;
+ break;
default:
err = -ENOPROTOOPT;
}
return -EFAULT;
err = 0;
break;
+ case NETLINK_BROADCAST_ERROR:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_BROADCAST_SEND_ERROR ? 1 : 0;
+ if (put_user(len, optlen) ||
+ put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
+ case NETLINK_NO_ENOBUFS:
+ if (len < sizeof(int))
+ return -EINVAL;
+ len = sizeof(int);
+ val = nlk->flags & NETLINK_RECV_NO_ENOBUFS ? 1 : 0;
+ if (put_user(len, optlen) ||
+ put_user(val, optval))
+ return -EFAULT;
+ err = 0;
+ break;
default:
err = -ENOPROTOOPT;
}
NETLINK_CB(skb).pid = nlk->pid;
NETLINK_CB(skb).dst_group = dst_group;
NETLINK_CB(skb).loginuid = audit_get_loginuid(current);
+ NETLINK_CB(skb).sessionid = audit_get_sessionid(current);
security_task_getsecid(current, &(NETLINK_CB(skb).sid));
memcpy(NETLINK_CREDS(skb), &siocb->scm->creds, sizeof(struct ucred));
struct netlink_sock *nlk = nlk_sk(sk);
int noblock = flags&MSG_DONTWAIT;
size_t copied;
- struct sk_buff *skb;
+ struct sk_buff *skb, *frag __maybe_unused = NULL;
int err;
if (flags&MSG_OOB)
if (skb == NULL)
goto out;
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+ if (unlikely(skb_shinfo(skb)->frag_list)) {
+ bool need_compat = !!(flags & MSG_CMSG_COMPAT);
+
+ /*
+ * If this skb has a frag_list, then here that means that
+ * we will have to use the frag_list skb for compat tasks
+ * and the regular skb for non-compat tasks.
+ *
+ * The skb might (and likely will) be cloned, so we can't
+ * just reset frag_list and go on with things -- we need to
+ * keep that. For the compat case that's easy -- simply get
+ * a reference to the compat skb and free the regular one
+ * including the frag. For the non-compat case, we need to
+ * avoid sending the frag to the user -- so assign NULL but
+ * restore it below before freeing the skb.
+ */
+ if (need_compat) {
+ struct sk_buff *compskb = skb_shinfo(skb)->frag_list;
+ skb_get(compskb);
+ kfree_skb(skb);
+ skb = compskb;
+ } else {
+ frag = skb_shinfo(skb)->frag_list;
+ skb_shinfo(skb)->frag_list = NULL;
+ }
+ }
+#endif
+
msg->msg_namelen = 0;
copied = skb->len;
siocb->scm->creds = *NETLINK_CREDS(skb);
if (flags & MSG_TRUNC)
copied = skb->len;
+
+#ifdef CONFIG_COMPAT_NETLINK_MESSAGES
+ skb_shinfo(skb)->frag_list = frag;
+#endif
+
skb_free_datagram(sk, skb);
if (nlk->cb && atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2)
if (groups < 32)
groups = 32;
- listeners = kzalloc(NLGRPSZ(groups), GFP_KERNEL);
+ listeners = kzalloc(NLGRPSZ(groups) + sizeof(struct listeners_rcu_head),
+ GFP_KERNEL);
if (!listeners)
goto out_sock_release;
EXPORT_SYMBOL(netlink_kernel_release);
+static void netlink_free_old_listeners(struct rcu_head *rcu_head)
+{
+ struct listeners_rcu_head *lrh;
+
+ lrh = container_of(rcu_head, struct listeners_rcu_head, rcu_head);
+ kfree(lrh->ptr);
+}
+
+int __netlink_change_ngroups(struct sock *sk, unsigned int groups)
+{
+ unsigned long *listeners, *old = NULL;
+ struct listeners_rcu_head *old_rcu_head;
+ struct netlink_table *tbl = &nl_table[sk->sk_protocol];
+
+ if (groups < 32)
+ groups = 32;
+
+ if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
+ listeners = kzalloc(NLGRPSZ(groups) +
+ sizeof(struct listeners_rcu_head),
+ GFP_ATOMIC);
+ if (!listeners)
+ return -ENOMEM;
+ old = tbl->listeners;
+ memcpy(listeners, old, NLGRPSZ(tbl->groups));
+ rcu_assign_pointer(tbl->listeners, listeners);
+ /*
+ * Free the old memory after an RCU grace period so we
+ * don't leak it. We use call_rcu() here in order to be
+ * able to call this function from atomic contexts. The
+ * allocation of this memory will have reserved enough
+ * space for struct listeners_rcu_head at the end.
+ */
+ old_rcu_head = (void *)(tbl->listeners +
+ NLGRPLONGS(tbl->groups));
+ old_rcu_head->ptr = old;
+ call_rcu(&old_rcu_head->rcu_head, netlink_free_old_listeners);
+ }
+ tbl->groups = groups;
+
+ return 0;
+}
+
/**
* netlink_change_ngroups - change number of multicast groups
*
*/
int netlink_change_ngroups(struct sock *sk, unsigned int groups)
{
- unsigned long *listeners, *old = NULL;
- struct netlink_table *tbl = &nl_table[sk->sk_protocol];
- int err = 0;
-
- if (groups < 32)
- groups = 32;
+ int err;
netlink_table_grab();
- if (NLGRPSZ(tbl->groups) < NLGRPSZ(groups)) {
- listeners = kzalloc(NLGRPSZ(groups), GFP_ATOMIC);
- if (!listeners) {
- err = -ENOMEM;
- goto out_ungrab;
- }
- old = tbl->listeners;
- memcpy(listeners, old, NLGRPSZ(tbl->groups));
- rcu_assign_pointer(tbl->listeners, listeners);
- }
- tbl->groups = groups;
-
- out_ungrab:
+ err = __netlink_change_ngroups(sk, groups);
netlink_table_ungrab();
- synchronize_rcu();
- kfree(old);
+
return err;
}
-EXPORT_SYMBOL(netlink_change_ngroups);
+
+void __netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
+{
+ struct sock *sk;
+ struct hlist_node *node;
+ struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
+
+ sk_for_each_bound(sk, node, &tbl->mc_list)
+ netlink_update_socket_mc(nlk_sk(sk), group, 0);
+}
/**
* netlink_clear_multicast_users - kick off multicast listeners
*/
void netlink_clear_multicast_users(struct sock *ksk, unsigned int group)
{
- struct sock *sk;
- struct hlist_node *node;
- struct netlink_table *tbl = &nl_table[ksk->sk_protocol];
-
netlink_table_grab();
-
- sk_for_each_bound(sk, node, &tbl->mc_list)
- netlink_update_socket_mc(nlk_sk(sk), group, 0);
-
+ __netlink_clear_multicast_users(ksk, group);
netlink_table_ungrab();
}
-EXPORT_SYMBOL(netlink_clear_multicast_users);
void netlink_set_nonroot(int protocol, unsigned int flags)
{
static void netlink_destroy_callback(struct netlink_callback *cb)
{
- if (cb->skb)
- kfree_skb(cb->skb);
+ kfree_skb(cb->skb);
kfree(cb);
}
}
int netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
- struct nlmsghdr *nlh,
+ const struct nlmsghdr *nlh,
int (*dump)(struct sk_buff *skb,
struct netlink_callback *),
int (*done)(struct netlink_callback *))
}
rep = __nlmsg_put(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
- NLMSG_ERROR, sizeof(struct nlmsgerr), 0);
+ NLMSG_ERROR, payload, 0);
errmsg = nlmsg_data(rep);
errmsg->error = err;
memcpy(&errmsg->msg, nlh, err ? nlh->nlmsg_len : sizeof(*nlh));
exclude_pid = pid;
}
- /* errors reported via destination sk->sk_err */
- nlmsg_multicast(sk, skb, exclude_pid, group, flags);
+ /* errors reported via destination sk->sk_err, but propagate
+ * delivery errors if NETLINK_BROADCAST_ERROR flag is set */
+ err = nlmsg_multicast(sk, skb, exclude_pid, group, flags);
}
- if (report)
- err = nlmsg_unicast(sk, skb, pid);
+ if (report) {
+ int err2;
+
+ err2 = nlmsg_unicast(sk, skb, pid);
+ if (!err || err == -ESRCH)
+ err = err2;
+ }
return err;
}
if (v == SEQ_START_TOKEN)
seq_puts(seq,
"sk Eth Pid Groups "
- "Rmem Wmem Dump Locks\n");
+ "Rmem Wmem Dump Locks Drops\n");
else {
struct sock *s = v;
struct netlink_sock *nlk = nlk_sk(s);
- seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %d\n",
+ seq_printf(seq, "%p %-3d %-6d %08x %-8d %-8d %p %-8d %-8d\n",
s,
s->sk_protocol,
nlk->pid,
nlk->groups ? (u32)nlk->groups[0] : 0,
- atomic_read(&s->sk_rmem_alloc),
- atomic_read(&s->sk_wmem_alloc),
+ sk_rmem_alloc_get(s),
+ sk_wmem_alloc_get(s),
nlk->cb,
- atomic_read(&s->sk_refcnt)
+ atomic_read(&s->sk_refcnt),
+ atomic_read(&s->sk_drops)
);
}
.sendpage = sock_no_sendpage,
};
-static struct net_proto_family netlink_family_ops = {
+static const struct net_proto_family netlink_family_ops = {
.family = PF_NETLINK,
.create = netlink_create,
.owner = THIS_MODULE, /* for consistency 8) */
if (!nl_table)
goto panic;
- if (num_physpages >= (128 * 1024))
- limit = num_physpages >> (21 - PAGE_SHIFT);
+ if (totalram_pages >= (128 * 1024))
+ limit = totalram_pages >> (21 - PAGE_SHIFT);
else
- limit = num_physpages >> (23 - PAGE_SHIFT);
+ limit = totalram_pages >> (23 - PAGE_SHIFT);
order = get_bitmask_order(limit) - 1 + PAGE_SHIFT;
limit = (1UL << order) / sizeof(struct hlist_head);