X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Funix%2Faf_unix.c;h=63ed69ffad99a94c102dfd70f702f17d539d5468;hb=8b808bf29bdafe9270cb283ea093bb87f5a3be19;hp=bc4c44552c1f4679b95e7ddccac506d6d81b2188;hpb=c752f0739f09b803aed191c4765a3b6650a08653;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bc4c445..63ed69f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -83,7 +83,6 @@ */ #include -#include #include #include #include @@ -104,6 +103,7 @@ #include #include #include +#include #include #include #include @@ -112,30 +112,46 @@ #include #include #include -#include #include #include #include #include -int sysctl_unix_max_dgram_qlen = 10; - -struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -DEFINE_RWLOCK(unix_table_lock); +static struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; +static DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) #define UNIX_ABSTRACT(sk) (unix_sk(sk)->addr->hash != UNIX_HASH_SIZE) +#ifdef CONFIG_SECURITY_NETWORK +static void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ + memcpy(UNIXSID(skb), &scm->secid, sizeof(u32)); +} + +static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ + scm->secid = *UNIXSID(skb); +} +#else +static inline void unix_get_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ } + +static inline void unix_set_secdata(struct scm_cookie *scm, struct sk_buff *skb) +{ } +#endif /* CONFIG_SECURITY_NETWORK */ + /* * SMP locking strategy: - * hash table is protected with rwlock unix_table_lock + * hash table is protected with spinlock unix_table_lock * each socket state is protected by separate rwlock. */ -static inline unsigned unix_hash_fold(unsigned hash) +static inline unsigned unix_hash_fold(__wsum n) { + unsigned hash = (__force unsigned)n; hash ^= hash>>16; hash ^= hash>>8; return hash&(UNIX_HASH_SIZE-1); @@ -157,11 +173,11 @@ static struct sock *unix_peer_get(struct sock *s) { struct sock *peer; - unix_state_rlock(s); + unix_state_lock(s); peer = unix_peer(s); if (peer) sock_hold(peer); - unix_state_runlock(s); + unix_state_unlock(s); return peer; } @@ -177,7 +193,7 @@ static inline void unix_release_addr(struct unix_address *addr) * - if started by not zero, should be NULL terminated (FS object) * - if started by zero, it is abstract name. */ - + static int unix_mkname(struct sockaddr_un * sunaddr, int len, unsigned *hashp) { if (len <= sizeof(short) || len > sizeof(*sunaddr)) @@ -214,19 +230,20 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) static inline void unix_remove_socket(struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_remove_socket(sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_insert_socket(list, sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } -static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, +static struct sock *__unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash) { struct sock *s; @@ -235,6 +252,9 @@ static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, sk_for_each(s, node, &unix_socket_table[hash ^ type]) { struct unix_sock *u = unix_sk(s); + if (!net_eq(sock_net(s), net)) + continue; + if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) goto found; @@ -244,30 +264,34 @@ found: return s; } -static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, +static inline struct sock *unix_find_socket_byname(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash) { struct sock *s; - read_lock(&unix_table_lock); - s = __unix_find_socket_byname(sunname, len, type, hash); + spin_lock(&unix_table_lock); + s = __unix_find_socket_byname(net, sunname, len, type, hash); if (s) sock_hold(s); - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } -static struct sock *unix_find_socket_byinode(struct inode *i) +static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) { struct sock *s; struct hlist_node *node; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); sk_for_each(s, node, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; + if (!net_eq(sock_net(s), net)) + continue; + if(dentry && dentry->d_inode == i) { sock_hold(s); @@ -276,7 +300,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) } s = NULL; found: - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -290,8 +314,8 @@ static void unix_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); if (unix_writable(sk)) { if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) - wake_up_interruptible(sk->sk_sleep); - sk_wake_async(sk, 2, POLL_OUT); + wake_up_interruptible_sync(sk->sk_sleep); + sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT); } read_unlock(&sk->sk_callback_lock); } @@ -352,7 +376,7 @@ static int unix_release_sock (struct sock *sk, int embrion) unix_remove_socket(sk); /* Clear state */ - unix_state_wlock(sk); + unix_state_lock(sk); sock_orphan(sk); sk->sk_shutdown = SHUTDOWN_MASK; dentry = u->dentry; @@ -361,7 +385,7 @@ static int unix_release_sock (struct sock *sk, int embrion) u->mnt = NULL; state = sk->sk_state; sk->sk_state = TCP_CLOSE; - unix_state_wunlock(sk); + unix_state_unlock(sk); wake_up_interruptible_all(&u->peer_wait); @@ -369,15 +393,15 @@ static int unix_release_sock (struct sock *sk, int embrion) if (skpair!=NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { - unix_state_wlock(skpair); + unix_state_lock(skpair); /* No more writes */ skpair->sk_shutdown = SHUTDOWN_MASK; if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) skpair->sk_err = ECONNRESET; - unix_state_wunlock(skpair); + unix_state_unlock(skpair); skpair->sk_state_change(skpair); read_lock(&skpair->sk_callback_lock); - sk_wake_async(skpair,1,POLL_HUP); + sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP); read_unlock(&skpair->sk_callback_lock); } sock_put(skpair); /* It may now die */ @@ -413,8 +437,8 @@ static int unix_release_sock (struct sock *sk, int embrion) * What the above comment does talk about? --ANK(980817) */ - if (atomic_read(&unix_tot_inflight)) - unix_gc(); /* Garbage collect fds */ + if (unix_tot_inflight) + unix_gc(); /* Garbage collect fds */ return 0; } @@ -431,7 +455,7 @@ static int unix_listen(struct socket *sock, int backlog) err = -EINVAL; if (!u->addr) goto out; /* No listens on an unbound socket */ - unix_state_wlock(sk); + unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; if (backlog > sk->sk_max_ack_backlog) @@ -439,13 +463,13 @@ static int unix_listen(struct socket *sock, int backlog) sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; /* set credentials so connect can copy them */ - sk->sk_peercred.pid = current->tgid; + sk->sk_peercred.pid = task_tgid_vnr(current); sk->sk_peercred.uid = current->euid; sk->sk_peercred.gid = current->egid; err = 0; out_unlock: - unix_state_wunlock(sk); + unix_state_unlock(sk); out: return err; } @@ -473,7 +497,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *, static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, struct msghdr *, size_t); -static struct proto_ops unix_stream_ops = { +static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -494,7 +518,7 @@ static struct proto_ops unix_stream_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_dgram_ops = { +static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -515,7 +539,7 @@ static struct proto_ops unix_dgram_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_seqpacket_ops = { +static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -542,38 +566,50 @@ static struct proto unix_proto = { .obj_size = sizeof(struct unix_sock), }; -static struct sock * unix_create1(struct socket *sock) +/* + * AF_UNIX sockets do not interact with hardware, hence they + * dont trigger interrupts - so it's safe for them to have + * bh-unsafe locking for their sk_receive_queue.lock. Split off + * this special lock-class by reinitializing the spinlock key: + */ +static struct lock_class_key af_unix_sk_receive_queue_lock_key; + +static struct sock * unix_create1(struct net *net, struct socket *sock) { struct sock *sk = NULL; struct unix_sock *u; - if (atomic_read(&unix_nr_socks) >= 2*files_stat.max_files) + atomic_inc(&unix_nr_socks); + if (atomic_read(&unix_nr_socks) > 2 * get_max_files()) goto out; - sk = sk_alloc(PF_UNIX, GFP_KERNEL, &unix_proto, 1); + sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_proto); if (!sk) goto out; - atomic_inc(&unix_nr_socks); - sock_init_data(sock,sk); + lockdep_set_class(&sk->sk_receive_queue.lock, + &af_unix_sk_receive_queue_lock_key); sk->sk_write_space = unix_write_space; - sk->sk_max_ack_backlog = sysctl_unix_max_dgram_qlen; + sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen; sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->dentry = NULL; u->mnt = NULL; - rwlock_init(&u->lock); - atomic_set(&u->inflight, sock ? 0 : -1); - init_MUTEX(&u->readsem); /* single task reading lock */ + spin_lock_init(&u->lock); + atomic_set(&u->inflight, 0); + INIT_LIST_HEAD(&u->link); + mutex_init(&u->readlock); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); unix_insert_socket(unix_sockets_unbound, sk); out: + if (sk == NULL) + atomic_dec(&unix_nr_socks); return sk; } -static int unix_create(struct socket *sock, int protocol) +static int unix_create(struct net *net, struct socket *sock, int protocol) { if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -600,7 +636,7 @@ static int unix_create(struct socket *sock, int protocol) return -ESOCKTNOSUPPORT; } - return unix_create1(sock) ? 0 : -ENOMEM; + return unix_create1(net, sock) ? 0 : -ENOMEM; } static int unix_release(struct socket *sock) @@ -618,23 +654,23 @@ static int unix_release(struct socket *sock) static int unix_autobind(struct socket *sock) { struct sock *sk = sock->sk; + struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); static u32 ordernum = 1; struct unix_address * addr; int err; - down(&u->readsem); + mutex_lock(&u->readlock); err = 0; if (u->addr) goto out; err = -ENOMEM; - addr = kmalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); + addr = kzalloc(sizeof(*addr) + sizeof(short) + 16, GFP_KERNEL); if (!addr) goto out; - memset(addr, 0, sizeof(*addr) + sizeof(short) + 16); addr->name->sun_family = AF_UNIX; atomic_set(&addr->refcnt, 1); @@ -642,12 +678,12 @@ retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; - if (__unix_find_socket_byname(addr->name, addr->len, sock->type, + if (__unix_find_socket_byname(net, addr->name, addr->len, sock->type, addr->hash)) { - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ if (!(ordernum&0xFF)) yield(); @@ -658,39 +694,40 @@ retry: __unix_remove_socket(sk); u->addr = addr; __unix_insert_socket(&unix_socket_table[addr->hash], sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); err = 0; -out: up(&u->readsem); +out: mutex_unlock(&u->readlock); return err; } -static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, +static struct sock *unix_find_other(struct net *net, + struct sockaddr_un *sunname, int len, int type, unsigned hash, int *error) { struct sock *u; struct nameidata nd; int err = 0; - + if (sunname->sun_path[0]) { err = path_lookup(sunname->sun_path, LOOKUP_FOLLOW, &nd); if (err) goto fail; - err = permission(nd.dentry->d_inode,MAY_WRITE, &nd); + err = vfs_permission(&nd, MAY_WRITE); if (err) goto put_fail; err = -ECONNREFUSED; - if (!S_ISSOCK(nd.dentry->d_inode->i_mode)) + if (!S_ISSOCK(nd.path.dentry->d_inode->i_mode)) goto put_fail; - u=unix_find_socket_byinode(nd.dentry->d_inode); + u = unix_find_socket_byinode(net, nd.path.dentry->d_inode); if (!u) goto put_fail; if (u->sk_type == type) - touch_atime(nd.mnt, nd.dentry); + touch_atime(nd.path.mnt, nd.path.dentry); - path_release(&nd); + path_put(&nd.path); err=-EPROTOTYPE; if (u->sk_type != type) { @@ -699,7 +736,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, } } else { err = -ECONNREFUSED; - u=unix_find_socket_byname(sunname, len, type, hash); + u=unix_find_socket_byname(net, sunname, len, type, hash); if (u) { struct dentry *dentry; dentry = unix_sk(u)->dentry; @@ -711,7 +748,7 @@ static struct sock *unix_find_other(struct sockaddr_un *sunname, int len, return u; put_fail: - path_release(&nd); + path_put(&nd.path); fail: *error=err; return NULL; @@ -721,6 +758,7 @@ fail: static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; + struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; struct dentry * dentry = NULL; @@ -744,7 +782,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; addr_len = err; - down(&u->readsem); + mutex_lock(&u->readlock); err = -EINVAL; if (u->addr) @@ -781,21 +819,21 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ mode = S_IFSOCK | (SOCK_INODE(sock)->i_mode & ~current->fs->umask); - err = vfs_mknod(nd.dentry->d_inode, dentry, mode, 0); + err = vfs_mknod(nd.path.dentry->d_inode, dentry, mode, 0); if (err) goto out_mknod_dput; - up(&nd.dentry->d_inode->i_sem); - dput(nd.dentry); - nd.dentry = dentry; + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + dput(nd.path.dentry); + nd.path.dentry = dentry; addr->hash = UNIX_HASH_SIZE; } - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; - if (__unix_find_socket_byname(sunaddr, addr_len, + if (__unix_find_socket_byname(net, sunaddr, addr_len, sk->sk_type, hash)) { unix_release_addr(addr); goto out_unlock; @@ -804,8 +842,8 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) list = &unix_socket_table[addr->hash]; } else { list = &unix_socket_table[dentry->d_inode->i_ino & (UNIX_HASH_SIZE-1)]; - u->dentry = nd.dentry; - u->mnt = nd.mnt; + u->dentry = nd.path.dentry; + u->mnt = nd.path.mnt; } err = 0; @@ -814,17 +852,17 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) __unix_insert_socket(list, sk); out_unlock: - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); out_up: - up(&u->readsem); + mutex_unlock(&u->readlock); out: return err; out_mknod_dput: dput(dentry); out_mknod_unlock: - up(&nd.dentry->d_inode->i_sem); - path_release(&nd); + mutex_unlock(&nd.path.dentry->d_inode->i_mutex); + path_put(&nd.path); out_mknod_parent: if (err==-EEXIST) err=-EADDRINUSE; @@ -832,10 +870,36 @@ out_mknod_parent: goto out_up; } +static void unix_state_double_lock(struct sock *sk1, struct sock *sk2) +{ + if (unlikely(sk1 == sk2) || !sk2) { + unix_state_lock(sk1); + return; + } + if (sk1 < sk2) { + unix_state_lock(sk1); + unix_state_lock_nested(sk2); + } else { + unix_state_lock(sk2); + unix_state_lock_nested(sk1); + } +} + +static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) +{ + if (unlikely(sk1 == sk2) || !sk2) { + unix_state_unlock(sk1); + return; + } + unix_state_unlock(sk1); + unix_state_unlock(sk2); +} + static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { struct sock *sk = sock->sk; + struct net *net = sock_net(sk); struct sockaddr_un *sunaddr=(struct sockaddr_un*)addr; struct sock *other; unsigned hash; @@ -851,11 +915,19 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, !unix_sk(sk)->addr && (err = unix_autobind(sock)) != 0) goto out; - other=unix_find_other(sunaddr, alen, sock->type, hash, &err); +restart: + other=unix_find_other(net, sunaddr, alen, sock->type, hash, &err); if (!other) goto out; - unix_state_wlock(sk); + unix_state_double_lock(sk, other); + + /* Apparently VFS overslept socket death. Retry. */ + if (sock_flag(other, SOCK_DEAD)) { + unix_state_double_unlock(sk, other); + sock_put(other); + goto restart; + } err = -EPERM; if (!unix_may_send(sk, other)) @@ -870,7 +942,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, * 1003.1g breaking connected state with AF_UNSPEC */ other = NULL; - unix_state_wlock(sk); + unix_state_double_lock(sk, other); } /* @@ -879,19 +951,19 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (unix_peer(sk)) { struct sock *old_peer = unix_peer(sk); unix_peer(sk)=other; - unix_state_wunlock(sk); + unix_state_double_unlock(sk, other); if (other != old_peer) unix_dgram_disconnected(sk, old_peer); sock_put(old_peer); } else { unix_peer(sk)=other; - unix_state_wunlock(sk); + unix_state_double_unlock(sk, other); } - return 0; + return 0; out_unlock: - unix_state_wunlock(sk); + unix_state_double_unlock(sk, other); sock_put(other); out: return err; @@ -910,7 +982,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) (skb_queue_len(&other->sk_receive_queue) > other->sk_max_ack_backlog); - unix_state_runlock(other); + unix_state_unlock(other); if (sched) timeo = schedule_timeout(timeo); @@ -924,6 +996,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, { struct sockaddr_un *sunaddr=(struct sockaddr_un *)uaddr; struct sock *sk = sock->sk; + struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk), *newu, *otheru; struct sock *newsk = NULL; struct sock *other = NULL; @@ -952,7 +1025,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, err = -ENOMEM; /* create new sock for complete connection */ - newsk = unix_create1(NULL); + newsk = unix_create1(sock_net(sk), NULL); if (newsk == NULL) goto out; @@ -963,16 +1036,16 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(sunaddr, addr_len, sk->sk_type, hash, &err); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, hash, &err); if (!other) goto out; /* Latch state of peer */ - unix_state_rlock(other); + unix_state_lock(other); /* Apparently VFS overslept socket death. Retry. */ if (sock_flag(other, SOCK_DEAD)) { - unix_state_runlock(other); + unix_state_unlock(other); sock_put(other); goto restart; } @@ -994,7 +1067,7 @@ restart: goto out; sock_put(other); goto restart; - } + } /* Latch our state. @@ -1022,18 +1095,18 @@ restart: goto out_unlock; } - unix_state_wlock(sk); + unix_state_lock_nested(sk); if (sk->sk_state != st) { - unix_state_wunlock(sk); - unix_state_runlock(other); + unix_state_unlock(sk); + unix_state_unlock(other); sock_put(other); goto restart; } err = security_unix_stream_connect(sock, other->sk_socket, newsk); if (err) { - unix_state_wunlock(sk); + unix_state_unlock(sk); goto out_unlock; } @@ -1043,7 +1116,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - newsk->sk_peercred.pid = current->tgid; + newsk->sk_peercred.pid = task_tgid_vnr(current); newsk->sk_peercred.uid = current->euid; newsk->sk_peercred.gid = current->egid; newu = unix_sk(newsk); @@ -1063,28 +1136,27 @@ restart: /* Set credentials */ sk->sk_peercred = other->sk_peercred; - sock_hold(newsk); - unix_peer(sk) = newsk; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; + sock_hold(newsk); + + smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + unix_peer(sk) = newsk; - unix_state_wunlock(sk); + unix_state_unlock(sk); /* take ten and and send info to listening sock */ spin_lock(&other->sk_receive_queue.lock); __skb_queue_tail(&other->sk_receive_queue, skb); - /* Undo artificially decreased inflight after embrion - * is installed to listening socket. */ - atomic_inc(&newu->inflight); spin_unlock(&other->sk_receive_queue.lock); - unix_state_runlock(other); + unix_state_unlock(other); other->sk_data_ready(other, 0); sock_put(other); return 0; out_unlock: if (other) - unix_state_runlock(other); + unix_state_unlock(other); out: if (skb) @@ -1105,7 +1177,7 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) sock_hold(skb); unix_peer(ska)=skb; unix_peer(skb)=ska; - ska->sk_peercred.pid = skb->sk_peercred.pid = current->tgid; + ska->sk_peercred.pid = skb->sk_peercred.pid = task_tgid_vnr(current); ska->sk_peercred.uid = skb->sk_peercred.uid = current->euid; ska->sk_peercred.gid = skb->sk_peercred.gid = current->egid; @@ -1150,10 +1222,10 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags) wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ - unix_state_wlock(tsk); + unix_state_lock(tsk); newsock->state = SS_CONNECTED; sock_graft(tsk, newsock); - unix_state_wunlock(tsk); + unix_state_unlock(tsk); return 0; out: @@ -1180,7 +1252,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_ } u = unix_sk(sk); - unix_state_rlock(sk); + unix_state_lock(sk); if (!u->addr) { sunaddr->sun_family = AF_UNIX; sunaddr->sun_path[0] = 0; @@ -1191,7 +1263,7 @@ static int unix_getname(struct socket *sock, struct sockaddr *uaddr, int *uaddr_ *uaddr_len = addr->len; memcpy(sunaddr, addr->name, *uaddr_len); } - unix_state_runlock(sk); + unix_state_unlock(sk); sock_put(sk); out: return err; @@ -1240,6 +1312,7 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, { struct sock_iocb *siocb = kiocb_to_siocb(kiocb); struct sock *sk = sock->sk; + struct net *net = sock_net(sk); struct unix_sock *u = unix_sk(sk); struct sockaddr_un *sunaddr=msg->msg_name; struct sock *other = NULL; @@ -1288,8 +1361,9 @@ static int unix_dgram_sendmsg(struct kiocb *kiocb, struct socket *sock, memcpy(UNIXCREDS(skb), &siocb->scm->creds, sizeof(struct ucred)); if (siocb->scm->fp) unix_attach_fds(siocb->scm, skb); + unix_get_secdata(siocb->scm, skb); - skb->h.raw = skb->data; + skb_reset_transport_header(skb); err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len); if (err) goto out_free; @@ -1302,13 +1376,13 @@ restart: if (sunaddr == NULL) goto out_free; - other = unix_find_other(sunaddr, namelen, sk->sk_type, + other = unix_find_other(net, sunaddr, namelen, sk->sk_type, hash, &err); if (other==NULL) goto out_free; } - unix_state_rlock(other); + unix_state_lock(other); err = -EPERM; if (!unix_may_send(sk, other)) goto out_unlock; @@ -1318,20 +1392,20 @@ restart: * Check with 1003.1g - what should * datagram error */ - unix_state_runlock(other); + unix_state_unlock(other); sock_put(other); err = 0; - unix_state_wlock(sk); + unix_state_lock(sk); if (unix_peer(sk) == other) { unix_peer(sk)=NULL; - unix_state_wunlock(sk); + unix_state_unlock(sk); unix_dgram_disconnected(sk, other); sock_put(other); err = -ECONNREFUSED; } else { - unix_state_wunlock(sk); + unix_state_unlock(sk); } other = NULL; @@ -1368,14 +1442,14 @@ restart: } skb_queue_tail(&other->sk_receive_queue, skb); - unix_state_runlock(other); + unix_state_unlock(other); other->sk_data_ready(other, len); sock_put(other); scm_destroy(siocb->scm); return len; out_unlock: - unix_state_runlock(other); + unix_state_unlock(other); out_free: kfree_skb(skb); out: @@ -1385,7 +1459,7 @@ out: return err; } - + static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, struct msghdr *msg, size_t len) { @@ -1414,7 +1488,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } else { sunaddr = NULL; err = -ENOTCONN; - other = unix_peer_get(sk); + other = unix_peer(sk); if (!other) goto out_err; } @@ -1425,23 +1499,23 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, while(sent < len) { /* - * Optimisation for the fact that under 0.01% of X messages typically - * need breaking up. + * Optimisation for the fact that under 0.01% of X + * messages typically need breaking up. */ - size=len-sent; + size = len-sent; /* Keep two messages in the pipe so it schedules better */ - if (size > sk->sk_sndbuf / 2 - 64) - size = sk->sk_sndbuf / 2 - 64; + if (size > ((sk->sk_sndbuf >> 1) - 64)) + size = (sk->sk_sndbuf >> 1) - 64; if (size > SKB_MAX_ALLOC) size = SKB_MAX_ALLOC; - + /* * Grab a buffer */ - + skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err); if (skb==NULL) @@ -1465,18 +1539,17 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, goto out_err; } - unix_state_rlock(other); + unix_state_lock(other); if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) goto pipe_err_free; skb_queue_tail(&other->sk_receive_queue, skb); - unix_state_runlock(other); + unix_state_unlock(other); other->sk_data_ready(other, size); sent+=size; } - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; @@ -1484,15 +1557,13 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, return sent; pipe_err_free: - unix_state_runlock(other); + unix_state_unlock(other); kfree_skb(skb); pipe_err: if (sent==0 && !(msg->msg_flags&MSG_NOSIGNAL)) send_sig(SIGPIPE,current,0); err = -EPIPE; out_err: - if (other) - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; return sent ? : err; @@ -1503,7 +1574,7 @@ static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, { int err; struct sock *sk = sock->sk; - + err = sock_error(sk); if (err) return err; @@ -1516,7 +1587,7 @@ static int unix_seqpacket_sendmsg(struct kiocb *kiocb, struct socket *sock, return unix_dgram_sendmsg(kiocb, sock, msg, len); } - + static void unix_copy_addr(struct msghdr *msg, struct sock *sk) { struct unix_sock *u = unix_sk(sk); @@ -1546,13 +1617,20 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, msg->msg_namelen = 0; - down(&u->readsem); + mutex_lock(&u->readlock); skb = skb_recv_datagram(sk, flags, noblock, &err); - if (!skb) + if (!skb) { + unix_state_lock(sk); + /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ + if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && + (sk->sk_shutdown & RCV_SHUTDOWN)) + err = 0; + unix_state_unlock(sk); goto out_unlock; + } - wake_up_interruptible(&u->peer_wait); + wake_up_interruptible_sync(&u->peer_wait); if (msg->msg_name) unix_copy_addr(msg, skb->sk); @@ -1571,13 +1649,14 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, memset(&tmp_scm, 0, sizeof(tmp_scm)); } siocb->scm->creds = *UNIXCREDS(skb); + unix_set_secdata(siocb->scm, skb); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); } - else + else { /* It is questionable: on PEEK we could: - do not return fds - good, but too simple 8) @@ -1585,11 +1664,11 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, apparently wrong) - clone fds (I chose it for now, it is the most universal solution) - - POSIX 1003.1g does not actually define this clearly - at all. POSIX 1003.1g doesn't define a lot of things - clearly however! - + + POSIX 1003.1g does not actually define this clearly + at all. POSIX 1003.1g doesn't define a lot of things + clearly however! + */ if (UNIXCB(skb).fp) siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); @@ -1601,7 +1680,7 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, out_free: skb_free_datagram(sk,skb); out_unlock: - up(&u->readsem); + mutex_unlock(&u->readlock); out: return err; } @@ -1609,12 +1688,12 @@ out: /* * Sleep until data has arrive. But check for races.. */ - + static long unix_stream_data_wait(struct sock * sk, long timeo) { DEFINE_WAIT(wait); - unix_state_rlock(sk); + unix_state_lock(sk); for (;;) { prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); @@ -1627,14 +1706,14 @@ static long unix_stream_data_wait(struct sock * sk, long timeo) break; set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - unix_state_runlock(sk); + unix_state_unlock(sk); timeo = schedule_timeout(timeo); - unix_state_rlock(sk); + unix_state_lock(sk); clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); } finish_wait(sk->sk_sleep, &wait); - unix_state_runlock(sk); + unix_state_unlock(sk); return timeo; } @@ -1677,31 +1756,34 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, memset(&tmp_scm, 0, sizeof(tmp_scm)); } - down(&u->readsem); + mutex_lock(&u->readlock); do { int chunk; struct sk_buff *skb; + unix_state_lock(sk); skb = skb_dequeue(&sk->sk_receive_queue); if (skb==NULL) { if (copied >= target) - break; + goto unlock; /* * POSIX 1003.1g mandates this order. */ - + if ((err = sock_error(sk)) != 0) - break; + goto unlock; if (sk->sk_shutdown & RCV_SHUTDOWN) - break; + goto unlock; + + unix_state_unlock(sk); err = -EAGAIN; if (!timeo) break; - up(&u->readsem); + mutex_unlock(&u->readlock); timeo = unix_stream_data_wait(sk, timeo); @@ -1709,9 +1791,13 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, err = sock_intr_errno(timeo); goto out; } - down(&u->readsem); + mutex_lock(&u->readlock); continue; + unlock: + unix_state_unlock(sk); + break; } + unix_state_unlock(sk); if (check_creds) { /* Never glue messages from different writers */ @@ -1775,7 +1861,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, } } while (size); - up(&u->readsem); + mutex_unlock(&u->readlock); scm_recv(sock, msg, siocb->scm, flags); out: return copied ? : err; @@ -1789,12 +1875,12 @@ static int unix_shutdown(struct socket *sock, int mode) mode = (mode+1)&(RCV_SHUTDOWN|SEND_SHUTDOWN); if (mode) { - unix_state_wlock(sk); + unix_state_lock(sk); sk->sk_shutdown |= mode; other=unix_peer(sk); if (other) sock_hold(other); - unix_state_wunlock(sk); + unix_state_unlock(sk); sk->sk_state_change(sk); if (other && @@ -1806,15 +1892,15 @@ static int unix_shutdown(struct socket *sock, int mode) peer_mode |= SEND_SHUTDOWN; if (mode&SEND_SHUTDOWN) peer_mode |= RCV_SHUTDOWN; - unix_state_wlock(other); + unix_state_lock(other); other->sk_shutdown |= peer_mode; - unix_state_wunlock(other); + unix_state_unlock(other); other->sk_state_change(other); read_lock(&other->sk_callback_lock); if (peer_mode == SHUTDOWN_MASK) - sk_wake_async(other,1,POLL_HUP); + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_HUP); else if (peer_mode & RCV_SHUTDOWN) - sk_wake_async(other,1,POLL_IN); + sk_wake_async(other, SOCK_WAKE_WAITD, POLL_IN); read_unlock(&other->sk_callback_lock); } if (other) @@ -1860,7 +1946,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } return err; @@ -1879,6 +1965,8 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl mask |= POLLERR; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; + if (sk->sk_shutdown & RCV_SHUTDOWN) + mask |= POLLRDHUP; /* readable? */ if (!skb_queue_empty(&sk->sk_receive_queue) || @@ -1901,13 +1989,43 @@ static unsigned int unix_poll(struct file * file, struct socket *sock, poll_tabl #ifdef CONFIG_PROC_FS -static struct sock *unix_seq_idx(int *iter, loff_t pos) +static struct sock *first_unix_socket(int *i) +{ + for (*i = 0; *i <= UNIX_HASH_SIZE; (*i)++) { + if (!hlist_empty(&unix_socket_table[*i])) + return __sk_head(&unix_socket_table[*i]); + } + return NULL; +} + +static struct sock *next_unix_socket(int *i, struct sock *s) { + struct sock *next = sk_next(s); + /* More in this chain? */ + if (next) + return next; + /* Look for next non-empty chain. */ + for ((*i)++; *i <= UNIX_HASH_SIZE; (*i)++) { + if (!hlist_empty(&unix_socket_table[*i])) + return __sk_head(&unix_socket_table[*i]); + } + return NULL; +} + +struct unix_iter_state { + struct seq_net_private p; + int i; +}; +static struct sock *unix_seq_idx(struct seq_file *seq, loff_t pos) +{ + struct unix_iter_state *iter = seq->private; loff_t off = 0; struct sock *s; - for (s = first_unix_socket(iter); s; s = next_unix_socket(iter, s)) { - if (off == pos) + for (s = first_unix_socket(&iter->i); s; s = next_unix_socket(&iter->i, s)) { + if (sock_net(s) != seq_file_net(seq)) + continue; + if (off == pos) return s; ++off; } @@ -1916,35 +2034,43 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos) static void *unix_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(unix_table_lock) { - read_lock(&unix_table_lock); - return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); + spin_lock(&unix_table_lock); + return *pos ? unix_seq_idx(seq, *pos - 1) : SEQ_START_TOKEN; } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { + struct unix_iter_state *iter = seq->private; + struct sock *sk = v; ++*pos; - if (v == (void *)1) - return first_unix_socket(seq->private); - return next_unix_socket(seq->private, v); + if (v == SEQ_START_TOKEN) + sk = first_unix_socket(&iter->i); + else + sk = next_unix_socket(&iter->i, sk); + while (sk && (sock_net(sk) != seq_file_net(seq))) + sk = next_unix_socket(&iter->i, sk); + return sk; } static void unix_seq_stop(struct seq_file *seq, void *v) + __releases(unix_table_lock) { - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static int unix_seq_show(struct seq_file *seq, void *v) { - - if (v == (void *)1) + + if (v == SEQ_START_TOKEN) seq_puts(seq, "Num RefCount Protocol Flags Type St " "Inode Path\n"); else { struct sock *s = v; struct unix_sock *u = unix_sk(s); - unix_state_rlock(s); + unix_state_lock(s); seq_printf(seq, "%p: %08X %08X %08X %04X %02X %5lu", s, @@ -1972,14 +2098,14 @@ static int unix_seq_show(struct seq_file *seq, void *v) for ( ; i < len; i++) seq_putc(seq, u->addr->name->sun_path[i]); } - unix_state_runlock(s); + unix_state_unlock(s); seq_putc(seq, '\n'); } return 0; } -static struct seq_operations unix_seq_ops = { +static const struct seq_operations unix_seq_ops = { .start = unix_seq_start, .next = unix_seq_next, .stop = unix_seq_stop, @@ -1989,33 +2115,16 @@ static struct seq_operations unix_seq_ops = { static int unix_seq_open(struct inode *inode, struct file *file) { - struct seq_file *seq; - int rc = -ENOMEM; - int *iter = kmalloc(sizeof(int), GFP_KERNEL); - - if (!iter) - goto out; - - rc = seq_open(file, &unix_seq_ops); - if (rc) - goto out_kfree; - - seq = file->private_data; - seq->private = iter; - *iter = 0; -out: - return rc; -out_kfree: - kfree(iter); - goto out; + return seq_open_net(inode, file, &unix_seq_ops, + sizeof(struct unix_iter_state)); } -static struct file_operations unix_seq_fops = { +static const struct file_operations unix_seq_fops = { .owner = THIS_MODULE, .open = unix_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; #endif @@ -2026,36 +2135,53 @@ static struct net_proto_family unix_family_ops = { .owner = THIS_MODULE, }; -#ifdef CONFIG_SYSCTL -extern void unix_sysctl_register(void); -extern void unix_sysctl_unregister(void); -#else -static inline void unix_sysctl_register(void) {} -static inline void unix_sysctl_unregister(void) {} + +static int unix_net_init(struct net *net) +{ + int error = -ENOMEM; + + net->unx.sysctl_max_dgram_qlen = 10; + if (unix_sysctl_register(net)) + goto out; + +#ifdef CONFIG_PROC_FS + if (!proc_net_fops_create(net, "unix", 0, &unix_seq_fops)) { + unix_sysctl_unregister(net); + goto out; + } #endif + error = 0; +out: + return 0; +} + +static void unix_net_exit(struct net *net) +{ + unix_sysctl_unregister(net); + proc_net_remove(net, "unix"); +} + +static struct pernet_operations unix_net_ops = { + .init = unix_net_init, + .exit = unix_net_exit, +}; static int __init af_unix_init(void) { int rc = -1; struct sk_buff *dummy_skb; - if (sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)) { - printk(KERN_CRIT "%s: panic\n", __FUNCTION__); - goto out; - } + BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof(dummy_skb->cb)); rc = proto_register(&unix_proto, 1); - if (rc != 0) { - printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n", - __FUNCTION__); + if (rc != 0) { + printk(KERN_CRIT "%s: Cannot create unix_sock SLAB cache!\n", + __func__); goto out; } sock_register(&unix_family_ops); -#ifdef CONFIG_PROC_FS - proc_net_fops_create("unix", 0, &unix_seq_fops); -#endif - unix_sysctl_register(); + register_pernet_subsys(&unix_net_ops); out: return rc; } @@ -2063,12 +2189,15 @@ out: static void __exit af_unix_exit(void) { sock_unregister(PF_UNIX); - unix_sysctl_unregister(); - proc_net_remove("unix"); proto_unregister(&unix_proto); + unregister_pernet_subsys(&unix_net_ops); } -module_init(af_unix_init); +/* Earlier than device_initcall() so that other drivers invoking + request_module() don't end up in a loop when modprobe tries + to use a UNIX socket. But later than subsys_initcall() because + we depend on stuff initialised there */ +fs_initcall(af_unix_init); module_exit(af_unix_exit); MODULE_LICENSE("GPL");