X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Feventpoll.c;h=085c5c063420d3dca2153f70ca5d3dfaf48a1962;hb=444528b3e614f7f2391488d9bca8e0b872db909b;hp=8a23a91e1377cf7fb623accc49ab289f12d89499;hpb=5071f97ec6d74f006072de0ce89b67c8792fe5a1;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 8a23a91..085c5c0 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -71,24 +71,6 @@ * a better scalability. */ -#define DEBUG_EPOLL 0 - -#if DEBUG_EPOLL > 0 -#define DPRINTK(x) printk x -#define DNPRINTK(n, x) do { if ((n) <= DEBUG_EPOLL) printk x; } while (0) -#else /* #if DEBUG_EPOLL > 0 */ -#define DPRINTK(x) (void) 0 -#define DNPRINTK(n, x) (void) 0 -#endif /* #if DEBUG_EPOLL > 0 */ - -#define DEBUG_EPI 0 - -#if DEBUG_EPI != 0 -#define EPI_SLAB_DEBUG (SLAB_DEBUG_FREE | SLAB_RED_ZONE /* | SLAB_POISON */) -#else /* #if DEBUG_EPI != 0 */ -#define EPI_SLAB_DEBUG 0 -#endif /* #if DEBUG_EPI != 0 */ - /* Epoll private bits inside the event mask */ #define EP_PRIVATE_BITS (EPOLLONESHOT | EPOLLET) @@ -115,8 +97,8 @@ struct epoll_filefd { */ struct nested_call_node { struct list_head llink; - struct task_struct *task; void *cookie; + void *ctx; }; /* @@ -210,7 +192,7 @@ struct eppoll_entry { struct list_head llink; /* The "base" pointer is set to the container "struct epitem" */ - void *base; + struct epitem *base; /* * Wait queue item that will be linked to the target file wait @@ -335,17 +317,17 @@ static void ep_nested_calls_init(struct nested_calls *ncalls) * @nproc: Nested call core function pointer. * @priv: Opaque data to be passed to the @nproc callback. * @cookie: Cookie to be used to identify this nested call. + * @ctx: This instance context. * * Returns: Returns the code returned by the @nproc callback, or -1 if * the maximum recursion limit has been exceeded. */ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, int (*nproc)(void *, void *, int), void *priv, - void *cookie) + void *cookie, void *ctx) { int error, call_nests = 0; unsigned long flags; - struct task_struct *this_task = current; struct list_head *lsthead = &ncalls->tasks_call_list; struct nested_call_node *tncur; struct nested_call_node tnode; @@ -358,20 +340,19 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, * very much limited. */ list_for_each_entry(tncur, lsthead, llink) { - if (tncur->task == this_task && + if (tncur->ctx == ctx && (tncur->cookie == cookie || ++call_nests > max_nests)) { /* * Ops ... loop detected or maximum nest level reached. * We abort this wake by breaking the cycle itself. */ - spin_unlock_irqrestore(&ncalls->lock, flags); - - return -1; + error = -1; + goto out_unlock; } } /* Add the current task and cookie to the list */ - tnode.task = this_task; + tnode.ctx = ctx; tnode.cookie = cookie; list_add(&tnode.llink, lsthead); @@ -383,14 +364,34 @@ static int ep_call_nested(struct nested_calls *ncalls, int max_nests, /* Remove the current task from the list */ spin_lock_irqsave(&ncalls->lock, flags); list_del(&tnode.llink); +out_unlock: spin_unlock_irqrestore(&ncalls->lock, flags); return error; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + unsigned long flags; + + spin_lock_irqsave_nested(&wqueue->lock, flags, subclass); + wake_up_locked_poll(wqueue, events); + spin_unlock_irqrestore(&wqueue->lock, flags); +} +#else +static inline void ep_wake_up_nested(wait_queue_head_t *wqueue, + unsigned long events, int subclass) +{ + wake_up_poll(wqueue, events); +} +#endif + static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) { - wake_up_nested((wait_queue_head_t *) cookie, 1 + call_nests); + ep_wake_up_nested((wait_queue_head_t *) cookie, POLLIN, + 1 + call_nests); return 0; } @@ -406,32 +407,30 @@ static int ep_poll_wakeup_proc(void *priv, void *cookie, int call_nests) */ static void ep_poll_safewake(wait_queue_head_t *wq) { + int this_cpu = get_cpu(); + ep_call_nested(&poll_safewake_ncalls, EP_MAX_NESTS, - ep_poll_wakeup_proc, NULL, wq); + ep_poll_wakeup_proc, NULL, wq, (void *) (long) this_cpu); + + put_cpu(); } /* - * This function unregister poll callbacks from the associated file descriptor. - * Since this must be called without holding "ep->lock" the atomic exchange trick - * will protect us from multiple unregister. + * This function unregisters poll callbacks from the associated file + * descriptor. Must be called with "mtx" held (or "epmutex" if called from + * ep_free). */ static void ep_unregister_pollwait(struct eventpoll *ep, struct epitem *epi) { - int nwait; struct list_head *lsthead = &epi->pwqlist; struct eppoll_entry *pwq; - /* This is called without locks, so we need the atomic exchange */ - nwait = xchg(&epi->nwait, 0); - - if (nwait) { - while (!list_empty(lsthead)) { - pwq = list_first_entry(lsthead, struct eppoll_entry, llink); + while (!list_empty(lsthead)) { + pwq = list_first_entry(lsthead, struct eppoll_entry, llink); - list_del_init(&pwq->llink); - remove_wait_queue(pwq->whead, &pwq->wait); - kmem_cache_free(pwq_cache, pwq); - } + list_del(&pwq->llink); + remove_wait_queue(pwq->whead, &pwq->wait); + kmem_cache_free(pwq_cache, pwq); } } @@ -454,13 +453,11 @@ static int ep_scan_ready_list(struct eventpoll *ep, int error, pwake = 0; unsigned long flags; struct epitem *epi, *nepi; - struct list_head txlist; - - INIT_LIST_HEAD(&txlist); + LIST_HEAD(txlist); /* * We need to lock this because we could be hit by - * eventpoll_release_file() and epoll_ctl(EPOLL_CTL_DEL). + * eventpoll_release_file() and epoll_ctl(). */ mutex_lock(&ep->mtx); @@ -473,8 +470,7 @@ static int ep_scan_ready_list(struct eventpoll *ep, * in a lockless way. */ spin_lock_irqsave(&ep->lock, flags); - list_splice(&ep->rdllist, &txlist); - INIT_LIST_HEAD(&ep->rdllist); + list_splice_init(&ep->rdllist, &txlist); ep->ovflist = NULL; spin_unlock_irqrestore(&ep->lock, flags); @@ -514,8 +510,8 @@ static int ep_scan_ready_list(struct eventpoll *ep, if (!list_empty(&ep->rdllist)) { /* - * Wake up (if active) both the eventpoll wait list and the ->poll() - * wait list (delayed after we release the lock). + * Wake up (if active) both the eventpoll wait list and + * the ->poll() wait list (delayed after we release the lock). */ if (waitqueue_active(&ep->wq)) wake_up_locked(&ep->wq); @@ -570,9 +566,6 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) atomic_dec(&ep->user->epoll_watches); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_remove(%p, %p)\n", - current, ep, file)); - return 0; } @@ -628,11 +621,11 @@ static int ep_eventpoll_release(struct inode *inode, struct file *file) if (ep) ep_free(ep); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: close() ep=%p\n", current, ep)); return 0; } -static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) +static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) { struct epitem *epi, *tmp; @@ -640,13 +633,14 @@ static int ep_read_events_proc(struct eventpoll *ep, struct list_head *head, voi if (epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & epi->event.events) return POLLIN | POLLRDNORM; - else + else { /* * Item has been dropped into the ready list by the poll * callback, but it's not actually ready, as far as * caller requested events goes. We can remove it here. */ list_del_init(&epi->rdllink); + } } return 0; @@ -672,9 +666,9 @@ static unsigned int ep_eventpoll_poll(struct file *file, poll_table *wait) * could re-enter here. */ pollflags = ep_call_nested(&poll_readywalk_ncalls, EP_MAX_NESTS, - ep_poll_readyevents_proc, ep, ep); + ep_poll_readyevents_proc, ep, ep, current); - return pollflags != -1 ? pollflags: 0; + return pollflags != -1 ? pollflags : 0; } /* File callbacks that implement the eventpoll file behaviour */ @@ -751,8 +745,6 @@ static int ep_alloc(struct eventpoll **pep) *pep = ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_alloc() ep=%p\n", - current, ep)); return 0; free_uid: @@ -786,9 +778,6 @@ static struct epitem *ep_find(struct eventpoll *ep, struct file *file, int fd) } } - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_find(%p) -> %p\n", - current, file, epir)); - return epir; } @@ -804,9 +793,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", - current, epi->ffd.file, epi, ep)); - spin_lock_irqsave(&ep->lock, flags); /* @@ -819,6 +805,15 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k goto out_unlock; /* + * Check the events coming with the callback. At this stage, not + * every device reports the events in the "key" parameter of the + * callback. We need to be able to handle both cases here, hence the + * test for "key" != NULL before the event match test. + */ + if (key && !((unsigned long) key & epi->event.events)) + goto out_unlock; + + /* * If we are trasfering events to userspace, we can hold no locks * (because we're accessing user memory, and because of linux f_op->poll() * semantics). All the events that happens during that period of time are @@ -872,9 +867,10 @@ static void ep_ptable_queue_proc(struct file *file, wait_queue_head_t *whead, add_wait_queue(whead, &pwq->wait); list_add_tail(&pwq->llink, &epi->pwqlist); epi->nwait++; - } else + } else { /* We have to signal that an error occurred */ epi->nwait = -1; + } } static void ep_rbtree_insert(struct eventpoll *ep, struct epitem *epi) @@ -978,9 +974,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, if (pwake) ep_poll_safewake(&ep->poll_wait); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_insert(%p, %p, %d)\n", - current, ep, tfile, fd)); - return 0; error_unregister: @@ -1010,15 +1003,14 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even { int pwake = 0; unsigned int revents; - unsigned long flags; /* - * Set the new event interest mask before calling f_op->poll(), otherwise - * a potential race might occur. In fact if we do this operation inside - * the lock, an event might happen between the f_op->poll() call and the - * new event set registering. + * Set the new event interest mask before calling f_op->poll(); + * otherwise we might miss an event that happens between the + * f_op->poll() call and the new event set registering. */ epi->event.events = event->events; + epi->event.data = event->data; /* protected by mtx */ /* * Get current event bits. We can safely use the file* here because @@ -1026,16 +1018,12 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even */ revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL); - spin_lock_irqsave(&ep->lock, flags); - - /* Copy the data member from inside the lock */ - epi->event.data = event->data; - /* * If the item is "hot" and it is not registered inside the ready * list, push it inside. */ if (revents & event->events) { + spin_lock_irq(&ep->lock); if (!ep_is_linked(&epi->rdllink)) { list_add_tail(&epi->rdllink, &ep->rdllist); @@ -1045,8 +1033,8 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even if (waitqueue_active(&ep->poll_wait)) pwake++; } + spin_unlock_irq(&ep->lock); } - spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1055,62 +1043,67 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, struct epoll_even return 0; } -static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, void *priv) +static int ep_send_events_proc(struct eventpoll *ep, struct list_head *head, + void *priv) { struct ep_send_events_data *esed = priv; int eventcnt; - unsigned int revents; + unsigned int revents; struct epitem *epi; struct epoll_event __user *uevent; - /* + /* * We can loop without lock because we are passed a task private list. * Items cannot vanish during the loop because ep_scan_ready_list() is * holding "mtx" during this call. - */ + */ for (eventcnt = 0, uevent = esed->events; !list_empty(head) && eventcnt < esed->maxevents;) { epi = list_first_entry(head, struct epitem, rdllink); list_del_init(&epi->rdllink); - revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & - epi->event.events; + revents = epi->ffd.file->f_op->poll(epi->ffd.file, NULL) & + epi->event.events; - /* + /* * If the event mask intersect the caller-requested one, * deliver the event to userspace. Again, ep_scan_ready_list() * is holding "mtx", so no operations coming from userspace * can change the item. - */ - if (revents) { + */ + if (revents) { if (__put_user(revents, &uevent->events) || - __put_user(epi->event.data, &uevent->data)) - return eventcnt ? eventcnt: -EFAULT; - eventcnt++; + __put_user(epi->event.data, &uevent->data)) { + list_add(&epi->rdllink, head); + return eventcnt ? eventcnt : -EFAULT; + } + eventcnt++; uevent++; - if (epi->event.events & EPOLLONESHOT) - epi->event.events &= EP_PRIVATE_BITS; - else if (!(epi->event.events & EPOLLET)) - /* - * If this file has been added with Level Trigger - * mode, we need to insert back inside the ready - * list, so that the next call to epoll_wait() - * will check again the events availability. - * At this point, noone can insert into ep->rdllist - * besides us. The epoll_ctl() callers are locked - * out by ep_scan_ready_list() holding "mtx" and - * the poll callback will queue them in ep->ovflist. - */ - list_add_tail(&epi->rdllink, &ep->rdllist); - } - } + if (epi->event.events & EPOLLONESHOT) + epi->event.events &= EP_PRIVATE_BITS; + else if (!(epi->event.events & EPOLLET)) { + /* + * If this file has been added with Level + * Trigger mode, we need to insert back inside + * the ready list, so that the next call to + * epoll_wait() will check again the events + * availability. At this point, noone can insert + * into ep->rdllist besides us. The epoll_ctl() + * callers are locked out by + * ep_scan_ready_list() holding "mtx" and the + * poll callback will queue them in ep->ovflist. + */ + list_add_tail(&epi->rdllink, &ep->rdllist); + } + } + } return eventcnt; } -static int ep_send_events(struct eventpoll *ep, struct epoll_event __user *events, - int maxevents) +static int ep_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) { struct ep_send_events_data esed; @@ -1200,20 +1193,14 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) /* Check the EPOLL_* constant for consistency. */ BUILD_BUG_ON(EPOLL_CLOEXEC != O_CLOEXEC); - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", - current, flags)); - - error = -EINVAL; if (flags & ~EPOLL_CLOEXEC) - goto error_return; - + return -EINVAL; /* * Create the internal data structure ("struct eventpoll"). */ error = ep_alloc(&ep); if (error < 0) - goto error_return; - + return error; /* * Creates all the items needed to setup an eventpoll file. That is, * a file structure and a free file descriptor. @@ -1223,16 +1210,12 @@ SYSCALL_DEFINE1(epoll_create1, int, flags) if (error < 0) ep_free(ep); -error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, flags, error)); - return error; } SYSCALL_DEFINE1(epoll_create, int, size) { - if (size < 0) + if (size <= 0) return -EINVAL; return sys_epoll_create1(0); @@ -1252,9 +1235,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, struct epitem *epi; struct epoll_event epds; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p)\n", - current, epfd, op, fd, event)); - error = -EFAULT; if (ep_op_has_event(op) && copy_from_user(&epds, event, sizeof(struct epoll_event))) @@ -1305,7 +1285,6 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, case EPOLL_CTL_ADD: if (!epi) { epds.events |= POLLERR | POLLHUP; - error = ep_insert(ep, &epds, tfile, fd); } else error = -EEXIST; @@ -1331,8 +1310,6 @@ error_tgt_fput: error_fput: fput(file); error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_ctl(%d, %d, %d, %p) = %d\n", - current, epfd, op, fd, event, error)); return error; } @@ -1348,9 +1325,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, struct file *file; struct eventpoll *ep; - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d)\n", - current, epfd, events, maxevents, timeout)); - /* The maximum number of event must be greater than zero */ if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) return -EINVAL; @@ -1387,8 +1361,6 @@ SYSCALL_DEFINE4(epoll_wait, int, epfd, struct epoll_event __user *, events, error_fput: fput(file); error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_wait(%d, %p, %d, %d) = %d\n", - current, epfd, events, maxevents, timeout, error)); return error; } @@ -1460,13 +1432,11 @@ static int __init eventpoll_init(void) /* Allocates slab cache used to allocate "struct epitem" items */ epi_cache = kmem_cache_create("eventpoll_epi", sizeof(struct epitem), - 0, SLAB_HWCACHE_ALIGN|EPI_SLAB_DEBUG|SLAB_PANIC, - NULL); + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); /* Allocates slab cache used to allocate "struct eppoll_entry" */ pwq_cache = kmem_cache_create("eventpoll_pwq", - sizeof(struct eppoll_entry), 0, - EPI_SLAB_DEBUG|SLAB_PANIC, NULL); + sizeof(struct eppoll_entry), 0, SLAB_PANIC, NULL); return 0; }