X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Fselect.c;h=0fe0e1469df31f386845dd443fdfe89a01543d92;hb=dd4dc82d4c129babca9b48b219cff0add5ff4132;hp=f6dceb56793fbe3f20b2a8313f9196bcb9e56ac6;hpb=8ff3e8e85fa6c312051134b3953e397feb639f51;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/select.c b/fs/select.c index f6dceb5..0fe0e14 100644 --- a/fs/select.c +++ b/fs/select.c @@ -28,6 +28,60 @@ #include + +/* + * Estimate expected accuracy in ns from a timeval. + * + * After quite a bit of churning around, we've settled on + * a simple thing of taking 0.1% of the timeout as the + * slack, with a cap of 100 msec. + * "nice" tasks get a 0.5% slack instead. + * + * Consider this comment an open invitation to come up with even + * better solutions.. + */ + +static long __estimate_accuracy(struct timespec *tv) +{ + long slack; + int divfactor = 1000; + + if (task_nice(current) > 0) + divfactor = divfactor / 5; + + slack = tv->tv_nsec / divfactor; + slack += tv->tv_sec * (NSEC_PER_SEC/divfactor); + + if (slack > 100 * NSEC_PER_MSEC) + slack = 100 * NSEC_PER_MSEC; + + if (slack < 0) + slack = 0; + return slack; +} + +static long estimate_accuracy(struct timespec *tv) +{ + unsigned long ret; + struct timespec now; + + /* + * Realtime tasks get a slack of 0 for obvious reasons. + */ + + if (rt_task(current)) + return 0; + + ktime_get_ts(&now); + now = timespec_sub(*tv, now); + ret = __estimate_accuracy(&now); + if (ret < current->timer_slack_ns) + return current->timer_slack_ns; + return ret; +} + + + struct poll_table_page { struct poll_table_page * next; struct poll_table_entry * entry; @@ -55,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); + pwq->polling_task = current; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } - EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) @@ -88,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq) free_page((unsigned long) old); } } - EXPORT_SYMBOL(poll_freewait); -static struct poll_table_entry *poll_get_entry(poll_table *_p) +static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { - struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) @@ -105,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; - __set_current_state(TASK_RUNNING); return NULL; } new_table->entry = new_table->entries; @@ -117,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) return table->entry++; } +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_wqueues *pwq = wait->private; + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); + + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expect write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in poll_schedule_timeout. + */ + smp_wmb(); + pwq->triggered = 1; + + /* + * Perform the default wake up operation using a dummy + * waitqueue. + * + * TODO: This is hacky but there currently is no interface to + * pass in @sync. @sync is scheduled to be removed and once + * that happens, wake_up_process() can be used directly. + */ + return default_wake_function(&dummy_wait, mode, sync, key); +} + /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { - struct poll_table_entry *entry = poll_get_entry(p); + struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); + struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + init_waitqueue_func_entry(&entry->wait, pollwake); + entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } +int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack) +{ + int rc = -EINTR; + + set_current_state(state); + if (!pwq->triggered) + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + __set_current_state(TASK_RUNNING); + + /* + * Prepare for the next iteration. + * + * The following set_mb() serves two purposes. First, it's + * the counterpart rmb of the wmb in pollwake() such that data + * written before wake up is always visible after wake up. + * Second, the full barrier guarantees that triggered clearing + * doesn't pass event check of the next iteration. Note that + * this problem doesn't exist for the first iteration as + * add_wait_queue() has full barrier semantics. + */ + set_mb(pwq->triggered, 0); + + return rc; +} +EXPORT_SYMBOL(poll_schedule_timeout); + /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec variable for the final timeout @@ -262,6 +368,7 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) struct poll_wqueues table; poll_table *wait; int retval, i, timed_out = 0; + unsigned long slack = 0; rcu_read_lock(); retval = max_select_fd(n, fds); @@ -278,12 +385,13 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) timed_out = 1; } + if (end_time && !timed_out) + slack = estimate_accuracy(end_time); + retval = 0; for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; - set_current_state(TASK_INTERRUPTIBLE); - inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -353,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) to = &expire; } - if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, + to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); poll_freewait(&table); @@ -449,8 +557,8 @@ out_nofds: return ret; } -asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timeval __user *tvp) +SYSCALL_DEFINE5(select, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timeval __user *, tvp) { struct timespec end_time, *to = NULL; struct timeval tv; @@ -461,8 +569,9 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, return -EFAULT; to = &end_time; - if (poll_select_set_timeout(to, tv.tv_sec, - tv.tv_usec * NSEC_PER_USEC)) + if (poll_select_set_timeout(to, + tv.tv_sec + (tv.tv_usec / USEC_PER_SEC), + (tv.tv_usec % USEC_PER_SEC) * NSEC_PER_USEC)) return -EINVAL; } @@ -473,9 +582,9 @@ asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp, } #ifdef HAVE_SET_RESTORE_SIGMASK -asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec __user *tsp, - const sigset_t __user *sigmask, size_t sigsetsize) +static long do_pselect(int n, fd_set __user *inp, fd_set __user *outp, + fd_set __user *exp, struct timespec __user *tsp, + const sigset_t __user *sigmask, size_t sigsetsize) { sigset_t ksigmask, sigsaved; struct timespec ts, end_time, *to = NULL; @@ -501,7 +610,7 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, sigprocmask(SIG_SETMASK, &ksigmask, &sigsaved); } - ret = core_sys_select(n, inp, outp, exp, &end_time); + ret = core_sys_select(n, inp, outp, exp, to); ret = poll_select_copy_remaining(&end_time, tsp, 0, ret); if (ret == -ERESTARTNOHAND) { @@ -527,8 +636,9 @@ asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, * which has a pointer to the sigset_t itself followed by a size_t containing * the sigset size. */ -asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, - fd_set __user *exp, struct timespec __user *tsp, void __user *sig) +SYSCALL_DEFINE6(pselect6, int, n, fd_set __user *, inp, fd_set __user *, outp, + fd_set __user *, exp, struct timespec __user *, tsp, + void __user *, sig) { size_t sigsetsize = 0; sigset_t __user *up = NULL; @@ -541,7 +651,7 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, return -EFAULT; } - return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize); + return do_pselect(n, inp, outp, exp, tsp, up, sigsetsize); } #endif /* HAVE_SET_RESTORE_SIGMASK */ @@ -593,6 +703,7 @@ static int do_poll(unsigned int nfds, struct poll_list *list, poll_table* pt = &wait->pt; ktime_t expire, *to = NULL; int timed_out = 0, count = 0; + unsigned long slack = 0; /* Optimise the no-wait case */ if (end_time && !end_time->tv_sec && !end_time->tv_nsec) { @@ -600,10 +711,12 @@ static int do_poll(unsigned int nfds, struct poll_list *list, timed_out = 1; } + if (end_time && !timed_out) + slack = estimate_accuracy(end_time); + for (;;) { struct poll_list *walk; - set_current_state(TASK_INTERRUPTIBLE); for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -646,10 +759,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list, to = &expire; } - if (!schedule_hrtimeout(to, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); return count; } @@ -743,8 +855,8 @@ static long do_restart_poll(struct restart_block *restart_block) return ret; } -asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, - long timeout_msecs) +SYSCALL_DEFINE3(poll, struct pollfd __user *, ufds, unsigned int, nfds, + long, timeout_msecs) { struct timespec end_time, *to = NULL; int ret; @@ -778,9 +890,9 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, } #ifdef HAVE_SET_RESTORE_SIGMASK -asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, - struct timespec __user *tsp, const sigset_t __user *sigmask, - size_t sigsetsize) +SYSCALL_DEFINE5(ppoll, struct pollfd __user *, ufds, unsigned int, nfds, + struct timespec __user *, tsp, const sigset_t __user *, sigmask, + size_t, sigsetsize) { sigset_t ksigmask, sigsaved; struct timespec ts, end_time, *to = NULL;