X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=fs%2Ffile.c;h=f313314f996fcaeb4457c04d64fd8eae25338423;hb=e4636d535e32768c8c500641ddb144f56e3dc5c0;hp=f5926ce73f373a3eb4255e9024a91bf1784d1351;hpb=badf16621c1f9d1ac753be056fce11b43d6e0be5;p=safe%2Fjmp%2Flinux-2.6 diff --git a/fs/file.c b/fs/file.c index f5926ce..f313314 100644 --- a/fs/file.c +++ b/fs/file.c @@ -6,257 +6,490 @@ * Manage the dynamic fd arrays in the process files_struct. */ +#include #include #include #include #include #include #include +#include #include +#include +#include +#include +#include +struct fdtable_defer { + spinlock_t lock; + struct work_struct wq; + struct fdtable *next; +}; + +int sysctl_nr_open __read_mostly = 1024*1024; +int sysctl_nr_open_min = BITS_PER_LONG; +int sysctl_nr_open_max = 1024 * 1024; /* raised later */ /* - * Allocate an fd array, using kmalloc or vmalloc. - * Note: the array isn't cleared at allocation time. + * We use this list to defer free fdtables that have vmalloced + * sets/arrays. By keeping a per-cpu list, we avoid having to embed + * the work_struct in fdtable itself which avoids a 64 byte (i386) increase in + * this per-task structure. */ -struct file ** alloc_fd_array(int num) -{ - struct file **new_fds; - int size = num * sizeof(struct file *); +static DEFINE_PER_CPU(struct fdtable_defer, fdtable_defer_list); +static inline void * alloc_fdmem(unsigned int size) +{ if (size <= PAGE_SIZE) - new_fds = (struct file **) kmalloc(size, GFP_KERNEL); - else - new_fds = (struct file **) vmalloc(size); - return new_fds; + return kmalloc(size, GFP_KERNEL); + else + return vmalloc(size); } -void free_fd_array(struct file **array, int num) +static inline void free_fdarr(struct fdtable *fdt) { - int size = num * sizeof(struct file *); + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) + kfree(fdt->fd); + else + vfree(fdt->fd); +} - if (!array) { - printk (KERN_ERR "free_fd_array: array = 0 (num = %d)\n", num); - return; +static inline void free_fdset(struct fdtable *fdt) +{ + if (fdt->max_fds <= (PAGE_SIZE * BITS_PER_BYTE / 2)) + kfree(fdt->open_fds); + else + vfree(fdt->open_fds); +} + +static void free_fdtable_work(struct work_struct *work) +{ + struct fdtable_defer *f = + container_of(work, struct fdtable_defer, wq); + struct fdtable *fdt; + + spin_lock_bh(&f->lock); + fdt = f->next; + f->next = NULL; + spin_unlock_bh(&f->lock); + while(fdt) { + struct fdtable *next = fdt->next; + vfree(fdt->fd); + free_fdset(fdt); + kfree(fdt); + fdt = next; } +} + +void free_fdtable_rcu(struct rcu_head *rcu) +{ + struct fdtable *fdt = container_of(rcu, struct fdtable, rcu); + struct fdtable_defer *fddef; - if (num <= NR_OPEN_DEFAULT) /* Don't free the embedded fd array! */ + BUG_ON(!fdt); + + if (fdt->max_fds <= NR_OPEN_DEFAULT) { + /* + * This fdtable is embedded in the files structure and that + * structure itself is getting destroyed. + */ + kmem_cache_free(files_cachep, + container_of(fdt, struct files_struct, fdtab)); return; - else if (size <= PAGE_SIZE) - kfree(array); - else - vfree(array); + } + if (fdt->max_fds <= (PAGE_SIZE / sizeof(struct file *))) { + kfree(fdt->fd); + kfree(fdt->open_fds); + kfree(fdt); + } else { + fddef = &get_cpu_var(fdtable_defer_list); + spin_lock(&fddef->lock); + fdt->next = fddef->next; + fddef->next = fdt; + /* vmallocs are handled from the workqueue context */ + schedule_work(&fddef->wq); + spin_unlock(&fddef->lock); + put_cpu_var(fdtable_defer_list); + } } /* - * Expand the fd array in the files_struct. Called with the files - * spinlock held for write. + * Expand the fdset in the files_struct. Called with the files spinlock + * held for write. */ +static void copy_fdtable(struct fdtable *nfdt, struct fdtable *ofdt) +{ + unsigned int cpy, set; -static int expand_fd_array(struct files_struct *files, int nr) - __releases(files->file_lock) - __acquires(files->file_lock) + BUG_ON(nfdt->max_fds < ofdt->max_fds); + + cpy = ofdt->max_fds * sizeof(struct file *); + set = (nfdt->max_fds - ofdt->max_fds) * sizeof(struct file *); + memcpy(nfdt->fd, ofdt->fd, cpy); + memset((char *)(nfdt->fd) + cpy, 0, set); + + cpy = ofdt->max_fds / BITS_PER_BYTE; + set = (nfdt->max_fds - ofdt->max_fds) / BITS_PER_BYTE; + memcpy(nfdt->open_fds, ofdt->open_fds, cpy); + memset((char *)(nfdt->open_fds) + cpy, 0, set); + memcpy(nfdt->close_on_exec, ofdt->close_on_exec, cpy); + memset((char *)(nfdt->close_on_exec) + cpy, 0, set); +} + +static struct fdtable * alloc_fdtable(unsigned int nr) { - struct file **new_fds; - int error, nfds; struct fdtable *fdt; + char *data; - - error = -EMFILE; - fdt = files_fdtable(files); - if (fdt->max_fds >= NR_OPEN || nr >= NR_OPEN) + /* + * Figure out how many fds we actually want to support in this fdtable. + * Allocation steps are keyed to the size of the fdarray, since it + * grows far faster than any of the other dynamic data. We try to fit + * the fdarray into comfortable page-tuned chunks: starting at 1024B + * and growing in powers of two from there on. + */ + nr /= (1024 / sizeof(struct file *)); + nr = roundup_pow_of_two(nr + 1); + nr *= (1024 / sizeof(struct file *)); + /* + * Note that this can drive nr *below* what we had passed if sysctl_nr_open + * had been set lower between the check in expand_files() and here. Deal + * with that in caller, it's cheaper that way. + * + * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise + * bitmaps handling below becomes unpleasant, to put it mildly... + */ + if (unlikely(nr > sysctl_nr_open)) + nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; + + fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); + if (!fdt) goto out; + fdt->max_fds = nr; + data = alloc_fdmem(nr * sizeof(struct file *)); + if (!data) + goto out_fdt; + fdt->fd = (struct file **)data; + data = alloc_fdmem(max_t(unsigned int, + 2 * nr / BITS_PER_BYTE, L1_CACHE_BYTES)); + if (!data) + goto out_arr; + fdt->open_fds = (fd_set *)data; + data += nr / BITS_PER_BYTE; + fdt->close_on_exec = (fd_set *)data; + INIT_RCU_HEAD(&fdt->rcu); + fdt->next = NULL; - nfds = fdt->max_fds; - spin_unlock(&files->file_lock); + return fdt; - /* - * Expand to the max in easy steps, and keep expanding it until - * we have enough for the requested fd array size. - */ +out_arr: + free_fdarr(fdt); +out_fdt: + kfree(fdt); +out: + return NULL; +} - do { -#if NR_OPEN_DEFAULT < 256 - if (nfds < 256) - nfds = 256; - else -#endif - if (nfds < (PAGE_SIZE / sizeof(struct file *))) - nfds = PAGE_SIZE / sizeof(struct file *); - else { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; - } - } while (nfds <= nr); +/* + * Expand the file descriptor table. + * This function will allocate a new fdtable and both fd array and fdset, of + * the given size. + * Return <0 error code on error; 1 on successful completion. + * The files->file_lock should be held on entry, and will be held on exit. + */ +static int expand_fdtable(struct files_struct *files, int nr) + __releases(files->file_lock) + __acquires(files->file_lock) +{ + struct fdtable *new_fdt, *cur_fdt; - error = -ENOMEM; - new_fds = alloc_fd_array(nfds); + spin_unlock(&files->file_lock); + new_fdt = alloc_fdtable(nr); spin_lock(&files->file_lock); - if (!new_fds) - goto out; - - /* Copy the existing array and install the new pointer */ - fdt = files_fdtable(files); - - if (nfds > fdt->max_fds) { - struct file **old_fds; - int i; - - old_fds = xchg(&fdt->fd, new_fds); - i = xchg(&fdt->max_fds, nfds); - - /* Don't copy/clear the array if we are creating a new - fd array for fork() */ - if (i) { - memcpy(new_fds, old_fds, i * sizeof(struct file *)); - /* clear the remainder of the array */ - memset(&new_fds[i], 0, - (nfds-i) * sizeof(struct file *)); - - spin_unlock(&files->file_lock); - free_fd_array(old_fds, i); - spin_lock(&files->file_lock); - } + if (!new_fdt) + return -ENOMEM; + /* + * extremely unlikely race - sysctl_nr_open decreased between the check in + * caller and alloc_fdtable(). Cheaper to catch it here... + */ + if (unlikely(new_fdt->max_fds <= nr)) { + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); + return -EMFILE; + } + /* + * Check again since another task may have expanded the fd table while + * we dropped the lock + */ + cur_fdt = files_fdtable(files); + if (nr >= cur_fdt->max_fds) { + /* Continue as planned */ + copy_fdtable(new_fdt, cur_fdt); + rcu_assign_pointer(files->fdt, new_fdt); + if (cur_fdt->max_fds > NR_OPEN_DEFAULT) + free_fdtable(cur_fdt); } else { - /* Somebody expanded the array while we slept ... */ - spin_unlock(&files->file_lock); - free_fd_array(new_fds, nfds); - spin_lock(&files->file_lock); + /* Somebody else expanded, so undo our attempt */ + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); } - error = 0; -out: - return error; + return 1; } /* - * Allocate an fdset array, using kmalloc or vmalloc. - * Note: the array isn't cleared at allocation time. + * Expand files. + * This function will expand the file structures, if the requested size exceeds + * the current capacity and there is room for expansion. + * Return <0 error code on error; 0 when nothing done; 1 when files were + * expanded and execution may have blocked. + * The files->file_lock should be held on entry, and will be held on exit. */ -fd_set * alloc_fdset(int num) +int expand_files(struct files_struct *files, int nr) { - fd_set *new_fdset; - int size = num / 8; + struct fdtable *fdt; - if (size <= PAGE_SIZE) - new_fdset = (fd_set *) kmalloc(size, GFP_KERNEL); - else - new_fdset = (fd_set *) vmalloc(size); - return new_fdset; + fdt = files_fdtable(files); + + /* + * N.B. For clone tasks sharing a files structure, this test + * will limit the total number of files that can be opened. + */ + if (nr >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) + return -EMFILE; + + /* Do we need to expand? */ + if (nr < fdt->max_fds) + return 0; + + /* Can we expand? */ + if (nr >= sysctl_nr_open) + return -EMFILE; + + /* All good, so we try */ + return expand_fdtable(files, nr); } -void free_fdset(fd_set *array, int num) +static int count_open_files(struct fdtable *fdt) { - int size = num / 8; + int size = fdt->max_fds; + int i; - if (num <= __FD_SETSIZE) /* Don't free an embedded fdset */ - return; - else if (size <= PAGE_SIZE) - kfree(array); - else - vfree(array); + /* Find the last open fd */ + for (i = size/(8*sizeof(long)); i > 0; ) { + if (fdt->open_fds->fds_bits[--i]) + break; + } + i = (i+1) * 8 * sizeof(long); + return i; } /* - * Expand the fdset in the files_struct. Called with the files spinlock - * held for write. + * Allocate a new files structure and copy contents from the + * passed in files structure. + * errorp will be valid only when the returned files_struct is NULL. */ -static int expand_fdset(struct files_struct *files, int nr) - __releases(file->file_lock) - __acquires(file->file_lock) +struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) { - fd_set *new_openset = NULL, *new_execset = NULL; - int error, nfds = 0; - struct fdtable *fdt; + struct files_struct *newf; + struct file **old_fds, **new_fds; + int open_files, size, i; + struct fdtable *old_fdt, *new_fdt; - error = -EMFILE; - fdt = files_fdtable(files); - if (fdt->max_fdset >= NR_OPEN || nr >= NR_OPEN) + *errorp = -ENOMEM; + newf = kmem_cache_alloc(files_cachep, GFP_KERNEL); + if (!newf) goto out; - nfds = fdt->max_fdset; - spin_unlock(&files->file_lock); + atomic_set(&newf->count, 1); + + spin_lock_init(&newf->file_lock); + newf->next_fd = 0; + new_fdt = &newf->fdtab; + new_fdt->max_fds = NR_OPEN_DEFAULT; + new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; + new_fdt->open_fds = (fd_set *)&newf->open_fds_init; + new_fdt->fd = &newf->fd_array[0]; + INIT_RCU_HEAD(&new_fdt->rcu); + new_fdt->next = NULL; + + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + open_files = count_open_files(old_fdt); + + /* + * Check whether we need to allocate a larger fd array and fd set. + */ + while (unlikely(open_files > new_fdt->max_fds)) { + spin_unlock(&oldf->file_lock); - /* Expand to the max in easy steps */ - do { - if (nfds < (PAGE_SIZE * 8)) - nfds = PAGE_SIZE * 8; - else { - nfds = nfds * 2; - if (nfds > NR_OPEN) - nfds = NR_OPEN; + if (new_fdt != &newf->fdtab) { + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); } - } while (nfds <= nr); - error = -ENOMEM; - new_openset = alloc_fdset(nfds); - new_execset = alloc_fdset(nfds); - spin_lock(&files->file_lock); - if (!new_openset || !new_execset) - goto out; + new_fdt = alloc_fdtable(open_files - 1); + if (!new_fdt) { + *errorp = -ENOMEM; + goto out_release; + } - error = 0; - - /* Copy the existing tables and install the new pointers */ - fdt = files_fdtable(files); - if (nfds > fdt->max_fdset) { - int i = fdt->max_fdset / (sizeof(unsigned long) * 8); - int count = (nfds - fdt->max_fdset) / 8; - - /* - * Don't copy the entire array if the current fdset is - * not yet initialised. + /* beyond sysctl_nr_open; nothing to do */ + if (unlikely(new_fdt->max_fds < open_files)) { + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); + *errorp = -EMFILE; + goto out_release; + } + + /* + * Reacquire the oldf lock and a pointer to its fd table + * who knows it may have a new bigger fd table. We need + * the latest pointer. */ - if (i) { - memcpy (new_openset, fdt->open_fds, fdt->max_fdset/8); - memcpy (new_execset, fdt->close_on_exec, fdt->max_fdset/8); - memset (&new_openset->fds_bits[i], 0, count); - memset (&new_execset->fds_bits[i], 0, count); + spin_lock(&oldf->file_lock); + old_fdt = files_fdtable(oldf); + open_files = count_open_files(old_fdt); + } + + old_fds = old_fdt->fd; + new_fds = new_fdt->fd; + + memcpy(new_fdt->open_fds->fds_bits, + old_fdt->open_fds->fds_bits, open_files/8); + memcpy(new_fdt->close_on_exec->fds_bits, + old_fdt->close_on_exec->fds_bits, open_files/8); + + for (i = open_files; i != 0; i--) { + struct file *f = *old_fds++; + if (f) { + get_file(f); + } else { + /* + * The fd may be claimed in the fd bitmap but not yet + * instantiated in the files array if a sibling thread + * is partway through open(). So make sure that this + * fd is available to the new process. + */ + FD_CLR(open_files - i, new_fdt->open_fds); } - - nfds = xchg(&fdt->max_fdset, nfds); - new_openset = xchg(&fdt->open_fds, new_openset); - new_execset = xchg(&fdt->close_on_exec, new_execset); - spin_unlock(&files->file_lock); - free_fdset (new_openset, nfds); - free_fdset (new_execset, nfds); - spin_lock(&files->file_lock); - return 0; - } - /* Somebody expanded the array while we slept ... */ + rcu_assign_pointer(*new_fds++, f); + } + spin_unlock(&oldf->file_lock); + /* compute the remainder to be cleared */ + size = (new_fdt->max_fds - open_files) * sizeof(struct file *); + + /* This is long word aligned thus could use a optimized version */ + memset(new_fds, 0, size); + + if (new_fdt->max_fds > open_files) { + int left = (new_fdt->max_fds-open_files)/8; + int start = open_files / (8 * sizeof(unsigned long)); + + memset(&new_fdt->open_fds->fds_bits[start], 0, left); + memset(&new_fdt->close_on_exec->fds_bits[start], 0, left); + } + + rcu_assign_pointer(newf->fdt, new_fdt); + + return newf; + +out_release: + kmem_cache_free(files_cachep, newf); out: - spin_unlock(&files->file_lock); - if (new_openset) - free_fdset(new_openset, nfds); - if (new_execset) - free_fdset(new_execset, nfds); - spin_lock(&files->file_lock); - return error; + return NULL; +} + +static void __devinit fdtable_defer_list_init(int cpu) +{ + struct fdtable_defer *fddef = &per_cpu(fdtable_defer_list, cpu); + spin_lock_init(&fddef->lock); + INIT_WORK(&fddef->wq, free_fdtable_work); + fddef->next = NULL; +} + +void __init files_defer_init(void) +{ + int i; + for_each_possible_cpu(i) + fdtable_defer_list_init(i); + sysctl_nr_open_max = min((size_t)INT_MAX, ~(size_t)0/sizeof(void *)) & + -BITS_PER_LONG; } +struct files_struct init_files = { + .count = ATOMIC_INIT(1), + .fdt = &init_files.fdtab, + .fdtab = { + .max_fds = NR_OPEN_DEFAULT, + .fd = &init_files.fd_array[0], + .close_on_exec = (fd_set *)&init_files.close_on_exec_init, + .open_fds = (fd_set *)&init_files.open_fds_init, + .rcu = RCU_HEAD_INIT, + }, + .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), +}; + /* - * Expand files. - * Return <0 on error; 0 nothing done; 1 files expanded, we may have blocked. - * Should be called with the files->file_lock spinlock held for write. + * allocate a file descriptor, mark it busy. */ -int expand_files(struct files_struct *files, int nr) +int alloc_fd(unsigned start, unsigned flags) { - int err, expand = 0; + struct files_struct *files = current->files; + unsigned int fd; + int error; struct fdtable *fdt; + spin_lock(&files->file_lock); +repeat: fdt = files_fdtable(files); - if (nr >= fdt->max_fdset) { - expand = 1; - if ((err = expand_fdset(files, nr))) - goto out; - } - if (nr >= fdt->max_fds) { - expand = 1; - if ((err = expand_fd_array(files, nr))) - goto out; + fd = start; + if (fd < files->next_fd) + fd = files->next_fd; + + if (fd < fdt->max_fds) + fd = find_next_zero_bit(fdt->open_fds->fds_bits, + fdt->max_fds, fd); + + error = expand_files(files, fd); + if (error < 0) + goto out; + + /* + * If we needed to expand the fs array we + * might have blocked - try again. + */ + if (error) + goto repeat; + + if (start <= files->next_fd) + files->next_fd = fd + 1; + + FD_SET(fd, fdt->open_fds); + if (flags & O_CLOEXEC) + FD_SET(fd, fdt->close_on_exec); + else + FD_CLR(fd, fdt->close_on_exec); + error = fd; +#if 1 + /* Sanity check */ + if (rcu_dereference(fdt->fd[fd]) != NULL) { + printk(KERN_WARNING "alloc_fd: slot %d not NULL!\n", fd); + rcu_assign_pointer(fdt->fd[fd], NULL); } - err = expand; +#endif + out: - return err; + spin_unlock(&files->file_lock); + return error; +} + +int get_unused_fd(void) +{ + return alloc_fd(0, 0); } +EXPORT_SYMBOL(get_unused_fd);