X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=blobdiff_plain;f=kernel%2Facct.c;h=a6605ca921b6a6a9538ac645dadee177ef34f9b9;hp=b756f527497ea8c201d533df64ff053da0084d96;hb=e071041be037eca208b62b84469a06bdfc692bea;hpb=417ef531415c070926b071b75fd1c1ac4b6e2f7e diff --git a/kernel/acct.c b/kernel/acct.c index b756f52..a6605ca 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -43,10 +43,10 @@ * a struct file opened for write. Fixed. 2/6/2000, AV. */ -#include #include #include #include +#include #include #include #include @@ -54,9 +54,11 @@ #include #include #include +#include #include #include #include /* sector_div */ +#include /* * These constants control the amount of freespace that suspend and @@ -73,35 +75,39 @@ int acct_parm[3] = {4, 2, 30}; /* * External references and all of the globals. */ -static void do_acct_process(long, struct file *); +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *); /* * This structure is used so that all the data protected by lock * can be placed in the same cache line as the lock. This primes * the cache line to have the data after getting the lock. */ -struct acct_glbs { - spinlock_t lock; +struct bsd_acct_struct { volatile int active; volatile int needcheck; struct file *file; + struct pid_namespace *ns; struct timer_list timer; + struct list_head list; }; -static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED}; +static DEFINE_SPINLOCK(acct_lock); +static LIST_HEAD(acct_list); /* * Called whenever the timer says to check the free space. */ -static void acct_timeout(unsigned long unused) +static void acct_timeout(unsigned long x) { - acct_globals.needcheck = 1; + struct bsd_acct_struct *acct = (struct bsd_acct_struct *)x; + acct->needcheck = 1; } /* * Check the amount of free space and suspend/resume accordingly. */ -static int check_free_space(struct file *file) +static int check_free_space(struct bsd_acct_struct *acct, struct file *file) { struct kstatfs sbuf; int res; @@ -109,14 +115,14 @@ static int check_free_space(struct file *file) sector_t resume; sector_t suspend; - spin_lock(&acct_globals.lock); - res = acct_globals.active; - if (!file || !acct_globals.needcheck) + spin_lock(&acct_lock); + res = acct->active; + if (!file || !acct->needcheck) goto out; - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); /* May block */ - if (vfs_statfs(file->f_dentry->d_inode->i_sb, &sbuf)) + if (vfs_statfs(file->f_path.dentry, &sbuf)) return res; suspend = sbuf.f_blocks * SUSPEND; resume = sbuf.f_blocks * RESUME; @@ -132,35 +138,35 @@ static int check_free_space(struct file *file) act = 0; /* - * If some joker switched acct_globals.file under us we'ld better be + * If some joker switched acct->file under us we'ld better be * silent and _not_ touch anything. */ - spin_lock(&acct_globals.lock); - if (file != acct_globals.file) { + spin_lock(&acct_lock); + if (file != acct->file) { if (act) res = act>0; goto out; } - if (acct_globals.active) { + if (acct->active) { if (act < 0) { - acct_globals.active = 0; + acct->active = 0; printk(KERN_INFO "Process accounting paused\n"); } } else { if (act > 0) { - acct_globals.active = 1; + acct->active = 1; printk(KERN_INFO "Process accounting resumed\n"); } } - del_timer(&acct_globals.timer); - acct_globals.needcheck = 0; - acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct_globals.timer); - res = acct_globals.active; + del_timer(&acct->timer); + acct->needcheck = 0; + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); + res = acct->active; out: - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); return res; } @@ -168,37 +174,100 @@ out: * Close the old accounting file (if currently open) and then replace * it with file (if non-NULL). * - * NOTE: acct_globals.lock MUST be held on entry and exit. + * NOTE: acct_lock MUST be held on entry and exit. */ -static void acct_file_reopen(struct file *file) +static void acct_file_reopen(struct bsd_acct_struct *acct, struct file *file, + struct pid_namespace *ns) { struct file *old_acct = NULL; - - if (acct_globals.file) { - old_acct = acct_globals.file; - del_timer(&acct_globals.timer); - acct_globals.active = 0; - acct_globals.needcheck = 0; - acct_globals.file = NULL; + struct pid_namespace *old_ns = NULL; + + if (acct->file) { + old_acct = acct->file; + old_ns = acct->ns; + del_timer(&acct->timer); + acct->active = 0; + acct->needcheck = 0; + acct->file = NULL; + acct->ns = NULL; + list_del(&acct->list); } if (file) { - acct_globals.file = file; - acct_globals.needcheck = 0; - acct_globals.active = 1; + acct->file = file; + acct->ns = ns; + acct->needcheck = 0; + acct->active = 1; + list_add(&acct->list, &acct_list); /* It's been deleted if it was used before so this is safe */ - init_timer(&acct_globals.timer); - acct_globals.timer.function = acct_timeout; - acct_globals.timer.expires = jiffies + ACCT_TIMEOUT*HZ; - add_timer(&acct_globals.timer); + setup_timer(&acct->timer, acct_timeout, (unsigned long)acct); + acct->timer.expires = jiffies + ACCT_TIMEOUT*HZ; + add_timer(&acct->timer); } if (old_acct) { - spin_unlock(&acct_globals.lock); - do_acct_process(0, old_acct); + mnt_unpin(old_acct->f_path.mnt); + spin_unlock(&acct_lock); + do_acct_process(acct, old_ns, old_acct); filp_close(old_acct, NULL); - spin_lock(&acct_globals.lock); + spin_lock(&acct_lock); } } +static int acct_on(char *name) +{ + struct file *file; + struct vfsmount *mnt; + int error; + struct pid_namespace *ns; + struct bsd_acct_struct *acct = NULL; + + /* Difference from BSD - they don't do O_APPEND */ + file = filp_open(name, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (!S_ISREG(file->f_path.dentry->d_inode->i_mode)) { + filp_close(file, NULL); + return -EACCES; + } + + if (!file->f_op->write) { + filp_close(file, NULL); + return -EIO; + } + + ns = task_active_pid_ns(current); + if (ns->bacct == NULL) { + acct = kzalloc(sizeof(struct bsd_acct_struct), GFP_KERNEL); + if (acct == NULL) { + filp_close(file, NULL); + return -ENOMEM; + } + } + + error = security_acct(file); + if (error) { + kfree(acct); + filp_close(file, NULL); + return error; + } + + spin_lock(&acct_lock); + if (ns->bacct == NULL) { + ns->bacct = acct; + acct = NULL; + } + + mnt = file->f_path.mnt; + mnt_pin(mnt); + acct_file_reopen(ns->bacct, file, ns); + spin_unlock(&acct_lock); + + mntput(mnt); /* it's pinned, now give up active reference */ + kfree(acct); + + return 0; +} + /** * sys_acct - enable/disable process accounting * @name: file name for accounting records or NULL to shutdown accounting @@ -210,49 +279,55 @@ static void acct_file_reopen(struct file *file) * should be written. If the filename is NULL, accounting will be * shutdown. */ -asmlinkage long sys_acct(const char __user *name) +SYSCALL_DEFINE1(acct, const char __user *, name) { - struct file *file = NULL; - char *tmp; int error; if (!capable(CAP_SYS_PACCT)) return -EPERM; if (name) { - tmp = getname(name); - if (IS_ERR(tmp)) { + char *tmp = getname(name); + if (IS_ERR(tmp)) return (PTR_ERR(tmp)); - } - /* Difference from BSD - they don't do O_APPEND */ - file = filp_open(tmp, O_WRONLY|O_APPEND|O_LARGEFILE, 0); + error = acct_on(tmp); putname(tmp); - if (IS_ERR(file)) { - return (PTR_ERR(file)); - } - if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { - filp_close(file, NULL); - return (-EACCES); - } + } else { + struct bsd_acct_struct *acct; - if (!file->f_op->write) { - filp_close(file, NULL); - return (-EIO); - } - } + acct = task_active_pid_ns(current)->bacct; + if (acct == NULL) + return 0; - error = security_acct(file); - if (error) { - if (file) - filp_close(file, NULL); - return error; + error = security_acct(NULL); + if (!error) { + spin_lock(&acct_lock); + acct_file_reopen(acct, NULL, NULL); + spin_unlock(&acct_lock); + } } + return error; +} - spin_lock(&acct_globals.lock); - acct_file_reopen(file); - spin_unlock(&acct_globals.lock); - - return (0); +/** + * acct_auto_close - turn off a filesystem's accounting if it is on + * @m: vfsmount being shut down + * + * If the accounting is turned on for a file in the subtree pointed to + * to by m, turn accounting off. Done when m is about to die. + */ +void acct_auto_close_mnt(struct vfsmount *m) +{ + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt == m) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); } /** @@ -264,12 +339,31 @@ asmlinkage long sys_acct(const char __user *name) */ void acct_auto_close(struct super_block *sb) { - spin_lock(&acct_globals.lock); - if (acct_globals.file && - acct_globals.file->f_dentry->d_inode->i_sb == sb) { - acct_file_reopen((struct file *)NULL); + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); +restart: + list_for_each_entry(acct, &acct_list, list) + if (acct->file && acct->file->f_path.mnt->mnt_sb == sb) { + acct_file_reopen(acct, NULL, NULL); + goto restart; + } + spin_unlock(&acct_lock); +} + +void acct_exit_ns(struct pid_namespace *ns) +{ + struct bsd_acct_struct *acct; + + spin_lock(&acct_lock); + acct = ns->bacct; + if (acct != NULL) { + if (acct->file != NULL) + acct_file_reopen(acct, NULL, NULL); + + kfree(acct); } - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); } /* @@ -296,16 +390,16 @@ static comp_t encode_comp_t(unsigned long value) } /* - * If we need to round up, do it (and handle overflow correctly). - */ + * If we need to round up, do it (and handle overflow correctly). + */ if (rnd && (++value > MAXFRACT)) { value >>= EXPSIZE; exp++; } /* - * Clean it up and polish it off. - */ + * Clean it up and polish it off. + */ exp <<= MANTSIZE; /* Shift the exponent into place */ exp += value; /* and add on the mantissa. */ return exp; @@ -328,30 +422,30 @@ static comp_t encode_comp_t(unsigned long value) static comp2_t encode_comp2_t(u64 value) { - int exp, rnd; - - exp = (value > (MAXFRACT2>>1)); - rnd = 0; - while (value > MAXFRACT2) { - rnd = value & 1; - value >>= 1; - exp++; - } - - /* - * If we need to round up, do it (and handle overflow correctly). - */ - if (rnd && (++value > MAXFRACT2)) { - value >>= 1; - exp++; - } - - if (exp > MAXEXP2) { - /* Overflow. Return largest representable number instead. */ - return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; - } else { - return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); - } + int exp, rnd; + + exp = (value > (MAXFRACT2>>1)); + rnd = 0; + while (value > MAXFRACT2) { + rnd = value & 1; + value >>= 1; + exp++; + } + + /* + * If we need to round up, do it (and handle overflow correctly). + */ + if (rnd && (++value > MAXFRACT2)) { + value >>= 1; + exp++; + } + + if (exp > MAXEXP2) { + /* Overflow. Return largest representable number instead. */ + return (1ul << (MANTSIZE2+EXPSIZE2-1)) - 1; + } else { + return (value & (MAXFRACT2>>1)) | (exp << (MANTSIZE2-1)); + } } #endif @@ -380,28 +474,34 @@ static u32 encode_float(u64 value) * The acct_process() call is the workhorse of the process * accounting system. The struct acct is built here and then written * into the accounting file. This function should only be called from - * do_exit(). + * do_exit() or when switching to a different output file. */ /* * do_acct_process does all actual work. Caller holds the reference to file. */ -static void do_acct_process(long exitcode, struct file *file) +static void do_acct_process(struct bsd_acct_struct *acct, + struct pid_namespace *ns, struct file *file) { + struct pacct_struct *pacct = ¤t->signal->pacct; acct_t ac; mm_segment_t fs; - unsigned long vsize; unsigned long flim; u64 elapsed; u64 run_time; struct timespec uptime; + struct tty_struct *tty; + const struct cred *orig_cred; + + /* Perform file operations on behalf of whoever enabled accounting */ + orig_cred = override_creds(file->f_cred); /* * First check to see if there is enough free_space to continue * the process accounting system. */ - if (!check_free_space(file)) - return; + if (!check_free_space(acct, file)) + goto out; /* * Fill the accounting struct with the needed info as recorded @@ -415,8 +515,8 @@ static void do_acct_process(long exitcode, struct file *file) /* calculate run_time in nsec*/ do_posix_clock_monotonic_gettime(&uptime); run_time = (u64)uptime.tv_sec*NSEC_PER_SEC + uptime.tv_nsec; - run_time -= (u64)current->start_time.tv_sec*NSEC_PER_SEC - + current->start_time.tv_nsec; + run_time -= (u64)current->group_leader->start_time.tv_sec * NSEC_PER_SEC + + current->group_leader->start_time.tv_nsec; /* convert nsec -> AHZ */ elapsed = nsec_to_AHZ(run_time); #if ACCT_VERSION==3 @@ -434,139 +534,151 @@ static void do_acct_process(long exitcode, struct file *file) } #endif do_div(elapsed, AHZ); - ac.ac_btime = xtime.tv_sec - elapsed; - ac.ac_utime = encode_comp_t(jiffies_to_AHZ( - current->signal->utime + - current->group_leader->utime)); - ac.ac_stime = encode_comp_t(jiffies_to_AHZ( - current->signal->stime + - current->group_leader->stime)); + ac.ac_btime = get_seconds() - elapsed; /* we really need to bite the bullet and change layout */ - ac.ac_uid = current->uid; - ac.ac_gid = current->gid; + ac.ac_uid = orig_cred->uid; + ac.ac_gid = orig_cred->gid; #if ACCT_VERSION==2 ac.ac_ahz = AHZ; #endif #if ACCT_VERSION==1 || ACCT_VERSION==2 /* backward-compatible 16 bit fields */ - ac.ac_uid16 = current->uid; - ac.ac_gid16 = current->gid; + ac.ac_uid16 = ac.ac_uid; + ac.ac_gid16 = ac.ac_gid; #endif #if ACCT_VERSION==3 - ac.ac_pid = current->tgid; - ac.ac_ppid = current->parent->tgid; + ac.ac_pid = task_tgid_nr_ns(current, ns); + rcu_read_lock(); + ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns); + rcu_read_unlock(); #endif - read_lock(&tasklist_lock); /* pin current->signal */ - ac.ac_tty = current->signal->tty ? - old_encode_dev(tty_devnum(current->signal->tty)) : 0; - read_unlock(&tasklist_lock); - - ac.ac_flag = 0; - if (current->flags & PF_FORKNOEXEC) - ac.ac_flag |= AFORK; - if (current->flags & PF_SUPERPRIV) - ac.ac_flag |= ASU; - if (current->flags & PF_DUMPCORE) - ac.ac_flag |= ACORE; - if (current->flags & PF_SIGNALED) - ac.ac_flag |= AXSIG; - - vsize = 0; - if (current->mm) { - struct vm_area_struct *vma; - down_read(¤t->mm->mmap_sem); - vma = current->mm->mmap; - while (vma) { - vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - up_read(¤t->mm->mmap_sem); - } - vsize = vsize / 1024; - ac.ac_mem = encode_comp_t(vsize); + spin_lock_irq(¤t->sighand->siglock); + tty = current->signal->tty; /* Safe as we hold the siglock */ + ac.ac_tty = tty ? old_encode_dev(tty_devnum(tty)) : 0; + ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); + ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); + ac.ac_flag = pacct->ac_flag; + ac.ac_mem = encode_comp_t(pacct->ac_mem); + ac.ac_minflt = encode_comp_t(pacct->ac_minflt); + ac.ac_majflt = encode_comp_t(pacct->ac_majflt); + ac.ac_exitcode = pacct->ac_exitcode; + spin_unlock_irq(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); - ac.ac_minflt = encode_comp_t(current->signal->min_flt + - current->group_leader->min_flt); - ac.ac_majflt = encode_comp_t(current->signal->maj_flt + - current->group_leader->maj_flt); ac.ac_swaps = encode_comp_t(0); - ac.ac_exitcode = exitcode; /* - * Kernel segment override to datasegment and write it - * to the accounting file. - */ + * Kernel segment override to datasegment and write it + * to the accounting file. + */ fs = get_fs(); set_fs(KERNEL_DS); /* - * Accounting records are not subject to resource limits. - */ + * Accounting records are not subject to resource limits. + */ flim = current->signal->rlim[RLIMIT_FSIZE].rlim_cur; current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; file->f_op->write(file, (char *)&ac, sizeof(acct_t), &file->f_pos); current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim; set_fs(fs); +out: + revert_creds(orig_cred); } /** - * acct_process - now just a wrapper around do_acct_process + * acct_init_pacct - initialize a new pacct_struct + * @pacct: per-process accounting info struct to initialize + */ +void acct_init_pacct(struct pacct_struct *pacct) +{ + memset(pacct, 0, sizeof(struct pacct_struct)); + pacct->ac_utime = pacct->ac_stime = cputime_zero; +} + +/** + * acct_collect - collect accounting information into pacct_struct * @exitcode: task exit code - * - * handles process accounting for an exiting task + * @group_dead: not 0, if this thread is the last one in the process. */ -void acct_process(long exitcode) +void acct_collect(long exitcode, int group_dead) +{ + struct pacct_struct *pacct = ¤t->signal->pacct; + unsigned long vsize = 0; + + if (group_dead && current->mm) { + struct vm_area_struct *vma; + down_read(¤t->mm->mmap_sem); + vma = current->mm->mmap; + while (vma) { + vsize += vma->vm_end - vma->vm_start; + vma = vma->vm_next; + } + up_read(¤t->mm->mmap_sem); + } + + spin_lock_irq(¤t->sighand->siglock); + if (group_dead) + pacct->ac_mem = vsize / 1024; + if (thread_group_leader(current)) { + pacct->ac_exitcode = exitcode; + if (current->flags & PF_FORKNOEXEC) + pacct->ac_flag |= AFORK; + } + if (current->flags & PF_SUPERPRIV) + pacct->ac_flag |= ASU; + if (current->flags & PF_DUMPCORE) + pacct->ac_flag |= ACORE; + if (current->flags & PF_SIGNALED) + pacct->ac_flag |= AXSIG; + pacct->ac_utime = cputime_add(pacct->ac_utime, current->utime); + pacct->ac_stime = cputime_add(pacct->ac_stime, current->stime); + pacct->ac_minflt += current->min_flt; + pacct->ac_majflt += current->maj_flt; + spin_unlock_irq(¤t->sighand->siglock); +} + +static void acct_process_in_ns(struct pid_namespace *ns) { struct file *file = NULL; + struct bsd_acct_struct *acct; + acct = ns->bacct; /* * accelerate the common fastpath: */ - if (!acct_globals.file) + if (!acct || !acct->file) return; - spin_lock(&acct_globals.lock); - file = acct_globals.file; + spin_lock(&acct_lock); + file = acct->file; if (unlikely(!file)) { - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); return; } get_file(file); - spin_unlock(&acct_globals.lock); + spin_unlock(&acct_lock); - do_acct_process(exitcode, file); + do_acct_process(acct, ns, file); fput(file); } - /** - * acct_update_integrals - update mm integral fields in task_struct - * @tsk: task_struct for accounting + * acct_process - now just a wrapper around acct_process_in_ns, + * which in turn is a wrapper around do_acct_process. + * + * handles process accounting for an exiting task */ -void acct_update_integrals(struct task_struct *tsk) +void acct_process(void) { - if (likely(tsk->mm)) { - long delta = tsk->stime - tsk->acct_stimexpd; - - if (delta == 0) - return; - tsk->acct_stimexpd = tsk->stime; - tsk->acct_rss_mem1 += delta * get_mm_counter(tsk->mm, rss); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - } -} + struct pid_namespace *ns; -/** - * acct_clear_integrals - clear the mm integral fields in task_struct - * @tsk: task_struct whose accounting fields are cleared - */ -void acct_clear_integrals(struct task_struct *tsk) -{ - if (tsk) { - tsk->acct_stimexpd = 0; - tsk->acct_rss_mem1 = 0; - tsk->acct_vm_mem1 = 0; - } + /* + * This loop is safe lockless, since current is still + * alive and holds its namespace, which in turn holds + * its parent. + */ + for (ns = task_active_pid_ns(current); ns != NULL; ns = ns->parent) + acct_process_in_ns(ns); }