X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Foom_kill.c;h=ea2147dabba60febfb381b8fb788c9a0cbf9ed30;hb=92f7ba70eecf4da8264a767b181cc2090f62d4ad;hp=a0a01902f551dd45b19385b64435ec6708db249d;hpb=fbdd12676c83df77480f00ebd32fc98fbe3bf836;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/oom_kill.c b/mm/oom_kill.c index a0a0190..ea2147d 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -31,9 +31,26 @@ int sysctl_panic_on_oom; int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks; -static DEFINE_SPINLOCK(zone_scan_mutex); +static DEFINE_SPINLOCK(zone_scan_lock); /* #define DEBUG */ +/* + * Is all threads of the target process nodes overlap ours? + */ +static int has_intersects_mems_allowed(struct task_struct *tsk) +{ + struct task_struct *t; + + t = tsk; + do { + if (cpuset_mems_allowed_intersects(current, t)) + return 1; + t = next_thread(t); + } while (t != tsk); + + return 0; +} + /** * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate @@ -55,9 +72,16 @@ static DEFINE_SPINLOCK(zone_scan_mutex); unsigned long badness(struct task_struct *p, unsigned long uptime) { - unsigned long points, cpu_time, run_time, s; + unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; + int oom_adj = p->signal->oom_adj; + struct task_cputime task_time; + unsigned long utime; + unsigned long stime; + + if (oom_adj == OOM_DISABLE) + return 0; task_lock(p); mm = p->mm; @@ -79,7 +103,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) /* * swapoff can easily use up all memory, so kill those first. */ - if (p->flags & PF_SWAPOFF) + if (p->flags & PF_OOM_ORIGIN) return ULONG_MAX; /* @@ -102,20 +126,21 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * of seconds. There is no particular reason for this other than * that it turned out to work very well in practice. */ - cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime)) - >> (SHIFT_HZ + 3); + thread_group_cputime(p, &task_time); + utime = cputime_to_jiffies(task_time.utime); + stime = cputime_to_jiffies(task_time.stime); + cpu_time = (utime + stime) >> (SHIFT_HZ + 3); + if (uptime >= p->start_time.tv_sec) run_time = (uptime - p->start_time.tv_sec) >> 10; else run_time = 0; - s = int_sqrt(cpu_time); - if (s) - points /= s; - s = int_sqrt(int_sqrt(run_time)); - if (s) - points /= s; + if (cpu_time) + points /= int_sqrt(cpu_time); + if (run_time) + points /= int_sqrt(int_sqrt(run_time)); /* * Niced processes are most likely less important, so double @@ -128,8 +153,8 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (has_capability(p, CAP_SYS_ADMIN) || - has_capability(p, CAP_SYS_RESOURCE)) + if (has_capability_noaudit(p, CAP_SYS_ADMIN) || + has_capability_noaudit(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -138,7 +163,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (has_capability(p, CAP_SYS_RAWIO)) + if (has_capability_noaudit(p, CAP_SYS_RAWIO)) points /= 4; /* @@ -146,19 +171,19 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * because p may have allocated or otherwise mapped memory on * this node before. However it will be less likely. */ - if (!cpuset_mems_allowed_intersects(current, p)) + if (!has_intersects_mems_allowed(p)) points /= 8; /* - * Adjust the score by oomkilladj. + * Adjust the score by oom_adj. */ - if (p->oomkilladj) { - if (p->oomkilladj > 0) { + if (oom_adj) { + if (oom_adj > 0) { if (!points) points = 1; - points <<= p->oomkilladj; + points <<= oom_adj; } else - points >>= -(p->oomkilladj); + points >>= -(oom_adj); } #ifdef DEBUG @@ -202,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, static struct task_struct *select_bad_process(unsigned long *ppoints, struct mem_cgroup *mem) { - struct task_struct *g, *p; + struct task_struct *p; struct task_struct *chosen = NULL; struct timespec uptime; *ppoints = 0; do_posix_clock_monotonic_gettime(&uptime); - do_each_thread(g, p) { + for_each_process(p) { unsigned long points; /* @@ -253,7 +278,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } - if (p->oomkilladj == OOM_DISABLE) + if (p->signal->oom_adj == OOM_DISABLE) continue; points = badness(p, uptime.tv_sec); @@ -261,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, chosen = p; *ppoints = points; } - } while_each_thread(g, p); + } return chosen; } @@ -286,21 +311,27 @@ static void dump_tasks(const struct mem_cgroup *mem) printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " "name\n"); do_each_thread(g, p) { - /* - * total_vm and rss sizes do not exist for tasks with a - * detached mm so there's no need to report them. - */ - if (!p->mm) - continue; + struct mm_struct *mm; + if (mem && !task_in_mem_cgroup(p, mem)) continue; if (!thread_group_leader(p)) continue; task_lock(p); + mm = p->mm; + if (!mm) { + /* + * total_vm and rss sizes do not exist for tasks with no + * mm so there's no need to report them; they can't be + * oom killed anyway. + */ + task_unlock(p); + continue; + } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", - p->pid, p->uid, p->tgid, p->mm->total_vm, - get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, + p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, + get_mm_rss(mm), (int)task_cpu(p), p->signal->oom_adj, p->comm); task_unlock(p); } while_each_thread(g, p); @@ -342,11 +373,6 @@ static void __oom_kill_task(struct task_struct *p, int verbose) static int oom_kill_task(struct task_struct *p) { - struct mm_struct *mm; - struct task_struct *g, *q; - - mm = p->mm; - /* WARNING: mm may not be dereferenced since we did not obtain its * value from get_task_mm(p). This is OK since all we need to do is * compare mm to q->mm below. @@ -355,30 +381,11 @@ static int oom_kill_task(struct task_struct *p) * change to NULL at any time since we do not hold task_lock(p). * However, this is of no concern to us. */ - - if (mm == NULL) + if (!p->mm || p->signal->oom_adj == OOM_DISABLE) return 1; - /* - * Don't kill the process if any threads are set to OOM_DISABLE - */ - do_each_thread(g, q) { - if (q->mm == mm && q->oomkilladj == OOM_DISABLE) - return 1; - } while_each_thread(g, q); - __oom_kill_task(p, 1); - /* - * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group. Don't let them have access - * to memory reserves though, otherwise we might deplete all memory. - */ - do_each_thread(g, q) { - if (q->mm == mm && !same_thread_group(q, p)) - force_sig(SIGKILL, q); - } while_each_thread(g, q); - return 0; } @@ -390,9 +397,14 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, if (printk_ratelimit()) { printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", - current->comm, gfp_mask, order, current->oomkilladj); + "gfp_mask=0x%x, order=%d, oom_adj=%d\n", + current->comm, gfp_mask, order, + current->signal->oom_adj); + task_lock(current); + cpuset_print_task_mems_allowed(current); + task_unlock(current); dump_stack(); + mem_cgroup_print_oom_info(mem, current); show_mem(); if (sysctl_oom_dump_tasks) dump_tasks(mem); @@ -426,7 +438,6 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) unsigned long points = 0; struct task_struct *p; - cgroup_lock(); read_lock(&tasklist_lock); retry: p = select_bad_process(&points, mem); @@ -441,7 +452,6 @@ retry: goto retry; out: read_unlock(&tasklist_lock); - cgroup_unlock(); } #endif @@ -470,7 +480,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) struct zone *zone; int ret = 1; - spin_lock(&zone_scan_mutex); + spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { if (zone_is_oom_locked(zone)) { ret = 0; @@ -480,7 +490,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { /* - * Lock each zone in the zonelist under zone_scan_mutex so a + * Lock each zone in the zonelist under zone_scan_lock so a * parallel invocation of try_set_zone_oom() doesn't succeed * when it shouldn't. */ @@ -488,7 +498,7 @@ int try_set_zone_oom(struct zonelist *zonelist, gfp_t gfp_mask) } out: - spin_unlock(&zone_scan_mutex); + spin_unlock(&zone_scan_lock); return ret; } @@ -502,11 +512,80 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) struct zoneref *z; struct zone *zone; - spin_lock(&zone_scan_mutex); + spin_lock(&zone_scan_lock); for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) { zone_clear_flag(zone, ZONE_OOM_LOCKED); } - spin_unlock(&zone_scan_mutex); + spin_unlock(&zone_scan_lock); +} + +/* + * Must be called with tasklist_lock held for read. + */ +static void __out_of_memory(gfp_t gfp_mask, int order) +{ + struct task_struct *p; + unsigned long points; + + if (sysctl_oom_kill_allocating_task) + if (!oom_kill_process(current, gfp_mask, order, 0, NULL, + "Out of memory (oom_kill_allocating_task)")) + return; +retry: + /* + * Rambo mode: Shoot down a process and hope it solves whatever + * issues we may have. + */ + p = select_bad_process(&points, NULL); + + if (PTR_ERR(p) == -1UL) + return; + + /* Found nothing?!?! Either we hang forever, or we panic. */ + if (!p) { + read_unlock(&tasklist_lock); + panic("Out of memory and no killable processes...\n"); + } + + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) + goto retry; +} + +/* + * pagefault handler calls into here because it is out of memory but + * doesn't know exactly how or why. + */ +void pagefault_out_of_memory(void) +{ + unsigned long freed = 0; + + blocking_notifier_call_chain(&oom_notify_list, 0, &freed); + if (freed > 0) + /* Got some memory back in the last second. */ + return; + + /* + * If this is from memcg, oom-killer is already invoked. + * and not worth to go system-wide-oom. + */ + if (mem_cgroup_oom_called(current)) + goto rest_and_return; + + if (sysctl_panic_on_oom) + panic("out of memory from page fault. panic_on_oom is selected.\n"); + + read_lock(&tasklist_lock); + __out_of_memory(0, 0); /* unknown gfp_mask and order */ + read_unlock(&tasklist_lock); + + /* + * Give "p" a good chance of killing itself before we + * retry to allocate memory. + */ +rest_and_return: + if (!test_thread_flag(TIF_MEMDIE)) + schedule_timeout_uninterruptible(1); } /** @@ -522,8 +601,6 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask) */ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) { - struct task_struct *p; - unsigned long points = 0; unsigned long freed = 0; enum oom_constraint constraint; @@ -544,7 +621,7 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, gfp_mask, order, points, NULL, + oom_kill_process(current, gfp_mask, order, 0, NULL, "No available memory (MPOL_BIND)"); break; @@ -553,35 +630,10 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) panic("out of memory. panic_on_oom is selected\n"); /* Fall-through */ case CONSTRAINT_CPUSET: - if (sysctl_oom_kill_allocating_task) { - oom_kill_process(current, gfp_mask, order, points, NULL, - "Out of memory (oom_kill_allocating_task)"); - break; - } -retry: - /* - * Rambo mode: Shoot down a process and hope it solves whatever - * issues we may have. - */ - p = select_bad_process(&points, NULL); - - if (PTR_ERR(p) == -1UL) - goto out; - - /* Found nothing?!?! Either we hang forever, or we panic. */ - if (!p) { - read_unlock(&tasklist_lock); - panic("Out of memory and no killable processes...\n"); - } - - if (oom_kill_process(p, gfp_mask, order, points, NULL, - "Out of memory")) - goto retry; - + __out_of_memory(gfp_mask, order); break; } -out: read_unlock(&tasklist_lock); /*