X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Foom_kill.c;h=beb592fe9389ffa9fe34c8c92ca82ada55abd960;hb=6a85fa3adddd3a74bd5b94c4b72668d307b88377;hp=12cd4735dc2935afda3b60f0753b0ec5ce212b2d;hpb=5081dde33f7a61d28d9b185cc386f12cb837c7a4;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 12cd473..beb592f 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -15,7 +15,9 @@ * kernel subsystems and hints as to where to find out what things do. */ +#include #include +#include #include #include #include @@ -23,14 +25,19 @@ #include #include #include +#include int sysctl_panic_on_oom; +int sysctl_oom_kill_allocating_task; +int sysctl_oom_dump_tasks; +static DEFINE_SPINLOCK(zone_scan_mutex); /* #define DEBUG */ /** * badness - calculate a numeric value for how bad this task has been * @p: task struct of which task we should calculate * @uptime: current uptime in seconds + * @mem: target memory controller * * The formula used is relatively simple and documented inline in the * function. The main rationale is that we want to select a good task @@ -46,7 +53,8 @@ int sysctl_panic_on_oom; * of least surprise ... (be careful when you change it) */ -unsigned long badness(struct task_struct *p, unsigned long uptime) +unsigned long badness(struct task_struct *p, unsigned long uptime, + struct mem_cgroup *mem) { unsigned long points, cpu_time, run_time, s; struct mm_struct *mm; @@ -60,12 +68,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) } /* - * swapoff can easily use up all memory, so kill those first. - */ - if (p->flags & PF_SWAPOFF) - return ULONG_MAX; - - /* * The memory size of the process is the basis for the badness. */ points = mm->total_vm; @@ -76,6 +78,12 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); /* + * swapoff can easily use up all memory, so kill those first. + */ + if (p->flags & PF_SWAPOFF) + return ULONG_MAX; + + /* * Processes which fork a lot of child processes are likely * a good choice. We add half the vmsize of the children if they * have an own mm. This prevents forking servers to flood the @@ -121,8 +129,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * Superuser processes are usually more important, so we make it * less likely that we kill those. */ - if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_ADMIN) || - p->uid == 0 || p->euid == 0) + if (__capable(p, CAP_SYS_ADMIN) || __capable(p, CAP_SYS_RESOURCE)) points /= 4; /* @@ -131,7 +138,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * tend to only have this flag set on applications they think * of as important. */ - if (cap_t(p->cap_effective) & CAP_TO_MASK(CAP_SYS_RAWIO)) + if (__capable(p, CAP_SYS_RAWIO)) points /= 4; /* @@ -139,46 +146,41 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) * because p may have allocated or otherwise mapped memory on * this node before. However it will be less likely. */ - if (!cpuset_excl_nodes_overlap(p)) + if (!cpuset_mems_allowed_intersects(current, p)) points /= 8; /* * Adjust the score by oomkilladj. */ if (p->oomkilladj) { - if (p->oomkilladj > 0) + if (p->oomkilladj > 0) { + if (!points) + points = 1; points <<= p->oomkilladj; - else + } else points >>= -(p->oomkilladj); } #ifdef DEBUG - printk(KERN_DEBUG "OOMkill: task %d (%s) got %d points\n", + printk(KERN_DEBUG "OOMkill: task %d (%s) got %lu points\n", p->pid, p->comm, points); #endif return points; } /* - * Types of limitations to the nodes from which allocations may occur - */ -#define CONSTRAINT_NONE 1 -#define CONSTRAINT_MEMORY_POLICY 2 -#define CONSTRAINT_CPUSET 3 - -/* * Determine the type of allocation constraint. */ -static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) +static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist, + gfp_t gfp_mask) { #ifdef CONFIG_NUMA struct zone **z; - nodemask_t nodes = node_online_map; + nodemask_t nodes = node_states[N_HIGH_MEMORY]; for (z = zonelist->zones; *z; z++) - if (cpuset_zone_allowed(*z, gfp_mask)) - node_clear((*z)->zone_pgdat->node_id, - nodes); + if (cpuset_zone_allowed_softwall(*z, gfp_mask)) + node_clear(zone_to_nid(*z), nodes); else return CONSTRAINT_CPUSET; @@ -195,7 +197,8 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask) * * (not docbooked, we don't want this one cluttering up the manual) */ -static struct task_struct *select_bad_process(unsigned long *ppoints) +static struct task_struct *select_bad_process(unsigned long *ppoints, + struct mem_cgroup *mem) { struct task_struct *g, *p; struct task_struct *chosen = NULL; @@ -205,14 +208,30 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) do_posix_clock_monotonic_gettime(&uptime); do_each_thread(g, p) { unsigned long points; - int releasing; - /* skip kernel threads */ + /* + * skip kernel threads and tasks which have already released + * their mm. + */ if (!p->mm) continue; - /* skip the init task with pid == 1 */ - if (p->pid == 1) + /* skip the init task */ + if (is_global_init(p)) continue; + if (mem && !task_in_mem_cgroup(p, mem)) + continue; + + /* + * This task already has access to memory reserves and is + * being killed. Don't allow any other task access to the + * memory reserve. + * + * Note: this may have a chance of deadlock if it gets + * blocked waiting for another task which itself is waiting + * for memory. Is there a better alternative? + */ + if (test_tsk_thread_flag(p, TIF_MEMDIE)) + return ERR_PTR(-1UL); /* * This is in the process of releasing memory so wait for it @@ -222,72 +241,102 @@ static struct task_struct *select_bad_process(unsigned long *ppoints) * go ahead if it is exiting: this will simply set TIF_MEMDIE, * which will allow it to gain access to memory reserves in * the process of exiting and releasing its resources. - * Otherwise we could get an OOM deadlock. + * Otherwise we could get an easy OOM deadlock. */ - releasing = test_tsk_thread_flag(p, TIF_MEMDIE) || - p->flags & PF_EXITING; - if (releasing) { - /* PF_DEAD tasks have already released their mm */ - if (p->flags & PF_DEAD) - continue; - if (p->flags & PF_EXITING && p == current) { - chosen = p; - *ppoints = ULONG_MAX; - break; - } - return ERR_PTR(-1UL); + if (p->flags & PF_EXITING) { + if (p != current) + return ERR_PTR(-1UL); + + chosen = p; + *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) continue; - points = badness(p, uptime.tv_sec); + points = badness(p, uptime.tv_sec, mem); if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } } while_each_thread(g, p); + return chosen; } /** - * We must be careful though to never send SIGKILL a process with - * CAP_SYS_RAW_IO set, send SIGTERM instead (but it's unlikely that - * we select a process with CAP_SYS_RAW_IO set). + * dump_tasks - dump current memory state of all system tasks + * @mem: target memory controller + * + * Dumps the current memory state of all system tasks, excluding kernel threads. + * State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj + * score, and name. + * + * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are + * shown. + * + * Call with tasklist_lock read-locked. */ -static void __oom_kill_task(struct task_struct *p, const char *message) +static void dump_tasks(const struct mem_cgroup *mem) { - if (p->pid == 1) { + struct task_struct *g, *p; + + printk(KERN_INFO "[ pid ] uid tgid total_vm rss cpu oom_adj " + "name\n"); + do_each_thread(g, p) { + /* + * total_vm and rss sizes do not exist for tasks with a + * detached mm so there's no need to report them. + */ + if (!p->mm) + continue; + if (mem && !task_in_mem_cgroup(p, mem)) + continue; + + task_lock(p); + printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", + p->pid, p->uid, p->tgid, p->mm->total_vm, + get_mm_rss(p->mm), (int)task_cpu(p), p->oomkilladj, + p->comm); + task_unlock(p); + } while_each_thread(g, p); +} + +/* + * Send SIGKILL to the selected process irrespective of CAP_SYS_RAW_IO + * flag though it's unlikely that we select a process with CAP_SYS_RAW_IO + * set. + */ +static void __oom_kill_task(struct task_struct *p, int verbose) +{ + if (is_global_init(p)) { WARN_ON(1); printk(KERN_WARNING "tried to kill init!\n"); return; } - task_lock(p); - if (!p->mm || p->mm == &init_mm) { + if (!p->mm) { WARN_ON(1); printk(KERN_WARNING "tried to kill an mm-less task!\n"); - task_unlock(p); return; } - task_unlock(p); - if (message) { - printk(KERN_ERR "%s: Killed process %d (%s).\n", - message, p->pid, p->comm); - } + if (verbose) + printk(KERN_ERR "Killed process %d (%s)\n", + task_pid_nr(p), p->comm); /* * We give our sacrificial lamb high priority and access to * all the memory it needs. That way it should be able to * exit() and clear out its resources quickly... */ - p->time_slice = HZ; + p->rt.time_slice = HZ; set_tsk_thread_flag(p, TIF_MEMDIE); force_sig(SIGKILL, p); } -static int oom_kill_task(struct task_struct *p, const char *message) +static int oom_kill_task(struct task_struct *p) { struct mm_struct *mm; struct task_struct *g, *q; @@ -303,49 +352,94 @@ static int oom_kill_task(struct task_struct *p, const char *message) * However, this is of no concern to us. */ - if (mm == NULL || mm == &init_mm) + if (mm == NULL) return 1; - __oom_kill_task(p, message); + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + + __oom_kill_task(p, 1); + /* * kill all processes that share the ->mm (i.e. all threads), - * but are in a different thread group + * but are in a different thread group. Don't let them have access + * to memory reserves though, otherwise we might deplete all memory. */ - do_each_thread(g, q) - if (q->mm == mm && q->tgid != p->tgid) - __oom_kill_task(q, message); - while_each_thread(g, q); + do_each_thread(g, q) { + if (q->mm == mm && !same_thread_group(q, p)) + force_sig(SIGKILL, q); + } while_each_thread(g, q); return 0; } -static int oom_kill_process(struct task_struct *p, unsigned long points, - const char *message) +static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, + unsigned long points, struct mem_cgroup *mem, + const char *message) { struct task_struct *c; - struct list_head *tsk; + + if (printk_ratelimit()) { + printk(KERN_WARNING "%s invoked oom-killer: " + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + dump_stack(); + show_mem(); + if (sysctl_oom_dump_tasks) + dump_tasks(mem); + } /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly */ if (p->flags & PF_EXITING) { - __oom_kill_task(p, NULL); + __oom_kill_task(p, 0); return 0; } - printk(KERN_ERR "Out of Memory: Kill process %d (%s) score %li" - " and children.\n", p->pid, p->comm, points); + printk(KERN_ERR "%s: kill process %d (%s) score %li or a child\n", + message, task_pid_nr(p), p->comm, points); + /* Try to kill a child first */ - list_for_each(tsk, &p->children) { - c = list_entry(tsk, struct task_struct, sibling); + list_for_each_entry(c, &p->children, sibling) { if (c->mm == p->mm) continue; - if (!oom_kill_task(c, message)) + if (!oom_kill_task(c)) return 0; } - return oom_kill_task(p, message); + return oom_kill_task(p); +} + +#ifdef CONFIG_CGROUP_MEM_RES_CTLR +void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask) +{ + unsigned long points = 0; + struct task_struct *p; + + cgroup_lock(); + read_lock(&tasklist_lock); +retry: + p = select_bad_process(&points, mem); + if (PTR_ERR(p) == -1UL) + goto out; + + if (!p) + p = current; + + if (oom_kill_process(p, gfp_mask, 0, points, mem, + "Memory cgroup out of memory")) + goto retry; +out: + read_unlock(&tasklist_lock); + cgroup_unlock(); } +#endif static BLOCKING_NOTIFIER_HEAD(oom_notify_list); @@ -361,8 +455,62 @@ int unregister_oom_notifier(struct notifier_block *nb) } EXPORT_SYMBOL_GPL(unregister_oom_notifier); +/* + * Try to acquire the OOM killer lock for the zones in zonelist. Returns zero + * if a parallel OOM killing is already taking place that includes a zone in + * the zonelist. Otherwise, locks all zones in the zonelist and returns 1. + */ +int try_set_zone_oom(struct zonelist *zonelist) +{ + struct zone **z; + int ret = 1; + + z = zonelist->zones; + + spin_lock(&zone_scan_mutex); + do { + if (zone_is_oom_locked(*z)) { + ret = 0; + goto out; + } + } while (*(++z) != NULL); + + /* + * Lock each zone in the zonelist under zone_scan_mutex so a parallel + * invocation of try_set_zone_oom() doesn't succeed when it shouldn't. + */ + z = zonelist->zones; + do { + zone_set_flag(*z, ZONE_OOM_LOCKED); + } while (*(++z) != NULL); +out: + spin_unlock(&zone_scan_mutex); + return ret; +} + +/* + * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed + * allocation attempts with zonelists containing them may now recall the OOM + * killer, if necessary. + */ +void clear_zonelist_oom(struct zonelist *zonelist) +{ + struct zone **z; + + z = zonelist->zones; + + spin_lock(&zone_scan_mutex); + do { + zone_clear_flag(*z, ZONE_OOM_LOCKED); + } while (*(++z) != NULL); + spin_unlock(&zone_scan_mutex); +} + /** * out_of_memory - kill the "best" process when we run out of memory + * @zonelist: zonelist pointer + * @gfp_mask: memory allocation flags + * @order: amount of memory being requested as a power of 2 * * If we run out of memory, we have the choice between either * killing a random task (bad), letting the system crash (worse) @@ -374,46 +522,45 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, int order) struct task_struct *p; unsigned long points = 0; unsigned long freed = 0; + enum oom_constraint constraint; blocking_notifier_call_chain(&oom_notify_list, 0, &freed); if (freed > 0) /* Got some memory back in the last second. */ return; - if (printk_ratelimit()) { - printk("oom-killer: gfp_mask=0x%x, order=%d\n", - gfp_mask, order); - dump_stack(); - show_mem(); - } - - cpuset_lock(); - read_lock(&tasklist_lock); + if (sysctl_panic_on_oom == 2) + panic("out of memory. Compulsory panic_on_oom is selected.\n"); /* * Check if there were limitations on the allocation (only relevant for * NUMA) that may require different handling. */ - switch (constrained_alloc(zonelist, gfp_mask)) { + constraint = constrained_alloc(zonelist, gfp_mask); + read_lock(&tasklist_lock); + + switch (constraint) { case CONSTRAINT_MEMORY_POLICY: - oom_kill_process(current, points, + oom_kill_process(current, gfp_mask, order, points, NULL, "No available memory (MPOL_BIND)"); break; - case CONSTRAINT_CPUSET: - oom_kill_process(current, points, - "No available memory in cpuset"); - break; - case CONSTRAINT_NONE: if (sysctl_panic_on_oom) panic("out of memory. panic_on_oom is selected\n"); + /* Fall-through */ + case CONSTRAINT_CPUSET: + if (sysctl_oom_kill_allocating_task) { + oom_kill_process(current, gfp_mask, order, points, NULL, + "Out of memory (oom_kill_allocating_task)"); + break; + } retry: /* * Rambo mode: Shoot down a process and hope it solves whatever * issues we may have. */ - p = select_bad_process(&points); + p = select_bad_process(&points, NULL); if (PTR_ERR(p) == -1UL) goto out; @@ -421,11 +568,11 @@ retry: /* Found nothing?!?! Either we hang forever, or we panic. */ if (!p) { read_unlock(&tasklist_lock); - cpuset_unlock(); panic("Out of memory and no killable processes...\n"); } - if (oom_kill_process(p, points, "Out of memory")) + if (oom_kill_process(p, gfp_mask, order, points, NULL, + "Out of memory")) goto retry; break; @@ -433,7 +580,6 @@ retry: out: read_unlock(&tasklist_lock); - cpuset_unlock(); /* * Give "p" a good chance of killing itself before we