oom: make oom_score to per-process value
authorKOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Tue, 22 Sep 2009 00:03:14 +0000 (17:03 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Tue, 22 Sep 2009 14:17:39 +0000 (07:17 -0700)
oom-killer kills a process, not task.  Then oom_score should be calculated
as per-process too.  it makes consistency more and makes speed up
select_bad_process().

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Documentation/filesystems/proc.txt
fs/proc/base.c
mm/oom_kill.c

index ae7f8bb..75988ba 100644 (file)
@@ -1205,7 +1205,7 @@ The following heuristics are then applied:
  * if the task was reniced, its score doubles
  * superuser or direct hardware access tasks (CAP_SYS_ADMIN, CAP_SYS_RESOURCE
        or CAP_SYS_RAWIO) have their score divided by 4
- * if oom condition happened in one cpuset and checked task does not belong
+ * if oom condition happened in one cpuset and checked process does not belong
        to it, its score is divided by 8
  * the resulting score is multiplied by two to the power of oom_adj, i.e.
        points <<= oom_adj when it is positive and
index 81cfff8..71a3425 100644 (file)
@@ -447,7 +447,7 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 
        do_posix_clock_monotonic_gettime(&uptime);
        read_lock(&tasklist_lock);
-       points = badness(task, uptime.tv_sec);
+       points = badness(task->group_leader, uptime.tv_sec);
        read_unlock(&tasklist_lock);
        return sprintf(buffer, "%lu\n", points);
 }
index 630b77f..3726922 100644 (file)
@@ -34,6 +34,23 @@ int sysctl_oom_dump_tasks;
 static DEFINE_SPINLOCK(zone_scan_lock);
 /* #define DEBUG */
 
+/*
+ * Is all threads of the target process nodes overlap ours?
+ */
+static int has_intersects_mems_allowed(struct task_struct *tsk)
+{
+       struct task_struct *t;
+
+       t = tsk;
+       do {
+               if (cpuset_mems_allowed_intersects(current, t))
+                       return 1;
+               t = next_thread(t);
+       } while (t != tsk);
+
+       return 0;
+}
+
 /**
  * badness - calculate a numeric value for how bad this task has been
  * @p: task struct of which task we should calculate
@@ -59,6 +76,9 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
        struct mm_struct *mm;
        struct task_struct *child;
        int oom_adj = p->signal->oom_adj;
+       struct task_cputime task_time;
+       unsigned long utime;
+       unsigned long stime;
 
        if (oom_adj == OOM_DISABLE)
                return 0;
@@ -106,8 +126,11 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
          * of seconds. There is no particular reason for this other than
          * that it turned out to work very well in practice.
         */
-       cpu_time = (cputime_to_jiffies(p->utime) + cputime_to_jiffies(p->stime))
-               >> (SHIFT_HZ + 3);
+       thread_group_cputime(p, &task_time);
+       utime = cputime_to_jiffies(task_time.utime);
+       stime = cputime_to_jiffies(task_time.stime);
+       cpu_time = (utime + stime) >> (SHIFT_HZ + 3);
+
 
        if (uptime >= p->start_time.tv_sec)
                run_time = (uptime - p->start_time.tv_sec) >> 10;
@@ -148,7 +171,7 @@ unsigned long badness(struct task_struct *p, unsigned long uptime)
         * because p may have allocated or otherwise mapped memory on
         * this node before. However it will be less likely.
         */
-       if (!cpuset_mems_allowed_intersects(current, p))
+       if (!has_intersects_mems_allowed(p))
                points /= 8;
 
        /*
@@ -204,13 +227,13 @@ static inline enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 static struct task_struct *select_bad_process(unsigned long *ppoints,
                                                struct mem_cgroup *mem)
 {
-       struct task_struct *g, *p;
+       struct task_struct *p;
        struct task_struct *chosen = NULL;
        struct timespec uptime;
        *ppoints = 0;
 
        do_posix_clock_monotonic_gettime(&uptime);
-       do_each_thread(g, p) {
+       for_each_process(p) {
                unsigned long points;
 
                /*
@@ -263,7 +286,7 @@ static struct task_struct *select_bad_process(unsigned long *ppoints,
                        chosen = p;
                        *ppoints = points;
                }
-       } while_each_thread(g, p);
+       }
 
        return chosen;
 }