X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Frcutree.c;h=0536125b0497ab0d5f66bac3dc81ecec26fbd0ea;hb=608221fdf9a2170962295dcfbea53dc5c50d1a74;hp=81af59b8dd8865f4cd0e1291427d04ebadbac740;hpb=162cc2794df37662beb7f97ddd1dd5bffaf85e9a;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 81af59b..0536125 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -63,6 +63,9 @@ .gpnum = -300, \ .completed = -300, \ .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .orphan_cbs_list = NULL, \ + .orphan_cbs_tail = &name.orphan_cbs_list, \ + .orphan_qlen = 0, \ .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ @@ -122,6 +125,10 @@ static int blimit = 10; /* Maximum callbacks per softirq. */ static int qhimark = 10000; /* If this many pending, ignore blimit. */ static int qlowmark = 100; /* Once only this many pending, use blimit. */ +module_param(blimit, int, 0); +module_param(qhimark, int, 0); +module_param(qlowmark, int, 0); + static void force_quiescent_state(struct rcu_state *rsp, int relaxed); static int rcu_pending(int cpu); @@ -458,8 +465,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp) long delta; unsigned long flags; struct rcu_node *rnp = rcu_get_root(rsp); - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; /* Only let one CPU complain about others per time interval. */ @@ -470,18 +475,24 @@ static void print_other_cpu_stall(struct rcu_state *rsp) return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; + + /* + * Now rat on any tasks that got kicked up to the root rcu_node + * due to CPU offlining. + */ + rcu_print_task_stall(rnp); spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { rcu_print_task_stall(rnp); - if (rnp_cur->qsmask == 0) + if (rnp->qsmask == 0) continue; - for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++) - if (rnp_cur->qsmask & (1UL << cpu)) - printk(" %d", rnp_cur->grplo + cpu); + for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) + if (rnp->qsmask & (1UL << cpu)) + printk(" %d", rnp->grplo + cpu); } printk(" (detected by %d, t=%ld jiffies)\n", smp_processor_id(), (long)(jiffies - rsp->gp_start)); @@ -645,7 +656,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * one corresponding to this CPU, due to the fact that we have * irqs disabled. */ - for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) { + rcu_for_each_node_breadth_first(rsp, rnp) { spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -830,17 +841,63 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) #ifdef CONFIG_HOTPLUG_CPU /* + * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the + * specified flavor of RCU. The callbacks will be adopted by the next + * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever + * comes first. Because this is invoked from the CPU_DYING notifier, + * irqs are already disabled. + */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ + int i; + struct rcu_data *rdp = rsp->rda[smp_processor_id()]; + + if (rdp->nxtlist == NULL) + return; /* irqs disabled, so comparison is stable. */ + spin_lock(&rsp->onofflock); /* irqs already disabled. */ + *rsp->orphan_cbs_tail = rdp->nxtlist; + rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; + rdp->nxtlist = NULL; + for (i = 0; i < RCU_NEXT_SIZE; i++) + rdp->nxttail[i] = &rdp->nxtlist; + rsp->orphan_qlen += rdp->qlen; + rdp->qlen = 0; + spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ +} + +/* + * Adopt previously orphaned RCU callbacks. + */ +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ + unsigned long flags; + struct rcu_data *rdp; + + spin_lock_irqsave(&rsp->onofflock, flags); + rdp = rsp->rda[smp_processor_id()]; + if (rsp->orphan_cbs_list == NULL) { + spin_unlock_irqrestore(&rsp->onofflock, flags); + return; + } + *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; + rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail; + rdp->qlen += rsp->orphan_qlen; + rsp->orphan_cbs_list = NULL; + rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; + rsp->orphan_qlen = 0; + spin_unlock_irqrestore(&rsp->onofflock, flags); +} + +/* * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy * and move all callbacks from the outgoing CPU to the current one. */ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) { - int i; unsigned long flags; long lastcomp; unsigned long mask; struct rcu_data *rdp = rsp->rda[cpu]; - struct rcu_data *rdp_me; struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ @@ -856,39 +913,29 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } - rcu_preempt_offline_tasks(rsp, rnp, rdp); + + /* + * If there was a task blocking the current grace period, + * and if all CPUs have checked in, we need to propagate + * the quiescent state up the rcu_node hierarchy. But that + * is inconvenient at the moment due to deadlock issues if + * this should end the current grace period. So set the + * offlined CPU's bit in ->qsmask in order to force the + * next force_quiescent_state() invocation to clean up this + * mess in a deadlock-free manner. + */ + if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask) + rnp->qsmask |= mask; + mask = rnp->grpmask; spin_unlock(&rnp->lock); /* irqs remain disabled. */ rnp = rnp->parent; } while (rnp != NULL); lastcomp = rsp->completed; - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + spin_unlock_irqrestore(&rsp->onofflock, flags); - /* - * Move callbacks from the outgoing CPU to the running CPU. - * Note that the outgoing CPU is now quiescent, so it is now - * (uncharacteristically) safe to access its rcu_data structure. - * Note also that we must carefully retain the order of the - * outgoing CPU's callbacks in order for rcu_barrier() to work - * correctly. Finally, note that we start all the callbacks - * afresh, even those that have passed through a grace period - * and are therefore ready to invoke. The theory is that hotplug - * events are rare, and that if they are frequent enough to - * indefinitely delay callbacks, you have far worse things to - * be worrying about. - */ - rdp_me = rsp->rda[smp_processor_id()]; - if (rdp->nxtlist != NULL) { - *rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist; - rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; - rdp->nxtlist = NULL; - for (i = 0; i < RCU_NEXT_SIZE; i++) - rdp->nxttail[i] = &rdp->nxtlist; - rdp_me->qlen += rdp->qlen; - rdp->qlen = 0; - } - local_irq_restore(flags); + rcu_adopt_orphan_cbs(rsp); } /* @@ -906,6 +953,14 @@ static void rcu_offline_cpu(int cpu) #else /* #ifdef CONFIG_HOTPLUG_CPU */ +static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) +{ +} + +static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) +{ +} + static void rcu_offline_cpu(int cpu) { } @@ -916,7 +971,7 @@ static void rcu_offline_cpu(int cpu) * Invoke any RCU callbacks that have made it to the end of their grace * period. Thottle as specified by rdp->blimit. */ -static void rcu_do_batch(struct rcu_data *rdp) +static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) { unsigned long flags; struct rcu_head *next, *list, **tail; @@ -969,6 +1024,13 @@ static void rcu_do_batch(struct rcu_data *rdp) if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) rdp->blimit = blimit; + /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ + if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; + } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) + rdp->qlen_last_fqs_check = rdp->qlen; + local_irq_restore(flags); /* Re-raise the RCU softirq if there are callbacks remaining. */ @@ -1038,33 +1100,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, int cpu; unsigned long flags; unsigned long mask; - struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1]; - struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES]; + struct rcu_node *rnp; - for (; rnp_cur < rnp_end; rnp_cur++) { + rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp_cur->lock, flags); + spin_lock_irqsave(&rnp->lock, flags); if (rsp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); return 1; } - if (rnp_cur->qsmask == 0) { - spin_unlock_irqrestore(&rnp_cur->lock, flags); + if (rnp->qsmask == 0) { + spin_unlock_irqrestore(&rnp->lock, flags); continue; } - cpu = rnp_cur->grplo; + cpu = rnp->grplo; bit = 1; - for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) { - if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu])) + for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { + if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } if (mask != 0 && rsp->completed == lastcomp) { - /* cpu_quiet_msk() releases rnp_cur->lock. */ - cpu_quiet_msk(mask, rsp, rnp_cur, flags); + /* cpu_quiet_msk() releases rnp->lock. */ + cpu_quiet_msk(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp_cur->lock, flags); + spin_unlock_irqrestore(&rnp->lock, flags); } return 0; } @@ -1183,7 +1244,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) } /* If there are callbacks ready, invoke them. */ - rcu_do_batch(rdp); + rcu_do_batch(rsp, rdp); } /* @@ -1247,10 +1308,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } - /* Force the grace period if too many callbacks or too long waiting. */ - if (unlikely(++rdp->qlen > qhimark)) { + /* + * Force the grace period if too many callbacks or too long waiting. + * Enforce hysteresis, and don't invoke force_quiescent_state() + * if some other CPU has recently done so. Also, don't bother + * invoking force_quiescent_state() if the newly enqueued callback + * is the only one waiting for a grace period to complete. + */ + if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { rdp->blimit = LONG_MAX; - force_quiescent_state(rsp, 0); + if (rsp->n_force_qs == rdp->n_force_qs_snap && + *rdp->nxttail[RCU_DONE_TAIL] != head) + force_quiescent_state(rsp, 0); + rdp->n_force_qs_snap = rsp->n_force_qs; + rdp->qlen_last_fqs_check = rdp->qlen; } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) force_quiescent_state(rsp, 1); local_irq_restore(flags); @@ -1356,6 +1427,82 @@ int rcu_needs_cpu(int cpu) rcu_preempt_needs_cpu(cpu); } +static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; +static atomic_t rcu_barrier_cpu_count; +static DEFINE_MUTEX(rcu_barrier_mutex); +static struct completion rcu_barrier_completion; + +static void rcu_barrier_callback(struct rcu_head *notused) +{ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); +} + +/* + * Called with preemption disabled, and from cross-cpu IRQ context. + */ +static void rcu_barrier_func(void *type) +{ + int cpu = smp_processor_id(); + struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu); + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head)); + + atomic_inc(&rcu_barrier_cpu_count); + call_rcu_func = type; + call_rcu_func(head, rcu_barrier_callback); +} + +/* + * Orchestrate the specified type of RCU barrier, waiting for all + * RCU callbacks of the specified type to complete. + */ +static void _rcu_barrier(struct rcu_state *rsp, + void (*call_rcu_func)(struct rcu_head *head, + void (*func)(struct rcu_head *head))) +{ + BUG_ON(in_interrupt()); + /* Take mutex to serialize concurrent rcu_barrier() requests. */ + mutex_lock(&rcu_barrier_mutex); + init_completion(&rcu_barrier_completion); + /* + * Initialize rcu_barrier_cpu_count to 1, then invoke + * rcu_barrier_func() on each CPU, so that each CPU also has + * incremented rcu_barrier_cpu_count. Only then is it safe to + * decrement rcu_barrier_cpu_count -- otherwise the first CPU + * might complete its grace period before all of the other CPUs + * did their increment, causing this function to return too + * early. + */ + atomic_set(&rcu_barrier_cpu_count, 1); + preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */ + rcu_adopt_orphan_cbs(rsp); + on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1); + preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */ + if (atomic_dec_and_test(&rcu_barrier_cpu_count)) + complete(&rcu_barrier_completion); + wait_for_completion(&rcu_barrier_completion); + mutex_unlock(&rcu_barrier_mutex); +} + +/** + * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. + */ +void rcu_barrier_bh(void) +{ + _rcu_barrier(&rcu_bh_state, call_rcu_bh); +} +EXPORT_SYMBOL_GPL(rcu_barrier_bh); + +/** + * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. + */ +void rcu_barrier_sched(void) +{ + _rcu_barrier(&rcu_sched_state, call_rcu_sched); +} +EXPORT_SYMBOL_GPL(rcu_barrier_sched); + /* * Do boot-time initialization of a CPU's per-CPU RCU data. */ @@ -1406,6 +1553,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->beenonline = 1; /* We have now been online. */ rdp->preemptable = preemptable; rdp->passed_quiesc_completed = lastcomp - 1; + rdp->qlen_last_fqs_check = 0; + rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; spin_unlock(&rnp->lock); /* irqs remain disabled. */ @@ -1452,6 +1601,22 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self, case CPU_UP_PREPARE_FROZEN: rcu_online_cpu(cpu); break; + case CPU_DYING: + case CPU_DYING_FROZEN: + /* + * preempt_disable() in _rcu_barrier() prevents stop_machine(), + * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);" + * returns, all online cpus have queued rcu_barrier_func(). + * The dying CPU clears its cpu_online_mask bit and + * moves all of its RCU callbacks to ->orphan_cbs_list + * in the context of stop_machine(), so subsequent calls + * to _rcu_barrier() will adopt these callbacks and only + * then queue rcu_barrier_func() on all remaining CPUs. + */ + rcu_send_cbs_to_orphanage(&rcu_bh_state); + rcu_send_cbs_to_orphanage(&rcu_sched_state); + rcu_preempt_send_cbs_to_orphanage(); + break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -1514,7 +1679,8 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - spin_lock_init(&rnp->lock); + if (rnp != rcu_get_root(rsp)) + spin_lock_init(&rnp->lock); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1537,6 +1703,7 @@ static void __init rcu_init_one(struct rcu_state *rsp) INIT_LIST_HEAD(&rnp->blocked_tasks[1]); } } + spin_lock_init(&rcu_get_root(rsp)->lock); } /* @@ -1546,6 +1713,10 @@ static void __init rcu_init_one(struct rcu_state *rsp) */ #define RCU_INIT_FLAVOR(rsp, rcu_data) \ do { \ + int i; \ + int j; \ + struct rcu_node *rnp; \ + \ rcu_init_one(rsp); \ rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \ j = 0; \ @@ -1560,10 +1731,6 @@ do { \ void __init __rcu_init(void) { - int i; /* All used by RCU_INIT_FLAVOR(). */ - int j; - struct rcu_node *rnp; - rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n"); @@ -1575,7 +1742,3 @@ void __init __rcu_init(void) } #include "rcutree_plugin.h" - -module_param(blimit, int, 0); -module_param(qhimark, int, 0); -module_param(qlowmark, int, 0);