SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/rcuclassic.c

   1 /*
   2  * Read-Copy Update mechanism for mutual exclusion
   3  *
   4  * This program is free software; you can redistribute it and/or modify
   5  * it under the terms of the GNU General Public License as published by
   6  * the Free Software Foundation; either version 2 of the License, or
   7  * (at your option) any later version.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write to the Free Software
  16  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
  17  *
  18  * Copyright IBM Corporation, 2001
  19  *
  20  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  21  *          Manfred Spraul <manfred@colorfullife.com>
  22  *
  23  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  24  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
  25  * Papers:
  26  * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
  27  * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
  28  *
  29  * For detailed explanation of Read-Copy Update mechanism see -
  30  *              Documentation/RCU
  31  *
  32  */
  33 #include <linux/types.h>
  34 #include <linux/kernel.h>
  35 #include <linux/init.h>
  36 #include <linux/spinlock.h>
  37 #include <linux/smp.h>
  38 #include <linux/rcupdate.h>
  39 #include <linux/interrupt.h>
  40 #include <linux/sched.h>
  41 #include <asm/atomic.h>
  42 #include <linux/bitops.h>
  43 #include <linux/module.h>
  44 #include <linux/completion.h>
  45 #include <linux/moduleparam.h>
  46 #include <linux/percpu.h>
  47 #include <linux/notifier.h>
  48 #include <linux/cpu.h>
  49 #include <linux/mutex.h>
  50
  51 #ifdef CONFIG_DEBUG_LOCK_ALLOC
  52 static struct lock_class_key rcu_lock_key;
  53 struct lockdep_map rcu_lock_map =
  54         STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
  55 EXPORT_SYMBOL_GPL(rcu_lock_map);
  56 #endif
  57
  58
  59 /* Definition for rcupdate control block. */
  60 static struct rcu_ctrlblk rcu_ctrlblk = {
  61         .cur = -300,
  62         .completed = -300,
  63         .pending = -300,
  64         .lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
  65         .cpumask = CPU_MASK_NONE,
  66 };
  67 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
  68         .cur = -300,
  69         .completed = -300,
  70         .pending = -300,
  71         .lock = __SPIN_LOCK_UNLOCKED(&rcu_bh_ctrlblk.lock),
  72         .cpumask = CPU_MASK_NONE,
  73 };
  74
  75 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
  76 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
  77
  78 static int blimit = 10;
  79 static int qhimark = 10000;
  80 static int qlowmark = 100;
  81
  82 #ifdef CONFIG_SMP
  83 static void force_quiescent_state(struct rcu_data *rdp,
  84                         struct rcu_ctrlblk *rcp)
  85 {
  86         int cpu;
  87         cpumask_t cpumask;
  88         set_need_resched();
  89         if (unlikely(!rcp->signaled)) {
  90                 rcp->signaled = 1;
  91                 /*
  92                  * Don't send IPI to itself. With irqs disabled,
  93                  * rdp->cpu is the current cpu.
  94                  *
  95                  * cpu_online_map is updated by the _cpu_down()
  96                  * using stop_machine_run(). Since we're in irqs disabled
  97                  * section, stop_machine_run() is not exectuting, hence
  98                  * the cpu_online_map is stable.
  99                  *
 100                  * However,  a cpu might have been offlined _just_ before
 101                  * we disabled irqs while entering here.
 102                  * And rcu subsystem might not yet have handled the CPU_DEAD
 103                  * notification, leading to the offlined cpu's bit
 104                  * being set in the rcp->cpumask.
 105                  *
 106                  * Hence cpumask = (rcp->cpumask & cpu_online_map) to prevent
 107                  * sending smp_reschedule() to an offlined CPU.
 108                  */
 109                 cpus_and(cpumask, rcp->cpumask, cpu_online_map);
 110                 cpu_clear(rdp->cpu, cpumask);
 111                 for_each_cpu_mask(cpu, cpumask)
 112                         smp_send_reschedule(cpu);
 113         }
 114 }
 115 #else
 116 static inline void force_quiescent_state(struct rcu_data *rdp,
 117                         struct rcu_ctrlblk *rcp)
 118 {
 119         set_need_resched();
 120 }
 121 #endif
 122
 123 /**
 124  * call_rcu - Queue an RCU callback for invocation after a grace period.
 125  * @head: structure to be used for queueing the RCU updates.
 126  * @func: actual update function to be invoked after the grace period
 127  *
 128  * The update function will be invoked some time after a full grace
 129  * period elapses, in other words after all currently executing RCU
 130  * read-side critical sections have completed.  RCU read-side critical
 131  * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
 132  * and may be nested.
 133  */
 134 void call_rcu(struct rcu_head *head,
 135                                 void (*func)(struct rcu_head *rcu))
 136 {
 137         unsigned long flags;
 138         struct rcu_data *rdp;
 139
 140         head->func = func;
 141         head->next = NULL;
 142         local_irq_save(flags);
 143         rdp = &__get_cpu_var(rcu_data);
 144         *rdp->nxttail = head;
 145         rdp->nxttail = &head->next;
 146         if (unlikely(++rdp->qlen > qhimark)) {
 147                 rdp->blimit = INT_MAX;
 148                 force_quiescent_state(rdp, &rcu_ctrlblk);
 149         }
 150         local_irq_restore(flags);
 151 }
 152 EXPORT_SYMBOL_GPL(call_rcu);
 153
 154 /**
 155  * call_rcu_bh - Queue an RCU for invocation after a quicker grace period.
 156  * @head: structure to be used for queueing the RCU updates.
 157  * @func: actual update function to be invoked after the grace period
 158  *
 159  * The update function will be invoked some time after a full grace
 160  * period elapses, in other words after all currently executing RCU
 161  * read-side critical sections have completed. call_rcu_bh() assumes
 162  * that the read-side critical sections end on completion of a softirq
 163  * handler. This means that read-side critical sections in process
 164  * context must not be interrupted by softirqs. This interface is to be
 165  * used when most of the read-side critical sections are in softirq context.
 166  * RCU read-side critical sections are delimited by rcu_read_lock() and
 167  * rcu_read_unlock(), * if in interrupt context or rcu_read_lock_bh()
 168  * and rcu_read_unlock_bh(), if in process context. These may be nested.
 169  */
 170 void call_rcu_bh(struct rcu_head *head,
 171                                 void (*func)(struct rcu_head *rcu))
 172 {
 173         unsigned long flags;
 174         struct rcu_data *rdp;
 175
 176         head->func = func;
 177         head->next = NULL;
 178         local_irq_save(flags);
 179         rdp = &__get_cpu_var(rcu_bh_data);
 180         *rdp->nxttail = head;
 181         rdp->nxttail = &head->next;
 182
 183         if (unlikely(++rdp->qlen > qhimark)) {
 184                 rdp->blimit = INT_MAX;
 185                 force_quiescent_state(rdp, &rcu_bh_ctrlblk);
 186         }
 187
 188         local_irq_restore(flags);
 189 }
 190 EXPORT_SYMBOL_GPL(call_rcu_bh);
 191
 192 /*
 193  * Return the number of RCU batches processed thus far.  Useful
 194  * for debug and statistics.
 195  */
 196 long rcu_batches_completed(void)
 197 {
 198         return rcu_ctrlblk.completed;
 199 }
 200 EXPORT_SYMBOL_GPL(rcu_batches_completed);
 201
 202 /*
 203  * Return the number of RCU batches processed thus far.  Useful
 204  * for debug and statistics.
 205  */
 206 long rcu_batches_completed_bh(void)
 207 {
 208         return rcu_bh_ctrlblk.completed;
 209 }
 210 EXPORT_SYMBOL_GPL(rcu_batches_completed_bh);
 211
 212 /* Raises the softirq for processing rcu_callbacks. */
 213 static inline void raise_rcu_softirq(void)
 214 {
 215         raise_softirq(RCU_SOFTIRQ);
 216         /*
 217          * The smp_mb() here is required to ensure that this cpu's
 218          * __rcu_process_callbacks() reads the most recently updated
 219          * value of rcu->cur.
 220          */
 221         smp_mb();
 222 }
 223
 224 /*
 225  * Invoke the completed RCU callbacks. They are expected to be in
 226  * a per-cpu list.
 227  */
 228 static void rcu_do_batch(struct rcu_data *rdp)
 229 {
 230         struct rcu_head *next, *list;
 231         int count = 0;
 232
 233         list = rdp->donelist;
 234         while (list) {
 235                 next = list->next;
 236                 prefetch(next);
 237                 list->func(list);
 238                 list = next;
 239                 if (++count >= rdp->blimit)
 240                         break;
 241         }
 242         rdp->donelist = list;
 243
 244         local_irq_disable();
 245         rdp->qlen -= count;
 246         local_irq_enable();
 247         if (rdp->blimit == INT_MAX && rdp->qlen <= qlowmark)
 248                 rdp->blimit = blimit;
 249
 250         if (!rdp->donelist)
 251                 rdp->donetail = &rdp->donelist;
 252         else
 253                 raise_rcu_softirq();
 254 }
 255
 256 /*
 257  * Grace period handling:
 258  * The grace period handling consists out of two steps:
 259  * - A new grace period is started.
 260  *   This is done by rcu_start_batch. The start is not broadcasted to
 261  *   all cpus, they must pick this up by comparing rcp->cur with
 262  *   rdp->quiescbatch. All cpus are recorded  in the
 263  *   rcu_ctrlblk.cpumask bitmap.
 264  * - All cpus must go through a quiescent state.
 265  *   Since the start of the grace period is not broadcasted, at least two
 266  *   calls to rcu_check_quiescent_state are required:
 267  *   The first call just notices that a new grace period is running. The
 268  *   following calls check if there was a quiescent state since the beginning
 269  *   of the grace period. If so, it updates rcu_ctrlblk.cpumask. If
 270  *   the bitmap is empty, then the grace period is completed.
 271  *   rcu_check_quiescent_state calls rcu_start_batch(0) to start the next grace
 272  *   period (if necessary).
 273  */
 274 /*
 275  * Register a new batch of callbacks, and start it up if there is currently no
 276  * active batch and the batch to be registered has not already occurred.
 277  * Caller must hold rcu_ctrlblk.lock.
 278  */
 279 static void rcu_start_batch(struct rcu_ctrlblk *rcp)
 280 {
 281         if (rcp->cur != rcp->pending &&
 282                         rcp->completed == rcp->cur) {
 283                 rcp->cur++;
 284
 285                 /*
 286                  * Accessing nohz_cpu_mask before incrementing rcp->cur needs a
 287                  * Barrier  Otherwise it can cause tickless idle CPUs to be
 288                  * included in rcp->cpumask, which will extend graceperiods
 289                  * unnecessarily.
 290                  */
 291                 smp_mb();
 292                 cpus_andnot(rcp->cpumask, cpu_online_map, nohz_cpu_mask);
 293
 294                 rcp->signaled = 0;
 295         }
 296 }
 297
 298 /*
 299  * cpu went through a quiescent state since the beginning of the grace period.
 300  * Clear it from the cpu mask and complete the grace period if it was the last
 301  * cpu. Start another grace period if someone has further entries pending
 302  */
 303 static void cpu_quiet(int cpu, struct rcu_ctrlblk *rcp)
 304 {
 305         cpu_clear(cpu, rcp->cpumask);
 306         if (cpus_empty(rcp->cpumask)) {
 307                 /* batch completed ! */
 308                 rcp->completed = rcp->cur;
 309                 rcu_start_batch(rcp);
 310         }
 311 }
 312
 313 /*
 314  * Check if the cpu has gone through a quiescent state (say context
 315  * switch). If so and if it already hasn't done so in this RCU
 316  * quiescent cycle, then indicate that it has done so.
 317  */
 318 static void rcu_check_quiescent_state(struct rcu_ctrlblk *rcp,
 319                                         struct rcu_data *rdp)
 320 {
 321         if (rdp->quiescbatch != rcp->cur) {
 322                 /* start new grace period: */
 323                 rdp->qs_pending = 1;
 324                 rdp->passed_quiesc = 0;
 325                 rdp->quiescbatch = rcp->cur;
 326                 return;
 327         }
 328
 329         /* Grace period already completed for this cpu?
 330          * qs_pending is checked instead of the actual bitmap to avoid
 331          * cacheline trashing.
 332          */
 333         if (!rdp->qs_pending)
 334                 return;
 335
 336         /*
 337          * Was there a quiescent state since the beginning of the grace
 338          * period? If no, then exit and wait for the next call.
 339          */
 340         if (!rdp->passed_quiesc)
 341                 return;
 342         rdp->qs_pending = 0;
 343
 344         spin_lock(&rcp->lock);
 345         /*
 346          * rdp->quiescbatch/rcp->cur and the cpu bitmap can come out of sync
 347          * during cpu startup. Ignore the quiescent state.
 348          */
 349         if (likely(rdp->quiescbatch == rcp->cur))
 350                 cpu_quiet(rdp->cpu, rcp);
 351
 352         spin_unlock(&rcp->lock);
 353 }
 354
 355
 356 #ifdef CONFIG_HOTPLUG_CPU
 357
 358 /* warning! helper for rcu_offline_cpu. do not use elsewhere without reviewing
 359  * locking requirements, the list it's pulling from has to belong to a cpu
 360  * which is dead and hence not processing interrupts.
 361  */
 362 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 363                                 struct rcu_head **tail)
 364 {
 365         local_irq_disable();
 366         *this_rdp->nxttail = list;
 367         if (list)
 368                 this_rdp->nxttail = tail;
 369         local_irq_enable();
 370 }
 371
 372 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
 373                                 struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 374 {
 375         /* if the cpu going offline owns the grace period
 376          * we can block indefinitely waiting for it, so flush
 377          * it here
 378          */
 379         spin_lock_bh(&rcp->lock);
 380         if (rcp->cur != rcp->completed)
 381                 cpu_quiet(rdp->cpu, rcp);
 382         spin_unlock_bh(&rcp->lock);
 383         rcu_move_batch(this_rdp, rdp->donelist, rdp->donetail);
 384         rcu_move_batch(this_rdp, rdp->curlist, rdp->curtail);
 385         rcu_move_batch(this_rdp, rdp->nxtlist, rdp->nxttail);
 386
 387         local_irq_disable();
 388         this_rdp->qlen += rdp->qlen;
 389         local_irq_enable();
 390 }
 391
 392 static void rcu_offline_cpu(int cpu)
 393 {
 394         struct rcu_data *this_rdp = &get_cpu_var(rcu_data);
 395         struct rcu_data *this_bh_rdp = &get_cpu_var(rcu_bh_data);
 396
 397         __rcu_offline_cpu(this_rdp, &rcu_ctrlblk,
 398                                         &per_cpu(rcu_data, cpu));
 399         __rcu_offline_cpu(this_bh_rdp, &rcu_bh_ctrlblk,
 400                                         &per_cpu(rcu_bh_data, cpu));
 401         put_cpu_var(rcu_data);
 402         put_cpu_var(rcu_bh_data);
 403 }
 404
 405 #else
 406
 407 static void rcu_offline_cpu(int cpu)
 408 {
 409 }
 410
 411 #endif
 412
 413 /*
 414  * This does the RCU processing work from softirq context.
 415  */
 416 static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp,
 417                                         struct rcu_data *rdp)
 418 {
 419         if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch)) {
 420                 *rdp->donetail = rdp->curlist;
 421                 rdp->donetail = rdp->curtail;
 422                 rdp->curlist = NULL;
 423                 rdp->curtail = &rdp->curlist;
 424         }
 425
 426         if (rdp->nxtlist && !rdp->curlist) {
 427                 local_irq_disable();
 428                 rdp->curlist = rdp->nxtlist;
 429                 rdp->curtail = rdp->nxttail;
 430                 rdp->nxtlist = NULL;
 431                 rdp->nxttail = &rdp->nxtlist;
 432                 local_irq_enable();
 433
 434                 /*
 435                  * start the next batch of callbacks
 436                  */
 437
 438                 /* determine batch number */
 439                 rdp->batch = rcp->cur + 1;
 440
 441                 if (rcu_batch_after(rdp->batch, rcp->pending)) {
 442                         /* and start it/schedule start if it's a new batch */
 443                         spin_lock(&rcp->lock);
 444                         if (rcu_batch_after(rdp->batch, rcp->pending)) {
 445                                 rcp->pending = rdp->batch;
 446                                 rcu_start_batch(rcp);
 447                         }
 448                         spin_unlock(&rcp->lock);
 449                 }
 450         }
 451
 452         rcu_check_quiescent_state(rcp, rdp);
 453         if (rdp->donelist)
 454                 rcu_do_batch(rdp);
 455 }
 456
 457 static void rcu_process_callbacks(struct softirq_action *unused)
 458 {
 459         __rcu_process_callbacks(&rcu_ctrlblk, &__get_cpu_var(rcu_data));
 460         __rcu_process_callbacks(&rcu_bh_ctrlblk, &__get_cpu_var(rcu_bh_data));
 461 }
 462
 463 static int __rcu_pending(struct rcu_ctrlblk *rcp, struct rcu_data *rdp)
 464 {
 465         /* This cpu has pending rcu entries and the grace period
 466          * for them has completed.
 467          */
 468         if (rdp->curlist && !rcu_batch_before(rcp->completed, rdp->batch))
 469                 return 1;
 470
 471         /* This cpu has no pending entries, but there are new entries */
 472         if (!rdp->curlist && rdp->nxtlist)
 473                 return 1;
 474
 475         /* This cpu has finished callbacks to invoke */
 476         if (rdp->donelist)
 477                 return 1;
 478
 479         /* The rcu core waits for a quiescent state from the cpu */
 480         if (rdp->quiescbatch != rcp->cur || rdp->qs_pending)
 481                 return 1;
 482
 483         /* nothing to do */
 484         return 0;
 485 }
 486
 487 /*
 488  * Check to see if there is any immediate RCU-related work to be done
 489  * by the current CPU, returning 1 if so.  This function is part of the
 490  * RCU implementation; it is -not- an exported member of the RCU API.
 491  */
 492 int rcu_pending(int cpu)
 493 {
 494         return __rcu_pending(&rcu_ctrlblk, &per_cpu(rcu_data, cpu)) ||
 495                 __rcu_pending(&rcu_bh_ctrlblk, &per_cpu(rcu_bh_data, cpu));
 496 }
 497
 498 /*
 499  * Check to see if any future RCU-related work will need to be done
 500  * by the current CPU, even if none need be done immediately, returning
 501  * 1 if so.  This function is part of the RCU implementation; it is -not-
 502  * an exported member of the RCU API.
 503  */
 504 int rcu_needs_cpu(int cpu)
 505 {
 506         struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 507         struct rcu_data *rdp_bh = &per_cpu(rcu_bh_data, cpu);
 508
 509         return (!!rdp->curlist || !!rdp_bh->curlist || rcu_pending(cpu));
 510 }
 511
 512 void rcu_check_callbacks(int cpu, int user)
 513 {
 514         if (user ||
 515             (idle_cpu(cpu) && !in_softirq() &&
 516                                 hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 517
 518                 /*
 519                  * Get here if this CPU took its interrupt from user
 520                  * mode or from the idle loop, and if this is not a
 521                  * nested interrupt.  In this case, the CPU is in
 522                  * a quiescent state, so count it.
 523                  *
 524                  * Also do a memory barrier.  This is needed to handle
 525                  * the case where writes from a preempt-disable section
 526                  * of code get reordered into schedule() by this CPU's
 527                  * write buffer.  The memory barrier makes sure that
 528                  * the rcu_qsctr_inc() and rcu_bh_qsctr_inc() are see
 529                  * by other CPUs to happen after any such write.
 530                  */
 531
 532                 smp_mb();  /* See above block comment. */
 533                 rcu_qsctr_inc(cpu);
 534                 rcu_bh_qsctr_inc(cpu);
 535
 536         } else if (!in_softirq()) {
 537
 538                 /*
 539                  * Get here if this CPU did not take its interrupt from
 540                  * softirq, in other words, if it is not interrupting
 541                  * a rcu_bh read-side critical section.  This is an _bh
 542                  * critical section, so count it.  The memory barrier
 543                  * is needed for the same reason as is the above one.
 544                  */
 545
 546                 smp_mb();  /* See above block comment. */
 547                 rcu_bh_qsctr_inc(cpu);
 548         }
 549         raise_rcu_softirq();
 550 }
 551
 552 static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
 553                                                 struct rcu_data *rdp)
 554 {
 555         memset(rdp, 0, sizeof(*rdp));
 556         rdp->curtail = &rdp->curlist;
 557         rdp->nxttail = &rdp->nxtlist;
 558         rdp->donetail = &rdp->donelist;
 559         rdp->quiescbatch = rcp->completed;
 560         rdp->qs_pending = 0;
 561         rdp->cpu = cpu;
 562         rdp->blimit = blimit;
 563 }
 564
 565 static void __cpuinit rcu_online_cpu(int cpu)
 566 {
 567         struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
 568         struct rcu_data *bh_rdp = &per_cpu(rcu_bh_data, cpu);
 569
 570         rcu_init_percpu_data(cpu, &rcu_ctrlblk, rdp);
 571         rcu_init_percpu_data(cpu, &rcu_bh_ctrlblk, bh_rdp);
 572         open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 573 }
 574
 575 static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 576                                 unsigned long action, void *hcpu)
 577 {
 578         long cpu = (long)hcpu;
 579
 580         switch (action) {
 581         case CPU_UP_PREPARE:
 582         case CPU_UP_PREPARE_FROZEN:
 583                 rcu_online_cpu(cpu);
 584                 break;
 585         case CPU_DEAD:
 586         case CPU_DEAD_FROZEN:
 587                 rcu_offline_cpu(cpu);
 588                 break;
 589         default:
 590                 break;
 591         }
 592         return NOTIFY_OK;
 593 }
 594
 595 static struct notifier_block __cpuinitdata rcu_nb = {
 596         .notifier_call  = rcu_cpu_notify,
 597 };
 598
 599 /*
 600  * Initializes rcu mechanism.  Assumed to be called early.
 601  * That is before local timer(SMP) or jiffie timer (uniproc) is setup.
 602  * Note that rcu_qsctr and friends are implicitly
 603  * initialized due to the choice of ``0'' for RCU_CTR_INVALID.
 604  */
 605 void __init __rcu_init(void)
 606 {
 607         rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 608                         (void *)(long)smp_processor_id());
 609         /* Register notifier for non-boot CPUs */
 610         register_cpu_notifier(&rcu_nb);
 611 }
 612
 613 module_param(blimit, int, 0);
 614 module_param(qhimark, int, 0);
 615 module_param(qlowmark, int, 0);