Linux 2.6.32-rc7
[safe/jmp/linux-2.6] / drivers / misc / sgi-xp / xpc_main.c
1 /*
2  * This file is subject to the terms and conditions of the GNU General Public
3  * License.  See the file "COPYING" in the main directory of this archive
4  * for more details.
5  *
6  * Copyright (c) 2004-2009 Silicon Graphics, Inc.  All Rights Reserved.
7  */
8
9 /*
10  * Cross Partition Communication (XPC) support - standard version.
11  *
12  *      XPC provides a message passing capability that crosses partition
13  *      boundaries. This module is made up of two parts:
14  *
15  *          partition   This part detects the presence/absence of other
16  *                      partitions. It provides a heartbeat and monitors
17  *                      the heartbeats of other partitions.
18  *
19  *          channel     This part manages the channels and sends/receives
20  *                      messages across them to/from other partitions.
21  *
22  *      There are a couple of additional functions residing in XP, which
23  *      provide an interface to XPC for its users.
24  *
25  *
26  *      Caveats:
27  *
28  *        . Currently on sn2, we have no way to determine which nasid an IRQ
29  *          came from. Thus, xpc_send_IRQ_sn2() does a remote amo write
30  *          followed by an IPI. The amo indicates where data is to be pulled
31  *          from, so after the IPI arrives, the remote partition checks the amo
32  *          word. The IPI can actually arrive before the amo however, so other
33  *          code must periodically check for this case. Also, remote amo
34  *          operations do not reliably time out. Thus we do a remote PIO read
35  *          solely to know whether the remote partition is down and whether we
36  *          should stop sending IPIs to it. This remote PIO read operation is
37  *          set up in a special nofault region so SAL knows to ignore (and
38  *          cleanup) any errors due to the remote amo write, PIO read, and/or
39  *          PIO write operations.
40  *
41  *          If/when new hardware solves this IPI problem, we should abandon
42  *          the current approach.
43  *
44  */
45
46 #include <linux/module.h>
47 #include <linux/sysctl.h>
48 #include <linux/device.h>
49 #include <linux/delay.h>
50 #include <linux/reboot.h>
51 #include <linux/kdebug.h>
52 #include <linux/kthread.h>
53 #include "xpc.h"
54
55 /* define two XPC debug device structures to be used with dev_dbg() et al */
56
57 struct device_driver xpc_dbg_name = {
58         .name = "xpc"
59 };
60
61 struct device xpc_part_dbg_subname = {
62         .init_name = "",        /* set to "part" at xpc_init() time */
63         .driver = &xpc_dbg_name
64 };
65
66 struct device xpc_chan_dbg_subname = {
67         .init_name = "",        /* set to "chan" at xpc_init() time */
68         .driver = &xpc_dbg_name
69 };
70
71 struct device *xpc_part = &xpc_part_dbg_subname;
72 struct device *xpc_chan = &xpc_chan_dbg_subname;
73
74 static int xpc_kdebug_ignore;
75
76 /* systune related variables for /proc/sys directories */
77
78 static int xpc_hb_interval = XPC_HB_DEFAULT_INTERVAL;
79 static int xpc_hb_min_interval = 1;
80 static int xpc_hb_max_interval = 10;
81
82 static int xpc_hb_check_interval = XPC_HB_CHECK_DEFAULT_INTERVAL;
83 static int xpc_hb_check_min_interval = 10;
84 static int xpc_hb_check_max_interval = 120;
85
86 int xpc_disengage_timelimit = XPC_DISENGAGE_DEFAULT_TIMELIMIT;
87 static int xpc_disengage_min_timelimit; /* = 0 */
88 static int xpc_disengage_max_timelimit = 120;
89
90 static ctl_table xpc_sys_xpc_hb_dir[] = {
91         {
92          .ctl_name = CTL_UNNUMBERED,
93          .procname = "hb_interval",
94          .data = &xpc_hb_interval,
95          .maxlen = sizeof(int),
96          .mode = 0644,
97          .proc_handler = &proc_dointvec_minmax,
98          .strategy = &sysctl_intvec,
99          .extra1 = &xpc_hb_min_interval,
100          .extra2 = &xpc_hb_max_interval},
101         {
102          .ctl_name = CTL_UNNUMBERED,
103          .procname = "hb_check_interval",
104          .data = &xpc_hb_check_interval,
105          .maxlen = sizeof(int),
106          .mode = 0644,
107          .proc_handler = &proc_dointvec_minmax,
108          .strategy = &sysctl_intvec,
109          .extra1 = &xpc_hb_check_min_interval,
110          .extra2 = &xpc_hb_check_max_interval},
111         {}
112 };
113 static ctl_table xpc_sys_xpc_dir[] = {
114         {
115          .ctl_name = CTL_UNNUMBERED,
116          .procname = "hb",
117          .mode = 0555,
118          .child = xpc_sys_xpc_hb_dir},
119         {
120          .ctl_name = CTL_UNNUMBERED,
121          .procname = "disengage_timelimit",
122          .data = &xpc_disengage_timelimit,
123          .maxlen = sizeof(int),
124          .mode = 0644,
125          .proc_handler = &proc_dointvec_minmax,
126          .strategy = &sysctl_intvec,
127          .extra1 = &xpc_disengage_min_timelimit,
128          .extra2 = &xpc_disengage_max_timelimit},
129         {}
130 };
131 static ctl_table xpc_sys_dir[] = {
132         {
133          .ctl_name = CTL_UNNUMBERED,
134          .procname = "xpc",
135          .mode = 0555,
136          .child = xpc_sys_xpc_dir},
137         {}
138 };
139 static struct ctl_table_header *xpc_sysctl;
140
141 /* non-zero if any remote partition disengage was timed out */
142 int xpc_disengage_timedout;
143
144 /* #of activate IRQs received and not yet processed */
145 int xpc_activate_IRQ_rcvd;
146 DEFINE_SPINLOCK(xpc_activate_IRQ_rcvd_lock);
147
148 /* IRQ handler notifies this wait queue on receipt of an IRQ */
149 DECLARE_WAIT_QUEUE_HEAD(xpc_activate_IRQ_wq);
150
151 static unsigned long xpc_hb_check_timeout;
152 static struct timer_list xpc_hb_timer;
153
154 /* notification that the xpc_hb_checker thread has exited */
155 static DECLARE_COMPLETION(xpc_hb_checker_exited);
156
157 /* notification that the xpc_discovery thread has exited */
158 static DECLARE_COMPLETION(xpc_discovery_exited);
159
160 static void xpc_kthread_waitmsgs(struct xpc_partition *, struct xpc_channel *);
161
162 static int xpc_system_reboot(struct notifier_block *, unsigned long, void *);
163 static struct notifier_block xpc_reboot_notifier = {
164         .notifier_call = xpc_system_reboot,
165 };
166
167 static int xpc_system_die(struct notifier_block *, unsigned long, void *);
168 static struct notifier_block xpc_die_notifier = {
169         .notifier_call = xpc_system_die,
170 };
171
172 struct xpc_arch_operations xpc_arch_ops;
173
174 /*
175  * Timer function to enforce the timelimit on the partition disengage.
176  */
177 static void
178 xpc_timeout_partition_disengage(unsigned long data)
179 {
180         struct xpc_partition *part = (struct xpc_partition *)data;
181
182         DBUG_ON(time_is_after_jiffies(part->disengage_timeout));
183
184         (void)xpc_partition_disengaged(part);
185
186         DBUG_ON(part->disengage_timeout != 0);
187         DBUG_ON(xpc_arch_ops.partition_engaged(XPC_PARTID(part)));
188 }
189
190 /*
191  * Timer to produce the heartbeat.  The timer structures function is
192  * already set when this is initially called.  A tunable is used to
193  * specify when the next timeout should occur.
194  */
195 static void
196 xpc_hb_beater(unsigned long dummy)
197 {
198         xpc_arch_ops.increment_heartbeat();
199
200         if (time_is_before_eq_jiffies(xpc_hb_check_timeout))
201                 wake_up_interruptible(&xpc_activate_IRQ_wq);
202
203         xpc_hb_timer.expires = jiffies + (xpc_hb_interval * HZ);
204         add_timer(&xpc_hb_timer);
205 }
206
207 static void
208 xpc_start_hb_beater(void)
209 {
210         xpc_arch_ops.heartbeat_init();
211         init_timer(&xpc_hb_timer);
212         xpc_hb_timer.function = xpc_hb_beater;
213         xpc_hb_beater(0);
214 }
215
216 static void
217 xpc_stop_hb_beater(void)
218 {
219         del_timer_sync(&xpc_hb_timer);
220         xpc_arch_ops.heartbeat_exit();
221 }
222
223 /*
224  * At periodic intervals, scan through all active partitions and ensure
225  * their heartbeat is still active.  If not, the partition is deactivated.
226  */
227 static void
228 xpc_check_remote_hb(void)
229 {
230         struct xpc_partition *part;
231         short partid;
232         enum xp_retval ret;
233
234         for (partid = 0; partid < xp_max_npartitions; partid++) {
235
236                 if (xpc_exiting)
237                         break;
238
239                 if (partid == xp_partition_id)
240                         continue;
241
242                 part = &xpc_partitions[partid];
243
244                 if (part->act_state == XPC_P_AS_INACTIVE ||
245                     part->act_state == XPC_P_AS_DEACTIVATING) {
246                         continue;
247                 }
248
249                 ret = xpc_arch_ops.get_remote_heartbeat(part);
250                 if (ret != xpSuccess)
251                         XPC_DEACTIVATE_PARTITION(part, ret);
252         }
253 }
254
255 /*
256  * This thread is responsible for nearly all of the partition
257  * activation/deactivation.
258  */
259 static int
260 xpc_hb_checker(void *ignore)
261 {
262         int force_IRQ = 0;
263
264         /* this thread was marked active by xpc_hb_init() */
265
266         set_cpus_allowed_ptr(current, cpumask_of(XPC_HB_CHECK_CPU));
267
268         /* set our heartbeating to other partitions into motion */
269         xpc_hb_check_timeout = jiffies + (xpc_hb_check_interval * HZ);
270         xpc_start_hb_beater();
271
272         while (!xpc_exiting) {
273
274                 dev_dbg(xpc_part, "woke up with %d ticks rem; %d IRQs have "
275                         "been received\n",
276                         (int)(xpc_hb_check_timeout - jiffies),
277                         xpc_activate_IRQ_rcvd);
278
279                 /* checking of remote heartbeats is skewed by IRQ handling */
280                 if (time_is_before_eq_jiffies(xpc_hb_check_timeout)) {
281                         xpc_hb_check_timeout = jiffies +
282                             (xpc_hb_check_interval * HZ);
283
284                         dev_dbg(xpc_part, "checking remote heartbeats\n");
285                         xpc_check_remote_hb();
286
287                         /*
288                          * On sn2 we need to periodically recheck to ensure no
289                          * IRQ/amo pairs have been missed.
290                          */
291                         if (is_shub())
292                                 force_IRQ = 1;
293                 }
294
295                 /* check for outstanding IRQs */
296                 if (xpc_activate_IRQ_rcvd > 0 || force_IRQ != 0) {
297                         force_IRQ = 0;
298                         dev_dbg(xpc_part, "processing activate IRQs "
299                                 "received\n");
300                         xpc_arch_ops.process_activate_IRQ_rcvd();
301                 }
302
303                 /* wait for IRQ or timeout */
304                 (void)wait_event_interruptible(xpc_activate_IRQ_wq,
305                                                (time_is_before_eq_jiffies(
306                                                 xpc_hb_check_timeout) ||
307                                                 xpc_activate_IRQ_rcvd > 0 ||
308                                                 xpc_exiting));
309         }
310
311         xpc_stop_hb_beater();
312
313         dev_dbg(xpc_part, "heartbeat checker is exiting\n");
314
315         /* mark this thread as having exited */
316         complete(&xpc_hb_checker_exited);
317         return 0;
318 }
319
320 /*
321  * This thread will attempt to discover other partitions to activate
322  * based on info provided by SAL. This new thread is short lived and
323  * will exit once discovery is complete.
324  */
325 static int
326 xpc_initiate_discovery(void *ignore)
327 {
328         xpc_discovery();
329
330         dev_dbg(xpc_part, "discovery thread is exiting\n");
331
332         /* mark this thread as having exited */
333         complete(&xpc_discovery_exited);
334         return 0;
335 }
336
337 /*
338  * The first kthread assigned to a newly activated partition is the one
339  * created by XPC HB with which it calls xpc_activating(). XPC hangs on to
340  * that kthread until the partition is brought down, at which time that kthread
341  * returns back to XPC HB. (The return of that kthread will signify to XPC HB
342  * that XPC has dismantled all communication infrastructure for the associated
343  * partition.) This kthread becomes the channel manager for that partition.
344  *
345  * Each active partition has a channel manager, who, besides connecting and
346  * disconnecting channels, will ensure that each of the partition's connected
347  * channels has the required number of assigned kthreads to get the work done.
348  */
349 static void
350 xpc_channel_mgr(struct xpc_partition *part)
351 {
352         while (part->act_state != XPC_P_AS_DEACTIVATING ||
353                atomic_read(&part->nchannels_active) > 0 ||
354                !xpc_partition_disengaged(part)) {
355
356                 xpc_process_sent_chctl_flags(part);
357
358                 /*
359                  * Wait until we've been requested to activate kthreads or
360                  * all of the channel's message queues have been torn down or
361                  * a signal is pending.
362                  *
363                  * The channel_mgr_requests is set to 1 after being awakened,
364                  * This is done to prevent the channel mgr from making one pass
365                  * through the loop for each request, since he will
366                  * be servicing all the requests in one pass. The reason it's
367                  * set to 1 instead of 0 is so that other kthreads will know
368                  * that the channel mgr is running and won't bother trying to
369                  * wake him up.
370                  */
371                 atomic_dec(&part->channel_mgr_requests);
372                 (void)wait_event_interruptible(part->channel_mgr_wq,
373                                 (atomic_read(&part->channel_mgr_requests) > 0 ||
374                                  part->chctl.all_flags != 0 ||
375                                  (part->act_state == XPC_P_AS_DEACTIVATING &&
376                                  atomic_read(&part->nchannels_active) == 0 &&
377                                  xpc_partition_disengaged(part))));
378                 atomic_set(&part->channel_mgr_requests, 1);
379         }
380 }
381
382 /*
383  * Guarantee that the kzalloc'd memory is cacheline aligned.
384  */
385 void *
386 xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base)
387 {
388         /* see if kzalloc will give us cachline aligned memory by default */
389         *base = kzalloc(size, flags);
390         if (*base == NULL)
391                 return NULL;
392
393         if ((u64)*base == L1_CACHE_ALIGN((u64)*base))
394                 return *base;
395
396         kfree(*base);
397
398         /* nope, we'll have to do it ourselves */
399         *base = kzalloc(size + L1_CACHE_BYTES, flags);
400         if (*base == NULL)
401                 return NULL;
402
403         return (void *)L1_CACHE_ALIGN((u64)*base);
404 }
405
406 /*
407  * Setup the channel structures necessary to support XPartition Communication
408  * between the specified remote partition and the local one.
409  */
410 static enum xp_retval
411 xpc_setup_ch_structures(struct xpc_partition *part)
412 {
413         enum xp_retval ret;
414         int ch_number;
415         struct xpc_channel *ch;
416         short partid = XPC_PARTID(part);
417
418         /*
419          * Allocate all of the channel structures as a contiguous chunk of
420          * memory.
421          */
422         DBUG_ON(part->channels != NULL);
423         part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_MAX_NCHANNELS,
424                                  GFP_KERNEL);
425         if (part->channels == NULL) {
426                 dev_err(xpc_chan, "can't get memory for channels\n");
427                 return xpNoMemory;
428         }
429
430         /* allocate the remote open and close args */
431
432         part->remote_openclose_args =
433             xpc_kzalloc_cacheline_aligned(XPC_OPENCLOSE_ARGS_SIZE,
434                                           GFP_KERNEL, &part->
435                                           remote_openclose_args_base);
436         if (part->remote_openclose_args == NULL) {
437                 dev_err(xpc_chan, "can't get memory for remote connect args\n");
438                 ret = xpNoMemory;
439                 goto out_1;
440         }
441
442         part->chctl.all_flags = 0;
443         spin_lock_init(&part->chctl_lock);
444
445         atomic_set(&part->channel_mgr_requests, 1);
446         init_waitqueue_head(&part->channel_mgr_wq);
447
448         part->nchannels = XPC_MAX_NCHANNELS;
449
450         atomic_set(&part->nchannels_active, 0);
451         atomic_set(&part->nchannels_engaged, 0);
452
453         for (ch_number = 0; ch_number < part->nchannels; ch_number++) {
454                 ch = &part->channels[ch_number];
455
456                 ch->partid = partid;
457                 ch->number = ch_number;
458                 ch->flags = XPC_C_DISCONNECTED;
459
460                 atomic_set(&ch->kthreads_assigned, 0);
461                 atomic_set(&ch->kthreads_idle, 0);
462                 atomic_set(&ch->kthreads_active, 0);
463
464                 atomic_set(&ch->references, 0);
465                 atomic_set(&ch->n_to_notify, 0);
466
467                 spin_lock_init(&ch->lock);
468                 init_completion(&ch->wdisconnect_wait);
469
470                 atomic_set(&ch->n_on_msg_allocate_wq, 0);
471                 init_waitqueue_head(&ch->msg_allocate_wq);
472                 init_waitqueue_head(&ch->idle_wq);
473         }
474
475         ret = xpc_arch_ops.setup_ch_structures(part);
476         if (ret != xpSuccess)
477                 goto out_2;
478
479         /*
480          * With the setting of the partition setup_state to XPC_P_SS_SETUP,
481          * we're declaring that this partition is ready to go.
482          */
483         part->setup_state = XPC_P_SS_SETUP;
484
485         return xpSuccess;
486
487         /* setup of ch structures failed */
488 out_2:
489         kfree(part->remote_openclose_args_base);
490         part->remote_openclose_args = NULL;
491 out_1:
492         kfree(part->channels);
493         part->channels = NULL;
494         return ret;
495 }
496
497 /*
498  * Teardown the channel structures necessary to support XPartition Communication
499  * between the specified remote partition and the local one.
500  */
501 static void
502 xpc_teardown_ch_structures(struct xpc_partition *part)
503 {
504         DBUG_ON(atomic_read(&part->nchannels_engaged) != 0);
505         DBUG_ON(atomic_read(&part->nchannels_active) != 0);
506
507         /*
508          * Make this partition inaccessible to local processes by marking it
509          * as no longer setup. Then wait before proceeding with the teardown
510          * until all existing references cease.
511          */
512         DBUG_ON(part->setup_state != XPC_P_SS_SETUP);
513         part->setup_state = XPC_P_SS_WTEARDOWN;
514
515         wait_event(part->teardown_wq, (atomic_read(&part->references) == 0));
516
517         /* now we can begin tearing down the infrastructure */
518
519         xpc_arch_ops.teardown_ch_structures(part);
520
521         kfree(part->remote_openclose_args_base);
522         part->remote_openclose_args = NULL;
523         kfree(part->channels);
524         part->channels = NULL;
525
526         part->setup_state = XPC_P_SS_TORNDOWN;
527 }
528
529 /*
530  * When XPC HB determines that a partition has come up, it will create a new
531  * kthread and that kthread will call this function to attempt to set up the
532  * basic infrastructure used for Cross Partition Communication with the newly
533  * upped partition.
534  *
535  * The kthread that was created by XPC HB and which setup the XPC
536  * infrastructure will remain assigned to the partition becoming the channel
537  * manager for that partition until the partition is deactivating, at which
538  * time the kthread will teardown the XPC infrastructure and then exit.
539  */
540 static int
541 xpc_activating(void *__partid)
542 {
543         short partid = (u64)__partid;
544         struct xpc_partition *part = &xpc_partitions[partid];
545         unsigned long irq_flags;
546
547         DBUG_ON(partid < 0 || partid >= xp_max_npartitions);
548
549         spin_lock_irqsave(&part->act_lock, irq_flags);
550
551         if (part->act_state == XPC_P_AS_DEACTIVATING) {
552                 part->act_state = XPC_P_AS_INACTIVE;
553                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
554                 part->remote_rp_pa = 0;
555                 return 0;
556         }
557
558         /* indicate the thread is activating */
559         DBUG_ON(part->act_state != XPC_P_AS_ACTIVATION_REQ);
560         part->act_state = XPC_P_AS_ACTIVATING;
561
562         XPC_SET_REASON(part, 0, 0);
563         spin_unlock_irqrestore(&part->act_lock, irq_flags);
564
565         dev_dbg(xpc_part, "activating partition %d\n", partid);
566
567         xpc_arch_ops.allow_hb(partid);
568
569         if (xpc_setup_ch_structures(part) == xpSuccess) {
570                 (void)xpc_part_ref(part);       /* this will always succeed */
571
572                 if (xpc_arch_ops.make_first_contact(part) == xpSuccess) {
573                         xpc_mark_partition_active(part);
574                         xpc_channel_mgr(part);
575                         /* won't return until partition is deactivating */
576                 }
577
578                 xpc_part_deref(part);
579                 xpc_teardown_ch_structures(part);
580         }
581
582         xpc_arch_ops.disallow_hb(partid);
583         xpc_mark_partition_inactive(part);
584
585         if (part->reason == xpReactivating) {
586                 /* interrupting ourselves results in activating partition */
587                 xpc_arch_ops.request_partition_reactivation(part);
588         }
589
590         return 0;
591 }
592
593 void
594 xpc_activate_partition(struct xpc_partition *part)
595 {
596         short partid = XPC_PARTID(part);
597         unsigned long irq_flags;
598         struct task_struct *kthread;
599
600         spin_lock_irqsave(&part->act_lock, irq_flags);
601
602         DBUG_ON(part->act_state != XPC_P_AS_INACTIVE);
603
604         part->act_state = XPC_P_AS_ACTIVATION_REQ;
605         XPC_SET_REASON(part, xpCloneKThread, __LINE__);
606
607         spin_unlock_irqrestore(&part->act_lock, irq_flags);
608
609         kthread = kthread_run(xpc_activating, (void *)((u64)partid), "xpc%02d",
610                               partid);
611         if (IS_ERR(kthread)) {
612                 spin_lock_irqsave(&part->act_lock, irq_flags);
613                 part->act_state = XPC_P_AS_INACTIVE;
614                 XPC_SET_REASON(part, xpCloneKThreadFailed, __LINE__);
615                 spin_unlock_irqrestore(&part->act_lock, irq_flags);
616         }
617 }
618
619 void
620 xpc_activate_kthreads(struct xpc_channel *ch, int needed)
621 {
622         int idle = atomic_read(&ch->kthreads_idle);
623         int assigned = atomic_read(&ch->kthreads_assigned);
624         int wakeup;
625
626         DBUG_ON(needed <= 0);
627
628         if (idle > 0) {
629                 wakeup = (needed > idle) ? idle : needed;
630                 needed -= wakeup;
631
632                 dev_dbg(xpc_chan, "wakeup %d idle kthreads, partid=%d, "
633                         "channel=%d\n", wakeup, ch->partid, ch->number);
634
635                 /* only wakeup the requested number of kthreads */
636                 wake_up_nr(&ch->idle_wq, wakeup);
637         }
638
639         if (needed <= 0)
640                 return;
641
642         if (needed + assigned > ch->kthreads_assigned_limit) {
643                 needed = ch->kthreads_assigned_limit - assigned;
644                 if (needed <= 0)
645                         return;
646         }
647
648         dev_dbg(xpc_chan, "create %d new kthreads, partid=%d, channel=%d\n",
649                 needed, ch->partid, ch->number);
650
651         xpc_create_kthreads(ch, needed, 0);
652 }
653
654 /*
655  * This function is where XPC's kthreads wait for messages to deliver.
656  */
657 static void
658 xpc_kthread_waitmsgs(struct xpc_partition *part, struct xpc_channel *ch)
659 {
660         int (*n_of_deliverable_payloads) (struct xpc_channel *) =
661                 xpc_arch_ops.n_of_deliverable_payloads;
662
663         do {
664                 /* deliver messages to their intended recipients */
665
666                 while (n_of_deliverable_payloads(ch) > 0 &&
667                        !(ch->flags & XPC_C_DISCONNECTING)) {
668                         xpc_deliver_payload(ch);
669                 }
670
671                 if (atomic_inc_return(&ch->kthreads_idle) >
672                     ch->kthreads_idle_limit) {
673                         /* too many idle kthreads on this channel */
674                         atomic_dec(&ch->kthreads_idle);
675                         break;
676                 }
677
678                 dev_dbg(xpc_chan, "idle kthread calling "
679                         "wait_event_interruptible_exclusive()\n");
680
681                 (void)wait_event_interruptible_exclusive(ch->idle_wq,
682                                 (n_of_deliverable_payloads(ch) > 0 ||
683                                  (ch->flags & XPC_C_DISCONNECTING)));
684
685                 atomic_dec(&ch->kthreads_idle);
686
687         } while (!(ch->flags & XPC_C_DISCONNECTING));
688 }
689
690 static int
691 xpc_kthread_start(void *args)
692 {
693         short partid = XPC_UNPACK_ARG1(args);
694         u16 ch_number = XPC_UNPACK_ARG2(args);
695         struct xpc_partition *part = &xpc_partitions[partid];
696         struct xpc_channel *ch;
697         int n_needed;
698         unsigned long irq_flags;
699         int (*n_of_deliverable_payloads) (struct xpc_channel *) =
700                 xpc_arch_ops.n_of_deliverable_payloads;
701
702         dev_dbg(xpc_chan, "kthread starting, partid=%d, channel=%d\n",
703                 partid, ch_number);
704
705         ch = &part->channels[ch_number];
706
707         if (!(ch->flags & XPC_C_DISCONNECTING)) {
708
709                 /* let registerer know that connection has been established */
710
711                 spin_lock_irqsave(&ch->lock, irq_flags);
712                 if (!(ch->flags & XPC_C_CONNECTEDCALLOUT)) {
713                         ch->flags |= XPC_C_CONNECTEDCALLOUT;
714                         spin_unlock_irqrestore(&ch->lock, irq_flags);
715
716                         xpc_connected_callout(ch);
717
718                         spin_lock_irqsave(&ch->lock, irq_flags);
719                         ch->flags |= XPC_C_CONNECTEDCALLOUT_MADE;
720                         spin_unlock_irqrestore(&ch->lock, irq_flags);
721
722                         /*
723                          * It is possible that while the callout was being
724                          * made that the remote partition sent some messages.
725                          * If that is the case, we may need to activate
726                          * additional kthreads to help deliver them. We only
727                          * need one less than total #of messages to deliver.
728                          */
729                         n_needed = n_of_deliverable_payloads(ch) - 1;
730                         if (n_needed > 0 && !(ch->flags & XPC_C_DISCONNECTING))
731                                 xpc_activate_kthreads(ch, n_needed);
732
733                 } else {
734                         spin_unlock_irqrestore(&ch->lock, irq_flags);
735                 }
736
737                 xpc_kthread_waitmsgs(part, ch);
738         }
739
740         /* let registerer know that connection is disconnecting */
741
742         spin_lock_irqsave(&ch->lock, irq_flags);
743         if ((ch->flags & XPC_C_CONNECTEDCALLOUT_MADE) &&
744             !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) {
745                 ch->flags |= XPC_C_DISCONNECTINGCALLOUT;
746                 spin_unlock_irqrestore(&ch->lock, irq_flags);
747
748                 xpc_disconnect_callout(ch, xpDisconnecting);
749
750                 spin_lock_irqsave(&ch->lock, irq_flags);
751                 ch->flags |= XPC_C_DISCONNECTINGCALLOUT_MADE;
752         }
753         spin_unlock_irqrestore(&ch->lock, irq_flags);
754
755         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
756             atomic_dec_return(&part->nchannels_engaged) == 0) {
757                 xpc_arch_ops.indicate_partition_disengaged(part);
758         }
759
760         xpc_msgqueue_deref(ch);
761
762         dev_dbg(xpc_chan, "kthread exiting, partid=%d, channel=%d\n",
763                 partid, ch_number);
764
765         xpc_part_deref(part);
766         return 0;
767 }
768
769 /*
770  * For each partition that XPC has established communications with, there is
771  * a minimum of one kernel thread assigned to perform any operation that
772  * may potentially sleep or block (basically the callouts to the asynchronous
773  * functions registered via xpc_connect()).
774  *
775  * Additional kthreads are created and destroyed by XPC as the workload
776  * demands.
777  *
778  * A kthread is assigned to one of the active channels that exists for a given
779  * partition.
780  */
781 void
782 xpc_create_kthreads(struct xpc_channel *ch, int needed,
783                     int ignore_disconnecting)
784 {
785         unsigned long irq_flags;
786         u64 args = XPC_PACK_ARGS(ch->partid, ch->number);
787         struct xpc_partition *part = &xpc_partitions[ch->partid];
788         struct task_struct *kthread;
789         void (*indicate_partition_disengaged) (struct xpc_partition *) =
790                 xpc_arch_ops.indicate_partition_disengaged;
791
792         while (needed-- > 0) {
793
794                 /*
795                  * The following is done on behalf of the newly created
796                  * kthread. That kthread is responsible for doing the
797                  * counterpart to the following before it exits.
798                  */
799                 if (ignore_disconnecting) {
800                         if (!atomic_inc_not_zero(&ch->kthreads_assigned)) {
801                                 /* kthreads assigned had gone to zero */
802                                 BUG_ON(!(ch->flags &
803                                          XPC_C_DISCONNECTINGCALLOUT_MADE));
804                                 break;
805                         }
806
807                 } else if (ch->flags & XPC_C_DISCONNECTING) {
808                         break;
809
810                 } else if (atomic_inc_return(&ch->kthreads_assigned) == 1 &&
811                            atomic_inc_return(&part->nchannels_engaged) == 1) {
812                         xpc_arch_ops.indicate_partition_engaged(part);
813                 }
814                 (void)xpc_part_ref(part);
815                 xpc_msgqueue_ref(ch);
816
817                 kthread = kthread_run(xpc_kthread_start, (void *)args,
818                                       "xpc%02dc%d", ch->partid, ch->number);
819                 if (IS_ERR(kthread)) {
820                         /* the fork failed */
821
822                         /*
823                          * NOTE: if (ignore_disconnecting &&
824                          * !(ch->flags & XPC_C_DISCONNECTINGCALLOUT)) is true,
825                          * then we'll deadlock if all other kthreads assigned
826                          * to this channel are blocked in the channel's
827                          * registerer, because the only thing that will unblock
828                          * them is the xpDisconnecting callout that this
829                          * failed kthread_run() would have made.
830                          */
831
832                         if (atomic_dec_return(&ch->kthreads_assigned) == 0 &&
833                             atomic_dec_return(&part->nchannels_engaged) == 0) {
834                                 indicate_partition_disengaged(part);
835                         }
836                         xpc_msgqueue_deref(ch);
837                         xpc_part_deref(part);
838
839                         if (atomic_read(&ch->kthreads_assigned) <
840                             ch->kthreads_idle_limit) {
841                                 /*
842                                  * Flag this as an error only if we have an
843                                  * insufficient #of kthreads for the channel
844                                  * to function.
845                                  */
846                                 spin_lock_irqsave(&ch->lock, irq_flags);
847                                 XPC_DISCONNECT_CHANNEL(ch, xpLackOfResources,
848                                                        &irq_flags);
849                                 spin_unlock_irqrestore(&ch->lock, irq_flags);
850                         }
851                         break;
852                 }
853         }
854 }
855
856 void
857 xpc_disconnect_wait(int ch_number)
858 {
859         unsigned long irq_flags;
860         short partid;
861         struct xpc_partition *part;
862         struct xpc_channel *ch;
863         int wakeup_channel_mgr;
864
865         /* now wait for all callouts to the caller's function to cease */
866         for (partid = 0; partid < xp_max_npartitions; partid++) {
867                 part = &xpc_partitions[partid];
868
869                 if (!xpc_part_ref(part))
870                         continue;
871
872                 ch = &part->channels[ch_number];
873
874                 if (!(ch->flags & XPC_C_WDISCONNECT)) {
875                         xpc_part_deref(part);
876                         continue;
877                 }
878
879                 wait_for_completion(&ch->wdisconnect_wait);
880
881                 spin_lock_irqsave(&ch->lock, irq_flags);
882                 DBUG_ON(!(ch->flags & XPC_C_DISCONNECTED));
883                 wakeup_channel_mgr = 0;
884
885                 if (ch->delayed_chctl_flags) {
886                         if (part->act_state != XPC_P_AS_DEACTIVATING) {
887                                 spin_lock(&part->chctl_lock);
888                                 part->chctl.flags[ch->number] |=
889                                     ch->delayed_chctl_flags;
890                                 spin_unlock(&part->chctl_lock);
891                                 wakeup_channel_mgr = 1;
892                         }
893                         ch->delayed_chctl_flags = 0;
894                 }
895
896                 ch->flags &= ~XPC_C_WDISCONNECT;
897                 spin_unlock_irqrestore(&ch->lock, irq_flags);
898
899                 if (wakeup_channel_mgr)
900                         xpc_wakeup_channel_mgr(part);
901
902                 xpc_part_deref(part);
903         }
904 }
905
906 static int
907 xpc_setup_partitions(void)
908 {
909         short partid;
910         struct xpc_partition *part;
911
912         xpc_partitions = kzalloc(sizeof(struct xpc_partition) *
913                                  xp_max_npartitions, GFP_KERNEL);
914         if (xpc_partitions == NULL) {
915                 dev_err(xpc_part, "can't get memory for partition structure\n");
916                 return -ENOMEM;
917         }
918
919         /*
920          * The first few fields of each entry of xpc_partitions[] need to
921          * be initialized now so that calls to xpc_connect() and
922          * xpc_disconnect() can be made prior to the activation of any remote
923          * partition. NOTE THAT NONE OF THE OTHER FIELDS BELONGING TO THESE
924          * ENTRIES ARE MEANINGFUL UNTIL AFTER AN ENTRY'S CORRESPONDING
925          * PARTITION HAS BEEN ACTIVATED.
926          */
927         for (partid = 0; partid < xp_max_npartitions; partid++) {
928                 part = &xpc_partitions[partid];
929
930                 DBUG_ON((u64)part != L1_CACHE_ALIGN((u64)part));
931
932                 part->activate_IRQ_rcvd = 0;
933                 spin_lock_init(&part->act_lock);
934                 part->act_state = XPC_P_AS_INACTIVE;
935                 XPC_SET_REASON(part, 0, 0);
936
937                 init_timer(&part->disengage_timer);
938                 part->disengage_timer.function =
939                     xpc_timeout_partition_disengage;
940                 part->disengage_timer.data = (unsigned long)part;
941
942                 part->setup_state = XPC_P_SS_UNSET;
943                 init_waitqueue_head(&part->teardown_wq);
944                 atomic_set(&part->references, 0);
945         }
946
947         return xpc_arch_ops.setup_partitions();
948 }
949
950 static void
951 xpc_teardown_partitions(void)
952 {
953         xpc_arch_ops.teardown_partitions();
954         kfree(xpc_partitions);
955 }
956
957 static void
958 xpc_do_exit(enum xp_retval reason)
959 {
960         short partid;
961         int active_part_count, printed_waiting_msg = 0;
962         struct xpc_partition *part;
963         unsigned long printmsg_time, disengage_timeout = 0;
964
965         /* a 'rmmod XPC' and a 'reboot' cannot both end up here together */
966         DBUG_ON(xpc_exiting == 1);
967
968         /*
969          * Let the heartbeat checker thread and the discovery thread
970          * (if one is running) know that they should exit. Also wake up
971          * the heartbeat checker thread in case it's sleeping.
972          */
973         xpc_exiting = 1;
974         wake_up_interruptible(&xpc_activate_IRQ_wq);
975
976         /* wait for the discovery thread to exit */
977         wait_for_completion(&xpc_discovery_exited);
978
979         /* wait for the heartbeat checker thread to exit */
980         wait_for_completion(&xpc_hb_checker_exited);
981
982         /* sleep for a 1/3 of a second or so */
983         (void)msleep_interruptible(300);
984
985         /* wait for all partitions to become inactive */
986
987         printmsg_time = jiffies + (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
988         xpc_disengage_timedout = 0;
989
990         do {
991                 active_part_count = 0;
992
993                 for (partid = 0; partid < xp_max_npartitions; partid++) {
994                         part = &xpc_partitions[partid];
995
996                         if (xpc_partition_disengaged(part) &&
997                             part->act_state == XPC_P_AS_INACTIVE) {
998                                 continue;
999                         }
1000
1001                         active_part_count++;
1002
1003                         XPC_DEACTIVATE_PARTITION(part, reason);
1004
1005                         if (part->disengage_timeout > disengage_timeout)
1006                                 disengage_timeout = part->disengage_timeout;
1007                 }
1008
1009                 if (xpc_arch_ops.any_partition_engaged()) {
1010                         if (time_is_before_jiffies(printmsg_time)) {
1011                                 dev_info(xpc_part, "waiting for remote "
1012                                          "partitions to deactivate, timeout in "
1013                                          "%ld seconds\n", (disengage_timeout -
1014                                          jiffies) / HZ);
1015                                 printmsg_time = jiffies +
1016                                     (XPC_DEACTIVATE_PRINTMSG_INTERVAL * HZ);
1017                                 printed_waiting_msg = 1;
1018                         }
1019
1020                 } else if (active_part_count > 0) {
1021                         if (printed_waiting_msg) {
1022                                 dev_info(xpc_part, "waiting for local partition"
1023                                          " to deactivate\n");
1024                                 printed_waiting_msg = 0;
1025                         }
1026
1027                 } else {
1028                         if (!xpc_disengage_timedout) {
1029                                 dev_info(xpc_part, "all partitions have "
1030                                          "deactivated\n");
1031                         }
1032                         break;
1033                 }
1034
1035                 /* sleep for a 1/3 of a second or so */
1036                 (void)msleep_interruptible(300);
1037
1038         } while (1);
1039
1040         DBUG_ON(xpc_arch_ops.any_partition_engaged());
1041
1042         xpc_teardown_rsvd_page();
1043
1044         if (reason == xpUnloading) {
1045                 (void)unregister_die_notifier(&xpc_die_notifier);
1046                 (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1047         }
1048
1049         /* clear the interface to XPC's functions */
1050         xpc_clear_interface();
1051
1052         if (xpc_sysctl)
1053                 unregister_sysctl_table(xpc_sysctl);
1054
1055         xpc_teardown_partitions();
1056
1057         if (is_shub())
1058                 xpc_exit_sn2();
1059         else if (is_uv())
1060                 xpc_exit_uv();
1061 }
1062
1063 /*
1064  * This function is called when the system is being rebooted.
1065  */
1066 static int
1067 xpc_system_reboot(struct notifier_block *nb, unsigned long event, void *unused)
1068 {
1069         enum xp_retval reason;
1070
1071         switch (event) {
1072         case SYS_RESTART:
1073                 reason = xpSystemReboot;
1074                 break;
1075         case SYS_HALT:
1076                 reason = xpSystemHalt;
1077                 break;
1078         case SYS_POWER_OFF:
1079                 reason = xpSystemPoweroff;
1080                 break;
1081         default:
1082                 reason = xpSystemGoingDown;
1083         }
1084
1085         xpc_do_exit(reason);
1086         return NOTIFY_DONE;
1087 }
1088
1089 /*
1090  * Notify other partitions to deactivate from us by first disengaging from all
1091  * references to our memory.
1092  */
1093 static void
1094 xpc_die_deactivate(void)
1095 {
1096         struct xpc_partition *part;
1097         short partid;
1098         int any_engaged;
1099         long keep_waiting;
1100         long wait_to_print;
1101
1102         /* keep xpc_hb_checker thread from doing anything (just in case) */
1103         xpc_exiting = 1;
1104
1105         xpc_arch_ops.disallow_all_hbs();   /*indicate we're deactivated */
1106
1107         for (partid = 0; partid < xp_max_npartitions; partid++) {
1108                 part = &xpc_partitions[partid];
1109
1110                 if (xpc_arch_ops.partition_engaged(partid) ||
1111                     part->act_state != XPC_P_AS_INACTIVE) {
1112                         xpc_arch_ops.request_partition_deactivation(part);
1113                         xpc_arch_ops.indicate_partition_disengaged(part);
1114                 }
1115         }
1116
1117         /*
1118          * Though we requested that all other partitions deactivate from us,
1119          * we only wait until they've all disengaged or we've reached the
1120          * defined timelimit.
1121          *
1122          * Given that one iteration through the following while-loop takes
1123          * approximately 200 microseconds, calculate the #of loops to take
1124          * before bailing and the #of loops before printing a waiting message.
1125          */
1126         keep_waiting = xpc_disengage_timelimit * 1000 * 5;
1127         wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL * 1000 * 5;
1128
1129         while (1) {
1130                 any_engaged = xpc_arch_ops.any_partition_engaged();
1131                 if (!any_engaged) {
1132                         dev_info(xpc_part, "all partitions have deactivated\n");
1133                         break;
1134                 }
1135
1136                 if (!keep_waiting--) {
1137                         for (partid = 0; partid < xp_max_npartitions;
1138                              partid++) {
1139                                 if (xpc_arch_ops.partition_engaged(partid)) {
1140                                         dev_info(xpc_part, "deactivate from "
1141                                                  "remote partition %d timed "
1142                                                  "out\n", partid);
1143                                 }
1144                         }
1145                         break;
1146                 }
1147
1148                 if (!wait_to_print--) {
1149                         dev_info(xpc_part, "waiting for remote partitions to "
1150                                  "deactivate, timeout in %ld seconds\n",
1151                                  keep_waiting / (1000 * 5));
1152                         wait_to_print = XPC_DEACTIVATE_PRINTMSG_INTERVAL *
1153                             1000 * 5;
1154                 }
1155
1156                 udelay(200);
1157         }
1158 }
1159
1160 /*
1161  * This function is called when the system is being restarted or halted due
1162  * to some sort of system failure. If this is the case we need to notify the
1163  * other partitions to disengage from all references to our memory.
1164  * This function can also be called when our heartbeater could be offlined
1165  * for a time. In this case we need to notify other partitions to not worry
1166  * about the lack of a heartbeat.
1167  */
1168 static int
1169 xpc_system_die(struct notifier_block *nb, unsigned long event, void *unused)
1170 {
1171 #ifdef CONFIG_IA64              /* !!! temporary kludge */
1172         switch (event) {
1173         case DIE_MACHINE_RESTART:
1174         case DIE_MACHINE_HALT:
1175                 xpc_die_deactivate();
1176                 break;
1177
1178         case DIE_KDEBUG_ENTER:
1179                 /* Should lack of heartbeat be ignored by other partitions? */
1180                 if (!xpc_kdebug_ignore)
1181                         break;
1182
1183                 /* fall through */
1184         case DIE_MCA_MONARCH_ENTER:
1185         case DIE_INIT_MONARCH_ENTER:
1186                 xpc_arch_ops.offline_heartbeat();
1187                 break;
1188
1189         case DIE_KDEBUG_LEAVE:
1190                 /* Is lack of heartbeat being ignored by other partitions? */
1191                 if (!xpc_kdebug_ignore)
1192                         break;
1193
1194                 /* fall through */
1195         case DIE_MCA_MONARCH_LEAVE:
1196         case DIE_INIT_MONARCH_LEAVE:
1197                 xpc_arch_ops.online_heartbeat();
1198                 break;
1199         }
1200 #else
1201         xpc_die_deactivate();
1202 #endif
1203
1204         return NOTIFY_DONE;
1205 }
1206
1207 int __init
1208 xpc_init(void)
1209 {
1210         int ret;
1211         struct task_struct *kthread;
1212
1213         dev_set_name(xpc_part, "part");
1214         dev_set_name(xpc_chan, "chan");
1215
1216         if (is_shub()) {
1217                 /*
1218                  * The ia64-sn2 architecture supports at most 64 partitions.
1219                  * And the inability to unregister remote amos restricts us
1220                  * further to only support exactly 64 partitions on this
1221                  * architecture, no less.
1222                  */
1223                 if (xp_max_npartitions != 64) {
1224                         dev_err(xpc_part, "max #of partitions not set to 64\n");
1225                         ret = -EINVAL;
1226                 } else {
1227                         ret = xpc_init_sn2();
1228                 }
1229
1230         } else if (is_uv()) {
1231                 ret = xpc_init_uv();
1232
1233         } else {
1234                 ret = -ENODEV;
1235         }
1236
1237         if (ret != 0)
1238                 return ret;
1239
1240         ret = xpc_setup_partitions();
1241         if (ret != 0) {
1242                 dev_err(xpc_part, "can't get memory for partition structure\n");
1243                 goto out_1;
1244         }
1245
1246         xpc_sysctl = register_sysctl_table(xpc_sys_dir);
1247
1248         /*
1249          * Fill the partition reserved page with the information needed by
1250          * other partitions to discover we are alive and establish initial
1251          * communications.
1252          */
1253         ret = xpc_setup_rsvd_page();
1254         if (ret != 0) {
1255                 dev_err(xpc_part, "can't setup our reserved page\n");
1256                 goto out_2;
1257         }
1258
1259         /* add ourselves to the reboot_notifier_list */
1260         ret = register_reboot_notifier(&xpc_reboot_notifier);
1261         if (ret != 0)
1262                 dev_warn(xpc_part, "can't register reboot notifier\n");
1263
1264         /* add ourselves to the die_notifier list */
1265         ret = register_die_notifier(&xpc_die_notifier);
1266         if (ret != 0)
1267                 dev_warn(xpc_part, "can't register die notifier\n");
1268
1269         /*
1270          * The real work-horse behind xpc.  This processes incoming
1271          * interrupts and monitors remote heartbeats.
1272          */
1273         kthread = kthread_run(xpc_hb_checker, NULL, XPC_HB_CHECK_THREAD_NAME);
1274         if (IS_ERR(kthread)) {
1275                 dev_err(xpc_part, "failed while forking hb check thread\n");
1276                 ret = -EBUSY;
1277                 goto out_3;
1278         }
1279
1280         /*
1281          * Startup a thread that will attempt to discover other partitions to
1282          * activate based on info provided by SAL. This new thread is short
1283          * lived and will exit once discovery is complete.
1284          */
1285         kthread = kthread_run(xpc_initiate_discovery, NULL,
1286                               XPC_DISCOVERY_THREAD_NAME);
1287         if (IS_ERR(kthread)) {
1288                 dev_err(xpc_part, "failed while forking discovery thread\n");
1289
1290                 /* mark this new thread as a non-starter */
1291                 complete(&xpc_discovery_exited);
1292
1293                 xpc_do_exit(xpUnloading);
1294                 return -EBUSY;
1295         }
1296
1297         /* set the interface to point at XPC's functions */
1298         xpc_set_interface(xpc_initiate_connect, xpc_initiate_disconnect,
1299                           xpc_initiate_send, xpc_initiate_send_notify,
1300                           xpc_initiate_received, xpc_initiate_partid_to_nasids);
1301
1302         return 0;
1303
1304         /* initialization was not successful */
1305 out_3:
1306         xpc_teardown_rsvd_page();
1307
1308         (void)unregister_die_notifier(&xpc_die_notifier);
1309         (void)unregister_reboot_notifier(&xpc_reboot_notifier);
1310 out_2:
1311         if (xpc_sysctl)
1312                 unregister_sysctl_table(xpc_sysctl);
1313
1314         xpc_teardown_partitions();
1315 out_1:
1316         if (is_shub())
1317                 xpc_exit_sn2();
1318         else if (is_uv())
1319                 xpc_exit_uv();
1320         return ret;
1321 }
1322
1323 module_init(xpc_init);
1324
1325 void __exit
1326 xpc_exit(void)
1327 {
1328         xpc_do_exit(xpUnloading);
1329 }
1330
1331 module_exit(xpc_exit);
1332
1333 MODULE_AUTHOR("Silicon Graphics, Inc.");
1334 MODULE_DESCRIPTION("Cross Partition Communication (XPC) support");
1335 MODULE_LICENSE("GPL");
1336
1337 module_param(xpc_hb_interval, int, 0);
1338 MODULE_PARM_DESC(xpc_hb_interval, "Number of seconds between "
1339                  "heartbeat increments.");
1340
1341 module_param(xpc_hb_check_interval, int, 0);
1342 MODULE_PARM_DESC(xpc_hb_check_interval, "Number of seconds between "
1343                  "heartbeat checks.");
1344
1345 module_param(xpc_disengage_timelimit, int, 0);
1346 MODULE_PARM_DESC(xpc_disengage_timelimit, "Number of seconds to wait "
1347                  "for disengage to complete.");
1348
1349 module_param(xpc_kdebug_ignore, int, 0);
1350 MODULE_PARM_DESC(xpc_kdebug_ignore, "Should lack of heartbeat be ignored by "
1351                  "other partitions when dropping into kdebug.");