SGI Altix mmtimer: allow larger number of timers per node
authorDimitri Sivanich <sivanich@sgi.com>
Wed, 30 Apr 2008 07:53:35 +0000 (00:53 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Wed, 30 Apr 2008 15:29:41 +0000 (08:29 -0700)
The purpose of this patch to the SGI Altix specific mmtimer (posix timer)
driver is to allow a virtually infinite number of timers to be set per
node.

Timers will now be kept on a sorted per-node list and a single node-based
hardware comparator is used to trigger the next timer.

[akpm@linux-foundation.org: mark things static]
[akpm@linux-foundation.org: fix warning]
Signed-off-by: Dimitri Sivanich <sivanich@sgi.com>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
drivers/char/mmtimer.c

index e60a74c..d83db5d 100644 (file)
@@ -74,9 +74,8 @@ static const struct file_operations mmtimer_fops = {
  * We only have comparison registers RTC1-4 currently available per
  * node.  RTC0 is used by SAL.
  */
-#define NUM_COMPARATORS 3
 /* Check for an RTC interrupt pending */
-static int inline mmtimer_int_pending(int comparator)
+static int mmtimer_int_pending(int comparator)
 {
        if (HUB_L((unsigned long *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED)) &
                        SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator)
@@ -84,15 +83,16 @@ static int inline mmtimer_int_pending(int comparator)
        else
                return 0;
 }
+
 /* Clear the RTC interrupt pending bit */
-static void inline mmtimer_clr_int_pending(int comparator)
+static void mmtimer_clr_int_pending(int comparator)
 {
        HUB_S((u64 *)LOCAL_MMR_ADDR(SH_EVENT_OCCURRED_ALIAS),
                SH_EVENT_OCCURRED_RTC1_INT_MASK << comparator);
 }
 
 /* Setup timer on comparator RTC1 */
-static void inline mmtimer_setup_int_0(u64 expires)
+static void mmtimer_setup_int_0(int cpu, u64 expires)
 {
        u64 val;
 
@@ -106,7 +106,7 @@ static void inline mmtimer_setup_int_0(u64 expires)
        mmtimer_clr_int_pending(0);
 
        val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC1_INT_CONFIG_IDX_SHFT) |
-               ((u64)cpu_physical_id(smp_processor_id()) <<
+               ((u64)cpu_physical_id(cpu) <<
                        SH_RTC1_INT_CONFIG_PID_SHFT);
 
        /* Set configuration */
@@ -122,7 +122,7 @@ static void inline mmtimer_setup_int_0(u64 expires)
 }
 
 /* Setup timer on comparator RTC2 */
-static void inline mmtimer_setup_int_1(u64 expires)
+static void mmtimer_setup_int_1(int cpu, u64 expires)
 {
        u64 val;
 
@@ -133,7 +133,7 @@ static void inline mmtimer_setup_int_1(u64 expires)
        mmtimer_clr_int_pending(1);
 
        val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC2_INT_CONFIG_IDX_SHFT) |
-               ((u64)cpu_physical_id(smp_processor_id()) <<
+               ((u64)cpu_physical_id(cpu) <<
                        SH_RTC2_INT_CONFIG_PID_SHFT);
 
        HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC2_INT_CONFIG), val);
@@ -144,7 +144,7 @@ static void inline mmtimer_setup_int_1(u64 expires)
 }
 
 /* Setup timer on comparator RTC3 */
-static void inline mmtimer_setup_int_2(u64 expires)
+static void mmtimer_setup_int_2(int cpu, u64 expires)
 {
        u64 val;
 
@@ -155,7 +155,7 @@ static void inline mmtimer_setup_int_2(u64 expires)
        mmtimer_clr_int_pending(2);
 
        val = ((u64)SGI_MMTIMER_VECTOR << SH_RTC3_INT_CONFIG_IDX_SHFT) |
-               ((u64)cpu_physical_id(smp_processor_id()) <<
+               ((u64)cpu_physical_id(cpu) <<
                        SH_RTC3_INT_CONFIG_PID_SHFT);
 
        HUB_S((u64 *)LOCAL_MMR_ADDR(SH_RTC3_INT_CONFIG), val);
@@ -170,22 +170,22 @@ static void inline mmtimer_setup_int_2(u64 expires)
  * in order to insure that the setup succeeds in a deterministic time frame.
  * It will check if the interrupt setup succeeded.
  */
-static int inline mmtimer_setup(int comparator, unsigned long expires)
+static int mmtimer_setup(int cpu, int comparator, unsigned long expires)
 {
 
        switch (comparator) {
        case 0:
-               mmtimer_setup_int_0(expires);
+               mmtimer_setup_int_0(cpu, expires);
                break;
        case 1:
-               mmtimer_setup_int_1(expires);
+               mmtimer_setup_int_1(cpu, expires);
                break;
        case 2:
-               mmtimer_setup_int_2(expires);
+               mmtimer_setup_int_2(cpu, expires);
                break;
        }
        /* We might've missed our expiration time */
-       if (rtc_time() < expires)
+       if (rtc_time() <= expires)
                return 1;
 
        /*
@@ -195,7 +195,7 @@ static int inline mmtimer_setup(int comparator, unsigned long expires)
        return mmtimer_int_pending(comparator);
 }
 
-static int inline mmtimer_disable_int(long nasid, int comparator)
+static int mmtimer_disable_int(long nasid, int comparator)
 {
        switch (comparator) {
        case 0:
@@ -216,18 +216,124 @@ static int inline mmtimer_disable_int(long nasid, int comparator)
        return 0;
 }
 
-#define TIMER_OFF 0xbadcabLL
+#define COMPARATOR     1               /* The comparator to use */
 
-/* There is one of these for each comparator */
-typedef struct mmtimer {
-       spinlock_t lock ____cacheline_aligned;
+#define TIMER_OFF      0xbadcabLL      /* Timer is not setup */
+#define TIMER_SET      0               /* Comparator is set for this timer */
+
+/* There is one of these for each timer */
+struct mmtimer {
+       struct rb_node list;
        struct k_itimer *timer;
-       int i;
        int cpu;
+};
+
+struct mmtimer_node {
+       spinlock_t lock ____cacheline_aligned;
+       struct rb_root timer_head;
+       struct rb_node *next;
        struct tasklet_struct tasklet;
-} mmtimer_t;
+};
+static struct mmtimer_node *timers;
+
+
+/*
+ * Add a new mmtimer struct to the node's mmtimer list.
+ * This function assumes the struct mmtimer_node is locked.
+ */
+static void mmtimer_add_list(struct mmtimer *n)
+{
+       int nodeid = n->timer->it.mmtimer.node;
+       unsigned long expires = n->timer->it.mmtimer.expires;
+       struct rb_node **link = &timers[nodeid].timer_head.rb_node;
+       struct rb_node *parent = NULL;
+       struct mmtimer *x;
+
+       /*
+        * Find the right place in the rbtree:
+        */
+       while (*link) {
+               parent = *link;
+               x = rb_entry(parent, struct mmtimer, list);
+
+               if (expires < x->timer->it.mmtimer.expires)
+                       link = &(*link)->rb_left;
+               else
+                       link = &(*link)->rb_right;
+       }
+
+       /*
+        * Insert the timer to the rbtree and check whether it
+        * replaces the first pending timer
+        */
+       rb_link_node(&n->list, parent, link);
+       rb_insert_color(&n->list, &timers[nodeid].timer_head);
+
+       if (!timers[nodeid].next || expires < rb_entry(timers[nodeid].next,
+                       struct mmtimer, list)->timer->it.mmtimer.expires)
+               timers[nodeid].next = &n->list;
+}
+
+/*
+ * Set the comparator for the next timer.
+ * This function assumes the struct mmtimer_node is locked.
+ */
+static void mmtimer_set_next_timer(int nodeid)
+{
+       struct mmtimer_node *n = &timers[nodeid];
+       struct mmtimer *x;
+       struct k_itimer *t;
+       int o;
+
+restart:
+       if (n->next == NULL)
+               return;
 
-static mmtimer_t ** timers;
+       x = rb_entry(n->next, struct mmtimer, list);
+       t = x->timer;
+       if (!t->it.mmtimer.incr) {
+               /* Not an interval timer */
+               if (!mmtimer_setup(x->cpu, COMPARATOR,
+                                       t->it.mmtimer.expires)) {
+                       /* Late setup, fire now */
+                       tasklet_schedule(&n->tasklet);
+               }
+               return;
+       }
+
+       /* Interval timer */
+       o = 0;
+       while (!mmtimer_setup(x->cpu, COMPARATOR, t->it.mmtimer.expires)) {
+               unsigned long e, e1;
+               struct rb_node *next;
+               t->it.mmtimer.expires += t->it.mmtimer.incr << o;
+               t->it_overrun += 1 << o;
+               o++;
+               if (o > 20) {
+                       printk(KERN_ALERT "mmtimer: cannot reschedule timer\n");
+                       t->it.mmtimer.clock = TIMER_OFF;
+                       n->next = rb_next(&x->list);
+                       rb_erase(&x->list, &n->timer_head);
+                       kfree(x);
+                       goto restart;
+               }
+
+               e = t->it.mmtimer.expires;
+               next = rb_next(&x->list);
+
+               if (next == NULL)
+                       continue;
+
+               e1 = rb_entry(next, struct mmtimer, list)->
+                       timer->it.mmtimer.expires;
+               if (e > e1) {
+                       n->next = next;
+                       rb_erase(&x->list, &n->timer_head);
+                       mmtimer_add_list(x);
+                       goto restart;
+               }
+       }
+}
 
 /**
  * mmtimer_ioctl - ioctl interface for /dev/mmtimer
@@ -390,35 +496,6 @@ static int sgi_clock_set(clockid_t clockid, struct timespec *tp)
        return 0;
 }
 
-/*
- * Schedule the next periodic interrupt. This function will attempt
- * to schedule a periodic interrupt later if necessary. If the scheduling
- * of an interrupt fails then the time to skip is lengthened
- * exponentially in order to ensure that the next interrupt
- * can be properly scheduled..
- */
-static int inline reschedule_periodic_timer(mmtimer_t *x)
-{
-       int n;
-       struct k_itimer *t = x->timer;
-
-       t->it.mmtimer.clock = x->i;
-       t->it_overrun--;
-
-       n = 0;
-       do {
-
-               t->it.mmtimer.expires += t->it.mmtimer.incr << n;
-               t->it_overrun += 1 << n;
-               n++;
-               if (n > 20)
-                       return 1;
-
-       } while (!mmtimer_setup(x->i, t->it.mmtimer.expires));
-
-       return 0;
-}
-
 /**
  * mmtimer_interrupt - timer interrupt handler
  * @irq: irq received
@@ -435,71 +512,75 @@ static int inline reschedule_periodic_timer(mmtimer_t *x)
 static irqreturn_t
 mmtimer_interrupt(int irq, void *dev_id)
 {
-       int i;
        unsigned long expires = 0;
        int result = IRQ_NONE;
        unsigned indx = cpu_to_node(smp_processor_id());
+       struct mmtimer *base;
 
-       /*
-        * Do this once for each comparison register
-        */
-       for (i = 0; i < NUM_COMPARATORS; i++) {
-               mmtimer_t *base = timers[indx] + i;
-               /* Make sure this doesn't get reused before tasklet_sched */
-               spin_lock(&base->lock);
-               if (base->cpu == smp_processor_id()) {
-                       if (base->timer)
-                               expires = base->timer->it.mmtimer.expires;
-                       /* expires test won't work with shared irqs */
-                       if ((mmtimer_int_pending(i) > 0) ||
-                               (expires && (expires < rtc_time()))) {
-                               mmtimer_clr_int_pending(i);
-                               tasklet_schedule(&base->tasklet);
-                               result = IRQ_HANDLED;
-                       }
+       spin_lock(&timers[indx].lock);
+       base = rb_entry(timers[indx].next, struct mmtimer, list);
+       if (base == NULL) {
+               spin_unlock(&timers[indx].lock);
+               return result;
+       }
+
+       if (base->cpu == smp_processor_id()) {
+               if (base->timer)
+                       expires = base->timer->it.mmtimer.expires;
+               /* expires test won't work with shared irqs */
+               if ((mmtimer_int_pending(COMPARATOR) > 0) ||
+                       (expires && (expires <= rtc_time()))) {
+                       mmtimer_clr_int_pending(COMPARATOR);
+                       tasklet_schedule(&timers[indx].tasklet);
+                       result = IRQ_HANDLED;
                }
-               spin_unlock(&base->lock);
-               expires = 0;
        }
+       spin_unlock(&timers[indx].lock);
        return result;
 }
 
-void mmtimer_tasklet(unsigned long data) {
-       mmtimer_t *x = (mmtimer_t *)data;
-       struct k_itimer *t = x->timer;
+static void mmtimer_tasklet(unsigned long data)
+{
+       int nodeid = data;
+       struct mmtimer_node *mn = &timers[nodeid];
+       struct mmtimer *x = rb_entry(mn->next, struct mmtimer, list);
+       struct k_itimer *t;
        unsigned long flags;
 
-       if (t == NULL)
-               return;
-
        /* Send signal and deal with periodic signals */
-       spin_lock_irqsave(&t->it_lock, flags);
-       spin_lock(&x->lock);
-       /* If timer was deleted between interrupt and here, leave */
-       if (t != x->timer)
+       spin_lock_irqsave(&mn->lock, flags);
+       if (!mn->next)
                goto out;
-       t->it_overrun = 0;
 
-       if (posix_timer_event(t, 0) != 0) {
+       x = rb_entry(mn->next, struct mmtimer, list);
+       t = x->timer;
+
+       if (t->it.mmtimer.clock == TIMER_OFF)
+               goto out;
+
+       t->it_overrun = 0;
 
-               // printk(KERN_WARNING "mmtimer: cannot deliver signal.\n");
+       mn->next = rb_next(&x->list);
+       rb_erase(&x->list, &mn->timer_head);
 
+       if (posix_timer_event(t, 0) != 0)
                t->it_overrun++;
-       }
+
        if(t->it.mmtimer.incr) {
-               /* Periodic timer */
-               if (reschedule_periodic_timer(x)) {
-                       printk(KERN_WARNING "mmtimer: unable to reschedule\n");
-                       x->timer = NULL;
-               }
+               t->it.mmtimer.expires += t->it.mmtimer.incr;
+               mmtimer_add_list(x);
        } else {
                /* Ensure we don't false trigger in mmtimer_interrupt */
+               t->it.mmtimer.clock = TIMER_OFF;
                t->it.mmtimer.expires = 0;
+               kfree(x);
        }
+       /* Set comparator for next timer, if there is one */
+       mmtimer_set_next_timer(nodeid);
+
        t->it_overrun_last = t->it_overrun;
 out:
-       spin_unlock(&x->lock);
-       spin_unlock_irqrestore(&t->it_lock, flags);
+       spin_unlock_irqrestore(&mn->lock, flags);
 }
 
 static int sgi_timer_create(struct k_itimer *timer)
@@ -516,19 +597,50 @@ static int sgi_timer_create(struct k_itimer *timer)
  */
 static int sgi_timer_del(struct k_itimer *timr)
 {
-       int i = timr->it.mmtimer.clock;
        cnodeid_t nodeid = timr->it.mmtimer.node;
-       mmtimer_t *t = timers[nodeid] + i;
        unsigned long irqflags;
 
-       if (i != TIMER_OFF) {
-               spin_lock_irqsave(&t->lock, irqflags);
-               mmtimer_disable_int(cnodeid_to_nasid(nodeid),i);
-               t->timer = NULL;
+       spin_lock_irqsave(&timers[nodeid].lock, irqflags);
+       if (timr->it.mmtimer.clock != TIMER_OFF) {
+               unsigned long expires = timr->it.mmtimer.expires;
+               struct rb_node *n = timers[nodeid].timer_head.rb_node;
+               struct mmtimer *uninitialized_var(t);
+               int r = 0;
+
                timr->it.mmtimer.clock = TIMER_OFF;
                timr->it.mmtimer.expires = 0;
-               spin_unlock_irqrestore(&t->lock, irqflags);
+
+               while (n) {
+                       t = rb_entry(n, struct mmtimer, list);
+                       if (t->timer == timr)
+                               break;
+
+                       if (expires < t->timer->it.mmtimer.expires)
+                               n = n->rb_left;
+                       else
+                               n = n->rb_right;
+               }
+
+               if (!n) {
+                       spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
+                       return 0;
+               }
+
+               if (timers[nodeid].next == n) {
+                       timers[nodeid].next = rb_next(n);
+                       r = 1;
+               }
+
+               rb_erase(n, &timers[nodeid].timer_head);
+               kfree(t);
+
+               if (r) {
+                       mmtimer_disable_int(cnodeid_to_nasid(nodeid),
+                               COMPARATOR);
+                       mmtimer_set_next_timer(nodeid);
+               }
        }
+       spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
        return 0;
 }
 
@@ -557,12 +669,11 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
        struct itimerspec * new_setting,
        struct itimerspec * old_setting)
 {
-
-       int i;
        unsigned long when, period, irqflags;
        int err = 0;
        cnodeid_t nodeid;
-       mmtimer_t *base;
+       struct mmtimer *base;
+       struct rb_node *n;
 
        if (old_setting)
                sgi_timer_get(timr, old_setting);
@@ -575,6 +686,10 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
                /* Clear timer */
                return 0;
 
+       base = kmalloc(sizeof(struct mmtimer), GFP_KERNEL);
+       if (base == NULL)
+               return -ENOMEM;
+
        if (flags & TIMER_ABSTIME) {
                struct timespec n;
                unsigned long now;
@@ -604,47 +719,38 @@ static int sgi_timer_set(struct k_itimer *timr, int flags,
        preempt_disable();
 
        nodeid =  cpu_to_node(smp_processor_id());
-retry:
-       /* Don't use an allocated timer, or a deleted one that's pending */
-       for(i = 0; i< NUM_COMPARATORS; i++) {
-               base = timers[nodeid] + i;
-               if (!base->timer && !base->tasklet.state) {
-                       break;
-               }
-       }
-
-       if (i == NUM_COMPARATORS) {
-               preempt_enable();
-               return -EBUSY;
-       }
 
-       spin_lock_irqsave(&base->lock, irqflags);
+       /* Lock the node timer structure */
+       spin_lock_irqsave(&timers[nodeid].lock, irqflags);
 
-       if (base->timer || base->tasklet.state != 0) {
-               spin_unlock_irqrestore(&base->lock, irqflags);
-               goto retry;
-       }
        base->timer = timr;
        base->cpu = smp_processor_id();
 
-       timr->it.mmtimer.clock = i;
+       timr->it.mmtimer.clock = TIMER_SET;
        timr->it.mmtimer.node = nodeid;
        timr->it.mmtimer.incr = period;
        timr->it.mmtimer.expires = when;
 
-       if (period == 0) {
-               if (!mmtimer_setup(i, when)) {
-                       mmtimer_disable_int(-1, i);
-                       posix_timer_event(timr, 0);
-                       timr->it.mmtimer.expires = 0;
-               }
-       } else {
-               timr->it.mmtimer.expires -= period;
-               if (reschedule_periodic_timer(base))
-                       err = -EINVAL;
+       n = timers[nodeid].next;
+
+       /* Add the new struct mmtimer to node's timer list */
+       mmtimer_add_list(base);
+
+       if (timers[nodeid].next == n) {
+               /* No need to reprogram comparator for now */
+               spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
+               preempt_enable();
+               return err;
        }
 
-       spin_unlock_irqrestore(&base->lock, irqflags);
+       /* We need to reprogram the comparator */
+       if (n)
+               mmtimer_disable_int(cnodeid_to_nasid(nodeid), COMPARATOR);
+
+       mmtimer_set_next_timer(nodeid);
+
+       /* Unlock the node timer structure */
+       spin_unlock_irqrestore(&timers[nodeid].lock, irqflags);
 
        preempt_enable();
 
@@ -669,7 +775,6 @@ static struct k_clock sgi_clock = {
  */
 static int __init mmtimer_init(void)
 {
-       unsigned i;
        cnodeid_t node, maxn = -1;
 
        if (!ia64_platform_is("sn2"))
@@ -706,31 +811,18 @@ static int __init mmtimer_init(void)
        maxn++;
 
        /* Allocate list of node ptrs to mmtimer_t's */
-       timers = kzalloc(sizeof(mmtimer_t *)*maxn, GFP_KERNEL);
+       timers = kzalloc(sizeof(struct mmtimer_node)*maxn, GFP_KERNEL);
        if (timers == NULL) {
                printk(KERN_ERR "%s: failed to allocate memory for device\n",
                                MMTIMER_NAME);
                goto out3;
        }
 
-       /* Allocate mmtimer_t's for each online node */
+       /* Initialize struct mmtimer's for each online node */
        for_each_online_node(node) {
-               timers[node] = kmalloc_node(sizeof(mmtimer_t)*NUM_COMPARATORS, GFP_KERNEL, node);
-               if (timers[node] == NULL) {
-                       printk(KERN_ERR "%s: failed to allocate memory for device\n",
-                               MMTIMER_NAME);
-                       goto out4;
-               }
-               for (i=0; i< NUM_COMPARATORS; i++) {
-                       mmtimer_t * base = timers[node] + i;
-
-                       spin_lock_init(&base->lock);
-                       base->timer = NULL;
-                       base->cpu = 0;
-                       base->i = i;
-                       tasklet_init(&base->tasklet, mmtimer_tasklet,
-                               (unsigned long) (base));
-               }
+               spin_lock_init(&timers[node].lock);
+               tasklet_init(&timers[node].tasklet, mmtimer_tasklet,
+                       (unsigned long) node);
        }
 
        sgi_clock_period = sgi_clock.res = NSEC_PER_SEC / sn_rtc_cycles_per_second;
@@ -741,11 +833,8 @@ static int __init mmtimer_init(void)
 
        return 0;
 
-out4:
-       for_each_online_node(node) {
-               kfree(timers[node]);
-       }
 out3:
+       kfree(timers);
        misc_deregister(&mmtimer_miscdev);
 out2:
        free_irq(SGI_MMTIMER_VECTOR, NULL);
@@ -754,4 +843,3 @@ out1:
 }
 
 module_init(mmtimer_init);
-