X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fcore%2Fdst.c;h=57bc4d5b8d084c053cded6ebddfe66bf6255d9f5;hb=56baa667973e53d6d38af2ad3731d558566d818b;hp=836ec66069254752c7ca2b9db0a597419dd758a2;hpb=54e6ecb23951b195d02433a741c7f7cb0b796c78;p=safe%2Fjmp%2Flinux-2.6

diff --git a/net/core/dst.c b/net/core/dst.c
index 836ec66..57bc4d5 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -9,60 +9,83 @@
 #include <linux/errno.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/workqueue.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/netdevice.h>
-#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/string.h>
 #include <linux/types.h>
+#include <net/net_namespace.h>
 
 #include <net/dst.h>
 
-/* Locking strategy:
- * 1) Garbage collection state of dead destination cache
- *    entries is protected by dst_lock.
- * 2) GC is run only from BH context, and is the only remover
- *    of entries.
- * 3) Entries are added to the garbage list from both BH
- *    and non-BH context, so local BH disabling is needed.
- * 4) All operations modify state, so a spinlock is used.
+/*
+ * Theory of operations:
+ * 1) We use a list, protected by a spinlock, to add
+ *    new entries from both BH and non-BH context.
+ * 2) In order to keep spinlock held for a small delay,
+ *    we use a second list where are stored long lived
+ *    entries, that are handled by the garbage collect thread
+ *    fired by a workqueue.
+ * 3) This list is guarded by a mutex,
+ *    so that the gc_task and dst_dev_event() can be synchronized.
  */
-static struct dst_entry 	*dst_garbage_list;
-#if RT_CACHE_DEBUG >= 2 
+#if RT_CACHE_DEBUG >= 2
 static atomic_t			 dst_total = ATOMIC_INIT(0);
 #endif
-static DEFINE_SPINLOCK(dst_lock);
 
-static unsigned long dst_gc_timer_expires;
-static unsigned long dst_gc_timer_inc = DST_GC_MAX;
-static void dst_run_gc(unsigned long);
+/*
+ * We want to keep lock & list close together
+ * to dirty as few cache lines as possible in __dst_free().
+ * As this is not a very strong hint, we dont force an alignment on SMP.
+ */
+static struct {
+	spinlock_t		lock;
+	struct dst_entry 	*list;
+	unsigned long		timer_inc;
+	unsigned long		timer_expires;
+} dst_garbage = {
+	.lock = __SPIN_LOCK_UNLOCKED(dst_garbage.lock),
+	.timer_inc = DST_GC_MAX,
+};
+static void dst_gc_task(struct work_struct *work);
 static void ___dst_free(struct dst_entry * dst);
 
-static DEFINE_TIMER(dst_gc_timer, dst_run_gc, DST_GC_MIN, 0);
+static DECLARE_DELAYED_WORK(dst_gc_work, dst_gc_task);
 
-static void dst_run_gc(unsigned long dummy)
+static DEFINE_MUTEX(dst_gc_mutex);
+/*
+ * long lived entries are maintained in this list, guarded by dst_gc_mutex
+ */
+static struct dst_entry         *dst_busy_list;
+
+static void dst_gc_task(struct work_struct *work)
 {
 	int    delayed = 0;
-	int    work_performed;
-	struct dst_entry * dst, **dstp;
+	int    work_performed = 0;
+	unsigned long expires = ~0L;
+	struct dst_entry *dst, *next, head;
+	struct dst_entry *last = &head;
+#if RT_CACHE_DEBUG >= 2
+	ktime_t time_start = ktime_get();
+	struct timespec elapsed;
+#endif
 
-	if (!spin_trylock(&dst_lock)) {
-		mod_timer(&dst_gc_timer, jiffies + HZ/10);
-		return;
-	}
+	mutex_lock(&dst_gc_mutex);
+	next = dst_busy_list;
 
-	del_timer(&dst_gc_timer);
-	dstp = &dst_garbage_list;
-	work_performed = 0;
-	while ((dst = *dstp) != NULL) {
-		if (atomic_read(&dst->__refcnt)) {
-			dstp = &dst->next;
+loop:
+	while ((dst = next) != NULL) {
+		next = dst->next;
+		prefetch(&next->next);
+		if (likely(atomic_read(&dst->__refcnt))) {
+			last->next = dst;
+			last = dst;
 			delayed++;
 			continue;
 		}
-		*dstp = dst->next;
-		work_performed = 1;
+		work_performed++;
 
 		dst = dst_destroy(dst);
 		if (dst) {
@@ -78,64 +101,82 @@ static void dst_run_gc(unsigned long dummy)
 				continue;
 
 			___dst_free(dst);
-			dst->next = *dstp;
-			*dstp = dst;
-			dstp = &dst->next;
+			dst->next = next;
+			next = dst;
 		}
 	}
-	if (!dst_garbage_list) {
-		dst_gc_timer_inc = DST_GC_MAX;
-		goto out;
+
+	spin_lock_bh(&dst_garbage.lock);
+	next = dst_garbage.list;
+	if (next) {
+		dst_garbage.list = NULL;
+		spin_unlock_bh(&dst_garbage.lock);
+		goto loop;
 	}
-	if (!work_performed) {
-		if ((dst_gc_timer_expires += dst_gc_timer_inc) > DST_GC_MAX)
-			dst_gc_timer_expires = DST_GC_MAX;
-		dst_gc_timer_inc += DST_GC_INC;
-	} else {
-		dst_gc_timer_inc = DST_GC_INC;
-		dst_gc_timer_expires = DST_GC_MIN;
+	last->next = NULL;
+	dst_busy_list = head.next;
+	if (!dst_busy_list)
+		dst_garbage.timer_inc = DST_GC_MAX;
+	else {
+		/*
+		 * if we freed less than 1/10 of delayed entries,
+		 * we can sleep longer.
+		 */
+		if (work_performed <= delayed/10) {
+			dst_garbage.timer_expires += dst_garbage.timer_inc;
+			if (dst_garbage.timer_expires > DST_GC_MAX)
+				dst_garbage.timer_expires = DST_GC_MAX;
+			dst_garbage.timer_inc += DST_GC_INC;
+		} else {
+			dst_garbage.timer_inc = DST_GC_INC;
+			dst_garbage.timer_expires = DST_GC_MIN;
+		}
+		expires = dst_garbage.timer_expires;
+		/*
+		 * if the next desired timer is more than 4 seconds in the future
+		 * then round the timer to whole seconds
+		 */
+		if (expires > 4*HZ)
+			expires = round_jiffies_relative(expires);
+		schedule_delayed_work(&dst_gc_work, expires);
 	}
+
+	spin_unlock_bh(&dst_garbage.lock);
+	mutex_unlock(&dst_gc_mutex);
 #if RT_CACHE_DEBUG >= 2
-	printk("dst_total: %d/%d %ld\n",
-	       atomic_read(&dst_total), delayed,  dst_gc_timer_expires);
+	elapsed = ktime_to_timespec(ktime_sub(ktime_get(), time_start));
+	printk(KERN_DEBUG "dst_total: %d delayed: %d work_perf: %d"
+		" expires: %lu elapsed: %lu us\n",
+		atomic_read(&dst_total), delayed, work_performed,
+		expires,
+		elapsed.tv_sec * USEC_PER_SEC + elapsed.tv_nsec / NSEC_PER_USEC);
 #endif
-	mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
-
-out:
-	spin_unlock(&dst_lock);
 }
 
-static int dst_discard_in(struct sk_buff *skb)
-{
-	kfree_skb(skb);
-	return 0;
-}
-
-static int dst_discard_out(struct sk_buff *skb)
+int dst_discard(struct sk_buff *skb)
 {
 	kfree_skb(skb);
 	return 0;
 }
+EXPORT_SYMBOL(dst_discard);
 
 void * dst_alloc(struct dst_ops * ops)
 {
 	struct dst_entry * dst;
 
 	if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) {
-		if (ops->gc())
+		if (ops->gc(ops))
 			return NULL;
 	}
-	dst = kmem_cache_alloc(ops->kmem_cachep, GFP_ATOMIC);
+	dst = kmem_cache_zalloc(ops->kmem_cachep, GFP_ATOMIC);
 	if (!dst)
 		return NULL;
-	memset(dst, 0, ops->entry_size);
 	atomic_set(&dst->__refcnt, 0);
 	dst->ops = ops;
 	dst->lastuse = jiffies;
 	dst->path = dst;
-	dst->input = dst_discard_in;
-	dst->output = dst_discard_out;
-#if RT_CACHE_DEBUG >= 2 
+	dst->input = dst->output = dst_discard;
+#if RT_CACHE_DEBUG >= 2
 	atomic_inc(&dst_total);
 #endif
 	atomic_inc(&ops->entries);
@@ -148,24 +189,24 @@ static void ___dst_free(struct dst_entry * dst)
 	   protocol module is unloaded.
 	 */
 	if (dst->dev == NULL || !(dst->dev->flags&IFF_UP)) {
-		dst->input = dst_discard_in;
-		dst->output = dst_discard_out;
+		dst->input = dst->output = dst_discard;
 	}
 	dst->obsolete = 2;
 }
 
 void __dst_free(struct dst_entry * dst)
 {
-	spin_lock_bh(&dst_lock);
+	spin_lock_bh(&dst_garbage.lock);
 	___dst_free(dst);
-	dst->next = dst_garbage_list;
-	dst_garbage_list = dst;
-	if (dst_gc_timer_inc > DST_GC_INC) {
-		dst_gc_timer_inc = DST_GC_INC;
-		dst_gc_timer_expires = DST_GC_MIN;
-		mod_timer(&dst_gc_timer, jiffies + dst_gc_timer_expires);
+	dst->next = dst_garbage.list;
+	dst_garbage.list = dst;
+	if (dst_garbage.timer_inc > DST_GC_INC) {
+		dst_garbage.timer_inc = DST_GC_INC;
+		dst_garbage.timer_expires = DST_GC_MIN;
+		cancel_delayed_work(&dst_gc_work);
+		schedule_delayed_work(&dst_gc_work, dst_garbage.timer_expires);
 	}
-	spin_unlock_bh(&dst_lock);
+	spin_unlock_bh(&dst_garbage.lock);
 }
 
 struct dst_entry *dst_destroy(struct dst_entry * dst)
@@ -196,7 +237,7 @@ again:
 		dst->ops->destroy(dst);
 	if (dst->dev)
 		dev_put(dst->dev);
-#if RT_CACHE_DEBUG >= 2 
+#if RT_CACHE_DEBUG >= 2
 	atomic_dec(&dst_total);
 #endif
 	kmem_cache_free(dst->ops->kmem_cachep, dst);
@@ -219,6 +260,18 @@ again:
 	return NULL;
 }
 
+void dst_release(struct dst_entry *dst)
+{
+	if (dst) {
+               int newrefcnt;
+
+		smp_mb__before_atomic_dec();
+               newrefcnt = atomic_dec_return(&dst->__refcnt);
+               WARN_ON(newrefcnt < 0);
+	}
+}
+EXPORT_SYMBOL(dst_release);
+
 /* Dirty hack. We did it in 2.2 (in __dst_free),
  * we have _very_ good reasons not to repeat
  * this mistake in 2.3, but we have no choice
@@ -237,16 +290,15 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 		return;
 
 	if (!unregister) {
-		dst->input = dst_discard_in;
-		dst->output = dst_discard_out;
+		dst->input = dst->output = dst_discard;
 	} else {
-		dst->dev = &loopback_dev;
-		dev_hold(&loopback_dev);
+		dst->dev = dev_net(dst->dev)->loopback_dev;
+		dev_hold(dst->dev);
 		dev_put(dev);
 		if (dst->neighbour && dst->neighbour->dev == dev) {
-			dst->neighbour->dev = &loopback_dev;
+			dst->neighbour->dev = dst->dev;
+			dev_hold(dst->dev);
 			dev_put(dev);
-			dev_hold(&loopback_dev);
 		}
 	}
 }
@@ -254,16 +306,30 @@ static inline void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
 static int dst_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
 {
 	struct net_device *dev = ptr;
-	struct dst_entry *dst;
+	struct dst_entry *dst, *last = NULL;
 
 	switch (event) {
 	case NETDEV_UNREGISTER:
 	case NETDEV_DOWN:
-		spin_lock_bh(&dst_lock);
-		for (dst = dst_garbage_list; dst; dst = dst->next) {
+		mutex_lock(&dst_gc_mutex);
+		for (dst = dst_busy_list; dst; dst = dst->next) {
+			last = dst;
+			dst_ifdown(dst, dev, event != NETDEV_DOWN);
+		}
+
+		spin_lock_bh(&dst_garbage.lock);
+		dst = dst_garbage.list;
+		dst_garbage.list = NULL;
+		spin_unlock_bh(&dst_garbage.lock);
+
+		if (last)
+			last->next = dst;
+		else
+			dst_busy_list = dst;
+		for (; dst; dst = dst->next) {
 			dst_ifdown(dst, dev, event != NETDEV_DOWN);
 		}
-		spin_unlock_bh(&dst_lock);
+		mutex_unlock(&dst_gc_mutex);
 		break;
 	}
 	return NOTIFY_DONE;