[PATCH] knfsd: move tempsock aging to a timer
authorGreg Banks <gnb@melbourne.sgi.com>
Mon, 2 Oct 2006 09:17:54 +0000 (02:17 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Mon, 2 Oct 2006 14:57:19 +0000 (07:57 -0700)
Following are 11 patches from Greg Banks which combine to make knfsd more
Numa-aware.  They reduce hitting on 'global' data structures, and create some
data-structures that can be node-local.

knfsd threads are bound to a particular node, and the thread to handle a new
request is chosen from the threads that are attach to the node that received
the interrupt.

The distribution of threads across nodes can be controlled by a new file in
the 'nfsd' filesystem, though the default approach of an even spread is
probably fine for most sites.

Some (old) numbers that show the efficacy of these patches: N == number of
NICs == number of CPUs == nmber of clients.  Number of NUMA nodes == N/2

N Throughput, MiB/s CPU usage, % (max=N*100)
Before After Before After
--- ------ ---- ----- -----
4 312 435 350 228
6 500 656 501 418
8 562 804 690 589

This patch:

Move the aging of RPC/TCP connection sockets from the main svc_recv() loop to
a timer which uses a mark-and-sweep algorithm every 6 minutes.  This reduces
the amount of work that needs to be done in the main RPC loop and the length
of time we need to hold the (effectively global) svc_serv->sv_lock.

[akpm@osdl.org: cleanup]
Signed-off-by: Greg Banks <gnb@melbourne.sgi.com>
Signed-off-by: Neil Brown <neilb@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
include/linux/sunrpc/svc.h
include/linux/sunrpc/svcsock.h
net/sunrpc/svc.c
net/sunrpc/svcsock.c

index cb341f9..5eabded 100644 (file)
@@ -40,6 +40,7 @@ struct svc_serv {
        struct list_head        sv_permsocks;   /* all permanent sockets */
        struct list_head        sv_tempsocks;   /* all temporary sockets */
        int                     sv_tmpcnt;      /* count of temporary sockets */
+       struct timer_list       sv_temptimer;   /* timer for aging temporary sockets */
 
        char *                  sv_name;        /* service name */
 
index d5f15e8..846aee9 100644 (file)
@@ -31,6 +31,8 @@ struct svc_sock {
 #define        SK_DEAD         6                       /* socket closed */
 #define        SK_CHNGBUF      7                       /* need to change snd/rcv buffer sizes */
 #define        SK_DEFERRED     8                       /* request on sk_deferred */
+#define        SK_OLD          9                       /* used for temp socket aging mark+sweep */
+#define        SK_DETACHED     10                      /* detached from tempsocks list */
 
        int                     sk_reserved;    /* space on outq that is reserved */
 
index eee45a5..0c2c522 100644 (file)
@@ -59,6 +59,7 @@ svc_create(struct svc_program *prog, unsigned int bufsize,
        INIT_LIST_HEAD(&serv->sv_sockets);
        INIT_LIST_HEAD(&serv->sv_tempsocks);
        INIT_LIST_HEAD(&serv->sv_permsocks);
+       init_timer(&serv->sv_temptimer);
        spin_lock_init(&serv->sv_lock);
 
        /* Remove any stale portmap registrations */
@@ -87,6 +88,8 @@ svc_destroy(struct svc_serv *serv)
        } else
                printk("svc_destroy: no threads for serv=%p!\n", serv);
 
+       del_timer_sync(&serv->sv_temptimer);
+
        while (!list_empty(&serv->sv_tempsocks)) {
                svsk = list_entry(serv->sv_tempsocks.next,
                                  struct svc_sock,
index bc9bd18..9ba1a07 100644 (file)
@@ -74,6 +74,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_sock *svsk);
 static int svc_deferred_recv(struct svc_rqst *rqstp);
 static struct cache_deferred_req *svc_defer(struct cache_req *req);
 
+/* apparently the "standard" is that clients close
+ * idle connections after 5 minutes, servers after
+ * 6 minutes
+ *   http://www.connectathon.org/talks96/nfstcp.pdf
+ */
+static int svc_conn_age_period = 6*60;
+
 /*
  * Queue up an idle server thread.  Must have serv->sv_lock held.
  * Note: this is really a stack rather than a queue, so that we only
@@ -1220,24 +1227,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
                return -EINTR;
 
        spin_lock_bh(&serv->sv_lock);
-       if (!list_empty(&serv->sv_tempsocks)) {
-               svsk = list_entry(serv->sv_tempsocks.next,
-                                 struct svc_sock, sk_list);
-               /* apparently the "standard" is that clients close
-                * idle connections after 5 minutes, servers after
-                * 6 minutes
-                *   http://www.connectathon.org/talks96/nfstcp.pdf 
-                */
-               if (get_seconds() - svsk->sk_lastrecv < 6*60
-                   || test_bit(SK_BUSY, &svsk->sk_flags))
-                       svsk = NULL;
-       }
-       if (svsk) {
-               set_bit(SK_BUSY, &svsk->sk_flags);
-               set_bit(SK_CLOSE, &svsk->sk_flags);
-               rqstp->rq_sock = svsk;
-               svsk->sk_inuse++;
-       } else if ((svsk = svc_sock_dequeue(serv)) != NULL) {
+       if ((svsk = svc_sock_dequeue(serv)) != NULL) {
                rqstp->rq_sock = svsk;
                svsk->sk_inuse++;
                rqstp->rq_reserved = serv->sv_bufsz;    
@@ -1282,13 +1272,7 @@ svc_recv(struct svc_rqst *rqstp, long timeout)
                return -EAGAIN;
        }
        svsk->sk_lastrecv = get_seconds();
-       if (test_bit(SK_TEMP, &svsk->sk_flags)) {
-               /* push active sockets to end of list */
-               spin_lock_bh(&serv->sv_lock);
-               if (!list_empty(&svsk->sk_list))
-                       list_move_tail(&svsk->sk_list, &serv->sv_tempsocks);
-               spin_unlock_bh(&serv->sv_lock);
-       }
+       clear_bit(SK_OLD, &svsk->sk_flags);
 
        rqstp->rq_secure  = ntohs(rqstp->rq_addr.sin_port) < 1024;
        rqstp->rq_chandle.defer = svc_defer;
@@ -1348,6 +1332,58 @@ svc_send(struct svc_rqst *rqstp)
 }
 
 /*
+ * Timer function to close old temporary sockets, using
+ * a mark-and-sweep algorithm.
+ */
+static void
+svc_age_temp_sockets(unsigned long closure)
+{
+       struct svc_serv *serv = (struct svc_serv *)closure;
+       struct svc_sock *svsk;
+       struct list_head *le, *next;
+       LIST_HEAD(to_be_aged);
+
+       dprintk("svc_age_temp_sockets\n");
+
+       if (!spin_trylock_bh(&serv->sv_lock)) {
+               /* busy, try again 1 sec later */
+               dprintk("svc_age_temp_sockets: busy\n");
+               mod_timer(&serv->sv_temptimer, jiffies + HZ);
+               return;
+       }
+
+       list_for_each_safe(le, next, &serv->sv_tempsocks) {
+               svsk = list_entry(le, struct svc_sock, sk_list);
+
+               if (!test_and_set_bit(SK_OLD, &svsk->sk_flags))
+                       continue;
+               if (svsk->sk_inuse || test_bit(SK_BUSY, &svsk->sk_flags))
+                       continue;
+               svsk->sk_inuse++;
+               list_move(le, &to_be_aged);
+               set_bit(SK_CLOSE, &svsk->sk_flags);
+               set_bit(SK_DETACHED, &svsk->sk_flags);
+       }
+       spin_unlock_bh(&serv->sv_lock);
+
+       while (!list_empty(&to_be_aged)) {
+               le = to_be_aged.next;
+               /* fiddling the sk_list node is safe 'cos we're SK_DETACHED */
+               list_del_init(le);
+               svsk = list_entry(le, struct svc_sock, sk_list);
+
+               dprintk("queuing svsk %p for closing, %lu seconds old\n",
+                       svsk, get_seconds() - svsk->sk_lastrecv);
+
+               /* a thread will dequeue and close it soon */
+               svc_sock_enqueue(svsk);
+               svc_sock_put(svsk);
+       }
+
+       mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
+}
+
+/*
  * Initialize socket for RPC use and create svc_sock struct
  * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
  */
@@ -1400,6 +1436,13 @@ svc_setup_socket(struct svc_serv *serv, struct socket *sock,
                set_bit(SK_TEMP, &svsk->sk_flags);
                list_add(&svsk->sk_list, &serv->sv_tempsocks);
                serv->sv_tmpcnt++;
+               if (serv->sv_temptimer.function == NULL) {
+                       /* setup timer to age temp sockets */
+                       setup_timer(&serv->sv_temptimer, svc_age_temp_sockets,
+                                       (unsigned long)serv);
+                       mod_timer(&serv->sv_temptimer,
+                                       jiffies + svc_conn_age_period * HZ);
+               }
        } else {
                clear_bit(SK_TEMP, &svsk->sk_flags);
                list_add(&svsk->sk_list, &serv->sv_permsocks);
@@ -1513,7 +1556,8 @@ svc_delete_socket(struct svc_sock *svsk)
 
        spin_lock_bh(&serv->sv_lock);
 
-       list_del_init(&svsk->sk_list);
+       if (!test_and_set_bit(SK_DETACHED, &svsk->sk_flags))
+               list_del_init(&svsk->sk_list);
        list_del_init(&svsk->sk_ready);
        if (!test_and_set_bit(SK_DEAD, &svsk->sk_flags))
                if (test_bit(SK_TEMP, &svsk->sk_flags))