[GFS2] Add nanosecond timestamp feature
[safe/jmp/linux-2.6] / net / xfrm / xfrm_state.c
index 535d43c..dfacb9c 100644 (file)
 #include <linux/pfkeyv2.h>
 #include <linux/ipsec.h>
 #include <linux/module.h>
-#include <linux/bootmem.h>
-#include <linux/vmalloc.h>
 #include <linux/cache.h>
 #include <asm/uaccess.h>
+#include <linux/audit.h>
+#include <linux/cache.h>
+
+#include "xfrm_hash.h"
 
 struct sock *xfrm_nl;
 EXPORT_SYMBOL(xfrm_nl);
 
-u32 sysctl_xfrm_aevent_etime = XFRM_AE_ETIME;
+u32 sysctl_xfrm_aevent_etime __read_mostly = XFRM_AE_ETIME;
 EXPORT_SYMBOL(sysctl_xfrm_aevent_etime);
 
-u32 sysctl_xfrm_aevent_rseqth = XFRM_AE_SEQT_SIZE;
+u32 sysctl_xfrm_aevent_rseqth __read_mostly = XFRM_AE_SEQT_SIZE;
 EXPORT_SYMBOL(sysctl_xfrm_aevent_rseqth);
 
+u32 sysctl_xfrm_acq_expires __read_mostly = 30;
+
 /* Each xfrm_state may be linked to two tables:
 
    1. Hash table by (spi,daddr,ah/esp) to find SA by SPI. (input,ctl)
-   2. Hash table by daddr to find what SAs exist for given
+   2. Hash table by (daddr,family,reqid) to find what SAs exist for given
       destination/tunnel endpoint. (output)
  */
 
@@ -55,127 +59,25 @@ static unsigned int xfrm_state_hashmax __read_mostly = 1 * 1024 * 1024;
 static unsigned int xfrm_state_num;
 static unsigned int xfrm_state_genid;
 
-static inline unsigned int __xfrm4_dst_hash(xfrm_address_t *addr, unsigned int hmask)
-{
-       unsigned int h;
-       h = ntohl(addr->a4);
-       h = (h ^ (h>>16)) & hmask;
-       return h;
-}
-
-static inline unsigned int __xfrm6_dst_hash(xfrm_address_t *addr, unsigned int hmask)
-{
-       unsigned int h;
-       h = ntohl(addr->a6[2]^addr->a6[3]);
-       h = (h ^ (h>>16)) & hmask;
-       return h;
-}
-
-static inline unsigned int __xfrm4_src_hash(xfrm_address_t *addr, unsigned int hmask)
-{
-       return __xfrm4_dst_hash(addr, hmask);
-}
-
-static inline unsigned int __xfrm6_src_hash(xfrm_address_t *addr, unsigned int hmask)
-{
-       return __xfrm6_dst_hash(addr, hmask);
-}
-
-static inline unsigned __xfrm_src_hash(xfrm_address_t *addr, unsigned short family,  unsigned int hmask)
-{
-       switch (family) {
-       case AF_INET:
-               return __xfrm4_src_hash(addr, hmask);
-       case AF_INET6:
-               return __xfrm6_src_hash(addr, hmask);
-       }
-       return 0;
-}
-
-static inline unsigned xfrm_src_hash(xfrm_address_t *addr, unsigned short family)
-{
-       return __xfrm_src_hash(addr, family, xfrm_state_hmask);
-}
-
-static inline unsigned int __xfrm_dst_hash(xfrm_address_t *addr, unsigned short family, unsigned int hmask)
-{
-       switch (family) {
-       case AF_INET:
-               return __xfrm4_dst_hash(addr, hmask);
-       case AF_INET6:
-               return __xfrm6_dst_hash(addr, hmask);
-       }
-       return 0;
-}
-
-static inline unsigned int xfrm_dst_hash(xfrm_address_t *addr, unsigned short family)
-{
-       return __xfrm_dst_hash(addr, family, xfrm_state_hmask);
-}
-
-static inline unsigned int __xfrm4_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto,
-                                       unsigned int hmask)
-{
-       unsigned int h;
-       h = ntohl(addr->a4^spi^proto);
-       h = (h ^ (h>>10) ^ (h>>20)) & hmask;
-       return h;
-}
-
-static inline unsigned int __xfrm6_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto,
-                                           unsigned int hmask)
+static inline unsigned int xfrm_dst_hash(xfrm_address_t *daddr,
+                                        xfrm_address_t *saddr,
+                                        u32 reqid,
+                                        unsigned short family)
 {
-       unsigned int h;
-       h = ntohl(addr->a6[2]^addr->a6[3]^spi^proto);
-       h = (h ^ (h>>10) ^ (h>>20)) & hmask;
-       return h;
+       return __xfrm_dst_hash(daddr, saddr, reqid, family, xfrm_state_hmask);
 }
 
-static inline
-unsigned __xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family,
-                        unsigned int hmask)
+static inline unsigned int xfrm_src_hash(xfrm_address_t *daddr,
+                                        xfrm_address_t *saddr,
+                                        unsigned short family)
 {
-       switch (family) {
-       case AF_INET:
-               return __xfrm4_spi_hash(addr, spi, proto, hmask);
-       case AF_INET6:
-               return __xfrm6_spi_hash(addr, spi, proto, hmask);
-       }
-       return 0;       /*XXX*/
+       return __xfrm_src_hash(daddr, saddr, family, xfrm_state_hmask);
 }
 
 static inline unsigned int
-xfrm_spi_hash(xfrm_address_t *addr, u32 spi, u8 proto, unsigned short family)
-{
-       return __xfrm_spi_hash(addr, spi, proto, family, xfrm_state_hmask);
-}
-
-static struct hlist_head *xfrm_state_hash_alloc(unsigned int sz)
-{
-       struct hlist_head *n;
-
-       if (sz <= PAGE_SIZE)
-               n = kmalloc(sz, GFP_KERNEL);
-       else if (hashdist)
-               n = __vmalloc(sz, GFP_KERNEL, PAGE_KERNEL);
-       else
-               n = (struct hlist_head *)
-                       __get_free_pages(GFP_KERNEL, get_order(sz));
-
-       if (n)
-               memset(n, 0, sz);
-
-       return n;
-}
-
-static void xfrm_state_hash_free(struct hlist_head *n, unsigned int sz)
+xfrm_spi_hash(xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family)
 {
-       if (sz <= PAGE_SIZE)
-               kfree(n);
-       else if (hashdist)
-               vfree(n);
-       else
-               free_pages((unsigned long)n, get_order(sz));
+       return __xfrm_spi_hash(daddr, spi, proto, family, xfrm_state_hmask);
 }
 
 static void xfrm_hash_transfer(struct hlist_head *list,
@@ -190,16 +92,22 @@ static void xfrm_hash_transfer(struct hlist_head *list,
        hlist_for_each_entry_safe(x, entry, tmp, list, bydst) {
                unsigned int h;
 
-               h = __xfrm_dst_hash(&x->id.daddr, x->props.family, nhashmask);
+               h = __xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
+                                   x->props.reqid, x->props.family,
+                                   nhashmask);
                hlist_add_head(&x->bydst, ndsttable+h);
 
-               h = __xfrm_src_hash(&x->props.saddr, x->props.family,
+               h = __xfrm_src_hash(&x->id.daddr, &x->props.saddr,
+                                   x->props.family,
                                    nhashmask);
                hlist_add_head(&x->bysrc, nsrctable+h);
 
-               h = __xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto,
-                                   x->props.family, nhashmask);
-               hlist_add_head(&x->byspi, nspitable+h);
+               if (x->id.spi) {
+                       h = __xfrm_spi_hash(&x->id.daddr, x->id.spi,
+                                           x->id.proto, x->props.family,
+                                           nhashmask);
+                       hlist_add_head(&x->byspi, nspitable+h);
+               }
        }
 }
 
@@ -211,7 +119,7 @@ static unsigned long xfrm_hash_new_size(void)
 
 static DEFINE_MUTEX(hash_resize_mutex);
 
-static void xfrm_hash_resize(void *__unused)
+static void xfrm_hash_resize(struct work_struct *__unused)
 {
        struct hlist_head *ndst, *nsrc, *nspi, *odst, *osrc, *ospi;
        unsigned long nsize, osize;
@@ -221,18 +129,18 @@ static void xfrm_hash_resize(void *__unused)
        mutex_lock(&hash_resize_mutex);
 
        nsize = xfrm_hash_new_size();
-       ndst = xfrm_state_hash_alloc(nsize);
+       ndst = xfrm_hash_alloc(nsize);
        if (!ndst)
                goto out_unlock;
-       nsrc = xfrm_state_hash_alloc(nsize);
+       nsrc = xfrm_hash_alloc(nsize);
        if (!nsrc) {
-               xfrm_state_hash_free(ndst, nsize);
+               xfrm_hash_free(ndst, nsize);
                goto out_unlock;
        }
-       nspi = xfrm_state_hash_alloc(nsize);
+       nspi = xfrm_hash_alloc(nsize);
        if (!nspi) {
-               xfrm_state_hash_free(ndst, nsize);
-               xfrm_state_hash_free(nsrc, nsize);
+               xfrm_hash_free(ndst, nsize);
+               xfrm_hash_free(nsrc, nsize);
                goto out_unlock;
        }
 
@@ -256,15 +164,15 @@ static void xfrm_hash_resize(void *__unused)
        spin_unlock_bh(&xfrm_state_lock);
 
        osize = (ohashmask + 1) * sizeof(struct hlist_head);
-       xfrm_state_hash_free(odst, osize);
-       xfrm_state_hash_free(osrc, osize);
-       xfrm_state_hash_free(ospi, osize);
+       xfrm_hash_free(odst, osize);
+       xfrm_hash_free(osrc, osize);
+       xfrm_hash_free(ospi, osize);
 
 out_unlock:
        mutex_unlock(&hash_resize_mutex);
 }
 
-static DECLARE_WORK(xfrm_hash_work, xfrm_hash_resize, NULL);
+static DECLARE_WORK(xfrm_hash_work, xfrm_hash_resize);
 
 DECLARE_WAIT_QUEUE_HEAD(km_waitq);
 EXPORT_SYMBOL(km_waitq);
@@ -276,22 +184,15 @@ static struct work_struct xfrm_state_gc_work;
 static HLIST_HEAD(xfrm_state_gc_list);
 static DEFINE_SPINLOCK(xfrm_state_gc_lock);
 
-static int xfrm_state_gc_flush_bundles;
-
 int __xfrm_state_delete(struct xfrm_state *x);
 
-static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family);
-static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo);
-
 int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol);
 void km_state_expired(struct xfrm_state *x, int hard, u32 pid);
 
 static void xfrm_state_gc_destroy(struct xfrm_state *x)
 {
-       if (del_timer(&x->timer))
-               BUG();
-       if (del_timer(&x->rtimer))
-               BUG();
+       del_timer_sync(&x->timer);
+       del_timer_sync(&x->rtimer);
        kfree(x->aalg);
        kfree(x->ealg);
        kfree(x->calg);
@@ -307,17 +208,12 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x)
        kfree(x);
 }
 
-static void xfrm_state_gc_task(void *data)
+static void xfrm_state_gc_task(struct work_struct *data)
 {
        struct xfrm_state *x;
        struct hlist_node *entry, *tmp;
        struct hlist_head gc_list;
 
-       if (xfrm_state_gc_flush_bundles) {
-               xfrm_state_gc_flush_bundles = 0;
-               xfrm_flush_bundles();
-       }
-
        spin_lock_bh(&xfrm_state_gc_lock);
        gc_list.first = xfrm_state_gc_list.first;
        INIT_HLIST_HEAD(&xfrm_state_gc_list);
@@ -334,15 +230,16 @@ static inline unsigned long make_jiffies(long secs)
        if (secs >= (MAX_SCHEDULE_TIMEOUT-1)/HZ)
                return MAX_SCHEDULE_TIMEOUT-1;
        else
-               return secs*HZ;
+               return secs*HZ;
 }
 
 static void xfrm_timer_handler(unsigned long data)
 {
        struct xfrm_state *x = (struct xfrm_state*)data;
-       unsigned long now = (unsigned long)xtime.tv_sec;
+       unsigned long now = get_seconds();
        long next = LONG_MAX;
        int warn = 0;
+       int err = 0;
 
        spin_lock(&x->lock);
        if (x->km.state == XFRM_STATE_DEAD)
@@ -388,9 +285,9 @@ static void xfrm_timer_handler(unsigned long data)
        if (warn)
                km_state_expired(x, 0, 0);
 resched:
-       if (next != LONG_MAX &&
-           !mod_timer(&x->timer, jiffies + make_jiffies(next)))
-               xfrm_state_hold(x);
+       if (next != LONG_MAX)
+               mod_timer(&x->timer, jiffies + make_jiffies(next));
+
        goto out;
 
 expired:
@@ -400,12 +297,16 @@ expired:
                next = 2;
                goto resched;
        }
-       if (!__xfrm_state_delete(x) && x->id.spi)
+
+       err = __xfrm_state_delete(x);
+       if (!err && x->id.spi)
                km_state_expired(x, 1, 0);
 
+       xfrm_audit_log(audit_get_loginuid(current->audit_context), 0,
+                      AUDIT_MAC_IPSEC_DELSA, err ? 0 : 1, NULL, x);
+
 out:
        spin_unlock(&x->lock);
-       xfrm_state_put(x);
 }
 
 static void xfrm_replay_timer_handler(unsigned long data);
@@ -428,7 +329,7 @@ struct xfrm_state *xfrm_state_alloc(void)
                init_timer(&x->rtimer);
                x->rtimer.function = xfrm_replay_timer_handler;
                x->rtimer.data     = (unsigned long)x;
-               x->curlft.add_time = (unsigned long)xtime.tv_sec;
+               x->curlft.add_time = get_seconds();
                x->lft.soft_byte_limit = XFRM_INF;
                x->lft.soft_packet_limit = XFRM_INF;
                x->lft.hard_byte_limit = XFRM_INF;
@@ -460,29 +361,11 @@ int __xfrm_state_delete(struct xfrm_state *x)
                x->km.state = XFRM_STATE_DEAD;
                spin_lock(&xfrm_state_lock);
                hlist_del(&x->bydst);
-               __xfrm_state_put(x);
                hlist_del(&x->bysrc);
-               __xfrm_state_put(x);
-               if (x->id.spi) {
+               if (x->id.spi)
                        hlist_del(&x->byspi);
-                       __xfrm_state_put(x);
-               }
                xfrm_state_num--;
                spin_unlock(&xfrm_state_lock);
-               if (del_timer(&x->timer))
-                       __xfrm_state_put(x);
-               if (del_timer(&x->rtimer))
-                       __xfrm_state_put(x);
-
-               /* The number two in this test is the reference
-                * mentioned in the comment below plus the reference
-                * our caller holds.  A larger value means that
-                * there are DSTs attached to this xfrm_state.
-                */
-               if (atomic_read(&x->refcnt) > 2) {
-                       xfrm_state_gc_flush_bundles = 1;
-                       schedule_work(&xfrm_state_gc_work);
-               }
 
                /* All xfrm_state objects are created by xfrm_state_alloc.
                 * The xfrm_state_alloc call gives a reference, and that
@@ -508,12 +391,49 @@ int xfrm_state_delete(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_delete);
 
-void xfrm_state_flush(u8 proto)
+#ifdef CONFIG_SECURITY_NETWORK_XFRM
+static inline int
+xfrm_state_flush_secctx_check(u8 proto, struct xfrm_audit *audit_info)
 {
-       int i;
+       int i, err = 0;
+
+       for (i = 0; i <= xfrm_state_hmask; i++) {
+               struct hlist_node *entry;
+               struct xfrm_state *x;
+
+               hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
+                       if (xfrm_id_proto_match(x->id.proto, proto) &&
+                          (err = security_xfrm_state_delete(x)) != 0) {
+                               xfrm_audit_log(audit_info->loginuid,
+                                              audit_info->secid,
+                                              AUDIT_MAC_IPSEC_DELSA,
+                                               0, NULL, x);
+
+                               return err;
+                       }
+               }
+       }
+
+       return err;
+}
+#else
+static inline int
+xfrm_state_flush_secctx_check(u8 proto, struct xfrm_audit *audit_info)
+{
+       return 0;
+}
+#endif
+
+int xfrm_state_flush(u8 proto, struct xfrm_audit *audit_info)
+{
+       int i, err = 0;
 
        spin_lock_bh(&xfrm_state_lock);
-       for (i = 0; i < xfrm_state_hmask; i++) {
+       err = xfrm_state_flush_secctx_check(proto, audit_info);
+       if (err)
+               goto out;
+
+       for (i = 0; i <= xfrm_state_hmask; i++) {
                struct hlist_node *entry;
                struct xfrm_state *x;
 restart:
@@ -523,7 +443,11 @@ restart:
                                xfrm_state_hold(x);
                                spin_unlock_bh(&xfrm_state_lock);
 
-                               xfrm_state_delete(x);
+                               err = xfrm_state_delete(x);
+                               xfrm_audit_log(audit_info->loginuid,
+                                              audit_info->secid,
+                                              AUDIT_MAC_IPSEC_DELSA,
+                                              err ? 0 : 1, NULL, x);
                                xfrm_state_put(x);
 
                                spin_lock_bh(&xfrm_state_lock);
@@ -531,11 +455,25 @@ restart:
                        }
                }
        }
+       err = 0;
+
+out:
        spin_unlock_bh(&xfrm_state_lock);
        wake_up(&km_waitq);
+       return err;
 }
 EXPORT_SYMBOL(xfrm_state_flush);
 
+void xfrm_sad_getinfo(struct xfrmk_sadinfo *si)
+{
+       spin_lock_bh(&xfrm_state_lock);
+       si->sadcnt = xfrm_state_num;
+       si->sadhcnt = xfrm_state_hmask;
+       si->sadhmcnt = xfrm_state_hashmax;
+       spin_unlock_bh(&xfrm_state_lock);
+}
+EXPORT_SYMBOL(xfrm_sad_getinfo);
+
 static int
 xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
                  struct xfrm_tmpl *tmpl,
@@ -550,7 +488,7 @@ xfrm_init_tempsel(struct xfrm_state *x, struct flowi *fl,
        return 0;
 }
 
-static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto, unsigned short family)
+static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, __be32 spi, u8 proto, unsigned short family)
 {
        unsigned int h = xfrm_spi_hash(daddr, spi, proto, family);
        struct xfrm_state *x;
@@ -573,7 +511,7 @@ static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8
                                             x->id.daddr.a6))
                                continue;
                        break;
-               };
+               }
 
                xfrm_state_hold(x);
                return x;
@@ -584,7 +522,7 @@ static struct xfrm_state *__xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8
 
 static struct xfrm_state *__xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr, u8 proto, unsigned short family)
 {
-       unsigned int h = xfrm_src_hash(saddr, family);
+       unsigned int h = xfrm_src_hash(daddr, saddr, family);
        struct xfrm_state *x;
        struct hlist_node *entry;
 
@@ -608,7 +546,7 @@ static struct xfrm_state *__xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm
                                             x->props.saddr.a6))
                                continue;
                        break;
-               };
+               }
 
                xfrm_state_hold(x);
                return x;
@@ -629,19 +567,27 @@ __xfrm_state_locate(struct xfrm_state *x, int use_spi, int family)
                                                  x->id.proto, family);
 }
 
+static void xfrm_hash_grow_check(int have_hash_collision)
+{
+       if (have_hash_collision &&
+           (xfrm_state_hmask + 1) < xfrm_state_hashmax &&
+           xfrm_state_num > xfrm_state_hmask)
+               schedule_work(&xfrm_hash_work);
+}
+
 struct xfrm_state *
-xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, 
+xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
                struct flowi *fl, struct xfrm_tmpl *tmpl,
                struct xfrm_policy *pol, int *err,
                unsigned short family)
 {
-       unsigned int h = xfrm_dst_hash(daddr, family);
+       unsigned int h = xfrm_dst_hash(daddr, saddr, tmpl->reqid, family);
        struct hlist_node *entry;
        struct xfrm_state *x, *x0;
        int acquire_in_progress = 0;
        int error = 0;
        struct xfrm_state *best = NULL;
-       
+
        spin_lock_bh(&xfrm_state_lock);
        hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) {
                if (x->props.family == family &&
@@ -677,7 +623,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
                                acquire_in_progress = 1;
                        } else if (x->km.state == XFRM_STATE_ERROR ||
                                   x->km.state == XFRM_STATE_EXPIRED) {
-                               if (xfrm_selector_match(&x->sel, fl, family) &&
+                               if (xfrm_selector_match(&x->sel, fl, family) &&
                                    security_xfrm_state_pol_flow_match(x, pol, fl))
                                        error = -ESRCH;
                        }
@@ -713,19 +659,17 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr,
                if (km_query(x, tmpl, pol) == 0) {
                        x->km.state = XFRM_STATE_ACQ;
                        hlist_add_head(&x->bydst, xfrm_state_bydst+h);
-                       xfrm_state_hold(x);
-                       h = xfrm_src_hash(saddr, family);
+                       h = xfrm_src_hash(daddr, saddr, family);
                        hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
-                       xfrm_state_hold(x);
                        if (x->id.spi) {
                                h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, family);
                                hlist_add_head(&x->byspi, xfrm_state_byspi+h);
-                               xfrm_state_hold(x);
                        }
-                       x->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
-                       xfrm_state_hold(x);
-                       x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
+                       x->lft.hard_add_expires_seconds = sysctl_xfrm_acq_expires;
+                       x->timer.expires = jiffies + sysctl_xfrm_acq_expires*HZ;
                        add_timer(&x->timer);
+                       xfrm_state_num++;
+                       xfrm_hash_grow_check(x->bydst.next != NULL);
                } else {
                        x->km.state = XFRM_STATE_DEAD;
                        xfrm_state_put(x);
@@ -744,57 +688,67 @@ out:
 
 static void __xfrm_state_insert(struct xfrm_state *x)
 {
-       unsigned int h = xfrm_dst_hash(&x->id.daddr, x->props.family);
+       unsigned int h;
 
        x->genid = ++xfrm_state_genid;
 
+       h = xfrm_dst_hash(&x->id.daddr, &x->props.saddr,
+                         x->props.reqid, x->props.family);
        hlist_add_head(&x->bydst, xfrm_state_bydst+h);
-       xfrm_state_hold(x);
-
-       h = xfrm_src_hash(&x->props.saddr, x->props.family);
 
+       h = xfrm_src_hash(&x->id.daddr, &x->props.saddr, x->props.family);
        hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
-       xfrm_state_hold(x);
 
-       if (xfrm_id_proto_match(x->id.proto, IPSEC_PROTO_ANY)) {
+       if (x->id.spi) {
                h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto,
                                  x->props.family);
 
                hlist_add_head(&x->byspi, xfrm_state_byspi+h);
-               xfrm_state_hold(x);
        }
 
-       if (!mod_timer(&x->timer, jiffies + HZ))
-               xfrm_state_hold(x);
-
-       if (x->replay_maxage &&
-           !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
-               xfrm_state_hold(x);
+       mod_timer(&x->timer, jiffies + HZ);
+       if (x->replay_maxage)
+               mod_timer(&x->rtimer, jiffies + x->replay_maxage);
 
        wake_up(&km_waitq);
 
        xfrm_state_num++;
 
-       if (x->bydst.next != NULL &&
-           (xfrm_state_hmask + 1) < xfrm_state_hashmax &&
-           xfrm_state_num > xfrm_state_hmask)
-               schedule_work(&xfrm_hash_work);
+       xfrm_hash_grow_check(x->bydst.next != NULL);
+}
+
+/* xfrm_state_lock is held */
+static void __xfrm_state_bump_genids(struct xfrm_state *xnew)
+{
+       unsigned short family = xnew->props.family;
+       u32 reqid = xnew->props.reqid;
+       struct xfrm_state *x;
+       struct hlist_node *entry;
+       unsigned int h;
+
+       h = xfrm_dst_hash(&xnew->id.daddr, &xnew->props.saddr, reqid, family);
+       hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) {
+               if (x->props.family     == family &&
+                   x->props.reqid      == reqid &&
+                   !xfrm_addr_cmp(&x->id.daddr, &xnew->id.daddr, family) &&
+                   !xfrm_addr_cmp(&x->props.saddr, &xnew->props.saddr, family))
+                       x->genid = xfrm_state_genid;
+       }
 }
 
 void xfrm_state_insert(struct xfrm_state *x)
 {
        spin_lock_bh(&xfrm_state_lock);
+       __xfrm_state_bump_genids(x);
        __xfrm_state_insert(x);
        spin_unlock_bh(&xfrm_state_lock);
-
-       xfrm_flush_all_bundles();
 }
 EXPORT_SYMBOL(xfrm_state_insert);
 
 /* xfrm_state_lock is held */
 static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 reqid, u8 proto, xfrm_address_t *daddr, xfrm_address_t *saddr, int create)
 {
-       unsigned int h = xfrm_dst_hash(daddr, family);
+       unsigned int h = xfrm_dst_hash(daddr, saddr, reqid, family);
        struct hlist_node *entry;
        struct xfrm_state *x;
 
@@ -803,7 +757,8 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re
                    x->props.mode   != mode ||
                    x->props.family != family ||
                    x->km.state     != XFRM_STATE_ACQ ||
-                   x->id.spi       != 0)
+                   x->id.spi       != 0 ||
+                   x->id.proto     != proto)
                        continue;
 
                switch (family) {
@@ -820,7 +775,7 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re
                                             (struct in6_addr *)saddr))
                                continue;
                        break;
-               };
+               }
 
                xfrm_state_hold(x);
                return x;
@@ -853,23 +808,25 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re
                        ipv6_addr_copy((struct in6_addr *)x->id.daddr.a6,
                                       (struct in6_addr *)daddr);
                        break;
-               };
+               }
 
                x->km.state = XFRM_STATE_ACQ;
                x->id.proto = proto;
                x->props.family = family;
                x->props.mode = mode;
                x->props.reqid = reqid;
-               x->lft.hard_add_expires_seconds = XFRM_ACQ_EXPIRES;
+               x->lft.hard_add_expires_seconds = sysctl_xfrm_acq_expires;
                xfrm_state_hold(x);
-               x->timer.expires = jiffies + XFRM_ACQ_EXPIRES*HZ;
+               x->timer.expires = jiffies + sysctl_xfrm_acq_expires*HZ;
                add_timer(&x->timer);
-               xfrm_state_hold(x);
                hlist_add_head(&x->bydst, xfrm_state_bydst+h);
-               h = xfrm_src_hash(saddr, family);
-               xfrm_state_hold(x);
+               h = xfrm_src_hash(daddr, saddr, family);
                hlist_add_head(&x->bysrc, xfrm_state_bysrc+h);
                wake_up(&km_waitq);
+
+               xfrm_state_num++;
+
+               xfrm_hash_grow_check(x->bydst.next != NULL);
        }
 
        return x;
@@ -898,7 +855,8 @@ int xfrm_state_add(struct xfrm_state *x)
 
        if (use_spi && x->km.seq) {
                x1 = __xfrm_find_acq_byseq(x->km.seq);
-               if (x1 && xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family)) {
+               if (x1 && ((x1->id.proto != x->id.proto) ||
+                   xfrm_addr_cmp(&x1->id.daddr, &x->id.daddr, family))) {
                        xfrm_state_put(x1);
                        x1 = NULL;
                }
@@ -909,15 +867,13 @@ int xfrm_state_add(struct xfrm_state *x)
                                     x->id.proto,
                                     &x->id.daddr, &x->props.saddr, 0);
 
+       __xfrm_state_bump_genids(x);
        __xfrm_state_insert(x);
        err = 0;
 
 out:
        spin_unlock_bh(&xfrm_state_lock);
 
-       if (!err)
-               xfrm_flush_all_bundles();
-
        if (x1) {
                xfrm_state_delete(x1);
                xfrm_state_put(x1);
@@ -927,6 +883,160 @@ out:
 }
 EXPORT_SYMBOL(xfrm_state_add);
 
+#ifdef CONFIG_XFRM_MIGRATE
+struct xfrm_state *xfrm_state_clone(struct xfrm_state *orig, int *errp)
+{
+       int err = -ENOMEM;
+       struct xfrm_state *x = xfrm_state_alloc();
+       if (!x)
+               goto error;
+
+       memcpy(&x->id, &orig->id, sizeof(x->id));
+       memcpy(&x->sel, &orig->sel, sizeof(x->sel));
+       memcpy(&x->lft, &orig->lft, sizeof(x->lft));
+       x->props.mode = orig->props.mode;
+       x->props.replay_window = orig->props.replay_window;
+       x->props.reqid = orig->props.reqid;
+       x->props.family = orig->props.family;
+       x->props.saddr = orig->props.saddr;
+
+       if (orig->aalg) {
+               x->aalg = xfrm_algo_clone(orig->aalg);
+               if (!x->aalg)
+                       goto error;
+       }
+       x->props.aalgo = orig->props.aalgo;
+
+       if (orig->ealg) {
+               x->ealg = xfrm_algo_clone(orig->ealg);
+               if (!x->ealg)
+                       goto error;
+       }
+       x->props.ealgo = orig->props.ealgo;
+
+       if (orig->calg) {
+               x->calg = xfrm_algo_clone(orig->calg);
+               if (!x->calg)
+                       goto error;
+       }
+       x->props.calgo = orig->props.calgo;
+
+       if (orig->encap) {
+               x->encap = kmemdup(orig->encap, sizeof(*x->encap), GFP_KERNEL);
+               if (!x->encap)
+                       goto error;
+       }
+
+       if (orig->coaddr) {
+               x->coaddr = kmemdup(orig->coaddr, sizeof(*x->coaddr),
+                                   GFP_KERNEL);
+               if (!x->coaddr)
+                       goto error;
+       }
+
+       err = xfrm_init_state(x);
+       if (err)
+               goto error;
+
+       x->props.flags = orig->props.flags;
+
+       x->curlft.add_time = orig->curlft.add_time;
+       x->km.state = orig->km.state;
+       x->km.seq = orig->km.seq;
+
+       return x;
+
+ error:
+       if (errp)
+               *errp = err;
+       if (x) {
+               kfree(x->aalg);
+               kfree(x->ealg);
+               kfree(x->calg);
+               kfree(x->encap);
+               kfree(x->coaddr);
+       }
+       kfree(x);
+       return NULL;
+}
+EXPORT_SYMBOL(xfrm_state_clone);
+
+/* xfrm_state_lock is held */
+struct xfrm_state * xfrm_migrate_state_find(struct xfrm_migrate *m)
+{
+       unsigned int h;
+       struct xfrm_state *x;
+       struct hlist_node *entry;
+
+       if (m->reqid) {
+               h = xfrm_dst_hash(&m->old_daddr, &m->old_saddr,
+                                 m->reqid, m->old_family);
+               hlist_for_each_entry(x, entry, xfrm_state_bydst+h, bydst) {
+                       if (x->props.mode != m->mode ||
+                           x->id.proto != m->proto)
+                               continue;
+                       if (m->reqid && x->props.reqid != m->reqid)
+                               continue;
+                       if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr,
+                                         m->old_family) ||
+                           xfrm_addr_cmp(&x->props.saddr, &m->old_saddr,
+                                         m->old_family))
+                               continue;
+                       xfrm_state_hold(x);
+                       return x;
+               }
+       } else {
+               h = xfrm_src_hash(&m->old_daddr, &m->old_saddr,
+                                 m->old_family);
+               hlist_for_each_entry(x, entry, xfrm_state_bysrc+h, bysrc) {
+                       if (x->props.mode != m->mode ||
+                           x->id.proto != m->proto)
+                               continue;
+                       if (xfrm_addr_cmp(&x->id.daddr, &m->old_daddr,
+                                         m->old_family) ||
+                           xfrm_addr_cmp(&x->props.saddr, &m->old_saddr,
+                                         m->old_family))
+                               continue;
+                       xfrm_state_hold(x);
+                       return x;
+               }
+       }
+
+       return NULL;
+}
+EXPORT_SYMBOL(xfrm_migrate_state_find);
+
+struct xfrm_state * xfrm_state_migrate(struct xfrm_state *x,
+                                      struct xfrm_migrate *m)
+{
+       struct xfrm_state *xc;
+       int err;
+
+       xc = xfrm_state_clone(x, &err);
+       if (!xc)
+               return NULL;
+
+       memcpy(&xc->id.daddr, &m->new_daddr, sizeof(xc->id.daddr));
+       memcpy(&xc->props.saddr, &m->new_saddr, sizeof(xc->props.saddr));
+
+       /* add state */
+       if (!xfrm_addr_cmp(&x->id.daddr, &m->new_daddr, m->new_family)) {
+               /* a care is needed when the destination address of the
+                  state is to be updated as it is a part of triplet */
+               xfrm_state_insert(xc);
+       } else {
+               if ((err = xfrm_state_add(xc)) < 0)
+                       goto error;
+       }
+
+       return xc;
+error:
+       kfree(xc);
+       return NULL;
+}
+EXPORT_SYMBOL(xfrm_state_migrate);
+#endif
+
 int xfrm_state_update(struct xfrm_state *x)
 {
        struct xfrm_state *x1;
@@ -977,8 +1087,7 @@ out:
                memcpy(&x1->lft, &x->lft, sizeof(x1->lft));
                x1->km.dying = 0;
 
-               if (!mod_timer(&x1->timer, jiffies + HZ))
-                       xfrm_state_hold(x1);
+               mod_timer(&x1->timer, jiffies + HZ);
                if (x1->curlft.use_time)
                        xfrm_state_check_expire(x1);
 
@@ -995,7 +1104,7 @@ EXPORT_SYMBOL(xfrm_state_update);
 int xfrm_state_check_expire(struct xfrm_state *x)
 {
        if (!x->curlft.use_time)
-               x->curlft.use_time = (unsigned long)xtime.tv_sec;
+               x->curlft.use_time = get_seconds();
 
        if (x->km.state != XFRM_STATE_VALID)
                return -EINVAL;
@@ -1003,8 +1112,7 @@ int xfrm_state_check_expire(struct xfrm_state *x)
        if (x->curlft.bytes >= x->lft.hard_byte_limit ||
            x->curlft.packets >= x->lft.hard_packet_limit) {
                x->km.state = XFRM_STATE_EXPIRED;
-               if (!mod_timer(&x->timer, jiffies))
-                       xfrm_state_hold(x);
+               mod_timer(&x->timer, jiffies);
                return -EINVAL;
        }
 
@@ -1042,7 +1150,7 @@ err:
 EXPORT_SYMBOL(xfrm_state_check);
 
 struct xfrm_state *
-xfrm_state_lookup(xfrm_address_t *daddr, u32 spi, u8 proto,
+xfrm_state_lookup(xfrm_address_t *daddr, __be32 spi, u8 proto,
                  unsigned short family)
 {
        struct xfrm_state *x;
@@ -1068,8 +1176,8 @@ xfrm_state_lookup_byaddr(xfrm_address_t *daddr, xfrm_address_t *saddr,
 EXPORT_SYMBOL(xfrm_state_lookup_byaddr);
 
 struct xfrm_state *
-xfrm_find_acq(u8 mode, u32 reqid, u8 proto, 
-             xfrm_address_t *daddr, xfrm_address_t *saddr, 
+xfrm_find_acq(u8 mode, u32 reqid, u8 proto,
+             xfrm_address_t *daddr, xfrm_address_t *saddr,
              int create, unsigned short family)
 {
        struct xfrm_state *x;
@@ -1166,7 +1274,7 @@ u32 xfrm_get_acqseq(void)
 EXPORT_SYMBOL(xfrm_get_acqseq);
 
 void
-xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi)
+xfrm_alloc_spi(struct xfrm_state *x, __be32 minspi, __be32 maxspi)
 {
        unsigned int h;
        struct xfrm_state *x0;
@@ -1183,10 +1291,10 @@ xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi)
                x->id.spi = minspi;
        } else {
                u32 spi = 0;
-               minspi = ntohl(minspi);
-               maxspi = ntohl(maxspi);
-               for (h=0; h<maxspi-minspi+1; h++) {
-                       spi = minspi + net_random()%(maxspi-minspi+1);
+               u32 low = ntohl(minspi);
+               u32 high = ntohl(maxspi);
+               for (h=0; h<high-low+1; h++) {
+                       spi = low + net_random()%(high-low+1);
                        x0 = xfrm_state_lookup(&x->id.daddr, htonl(spi), x->id.proto, x->props.family);
                        if (x0 == NULL) {
                                x->id.spi = htonl(spi);
@@ -1199,7 +1307,6 @@ xfrm_alloc_spi(struct xfrm_state *x, u32 minspi, u32 maxspi)
                spin_lock_bh(&xfrm_state_lock);
                h = xfrm_spi_hash(&x->id.daddr, x->id.spi, x->id.proto, x->props.family);
                hlist_add_head(&x->byspi, xfrm_state_byspi+h);
-               xfrm_state_hold(x);
                spin_unlock_bh(&xfrm_state_lock);
                wake_up(&km_waitq);
        }
@@ -1210,7 +1317,7 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
                    void *data)
 {
        int i;
-       struct xfrm_state *x;
+       struct xfrm_state *x, *last = NULL;
        struct hlist_node *entry;
        int count = 0;
        int err = 0;
@@ -1218,24 +1325,22 @@ int xfrm_state_walk(u8 proto, int (*func)(struct xfrm_state *, int, void*),
        spin_lock_bh(&xfrm_state_lock);
        for (i = 0; i <= xfrm_state_hmask; i++) {
                hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
-                       if (xfrm_id_proto_match(x->id.proto, proto))
-                               count++;
+                       if (!xfrm_id_proto_match(x->id.proto, proto))
+                               continue;
+                       if (last) {
+                               err = func(last, count, data);
+                               if (err)
+                                       goto out;
+                       }
+                       last = x;
+                       count++;
                }
        }
        if (count == 0) {
                err = -ENOENT;
                goto out;
        }
-
-       for (i = 0; i <= xfrm_state_hmask; i++) {
-               hlist_for_each_entry(x, entry, xfrm_state_bydst+i, bydst) {
-                       if (!xfrm_id_proto_match(x->id.proto, proto))
-                               continue;
-                       err = func(x, --count, data);
-                       if (err)
-                               goto out;
-               }
-       }
+       err = func(last, 0, data);
 out:
        spin_unlock_bh(&xfrm_state_lock);
        return err;
@@ -1286,10 +1391,8 @@ void xfrm_replay_notify(struct xfrm_state *x, int event)
        km_state_notify(x, &c);
 
        if (x->replay_maxage &&
-           !mod_timer(&x->rtimer, jiffies + x->replay_maxage)) {
-               xfrm_state_hold(x);
+           !mod_timer(&x->rtimer, jiffies + x->replay_maxage))
                x->xflags &= ~XFRM_TIME_DEFER;
-       }
 }
 EXPORT_SYMBOL(xfrm_replay_notify);
 
@@ -1307,14 +1410,12 @@ static void xfrm_replay_timer_handler(unsigned long data)
        }
 
        spin_unlock(&x->lock);
-       xfrm_state_put(x);
 }
 
-int xfrm_replay_check(struct xfrm_state *x, u32 seq)
+int xfrm_replay_check(struct xfrm_state *x, __be32 net_seq)
 {
        u32 diff;
-
-       seq = ntohl(seq);
+       u32 seq = ntohl(net_seq);
 
        if (unlikely(seq == 0))
                return -EINVAL;
@@ -1323,7 +1424,8 @@ int xfrm_replay_check(struct xfrm_state *x, u32 seq)
                return 0;
 
        diff = x->replay.seq - seq;
-       if (diff >= x->props.replay_window) {
+       if (diff >= min_t(unsigned int, x->props.replay_window,
+                         sizeof(x->replay.bitmap) * 8)) {
                x->stats.replay_window++;
                return -EINVAL;
        }
@@ -1336,11 +1438,10 @@ int xfrm_replay_check(struct xfrm_state *x, u32 seq)
 }
 EXPORT_SYMBOL(xfrm_replay_check);
 
-void xfrm_replay_advance(struct xfrm_state *x, u32 seq)
+void xfrm_replay_advance(struct xfrm_state *x, __be32 net_seq)
 {
        u32 diff;
-
-       seq = ntohl(seq);
+       u32 seq = ntohl(net_seq);
 
        if (seq > x->replay.seq) {
                diff = seq - x->replay.seq;
@@ -1420,7 +1521,7 @@ int km_query(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_policy *pol)
 }
 EXPORT_SYMBOL(km_query);
 
-int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, u16 sport)
+int km_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, __be16 sport)
 {
        int err = -EINVAL;
        struct xfrm_mgr *km;
@@ -1451,6 +1552,26 @@ void km_policy_expired(struct xfrm_policy *pol, int dir, int hard, u32 pid)
 }
 EXPORT_SYMBOL(km_policy_expired);
 
+int km_migrate(struct xfrm_selector *sel, u8 dir, u8 type,
+              struct xfrm_migrate *m, int num_migrate)
+{
+       int err = -EINVAL;
+       int ret;
+       struct xfrm_mgr *km;
+
+       read_lock(&xfrm_km_lock);
+       list_for_each_entry(km, &xfrm_km_list, list) {
+               if (km->migrate) {
+                       ret = km->migrate(sel, dir, type, m, num_migrate);
+                       if (!ret)
+                               err = ret;
+               }
+       }
+       read_unlock(&xfrm_km_lock);
+       return err;
+}
+EXPORT_SYMBOL(km_migrate);
+
 int km_report(u8 proto, struct xfrm_selector *sel, xfrm_address_t *addr)
 {
        int err = -EINVAL;
@@ -1564,7 +1685,7 @@ int xfrm_state_unregister_afinfo(struct xfrm_state_afinfo *afinfo)
 }
 EXPORT_SYMBOL(xfrm_state_unregister_afinfo);
 
-static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
+struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
 {
        struct xfrm_state_afinfo *afinfo;
        if (unlikely(family >= NPROTO))
@@ -1576,11 +1697,14 @@ static struct xfrm_state_afinfo *xfrm_state_get_afinfo(unsigned short family)
        return afinfo;
 }
 
-static void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
+void xfrm_state_put_afinfo(struct xfrm_state_afinfo *afinfo)
 {
        read_unlock(&xfrm_state_afinfo_lock);
 }
 
+EXPORT_SYMBOL(xfrm_state_get_afinfo);
+EXPORT_SYMBOL(xfrm_state_put_afinfo);
+
 /* Temporarily located here until net/xfrm/xfrm_tunnel.c is created */
 void xfrm_state_delete_tunnel(struct xfrm_state *x)
 {
@@ -1596,37 +1720,17 @@ void xfrm_state_delete_tunnel(struct xfrm_state *x)
 }
 EXPORT_SYMBOL(xfrm_state_delete_tunnel);
 
-/*
- * This function is NOT optimal.  For example, with ESP it will give an
- * MTU that's usually two bytes short of being optimal.  However, it will
- * usually give an answer that's a multiple of 4 provided the input is
- * also a multiple of 4.
- */
 int xfrm_state_mtu(struct xfrm_state *x, int mtu)
 {
-       int res = mtu;
-
-       res -= x->props.header_len;
-
-       for (;;) {
-               int m = res;
-
-               if (m < 68)
-                       return 68;
-
-               spin_lock_bh(&x->lock);
-               if (x->km.state == XFRM_STATE_VALID &&
-                   x->type && x->type->get_max_size)
-                       m = x->type->get_max_size(x, m);
-               else
-                       m += x->props.header_len;
-               spin_unlock_bh(&x->lock);
-
-               if (m <= mtu)
-                       break;
-               res -= (m - mtu);
-       }
+       int res;
 
+       spin_lock_bh(&x->lock);
+       if (x->km.state == XFRM_STATE_VALID &&
+           x->type && x->type->get_mtu)
+               res = x->type->get_mtu(x, mtu);
+       else
+               res = mtu - x->props.header_len;
+       spin_unlock_bh(&x->lock);
        return res;
 }
 
@@ -1670,20 +1774,20 @@ error:
 }
 
 EXPORT_SYMBOL(xfrm_init_state);
+
 void __init xfrm_state_init(void)
 {
        unsigned int sz;
 
        sz = sizeof(struct hlist_head) * 8;
 
-       xfrm_state_bydst = xfrm_state_hash_alloc(sz);
-       xfrm_state_bysrc = xfrm_state_hash_alloc(sz);
-       xfrm_state_byspi = xfrm_state_hash_alloc(sz);
+       xfrm_state_bydst = xfrm_hash_alloc(sz);
+       xfrm_state_bysrc = xfrm_hash_alloc(sz);
+       xfrm_state_byspi = xfrm_hash_alloc(sz);
        if (!xfrm_state_bydst || !xfrm_state_bysrc || !xfrm_state_byspi)
                panic("XFRM: Cannot allocate bydst/bysrc/byspi hashes.");
        xfrm_state_hmask = ((sz / sizeof(struct hlist_head)) - 1);
 
-       INIT_WORK(&xfrm_state_gc_work, xfrm_state_gc_task, NULL);
+       INIT_WORK(&xfrm_state_gc_work, xfrm_state_gc_task);
 }