net: reduce structures when XFRM=n
[safe/jmp/linux-2.6] / net / ipv4 / route.c
index 79c1e74..ffb2c57 100644 (file)
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly  = 10 * 60 * HZ;
 static int ip_rt_min_pmtu __read_mostly                = 512 + 20 + 20;
 static int ip_rt_min_advmss __read_mostly      = 256;
 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
+static int rt_chain_length_max __read_mostly   = 20;
 
 static void rt_worker_func(struct work_struct *work);
 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
@@ -145,6 +146,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 static void             ipv4_link_failure(struct sk_buff *skb);
 static void             ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 static int rt_garbage_collect(struct dst_ops *ops);
+static void rt_emergency_hash_rebuild(struct net *net);
 
 
 static struct dst_ops ipv4_dst_ops = {
@@ -201,6 +203,7 @@ const __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
        struct rtable   *chain;
 };
+
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
        defined(CONFIG_PROVE_LOCKING)
 /*
@@ -282,6 +285,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
        struct rtable *r = NULL;
 
        for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+               if (!rt_hash_table[st->bucket].chain)
+                       continue;
                rcu_read_lock_bh();
                r = rcu_dereference(rt_hash_table[st->bucket].chain);
                while (r) {
@@ -299,11 +304,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
                                          struct rtable *r)
 {
        struct rt_cache_iter_state *st = seq->private;
+
        r = r->u.dst.rt_next;
        while (!r) {
                rcu_read_unlock_bh();
-               if (--st->bucket < 0)
-                       break;
+               do {
+                       if (--st->bucket < 0)
+                               return NULL;
+               } while (!rt_hash_table[st->bucket].chain);
                rcu_read_lock_bh();
                r = rt_hash_table[st->bucket].chain;
        }
@@ -669,6 +677,20 @@ static inline u32 rt_score(struct rtable *rt)
        return score;
 }
 
+static inline bool rt_caching(const struct net *net)
+{
+       return net->ipv4.current_rt_cache_rebuild_count <=
+               net->ipv4.sysctl_rt_cache_rebuild_count;
+}
+
+static inline bool compare_hash_inputs(const struct flowi *fl1,
+                                       const struct flowi *fl2)
+{
+       return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
+               (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
+               (fl1->iif ^ fl2->iif)) == 0);
+}
+
 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 {
        return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
@@ -748,11 +770,24 @@ static void rt_do_flush(int process_context)
        }
 }
 
+/*
+ * While freeing expired entries, we compute average chain length
+ * and standard deviation, using fixed-point arithmetic.
+ * This to have an estimation of rt_chain_length_max
+ *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
+ * We use 3 bits for frational part, and 29 (or 61) for magnitude.
+ */
+
+#define FRACT_BITS 3
+#define ONE (1UL << FRACT_BITS)
+
 static void rt_check_expire(void)
 {
        static unsigned int rover;
        unsigned int i = rover, goal;
        struct rtable *rth, **rthp;
+       unsigned long length = 0, samples = 0;
+       unsigned long sum = 0, sum2 = 0;
        u64 mult;
 
        mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
@@ -761,6 +796,7 @@ static void rt_check_expire(void)
        goal = (unsigned int)mult;
        if (goal > rt_hash_mask)
                goal = rt_hash_mask + 1;
+       length = 0;
        for (; goal > 0; goal--) {
                unsigned long tmo = ip_rt_gc_timeout;
 
@@ -770,6 +806,8 @@ static void rt_check_expire(void)
                if (need_resched())
                        cond_resched();
 
+               samples++;
+
                if (*rthp == NULL)
                        continue;
                spin_lock_bh(rt_hash_lock_addr(i));
@@ -784,11 +822,29 @@ static void rt_check_expire(void)
                                if (time_before_eq(jiffies, rth->u.dst.expires)) {
                                        tmo >>= 1;
                                        rthp = &rth->u.dst.rt_next;
+                                       /*
+                                        * Only bump our length if the hash
+                                        * inputs on entries n and n+1 are not
+                                        * the same, we only count entries on
+                                        * a chain with equal hash inputs once
+                                        * so that entries for different QOS
+                                        * levels, and other non-hash input
+                                        * attributes don't unfairly skew
+                                        * the length computation
+                                        */
+                                       if ((*rthp == NULL) ||
+                                           !compare_hash_inputs(&(*rthp)->fl,
+                                                                &rth->fl))
+                                               length += ONE;
                                        continue;
                                }
                        } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
                                tmo >>= 1;
                                rthp = &rth->u.dst.rt_next;
+                               if ((*rthp == NULL) ||
+                                   !compare_hash_inputs(&(*rthp)->fl,
+                                                        &rth->fl))
+                                       length += ONE;
                                continue;
                        }
 
@@ -797,6 +853,15 @@ static void rt_check_expire(void)
                        rt_free(rth);
                }
                spin_unlock_bh(rt_hash_lock_addr(i));
+               sum += length;
+               sum2 += length*length;
+       }
+       if (samples) {
+               unsigned long avg = sum / samples;
+               unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+               rt_chain_length_max = max_t(unsigned long,
+                                       ip_rt_gc_elasticity,
+                                       (avg + 4*sd) >> FRACT_BITS);
        }
        rover = i;
 }
@@ -846,6 +911,26 @@ static void rt_secret_rebuild(unsigned long __net)
        mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
 }
 
+static void rt_secret_rebuild_oneshot(struct net *net)
+{
+       del_timer_sync(&net->ipv4.rt_secret_timer);
+       rt_cache_invalidate(net);
+       if (ip_rt_secret_interval) {
+               net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
+               add_timer(&net->ipv4.rt_secret_timer);
+       }
+}
+
+static void rt_emergency_hash_rebuild(struct net *net)
+{
+       if (net_ratelimit()) {
+               printk(KERN_WARNING "Route hash chain too long!\n");
+               printk(KERN_WARNING "Adjust your secret_interval!\n");
+       }
+
+       rt_secret_rebuild_oneshot(net);
+}
+
 /*
    Short description of GC goals.
 
@@ -984,6 +1069,7 @@ out:       return 0;
 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 {
        struct rtable   *rth, **rthp;
+       struct rtable   *rthi;
        unsigned long   now;
        struct rtable *cand, **candp;
        u32             min_score;
@@ -997,7 +1083,13 @@ restart:
        candp = NULL;
        now = jiffies;
 
+       if (!rt_caching(dev_net(rt->u.dst.dev))) {
+               rt_drop(rt);
+               return 0;
+       }
+
        rthp = &rt_hash_table[hash].chain;
+       rthi = NULL;
 
        spin_lock_bh(rt_hash_lock_addr(hash));
        while ((rth = *rthp) != NULL) {
@@ -1043,6 +1135,17 @@ restart:
                chain_length++;
 
                rthp = &rth->u.dst.rt_next;
+
+               /*
+                * check to see if the next entry in the chain
+                * contains the same hash input values as rt.  If it does
+                * This is where we will insert into the list, instead of
+                * at the head.  This groups entries that differ by aspects not
+                * relvant to the hash function together, which we use to adjust
+                * our chain length
+                */
+               if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
+                       rthi = rth;
        }
 
        if (cand) {
@@ -1056,6 +1159,16 @@ restart:
                        *candp = cand->u.dst.rt_next;
                        rt_free(cand);
                }
+       } else {
+               if (chain_length > rt_chain_length_max) {
+                       struct net *net = dev_net(rt->u.dst.dev);
+                       int num = ++net->ipv4.current_rt_cache_rebuild_count;
+                       if (!rt_caching(dev_net(rt->u.dst.dev))) {
+                               printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
+                                       rt->u.dst.dev->name, num);
+                       }
+                       rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
+               }
        }
 
        /* Try to bind route to arp only if it is output
@@ -1093,7 +1206,11 @@ restart:
                }
        }
 
-       rt->u.dst.rt_next = rt_hash_table[hash].chain;
+       if (rthi)
+               rt->u.dst.rt_next = rthi->u.dst.rt_next;
+       else
+               rt->u.dst.rt_next = rt_hash_table[hash].chain;
+
 #if RT_CACHE_DEBUG >= 2
        if (rt->u.dst.rt_next) {
                struct rtable *trt;
@@ -1104,7 +1221,16 @@ restart:
                printk("\n");
        }
 #endif
-       rt_hash_table[hash].chain = rt;
+       /*
+        * Since lookup is lockfree, we must make sure
+        * previous writes to rt are comitted to memory
+        * before making rt visible to other CPUS.
+        */
+       if (rthi)
+               rcu_assign_pointer(rthi->u.dst.rt_next, rt);
+       else
+               rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+
        spin_unlock_bh(rt_hash_lock_addr(hash));
        *rp = rt;
        return 0;
@@ -1207,6 +1333,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
            || ipv4_is_zeronet(new_gw))
                goto reject_redirect;
 
+       if (!rt_caching(net))
+               goto reject_redirect;
+
        if (!IN_DEV_SHARED_MEDIA(in_dev)) {
                if (!inet_addr_onlink(in_dev, new_gw, old_gw))
                        goto reject_redirect;
@@ -1270,7 +1399,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
                                rt->u.dst.path          = &rt->u.dst;
                                rt->u.dst.neighbour     = NULL;
                                rt->u.dst.hh            = NULL;
+#ifdef CONFIG_XFRM
                                rt->u.dst.xfrm          = NULL;
+#endif
                                rt->rt_genid            = rt_genid(net);
                                rt->rt_flags            |= RTCF_REDIRECTED;
 
@@ -1429,7 +1560,8 @@ static int ip_error(struct sk_buff *skb)
                        break;
                case ENETUNREACH:
                        code = ICMP_NET_UNREACH;
-                       IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
+                       IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
+                                       IPSTATS_MIB_INNOROUTES);
                        break;
                case EACCES:
                        code = ICMP_PKT_FILTERED;
@@ -1501,21 +1633,21 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
                                    rth->fl.iif != 0 ||
                                    dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
                                    !net_eq(dev_net(rth->u.dst.dev), net) ||
-                                   !rt_is_expired(rth))
+                                   rt_is_expired(rth))
                                        continue;
 
                                if (new_mtu < 68 || new_mtu >= old_mtu) {
 
                                        /* BSD 4.2 compatibility hack :-( */
                                        if (mtu == 0 &&
-                                           old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
+                                           old_mtu >= dst_mtu(&rth->u.dst) &&
                                            old_mtu >= 68 + (iph->ihl << 2))
                                                old_mtu -= iph->ihl << 2;
 
                                        mtu = guess_mtu(old_mtu);
                                }
-                               if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
-                                       if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
+                               if (mtu <= dst_mtu(&rth->u.dst)) {
+                                       if (mtu < dst_mtu(&rth->u.dst)) {
                                                dst_confirm(&rth->u.dst);
                                                if (mtu < ip_rt_min_pmtu) {
                                                        mtu = ip_rt_min_pmtu;
@@ -1537,7 +1669,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
 
 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
 {
-       if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
+       if (dst_mtu(dst) > mtu && mtu >= 68 &&
            !(dst_metric_locked(dst, RTAX_MTU))) {
                if (mtu < ip_rt_min_pmtu) {
                        mtu = ip_rt_min_pmtu;
@@ -1666,7 +1798,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
 
        if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
                rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
-       if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
+       if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
                rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
        if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
                rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
@@ -2119,6 +2251,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        struct net *net;
 
        net = dev_net(dev);
+
+       if (!rt_caching(net))
+               goto skip_cache;
+
        tos &= IPTOS_RT_MASK;
        hash = rt_hash(daddr, saddr, iif, rt_genid(net));
 
@@ -2143,6 +2279,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
        }
        rcu_read_unlock();
 
+skip_cache:
        /* Multicast recognition logic is moved from route cache to here.
           The problem was that too many Ethernet cards have broken/missing
           hardware multicast filters :-( As result the host on multicasting
@@ -2355,11 +2492,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                    ipv4_is_zeronet(oldflp->fl4_src))
                        goto out;
 
-               /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-               dev_out = ip_dev_find(net, oldflp->fl4_src);
-               if (dev_out == NULL)
-                       goto out;
-
                /* I removed check for oif == dev_out->oif here.
                   It was wrong for two reasons:
                   1. ip_dev_find(net, saddr) can return wrong iface, if saddr
@@ -2371,6 +2503,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                if (oldflp->oif == 0
                    && (ipv4_is_multicast(oldflp->fl4_dst) ||
                        oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+                       /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+                       dev_out = ip_dev_find(net, oldflp->fl4_src);
+                       if (dev_out == NULL)
+                               goto out;
+
                        /* Special hack: user can direct multicasts
                           and limited broadcast via necessary interface
                           without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -2389,9 +2526,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                        fl.oif = dev_out->ifindex;
                        goto make_route;
                }
-               if (dev_out)
+
+               if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+                       /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+                       dev_out = ip_dev_find(net, oldflp->fl4_src);
+                       if (dev_out == NULL)
+                               goto out;
                        dev_put(dev_out);
-               dev_out = NULL;
+                       dev_out = NULL;
+               }
        }
 
 
@@ -2522,6 +2665,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
        unsigned hash;
        struct rtable *rth;
 
+       if (!rt_caching(net))
+               goto slow_output;
+
        hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
 
        rcu_read_lock_bh();
@@ -2546,6 +2692,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
        }
        rcu_read_unlock_bh();
 
+slow_output:
        return ip_route_output_slow(net, rp, flp);
 }
 
@@ -2839,7 +2986,9 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
        if (s_h < 0)
                s_h = 0;
        s_idx = idx = cb->args[1];
-       for (h = s_h; h <= rt_hash_mask; h++) {
+       for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
+               if (!rt_hash_table[h].chain)
+                       continue;
                rcu_read_lock_bh();
                for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
                     rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
@@ -2858,7 +3007,6 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
                        dst_release(xchg(&skb->dst, NULL));
                }
                rcu_read_unlock_bh();
-               s_idx = 0;
        }
 
 done:
@@ -2895,8 +3043,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 }
 
 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
-                                               int __user *name,
-                                               int nlen,
                                                void __user *oldval,
                                                size_t __user *oldlenp,
                                                void __user *newval,
@@ -2913,7 +3059,66 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
        return 0;
 }
 
-ctl_table ipv4_route_table[] = {
+static void rt_secret_reschedule(int old)
+{
+       struct net *net;
+       int new = ip_rt_secret_interval;
+       int diff = new - old;
+
+       if (!diff)
+               return;
+
+       rtnl_lock();
+       for_each_net(net) {
+               int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
+
+               if (!new)
+                       continue;
+
+               if (deleted) {
+                       long time = net->ipv4.rt_secret_timer.expires - jiffies;
+
+                       if (time <= 0 || (time += diff) <= 0)
+                               time = 0;
+
+                       net->ipv4.rt_secret_timer.expires = time;
+               } else
+                       net->ipv4.rt_secret_timer.expires = new;
+
+               net->ipv4.rt_secret_timer.expires += jiffies;
+               add_timer(&net->ipv4.rt_secret_timer);
+       }
+       rtnl_unlock();
+}
+
+static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
+                                         struct file *filp,
+                                         void __user *buffer, size_t *lenp,
+                                         loff_t *ppos)
+{
+       int old = ip_rt_secret_interval;
+       int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
+
+       rt_secret_reschedule(old);
+
+       return ret;
+}
+
+static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
+                                                  void __user *oldval,
+                                                  size_t __user *oldlenp,
+                                                  void __user *newval,
+                                                  size_t newlen)
+{
+       int old = ip_rt_secret_interval;
+       int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
+
+       rt_secret_reschedule(old);
+
+       return ret;
+}
+
+static ctl_table ipv4_route_table[] = {
        {
                .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
                .procname       = "gc_thresh",
@@ -3047,20 +3252,29 @@ ctl_table ipv4_route_table[] = {
                .data           = &ip_rt_secret_interval,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-               .strategy       = &sysctl_jiffies,
+               .proc_handler   = &ipv4_sysctl_rt_secret_interval,
+               .strategy       = &ipv4_sysctl_rt_secret_interval_strategy,
        },
        { .ctl_name = 0 }
 };
 
-static __net_initdata struct ctl_path ipv4_route_path[] = {
+static struct ctl_table empty[1];
+
+static struct ctl_table ipv4_skeleton[] =
+{
+       { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
+         .mode = 0555, .child = ipv4_route_table},
+       { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
+         .mode = 0555, .child = empty},
+       { }
+};
+
+static __net_initdata struct ctl_path ipv4_path[] = {
        { .procname = "net", .ctl_name = CTL_NET, },
        { .procname = "ipv4", .ctl_name = NET_IPV4, },
-       { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
        { },
 };
 
-
 static struct ctl_table ipv4_route_flush_table[] = {
        {
                .ctl_name       = NET_IPV4_ROUTE_FLUSH,
@@ -3073,6 +3287,13 @@ static struct ctl_table ipv4_route_flush_table[] = {
        { .ctl_name = 0 },
 };
 
+static __net_initdata struct ctl_path ipv4_route_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "ipv4", .ctl_name = NET_IPV4, },
+       { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
+       { },
+};
+
 static __net_init int sysctl_route_net_init(struct net *net)
 {
        struct ctl_table *tbl;
@@ -3125,10 +3346,12 @@ static __net_init int rt_secret_timer_init(struct net *net)
        net->ipv4.rt_secret_timer.data = (unsigned long)net;
        init_timer_deferrable(&net->ipv4.rt_secret_timer);
 
-       net->ipv4.rt_secret_timer.expires =
-               jiffies + net_random() % ip_rt_secret_interval +
-               ip_rt_secret_interval;
-       add_timer(&net->ipv4.rt_secret_timer);
+       if (ip_rt_secret_interval) {
+               net->ipv4.rt_secret_timer.expires =
+                       jiffies + net_random() % ip_rt_secret_interval +
+                       ip_rt_secret_interval;
+               add_timer(&net->ipv4.rt_secret_timer);
+       }
        return 0;
 }
 
@@ -3215,6 +3438,17 @@ int __init ip_rt_init(void)
        return rc;
 }
 
+#ifdef CONFIG_SYSCTL
+/*
+ * We really need to sanitize the damn ipv4 init order, then all
+ * this nonsense will go away.
+ */
+void __init ip_static_sysctl_init(void)
+{
+       register_sysctl_paths(ipv4_path, ipv4_skeleton);
+}
+#endif
+
 EXPORT_SYMBOL(__ip_select_ident);
 EXPORT_SYMBOL(ip_route_input);
 EXPORT_SYMBOL(ip_route_output_key);