net: fix length computation in rt_check_expire()

[safe/jmp/linux-2.6] / net / ipv4 / route.c
diff --git a/net/ipv4/route.c b/net/ipv4/route.c

index 79c1e74..869cf1c 100644 (file)
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly  = 10 * 60 * HZ;
  static int ip_rt_min_pmtu __read_mostly                = 512 + 20 + 20;
  static int ip_rt_min_advmss __read_mostly      = 256;
  static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
+static int rt_chain_length_max __read_mostly   = 20;
  
  static void rt_worker_func(struct work_struct *work);
  static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
@@ -145,11 +146,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
  static void             ipv4_link_failure(struct sk_buff *skb);
  static void             ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
  static int rt_garbage_collect(struct dst_ops *ops);
+static void rt_emergency_hash_rebuild(struct net *net);
  
  
  static struct dst_ops ipv4_dst_ops = {
         .family =               AF_INET,
-       .protocol =             __constant_htons(ETH_P_IP),
+       .protocol =             cpu_to_be16(ETH_P_IP),
         .gc =                   rt_garbage_collect,
         .check =                ipv4_dst_check,
         .destroy =              ipv4_dst_destroy,
@@ -158,7 +160,6 @@ static struct dst_ops ipv4_dst_ops = {
         .link_failure =         ipv4_link_failure,
         .update_pmtu =          ip_rt_update_pmtu,
         .local_out =            __ip_local_out,
-       .entry_size =           sizeof(struct rtable),
         .entries =              ATOMIC_INIT(0),
  };
  
@@ -201,6 +202,7 @@ const __u8 ip_tos2prio[16] = {
  struct rt_hash_bucket {
         struct rtable   *chain;
  };
+
  #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
         defined(CONFIG_PROVE_LOCKING)
  /*
@@ -282,6 +284,8 @@ static struct rtable *rt_cache_get_first(struct seq_file *seq)
         struct rtable *r = NULL;
  
         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
+               if (!rt_hash_table[st->bucket].chain)
+                       continue;
                 rcu_read_lock_bh();
                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
                 while (r) {
@@ -299,11 +303,14 @@ static struct rtable *__rt_cache_get_next(struct seq_file *seq,
                                           struct rtable *r)
  {
         struct rt_cache_iter_state *st = seq->private;
+
         r = r->u.dst.rt_next;
         while (!r) {
                 rcu_read_unlock_bh();
-               if (--st->bucket < 0)
-                       break;
+               do {
+                       if (--st->bucket < 0)
+                               return NULL;
+               } while (!rt_hash_table[st->bucket].chain);
                 rcu_read_lock_bh();
                 r = rt_hash_table[st->bucket].chain;
         }
@@ -422,7 +429,7 @@ static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
         if (*pos == 0)
                 return SEQ_START_TOKEN;
  
-       for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
+       for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
                 if (!cpu_possible(cpu))
                         continue;
                 *pos = cpu+1;
@@ -435,7 +442,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
  {
         int cpu;
  
-       for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
+       for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
                 if (!cpu_possible(cpu))
                         continue;
                 *pos = cpu+1;
@@ -669,6 +676,20 @@ static inline u32 rt_score(struct rtable *rt)
         return score;
  }
  
+static inline bool rt_caching(const struct net *net)
+{
+       return net->ipv4.current_rt_cache_rebuild_count <=
+               net->ipv4.sysctl_rt_cache_rebuild_count;
+}
+
+static inline bool compare_hash_inputs(const struct flowi *fl1,
+                                       const struct flowi *fl2)
+{
+       return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
+               (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
+               (fl1->iif ^ fl2->iif)) == 0);
+}
+
  static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
  {
         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
@@ -748,11 +769,24 @@ static void rt_do_flush(int process_context)
         }
  }
  
+/*
+ * While freeing expired entries, we compute average chain length
+ * and standard deviation, using fixed-point arithmetic.
+ * This to have an estimation of rt_chain_length_max
+ *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
+ * We use 3 bits for frational part, and 29 (or 61) for magnitude.
+ */
+
+#define FRACT_BITS 3
+#define ONE (1UL << FRACT_BITS)
+
  static void rt_check_expire(void)
  {
         static unsigned int rover;
         unsigned int i = rover, goal;
         struct rtable *rth, **rthp;
+       unsigned long samples = 0;
+       unsigned long sum = 0, sum2 = 0;
         u64 mult;
  
         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
@@ -763,6 +797,7 @@ static void rt_check_expire(void)
                 goal = rt_hash_mask + 1;
         for (; goal > 0; goal--) {
                 unsigned long tmo = ip_rt_gc_timeout;
+               unsigned long length;
  
                 i = (i + 1) & rt_hash_mask;
                 rthp = &rt_hash_table[i].chain;
@@ -770,8 +805,11 @@ static void rt_check_expire(void)
                 if (need_resched())
                         cond_resched();
  
+               samples++;
+
                 if (*rthp == NULL)
                         continue;
+               length = 0;
                 spin_lock_bh(rt_hash_lock_addr(i));
                 while ((rth = *rthp) != NULL) {
                         if (rt_is_expired(rth)) {
@@ -784,11 +822,29 @@ static void rt_check_expire(void)
                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
                                         tmo >>= 1;
                                         rthp = &rth->u.dst.rt_next;
+                                       /*
+                                        * Only bump our length if the hash
+                                        * inputs on entries n and n+1 are not
+                                        * the same, we only count entries on
+                                        * a chain with equal hash inputs once
+                                        * so that entries for different QOS
+                                        * levels, and other non-hash input
+                                        * attributes don't unfairly skew
+                                        * the length computation
+                                        */
+                                       if ((*rthp == NULL) ||
+                                           !compare_hash_inputs(&(*rthp)->fl,
+                                                                &rth->fl))
+                                               length += ONE;
                                         continue;
                                 }
                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
                                 tmo >>= 1;
                                 rthp = &rth->u.dst.rt_next;
+                               if ((*rthp == NULL) ||
+                                   !compare_hash_inputs(&(*rthp)->fl,
+                                                        &rth->fl))
+                                       length += ONE;
                                 continue;
                         }
  
@@ -797,6 +853,15 @@ static void rt_check_expire(void)
                         rt_free(rth);
                 }
                 spin_unlock_bh(rt_hash_lock_addr(i));
+               sum += length;
+               sum2 += length*length;
+       }
+       if (samples) {
+               unsigned long avg = sum / samples;
+               unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
+               rt_chain_length_max = max_t(unsigned long,
+                                       ip_rt_gc_elasticity,
+                                       (avg + 4*sd) >> FRACT_BITS);
         }
         rover = i;
  }
@@ -846,6 +911,26 @@ static void rt_secret_rebuild(unsigned long __net)
         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
  }
  
+static void rt_secret_rebuild_oneshot(struct net *net)
+{
+       del_timer_sync(&net->ipv4.rt_secret_timer);
+       rt_cache_invalidate(net);
+       if (ip_rt_secret_interval) {
+               net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
+               add_timer(&net->ipv4.rt_secret_timer);
+       }
+}
+
+static void rt_emergency_hash_rebuild(struct net *net)
+{
+       if (net_ratelimit()) {
+               printk(KERN_WARNING "Route hash chain too long!\n");
+               printk(KERN_WARNING "Adjust your secret_interval!\n");
+       }
+
+       rt_secret_rebuild_oneshot(net);
+}
+
  /*
     Short description of GC goals.
  
@@ -984,6 +1069,7 @@ out:       return 0;
  static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
  {
         struct rtable   *rth, **rthp;
+       struct rtable   *rthi;
         unsigned long   now;
         struct rtable *cand, **candp;
         u32             min_score;
@@ -997,7 +1083,13 @@ restart:
         candp = NULL;
         now = jiffies;
  
+       if (!rt_caching(dev_net(rt->u.dst.dev))) {
+               rt_drop(rt);
+               return 0;
+       }
+
         rthp = &rt_hash_table[hash].chain;
+       rthi = NULL;
  
         spin_lock_bh(rt_hash_lock_addr(hash));
         while ((rth = *rthp) != NULL) {
@@ -1043,6 +1135,17 @@ restart:
                 chain_length++;
  
                 rthp = &rth->u.dst.rt_next;
+
+               /*
+                * check to see if the next entry in the chain
+                * contains the same hash input values as rt.  If it does
+                * This is where we will insert into the list, instead of
+                * at the head.  This groups entries that differ by aspects not
+                * relvant to the hash function together, which we use to adjust
+                * our chain length
+                */
+               if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
+                       rthi = rth;
         }
  
         if (cand) {
@@ -1056,6 +1159,16 @@ restart:
                         *candp = cand->u.dst.rt_next;
                         rt_free(cand);
                 }
+       } else {
+               if (chain_length > rt_chain_length_max) {
+                       struct net *net = dev_net(rt->u.dst.dev);
+                       int num = ++net->ipv4.current_rt_cache_rebuild_count;
+                       if (!rt_caching(dev_net(rt->u.dst.dev))) {
+                               printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
+                                       rt->u.dst.dev->name, num);
+                       }
+                       rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
+               }
         }
  
         /* Try to bind route to arp only if it is output
@@ -1093,18 +1206,30 @@ restart:
                 }
         }
  
-       rt->u.dst.rt_next = rt_hash_table[hash].chain;
+       if (rthi)
+               rt->u.dst.rt_next = rthi->u.dst.rt_next;
+       else
+               rt->u.dst.rt_next = rt_hash_table[hash].chain;
+
  #if RT_CACHE_DEBUG >= 2
         if (rt->u.dst.rt_next) {
                 struct rtable *trt;
-               printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
-                      NIPQUAD(rt->rt_dst));
+               printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
-                       printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
+                       printk(" . %pI4", &trt->rt_dst);
                 printk("\n");
         }
  #endif
-       rt_hash_table[hash].chain = rt;
+       /*
+        * Since lookup is lockfree, we must make sure
+        * previous writes to rt are comitted to memory
+        * before making rt visible to other CPUS.
+        */
+       if (rthi)
+               rcu_assign_pointer(rthi->u.dst.rt_next, rt);
+       else
+               rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+
         spin_unlock_bh(rt_hash_lock_addr(hash));
         *rp = rt;
         return 0;
@@ -1207,6 +1332,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
             || ipv4_is_zeronet(new_gw))
                 goto reject_redirect;
  
+       if (!rt_caching(net))
+               goto reject_redirect;
+
         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
                         goto reject_redirect;
@@ -1257,7 +1385,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
  
                                 /* Copy all the information. */
                                 *rt = *rth;
-                               INIT_RCU_HEAD(&rt->u.dst.rcu_head);
                                 rt->u.dst.__use         = 1;
                                 atomic_set(&rt->u.dst.__refcnt, 1);
                                 rt->u.dst.child         = NULL;
@@ -1270,7 +1397,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
                                 rt->u.dst.path          = &rt->u.dst;
                                 rt->u.dst.neighbour     = NULL;
                                 rt->u.dst.hh            = NULL;
+#ifdef CONFIG_XFRM
                                 rt->u.dst.xfrm          = NULL;
+#endif
                                 rt->rt_genid            = rt_genid(net);
                                 rt->rt_flags            |= RTCF_REDIRECTED;
  
@@ -1314,11 +1443,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
  reject_redirect:
  #ifdef CONFIG_IP_ROUTE_VERBOSE
         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-               printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
-                       NIPQUAD_FMT " ignored.\n"
-                       "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
-                      NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
-                      NIPQUAD(saddr), NIPQUAD(daddr));
+               printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
+                       "  Advised path = %pI4 -> %pI4\n",
+                      &old_gw, dev->name, &new_gw,
+                      &saddr, &daddr);
  #endif
         in_dev_put(in_dev);
  }
@@ -1338,9 +1466,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
                                                 rt->fl.oif,
                                                 rt_genid(dev_net(dst->dev)));
  #if RT_CACHE_DEBUG >= 1
-                       printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
-                                         NIPQUAD_FMT "/%02x dropped\n",
-                               NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
+                       printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
+                               &rt->rt_dst, rt->fl.fl4_tos);
  #endif
                         rt_del(hash, rt);
                         ret = NULL;
@@ -1404,10 +1531,9 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
                     net_ratelimit())
-                       printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
-                               "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
-                               NIPQUAD(rt->rt_src), rt->rt_iif,
-                               NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
+                       printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
+                               &rt->rt_src, rt->rt_iif,
+                               &rt->rt_dst, &rt->rt_gateway);
  #endif
         }
  out:
@@ -1429,7 +1555,8 @@ static int ip_error(struct sk_buff *skb)
                         break;
                 case ENETUNREACH:
                         code = ICMP_NET_UNREACH;
-                       IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
+                       IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
+                                       IPSTATS_MIB_INNOROUTES);
                         break;
                 case EACCES:
                         code = ICMP_PKT_FILTERED;
@@ -1501,21 +1628,21 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
                                     rth->fl.iif != 0 ||
                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
-                                   !rt_is_expired(rth))
+                                   rt_is_expired(rth))
                                         continue;
  
                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
  
                                         /* BSD 4.2 compatibility hack :-( */
                                         if (mtu == 0 &&
-                                           old_mtu >= dst_metric(&rth->u.dst, RTAX_MTU) &&
+                                           old_mtu >= dst_mtu(&rth->u.dst) &&
                                             old_mtu >= 68 + (iph->ihl << 2))
                                                 old_mtu -= iph->ihl << 2;
  
                                         mtu = guess_mtu(old_mtu);
                                 }
-                               if (mtu <= dst_metric(&rth->u.dst, RTAX_MTU)) {
-                                       if (mtu < dst_metric(&rth->u.dst, RTAX_MTU)) {
+                               if (mtu <= dst_mtu(&rth->u.dst)) {
+                                       if (mtu < dst_mtu(&rth->u.dst)) {
                                                 dst_confirm(&rth->u.dst);
                                                 if (mtu < ip_rt_min_pmtu) {
                                                         mtu = ip_rt_min_pmtu;
@@ -1537,7 +1664,7 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
  
  static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
  {
-       if (dst_metric(dst, RTAX_MTU) > mtu && mtu >= 68 &&
+       if (dst_mtu(dst) > mtu && mtu >= 68 &&
             !(dst_metric_locked(dst, RTAX_MTU))) {
                 if (mtu < ip_rt_min_pmtu) {
                         mtu = ip_rt_min_pmtu;
@@ -1599,8 +1726,8 @@ static void ipv4_link_failure(struct sk_buff *skb)
  
  static int ip_rt_bug(struct sk_buff *skb)
  {
-       printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
-               NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
+       printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
+               &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
                 skb->dev ? skb->dev->name : "?");
         kfree_skb(skb);
         return 0;
@@ -1666,7 +1793,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
  
         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
-       if (dst_metric(&rt->u.dst, RTAX_MTU) > IP_MAX_MTU)
+       if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
@@ -1777,9 +1904,8 @@ static void ip_handle_martian_source(struct net_device *dev,
                  *      RFC1812 recommendation, if source is martian,
                  *      the only hint is MAC header.
                  */
-               printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
-                       NIPQUAD_FMT", on dev %s\n",
-                       NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+               printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
+                       &daddr, &saddr, dev->name);
                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
                         int i;
                         const unsigned char *p = skb_mac_header(skb);
@@ -2088,9 +2214,8 @@ martian_destination:
         RT_CACHE_STAT_INC(in_martian_dst);
  #ifdef CONFIG_IP_ROUTE_VERBOSE
         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
-               printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
-                       NIPQUAD_FMT ", dev %s\n",
-                       NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
+               printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
+                       &daddr, &saddr, dev->name);
  #endif
  
  e_hostunreach:
@@ -2119,6 +2244,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
         struct net *net;
  
         net = dev_net(dev);
+
+       if (!rt_caching(net))
+               goto skip_cache;
+
         tos &= IPTOS_RT_MASK;
         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
  
@@ -2143,6 +2272,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
         }
         rcu_read_unlock();
  
+skip_cache:
         /* Multicast recognition logic is moved from route cache to here.
            The problem was that too many Ethernet cards have broken/missing
            hardware multicast filters :-( As result the host on multicasting
@@ -2355,11 +2485,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                     ipv4_is_zeronet(oldflp->fl4_src))
                         goto out;
  
-               /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
-               dev_out = ip_dev_find(net, oldflp->fl4_src);
-               if (dev_out == NULL)
-                       goto out;
-
                 /* I removed check for oif == dev_out->oif here.
                    It was wrong for two reasons:
                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
@@ -2371,6 +2496,11 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                 if (oldflp->oif == 0
                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
+                       /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+                       dev_out = ip_dev_find(net, oldflp->fl4_src);
+                       if (dev_out == NULL)
+                               goto out;
+
                         /* Special hack: user can direct multicasts
                            and limited broadcast via necessary interface
                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
@@ -2389,9 +2519,15 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp,
                         fl.oif = dev_out->ifindex;
                         goto make_route;
                 }
-               if (dev_out)
+
+               if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
+                       /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
+                       dev_out = ip_dev_find(net, oldflp->fl4_src);
+                       if (dev_out == NULL)
+                               goto out;
                         dev_put(dev_out);
-               dev_out = NULL;
+                       dev_out = NULL;
+               }
         }
  
  
@@ -2522,6 +2658,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
         unsigned hash;
         struct rtable *rth;
  
+       if (!rt_caching(net))
+               goto slow_output;
+
         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
  
         rcu_read_lock_bh();
@@ -2546,6 +2685,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp,
         }
         rcu_read_unlock_bh();
  
+slow_output:
         return ip_route_output_slow(net, rp, flp);
  }
  
@@ -2557,11 +2697,10 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
  
  static struct dst_ops ipv4_dst_blackhole_ops = {
         .family                 =       AF_INET,
-       .protocol               =       __constant_htons(ETH_P_IP),
+       .protocol               =       cpu_to_be16(ETH_P_IP),
         .destroy                =       ipv4_dst_destroy,
         .check                  =       ipv4_dst_check,
         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
-       .entry_size             =       sizeof(struct rtable),
         .entries                =       ATOMIC_INIT(0),
  };
  
@@ -2623,7 +2762,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
                         flp->fl4_src = (*rp)->rt_src;
                 if (!flp->fl4_dst)
                         flp->fl4_dst = (*rp)->rt_dst;
-               err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
+               err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
                                     flags ? XFRM_LOOKUP_WAIT : 0);
                 if (err == -EREMOTE)
                         err = ipv4_dst_blackhole(net, rp, flp);
@@ -2641,7 +2780,8 @@ int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
         return ip_route_output_flow(net, rp, flp, NULL, 0);
  }
  
-static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+static int rt_fill_info(struct net *net,
+                       struct sk_buff *skb, u32 pid, u32 seq, int event,
                         int nowait, unsigned int flags)
  {
         struct rtable *rt = skb->rtable;
@@ -2706,8 +2846,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                 __be32 dst = rt->rt_dst;
  
                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
-                   IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
-                       int err = ipmr_get_route(skb, r, nowait);
+                   IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+                       int err = ipmr_get_route(net, skb, r, nowait);
                         if (err <= 0) {
                                 if (!nowait) {
                                         if (err == 0)
@@ -2812,7 +2952,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
         if (rtm->rtm_flags & RTM_F_NOTIFY)
                 rt->rt_flags |= RTCF_NOTIFY;
  
-       err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+       err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
                            RTM_NEWROUTE, 0, 0);
         if (err <= 0)
                 goto errout_free;
@@ -2839,7 +2979,9 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
         if (s_h < 0)
                 s_h = 0;
         s_idx = idx = cb->args[1];
-       for (h = s_h; h <= rt_hash_mask; h++) {
+       for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
+               if (!rt_hash_table[h].chain)
+                       continue;
                 rcu_read_lock_bh();
                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
@@ -2848,7 +2990,7 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
                         if (rt_is_expired(rt))
                                 continue;
                         skb->dst = dst_clone(&rt->u.dst);
-                       if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+                       if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
                                          1, NLM_F_MULTI) <= 0) {
                                 dst_release(xchg(&skb->dst, NULL));
@@ -2858,7 +3000,6 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
                         dst_release(xchg(&skb->dst, NULL));
                 }
                 rcu_read_unlock_bh();
-               s_idx = 0;
         }
  
  done:
@@ -2895,8 +3036,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
  }
  
  static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
-                                               int __user *name,
-                                               int nlen,
                                                 void __user *oldval,
                                                 size_t __user *oldlenp,
                                                 void __user *newval,
@@ -2913,14 +3052,73 @@ static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
         return 0;
  }
  
-ctl_table ipv4_route_table[] = {
+static void rt_secret_reschedule(int old)
+{
+       struct net *net;
+       int new = ip_rt_secret_interval;
+       int diff = new - old;
+
+       if (!diff)
+               return;
+
+       rtnl_lock();
+       for_each_net(net) {
+               int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
+
+               if (!new)
+                       continue;
+
+               if (deleted) {
+                       long time = net->ipv4.rt_secret_timer.expires - jiffies;
+
+                       if (time <= 0 || (time += diff) <= 0)
+                               time = 0;
+
+                       net->ipv4.rt_secret_timer.expires = time;
+               } else
+                       net->ipv4.rt_secret_timer.expires = new;
+
+               net->ipv4.rt_secret_timer.expires += jiffies;
+               add_timer(&net->ipv4.rt_secret_timer);
+       }
+       rtnl_unlock();
+}
+
+static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
+                                         struct file *filp,
+                                         void __user *buffer, size_t *lenp,
+                                         loff_t *ppos)
+{
+       int old = ip_rt_secret_interval;
+       int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
+
+       rt_secret_reschedule(old);
+
+       return ret;
+}
+
+static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
+                                                  void __user *oldval,
+                                                  size_t __user *oldlenp,
+                                                  void __user *newval,
+                                                  size_t newlen)
+{
+       int old = ip_rt_secret_interval;
+       int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen);
+
+       rt_secret_reschedule(old);
+
+       return ret;
+}
+
+static ctl_table ipv4_route_table[] = {
         {
                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
                 .procname       = "gc_thresh",
                 .data           = &ipv4_dst_ops.gc_thresh,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
@@ -2928,7 +3126,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_max_size,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 /*  Deprecated. Use gc_min_interval_ms */
@@ -2938,8 +3136,8 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_gc_min_interval,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-               .strategy       = &sysctl_jiffies,
+               .proc_handler   = proc_dointvec_jiffies,
+               .strategy       = sysctl_jiffies,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
@@ -2947,8 +3145,8 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_gc_min_interval,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec_ms_jiffies,
-               .strategy       = &sysctl_ms_jiffies,
+               .proc_handler   = proc_dointvec_ms_jiffies,
+               .strategy       = sysctl_ms_jiffies,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
@@ -2956,8 +3154,8 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_gc_timeout,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-               .strategy       = &sysctl_jiffies,
+               .proc_handler   = proc_dointvec_jiffies,
+               .strategy       = sysctl_jiffies,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
@@ -2965,8 +3163,8 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_gc_interval,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-               .strategy       = &sysctl_jiffies,
+               .proc_handler   = proc_dointvec_jiffies,
+               .strategy       = sysctl_jiffies,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
@@ -2974,7 +3172,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_redirect_load,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
@@ -2982,7 +3180,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_redirect_number,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
@@ -2990,7 +3188,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_redirect_silence,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
@@ -2998,7 +3196,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_error_cost,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
@@ -3006,7 +3204,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_error_burst,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
@@ -3014,7 +3212,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_gc_elasticity,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
@@ -3022,8 +3220,8 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_mtu_expires,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-               .strategy       = &sysctl_jiffies,
+               .proc_handler   = proc_dointvec_jiffies,
+               .strategy       = sysctl_jiffies,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
@@ -3031,7 +3229,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_min_pmtu,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
@@ -3039,7 +3237,7 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_min_advmss,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
+               .proc_handler   = proc_dointvec,
         },
         {
                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
@@ -3047,32 +3245,48 @@ ctl_table ipv4_route_table[] = {
                 .data           = &ip_rt_secret_interval,
                 .maxlen         = sizeof(int),
                 .mode           = 0644,
-               .proc_handler   = &proc_dointvec_jiffies,
-               .strategy       = &sysctl_jiffies,
+               .proc_handler   = ipv4_sysctl_rt_secret_interval,
+               .strategy       = ipv4_sysctl_rt_secret_interval_strategy,
         },
         { .ctl_name = 0 }
  };
  
-static __net_initdata struct ctl_path ipv4_route_path[] = {
+static struct ctl_table empty[1];
+
+static struct ctl_table ipv4_skeleton[] =
+{
+       { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
+         .mode = 0555, .child = ipv4_route_table},
+       { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
+         .mode = 0555, .child = empty},
+       { }
+};
+
+static __net_initdata struct ctl_path ipv4_path[] = {
         { .procname = "net", .ctl_name = CTL_NET, },
         { .procname = "ipv4", .ctl_name = NET_IPV4, },
-       { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
         { },
  };
  
-
  static struct ctl_table ipv4_route_flush_table[] = {
         {
                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
                 .procname       = "flush",
                 .maxlen         = sizeof(int),
                 .mode           = 0200,
-               .proc_handler   = &ipv4_sysctl_rtcache_flush,
-               .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
+               .proc_handler   = ipv4_sysctl_rtcache_flush,
+               .strategy       = ipv4_sysctl_rtcache_flush_strategy,
         },
         { .ctl_name = 0 },
  };
  
+static __net_initdata struct ctl_path ipv4_route_path[] = {
+       { .procname = "net", .ctl_name = CTL_NET, },
+       { .procname = "ipv4", .ctl_name = NET_IPV4, },
+       { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
+       { },
+};
+
  static __net_init int sysctl_route_net_init(struct net *net)
  {
         struct ctl_table *tbl;
@@ -3125,10 +3339,12 @@ static __net_init int rt_secret_timer_init(struct net *net)
         net->ipv4.rt_secret_timer.data = (unsigned long)net;
         init_timer_deferrable(&net->ipv4.rt_secret_timer);
  
-       net->ipv4.rt_secret_timer.expires =
-               jiffies + net_random() % ip_rt_secret_interval +
-               ip_rt_secret_interval;
-       add_timer(&net->ipv4.rt_secret_timer);
+       if (ip_rt_secret_interval) {
+               net->ipv4.rt_secret_timer.expires =
+                       jiffies + net_random() % ip_rt_secret_interval +
+                       ip_rt_secret_interval;
+               add_timer(&net->ipv4.rt_secret_timer);
+       }
         return 0;
  }
  
@@ -3162,7 +3378,7 @@ int __init ip_rt_init(void)
         int rc = 0;
  
  #ifdef CONFIG_NET_CLS_ROUTE
-       ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
+       ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
         if (!ip_rt_acct)
                 panic("IP: failed to allocate ip_rt_acct\n");
  #endif
@@ -3182,7 +3398,7 @@ int __init ip_rt_init(void)
                                         0,
                                         &rt_hash_log,
                                         &rt_hash_mask,
-                                       0);
+                                       rhash_entries ? 0 : 512 * 1024);
         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
         rt_hash_lock_init();
  
@@ -3215,6 +3431,17 @@ int __init ip_rt_init(void)
         return rc;
  }
  
+#ifdef CONFIG_SYSCTL
+/*
+ * We really need to sanitize the damn ipv4 init order, then all
+ * this nonsense will go away.
+ */
+void __init ip_static_sysctl_init(void)
+{
+       register_sysctl_paths(ipv4_path, ipv4_skeleton);
+}
+#endif
+
  EXPORT_SYMBOL(__ip_select_ident);
  EXPORT_SYMBOL(ip_route_input);
  EXPORT_SYMBOL(ip_route_output_key);