sysctl: remove "struct file *" argument of ->proc_handler
[safe/jmp/linux-2.6] / net / ipv4 / route.c
index bf89540..bb41992 100644 (file)
@@ -131,8 +131,8 @@ static int ip_rt_min_advmss __read_mostly   = 256;
 static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ;
 static int rt_chain_length_max __read_mostly   = 20;
 
-static void rt_worker_func(struct work_struct *work);
-static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
+static struct delayed_work expires_work;
+static unsigned long expires_ljiffies;
 
 /*
  *     Interface to generic destination cache.
@@ -151,7 +151,7 @@ static void rt_emergency_hash_rebuild(struct net *net);
 
 static struct dst_ops ipv4_dst_ops = {
        .family =               AF_INET,
-       .protocol =             __constant_htons(ETH_P_IP),
+       .protocol =             cpu_to_be16(ETH_P_IP),
        .gc =                   rt_garbage_collect,
        .check =                ipv4_dst_check,
        .destroy =              ipv4_dst_destroy,
@@ -784,20 +784,23 @@ static void rt_check_expire(void)
 {
        static unsigned int rover;
        unsigned int i = rover, goal;
-       struct rtable *rth, **rthp;
-       unsigned long length = 0, samples = 0;
+       struct rtable *rth, *aux, **rthp;
+       unsigned long samples = 0;
        unsigned long sum = 0, sum2 = 0;
+       unsigned long delta;
        u64 mult;
 
-       mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
+       delta = jiffies - expires_ljiffies;
+       expires_ljiffies = jiffies;
+       mult = ((u64)delta) << rt_hash_log;
        if (ip_rt_gc_timeout > 1)
                do_div(mult, ip_rt_gc_timeout);
        goal = (unsigned int)mult;
        if (goal > rt_hash_mask)
                goal = rt_hash_mask + 1;
-       length = 0;
        for (; goal > 0; goal--) {
                unsigned long tmo = ip_rt_gc_timeout;
+               unsigned long length;
 
                i = (i + 1) & rt_hash_mask;
                rthp = &rt_hash_table[i].chain;
@@ -809,8 +812,10 @@ static void rt_check_expire(void)
 
                if (*rthp == NULL)
                        continue;
+               length = 0;
                spin_lock_bh(rt_hash_lock_addr(i));
                while ((rth = *rthp) != NULL) {
+                       prefetch(rth->u.dst.rt_next);
                        if (rt_is_expired(rth)) {
                                *rthp = rth->u.dst.rt_next;
                                rt_free(rth);
@@ -819,33 +824,30 @@ static void rt_check_expire(void)
                        if (rth->u.dst.expires) {
                                /* Entry is expired even if it is in use */
                                if (time_before_eq(jiffies, rth->u.dst.expires)) {
+nofree:
                                        tmo >>= 1;
                                        rthp = &rth->u.dst.rt_next;
                                        /*
-                                        * Only bump our length if the hash
-                                        * inputs on entries n and n+1 are not
-                                        * the same, we only count entries on
+                                        * We only count entries on
                                         * a chain with equal hash inputs once
                                         * so that entries for different QOS
                                         * levels, and other non-hash input
                                         * attributes don't unfairly skew
                                         * the length computation
                                         */
-                                       if ((*rthp == NULL) ||
-                                           !compare_hash_inputs(&(*rthp)->fl,
-                                                                &rth->fl))
-                                               length += ONE;
+                                       for (aux = rt_hash_table[i].chain;;) {
+                                               if (aux == rth) {
+                                                       length += ONE;
+                                                       break;
+                                               }
+                                               if (compare_hash_inputs(&aux->fl, &rth->fl))
+                                                       break;
+                                               aux = aux->u.dst.rt_next;
+                                       }
                                        continue;
                                }
-                       } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
-                               tmo >>= 1;
-                               rthp = &rth->u.dst.rt_next;
-                               if ((*rthp == NULL) ||
-                                   !compare_hash_inputs(&(*rthp)->fl,
-                                                        &rth->fl))
-                                       length += ONE;
-                               continue;
-                       }
+                       } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
+                               goto nofree;
 
                        /* Cleanup aged off entries. */
                        *rthp = rth->u.dst.rt_next;
@@ -1065,10 +1067,10 @@ work_done:
 out:   return 0;
 }
 
-static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
+static int rt_intern_hash(unsigned hash, struct rtable *rt,
+                         struct rtable **rp, struct sk_buff *skb)
 {
        struct rtable   *rth, **rthp;
-       struct rtable   *rthi;
        unsigned long   now;
        struct rtable *cand, **candp;
        u32             min_score;
@@ -1083,12 +1085,38 @@ restart:
        now = jiffies;
 
        if (!rt_caching(dev_net(rt->u.dst.dev))) {
-               rt_drop(rt);
-               return 0;
+               /*
+                * If we're not caching, just tell the caller we
+                * were successful and don't touch the route.  The
+                * caller hold the sole reference to the cache entry, and
+                * it will be released when the caller is done with it.
+                * If we drop it here, the callers have no way to resolve routes
+                * when we're not caching.  Instead, just point *rp at rt, so
+                * the caller gets a single use out of the route
+                * Note that we do rt_free on this new route entry, so that
+                * once its refcount hits zero, we are still able to reap it
+                * (Thanks Alexey)
+                * Note also the rt_free uses call_rcu.  We don't actually
+                * need rcu protection here, this is just our path to get
+                * on the route gc list.
+                */
+
+               if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
+                       int err = arp_bind_neighbour(&rt->u.dst);
+                       if (err) {
+                               if (net_ratelimit())
+                                       printk(KERN_WARNING
+                                           "Neighbour table failure & not caching routes.\n");
+                               rt_drop(rt);
+                               return err;
+                       }
+               }
+
+               rt_free(rt);
+               goto skip_hashing;
        }
 
        rthp = &rt_hash_table[hash].chain;
-       rthi = NULL;
 
        spin_lock_bh(rt_hash_lock_addr(hash));
        while ((rth = *rthp) != NULL) {
@@ -1117,7 +1145,10 @@ restart:
                        spin_unlock_bh(rt_hash_lock_addr(hash));
 
                        rt_drop(rt);
-                       *rp = rth;
+                       if (rp)
+                               *rp = rth;
+                       else
+                               skb_dst_set(skb, &rth->u.dst);
                        return 0;
                }
 
@@ -1134,17 +1165,6 @@ restart:
                chain_length++;
 
                rthp = &rth->u.dst.rt_next;
-
-               /*
-                * check to see if the next entry in the chain
-                * contains the same hash input values as rt.  If it does
-                * This is where we will insert into the list, instead of
-                * at the head.  This groups entries that differ by aspects not
-                * relvant to the hash function together, which we use to adjust
-                * our chain length
-                */
-               if (*rthp && compare_hash_inputs(&(*rthp)->fl, &rt->fl))
-                       rthi = rth;
        }
 
        if (cand) {
@@ -1205,15 +1225,13 @@ restart:
                }
        }
 
-       if (rthi)
-               rt->u.dst.rt_next = rthi->u.dst.rt_next;
-       else
-               rt->u.dst.rt_next = rt_hash_table[hash].chain;
+       rt->u.dst.rt_next = rt_hash_table[hash].chain;
 
 #if RT_CACHE_DEBUG >= 2
        if (rt->u.dst.rt_next) {
                struct rtable *trt;
-               printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst);
+               printk(KERN_DEBUG "rt_cache @%02x: %pI4",
+                      hash, &rt->rt_dst);
                for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
                        printk(" . %pI4", &trt->rt_dst);
                printk("\n");
@@ -1224,13 +1242,15 @@ restart:
         * previous writes to rt are comitted to memory
         * before making rt visible to other CPUS.
         */
-       if (rthi)
-               rcu_assign_pointer(rthi->u.dst.rt_next, rt);
-       else
-               rcu_assign_pointer(rt_hash_table[hash].chain, rt);
+       rcu_assign_pointer(rt_hash_table[hash].chain, rt);
 
        spin_unlock_bh(rt_hash_lock_addr(hash));
-       *rp = rt;
+
+skip_hashing:
+       if (rp)
+               *rp = rt;
+       else
+               skb_dst_set(skb, &rt->u.dst);
        return 0;
 }
 
@@ -1427,7 +1447,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
                                                        &netevent);
 
                                rt_del(hash, rth);
-                               if (!rt_intern_hash(hash, rt, &rt))
+                               if (!rt_intern_hash(hash, rt, &rt, NULL))
                                        ip_rt_put(rt);
                                goto do_next;
                        }
@@ -1493,14 +1513,18 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 
 void ip_rt_send_redirect(struct sk_buff *skb)
 {
-       struct rtable *rt = skb->rtable;
-       struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
+       struct rtable *rt = skb_rtable(skb);
+       struct in_device *in_dev;
+       int log_martians;
 
-       if (!in_dev)
+       rcu_read_lock();
+       in_dev = __in_dev_get_rcu(rt->u.dst.dev);
+       if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
+               rcu_read_unlock();
                return;
-
-       if (!IN_DEV_TX_REDIRECTS(in_dev))
-               goto out;
+       }
+       log_martians = IN_DEV_LOG_MARTIANS(in_dev);
+       rcu_read_unlock();
 
        /* No redirected packets during ip_rt_redirect_silence;
         * reset the algorithm.
@@ -1513,7 +1537,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
         */
        if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
                rt->u.dst.rate_last = jiffies;
-               goto out;
+               return;
        }
 
        /* Check for load limit; set rate_last to the latest sent
@@ -1527,7 +1551,7 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                rt->u.dst.rate_last = jiffies;
                ++rt->u.dst.rate_tokens;
 #ifdef CONFIG_IP_ROUTE_VERBOSE
-               if (IN_DEV_LOG_MARTIANS(in_dev) &&
+               if (log_martians &&
                    rt->u.dst.rate_tokens == ip_rt_redirect_number &&
                    net_ratelimit())
                        printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
@@ -1535,13 +1559,11 @@ void ip_rt_send_redirect(struct sk_buff *skb)
                                &rt->rt_dst, &rt->rt_gateway);
 #endif
        }
-out:
-       in_dev_put(in_dev);
 }
 
 static int ip_error(struct sk_buff *skb)
 {
-       struct rtable *rt = skb->rtable;
+       struct rtable *rt = skb_rtable(skb);
        unsigned long now;
        int code;
 
@@ -1718,7 +1740,7 @@ static void ipv4_link_failure(struct sk_buff *skb)
 
        icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
 
-       rt = skb->rtable;
+       rt = skb_rtable(skb);
        if (rt)
                dst_set_expires(&rt->u.dst, 0);
 }
@@ -1878,7 +1900,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
 
        in_dev_put(in_dev);
        hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
-       return rt_intern_hash(hash, rth, &skb->rtable);
+       return rt_intern_hash(hash, rth, NULL, skb);
 
 e_nobufs:
        in_dev_put(in_dev);
@@ -2039,7 +2061,7 @@ static int ip_mkroute_input(struct sk_buff *skb,
        /* put it into the cache */
        hash = rt_hash(daddr, saddr, fl->iif,
                       rt_genid(dev_net(rth->u.dst.dev)));
-       return rt_intern_hash(hash, rth, &skb->rtable);
+       return rt_intern_hash(hash, rth, NULL, skb);
 }
 
 /*
@@ -2195,7 +2217,7 @@ local_input:
        }
        rth->rt_type    = res.type;
        hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
-       err = rt_intern_hash(hash, rth, &skb->rtable);
+       err = rt_intern_hash(hash, rth, NULL, skb);
        goto done;
 
 no_route:
@@ -2264,7 +2286,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
                        dst_use(&rth->u.dst, jiffies);
                        RT_CACHE_STAT_INC(in_hit);
                        rcu_read_unlock();
-                       skb->rtable = rth;
+                       skb_dst_set(skb, &rth->u.dst);
                        return 0;
                }
                RT_CACHE_STAT_INC(in_hlist_search);
@@ -2440,7 +2462,7 @@ static int ip_mkroute_output(struct rtable **rp,
        if (err == 0) {
                hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
                               rt_genid(dev_net(dev_out)));
-               err = rt_intern_hash(hash, rth, rp);
+               err = rt_intern_hash(hash, rth, rp, NULL);
        }
 
        return err;
@@ -2696,7 +2718,7 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
 
 static struct dst_ops ipv4_dst_blackhole_ops = {
        .family                 =       AF_INET,
-       .protocol               =       __constant_htons(ETH_P_IP),
+       .protocol               =       cpu_to_be16(ETH_P_IP),
        .destroy                =       ipv4_dst_destroy,
        .check                  =       ipv4_dst_check,
        .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
@@ -2779,10 +2801,11 @@ int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
        return ip_route_output_flow(net, rp, flp, NULL, 0);
 }
 
-static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
+static int rt_fill_info(struct net *net,
+                       struct sk_buff *skb, u32 pid, u32 seq, int event,
                        int nowait, unsigned int flags)
 {
-       struct rtable *rt = skb->rtable;
+       struct rtable *rt = skb_rtable(skb);
        struct rtmsg *r;
        struct nlmsghdr *nlh;
        long expires;
@@ -2844,8 +2867,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
                __be32 dst = rt->rt_dst;
 
                if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
-                   IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
-                       int err = ipmr_get_route(skb, r, nowait);
+                   IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
+                       int err = ipmr_get_route(net, skb, r, nowait);
                        if (err <= 0) {
                                if (!nowait) {
                                        if (err == 0)
@@ -2926,7 +2949,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
                err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
                local_bh_enable();
 
-               rt = skb->rtable;
+               rt = skb_rtable(skb);
                if (err == 0 && rt->u.dst.error)
                        err = -rt->u.dst.error;
        } else {
@@ -2946,11 +2969,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void
        if (err)
                goto errout_free;
 
-       skb->rtable = rt;
+       skb_dst_set(skb, &rt->u.dst);
        if (rtm->rtm_flags & RTM_F_NOTIFY)
                rt->rt_flags |= RTCF_NOTIFY;
 
-       err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
+       err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
                           RTM_NEWROUTE, 0, 0);
        if (err <= 0)
                goto errout_free;
@@ -2987,15 +3010,15 @@ int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
                                continue;
                        if (rt_is_expired(rt))
                                continue;
-                       skb->dst = dst_clone(&rt->u.dst);
-                       if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
+                       skb_dst_set(skb, dst_clone(&rt->u.dst));
+                       if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
                                         cb->nlh->nlmsg_seq, RTM_NEWROUTE,
                                         1, NLM_F_MULTI) <= 0) {
-                               dst_release(xchg(&skb->dst, NULL));
+                               skb_dst_drop(skb);
                                rcu_read_unlock_bh();
                                goto done;
                        }
-                       dst_release(xchg(&skb->dst, NULL));
+                       skb_dst_drop(skb);
                }
                rcu_read_unlock_bh();
        }
@@ -3013,7 +3036,7 @@ void ip_rt_multicast_event(struct in_device *in_dev)
 
 #ifdef CONFIG_SYSCTL
 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
-                                       struct file *filp, void __user *buffer,
+                                       void __user *buffer,
                                        size_t *lenp, loff_t *ppos)
 {
        if (write) {
@@ -3023,7 +3046,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
 
                memcpy(&ctl, __ctl, sizeof(ctl));
                ctl.data = &flush_delay;
-               proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
+               proc_dointvec(&ctl, write, buffer, lenp, ppos);
 
                net = (struct net *)__ctl->extra1;
                rt_cache_flush(net, flush_delay);
@@ -3083,12 +3106,11 @@ static void rt_secret_reschedule(int old)
 }
 
 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
-                                         struct file *filp,
                                          void __user *buffer, size_t *lenp,
                                          loff_t *ppos)
 {
        int old = ip_rt_secret_interval;
-       int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
+       int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
 
        rt_secret_reschedule(old);
 
@@ -3391,12 +3413,12 @@ int __init ip_rt_init(void)
                alloc_large_system_hash("IP route cache",
                                        sizeof(struct rt_hash_bucket),
                                        rhash_entries,
-                                       (num_physpages >= 128 * 1024) ?
+                                       (totalram_pages >= 128 * 1024) ?
                                        15 : 17,
                                        0,
                                        &rt_hash_log,
                                        &rt_hash_mask,
-                                       0);
+                                       rhash_entries ? 0 : 512 * 1024);
        memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
        rt_hash_lock_init();
 
@@ -3409,6 +3431,8 @@ int __init ip_rt_init(void)
        /* All the timers, started at system startup tend
           to synchronize. Perturb it a bit.
         */
+       INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
+       expires_ljiffies = jiffies;
        schedule_delayed_work(&expires_work,
                net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
 
@@ -3419,7 +3443,7 @@ int __init ip_rt_init(void)
                printk(KERN_ERR "Unable to create route proc files\n");
 #ifdef CONFIG_XFRM
        xfrm_init();
-       xfrm4_init();
+       xfrm4_init(ip_rt_max_size);
 #endif
        rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);