X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=blobdiff_plain;f=net%2Fipv4%2Froute.c;h=9889fbd964872abdaf3c2781811aec7a555cb7c5;hp=8d23cc7efbadd4efdaa58b3a9a53eb598a97595f;hb=a661c4199b300503dae694efa21f1354f763b97c;hpb=00269b54edbf25f3bb0dccb558ae23a6fc77ed86 diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8d23cc7..9889fbd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -129,9 +129,10 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; +static int rt_chain_length_max __read_mostly = 20; -static void rt_worker_func(struct work_struct *work); -static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); +static struct delayed_work expires_work; +static unsigned long expires_ljiffies; /* * Interface to generic destination cache. @@ -145,11 +146,12 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); static int rt_garbage_collect(struct dst_ops *ops); +static void rt_emergency_hash_rebuild(struct net *net); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .gc = rt_garbage_collect, .check = ipv4_dst_check, .destroy = ipv4_dst_destroy, @@ -158,7 +160,6 @@ static struct dst_ops ipv4_dst_ops = { .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .local_out = __ip_local_out, - .entry_size = sizeof(struct rtable), .entries = ATOMIC_INIT(0), }; @@ -201,6 +202,7 @@ const __u8 ip_tos2prio[16] = { struct rt_hash_bucket { struct rtable *chain; }; + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ defined(CONFIG_PROVE_LOCKING) /* @@ -427,7 +429,7 @@ static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -440,7 +442,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -511,43 +513,42 @@ static const struct file_operations rt_cpu_seq_fops = { }; #ifdef CONFIG_NET_CLS_ROUTE -static int ip_rt_acct_read(char *buffer, char **start, off_t offset, - int length, int *eof, void *data) -{ - unsigned int i; - - if ((offset & 3) || (length & 3)) - return -EIO; - - if (offset >= sizeof(struct ip_rt_acct) * 256) { - *eof = 1; - return 0; - } - - if (offset + length >= sizeof(struct ip_rt_acct) * 256) { - length = sizeof(struct ip_rt_acct) * 256 - offset; - *eof = 1; +static int rt_acct_proc_show(struct seq_file *m, void *v) +{ + struct ip_rt_acct *dst, *src; + unsigned int i, j; + + dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL); + if (!dst) + return -ENOMEM; + + for_each_possible_cpu(i) { + src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i); + for (j = 0; j < 256; j++) { + dst[j].o_bytes += src[j].o_bytes; + dst[j].o_packets += src[j].o_packets; + dst[j].i_bytes += src[j].i_bytes; + dst[j].i_packets += src[j].i_packets; + } } - offset /= sizeof(u32); - - if (length > 0) { - u32 *dst = (u32 *) buffer; - - *start = buffer; - memset(dst, 0, length); - - for_each_possible_cpu(i) { - unsigned int j; - u32 *src; + seq_write(m, dst, 256 * sizeof(struct ip_rt_acct)); + kfree(dst); + return 0; +} - src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset; - for (j = 0; j < length/4; j++) - dst[j] += src[j]; - } - } - return length; +static int rt_acct_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, rt_acct_proc_show, NULL); } + +static const struct file_operations rt_acct_proc_fops = { + .owner = THIS_MODULE, + .open = rt_acct_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; #endif static int __net_init ip_rt_do_proc_init(struct net *net) @@ -565,8 +566,7 @@ static int __net_init ip_rt_do_proc_init(struct net *net) goto err2; #ifdef CONFIG_NET_CLS_ROUTE - pde = create_proc_read_entry("rt_acct", 0, net->proc_net, - ip_rt_acct_read, NULL); + pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops); if (!pde) goto err3; #endif @@ -674,6 +674,20 @@ static inline u32 rt_score(struct rtable *rt) return score; } +static inline bool rt_caching(const struct net *net) +{ + return net->ipv4.current_rt_cache_rebuild_count <= + net->ipv4.sysctl_rt_cache_rebuild_count; +} + +static inline bool compare_hash_inputs(const struct flowi *fl1, + const struct flowi *fl2) +{ + return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | + (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | + (fl1->iif ^ fl2->iif)) == 0); +} + static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | @@ -687,7 +701,7 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) { - return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); + return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev)); } static inline int rt_is_expired(struct rtable *rth) @@ -753,14 +767,30 @@ static void rt_do_flush(int process_context) } } +/* + * While freeing expired entries, we compute average chain length + * and standard deviation, using fixed-point arithmetic. + * This to have an estimation of rt_chain_length_max + * rt_chain_length_max = max(elasticity, AVG + 4*SD) + * We use 3 bits for frational part, and 29 (or 61) for magnitude. + */ + +#define FRACT_BITS 3 +#define ONE (1UL << FRACT_BITS) + static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct rtable *rth, *aux, **rthp; + unsigned long samples = 0; + unsigned long sum = 0, sum2 = 0; + unsigned long delta; u64 mult; - mult = ((u64)ip_rt_gc_interval) << rt_hash_log; + delta = jiffies - expires_ljiffies; + expires_ljiffies = jiffies; + mult = ((u64)delta) << rt_hash_log; if (ip_rt_gc_timeout > 1) do_div(mult, ip_rt_gc_timeout); goal = (unsigned int)mult; @@ -768,6 +798,7 @@ static void rt_check_expire(void) goal = rt_hash_mask + 1; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; + unsigned long length; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; @@ -775,10 +806,14 @@ static void rt_check_expire(void) if (need_resched()) cond_resched(); + samples++; + if (*rthp == NULL) continue; + length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { + prefetch(rth->u.dst.rt_next); if (rt_is_expired(rth)) { *rthp = rth->u.dst.rt_next; rt_free(rth); @@ -787,21 +822,45 @@ static void rt_check_expire(void) if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ if (time_before_eq(jiffies, rth->u.dst.expires)) { +nofree: tmo >>= 1; rthp = &rth->u.dst.rt_next; + /* + * We only count entries on + * a chain with equal hash inputs once + * so that entries for different QOS + * levels, and other non-hash input + * attributes don't unfairly skew + * the length computation + */ + for (aux = rt_hash_table[i].chain;;) { + if (aux == rth) { + length += ONE; + break; + } + if (compare_hash_inputs(&aux->fl, &rth->fl)) + break; + aux = aux->u.dst.rt_next; + } continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - continue; - } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) + goto nofree; /* Cleanup aged off entries. */ *rthp = rth->u.dst.rt_next; rt_free(rth); } spin_unlock_bh(rt_hash_lock_addr(i)); + sum += length; + sum2 += length*length; + } + if (samples) { + unsigned long avg = sum / samples; + unsigned long sd = int_sqrt(sum2 / samples - avg*avg); + rt_chain_length_max = max_t(unsigned long, + ip_rt_gc_elasticity, + (avg + 4*sd) >> FRACT_BITS); } rover = i; } @@ -851,6 +910,26 @@ static void rt_secret_rebuild(unsigned long __net) mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); } +static void rt_secret_rebuild_oneshot(struct net *net) +{ + del_timer_sync(&net->ipv4.rt_secret_timer); + rt_cache_invalidate(net); + if (ip_rt_secret_interval) { + net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; + add_timer(&net->ipv4.rt_secret_timer); + } +} + +static void rt_emergency_hash_rebuild(struct net *net) +{ + if (net_ratelimit()) { + printk(KERN_WARNING "Route hash chain too long!\n"); + printk(KERN_WARNING "Adjust your secret_interval!\n"); + } + + rt_secret_rebuild_oneshot(net); +} + /* Short description of GC goals. @@ -986,7 +1065,8 @@ work_done: out: return 0; } -static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) +static int rt_intern_hash(unsigned hash, struct rtable *rt, + struct rtable **rp, struct sk_buff *skb) { struct rtable *rth, **rthp; unsigned long now; @@ -1002,6 +1082,38 @@ restart: candp = NULL; now = jiffies; + if (!rt_caching(dev_net(rt->u.dst.dev))) { + /* + * If we're not caching, just tell the caller we + * were successful and don't touch the route. The + * caller hold the sole reference to the cache entry, and + * it will be released when the caller is done with it. + * If we drop it here, the callers have no way to resolve routes + * when we're not caching. Instead, just point *rp at rt, so + * the caller gets a single use out of the route + * Note that we do rt_free on this new route entry, so that + * once its refcount hits zero, we are still able to reap it + * (Thanks Alexey) + * Note also the rt_free uses call_rcu. We don't actually + * need rcu protection here, this is just our path to get + * on the route gc list. + */ + + if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { + int err = arp_bind_neighbour(&rt->u.dst); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING + "Neighbour table failure & not caching routes.\n"); + rt_drop(rt); + return err; + } + } + + rt_free(rt); + goto skip_hashing; + } + rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); @@ -1031,7 +1143,10 @@ restart: spin_unlock_bh(rt_hash_lock_addr(hash)); rt_drop(rt); - *rp = rth; + if (rp) + *rp = rth; + else + skb_dst_set(skb, &rth->u.dst); return 0; } @@ -1061,6 +1176,16 @@ restart: *candp = cand->u.dst.rt_next; rt_free(cand); } + } else { + if (chain_length > rt_chain_length_max) { + struct net *net = dev_net(rt->u.dst.dev); + int num = ++net->ipv4.current_rt_cache_rebuild_count; + if (!rt_caching(dev_net(rt->u.dst.dev))) { + printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", + rt->u.dst.dev->name, num); + } + rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); + } } /* Try to bind route to arp only if it is output @@ -1099,13 +1224,14 @@ restart: } rt->u.dst.rt_next = rt_hash_table[hash].chain; + #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash, - NIPQUAD(rt->rt_dst)); + printk(KERN_DEBUG "rt_cache @%02x: %pI4", + hash, &rt->rt_dst); for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) - printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst)); + printk(" . %pI4", &trt->rt_dst); printk("\n"); } #endif @@ -1115,8 +1241,14 @@ restart: * before making rt visible to other CPUS. */ rcu_assign_pointer(rt_hash_table[hash].chain, rt); + spin_unlock_bh(rt_hash_lock_addr(hash)); - *rp = rt; + +skip_hashing: + if (rp) + *rp = rt; + else + skb_dst_set(skb, &rt->u.dst); return 0; } @@ -1212,9 +1344,12 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, return; net = dev_net(dev); - if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) - || ipv4_is_zeronet(new_gw)) + if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) || + ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) || + ipv4_is_zeronet(new_gw)) + goto reject_redirect; + + if (!rt_caching(net)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { @@ -1267,7 +1402,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, /* Copy all the information. */ *rt = *rth; - INIT_RCU_HEAD(&rt->u.dst.rcu_head); rt->u.dst.__use = 1; atomic_set(&rt->u.dst.__refcnt, 1); rt->u.dst.child = NULL; @@ -1280,7 +1414,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->u.dst.path = &rt->u.dst; rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; +#ifdef CONFIG_XFRM rt->u.dst.xfrm = NULL; +#endif rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; @@ -1309,7 +1445,7 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, &netevent); rt_del(hash, rth); - if (!rt_intern_hash(hash, rt, &rt)) + if (!rt_intern_hash(hash, rt, &rt, NULL)) ip_rt_put(rt); goto do_next; } @@ -1324,11 +1460,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about " - NIPQUAD_FMT " ignored.\n" - " Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n", - NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), - NIPQUAD(saddr), NIPQUAD(daddr)); + printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); #endif in_dev_put(in_dev); } @@ -1348,9 +1483,8 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) rt->fl.oif, rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ipv4_negative_advice: redirect to " - NIPQUAD_FMT "/%02x dropped\n", - NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); + printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", + &rt->rt_dst, rt->fl.fl4_tos); #endif rt_del(hash, rt); ret = NULL; @@ -1377,14 +1511,18 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { - struct rtable *rt = skb->rtable; - struct in_device *in_dev = in_dev_get(rt->u.dst.dev); + struct rtable *rt = skb_rtable(skb); + struct in_device *in_dev; + int log_martians; - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(rt->u.dst.dev); + if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) { + rcu_read_unlock(); return; - - if (!IN_DEV_TX_REDIRECTS(in_dev)) - goto out; + } + log_martians = IN_DEV_LOG_MARTIANS(in_dev); + rcu_read_unlock(); /* No redirected packets during ip_rt_redirect_silence; * reset the algorithm. @@ -1397,7 +1535,7 @@ void ip_rt_send_redirect(struct sk_buff *skb) */ if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) { rt->u.dst.rate_last = jiffies; - goto out; + return; } /* Check for load limit; set rate_last to the latest sent @@ -1411,22 +1549,19 @@ void ip_rt_send_redirect(struct sk_buff *skb) rt->u.dst.rate_last = jiffies; ++rt->u.dst.rate_tokens; #ifdef CONFIG_IP_ROUTE_VERBOSE - if (IN_DEV_LOG_MARTIANS(in_dev) && + if (log_martians && rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) - printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores " - "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n", - NIPQUAD(rt->rt_src), rt->rt_iif, - NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); + printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", + &rt->rt_src, rt->rt_iif, + &rt->rt_dst, &rt->rt_gateway); #endif } -out: - in_dev_put(in_dev); } static int ip_error(struct sk_buff *skb) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); unsigned long now; int code; @@ -1491,9 +1626,6 @@ unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, __be32 daddr = iph->daddr; unsigned short est_mtu = 0; - if (ipv4_config.no_pmtu_disc) - return 0; - for (k = 0; k < 2; k++) { for (i = 0; i < 2; i++) { unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], @@ -1603,15 +1735,15 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - rt = skb->rtable; + rt = skb_rtable(skb); if (rt) dst_set_expires(&rt->u.dst, 0); } static int ip_rt_bug(struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n", - NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), + printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); return 0; @@ -1717,7 +1849,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, - dev, &spec_dst, &itag) < 0) + dev, &spec_dst, &itag, 0) < 0) goto e_inval; rth = dst_alloc(&ipv4_dst_ops); @@ -1763,7 +1895,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, in_dev_put(in_dev); hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); - return rt_intern_hash(hash, rth, &skb->rtable); + return rt_intern_hash(hash, rth, NULL, skb); e_nobufs: in_dev_put(in_dev); @@ -1788,9 +1920,8 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - printk(KERN_WARNING "martian source " NIPQUAD_FMT " from " - NIPQUAD_FMT", on dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", + &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; const unsigned char *p = skb_mac_header(skb); @@ -1831,7 +1962,7 @@ static int __mkroute_input(struct sk_buff *skb, err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res), - in_dev->dev, &spec_dst, &itag); + in_dev->dev, &spec_dst, &itag, skb->mark); if (err < 0) { ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr, saddr); @@ -1925,7 +2056,7 @@ static int ip_mkroute_input(struct sk_buff *skb, /* put it into the cache */ hash = rt_hash(daddr, saddr, fl->iif, rt_genid(dev_net(rth->u.dst.dev))); - return rt_intern_hash(hash, rth, &skb->rtable); + return rt_intern_hash(hash, rth, NULL, skb); } /* @@ -2005,7 +2136,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, int result; result = fib_validate_source(saddr, daddr, tos, net->loopback_dev->ifindex, - dev, &spec_dst, &itag); + dev, &spec_dst, &itag, skb->mark); if (result < 0) goto martian_source; if (result) @@ -2034,7 +2165,7 @@ brd_input: spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, - &itag); + &itag, skb->mark); if (err < 0) goto martian_source; if (err) @@ -2081,7 +2212,7 @@ local_input: } rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); - err = rt_intern_hash(hash, rth, &skb->rtable); + err = rt_intern_hash(hash, rth, NULL, skb); goto done; no_route: @@ -2099,9 +2230,8 @@ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from " - NIPQUAD_FMT ", dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); #endif e_hostunreach: @@ -2130,6 +2260,10 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct net *net; net = dev_net(dev); + + if (!rt_caching(net)) + goto skip_cache; + tos &= IPTOS_RT_MASK; hash = rt_hash(daddr, saddr, iif, rt_genid(net)); @@ -2147,13 +2281,14 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb->rtable = rth; + skb_dst_set(skb, &rth->u.dst); return 0; } RT_CACHE_STAT_INC(in_hlist_search); } rcu_read_unlock(); +skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2174,10 +2309,11 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!ipv4_is_local_multicast(daddr) && - IN_DEV_MFORWARD(in_dev)) + || + (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif - ) { + ) { rcu_read_unlock(); return ip_route_input_mc(skb, daddr, saddr, tos, dev, our); @@ -2322,7 +2458,7 @@ static int ip_mkroute_output(struct rtable **rp, if (err == 0) { hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, rt_genid(dev_net(dev_out))); - err = rt_intern_hash(hash, rth, rp); + err = rt_intern_hash(hash, rth, rp, NULL); } return err; @@ -2374,9 +2510,9 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, of another iface. --ANK */ - if (oldflp->oif == 0 - && (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + if (oldflp->oif == 0 && + (ipv4_is_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = ip_dev_find(net, oldflp->fl4_src); if (dev_out == NULL) @@ -2539,6 +2675,9 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, unsigned hash; struct rtable *rth; + if (!rt_caching(net)) + goto slow_output; + hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); @@ -2563,6 +2702,7 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, } rcu_read_unlock_bh(); +slow_output: return ip_route_output_slow(net, rp, flp); } @@ -2574,11 +2714,10 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, .check = ipv4_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .entry_size = sizeof(struct rtable), .entries = ATOMIC_INIT(0), }; @@ -2640,7 +2779,7 @@ int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, + err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) err = ipv4_dst_blackhole(net, rp, flp); @@ -2658,10 +2797,11 @@ int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) return ip_route_output_flow(net, rp, flp, NULL, 0); } -static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, +static int rt_fill_info(struct net *net, + struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { - struct rtable *rt = skb->rtable; + struct rtable *rt = skb_rtable(skb); struct rtmsg *r; struct nlmsghdr *nlh; long expires; @@ -2711,7 +2851,7 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, error = rt->u.dst.error; expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0; if (rt->peer) { - id = rt->peer->ip_id_count; + id = atomic_read(&rt->peer->ip_id_count) & 0xffff; if (rt->peer->tcp_ts_stamp) { ts = rt->peer->tcp_ts; tsage = get_seconds() - rt->peer->tcp_ts_stamp; @@ -2723,8 +2863,8 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, __be32 dst = rt->rt_dst; if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && - IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) { - int err = ipmr_get_route(skb, r, nowait); + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) @@ -2805,7 +2945,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); - rt = skb->rtable; + rt = skb_rtable(skb); if (err == 0 && rt->u.dst.error) err = -rt->u.dst.error; } else { @@ -2825,11 +2965,11 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (err) goto errout_free; - skb->rtable = rt; + skb_dst_set(skb, &rt->u.dst); if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; @@ -2866,15 +3006,15 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) continue; if (rt_is_expired(rt)) continue; - skb->dst = dst_clone(&rt->u.dst); - if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + skb_dst_set(skb, dst_clone(&rt->u.dst)); + if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { - dst_release(xchg(&skb->dst, NULL)); + skb_dst_drop(skb); rcu_read_unlock_bh(); goto done; } - dst_release(xchg(&skb->dst, NULL)); + skb_dst_drop(skb); } rcu_read_unlock_bh(); } @@ -2892,7 +3032,7 @@ void ip_rt_multicast_event(struct in_device *in_dev) #ifdef CONFIG_SYSCTL static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, - struct file *filp, void __user *buffer, + void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { @@ -2902,7 +3042,7 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, memcpy(&ctl, __ctl, sizeof(ctl)); ctl.data = &flush_delay; - proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); + proc_dointvec(&ctl, write, buffer, lenp, ppos); net = (struct net *)__ctl->extra1; rt_cache_flush(net, flush_delay); @@ -2913,8 +3053,6 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, } static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, @@ -2964,12 +3102,11 @@ static void rt_secret_reschedule(int old) } static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, - struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { int old = ip_rt_secret_interval; - int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); + int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos); rt_secret_reschedule(old); @@ -2977,16 +3114,13 @@ static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, } static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { int old = ip_rt_secret_interval; - int ret = sysctl_jiffies(table, name, nlen, oldval, oldlenp, newval, - newlen); + int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); rt_secret_reschedule(old); @@ -3000,7 +3134,7 @@ static ctl_table ipv4_route_table[] = { .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, @@ -3008,7 +3142,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ @@ -3018,8 +3152,8 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, @@ -3027,8 +3161,8 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_ms_jiffies, - .strategy = &sysctl_ms_jiffies, + .proc_handler = proc_dointvec_ms_jiffies, + .strategy = sysctl_ms_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, @@ -3036,8 +3170,8 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, @@ -3045,8 +3179,8 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, @@ -3054,7 +3188,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, @@ -3062,7 +3196,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, @@ -3070,7 +3204,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_ERROR_COST, @@ -3078,7 +3212,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, @@ -3086,7 +3220,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, @@ -3094,7 +3228,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, @@ -3102,8 +3236,8 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, @@ -3111,7 +3245,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, @@ -3119,7 +3253,7 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, @@ -3127,8 +3261,8 @@ static ctl_table ipv4_route_table[] = { .data = &ip_rt_secret_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &ipv4_sysctl_rt_secret_interval, - .strategy = &ipv4_sysctl_rt_secret_interval_strategy, + .proc_handler = ipv4_sysctl_rt_secret_interval, + .strategy = ipv4_sysctl_rt_secret_interval_strategy, }, { .ctl_name = 0 } }; @@ -3156,8 +3290,8 @@ static struct ctl_table ipv4_route_flush_table[] = { .procname = "flush", .maxlen = sizeof(int), .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, - .strategy = &ipv4_sysctl_rtcache_flush_strategy, + .proc_handler = ipv4_sysctl_rtcache_flush, + .strategy = ipv4_sysctl_rtcache_flush_strategy, }, { .ctl_name = 0 }, }; @@ -3174,7 +3308,7 @@ static __net_init int sysctl_route_net_init(struct net *net) struct ctl_table *tbl; tbl = ipv4_route_flush_table; - if (net != &init_net) { + if (!net_eq(net, &init_net)) { tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); if (tbl == NULL) goto err_dup; @@ -3260,7 +3394,7 @@ int __init ip_rt_init(void) int rc = 0; #ifdef CONFIG_NET_CLS_ROUTE - ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif @@ -3275,12 +3409,12 @@ int __init ip_rt_init(void) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), rhash_entries, - (num_physpages >= 128 * 1024) ? + (totalram_pages >= 128 * 1024) ? 15 : 17, 0, &rt_hash_log, &rt_hash_mask, - 0); + rhash_entries ? 0 : 512 * 1024); memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); @@ -3293,6 +3427,8 @@ int __init ip_rt_init(void) /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ + INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func); + expires_ljiffies = jiffies; schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); @@ -3303,7 +3439,7 @@ int __init ip_rt_init(void) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); - xfrm4_init(); + xfrm4_init(ip_rt_max_size); #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);