X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Froute.c;h=28205e5bfa9b703a30baed0afe83c24575080574;hb=0975ecba3b670df7c488a5e0e6fe9f1f370a8ad8;hp=1b70ffd1261580008dc499ef1b286ff7ee8edd53;hpb=bb72845e699d3c84e5f861b51db686107a51dea5;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 1b70ffd..28205e5 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -5,8 +5,6 @@ * * ROUTE - implementation of the IP router. * - * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Alan Cox, @@ -117,31 +115,24 @@ #define RT_GC_TIMEOUT (300*HZ) -static int ip_rt_min_delay = 2 * HZ; -static int ip_rt_max_delay = 10 * HZ; static int ip_rt_max_size; -static int ip_rt_gc_timeout = RT_GC_TIMEOUT; -static int ip_rt_gc_interval = 60 * HZ; -static int ip_rt_gc_min_interval = HZ / 2; -static int ip_rt_redirect_number = 9; -static int ip_rt_redirect_load = HZ / 50; -static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1)); -static int ip_rt_error_cost = HZ; -static int ip_rt_error_burst = 5 * HZ; -static int ip_rt_gc_elasticity = 8; -static int ip_rt_mtu_expires = 10 * 60 * HZ; -static int ip_rt_min_pmtu = 512 + 20 + 20; -static int ip_rt_min_advmss = 256; -static int ip_rt_secret_interval = 10 * 60 * HZ; -static int ip_rt_flush_expected; -static unsigned long rt_deadline; - -#define RTprint(a...) printk(KERN_DEBUG a) - -static struct timer_list rt_flush_timer; +static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT; +static int ip_rt_gc_interval __read_mostly = 60 * HZ; +static int ip_rt_gc_min_interval __read_mostly = HZ / 2; +static int ip_rt_redirect_number __read_mostly = 9; +static int ip_rt_redirect_load __read_mostly = HZ / 50; +static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1)); +static int ip_rt_error_cost __read_mostly = HZ; +static int ip_rt_error_burst __read_mostly = 5 * HZ; +static int ip_rt_gc_elasticity __read_mostly = 8; +static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; +static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; +static int ip_rt_min_advmss __read_mostly = 256; +static int ip_rt_secret_interval __read_mostly = 10 * 60 * HZ; +static int rt_chain_length_max __read_mostly = 20; + static void rt_worker_func(struct work_struct *work); static DECLARE_DELAYED_WORK(expires_work, rt_worker_func); -static struct timer_list rt_secret_timer; /* * Interface to generic destination cache. @@ -154,12 +145,13 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst); static void ipv4_link_failure(struct sk_buff *skb); static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu); -static int rt_garbage_collect(void); +static int rt_garbage_collect(struct dst_ops *ops); +static void rt_emergency_hash_rebuild(struct net *net); static struct dst_ops ipv4_dst_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .gc = rt_garbage_collect, .check = ipv4_dst_check, .destroy = ipv4_dst_destroy, @@ -167,8 +159,8 @@ static struct dst_ops ipv4_dst_ops = { .negative_advice = ipv4_negative_advice, .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, - .local_out = ip_local_out, - .entry_size = sizeof(struct rtable), + .local_out = __ip_local_out, + .entries = ATOMIC_INIT(0), }; #define ECN_OR_COST(class) TC_PRIO_##class @@ -210,6 +202,7 @@ const __u8 ip_tos2prio[16] = { struct rt_hash_bucket { struct rtable *chain; }; + #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \ defined(CONFIG_PROVE_LOCKING) /* @@ -256,63 +249,87 @@ static inline void rt_hash_lock_init(void) } #endif -static struct rt_hash_bucket *rt_hash_table; -static unsigned rt_hash_mask; -static unsigned int rt_hash_log; -static unsigned int rt_hash_rnd; +static struct rt_hash_bucket *rt_hash_table __read_mostly; +static unsigned rt_hash_mask __read_mostly; +static unsigned int rt_hash_log __read_mostly; static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat); #define RT_CACHE_STAT_INC(field) \ (__raw_get_cpu_var(rt_cache_stat).field++) -static int rt_intern_hash(unsigned hash, struct rtable *rth, - struct rtable **res); - -static unsigned int rt_hash_code(u32 daddr, u32 saddr) +static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx, + int genid) { - return (jhash_2words(daddr, saddr, rt_hash_rnd) - & rt_hash_mask); + return jhash_3words((__force u32)(__be32)(daddr), + (__force u32)(__be32)(saddr), + idx, genid) + & rt_hash_mask; } -#define rt_hash(daddr, saddr, idx) \ - rt_hash_code((__force u32)(__be32)(daddr),\ - (__force u32)(__be32)(saddr) ^ ((idx) << 5)) +static inline int rt_genid(struct net *net) +{ + return atomic_read(&net->ipv4.rt_genid); +} #ifdef CONFIG_PROC_FS struct rt_cache_iter_state { + struct seq_net_private p; int bucket; + int genid; }; static struct rtable *rt_cache_get_first(struct seq_file *seq) { - struct rtable *r = NULL; struct rt_cache_iter_state *st = seq->private; + struct rtable *r = NULL; for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) { + if (!rt_hash_table[st->bucket].chain) + continue; rcu_read_lock_bh(); - r = rt_hash_table[st->bucket].chain; - if (r) - break; + r = rcu_dereference(rt_hash_table[st->bucket].chain); + while (r) { + if (dev_net(r->u.dst.dev) == seq_file_net(seq) && + r->rt_genid == st->genid) + return r; + r = rcu_dereference(r->u.dst.rt_next); + } rcu_read_unlock_bh(); } - return rcu_dereference(r); + return r; } -static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r) +static struct rtable *__rt_cache_get_next(struct seq_file *seq, + struct rtable *r) { struct rt_cache_iter_state *st = seq->private; r = r->u.dst.rt_next; while (!r) { rcu_read_unlock_bh(); - if (--st->bucket < 0) - break; + do { + if (--st->bucket < 0) + return NULL; + } while (!rt_hash_table[st->bucket].chain); rcu_read_lock_bh(); r = rt_hash_table[st->bucket].chain; } return rcu_dereference(r); } +static struct rtable *rt_cache_get_next(struct seq_file *seq, + struct rtable *r) +{ + struct rt_cache_iter_state *st = seq->private; + while ((r = __rt_cache_get_next(seq, r)) != NULL) { + if (dev_net(r->u.dst.dev) != seq_file_net(seq)) + continue; + if (r->rt_genid == st->genid) + break; + } + return r; +} + static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) { struct rtable *r = rt_cache_get_first(seq); @@ -325,12 +342,16 @@ static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos) static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos) { - return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; + struct rt_cache_iter_state *st = seq->private; + if (*pos) + return rt_cache_get_idx(seq, *pos - 1); + st->genid = rt_genid(seq_file_net(seq)); + return SEQ_START_TOKEN; } static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos) { - struct rtable *r = NULL; + struct rtable *r; if (v == SEQ_START_TOKEN) r = rt_cache_get_first(seq); @@ -355,10 +376,10 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) "HHUptod\tSpecDst"); else { struct rtable *r = v; - char temp[256]; + int len; - sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" - "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X", + seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t" + "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n", r->u.dst.dev ? r->u.dst.dev->name : "*", (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway, r->rt_flags, atomic_read(&r->u.dst.__refcnt), @@ -372,8 +393,9 @@ static int rt_cache_seq_show(struct seq_file *seq, void *v) r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1, r->u.dst.hh ? (r->u.dst.hh->hh_output == dev_queue_xmit) : 0, - r->rt_spec_dst); - seq_printf(seq, "%-127s\n", temp); + r->rt_spec_dst, &len); + + seq_printf(seq, "%*s\n", 127 - len, ""); } return 0; } @@ -387,7 +409,7 @@ static const struct seq_operations rt_cache_seq_ops = { static int rt_cache_seq_open(struct inode *inode, struct file *file) { - return seq_open_private(file, &rt_cache_seq_ops, + return seq_open_net(inode, file, &rt_cache_seq_ops, sizeof(struct rt_cache_iter_state)); } @@ -396,7 +418,7 @@ static const struct file_operations rt_cache_seq_fops = { .open = rt_cache_seq_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = seq_release_net, }; @@ -407,7 +429,7 @@ static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos) if (*pos == 0) return SEQ_START_TOKEN; - for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -420,7 +442,7 @@ static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos) { int cpu; - for (cpu = *pos; cpu < NR_CPUS; ++cpu) { + for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) { if (!cpu_possible(cpu)) continue; *pos = cpu+1; @@ -530,7 +552,7 @@ static int ip_rt_acct_read(char *buffer, char **start, off_t offset, } #endif -static __init int ip_rt_proc_init(struct net *net) +static int __net_init ip_rt_do_proc_init(struct net *net) { struct proc_dir_entry *pde; @@ -539,12 +561,11 @@ static __init int ip_rt_proc_init(struct net *net) if (!pde) goto err1; - pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat); + pde = proc_create("rt_cache", S_IRUGO, + net->proc_net_stat, &rt_cpu_seq_fops); if (!pde) goto err2; - pde->proc_fops = &rt_cpu_seq_fops; - #ifdef CONFIG_NET_CLS_ROUTE pde = create_proc_read_entry("rt_acct", 0, net->proc_net, ip_rt_acct_read, NULL); @@ -562,25 +583,43 @@ err2: err1: return -ENOMEM; } + +static void __net_exit ip_rt_do_proc_exit(struct net *net) +{ + remove_proc_entry("rt_cache", net->proc_net_stat); + remove_proc_entry("rt_cache", net->proc_net); + remove_proc_entry("rt_acct", net->proc_net); +} + +static struct pernet_operations ip_rt_proc_ops __net_initdata = { + .init = ip_rt_do_proc_init, + .exit = ip_rt_do_proc_exit, +}; + +static int __init ip_rt_proc_init(void) +{ + return register_pernet_subsys(&ip_rt_proc_ops); +} + #else -static inline int ip_rt_proc_init(struct net *net) +static inline int ip_rt_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ -static __inline__ void rt_free(struct rtable *rt) +static inline void rt_free(struct rtable *rt) { call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); } -static __inline__ void rt_drop(struct rtable *rt) +static inline void rt_drop(struct rtable *rt) { ip_rt_put(rt); call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free); } -static __inline__ int rt_fast_clean(struct rtable *rth) +static inline int rt_fast_clean(struct rtable *rth) { /* Kill broadcast/multicast entries very aggresively, if they collide in hash table with more useful entries */ @@ -588,7 +627,7 @@ static __inline__ int rt_fast_clean(struct rtable *rth) rth->fl.iif && rth->u.dst.rt_next; } -static __inline__ int rt_valuable(struct rtable *rth) +static inline int rt_valuable(struct rtable *rth) { return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) || rth->u.dst.expires; @@ -637,6 +676,20 @@ static inline u32 rt_score(struct rtable *rt) return score; } +static inline bool rt_caching(const struct net *net) +{ + return net->ipv4.current_rt_cache_rebuild_count <= + net->ipv4.sysctl_rt_cache_rebuild_count; +} + +static inline bool compare_hash_inputs(const struct flowi *fl1, + const struct flowi *fl2) +{ + return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | + (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) | + (fl1->iif ^ fl2->iif)) == 0); +} + static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) | @@ -648,6 +701,16 @@ static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) (fl1->iif ^ fl2->iif)) == 0; } +static inline int compare_netns(struct rtable *rt1, struct rtable *rt2) +{ + return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev); +} + +static inline int rt_is_expired(struct rtable *rth) +{ + return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev)); +} + /* * Perform a full scan of hash table and free all entries. * Can be called by a softirq or a process. @@ -657,6 +720,7 @@ static void rt_do_flush(int process_context) { unsigned int i; struct rtable *rth, *next; + struct rtable * tail; for (i = 0; i <= rt_hash_mask; i++) { if (process_context && need_resched()) @@ -666,22 +730,63 @@ static void rt_do_flush(int process_context) continue; spin_lock_bh(rt_hash_lock_addr(i)); +#ifdef CONFIG_NET_NS + { + struct rtable ** prev, * p; + + rth = rt_hash_table[i].chain; + + /* defer releasing the head of the list after spin_unlock */ + for (tail = rth; tail; tail = tail->u.dst.rt_next) + if (!rt_is_expired(tail)) + break; + if (rth != tail) + rt_hash_table[i].chain = tail; + + /* call rt_free on entries after the tail requiring flush */ + prev = &rt_hash_table[i].chain; + for (p = *prev; p; p = next) { + next = p->u.dst.rt_next; + if (!rt_is_expired(p)) { + prev = &p->u.dst.rt_next; + } else { + *prev = next; + rt_free(p); + } + } + } +#else rth = rt_hash_table[i].chain; rt_hash_table[i].chain = NULL; + tail = NULL; +#endif spin_unlock_bh(rt_hash_lock_addr(i)); - for (; rth; rth = next) { + for (; rth != tail; rth = next) { next = rth->u.dst.rt_next; rt_free(rth); } } } +/* + * While freeing expired entries, we compute average chain length + * and standard deviation, using fixed-point arithmetic. + * This to have an estimation of rt_chain_length_max + * rt_chain_length_max = max(elasticity, AVG + 4*SD) + * We use 3 bits for frational part, and 29 (or 61) for magnitude. + */ + +#define FRACT_BITS 3 +#define ONE (1UL << FRACT_BITS) + static void rt_check_expire(void) { static unsigned int rover; unsigned int i = rover, goal; - struct rtable *rth, **rthp; + struct rtable *rth, *aux, **rthp; + unsigned long samples = 0; + unsigned long sum = 0, sum2 = 0; u64 mult; mult = ((u64)ip_rt_gc_interval) << rt_hash_log; @@ -692,6 +797,7 @@ static void rt_check_expire(void) goal = rt_hash_mask + 1; for (; goal > 0; goal--) { unsigned long tmo = ip_rt_gc_timeout; + unsigned long length; i = (i + 1) & rt_hash_mask; rthp = &rt_hash_table[i].chain; @@ -699,112 +805,128 @@ static void rt_check_expire(void) if (need_resched()) cond_resched(); + samples++; + if (*rthp == NULL) continue; + length = 0; spin_lock_bh(rt_hash_lock_addr(i)); while ((rth = *rthp) != NULL) { + prefetch(rth->u.dst.rt_next); + if (rt_is_expired(rth)) { + *rthp = rth->u.dst.rt_next; + rt_free(rth); + continue; + } if (rth->u.dst.expires) { /* Entry is expired even if it is in use */ if (time_before_eq(jiffies, rth->u.dst.expires)) { +nofree: tmo >>= 1; rthp = &rth->u.dst.rt_next; + /* + * We only count entries on + * a chain with equal hash inputs once + * so that entries for different QOS + * levels, and other non-hash input + * attributes don't unfairly skew + * the length computation + */ + for (aux = rt_hash_table[i].chain;;) { + if (aux == rth) { + length += ONE; + break; + } + if (compare_hash_inputs(&aux->fl, &rth->fl)) + break; + aux = aux->u.dst.rt_next; + } continue; } - } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) { - tmo >>= 1; - rthp = &rth->u.dst.rt_next; - continue; - } + } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) + goto nofree; /* Cleanup aged off entries. */ *rthp = rth->u.dst.rt_next; rt_free(rth); } spin_unlock_bh(rt_hash_lock_addr(i)); + sum += length; + sum2 += length*length; + } + if (samples) { + unsigned long avg = sum / samples; + unsigned long sd = int_sqrt(sum2 / samples - avg*avg); + rt_chain_length_max = max_t(unsigned long, + ip_rt_gc_elasticity, + (avg + 4*sd) >> FRACT_BITS); } rover = i; } /* * rt_worker_func() is run in process context. - * If a whole flush was scheduled, it is done. - * Else, we call rt_check_expire() to scan part of the hash table + * we call rt_check_expire() to scan part of the hash table */ static void rt_worker_func(struct work_struct *work) { - if (ip_rt_flush_expected) { - ip_rt_flush_expected = 0; - rt_do_flush(1); - } else - rt_check_expire(); + rt_check_expire(); schedule_delayed_work(&expires_work, ip_rt_gc_interval); } -/* This can run from both BH and non-BH contexts, the latter - * in the case of a forced flush event. +/* + * Pertubation of rt_genid by a small quantity [1..256] + * Using 8 bits of shuffling ensure we can call rt_cache_invalidate() + * many times (2^24) without giving recent rt_genid. + * Jenkins hash is strong enough that litle changes of rt_genid are OK. */ -static void rt_run_flush(unsigned long process_context) +static void rt_cache_invalidate(struct net *net) { - rt_deadline = 0; + unsigned char shuffle; - get_random_bytes(&rt_hash_rnd, 4); - - rt_do_flush(process_context); + get_random_bytes(&shuffle, sizeof(shuffle)); + atomic_add(shuffle + 1U, &net->ipv4.rt_genid); } -static DEFINE_SPINLOCK(rt_flush_lock); - -void rt_cache_flush(int delay) +/* + * delay < 0 : invalidate cache (fast : entries will be deleted later) + * delay >= 0 : invalidate & flush cache (can be long) + */ +void rt_cache_flush(struct net *net, int delay) { - unsigned long now = jiffies; - int user_mode = !in_softirq(); - - if (delay < 0) - delay = ip_rt_min_delay; - - spin_lock_bh(&rt_flush_lock); - - if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) { - long tmo = (long)(rt_deadline - now); - - /* If flush timer is already running - and flush request is not immediate (delay > 0): - - if deadline is not achieved, prolongate timer to "delay", - otherwise fire it at deadline time. - */ - - if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay) - tmo = 0; + rt_cache_invalidate(net); + if (delay >= 0) + rt_do_flush(!in_softirq()); +} - if (delay > tmo) - delay = tmo; - } +/* + * We change rt_genid and let gc do the cleanup + */ +static void rt_secret_rebuild(unsigned long __net) +{ + struct net *net = (struct net *)__net; + rt_cache_invalidate(net); + mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval); +} - if (delay <= 0) { - spin_unlock_bh(&rt_flush_lock); - rt_run_flush(user_mode); - return; +static void rt_secret_rebuild_oneshot(struct net *net) +{ + del_timer_sync(&net->ipv4.rt_secret_timer); + rt_cache_invalidate(net); + if (ip_rt_secret_interval) { + net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval; + add_timer(&net->ipv4.rt_secret_timer); } - - if (rt_deadline == 0) - rt_deadline = now + ip_rt_max_delay; - - mod_timer(&rt_flush_timer, now+delay); - spin_unlock_bh(&rt_flush_lock); } -/* - * We change rt_hash_rnd and ask next rt_worker_func() invocation - * to perform a flush in process context - */ -static void rt_secret_rebuild(unsigned long dummy) +static void rt_emergency_hash_rebuild(struct net *net) { - get_random_bytes(&rt_hash_rnd, 4); - ip_rt_flush_expected = 1; - cancel_delayed_work(&expires_work); - schedule_delayed_work(&expires_work, HZ/10); - mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval); + if (net_ratelimit()) { + printk(KERN_WARNING "Route hash chain too long!\n"); + printk(KERN_WARNING "Adjust your secret_interval!\n"); + } + + rt_secret_rebuild_oneshot(net); } /* @@ -820,7 +942,7 @@ static void rt_secret_rebuild(unsigned long dummy) and when load increases it reduces to limit cache size. */ -static int rt_garbage_collect(void) +static int rt_garbage_collect(struct dst_ops *ops) { static unsigned long expire = RT_GC_TIMEOUT; static unsigned long last_gc; @@ -851,14 +973,14 @@ static int rt_garbage_collect(void) equilibrium = ipv4_dst_ops.gc_thresh; goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; if (goal > 0) { - equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1); + equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; } } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ - goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1); + goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; } @@ -880,7 +1002,8 @@ static int rt_garbage_collect(void) rthp = &rt_hash_table[k].chain; spin_lock_bh(rt_hash_lock_addr(k)); while ((rth = *rthp) != NULL) { - if (!rt_may_expire(rth, tmo, expire)) { + if (!rt_is_expired(rth) && + !rt_may_expire(rth, tmo, expire)) { tmo >>= 1; rthp = &rth->u.dst.rt_next; continue; @@ -957,11 +1080,21 @@ restart: candp = NULL; now = jiffies; + if (!rt_caching(dev_net(rt->u.dst.dev))) { + rt_drop(rt); + return 0; + } + rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); while ((rth = *rthp) != NULL) { - if (compare_keys(&rth->fl, &rt->fl)) { + if (rt_is_expired(rth)) { + *rthp = rth->u.dst.rt_next; + rt_free(rth); + continue; + } + if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) { /* Put it first */ *rthp = rth->u.dst.rt_next; /* @@ -1011,6 +1144,16 @@ restart: *candp = cand->u.dst.rt_next; rt_free(cand); } + } else { + if (chain_length > rt_chain_length_max) { + struct net *net = dev_net(rt->u.dst.dev); + int num = ++net->ipv4.current_rt_cache_rebuild_count; + if (!rt_caching(dev_net(rt->u.dst.dev))) { + printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n", + rt->u.dst.dev->name, num); + } + rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev)); + } } /* Try to bind route to arp only if it is output @@ -1035,7 +1178,7 @@ restart: int saved_int = ip_rt_gc_min_interval; ip_rt_gc_elasticity = 1; ip_rt_gc_min_interval = 0; - rt_garbage_collect(); + rt_garbage_collect(&ipv4_dst_ops); ip_rt_gc_min_interval = saved_int; ip_rt_gc_elasticity = saved_elasticity; goto restart; @@ -1049,17 +1192,23 @@ restart: } rt->u.dst.rt_next = rt_hash_table[hash].chain; + #if RT_CACHE_DEBUG >= 2 if (rt->u.dst.rt_next) { struct rtable *trt; - printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash, - NIPQUAD(rt->rt_dst)); + printk(KERN_DEBUG "rt_cache @%02x: %pI4", hash, &rt->rt_dst); for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next) - printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst)); + printk(" . %pI4", &trt->rt_dst); printk("\n"); } #endif - rt_hash_table[hash].chain = rt; + /* + * Since lookup is lockfree, we must make sure + * previous writes to rt are comitted to memory + * before making rt visible to other CPUS. + */ + rcu_assign_pointer(rt_hash_table[hash].chain, rt); + spin_unlock_bh(rt_hash_lock_addr(hash)); *rp = rt; return 0; @@ -1126,17 +1275,19 @@ void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more) static void rt_del(unsigned hash, struct rtable *rt) { - struct rtable **rthp; + struct rtable **rthp, *aux; + rthp = &rt_hash_table[hash].chain; spin_lock_bh(rt_hash_lock_addr(hash)); ip_rt_put(rt); - for (rthp = &rt_hash_table[hash].chain; *rthp; - rthp = &(*rthp)->u.dst.rt_next) - if (*rthp == rt) { - *rthp = rt->u.dst.rt_next; - rt_free(rt); - break; + while ((aux = *rthp) != NULL) { + if (aux == rt || rt_is_expired(aux)) { + *rthp = aux->u.dst.rt_next; + rt_free(aux); + continue; } + rthp = &aux->u.dst.rt_next; + } spin_unlock_bh(rt_hash_lock_addr(hash)); } @@ -1149,12 +1300,18 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, __be32 skeys[2] = { saddr, 0 }; int ikeys[2] = { dev->ifindex, 0 }; struct netevent_redirect netevent; + struct net *net; if (!in_dev) return; + net = dev_net(dev); if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) - || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw)) + || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) + || ipv4_is_zeronet(new_gw)) + goto reject_redirect; + + if (!rt_caching(net)) goto reject_redirect; if (!IN_DEV_SHARED_MEDIA(in_dev)) { @@ -1163,13 +1320,14 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { - if (inet_addr_type(new_gw) != RTN_UNICAST) + if (inet_addr_type(net, new_gw) != RTN_UNICAST) goto reject_redirect; } for (i = 0; i < 2; i++) { for (k = 0; k < 2; k++) { - unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]); + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + rt_genid(net)); rthp=&rt_hash_table[hash].chain; @@ -1180,7 +1338,9 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, if (rth->fl.fl4_dst != daddr || rth->fl.fl4_src != skeys[i] || rth->fl.oif != ikeys[k] || - rth->fl.iif != 0) { + rth->fl.iif != 0 || + rt_is_expired(rth) || + !net_eq(dev_net(rth->u.dst.dev), net)) { rthp = &rth->u.dst.rt_next; continue; } @@ -1204,7 +1364,6 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, /* Copy all the information. */ *rt = *rth; - INIT_RCU_HEAD(&rt->u.dst.rcu_head); rt->u.dst.__use = 1; atomic_set(&rt->u.dst.__refcnt, 1); rt->u.dst.child = NULL; @@ -1217,8 +1376,10 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, rt->u.dst.path = &rt->u.dst; rt->u.dst.neighbour = NULL; rt->u.dst.hh = NULL; +#ifdef CONFIG_XFRM rt->u.dst.xfrm = NULL; - +#endif + rt->rt_genid = rt_genid(net); rt->rt_flags |= RTCF_REDIRECTED; /* Gateway is different ... */ @@ -1261,18 +1422,17 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, reject_redirect: #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about " - "%u.%u.%u.%u ignored.\n" - " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n", - NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw), - NIPQUAD(saddr), NIPQUAD(daddr)); + printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n" + " Advised path = %pI4 -> %pI4\n", + &old_gw, dev->name, &new_gw, + &saddr, &daddr); #endif in_dev_put(in_dev); } static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) { - struct rtable *rt = (struct rtable*)dst; + struct rtable *rt = (struct rtable *)dst; struct dst_entry *ret = dst; if (rt) { @@ -1282,11 +1442,11 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) } else if ((rt->rt_flags & RTCF_REDIRECTED) || rt->u.dst.expires) { unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src, - rt->fl.oif); + rt->fl.oif, + rt_genid(dev_net(dst->dev))); #if RT_CACHE_DEBUG >= 1 - printk(KERN_DEBUG "ipv4_negative_advice: redirect to " - "%u.%u.%u.%u/%02x dropped\n", - NIPQUAD(rt->rt_dst), rt->fl.fl4_tos); + printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n", + &rt->rt_dst, rt->fl.fl4_tos); #endif rt_del(hash, rt); ret = NULL; @@ -1313,7 +1473,7 @@ static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst) void ip_rt_send_redirect(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; struct in_device *in_dev = in_dev_get(rt->u.dst.dev); if (!in_dev) @@ -1350,10 +1510,9 @@ void ip_rt_send_redirect(struct sk_buff *skb) if (IN_DEV_LOG_MARTIANS(in_dev) && rt->u.dst.rate_tokens == ip_rt_redirect_number && net_ratelimit()) - printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores " - "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n", - NIPQUAD(rt->rt_src), rt->rt_iif, - NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway)); + printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n", + &rt->rt_src, rt->rt_iif, + &rt->rt_dst, &rt->rt_gateway); #endif } out: @@ -1362,7 +1521,7 @@ out: static int ip_error(struct sk_buff *skb) { - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; unsigned long now; int code; @@ -1375,7 +1534,8 @@ static int ip_error(struct sk_buff *skb) break; case ENETUNREACH: code = ICMP_NET_UNREACH; - IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES); + IP_INC_STATS_BH(dev_net(rt->u.dst.dev), + IPSTATS_MIB_INNOROUTES); break; case EACCES: code = ICMP_PKT_FILTERED; @@ -1404,7 +1564,7 @@ out: kfree_skb(skb); static const unsigned short mtu_plateau[] = {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 }; -static __inline__ unsigned short guess_mtu(unsigned short old_mtu) +static inline unsigned short guess_mtu(unsigned short old_mtu) { int i; @@ -1414,11 +1574,14 @@ static __inline__ unsigned short guess_mtu(unsigned short old_mtu) return 68; } -unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) +unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph, + unsigned short new_mtu, + struct net_device *dev) { - int i; + int i, k; unsigned short old_mtu = ntohs(iph->tot_len); struct rtable *rth; + int ikeys[2] = { dev->ifindex, 0 }; __be32 skeys[2] = { iph->saddr, 0, }; __be32 daddr = iph->daddr; unsigned short est_mtu = 0; @@ -1426,32 +1589,39 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) if (ipv4_config.no_pmtu_disc) return 0; - for (i = 0; i < 2; i++) { - unsigned hash = rt_hash(daddr, skeys[i], 0); + for (k = 0; k < 2; k++) { + for (i = 0; i < 2; i++) { + unsigned hash = rt_hash(daddr, skeys[i], ikeys[k], + rt_genid(net)); - rcu_read_lock(); - for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; - rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == skeys[i] && - rth->rt_dst == daddr && - rth->rt_src == iph->saddr && - rth->fl.iif == 0 && - !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) { + rcu_read_lock(); + for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; + rth = rcu_dereference(rth->u.dst.rt_next)) { unsigned short mtu = new_mtu; + if (rth->fl.fl4_dst != daddr || + rth->fl.fl4_src != skeys[i] || + rth->rt_dst != daddr || + rth->rt_src != iph->saddr || + rth->fl.oif != ikeys[k] || + rth->fl.iif != 0 || + dst_metric_locked(&rth->u.dst, RTAX_MTU) || + !net_eq(dev_net(rth->u.dst.dev), net) || + rt_is_expired(rth)) + continue; + if (new_mtu < 68 || new_mtu >= old_mtu) { /* BSD 4.2 compatibility hack :-( */ if (mtu == 0 && - old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] && + old_mtu >= dst_mtu(&rth->u.dst) && old_mtu >= 68 + (iph->ihl << 2)) old_mtu -= iph->ihl << 2; mtu = guess_mtu(old_mtu); } - if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) { - if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) { + if (mtu <= dst_mtu(&rth->u.dst)) { + if (mtu < dst_mtu(&rth->u.dst)) { dst_confirm(&rth->u.dst); if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; @@ -1465,15 +1635,15 @@ unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu) est_mtu = mtu; } } + rcu_read_unlock(); } - rcu_read_unlock(); } return est_mtu ? : new_mtu; } static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu) { - if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 && + if (dst_mtu(dst) > mtu && mtu >= 68 && !(dst_metric_locked(dst, RTAX_MTU))) { if (mtu < ip_rt_min_pmtu) { mtu = ip_rt_min_pmtu; @@ -1512,9 +1682,9 @@ static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev, { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; - if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) { + if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) { struct in_device *loopback_idev = - in_dev_get(dev->nd_net->loopback_dev); + in_dev_get(dev_net(dev)->loopback_dev); if (loopback_idev) { rt->idev = loopback_idev; in_dev_put(idev); @@ -1528,15 +1698,15 @@ static void ipv4_link_failure(struct sk_buff *skb) icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); - rt = (struct rtable *) skb->dst; + rt = skb->rtable; if (rt) dst_set_expires(&rt->u.dst, 0); } static int ip_rt_bug(struct sk_buff *skb) { - printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n", - NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr), + printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n", + &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr, skb->dev ? skb->dev->name : "?"); kfree_skb(skb); return 0; @@ -1558,7 +1728,7 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(&rt->fl, &res) == 0) { + else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) { src = FIB_RES_PREFSRC(res); fib_res_put(&res); } else @@ -1589,7 +1759,7 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) sizeof(rt->u.dst.metrics)); if (fi->fib_mtu == 0) { rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu; - if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) && + if (dst_metric_locked(&rt->u.dst, RTAX_MTU) && rt->rt_gateway != rt->rt_dst && rt->u.dst.dev->mtu > 576) rt->u.dst.metrics[RTAX_MTU-1] = 576; @@ -1600,14 +1770,14 @@ static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag) } else rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu; - if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0) + if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0) rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl; - if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU) + if (dst_mtu(&rt->u.dst) > IP_MAX_MTU) rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU; - if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0) + if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0) rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40, ip_rt_min_advmss); - if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40) + if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40) rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40; #ifdef CONFIG_NET_CLS_ROUTE @@ -1633,12 +1803,12 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (in_dev == NULL) return -EINVAL; - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) || - skb->protocol != htons(ETH_P_IP)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) { - if (!LOCAL_MCAST(daddr)) + if (ipv4_is_zeronet(saddr)) { + if (!ipv4_is_local_multicast(daddr)) goto e_inval; spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); } else if (fib_validate_source(saddr, 0, tos, 0, @@ -1672,22 +1842,23 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth->fl.oif = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; - rth->rt_type = RTN_MULTICAST; + rth->rt_genid = rt_genid(dev_net(dev)); rth->rt_flags = RTCF_MULTICAST; + rth->rt_type = RTN_MULTICAST; if (our) { rth->u.dst.input= ip_local_deliver; rth->rt_flags |= RTCF_LOCAL; } #ifdef CONFIG_IP_MROUTE - if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev)) rth->u.dst.input = ip_mr_input; #endif RT_CACHE_STAT_INC(in_slow_mc); in_dev_put(in_dev); - hash = rt_hash(daddr, saddr, dev->ifindex); - return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst); + hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev))); + return rt_intern_hash(hash, rth, &skb->rtable); e_nobufs: in_dev_put(in_dev); @@ -1712,9 +1883,8 @@ static void ip_handle_martian_source(struct net_device *dev, * RFC1812 recommendation, if source is martian, * the only hint is MAC header. */ - printk(KERN_WARNING "martian source %u.%u.%u.%u from " - "%u.%u.%u.%u, on dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n", + &daddr, &saddr, dev->name); if (dev->hard_header_len && skb_mac_header_was_set(skb)) { int i; const unsigned char *p = skb_mac_header(skb); @@ -1730,11 +1900,11 @@ static void ip_handle_martian_source(struct net_device *dev, #endif } -static inline int __mkroute_input(struct sk_buff *skb, - struct fib_result* res, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos, - struct rtable **result) +static int __mkroute_input(struct sk_buff *skb, + struct fib_result *res, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos, + struct rtable **result) { struct rtable *rth; @@ -1767,7 +1937,7 @@ static inline int __mkroute_input(struct sk_buff *skb, if (err) flags |= RTCF_DIRECTSRC; - if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) && + if (out_dev == in_dev && err && (IN_DEV_SHARED_MEDIA(out_dev) || inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res)))) flags |= RTCF_DOREDIRECT; @@ -1776,7 +1946,7 @@ static inline int __mkroute_input(struct sk_buff *skb, /* Not IP (i.e. ARP). Do not create route, if it is * invalid for proxy arp. DNAT routes are always valid. */ - if (out_dev == in_dev && !(flags & RTCF_DNAT)) { + if (out_dev == in_dev) { err = -EINVAL; goto cleanup; } @@ -1812,6 +1982,7 @@ static inline int __mkroute_input(struct sk_buff *skb, rth->u.dst.input = ip_forward; rth->u.dst.output = ip_output; + rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev)); rt_set_nexthop(rth, res, itag); @@ -1825,11 +1996,11 @@ static inline int __mkroute_input(struct sk_buff *skb, return err; } -static inline int ip_mkroute_input(struct sk_buff *skb, - struct fib_result* res, - const struct flowi *fl, - struct in_device *in_dev, - __be32 daddr, __be32 saddr, u32 tos) +static int ip_mkroute_input(struct sk_buff *skb, + struct fib_result *res, + const struct flowi *fl, + struct in_device *in_dev, + __be32 daddr, __be32 saddr, u32 tos) { struct rtable* rth = NULL; int err; @@ -1846,8 +2017,9 @@ static inline int ip_mkroute_input(struct sk_buff *skb, return err; /* put it into the cache */ - hash = rt_hash(daddr, saddr, fl->iif); - return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + hash = rt_hash(daddr, saddr, fl->iif, + rt_genid(dev_net(rth->u.dst.dev))); + return rt_intern_hash(hash, rth, &skb->rtable); } /* @@ -1880,6 +2052,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, __be32 spec_dst; int err = -EINVAL; int free_res = 0; + struct net * net = dev_net(dev); /* IP on this device is disabled. */ @@ -1890,7 +2063,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, by fib_lookup. */ - if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr)) + if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) || + ipv4_is_loopback(saddr)) goto martian_source; if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) @@ -1899,16 +2073,17 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* Accept zero addresses only to limited broadcast; * I even do not know to fix it or not. Waiting for complains :-) */ - if (ZERONET(saddr)) + if (ipv4_is_zeronet(saddr)) goto martian_source; - if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) + if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || + ipv4_is_loopback(daddr)) goto martian_destination; /* * Now we are ready to route packet. */ - if ((err = fib_lookup(&fl, &res)) != 0) { + if ((err = fib_lookup(net, &fl, &res)) != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; @@ -1923,7 +2098,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { int result; result = fib_validate_source(saddr, daddr, tos, - init_net.loopback_dev->ifindex, + net->loopback_dev->ifindex, dev, &spec_dst, &itag); if (result < 0) goto martian_source; @@ -1949,7 +2124,7 @@ brd_input: if (skb->protocol != htons(ETH_P_IP)) goto e_inval; - if (ZERONET(saddr)) + if (ipv4_is_zeronet(saddr)) spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK); else { err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst, @@ -1969,6 +2144,7 @@ local_input: goto e_nobufs; rth->u.dst.output= ip_rt_bug; + rth->rt_genid = rt_genid(net); atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; @@ -1985,7 +2161,7 @@ local_input: #endif rth->rt_iif = rth->fl.iif = dev->ifindex; - rth->u.dst.dev = init_net.loopback_dev; + rth->u.dst.dev = net->loopback_dev; dev_hold(rth->u.dst.dev); rth->idev = in_dev_get(rth->u.dst.dev); rth->rt_gateway = daddr; @@ -1998,8 +2174,8 @@ local_input: rth->rt_flags &= ~RTCF_LOCAL; } rth->rt_type = res.type; - hash = rt_hash(daddr, saddr, fl.iif); - err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst); + hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); + err = rt_intern_hash(hash, rth, &skb->rtable); goto done; no_route: @@ -2017,9 +2193,8 @@ martian_destination: RT_CACHE_STAT_INC(in_martian_dst); #ifdef CONFIG_IP_ROUTE_VERBOSE if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) - printk(KERN_WARNING "martian destination %u.%u.%u.%u from " - "%u.%u.%u.%u, dev %s\n", - NIPQUAD(daddr), NIPQUAD(saddr), dev->name); + printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n", + &daddr, &saddr, dev->name); #endif e_hostunreach: @@ -2045,29 +2220,38 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, struct rtable * rth; unsigned hash; int iif = dev->ifindex; + struct net *net; + + net = dev_net(dev); + + if (!rt_caching(net)) + goto skip_cache; tos &= IPTOS_RT_MASK; - hash = rt_hash(daddr, saddr, iif); + hash = rt_hash(daddr, saddr, iif, rt_genid(net)); rcu_read_lock(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; rth = rcu_dereference(rth->u.dst.rt_next)) { - if (rth->fl.fl4_dst == daddr && - rth->fl.fl4_src == saddr && - rth->fl.iif == iif && - rth->fl.oif == 0 && + if (((rth->fl.fl4_dst ^ daddr) | + (rth->fl.fl4_src ^ saddr) | + (rth->fl.iif ^ iif) | + rth->fl.oif | + (rth->fl.fl4_tos ^ tos)) == 0 && rth->fl.mark == skb->mark && - rth->fl.fl4_tos == tos) { + net_eq(dev_net(rth->u.dst.dev), net) && + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(in_hit); rcu_read_unlock(); - skb->dst = (struct dst_entry*)rth; + skb->rtable = rth; return 0; } RT_CACHE_STAT_INC(in_hlist_search); } rcu_read_unlock(); +skip_cache: /* Multicast recognition logic is moved from route cache to here. The problem was that too many Ethernet cards have broken/missing hardware multicast filters :-( As result the host on multicasting @@ -2079,7 +2263,7 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, Note, that multicast routers are not affected, because route cache entry is created eventually. */ - if (MULTICAST(daddr)) { + if (ipv4_is_multicast(daddr)) { struct in_device *in_dev; rcu_read_lock(); @@ -2088,7 +2272,8 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, ip_hdr(skb)->protocol); if (our #ifdef CONFIG_IP_MROUTE - || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev)) + || (!ipv4_is_local_multicast(daddr) && + IN_DEV_MFORWARD(in_dev)) #endif ) { rcu_read_unlock(); @@ -2102,26 +2287,26 @@ int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr, return ip_route_input_slow(skb, daddr, saddr, tos, dev); } -static inline int __mkroute_output(struct rtable **result, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static int __mkroute_output(struct rtable **result, + struct fib_result *res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) { struct rtable *rth; struct in_device *in_dev; u32 tos = RT_FL_TOS(oldflp); int err = 0; - if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) return -EINVAL; if (fl->fl4_dst == htonl(0xFFFFFFFF)) res->type = RTN_BROADCAST; - else if (MULTICAST(fl->fl4_dst)) + else if (ipv4_is_multicast(fl->fl4_dst)) res->type = RTN_MULTICAST; - else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst)) + else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) return -EINVAL; if (dev_out->flags & IFF_LOOPBACK) @@ -2184,6 +2369,7 @@ static inline int __mkroute_output(struct rtable **result, rth->rt_spec_dst= fl->fl4_src; rth->u.dst.output=ip_output; + rth->rt_genid = rt_genid(dev_net(dev_out)); RT_CACHE_STAT_INC(out_slow_tot); @@ -2201,7 +2387,7 @@ static inline int __mkroute_output(struct rtable **result, #ifdef CONFIG_IP_MROUTE if (res->type == RTN_MULTICAST) { if (IN_DEV_MFORWARD(in_dev) && - !LOCAL_MCAST(oldflp->fl4_dst)) { + !ipv4_is_local_multicast(oldflp->fl4_dst)) { rth->u.dst.input = ip_mr_input; rth->u.dst.output = ip_mc_output; } @@ -2221,18 +2407,19 @@ static inline int __mkroute_output(struct rtable **result, return err; } -static inline int ip_mkroute_output(struct rtable **rp, - struct fib_result* res, - const struct flowi *fl, - const struct flowi *oldflp, - struct net_device *dev_out, - unsigned flags) +static int ip_mkroute_output(struct rtable **rp, + struct fib_result *res, + const struct flowi *fl, + const struct flowi *oldflp, + struct net_device *dev_out, + unsigned flags) { struct rtable *rth = NULL; int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags); unsigned hash; if (err == 0) { - hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif); + hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif, + rt_genid(dev_net(dev_out))); err = rt_intern_hash(hash, rth, rp); } @@ -2243,7 +2430,8 @@ static inline int ip_mkroute_output(struct rtable **rp, * Major route resolver routine. */ -static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) +static int ip_route_output_slow(struct net *net, struct rtable **rp, + const struct flowi *oldflp) { u32 tos = RT_FL_TOS(oldflp); struct flowi fl = { .nl_u = { .ip4_u = @@ -2255,7 +2443,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) RT_SCOPE_UNIVERSE), } }, .mark = oldflp->mark, - .iif = init_net.loopback_dev->ifindex, + .iif = net->loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; unsigned flags = 0; @@ -2271,26 +2459,27 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) if (oldflp->fl4_src) { err = -EINVAL; - if (MULTICAST(oldflp->fl4_src) || - BADCLASS(oldflp->fl4_src) || - ZERONET(oldflp->fl4_src)) - goto out; - - /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); - if (dev_out == NULL) + if (ipv4_is_multicast(oldflp->fl4_src) || + ipv4_is_lbcast(oldflp->fl4_src) || + ipv4_is_zeronet(oldflp->fl4_src)) goto out; /* I removed check for oif == dev_out->oif here. It was wrong for two reasons: - 1. ip_dev_find(saddr) can return wrong iface, if saddr is - assigned to multiple interfaces. + 1. ip_dev_find(net, saddr) can return wrong iface, if saddr + is assigned to multiple interfaces. 2. Moreover, we are allowed to send packets with saddr of another iface. --ANK */ if (oldflp->oif == 0 - && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + && (ipv4_is_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; + /* Special hack: user can direct multicasts and limited broadcast via necessary interface without fiddling with IP_MULTICAST_IF or IP_PKTINFO. @@ -2309,14 +2498,20 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.oif = dev_out->ifindex; goto make_route; } - if (dev_out) + + if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { + /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ + dev_out = ip_dev_find(net, oldflp->fl4_src); + if (dev_out == NULL) + goto out; dev_put(dev_out); - dev_out = NULL; + dev_out = NULL; + } } if (oldflp->oif) { - dev_out = dev_get_by_index(&init_net, oldflp->oif); + dev_out = dev_get_by_index(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; @@ -2327,14 +2522,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) goto out; /* Wrong error code */ } - if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) { + if (ipv4_is_local_multicast(oldflp->fl4_dst) || + oldflp->fl4_dst == htonl(0xFFFFFFFF)) { if (!fl.fl4_src) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); goto make_route; } if (!fl.fl4_src) { - if (MULTICAST(oldflp->fl4_dst)) + if (ipv4_is_multicast(oldflp->fl4_dst)) fl.fl4_src = inet_select_addr(dev_out, 0, fl.fl4_scope); else if (!oldflp->fl4_dst) @@ -2349,15 +2545,15 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = net->loopback_dev; dev_hold(dev_out); - fl.oif = init_net.loopback_dev->ifindex; + fl.oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } - if (fib_lookup(&fl, &res)) { + if (fib_lookup(net, &fl, &res)) { res.fi = NULL; if (oldflp->oif) { /* Apparently, routing tables are wrong. Assume, @@ -2396,7 +2592,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) fl.fl4_src = fl.fl4_dst; if (dev_out) dev_put(dev_out); - dev_out = init_net.loopback_dev; + dev_out = net->loopback_dev; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) @@ -2412,7 +2608,7 @@ static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) else #endif if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif) - fib_select_default(&fl, &res); + fib_select_default(net, &fl, &res); if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); @@ -2435,12 +2631,16 @@ make_route: out: return err; } -int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) +int __ip_route_output_key(struct net *net, struct rtable **rp, + const struct flowi *flp) { unsigned hash; struct rtable *rth; - hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif); + if (!rt_caching(net)) + goto slow_output; + + hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net)); rcu_read_lock_bh(); for (rth = rcu_dereference(rt_hash_table[hash].chain); rth; @@ -2451,7 +2651,9 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) rth->fl.oif == flp->oif && rth->fl.mark == flp->mark && !((rth->fl.fl4_tos ^ flp->fl4_tos) & - (IPTOS_RT_MASK | RTO_ONLINK))) { + (IPTOS_RT_MASK | RTO_ONLINK)) && + net_eq(dev_net(rth->u.dst.dev), net) && + !rt_is_expired(rth)) { dst_use(&rth->u.dst, jiffies); RT_CACHE_STAT_INC(out_hit); rcu_read_unlock_bh(); @@ -2462,7 +2664,8 @@ int __ip_route_output_key(struct rtable **rp, const struct flowi *flp) } rcu_read_unlock_bh(); - return ip_route_output_slow(rp, flp); +slow_output: + return ip_route_output_slow(net, rp, flp); } EXPORT_SYMBOL_GPL(__ip_route_output_key); @@ -2473,15 +2676,15 @@ static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) static struct dst_ops ipv4_dst_blackhole_ops = { .family = AF_INET, - .protocol = __constant_htons(ETH_P_IP), + .protocol = cpu_to_be16(ETH_P_IP), .destroy = ipv4_dst_destroy, .check = ipv4_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .entry_size = sizeof(struct rtable), + .entries = ATOMIC_INIT(0), }; -static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk) +static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp) { struct rtable *ort = *rp; struct rtable *rt = (struct rtable *) @@ -2505,6 +2708,7 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock rt->idev = ort->idev; if (rt->idev) in_dev_hold(rt->idev); + rt->rt_genid = rt_genid(net); rt->rt_flags = ort->rt_flags; rt->rt_type = ort->rt_type; rt->rt_dst = ort->rt_dst; @@ -2524,11 +2728,12 @@ static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock return (rt ? 0 : -ENOMEM); } -int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags) +int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, + struct sock *sk, int flags) { int err; - if ((err = __ip_route_output_key(rp, flp)) != 0) + if ((err = __ip_route_output_key(net, rp, flp)) != 0) return err; if (flp->proto) { @@ -2536,10 +2741,10 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, flp->fl4_src = (*rp)->rt_src; if (!flp->fl4_dst) flp->fl4_dst = (*rp)->rt_dst; - err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, + err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk, flags ? XFRM_LOOKUP_WAIT : 0); if (err == -EREMOTE) - err = ipv4_dst_blackhole(rp, flp, sk); + err = ipv4_dst_blackhole(net, rp, flp); return err; } @@ -2549,15 +2754,16 @@ int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, EXPORT_SYMBOL_GPL(ip_route_output_flow); -int ip_route_output_key(struct rtable **rp, struct flowi *flp) +int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp) { - return ip_route_output_flow(rp, flp, NULL, 0); + return ip_route_output_flow(net, rp, flp, NULL, 0); } -static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, +static int rt_fill_info(struct net *net, + struct sk_buff *skb, u32 pid, u32 seq, int event, int nowait, unsigned int flags) { - struct rtable *rt = (struct rtable*)skb->dst; + struct rtable *rt = skb->rtable; struct rtmsg *r; struct nlmsghdr *nlh; long expires; @@ -2618,9 +2824,9 @@ static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event, #ifdef CONFIG_IP_MROUTE __be32 dst = rt->rt_dst; - if (MULTICAST(dst) && !LOCAL_MCAST(dst) && - IPV4_DEVCONF_ALL(MC_FORWARDING)) { - int err = ipmr_get_route(skb, r, nowait); + if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) && + IPV4_DEVCONF_ALL(net, MC_FORWARDING)) { + int err = ipmr_get_route(net, skb, r, nowait); if (err <= 0) { if (!nowait) { if (err == 0) @@ -2650,7 +2856,7 @@ nla_put_failure: static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg) { - struct net *net = in_skb->sk->sk_net; + struct net *net = sock_net(in_skb->sk); struct rtmsg *rtm; struct nlattr *tb[RTA_MAX+1]; struct rtable *rt = NULL; @@ -2660,9 +2866,6 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void int err; struct sk_buff *skb; - if (net != &init_net) - return -EINVAL; - err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy); if (err < 0) goto errout; @@ -2692,7 +2895,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void if (iif) { struct net_device *dev; - dev = __dev_get_by_index(&init_net, iif); + dev = __dev_get_by_index(net, iif); if (dev == NULL) { err = -ENODEV; goto errout_free; @@ -2704,7 +2907,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev); local_bh_enable(); - rt = (struct rtable*) skb->dst; + rt = skb->rtable; if (err == 0 && rt->u.dst.error) err = -rt->u.dst.error; } else { @@ -2718,22 +2921,22 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void }, .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0, }; - err = ip_route_output_key(&rt, &fl); + err = ip_route_output_key(net, &rt, &fl); } if (err) goto errout_free; - skb->dst = &rt->u.dst; + skb->rtable = rt; if (rtm->rtm_flags & RTM_F_NOTIFY) rt->rt_flags |= RTCF_NOTIFY; - err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, - RTM_NEWROUTE, 0, 0); + err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq, + RTM_NEWROUTE, 0, 0); if (err <= 0) goto errout_free; - err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid); + err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid); errout: return err; @@ -2747,19 +2950,26 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) struct rtable *rt; int h, s_h; int idx, s_idx; + struct net *net; + + net = sock_net(skb->sk); s_h = cb->args[0]; if (s_h < 0) s_h = 0; s_idx = idx = cb->args[1]; - for (h = s_h; h <= rt_hash_mask; h++) { + for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) { + if (!rt_hash_table[h].chain) + continue; rcu_read_lock_bh(); for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt; rt = rcu_dereference(rt->u.dst.rt_next), idx++) { - if (idx < s_idx) + if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx) + continue; + if (rt_is_expired(rt)) continue; skb->dst = dst_clone(&rt->u.dst); - if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid, + if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, RTM_NEWROUTE, 1, NLM_F_MULTI) <= 0) { dst_release(xchg(&skb->dst, NULL)); @@ -2769,7 +2979,6 @@ int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb) dst_release(xchg(&skb->dst, NULL)); } rcu_read_unlock_bh(); - s_idx = 0; } done: @@ -2780,19 +2989,25 @@ done: void ip_rt_multicast_event(struct in_device *in_dev) { - rt_cache_flush(0); + rt_cache_flush(dev_net(in_dev->dev), 0); } #ifdef CONFIG_SYSCTL -static int flush_delay; - -static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, +static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write, struct file *filp, void __user *buffer, size_t *lenp, loff_t *ppos) { if (write) { - proc_dointvec(ctl, write, filp, buffer, lenp, ppos); - rt_cache_flush(flush_delay); + int flush_delay; + ctl_table ctl; + struct net *net; + + memcpy(&ctl, __ctl, sizeof(ctl)); + ctl.data = &flush_delay; + proc_dointvec(&ctl, write, filp, buffer, lenp, ppos); + + net = (struct net *)__ctl->extra1; + rt_cache_flush(net, flush_delay); return 0; } @@ -2800,57 +3015,89 @@ static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write, } static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table, - int __user *name, - int nlen, void __user *oldval, size_t __user *oldlenp, void __user *newval, size_t newlen) { int delay; + struct net *net; if (newlen != sizeof(int)) return -EINVAL; if (get_user(delay, (int __user *)newval)) return -EFAULT; - rt_cache_flush(delay); + net = (struct net *)table->extra1; + rt_cache_flush(net, delay); return 0; } -ctl_table ipv4_route_table[] = { - { - .ctl_name = NET_IPV4_ROUTE_FLUSH, - .procname = "flush", - .data = &flush_delay, - .maxlen = sizeof(int), - .mode = 0200, - .proc_handler = &ipv4_sysctl_rtcache_flush, - .strategy = &ipv4_sysctl_rtcache_flush_strategy, - }, - { - .ctl_name = NET_IPV4_ROUTE_MIN_DELAY, - .procname = "min_delay", - .data = &ip_rt_min_delay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, - { - .ctl_name = NET_IPV4_ROUTE_MAX_DELAY, - .procname = "max_delay", - .data = &ip_rt_max_delay, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, - }, +static void rt_secret_reschedule(int old) +{ + struct net *net; + int new = ip_rt_secret_interval; + int diff = new - old; + + if (!diff) + return; + + rtnl_lock(); + for_each_net(net) { + int deleted = del_timer_sync(&net->ipv4.rt_secret_timer); + + if (!new) + continue; + + if (deleted) { + long time = net->ipv4.rt_secret_timer.expires - jiffies; + + if (time <= 0 || (time += diff) <= 0) + time = 0; + + net->ipv4.rt_secret_timer.expires = time; + } else + net->ipv4.rt_secret_timer.expires = new; + + net->ipv4.rt_secret_timer.expires += jiffies; + add_timer(&net->ipv4.rt_secret_timer); + } + rtnl_unlock(); +} + +static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write, + struct file *filp, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int old = ip_rt_secret_interval; + int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos); + + rt_secret_reschedule(old); + + return ret; +} + +static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table, + void __user *oldval, + size_t __user *oldlenp, + void __user *newval, + size_t newlen) +{ + int old = ip_rt_secret_interval; + int ret = sysctl_jiffies(table, oldval, oldlenp, newval, newlen); + + rt_secret_reschedule(old); + + return ret; +} + +static ctl_table ipv4_route_table[] = { { .ctl_name = NET_IPV4_ROUTE_GC_THRESH, .procname = "gc_thresh", .data = &ipv4_dst_ops.gc_thresh, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_MAX_SIZE, @@ -2858,7 +3105,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { /* Deprecated. Use gc_min_interval_ms */ @@ -2868,8 +3115,8 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS, @@ -2877,8 +3124,8 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_min_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_ms_jiffies, - .strategy = &sysctl_ms_jiffies, + .proc_handler = proc_dointvec_ms_jiffies, + .strategy = sysctl_ms_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT, @@ -2886,8 +3133,8 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_timeout, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL, @@ -2895,8 +3142,8 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD, @@ -2904,7 +3151,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_redirect_load, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER, @@ -2912,7 +3159,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_redirect_number, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE, @@ -2920,7 +3167,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_redirect_silence, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_ERROR_COST, @@ -2928,7 +3175,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_error_cost, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_ERROR_BURST, @@ -2936,7 +3183,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_error_burst, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY, @@ -2944,7 +3191,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_gc_elasticity, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES, @@ -2952,8 +3199,8 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_mtu_expires, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = proc_dointvec_jiffies, + .strategy = sysctl_jiffies, }, { .ctl_name = NET_IPV4_ROUTE_MIN_PMTU, @@ -2961,7 +3208,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_min_pmtu, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS, @@ -2969,7 +3216,7 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_min_advmss, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL, @@ -2977,13 +3224,120 @@ ctl_table ipv4_route_table[] = { .data = &ip_rt_secret_interval, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, - .strategy = &sysctl_jiffies, + .proc_handler = ipv4_sysctl_rt_secret_interval, + .strategy = ipv4_sysctl_rt_secret_interval_strategy, }, { .ctl_name = 0 } }; + +static struct ctl_table empty[1]; + +static struct ctl_table ipv4_skeleton[] = +{ + { .procname = "route", .ctl_name = NET_IPV4_ROUTE, + .mode = 0555, .child = ipv4_route_table}, + { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH, + .mode = 0555, .child = empty}, + { } +}; + +static __net_initdata struct ctl_path ipv4_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { }, +}; + +static struct ctl_table ipv4_route_flush_table[] = { + { + .ctl_name = NET_IPV4_ROUTE_FLUSH, + .procname = "flush", + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = ipv4_sysctl_rtcache_flush, + .strategy = ipv4_sysctl_rtcache_flush_strategy, + }, + { .ctl_name = 0 }, +}; + +static __net_initdata struct ctl_path ipv4_route_path[] = { + { .procname = "net", .ctl_name = CTL_NET, }, + { .procname = "ipv4", .ctl_name = NET_IPV4, }, + { .procname = "route", .ctl_name = NET_IPV4_ROUTE, }, + { }, +}; + +static __net_init int sysctl_route_net_init(struct net *net) +{ + struct ctl_table *tbl; + + tbl = ipv4_route_flush_table; + if (net != &init_net) { + tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL); + if (tbl == NULL) + goto err_dup; + } + tbl[0].extra1 = net; + + net->ipv4.route_hdr = + register_net_sysctl_table(net, ipv4_route_path, tbl); + if (net->ipv4.route_hdr == NULL) + goto err_reg; + return 0; + +err_reg: + if (tbl != ipv4_route_flush_table) + kfree(tbl); +err_dup: + return -ENOMEM; +} + +static __net_exit void sysctl_route_net_exit(struct net *net) +{ + struct ctl_table *tbl; + + tbl = net->ipv4.route_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.route_hdr); + BUG_ON(tbl == ipv4_route_flush_table); + kfree(tbl); +} + +static __net_initdata struct pernet_operations sysctl_route_ops = { + .init = sysctl_route_net_init, + .exit = sysctl_route_net_exit, +}; #endif + +static __net_init int rt_secret_timer_init(struct net *net) +{ + atomic_set(&net->ipv4.rt_genid, + (int) ((num_physpages ^ (num_physpages>>8)) ^ + (jiffies ^ (jiffies >> 7)))); + + net->ipv4.rt_secret_timer.function = rt_secret_rebuild; + net->ipv4.rt_secret_timer.data = (unsigned long)net; + init_timer_deferrable(&net->ipv4.rt_secret_timer); + + if (ip_rt_secret_interval) { + net->ipv4.rt_secret_timer.expires = + jiffies + net_random() % ip_rt_secret_interval + + ip_rt_secret_interval; + add_timer(&net->ipv4.rt_secret_timer); + } + return 0; +} + +static __net_exit void rt_secret_timer_exit(struct net *net) +{ + del_timer_sync(&net->ipv4.rt_secret_timer); +} + +static __net_initdata struct pernet_operations rt_secret_timer_ops = { + .init = rt_secret_timer_init, + .exit = rt_secret_timer_exit, +}; + + #ifdef CONFIG_NET_CLS_ROUTE struct ip_rt_acct *ip_rt_acct __read_mostly; #endif /* CONFIG_NET_CLS_ROUTE */ @@ -3002,11 +3356,8 @@ int __init ip_rt_init(void) { int rc = 0; - rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^ - (jiffies ^ (jiffies >> 7))); - #ifdef CONFIG_NET_CLS_ROUTE - ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct)); + ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct)); if (!ip_rt_acct) panic("IP: failed to allocate ip_rt_acct\n"); #endif @@ -3026,7 +3377,7 @@ int __init ip_rt_init(void) 0, &rt_hash_log, &rt_hash_mask, - 0); + rhash_entries ? 0 : 512 * 1024); memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket)); rt_hash_lock_init(); @@ -3036,20 +3387,16 @@ int __init ip_rt_init(void) devinet_init(); ip_fib_init(); - setup_timer(&rt_flush_timer, rt_run_flush, 0); - setup_timer(&rt_secret_timer, rt_secret_rebuild, 0); - /* All the timers, started at system startup tend to synchronize. Perturb it a bit. */ schedule_delayed_work(&expires_work, net_random() % ip_rt_gc_interval + ip_rt_gc_interval); - rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval + - ip_rt_secret_interval; - add_timer(&rt_secret_timer); + if (register_pernet_subsys(&rt_secret_timer_ops)) + printk(KERN_ERR "Unable to setup rt_secret_timer\n"); - if (ip_rt_proc_init(&init_net)) + if (ip_rt_proc_init()) printk(KERN_ERR "Unable to create route proc files\n"); #ifdef CONFIG_XFRM xfrm_init(); @@ -3057,9 +3404,23 @@ int __init ip_rt_init(void) #endif rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL); +#ifdef CONFIG_SYSCTL + register_pernet_subsys(&sysctl_route_ops); +#endif return rc; } +#ifdef CONFIG_SYSCTL +/* + * We really need to sanitize the damn ipv4 init order, then all + * this nonsense will go away. + */ +void __init ip_static_sysctl_init(void) +{ + register_sysctl_paths(ipv4_path, ipv4_skeleton); +} +#endif + EXPORT_SYMBOL(__ip_select_ident); EXPORT_SYMBOL(ip_route_input); EXPORT_SYMBOL(ip_route_output_key);