2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
8 * Version: $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
13 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
17 * Alan Cox : Verify area fixes.
18 * Alan Cox : cli() protects routing changes
19 * Rui Oliveira : ICMP routing table updates
20 * (rco@di.uminho.pt) Routing table insertion and update
21 * Linus Torvalds : Rewrote bits to be sensible
22 * Alan Cox : Added BSD route gw semantics
23 * Alan Cox : Super /proc >4K
24 * Alan Cox : MTU in route table
25 * Alan Cox : MSS actually. Also added the window
27 * Sam Lantinga : Fixed route matching in rt_del()
28 * Alan Cox : Routing cache support.
29 * Alan Cox : Removed compatibility cruft.
30 * Alan Cox : RTF_REJECT support.
31 * Alan Cox : TCP irtt support.
32 * Jonathan Naylor : Added Metric support.
33 * Miquel van Smoorenburg : BSD API fixes.
34 * Miquel van Smoorenburg : Metrics.
35 * Alan Cox : Use __u32 properly
36 * Alan Cox : Aligned routing errors more closely with BSD
37 * our system is still very different.
38 * Alan Cox : Faster /proc handling
39 * Alexey Kuznetsov : Massive rework to support tree based routing,
40 * routing caches and better behaviour.
42 * Olaf Erb : irtt wasn't being copied right.
43 * Bjorn Ekwall : Kerneld route support.
44 * Alan Cox : Multicast fixed (I hope)
45 * Pavel Krauz : Limited broadcast fixed
46 * Mike McLagan : Routing by source
47 * Alexey Kuznetsov : End of old history. Split to fib.c and
48 * route.c and rewritten from scratch.
49 * Andi Kleen : Load-limit warning messages.
50 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
51 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
52 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
53 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
54 * Marc Boucher : routing by fwmark
55 * Robert Olsson : Added rt_cache statistics
56 * Arnaldo C. Melo : Convert proc stuff to seq_file
57 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
58 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
59 * Ilia Sotnikov : Removed TOS from hash calculations
61 * This program is free software; you can redistribute it and/or
62 * modify it under the terms of the GNU General Public License
63 * as published by the Free Software Foundation; either version
64 * 2 of the License, or (at your option) any later version.
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
110 #include <linux/sysctl.h>
113 #define RT_FL_TOS(oldflp) \
114 ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
116 #define IP_MAX_MTU 0xFFF0
118 #define RT_GC_TIMEOUT (300*HZ)
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval = 60 * HZ;
123 static int ip_rt_gc_min_interval = HZ / 2;
124 static int ip_rt_redirect_number = 9;
125 static int ip_rt_redirect_load = HZ / 50;
126 static int ip_rt_redirect_silence = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost = HZ;
128 static int ip_rt_error_burst = 5 * HZ;
129 static int ip_rt_gc_elasticity = 8;
130 static int ip_rt_mtu_expires = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu = 512 + 20 + 20;
132 static int ip_rt_min_advmss = 256;
133 static int ip_rt_secret_interval = 10 * 60 * HZ;
135 #define RTprint(a...) printk(KERN_DEBUG a)
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
142 * Interface to generic destination cache.
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void ipv4_dst_destroy(struct dst_entry *dst);
147 static void ipv4_dst_ifdown(struct dst_entry *dst,
148 struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void ipv4_link_failure(struct sk_buff *skb);
151 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
155 static struct dst_ops ipv4_dst_ops = {
157 .protocol = __constant_htons(ETH_P_IP),
158 .gc = rt_garbage_collect,
159 .check = ipv4_dst_check,
160 .destroy = ipv4_dst_destroy,
161 .ifdown = ipv4_dst_ifdown,
162 .negative_advice = ipv4_negative_advice,
163 .link_failure = ipv4_link_failure,
164 .update_pmtu = ip_rt_update_pmtu,
165 .local_out = ip_local_out,
166 .entry_size = sizeof(struct rtable),
167 .entries = ATOMIC_INIT(0),
170 #define ECN_OR_COST(class) TC_PRIO_##class
172 const __u8 ip_tos2prio[16] = {
176 ECN_OR_COST(BESTEFFORT),
182 ECN_OR_COST(INTERACTIVE),
184 ECN_OR_COST(INTERACTIVE),
185 TC_PRIO_INTERACTIVE_BULK,
186 ECN_OR_COST(INTERACTIVE_BULK),
187 TC_PRIO_INTERACTIVE_BULK,
188 ECN_OR_COST(INTERACTIVE_BULK)
196 /* The locking scheme is rather straight forward:
198 * 1) Read-Copy Update protects the buckets of the central route hash.
199 * 2) Only writers remove entries, and they hold the lock
200 * as they look at rtable reference counts.
201 * 3) Only readers acquire references to rtable entries,
202 * they do so with atomic increments and with the
206 struct rt_hash_bucket {
207 struct rtable *chain;
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210 defined(CONFIG_PROVE_LOCKING)
212 * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213 * The size of this table is a power of two and depends on the number of CPUS.
214 * (on lockdep we have a quite big spinlock_t, so keep the size down there)
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ 256
220 # define RT_HASH_LOCK_SZ 4096
222 # define RT_HASH_LOCK_SZ 2048
224 # define RT_HASH_LOCK_SZ 1024
226 # define RT_HASH_LOCK_SZ 512
228 # define RT_HASH_LOCK_SZ 256
232 static spinlock_t *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
235 static __init void rt_hash_lock_init(void)
239 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
242 panic("IP: failed to allocate rt_hash_locks\n");
244 for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245 spin_lock_init(&rt_hash_locks[i]);
248 # define rt_hash_lock_addr(slot) NULL
250 static inline void rt_hash_lock_init(void)
255 static struct rt_hash_bucket *rt_hash_table;
256 static unsigned rt_hash_mask;
257 static unsigned int rt_hash_log;
258 static atomic_t rt_genid;
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262 (__raw_get_cpu_var(rt_cache_stat).field++)
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
266 return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
270 #define rt_hash(daddr, saddr, idx) \
271 rt_hash_code((__force u32)(__be32)(daddr),\
272 (__force u32)(__be32)(saddr) ^ ((idx) << 5))
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
282 struct rtable *r = NULL;
284 for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
286 r = rcu_dereference(rt_hash_table[st->bucket].chain);
288 if (r->rt_genid == st->genid)
290 r = rcu_dereference(r->u.dst.rt_next);
292 rcu_read_unlock_bh();
297 static struct rtable *__rt_cache_get_next(struct rt_cache_iter_state *st,
300 r = r->u.dst.rt_next;
302 rcu_read_unlock_bh();
303 if (--st->bucket < 0)
306 r = rt_hash_table[st->bucket].chain;
308 return rcu_dereference(r);
311 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st,
314 while ((r = __rt_cache_get_next(st, r)) != NULL) {
315 if (r->rt_genid == st->genid)
321 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
323 struct rtable *r = rt_cache_get_first(st);
326 while (pos && (r = rt_cache_get_next(st, r)))
328 return pos ? NULL : r;
331 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
333 struct rt_cache_iter_state *st = seq->private;
336 return rt_cache_get_idx(st, *pos - 1);
337 st->genid = atomic_read(&rt_genid);
338 return SEQ_START_TOKEN;
341 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
344 struct rt_cache_iter_state *st = seq->private;
346 if (v == SEQ_START_TOKEN)
347 r = rt_cache_get_first(st);
349 r = rt_cache_get_next(st, v);
354 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
356 if (v && v != SEQ_START_TOKEN)
357 rcu_read_unlock_bh();
360 static int rt_cache_seq_show(struct seq_file *seq, void *v)
362 if (v == SEQ_START_TOKEN)
363 seq_printf(seq, "%-127s\n",
364 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
365 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
368 struct rtable *r = v;
371 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
372 "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
373 r->u.dst.dev ? r->u.dst.dev->name : "*",
374 (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
375 r->rt_flags, atomic_read(&r->u.dst.__refcnt),
376 r->u.dst.__use, 0, (unsigned long)r->rt_src,
377 (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
378 (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
379 dst_metric(&r->u.dst, RTAX_WINDOW),
380 (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
381 dst_metric(&r->u.dst, RTAX_RTTVAR)),
383 r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
384 r->u.dst.hh ? (r->u.dst.hh->hh_output ==
387 seq_printf(seq, "%-127s\n", temp);
392 static const struct seq_operations rt_cache_seq_ops = {
393 .start = rt_cache_seq_start,
394 .next = rt_cache_seq_next,
395 .stop = rt_cache_seq_stop,
396 .show = rt_cache_seq_show,
399 static int rt_cache_seq_open(struct inode *inode, struct file *file)
401 return seq_open_private(file, &rt_cache_seq_ops,
402 sizeof(struct rt_cache_iter_state));
405 static const struct file_operations rt_cache_seq_fops = {
406 .owner = THIS_MODULE,
407 .open = rt_cache_seq_open,
410 .release = seq_release_private,
414 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
419 return SEQ_START_TOKEN;
421 for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
422 if (!cpu_possible(cpu))
425 return &per_cpu(rt_cache_stat, cpu);
430 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
434 for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
435 if (!cpu_possible(cpu))
438 return &per_cpu(rt_cache_stat, cpu);
444 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
449 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
451 struct rt_cache_stat *st = v;
453 if (v == SEQ_START_TOKEN) {
454 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
458 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
459 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
460 atomic_read(&ipv4_dst_ops.entries),
483 static const struct seq_operations rt_cpu_seq_ops = {
484 .start = rt_cpu_seq_start,
485 .next = rt_cpu_seq_next,
486 .stop = rt_cpu_seq_stop,
487 .show = rt_cpu_seq_show,
491 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
493 return seq_open(file, &rt_cpu_seq_ops);
496 static const struct file_operations rt_cpu_seq_fops = {
497 .owner = THIS_MODULE,
498 .open = rt_cpu_seq_open,
501 .release = seq_release,
504 #ifdef CONFIG_NET_CLS_ROUTE
505 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
506 int length, int *eof, void *data)
510 if ((offset & 3) || (length & 3))
513 if (offset >= sizeof(struct ip_rt_acct) * 256) {
518 if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
519 length = sizeof(struct ip_rt_acct) * 256 - offset;
523 offset /= sizeof(u32);
526 u32 *dst = (u32 *) buffer;
529 memset(dst, 0, length);
531 for_each_possible_cpu(i) {
535 src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
536 for (j = 0; j < length/4; j++)
544 static __init int ip_rt_proc_init(struct net *net)
546 struct proc_dir_entry *pde;
548 pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
553 pde = proc_create("rt_cache", S_IRUGO,
554 net->proc_net_stat, &rt_cpu_seq_fops);
558 #ifdef CONFIG_NET_CLS_ROUTE
559 pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
560 ip_rt_acct_read, NULL);
566 #ifdef CONFIG_NET_CLS_ROUTE
568 remove_proc_entry("rt_cache", net->proc_net_stat);
571 remove_proc_entry("rt_cache", net->proc_net);
576 static inline int ip_rt_proc_init(struct net *net)
580 #endif /* CONFIG_PROC_FS */
582 static __inline__ void rt_free(struct rtable *rt)
584 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
587 static __inline__ void rt_drop(struct rtable *rt)
590 call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
593 static __inline__ int rt_fast_clean(struct rtable *rth)
595 /* Kill broadcast/multicast entries very aggresively, if they
596 collide in hash table with more useful entries */
597 return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
598 rth->fl.iif && rth->u.dst.rt_next;
601 static __inline__ int rt_valuable(struct rtable *rth)
603 return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
607 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
612 if (atomic_read(&rth->u.dst.__refcnt))
616 if (rth->u.dst.expires &&
617 time_after_eq(jiffies, rth->u.dst.expires))
620 age = jiffies - rth->u.dst.lastuse;
622 if ((age <= tmo1 && !rt_fast_clean(rth)) ||
623 (age <= tmo2 && rt_valuable(rth)))
629 /* Bits of score are:
631 * 30: not quite useless
632 * 29..0: usage counter
634 static inline u32 rt_score(struct rtable *rt)
636 u32 score = jiffies - rt->u.dst.lastuse;
638 score = ~score & ~(3<<30);
644 !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
650 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
652 return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
653 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
654 (fl1->mark ^ fl2->mark) |
655 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
656 *(u16 *)&fl2->nl_u.ip4_u.tos) |
657 (fl1->oif ^ fl2->oif) |
658 (fl1->iif ^ fl2->iif)) == 0;
661 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
663 return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
667 * Perform a full scan of hash table and free all entries.
668 * Can be called by a softirq or a process.
669 * In the later case, we want to be reschedule if necessary
671 static void rt_do_flush(int process_context)
674 struct rtable *rth, *next;
676 for (i = 0; i <= rt_hash_mask; i++) {
677 if (process_context && need_resched())
679 rth = rt_hash_table[i].chain;
683 spin_lock_bh(rt_hash_lock_addr(i));
684 rth = rt_hash_table[i].chain;
685 rt_hash_table[i].chain = NULL;
686 spin_unlock_bh(rt_hash_lock_addr(i));
688 for (; rth; rth = next) {
689 next = rth->u.dst.rt_next;
695 static void rt_check_expire(void)
697 static unsigned int rover;
698 unsigned int i = rover, goal;
699 struct rtable *rth, **rthp;
702 mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
703 if (ip_rt_gc_timeout > 1)
704 do_div(mult, ip_rt_gc_timeout);
705 goal = (unsigned int)mult;
706 if (goal > rt_hash_mask)
707 goal = rt_hash_mask + 1;
708 for (; goal > 0; goal--) {
709 unsigned long tmo = ip_rt_gc_timeout;
711 i = (i + 1) & rt_hash_mask;
712 rthp = &rt_hash_table[i].chain;
719 spin_lock_bh(rt_hash_lock_addr(i));
720 while ((rth = *rthp) != NULL) {
721 if (rth->rt_genid != atomic_read(&rt_genid)) {
722 *rthp = rth->u.dst.rt_next;
726 if (rth->u.dst.expires) {
727 /* Entry is expired even if it is in use */
728 if (time_before_eq(jiffies, rth->u.dst.expires)) {
730 rthp = &rth->u.dst.rt_next;
733 } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
735 rthp = &rth->u.dst.rt_next;
739 /* Cleanup aged off entries. */
740 *rthp = rth->u.dst.rt_next;
743 spin_unlock_bh(rt_hash_lock_addr(i));
749 * rt_worker_func() is run in process context.
750 * we call rt_check_expire() to scan part of the hash table
752 static void rt_worker_func(struct work_struct *work)
755 schedule_delayed_work(&expires_work, ip_rt_gc_interval);
759 * Pertubation of rt_genid by a small quantity [1..256]
760 * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
761 * many times (2^24) without giving recent rt_genid.
762 * Jenkins hash is strong enough that litle changes of rt_genid are OK.
764 static void rt_cache_invalidate(void)
766 unsigned char shuffle;
768 get_random_bytes(&shuffle, sizeof(shuffle));
769 atomic_add(shuffle + 1U, &rt_genid);
773 * delay < 0 : invalidate cache (fast : entries will be deleted later)
774 * delay >= 0 : invalidate & flush cache (can be long)
776 void rt_cache_flush(int delay)
778 rt_cache_invalidate();
780 rt_do_flush(!in_softirq());
784 * We change rt_genid and let gc do the cleanup
786 static void rt_secret_rebuild(unsigned long dummy)
788 rt_cache_invalidate();
789 mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
793 Short description of GC goals.
795 We want to build algorithm, which will keep routing cache
796 at some equilibrium point, when number of aged off entries
797 is kept approximately equal to newly generated ones.
799 Current expiration strength is variable "expire".
800 We try to adjust it dynamically, so that if networking
801 is idle expires is large enough to keep enough of warm entries,
802 and when load increases it reduces to limit cache size.
805 static int rt_garbage_collect(struct dst_ops *ops)
807 static unsigned long expire = RT_GC_TIMEOUT;
808 static unsigned long last_gc;
810 static int equilibrium;
811 struct rtable *rth, **rthp;
812 unsigned long now = jiffies;
816 * Garbage collection is pretty expensive,
817 * do not make it too frequently.
820 RT_CACHE_STAT_INC(gc_total);
822 if (now - last_gc < ip_rt_gc_min_interval &&
823 atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
824 RT_CACHE_STAT_INC(gc_ignored);
828 /* Calculate number of entries, which we want to expire now. */
829 goal = atomic_read(&ipv4_dst_ops.entries) -
830 (ip_rt_gc_elasticity << rt_hash_log);
832 if (equilibrium < ipv4_dst_ops.gc_thresh)
833 equilibrium = ipv4_dst_ops.gc_thresh;
834 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
836 equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
837 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
840 /* We are in dangerous area. Try to reduce cache really
843 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
844 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
847 if (now - last_gc >= ip_rt_gc_min_interval)
858 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
859 unsigned long tmo = expire;
861 k = (k + 1) & rt_hash_mask;
862 rthp = &rt_hash_table[k].chain;
863 spin_lock_bh(rt_hash_lock_addr(k));
864 while ((rth = *rthp) != NULL) {
865 if (rth->rt_genid == atomic_read(&rt_genid) &&
866 !rt_may_expire(rth, tmo, expire)) {
868 rthp = &rth->u.dst.rt_next;
871 *rthp = rth->u.dst.rt_next;
875 spin_unlock_bh(rt_hash_lock_addr(k));
884 /* Goal is not achieved. We stop process if:
886 - if expire reduced to zero. Otherwise, expire is halfed.
887 - if table is not full.
888 - if we are called from interrupt.
889 - jiffies check is just fallback/debug loop breaker.
890 We will not spin here for long time in any case.
893 RT_CACHE_STAT_INC(gc_goal_miss);
899 #if RT_CACHE_DEBUG >= 2
900 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
901 atomic_read(&ipv4_dst_ops.entries), goal, i);
904 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
906 } while (!in_softirq() && time_before_eq(jiffies, now));
908 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
911 printk(KERN_WARNING "dst cache overflow\n");
912 RT_CACHE_STAT_INC(gc_dst_overflow);
916 expire += ip_rt_gc_min_interval;
917 if (expire > ip_rt_gc_timeout ||
918 atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
919 expire = ip_rt_gc_timeout;
920 #if RT_CACHE_DEBUG >= 2
921 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
922 atomic_read(&ipv4_dst_ops.entries), goal, rover);
927 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
929 struct rtable *rth, **rthp;
931 struct rtable *cand, **candp;
934 int attempts = !in_softirq();
943 rthp = &rt_hash_table[hash].chain;
945 spin_lock_bh(rt_hash_lock_addr(hash));
946 while ((rth = *rthp) != NULL) {
947 if (rth->rt_genid != atomic_read(&rt_genid)) {
948 *rthp = rth->u.dst.rt_next;
952 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
954 *rthp = rth->u.dst.rt_next;
956 * Since lookup is lockfree, the deletion
957 * must be visible to another weakly ordered CPU before
958 * the insertion at the start of the hash chain.
960 rcu_assign_pointer(rth->u.dst.rt_next,
961 rt_hash_table[hash].chain);
963 * Since lookup is lockfree, the update writes
964 * must be ordered for consistency on SMP.
966 rcu_assign_pointer(rt_hash_table[hash].chain, rth);
968 dst_use(&rth->u.dst, now);
969 spin_unlock_bh(rt_hash_lock_addr(hash));
976 if (!atomic_read(&rth->u.dst.__refcnt)) {
977 u32 score = rt_score(rth);
979 if (score <= min_score) {
988 rthp = &rth->u.dst.rt_next;
992 /* ip_rt_gc_elasticity used to be average length of chain
993 * length, when exceeded gc becomes really aggressive.
995 * The second limit is less certain. At the moment it allows
996 * only 2 entries per bucket. We will see.
998 if (chain_length > ip_rt_gc_elasticity) {
999 *candp = cand->u.dst.rt_next;
1004 /* Try to bind route to arp only if it is output
1005 route or unicast forwarding path.
1007 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1008 int err = arp_bind_neighbour(&rt->u.dst);
1010 spin_unlock_bh(rt_hash_lock_addr(hash));
1012 if (err != -ENOBUFS) {
1017 /* Neighbour tables are full and nothing
1018 can be released. Try to shrink route cache,
1019 it is most likely it holds some neighbour records.
1021 if (attempts-- > 0) {
1022 int saved_elasticity = ip_rt_gc_elasticity;
1023 int saved_int = ip_rt_gc_min_interval;
1024 ip_rt_gc_elasticity = 1;
1025 ip_rt_gc_min_interval = 0;
1026 rt_garbage_collect(&ipv4_dst_ops);
1027 ip_rt_gc_min_interval = saved_int;
1028 ip_rt_gc_elasticity = saved_elasticity;
1032 if (net_ratelimit())
1033 printk(KERN_WARNING "Neighbour table overflow.\n");
1039 rt->u.dst.rt_next = rt_hash_table[hash].chain;
1040 #if RT_CACHE_DEBUG >= 2
1041 if (rt->u.dst.rt_next) {
1043 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1044 NIPQUAD(rt->rt_dst));
1045 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1046 printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1050 rt_hash_table[hash].chain = rt;
1051 spin_unlock_bh(rt_hash_lock_addr(hash));
1056 void rt_bind_peer(struct rtable *rt, int create)
1058 static DEFINE_SPINLOCK(rt_peer_lock);
1059 struct inet_peer *peer;
1061 peer = inet_getpeer(rt->rt_dst, create);
1063 spin_lock_bh(&rt_peer_lock);
1064 if (rt->peer == NULL) {
1068 spin_unlock_bh(&rt_peer_lock);
1074 * Peer allocation may fail only in serious out-of-memory conditions. However
1075 * we still can generate some output.
1076 * Random ID selection looks a bit dangerous because we have no chances to
1077 * select ID being unique in a reasonable period of time.
1078 * But broken packet identifier may be better than no packet at all.
1080 static void ip_select_fb_ident(struct iphdr *iph)
1082 static DEFINE_SPINLOCK(ip_fb_id_lock);
1083 static u32 ip_fallback_id;
1086 spin_lock_bh(&ip_fb_id_lock);
1087 salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1088 iph->id = htons(salt & 0xFFFF);
1089 ip_fallback_id = salt;
1090 spin_unlock_bh(&ip_fb_id_lock);
1093 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1095 struct rtable *rt = (struct rtable *) dst;
1098 if (rt->peer == NULL)
1099 rt_bind_peer(rt, 1);
1101 /* If peer is attached to destination, it is never detached,
1102 so that we need not to grab a lock to dereference it.
1105 iph->id = htons(inet_getid(rt->peer, more));
1109 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1110 __builtin_return_address(0));
1112 ip_select_fb_ident(iph);
1115 static void rt_del(unsigned hash, struct rtable *rt)
1117 struct rtable **rthp, *aux;
1119 rthp = &rt_hash_table[hash].chain;
1120 spin_lock_bh(rt_hash_lock_addr(hash));
1122 while ((aux = *rthp) != NULL) {
1123 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1124 *rthp = aux->u.dst.rt_next;
1128 rthp = &aux->u.dst.rt_next;
1130 spin_unlock_bh(rt_hash_lock_addr(hash));
1133 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1134 __be32 saddr, struct net_device *dev)
1137 struct in_device *in_dev = in_dev_get(dev);
1138 struct rtable *rth, **rthp;
1139 __be32 skeys[2] = { saddr, 0 };
1140 int ikeys[2] = { dev->ifindex, 0 };
1141 struct netevent_redirect netevent;
1148 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1149 || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1150 || ipv4_is_zeronet(new_gw))
1151 goto reject_redirect;
1153 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1154 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1155 goto reject_redirect;
1156 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1157 goto reject_redirect;
1159 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1160 goto reject_redirect;
1163 for (i = 0; i < 2; i++) {
1164 for (k = 0; k < 2; k++) {
1165 unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1167 rthp=&rt_hash_table[hash].chain;
1170 while ((rth = rcu_dereference(*rthp)) != NULL) {
1173 if (rth->fl.fl4_dst != daddr ||
1174 rth->fl.fl4_src != skeys[i] ||
1175 rth->fl.oif != ikeys[k] ||
1177 rth->rt_genid != atomic_read(&rt_genid) ||
1178 rth->u.dst.dev->nd_net != net) {
1179 rthp = &rth->u.dst.rt_next;
1183 if (rth->rt_dst != daddr ||
1184 rth->rt_src != saddr ||
1186 rth->rt_gateway != old_gw ||
1187 rth->u.dst.dev != dev)
1190 dst_hold(&rth->u.dst);
1193 rt = dst_alloc(&ipv4_dst_ops);
1200 /* Copy all the information. */
1202 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1203 rt->u.dst.__use = 1;
1204 atomic_set(&rt->u.dst.__refcnt, 1);
1205 rt->u.dst.child = NULL;
1207 dev_hold(rt->u.dst.dev);
1209 in_dev_hold(rt->idev);
1210 rt->u.dst.obsolete = 0;
1211 rt->u.dst.lastuse = jiffies;
1212 rt->u.dst.path = &rt->u.dst;
1213 rt->u.dst.neighbour = NULL;
1214 rt->u.dst.hh = NULL;
1215 rt->u.dst.xfrm = NULL;
1216 rt->rt_genid = atomic_read(&rt_genid);
1217 rt->rt_flags |= RTCF_REDIRECTED;
1219 /* Gateway is different ... */
1220 rt->rt_gateway = new_gw;
1222 /* Redirect received -> path was valid */
1223 dst_confirm(&rth->u.dst);
1226 atomic_inc(&rt->peer->refcnt);
1228 if (arp_bind_neighbour(&rt->u.dst) ||
1229 !(rt->u.dst.neighbour->nud_state &
1231 if (rt->u.dst.neighbour)
1232 neigh_event_send(rt->u.dst.neighbour, NULL);
1238 netevent.old = &rth->u.dst;
1239 netevent.new = &rt->u.dst;
1240 call_netevent_notifiers(NETEVENT_REDIRECT,
1244 if (!rt_intern_hash(hash, rt, &rt))
1257 #ifdef CONFIG_IP_ROUTE_VERBOSE
1258 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1259 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1260 "%u.%u.%u.%u ignored.\n"
1261 " Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1262 NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1263 NIPQUAD(saddr), NIPQUAD(daddr));
1268 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1270 struct rtable *rt = (struct rtable*)dst;
1271 struct dst_entry *ret = dst;
1274 if (dst->obsolete) {
1277 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1278 rt->u.dst.expires) {
1279 unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1281 #if RT_CACHE_DEBUG >= 1
1282 printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1283 "%u.%u.%u.%u/%02x dropped\n",
1284 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1295 * 1. The first ip_rt_redirect_number redirects are sent
1296 * with exponential backoff, then we stop sending them at all,
1297 * assuming that the host ignores our redirects.
1298 * 2. If we did not see packets requiring redirects
1299 * during ip_rt_redirect_silence, we assume that the host
1300 * forgot redirected route and start to send redirects again.
1302 * This algorithm is much cheaper and more intelligent than dumb load limiting
1305 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1306 * and "frag. need" (breaks PMTU discovery) in icmp.c.
1309 void ip_rt_send_redirect(struct sk_buff *skb)
1311 struct rtable *rt = (struct rtable*)skb->dst;
1312 struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1317 if (!IN_DEV_TX_REDIRECTS(in_dev))
1320 /* No redirected packets during ip_rt_redirect_silence;
1321 * reset the algorithm.
1323 if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1324 rt->u.dst.rate_tokens = 0;
1326 /* Too many ignored redirects; do not send anything
1327 * set u.dst.rate_last to the last seen redirected packet.
1329 if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1330 rt->u.dst.rate_last = jiffies;
1334 /* Check for load limit; set rate_last to the latest sent
1337 if (rt->u.dst.rate_tokens == 0 ||
1339 (rt->u.dst.rate_last +
1340 (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1341 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1342 rt->u.dst.rate_last = jiffies;
1343 ++rt->u.dst.rate_tokens;
1344 #ifdef CONFIG_IP_ROUTE_VERBOSE
1345 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1346 rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1348 printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1349 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1350 NIPQUAD(rt->rt_src), rt->rt_iif,
1351 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1358 static int ip_error(struct sk_buff *skb)
1360 struct rtable *rt = (struct rtable*)skb->dst;
1364 switch (rt->u.dst.error) {
1369 code = ICMP_HOST_UNREACH;
1372 code = ICMP_NET_UNREACH;
1373 IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1376 code = ICMP_PKT_FILTERED;
1381 rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1382 if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1383 rt->u.dst.rate_tokens = ip_rt_error_burst;
1384 rt->u.dst.rate_last = now;
1385 if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1386 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1387 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1390 out: kfree_skb(skb);
1395 * The last two values are not from the RFC but
1396 * are needed for AMPRnet AX.25 paths.
1399 static const unsigned short mtu_plateau[] =
1400 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1402 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1406 for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1407 if (old_mtu > mtu_plateau[i])
1408 return mtu_plateau[i];
1412 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1413 unsigned short new_mtu)
1416 unsigned short old_mtu = ntohs(iph->tot_len);
1418 __be32 skeys[2] = { iph->saddr, 0, };
1419 __be32 daddr = iph->daddr;
1420 unsigned short est_mtu = 0;
1422 if (ipv4_config.no_pmtu_disc)
1425 for (i = 0; i < 2; i++) {
1426 unsigned hash = rt_hash(daddr, skeys[i], 0);
1429 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1430 rth = rcu_dereference(rth->u.dst.rt_next)) {
1431 if (rth->fl.fl4_dst == daddr &&
1432 rth->fl.fl4_src == skeys[i] &&
1433 rth->rt_dst == daddr &&
1434 rth->rt_src == iph->saddr &&
1436 !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1437 rth->u.dst.dev->nd_net == net &&
1438 rth->rt_genid == atomic_read(&rt_genid)) {
1439 unsigned short mtu = new_mtu;
1441 if (new_mtu < 68 || new_mtu >= old_mtu) {
1443 /* BSD 4.2 compatibility hack :-( */
1445 old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1446 old_mtu >= 68 + (iph->ihl << 2))
1447 old_mtu -= iph->ihl << 2;
1449 mtu = guess_mtu(old_mtu);
1451 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1452 if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1453 dst_confirm(&rth->u.dst);
1454 if (mtu < ip_rt_min_pmtu) {
1455 mtu = ip_rt_min_pmtu;
1456 rth->u.dst.metrics[RTAX_LOCK-1] |=
1459 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1460 dst_set_expires(&rth->u.dst,
1469 return est_mtu ? : new_mtu;
1472 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1474 if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1475 !(dst_metric_locked(dst, RTAX_MTU))) {
1476 if (mtu < ip_rt_min_pmtu) {
1477 mtu = ip_rt_min_pmtu;
1478 dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1480 dst->metrics[RTAX_MTU-1] = mtu;
1481 dst_set_expires(dst, ip_rt_mtu_expires);
1482 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1486 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1491 static void ipv4_dst_destroy(struct dst_entry *dst)
1493 struct rtable *rt = (struct rtable *) dst;
1494 struct inet_peer *peer = rt->peer;
1495 struct in_device *idev = rt->idev;
1508 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1511 struct rtable *rt = (struct rtable *) dst;
1512 struct in_device *idev = rt->idev;
1513 if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1514 struct in_device *loopback_idev =
1515 in_dev_get(dev->nd_net->loopback_dev);
1516 if (loopback_idev) {
1517 rt->idev = loopback_idev;
1523 static void ipv4_link_failure(struct sk_buff *skb)
1527 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1529 rt = (struct rtable *) skb->dst;
1531 dst_set_expires(&rt->u.dst, 0);
1534 static int ip_rt_bug(struct sk_buff *skb)
1536 printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1537 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1538 skb->dev ? skb->dev->name : "?");
1544 We do not cache source address of outgoing interface,
1545 because it is used only by IP RR, TS and SRR options,
1546 so that it out of fast path.
1548 BTW remember: "addr" is allowed to be not aligned
1552 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1555 struct fib_result res;
1557 if (rt->fl.iif == 0)
1559 else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1560 src = FIB_RES_PREFSRC(res);
1563 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1565 memcpy(addr, &src, 4);
1568 #ifdef CONFIG_NET_CLS_ROUTE
1569 static void set_class_tag(struct rtable *rt, u32 tag)
1571 if (!(rt->u.dst.tclassid & 0xFFFF))
1572 rt->u.dst.tclassid |= tag & 0xFFFF;
1573 if (!(rt->u.dst.tclassid & 0xFFFF0000))
1574 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1578 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1580 struct fib_info *fi = res->fi;
1583 if (FIB_RES_GW(*res) &&
1584 FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1585 rt->rt_gateway = FIB_RES_GW(*res);
1586 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1587 sizeof(rt->u.dst.metrics));
1588 if (fi->fib_mtu == 0) {
1589 rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1590 if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1591 rt->rt_gateway != rt->rt_dst &&
1592 rt->u.dst.dev->mtu > 576)
1593 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1595 #ifdef CONFIG_NET_CLS_ROUTE
1596 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1599 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1601 if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1602 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1603 if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1604 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1605 if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1606 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1608 if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1609 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1611 #ifdef CONFIG_NET_CLS_ROUTE
1612 #ifdef CONFIG_IP_MULTIPLE_TABLES
1613 set_class_tag(rt, fib_rules_tclass(res));
1615 set_class_tag(rt, itag);
1617 rt->rt_type = res->type;
1620 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1621 u8 tos, struct net_device *dev, int our)
1626 struct in_device *in_dev = in_dev_get(dev);
1629 /* Primary sanity checks. */
1634 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1635 ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1638 if (ipv4_is_zeronet(saddr)) {
1639 if (!ipv4_is_local_multicast(daddr))
1641 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1642 } else if (fib_validate_source(saddr, 0, tos, 0,
1643 dev, &spec_dst, &itag) < 0)
1646 rth = dst_alloc(&ipv4_dst_ops);
1650 rth->u.dst.output= ip_rt_bug;
1652 atomic_set(&rth->u.dst.__refcnt, 1);
1653 rth->u.dst.flags= DST_HOST;
1654 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1655 rth->u.dst.flags |= DST_NOPOLICY;
1656 rth->fl.fl4_dst = daddr;
1657 rth->rt_dst = daddr;
1658 rth->fl.fl4_tos = tos;
1659 rth->fl.mark = skb->mark;
1660 rth->fl.fl4_src = saddr;
1661 rth->rt_src = saddr;
1662 #ifdef CONFIG_NET_CLS_ROUTE
1663 rth->u.dst.tclassid = itag;
1666 rth->fl.iif = dev->ifindex;
1667 rth->u.dst.dev = init_net.loopback_dev;
1668 dev_hold(rth->u.dst.dev);
1669 rth->idev = in_dev_get(rth->u.dst.dev);
1671 rth->rt_gateway = daddr;
1672 rth->rt_spec_dst= spec_dst;
1673 rth->rt_genid = atomic_read(&rt_genid);
1674 rth->rt_flags = RTCF_MULTICAST;
1675 rth->rt_type = RTN_MULTICAST;
1677 rth->u.dst.input= ip_local_deliver;
1678 rth->rt_flags |= RTCF_LOCAL;
1681 #ifdef CONFIG_IP_MROUTE
1682 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1683 rth->u.dst.input = ip_mr_input;
1685 RT_CACHE_STAT_INC(in_slow_mc);
1688 hash = rt_hash(daddr, saddr, dev->ifindex);
1689 return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1701 static void ip_handle_martian_source(struct net_device *dev,
1702 struct in_device *in_dev,
1703 struct sk_buff *skb,
1707 RT_CACHE_STAT_INC(in_martian_src);
1708 #ifdef CONFIG_IP_ROUTE_VERBOSE
1709 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1711 * RFC1812 recommendation, if source is martian,
1712 * the only hint is MAC header.
1714 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1715 "%u.%u.%u.%u, on dev %s\n",
1716 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1717 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1719 const unsigned char *p = skb_mac_header(skb);
1720 printk(KERN_WARNING "ll header: ");
1721 for (i = 0; i < dev->hard_header_len; i++, p++) {
1723 if (i < (dev->hard_header_len - 1))
1732 static inline int __mkroute_input(struct sk_buff *skb,
1733 struct fib_result* res,
1734 struct in_device *in_dev,
1735 __be32 daddr, __be32 saddr, u32 tos,
1736 struct rtable **result)
1741 struct in_device *out_dev;
1746 /* get a working reference to the output device */
1747 out_dev = in_dev_get(FIB_RES_DEV(*res));
1748 if (out_dev == NULL) {
1749 if (net_ratelimit())
1750 printk(KERN_CRIT "Bug in ip_route_input" \
1751 "_slow(). Please, report\n");
1756 err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1757 in_dev->dev, &spec_dst, &itag);
1759 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1767 flags |= RTCF_DIRECTSRC;
1769 if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1770 (IN_DEV_SHARED_MEDIA(out_dev) ||
1771 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1772 flags |= RTCF_DOREDIRECT;
1774 if (skb->protocol != htons(ETH_P_IP)) {
1775 /* Not IP (i.e. ARP). Do not create route, if it is
1776 * invalid for proxy arp. DNAT routes are always valid.
1778 if (out_dev == in_dev) {
1785 rth = dst_alloc(&ipv4_dst_ops);
1791 atomic_set(&rth->u.dst.__refcnt, 1);
1792 rth->u.dst.flags= DST_HOST;
1793 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1794 rth->u.dst.flags |= DST_NOPOLICY;
1795 if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1796 rth->u.dst.flags |= DST_NOXFRM;
1797 rth->fl.fl4_dst = daddr;
1798 rth->rt_dst = daddr;
1799 rth->fl.fl4_tos = tos;
1800 rth->fl.mark = skb->mark;
1801 rth->fl.fl4_src = saddr;
1802 rth->rt_src = saddr;
1803 rth->rt_gateway = daddr;
1805 rth->fl.iif = in_dev->dev->ifindex;
1806 rth->u.dst.dev = (out_dev)->dev;
1807 dev_hold(rth->u.dst.dev);
1808 rth->idev = in_dev_get(rth->u.dst.dev);
1810 rth->rt_spec_dst= spec_dst;
1812 rth->u.dst.input = ip_forward;
1813 rth->u.dst.output = ip_output;
1814 rth->rt_genid = atomic_read(&rt_genid);
1816 rt_set_nexthop(rth, res, itag);
1818 rth->rt_flags = flags;
1823 /* release the working reference to the output device */
1824 in_dev_put(out_dev);
1828 static inline int ip_mkroute_input(struct sk_buff *skb,
1829 struct fib_result* res,
1830 const struct flowi *fl,
1831 struct in_device *in_dev,
1832 __be32 daddr, __be32 saddr, u32 tos)
1834 struct rtable* rth = NULL;
1838 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1839 if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1840 fib_select_multipath(fl, res);
1843 /* create a routing cache entry */
1844 err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1848 /* put it into the cache */
1849 hash = rt_hash(daddr, saddr, fl->iif);
1850 return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1854 * NOTE. We drop all the packets that has local source
1855 * addresses, because every properly looped back packet
1856 * must have correct destination already attached by output routine.
1858 * Such approach solves two big problems:
1859 * 1. Not simplex devices are handled properly.
1860 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1863 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1864 u8 tos, struct net_device *dev)
1866 struct fib_result res;
1867 struct in_device *in_dev = in_dev_get(dev);
1868 struct flowi fl = { .nl_u = { .ip4_u =
1872 .scope = RT_SCOPE_UNIVERSE,
1875 .iif = dev->ifindex };
1878 struct rtable * rth;
1883 struct net * net = dev->nd_net;
1885 /* IP on this device is disabled. */
1890 /* Check for the most weird martians, which can be not detected
1894 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1895 ipv4_is_loopback(saddr))
1896 goto martian_source;
1898 if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1901 /* Accept zero addresses only to limited broadcast;
1902 * I even do not know to fix it or not. Waiting for complains :-)
1904 if (ipv4_is_zeronet(saddr))
1905 goto martian_source;
1907 if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1908 ipv4_is_loopback(daddr))
1909 goto martian_destination;
1912 * Now we are ready to route packet.
1914 if ((err = fib_lookup(net, &fl, &res)) != 0) {
1915 if (!IN_DEV_FORWARD(in_dev))
1921 RT_CACHE_STAT_INC(in_slow_tot);
1923 if (res.type == RTN_BROADCAST)
1926 if (res.type == RTN_LOCAL) {
1928 result = fib_validate_source(saddr, daddr, tos,
1929 net->loopback_dev->ifindex,
1930 dev, &spec_dst, &itag);
1932 goto martian_source;
1934 flags |= RTCF_DIRECTSRC;
1939 if (!IN_DEV_FORWARD(in_dev))
1941 if (res.type != RTN_UNICAST)
1942 goto martian_destination;
1944 err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1952 if (skb->protocol != htons(ETH_P_IP))
1955 if (ipv4_is_zeronet(saddr))
1956 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1958 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1961 goto martian_source;
1963 flags |= RTCF_DIRECTSRC;
1965 flags |= RTCF_BROADCAST;
1966 res.type = RTN_BROADCAST;
1967 RT_CACHE_STAT_INC(in_brd);
1970 rth = dst_alloc(&ipv4_dst_ops);
1974 rth->u.dst.output= ip_rt_bug;
1975 rth->rt_genid = atomic_read(&rt_genid);
1977 atomic_set(&rth->u.dst.__refcnt, 1);
1978 rth->u.dst.flags= DST_HOST;
1979 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1980 rth->u.dst.flags |= DST_NOPOLICY;
1981 rth->fl.fl4_dst = daddr;
1982 rth->rt_dst = daddr;
1983 rth->fl.fl4_tos = tos;
1984 rth->fl.mark = skb->mark;
1985 rth->fl.fl4_src = saddr;
1986 rth->rt_src = saddr;
1987 #ifdef CONFIG_NET_CLS_ROUTE
1988 rth->u.dst.tclassid = itag;
1991 rth->fl.iif = dev->ifindex;
1992 rth->u.dst.dev = net->loopback_dev;
1993 dev_hold(rth->u.dst.dev);
1994 rth->idev = in_dev_get(rth->u.dst.dev);
1995 rth->rt_gateway = daddr;
1996 rth->rt_spec_dst= spec_dst;
1997 rth->u.dst.input= ip_local_deliver;
1998 rth->rt_flags = flags|RTCF_LOCAL;
1999 if (res.type == RTN_UNREACHABLE) {
2000 rth->u.dst.input= ip_error;
2001 rth->u.dst.error= -err;
2002 rth->rt_flags &= ~RTCF_LOCAL;
2004 rth->rt_type = res.type;
2005 hash = rt_hash(daddr, saddr, fl.iif);
2006 err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2010 RT_CACHE_STAT_INC(in_no_route);
2011 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2012 res.type = RTN_UNREACHABLE;
2018 * Do not cache martian addresses: they should be logged (RFC1812)
2020 martian_destination:
2021 RT_CACHE_STAT_INC(in_martian_dst);
2022 #ifdef CONFIG_IP_ROUTE_VERBOSE
2023 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2024 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2025 "%u.%u.%u.%u, dev %s\n",
2026 NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2030 err = -EHOSTUNREACH;
2042 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2046 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2047 u8 tos, struct net_device *dev)
2049 struct rtable * rth;
2051 int iif = dev->ifindex;
2055 tos &= IPTOS_RT_MASK;
2056 hash = rt_hash(daddr, saddr, iif);
2059 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2060 rth = rcu_dereference(rth->u.dst.rt_next)) {
2061 if (rth->fl.fl4_dst == daddr &&
2062 rth->fl.fl4_src == saddr &&
2063 rth->fl.iif == iif &&
2065 rth->fl.mark == skb->mark &&
2066 rth->fl.fl4_tos == tos &&
2067 rth->u.dst.dev->nd_net == net &&
2068 rth->rt_genid == atomic_read(&rt_genid)) {
2069 dst_use(&rth->u.dst, jiffies);
2070 RT_CACHE_STAT_INC(in_hit);
2072 skb->dst = (struct dst_entry*)rth;
2075 RT_CACHE_STAT_INC(in_hlist_search);
2079 /* Multicast recognition logic is moved from route cache to here.
2080 The problem was that too many Ethernet cards have broken/missing
2081 hardware multicast filters :-( As result the host on multicasting
2082 network acquires a lot of useless route cache entries, sort of
2083 SDR messages from all the world. Now we try to get rid of them.
2084 Really, provided software IP multicast filter is organized
2085 reasonably (at least, hashed), it does not result in a slowdown
2086 comparing with route cache reject entries.
2087 Note, that multicast routers are not affected, because
2088 route cache entry is created eventually.
2090 if (ipv4_is_multicast(daddr)) {
2091 struct in_device *in_dev;
2094 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2095 int our = ip_check_mc(in_dev, daddr, saddr,
2096 ip_hdr(skb)->protocol);
2098 #ifdef CONFIG_IP_MROUTE
2099 || (!ipv4_is_local_multicast(daddr) &&
2100 IN_DEV_MFORWARD(in_dev))
2104 return ip_route_input_mc(skb, daddr, saddr,
2111 return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2114 static inline int __mkroute_output(struct rtable **result,
2115 struct fib_result* res,
2116 const struct flowi *fl,
2117 const struct flowi *oldflp,
2118 struct net_device *dev_out,
2122 struct in_device *in_dev;
2123 u32 tos = RT_FL_TOS(oldflp);
2126 if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2129 if (fl->fl4_dst == htonl(0xFFFFFFFF))
2130 res->type = RTN_BROADCAST;
2131 else if (ipv4_is_multicast(fl->fl4_dst))
2132 res->type = RTN_MULTICAST;
2133 else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2136 if (dev_out->flags & IFF_LOOPBACK)
2137 flags |= RTCF_LOCAL;
2139 /* get work reference to inet device */
2140 in_dev = in_dev_get(dev_out);
2144 if (res->type == RTN_BROADCAST) {
2145 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2147 fib_info_put(res->fi);
2150 } else if (res->type == RTN_MULTICAST) {
2151 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2152 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2154 flags &= ~RTCF_LOCAL;
2155 /* If multicast route do not exist use
2156 default one, but do not gateway in this case.
2159 if (res->fi && res->prefixlen < 4) {
2160 fib_info_put(res->fi);
2166 rth = dst_alloc(&ipv4_dst_ops);
2172 atomic_set(&rth->u.dst.__refcnt, 1);
2173 rth->u.dst.flags= DST_HOST;
2174 if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2175 rth->u.dst.flags |= DST_NOXFRM;
2176 if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2177 rth->u.dst.flags |= DST_NOPOLICY;
2179 rth->fl.fl4_dst = oldflp->fl4_dst;
2180 rth->fl.fl4_tos = tos;
2181 rth->fl.fl4_src = oldflp->fl4_src;
2182 rth->fl.oif = oldflp->oif;
2183 rth->fl.mark = oldflp->mark;
2184 rth->rt_dst = fl->fl4_dst;
2185 rth->rt_src = fl->fl4_src;
2186 rth->rt_iif = oldflp->oif ? : dev_out->ifindex;
2187 /* get references to the devices that are to be hold by the routing
2189 rth->u.dst.dev = dev_out;
2191 rth->idev = in_dev_get(dev_out);
2192 rth->rt_gateway = fl->fl4_dst;
2193 rth->rt_spec_dst= fl->fl4_src;
2195 rth->u.dst.output=ip_output;
2196 rth->rt_genid = atomic_read(&rt_genid);
2198 RT_CACHE_STAT_INC(out_slow_tot);
2200 if (flags & RTCF_LOCAL) {
2201 rth->u.dst.input = ip_local_deliver;
2202 rth->rt_spec_dst = fl->fl4_dst;
2204 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2205 rth->rt_spec_dst = fl->fl4_src;
2206 if (flags & RTCF_LOCAL &&
2207 !(dev_out->flags & IFF_LOOPBACK)) {
2208 rth->u.dst.output = ip_mc_output;
2209 RT_CACHE_STAT_INC(out_slow_mc);
2211 #ifdef CONFIG_IP_MROUTE
2212 if (res->type == RTN_MULTICAST) {
2213 if (IN_DEV_MFORWARD(in_dev) &&
2214 !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2215 rth->u.dst.input = ip_mr_input;
2216 rth->u.dst.output = ip_mc_output;
2222 rt_set_nexthop(rth, res, 0);
2224 rth->rt_flags = flags;
2228 /* release work reference to inet device */
2234 static inline int ip_mkroute_output(struct rtable **rp,
2235 struct fib_result* res,
2236 const struct flowi *fl,
2237 const struct flowi *oldflp,
2238 struct net_device *dev_out,
2241 struct rtable *rth = NULL;
2242 int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2245 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2246 err = rt_intern_hash(hash, rth, rp);
2253 * Major route resolver routine.
2256 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2257 const struct flowi *oldflp)
2259 u32 tos = RT_FL_TOS(oldflp);
2260 struct flowi fl = { .nl_u = { .ip4_u =
2261 { .daddr = oldflp->fl4_dst,
2262 .saddr = oldflp->fl4_src,
2263 .tos = tos & IPTOS_RT_MASK,
2264 .scope = ((tos & RTO_ONLINK) ?
2268 .mark = oldflp->mark,
2269 .iif = net->loopback_dev->ifindex,
2270 .oif = oldflp->oif };
2271 struct fib_result res;
2273 struct net_device *dev_out = NULL;
2279 #ifdef CONFIG_IP_MULTIPLE_TABLES
2283 if (oldflp->fl4_src) {
2285 if (ipv4_is_multicast(oldflp->fl4_src) ||
2286 ipv4_is_lbcast(oldflp->fl4_src) ||
2287 ipv4_is_zeronet(oldflp->fl4_src))
2290 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2291 dev_out = ip_dev_find(net, oldflp->fl4_src);
2292 if (dev_out == NULL)
2295 /* I removed check for oif == dev_out->oif here.
2296 It was wrong for two reasons:
2297 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2298 is assigned to multiple interfaces.
2299 2. Moreover, we are allowed to send packets with saddr
2300 of another iface. --ANK
2303 if (oldflp->oif == 0
2304 && (ipv4_is_multicast(oldflp->fl4_dst) ||
2305 oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2306 /* Special hack: user can direct multicasts
2307 and limited broadcast via necessary interface
2308 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2309 This hack is not just for fun, it allows
2310 vic,vat and friends to work.
2311 They bind socket to loopback, set ttl to zero
2312 and expect that it will work.
2313 From the viewpoint of routing cache they are broken,
2314 because we are not allowed to build multicast path
2315 with loopback source addr (look, routing cache
2316 cannot know, that ttl is zero, so that packet
2317 will not leave this host and route is valid).
2318 Luckily, this hack is good workaround.
2321 fl.oif = dev_out->ifindex;
2331 dev_out = dev_get_by_index(net, oldflp->oif);
2333 if (dev_out == NULL)
2336 /* RACE: Check return value of inet_select_addr instead. */
2337 if (__in_dev_get_rtnl(dev_out) == NULL) {
2339 goto out; /* Wrong error code */
2342 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2343 oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2345 fl.fl4_src = inet_select_addr(dev_out, 0,
2350 if (ipv4_is_multicast(oldflp->fl4_dst))
2351 fl.fl4_src = inet_select_addr(dev_out, 0,
2353 else if (!oldflp->fl4_dst)
2354 fl.fl4_src = inet_select_addr(dev_out, 0,
2360 fl.fl4_dst = fl.fl4_src;
2362 fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2365 dev_out = net->loopback_dev;
2367 fl.oif = net->loopback_dev->ifindex;
2368 res.type = RTN_LOCAL;
2369 flags |= RTCF_LOCAL;
2373 if (fib_lookup(net, &fl, &res)) {
2376 /* Apparently, routing tables are wrong. Assume,
2377 that the destination is on link.
2380 Because we are allowed to send to iface
2381 even if it has NO routes and NO assigned
2382 addresses. When oif is specified, routing
2383 tables are looked up with only one purpose:
2384 to catch if destination is gatewayed, rather than
2385 direct. Moreover, if MSG_DONTROUTE is set,
2386 we send packet, ignoring both routing tables
2387 and ifaddr state. --ANK
2390 We could make it even if oif is unknown,
2391 likely IPv6, but we do not.
2394 if (fl.fl4_src == 0)
2395 fl.fl4_src = inet_select_addr(dev_out, 0,
2397 res.type = RTN_UNICAST;
2407 if (res.type == RTN_LOCAL) {
2409 fl.fl4_src = fl.fl4_dst;
2412 dev_out = net->loopback_dev;
2414 fl.oif = dev_out->ifindex;
2416 fib_info_put(res.fi);
2418 flags |= RTCF_LOCAL;
2422 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2423 if (res.fi->fib_nhs > 1 && fl.oif == 0)
2424 fib_select_multipath(&fl, &res);
2427 if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2428 fib_select_default(net, &fl, &res);
2431 fl.fl4_src = FIB_RES_PREFSRC(res);
2435 dev_out = FIB_RES_DEV(res);
2437 fl.oif = dev_out->ifindex;
2441 err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2451 int __ip_route_output_key(struct net *net, struct rtable **rp,
2452 const struct flowi *flp)
2457 hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2460 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2461 rth = rcu_dereference(rth->u.dst.rt_next)) {
2462 if (rth->fl.fl4_dst == flp->fl4_dst &&
2463 rth->fl.fl4_src == flp->fl4_src &&
2465 rth->fl.oif == flp->oif &&
2466 rth->fl.mark == flp->mark &&
2467 !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2468 (IPTOS_RT_MASK | RTO_ONLINK)) &&
2469 rth->u.dst.dev->nd_net == net &&
2470 rth->rt_genid == atomic_read(&rt_genid)) {
2471 dst_use(&rth->u.dst, jiffies);
2472 RT_CACHE_STAT_INC(out_hit);
2473 rcu_read_unlock_bh();
2477 RT_CACHE_STAT_INC(out_hlist_search);
2479 rcu_read_unlock_bh();
2481 return ip_route_output_slow(net, rp, flp);
2484 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2486 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2490 static struct dst_ops ipv4_dst_blackhole_ops = {
2492 .protocol = __constant_htons(ETH_P_IP),
2493 .destroy = ipv4_dst_destroy,
2494 .check = ipv4_dst_check,
2495 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2496 .entry_size = sizeof(struct rtable),
2497 .entries = ATOMIC_INIT(0),
2501 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2503 struct rtable *ort = *rp;
2504 struct rtable *rt = (struct rtable *)
2505 dst_alloc(&ipv4_dst_blackhole_ops);
2508 struct dst_entry *new = &rt->u.dst;
2510 atomic_set(&new->__refcnt, 1);
2512 new->input = dst_discard;
2513 new->output = dst_discard;
2514 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2516 new->dev = ort->u.dst.dev;
2522 rt->idev = ort->idev;
2524 in_dev_hold(rt->idev);
2525 rt->rt_genid = atomic_read(&rt_genid);
2526 rt->rt_flags = ort->rt_flags;
2527 rt->rt_type = ort->rt_type;
2528 rt->rt_dst = ort->rt_dst;
2529 rt->rt_src = ort->rt_src;
2530 rt->rt_iif = ort->rt_iif;
2531 rt->rt_gateway = ort->rt_gateway;
2532 rt->rt_spec_dst = ort->rt_spec_dst;
2533 rt->peer = ort->peer;
2535 atomic_inc(&rt->peer->refcnt);
2540 dst_release(&(*rp)->u.dst);
2542 return (rt ? 0 : -ENOMEM);
2545 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2546 struct sock *sk, int flags)
2550 if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2555 flp->fl4_src = (*rp)->rt_src;
2557 flp->fl4_dst = (*rp)->rt_dst;
2558 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2559 flags ? XFRM_LOOKUP_WAIT : 0);
2560 if (err == -EREMOTE)
2561 err = ipv4_dst_blackhole(rp, flp, sk);
2569 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2571 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2573 return ip_route_output_flow(net, rp, flp, NULL, 0);
2576 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2577 int nowait, unsigned int flags)
2579 struct rtable *rt = (struct rtable*)skb->dst;
2581 struct nlmsghdr *nlh;
2583 u32 id = 0, ts = 0, tsage = 0, error;
2585 nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2589 r = nlmsg_data(nlh);
2590 r->rtm_family = AF_INET;
2591 r->rtm_dst_len = 32;
2593 r->rtm_tos = rt->fl.fl4_tos;
2594 r->rtm_table = RT_TABLE_MAIN;
2595 NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2596 r->rtm_type = rt->rt_type;
2597 r->rtm_scope = RT_SCOPE_UNIVERSE;
2598 r->rtm_protocol = RTPROT_UNSPEC;
2599 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2600 if (rt->rt_flags & RTCF_NOTIFY)
2601 r->rtm_flags |= RTM_F_NOTIFY;
2603 NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2605 if (rt->fl.fl4_src) {
2606 r->rtm_src_len = 32;
2607 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2610 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2611 #ifdef CONFIG_NET_CLS_ROUTE
2612 if (rt->u.dst.tclassid)
2613 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2616 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2617 else if (rt->rt_src != rt->fl.fl4_src)
2618 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2620 if (rt->rt_dst != rt->rt_gateway)
2621 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2623 if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2624 goto nla_put_failure;
2626 error = rt->u.dst.error;
2627 expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2629 id = rt->peer->ip_id_count;
2630 if (rt->peer->tcp_ts_stamp) {
2631 ts = rt->peer->tcp_ts;
2632 tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2637 #ifdef CONFIG_IP_MROUTE
2638 __be32 dst = rt->rt_dst;
2640 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2641 IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2642 int err = ipmr_get_route(skb, r, nowait);
2647 goto nla_put_failure;
2649 if (err == -EMSGSIZE)
2650 goto nla_put_failure;
2656 NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2659 if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2660 expires, error) < 0)
2661 goto nla_put_failure;
2663 return nlmsg_end(skb, nlh);
2666 nlmsg_cancel(skb, nlh);
2670 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2672 struct net *net = in_skb->sk->sk_net;
2674 struct nlattr *tb[RTA_MAX+1];
2675 struct rtable *rt = NULL;
2680 struct sk_buff *skb;
2682 if (net != &init_net)
2685 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2689 rtm = nlmsg_data(nlh);
2691 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2697 /* Reserve room for dummy headers, this skb can pass
2698 through good chunk of routing engine.
2700 skb_reset_mac_header(skb);
2701 skb_reset_network_header(skb);
2703 /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2704 ip_hdr(skb)->protocol = IPPROTO_ICMP;
2705 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2707 src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2708 dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2709 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2712 struct net_device *dev;
2714 dev = __dev_get_by_index(&init_net, iif);
2720 skb->protocol = htons(ETH_P_IP);
2723 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2726 rt = (struct rtable*) skb->dst;
2727 if (err == 0 && rt->u.dst.error)
2728 err = -rt->u.dst.error;
2735 .tos = rtm->rtm_tos,
2738 .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2740 err = ip_route_output_key(&init_net, &rt, &fl);
2746 skb->dst = &rt->u.dst;
2747 if (rtm->rtm_flags & RTM_F_NOTIFY)
2748 rt->rt_flags |= RTCF_NOTIFY;
2750 err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2751 RTM_NEWROUTE, 0, 0);
2755 err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2764 int ip_rt_dump(struct sk_buff *skb, struct netlink_callback *cb)
2773 s_idx = idx = cb->args[1];
2774 for (h = s_h; h <= rt_hash_mask; h++) {
2776 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2777 rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2780 if (rt->rt_genid != atomic_read(&rt_genid))
2782 skb->dst = dst_clone(&rt->u.dst);
2783 if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2784 cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2785 1, NLM_F_MULTI) <= 0) {
2786 dst_release(xchg(&skb->dst, NULL));
2787 rcu_read_unlock_bh();
2790 dst_release(xchg(&skb->dst, NULL));
2792 rcu_read_unlock_bh();
2802 void ip_rt_multicast_event(struct in_device *in_dev)
2807 #ifdef CONFIG_SYSCTL
2808 static int flush_delay;
2810 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2811 struct file *filp, void __user *buffer,
2812 size_t *lenp, loff_t *ppos)
2815 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2816 rt_cache_flush(flush_delay);
2823 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2826 void __user *oldval,
2827 size_t __user *oldlenp,
2828 void __user *newval,
2832 if (newlen != sizeof(int))
2834 if (get_user(delay, (int __user *)newval))
2836 rt_cache_flush(delay);
2840 ctl_table ipv4_route_table[] = {
2842 .ctl_name = NET_IPV4_ROUTE_FLUSH,
2843 .procname = "flush",
2844 .data = &flush_delay,
2845 .maxlen = sizeof(int),
2847 .proc_handler = &ipv4_sysctl_rtcache_flush,
2848 .strategy = &ipv4_sysctl_rtcache_flush_strategy,
2851 .ctl_name = NET_IPV4_ROUTE_GC_THRESH,
2852 .procname = "gc_thresh",
2853 .data = &ipv4_dst_ops.gc_thresh,
2854 .maxlen = sizeof(int),
2856 .proc_handler = &proc_dointvec,
2859 .ctl_name = NET_IPV4_ROUTE_MAX_SIZE,
2860 .procname = "max_size",
2861 .data = &ip_rt_max_size,
2862 .maxlen = sizeof(int),
2864 .proc_handler = &proc_dointvec,
2867 /* Deprecated. Use gc_min_interval_ms */
2869 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2870 .procname = "gc_min_interval",
2871 .data = &ip_rt_gc_min_interval,
2872 .maxlen = sizeof(int),
2874 .proc_handler = &proc_dointvec_jiffies,
2875 .strategy = &sysctl_jiffies,
2878 .ctl_name = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2879 .procname = "gc_min_interval_ms",
2880 .data = &ip_rt_gc_min_interval,
2881 .maxlen = sizeof(int),
2883 .proc_handler = &proc_dointvec_ms_jiffies,
2884 .strategy = &sysctl_ms_jiffies,
2887 .ctl_name = NET_IPV4_ROUTE_GC_TIMEOUT,
2888 .procname = "gc_timeout",
2889 .data = &ip_rt_gc_timeout,
2890 .maxlen = sizeof(int),
2892 .proc_handler = &proc_dointvec_jiffies,
2893 .strategy = &sysctl_jiffies,
2896 .ctl_name = NET_IPV4_ROUTE_GC_INTERVAL,
2897 .procname = "gc_interval",
2898 .data = &ip_rt_gc_interval,
2899 .maxlen = sizeof(int),
2901 .proc_handler = &proc_dointvec_jiffies,
2902 .strategy = &sysctl_jiffies,
2905 .ctl_name = NET_IPV4_ROUTE_REDIRECT_LOAD,
2906 .procname = "redirect_load",
2907 .data = &ip_rt_redirect_load,
2908 .maxlen = sizeof(int),
2910 .proc_handler = &proc_dointvec,
2913 .ctl_name = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2914 .procname = "redirect_number",
2915 .data = &ip_rt_redirect_number,
2916 .maxlen = sizeof(int),
2918 .proc_handler = &proc_dointvec,
2921 .ctl_name = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2922 .procname = "redirect_silence",
2923 .data = &ip_rt_redirect_silence,
2924 .maxlen = sizeof(int),
2926 .proc_handler = &proc_dointvec,
2929 .ctl_name = NET_IPV4_ROUTE_ERROR_COST,
2930 .procname = "error_cost",
2931 .data = &ip_rt_error_cost,
2932 .maxlen = sizeof(int),
2934 .proc_handler = &proc_dointvec,
2937 .ctl_name = NET_IPV4_ROUTE_ERROR_BURST,
2938 .procname = "error_burst",
2939 .data = &ip_rt_error_burst,
2940 .maxlen = sizeof(int),
2942 .proc_handler = &proc_dointvec,
2945 .ctl_name = NET_IPV4_ROUTE_GC_ELASTICITY,
2946 .procname = "gc_elasticity",
2947 .data = &ip_rt_gc_elasticity,
2948 .maxlen = sizeof(int),
2950 .proc_handler = &proc_dointvec,
2953 .ctl_name = NET_IPV4_ROUTE_MTU_EXPIRES,
2954 .procname = "mtu_expires",
2955 .data = &ip_rt_mtu_expires,
2956 .maxlen = sizeof(int),
2958 .proc_handler = &proc_dointvec_jiffies,
2959 .strategy = &sysctl_jiffies,
2962 .ctl_name = NET_IPV4_ROUTE_MIN_PMTU,
2963 .procname = "min_pmtu",
2964 .data = &ip_rt_min_pmtu,
2965 .maxlen = sizeof(int),
2967 .proc_handler = &proc_dointvec,
2970 .ctl_name = NET_IPV4_ROUTE_MIN_ADVMSS,
2971 .procname = "min_adv_mss",
2972 .data = &ip_rt_min_advmss,
2973 .maxlen = sizeof(int),
2975 .proc_handler = &proc_dointvec,
2978 .ctl_name = NET_IPV4_ROUTE_SECRET_INTERVAL,
2979 .procname = "secret_interval",
2980 .data = &ip_rt_secret_interval,
2981 .maxlen = sizeof(int),
2983 .proc_handler = &proc_dointvec_jiffies,
2984 .strategy = &sysctl_jiffies,
2990 #ifdef CONFIG_NET_CLS_ROUTE
2991 struct ip_rt_acct *ip_rt_acct __read_mostly;
2992 #endif /* CONFIG_NET_CLS_ROUTE */
2994 static __initdata unsigned long rhash_entries;
2995 static int __init set_rhash_entries(char *str)
2999 rhash_entries = simple_strtoul(str, &str, 0);
3002 __setup("rhash_entries=", set_rhash_entries);
3004 int __init ip_rt_init(void)
3008 atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3009 (jiffies ^ (jiffies >> 7))));
3011 #ifdef CONFIG_NET_CLS_ROUTE
3012 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3014 panic("IP: failed to allocate ip_rt_acct\n");
3017 ipv4_dst_ops.kmem_cachep =
3018 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3019 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3021 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3023 rt_hash_table = (struct rt_hash_bucket *)
3024 alloc_large_system_hash("IP route cache",
3025 sizeof(struct rt_hash_bucket),
3027 (num_physpages >= 128 * 1024) ?
3033 memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3034 rt_hash_lock_init();
3036 ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3037 ip_rt_max_size = (rt_hash_mask + 1) * 16;
3042 setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3044 /* All the timers, started at system startup tend
3045 to synchronize. Perturb it a bit.
3047 schedule_delayed_work(&expires_work,
3048 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3050 rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3051 ip_rt_secret_interval;
3052 add_timer(&rt_secret_timer);
3054 if (ip_rt_proc_init(&init_net))
3055 printk(KERN_ERR "Unable to create route proc files\n");
3060 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3065 EXPORT_SYMBOL(__ip_select_ident);
3066 EXPORT_SYMBOL(ip_route_input);
3067 EXPORT_SYMBOL(ip_route_output_key);