[NETNS]: Add namespace parameter to ip_route_output_key.
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(struct dst_ops *ops);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238
239 static __init void rt_hash_lock_init(void)
240 {
241         int i;
242
243         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
244                         GFP_KERNEL);
245         if (!rt_hash_locks)
246                 panic("IP: failed to allocate rt_hash_locks\n");
247
248         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
249                 spin_lock_init(&rt_hash_locks[i]);
250 }
251 #else
252 # define rt_hash_lock_addr(slot) NULL
253
254 static inline void rt_hash_lock_init(void)
255 {
256 }
257 #endif
258
259 static struct rt_hash_bucket    *rt_hash_table;
260 static unsigned                 rt_hash_mask;
261 static unsigned int             rt_hash_log;
262 static unsigned int             rt_hash_rnd;
263
264 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
265 #define RT_CACHE_STAT_INC(field) \
266         (__raw_get_cpu_var(rt_cache_stat).field++)
267
268 static int rt_intern_hash(unsigned hash, struct rtable *rth,
269                                 struct rtable **res);
270
271 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
272 {
273         return (jhash_2words(daddr, saddr, rt_hash_rnd)
274                 & rt_hash_mask);
275 }
276
277 #define rt_hash(daddr, saddr, idx) \
278         rt_hash_code((__force u32)(__be32)(daddr),\
279                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
280
281 #ifdef CONFIG_PROC_FS
282 struct rt_cache_iter_state {
283         int bucket;
284 };
285
286 static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 {
288         struct rtable *r = NULL;
289         struct rt_cache_iter_state *st = seq->private;
290
291         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
292                 rcu_read_lock_bh();
293                 r = rt_hash_table[st->bucket].chain;
294                 if (r)
295                         break;
296                 rcu_read_unlock_bh();
297         }
298         return rcu_dereference(r);
299 }
300
301 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 if (--st->bucket < 0)
309                         break;
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference(r);
314 }
315
316 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
317 {
318         struct rtable *r = rt_cache_get_first(seq);
319
320         if (r)
321                 while (pos && (r = rt_cache_get_next(seq, r)))
322                         --pos;
323         return pos ? NULL : r;
324 }
325
326 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
327 {
328         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
329 }
330
331 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
332 {
333         struct rtable *r = NULL;
334
335         if (v == SEQ_START_TOKEN)
336                 r = rt_cache_get_first(seq);
337         else
338                 r = rt_cache_get_next(seq, v);
339         ++*pos;
340         return r;
341 }
342
343 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
344 {
345         if (v && v != SEQ_START_TOKEN)
346                 rcu_read_unlock_bh();
347 }
348
349 static int rt_cache_seq_show(struct seq_file *seq, void *v)
350 {
351         if (v == SEQ_START_TOKEN)
352                 seq_printf(seq, "%-127s\n",
353                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
354                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
355                            "HHUptod\tSpecDst");
356         else {
357                 struct rtable *r = v;
358                 char temp[256];
359
360                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
361                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
362                         r->u.dst.dev ? r->u.dst.dev->name : "*",
363                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
364                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
365                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
366                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
367                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
368                         dst_metric(&r->u.dst, RTAX_WINDOW),
369                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
370                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
371                         r->fl.fl4_tos,
372                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
373                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
374                                        dev_queue_xmit) : 0,
375                         r->rt_spec_dst);
376                 seq_printf(seq, "%-127s\n", temp);
377         }
378         return 0;
379 }
380
381 static const struct seq_operations rt_cache_seq_ops = {
382         .start  = rt_cache_seq_start,
383         .next   = rt_cache_seq_next,
384         .stop   = rt_cache_seq_stop,
385         .show   = rt_cache_seq_show,
386 };
387
388 static int rt_cache_seq_open(struct inode *inode, struct file *file)
389 {
390         return seq_open_private(file, &rt_cache_seq_ops,
391                         sizeof(struct rt_cache_iter_state));
392 }
393
394 static const struct file_operations rt_cache_seq_fops = {
395         .owner   = THIS_MODULE,
396         .open    = rt_cache_seq_open,
397         .read    = seq_read,
398         .llseek  = seq_lseek,
399         .release = seq_release_private,
400 };
401
402
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405         int cpu;
406
407         if (*pos == 0)
408                 return SEQ_START_TOKEN;
409
410         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411                 if (!cpu_possible(cpu))
412                         continue;
413                 *pos = cpu+1;
414                 return &per_cpu(rt_cache_stat, cpu);
415         }
416         return NULL;
417 }
418
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421         int cpu;
422
423         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430
431 }
432
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435
436 }
437
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440         struct rt_cache_stat *st = v;
441
442         if (v == SEQ_START_TOKEN) {
443                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444                 return 0;
445         }
446
447         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449                    atomic_read(&ipv4_dst_ops.entries),
450                    st->in_hit,
451                    st->in_slow_tot,
452                    st->in_slow_mc,
453                    st->in_no_route,
454                    st->in_brd,
455                    st->in_martian_dst,
456                    st->in_martian_src,
457
458                    st->out_hit,
459                    st->out_slow_tot,
460                    st->out_slow_mc,
461
462                    st->gc_total,
463                    st->gc_ignored,
464                    st->gc_goal_miss,
465                    st->gc_dst_overflow,
466                    st->in_hlist_search,
467                    st->out_hlist_search
468                 );
469         return 0;
470 }
471
472 static const struct seq_operations rt_cpu_seq_ops = {
473         .start  = rt_cpu_seq_start,
474         .next   = rt_cpu_seq_next,
475         .stop   = rt_cpu_seq_stop,
476         .show   = rt_cpu_seq_show,
477 };
478
479
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482         return seq_open(file, &rt_cpu_seq_ops);
483 }
484
485 static const struct file_operations rt_cpu_seq_fops = {
486         .owner   = THIS_MODULE,
487         .open    = rt_cpu_seq_open,
488         .read    = seq_read,
489         .llseek  = seq_lseek,
490         .release = seq_release,
491 };
492
493 #ifdef CONFIG_NET_CLS_ROUTE
494 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
495                            int length, int *eof, void *data)
496 {
497         unsigned int i;
498
499         if ((offset & 3) || (length & 3))
500                 return -EIO;
501
502         if (offset >= sizeof(struct ip_rt_acct) * 256) {
503                 *eof = 1;
504                 return 0;
505         }
506
507         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
508                 length = sizeof(struct ip_rt_acct) * 256 - offset;
509                 *eof = 1;
510         }
511
512         offset /= sizeof(u32);
513
514         if (length > 0) {
515                 u32 *dst = (u32 *) buffer;
516
517                 *start = buffer;
518                 memset(dst, 0, length);
519
520                 for_each_possible_cpu(i) {
521                         unsigned int j;
522                         u32 *src;
523
524                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
525                         for (j = 0; j < length/4; j++)
526                                 dst[j] += src[j];
527                 }
528         }
529         return length;
530 }
531 #endif
532
533 static __init int ip_rt_proc_init(struct net *net)
534 {
535         struct proc_dir_entry *pde;
536
537         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
538                         &rt_cache_seq_fops);
539         if (!pde)
540                 goto err1;
541
542         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
543         if (!pde)
544                 goto err2;
545
546         pde->proc_fops = &rt_cpu_seq_fops;
547
548 #ifdef CONFIG_NET_CLS_ROUTE
549         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
550                         ip_rt_acct_read, NULL);
551         if (!pde)
552                 goto err3;
553 #endif
554         return 0;
555
556 #ifdef CONFIG_NET_CLS_ROUTE
557 err3:
558         remove_proc_entry("rt_cache", net->proc_net_stat);
559 #endif
560 err2:
561         remove_proc_entry("rt_cache", net->proc_net);
562 err1:
563         return -ENOMEM;
564 }
565 #else
566 static inline int ip_rt_proc_init(struct net *net)
567 {
568         return 0;
569 }
570 #endif /* CONFIG_PROC_FS */
571
572 static __inline__ void rt_free(struct rtable *rt)
573 {
574         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
575 }
576
577 static __inline__ void rt_drop(struct rtable *rt)
578 {
579         ip_rt_put(rt);
580         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
581 }
582
583 static __inline__ int rt_fast_clean(struct rtable *rth)
584 {
585         /* Kill broadcast/multicast entries very aggresively, if they
586            collide in hash table with more useful entries */
587         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
588                 rth->fl.iif && rth->u.dst.rt_next;
589 }
590
591 static __inline__ int rt_valuable(struct rtable *rth)
592 {
593         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
594                 rth->u.dst.expires;
595 }
596
597 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
598 {
599         unsigned long age;
600         int ret = 0;
601
602         if (atomic_read(&rth->u.dst.__refcnt))
603                 goto out;
604
605         ret = 1;
606         if (rth->u.dst.expires &&
607             time_after_eq(jiffies, rth->u.dst.expires))
608                 goto out;
609
610         age = jiffies - rth->u.dst.lastuse;
611         ret = 0;
612         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
613             (age <= tmo2 && rt_valuable(rth)))
614                 goto out;
615         ret = 1;
616 out:    return ret;
617 }
618
619 /* Bits of score are:
620  * 31: very valuable
621  * 30: not quite useless
622  * 29..0: usage counter
623  */
624 static inline u32 rt_score(struct rtable *rt)
625 {
626         u32 score = jiffies - rt->u.dst.lastuse;
627
628         score = ~score & ~(3<<30);
629
630         if (rt_valuable(rt))
631                 score |= (1<<31);
632
633         if (!rt->fl.iif ||
634             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
635                 score |= (1<<30);
636
637         return score;
638 }
639
640 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
641 {
642         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
643                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
644                 (fl1->mark ^ fl2->mark) |
645                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
646                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
647                 (fl1->oif ^ fl2->oif) |
648                 (fl1->iif ^ fl2->iif)) == 0;
649 }
650
651 /*
652  * Perform a full scan of hash table and free all entries.
653  * Can be called by a softirq or a process.
654  * In the later case, we want to be reschedule if necessary
655  */
656 static void rt_do_flush(int process_context)
657 {
658         unsigned int i;
659         struct rtable *rth, *next;
660
661         for (i = 0; i <= rt_hash_mask; i++) {
662                 if (process_context && need_resched())
663                         cond_resched();
664                 rth = rt_hash_table[i].chain;
665                 if (!rth)
666                         continue;
667
668                 spin_lock_bh(rt_hash_lock_addr(i));
669                 rth = rt_hash_table[i].chain;
670                 rt_hash_table[i].chain = NULL;
671                 spin_unlock_bh(rt_hash_lock_addr(i));
672
673                 for (; rth; rth = next) {
674                         next = rth->u.dst.rt_next;
675                         rt_free(rth);
676                 }
677         }
678 }
679
680 static void rt_check_expire(void)
681 {
682         static unsigned int rover;
683         unsigned int i = rover, goal;
684         struct rtable *rth, **rthp;
685         u64 mult;
686
687         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
688         if (ip_rt_gc_timeout > 1)
689                 do_div(mult, ip_rt_gc_timeout);
690         goal = (unsigned int)mult;
691         if (goal > rt_hash_mask)
692                 goal = rt_hash_mask + 1;
693         for (; goal > 0; goal--) {
694                 unsigned long tmo = ip_rt_gc_timeout;
695
696                 i = (i + 1) & rt_hash_mask;
697                 rthp = &rt_hash_table[i].chain;
698
699                 if (need_resched())
700                         cond_resched();
701
702                 if (*rthp == NULL)
703                         continue;
704                 spin_lock_bh(rt_hash_lock_addr(i));
705                 while ((rth = *rthp) != NULL) {
706                         if (rth->u.dst.expires) {
707                                 /* Entry is expired even if it is in use */
708                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
709                                         tmo >>= 1;
710                                         rthp = &rth->u.dst.rt_next;
711                                         continue;
712                                 }
713                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
714                                 tmo >>= 1;
715                                 rthp = &rth->u.dst.rt_next;
716                                 continue;
717                         }
718
719                         /* Cleanup aged off entries. */
720                         *rthp = rth->u.dst.rt_next;
721                         rt_free(rth);
722                 }
723                 spin_unlock_bh(rt_hash_lock_addr(i));
724         }
725         rover = i;
726 }
727
728 /*
729  * rt_worker_func() is run in process context.
730  * If a whole flush was scheduled, it is done.
731  * Else, we call rt_check_expire() to scan part of the hash table
732  */
733 static void rt_worker_func(struct work_struct *work)
734 {
735         if (ip_rt_flush_expected) {
736                 ip_rt_flush_expected = 0;
737                 rt_do_flush(1);
738         } else
739                 rt_check_expire();
740         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
741 }
742
743 /* This can run from both BH and non-BH contexts, the latter
744  * in the case of a forced flush event.
745  */
746 static void rt_run_flush(unsigned long process_context)
747 {
748         rt_deadline = 0;
749
750         get_random_bytes(&rt_hash_rnd, 4);
751
752         rt_do_flush(process_context);
753 }
754
755 static DEFINE_SPINLOCK(rt_flush_lock);
756
757 void rt_cache_flush(int delay)
758 {
759         unsigned long now = jiffies;
760         int user_mode = !in_softirq();
761
762         if (delay < 0)
763                 delay = ip_rt_min_delay;
764
765         spin_lock_bh(&rt_flush_lock);
766
767         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
768                 long tmo = (long)(rt_deadline - now);
769
770                 /* If flush timer is already running
771                    and flush request is not immediate (delay > 0):
772
773                    if deadline is not achieved, prolongate timer to "delay",
774                    otherwise fire it at deadline time.
775                  */
776
777                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
778                         tmo = 0;
779
780                 if (delay > tmo)
781                         delay = tmo;
782         }
783
784         if (delay <= 0) {
785                 spin_unlock_bh(&rt_flush_lock);
786                 rt_run_flush(user_mode);
787                 return;
788         }
789
790         if (rt_deadline == 0)
791                 rt_deadline = now + ip_rt_max_delay;
792
793         mod_timer(&rt_flush_timer, now+delay);
794         spin_unlock_bh(&rt_flush_lock);
795 }
796
797 /*
798  * We change rt_hash_rnd and ask next rt_worker_func() invocation
799  * to perform a flush in process context
800  */
801 static void rt_secret_rebuild(unsigned long dummy)
802 {
803         get_random_bytes(&rt_hash_rnd, 4);
804         ip_rt_flush_expected = 1;
805         cancel_delayed_work(&expires_work);
806         schedule_delayed_work(&expires_work, HZ/10);
807         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
808 }
809
810 /*
811    Short description of GC goals.
812
813    We want to build algorithm, which will keep routing cache
814    at some equilibrium point, when number of aged off entries
815    is kept approximately equal to newly generated ones.
816
817    Current expiration strength is variable "expire".
818    We try to adjust it dynamically, so that if networking
819    is idle expires is large enough to keep enough of warm entries,
820    and when load increases it reduces to limit cache size.
821  */
822
823 static int rt_garbage_collect(struct dst_ops *ops)
824 {
825         static unsigned long expire = RT_GC_TIMEOUT;
826         static unsigned long last_gc;
827         static int rover;
828         static int equilibrium;
829         struct rtable *rth, **rthp;
830         unsigned long now = jiffies;
831         int goal;
832
833         /*
834          * Garbage collection is pretty expensive,
835          * do not make it too frequently.
836          */
837
838         RT_CACHE_STAT_INC(gc_total);
839
840         if (now - last_gc < ip_rt_gc_min_interval &&
841             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842                 RT_CACHE_STAT_INC(gc_ignored);
843                 goto out;
844         }
845
846         /* Calculate number of entries, which we want to expire now. */
847         goal = atomic_read(&ipv4_dst_ops.entries) -
848                 (ip_rt_gc_elasticity << rt_hash_log);
849         if (goal <= 0) {
850                 if (equilibrium < ipv4_dst_ops.gc_thresh)
851                         equilibrium = ipv4_dst_ops.gc_thresh;
852                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853                 if (goal > 0) {
854                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
855                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856                 }
857         } else {
858                 /* We are in dangerous area. Try to reduce cache really
859                  * aggressively.
860                  */
861                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
862                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863         }
864
865         if (now - last_gc >= ip_rt_gc_min_interval)
866                 last_gc = now;
867
868         if (goal <= 0) {
869                 equilibrium += goal;
870                 goto work_done;
871         }
872
873         do {
874                 int i, k;
875
876                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877                         unsigned long tmo = expire;
878
879                         k = (k + 1) & rt_hash_mask;
880                         rthp = &rt_hash_table[k].chain;
881                         spin_lock_bh(rt_hash_lock_addr(k));
882                         while ((rth = *rthp) != NULL) {
883                                 if (!rt_may_expire(rth, tmo, expire)) {
884                                         tmo >>= 1;
885                                         rthp = &rth->u.dst.rt_next;
886                                         continue;
887                                 }
888                                 *rthp = rth->u.dst.rt_next;
889                                 rt_free(rth);
890                                 goal--;
891                         }
892                         spin_unlock_bh(rt_hash_lock_addr(k));
893                         if (goal <= 0)
894                                 break;
895                 }
896                 rover = k;
897
898                 if (goal <= 0)
899                         goto work_done;
900
901                 /* Goal is not achieved. We stop process if:
902
903                    - if expire reduced to zero. Otherwise, expire is halfed.
904                    - if table is not full.
905                    - if we are called from interrupt.
906                    - jiffies check is just fallback/debug loop breaker.
907                      We will not spin here for long time in any case.
908                  */
909
910                 RT_CACHE_STAT_INC(gc_goal_miss);
911
912                 if (expire == 0)
913                         break;
914
915                 expire >>= 1;
916 #if RT_CACHE_DEBUG >= 2
917                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
918                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
919 #endif
920
921                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
922                         goto out;
923         } while (!in_softirq() && time_before_eq(jiffies, now));
924
925         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
926                 goto out;
927         if (net_ratelimit())
928                 printk(KERN_WARNING "dst cache overflow\n");
929         RT_CACHE_STAT_INC(gc_dst_overflow);
930         return 1;
931
932 work_done:
933         expire += ip_rt_gc_min_interval;
934         if (expire > ip_rt_gc_timeout ||
935             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
936                 expire = ip_rt_gc_timeout;
937 #if RT_CACHE_DEBUG >= 2
938         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
939                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
940 #endif
941 out:    return 0;
942 }
943
944 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
945 {
946         struct rtable   *rth, **rthp;
947         unsigned long   now;
948         struct rtable *cand, **candp;
949         u32             min_score;
950         int             chain_length;
951         int attempts = !in_softirq();
952
953 restart:
954         chain_length = 0;
955         min_score = ~(u32)0;
956         cand = NULL;
957         candp = NULL;
958         now = jiffies;
959
960         rthp = &rt_hash_table[hash].chain;
961
962         spin_lock_bh(rt_hash_lock_addr(hash));
963         while ((rth = *rthp) != NULL) {
964                 if (compare_keys(&rth->fl, &rt->fl)) {
965                         /* Put it first */
966                         *rthp = rth->u.dst.rt_next;
967                         /*
968                          * Since lookup is lockfree, the deletion
969                          * must be visible to another weakly ordered CPU before
970                          * the insertion at the start of the hash chain.
971                          */
972                         rcu_assign_pointer(rth->u.dst.rt_next,
973                                            rt_hash_table[hash].chain);
974                         /*
975                          * Since lookup is lockfree, the update writes
976                          * must be ordered for consistency on SMP.
977                          */
978                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
979
980                         dst_use(&rth->u.dst, now);
981                         spin_unlock_bh(rt_hash_lock_addr(hash));
982
983                         rt_drop(rt);
984                         *rp = rth;
985                         return 0;
986                 }
987
988                 if (!atomic_read(&rth->u.dst.__refcnt)) {
989                         u32 score = rt_score(rth);
990
991                         if (score <= min_score) {
992                                 cand = rth;
993                                 candp = rthp;
994                                 min_score = score;
995                         }
996                 }
997
998                 chain_length++;
999
1000                 rthp = &rth->u.dst.rt_next;
1001         }
1002
1003         if (cand) {
1004                 /* ip_rt_gc_elasticity used to be average length of chain
1005                  * length, when exceeded gc becomes really aggressive.
1006                  *
1007                  * The second limit is less certain. At the moment it allows
1008                  * only 2 entries per bucket. We will see.
1009                  */
1010                 if (chain_length > ip_rt_gc_elasticity) {
1011                         *candp = cand->u.dst.rt_next;
1012                         rt_free(cand);
1013                 }
1014         }
1015
1016         /* Try to bind route to arp only if it is output
1017            route or unicast forwarding path.
1018          */
1019         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1020                 int err = arp_bind_neighbour(&rt->u.dst);
1021                 if (err) {
1022                         spin_unlock_bh(rt_hash_lock_addr(hash));
1023
1024                         if (err != -ENOBUFS) {
1025                                 rt_drop(rt);
1026                                 return err;
1027                         }
1028
1029                         /* Neighbour tables are full and nothing
1030                            can be released. Try to shrink route cache,
1031                            it is most likely it holds some neighbour records.
1032                          */
1033                         if (attempts-- > 0) {
1034                                 int saved_elasticity = ip_rt_gc_elasticity;
1035                                 int saved_int = ip_rt_gc_min_interval;
1036                                 ip_rt_gc_elasticity     = 1;
1037                                 ip_rt_gc_min_interval   = 0;
1038                                 rt_garbage_collect(&ipv4_dst_ops);
1039                                 ip_rt_gc_min_interval   = saved_int;
1040                                 ip_rt_gc_elasticity     = saved_elasticity;
1041                                 goto restart;
1042                         }
1043
1044                         if (net_ratelimit())
1045                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1046                         rt_drop(rt);
1047                         return -ENOBUFS;
1048                 }
1049         }
1050
1051         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1052 #if RT_CACHE_DEBUG >= 2
1053         if (rt->u.dst.rt_next) {
1054                 struct rtable *trt;
1055                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1056                        NIPQUAD(rt->rt_dst));
1057                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1058                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1059                 printk("\n");
1060         }
1061 #endif
1062         rt_hash_table[hash].chain = rt;
1063         spin_unlock_bh(rt_hash_lock_addr(hash));
1064         *rp = rt;
1065         return 0;
1066 }
1067
1068 void rt_bind_peer(struct rtable *rt, int create)
1069 {
1070         static DEFINE_SPINLOCK(rt_peer_lock);
1071         struct inet_peer *peer;
1072
1073         peer = inet_getpeer(rt->rt_dst, create);
1074
1075         spin_lock_bh(&rt_peer_lock);
1076         if (rt->peer == NULL) {
1077                 rt->peer = peer;
1078                 peer = NULL;
1079         }
1080         spin_unlock_bh(&rt_peer_lock);
1081         if (peer)
1082                 inet_putpeer(peer);
1083 }
1084
1085 /*
1086  * Peer allocation may fail only in serious out-of-memory conditions.  However
1087  * we still can generate some output.
1088  * Random ID selection looks a bit dangerous because we have no chances to
1089  * select ID being unique in a reasonable period of time.
1090  * But broken packet identifier may be better than no packet at all.
1091  */
1092 static void ip_select_fb_ident(struct iphdr *iph)
1093 {
1094         static DEFINE_SPINLOCK(ip_fb_id_lock);
1095         static u32 ip_fallback_id;
1096         u32 salt;
1097
1098         spin_lock_bh(&ip_fb_id_lock);
1099         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1100         iph->id = htons(salt & 0xFFFF);
1101         ip_fallback_id = salt;
1102         spin_unlock_bh(&ip_fb_id_lock);
1103 }
1104
1105 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1106 {
1107         struct rtable *rt = (struct rtable *) dst;
1108
1109         if (rt) {
1110                 if (rt->peer == NULL)
1111                         rt_bind_peer(rt, 1);
1112
1113                 /* If peer is attached to destination, it is never detached,
1114                    so that we need not to grab a lock to dereference it.
1115                  */
1116                 if (rt->peer) {
1117                         iph->id = htons(inet_getid(rt->peer, more));
1118                         return;
1119                 }
1120         } else
1121                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1122                        __builtin_return_address(0));
1123
1124         ip_select_fb_ident(iph);
1125 }
1126
1127 static void rt_del(unsigned hash, struct rtable *rt)
1128 {
1129         struct rtable **rthp;
1130
1131         spin_lock_bh(rt_hash_lock_addr(hash));
1132         ip_rt_put(rt);
1133         for (rthp = &rt_hash_table[hash].chain; *rthp;
1134              rthp = &(*rthp)->u.dst.rt_next)
1135                 if (*rthp == rt) {
1136                         *rthp = rt->u.dst.rt_next;
1137                         rt_free(rt);
1138                         break;
1139                 }
1140         spin_unlock_bh(rt_hash_lock_addr(hash));
1141 }
1142
1143 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1144                     __be32 saddr, struct net_device *dev)
1145 {
1146         int i, k;
1147         struct in_device *in_dev = in_dev_get(dev);
1148         struct rtable *rth, **rthp;
1149         __be32  skeys[2] = { saddr, 0 };
1150         int  ikeys[2] = { dev->ifindex, 0 };
1151         struct netevent_redirect netevent;
1152
1153         if (!in_dev)
1154                 return;
1155
1156         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1157             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1158             || ipv4_is_zeronet(new_gw))
1159                 goto reject_redirect;
1160
1161         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1162                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1163                         goto reject_redirect;
1164                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1165                         goto reject_redirect;
1166         } else {
1167                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1168                         goto reject_redirect;
1169         }
1170
1171         for (i = 0; i < 2; i++) {
1172                 for (k = 0; k < 2; k++) {
1173                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1174
1175                         rthp=&rt_hash_table[hash].chain;
1176
1177                         rcu_read_lock();
1178                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1179                                 struct rtable *rt;
1180
1181                                 if (rth->fl.fl4_dst != daddr ||
1182                                     rth->fl.fl4_src != skeys[i] ||
1183                                     rth->fl.oif != ikeys[k] ||
1184                                     rth->fl.iif != 0) {
1185                                         rthp = &rth->u.dst.rt_next;
1186                                         continue;
1187                                 }
1188
1189                                 if (rth->rt_dst != daddr ||
1190                                     rth->rt_src != saddr ||
1191                                     rth->u.dst.error ||
1192                                     rth->rt_gateway != old_gw ||
1193                                     rth->u.dst.dev != dev)
1194                                         break;
1195
1196                                 dst_hold(&rth->u.dst);
1197                                 rcu_read_unlock();
1198
1199                                 rt = dst_alloc(&ipv4_dst_ops);
1200                                 if (rt == NULL) {
1201                                         ip_rt_put(rth);
1202                                         in_dev_put(in_dev);
1203                                         return;
1204                                 }
1205
1206                                 /* Copy all the information. */
1207                                 *rt = *rth;
1208                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1209                                 rt->u.dst.__use         = 1;
1210                                 atomic_set(&rt->u.dst.__refcnt, 1);
1211                                 rt->u.dst.child         = NULL;
1212                                 if (rt->u.dst.dev)
1213                                         dev_hold(rt->u.dst.dev);
1214                                 if (rt->idev)
1215                                         in_dev_hold(rt->idev);
1216                                 rt->u.dst.obsolete      = 0;
1217                                 rt->u.dst.lastuse       = jiffies;
1218                                 rt->u.dst.path          = &rt->u.dst;
1219                                 rt->u.dst.neighbour     = NULL;
1220                                 rt->u.dst.hh            = NULL;
1221                                 rt->u.dst.xfrm          = NULL;
1222
1223                                 rt->rt_flags            |= RTCF_REDIRECTED;
1224
1225                                 /* Gateway is different ... */
1226                                 rt->rt_gateway          = new_gw;
1227
1228                                 /* Redirect received -> path was valid */
1229                                 dst_confirm(&rth->u.dst);
1230
1231                                 if (rt->peer)
1232                                         atomic_inc(&rt->peer->refcnt);
1233
1234                                 if (arp_bind_neighbour(&rt->u.dst) ||
1235                                     !(rt->u.dst.neighbour->nud_state &
1236                                             NUD_VALID)) {
1237                                         if (rt->u.dst.neighbour)
1238                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1239                                         ip_rt_put(rth);
1240                                         rt_drop(rt);
1241                                         goto do_next;
1242                                 }
1243
1244                                 netevent.old = &rth->u.dst;
1245                                 netevent.new = &rt->u.dst;
1246                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1247                                                         &netevent);
1248
1249                                 rt_del(hash, rth);
1250                                 if (!rt_intern_hash(hash, rt, &rt))
1251                                         ip_rt_put(rt);
1252                                 goto do_next;
1253                         }
1254                         rcu_read_unlock();
1255                 do_next:
1256                         ;
1257                 }
1258         }
1259         in_dev_put(in_dev);
1260         return;
1261
1262 reject_redirect:
1263 #ifdef CONFIG_IP_ROUTE_VERBOSE
1264         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1265                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1266                         "%u.%u.%u.%u ignored.\n"
1267                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1268                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1269                        NIPQUAD(saddr), NIPQUAD(daddr));
1270 #endif
1271         in_dev_put(in_dev);
1272 }
1273
1274 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1275 {
1276         struct rtable *rt = (struct rtable*)dst;
1277         struct dst_entry *ret = dst;
1278
1279         if (rt) {
1280                 if (dst->obsolete) {
1281                         ip_rt_put(rt);
1282                         ret = NULL;
1283                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1284                            rt->u.dst.expires) {
1285                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1286                                                 rt->fl.oif);
1287 #if RT_CACHE_DEBUG >= 1
1288                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1289                                           "%u.%u.%u.%u/%02x dropped\n",
1290                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1291 #endif
1292                         rt_del(hash, rt);
1293                         ret = NULL;
1294                 }
1295         }
1296         return ret;
1297 }
1298
1299 /*
1300  * Algorithm:
1301  *      1. The first ip_rt_redirect_number redirects are sent
1302  *         with exponential backoff, then we stop sending them at all,
1303  *         assuming that the host ignores our redirects.
1304  *      2. If we did not see packets requiring redirects
1305  *         during ip_rt_redirect_silence, we assume that the host
1306  *         forgot redirected route and start to send redirects again.
1307  *
1308  * This algorithm is much cheaper and more intelligent than dumb load limiting
1309  * in icmp.c.
1310  *
1311  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1312  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1313  */
1314
1315 void ip_rt_send_redirect(struct sk_buff *skb)
1316 {
1317         struct rtable *rt = (struct rtable*)skb->dst;
1318         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1319
1320         if (!in_dev)
1321                 return;
1322
1323         if (!IN_DEV_TX_REDIRECTS(in_dev))
1324                 goto out;
1325
1326         /* No redirected packets during ip_rt_redirect_silence;
1327          * reset the algorithm.
1328          */
1329         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1330                 rt->u.dst.rate_tokens = 0;
1331
1332         /* Too many ignored redirects; do not send anything
1333          * set u.dst.rate_last to the last seen redirected packet.
1334          */
1335         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1336                 rt->u.dst.rate_last = jiffies;
1337                 goto out;
1338         }
1339
1340         /* Check for load limit; set rate_last to the latest sent
1341          * redirect.
1342          */
1343         if (rt->u.dst.rate_tokens == 0 ||
1344             time_after(jiffies,
1345                        (rt->u.dst.rate_last +
1346                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1347                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1348                 rt->u.dst.rate_last = jiffies;
1349                 ++rt->u.dst.rate_tokens;
1350 #ifdef CONFIG_IP_ROUTE_VERBOSE
1351                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1352                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1353                     net_ratelimit())
1354                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1355                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1356                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1357                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1358 #endif
1359         }
1360 out:
1361         in_dev_put(in_dev);
1362 }
1363
1364 static int ip_error(struct sk_buff *skb)
1365 {
1366         struct rtable *rt = (struct rtable*)skb->dst;
1367         unsigned long now;
1368         int code;
1369
1370         switch (rt->u.dst.error) {
1371                 case EINVAL:
1372                 default:
1373                         goto out;
1374                 case EHOSTUNREACH:
1375                         code = ICMP_HOST_UNREACH;
1376                         break;
1377                 case ENETUNREACH:
1378                         code = ICMP_NET_UNREACH;
1379                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1380                         break;
1381                 case EACCES:
1382                         code = ICMP_PKT_FILTERED;
1383                         break;
1384         }
1385
1386         now = jiffies;
1387         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1388         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1389                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1390         rt->u.dst.rate_last = now;
1391         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1392                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1393                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1394         }
1395
1396 out:    kfree_skb(skb);
1397         return 0;
1398 }
1399
1400 /*
1401  *      The last two values are not from the RFC but
1402  *      are needed for AMPRnet AX.25 paths.
1403  */
1404
1405 static const unsigned short mtu_plateau[] =
1406 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1407
1408 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1409 {
1410         int i;
1411
1412         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1413                 if (old_mtu > mtu_plateau[i])
1414                         return mtu_plateau[i];
1415         return 68;
1416 }
1417
1418 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1419 {
1420         int i;
1421         unsigned short old_mtu = ntohs(iph->tot_len);
1422         struct rtable *rth;
1423         __be32  skeys[2] = { iph->saddr, 0, };
1424         __be32  daddr = iph->daddr;
1425         unsigned short est_mtu = 0;
1426
1427         if (ipv4_config.no_pmtu_disc)
1428                 return 0;
1429
1430         for (i = 0; i < 2; i++) {
1431                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1432
1433                 rcu_read_lock();
1434                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1435                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1436                         if (rth->fl.fl4_dst == daddr &&
1437                             rth->fl.fl4_src == skeys[i] &&
1438                             rth->rt_dst  == daddr &&
1439                             rth->rt_src  == iph->saddr &&
1440                             rth->fl.iif == 0 &&
1441                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1442                                 unsigned short mtu = new_mtu;
1443
1444                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1445
1446                                         /* BSD 4.2 compatibility hack :-( */
1447                                         if (mtu == 0 &&
1448                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1449                                             old_mtu >= 68 + (iph->ihl << 2))
1450                                                 old_mtu -= iph->ihl << 2;
1451
1452                                         mtu = guess_mtu(old_mtu);
1453                                 }
1454                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1455                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1456                                                 dst_confirm(&rth->u.dst);
1457                                                 if (mtu < ip_rt_min_pmtu) {
1458                                                         mtu = ip_rt_min_pmtu;
1459                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1460                                                                 (1 << RTAX_MTU);
1461                                                 }
1462                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1463                                                 dst_set_expires(&rth->u.dst,
1464                                                         ip_rt_mtu_expires);
1465                                         }
1466                                         est_mtu = mtu;
1467                                 }
1468                         }
1469                 }
1470                 rcu_read_unlock();
1471         }
1472         return est_mtu ? : new_mtu;
1473 }
1474
1475 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1476 {
1477         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1478             !(dst_metric_locked(dst, RTAX_MTU))) {
1479                 if (mtu < ip_rt_min_pmtu) {
1480                         mtu = ip_rt_min_pmtu;
1481                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1482                 }
1483                 dst->metrics[RTAX_MTU-1] = mtu;
1484                 dst_set_expires(dst, ip_rt_mtu_expires);
1485                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1486         }
1487 }
1488
1489 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1490 {
1491         return NULL;
1492 }
1493
1494 static void ipv4_dst_destroy(struct dst_entry *dst)
1495 {
1496         struct rtable *rt = (struct rtable *) dst;
1497         struct inet_peer *peer = rt->peer;
1498         struct in_device *idev = rt->idev;
1499
1500         if (peer) {
1501                 rt->peer = NULL;
1502                 inet_putpeer(peer);
1503         }
1504
1505         if (idev) {
1506                 rt->idev = NULL;
1507                 in_dev_put(idev);
1508         }
1509 }
1510
1511 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1512                             int how)
1513 {
1514         struct rtable *rt = (struct rtable *) dst;
1515         struct in_device *idev = rt->idev;
1516         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1517                 struct in_device *loopback_idev =
1518                         in_dev_get(dev->nd_net->loopback_dev);
1519                 if (loopback_idev) {
1520                         rt->idev = loopback_idev;
1521                         in_dev_put(idev);
1522                 }
1523         }
1524 }
1525
1526 static void ipv4_link_failure(struct sk_buff *skb)
1527 {
1528         struct rtable *rt;
1529
1530         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1531
1532         rt = (struct rtable *) skb->dst;
1533         if (rt)
1534                 dst_set_expires(&rt->u.dst, 0);
1535 }
1536
1537 static int ip_rt_bug(struct sk_buff *skb)
1538 {
1539         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1540                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1541                 skb->dev ? skb->dev->name : "?");
1542         kfree_skb(skb);
1543         return 0;
1544 }
1545
1546 /*
1547    We do not cache source address of outgoing interface,
1548    because it is used only by IP RR, TS and SRR options,
1549    so that it out of fast path.
1550
1551    BTW remember: "addr" is allowed to be not aligned
1552    in IP options!
1553  */
1554
1555 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1556 {
1557         __be32 src;
1558         struct fib_result res;
1559
1560         if (rt->fl.iif == 0)
1561                 src = rt->rt_src;
1562         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1563                 src = FIB_RES_PREFSRC(res);
1564                 fib_res_put(&res);
1565         } else
1566                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1567                                         RT_SCOPE_UNIVERSE);
1568         memcpy(addr, &src, 4);
1569 }
1570
1571 #ifdef CONFIG_NET_CLS_ROUTE
1572 static void set_class_tag(struct rtable *rt, u32 tag)
1573 {
1574         if (!(rt->u.dst.tclassid & 0xFFFF))
1575                 rt->u.dst.tclassid |= tag & 0xFFFF;
1576         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1577                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1578 }
1579 #endif
1580
1581 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1582 {
1583         struct fib_info *fi = res->fi;
1584
1585         if (fi) {
1586                 if (FIB_RES_GW(*res) &&
1587                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1588                         rt->rt_gateway = FIB_RES_GW(*res);
1589                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1590                        sizeof(rt->u.dst.metrics));
1591                 if (fi->fib_mtu == 0) {
1592                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1593                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1594                             rt->rt_gateway != rt->rt_dst &&
1595                             rt->u.dst.dev->mtu > 576)
1596                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1597                 }
1598 #ifdef CONFIG_NET_CLS_ROUTE
1599                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1600 #endif
1601         } else
1602                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1603
1604         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1605                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1606         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1607                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1608         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1609                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1610                                        ip_rt_min_advmss);
1611         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1612                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1613
1614 #ifdef CONFIG_NET_CLS_ROUTE
1615 #ifdef CONFIG_IP_MULTIPLE_TABLES
1616         set_class_tag(rt, fib_rules_tclass(res));
1617 #endif
1618         set_class_tag(rt, itag);
1619 #endif
1620         rt->rt_type = res->type;
1621 }
1622
1623 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1624                                 u8 tos, struct net_device *dev, int our)
1625 {
1626         unsigned hash;
1627         struct rtable *rth;
1628         __be32 spec_dst;
1629         struct in_device *in_dev = in_dev_get(dev);
1630         u32 itag = 0;
1631
1632         /* Primary sanity checks. */
1633
1634         if (in_dev == NULL)
1635                 return -EINVAL;
1636
1637         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1638             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1639                 goto e_inval;
1640
1641         if (ipv4_is_zeronet(saddr)) {
1642                 if (!ipv4_is_local_multicast(daddr))
1643                         goto e_inval;
1644                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1645         } else if (fib_validate_source(saddr, 0, tos, 0,
1646                                         dev, &spec_dst, &itag) < 0)
1647                 goto e_inval;
1648
1649         rth = dst_alloc(&ipv4_dst_ops);
1650         if (!rth)
1651                 goto e_nobufs;
1652
1653         rth->u.dst.output= ip_rt_bug;
1654
1655         atomic_set(&rth->u.dst.__refcnt, 1);
1656         rth->u.dst.flags= DST_HOST;
1657         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1658                 rth->u.dst.flags |= DST_NOPOLICY;
1659         rth->fl.fl4_dst = daddr;
1660         rth->rt_dst     = daddr;
1661         rth->fl.fl4_tos = tos;
1662         rth->fl.mark    = skb->mark;
1663         rth->fl.fl4_src = saddr;
1664         rth->rt_src     = saddr;
1665 #ifdef CONFIG_NET_CLS_ROUTE
1666         rth->u.dst.tclassid = itag;
1667 #endif
1668         rth->rt_iif     =
1669         rth->fl.iif     = dev->ifindex;
1670         rth->u.dst.dev  = init_net.loopback_dev;
1671         dev_hold(rth->u.dst.dev);
1672         rth->idev       = in_dev_get(rth->u.dst.dev);
1673         rth->fl.oif     = 0;
1674         rth->rt_gateway = daddr;
1675         rth->rt_spec_dst= spec_dst;
1676         rth->rt_type    = RTN_MULTICAST;
1677         rth->rt_flags   = RTCF_MULTICAST;
1678         if (our) {
1679                 rth->u.dst.input= ip_local_deliver;
1680                 rth->rt_flags |= RTCF_LOCAL;
1681         }
1682
1683 #ifdef CONFIG_IP_MROUTE
1684         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1685                 rth->u.dst.input = ip_mr_input;
1686 #endif
1687         RT_CACHE_STAT_INC(in_slow_mc);
1688
1689         in_dev_put(in_dev);
1690         hash = rt_hash(daddr, saddr, dev->ifindex);
1691         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1692
1693 e_nobufs:
1694         in_dev_put(in_dev);
1695         return -ENOBUFS;
1696
1697 e_inval:
1698         in_dev_put(in_dev);
1699         return -EINVAL;
1700 }
1701
1702
1703 static void ip_handle_martian_source(struct net_device *dev,
1704                                      struct in_device *in_dev,
1705                                      struct sk_buff *skb,
1706                                      __be32 daddr,
1707                                      __be32 saddr)
1708 {
1709         RT_CACHE_STAT_INC(in_martian_src);
1710 #ifdef CONFIG_IP_ROUTE_VERBOSE
1711         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1712                 /*
1713                  *      RFC1812 recommendation, if source is martian,
1714                  *      the only hint is MAC header.
1715                  */
1716                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1717                         "%u.%u.%u.%u, on dev %s\n",
1718                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1719                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1720                         int i;
1721                         const unsigned char *p = skb_mac_header(skb);
1722                         printk(KERN_WARNING "ll header: ");
1723                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1724                                 printk("%02x", *p);
1725                                 if (i < (dev->hard_header_len - 1))
1726                                         printk(":");
1727                         }
1728                         printk("\n");
1729                 }
1730         }
1731 #endif
1732 }
1733
1734 static inline int __mkroute_input(struct sk_buff *skb,
1735                                   struct fib_result* res,
1736                                   struct in_device *in_dev,
1737                                   __be32 daddr, __be32 saddr, u32 tos,
1738                                   struct rtable **result)
1739 {
1740
1741         struct rtable *rth;
1742         int err;
1743         struct in_device *out_dev;
1744         unsigned flags = 0;
1745         __be32 spec_dst;
1746         u32 itag;
1747
1748         /* get a working reference to the output device */
1749         out_dev = in_dev_get(FIB_RES_DEV(*res));
1750         if (out_dev == NULL) {
1751                 if (net_ratelimit())
1752                         printk(KERN_CRIT "Bug in ip_route_input" \
1753                                "_slow(). Please, report\n");
1754                 return -EINVAL;
1755         }
1756
1757
1758         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1759                                   in_dev->dev, &spec_dst, &itag);
1760         if (err < 0) {
1761                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1762                                          saddr);
1763
1764                 err = -EINVAL;
1765                 goto cleanup;
1766         }
1767
1768         if (err)
1769                 flags |= RTCF_DIRECTSRC;
1770
1771         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1772             (IN_DEV_SHARED_MEDIA(out_dev) ||
1773              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1774                 flags |= RTCF_DOREDIRECT;
1775
1776         if (skb->protocol != htons(ETH_P_IP)) {
1777                 /* Not IP (i.e. ARP). Do not create route, if it is
1778                  * invalid for proxy arp. DNAT routes are always valid.
1779                  */
1780                 if (out_dev == in_dev) {
1781                         err = -EINVAL;
1782                         goto cleanup;
1783                 }
1784         }
1785
1786
1787         rth = dst_alloc(&ipv4_dst_ops);
1788         if (!rth) {
1789                 err = -ENOBUFS;
1790                 goto cleanup;
1791         }
1792
1793         atomic_set(&rth->u.dst.__refcnt, 1);
1794         rth->u.dst.flags= DST_HOST;
1795         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1796                 rth->u.dst.flags |= DST_NOPOLICY;
1797         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1798                 rth->u.dst.flags |= DST_NOXFRM;
1799         rth->fl.fl4_dst = daddr;
1800         rth->rt_dst     = daddr;
1801         rth->fl.fl4_tos = tos;
1802         rth->fl.mark    = skb->mark;
1803         rth->fl.fl4_src = saddr;
1804         rth->rt_src     = saddr;
1805         rth->rt_gateway = daddr;
1806         rth->rt_iif     =
1807                 rth->fl.iif     = in_dev->dev->ifindex;
1808         rth->u.dst.dev  = (out_dev)->dev;
1809         dev_hold(rth->u.dst.dev);
1810         rth->idev       = in_dev_get(rth->u.dst.dev);
1811         rth->fl.oif     = 0;
1812         rth->rt_spec_dst= spec_dst;
1813
1814         rth->u.dst.input = ip_forward;
1815         rth->u.dst.output = ip_output;
1816
1817         rt_set_nexthop(rth, res, itag);
1818
1819         rth->rt_flags = flags;
1820
1821         *result = rth;
1822         err = 0;
1823  cleanup:
1824         /* release the working reference to the output device */
1825         in_dev_put(out_dev);
1826         return err;
1827 }
1828
1829 static inline int ip_mkroute_input(struct sk_buff *skb,
1830                                    struct fib_result* res,
1831                                    const struct flowi *fl,
1832                                    struct in_device *in_dev,
1833                                    __be32 daddr, __be32 saddr, u32 tos)
1834 {
1835         struct rtable* rth = NULL;
1836         int err;
1837         unsigned hash;
1838
1839 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1840         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1841                 fib_select_multipath(fl, res);
1842 #endif
1843
1844         /* create a routing cache entry */
1845         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1846         if (err)
1847                 return err;
1848
1849         /* put it into the cache */
1850         hash = rt_hash(daddr, saddr, fl->iif);
1851         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1852 }
1853
1854 /*
1855  *      NOTE. We drop all the packets that has local source
1856  *      addresses, because every properly looped back packet
1857  *      must have correct destination already attached by output routine.
1858  *
1859  *      Such approach solves two big problems:
1860  *      1. Not simplex devices are handled properly.
1861  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1862  */
1863
1864 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1865                                u8 tos, struct net_device *dev)
1866 {
1867         struct fib_result res;
1868         struct in_device *in_dev = in_dev_get(dev);
1869         struct flowi fl = { .nl_u = { .ip4_u =
1870                                       { .daddr = daddr,
1871                                         .saddr = saddr,
1872                                         .tos = tos,
1873                                         .scope = RT_SCOPE_UNIVERSE,
1874                                       } },
1875                             .mark = skb->mark,
1876                             .iif = dev->ifindex };
1877         unsigned        flags = 0;
1878         u32             itag = 0;
1879         struct rtable * rth;
1880         unsigned        hash;
1881         __be32          spec_dst;
1882         int             err = -EINVAL;
1883         int             free_res = 0;
1884         struct net    * net = dev->nd_net;
1885
1886         /* IP on this device is disabled. */
1887
1888         if (!in_dev)
1889                 goto out;
1890
1891         /* Check for the most weird martians, which can be not detected
1892            by fib_lookup.
1893          */
1894
1895         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1896             ipv4_is_loopback(saddr))
1897                 goto martian_source;
1898
1899         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1900                 goto brd_input;
1901
1902         /* Accept zero addresses only to limited broadcast;
1903          * I even do not know to fix it or not. Waiting for complains :-)
1904          */
1905         if (ipv4_is_zeronet(saddr))
1906                 goto martian_source;
1907
1908         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1909             ipv4_is_loopback(daddr))
1910                 goto martian_destination;
1911
1912         /*
1913          *      Now we are ready to route packet.
1914          */
1915         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1916                 if (!IN_DEV_FORWARD(in_dev))
1917                         goto e_hostunreach;
1918                 goto no_route;
1919         }
1920         free_res = 1;
1921
1922         RT_CACHE_STAT_INC(in_slow_tot);
1923
1924         if (res.type == RTN_BROADCAST)
1925                 goto brd_input;
1926
1927         if (res.type == RTN_LOCAL) {
1928                 int result;
1929                 result = fib_validate_source(saddr, daddr, tos,
1930                                              net->loopback_dev->ifindex,
1931                                              dev, &spec_dst, &itag);
1932                 if (result < 0)
1933                         goto martian_source;
1934                 if (result)
1935                         flags |= RTCF_DIRECTSRC;
1936                 spec_dst = daddr;
1937                 goto local_input;
1938         }
1939
1940         if (!IN_DEV_FORWARD(in_dev))
1941                 goto e_hostunreach;
1942         if (res.type != RTN_UNICAST)
1943                 goto martian_destination;
1944
1945         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1946 done:
1947         in_dev_put(in_dev);
1948         if (free_res)
1949                 fib_res_put(&res);
1950 out:    return err;
1951
1952 brd_input:
1953         if (skb->protocol != htons(ETH_P_IP))
1954                 goto e_inval;
1955
1956         if (ipv4_is_zeronet(saddr))
1957                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1958         else {
1959                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1960                                           &itag);
1961                 if (err < 0)
1962                         goto martian_source;
1963                 if (err)
1964                         flags |= RTCF_DIRECTSRC;
1965         }
1966         flags |= RTCF_BROADCAST;
1967         res.type = RTN_BROADCAST;
1968         RT_CACHE_STAT_INC(in_brd);
1969
1970 local_input:
1971         rth = dst_alloc(&ipv4_dst_ops);
1972         if (!rth)
1973                 goto e_nobufs;
1974
1975         rth->u.dst.output= ip_rt_bug;
1976
1977         atomic_set(&rth->u.dst.__refcnt, 1);
1978         rth->u.dst.flags= DST_HOST;
1979         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1980                 rth->u.dst.flags |= DST_NOPOLICY;
1981         rth->fl.fl4_dst = daddr;
1982         rth->rt_dst     = daddr;
1983         rth->fl.fl4_tos = tos;
1984         rth->fl.mark    = skb->mark;
1985         rth->fl.fl4_src = saddr;
1986         rth->rt_src     = saddr;
1987 #ifdef CONFIG_NET_CLS_ROUTE
1988         rth->u.dst.tclassid = itag;
1989 #endif
1990         rth->rt_iif     =
1991         rth->fl.iif     = dev->ifindex;
1992         rth->u.dst.dev  = net->loopback_dev;
1993         dev_hold(rth->u.dst.dev);
1994         rth->idev       = in_dev_get(rth->u.dst.dev);
1995         rth->rt_gateway = daddr;
1996         rth->rt_spec_dst= spec_dst;
1997         rth->u.dst.input= ip_local_deliver;
1998         rth->rt_flags   = flags|RTCF_LOCAL;
1999         if (res.type == RTN_UNREACHABLE) {
2000                 rth->u.dst.input= ip_error;
2001                 rth->u.dst.error= -err;
2002                 rth->rt_flags   &= ~RTCF_LOCAL;
2003         }
2004         rth->rt_type    = res.type;
2005         hash = rt_hash(daddr, saddr, fl.iif);
2006         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2007         goto done;
2008
2009 no_route:
2010         RT_CACHE_STAT_INC(in_no_route);
2011         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2012         res.type = RTN_UNREACHABLE;
2013         if (err == -ESRCH)
2014                 err = -ENETUNREACH;
2015         goto local_input;
2016
2017         /*
2018          *      Do not cache martian addresses: they should be logged (RFC1812)
2019          */
2020 martian_destination:
2021         RT_CACHE_STAT_INC(in_martian_dst);
2022 #ifdef CONFIG_IP_ROUTE_VERBOSE
2023         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2024                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2025                         "%u.%u.%u.%u, dev %s\n",
2026                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027 #endif
2028
2029 e_hostunreach:
2030         err = -EHOSTUNREACH;
2031         goto done;
2032
2033 e_inval:
2034         err = -EINVAL;
2035         goto done;
2036
2037 e_nobufs:
2038         err = -ENOBUFS;
2039         goto done;
2040
2041 martian_source:
2042         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2043         goto e_inval;
2044 }
2045
2046 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2047                    u8 tos, struct net_device *dev)
2048 {
2049         struct rtable * rth;
2050         unsigned        hash;
2051         int iif = dev->ifindex;
2052
2053         tos &= IPTOS_RT_MASK;
2054         hash = rt_hash(daddr, saddr, iif);
2055
2056         rcu_read_lock();
2057         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2058              rth = rcu_dereference(rth->u.dst.rt_next)) {
2059                 if (rth->fl.fl4_dst == daddr &&
2060                     rth->fl.fl4_src == saddr &&
2061                     rth->fl.iif == iif &&
2062                     rth->fl.oif == 0 &&
2063                     rth->fl.mark == skb->mark &&
2064                     rth->fl.fl4_tos == tos) {
2065                         dst_use(&rth->u.dst, jiffies);
2066                         RT_CACHE_STAT_INC(in_hit);
2067                         rcu_read_unlock();
2068                         skb->dst = (struct dst_entry*)rth;
2069                         return 0;
2070                 }
2071                 RT_CACHE_STAT_INC(in_hlist_search);
2072         }
2073         rcu_read_unlock();
2074
2075         /* Multicast recognition logic is moved from route cache to here.
2076            The problem was that too many Ethernet cards have broken/missing
2077            hardware multicast filters :-( As result the host on multicasting
2078            network acquires a lot of useless route cache entries, sort of
2079            SDR messages from all the world. Now we try to get rid of them.
2080            Really, provided software IP multicast filter is organized
2081            reasonably (at least, hashed), it does not result in a slowdown
2082            comparing with route cache reject entries.
2083            Note, that multicast routers are not affected, because
2084            route cache entry is created eventually.
2085          */
2086         if (ipv4_is_multicast(daddr)) {
2087                 struct in_device *in_dev;
2088
2089                 rcu_read_lock();
2090                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2091                         int our = ip_check_mc(in_dev, daddr, saddr,
2092                                 ip_hdr(skb)->protocol);
2093                         if (our
2094 #ifdef CONFIG_IP_MROUTE
2095                             || (!ipv4_is_local_multicast(daddr) &&
2096                                 IN_DEV_MFORWARD(in_dev))
2097 #endif
2098                             ) {
2099                                 rcu_read_unlock();
2100                                 return ip_route_input_mc(skb, daddr, saddr,
2101                                                          tos, dev, our);
2102                         }
2103                 }
2104                 rcu_read_unlock();
2105                 return -EINVAL;
2106         }
2107         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2108 }
2109
2110 static inline int __mkroute_output(struct rtable **result,
2111                                    struct fib_result* res,
2112                                    const struct flowi *fl,
2113                                    const struct flowi *oldflp,
2114                                    struct net_device *dev_out,
2115                                    unsigned flags)
2116 {
2117         struct rtable *rth;
2118         struct in_device *in_dev;
2119         u32 tos = RT_FL_TOS(oldflp);
2120         int err = 0;
2121
2122         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2123                 return -EINVAL;
2124
2125         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2126                 res->type = RTN_BROADCAST;
2127         else if (ipv4_is_multicast(fl->fl4_dst))
2128                 res->type = RTN_MULTICAST;
2129         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2130                 return -EINVAL;
2131
2132         if (dev_out->flags & IFF_LOOPBACK)
2133                 flags |= RTCF_LOCAL;
2134
2135         /* get work reference to inet device */
2136         in_dev = in_dev_get(dev_out);
2137         if (!in_dev)
2138                 return -EINVAL;
2139
2140         if (res->type == RTN_BROADCAST) {
2141                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2142                 if (res->fi) {
2143                         fib_info_put(res->fi);
2144                         res->fi = NULL;
2145                 }
2146         } else if (res->type == RTN_MULTICAST) {
2147                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2148                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2149                                  oldflp->proto))
2150                         flags &= ~RTCF_LOCAL;
2151                 /* If multicast route do not exist use
2152                    default one, but do not gateway in this case.
2153                    Yes, it is hack.
2154                  */
2155                 if (res->fi && res->prefixlen < 4) {
2156                         fib_info_put(res->fi);
2157                         res->fi = NULL;
2158                 }
2159         }
2160
2161
2162         rth = dst_alloc(&ipv4_dst_ops);
2163         if (!rth) {
2164                 err = -ENOBUFS;
2165                 goto cleanup;
2166         }
2167
2168         atomic_set(&rth->u.dst.__refcnt, 1);
2169         rth->u.dst.flags= DST_HOST;
2170         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2171                 rth->u.dst.flags |= DST_NOXFRM;
2172         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2173                 rth->u.dst.flags |= DST_NOPOLICY;
2174
2175         rth->fl.fl4_dst = oldflp->fl4_dst;
2176         rth->fl.fl4_tos = tos;
2177         rth->fl.fl4_src = oldflp->fl4_src;
2178         rth->fl.oif     = oldflp->oif;
2179         rth->fl.mark    = oldflp->mark;
2180         rth->rt_dst     = fl->fl4_dst;
2181         rth->rt_src     = fl->fl4_src;
2182         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2183         /* get references to the devices that are to be hold by the routing
2184            cache entry */
2185         rth->u.dst.dev  = dev_out;
2186         dev_hold(dev_out);
2187         rth->idev       = in_dev_get(dev_out);
2188         rth->rt_gateway = fl->fl4_dst;
2189         rth->rt_spec_dst= fl->fl4_src;
2190
2191         rth->u.dst.output=ip_output;
2192
2193         RT_CACHE_STAT_INC(out_slow_tot);
2194
2195         if (flags & RTCF_LOCAL) {
2196                 rth->u.dst.input = ip_local_deliver;
2197                 rth->rt_spec_dst = fl->fl4_dst;
2198         }
2199         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2200                 rth->rt_spec_dst = fl->fl4_src;
2201                 if (flags & RTCF_LOCAL &&
2202                     !(dev_out->flags & IFF_LOOPBACK)) {
2203                         rth->u.dst.output = ip_mc_output;
2204                         RT_CACHE_STAT_INC(out_slow_mc);
2205                 }
2206 #ifdef CONFIG_IP_MROUTE
2207                 if (res->type == RTN_MULTICAST) {
2208                         if (IN_DEV_MFORWARD(in_dev) &&
2209                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2210                                 rth->u.dst.input = ip_mr_input;
2211                                 rth->u.dst.output = ip_mc_output;
2212                         }
2213                 }
2214 #endif
2215         }
2216
2217         rt_set_nexthop(rth, res, 0);
2218
2219         rth->rt_flags = flags;
2220
2221         *result = rth;
2222  cleanup:
2223         /* release work reference to inet device */
2224         in_dev_put(in_dev);
2225
2226         return err;
2227 }
2228
2229 static inline int ip_mkroute_output(struct rtable **rp,
2230                                     struct fib_result* res,
2231                                     const struct flowi *fl,
2232                                     const struct flowi *oldflp,
2233                                     struct net_device *dev_out,
2234                                     unsigned flags)
2235 {
2236         struct rtable *rth = NULL;
2237         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2238         unsigned hash;
2239         if (err == 0) {
2240                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2241                 err = rt_intern_hash(hash, rth, rp);
2242         }
2243
2244         return err;
2245 }
2246
2247 /*
2248  * Major route resolver routine.
2249  */
2250
2251 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2252                                 const struct flowi *oldflp)
2253 {
2254         u32 tos = RT_FL_TOS(oldflp);
2255         struct flowi fl = { .nl_u = { .ip4_u =
2256                                       { .daddr = oldflp->fl4_dst,
2257                                         .saddr = oldflp->fl4_src,
2258                                         .tos = tos & IPTOS_RT_MASK,
2259                                         .scope = ((tos & RTO_ONLINK) ?
2260                                                   RT_SCOPE_LINK :
2261                                                   RT_SCOPE_UNIVERSE),
2262                                       } },
2263                             .mark = oldflp->mark,
2264                             .iif = net->loopback_dev->ifindex,
2265                             .oif = oldflp->oif };
2266         struct fib_result res;
2267         unsigned flags = 0;
2268         struct net_device *dev_out = NULL;
2269         int free_res = 0;
2270         int err;
2271
2272
2273         res.fi          = NULL;
2274 #ifdef CONFIG_IP_MULTIPLE_TABLES
2275         res.r           = NULL;
2276 #endif
2277
2278         if (oldflp->fl4_src) {
2279                 err = -EINVAL;
2280                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2281                     ipv4_is_lbcast(oldflp->fl4_src) ||
2282                     ipv4_is_zeronet(oldflp->fl4_src))
2283                         goto out;
2284
2285                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2286                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2287                 if (dev_out == NULL)
2288                         goto out;
2289
2290                 /* I removed check for oif == dev_out->oif here.
2291                    It was wrong for two reasons:
2292                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2293                       is assigned to multiple interfaces.
2294                    2. Moreover, we are allowed to send packets with saddr
2295                       of another iface. --ANK
2296                  */
2297
2298                 if (oldflp->oif == 0
2299                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2300                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2301                         /* Special hack: user can direct multicasts
2302                            and limited broadcast via necessary interface
2303                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2304                            This hack is not just for fun, it allows
2305                            vic,vat and friends to work.
2306                            They bind socket to loopback, set ttl to zero
2307                            and expect that it will work.
2308                            From the viewpoint of routing cache they are broken,
2309                            because we are not allowed to build multicast path
2310                            with loopback source addr (look, routing cache
2311                            cannot know, that ttl is zero, so that packet
2312                            will not leave this host and route is valid).
2313                            Luckily, this hack is good workaround.
2314                          */
2315
2316                         fl.oif = dev_out->ifindex;
2317                         goto make_route;
2318                 }
2319                 if (dev_out)
2320                         dev_put(dev_out);
2321                 dev_out = NULL;
2322         }
2323
2324
2325         if (oldflp->oif) {
2326                 dev_out = dev_get_by_index(net, oldflp->oif);
2327                 err = -ENODEV;
2328                 if (dev_out == NULL)
2329                         goto out;
2330
2331                 /* RACE: Check return value of inet_select_addr instead. */
2332                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2333                         dev_put(dev_out);
2334                         goto out;       /* Wrong error code */
2335                 }
2336
2337                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2338                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2339                         if (!fl.fl4_src)
2340                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2341                                                               RT_SCOPE_LINK);
2342                         goto make_route;
2343                 }
2344                 if (!fl.fl4_src) {
2345                         if (ipv4_is_multicast(oldflp->fl4_dst))
2346                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2347                                                               fl.fl4_scope);
2348                         else if (!oldflp->fl4_dst)
2349                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2350                                                               RT_SCOPE_HOST);
2351                 }
2352         }
2353
2354         if (!fl.fl4_dst) {
2355                 fl.fl4_dst = fl.fl4_src;
2356                 if (!fl.fl4_dst)
2357                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2358                 if (dev_out)
2359                         dev_put(dev_out);
2360                 dev_out = net->loopback_dev;
2361                 dev_hold(dev_out);
2362                 fl.oif = net->loopback_dev->ifindex;
2363                 res.type = RTN_LOCAL;
2364                 flags |= RTCF_LOCAL;
2365                 goto make_route;
2366         }
2367
2368         if (fib_lookup(net, &fl, &res)) {
2369                 res.fi = NULL;
2370                 if (oldflp->oif) {
2371                         /* Apparently, routing tables are wrong. Assume,
2372                            that the destination is on link.
2373
2374                            WHY? DW.
2375                            Because we are allowed to send to iface
2376                            even if it has NO routes and NO assigned
2377                            addresses. When oif is specified, routing
2378                            tables are looked up with only one purpose:
2379                            to catch if destination is gatewayed, rather than
2380                            direct. Moreover, if MSG_DONTROUTE is set,
2381                            we send packet, ignoring both routing tables
2382                            and ifaddr state. --ANK
2383
2384
2385                            We could make it even if oif is unknown,
2386                            likely IPv6, but we do not.
2387                          */
2388
2389                         if (fl.fl4_src == 0)
2390                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2391                                                               RT_SCOPE_LINK);
2392                         res.type = RTN_UNICAST;
2393                         goto make_route;
2394                 }
2395                 if (dev_out)
2396                         dev_put(dev_out);
2397                 err = -ENETUNREACH;
2398                 goto out;
2399         }
2400         free_res = 1;
2401
2402         if (res.type == RTN_LOCAL) {
2403                 if (!fl.fl4_src)
2404                         fl.fl4_src = fl.fl4_dst;
2405                 if (dev_out)
2406                         dev_put(dev_out);
2407                 dev_out = net->loopback_dev;
2408                 dev_hold(dev_out);
2409                 fl.oif = dev_out->ifindex;
2410                 if (res.fi)
2411                         fib_info_put(res.fi);
2412                 res.fi = NULL;
2413                 flags |= RTCF_LOCAL;
2414                 goto make_route;
2415         }
2416
2417 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2418         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2419                 fib_select_multipath(&fl, &res);
2420         else
2421 #endif
2422         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2423                 fib_select_default(net, &fl, &res);
2424
2425         if (!fl.fl4_src)
2426                 fl.fl4_src = FIB_RES_PREFSRC(res);
2427
2428         if (dev_out)
2429                 dev_put(dev_out);
2430         dev_out = FIB_RES_DEV(res);
2431         dev_hold(dev_out);
2432         fl.oif = dev_out->ifindex;
2433
2434
2435 make_route:
2436         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2437
2438
2439         if (free_res)
2440                 fib_res_put(&res);
2441         if (dev_out)
2442                 dev_put(dev_out);
2443 out:    return err;
2444 }
2445
2446 int __ip_route_output_key(struct net *net, struct rtable **rp,
2447                           const struct flowi *flp)
2448 {
2449         unsigned hash;
2450         struct rtable *rth;
2451
2452         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2453
2454         rcu_read_lock_bh();
2455         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2456                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2457                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2458                     rth->fl.fl4_src == flp->fl4_src &&
2459                     rth->fl.iif == 0 &&
2460                     rth->fl.oif == flp->oif &&
2461                     rth->fl.mark == flp->mark &&
2462                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2463                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2464                         dst_use(&rth->u.dst, jiffies);
2465                         RT_CACHE_STAT_INC(out_hit);
2466                         rcu_read_unlock_bh();
2467                         *rp = rth;
2468                         return 0;
2469                 }
2470                 RT_CACHE_STAT_INC(out_hlist_search);
2471         }
2472         rcu_read_unlock_bh();
2473
2474         return ip_route_output_slow(net, rp, flp);
2475 }
2476
2477 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2478
2479 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2480 {
2481 }
2482
2483 static struct dst_ops ipv4_dst_blackhole_ops = {
2484         .family                 =       AF_INET,
2485         .protocol               =       __constant_htons(ETH_P_IP),
2486         .destroy                =       ipv4_dst_destroy,
2487         .check                  =       ipv4_dst_check,
2488         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2489         .entry_size             =       sizeof(struct rtable),
2490 };
2491
2492
2493 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2494 {
2495         struct rtable *ort = *rp;
2496         struct rtable *rt = (struct rtable *)
2497                 dst_alloc(&ipv4_dst_blackhole_ops);
2498
2499         if (rt) {
2500                 struct dst_entry *new = &rt->u.dst;
2501
2502                 atomic_set(&new->__refcnt, 1);
2503                 new->__use = 1;
2504                 new->input = dst_discard;
2505                 new->output = dst_discard;
2506                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2507
2508                 new->dev = ort->u.dst.dev;
2509                 if (new->dev)
2510                         dev_hold(new->dev);
2511
2512                 rt->fl = ort->fl;
2513
2514                 rt->idev = ort->idev;
2515                 if (rt->idev)
2516                         in_dev_hold(rt->idev);
2517                 rt->rt_flags = ort->rt_flags;
2518                 rt->rt_type = ort->rt_type;
2519                 rt->rt_dst = ort->rt_dst;
2520                 rt->rt_src = ort->rt_src;
2521                 rt->rt_iif = ort->rt_iif;
2522                 rt->rt_gateway = ort->rt_gateway;
2523                 rt->rt_spec_dst = ort->rt_spec_dst;
2524                 rt->peer = ort->peer;
2525                 if (rt->peer)
2526                         atomic_inc(&rt->peer->refcnt);
2527
2528                 dst_free(new);
2529         }
2530
2531         dst_release(&(*rp)->u.dst);
2532         *rp = rt;
2533         return (rt ? 0 : -ENOMEM);
2534 }
2535
2536 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2537                          struct sock *sk, int flags)
2538 {
2539         int err;
2540
2541         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2542                 return err;
2543
2544         if (flp->proto) {
2545                 if (!flp->fl4_src)
2546                         flp->fl4_src = (*rp)->rt_src;
2547                 if (!flp->fl4_dst)
2548                         flp->fl4_dst = (*rp)->rt_dst;
2549                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2550                                     flags ? XFRM_LOOKUP_WAIT : 0);
2551                 if (err == -EREMOTE)
2552                         err = ipv4_dst_blackhole(rp, flp, sk);
2553
2554                 return err;
2555         }
2556
2557         return 0;
2558 }
2559
2560 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2561
2562 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2563 {
2564         return ip_route_output_flow(net, rp, flp, NULL, 0);
2565 }
2566
2567 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2568                         int nowait, unsigned int flags)
2569 {
2570         struct rtable *rt = (struct rtable*)skb->dst;
2571         struct rtmsg *r;
2572         struct nlmsghdr *nlh;
2573         long expires;
2574         u32 id = 0, ts = 0, tsage = 0, error;
2575
2576         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2577         if (nlh == NULL)
2578                 return -EMSGSIZE;
2579
2580         r = nlmsg_data(nlh);
2581         r->rtm_family    = AF_INET;
2582         r->rtm_dst_len  = 32;
2583         r->rtm_src_len  = 0;
2584         r->rtm_tos      = rt->fl.fl4_tos;
2585         r->rtm_table    = RT_TABLE_MAIN;
2586         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2587         r->rtm_type     = rt->rt_type;
2588         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2589         r->rtm_protocol = RTPROT_UNSPEC;
2590         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2591         if (rt->rt_flags & RTCF_NOTIFY)
2592                 r->rtm_flags |= RTM_F_NOTIFY;
2593
2594         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2595
2596         if (rt->fl.fl4_src) {
2597                 r->rtm_src_len = 32;
2598                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2599         }
2600         if (rt->u.dst.dev)
2601                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2602 #ifdef CONFIG_NET_CLS_ROUTE
2603         if (rt->u.dst.tclassid)
2604                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2605 #endif
2606         if (rt->fl.iif)
2607                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2608         else if (rt->rt_src != rt->fl.fl4_src)
2609                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2610
2611         if (rt->rt_dst != rt->rt_gateway)
2612                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2613
2614         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2615                 goto nla_put_failure;
2616
2617         error = rt->u.dst.error;
2618         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2619         if (rt->peer) {
2620                 id = rt->peer->ip_id_count;
2621                 if (rt->peer->tcp_ts_stamp) {
2622                         ts = rt->peer->tcp_ts;
2623                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2624                 }
2625         }
2626
2627         if (rt->fl.iif) {
2628 #ifdef CONFIG_IP_MROUTE
2629                 __be32 dst = rt->rt_dst;
2630
2631                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2632                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2633                         int err = ipmr_get_route(skb, r, nowait);
2634                         if (err <= 0) {
2635                                 if (!nowait) {
2636                                         if (err == 0)
2637                                                 return 0;
2638                                         goto nla_put_failure;
2639                                 } else {
2640                                         if (err == -EMSGSIZE)
2641                                                 goto nla_put_failure;
2642                                         error = err;
2643                                 }
2644                         }
2645                 } else
2646 #endif
2647                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2648         }
2649
2650         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2651                                expires, error) < 0)
2652                 goto nla_put_failure;
2653
2654         return nlmsg_end(skb, nlh);
2655
2656 nla_put_failure:
2657         nlmsg_cancel(skb, nlh);
2658         return -EMSGSIZE;
2659 }
2660
2661 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2662 {
2663         struct net *net = in_skb->sk->sk_net;
2664         struct rtmsg *rtm;
2665         struct nlattr *tb[RTA_MAX+1];
2666         struct rtable *rt = NULL;
2667         __be32 dst = 0;
2668         __be32 src = 0;
2669         u32 iif;
2670         int err;
2671         struct sk_buff *skb;
2672
2673         if (net != &init_net)
2674                 return -EINVAL;
2675
2676         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2677         if (err < 0)
2678                 goto errout;
2679
2680         rtm = nlmsg_data(nlh);
2681
2682         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2683         if (skb == NULL) {
2684                 err = -ENOBUFS;
2685                 goto errout;
2686         }
2687
2688         /* Reserve room for dummy headers, this skb can pass
2689            through good chunk of routing engine.
2690          */
2691         skb_reset_mac_header(skb);
2692         skb_reset_network_header(skb);
2693
2694         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2695         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2696         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2697
2698         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2699         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2700         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2701
2702         if (iif) {
2703                 struct net_device *dev;
2704
2705                 dev = __dev_get_by_index(&init_net, iif);
2706                 if (dev == NULL) {
2707                         err = -ENODEV;
2708                         goto errout_free;
2709                 }
2710
2711                 skb->protocol   = htons(ETH_P_IP);
2712                 skb->dev        = dev;
2713                 local_bh_disable();
2714                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2715                 local_bh_enable();
2716
2717                 rt = (struct rtable*) skb->dst;
2718                 if (err == 0 && rt->u.dst.error)
2719                         err = -rt->u.dst.error;
2720         } else {
2721                 struct flowi fl = {
2722                         .nl_u = {
2723                                 .ip4_u = {
2724                                         .daddr = dst,
2725                                         .saddr = src,
2726                                         .tos = rtm->rtm_tos,
2727                                 },
2728                         },
2729                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2730                 };
2731                 err = ip_route_output_key(&init_net, &rt, &fl);
2732         }
2733
2734         if (err)
2735                 goto errout_free;
2736
2737         skb->dst = &rt->u.dst;
2738         if (rtm->rtm_flags & RTM_F_NOTIFY)
2739                 rt->rt_flags |= RTCF_NOTIFY;
2740
2741         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2742                                 RTM_NEWROUTE, 0, 0);
2743         if (err <= 0)
2744                 goto errout_free;
2745
2746         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2747 errout:
2748         return err;
2749
2750 errout_free:
2751         kfree_skb(skb);
2752         goto errout;
2753 }
2754
2755 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2756 {
2757         struct rtable *rt;
2758         int h, s_h;
2759         int idx, s_idx;
2760
2761         s_h = cb->args[0];
2762         if (s_h < 0)
2763                 s_h = 0;
2764         s_idx = idx = cb->args[1];
2765         for (h = s_h; h <= rt_hash_mask; h++) {
2766                 rcu_read_lock_bh();
2767                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2768                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2769                         if (idx < s_idx)
2770                                 continue;
2771                         skb->dst = dst_clone(&rt->u.dst);
2772                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2773                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2774                                          1, NLM_F_MULTI) <= 0) {
2775                                 dst_release(xchg(&skb->dst, NULL));
2776                                 rcu_read_unlock_bh();
2777                                 goto done;
2778                         }
2779                         dst_release(xchg(&skb->dst, NULL));
2780                 }
2781                 rcu_read_unlock_bh();
2782                 s_idx = 0;
2783         }
2784
2785 done:
2786         cb->args[0] = h;
2787         cb->args[1] = idx;
2788         return skb->len;
2789 }
2790
2791 void ip_rt_multicast_event(struct in_device *in_dev)
2792 {
2793         rt_cache_flush(0);
2794 }
2795
2796 #ifdef CONFIG_SYSCTL
2797 static int flush_delay;
2798
2799 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2800                                         struct file *filp, void __user *buffer,
2801                                         size_t *lenp, loff_t *ppos)
2802 {
2803         if (write) {
2804                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2805                 rt_cache_flush(flush_delay);
2806                 return 0;
2807         }
2808
2809         return -EINVAL;
2810 }
2811
2812 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2813                                                 int __user *name,
2814                                                 int nlen,
2815                                                 void __user *oldval,
2816                                                 size_t __user *oldlenp,
2817                                                 void __user *newval,
2818                                                 size_t newlen)
2819 {
2820         int delay;
2821         if (newlen != sizeof(int))
2822                 return -EINVAL;
2823         if (get_user(delay, (int __user *)newval))
2824                 return -EFAULT;
2825         rt_cache_flush(delay);
2826         return 0;
2827 }
2828
2829 ctl_table ipv4_route_table[] = {
2830         {
2831                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2832                 .procname       = "flush",
2833                 .data           = &flush_delay,
2834                 .maxlen         = sizeof(int),
2835                 .mode           = 0200,
2836                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2837                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2838         },
2839         {
2840                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2841                 .procname       = "min_delay",
2842                 .data           = &ip_rt_min_delay,
2843                 .maxlen         = sizeof(int),
2844                 .mode           = 0644,
2845                 .proc_handler   = &proc_dointvec_jiffies,
2846                 .strategy       = &sysctl_jiffies,
2847         },
2848         {
2849                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2850                 .procname       = "max_delay",
2851                 .data           = &ip_rt_max_delay,
2852                 .maxlen         = sizeof(int),
2853                 .mode           = 0644,
2854                 .proc_handler   = &proc_dointvec_jiffies,
2855                 .strategy       = &sysctl_jiffies,
2856         },
2857         {
2858                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2859                 .procname       = "gc_thresh",
2860                 .data           = &ipv4_dst_ops.gc_thresh,
2861                 .maxlen         = sizeof(int),
2862                 .mode           = 0644,
2863                 .proc_handler   = &proc_dointvec,
2864         },
2865         {
2866                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2867                 .procname       = "max_size",
2868                 .data           = &ip_rt_max_size,
2869                 .maxlen         = sizeof(int),
2870                 .mode           = 0644,
2871                 .proc_handler   = &proc_dointvec,
2872         },
2873         {
2874                 /*  Deprecated. Use gc_min_interval_ms */
2875
2876                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2877                 .procname       = "gc_min_interval",
2878                 .data           = &ip_rt_gc_min_interval,
2879                 .maxlen         = sizeof(int),
2880                 .mode           = 0644,
2881                 .proc_handler   = &proc_dointvec_jiffies,
2882                 .strategy       = &sysctl_jiffies,
2883         },
2884         {
2885                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2886                 .procname       = "gc_min_interval_ms",
2887                 .data           = &ip_rt_gc_min_interval,
2888                 .maxlen         = sizeof(int),
2889                 .mode           = 0644,
2890                 .proc_handler   = &proc_dointvec_ms_jiffies,
2891                 .strategy       = &sysctl_ms_jiffies,
2892         },
2893         {
2894                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2895                 .procname       = "gc_timeout",
2896                 .data           = &ip_rt_gc_timeout,
2897                 .maxlen         = sizeof(int),
2898                 .mode           = 0644,
2899                 .proc_handler   = &proc_dointvec_jiffies,
2900                 .strategy       = &sysctl_jiffies,
2901         },
2902         {
2903                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2904                 .procname       = "gc_interval",
2905                 .data           = &ip_rt_gc_interval,
2906                 .maxlen         = sizeof(int),
2907                 .mode           = 0644,
2908                 .proc_handler   = &proc_dointvec_jiffies,
2909                 .strategy       = &sysctl_jiffies,
2910         },
2911         {
2912                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2913                 .procname       = "redirect_load",
2914                 .data           = &ip_rt_redirect_load,
2915                 .maxlen         = sizeof(int),
2916                 .mode           = 0644,
2917                 .proc_handler   = &proc_dointvec,
2918         },
2919         {
2920                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2921                 .procname       = "redirect_number",
2922                 .data           = &ip_rt_redirect_number,
2923                 .maxlen         = sizeof(int),
2924                 .mode           = 0644,
2925                 .proc_handler   = &proc_dointvec,
2926         },
2927         {
2928                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2929                 .procname       = "redirect_silence",
2930                 .data           = &ip_rt_redirect_silence,
2931                 .maxlen         = sizeof(int),
2932                 .mode           = 0644,
2933                 .proc_handler   = &proc_dointvec,
2934         },
2935         {
2936                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2937                 .procname       = "error_cost",
2938                 .data           = &ip_rt_error_cost,
2939                 .maxlen         = sizeof(int),
2940                 .mode           = 0644,
2941                 .proc_handler   = &proc_dointvec,
2942         },
2943         {
2944                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2945                 .procname       = "error_burst",
2946                 .data           = &ip_rt_error_burst,
2947                 .maxlen         = sizeof(int),
2948                 .mode           = 0644,
2949                 .proc_handler   = &proc_dointvec,
2950         },
2951         {
2952                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2953                 .procname       = "gc_elasticity",
2954                 .data           = &ip_rt_gc_elasticity,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = &proc_dointvec,
2958         },
2959         {
2960                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2961                 .procname       = "mtu_expires",
2962                 .data           = &ip_rt_mtu_expires,
2963                 .maxlen         = sizeof(int),
2964                 .mode           = 0644,
2965                 .proc_handler   = &proc_dointvec_jiffies,
2966                 .strategy       = &sysctl_jiffies,
2967         },
2968         {
2969                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2970                 .procname       = "min_pmtu",
2971                 .data           = &ip_rt_min_pmtu,
2972                 .maxlen         = sizeof(int),
2973                 .mode           = 0644,
2974                 .proc_handler   = &proc_dointvec,
2975         },
2976         {
2977                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2978                 .procname       = "min_adv_mss",
2979                 .data           = &ip_rt_min_advmss,
2980                 .maxlen         = sizeof(int),
2981                 .mode           = 0644,
2982                 .proc_handler   = &proc_dointvec,
2983         },
2984         {
2985                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2986                 .procname       = "secret_interval",
2987                 .data           = &ip_rt_secret_interval,
2988                 .maxlen         = sizeof(int),
2989                 .mode           = 0644,
2990                 .proc_handler   = &proc_dointvec_jiffies,
2991                 .strategy       = &sysctl_jiffies,
2992         },
2993         { .ctl_name = 0 }
2994 };
2995 #endif
2996
2997 #ifdef CONFIG_NET_CLS_ROUTE
2998 struct ip_rt_acct *ip_rt_acct __read_mostly;
2999 #endif /* CONFIG_NET_CLS_ROUTE */
3000
3001 static __initdata unsigned long rhash_entries;
3002 static int __init set_rhash_entries(char *str)
3003 {
3004         if (!str)
3005                 return 0;
3006         rhash_entries = simple_strtoul(str, &str, 0);
3007         return 1;
3008 }
3009 __setup("rhash_entries=", set_rhash_entries);
3010
3011 int __init ip_rt_init(void)
3012 {
3013         int rc = 0;
3014
3015         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3016                              (jiffies ^ (jiffies >> 7)));
3017
3018 #ifdef CONFIG_NET_CLS_ROUTE
3019         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3020         if (!ip_rt_acct)
3021                 panic("IP: failed to allocate ip_rt_acct\n");
3022 #endif
3023
3024         ipv4_dst_ops.kmem_cachep =
3025                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3026                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3027
3028         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3029
3030         rt_hash_table = (struct rt_hash_bucket *)
3031                 alloc_large_system_hash("IP route cache",
3032                                         sizeof(struct rt_hash_bucket),
3033                                         rhash_entries,
3034                                         (num_physpages >= 128 * 1024) ?
3035                                         15 : 17,
3036                                         0,
3037                                         &rt_hash_log,
3038                                         &rt_hash_mask,
3039                                         0);
3040         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3041         rt_hash_lock_init();
3042
3043         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3044         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3045
3046         devinet_init();
3047         ip_fib_init();
3048
3049         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3050         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3051
3052         /* All the timers, started at system startup tend
3053            to synchronize. Perturb it a bit.
3054          */
3055         schedule_delayed_work(&expires_work,
3056                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3057
3058         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3059                 ip_rt_secret_interval;
3060         add_timer(&rt_secret_timer);
3061
3062         if (ip_rt_proc_init(&init_net))
3063                 printk(KERN_ERR "Unable to create route proc files\n");
3064 #ifdef CONFIG_XFRM
3065         xfrm_init();
3066         xfrm4_init();
3067 #endif
3068         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3069
3070         return rc;
3071 }
3072
3073 EXPORT_SYMBOL(__ip_select_ident);
3074 EXPORT_SYMBOL(ip_route_input);
3075 EXPORT_SYMBOL(ip_route_output_key);