tcp: fix ICMP-RTO war
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114 #define IP_MAX_MTU      0xFFF0
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly    = 8;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
132 static int rt_chain_length_max __read_mostly    = 20;
133
134 static struct delayed_work expires_work;
135 static unsigned long expires_ljiffies;
136
137 /*
138  *      Interface to generic destination cache.
139  */
140
141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
142 static void              ipv4_dst_destroy(struct dst_entry *dst);
143 static void              ipv4_dst_ifdown(struct dst_entry *dst,
144                                          struct net_device *dev, int how);
145 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
146 static void              ipv4_link_failure(struct sk_buff *skb);
147 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
148 static int rt_garbage_collect(struct dst_ops *ops);
149 static void rt_emergency_hash_rebuild(struct net *net);
150
151
152 static struct dst_ops ipv4_dst_ops = {
153         .family =               AF_INET,
154         .protocol =             cpu_to_be16(ETH_P_IP),
155         .gc =                   rt_garbage_collect,
156         .check =                ipv4_dst_check,
157         .destroy =              ipv4_dst_destroy,
158         .ifdown =               ipv4_dst_ifdown,
159         .negative_advice =      ipv4_negative_advice,
160         .link_failure =         ipv4_link_failure,
161         .update_pmtu =          ip_rt_update_pmtu,
162         .local_out =            __ip_local_out,
163         .entries =              ATOMIC_INIT(0),
164 };
165
166 #define ECN_OR_COST(class)      TC_PRIO_##class
167
168 const __u8 ip_tos2prio[16] = {
169         TC_PRIO_BESTEFFORT,
170         ECN_OR_COST(FILLER),
171         TC_PRIO_BESTEFFORT,
172         ECN_OR_COST(BESTEFFORT),
173         TC_PRIO_BULK,
174         ECN_OR_COST(BULK),
175         TC_PRIO_BULK,
176         ECN_OR_COST(BULK),
177         TC_PRIO_INTERACTIVE,
178         ECN_OR_COST(INTERACTIVE),
179         TC_PRIO_INTERACTIVE,
180         ECN_OR_COST(INTERACTIVE),
181         TC_PRIO_INTERACTIVE_BULK,
182         ECN_OR_COST(INTERACTIVE_BULK),
183         TC_PRIO_INTERACTIVE_BULK,
184         ECN_OR_COST(INTERACTIVE_BULK)
185 };
186
187
188 /*
189  * Route cache.
190  */
191
192 /* The locking scheme is rather straight forward:
193  *
194  * 1) Read-Copy Update protects the buckets of the central route hash.
195  * 2) Only writers remove entries, and they hold the lock
196  *    as they look at rtable reference counts.
197  * 3) Only readers acquire references to rtable entries,
198  *    they do so with atomic increments and with the
199  *    lock held.
200  */
201
202 struct rt_hash_bucket {
203         struct rtable   *chain;
204 };
205
206 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
207         defined(CONFIG_PROVE_LOCKING)
208 /*
209  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
210  * The size of this table is a power of two and depends on the number of CPUS.
211  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
212  */
213 #ifdef CONFIG_LOCKDEP
214 # define RT_HASH_LOCK_SZ        256
215 #else
216 # if NR_CPUS >= 32
217 #  define RT_HASH_LOCK_SZ       4096
218 # elif NR_CPUS >= 16
219 #  define RT_HASH_LOCK_SZ       2048
220 # elif NR_CPUS >= 8
221 #  define RT_HASH_LOCK_SZ       1024
222 # elif NR_CPUS >= 4
223 #  define RT_HASH_LOCK_SZ       512
224 # else
225 #  define RT_HASH_LOCK_SZ       256
226 # endif
227 #endif
228
229 static spinlock_t       *rt_hash_locks;
230 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
231
232 static __init void rt_hash_lock_init(void)
233 {
234         int i;
235
236         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
237                         GFP_KERNEL);
238         if (!rt_hash_locks)
239                 panic("IP: failed to allocate rt_hash_locks\n");
240
241         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
242                 spin_lock_init(&rt_hash_locks[i]);
243 }
244 #else
245 # define rt_hash_lock_addr(slot) NULL
246
247 static inline void rt_hash_lock_init(void)
248 {
249 }
250 #endif
251
252 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
253 static unsigned                 rt_hash_mask __read_mostly;
254 static unsigned int             rt_hash_log  __read_mostly;
255
256 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
257 #define RT_CACHE_STAT_INC(field) \
258         (__raw_get_cpu_var(rt_cache_stat).field++)
259
260 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
261                 int genid)
262 {
263         return jhash_3words((__force u32)(__be32)(daddr),
264                             (__force u32)(__be32)(saddr),
265                             idx, genid)
266                 & rt_hash_mask;
267 }
268
269 static inline int rt_genid(struct net *net)
270 {
271         return atomic_read(&net->ipv4.rt_genid);
272 }
273
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276         struct seq_net_private p;
277         int bucket;
278         int genid;
279 };
280
281 static struct rtable *rt_cache_get_first(struct seq_file *seq)
282 {
283         struct rt_cache_iter_state *st = seq->private;
284         struct rtable *r = NULL;
285
286         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
287                 if (!rt_hash_table[st->bucket].chain)
288                         continue;
289                 rcu_read_lock_bh();
290                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
291                 while (r) {
292                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
293                             r->rt_genid == st->genid)
294                                 return r;
295                         r = rcu_dereference(r->u.dst.rt_next);
296                 }
297                 rcu_read_unlock_bh();
298         }
299         return r;
300 }
301
302 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
303                                           struct rtable *r)
304 {
305         struct rt_cache_iter_state *st = seq->private;
306
307         r = r->u.dst.rt_next;
308         while (!r) {
309                 rcu_read_unlock_bh();
310                 do {
311                         if (--st->bucket < 0)
312                                 return NULL;
313                 } while (!rt_hash_table[st->bucket].chain);
314                 rcu_read_lock_bh();
315                 r = rt_hash_table[st->bucket].chain;
316         }
317         return rcu_dereference(r);
318 }
319
320 static struct rtable *rt_cache_get_next(struct seq_file *seq,
321                                         struct rtable *r)
322 {
323         struct rt_cache_iter_state *st = seq->private;
324         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
325                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
326                         continue;
327                 if (r->rt_genid == st->genid)
328                         break;
329         }
330         return r;
331 }
332
333 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
334 {
335         struct rtable *r = rt_cache_get_first(seq);
336
337         if (r)
338                 while (pos && (r = rt_cache_get_next(seq, r)))
339                         --pos;
340         return pos ? NULL : r;
341 }
342
343 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
344 {
345         struct rt_cache_iter_state *st = seq->private;
346         if (*pos)
347                 return rt_cache_get_idx(seq, *pos - 1);
348         st->genid = rt_genid(seq_file_net(seq));
349         return SEQ_START_TOKEN;
350 }
351
352 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
353 {
354         struct rtable *r;
355
356         if (v == SEQ_START_TOKEN)
357                 r = rt_cache_get_first(seq);
358         else
359                 r = rt_cache_get_next(seq, v);
360         ++*pos;
361         return r;
362 }
363
364 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
365 {
366         if (v && v != SEQ_START_TOKEN)
367                 rcu_read_unlock_bh();
368 }
369
370 static int rt_cache_seq_show(struct seq_file *seq, void *v)
371 {
372         if (v == SEQ_START_TOKEN)
373                 seq_printf(seq, "%-127s\n",
374                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
375                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
376                            "HHUptod\tSpecDst");
377         else {
378                 struct rtable *r = v;
379                 int len;
380
381                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
382                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
383                         r->u.dst.dev ? r->u.dst.dev->name : "*",
384                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
385                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
386                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
387                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
388                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
389                         dst_metric(&r->u.dst, RTAX_WINDOW),
390                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
391                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
392                         r->fl.fl4_tos,
393                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
394                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
395                                        dev_queue_xmit) : 0,
396                         r->rt_spec_dst, &len);
397
398                 seq_printf(seq, "%*s\n", 127 - len, "");
399         }
400         return 0;
401 }
402
403 static const struct seq_operations rt_cache_seq_ops = {
404         .start  = rt_cache_seq_start,
405         .next   = rt_cache_seq_next,
406         .stop   = rt_cache_seq_stop,
407         .show   = rt_cache_seq_show,
408 };
409
410 static int rt_cache_seq_open(struct inode *inode, struct file *file)
411 {
412         return seq_open_net(inode, file, &rt_cache_seq_ops,
413                         sizeof(struct rt_cache_iter_state));
414 }
415
416 static const struct file_operations rt_cache_seq_fops = {
417         .owner   = THIS_MODULE,
418         .open    = rt_cache_seq_open,
419         .read    = seq_read,
420         .llseek  = seq_lseek,
421         .release = seq_release_net,
422 };
423
424
425 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
426 {
427         int cpu;
428
429         if (*pos == 0)
430                 return SEQ_START_TOKEN;
431
432         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
433                 if (!cpu_possible(cpu))
434                         continue;
435                 *pos = cpu+1;
436                 return &per_cpu(rt_cache_stat, cpu);
437         }
438         return NULL;
439 }
440
441 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
442 {
443         int cpu;
444
445         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
446                 if (!cpu_possible(cpu))
447                         continue;
448                 *pos = cpu+1;
449                 return &per_cpu(rt_cache_stat, cpu);
450         }
451         return NULL;
452
453 }
454
455 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
456 {
457
458 }
459
460 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
461 {
462         struct rt_cache_stat *st = v;
463
464         if (v == SEQ_START_TOKEN) {
465                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
466                 return 0;
467         }
468
469         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
470                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
471                    atomic_read(&ipv4_dst_ops.entries),
472                    st->in_hit,
473                    st->in_slow_tot,
474                    st->in_slow_mc,
475                    st->in_no_route,
476                    st->in_brd,
477                    st->in_martian_dst,
478                    st->in_martian_src,
479
480                    st->out_hit,
481                    st->out_slow_tot,
482                    st->out_slow_mc,
483
484                    st->gc_total,
485                    st->gc_ignored,
486                    st->gc_goal_miss,
487                    st->gc_dst_overflow,
488                    st->in_hlist_search,
489                    st->out_hlist_search
490                 );
491         return 0;
492 }
493
494 static const struct seq_operations rt_cpu_seq_ops = {
495         .start  = rt_cpu_seq_start,
496         .next   = rt_cpu_seq_next,
497         .stop   = rt_cpu_seq_stop,
498         .show   = rt_cpu_seq_show,
499 };
500
501
502 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
503 {
504         return seq_open(file, &rt_cpu_seq_ops);
505 }
506
507 static const struct file_operations rt_cpu_seq_fops = {
508         .owner   = THIS_MODULE,
509         .open    = rt_cpu_seq_open,
510         .read    = seq_read,
511         .llseek  = seq_lseek,
512         .release = seq_release,
513 };
514
515 #ifdef CONFIG_NET_CLS_ROUTE
516 static int rt_acct_proc_show(struct seq_file *m, void *v)
517 {
518         struct ip_rt_acct *dst, *src;
519         unsigned int i, j;
520
521         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
522         if (!dst)
523                 return -ENOMEM;
524
525         for_each_possible_cpu(i) {
526                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
527                 for (j = 0; j < 256; j++) {
528                         dst[j].o_bytes   += src[j].o_bytes;
529                         dst[j].o_packets += src[j].o_packets;
530                         dst[j].i_bytes   += src[j].i_bytes;
531                         dst[j].i_packets += src[j].i_packets;
532                 }
533         }
534
535         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
536         kfree(dst);
537         return 0;
538 }
539
540 static int rt_acct_proc_open(struct inode *inode, struct file *file)
541 {
542         return single_open(file, rt_acct_proc_show, NULL);
543 }
544
545 static const struct file_operations rt_acct_proc_fops = {
546         .owner          = THIS_MODULE,
547         .open           = rt_acct_proc_open,
548         .read           = seq_read,
549         .llseek         = seq_lseek,
550         .release        = single_release,
551 };
552 #endif
553
554 static int __net_init ip_rt_do_proc_init(struct net *net)
555 {
556         struct proc_dir_entry *pde;
557
558         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
559                         &rt_cache_seq_fops);
560         if (!pde)
561                 goto err1;
562
563         pde = proc_create("rt_cache", S_IRUGO,
564                           net->proc_net_stat, &rt_cpu_seq_fops);
565         if (!pde)
566                 goto err2;
567
568 #ifdef CONFIG_NET_CLS_ROUTE
569         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
570         if (!pde)
571                 goto err3;
572 #endif
573         return 0;
574
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577         remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580         remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582         return -ENOMEM;
583 }
584
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
586 {
587         remove_proc_entry("rt_cache", net->proc_net_stat);
588         remove_proc_entry("rt_cache", net->proc_net);
589 #ifdef CONFIG_NET_CLS_ROUTE
590         remove_proc_entry("rt_acct", net->proc_net);
591 #endif
592 }
593
594 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
595         .init = ip_rt_do_proc_init,
596         .exit = ip_rt_do_proc_exit,
597 };
598
599 static int __init ip_rt_proc_init(void)
600 {
601         return register_pernet_subsys(&ip_rt_proc_ops);
602 }
603
604 #else
605 static inline int ip_rt_proc_init(void)
606 {
607         return 0;
608 }
609 #endif /* CONFIG_PROC_FS */
610
611 static inline void rt_free(struct rtable *rt)
612 {
613         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
614 }
615
616 static inline void rt_drop(struct rtable *rt)
617 {
618         ip_rt_put(rt);
619         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
620 }
621
622 static inline int rt_fast_clean(struct rtable *rth)
623 {
624         /* Kill broadcast/multicast entries very aggresively, if they
625            collide in hash table with more useful entries */
626         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
627                 rth->fl.iif && rth->u.dst.rt_next;
628 }
629
630 static inline int rt_valuable(struct rtable *rth)
631 {
632         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
633                 rth->u.dst.expires;
634 }
635
636 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
637 {
638         unsigned long age;
639         int ret = 0;
640
641         if (atomic_read(&rth->u.dst.__refcnt))
642                 goto out;
643
644         ret = 1;
645         if (rth->u.dst.expires &&
646             time_after_eq(jiffies, rth->u.dst.expires))
647                 goto out;
648
649         age = jiffies - rth->u.dst.lastuse;
650         ret = 0;
651         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
652             (age <= tmo2 && rt_valuable(rth)))
653                 goto out;
654         ret = 1;
655 out:    return ret;
656 }
657
658 /* Bits of score are:
659  * 31: very valuable
660  * 30: not quite useless
661  * 29..0: usage counter
662  */
663 static inline u32 rt_score(struct rtable *rt)
664 {
665         u32 score = jiffies - rt->u.dst.lastuse;
666
667         score = ~score & ~(3<<30);
668
669         if (rt_valuable(rt))
670                 score |= (1<<31);
671
672         if (!rt->fl.iif ||
673             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
674                 score |= (1<<30);
675
676         return score;
677 }
678
679 static inline bool rt_caching(const struct net *net)
680 {
681         return net->ipv4.current_rt_cache_rebuild_count <=
682                 net->ipv4.sysctl_rt_cache_rebuild_count;
683 }
684
685 static inline bool compare_hash_inputs(const struct flowi *fl1,
686                                         const struct flowi *fl2)
687 {
688         return (__force u32)(((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
689                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr) |
690                 (fl1->iif ^ fl2->iif)) == 0);
691 }
692
693 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
694 {
695         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
696                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
697                 (fl1->mark ^ fl2->mark) |
698                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
699                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
700                 (fl1->oif ^ fl2->oif) |
701                 (fl1->iif ^ fl2->iif)) == 0;
702 }
703
704 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
705 {
706         return net_eq(dev_net(rt1->u.dst.dev), dev_net(rt2->u.dst.dev));
707 }
708
709 static inline int rt_is_expired(struct rtable *rth)
710 {
711         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
712 }
713
714 /*
715  * Perform a full scan of hash table and free all entries.
716  * Can be called by a softirq or a process.
717  * In the later case, we want to be reschedule if necessary
718  */
719 static void rt_do_flush(int process_context)
720 {
721         unsigned int i;
722         struct rtable *rth, *next;
723         struct rtable * tail;
724
725         for (i = 0; i <= rt_hash_mask; i++) {
726                 if (process_context && need_resched())
727                         cond_resched();
728                 rth = rt_hash_table[i].chain;
729                 if (!rth)
730                         continue;
731
732                 spin_lock_bh(rt_hash_lock_addr(i));
733 #ifdef CONFIG_NET_NS
734                 {
735                 struct rtable ** prev, * p;
736
737                 rth = rt_hash_table[i].chain;
738
739                 /* defer releasing the head of the list after spin_unlock */
740                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
741                         if (!rt_is_expired(tail))
742                                 break;
743                 if (rth != tail)
744                         rt_hash_table[i].chain = tail;
745
746                 /* call rt_free on entries after the tail requiring flush */
747                 prev = &rt_hash_table[i].chain;
748                 for (p = *prev; p; p = next) {
749                         next = p->u.dst.rt_next;
750                         if (!rt_is_expired(p)) {
751                                 prev = &p->u.dst.rt_next;
752                         } else {
753                                 *prev = next;
754                                 rt_free(p);
755                         }
756                 }
757                 }
758 #else
759                 rth = rt_hash_table[i].chain;
760                 rt_hash_table[i].chain = NULL;
761                 tail = NULL;
762 #endif
763                 spin_unlock_bh(rt_hash_lock_addr(i));
764
765                 for (; rth != tail; rth = next) {
766                         next = rth->u.dst.rt_next;
767                         rt_free(rth);
768                 }
769         }
770 }
771
772 /*
773  * While freeing expired entries, we compute average chain length
774  * and standard deviation, using fixed-point arithmetic.
775  * This to have an estimation of rt_chain_length_max
776  *  rt_chain_length_max = max(elasticity, AVG + 4*SD)
777  * We use 3 bits for frational part, and 29 (or 61) for magnitude.
778  */
779
780 #define FRACT_BITS 3
781 #define ONE (1UL << FRACT_BITS)
782
783 static void rt_check_expire(void)
784 {
785         static unsigned int rover;
786         unsigned int i = rover, goal;
787         struct rtable *rth, *aux, **rthp;
788         unsigned long samples = 0;
789         unsigned long sum = 0, sum2 = 0;
790         unsigned long delta;
791         u64 mult;
792
793         delta = jiffies - expires_ljiffies;
794         expires_ljiffies = jiffies;
795         mult = ((u64)delta) << rt_hash_log;
796         if (ip_rt_gc_timeout > 1)
797                 do_div(mult, ip_rt_gc_timeout);
798         goal = (unsigned int)mult;
799         if (goal > rt_hash_mask)
800                 goal = rt_hash_mask + 1;
801         for (; goal > 0; goal--) {
802                 unsigned long tmo = ip_rt_gc_timeout;
803                 unsigned long length;
804
805                 i = (i + 1) & rt_hash_mask;
806                 rthp = &rt_hash_table[i].chain;
807
808                 if (need_resched())
809                         cond_resched();
810
811                 samples++;
812
813                 if (*rthp == NULL)
814                         continue;
815                 length = 0;
816                 spin_lock_bh(rt_hash_lock_addr(i));
817                 while ((rth = *rthp) != NULL) {
818                         prefetch(rth->u.dst.rt_next);
819                         if (rt_is_expired(rth)) {
820                                 *rthp = rth->u.dst.rt_next;
821                                 rt_free(rth);
822                                 continue;
823                         }
824                         if (rth->u.dst.expires) {
825                                 /* Entry is expired even if it is in use */
826                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
827 nofree:
828                                         tmo >>= 1;
829                                         rthp = &rth->u.dst.rt_next;
830                                         /*
831                                          * We only count entries on
832                                          * a chain with equal hash inputs once
833                                          * so that entries for different QOS
834                                          * levels, and other non-hash input
835                                          * attributes don't unfairly skew
836                                          * the length computation
837                                          */
838                                         for (aux = rt_hash_table[i].chain;;) {
839                                                 if (aux == rth) {
840                                                         length += ONE;
841                                                         break;
842                                                 }
843                                                 if (compare_hash_inputs(&aux->fl, &rth->fl))
844                                                         break;
845                                                 aux = aux->u.dst.rt_next;
846                                         }
847                                         continue;
848                                 }
849                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout))
850                                 goto nofree;
851
852                         /* Cleanup aged off entries. */
853                         *rthp = rth->u.dst.rt_next;
854                         rt_free(rth);
855                 }
856                 spin_unlock_bh(rt_hash_lock_addr(i));
857                 sum += length;
858                 sum2 += length*length;
859         }
860         if (samples) {
861                 unsigned long avg = sum / samples;
862                 unsigned long sd = int_sqrt(sum2 / samples - avg*avg);
863                 rt_chain_length_max = max_t(unsigned long,
864                                         ip_rt_gc_elasticity,
865                                         (avg + 4*sd) >> FRACT_BITS);
866         }
867         rover = i;
868 }
869
870 /*
871  * rt_worker_func() is run in process context.
872  * we call rt_check_expire() to scan part of the hash table
873  */
874 static void rt_worker_func(struct work_struct *work)
875 {
876         rt_check_expire();
877         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
878 }
879
880 /*
881  * Pertubation of rt_genid by a small quantity [1..256]
882  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
883  * many times (2^24) without giving recent rt_genid.
884  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
885  */
886 static void rt_cache_invalidate(struct net *net)
887 {
888         unsigned char shuffle;
889
890         get_random_bytes(&shuffle, sizeof(shuffle));
891         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
892 }
893
894 /*
895  * delay < 0  : invalidate cache (fast : entries will be deleted later)
896  * delay >= 0 : invalidate & flush cache (can be long)
897  */
898 void rt_cache_flush(struct net *net, int delay)
899 {
900         rt_cache_invalidate(net);
901         if (delay >= 0)
902                 rt_do_flush(!in_softirq());
903 }
904
905 /* Flush previous cache invalidated entries from the cache */
906 void rt_cache_flush_batch(void)
907 {
908         rt_do_flush(!in_softirq());
909 }
910
911 /*
912  * We change rt_genid and let gc do the cleanup
913  */
914 static void rt_secret_rebuild(unsigned long __net)
915 {
916         struct net *net = (struct net *)__net;
917         rt_cache_invalidate(net);
918         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
919 }
920
921 static void rt_secret_rebuild_oneshot(struct net *net)
922 {
923         del_timer_sync(&net->ipv4.rt_secret_timer);
924         rt_cache_invalidate(net);
925         if (ip_rt_secret_interval) {
926                 net->ipv4.rt_secret_timer.expires += ip_rt_secret_interval;
927                 add_timer(&net->ipv4.rt_secret_timer);
928         }
929 }
930
931 static void rt_emergency_hash_rebuild(struct net *net)
932 {
933         if (net_ratelimit()) {
934                 printk(KERN_WARNING "Route hash chain too long!\n");
935                 printk(KERN_WARNING "Adjust your secret_interval!\n");
936         }
937
938         rt_secret_rebuild_oneshot(net);
939 }
940
941 /*
942    Short description of GC goals.
943
944    We want to build algorithm, which will keep routing cache
945    at some equilibrium point, when number of aged off entries
946    is kept approximately equal to newly generated ones.
947
948    Current expiration strength is variable "expire".
949    We try to adjust it dynamically, so that if networking
950    is idle expires is large enough to keep enough of warm entries,
951    and when load increases it reduces to limit cache size.
952  */
953
954 static int rt_garbage_collect(struct dst_ops *ops)
955 {
956         static unsigned long expire = RT_GC_TIMEOUT;
957         static unsigned long last_gc;
958         static int rover;
959         static int equilibrium;
960         struct rtable *rth, **rthp;
961         unsigned long now = jiffies;
962         int goal;
963
964         /*
965          * Garbage collection is pretty expensive,
966          * do not make it too frequently.
967          */
968
969         RT_CACHE_STAT_INC(gc_total);
970
971         if (now - last_gc < ip_rt_gc_min_interval &&
972             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
973                 RT_CACHE_STAT_INC(gc_ignored);
974                 goto out;
975         }
976
977         /* Calculate number of entries, which we want to expire now. */
978         goal = atomic_read(&ipv4_dst_ops.entries) -
979                 (ip_rt_gc_elasticity << rt_hash_log);
980         if (goal <= 0) {
981                 if (equilibrium < ipv4_dst_ops.gc_thresh)
982                         equilibrium = ipv4_dst_ops.gc_thresh;
983                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
984                 if (goal > 0) {
985                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
986                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
987                 }
988         } else {
989                 /* We are in dangerous area. Try to reduce cache really
990                  * aggressively.
991                  */
992                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
993                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
994         }
995
996         if (now - last_gc >= ip_rt_gc_min_interval)
997                 last_gc = now;
998
999         if (goal <= 0) {
1000                 equilibrium += goal;
1001                 goto work_done;
1002         }
1003
1004         do {
1005                 int i, k;
1006
1007                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
1008                         unsigned long tmo = expire;
1009
1010                         k = (k + 1) & rt_hash_mask;
1011                         rthp = &rt_hash_table[k].chain;
1012                         spin_lock_bh(rt_hash_lock_addr(k));
1013                         while ((rth = *rthp) != NULL) {
1014                                 if (!rt_is_expired(rth) &&
1015                                         !rt_may_expire(rth, tmo, expire)) {
1016                                         tmo >>= 1;
1017                                         rthp = &rth->u.dst.rt_next;
1018                                         continue;
1019                                 }
1020                                 *rthp = rth->u.dst.rt_next;
1021                                 rt_free(rth);
1022                                 goal--;
1023                         }
1024                         spin_unlock_bh(rt_hash_lock_addr(k));
1025                         if (goal <= 0)
1026                                 break;
1027                 }
1028                 rover = k;
1029
1030                 if (goal <= 0)
1031                         goto work_done;
1032
1033                 /* Goal is not achieved. We stop process if:
1034
1035                    - if expire reduced to zero. Otherwise, expire is halfed.
1036                    - if table is not full.
1037                    - if we are called from interrupt.
1038                    - jiffies check is just fallback/debug loop breaker.
1039                      We will not spin here for long time in any case.
1040                  */
1041
1042                 RT_CACHE_STAT_INC(gc_goal_miss);
1043
1044                 if (expire == 0)
1045                         break;
1046
1047                 expire >>= 1;
1048 #if RT_CACHE_DEBUG >= 2
1049                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
1050                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
1051 #endif
1052
1053                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1054                         goto out;
1055         } while (!in_softirq() && time_before_eq(jiffies, now));
1056
1057         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
1058                 goto out;
1059         if (net_ratelimit())
1060                 printk(KERN_WARNING "dst cache overflow\n");
1061         RT_CACHE_STAT_INC(gc_dst_overflow);
1062         return 1;
1063
1064 work_done:
1065         expire += ip_rt_gc_min_interval;
1066         if (expire > ip_rt_gc_timeout ||
1067             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
1068                 expire = ip_rt_gc_timeout;
1069 #if RT_CACHE_DEBUG >= 2
1070         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
1071                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
1072 #endif
1073 out:    return 0;
1074 }
1075
1076 static int rt_intern_hash(unsigned hash, struct rtable *rt,
1077                           struct rtable **rp, struct sk_buff *skb)
1078 {
1079         struct rtable   *rth, **rthp;
1080         unsigned long   now;
1081         struct rtable *cand, **candp;
1082         u32             min_score;
1083         int             chain_length;
1084         int attempts = !in_softirq();
1085
1086 restart:
1087         chain_length = 0;
1088         min_score = ~(u32)0;
1089         cand = NULL;
1090         candp = NULL;
1091         now = jiffies;
1092
1093         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1094                 /*
1095                  * If we're not caching, just tell the caller we
1096                  * were successful and don't touch the route.  The
1097                  * caller hold the sole reference to the cache entry, and
1098                  * it will be released when the caller is done with it.
1099                  * If we drop it here, the callers have no way to resolve routes
1100                  * when we're not caching.  Instead, just point *rp at rt, so
1101                  * the caller gets a single use out of the route
1102                  * Note that we do rt_free on this new route entry, so that
1103                  * once its refcount hits zero, we are still able to reap it
1104                  * (Thanks Alexey)
1105                  * Note also the rt_free uses call_rcu.  We don't actually
1106                  * need rcu protection here, this is just our path to get
1107                  * on the route gc list.
1108                  */
1109
1110                 if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1111                         int err = arp_bind_neighbour(&rt->u.dst);
1112                         if (err) {
1113                                 if (net_ratelimit())
1114                                         printk(KERN_WARNING
1115                                             "Neighbour table failure & not caching routes.\n");
1116                                 rt_drop(rt);
1117                                 return err;
1118                         }
1119                 }
1120
1121                 rt_free(rt);
1122                 goto skip_hashing;
1123         }
1124
1125         rthp = &rt_hash_table[hash].chain;
1126
1127         spin_lock_bh(rt_hash_lock_addr(hash));
1128         while ((rth = *rthp) != NULL) {
1129                 if (rt_is_expired(rth)) {
1130                         *rthp = rth->u.dst.rt_next;
1131                         rt_free(rth);
1132                         continue;
1133                 }
1134                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1135                         /* Put it first */
1136                         *rthp = rth->u.dst.rt_next;
1137                         /*
1138                          * Since lookup is lockfree, the deletion
1139                          * must be visible to another weakly ordered CPU before
1140                          * the insertion at the start of the hash chain.
1141                          */
1142                         rcu_assign_pointer(rth->u.dst.rt_next,
1143                                            rt_hash_table[hash].chain);
1144                         /*
1145                          * Since lookup is lockfree, the update writes
1146                          * must be ordered for consistency on SMP.
1147                          */
1148                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1149
1150                         dst_use(&rth->u.dst, now);
1151                         spin_unlock_bh(rt_hash_lock_addr(hash));
1152
1153                         rt_drop(rt);
1154                         if (rp)
1155                                 *rp = rth;
1156                         else
1157                                 skb_dst_set(skb, &rth->u.dst);
1158                         return 0;
1159                 }
1160
1161                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1162                         u32 score = rt_score(rth);
1163
1164                         if (score <= min_score) {
1165                                 cand = rth;
1166                                 candp = rthp;
1167                                 min_score = score;
1168                         }
1169                 }
1170
1171                 chain_length++;
1172
1173                 rthp = &rth->u.dst.rt_next;
1174         }
1175
1176         if (cand) {
1177                 /* ip_rt_gc_elasticity used to be average length of chain
1178                  * length, when exceeded gc becomes really aggressive.
1179                  *
1180                  * The second limit is less certain. At the moment it allows
1181                  * only 2 entries per bucket. We will see.
1182                  */
1183                 if (chain_length > ip_rt_gc_elasticity) {
1184                         *candp = cand->u.dst.rt_next;
1185                         rt_free(cand);
1186                 }
1187         } else {
1188                 if (chain_length > rt_chain_length_max) {
1189                         struct net *net = dev_net(rt->u.dst.dev);
1190                         int num = ++net->ipv4.current_rt_cache_rebuild_count;
1191                         if (!rt_caching(dev_net(rt->u.dst.dev))) {
1192                                 printk(KERN_WARNING "%s: %d rebuilds is over limit, route caching disabled\n",
1193                                         rt->u.dst.dev->name, num);
1194                         }
1195                         rt_emergency_hash_rebuild(dev_net(rt->u.dst.dev));
1196                 }
1197         }
1198
1199         /* Try to bind route to arp only if it is output
1200            route or unicast forwarding path.
1201          */
1202         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1203                 int err = arp_bind_neighbour(&rt->u.dst);
1204                 if (err) {
1205                         spin_unlock_bh(rt_hash_lock_addr(hash));
1206
1207                         if (err != -ENOBUFS) {
1208                                 rt_drop(rt);
1209                                 return err;
1210                         }
1211
1212                         /* Neighbour tables are full and nothing
1213                            can be released. Try to shrink route cache,
1214                            it is most likely it holds some neighbour records.
1215                          */
1216                         if (attempts-- > 0) {
1217                                 int saved_elasticity = ip_rt_gc_elasticity;
1218                                 int saved_int = ip_rt_gc_min_interval;
1219                                 ip_rt_gc_elasticity     = 1;
1220                                 ip_rt_gc_min_interval   = 0;
1221                                 rt_garbage_collect(&ipv4_dst_ops);
1222                                 ip_rt_gc_min_interval   = saved_int;
1223                                 ip_rt_gc_elasticity     = saved_elasticity;
1224                                 goto restart;
1225                         }
1226
1227                         if (net_ratelimit())
1228                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1229                         rt_drop(rt);
1230                         return -ENOBUFS;
1231                 }
1232         }
1233
1234         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1235
1236 #if RT_CACHE_DEBUG >= 2
1237         if (rt->u.dst.rt_next) {
1238                 struct rtable *trt;
1239                 printk(KERN_DEBUG "rt_cache @%02x: %pI4",
1240                        hash, &rt->rt_dst);
1241                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1242                         printk(" . %pI4", &trt->rt_dst);
1243                 printk("\n");
1244         }
1245 #endif
1246         /*
1247          * Since lookup is lockfree, we must make sure
1248          * previous writes to rt are comitted to memory
1249          * before making rt visible to other CPUS.
1250          */
1251         rcu_assign_pointer(rt_hash_table[hash].chain, rt);
1252
1253         spin_unlock_bh(rt_hash_lock_addr(hash));
1254
1255 skip_hashing:
1256         if (rp)
1257                 *rp = rt;
1258         else
1259                 skb_dst_set(skb, &rt->u.dst);
1260         return 0;
1261 }
1262
1263 void rt_bind_peer(struct rtable *rt, int create)
1264 {
1265         static DEFINE_SPINLOCK(rt_peer_lock);
1266         struct inet_peer *peer;
1267
1268         peer = inet_getpeer(rt->rt_dst, create);
1269
1270         spin_lock_bh(&rt_peer_lock);
1271         if (rt->peer == NULL) {
1272                 rt->peer = peer;
1273                 peer = NULL;
1274         }
1275         spin_unlock_bh(&rt_peer_lock);
1276         if (peer)
1277                 inet_putpeer(peer);
1278 }
1279
1280 /*
1281  * Peer allocation may fail only in serious out-of-memory conditions.  However
1282  * we still can generate some output.
1283  * Random ID selection looks a bit dangerous because we have no chances to
1284  * select ID being unique in a reasonable period of time.
1285  * But broken packet identifier may be better than no packet at all.
1286  */
1287 static void ip_select_fb_ident(struct iphdr *iph)
1288 {
1289         static DEFINE_SPINLOCK(ip_fb_id_lock);
1290         static u32 ip_fallback_id;
1291         u32 salt;
1292
1293         spin_lock_bh(&ip_fb_id_lock);
1294         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1295         iph->id = htons(salt & 0xFFFF);
1296         ip_fallback_id = salt;
1297         spin_unlock_bh(&ip_fb_id_lock);
1298 }
1299
1300 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1301 {
1302         struct rtable *rt = (struct rtable *) dst;
1303
1304         if (rt) {
1305                 if (rt->peer == NULL)
1306                         rt_bind_peer(rt, 1);
1307
1308                 /* If peer is attached to destination, it is never detached,
1309                    so that we need not to grab a lock to dereference it.
1310                  */
1311                 if (rt->peer) {
1312                         iph->id = htons(inet_getid(rt->peer, more));
1313                         return;
1314                 }
1315         } else
1316                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1317                        __builtin_return_address(0));
1318
1319         ip_select_fb_ident(iph);
1320 }
1321
1322 static void rt_del(unsigned hash, struct rtable *rt)
1323 {
1324         struct rtable **rthp, *aux;
1325
1326         rthp = &rt_hash_table[hash].chain;
1327         spin_lock_bh(rt_hash_lock_addr(hash));
1328         ip_rt_put(rt);
1329         while ((aux = *rthp) != NULL) {
1330                 if (aux == rt || rt_is_expired(aux)) {
1331                         *rthp = aux->u.dst.rt_next;
1332                         rt_free(aux);
1333                         continue;
1334                 }
1335                 rthp = &aux->u.dst.rt_next;
1336         }
1337         spin_unlock_bh(rt_hash_lock_addr(hash));
1338 }
1339
1340 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1341                     __be32 saddr, struct net_device *dev)
1342 {
1343         int i, k;
1344         struct in_device *in_dev = in_dev_get(dev);
1345         struct rtable *rth, **rthp;
1346         __be32  skeys[2] = { saddr, 0 };
1347         int  ikeys[2] = { dev->ifindex, 0 };
1348         struct netevent_redirect netevent;
1349         struct net *net;
1350
1351         if (!in_dev)
1352                 return;
1353
1354         net = dev_net(dev);
1355         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
1356             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
1357             ipv4_is_zeronet(new_gw))
1358                 goto reject_redirect;
1359
1360         if (!rt_caching(net))
1361                 goto reject_redirect;
1362
1363         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1364                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1365                         goto reject_redirect;
1366                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1367                         goto reject_redirect;
1368         } else {
1369                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1370                         goto reject_redirect;
1371         }
1372
1373         for (i = 0; i < 2; i++) {
1374                 for (k = 0; k < 2; k++) {
1375                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1376                                                 rt_genid(net));
1377
1378                         rthp=&rt_hash_table[hash].chain;
1379
1380                         rcu_read_lock();
1381                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1382                                 struct rtable *rt;
1383
1384                                 if (rth->fl.fl4_dst != daddr ||
1385                                     rth->fl.fl4_src != skeys[i] ||
1386                                     rth->fl.oif != ikeys[k] ||
1387                                     rth->fl.iif != 0 ||
1388                                     rt_is_expired(rth) ||
1389                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1390                                         rthp = &rth->u.dst.rt_next;
1391                                         continue;
1392                                 }
1393
1394                                 if (rth->rt_dst != daddr ||
1395                                     rth->rt_src != saddr ||
1396                                     rth->u.dst.error ||
1397                                     rth->rt_gateway != old_gw ||
1398                                     rth->u.dst.dev != dev)
1399                                         break;
1400
1401                                 dst_hold(&rth->u.dst);
1402                                 rcu_read_unlock();
1403
1404                                 rt = dst_alloc(&ipv4_dst_ops);
1405                                 if (rt == NULL) {
1406                                         ip_rt_put(rth);
1407                                         in_dev_put(in_dev);
1408                                         return;
1409                                 }
1410
1411                                 /* Copy all the information. */
1412                                 *rt = *rth;
1413                                 rt->u.dst.__use         = 1;
1414                                 atomic_set(&rt->u.dst.__refcnt, 1);
1415                                 rt->u.dst.child         = NULL;
1416                                 if (rt->u.dst.dev)
1417                                         dev_hold(rt->u.dst.dev);
1418                                 if (rt->idev)
1419                                         in_dev_hold(rt->idev);
1420                                 rt->u.dst.obsolete      = 0;
1421                                 rt->u.dst.lastuse       = jiffies;
1422                                 rt->u.dst.path          = &rt->u.dst;
1423                                 rt->u.dst.neighbour     = NULL;
1424                                 rt->u.dst.hh            = NULL;
1425 #ifdef CONFIG_XFRM
1426                                 rt->u.dst.xfrm          = NULL;
1427 #endif
1428                                 rt->rt_genid            = rt_genid(net);
1429                                 rt->rt_flags            |= RTCF_REDIRECTED;
1430
1431                                 /* Gateway is different ... */
1432                                 rt->rt_gateway          = new_gw;
1433
1434                                 /* Redirect received -> path was valid */
1435                                 dst_confirm(&rth->u.dst);
1436
1437                                 if (rt->peer)
1438                                         atomic_inc(&rt->peer->refcnt);
1439
1440                                 if (arp_bind_neighbour(&rt->u.dst) ||
1441                                     !(rt->u.dst.neighbour->nud_state &
1442                                             NUD_VALID)) {
1443                                         if (rt->u.dst.neighbour)
1444                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1445                                         ip_rt_put(rth);
1446                                         rt_drop(rt);
1447                                         goto do_next;
1448                                 }
1449
1450                                 netevent.old = &rth->u.dst;
1451                                 netevent.new = &rt->u.dst;
1452                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1453                                                         &netevent);
1454
1455                                 rt_del(hash, rth);
1456                                 if (!rt_intern_hash(hash, rt, &rt, NULL))
1457                                         ip_rt_put(rt);
1458                                 goto do_next;
1459                         }
1460                         rcu_read_unlock();
1461                 do_next:
1462                         ;
1463                 }
1464         }
1465         in_dev_put(in_dev);
1466         return;
1467
1468 reject_redirect:
1469 #ifdef CONFIG_IP_ROUTE_VERBOSE
1470         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1471                 printk(KERN_INFO "Redirect from %pI4 on %s about %pI4 ignored.\n"
1472                         "  Advised path = %pI4 -> %pI4\n",
1473                        &old_gw, dev->name, &new_gw,
1474                        &saddr, &daddr);
1475 #endif
1476         in_dev_put(in_dev);
1477 }
1478
1479 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1480 {
1481         struct rtable *rt = (struct rtable *)dst;
1482         struct dst_entry *ret = dst;
1483
1484         if (rt) {
1485                 if (dst->obsolete) {
1486                         ip_rt_put(rt);
1487                         ret = NULL;
1488                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1489                            rt->u.dst.expires) {
1490                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1491                                                 rt->fl.oif,
1492                                                 rt_genid(dev_net(dst->dev)));
1493 #if RT_CACHE_DEBUG >= 1
1494                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to %pI4/%02x dropped\n",
1495                                 &rt->rt_dst, rt->fl.fl4_tos);
1496 #endif
1497                         rt_del(hash, rt);
1498                         ret = NULL;
1499                 }
1500         }
1501         return ret;
1502 }
1503
1504 /*
1505  * Algorithm:
1506  *      1. The first ip_rt_redirect_number redirects are sent
1507  *         with exponential backoff, then we stop sending them at all,
1508  *         assuming that the host ignores our redirects.
1509  *      2. If we did not see packets requiring redirects
1510  *         during ip_rt_redirect_silence, we assume that the host
1511  *         forgot redirected route and start to send redirects again.
1512  *
1513  * This algorithm is much cheaper and more intelligent than dumb load limiting
1514  * in icmp.c.
1515  *
1516  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1517  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1518  */
1519
1520 void ip_rt_send_redirect(struct sk_buff *skb)
1521 {
1522         struct rtable *rt = skb_rtable(skb);
1523         struct in_device *in_dev;
1524         int log_martians;
1525
1526         rcu_read_lock();
1527         in_dev = __in_dev_get_rcu(rt->u.dst.dev);
1528         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
1529                 rcu_read_unlock();
1530                 return;
1531         }
1532         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
1533         rcu_read_unlock();
1534
1535         /* No redirected packets during ip_rt_redirect_silence;
1536          * reset the algorithm.
1537          */
1538         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1539                 rt->u.dst.rate_tokens = 0;
1540
1541         /* Too many ignored redirects; do not send anything
1542          * set u.dst.rate_last to the last seen redirected packet.
1543          */
1544         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1545                 rt->u.dst.rate_last = jiffies;
1546                 return;
1547         }
1548
1549         /* Check for load limit; set rate_last to the latest sent
1550          * redirect.
1551          */
1552         if (rt->u.dst.rate_tokens == 0 ||
1553             time_after(jiffies,
1554                        (rt->u.dst.rate_last +
1555                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1556                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1557                 rt->u.dst.rate_last = jiffies;
1558                 ++rt->u.dst.rate_tokens;
1559 #ifdef CONFIG_IP_ROUTE_VERBOSE
1560                 if (log_martians &&
1561                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1562                     net_ratelimit())
1563                         printk(KERN_WARNING "host %pI4/if%d ignores redirects for %pI4 to %pI4.\n",
1564                                 &rt->rt_src, rt->rt_iif,
1565                                 &rt->rt_dst, &rt->rt_gateway);
1566 #endif
1567         }
1568 }
1569
1570 static int ip_error(struct sk_buff *skb)
1571 {
1572         struct rtable *rt = skb_rtable(skb);
1573         unsigned long now;
1574         int code;
1575
1576         switch (rt->u.dst.error) {
1577                 case EINVAL:
1578                 default:
1579                         goto out;
1580                 case EHOSTUNREACH:
1581                         code = ICMP_HOST_UNREACH;
1582                         break;
1583                 case ENETUNREACH:
1584                         code = ICMP_NET_UNREACH;
1585                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1586                                         IPSTATS_MIB_INNOROUTES);
1587                         break;
1588                 case EACCES:
1589                         code = ICMP_PKT_FILTERED;
1590                         break;
1591         }
1592
1593         now = jiffies;
1594         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1595         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1596                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1597         rt->u.dst.rate_last = now;
1598         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1599                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1600                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1601         }
1602
1603 out:    kfree_skb(skb);
1604         return 0;
1605 }
1606
1607 /*
1608  *      The last two values are not from the RFC but
1609  *      are needed for AMPRnet AX.25 paths.
1610  */
1611
1612 static const unsigned short mtu_plateau[] =
1613 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1614
1615 static inline unsigned short guess_mtu(unsigned short old_mtu)
1616 {
1617         int i;
1618
1619         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1620                 if (old_mtu > mtu_plateau[i])
1621                         return mtu_plateau[i];
1622         return 68;
1623 }
1624
1625 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1626                                  unsigned short new_mtu,
1627                                  struct net_device *dev)
1628 {
1629         int i, k;
1630         unsigned short old_mtu = ntohs(iph->tot_len);
1631         struct rtable *rth;
1632         int  ikeys[2] = { dev->ifindex, 0 };
1633         __be32  skeys[2] = { iph->saddr, 0, };
1634         __be32  daddr = iph->daddr;
1635         unsigned short est_mtu = 0;
1636
1637         for (k = 0; k < 2; k++) {
1638                 for (i = 0; i < 2; i++) {
1639                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1640                                                 rt_genid(net));
1641
1642                         rcu_read_lock();
1643                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1644                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1645                                 unsigned short mtu = new_mtu;
1646
1647                                 if (rth->fl.fl4_dst != daddr ||
1648                                     rth->fl.fl4_src != skeys[i] ||
1649                                     rth->rt_dst != daddr ||
1650                                     rth->rt_src != iph->saddr ||
1651                                     rth->fl.oif != ikeys[k] ||
1652                                     rth->fl.iif != 0 ||
1653                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1654                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1655                                     rt_is_expired(rth))
1656                                         continue;
1657
1658                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1659
1660                                         /* BSD 4.2 compatibility hack :-( */
1661                                         if (mtu == 0 &&
1662                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1663                                             old_mtu >= 68 + (iph->ihl << 2))
1664                                                 old_mtu -= iph->ihl << 2;
1665
1666                                         mtu = guess_mtu(old_mtu);
1667                                 }
1668                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1669                                         if (mtu < dst_mtu(&rth->u.dst)) {
1670                                                 dst_confirm(&rth->u.dst);
1671                                                 if (mtu < ip_rt_min_pmtu) {
1672                                                         mtu = ip_rt_min_pmtu;
1673                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1674                                                                 (1 << RTAX_MTU);
1675                                                 }
1676                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1677                                                 dst_set_expires(&rth->u.dst,
1678                                                         ip_rt_mtu_expires);
1679                                         }
1680                                         est_mtu = mtu;
1681                                 }
1682                         }
1683                         rcu_read_unlock();
1684                 }
1685         }
1686         return est_mtu ? : new_mtu;
1687 }
1688
1689 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1690 {
1691         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1692             !(dst_metric_locked(dst, RTAX_MTU))) {
1693                 if (mtu < ip_rt_min_pmtu) {
1694                         mtu = ip_rt_min_pmtu;
1695                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1696                 }
1697                 dst->metrics[RTAX_MTU-1] = mtu;
1698                 dst_set_expires(dst, ip_rt_mtu_expires);
1699                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1700         }
1701 }
1702
1703 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1704 {
1705         return NULL;
1706 }
1707
1708 static void ipv4_dst_destroy(struct dst_entry *dst)
1709 {
1710         struct rtable *rt = (struct rtable *) dst;
1711         struct inet_peer *peer = rt->peer;
1712         struct in_device *idev = rt->idev;
1713
1714         if (peer) {
1715                 rt->peer = NULL;
1716                 inet_putpeer(peer);
1717         }
1718
1719         if (idev) {
1720                 rt->idev = NULL;
1721                 in_dev_put(idev);
1722         }
1723 }
1724
1725 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1726                             int how)
1727 {
1728         struct rtable *rt = (struct rtable *) dst;
1729         struct in_device *idev = rt->idev;
1730         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1731                 struct in_device *loopback_idev =
1732                         in_dev_get(dev_net(dev)->loopback_dev);
1733                 if (loopback_idev) {
1734                         rt->idev = loopback_idev;
1735                         in_dev_put(idev);
1736                 }
1737         }
1738 }
1739
1740 static void ipv4_link_failure(struct sk_buff *skb)
1741 {
1742         struct rtable *rt;
1743
1744         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1745
1746         rt = skb_rtable(skb);
1747         if (rt)
1748                 dst_set_expires(&rt->u.dst, 0);
1749 }
1750
1751 static int ip_rt_bug(struct sk_buff *skb)
1752 {
1753         printk(KERN_DEBUG "ip_rt_bug: %pI4 -> %pI4, %s\n",
1754                 &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1755                 skb->dev ? skb->dev->name : "?");
1756         kfree_skb(skb);
1757         return 0;
1758 }
1759
1760 /*
1761    We do not cache source address of outgoing interface,
1762    because it is used only by IP RR, TS and SRR options,
1763    so that it out of fast path.
1764
1765    BTW remember: "addr" is allowed to be not aligned
1766    in IP options!
1767  */
1768
1769 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1770 {
1771         __be32 src;
1772         struct fib_result res;
1773
1774         if (rt->fl.iif == 0)
1775                 src = rt->rt_src;
1776         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1777                 src = FIB_RES_PREFSRC(res);
1778                 fib_res_put(&res);
1779         } else
1780                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1781                                         RT_SCOPE_UNIVERSE);
1782         memcpy(addr, &src, 4);
1783 }
1784
1785 #ifdef CONFIG_NET_CLS_ROUTE
1786 static void set_class_tag(struct rtable *rt, u32 tag)
1787 {
1788         if (!(rt->u.dst.tclassid & 0xFFFF))
1789                 rt->u.dst.tclassid |= tag & 0xFFFF;
1790         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1791                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1792 }
1793 #endif
1794
1795 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1796 {
1797         struct fib_info *fi = res->fi;
1798
1799         if (fi) {
1800                 if (FIB_RES_GW(*res) &&
1801                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1802                         rt->rt_gateway = FIB_RES_GW(*res);
1803                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1804                        sizeof(rt->u.dst.metrics));
1805                 if (fi->fib_mtu == 0) {
1806                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1807                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1808                             rt->rt_gateway != rt->rt_dst &&
1809                             rt->u.dst.dev->mtu > 576)
1810                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1811                 }
1812 #ifdef CONFIG_NET_CLS_ROUTE
1813                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1814 #endif
1815         } else
1816                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1817
1818         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1819                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1820         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1821                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1822         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1823                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1824                                        ip_rt_min_advmss);
1825         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1826                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1827
1828 #ifdef CONFIG_NET_CLS_ROUTE
1829 #ifdef CONFIG_IP_MULTIPLE_TABLES
1830         set_class_tag(rt, fib_rules_tclass(res));
1831 #endif
1832         set_class_tag(rt, itag);
1833 #endif
1834         rt->rt_type = res->type;
1835 }
1836
1837 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1838                                 u8 tos, struct net_device *dev, int our)
1839 {
1840         unsigned hash;
1841         struct rtable *rth;
1842         __be32 spec_dst;
1843         struct in_device *in_dev = in_dev_get(dev);
1844         u32 itag = 0;
1845
1846         /* Primary sanity checks. */
1847
1848         if (in_dev == NULL)
1849                 return -EINVAL;
1850
1851         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1852             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1853                 goto e_inval;
1854
1855         if (ipv4_is_zeronet(saddr)) {
1856                 if (!ipv4_is_local_multicast(daddr))
1857                         goto e_inval;
1858                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1859         } else if (fib_validate_source(saddr, 0, tos, 0,
1860                                         dev, &spec_dst, &itag, 0) < 0)
1861                 goto e_inval;
1862
1863         rth = dst_alloc(&ipv4_dst_ops);
1864         if (!rth)
1865                 goto e_nobufs;
1866
1867         rth->u.dst.output= ip_rt_bug;
1868
1869         atomic_set(&rth->u.dst.__refcnt, 1);
1870         rth->u.dst.flags= DST_HOST;
1871         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1872                 rth->u.dst.flags |= DST_NOPOLICY;
1873         rth->fl.fl4_dst = daddr;
1874         rth->rt_dst     = daddr;
1875         rth->fl.fl4_tos = tos;
1876         rth->fl.mark    = skb->mark;
1877         rth->fl.fl4_src = saddr;
1878         rth->rt_src     = saddr;
1879 #ifdef CONFIG_NET_CLS_ROUTE
1880         rth->u.dst.tclassid = itag;
1881 #endif
1882         rth->rt_iif     =
1883         rth->fl.iif     = dev->ifindex;
1884         rth->u.dst.dev  = init_net.loopback_dev;
1885         dev_hold(rth->u.dst.dev);
1886         rth->idev       = in_dev_get(rth->u.dst.dev);
1887         rth->fl.oif     = 0;
1888         rth->rt_gateway = daddr;
1889         rth->rt_spec_dst= spec_dst;
1890         rth->rt_genid   = rt_genid(dev_net(dev));
1891         rth->rt_flags   = RTCF_MULTICAST;
1892         rth->rt_type    = RTN_MULTICAST;
1893         if (our) {
1894                 rth->u.dst.input= ip_local_deliver;
1895                 rth->rt_flags |= RTCF_LOCAL;
1896         }
1897
1898 #ifdef CONFIG_IP_MROUTE
1899         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1900                 rth->u.dst.input = ip_mr_input;
1901 #endif
1902         RT_CACHE_STAT_INC(in_slow_mc);
1903
1904         in_dev_put(in_dev);
1905         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1906         return rt_intern_hash(hash, rth, NULL, skb);
1907
1908 e_nobufs:
1909         in_dev_put(in_dev);
1910         return -ENOBUFS;
1911
1912 e_inval:
1913         in_dev_put(in_dev);
1914         return -EINVAL;
1915 }
1916
1917
1918 static void ip_handle_martian_source(struct net_device *dev,
1919                                      struct in_device *in_dev,
1920                                      struct sk_buff *skb,
1921                                      __be32 daddr,
1922                                      __be32 saddr)
1923 {
1924         RT_CACHE_STAT_INC(in_martian_src);
1925 #ifdef CONFIG_IP_ROUTE_VERBOSE
1926         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1927                 /*
1928                  *      RFC1812 recommendation, if source is martian,
1929                  *      the only hint is MAC header.
1930                  */
1931                 printk(KERN_WARNING "martian source %pI4 from %pI4, on dev %s\n",
1932                         &daddr, &saddr, dev->name);
1933                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1934                         int i;
1935                         const unsigned char *p = skb_mac_header(skb);
1936                         printk(KERN_WARNING "ll header: ");
1937                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1938                                 printk("%02x", *p);
1939                                 if (i < (dev->hard_header_len - 1))
1940                                         printk(":");
1941                         }
1942                         printk("\n");
1943                 }
1944         }
1945 #endif
1946 }
1947
1948 static int __mkroute_input(struct sk_buff *skb,
1949                            struct fib_result *res,
1950                            struct in_device *in_dev,
1951                            __be32 daddr, __be32 saddr, u32 tos,
1952                            struct rtable **result)
1953 {
1954
1955         struct rtable *rth;
1956         int err;
1957         struct in_device *out_dev;
1958         unsigned flags = 0;
1959         __be32 spec_dst;
1960         u32 itag;
1961
1962         /* get a working reference to the output device */
1963         out_dev = in_dev_get(FIB_RES_DEV(*res));
1964         if (out_dev == NULL) {
1965                 if (net_ratelimit())
1966                         printk(KERN_CRIT "Bug in ip_route_input" \
1967                                "_slow(). Please, report\n");
1968                 return -EINVAL;
1969         }
1970
1971
1972         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1973                                   in_dev->dev, &spec_dst, &itag, skb->mark);
1974         if (err < 0) {
1975                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1976                                          saddr);
1977
1978                 err = -EINVAL;
1979                 goto cleanup;
1980         }
1981
1982         if (err)
1983                 flags |= RTCF_DIRECTSRC;
1984
1985         if (out_dev == in_dev && err &&
1986             (IN_DEV_SHARED_MEDIA(out_dev) ||
1987              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1988                 flags |= RTCF_DOREDIRECT;
1989
1990         if (skb->protocol != htons(ETH_P_IP)) {
1991                 /* Not IP (i.e. ARP). Do not create route, if it is
1992                  * invalid for proxy arp. DNAT routes are always valid.
1993                  */
1994                 if (out_dev == in_dev) {
1995                         err = -EINVAL;
1996                         goto cleanup;
1997                 }
1998         }
1999
2000
2001         rth = dst_alloc(&ipv4_dst_ops);
2002         if (!rth) {
2003                 err = -ENOBUFS;
2004                 goto cleanup;
2005         }
2006
2007         atomic_set(&rth->u.dst.__refcnt, 1);
2008         rth->u.dst.flags= DST_HOST;
2009         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2010                 rth->u.dst.flags |= DST_NOPOLICY;
2011         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
2012                 rth->u.dst.flags |= DST_NOXFRM;
2013         rth->fl.fl4_dst = daddr;
2014         rth->rt_dst     = daddr;
2015         rth->fl.fl4_tos = tos;
2016         rth->fl.mark    = skb->mark;
2017         rth->fl.fl4_src = saddr;
2018         rth->rt_src     = saddr;
2019         rth->rt_gateway = daddr;
2020         rth->rt_iif     =
2021                 rth->fl.iif     = in_dev->dev->ifindex;
2022         rth->u.dst.dev  = (out_dev)->dev;
2023         dev_hold(rth->u.dst.dev);
2024         rth->idev       = in_dev_get(rth->u.dst.dev);
2025         rth->fl.oif     = 0;
2026         rth->rt_spec_dst= spec_dst;
2027
2028         rth->u.dst.input = ip_forward;
2029         rth->u.dst.output = ip_output;
2030         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
2031
2032         rt_set_nexthop(rth, res, itag);
2033
2034         rth->rt_flags = flags;
2035
2036         *result = rth;
2037         err = 0;
2038  cleanup:
2039         /* release the working reference to the output device */
2040         in_dev_put(out_dev);
2041         return err;
2042 }
2043
2044 static int ip_mkroute_input(struct sk_buff *skb,
2045                             struct fib_result *res,
2046                             const struct flowi *fl,
2047                             struct in_device *in_dev,
2048                             __be32 daddr, __be32 saddr, u32 tos)
2049 {
2050         struct rtable* rth = NULL;
2051         int err;
2052         unsigned hash;
2053
2054 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2055         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
2056                 fib_select_multipath(fl, res);
2057 #endif
2058
2059         /* create a routing cache entry */
2060         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
2061         if (err)
2062                 return err;
2063
2064         /* put it into the cache */
2065         hash = rt_hash(daddr, saddr, fl->iif,
2066                        rt_genid(dev_net(rth->u.dst.dev)));
2067         return rt_intern_hash(hash, rth, NULL, skb);
2068 }
2069
2070 /*
2071  *      NOTE. We drop all the packets that has local source
2072  *      addresses, because every properly looped back packet
2073  *      must have correct destination already attached by output routine.
2074  *
2075  *      Such approach solves two big problems:
2076  *      1. Not simplex devices are handled properly.
2077  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2078  */
2079
2080 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081                                u8 tos, struct net_device *dev)
2082 {
2083         struct fib_result res;
2084         struct in_device *in_dev = in_dev_get(dev);
2085         struct flowi fl = { .nl_u = { .ip4_u =
2086                                       { .daddr = daddr,
2087                                         .saddr = saddr,
2088                                         .tos = tos,
2089                                         .scope = RT_SCOPE_UNIVERSE,
2090                                       } },
2091                             .mark = skb->mark,
2092                             .iif = dev->ifindex };
2093         unsigned        flags = 0;
2094         u32             itag = 0;
2095         struct rtable * rth;
2096         unsigned        hash;
2097         __be32          spec_dst;
2098         int             err = -EINVAL;
2099         int             free_res = 0;
2100         struct net    * net = dev_net(dev);
2101
2102         /* IP on this device is disabled. */
2103
2104         if (!in_dev)
2105                 goto out;
2106
2107         /* Check for the most weird martians, which can be not detected
2108            by fib_lookup.
2109          */
2110
2111         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
2112             ipv4_is_loopback(saddr))
2113                 goto martian_source;
2114
2115         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
2116                 goto brd_input;
2117
2118         /* Accept zero addresses only to limited broadcast;
2119          * I even do not know to fix it or not. Waiting for complains :-)
2120          */
2121         if (ipv4_is_zeronet(saddr))
2122                 goto martian_source;
2123
2124         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
2125             ipv4_is_loopback(daddr))
2126                 goto martian_destination;
2127
2128         /*
2129          *      Now we are ready to route packet.
2130          */
2131         if ((err = fib_lookup(net, &fl, &res)) != 0) {
2132                 if (!IN_DEV_FORWARD(in_dev))
2133                         goto e_hostunreach;
2134                 goto no_route;
2135         }
2136         free_res = 1;
2137
2138         RT_CACHE_STAT_INC(in_slow_tot);
2139
2140         if (res.type == RTN_BROADCAST)
2141                 goto brd_input;
2142
2143         if (res.type == RTN_LOCAL) {
2144                 int result;
2145                 result = fib_validate_source(saddr, daddr, tos,
2146                                              net->loopback_dev->ifindex,
2147                                              dev, &spec_dst, &itag, skb->mark);
2148                 if (result < 0)
2149                         goto martian_source;
2150                 if (result)
2151                         flags |= RTCF_DIRECTSRC;
2152                 spec_dst = daddr;
2153                 goto local_input;
2154         }
2155
2156         if (!IN_DEV_FORWARD(in_dev))
2157                 goto e_hostunreach;
2158         if (res.type != RTN_UNICAST)
2159                 goto martian_destination;
2160
2161         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2162 done:
2163         in_dev_put(in_dev);
2164         if (free_res)
2165                 fib_res_put(&res);
2166 out:    return err;
2167
2168 brd_input:
2169         if (skb->protocol != htons(ETH_P_IP))
2170                 goto e_inval;
2171
2172         if (ipv4_is_zeronet(saddr))
2173                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2174         else {
2175                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2176                                           &itag, skb->mark);
2177                 if (err < 0)
2178                         goto martian_source;
2179                 if (err)
2180                         flags |= RTCF_DIRECTSRC;
2181         }
2182         flags |= RTCF_BROADCAST;
2183         res.type = RTN_BROADCAST;
2184         RT_CACHE_STAT_INC(in_brd);
2185
2186 local_input:
2187         rth = dst_alloc(&ipv4_dst_ops);
2188         if (!rth)
2189                 goto e_nobufs;
2190
2191         rth->u.dst.output= ip_rt_bug;
2192         rth->rt_genid = rt_genid(net);
2193
2194         atomic_set(&rth->u.dst.__refcnt, 1);
2195         rth->u.dst.flags= DST_HOST;
2196         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2197                 rth->u.dst.flags |= DST_NOPOLICY;
2198         rth->fl.fl4_dst = daddr;
2199         rth->rt_dst     = daddr;
2200         rth->fl.fl4_tos = tos;
2201         rth->fl.mark    = skb->mark;
2202         rth->fl.fl4_src = saddr;
2203         rth->rt_src     = saddr;
2204 #ifdef CONFIG_NET_CLS_ROUTE
2205         rth->u.dst.tclassid = itag;
2206 #endif
2207         rth->rt_iif     =
2208         rth->fl.iif     = dev->ifindex;
2209         rth->u.dst.dev  = net->loopback_dev;
2210         dev_hold(rth->u.dst.dev);
2211         rth->idev       = in_dev_get(rth->u.dst.dev);
2212         rth->rt_gateway = daddr;
2213         rth->rt_spec_dst= spec_dst;
2214         rth->u.dst.input= ip_local_deliver;
2215         rth->rt_flags   = flags|RTCF_LOCAL;
2216         if (res.type == RTN_UNREACHABLE) {
2217                 rth->u.dst.input= ip_error;
2218                 rth->u.dst.error= -err;
2219                 rth->rt_flags   &= ~RTCF_LOCAL;
2220         }
2221         rth->rt_type    = res.type;
2222         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2223         err = rt_intern_hash(hash, rth, NULL, skb);
2224         goto done;
2225
2226 no_route:
2227         RT_CACHE_STAT_INC(in_no_route);
2228         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2229         res.type = RTN_UNREACHABLE;
2230         if (err == -ESRCH)
2231                 err = -ENETUNREACH;
2232         goto local_input;
2233
2234         /*
2235          *      Do not cache martian addresses: they should be logged (RFC1812)
2236          */
2237 martian_destination:
2238         RT_CACHE_STAT_INC(in_martian_dst);
2239 #ifdef CONFIG_IP_ROUTE_VERBOSE
2240         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2241                 printk(KERN_WARNING "martian destination %pI4 from %pI4, dev %s\n",
2242                         &daddr, &saddr, dev->name);
2243 #endif
2244
2245 e_hostunreach:
2246         err = -EHOSTUNREACH;
2247         goto done;
2248
2249 e_inval:
2250         err = -EINVAL;
2251         goto done;
2252
2253 e_nobufs:
2254         err = -ENOBUFS;
2255         goto done;
2256
2257 martian_source:
2258         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2259         goto e_inval;
2260 }
2261
2262 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2263                    u8 tos, struct net_device *dev)
2264 {
2265         struct rtable * rth;
2266         unsigned        hash;
2267         int iif = dev->ifindex;
2268         struct net *net;
2269
2270         net = dev_net(dev);
2271
2272         if (!rt_caching(net))
2273                 goto skip_cache;
2274
2275         tos &= IPTOS_RT_MASK;
2276         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2277
2278         rcu_read_lock();
2279         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2280              rth = rcu_dereference(rth->u.dst.rt_next)) {
2281                 if (((rth->fl.fl4_dst ^ daddr) |
2282                      (rth->fl.fl4_src ^ saddr) |
2283                      (rth->fl.iif ^ iif) |
2284                      rth->fl.oif |
2285                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2286                     rth->fl.mark == skb->mark &&
2287                     net_eq(dev_net(rth->u.dst.dev), net) &&
2288                     !rt_is_expired(rth)) {
2289                         dst_use(&rth->u.dst, jiffies);
2290                         RT_CACHE_STAT_INC(in_hit);
2291                         rcu_read_unlock();
2292                         skb_dst_set(skb, &rth->u.dst);
2293                         return 0;
2294                 }
2295                 RT_CACHE_STAT_INC(in_hlist_search);
2296         }
2297         rcu_read_unlock();
2298
2299 skip_cache:
2300         /* Multicast recognition logic is moved from route cache to here.
2301            The problem was that too many Ethernet cards have broken/missing
2302            hardware multicast filters :-( As result the host on multicasting
2303            network acquires a lot of useless route cache entries, sort of
2304            SDR messages from all the world. Now we try to get rid of them.
2305            Really, provided software IP multicast filter is organized
2306            reasonably (at least, hashed), it does not result in a slowdown
2307            comparing with route cache reject entries.
2308            Note, that multicast routers are not affected, because
2309            route cache entry is created eventually.
2310          */
2311         if (ipv4_is_multicast(daddr)) {
2312                 struct in_device *in_dev;
2313
2314                 rcu_read_lock();
2315                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2316                         int our = ip_check_mc(in_dev, daddr, saddr,
2317                                 ip_hdr(skb)->protocol);
2318                         if (our
2319 #ifdef CONFIG_IP_MROUTE
2320                                 ||
2321                             (!ipv4_is_local_multicast(daddr) &&
2322                              IN_DEV_MFORWARD(in_dev))
2323 #endif
2324                            ) {
2325                                 rcu_read_unlock();
2326                                 return ip_route_input_mc(skb, daddr, saddr,
2327                                                          tos, dev, our);
2328                         }
2329                 }
2330                 rcu_read_unlock();
2331                 return -EINVAL;
2332         }
2333         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2334 }
2335
2336 static int __mkroute_output(struct rtable **result,
2337                             struct fib_result *res,
2338                             const struct flowi *fl,
2339                             const struct flowi *oldflp,
2340                             struct net_device *dev_out,
2341                             unsigned flags)
2342 {
2343         struct rtable *rth;
2344         struct in_device *in_dev;
2345         u32 tos = RT_FL_TOS(oldflp);
2346         int err = 0;
2347
2348         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2349                 return -EINVAL;
2350
2351         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2352                 res->type = RTN_BROADCAST;
2353         else if (ipv4_is_multicast(fl->fl4_dst))
2354                 res->type = RTN_MULTICAST;
2355         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2356                 return -EINVAL;
2357
2358         if (dev_out->flags & IFF_LOOPBACK)
2359                 flags |= RTCF_LOCAL;
2360
2361         /* get work reference to inet device */
2362         in_dev = in_dev_get(dev_out);
2363         if (!in_dev)
2364                 return -EINVAL;
2365
2366         if (res->type == RTN_BROADCAST) {
2367                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2368                 if (res->fi) {
2369                         fib_info_put(res->fi);
2370                         res->fi = NULL;
2371                 }
2372         } else if (res->type == RTN_MULTICAST) {
2373                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2374                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2375                                  oldflp->proto))
2376                         flags &= ~RTCF_LOCAL;
2377                 /* If multicast route do not exist use
2378                    default one, but do not gateway in this case.
2379                    Yes, it is hack.
2380                  */
2381                 if (res->fi && res->prefixlen < 4) {
2382                         fib_info_put(res->fi);
2383                         res->fi = NULL;
2384                 }
2385         }
2386
2387
2388         rth = dst_alloc(&ipv4_dst_ops);
2389         if (!rth) {
2390                 err = -ENOBUFS;
2391                 goto cleanup;
2392         }
2393
2394         atomic_set(&rth->u.dst.__refcnt, 1);
2395         rth->u.dst.flags= DST_HOST;
2396         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2397                 rth->u.dst.flags |= DST_NOXFRM;
2398         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2399                 rth->u.dst.flags |= DST_NOPOLICY;
2400
2401         rth->fl.fl4_dst = oldflp->fl4_dst;
2402         rth->fl.fl4_tos = tos;
2403         rth->fl.fl4_src = oldflp->fl4_src;
2404         rth->fl.oif     = oldflp->oif;
2405         rth->fl.mark    = oldflp->mark;
2406         rth->rt_dst     = fl->fl4_dst;
2407         rth->rt_src     = fl->fl4_src;
2408         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2409         /* get references to the devices that are to be hold by the routing
2410            cache entry */
2411         rth->u.dst.dev  = dev_out;
2412         dev_hold(dev_out);
2413         rth->idev       = in_dev_get(dev_out);
2414         rth->rt_gateway = fl->fl4_dst;
2415         rth->rt_spec_dst= fl->fl4_src;
2416
2417         rth->u.dst.output=ip_output;
2418         rth->rt_genid = rt_genid(dev_net(dev_out));
2419
2420         RT_CACHE_STAT_INC(out_slow_tot);
2421
2422         if (flags & RTCF_LOCAL) {
2423                 rth->u.dst.input = ip_local_deliver;
2424                 rth->rt_spec_dst = fl->fl4_dst;
2425         }
2426         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2427                 rth->rt_spec_dst = fl->fl4_src;
2428                 if (flags & RTCF_LOCAL &&
2429                     !(dev_out->flags & IFF_LOOPBACK)) {
2430                         rth->u.dst.output = ip_mc_output;
2431                         RT_CACHE_STAT_INC(out_slow_mc);
2432                 }
2433 #ifdef CONFIG_IP_MROUTE
2434                 if (res->type == RTN_MULTICAST) {
2435                         if (IN_DEV_MFORWARD(in_dev) &&
2436                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2437                                 rth->u.dst.input = ip_mr_input;
2438                                 rth->u.dst.output = ip_mc_output;
2439                         }
2440                 }
2441 #endif
2442         }
2443
2444         rt_set_nexthop(rth, res, 0);
2445
2446         rth->rt_flags = flags;
2447
2448         *result = rth;
2449  cleanup:
2450         /* release work reference to inet device */
2451         in_dev_put(in_dev);
2452
2453         return err;
2454 }
2455
2456 static int ip_mkroute_output(struct rtable **rp,
2457                              struct fib_result *res,
2458                              const struct flowi *fl,
2459                              const struct flowi *oldflp,
2460                              struct net_device *dev_out,
2461                              unsigned flags)
2462 {
2463         struct rtable *rth = NULL;
2464         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2465         unsigned hash;
2466         if (err == 0) {
2467                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2468                                rt_genid(dev_net(dev_out)));
2469                 err = rt_intern_hash(hash, rth, rp, NULL);
2470         }
2471
2472         return err;
2473 }
2474
2475 /*
2476  * Major route resolver routine.
2477  */
2478
2479 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2480                                 const struct flowi *oldflp)
2481 {
2482         u32 tos = RT_FL_TOS(oldflp);
2483         struct flowi fl = { .nl_u = { .ip4_u =
2484                                       { .daddr = oldflp->fl4_dst,
2485                                         .saddr = oldflp->fl4_src,
2486                                         .tos = tos & IPTOS_RT_MASK,
2487                                         .scope = ((tos & RTO_ONLINK) ?
2488                                                   RT_SCOPE_LINK :
2489                                                   RT_SCOPE_UNIVERSE),
2490                                       } },
2491                             .mark = oldflp->mark,
2492                             .iif = net->loopback_dev->ifindex,
2493                             .oif = oldflp->oif };
2494         struct fib_result res;
2495         unsigned flags = 0;
2496         struct net_device *dev_out = NULL;
2497         int free_res = 0;
2498         int err;
2499
2500
2501         res.fi          = NULL;
2502 #ifdef CONFIG_IP_MULTIPLE_TABLES
2503         res.r           = NULL;
2504 #endif
2505
2506         if (oldflp->fl4_src) {
2507                 err = -EINVAL;
2508                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2509                     ipv4_is_lbcast(oldflp->fl4_src) ||
2510                     ipv4_is_zeronet(oldflp->fl4_src))
2511                         goto out;
2512
2513                 /* I removed check for oif == dev_out->oif here.
2514                    It was wrong for two reasons:
2515                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2516                       is assigned to multiple interfaces.
2517                    2. Moreover, we are allowed to send packets with saddr
2518                       of another iface. --ANK
2519                  */
2520
2521                 if (oldflp->oif == 0 &&
2522                     (ipv4_is_multicast(oldflp->fl4_dst) ||
2523                      oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2524                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2525                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2526                         if (dev_out == NULL)
2527                                 goto out;
2528
2529                         /* Special hack: user can direct multicasts
2530                            and limited broadcast via necessary interface
2531                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2532                            This hack is not just for fun, it allows
2533                            vic,vat and friends to work.
2534                            They bind socket to loopback, set ttl to zero
2535                            and expect that it will work.
2536                            From the viewpoint of routing cache they are broken,
2537                            because we are not allowed to build multicast path
2538                            with loopback source addr (look, routing cache
2539                            cannot know, that ttl is zero, so that packet
2540                            will not leave this host and route is valid).
2541                            Luckily, this hack is good workaround.
2542                          */
2543
2544                         fl.oif = dev_out->ifindex;
2545                         goto make_route;
2546                 }
2547
2548                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2549                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2550                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2551                         if (dev_out == NULL)
2552                                 goto out;
2553                         dev_put(dev_out);
2554                         dev_out = NULL;
2555                 }
2556         }
2557
2558
2559         if (oldflp->oif) {
2560                 dev_out = dev_get_by_index(net, oldflp->oif);
2561                 err = -ENODEV;
2562                 if (dev_out == NULL)
2563                         goto out;
2564
2565                 /* RACE: Check return value of inet_select_addr instead. */
2566                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2567                         dev_put(dev_out);
2568                         goto out;       /* Wrong error code */
2569                 }
2570
2571                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2572                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2573                         if (!fl.fl4_src)
2574                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2575                                                               RT_SCOPE_LINK);
2576                         goto make_route;
2577                 }
2578                 if (!fl.fl4_src) {
2579                         if (ipv4_is_multicast(oldflp->fl4_dst))
2580                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2581                                                               fl.fl4_scope);
2582                         else if (!oldflp->fl4_dst)
2583                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2584                                                               RT_SCOPE_HOST);
2585                 }
2586         }
2587
2588         if (!fl.fl4_dst) {
2589                 fl.fl4_dst = fl.fl4_src;
2590                 if (!fl.fl4_dst)
2591                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2592                 if (dev_out)
2593                         dev_put(dev_out);
2594                 dev_out = net->loopback_dev;
2595                 dev_hold(dev_out);
2596                 fl.oif = net->loopback_dev->ifindex;
2597                 res.type = RTN_LOCAL;
2598                 flags |= RTCF_LOCAL;
2599                 goto make_route;
2600         }
2601
2602         if (fib_lookup(net, &fl, &res)) {
2603                 res.fi = NULL;
2604                 if (oldflp->oif) {
2605                         /* Apparently, routing tables are wrong. Assume,
2606                            that the destination is on link.
2607
2608                            WHY? DW.
2609                            Because we are allowed to send to iface
2610                            even if it has NO routes and NO assigned
2611                            addresses. When oif is specified, routing
2612                            tables are looked up with only one purpose:
2613                            to catch if destination is gatewayed, rather than
2614                            direct. Moreover, if MSG_DONTROUTE is set,
2615                            we send packet, ignoring both routing tables
2616                            and ifaddr state. --ANK
2617
2618
2619                            We could make it even if oif is unknown,
2620                            likely IPv6, but we do not.
2621                          */
2622
2623                         if (fl.fl4_src == 0)
2624                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2625                                                               RT_SCOPE_LINK);
2626                         res.type = RTN_UNICAST;
2627                         goto make_route;
2628                 }
2629                 if (dev_out)
2630                         dev_put(dev_out);
2631                 err = -ENETUNREACH;
2632                 goto out;
2633         }
2634         free_res = 1;
2635
2636         if (res.type == RTN_LOCAL) {
2637                 if (!fl.fl4_src)
2638                         fl.fl4_src = fl.fl4_dst;
2639                 if (dev_out)
2640                         dev_put(dev_out);
2641                 dev_out = net->loopback_dev;
2642                 dev_hold(dev_out);
2643                 fl.oif = dev_out->ifindex;
2644                 if (res.fi)
2645                         fib_info_put(res.fi);
2646                 res.fi = NULL;
2647                 flags |= RTCF_LOCAL;
2648                 goto make_route;
2649         }
2650
2651 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2652         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2653                 fib_select_multipath(&fl, &res);
2654         else
2655 #endif
2656         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2657                 fib_select_default(net, &fl, &res);
2658
2659         if (!fl.fl4_src)
2660                 fl.fl4_src = FIB_RES_PREFSRC(res);
2661
2662         if (dev_out)
2663                 dev_put(dev_out);
2664         dev_out = FIB_RES_DEV(res);
2665         dev_hold(dev_out);
2666         fl.oif = dev_out->ifindex;
2667
2668
2669 make_route:
2670         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2671
2672
2673         if (free_res)
2674                 fib_res_put(&res);
2675         if (dev_out)
2676                 dev_put(dev_out);
2677 out:    return err;
2678 }
2679
2680 int __ip_route_output_key(struct net *net, struct rtable **rp,
2681                           const struct flowi *flp)
2682 {
2683         unsigned hash;
2684         struct rtable *rth;
2685
2686         if (!rt_caching(net))
2687                 goto slow_output;
2688
2689         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2690
2691         rcu_read_lock_bh();
2692         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2693                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2694                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2695                     rth->fl.fl4_src == flp->fl4_src &&
2696                     rth->fl.iif == 0 &&
2697                     rth->fl.oif == flp->oif &&
2698                     rth->fl.mark == flp->mark &&
2699                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2700                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2701                     net_eq(dev_net(rth->u.dst.dev), net) &&
2702                     !rt_is_expired(rth)) {
2703                         dst_use(&rth->u.dst, jiffies);
2704                         RT_CACHE_STAT_INC(out_hit);
2705                         rcu_read_unlock_bh();
2706                         *rp = rth;
2707                         return 0;
2708                 }
2709                 RT_CACHE_STAT_INC(out_hlist_search);
2710         }
2711         rcu_read_unlock_bh();
2712
2713 slow_output:
2714         return ip_route_output_slow(net, rp, flp);
2715 }
2716
2717 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2718
2719 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2720 {
2721 }
2722
2723 static struct dst_ops ipv4_dst_blackhole_ops = {
2724         .family                 =       AF_INET,
2725         .protocol               =       cpu_to_be16(ETH_P_IP),
2726         .destroy                =       ipv4_dst_destroy,
2727         .check                  =       ipv4_dst_check,
2728         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2729         .entries                =       ATOMIC_INIT(0),
2730 };
2731
2732
2733 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2734 {
2735         struct rtable *ort = *rp;
2736         struct rtable *rt = (struct rtable *)
2737                 dst_alloc(&ipv4_dst_blackhole_ops);
2738
2739         if (rt) {
2740                 struct dst_entry *new = &rt->u.dst;
2741
2742                 atomic_set(&new->__refcnt, 1);
2743                 new->__use = 1;
2744                 new->input = dst_discard;
2745                 new->output = dst_discard;
2746                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2747
2748                 new->dev = ort->u.dst.dev;
2749                 if (new->dev)
2750                         dev_hold(new->dev);
2751
2752                 rt->fl = ort->fl;
2753
2754                 rt->idev = ort->idev;
2755                 if (rt->idev)
2756                         in_dev_hold(rt->idev);
2757                 rt->rt_genid = rt_genid(net);
2758                 rt->rt_flags = ort->rt_flags;
2759                 rt->rt_type = ort->rt_type;
2760                 rt->rt_dst = ort->rt_dst;
2761                 rt->rt_src = ort->rt_src;
2762                 rt->rt_iif = ort->rt_iif;
2763                 rt->rt_gateway = ort->rt_gateway;
2764                 rt->rt_spec_dst = ort->rt_spec_dst;
2765                 rt->peer = ort->peer;
2766                 if (rt->peer)
2767                         atomic_inc(&rt->peer->refcnt);
2768
2769                 dst_free(new);
2770         }
2771
2772         dst_release(&(*rp)->u.dst);
2773         *rp = rt;
2774         return (rt ? 0 : -ENOMEM);
2775 }
2776
2777 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2778                          struct sock *sk, int flags)
2779 {
2780         int err;
2781
2782         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2783                 return err;
2784
2785         if (flp->proto) {
2786                 if (!flp->fl4_src)
2787                         flp->fl4_src = (*rp)->rt_src;
2788                 if (!flp->fl4_dst)
2789                         flp->fl4_dst = (*rp)->rt_dst;
2790                 err = __xfrm_lookup(net, (struct dst_entry **)rp, flp, sk,
2791                                     flags ? XFRM_LOOKUP_WAIT : 0);
2792                 if (err == -EREMOTE)
2793                         err = ipv4_dst_blackhole(net, rp, flp);
2794
2795                 return err;
2796         }
2797
2798         return 0;
2799 }
2800
2801 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2802
2803 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2804 {
2805         return ip_route_output_flow(net, rp, flp, NULL, 0);
2806 }
2807
2808 static int rt_fill_info(struct net *net,
2809                         struct sk_buff *skb, u32 pid, u32 seq, int event,
2810                         int nowait, unsigned int flags)
2811 {
2812         struct rtable *rt = skb_rtable(skb);
2813         struct rtmsg *r;
2814         struct nlmsghdr *nlh;
2815         long expires;
2816         u32 id = 0, ts = 0, tsage = 0, error;
2817
2818         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2819         if (nlh == NULL)
2820                 return -EMSGSIZE;
2821
2822         r = nlmsg_data(nlh);
2823         r->rtm_family    = AF_INET;
2824         r->rtm_dst_len  = 32;
2825         r->rtm_src_len  = 0;
2826         r->rtm_tos      = rt->fl.fl4_tos;
2827         r->rtm_table    = RT_TABLE_MAIN;
2828         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2829         r->rtm_type     = rt->rt_type;
2830         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2831         r->rtm_protocol = RTPROT_UNSPEC;
2832         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2833         if (rt->rt_flags & RTCF_NOTIFY)
2834                 r->rtm_flags |= RTM_F_NOTIFY;
2835
2836         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2837
2838         if (rt->fl.fl4_src) {
2839                 r->rtm_src_len = 32;
2840                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2841         }
2842         if (rt->u.dst.dev)
2843                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2844 #ifdef CONFIG_NET_CLS_ROUTE
2845         if (rt->u.dst.tclassid)
2846                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2847 #endif
2848         if (rt->fl.iif)
2849                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2850         else if (rt->rt_src != rt->fl.fl4_src)
2851                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2852
2853         if (rt->rt_dst != rt->rt_gateway)
2854                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2855
2856         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2857                 goto nla_put_failure;
2858
2859         error = rt->u.dst.error;
2860         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2861         if (rt->peer) {
2862                 id = atomic_read(&rt->peer->ip_id_count) & 0xffff;
2863                 if (rt->peer->tcp_ts_stamp) {
2864                         ts = rt->peer->tcp_ts;
2865                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2866                 }
2867         }
2868
2869         if (rt->fl.iif) {
2870 #ifdef CONFIG_IP_MROUTE
2871                 __be32 dst = rt->rt_dst;
2872
2873                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2874                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2875                         int err = ipmr_get_route(net, skb, r, nowait);
2876                         if (err <= 0) {
2877                                 if (!nowait) {
2878                                         if (err == 0)
2879                                                 return 0;
2880                                         goto nla_put_failure;
2881                                 } else {
2882                                         if (err == -EMSGSIZE)
2883                                                 goto nla_put_failure;
2884                                         error = err;
2885                                 }
2886                         }
2887                 } else
2888 #endif
2889                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2890         }
2891
2892         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2893                                expires, error) < 0)
2894                 goto nla_put_failure;
2895
2896         return nlmsg_end(skb, nlh);
2897
2898 nla_put_failure:
2899         nlmsg_cancel(skb, nlh);
2900         return -EMSGSIZE;
2901 }
2902
2903 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2904 {
2905         struct net *net = sock_net(in_skb->sk);
2906         struct rtmsg *rtm;
2907         struct nlattr *tb[RTA_MAX+1];
2908         struct rtable *rt = NULL;
2909         __be32 dst = 0;
2910         __be32 src = 0;
2911         u32 iif;
2912         int err;
2913         struct sk_buff *skb;
2914
2915         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2916         if (err < 0)
2917                 goto errout;
2918
2919         rtm = nlmsg_data(nlh);
2920
2921         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2922         if (skb == NULL) {
2923                 err = -ENOBUFS;
2924                 goto errout;
2925         }
2926
2927         /* Reserve room for dummy headers, this skb can pass
2928            through good chunk of routing engine.
2929          */
2930         skb_reset_mac_header(skb);
2931         skb_reset_network_header(skb);
2932
2933         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2934         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2935         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2936
2937         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2938         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2939         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2940
2941         if (iif) {
2942                 struct net_device *dev;
2943
2944                 dev = __dev_get_by_index(net, iif);
2945                 if (dev == NULL) {
2946                         err = -ENODEV;
2947                         goto errout_free;
2948                 }
2949
2950                 skb->protocol   = htons(ETH_P_IP);
2951                 skb->dev        = dev;
2952                 local_bh_disable();
2953                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2954                 local_bh_enable();
2955
2956                 rt = skb_rtable(skb);
2957                 if (err == 0 && rt->u.dst.error)
2958                         err = -rt->u.dst.error;
2959         } else {
2960                 struct flowi fl = {
2961                         .nl_u = {
2962                                 .ip4_u = {
2963                                         .daddr = dst,
2964                                         .saddr = src,
2965                                         .tos = rtm->rtm_tos,
2966                                 },
2967                         },
2968                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2969                 };
2970                 err = ip_route_output_key(net, &rt, &fl);
2971         }
2972
2973         if (err)
2974                 goto errout_free;
2975
2976         skb_dst_set(skb, &rt->u.dst);
2977         if (rtm->rtm_flags & RTM_F_NOTIFY)
2978                 rt->rt_flags |= RTCF_NOTIFY;
2979
2980         err = rt_fill_info(net, skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2981                            RTM_NEWROUTE, 0, 0);
2982         if (err <= 0)
2983                 goto errout_free;
2984
2985         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2986 errout:
2987         return err;
2988
2989 errout_free:
2990         kfree_skb(skb);
2991         goto errout;
2992 }
2993
2994 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2995 {
2996         struct rtable *rt;
2997         int h, s_h;
2998         int idx, s_idx;
2999         struct net *net;
3000
3001         net = sock_net(skb->sk);
3002
3003         s_h = cb->args[0];
3004         if (s_h < 0)
3005                 s_h = 0;
3006         s_idx = idx = cb->args[1];
3007         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
3008                 if (!rt_hash_table[h].chain)
3009                         continue;
3010                 rcu_read_lock_bh();
3011                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
3012                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
3013                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
3014                                 continue;
3015                         if (rt_is_expired(rt))
3016                                 continue;
3017                         skb_dst_set(skb, dst_clone(&rt->u.dst));
3018                         if (rt_fill_info(net, skb, NETLINK_CB(cb->skb).pid,
3019                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
3020                                          1, NLM_F_MULTI) <= 0) {
3021                                 skb_dst_drop(skb);
3022                                 rcu_read_unlock_bh();
3023                                 goto done;
3024                         }
3025                         skb_dst_drop(skb);
3026                 }
3027                 rcu_read_unlock_bh();
3028         }
3029
3030 done:
3031         cb->args[0] = h;
3032         cb->args[1] = idx;
3033         return skb->len;
3034 }
3035
3036 void ip_rt_multicast_event(struct in_device *in_dev)
3037 {
3038         rt_cache_flush(dev_net(in_dev->dev), 0);
3039 }
3040
3041 #ifdef CONFIG_SYSCTL
3042 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
3043                                         void __user *buffer,
3044                                         size_t *lenp, loff_t *ppos)
3045 {
3046         if (write) {
3047                 int flush_delay;
3048                 ctl_table ctl;
3049                 struct net *net;
3050
3051                 memcpy(&ctl, __ctl, sizeof(ctl));
3052                 ctl.data = &flush_delay;
3053                 proc_dointvec(&ctl, write, buffer, lenp, ppos);
3054
3055                 net = (struct net *)__ctl->extra1;
3056                 rt_cache_flush(net, flush_delay);
3057                 return 0;
3058         }
3059
3060         return -EINVAL;
3061 }
3062
3063 static void rt_secret_reschedule(int old)
3064 {
3065         struct net *net;
3066         int new = ip_rt_secret_interval;
3067         int diff = new - old;
3068
3069         if (!diff)
3070                 return;
3071
3072         rtnl_lock();
3073         for_each_net(net) {
3074                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
3075
3076                 if (!new)
3077                         continue;
3078
3079                 if (deleted) {
3080                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
3081
3082                         if (time <= 0 || (time += diff) <= 0)
3083                                 time = 0;
3084
3085                         net->ipv4.rt_secret_timer.expires = time;
3086                 } else
3087                         net->ipv4.rt_secret_timer.expires = new;
3088
3089                 net->ipv4.rt_secret_timer.expires += jiffies;
3090                 add_timer(&net->ipv4.rt_secret_timer);
3091         }
3092         rtnl_unlock();
3093 }
3094
3095 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
3096                                           void __user *buffer, size_t *lenp,
3097                                           loff_t *ppos)
3098 {
3099         int old = ip_rt_secret_interval;
3100         int ret = proc_dointvec_jiffies(ctl, write, buffer, lenp, ppos);
3101
3102         rt_secret_reschedule(old);
3103
3104         return ret;
3105 }
3106
3107 static ctl_table ipv4_route_table[] = {
3108         {
3109                 .procname       = "gc_thresh",
3110                 .data           = &ipv4_dst_ops.gc_thresh,
3111                 .maxlen         = sizeof(int),
3112                 .mode           = 0644,
3113                 .proc_handler   = proc_dointvec,
3114         },
3115         {
3116                 .procname       = "max_size",
3117                 .data           = &ip_rt_max_size,
3118                 .maxlen         = sizeof(int),
3119                 .mode           = 0644,
3120                 .proc_handler   = proc_dointvec,
3121         },
3122         {
3123                 /*  Deprecated. Use gc_min_interval_ms */
3124
3125                 .procname       = "gc_min_interval",
3126                 .data           = &ip_rt_gc_min_interval,
3127                 .maxlen         = sizeof(int),
3128                 .mode           = 0644,
3129                 .proc_handler   = proc_dointvec_jiffies,
3130         },
3131         {
3132                 .procname       = "gc_min_interval_ms",
3133                 .data           = &ip_rt_gc_min_interval,
3134                 .maxlen         = sizeof(int),
3135                 .mode           = 0644,
3136                 .proc_handler   = proc_dointvec_ms_jiffies,
3137         },
3138         {
3139                 .procname       = "gc_timeout",
3140                 .data           = &ip_rt_gc_timeout,
3141                 .maxlen         = sizeof(int),
3142                 .mode           = 0644,
3143                 .proc_handler   = proc_dointvec_jiffies,
3144         },
3145         {
3146                 .procname       = "gc_interval",
3147                 .data           = &ip_rt_gc_interval,
3148                 .maxlen         = sizeof(int),
3149                 .mode           = 0644,
3150                 .proc_handler   = proc_dointvec_jiffies,
3151         },
3152         {
3153                 .procname       = "redirect_load",
3154                 .data           = &ip_rt_redirect_load,
3155                 .maxlen         = sizeof(int),
3156                 .mode           = 0644,
3157                 .proc_handler   = proc_dointvec,
3158         },
3159         {
3160                 .procname       = "redirect_number",
3161                 .data           = &ip_rt_redirect_number,
3162                 .maxlen         = sizeof(int),
3163                 .mode           = 0644,
3164                 .proc_handler   = proc_dointvec,
3165         },
3166         {
3167                 .procname       = "redirect_silence",
3168                 .data           = &ip_rt_redirect_silence,
3169                 .maxlen         = sizeof(int),
3170                 .mode           = 0644,
3171                 .proc_handler   = proc_dointvec,
3172         },
3173         {
3174                 .procname       = "error_cost",
3175                 .data           = &ip_rt_error_cost,
3176                 .maxlen         = sizeof(int),
3177                 .mode           = 0644,
3178                 .proc_handler   = proc_dointvec,
3179         },
3180         {
3181                 .procname       = "error_burst",
3182                 .data           = &ip_rt_error_burst,
3183                 .maxlen         = sizeof(int),
3184                 .mode           = 0644,
3185                 .proc_handler   = proc_dointvec,
3186         },
3187         {
3188                 .procname       = "gc_elasticity",
3189                 .data           = &ip_rt_gc_elasticity,
3190                 .maxlen         = sizeof(int),
3191                 .mode           = 0644,
3192                 .proc_handler   = proc_dointvec,
3193         },
3194         {
3195                 .procname       = "mtu_expires",
3196                 .data           = &ip_rt_mtu_expires,
3197                 .maxlen         = sizeof(int),
3198                 .mode           = 0644,
3199                 .proc_handler   = proc_dointvec_jiffies,
3200         },
3201         {
3202                 .procname       = "min_pmtu",
3203                 .data           = &ip_rt_min_pmtu,
3204                 .maxlen         = sizeof(int),
3205                 .mode           = 0644,
3206                 .proc_handler   = proc_dointvec,
3207         },
3208         {
3209                 .procname       = "min_adv_mss",
3210                 .data           = &ip_rt_min_advmss,
3211                 .maxlen         = sizeof(int),
3212                 .mode           = 0644,
3213                 .proc_handler   = proc_dointvec,
3214         },
3215         {
3216                 .procname       = "secret_interval",
3217                 .data           = &ip_rt_secret_interval,
3218                 .maxlen         = sizeof(int),
3219                 .mode           = 0644,
3220                 .proc_handler   = ipv4_sysctl_rt_secret_interval,
3221         },
3222         { }
3223 };
3224
3225 static struct ctl_table empty[1];
3226
3227 static struct ctl_table ipv4_skeleton[] =
3228 {
3229         { .procname = "route", 
3230           .mode = 0555, .child = ipv4_route_table},
3231         { .procname = "neigh", 
3232           .mode = 0555, .child = empty},
3233         { }
3234 };
3235
3236 static __net_initdata struct ctl_path ipv4_path[] = {
3237         { .procname = "net", },
3238         { .procname = "ipv4", },
3239         { },
3240 };
3241
3242 static struct ctl_table ipv4_route_flush_table[] = {
3243         {
3244                 .procname       = "flush",
3245                 .maxlen         = sizeof(int),
3246                 .mode           = 0200,
3247                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3248         },
3249         { },
3250 };
3251
3252 static __net_initdata struct ctl_path ipv4_route_path[] = {
3253         { .procname = "net", },
3254         { .procname = "ipv4", },
3255         { .procname = "route", },
3256         { },
3257 };
3258
3259 static __net_init int sysctl_route_net_init(struct net *net)
3260 {
3261         struct ctl_table *tbl;
3262
3263         tbl = ipv4_route_flush_table;
3264         if (!net_eq(net, &init_net)) {
3265                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3266                 if (tbl == NULL)
3267                         goto err_dup;
3268         }
3269         tbl[0].extra1 = net;
3270
3271         net->ipv4.route_hdr =
3272                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3273         if (net->ipv4.route_hdr == NULL)
3274                 goto err_reg;
3275         return 0;
3276
3277 err_reg:
3278         if (tbl != ipv4_route_flush_table)
3279                 kfree(tbl);
3280 err_dup:
3281         return -ENOMEM;
3282 }
3283
3284 static __net_exit void sysctl_route_net_exit(struct net *net)
3285 {
3286         struct ctl_table *tbl;
3287
3288         tbl = net->ipv4.route_hdr->ctl_table_arg;
3289         unregister_net_sysctl_table(net->ipv4.route_hdr);
3290         BUG_ON(tbl == ipv4_route_flush_table);
3291         kfree(tbl);
3292 }
3293
3294 static __net_initdata struct pernet_operations sysctl_route_ops = {
3295         .init = sysctl_route_net_init,
3296         .exit = sysctl_route_net_exit,
3297 };
3298 #endif
3299
3300
3301 static __net_init int rt_secret_timer_init(struct net *net)
3302 {
3303         atomic_set(&net->ipv4.rt_genid,
3304                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3305                         (jiffies ^ (jiffies >> 7))));
3306
3307         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3308         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3309         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3310
3311         if (ip_rt_secret_interval) {
3312                 net->ipv4.rt_secret_timer.expires =
3313                         jiffies + net_random() % ip_rt_secret_interval +
3314                         ip_rt_secret_interval;
3315                 add_timer(&net->ipv4.rt_secret_timer);
3316         }
3317         return 0;
3318 }
3319
3320 static __net_exit void rt_secret_timer_exit(struct net *net)
3321 {
3322         del_timer_sync(&net->ipv4.rt_secret_timer);
3323 }
3324
3325 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3326         .init = rt_secret_timer_init,
3327         .exit = rt_secret_timer_exit,
3328 };
3329
3330
3331 #ifdef CONFIG_NET_CLS_ROUTE
3332 struct ip_rt_acct *ip_rt_acct __read_mostly;
3333 #endif /* CONFIG_NET_CLS_ROUTE */
3334
3335 static __initdata unsigned long rhash_entries;
3336 static int __init set_rhash_entries(char *str)
3337 {
3338         if (!str)
3339                 return 0;
3340         rhash_entries = simple_strtoul(str, &str, 0);
3341         return 1;
3342 }
3343 __setup("rhash_entries=", set_rhash_entries);
3344
3345 int __init ip_rt_init(void)
3346 {
3347         int rc = 0;
3348
3349 #ifdef CONFIG_NET_CLS_ROUTE
3350         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3351         if (!ip_rt_acct)
3352                 panic("IP: failed to allocate ip_rt_acct\n");
3353 #endif
3354
3355         ipv4_dst_ops.kmem_cachep =
3356                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3357                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3358
3359         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3360
3361         rt_hash_table = (struct rt_hash_bucket *)
3362                 alloc_large_system_hash("IP route cache",
3363                                         sizeof(struct rt_hash_bucket),
3364                                         rhash_entries,
3365                                         (totalram_pages >= 128 * 1024) ?
3366                                         15 : 17,
3367                                         0,
3368                                         &rt_hash_log,
3369                                         &rt_hash_mask,
3370                                         rhash_entries ? 0 : 512 * 1024);
3371         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3372         rt_hash_lock_init();
3373
3374         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3375         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3376
3377         devinet_init();
3378         ip_fib_init();
3379
3380         /* All the timers, started at system startup tend
3381            to synchronize. Perturb it a bit.
3382          */
3383         INIT_DELAYED_WORK_DEFERRABLE(&expires_work, rt_worker_func);
3384         expires_ljiffies = jiffies;
3385         schedule_delayed_work(&expires_work,
3386                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3387
3388         if (register_pernet_subsys(&rt_secret_timer_ops))
3389                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3390
3391         if (ip_rt_proc_init())
3392                 printk(KERN_ERR "Unable to create route proc files\n");
3393 #ifdef CONFIG_XFRM
3394         xfrm_init();
3395         xfrm4_init(ip_rt_max_size);
3396 #endif
3397         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3398
3399 #ifdef CONFIG_SYSCTL
3400         register_pernet_subsys(&sysctl_route_ops);
3401 #endif
3402         return rc;
3403 }
3404
3405 #ifdef CONFIG_SYSCTL
3406 /*
3407  * We really need to sanitize the damn ipv4 init order, then all
3408  * this nonsense will go away.
3409  */
3410 void __init ip_static_sysctl_init(void)
3411 {
3412         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3413 }
3414 #endif
3415
3416 EXPORT_SYMBOL(__ip_select_ident);
3417 EXPORT_SYMBOL(ip_route_input);
3418 EXPORT_SYMBOL(ip_route_output_key);