ipv4: Loosen source address check on IPv4 output
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #include <linux/module.h>
66 #include <asm/uaccess.h>
67 #include <asm/system.h>
68 #include <linux/bitops.h>
69 #include <linux/types.h>
70 #include <linux/kernel.h>
71 #include <linux/mm.h>
72 #include <linux/bootmem.h>
73 #include <linux/string.h>
74 #include <linux/socket.h>
75 #include <linux/sockios.h>
76 #include <linux/errno.h>
77 #include <linux/in.h>
78 #include <linux/inet.h>
79 #include <linux/netdevice.h>
80 #include <linux/proc_fs.h>
81 #include <linux/init.h>
82 #include <linux/workqueue.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/jhash.h>
91 #include <linux/rcupdate.h>
92 #include <linux/times.h>
93 #include <net/dst.h>
94 #include <net/net_namespace.h>
95 #include <net/protocol.h>
96 #include <net/ip.h>
97 #include <net/route.h>
98 #include <net/inetpeer.h>
99 #include <net/sock.h>
100 #include <net/ip_fib.h>
101 #include <net/arp.h>
102 #include <net/tcp.h>
103 #include <net/icmp.h>
104 #include <net/xfrm.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110
111 #define RT_FL_TOS(oldflp) \
112     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
113
114 #define IP_MAX_MTU      0xFFF0
115
116 #define RT_GC_TIMEOUT (300*HZ)
117
118 static int ip_rt_max_size;
119 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
120 static int ip_rt_gc_interval __read_mostly      = 60 * HZ;
121 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_gc_elasticity __read_mostly    = 8;
128 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
129 static int ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
130 static int ip_rt_min_advmss __read_mostly       = 256;
131 static int ip_rt_secret_interval __read_mostly  = 10 * 60 * HZ;
132
133 static void rt_worker_func(struct work_struct *work);
134 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
135
136 /*
137  *      Interface to generic destination cache.
138  */
139
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static void              ipv4_dst_destroy(struct dst_entry *dst);
142 static void              ipv4_dst_ifdown(struct dst_entry *dst,
143                                          struct net_device *dev, int how);
144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
145 static void              ipv4_link_failure(struct sk_buff *skb);
146 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
147 static int rt_garbage_collect(struct dst_ops *ops);
148
149
150 static struct dst_ops ipv4_dst_ops = {
151         .family =               AF_INET,
152         .protocol =             __constant_htons(ETH_P_IP),
153         .gc =                   rt_garbage_collect,
154         .check =                ipv4_dst_check,
155         .destroy =              ipv4_dst_destroy,
156         .ifdown =               ipv4_dst_ifdown,
157         .negative_advice =      ipv4_negative_advice,
158         .link_failure =         ipv4_link_failure,
159         .update_pmtu =          ip_rt_update_pmtu,
160         .local_out =            __ip_local_out,
161         .entry_size =           sizeof(struct rtable),
162         .entries =              ATOMIC_INIT(0),
163 };
164
165 #define ECN_OR_COST(class)      TC_PRIO_##class
166
167 const __u8 ip_tos2prio[16] = {
168         TC_PRIO_BESTEFFORT,
169         ECN_OR_COST(FILLER),
170         TC_PRIO_BESTEFFORT,
171         ECN_OR_COST(BESTEFFORT),
172         TC_PRIO_BULK,
173         ECN_OR_COST(BULK),
174         TC_PRIO_BULK,
175         ECN_OR_COST(BULK),
176         TC_PRIO_INTERACTIVE,
177         ECN_OR_COST(INTERACTIVE),
178         TC_PRIO_INTERACTIVE,
179         ECN_OR_COST(INTERACTIVE),
180         TC_PRIO_INTERACTIVE_BULK,
181         ECN_OR_COST(INTERACTIVE_BULK),
182         TC_PRIO_INTERACTIVE_BULK,
183         ECN_OR_COST(INTERACTIVE_BULK)
184 };
185
186
187 /*
188  * Route cache.
189  */
190
191 /* The locking scheme is rather straight forward:
192  *
193  * 1) Read-Copy Update protects the buckets of the central route hash.
194  * 2) Only writers remove entries, and they hold the lock
195  *    as they look at rtable reference counts.
196  * 3) Only readers acquire references to rtable entries,
197  *    they do so with atomic increments and with the
198  *    lock held.
199  */
200
201 struct rt_hash_bucket {
202         struct rtable   *chain;
203 };
204 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
205         defined(CONFIG_PROVE_LOCKING)
206 /*
207  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
208  * The size of this table is a power of two and depends on the number of CPUS.
209  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
210  */
211 #ifdef CONFIG_LOCKDEP
212 # define RT_HASH_LOCK_SZ        256
213 #else
214 # if NR_CPUS >= 32
215 #  define RT_HASH_LOCK_SZ       4096
216 # elif NR_CPUS >= 16
217 #  define RT_HASH_LOCK_SZ       2048
218 # elif NR_CPUS >= 8
219 #  define RT_HASH_LOCK_SZ       1024
220 # elif NR_CPUS >= 4
221 #  define RT_HASH_LOCK_SZ       512
222 # else
223 #  define RT_HASH_LOCK_SZ       256
224 # endif
225 #endif
226
227 static spinlock_t       *rt_hash_locks;
228 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
229
230 static __init void rt_hash_lock_init(void)
231 {
232         int i;
233
234         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
235                         GFP_KERNEL);
236         if (!rt_hash_locks)
237                 panic("IP: failed to allocate rt_hash_locks\n");
238
239         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
240                 spin_lock_init(&rt_hash_locks[i]);
241 }
242 #else
243 # define rt_hash_lock_addr(slot) NULL
244
245 static inline void rt_hash_lock_init(void)
246 {
247 }
248 #endif
249
250 static struct rt_hash_bucket    *rt_hash_table __read_mostly;
251 static unsigned                 rt_hash_mask __read_mostly;
252 static unsigned int             rt_hash_log  __read_mostly;
253
254 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
255 #define RT_CACHE_STAT_INC(field) \
256         (__raw_get_cpu_var(rt_cache_stat).field++)
257
258 static inline unsigned int rt_hash(__be32 daddr, __be32 saddr, int idx,
259                 int genid)
260 {
261         return jhash_3words((__force u32)(__be32)(daddr),
262                             (__force u32)(__be32)(saddr),
263                             idx, genid)
264                 & rt_hash_mask;
265 }
266
267 static inline int rt_genid(struct net *net)
268 {
269         return atomic_read(&net->ipv4.rt_genid);
270 }
271
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274         struct seq_net_private p;
275         int bucket;
276         int genid;
277 };
278
279 static struct rtable *rt_cache_get_first(struct seq_file *seq)
280 {
281         struct rt_cache_iter_state *st = seq->private;
282         struct rtable *r = NULL;
283
284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285                 if (!rt_hash_table[st->bucket].chain)
286                         continue;
287                 rcu_read_lock_bh();
288                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
289                 while (r) {
290                         if (dev_net(r->u.dst.dev) == seq_file_net(seq) &&
291                             r->rt_genid == st->genid)
292                                 return r;
293                         r = rcu_dereference(r->u.dst.rt_next);
294                 }
295                 rcu_read_unlock_bh();
296         }
297         return r;
298 }
299
300 static struct rtable *__rt_cache_get_next(struct seq_file *seq,
301                                           struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 do {
309                         if (--st->bucket < 0)
310                                 return NULL;
311                 } while (!rt_hash_table[st->bucket].chain);
312                 rcu_read_lock_bh();
313                 r = rt_hash_table[st->bucket].chain;
314         }
315         return rcu_dereference(r);
316 }
317
318 static struct rtable *rt_cache_get_next(struct seq_file *seq,
319                                         struct rtable *r)
320 {
321         struct rt_cache_iter_state *st = seq->private;
322         while ((r = __rt_cache_get_next(seq, r)) != NULL) {
323                 if (dev_net(r->u.dst.dev) != seq_file_net(seq))
324                         continue;
325                 if (r->rt_genid == st->genid)
326                         break;
327         }
328         return r;
329 }
330
331 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
332 {
333         struct rtable *r = rt_cache_get_first(seq);
334
335         if (r)
336                 while (pos && (r = rt_cache_get_next(seq, r)))
337                         --pos;
338         return pos ? NULL : r;
339 }
340
341 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
342 {
343         struct rt_cache_iter_state *st = seq->private;
344         if (*pos)
345                 return rt_cache_get_idx(seq, *pos - 1);
346         st->genid = rt_genid(seq_file_net(seq));
347         return SEQ_START_TOKEN;
348 }
349
350 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
351 {
352         struct rtable *r;
353
354         if (v == SEQ_START_TOKEN)
355                 r = rt_cache_get_first(seq);
356         else
357                 r = rt_cache_get_next(seq, v);
358         ++*pos;
359         return r;
360 }
361
362 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
363 {
364         if (v && v != SEQ_START_TOKEN)
365                 rcu_read_unlock_bh();
366 }
367
368 static int rt_cache_seq_show(struct seq_file *seq, void *v)
369 {
370         if (v == SEQ_START_TOKEN)
371                 seq_printf(seq, "%-127s\n",
372                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
373                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
374                            "HHUptod\tSpecDst");
375         else {
376                 struct rtable *r = v;
377                 int len;
378
379                 seq_printf(seq, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
380                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X%n",
381                         r->u.dst.dev ? r->u.dst.dev->name : "*",
382                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
383                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
384                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
385                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
386                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
387                         dst_metric(&r->u.dst, RTAX_WINDOW),
388                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
389                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
390                         r->fl.fl4_tos,
391                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
392                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
393                                        dev_queue_xmit) : 0,
394                         r->rt_spec_dst, &len);
395
396                 seq_printf(seq, "%*s\n", 127 - len, "");
397         }
398         return 0;
399 }
400
401 static const struct seq_operations rt_cache_seq_ops = {
402         .start  = rt_cache_seq_start,
403         .next   = rt_cache_seq_next,
404         .stop   = rt_cache_seq_stop,
405         .show   = rt_cache_seq_show,
406 };
407
408 static int rt_cache_seq_open(struct inode *inode, struct file *file)
409 {
410         return seq_open_net(inode, file, &rt_cache_seq_ops,
411                         sizeof(struct rt_cache_iter_state));
412 }
413
414 static const struct file_operations rt_cache_seq_fops = {
415         .owner   = THIS_MODULE,
416         .open    = rt_cache_seq_open,
417         .read    = seq_read,
418         .llseek  = seq_lseek,
419         .release = seq_release_net,
420 };
421
422
423 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
424 {
425         int cpu;
426
427         if (*pos == 0)
428                 return SEQ_START_TOKEN;
429
430         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
431                 if (!cpu_possible(cpu))
432                         continue;
433                 *pos = cpu+1;
434                 return &per_cpu(rt_cache_stat, cpu);
435         }
436         return NULL;
437 }
438
439 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
440 {
441         int cpu;
442
443         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
444                 if (!cpu_possible(cpu))
445                         continue;
446                 *pos = cpu+1;
447                 return &per_cpu(rt_cache_stat, cpu);
448         }
449         return NULL;
450
451 }
452
453 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
454 {
455
456 }
457
458 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
459 {
460         struct rt_cache_stat *st = v;
461
462         if (v == SEQ_START_TOKEN) {
463                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
464                 return 0;
465         }
466
467         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
468                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
469                    atomic_read(&ipv4_dst_ops.entries),
470                    st->in_hit,
471                    st->in_slow_tot,
472                    st->in_slow_mc,
473                    st->in_no_route,
474                    st->in_brd,
475                    st->in_martian_dst,
476                    st->in_martian_src,
477
478                    st->out_hit,
479                    st->out_slow_tot,
480                    st->out_slow_mc,
481
482                    st->gc_total,
483                    st->gc_ignored,
484                    st->gc_goal_miss,
485                    st->gc_dst_overflow,
486                    st->in_hlist_search,
487                    st->out_hlist_search
488                 );
489         return 0;
490 }
491
492 static const struct seq_operations rt_cpu_seq_ops = {
493         .start  = rt_cpu_seq_start,
494         .next   = rt_cpu_seq_next,
495         .stop   = rt_cpu_seq_stop,
496         .show   = rt_cpu_seq_show,
497 };
498
499
500 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
501 {
502         return seq_open(file, &rt_cpu_seq_ops);
503 }
504
505 static const struct file_operations rt_cpu_seq_fops = {
506         .owner   = THIS_MODULE,
507         .open    = rt_cpu_seq_open,
508         .read    = seq_read,
509         .llseek  = seq_lseek,
510         .release = seq_release,
511 };
512
513 #ifdef CONFIG_NET_CLS_ROUTE
514 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
515                            int length, int *eof, void *data)
516 {
517         unsigned int i;
518
519         if ((offset & 3) || (length & 3))
520                 return -EIO;
521
522         if (offset >= sizeof(struct ip_rt_acct) * 256) {
523                 *eof = 1;
524                 return 0;
525         }
526
527         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
528                 length = sizeof(struct ip_rt_acct) * 256 - offset;
529                 *eof = 1;
530         }
531
532         offset /= sizeof(u32);
533
534         if (length > 0) {
535                 u32 *dst = (u32 *) buffer;
536
537                 *start = buffer;
538                 memset(dst, 0, length);
539
540                 for_each_possible_cpu(i) {
541                         unsigned int j;
542                         u32 *src;
543
544                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
545                         for (j = 0; j < length/4; j++)
546                                 dst[j] += src[j];
547                 }
548         }
549         return length;
550 }
551 #endif
552
553 static int __net_init ip_rt_do_proc_init(struct net *net)
554 {
555         struct proc_dir_entry *pde;
556
557         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
558                         &rt_cache_seq_fops);
559         if (!pde)
560                 goto err1;
561
562         pde = proc_create("rt_cache", S_IRUGO,
563                           net->proc_net_stat, &rt_cpu_seq_fops);
564         if (!pde)
565                 goto err2;
566
567 #ifdef CONFIG_NET_CLS_ROUTE
568         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
569                         ip_rt_acct_read, NULL);
570         if (!pde)
571                 goto err3;
572 #endif
573         return 0;
574
575 #ifdef CONFIG_NET_CLS_ROUTE
576 err3:
577         remove_proc_entry("rt_cache", net->proc_net_stat);
578 #endif
579 err2:
580         remove_proc_entry("rt_cache", net->proc_net);
581 err1:
582         return -ENOMEM;
583 }
584
585 static void __net_exit ip_rt_do_proc_exit(struct net *net)
586 {
587         remove_proc_entry("rt_cache", net->proc_net_stat);
588         remove_proc_entry("rt_cache", net->proc_net);
589         remove_proc_entry("rt_acct", net->proc_net);
590 }
591
592 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
593         .init = ip_rt_do_proc_init,
594         .exit = ip_rt_do_proc_exit,
595 };
596
597 static int __init ip_rt_proc_init(void)
598 {
599         return register_pernet_subsys(&ip_rt_proc_ops);
600 }
601
602 #else
603 static inline int ip_rt_proc_init(void)
604 {
605         return 0;
606 }
607 #endif /* CONFIG_PROC_FS */
608
609 static inline void rt_free(struct rtable *rt)
610 {
611         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
612 }
613
614 static inline void rt_drop(struct rtable *rt)
615 {
616         ip_rt_put(rt);
617         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
618 }
619
620 static inline int rt_fast_clean(struct rtable *rth)
621 {
622         /* Kill broadcast/multicast entries very aggresively, if they
623            collide in hash table with more useful entries */
624         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
625                 rth->fl.iif && rth->u.dst.rt_next;
626 }
627
628 static inline int rt_valuable(struct rtable *rth)
629 {
630         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
631                 rth->u.dst.expires;
632 }
633
634 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
635 {
636         unsigned long age;
637         int ret = 0;
638
639         if (atomic_read(&rth->u.dst.__refcnt))
640                 goto out;
641
642         ret = 1;
643         if (rth->u.dst.expires &&
644             time_after_eq(jiffies, rth->u.dst.expires))
645                 goto out;
646
647         age = jiffies - rth->u.dst.lastuse;
648         ret = 0;
649         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
650             (age <= tmo2 && rt_valuable(rth)))
651                 goto out;
652         ret = 1;
653 out:    return ret;
654 }
655
656 /* Bits of score are:
657  * 31: very valuable
658  * 30: not quite useless
659  * 29..0: usage counter
660  */
661 static inline u32 rt_score(struct rtable *rt)
662 {
663         u32 score = jiffies - rt->u.dst.lastuse;
664
665         score = ~score & ~(3<<30);
666
667         if (rt_valuable(rt))
668                 score |= (1<<31);
669
670         if (!rt->fl.iif ||
671             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
672                 score |= (1<<30);
673
674         return score;
675 }
676
677 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
678 {
679         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
680                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
681                 (fl1->mark ^ fl2->mark) |
682                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
683                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
684                 (fl1->oif ^ fl2->oif) |
685                 (fl1->iif ^ fl2->iif)) == 0;
686 }
687
688 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
689 {
690         return dev_net(rt1->u.dst.dev) == dev_net(rt2->u.dst.dev);
691 }
692
693 static inline int rt_is_expired(struct rtable *rth)
694 {
695         return rth->rt_genid != rt_genid(dev_net(rth->u.dst.dev));
696 }
697
698 /*
699  * Perform a full scan of hash table and free all entries.
700  * Can be called by a softirq or a process.
701  * In the later case, we want to be reschedule if necessary
702  */
703 static void rt_do_flush(int process_context)
704 {
705         unsigned int i;
706         struct rtable *rth, *next;
707         struct rtable * tail;
708
709         for (i = 0; i <= rt_hash_mask; i++) {
710                 if (process_context && need_resched())
711                         cond_resched();
712                 rth = rt_hash_table[i].chain;
713                 if (!rth)
714                         continue;
715
716                 spin_lock_bh(rt_hash_lock_addr(i));
717 #ifdef CONFIG_NET_NS
718                 {
719                 struct rtable ** prev, * p;
720
721                 rth = rt_hash_table[i].chain;
722
723                 /* defer releasing the head of the list after spin_unlock */
724                 for (tail = rth; tail; tail = tail->u.dst.rt_next)
725                         if (!rt_is_expired(tail))
726                                 break;
727                 if (rth != tail)
728                         rt_hash_table[i].chain = tail;
729
730                 /* call rt_free on entries after the tail requiring flush */
731                 prev = &rt_hash_table[i].chain;
732                 for (p = *prev; p; p = next) {
733                         next = p->u.dst.rt_next;
734                         if (!rt_is_expired(p)) {
735                                 prev = &p->u.dst.rt_next;
736                         } else {
737                                 *prev = next;
738                                 rt_free(p);
739                         }
740                 }
741                 }
742 #else
743                 rth = rt_hash_table[i].chain;
744                 rt_hash_table[i].chain = NULL;
745                 tail = NULL;
746 #endif
747                 spin_unlock_bh(rt_hash_lock_addr(i));
748
749                 for (; rth != tail; rth = next) {
750                         next = rth->u.dst.rt_next;
751                         rt_free(rth);
752                 }
753         }
754 }
755
756 static void rt_check_expire(void)
757 {
758         static unsigned int rover;
759         unsigned int i = rover, goal;
760         struct rtable *rth, **rthp;
761         u64 mult;
762
763         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
764         if (ip_rt_gc_timeout > 1)
765                 do_div(mult, ip_rt_gc_timeout);
766         goal = (unsigned int)mult;
767         if (goal > rt_hash_mask)
768                 goal = rt_hash_mask + 1;
769         for (; goal > 0; goal--) {
770                 unsigned long tmo = ip_rt_gc_timeout;
771
772                 i = (i + 1) & rt_hash_mask;
773                 rthp = &rt_hash_table[i].chain;
774
775                 if (need_resched())
776                         cond_resched();
777
778                 if (*rthp == NULL)
779                         continue;
780                 spin_lock_bh(rt_hash_lock_addr(i));
781                 while ((rth = *rthp) != NULL) {
782                         if (rt_is_expired(rth)) {
783                                 *rthp = rth->u.dst.rt_next;
784                                 rt_free(rth);
785                                 continue;
786                         }
787                         if (rth->u.dst.expires) {
788                                 /* Entry is expired even if it is in use */
789                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
790                                         tmo >>= 1;
791                                         rthp = &rth->u.dst.rt_next;
792                                         continue;
793                                 }
794                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
795                                 tmo >>= 1;
796                                 rthp = &rth->u.dst.rt_next;
797                                 continue;
798                         }
799
800                         /* Cleanup aged off entries. */
801                         *rthp = rth->u.dst.rt_next;
802                         rt_free(rth);
803                 }
804                 spin_unlock_bh(rt_hash_lock_addr(i));
805         }
806         rover = i;
807 }
808
809 /*
810  * rt_worker_func() is run in process context.
811  * we call rt_check_expire() to scan part of the hash table
812  */
813 static void rt_worker_func(struct work_struct *work)
814 {
815         rt_check_expire();
816         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
817 }
818
819 /*
820  * Pertubation of rt_genid by a small quantity [1..256]
821  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
822  * many times (2^24) without giving recent rt_genid.
823  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
824  */
825 static void rt_cache_invalidate(struct net *net)
826 {
827         unsigned char shuffle;
828
829         get_random_bytes(&shuffle, sizeof(shuffle));
830         atomic_add(shuffle + 1U, &net->ipv4.rt_genid);
831 }
832
833 /*
834  * delay < 0  : invalidate cache (fast : entries will be deleted later)
835  * delay >= 0 : invalidate & flush cache (can be long)
836  */
837 void rt_cache_flush(struct net *net, int delay)
838 {
839         rt_cache_invalidate(net);
840         if (delay >= 0)
841                 rt_do_flush(!in_softirq());
842 }
843
844 /*
845  * We change rt_genid and let gc do the cleanup
846  */
847 static void rt_secret_rebuild(unsigned long __net)
848 {
849         struct net *net = (struct net *)__net;
850         rt_cache_invalidate(net);
851         mod_timer(&net->ipv4.rt_secret_timer, jiffies + ip_rt_secret_interval);
852 }
853
854 /*
855    Short description of GC goals.
856
857    We want to build algorithm, which will keep routing cache
858    at some equilibrium point, when number of aged off entries
859    is kept approximately equal to newly generated ones.
860
861    Current expiration strength is variable "expire".
862    We try to adjust it dynamically, so that if networking
863    is idle expires is large enough to keep enough of warm entries,
864    and when load increases it reduces to limit cache size.
865  */
866
867 static int rt_garbage_collect(struct dst_ops *ops)
868 {
869         static unsigned long expire = RT_GC_TIMEOUT;
870         static unsigned long last_gc;
871         static int rover;
872         static int equilibrium;
873         struct rtable *rth, **rthp;
874         unsigned long now = jiffies;
875         int goal;
876
877         /*
878          * Garbage collection is pretty expensive,
879          * do not make it too frequently.
880          */
881
882         RT_CACHE_STAT_INC(gc_total);
883
884         if (now - last_gc < ip_rt_gc_min_interval &&
885             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
886                 RT_CACHE_STAT_INC(gc_ignored);
887                 goto out;
888         }
889
890         /* Calculate number of entries, which we want to expire now. */
891         goal = atomic_read(&ipv4_dst_ops.entries) -
892                 (ip_rt_gc_elasticity << rt_hash_log);
893         if (goal <= 0) {
894                 if (equilibrium < ipv4_dst_ops.gc_thresh)
895                         equilibrium = ipv4_dst_ops.gc_thresh;
896                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
897                 if (goal > 0) {
898                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
899                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
900                 }
901         } else {
902                 /* We are in dangerous area. Try to reduce cache really
903                  * aggressively.
904                  */
905                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
906                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
907         }
908
909         if (now - last_gc >= ip_rt_gc_min_interval)
910                 last_gc = now;
911
912         if (goal <= 0) {
913                 equilibrium += goal;
914                 goto work_done;
915         }
916
917         do {
918                 int i, k;
919
920                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
921                         unsigned long tmo = expire;
922
923                         k = (k + 1) & rt_hash_mask;
924                         rthp = &rt_hash_table[k].chain;
925                         spin_lock_bh(rt_hash_lock_addr(k));
926                         while ((rth = *rthp) != NULL) {
927                                 if (!rt_is_expired(rth) &&
928                                         !rt_may_expire(rth, tmo, expire)) {
929                                         tmo >>= 1;
930                                         rthp = &rth->u.dst.rt_next;
931                                         continue;
932                                 }
933                                 *rthp = rth->u.dst.rt_next;
934                                 rt_free(rth);
935                                 goal--;
936                         }
937                         spin_unlock_bh(rt_hash_lock_addr(k));
938                         if (goal <= 0)
939                                 break;
940                 }
941                 rover = k;
942
943                 if (goal <= 0)
944                         goto work_done;
945
946                 /* Goal is not achieved. We stop process if:
947
948                    - if expire reduced to zero. Otherwise, expire is halfed.
949                    - if table is not full.
950                    - if we are called from interrupt.
951                    - jiffies check is just fallback/debug loop breaker.
952                      We will not spin here for long time in any case.
953                  */
954
955                 RT_CACHE_STAT_INC(gc_goal_miss);
956
957                 if (expire == 0)
958                         break;
959
960                 expire >>= 1;
961 #if RT_CACHE_DEBUG >= 2
962                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
963                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
964 #endif
965
966                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
967                         goto out;
968         } while (!in_softirq() && time_before_eq(jiffies, now));
969
970         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
971                 goto out;
972         if (net_ratelimit())
973                 printk(KERN_WARNING "dst cache overflow\n");
974         RT_CACHE_STAT_INC(gc_dst_overflow);
975         return 1;
976
977 work_done:
978         expire += ip_rt_gc_min_interval;
979         if (expire > ip_rt_gc_timeout ||
980             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
981                 expire = ip_rt_gc_timeout;
982 #if RT_CACHE_DEBUG >= 2
983         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
984                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
985 #endif
986 out:    return 0;
987 }
988
989 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
990 {
991         struct rtable   *rth, **rthp;
992         unsigned long   now;
993         struct rtable *cand, **candp;
994         u32             min_score;
995         int             chain_length;
996         int attempts = !in_softirq();
997
998 restart:
999         chain_length = 0;
1000         min_score = ~(u32)0;
1001         cand = NULL;
1002         candp = NULL;
1003         now = jiffies;
1004
1005         rthp = &rt_hash_table[hash].chain;
1006
1007         spin_lock_bh(rt_hash_lock_addr(hash));
1008         while ((rth = *rthp) != NULL) {
1009                 if (rt_is_expired(rth)) {
1010                         *rthp = rth->u.dst.rt_next;
1011                         rt_free(rth);
1012                         continue;
1013                 }
1014                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
1015                         /* Put it first */
1016                         *rthp = rth->u.dst.rt_next;
1017                         /*
1018                          * Since lookup is lockfree, the deletion
1019                          * must be visible to another weakly ordered CPU before
1020                          * the insertion at the start of the hash chain.
1021                          */
1022                         rcu_assign_pointer(rth->u.dst.rt_next,
1023                                            rt_hash_table[hash].chain);
1024                         /*
1025                          * Since lookup is lockfree, the update writes
1026                          * must be ordered for consistency on SMP.
1027                          */
1028                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
1029
1030                         dst_use(&rth->u.dst, now);
1031                         spin_unlock_bh(rt_hash_lock_addr(hash));
1032
1033                         rt_drop(rt);
1034                         *rp = rth;
1035                         return 0;
1036                 }
1037
1038                 if (!atomic_read(&rth->u.dst.__refcnt)) {
1039                         u32 score = rt_score(rth);
1040
1041                         if (score <= min_score) {
1042                                 cand = rth;
1043                                 candp = rthp;
1044                                 min_score = score;
1045                         }
1046                 }
1047
1048                 chain_length++;
1049
1050                 rthp = &rth->u.dst.rt_next;
1051         }
1052
1053         if (cand) {
1054                 /* ip_rt_gc_elasticity used to be average length of chain
1055                  * length, when exceeded gc becomes really aggressive.
1056                  *
1057                  * The second limit is less certain. At the moment it allows
1058                  * only 2 entries per bucket. We will see.
1059                  */
1060                 if (chain_length > ip_rt_gc_elasticity) {
1061                         *candp = cand->u.dst.rt_next;
1062                         rt_free(cand);
1063                 }
1064         }
1065
1066         /* Try to bind route to arp only if it is output
1067            route or unicast forwarding path.
1068          */
1069         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1070                 int err = arp_bind_neighbour(&rt->u.dst);
1071                 if (err) {
1072                         spin_unlock_bh(rt_hash_lock_addr(hash));
1073
1074                         if (err != -ENOBUFS) {
1075                                 rt_drop(rt);
1076                                 return err;
1077                         }
1078
1079                         /* Neighbour tables are full and nothing
1080                            can be released. Try to shrink route cache,
1081                            it is most likely it holds some neighbour records.
1082                          */
1083                         if (attempts-- > 0) {
1084                                 int saved_elasticity = ip_rt_gc_elasticity;
1085                                 int saved_int = ip_rt_gc_min_interval;
1086                                 ip_rt_gc_elasticity     = 1;
1087                                 ip_rt_gc_min_interval   = 0;
1088                                 rt_garbage_collect(&ipv4_dst_ops);
1089                                 ip_rt_gc_min_interval   = saved_int;
1090                                 ip_rt_gc_elasticity     = saved_elasticity;
1091                                 goto restart;
1092                         }
1093
1094                         if (net_ratelimit())
1095                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1096                         rt_drop(rt);
1097                         return -ENOBUFS;
1098                 }
1099         }
1100
1101         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1102 #if RT_CACHE_DEBUG >= 2
1103         if (rt->u.dst.rt_next) {
1104                 struct rtable *trt;
1105                 printk(KERN_DEBUG "rt_cache @%02x: " NIPQUAD_FMT, hash,
1106                        NIPQUAD(rt->rt_dst));
1107                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1108                         printk(" . " NIPQUAD_FMT, NIPQUAD(trt->rt_dst));
1109                 printk("\n");
1110         }
1111 #endif
1112         rt_hash_table[hash].chain = rt;
1113         spin_unlock_bh(rt_hash_lock_addr(hash));
1114         *rp = rt;
1115         return 0;
1116 }
1117
1118 void rt_bind_peer(struct rtable *rt, int create)
1119 {
1120         static DEFINE_SPINLOCK(rt_peer_lock);
1121         struct inet_peer *peer;
1122
1123         peer = inet_getpeer(rt->rt_dst, create);
1124
1125         spin_lock_bh(&rt_peer_lock);
1126         if (rt->peer == NULL) {
1127                 rt->peer = peer;
1128                 peer = NULL;
1129         }
1130         spin_unlock_bh(&rt_peer_lock);
1131         if (peer)
1132                 inet_putpeer(peer);
1133 }
1134
1135 /*
1136  * Peer allocation may fail only in serious out-of-memory conditions.  However
1137  * we still can generate some output.
1138  * Random ID selection looks a bit dangerous because we have no chances to
1139  * select ID being unique in a reasonable period of time.
1140  * But broken packet identifier may be better than no packet at all.
1141  */
1142 static void ip_select_fb_ident(struct iphdr *iph)
1143 {
1144         static DEFINE_SPINLOCK(ip_fb_id_lock);
1145         static u32 ip_fallback_id;
1146         u32 salt;
1147
1148         spin_lock_bh(&ip_fb_id_lock);
1149         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1150         iph->id = htons(salt & 0xFFFF);
1151         ip_fallback_id = salt;
1152         spin_unlock_bh(&ip_fb_id_lock);
1153 }
1154
1155 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1156 {
1157         struct rtable *rt = (struct rtable *) dst;
1158
1159         if (rt) {
1160                 if (rt->peer == NULL)
1161                         rt_bind_peer(rt, 1);
1162
1163                 /* If peer is attached to destination, it is never detached,
1164                    so that we need not to grab a lock to dereference it.
1165                  */
1166                 if (rt->peer) {
1167                         iph->id = htons(inet_getid(rt->peer, more));
1168                         return;
1169                 }
1170         } else
1171                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1172                        __builtin_return_address(0));
1173
1174         ip_select_fb_ident(iph);
1175 }
1176
1177 static void rt_del(unsigned hash, struct rtable *rt)
1178 {
1179         struct rtable **rthp, *aux;
1180
1181         rthp = &rt_hash_table[hash].chain;
1182         spin_lock_bh(rt_hash_lock_addr(hash));
1183         ip_rt_put(rt);
1184         while ((aux = *rthp) != NULL) {
1185                 if (aux == rt || rt_is_expired(aux)) {
1186                         *rthp = aux->u.dst.rt_next;
1187                         rt_free(aux);
1188                         continue;
1189                 }
1190                 rthp = &aux->u.dst.rt_next;
1191         }
1192         spin_unlock_bh(rt_hash_lock_addr(hash));
1193 }
1194
1195 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1196                     __be32 saddr, struct net_device *dev)
1197 {
1198         int i, k;
1199         struct in_device *in_dev = in_dev_get(dev);
1200         struct rtable *rth, **rthp;
1201         __be32  skeys[2] = { saddr, 0 };
1202         int  ikeys[2] = { dev->ifindex, 0 };
1203         struct netevent_redirect netevent;
1204         struct net *net;
1205
1206         if (!in_dev)
1207                 return;
1208
1209         net = dev_net(dev);
1210         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1211             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1212             || ipv4_is_zeronet(new_gw))
1213                 goto reject_redirect;
1214
1215         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1216                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1217                         goto reject_redirect;
1218                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1219                         goto reject_redirect;
1220         } else {
1221                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1222                         goto reject_redirect;
1223         }
1224
1225         for (i = 0; i < 2; i++) {
1226                 for (k = 0; k < 2; k++) {
1227                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1228                                                 rt_genid(net));
1229
1230                         rthp=&rt_hash_table[hash].chain;
1231
1232                         rcu_read_lock();
1233                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1234                                 struct rtable *rt;
1235
1236                                 if (rth->fl.fl4_dst != daddr ||
1237                                     rth->fl.fl4_src != skeys[i] ||
1238                                     rth->fl.oif != ikeys[k] ||
1239                                     rth->fl.iif != 0 ||
1240                                     rt_is_expired(rth) ||
1241                                     !net_eq(dev_net(rth->u.dst.dev), net)) {
1242                                         rthp = &rth->u.dst.rt_next;
1243                                         continue;
1244                                 }
1245
1246                                 if (rth->rt_dst != daddr ||
1247                                     rth->rt_src != saddr ||
1248                                     rth->u.dst.error ||
1249                                     rth->rt_gateway != old_gw ||
1250                                     rth->u.dst.dev != dev)
1251                                         break;
1252
1253                                 dst_hold(&rth->u.dst);
1254                                 rcu_read_unlock();
1255
1256                                 rt = dst_alloc(&ipv4_dst_ops);
1257                                 if (rt == NULL) {
1258                                         ip_rt_put(rth);
1259                                         in_dev_put(in_dev);
1260                                         return;
1261                                 }
1262
1263                                 /* Copy all the information. */
1264                                 *rt = *rth;
1265                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1266                                 rt->u.dst.__use         = 1;
1267                                 atomic_set(&rt->u.dst.__refcnt, 1);
1268                                 rt->u.dst.child         = NULL;
1269                                 if (rt->u.dst.dev)
1270                                         dev_hold(rt->u.dst.dev);
1271                                 if (rt->idev)
1272                                         in_dev_hold(rt->idev);
1273                                 rt->u.dst.obsolete      = 0;
1274                                 rt->u.dst.lastuse       = jiffies;
1275                                 rt->u.dst.path          = &rt->u.dst;
1276                                 rt->u.dst.neighbour     = NULL;
1277                                 rt->u.dst.hh            = NULL;
1278                                 rt->u.dst.xfrm          = NULL;
1279                                 rt->rt_genid            = rt_genid(net);
1280                                 rt->rt_flags            |= RTCF_REDIRECTED;
1281
1282                                 /* Gateway is different ... */
1283                                 rt->rt_gateway          = new_gw;
1284
1285                                 /* Redirect received -> path was valid */
1286                                 dst_confirm(&rth->u.dst);
1287
1288                                 if (rt->peer)
1289                                         atomic_inc(&rt->peer->refcnt);
1290
1291                                 if (arp_bind_neighbour(&rt->u.dst) ||
1292                                     !(rt->u.dst.neighbour->nud_state &
1293                                             NUD_VALID)) {
1294                                         if (rt->u.dst.neighbour)
1295                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1296                                         ip_rt_put(rth);
1297                                         rt_drop(rt);
1298                                         goto do_next;
1299                                 }
1300
1301                                 netevent.old = &rth->u.dst;
1302                                 netevent.new = &rt->u.dst;
1303                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1304                                                         &netevent);
1305
1306                                 rt_del(hash, rth);
1307                                 if (!rt_intern_hash(hash, rt, &rt))
1308                                         ip_rt_put(rt);
1309                                 goto do_next;
1310                         }
1311                         rcu_read_unlock();
1312                 do_next:
1313                         ;
1314                 }
1315         }
1316         in_dev_put(in_dev);
1317         return;
1318
1319 reject_redirect:
1320 #ifdef CONFIG_IP_ROUTE_VERBOSE
1321         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1322                 printk(KERN_INFO "Redirect from " NIPQUAD_FMT " on %s about "
1323                         NIPQUAD_FMT " ignored.\n"
1324                         "  Advised path = " NIPQUAD_FMT " -> " NIPQUAD_FMT "\n",
1325                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1326                        NIPQUAD(saddr), NIPQUAD(daddr));
1327 #endif
1328         in_dev_put(in_dev);
1329 }
1330
1331 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1332 {
1333         struct rtable *rt = (struct rtable *)dst;
1334         struct dst_entry *ret = dst;
1335
1336         if (rt) {
1337                 if (dst->obsolete) {
1338                         ip_rt_put(rt);
1339                         ret = NULL;
1340                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1341                            rt->u.dst.expires) {
1342                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1343                                                 rt->fl.oif,
1344                                                 rt_genid(dev_net(dst->dev)));
1345 #if RT_CACHE_DEBUG >= 1
1346                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1347                                           NIPQUAD_FMT "/%02x dropped\n",
1348                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1349 #endif
1350                         rt_del(hash, rt);
1351                         ret = NULL;
1352                 }
1353         }
1354         return ret;
1355 }
1356
1357 /*
1358  * Algorithm:
1359  *      1. The first ip_rt_redirect_number redirects are sent
1360  *         with exponential backoff, then we stop sending them at all,
1361  *         assuming that the host ignores our redirects.
1362  *      2. If we did not see packets requiring redirects
1363  *         during ip_rt_redirect_silence, we assume that the host
1364  *         forgot redirected route and start to send redirects again.
1365  *
1366  * This algorithm is much cheaper and more intelligent than dumb load limiting
1367  * in icmp.c.
1368  *
1369  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1370  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1371  */
1372
1373 void ip_rt_send_redirect(struct sk_buff *skb)
1374 {
1375         struct rtable *rt = skb->rtable;
1376         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1377
1378         if (!in_dev)
1379                 return;
1380
1381         if (!IN_DEV_TX_REDIRECTS(in_dev))
1382                 goto out;
1383
1384         /* No redirected packets during ip_rt_redirect_silence;
1385          * reset the algorithm.
1386          */
1387         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1388                 rt->u.dst.rate_tokens = 0;
1389
1390         /* Too many ignored redirects; do not send anything
1391          * set u.dst.rate_last to the last seen redirected packet.
1392          */
1393         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1394                 rt->u.dst.rate_last = jiffies;
1395                 goto out;
1396         }
1397
1398         /* Check for load limit; set rate_last to the latest sent
1399          * redirect.
1400          */
1401         if (rt->u.dst.rate_tokens == 0 ||
1402             time_after(jiffies,
1403                        (rt->u.dst.rate_last +
1404                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1405                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1406                 rt->u.dst.rate_last = jiffies;
1407                 ++rt->u.dst.rate_tokens;
1408 #ifdef CONFIG_IP_ROUTE_VERBOSE
1409                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1410                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1411                     net_ratelimit())
1412                         printk(KERN_WARNING "host " NIPQUAD_FMT "/if%d ignores "
1413                                 "redirects for " NIPQUAD_FMT " to " NIPQUAD_FMT ".\n",
1414                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1415                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1416 #endif
1417         }
1418 out:
1419         in_dev_put(in_dev);
1420 }
1421
1422 static int ip_error(struct sk_buff *skb)
1423 {
1424         struct rtable *rt = skb->rtable;
1425         unsigned long now;
1426         int code;
1427
1428         switch (rt->u.dst.error) {
1429                 case EINVAL:
1430                 default:
1431                         goto out;
1432                 case EHOSTUNREACH:
1433                         code = ICMP_HOST_UNREACH;
1434                         break;
1435                 case ENETUNREACH:
1436                         code = ICMP_NET_UNREACH;
1437                         IP_INC_STATS_BH(dev_net(rt->u.dst.dev),
1438                                         IPSTATS_MIB_INNOROUTES);
1439                         break;
1440                 case EACCES:
1441                         code = ICMP_PKT_FILTERED;
1442                         break;
1443         }
1444
1445         now = jiffies;
1446         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1447         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1448                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1449         rt->u.dst.rate_last = now;
1450         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1451                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1452                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1453         }
1454
1455 out:    kfree_skb(skb);
1456         return 0;
1457 }
1458
1459 /*
1460  *      The last two values are not from the RFC but
1461  *      are needed for AMPRnet AX.25 paths.
1462  */
1463
1464 static const unsigned short mtu_plateau[] =
1465 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1466
1467 static inline unsigned short guess_mtu(unsigned short old_mtu)
1468 {
1469         int i;
1470
1471         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1472                 if (old_mtu > mtu_plateau[i])
1473                         return mtu_plateau[i];
1474         return 68;
1475 }
1476
1477 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1478                                  unsigned short new_mtu,
1479                                  struct net_device *dev)
1480 {
1481         int i, k;
1482         unsigned short old_mtu = ntohs(iph->tot_len);
1483         struct rtable *rth;
1484         int  ikeys[2] = { dev->ifindex, 0 };
1485         __be32  skeys[2] = { iph->saddr, 0, };
1486         __be32  daddr = iph->daddr;
1487         unsigned short est_mtu = 0;
1488
1489         if (ipv4_config.no_pmtu_disc)
1490                 return 0;
1491
1492         for (k = 0; k < 2; k++) {
1493                 for (i = 0; i < 2; i++) {
1494                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k],
1495                                                 rt_genid(net));
1496
1497                         rcu_read_lock();
1498                         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1499                              rth = rcu_dereference(rth->u.dst.rt_next)) {
1500                                 unsigned short mtu = new_mtu;
1501
1502                                 if (rth->fl.fl4_dst != daddr ||
1503                                     rth->fl.fl4_src != skeys[i] ||
1504                                     rth->rt_dst != daddr ||
1505                                     rth->rt_src != iph->saddr ||
1506                                     rth->fl.oif != ikeys[k] ||
1507                                     rth->fl.iif != 0 ||
1508                                     dst_metric_locked(&rth->u.dst, RTAX_MTU) ||
1509                                     !net_eq(dev_net(rth->u.dst.dev), net) ||
1510                                     rt_is_expired(rth))
1511                                         continue;
1512
1513                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1514
1515                                         /* BSD 4.2 compatibility hack :-( */
1516                                         if (mtu == 0 &&
1517                                             old_mtu >= dst_mtu(&rth->u.dst) &&
1518                                             old_mtu >= 68 + (iph->ihl << 2))
1519                                                 old_mtu -= iph->ihl << 2;
1520
1521                                         mtu = guess_mtu(old_mtu);
1522                                 }
1523                                 if (mtu <= dst_mtu(&rth->u.dst)) {
1524                                         if (mtu < dst_mtu(&rth->u.dst)) {
1525                                                 dst_confirm(&rth->u.dst);
1526                                                 if (mtu < ip_rt_min_pmtu) {
1527                                                         mtu = ip_rt_min_pmtu;
1528                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1529                                                                 (1 << RTAX_MTU);
1530                                                 }
1531                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1532                                                 dst_set_expires(&rth->u.dst,
1533                                                         ip_rt_mtu_expires);
1534                                         }
1535                                         est_mtu = mtu;
1536                                 }
1537                         }
1538                         rcu_read_unlock();
1539                 }
1540         }
1541         return est_mtu ? : new_mtu;
1542 }
1543
1544 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1545 {
1546         if (dst_mtu(dst) > mtu && mtu >= 68 &&
1547             !(dst_metric_locked(dst, RTAX_MTU))) {
1548                 if (mtu < ip_rt_min_pmtu) {
1549                         mtu = ip_rt_min_pmtu;
1550                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1551                 }
1552                 dst->metrics[RTAX_MTU-1] = mtu;
1553                 dst_set_expires(dst, ip_rt_mtu_expires);
1554                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1555         }
1556 }
1557
1558 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1559 {
1560         return NULL;
1561 }
1562
1563 static void ipv4_dst_destroy(struct dst_entry *dst)
1564 {
1565         struct rtable *rt = (struct rtable *) dst;
1566         struct inet_peer *peer = rt->peer;
1567         struct in_device *idev = rt->idev;
1568
1569         if (peer) {
1570                 rt->peer = NULL;
1571                 inet_putpeer(peer);
1572         }
1573
1574         if (idev) {
1575                 rt->idev = NULL;
1576                 in_dev_put(idev);
1577         }
1578 }
1579
1580 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1581                             int how)
1582 {
1583         struct rtable *rt = (struct rtable *) dst;
1584         struct in_device *idev = rt->idev;
1585         if (dev != dev_net(dev)->loopback_dev && idev && idev->dev == dev) {
1586                 struct in_device *loopback_idev =
1587                         in_dev_get(dev_net(dev)->loopback_dev);
1588                 if (loopback_idev) {
1589                         rt->idev = loopback_idev;
1590                         in_dev_put(idev);
1591                 }
1592         }
1593 }
1594
1595 static void ipv4_link_failure(struct sk_buff *skb)
1596 {
1597         struct rtable *rt;
1598
1599         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1600
1601         rt = skb->rtable;
1602         if (rt)
1603                 dst_set_expires(&rt->u.dst, 0);
1604 }
1605
1606 static int ip_rt_bug(struct sk_buff *skb)
1607 {
1608         printk(KERN_DEBUG "ip_rt_bug: " NIPQUAD_FMT " -> " NIPQUAD_FMT ", %s\n",
1609                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1610                 skb->dev ? skb->dev->name : "?");
1611         kfree_skb(skb);
1612         return 0;
1613 }
1614
1615 /*
1616    We do not cache source address of outgoing interface,
1617    because it is used only by IP RR, TS and SRR options,
1618    so that it out of fast path.
1619
1620    BTW remember: "addr" is allowed to be not aligned
1621    in IP options!
1622  */
1623
1624 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1625 {
1626         __be32 src;
1627         struct fib_result res;
1628
1629         if (rt->fl.iif == 0)
1630                 src = rt->rt_src;
1631         else if (fib_lookup(dev_net(rt->u.dst.dev), &rt->fl, &res) == 0) {
1632                 src = FIB_RES_PREFSRC(res);
1633                 fib_res_put(&res);
1634         } else
1635                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1636                                         RT_SCOPE_UNIVERSE);
1637         memcpy(addr, &src, 4);
1638 }
1639
1640 #ifdef CONFIG_NET_CLS_ROUTE
1641 static void set_class_tag(struct rtable *rt, u32 tag)
1642 {
1643         if (!(rt->u.dst.tclassid & 0xFFFF))
1644                 rt->u.dst.tclassid |= tag & 0xFFFF;
1645         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1646                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1647 }
1648 #endif
1649
1650 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1651 {
1652         struct fib_info *fi = res->fi;
1653
1654         if (fi) {
1655                 if (FIB_RES_GW(*res) &&
1656                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1657                         rt->rt_gateway = FIB_RES_GW(*res);
1658                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1659                        sizeof(rt->u.dst.metrics));
1660                 if (fi->fib_mtu == 0) {
1661                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1662                         if (dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1663                             rt->rt_gateway != rt->rt_dst &&
1664                             rt->u.dst.dev->mtu > 576)
1665                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1666                 }
1667 #ifdef CONFIG_NET_CLS_ROUTE
1668                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1669 #endif
1670         } else
1671                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1672
1673         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1674                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1675         if (dst_mtu(&rt->u.dst) > IP_MAX_MTU)
1676                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1677         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) == 0)
1678                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1679                                        ip_rt_min_advmss);
1680         if (dst_metric(&rt->u.dst, RTAX_ADVMSS) > 65535 - 40)
1681                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1682
1683 #ifdef CONFIG_NET_CLS_ROUTE
1684 #ifdef CONFIG_IP_MULTIPLE_TABLES
1685         set_class_tag(rt, fib_rules_tclass(res));
1686 #endif
1687         set_class_tag(rt, itag);
1688 #endif
1689         rt->rt_type = res->type;
1690 }
1691
1692 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1693                                 u8 tos, struct net_device *dev, int our)
1694 {
1695         unsigned hash;
1696         struct rtable *rth;
1697         __be32 spec_dst;
1698         struct in_device *in_dev = in_dev_get(dev);
1699         u32 itag = 0;
1700
1701         /* Primary sanity checks. */
1702
1703         if (in_dev == NULL)
1704                 return -EINVAL;
1705
1706         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1707             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1708                 goto e_inval;
1709
1710         if (ipv4_is_zeronet(saddr)) {
1711                 if (!ipv4_is_local_multicast(daddr))
1712                         goto e_inval;
1713                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1714         } else if (fib_validate_source(saddr, 0, tos, 0,
1715                                         dev, &spec_dst, &itag) < 0)
1716                 goto e_inval;
1717
1718         rth = dst_alloc(&ipv4_dst_ops);
1719         if (!rth)
1720                 goto e_nobufs;
1721
1722         rth->u.dst.output= ip_rt_bug;
1723
1724         atomic_set(&rth->u.dst.__refcnt, 1);
1725         rth->u.dst.flags= DST_HOST;
1726         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1727                 rth->u.dst.flags |= DST_NOPOLICY;
1728         rth->fl.fl4_dst = daddr;
1729         rth->rt_dst     = daddr;
1730         rth->fl.fl4_tos = tos;
1731         rth->fl.mark    = skb->mark;
1732         rth->fl.fl4_src = saddr;
1733         rth->rt_src     = saddr;
1734 #ifdef CONFIG_NET_CLS_ROUTE
1735         rth->u.dst.tclassid = itag;
1736 #endif
1737         rth->rt_iif     =
1738         rth->fl.iif     = dev->ifindex;
1739         rth->u.dst.dev  = init_net.loopback_dev;
1740         dev_hold(rth->u.dst.dev);
1741         rth->idev       = in_dev_get(rth->u.dst.dev);
1742         rth->fl.oif     = 0;
1743         rth->rt_gateway = daddr;
1744         rth->rt_spec_dst= spec_dst;
1745         rth->rt_genid   = rt_genid(dev_net(dev));
1746         rth->rt_flags   = RTCF_MULTICAST;
1747         rth->rt_type    = RTN_MULTICAST;
1748         if (our) {
1749                 rth->u.dst.input= ip_local_deliver;
1750                 rth->rt_flags |= RTCF_LOCAL;
1751         }
1752
1753 #ifdef CONFIG_IP_MROUTE
1754         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1755                 rth->u.dst.input = ip_mr_input;
1756 #endif
1757         RT_CACHE_STAT_INC(in_slow_mc);
1758
1759         in_dev_put(in_dev);
1760         hash = rt_hash(daddr, saddr, dev->ifindex, rt_genid(dev_net(dev)));
1761         return rt_intern_hash(hash, rth, &skb->rtable);
1762
1763 e_nobufs:
1764         in_dev_put(in_dev);
1765         return -ENOBUFS;
1766
1767 e_inval:
1768         in_dev_put(in_dev);
1769         return -EINVAL;
1770 }
1771
1772
1773 static void ip_handle_martian_source(struct net_device *dev,
1774                                      struct in_device *in_dev,
1775                                      struct sk_buff *skb,
1776                                      __be32 daddr,
1777                                      __be32 saddr)
1778 {
1779         RT_CACHE_STAT_INC(in_martian_src);
1780 #ifdef CONFIG_IP_ROUTE_VERBOSE
1781         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1782                 /*
1783                  *      RFC1812 recommendation, if source is martian,
1784                  *      the only hint is MAC header.
1785                  */
1786                 printk(KERN_WARNING "martian source " NIPQUAD_FMT " from "
1787                         NIPQUAD_FMT", on dev %s\n",
1788                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1789                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1790                         int i;
1791                         const unsigned char *p = skb_mac_header(skb);
1792                         printk(KERN_WARNING "ll header: ");
1793                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1794                                 printk("%02x", *p);
1795                                 if (i < (dev->hard_header_len - 1))
1796                                         printk(":");
1797                         }
1798                         printk("\n");
1799                 }
1800         }
1801 #endif
1802 }
1803
1804 static int __mkroute_input(struct sk_buff *skb,
1805                            struct fib_result *res,
1806                            struct in_device *in_dev,
1807                            __be32 daddr, __be32 saddr, u32 tos,
1808                            struct rtable **result)
1809 {
1810
1811         struct rtable *rth;
1812         int err;
1813         struct in_device *out_dev;
1814         unsigned flags = 0;
1815         __be32 spec_dst;
1816         u32 itag;
1817
1818         /* get a working reference to the output device */
1819         out_dev = in_dev_get(FIB_RES_DEV(*res));
1820         if (out_dev == NULL) {
1821                 if (net_ratelimit())
1822                         printk(KERN_CRIT "Bug in ip_route_input" \
1823                                "_slow(). Please, report\n");
1824                 return -EINVAL;
1825         }
1826
1827
1828         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1829                                   in_dev->dev, &spec_dst, &itag);
1830         if (err < 0) {
1831                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1832                                          saddr);
1833
1834                 err = -EINVAL;
1835                 goto cleanup;
1836         }
1837
1838         if (err)
1839                 flags |= RTCF_DIRECTSRC;
1840
1841         if (out_dev == in_dev && err &&
1842             (IN_DEV_SHARED_MEDIA(out_dev) ||
1843              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1844                 flags |= RTCF_DOREDIRECT;
1845
1846         if (skb->protocol != htons(ETH_P_IP)) {
1847                 /* Not IP (i.e. ARP). Do not create route, if it is
1848                  * invalid for proxy arp. DNAT routes are always valid.
1849                  */
1850                 if (out_dev == in_dev) {
1851                         err = -EINVAL;
1852                         goto cleanup;
1853                 }
1854         }
1855
1856
1857         rth = dst_alloc(&ipv4_dst_ops);
1858         if (!rth) {
1859                 err = -ENOBUFS;
1860                 goto cleanup;
1861         }
1862
1863         atomic_set(&rth->u.dst.__refcnt, 1);
1864         rth->u.dst.flags= DST_HOST;
1865         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1866                 rth->u.dst.flags |= DST_NOPOLICY;
1867         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1868                 rth->u.dst.flags |= DST_NOXFRM;
1869         rth->fl.fl4_dst = daddr;
1870         rth->rt_dst     = daddr;
1871         rth->fl.fl4_tos = tos;
1872         rth->fl.mark    = skb->mark;
1873         rth->fl.fl4_src = saddr;
1874         rth->rt_src     = saddr;
1875         rth->rt_gateway = daddr;
1876         rth->rt_iif     =
1877                 rth->fl.iif     = in_dev->dev->ifindex;
1878         rth->u.dst.dev  = (out_dev)->dev;
1879         dev_hold(rth->u.dst.dev);
1880         rth->idev       = in_dev_get(rth->u.dst.dev);
1881         rth->fl.oif     = 0;
1882         rth->rt_spec_dst= spec_dst;
1883
1884         rth->u.dst.input = ip_forward;
1885         rth->u.dst.output = ip_output;
1886         rth->rt_genid = rt_genid(dev_net(rth->u.dst.dev));
1887
1888         rt_set_nexthop(rth, res, itag);
1889
1890         rth->rt_flags = flags;
1891
1892         *result = rth;
1893         err = 0;
1894  cleanup:
1895         /* release the working reference to the output device */
1896         in_dev_put(out_dev);
1897         return err;
1898 }
1899
1900 static int ip_mkroute_input(struct sk_buff *skb,
1901                             struct fib_result *res,
1902                             const struct flowi *fl,
1903                             struct in_device *in_dev,
1904                             __be32 daddr, __be32 saddr, u32 tos)
1905 {
1906         struct rtable* rth = NULL;
1907         int err;
1908         unsigned hash;
1909
1910 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1911         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1912                 fib_select_multipath(fl, res);
1913 #endif
1914
1915         /* create a routing cache entry */
1916         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1917         if (err)
1918                 return err;
1919
1920         /* put it into the cache */
1921         hash = rt_hash(daddr, saddr, fl->iif,
1922                        rt_genid(dev_net(rth->u.dst.dev)));
1923         return rt_intern_hash(hash, rth, &skb->rtable);
1924 }
1925
1926 /*
1927  *      NOTE. We drop all the packets that has local source
1928  *      addresses, because every properly looped back packet
1929  *      must have correct destination already attached by output routine.
1930  *
1931  *      Such approach solves two big problems:
1932  *      1. Not simplex devices are handled properly.
1933  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1934  */
1935
1936 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1937                                u8 tos, struct net_device *dev)
1938 {
1939         struct fib_result res;
1940         struct in_device *in_dev = in_dev_get(dev);
1941         struct flowi fl = { .nl_u = { .ip4_u =
1942                                       { .daddr = daddr,
1943                                         .saddr = saddr,
1944                                         .tos = tos,
1945                                         .scope = RT_SCOPE_UNIVERSE,
1946                                       } },
1947                             .mark = skb->mark,
1948                             .iif = dev->ifindex };
1949         unsigned        flags = 0;
1950         u32             itag = 0;
1951         struct rtable * rth;
1952         unsigned        hash;
1953         __be32          spec_dst;
1954         int             err = -EINVAL;
1955         int             free_res = 0;
1956         struct net    * net = dev_net(dev);
1957
1958         /* IP on this device is disabled. */
1959
1960         if (!in_dev)
1961                 goto out;
1962
1963         /* Check for the most weird martians, which can be not detected
1964            by fib_lookup.
1965          */
1966
1967         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1968             ipv4_is_loopback(saddr))
1969                 goto martian_source;
1970
1971         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1972                 goto brd_input;
1973
1974         /* Accept zero addresses only to limited broadcast;
1975          * I even do not know to fix it or not. Waiting for complains :-)
1976          */
1977         if (ipv4_is_zeronet(saddr))
1978                 goto martian_source;
1979
1980         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1981             ipv4_is_loopback(daddr))
1982                 goto martian_destination;
1983
1984         /*
1985          *      Now we are ready to route packet.
1986          */
1987         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1988                 if (!IN_DEV_FORWARD(in_dev))
1989                         goto e_hostunreach;
1990                 goto no_route;
1991         }
1992         free_res = 1;
1993
1994         RT_CACHE_STAT_INC(in_slow_tot);
1995
1996         if (res.type == RTN_BROADCAST)
1997                 goto brd_input;
1998
1999         if (res.type == RTN_LOCAL) {
2000                 int result;
2001                 result = fib_validate_source(saddr, daddr, tos,
2002                                              net->loopback_dev->ifindex,
2003                                              dev, &spec_dst, &itag);
2004                 if (result < 0)
2005                         goto martian_source;
2006                 if (result)
2007                         flags |= RTCF_DIRECTSRC;
2008                 spec_dst = daddr;
2009                 goto local_input;
2010         }
2011
2012         if (!IN_DEV_FORWARD(in_dev))
2013                 goto e_hostunreach;
2014         if (res.type != RTN_UNICAST)
2015                 goto martian_destination;
2016
2017         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
2018 done:
2019         in_dev_put(in_dev);
2020         if (free_res)
2021                 fib_res_put(&res);
2022 out:    return err;
2023
2024 brd_input:
2025         if (skb->protocol != htons(ETH_P_IP))
2026                 goto e_inval;
2027
2028         if (ipv4_is_zeronet(saddr))
2029                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
2030         else {
2031                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
2032                                           &itag);
2033                 if (err < 0)
2034                         goto martian_source;
2035                 if (err)
2036                         flags |= RTCF_DIRECTSRC;
2037         }
2038         flags |= RTCF_BROADCAST;
2039         res.type = RTN_BROADCAST;
2040         RT_CACHE_STAT_INC(in_brd);
2041
2042 local_input:
2043         rth = dst_alloc(&ipv4_dst_ops);
2044         if (!rth)
2045                 goto e_nobufs;
2046
2047         rth->u.dst.output= ip_rt_bug;
2048         rth->rt_genid = rt_genid(net);
2049
2050         atomic_set(&rth->u.dst.__refcnt, 1);
2051         rth->u.dst.flags= DST_HOST;
2052         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2053                 rth->u.dst.flags |= DST_NOPOLICY;
2054         rth->fl.fl4_dst = daddr;
2055         rth->rt_dst     = daddr;
2056         rth->fl.fl4_tos = tos;
2057         rth->fl.mark    = skb->mark;
2058         rth->fl.fl4_src = saddr;
2059         rth->rt_src     = saddr;
2060 #ifdef CONFIG_NET_CLS_ROUTE
2061         rth->u.dst.tclassid = itag;
2062 #endif
2063         rth->rt_iif     =
2064         rth->fl.iif     = dev->ifindex;
2065         rth->u.dst.dev  = net->loopback_dev;
2066         dev_hold(rth->u.dst.dev);
2067         rth->idev       = in_dev_get(rth->u.dst.dev);
2068         rth->rt_gateway = daddr;
2069         rth->rt_spec_dst= spec_dst;
2070         rth->u.dst.input= ip_local_deliver;
2071         rth->rt_flags   = flags|RTCF_LOCAL;
2072         if (res.type == RTN_UNREACHABLE) {
2073                 rth->u.dst.input= ip_error;
2074                 rth->u.dst.error= -err;
2075                 rth->rt_flags   &= ~RTCF_LOCAL;
2076         }
2077         rth->rt_type    = res.type;
2078         hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net));
2079         err = rt_intern_hash(hash, rth, &skb->rtable);
2080         goto done;
2081
2082 no_route:
2083         RT_CACHE_STAT_INC(in_no_route);
2084         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2085         res.type = RTN_UNREACHABLE;
2086         if (err == -ESRCH)
2087                 err = -ENETUNREACH;
2088         goto local_input;
2089
2090         /*
2091          *      Do not cache martian addresses: they should be logged (RFC1812)
2092          */
2093 martian_destination:
2094         RT_CACHE_STAT_INC(in_martian_dst);
2095 #ifdef CONFIG_IP_ROUTE_VERBOSE
2096         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2097                 printk(KERN_WARNING "martian destination " NIPQUAD_FMT " from "
2098                         NIPQUAD_FMT ", dev %s\n",
2099                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2100 #endif
2101
2102 e_hostunreach:
2103         err = -EHOSTUNREACH;
2104         goto done;
2105
2106 e_inval:
2107         err = -EINVAL;
2108         goto done;
2109
2110 e_nobufs:
2111         err = -ENOBUFS;
2112         goto done;
2113
2114 martian_source:
2115         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2116         goto e_inval;
2117 }
2118
2119 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2120                    u8 tos, struct net_device *dev)
2121 {
2122         struct rtable * rth;
2123         unsigned        hash;
2124         int iif = dev->ifindex;
2125         struct net *net;
2126
2127         net = dev_net(dev);
2128         tos &= IPTOS_RT_MASK;
2129         hash = rt_hash(daddr, saddr, iif, rt_genid(net));
2130
2131         rcu_read_lock();
2132         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2133              rth = rcu_dereference(rth->u.dst.rt_next)) {
2134                 if (((rth->fl.fl4_dst ^ daddr) |
2135                      (rth->fl.fl4_src ^ saddr) |
2136                      (rth->fl.iif ^ iif) |
2137                      rth->fl.oif |
2138                      (rth->fl.fl4_tos ^ tos)) == 0 &&
2139                     rth->fl.mark == skb->mark &&
2140                     net_eq(dev_net(rth->u.dst.dev), net) &&
2141                     !rt_is_expired(rth)) {
2142                         dst_use(&rth->u.dst, jiffies);
2143                         RT_CACHE_STAT_INC(in_hit);
2144                         rcu_read_unlock();
2145                         skb->rtable = rth;
2146                         return 0;
2147                 }
2148                 RT_CACHE_STAT_INC(in_hlist_search);
2149         }
2150         rcu_read_unlock();
2151
2152         /* Multicast recognition logic is moved from route cache to here.
2153            The problem was that too many Ethernet cards have broken/missing
2154            hardware multicast filters :-( As result the host on multicasting
2155            network acquires a lot of useless route cache entries, sort of
2156            SDR messages from all the world. Now we try to get rid of them.
2157            Really, provided software IP multicast filter is organized
2158            reasonably (at least, hashed), it does not result in a slowdown
2159            comparing with route cache reject entries.
2160            Note, that multicast routers are not affected, because
2161            route cache entry is created eventually.
2162          */
2163         if (ipv4_is_multicast(daddr)) {
2164                 struct in_device *in_dev;
2165
2166                 rcu_read_lock();
2167                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2168                         int our = ip_check_mc(in_dev, daddr, saddr,
2169                                 ip_hdr(skb)->protocol);
2170                         if (our
2171 #ifdef CONFIG_IP_MROUTE
2172                             || (!ipv4_is_local_multicast(daddr) &&
2173                                 IN_DEV_MFORWARD(in_dev))
2174 #endif
2175                             ) {
2176                                 rcu_read_unlock();
2177                                 return ip_route_input_mc(skb, daddr, saddr,
2178                                                          tos, dev, our);
2179                         }
2180                 }
2181                 rcu_read_unlock();
2182                 return -EINVAL;
2183         }
2184         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2185 }
2186
2187 static int __mkroute_output(struct rtable **result,
2188                             struct fib_result *res,
2189                             const struct flowi *fl,
2190                             const struct flowi *oldflp,
2191                             struct net_device *dev_out,
2192                             unsigned flags)
2193 {
2194         struct rtable *rth;
2195         struct in_device *in_dev;
2196         u32 tos = RT_FL_TOS(oldflp);
2197         int err = 0;
2198
2199         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2200                 return -EINVAL;
2201
2202         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2203                 res->type = RTN_BROADCAST;
2204         else if (ipv4_is_multicast(fl->fl4_dst))
2205                 res->type = RTN_MULTICAST;
2206         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2207                 return -EINVAL;
2208
2209         if (dev_out->flags & IFF_LOOPBACK)
2210                 flags |= RTCF_LOCAL;
2211
2212         /* get work reference to inet device */
2213         in_dev = in_dev_get(dev_out);
2214         if (!in_dev)
2215                 return -EINVAL;
2216
2217         if (res->type == RTN_BROADCAST) {
2218                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2219                 if (res->fi) {
2220                         fib_info_put(res->fi);
2221                         res->fi = NULL;
2222                 }
2223         } else if (res->type == RTN_MULTICAST) {
2224                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2225                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2226                                  oldflp->proto))
2227                         flags &= ~RTCF_LOCAL;
2228                 /* If multicast route do not exist use
2229                    default one, but do not gateway in this case.
2230                    Yes, it is hack.
2231                  */
2232                 if (res->fi && res->prefixlen < 4) {
2233                         fib_info_put(res->fi);
2234                         res->fi = NULL;
2235                 }
2236         }
2237
2238
2239         rth = dst_alloc(&ipv4_dst_ops);
2240         if (!rth) {
2241                 err = -ENOBUFS;
2242                 goto cleanup;
2243         }
2244
2245         atomic_set(&rth->u.dst.__refcnt, 1);
2246         rth->u.dst.flags= DST_HOST;
2247         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2248                 rth->u.dst.flags |= DST_NOXFRM;
2249         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2250                 rth->u.dst.flags |= DST_NOPOLICY;
2251
2252         rth->fl.fl4_dst = oldflp->fl4_dst;
2253         rth->fl.fl4_tos = tos;
2254         rth->fl.fl4_src = oldflp->fl4_src;
2255         rth->fl.oif     = oldflp->oif;
2256         rth->fl.mark    = oldflp->mark;
2257         rth->rt_dst     = fl->fl4_dst;
2258         rth->rt_src     = fl->fl4_src;
2259         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2260         /* get references to the devices that are to be hold by the routing
2261            cache entry */
2262         rth->u.dst.dev  = dev_out;
2263         dev_hold(dev_out);
2264         rth->idev       = in_dev_get(dev_out);
2265         rth->rt_gateway = fl->fl4_dst;
2266         rth->rt_spec_dst= fl->fl4_src;
2267
2268         rth->u.dst.output=ip_output;
2269         rth->rt_genid = rt_genid(dev_net(dev_out));
2270
2271         RT_CACHE_STAT_INC(out_slow_tot);
2272
2273         if (flags & RTCF_LOCAL) {
2274                 rth->u.dst.input = ip_local_deliver;
2275                 rth->rt_spec_dst = fl->fl4_dst;
2276         }
2277         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2278                 rth->rt_spec_dst = fl->fl4_src;
2279                 if (flags & RTCF_LOCAL &&
2280                     !(dev_out->flags & IFF_LOOPBACK)) {
2281                         rth->u.dst.output = ip_mc_output;
2282                         RT_CACHE_STAT_INC(out_slow_mc);
2283                 }
2284 #ifdef CONFIG_IP_MROUTE
2285                 if (res->type == RTN_MULTICAST) {
2286                         if (IN_DEV_MFORWARD(in_dev) &&
2287                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2288                                 rth->u.dst.input = ip_mr_input;
2289                                 rth->u.dst.output = ip_mc_output;
2290                         }
2291                 }
2292 #endif
2293         }
2294
2295         rt_set_nexthop(rth, res, 0);
2296
2297         rth->rt_flags = flags;
2298
2299         *result = rth;
2300  cleanup:
2301         /* release work reference to inet device */
2302         in_dev_put(in_dev);
2303
2304         return err;
2305 }
2306
2307 static int ip_mkroute_output(struct rtable **rp,
2308                              struct fib_result *res,
2309                              const struct flowi *fl,
2310                              const struct flowi *oldflp,
2311                              struct net_device *dev_out,
2312                              unsigned flags)
2313 {
2314         struct rtable *rth = NULL;
2315         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2316         unsigned hash;
2317         if (err == 0) {
2318                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif,
2319                                rt_genid(dev_net(dev_out)));
2320                 err = rt_intern_hash(hash, rth, rp);
2321         }
2322
2323         return err;
2324 }
2325
2326 /*
2327  * Major route resolver routine.
2328  */
2329
2330 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2331                                 const struct flowi *oldflp)
2332 {
2333         u32 tos = RT_FL_TOS(oldflp);
2334         struct flowi fl = { .nl_u = { .ip4_u =
2335                                       { .daddr = oldflp->fl4_dst,
2336                                         .saddr = oldflp->fl4_src,
2337                                         .tos = tos & IPTOS_RT_MASK,
2338                                         .scope = ((tos & RTO_ONLINK) ?
2339                                                   RT_SCOPE_LINK :
2340                                                   RT_SCOPE_UNIVERSE),
2341                                       } },
2342                             .mark = oldflp->mark,
2343                             .iif = net->loopback_dev->ifindex,
2344                             .oif = oldflp->oif };
2345         struct fib_result res;
2346         unsigned flags = 0;
2347         struct net_device *dev_out = NULL;
2348         int free_res = 0;
2349         int err;
2350
2351
2352         res.fi          = NULL;
2353 #ifdef CONFIG_IP_MULTIPLE_TABLES
2354         res.r           = NULL;
2355 #endif
2356
2357         if (oldflp->fl4_src) {
2358                 err = -EINVAL;
2359                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2360                     ipv4_is_lbcast(oldflp->fl4_src) ||
2361                     ipv4_is_zeronet(oldflp->fl4_src))
2362                         goto out;
2363
2364                 /* I removed check for oif == dev_out->oif here.
2365                    It was wrong for two reasons:
2366                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2367                       is assigned to multiple interfaces.
2368                    2. Moreover, we are allowed to send packets with saddr
2369                       of another iface. --ANK
2370                  */
2371
2372                 if (oldflp->oif == 0
2373                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2374                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2375                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2376                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2377                         if (dev_out == NULL)
2378                                 goto out;
2379
2380                         /* Special hack: user can direct multicasts
2381                            and limited broadcast via necessary interface
2382                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2383                            This hack is not just for fun, it allows
2384                            vic,vat and friends to work.
2385                            They bind socket to loopback, set ttl to zero
2386                            and expect that it will work.
2387                            From the viewpoint of routing cache they are broken,
2388                            because we are not allowed to build multicast path
2389                            with loopback source addr (look, routing cache
2390                            cannot know, that ttl is zero, so that packet
2391                            will not leave this host and route is valid).
2392                            Luckily, this hack is good workaround.
2393                          */
2394
2395                         fl.oif = dev_out->ifindex;
2396                         goto make_route;
2397                 }
2398
2399                 if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) {
2400                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2401                         dev_out = ip_dev_find(net, oldflp->fl4_src);
2402                         if (dev_out == NULL)
2403                                 goto out;
2404                         dev_put(dev_out);
2405                         dev_out = NULL;
2406                 }
2407         }
2408
2409
2410         if (oldflp->oif) {
2411                 dev_out = dev_get_by_index(net, oldflp->oif);
2412                 err = -ENODEV;
2413                 if (dev_out == NULL)
2414                         goto out;
2415
2416                 /* RACE: Check return value of inet_select_addr instead. */
2417                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2418                         dev_put(dev_out);
2419                         goto out;       /* Wrong error code */
2420                 }
2421
2422                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2423                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2424                         if (!fl.fl4_src)
2425                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2426                                                               RT_SCOPE_LINK);
2427                         goto make_route;
2428                 }
2429                 if (!fl.fl4_src) {
2430                         if (ipv4_is_multicast(oldflp->fl4_dst))
2431                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2432                                                               fl.fl4_scope);
2433                         else if (!oldflp->fl4_dst)
2434                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2435                                                               RT_SCOPE_HOST);
2436                 }
2437         }
2438
2439         if (!fl.fl4_dst) {
2440                 fl.fl4_dst = fl.fl4_src;
2441                 if (!fl.fl4_dst)
2442                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2443                 if (dev_out)
2444                         dev_put(dev_out);
2445                 dev_out = net->loopback_dev;
2446                 dev_hold(dev_out);
2447                 fl.oif = net->loopback_dev->ifindex;
2448                 res.type = RTN_LOCAL;
2449                 flags |= RTCF_LOCAL;
2450                 goto make_route;
2451         }
2452
2453         if (fib_lookup(net, &fl, &res)) {
2454                 res.fi = NULL;
2455                 if (oldflp->oif) {
2456                         /* Apparently, routing tables are wrong. Assume,
2457                            that the destination is on link.
2458
2459                            WHY? DW.
2460                            Because we are allowed to send to iface
2461                            even if it has NO routes and NO assigned
2462                            addresses. When oif is specified, routing
2463                            tables are looked up with only one purpose:
2464                            to catch if destination is gatewayed, rather than
2465                            direct. Moreover, if MSG_DONTROUTE is set,
2466                            we send packet, ignoring both routing tables
2467                            and ifaddr state. --ANK
2468
2469
2470                            We could make it even if oif is unknown,
2471                            likely IPv6, but we do not.
2472                          */
2473
2474                         if (fl.fl4_src == 0)
2475                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2476                                                               RT_SCOPE_LINK);
2477                         res.type = RTN_UNICAST;
2478                         goto make_route;
2479                 }
2480                 if (dev_out)
2481                         dev_put(dev_out);
2482                 err = -ENETUNREACH;
2483                 goto out;
2484         }
2485         free_res = 1;
2486
2487         if (res.type == RTN_LOCAL) {
2488                 if (!fl.fl4_src)
2489                         fl.fl4_src = fl.fl4_dst;
2490                 if (dev_out)
2491                         dev_put(dev_out);
2492                 dev_out = net->loopback_dev;
2493                 dev_hold(dev_out);
2494                 fl.oif = dev_out->ifindex;
2495                 if (res.fi)
2496                         fib_info_put(res.fi);
2497                 res.fi = NULL;
2498                 flags |= RTCF_LOCAL;
2499                 goto make_route;
2500         }
2501
2502 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2503         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2504                 fib_select_multipath(&fl, &res);
2505         else
2506 #endif
2507         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2508                 fib_select_default(net, &fl, &res);
2509
2510         if (!fl.fl4_src)
2511                 fl.fl4_src = FIB_RES_PREFSRC(res);
2512
2513         if (dev_out)
2514                 dev_put(dev_out);
2515         dev_out = FIB_RES_DEV(res);
2516         dev_hold(dev_out);
2517         fl.oif = dev_out->ifindex;
2518
2519
2520 make_route:
2521         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2522
2523
2524         if (free_res)
2525                 fib_res_put(&res);
2526         if (dev_out)
2527                 dev_put(dev_out);
2528 out:    return err;
2529 }
2530
2531 int __ip_route_output_key(struct net *net, struct rtable **rp,
2532                           const struct flowi *flp)
2533 {
2534         unsigned hash;
2535         struct rtable *rth;
2536
2537         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif, rt_genid(net));
2538
2539         rcu_read_lock_bh();
2540         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2541                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2542                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2543                     rth->fl.fl4_src == flp->fl4_src &&
2544                     rth->fl.iif == 0 &&
2545                     rth->fl.oif == flp->oif &&
2546                     rth->fl.mark == flp->mark &&
2547                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2548                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2549                     net_eq(dev_net(rth->u.dst.dev), net) &&
2550                     !rt_is_expired(rth)) {
2551                         dst_use(&rth->u.dst, jiffies);
2552                         RT_CACHE_STAT_INC(out_hit);
2553                         rcu_read_unlock_bh();
2554                         *rp = rth;
2555                         return 0;
2556                 }
2557                 RT_CACHE_STAT_INC(out_hlist_search);
2558         }
2559         rcu_read_unlock_bh();
2560
2561         return ip_route_output_slow(net, rp, flp);
2562 }
2563
2564 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2565
2566 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2567 {
2568 }
2569
2570 static struct dst_ops ipv4_dst_blackhole_ops = {
2571         .family                 =       AF_INET,
2572         .protocol               =       __constant_htons(ETH_P_IP),
2573         .destroy                =       ipv4_dst_destroy,
2574         .check                  =       ipv4_dst_check,
2575         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2576         .entry_size             =       sizeof(struct rtable),
2577         .entries                =       ATOMIC_INIT(0),
2578 };
2579
2580
2581 static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi *flp)
2582 {
2583         struct rtable *ort = *rp;
2584         struct rtable *rt = (struct rtable *)
2585                 dst_alloc(&ipv4_dst_blackhole_ops);
2586
2587         if (rt) {
2588                 struct dst_entry *new = &rt->u.dst;
2589
2590                 atomic_set(&new->__refcnt, 1);
2591                 new->__use = 1;
2592                 new->input = dst_discard;
2593                 new->output = dst_discard;
2594                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2595
2596                 new->dev = ort->u.dst.dev;
2597                 if (new->dev)
2598                         dev_hold(new->dev);
2599
2600                 rt->fl = ort->fl;
2601
2602                 rt->idev = ort->idev;
2603                 if (rt->idev)
2604                         in_dev_hold(rt->idev);
2605                 rt->rt_genid = rt_genid(net);
2606                 rt->rt_flags = ort->rt_flags;
2607                 rt->rt_type = ort->rt_type;
2608                 rt->rt_dst = ort->rt_dst;
2609                 rt->rt_src = ort->rt_src;
2610                 rt->rt_iif = ort->rt_iif;
2611                 rt->rt_gateway = ort->rt_gateway;
2612                 rt->rt_spec_dst = ort->rt_spec_dst;
2613                 rt->peer = ort->peer;
2614                 if (rt->peer)
2615                         atomic_inc(&rt->peer->refcnt);
2616
2617                 dst_free(new);
2618         }
2619
2620         dst_release(&(*rp)->u.dst);
2621         *rp = rt;
2622         return (rt ? 0 : -ENOMEM);
2623 }
2624
2625 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2626                          struct sock *sk, int flags)
2627 {
2628         int err;
2629
2630         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2631                 return err;
2632
2633         if (flp->proto) {
2634                 if (!flp->fl4_src)
2635                         flp->fl4_src = (*rp)->rt_src;
2636                 if (!flp->fl4_dst)
2637                         flp->fl4_dst = (*rp)->rt_dst;
2638                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2639                                     flags ? XFRM_LOOKUP_WAIT : 0);
2640                 if (err == -EREMOTE)
2641                         err = ipv4_dst_blackhole(net, rp, flp);
2642
2643                 return err;
2644         }
2645
2646         return 0;
2647 }
2648
2649 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2650
2651 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2652 {
2653         return ip_route_output_flow(net, rp, flp, NULL, 0);
2654 }
2655
2656 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2657                         int nowait, unsigned int flags)
2658 {
2659         struct rtable *rt = skb->rtable;
2660         struct rtmsg *r;
2661         struct nlmsghdr *nlh;
2662         long expires;
2663         u32 id = 0, ts = 0, tsage = 0, error;
2664
2665         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2666         if (nlh == NULL)
2667                 return -EMSGSIZE;
2668
2669         r = nlmsg_data(nlh);
2670         r->rtm_family    = AF_INET;
2671         r->rtm_dst_len  = 32;
2672         r->rtm_src_len  = 0;
2673         r->rtm_tos      = rt->fl.fl4_tos;
2674         r->rtm_table    = RT_TABLE_MAIN;
2675         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2676         r->rtm_type     = rt->rt_type;
2677         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2678         r->rtm_protocol = RTPROT_UNSPEC;
2679         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2680         if (rt->rt_flags & RTCF_NOTIFY)
2681                 r->rtm_flags |= RTM_F_NOTIFY;
2682
2683         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2684
2685         if (rt->fl.fl4_src) {
2686                 r->rtm_src_len = 32;
2687                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2688         }
2689         if (rt->u.dst.dev)
2690                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2691 #ifdef CONFIG_NET_CLS_ROUTE
2692         if (rt->u.dst.tclassid)
2693                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2694 #endif
2695         if (rt->fl.iif)
2696                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2697         else if (rt->rt_src != rt->fl.fl4_src)
2698                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2699
2700         if (rt->rt_dst != rt->rt_gateway)
2701                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2702
2703         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2704                 goto nla_put_failure;
2705
2706         error = rt->u.dst.error;
2707         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2708         if (rt->peer) {
2709                 id = rt->peer->ip_id_count;
2710                 if (rt->peer->tcp_ts_stamp) {
2711                         ts = rt->peer->tcp_ts;
2712                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2713                 }
2714         }
2715
2716         if (rt->fl.iif) {
2717 #ifdef CONFIG_IP_MROUTE
2718                 __be32 dst = rt->rt_dst;
2719
2720                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2721                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2722                         int err = ipmr_get_route(skb, r, nowait);
2723                         if (err <= 0) {
2724                                 if (!nowait) {
2725                                         if (err == 0)
2726                                                 return 0;
2727                                         goto nla_put_failure;
2728                                 } else {
2729                                         if (err == -EMSGSIZE)
2730                                                 goto nla_put_failure;
2731                                         error = err;
2732                                 }
2733                         }
2734                 } else
2735 #endif
2736                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2737         }
2738
2739         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2740                                expires, error) < 0)
2741                 goto nla_put_failure;
2742
2743         return nlmsg_end(skb, nlh);
2744
2745 nla_put_failure:
2746         nlmsg_cancel(skb, nlh);
2747         return -EMSGSIZE;
2748 }
2749
2750 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2751 {
2752         struct net *net = sock_net(in_skb->sk);
2753         struct rtmsg *rtm;
2754         struct nlattr *tb[RTA_MAX+1];
2755         struct rtable *rt = NULL;
2756         __be32 dst = 0;
2757         __be32 src = 0;
2758         u32 iif;
2759         int err;
2760         struct sk_buff *skb;
2761
2762         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2763         if (err < 0)
2764                 goto errout;
2765
2766         rtm = nlmsg_data(nlh);
2767
2768         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2769         if (skb == NULL) {
2770                 err = -ENOBUFS;
2771                 goto errout;
2772         }
2773
2774         /* Reserve room for dummy headers, this skb can pass
2775            through good chunk of routing engine.
2776          */
2777         skb_reset_mac_header(skb);
2778         skb_reset_network_header(skb);
2779
2780         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2781         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2782         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2783
2784         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2785         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2786         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2787
2788         if (iif) {
2789                 struct net_device *dev;
2790
2791                 dev = __dev_get_by_index(net, iif);
2792                 if (dev == NULL) {
2793                         err = -ENODEV;
2794                         goto errout_free;
2795                 }
2796
2797                 skb->protocol   = htons(ETH_P_IP);
2798                 skb->dev        = dev;
2799                 local_bh_disable();
2800                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2801                 local_bh_enable();
2802
2803                 rt = skb->rtable;
2804                 if (err == 0 && rt->u.dst.error)
2805                         err = -rt->u.dst.error;
2806         } else {
2807                 struct flowi fl = {
2808                         .nl_u = {
2809                                 .ip4_u = {
2810                                         .daddr = dst,
2811                                         .saddr = src,
2812                                         .tos = rtm->rtm_tos,
2813                                 },
2814                         },
2815                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2816                 };
2817                 err = ip_route_output_key(net, &rt, &fl);
2818         }
2819
2820         if (err)
2821                 goto errout_free;
2822
2823         skb->rtable = rt;
2824         if (rtm->rtm_flags & RTM_F_NOTIFY)
2825                 rt->rt_flags |= RTCF_NOTIFY;
2826
2827         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2828                            RTM_NEWROUTE, 0, 0);
2829         if (err <= 0)
2830                 goto errout_free;
2831
2832         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2833 errout:
2834         return err;
2835
2836 errout_free:
2837         kfree_skb(skb);
2838         goto errout;
2839 }
2840
2841 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2842 {
2843         struct rtable *rt;
2844         int h, s_h;
2845         int idx, s_idx;
2846         struct net *net;
2847
2848         net = sock_net(skb->sk);
2849
2850         s_h = cb->args[0];
2851         if (s_h < 0)
2852                 s_h = 0;
2853         s_idx = idx = cb->args[1];
2854         for (h = s_h; h <= rt_hash_mask; h++, s_idx = 0) {
2855                 if (!rt_hash_table[h].chain)
2856                         continue;
2857                 rcu_read_lock_bh();
2858                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2859                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2860                         if (!net_eq(dev_net(rt->u.dst.dev), net) || idx < s_idx)
2861                                 continue;
2862                         if (rt_is_expired(rt))
2863                                 continue;
2864                         skb->dst = dst_clone(&rt->u.dst);
2865                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2866                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2867                                          1, NLM_F_MULTI) <= 0) {
2868                                 dst_release(xchg(&skb->dst, NULL));
2869                                 rcu_read_unlock_bh();
2870                                 goto done;
2871                         }
2872                         dst_release(xchg(&skb->dst, NULL));
2873                 }
2874                 rcu_read_unlock_bh();
2875         }
2876
2877 done:
2878         cb->args[0] = h;
2879         cb->args[1] = idx;
2880         return skb->len;
2881 }
2882
2883 void ip_rt_multicast_event(struct in_device *in_dev)
2884 {
2885         rt_cache_flush(dev_net(in_dev->dev), 0);
2886 }
2887
2888 #ifdef CONFIG_SYSCTL
2889 static int ipv4_sysctl_rtcache_flush(ctl_table *__ctl, int write,
2890                                         struct file *filp, void __user *buffer,
2891                                         size_t *lenp, loff_t *ppos)
2892 {
2893         if (write) {
2894                 int flush_delay;
2895                 ctl_table ctl;
2896                 struct net *net;
2897
2898                 memcpy(&ctl, __ctl, sizeof(ctl));
2899                 ctl.data = &flush_delay;
2900                 proc_dointvec(&ctl, write, filp, buffer, lenp, ppos);
2901
2902                 net = (struct net *)__ctl->extra1;
2903                 rt_cache_flush(net, flush_delay);
2904                 return 0;
2905         }
2906
2907         return -EINVAL;
2908 }
2909
2910 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2911                                                 int __user *name,
2912                                                 int nlen,
2913                                                 void __user *oldval,
2914                                                 size_t __user *oldlenp,
2915                                                 void __user *newval,
2916                                                 size_t newlen)
2917 {
2918         int delay;
2919         struct net *net;
2920         if (newlen != sizeof(int))
2921                 return -EINVAL;
2922         if (get_user(delay, (int __user *)newval))
2923                 return -EFAULT;
2924         net = (struct net *)table->extra1;
2925         rt_cache_flush(net, delay);
2926         return 0;
2927 }
2928
2929 static void rt_secret_reschedule(int old)
2930 {
2931         struct net *net;
2932         int new = ip_rt_secret_interval;
2933         int diff = new - old;
2934
2935         if (!diff)
2936                 return;
2937
2938         rtnl_lock();
2939         for_each_net(net) {
2940                 int deleted = del_timer_sync(&net->ipv4.rt_secret_timer);
2941
2942                 if (!new)
2943                         continue;
2944
2945                 if (deleted) {
2946                         long time = net->ipv4.rt_secret_timer.expires - jiffies;
2947
2948                         if (time <= 0 || (time += diff) <= 0)
2949                                 time = 0;
2950
2951                         net->ipv4.rt_secret_timer.expires = time;
2952                 } else
2953                         net->ipv4.rt_secret_timer.expires = new;
2954
2955                 net->ipv4.rt_secret_timer.expires += jiffies;
2956                 add_timer(&net->ipv4.rt_secret_timer);
2957         }
2958         rtnl_unlock();
2959 }
2960
2961 static int ipv4_sysctl_rt_secret_interval(ctl_table *ctl, int write,
2962                                           struct file *filp,
2963                                           void __user *buffer, size_t *lenp,
2964                                           loff_t *ppos)
2965 {
2966         int old = ip_rt_secret_interval;
2967         int ret = proc_dointvec_jiffies(ctl, write, filp, buffer, lenp, ppos);
2968
2969         rt_secret_reschedule(old);
2970
2971         return ret;
2972 }
2973
2974 static int ipv4_sysctl_rt_secret_interval_strategy(ctl_table *table,
2975                                                    int __user *name,
2976                                                    int nlen,
2977                                                    void __user *oldval,
2978                                                    size_t __user *oldlenp,
2979                                                    void __user *newval,
2980                                                    size_t newlen)
2981 {
2982         int old = ip_rt_secret_interval;
2983         int ret = sysctl_jiffies(table, name, nlen, oldval, oldlenp, newval,
2984                                  newlen);
2985
2986         rt_secret_reschedule(old);
2987
2988         return ret;
2989 }
2990
2991 static ctl_table ipv4_route_table[] = {
2992         {
2993                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2994                 .procname       = "gc_thresh",
2995                 .data           = &ipv4_dst_ops.gc_thresh,
2996                 .maxlen         = sizeof(int),
2997                 .mode           = 0644,
2998                 .proc_handler   = &proc_dointvec,
2999         },
3000         {
3001                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
3002                 .procname       = "max_size",
3003                 .data           = &ip_rt_max_size,
3004                 .maxlen         = sizeof(int),
3005                 .mode           = 0644,
3006                 .proc_handler   = &proc_dointvec,
3007         },
3008         {
3009                 /*  Deprecated. Use gc_min_interval_ms */
3010
3011                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
3012                 .procname       = "gc_min_interval",
3013                 .data           = &ip_rt_gc_min_interval,
3014                 .maxlen         = sizeof(int),
3015                 .mode           = 0644,
3016                 .proc_handler   = &proc_dointvec_jiffies,
3017                 .strategy       = &sysctl_jiffies,
3018         },
3019         {
3020                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
3021                 .procname       = "gc_min_interval_ms",
3022                 .data           = &ip_rt_gc_min_interval,
3023                 .maxlen         = sizeof(int),
3024                 .mode           = 0644,
3025                 .proc_handler   = &proc_dointvec_ms_jiffies,
3026                 .strategy       = &sysctl_ms_jiffies,
3027         },
3028         {
3029                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
3030                 .procname       = "gc_timeout",
3031                 .data           = &ip_rt_gc_timeout,
3032                 .maxlen         = sizeof(int),
3033                 .mode           = 0644,
3034                 .proc_handler   = &proc_dointvec_jiffies,
3035                 .strategy       = &sysctl_jiffies,
3036         },
3037         {
3038                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
3039                 .procname       = "gc_interval",
3040                 .data           = &ip_rt_gc_interval,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = &proc_dointvec_jiffies,
3044                 .strategy       = &sysctl_jiffies,
3045         },
3046         {
3047                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
3048                 .procname       = "redirect_load",
3049                 .data           = &ip_rt_redirect_load,
3050                 .maxlen         = sizeof(int),
3051                 .mode           = 0644,
3052                 .proc_handler   = &proc_dointvec,
3053         },
3054         {
3055                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
3056                 .procname       = "redirect_number",
3057                 .data           = &ip_rt_redirect_number,
3058                 .maxlen         = sizeof(int),
3059                 .mode           = 0644,
3060                 .proc_handler   = &proc_dointvec,
3061         },
3062         {
3063                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
3064                 .procname       = "redirect_silence",
3065                 .data           = &ip_rt_redirect_silence,
3066                 .maxlen         = sizeof(int),
3067                 .mode           = 0644,
3068                 .proc_handler   = &proc_dointvec,
3069         },
3070         {
3071                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
3072                 .procname       = "error_cost",
3073                 .data           = &ip_rt_error_cost,
3074                 .maxlen         = sizeof(int),
3075                 .mode           = 0644,
3076                 .proc_handler   = &proc_dointvec,
3077         },
3078         {
3079                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
3080                 .procname       = "error_burst",
3081                 .data           = &ip_rt_error_burst,
3082                 .maxlen         = sizeof(int),
3083                 .mode           = 0644,
3084                 .proc_handler   = &proc_dointvec,
3085         },
3086         {
3087                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
3088                 .procname       = "gc_elasticity",
3089                 .data           = &ip_rt_gc_elasticity,
3090                 .maxlen         = sizeof(int),
3091                 .mode           = 0644,
3092                 .proc_handler   = &proc_dointvec,
3093         },
3094         {
3095                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
3096                 .procname       = "mtu_expires",
3097                 .data           = &ip_rt_mtu_expires,
3098                 .maxlen         = sizeof(int),
3099                 .mode           = 0644,
3100                 .proc_handler   = &proc_dointvec_jiffies,
3101                 .strategy       = &sysctl_jiffies,
3102         },
3103         {
3104                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
3105                 .procname       = "min_pmtu",
3106                 .data           = &ip_rt_min_pmtu,
3107                 .maxlen         = sizeof(int),
3108                 .mode           = 0644,
3109                 .proc_handler   = &proc_dointvec,
3110         },
3111         {
3112                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
3113                 .procname       = "min_adv_mss",
3114                 .data           = &ip_rt_min_advmss,
3115                 .maxlen         = sizeof(int),
3116                 .mode           = 0644,
3117                 .proc_handler   = &proc_dointvec,
3118         },
3119         {
3120                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3121                 .procname       = "secret_interval",
3122                 .data           = &ip_rt_secret_interval,
3123                 .maxlen         = sizeof(int),
3124                 .mode           = 0644,
3125                 .proc_handler   = &ipv4_sysctl_rt_secret_interval,
3126                 .strategy       = &ipv4_sysctl_rt_secret_interval_strategy,
3127         },
3128         { .ctl_name = 0 }
3129 };
3130
3131 static struct ctl_table empty[1];
3132
3133 static struct ctl_table ipv4_skeleton[] =
3134 {
3135         { .procname = "route", .ctl_name = NET_IPV4_ROUTE,
3136           .mode = 0555, .child = ipv4_route_table},
3137         { .procname = "neigh", .ctl_name = NET_IPV4_NEIGH,
3138           .mode = 0555, .child = empty},
3139         { }
3140 };
3141
3142 static __net_initdata struct ctl_path ipv4_path[] = {
3143         { .procname = "net", .ctl_name = CTL_NET, },
3144         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3145         { },
3146 };
3147
3148 static struct ctl_table ipv4_route_flush_table[] = {
3149         {
3150                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
3151                 .procname       = "flush",
3152                 .maxlen         = sizeof(int),
3153                 .mode           = 0200,
3154                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
3155                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
3156         },
3157         { .ctl_name = 0 },
3158 };
3159
3160 static __net_initdata struct ctl_path ipv4_route_path[] = {
3161         { .procname = "net", .ctl_name = CTL_NET, },
3162         { .procname = "ipv4", .ctl_name = NET_IPV4, },
3163         { .procname = "route", .ctl_name = NET_IPV4_ROUTE, },
3164         { },
3165 };
3166
3167 static __net_init int sysctl_route_net_init(struct net *net)
3168 {
3169         struct ctl_table *tbl;
3170
3171         tbl = ipv4_route_flush_table;
3172         if (net != &init_net) {
3173                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3174                 if (tbl == NULL)
3175                         goto err_dup;
3176         }
3177         tbl[0].extra1 = net;
3178
3179         net->ipv4.route_hdr =
3180                 register_net_sysctl_table(net, ipv4_route_path, tbl);
3181         if (net->ipv4.route_hdr == NULL)
3182                 goto err_reg;
3183         return 0;
3184
3185 err_reg:
3186         if (tbl != ipv4_route_flush_table)
3187                 kfree(tbl);
3188 err_dup:
3189         return -ENOMEM;
3190 }
3191
3192 static __net_exit void sysctl_route_net_exit(struct net *net)
3193 {
3194         struct ctl_table *tbl;
3195
3196         tbl = net->ipv4.route_hdr->ctl_table_arg;
3197         unregister_net_sysctl_table(net->ipv4.route_hdr);
3198         BUG_ON(tbl == ipv4_route_flush_table);
3199         kfree(tbl);
3200 }
3201
3202 static __net_initdata struct pernet_operations sysctl_route_ops = {
3203         .init = sysctl_route_net_init,
3204         .exit = sysctl_route_net_exit,
3205 };
3206 #endif
3207
3208
3209 static __net_init int rt_secret_timer_init(struct net *net)
3210 {
3211         atomic_set(&net->ipv4.rt_genid,
3212                         (int) ((num_physpages ^ (num_physpages>>8)) ^
3213                         (jiffies ^ (jiffies >> 7))));
3214
3215         net->ipv4.rt_secret_timer.function = rt_secret_rebuild;
3216         net->ipv4.rt_secret_timer.data = (unsigned long)net;
3217         init_timer_deferrable(&net->ipv4.rt_secret_timer);
3218
3219         if (ip_rt_secret_interval) {
3220                 net->ipv4.rt_secret_timer.expires =
3221                         jiffies + net_random() % ip_rt_secret_interval +
3222                         ip_rt_secret_interval;
3223                 add_timer(&net->ipv4.rt_secret_timer);
3224         }
3225         return 0;
3226 }
3227
3228 static __net_exit void rt_secret_timer_exit(struct net *net)
3229 {
3230         del_timer_sync(&net->ipv4.rt_secret_timer);
3231 }
3232
3233 static __net_initdata struct pernet_operations rt_secret_timer_ops = {
3234         .init = rt_secret_timer_init,
3235         .exit = rt_secret_timer_exit,
3236 };
3237
3238
3239 #ifdef CONFIG_NET_CLS_ROUTE
3240 struct ip_rt_acct *ip_rt_acct __read_mostly;
3241 #endif /* CONFIG_NET_CLS_ROUTE */
3242
3243 static __initdata unsigned long rhash_entries;
3244 static int __init set_rhash_entries(char *str)
3245 {
3246         if (!str)
3247                 return 0;
3248         rhash_entries = simple_strtoul(str, &str, 0);
3249         return 1;
3250 }
3251 __setup("rhash_entries=", set_rhash_entries);
3252
3253 int __init ip_rt_init(void)
3254 {
3255         int rc = 0;
3256
3257 #ifdef CONFIG_NET_CLS_ROUTE
3258         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3259         if (!ip_rt_acct)
3260                 panic("IP: failed to allocate ip_rt_acct\n");
3261 #endif
3262
3263         ipv4_dst_ops.kmem_cachep =
3264                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3265                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3266
3267         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3268
3269         rt_hash_table = (struct rt_hash_bucket *)
3270                 alloc_large_system_hash("IP route cache",
3271                                         sizeof(struct rt_hash_bucket),
3272                                         rhash_entries,
3273                                         (num_physpages >= 128 * 1024) ?
3274                                         15 : 17,
3275                                         0,
3276                                         &rt_hash_log,
3277                                         &rt_hash_mask,
3278                                         0);
3279         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3280         rt_hash_lock_init();
3281
3282         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3283         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3284
3285         devinet_init();
3286         ip_fib_init();
3287
3288         /* All the timers, started at system startup tend
3289            to synchronize. Perturb it a bit.
3290          */
3291         schedule_delayed_work(&expires_work,
3292                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3293
3294         if (register_pernet_subsys(&rt_secret_timer_ops))
3295                 printk(KERN_ERR "Unable to setup rt_secret_timer\n");
3296
3297         if (ip_rt_proc_init())
3298                 printk(KERN_ERR "Unable to create route proc files\n");
3299 #ifdef CONFIG_XFRM
3300         xfrm_init();
3301         xfrm4_init();
3302 #endif
3303         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3304
3305 #ifdef CONFIG_SYSCTL
3306         register_pernet_subsys(&sysctl_route_ops);
3307 #endif
3308         return rc;
3309 }
3310
3311 #ifdef CONFIG_SYSCTL
3312 /*
3313  * We really need to sanitize the damn ipv4 init order, then all
3314  * this nonsense will go away.
3315  */
3316 void __init ip_static_sysctl_init(void)
3317 {
3318         register_sysctl_paths(ipv4_path, ipv4_skeleton);
3319 }
3320 #endif
3321
3322 EXPORT_SYMBOL(__ip_select_ident);
3323 EXPORT_SYMBOL(ip_route_input);
3324 EXPORT_SYMBOL(ip_route_output_key);