[IPV4]: sk parameter is unused in ipv4_dst_blackhole.
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval            = 60 * HZ;
123 static int ip_rt_gc_min_interval        = HZ / 2;
124 static int ip_rt_redirect_number        = 9;
125 static int ip_rt_redirect_load          = HZ / 50;
126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost             = HZ;
128 static int ip_rt_error_burst            = 5 * HZ;
129 static int ip_rt_gc_elasticity          = 8;
130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
132 static int ip_rt_min_advmss             = 256;
133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .local_out =            ip_local_out,
166         .entry_size =           sizeof(struct rtable),
167         .entries =              ATOMIC_INIT(0),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 const __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234
235 static __init void rt_hash_lock_init(void)
236 {
237         int i;
238
239         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240                         GFP_KERNEL);
241         if (!rt_hash_locks)
242                 panic("IP: failed to allocate rt_hash_locks\n");
243
244         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245                 spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254
255 static struct rt_hash_bucket    *rt_hash_table;
256 static unsigned                 rt_hash_mask;
257 static unsigned int             rt_hash_log;
258 static atomic_t                 rt_genid;
259
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262         (__raw_get_cpu_var(rt_cache_stat).field++)
263
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267                 & rt_hash_mask;
268 }
269
270 #define rt_hash(daddr, saddr, idx) \
271         rt_hash_code((__force u32)(__be32)(daddr),\
272                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276         struct seq_net_private p;
277         int bucket;
278         int genid;
279 };
280
281 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
282 {
283         struct rtable *r = NULL;
284
285         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
286                 rcu_read_lock_bh();
287                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
288                 while (r) {
289                         if (r->u.dst.dev->nd_net == st->p.net &&
290                             r->rt_genid == st->genid)
291                                 return r;
292                         r = rcu_dereference(r->u.dst.rt_next);
293                 }
294                 rcu_read_unlock_bh();
295         }
296         return r;
297 }
298
299 static struct rtable *__rt_cache_get_next(struct rt_cache_iter_state *st,
300                                           struct rtable *r)
301 {
302         r = r->u.dst.rt_next;
303         while (!r) {
304                 rcu_read_unlock_bh();
305                 if (--st->bucket < 0)
306                         break;
307                 rcu_read_lock_bh();
308                 r = rt_hash_table[st->bucket].chain;
309         }
310         return rcu_dereference(r);
311 }
312
313 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st,
314                                         struct rtable *r)
315 {
316         while ((r = __rt_cache_get_next(st, r)) != NULL) {
317                 if (r->u.dst.dev->nd_net != st->p.net)
318                         continue;
319                 if (r->rt_genid == st->genid)
320                         break;
321         }
322         return r;
323 }
324
325 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
326 {
327         struct rtable *r = rt_cache_get_first(st);
328
329         if (r)
330                 while (pos && (r = rt_cache_get_next(st, r)))
331                         --pos;
332         return pos ? NULL : r;
333 }
334
335 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
336 {
337         struct rt_cache_iter_state *st = seq->private;
338
339         if (*pos)
340                 return rt_cache_get_idx(st, *pos - 1);
341         st->genid = atomic_read(&rt_genid);
342         return SEQ_START_TOKEN;
343 }
344
345 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
346 {
347         struct rtable *r;
348         struct rt_cache_iter_state *st = seq->private;
349
350         if (v == SEQ_START_TOKEN)
351                 r = rt_cache_get_first(st);
352         else
353                 r = rt_cache_get_next(st, v);
354         ++*pos;
355         return r;
356 }
357
358 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
359 {
360         if (v && v != SEQ_START_TOKEN)
361                 rcu_read_unlock_bh();
362 }
363
364 static int rt_cache_seq_show(struct seq_file *seq, void *v)
365 {
366         if (v == SEQ_START_TOKEN)
367                 seq_printf(seq, "%-127s\n",
368                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
369                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
370                            "HHUptod\tSpecDst");
371         else {
372                 struct rtable *r = v;
373                 char temp[256];
374
375                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
376                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
377                         r->u.dst.dev ? r->u.dst.dev->name : "*",
378                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
379                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
380                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
381                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
382                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
383                         dst_metric(&r->u.dst, RTAX_WINDOW),
384                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
385                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
386                         r->fl.fl4_tos,
387                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
388                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
389                                        dev_queue_xmit) : 0,
390                         r->rt_spec_dst);
391                 seq_printf(seq, "%-127s\n", temp);
392         }
393         return 0;
394 }
395
396 static const struct seq_operations rt_cache_seq_ops = {
397         .start  = rt_cache_seq_start,
398         .next   = rt_cache_seq_next,
399         .stop   = rt_cache_seq_stop,
400         .show   = rt_cache_seq_show,
401 };
402
403 static int rt_cache_seq_open(struct inode *inode, struct file *file)
404 {
405         return seq_open_net(inode, file, &rt_cache_seq_ops,
406                         sizeof(struct rt_cache_iter_state));
407 }
408
409 static const struct file_operations rt_cache_seq_fops = {
410         .owner   = THIS_MODULE,
411         .open    = rt_cache_seq_open,
412         .read    = seq_read,
413         .llseek  = seq_lseek,
414         .release = seq_release_net,
415 };
416
417
418 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
419 {
420         int cpu;
421
422         if (*pos == 0)
423                 return SEQ_START_TOKEN;
424
425         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
426                 if (!cpu_possible(cpu))
427                         continue;
428                 *pos = cpu+1;
429                 return &per_cpu(rt_cache_stat, cpu);
430         }
431         return NULL;
432 }
433
434 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435 {
436         int cpu;
437
438         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
439                 if (!cpu_possible(cpu))
440                         continue;
441                 *pos = cpu+1;
442                 return &per_cpu(rt_cache_stat, cpu);
443         }
444         return NULL;
445
446 }
447
448 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
449 {
450
451 }
452
453 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
454 {
455         struct rt_cache_stat *st = v;
456
457         if (v == SEQ_START_TOKEN) {
458                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
459                 return 0;
460         }
461
462         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
463                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
464                    atomic_read(&ipv4_dst_ops.entries),
465                    st->in_hit,
466                    st->in_slow_tot,
467                    st->in_slow_mc,
468                    st->in_no_route,
469                    st->in_brd,
470                    st->in_martian_dst,
471                    st->in_martian_src,
472
473                    st->out_hit,
474                    st->out_slow_tot,
475                    st->out_slow_mc,
476
477                    st->gc_total,
478                    st->gc_ignored,
479                    st->gc_goal_miss,
480                    st->gc_dst_overflow,
481                    st->in_hlist_search,
482                    st->out_hlist_search
483                 );
484         return 0;
485 }
486
487 static const struct seq_operations rt_cpu_seq_ops = {
488         .start  = rt_cpu_seq_start,
489         .next   = rt_cpu_seq_next,
490         .stop   = rt_cpu_seq_stop,
491         .show   = rt_cpu_seq_show,
492 };
493
494
495 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
496 {
497         return seq_open(file, &rt_cpu_seq_ops);
498 }
499
500 static const struct file_operations rt_cpu_seq_fops = {
501         .owner   = THIS_MODULE,
502         .open    = rt_cpu_seq_open,
503         .read    = seq_read,
504         .llseek  = seq_lseek,
505         .release = seq_release,
506 };
507
508 #ifdef CONFIG_NET_CLS_ROUTE
509 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
510                            int length, int *eof, void *data)
511 {
512         unsigned int i;
513
514         if ((offset & 3) || (length & 3))
515                 return -EIO;
516
517         if (offset >= sizeof(struct ip_rt_acct) * 256) {
518                 *eof = 1;
519                 return 0;
520         }
521
522         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
523                 length = sizeof(struct ip_rt_acct) * 256 - offset;
524                 *eof = 1;
525         }
526
527         offset /= sizeof(u32);
528
529         if (length > 0) {
530                 u32 *dst = (u32 *) buffer;
531
532                 *start = buffer;
533                 memset(dst, 0, length);
534
535                 for_each_possible_cpu(i) {
536                         unsigned int j;
537                         u32 *src;
538
539                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
540                         for (j = 0; j < length/4; j++)
541                                 dst[j] += src[j];
542                 }
543         }
544         return length;
545 }
546 #endif
547
548 static int __net_init ip_rt_do_proc_init(struct net *net)
549 {
550         struct proc_dir_entry *pde;
551
552         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
553                         &rt_cache_seq_fops);
554         if (!pde)
555                 goto err1;
556
557         pde = proc_create("rt_cache", S_IRUGO,
558                           net->proc_net_stat, &rt_cpu_seq_fops);
559         if (!pde)
560                 goto err2;
561
562 #ifdef CONFIG_NET_CLS_ROUTE
563         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
564                         ip_rt_acct_read, NULL);
565         if (!pde)
566                 goto err3;
567 #endif
568         return 0;
569
570 #ifdef CONFIG_NET_CLS_ROUTE
571 err3:
572         remove_proc_entry("rt_cache", net->proc_net_stat);
573 #endif
574 err2:
575         remove_proc_entry("rt_cache", net->proc_net);
576 err1:
577         return -ENOMEM;
578 }
579
580 static void __net_exit ip_rt_do_proc_exit(struct net *net)
581 {
582         remove_proc_entry("rt_cache", net->proc_net_stat);
583         remove_proc_entry("rt_cache", net->proc_net);
584         remove_proc_entry("rt_acct", net->proc_net);
585 }
586
587 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
588         .init = ip_rt_do_proc_init,
589         .exit = ip_rt_do_proc_exit,
590 };
591
592 static int __init ip_rt_proc_init(void)
593 {
594         return register_pernet_subsys(&ip_rt_proc_ops);
595 }
596
597 #else
598 static inline int ip_rt_proc_init(void)
599 {
600         return 0;
601 }
602 #endif /* CONFIG_PROC_FS */
603
604 static __inline__ void rt_free(struct rtable *rt)
605 {
606         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
607 }
608
609 static __inline__ void rt_drop(struct rtable *rt)
610 {
611         ip_rt_put(rt);
612         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
613 }
614
615 static __inline__ int rt_fast_clean(struct rtable *rth)
616 {
617         /* Kill broadcast/multicast entries very aggresively, if they
618            collide in hash table with more useful entries */
619         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
620                 rth->fl.iif && rth->u.dst.rt_next;
621 }
622
623 static __inline__ int rt_valuable(struct rtable *rth)
624 {
625         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
626                 rth->u.dst.expires;
627 }
628
629 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
630 {
631         unsigned long age;
632         int ret = 0;
633
634         if (atomic_read(&rth->u.dst.__refcnt))
635                 goto out;
636
637         ret = 1;
638         if (rth->u.dst.expires &&
639             time_after_eq(jiffies, rth->u.dst.expires))
640                 goto out;
641
642         age = jiffies - rth->u.dst.lastuse;
643         ret = 0;
644         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
645             (age <= tmo2 && rt_valuable(rth)))
646                 goto out;
647         ret = 1;
648 out:    return ret;
649 }
650
651 /* Bits of score are:
652  * 31: very valuable
653  * 30: not quite useless
654  * 29..0: usage counter
655  */
656 static inline u32 rt_score(struct rtable *rt)
657 {
658         u32 score = jiffies - rt->u.dst.lastuse;
659
660         score = ~score & ~(3<<30);
661
662         if (rt_valuable(rt))
663                 score |= (1<<31);
664
665         if (!rt->fl.iif ||
666             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
667                 score |= (1<<30);
668
669         return score;
670 }
671
672 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
673 {
674         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
675                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
676                 (fl1->mark ^ fl2->mark) |
677                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
678                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
679                 (fl1->oif ^ fl2->oif) |
680                 (fl1->iif ^ fl2->iif)) == 0;
681 }
682
683 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
684 {
685         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
686 }
687
688 /*
689  * Perform a full scan of hash table and free all entries.
690  * Can be called by a softirq or a process.
691  * In the later case, we want to be reschedule if necessary
692  */
693 static void rt_do_flush(int process_context)
694 {
695         unsigned int i;
696         struct rtable *rth, *next;
697
698         for (i = 0; i <= rt_hash_mask; i++) {
699                 if (process_context && need_resched())
700                         cond_resched();
701                 rth = rt_hash_table[i].chain;
702                 if (!rth)
703                         continue;
704
705                 spin_lock_bh(rt_hash_lock_addr(i));
706                 rth = rt_hash_table[i].chain;
707                 rt_hash_table[i].chain = NULL;
708                 spin_unlock_bh(rt_hash_lock_addr(i));
709
710                 for (; rth; rth = next) {
711                         next = rth->u.dst.rt_next;
712                         rt_free(rth);
713                 }
714         }
715 }
716
717 static void rt_check_expire(void)
718 {
719         static unsigned int rover;
720         unsigned int i = rover, goal;
721         struct rtable *rth, **rthp;
722         u64 mult;
723
724         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
725         if (ip_rt_gc_timeout > 1)
726                 do_div(mult, ip_rt_gc_timeout);
727         goal = (unsigned int)mult;
728         if (goal > rt_hash_mask)
729                 goal = rt_hash_mask + 1;
730         for (; goal > 0; goal--) {
731                 unsigned long tmo = ip_rt_gc_timeout;
732
733                 i = (i + 1) & rt_hash_mask;
734                 rthp = &rt_hash_table[i].chain;
735
736                 if (need_resched())
737                         cond_resched();
738
739                 if (*rthp == NULL)
740                         continue;
741                 spin_lock_bh(rt_hash_lock_addr(i));
742                 while ((rth = *rthp) != NULL) {
743                         if (rth->rt_genid != atomic_read(&rt_genid)) {
744                                 *rthp = rth->u.dst.rt_next;
745                                 rt_free(rth);
746                                 continue;
747                         }
748                         if (rth->u.dst.expires) {
749                                 /* Entry is expired even if it is in use */
750                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
751                                         tmo >>= 1;
752                                         rthp = &rth->u.dst.rt_next;
753                                         continue;
754                                 }
755                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
756                                 tmo >>= 1;
757                                 rthp = &rth->u.dst.rt_next;
758                                 continue;
759                         }
760
761                         /* Cleanup aged off entries. */
762                         *rthp = rth->u.dst.rt_next;
763                         rt_free(rth);
764                 }
765                 spin_unlock_bh(rt_hash_lock_addr(i));
766         }
767         rover = i;
768 }
769
770 /*
771  * rt_worker_func() is run in process context.
772  * we call rt_check_expire() to scan part of the hash table
773  */
774 static void rt_worker_func(struct work_struct *work)
775 {
776         rt_check_expire();
777         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
778 }
779
780 /*
781  * Pertubation of rt_genid by a small quantity [1..256]
782  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
783  * many times (2^24) without giving recent rt_genid.
784  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
785  */
786 static void rt_cache_invalidate(void)
787 {
788         unsigned char shuffle;
789
790         get_random_bytes(&shuffle, sizeof(shuffle));
791         atomic_add(shuffle + 1U, &rt_genid);
792 }
793
794 /*
795  * delay < 0  : invalidate cache (fast : entries will be deleted later)
796  * delay >= 0 : invalidate & flush cache (can be long)
797  */
798 void rt_cache_flush(int delay)
799 {
800         rt_cache_invalidate();
801         if (delay >= 0)
802                 rt_do_flush(!in_softirq());
803 }
804
805 /*
806  * We change rt_genid and let gc do the cleanup
807  */
808 static void rt_secret_rebuild(unsigned long dummy)
809 {
810         rt_cache_invalidate();
811         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
812 }
813
814 /*
815    Short description of GC goals.
816
817    We want to build algorithm, which will keep routing cache
818    at some equilibrium point, when number of aged off entries
819    is kept approximately equal to newly generated ones.
820
821    Current expiration strength is variable "expire".
822    We try to adjust it dynamically, so that if networking
823    is idle expires is large enough to keep enough of warm entries,
824    and when load increases it reduces to limit cache size.
825  */
826
827 static int rt_garbage_collect(struct dst_ops *ops)
828 {
829         static unsigned long expire = RT_GC_TIMEOUT;
830         static unsigned long last_gc;
831         static int rover;
832         static int equilibrium;
833         struct rtable *rth, **rthp;
834         unsigned long now = jiffies;
835         int goal;
836
837         /*
838          * Garbage collection is pretty expensive,
839          * do not make it too frequently.
840          */
841
842         RT_CACHE_STAT_INC(gc_total);
843
844         if (now - last_gc < ip_rt_gc_min_interval &&
845             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
846                 RT_CACHE_STAT_INC(gc_ignored);
847                 goto out;
848         }
849
850         /* Calculate number of entries, which we want to expire now. */
851         goal = atomic_read(&ipv4_dst_ops.entries) -
852                 (ip_rt_gc_elasticity << rt_hash_log);
853         if (goal <= 0) {
854                 if (equilibrium < ipv4_dst_ops.gc_thresh)
855                         equilibrium = ipv4_dst_ops.gc_thresh;
856                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
857                 if (goal > 0) {
858                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
859                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
860                 }
861         } else {
862                 /* We are in dangerous area. Try to reduce cache really
863                  * aggressively.
864                  */
865                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
866                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
867         }
868
869         if (now - last_gc >= ip_rt_gc_min_interval)
870                 last_gc = now;
871
872         if (goal <= 0) {
873                 equilibrium += goal;
874                 goto work_done;
875         }
876
877         do {
878                 int i, k;
879
880                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
881                         unsigned long tmo = expire;
882
883                         k = (k + 1) & rt_hash_mask;
884                         rthp = &rt_hash_table[k].chain;
885                         spin_lock_bh(rt_hash_lock_addr(k));
886                         while ((rth = *rthp) != NULL) {
887                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
888                                         !rt_may_expire(rth, tmo, expire)) {
889                                         tmo >>= 1;
890                                         rthp = &rth->u.dst.rt_next;
891                                         continue;
892                                 }
893                                 *rthp = rth->u.dst.rt_next;
894                                 rt_free(rth);
895                                 goal--;
896                         }
897                         spin_unlock_bh(rt_hash_lock_addr(k));
898                         if (goal <= 0)
899                                 break;
900                 }
901                 rover = k;
902
903                 if (goal <= 0)
904                         goto work_done;
905
906                 /* Goal is not achieved. We stop process if:
907
908                    - if expire reduced to zero. Otherwise, expire is halfed.
909                    - if table is not full.
910                    - if we are called from interrupt.
911                    - jiffies check is just fallback/debug loop breaker.
912                      We will not spin here for long time in any case.
913                  */
914
915                 RT_CACHE_STAT_INC(gc_goal_miss);
916
917                 if (expire == 0)
918                         break;
919
920                 expire >>= 1;
921 #if RT_CACHE_DEBUG >= 2
922                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
923                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
924 #endif
925
926                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
927                         goto out;
928         } while (!in_softirq() && time_before_eq(jiffies, now));
929
930         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
931                 goto out;
932         if (net_ratelimit())
933                 printk(KERN_WARNING "dst cache overflow\n");
934         RT_CACHE_STAT_INC(gc_dst_overflow);
935         return 1;
936
937 work_done:
938         expire += ip_rt_gc_min_interval;
939         if (expire > ip_rt_gc_timeout ||
940             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
941                 expire = ip_rt_gc_timeout;
942 #if RT_CACHE_DEBUG >= 2
943         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
944                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
945 #endif
946 out:    return 0;
947 }
948
949 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
950 {
951         struct rtable   *rth, **rthp;
952         unsigned long   now;
953         struct rtable *cand, **candp;
954         u32             min_score;
955         int             chain_length;
956         int attempts = !in_softirq();
957
958 restart:
959         chain_length = 0;
960         min_score = ~(u32)0;
961         cand = NULL;
962         candp = NULL;
963         now = jiffies;
964
965         rthp = &rt_hash_table[hash].chain;
966
967         spin_lock_bh(rt_hash_lock_addr(hash));
968         while ((rth = *rthp) != NULL) {
969                 if (rth->rt_genid != atomic_read(&rt_genid)) {
970                         *rthp = rth->u.dst.rt_next;
971                         rt_free(rth);
972                         continue;
973                 }
974                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
975                         /* Put it first */
976                         *rthp = rth->u.dst.rt_next;
977                         /*
978                          * Since lookup is lockfree, the deletion
979                          * must be visible to another weakly ordered CPU before
980                          * the insertion at the start of the hash chain.
981                          */
982                         rcu_assign_pointer(rth->u.dst.rt_next,
983                                            rt_hash_table[hash].chain);
984                         /*
985                          * Since lookup is lockfree, the update writes
986                          * must be ordered for consistency on SMP.
987                          */
988                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
989
990                         dst_use(&rth->u.dst, now);
991                         spin_unlock_bh(rt_hash_lock_addr(hash));
992
993                         rt_drop(rt);
994                         *rp = rth;
995                         return 0;
996                 }
997
998                 if (!atomic_read(&rth->u.dst.__refcnt)) {
999                         u32 score = rt_score(rth);
1000
1001                         if (score <= min_score) {
1002                                 cand = rth;
1003                                 candp = rthp;
1004                                 min_score = score;
1005                         }
1006                 }
1007
1008                 chain_length++;
1009
1010                 rthp = &rth->u.dst.rt_next;
1011         }
1012
1013         if (cand) {
1014                 /* ip_rt_gc_elasticity used to be average length of chain
1015                  * length, when exceeded gc becomes really aggressive.
1016                  *
1017                  * The second limit is less certain. At the moment it allows
1018                  * only 2 entries per bucket. We will see.
1019                  */
1020                 if (chain_length > ip_rt_gc_elasticity) {
1021                         *candp = cand->u.dst.rt_next;
1022                         rt_free(cand);
1023                 }
1024         }
1025
1026         /* Try to bind route to arp only if it is output
1027            route or unicast forwarding path.
1028          */
1029         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1030                 int err = arp_bind_neighbour(&rt->u.dst);
1031                 if (err) {
1032                         spin_unlock_bh(rt_hash_lock_addr(hash));
1033
1034                         if (err != -ENOBUFS) {
1035                                 rt_drop(rt);
1036                                 return err;
1037                         }
1038
1039                         /* Neighbour tables are full and nothing
1040                            can be released. Try to shrink route cache,
1041                            it is most likely it holds some neighbour records.
1042                          */
1043                         if (attempts-- > 0) {
1044                                 int saved_elasticity = ip_rt_gc_elasticity;
1045                                 int saved_int = ip_rt_gc_min_interval;
1046                                 ip_rt_gc_elasticity     = 1;
1047                                 ip_rt_gc_min_interval   = 0;
1048                                 rt_garbage_collect(&ipv4_dst_ops);
1049                                 ip_rt_gc_min_interval   = saved_int;
1050                                 ip_rt_gc_elasticity     = saved_elasticity;
1051                                 goto restart;
1052                         }
1053
1054                         if (net_ratelimit())
1055                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1056                         rt_drop(rt);
1057                         return -ENOBUFS;
1058                 }
1059         }
1060
1061         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1062 #if RT_CACHE_DEBUG >= 2
1063         if (rt->u.dst.rt_next) {
1064                 struct rtable *trt;
1065                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1066                        NIPQUAD(rt->rt_dst));
1067                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1068                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1069                 printk("\n");
1070         }
1071 #endif
1072         rt_hash_table[hash].chain = rt;
1073         spin_unlock_bh(rt_hash_lock_addr(hash));
1074         *rp = rt;
1075         return 0;
1076 }
1077
1078 void rt_bind_peer(struct rtable *rt, int create)
1079 {
1080         static DEFINE_SPINLOCK(rt_peer_lock);
1081         struct inet_peer *peer;
1082
1083         peer = inet_getpeer(rt->rt_dst, create);
1084
1085         spin_lock_bh(&rt_peer_lock);
1086         if (rt->peer == NULL) {
1087                 rt->peer = peer;
1088                 peer = NULL;
1089         }
1090         spin_unlock_bh(&rt_peer_lock);
1091         if (peer)
1092                 inet_putpeer(peer);
1093 }
1094
1095 /*
1096  * Peer allocation may fail only in serious out-of-memory conditions.  However
1097  * we still can generate some output.
1098  * Random ID selection looks a bit dangerous because we have no chances to
1099  * select ID being unique in a reasonable period of time.
1100  * But broken packet identifier may be better than no packet at all.
1101  */
1102 static void ip_select_fb_ident(struct iphdr *iph)
1103 {
1104         static DEFINE_SPINLOCK(ip_fb_id_lock);
1105         static u32 ip_fallback_id;
1106         u32 salt;
1107
1108         spin_lock_bh(&ip_fb_id_lock);
1109         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1110         iph->id = htons(salt & 0xFFFF);
1111         ip_fallback_id = salt;
1112         spin_unlock_bh(&ip_fb_id_lock);
1113 }
1114
1115 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1116 {
1117         struct rtable *rt = (struct rtable *) dst;
1118
1119         if (rt) {
1120                 if (rt->peer == NULL)
1121                         rt_bind_peer(rt, 1);
1122
1123                 /* If peer is attached to destination, it is never detached,
1124                    so that we need not to grab a lock to dereference it.
1125                  */
1126                 if (rt->peer) {
1127                         iph->id = htons(inet_getid(rt->peer, more));
1128                         return;
1129                 }
1130         } else
1131                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1132                        __builtin_return_address(0));
1133
1134         ip_select_fb_ident(iph);
1135 }
1136
1137 static void rt_del(unsigned hash, struct rtable *rt)
1138 {
1139         struct rtable **rthp, *aux;
1140
1141         rthp = &rt_hash_table[hash].chain;
1142         spin_lock_bh(rt_hash_lock_addr(hash));
1143         ip_rt_put(rt);
1144         while ((aux = *rthp) != NULL) {
1145                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1146                         *rthp = aux->u.dst.rt_next;
1147                         rt_free(aux);
1148                         continue;
1149                 }
1150                 rthp = &aux->u.dst.rt_next;
1151         }
1152         spin_unlock_bh(rt_hash_lock_addr(hash));
1153 }
1154
1155 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1156                     __be32 saddr, struct net_device *dev)
1157 {
1158         int i, k;
1159         struct in_device *in_dev = in_dev_get(dev);
1160         struct rtable *rth, **rthp;
1161         __be32  skeys[2] = { saddr, 0 };
1162         int  ikeys[2] = { dev->ifindex, 0 };
1163         struct netevent_redirect netevent;
1164         struct net *net;
1165
1166         if (!in_dev)
1167                 return;
1168
1169         net = dev->nd_net;
1170         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1171             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1172             || ipv4_is_zeronet(new_gw))
1173                 goto reject_redirect;
1174
1175         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1176                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1177                         goto reject_redirect;
1178                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1179                         goto reject_redirect;
1180         } else {
1181                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1182                         goto reject_redirect;
1183         }
1184
1185         for (i = 0; i < 2; i++) {
1186                 for (k = 0; k < 2; k++) {
1187                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1188
1189                         rthp=&rt_hash_table[hash].chain;
1190
1191                         rcu_read_lock();
1192                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1193                                 struct rtable *rt;
1194
1195                                 if (rth->fl.fl4_dst != daddr ||
1196                                     rth->fl.fl4_src != skeys[i] ||
1197                                     rth->fl.oif != ikeys[k] ||
1198                                     rth->fl.iif != 0 ||
1199                                     rth->rt_genid != atomic_read(&rt_genid) ||
1200                                     rth->u.dst.dev->nd_net != net) {
1201                                         rthp = &rth->u.dst.rt_next;
1202                                         continue;
1203                                 }
1204
1205                                 if (rth->rt_dst != daddr ||
1206                                     rth->rt_src != saddr ||
1207                                     rth->u.dst.error ||
1208                                     rth->rt_gateway != old_gw ||
1209                                     rth->u.dst.dev != dev)
1210                                         break;
1211
1212                                 dst_hold(&rth->u.dst);
1213                                 rcu_read_unlock();
1214
1215                                 rt = dst_alloc(&ipv4_dst_ops);
1216                                 if (rt == NULL) {
1217                                         ip_rt_put(rth);
1218                                         in_dev_put(in_dev);
1219                                         return;
1220                                 }
1221
1222                                 /* Copy all the information. */
1223                                 *rt = *rth;
1224                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1225                                 rt->u.dst.__use         = 1;
1226                                 atomic_set(&rt->u.dst.__refcnt, 1);
1227                                 rt->u.dst.child         = NULL;
1228                                 if (rt->u.dst.dev)
1229                                         dev_hold(rt->u.dst.dev);
1230                                 if (rt->idev)
1231                                         in_dev_hold(rt->idev);
1232                                 rt->u.dst.obsolete      = 0;
1233                                 rt->u.dst.lastuse       = jiffies;
1234                                 rt->u.dst.path          = &rt->u.dst;
1235                                 rt->u.dst.neighbour     = NULL;
1236                                 rt->u.dst.hh            = NULL;
1237                                 rt->u.dst.xfrm          = NULL;
1238                                 rt->rt_genid            = atomic_read(&rt_genid);
1239                                 rt->rt_flags            |= RTCF_REDIRECTED;
1240
1241                                 /* Gateway is different ... */
1242                                 rt->rt_gateway          = new_gw;
1243
1244                                 /* Redirect received -> path was valid */
1245                                 dst_confirm(&rth->u.dst);
1246
1247                                 if (rt->peer)
1248                                         atomic_inc(&rt->peer->refcnt);
1249
1250                                 if (arp_bind_neighbour(&rt->u.dst) ||
1251                                     !(rt->u.dst.neighbour->nud_state &
1252                                             NUD_VALID)) {
1253                                         if (rt->u.dst.neighbour)
1254                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1255                                         ip_rt_put(rth);
1256                                         rt_drop(rt);
1257                                         goto do_next;
1258                                 }
1259
1260                                 netevent.old = &rth->u.dst;
1261                                 netevent.new = &rt->u.dst;
1262                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1263                                                         &netevent);
1264
1265                                 rt_del(hash, rth);
1266                                 if (!rt_intern_hash(hash, rt, &rt))
1267                                         ip_rt_put(rt);
1268                                 goto do_next;
1269                         }
1270                         rcu_read_unlock();
1271                 do_next:
1272                         ;
1273                 }
1274         }
1275         in_dev_put(in_dev);
1276         return;
1277
1278 reject_redirect:
1279 #ifdef CONFIG_IP_ROUTE_VERBOSE
1280         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1281                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1282                         "%u.%u.%u.%u ignored.\n"
1283                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1284                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1285                        NIPQUAD(saddr), NIPQUAD(daddr));
1286 #endif
1287         in_dev_put(in_dev);
1288 }
1289
1290 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1291 {
1292         struct rtable *rt = (struct rtable *)dst;
1293         struct dst_entry *ret = dst;
1294
1295         if (rt) {
1296                 if (dst->obsolete) {
1297                         ip_rt_put(rt);
1298                         ret = NULL;
1299                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1300                            rt->u.dst.expires) {
1301                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1302                                                 rt->fl.oif);
1303 #if RT_CACHE_DEBUG >= 1
1304                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1305                                           "%u.%u.%u.%u/%02x dropped\n",
1306                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1307 #endif
1308                         rt_del(hash, rt);
1309                         ret = NULL;
1310                 }
1311         }
1312         return ret;
1313 }
1314
1315 /*
1316  * Algorithm:
1317  *      1. The first ip_rt_redirect_number redirects are sent
1318  *         with exponential backoff, then we stop sending them at all,
1319  *         assuming that the host ignores our redirects.
1320  *      2. If we did not see packets requiring redirects
1321  *         during ip_rt_redirect_silence, we assume that the host
1322  *         forgot redirected route and start to send redirects again.
1323  *
1324  * This algorithm is much cheaper and more intelligent than dumb load limiting
1325  * in icmp.c.
1326  *
1327  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1328  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1329  */
1330
1331 void ip_rt_send_redirect(struct sk_buff *skb)
1332 {
1333         struct rtable *rt = skb->rtable;
1334         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1335
1336         if (!in_dev)
1337                 return;
1338
1339         if (!IN_DEV_TX_REDIRECTS(in_dev))
1340                 goto out;
1341
1342         /* No redirected packets during ip_rt_redirect_silence;
1343          * reset the algorithm.
1344          */
1345         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1346                 rt->u.dst.rate_tokens = 0;
1347
1348         /* Too many ignored redirects; do not send anything
1349          * set u.dst.rate_last to the last seen redirected packet.
1350          */
1351         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1352                 rt->u.dst.rate_last = jiffies;
1353                 goto out;
1354         }
1355
1356         /* Check for load limit; set rate_last to the latest sent
1357          * redirect.
1358          */
1359         if (rt->u.dst.rate_tokens == 0 ||
1360             time_after(jiffies,
1361                        (rt->u.dst.rate_last +
1362                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1363                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1364                 rt->u.dst.rate_last = jiffies;
1365                 ++rt->u.dst.rate_tokens;
1366 #ifdef CONFIG_IP_ROUTE_VERBOSE
1367                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1368                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1369                     net_ratelimit())
1370                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1371                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1372                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1373                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1374 #endif
1375         }
1376 out:
1377         in_dev_put(in_dev);
1378 }
1379
1380 static int ip_error(struct sk_buff *skb)
1381 {
1382         struct rtable *rt = skb->rtable;
1383         unsigned long now;
1384         int code;
1385
1386         switch (rt->u.dst.error) {
1387                 case EINVAL:
1388                 default:
1389                         goto out;
1390                 case EHOSTUNREACH:
1391                         code = ICMP_HOST_UNREACH;
1392                         break;
1393                 case ENETUNREACH:
1394                         code = ICMP_NET_UNREACH;
1395                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1396                         break;
1397                 case EACCES:
1398                         code = ICMP_PKT_FILTERED;
1399                         break;
1400         }
1401
1402         now = jiffies;
1403         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1404         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1405                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1406         rt->u.dst.rate_last = now;
1407         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1408                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1409                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1410         }
1411
1412 out:    kfree_skb(skb);
1413         return 0;
1414 }
1415
1416 /*
1417  *      The last two values are not from the RFC but
1418  *      are needed for AMPRnet AX.25 paths.
1419  */
1420
1421 static const unsigned short mtu_plateau[] =
1422 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1423
1424 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1425 {
1426         int i;
1427
1428         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1429                 if (old_mtu > mtu_plateau[i])
1430                         return mtu_plateau[i];
1431         return 68;
1432 }
1433
1434 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1435                                  unsigned short new_mtu)
1436 {
1437         int i;
1438         unsigned short old_mtu = ntohs(iph->tot_len);
1439         struct rtable *rth;
1440         __be32  skeys[2] = { iph->saddr, 0, };
1441         __be32  daddr = iph->daddr;
1442         unsigned short est_mtu = 0;
1443
1444         if (ipv4_config.no_pmtu_disc)
1445                 return 0;
1446
1447         for (i = 0; i < 2; i++) {
1448                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1449
1450                 rcu_read_lock();
1451                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1452                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1453                         if (rth->fl.fl4_dst == daddr &&
1454                             rth->fl.fl4_src == skeys[i] &&
1455                             rth->rt_dst  == daddr &&
1456                             rth->rt_src  == iph->saddr &&
1457                             rth->fl.iif == 0 &&
1458                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1459                             rth->u.dst.dev->nd_net == net &&
1460                             rth->rt_genid == atomic_read(&rt_genid)) {
1461                                 unsigned short mtu = new_mtu;
1462
1463                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1464
1465                                         /* BSD 4.2 compatibility hack :-( */
1466                                         if (mtu == 0 &&
1467                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1468                                             old_mtu >= 68 + (iph->ihl << 2))
1469                                                 old_mtu -= iph->ihl << 2;
1470
1471                                         mtu = guess_mtu(old_mtu);
1472                                 }
1473                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1474                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1475                                                 dst_confirm(&rth->u.dst);
1476                                                 if (mtu < ip_rt_min_pmtu) {
1477                                                         mtu = ip_rt_min_pmtu;
1478                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1479                                                                 (1 << RTAX_MTU);
1480                                                 }
1481                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1482                                                 dst_set_expires(&rth->u.dst,
1483                                                         ip_rt_mtu_expires);
1484                                         }
1485                                         est_mtu = mtu;
1486                                 }
1487                         }
1488                 }
1489                 rcu_read_unlock();
1490         }
1491         return est_mtu ? : new_mtu;
1492 }
1493
1494 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1495 {
1496         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1497             !(dst_metric_locked(dst, RTAX_MTU))) {
1498                 if (mtu < ip_rt_min_pmtu) {
1499                         mtu = ip_rt_min_pmtu;
1500                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1501                 }
1502                 dst->metrics[RTAX_MTU-1] = mtu;
1503                 dst_set_expires(dst, ip_rt_mtu_expires);
1504                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1505         }
1506 }
1507
1508 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1509 {
1510         return NULL;
1511 }
1512
1513 static void ipv4_dst_destroy(struct dst_entry *dst)
1514 {
1515         struct rtable *rt = (struct rtable *) dst;
1516         struct inet_peer *peer = rt->peer;
1517         struct in_device *idev = rt->idev;
1518
1519         if (peer) {
1520                 rt->peer = NULL;
1521                 inet_putpeer(peer);
1522         }
1523
1524         if (idev) {
1525                 rt->idev = NULL;
1526                 in_dev_put(idev);
1527         }
1528 }
1529
1530 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1531                             int how)
1532 {
1533         struct rtable *rt = (struct rtable *) dst;
1534         struct in_device *idev = rt->idev;
1535         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1536                 struct in_device *loopback_idev =
1537                         in_dev_get(dev->nd_net->loopback_dev);
1538                 if (loopback_idev) {
1539                         rt->idev = loopback_idev;
1540                         in_dev_put(idev);
1541                 }
1542         }
1543 }
1544
1545 static void ipv4_link_failure(struct sk_buff *skb)
1546 {
1547         struct rtable *rt;
1548
1549         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1550
1551         rt = skb->rtable;
1552         if (rt)
1553                 dst_set_expires(&rt->u.dst, 0);
1554 }
1555
1556 static int ip_rt_bug(struct sk_buff *skb)
1557 {
1558         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1559                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1560                 skb->dev ? skb->dev->name : "?");
1561         kfree_skb(skb);
1562         return 0;
1563 }
1564
1565 /*
1566    We do not cache source address of outgoing interface,
1567    because it is used only by IP RR, TS and SRR options,
1568    so that it out of fast path.
1569
1570    BTW remember: "addr" is allowed to be not aligned
1571    in IP options!
1572  */
1573
1574 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1575 {
1576         __be32 src;
1577         struct fib_result res;
1578
1579         if (rt->fl.iif == 0)
1580                 src = rt->rt_src;
1581         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1582                 src = FIB_RES_PREFSRC(res);
1583                 fib_res_put(&res);
1584         } else
1585                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1586                                         RT_SCOPE_UNIVERSE);
1587         memcpy(addr, &src, 4);
1588 }
1589
1590 #ifdef CONFIG_NET_CLS_ROUTE
1591 static void set_class_tag(struct rtable *rt, u32 tag)
1592 {
1593         if (!(rt->u.dst.tclassid & 0xFFFF))
1594                 rt->u.dst.tclassid |= tag & 0xFFFF;
1595         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1596                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1597 }
1598 #endif
1599
1600 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1601 {
1602         struct fib_info *fi = res->fi;
1603
1604         if (fi) {
1605                 if (FIB_RES_GW(*res) &&
1606                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1607                         rt->rt_gateway = FIB_RES_GW(*res);
1608                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1609                        sizeof(rt->u.dst.metrics));
1610                 if (fi->fib_mtu == 0) {
1611                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1612                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1613                             rt->rt_gateway != rt->rt_dst &&
1614                             rt->u.dst.dev->mtu > 576)
1615                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1616                 }
1617 #ifdef CONFIG_NET_CLS_ROUTE
1618                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1619 #endif
1620         } else
1621                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1622
1623         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1624                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1625         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1626                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1627         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1628                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1629                                        ip_rt_min_advmss);
1630         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1631                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1632
1633 #ifdef CONFIG_NET_CLS_ROUTE
1634 #ifdef CONFIG_IP_MULTIPLE_TABLES
1635         set_class_tag(rt, fib_rules_tclass(res));
1636 #endif
1637         set_class_tag(rt, itag);
1638 #endif
1639         rt->rt_type = res->type;
1640 }
1641
1642 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1643                                 u8 tos, struct net_device *dev, int our)
1644 {
1645         unsigned hash;
1646         struct rtable *rth;
1647         __be32 spec_dst;
1648         struct in_device *in_dev = in_dev_get(dev);
1649         u32 itag = 0;
1650
1651         /* Primary sanity checks. */
1652
1653         if (in_dev == NULL)
1654                 return -EINVAL;
1655
1656         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1657             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1658                 goto e_inval;
1659
1660         if (ipv4_is_zeronet(saddr)) {
1661                 if (!ipv4_is_local_multicast(daddr))
1662                         goto e_inval;
1663                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1664         } else if (fib_validate_source(saddr, 0, tos, 0,
1665                                         dev, &spec_dst, &itag) < 0)
1666                 goto e_inval;
1667
1668         rth = dst_alloc(&ipv4_dst_ops);
1669         if (!rth)
1670                 goto e_nobufs;
1671
1672         rth->u.dst.output= ip_rt_bug;
1673
1674         atomic_set(&rth->u.dst.__refcnt, 1);
1675         rth->u.dst.flags= DST_HOST;
1676         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1677                 rth->u.dst.flags |= DST_NOPOLICY;
1678         rth->fl.fl4_dst = daddr;
1679         rth->rt_dst     = daddr;
1680         rth->fl.fl4_tos = tos;
1681         rth->fl.mark    = skb->mark;
1682         rth->fl.fl4_src = saddr;
1683         rth->rt_src     = saddr;
1684 #ifdef CONFIG_NET_CLS_ROUTE
1685         rth->u.dst.tclassid = itag;
1686 #endif
1687         rth->rt_iif     =
1688         rth->fl.iif     = dev->ifindex;
1689         rth->u.dst.dev  = init_net.loopback_dev;
1690         dev_hold(rth->u.dst.dev);
1691         rth->idev       = in_dev_get(rth->u.dst.dev);
1692         rth->fl.oif     = 0;
1693         rth->rt_gateway = daddr;
1694         rth->rt_spec_dst= spec_dst;
1695         rth->rt_genid   = atomic_read(&rt_genid);
1696         rth->rt_flags   = RTCF_MULTICAST;
1697         rth->rt_type    = RTN_MULTICAST;
1698         if (our) {
1699                 rth->u.dst.input= ip_local_deliver;
1700                 rth->rt_flags |= RTCF_LOCAL;
1701         }
1702
1703 #ifdef CONFIG_IP_MROUTE
1704         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1705                 rth->u.dst.input = ip_mr_input;
1706 #endif
1707         RT_CACHE_STAT_INC(in_slow_mc);
1708
1709         in_dev_put(in_dev);
1710         hash = rt_hash(daddr, saddr, dev->ifindex);
1711         return rt_intern_hash(hash, rth, &skb->rtable);
1712
1713 e_nobufs:
1714         in_dev_put(in_dev);
1715         return -ENOBUFS;
1716
1717 e_inval:
1718         in_dev_put(in_dev);
1719         return -EINVAL;
1720 }
1721
1722
1723 static void ip_handle_martian_source(struct net_device *dev,
1724                                      struct in_device *in_dev,
1725                                      struct sk_buff *skb,
1726                                      __be32 daddr,
1727                                      __be32 saddr)
1728 {
1729         RT_CACHE_STAT_INC(in_martian_src);
1730 #ifdef CONFIG_IP_ROUTE_VERBOSE
1731         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1732                 /*
1733                  *      RFC1812 recommendation, if source is martian,
1734                  *      the only hint is MAC header.
1735                  */
1736                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1737                         "%u.%u.%u.%u, on dev %s\n",
1738                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1739                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1740                         int i;
1741                         const unsigned char *p = skb_mac_header(skb);
1742                         printk(KERN_WARNING "ll header: ");
1743                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1744                                 printk("%02x", *p);
1745                                 if (i < (dev->hard_header_len - 1))
1746                                         printk(":");
1747                         }
1748                         printk("\n");
1749                 }
1750         }
1751 #endif
1752 }
1753
1754 static inline int __mkroute_input(struct sk_buff *skb,
1755                                   struct fib_result* res,
1756                                   struct in_device *in_dev,
1757                                   __be32 daddr, __be32 saddr, u32 tos,
1758                                   struct rtable **result)
1759 {
1760
1761         struct rtable *rth;
1762         int err;
1763         struct in_device *out_dev;
1764         unsigned flags = 0;
1765         __be32 spec_dst;
1766         u32 itag;
1767
1768         /* get a working reference to the output device */
1769         out_dev = in_dev_get(FIB_RES_DEV(*res));
1770         if (out_dev == NULL) {
1771                 if (net_ratelimit())
1772                         printk(KERN_CRIT "Bug in ip_route_input" \
1773                                "_slow(). Please, report\n");
1774                 return -EINVAL;
1775         }
1776
1777
1778         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1779                                   in_dev->dev, &spec_dst, &itag);
1780         if (err < 0) {
1781                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1782                                          saddr);
1783
1784                 err = -EINVAL;
1785                 goto cleanup;
1786         }
1787
1788         if (err)
1789                 flags |= RTCF_DIRECTSRC;
1790
1791         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1792             (IN_DEV_SHARED_MEDIA(out_dev) ||
1793              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1794                 flags |= RTCF_DOREDIRECT;
1795
1796         if (skb->protocol != htons(ETH_P_IP)) {
1797                 /* Not IP (i.e. ARP). Do not create route, if it is
1798                  * invalid for proxy arp. DNAT routes are always valid.
1799                  */
1800                 if (out_dev == in_dev) {
1801                         err = -EINVAL;
1802                         goto cleanup;
1803                 }
1804         }
1805
1806
1807         rth = dst_alloc(&ipv4_dst_ops);
1808         if (!rth) {
1809                 err = -ENOBUFS;
1810                 goto cleanup;
1811         }
1812
1813         atomic_set(&rth->u.dst.__refcnt, 1);
1814         rth->u.dst.flags= DST_HOST;
1815         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1816                 rth->u.dst.flags |= DST_NOPOLICY;
1817         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1818                 rth->u.dst.flags |= DST_NOXFRM;
1819         rth->fl.fl4_dst = daddr;
1820         rth->rt_dst     = daddr;
1821         rth->fl.fl4_tos = tos;
1822         rth->fl.mark    = skb->mark;
1823         rth->fl.fl4_src = saddr;
1824         rth->rt_src     = saddr;
1825         rth->rt_gateway = daddr;
1826         rth->rt_iif     =
1827                 rth->fl.iif     = in_dev->dev->ifindex;
1828         rth->u.dst.dev  = (out_dev)->dev;
1829         dev_hold(rth->u.dst.dev);
1830         rth->idev       = in_dev_get(rth->u.dst.dev);
1831         rth->fl.oif     = 0;
1832         rth->rt_spec_dst= spec_dst;
1833
1834         rth->u.dst.input = ip_forward;
1835         rth->u.dst.output = ip_output;
1836         rth->rt_genid = atomic_read(&rt_genid);
1837
1838         rt_set_nexthop(rth, res, itag);
1839
1840         rth->rt_flags = flags;
1841
1842         *result = rth;
1843         err = 0;
1844  cleanup:
1845         /* release the working reference to the output device */
1846         in_dev_put(out_dev);
1847         return err;
1848 }
1849
1850 static inline int ip_mkroute_input(struct sk_buff *skb,
1851                                    struct fib_result* res,
1852                                    const struct flowi *fl,
1853                                    struct in_device *in_dev,
1854                                    __be32 daddr, __be32 saddr, u32 tos)
1855 {
1856         struct rtable* rth = NULL;
1857         int err;
1858         unsigned hash;
1859
1860 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1861         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1862                 fib_select_multipath(fl, res);
1863 #endif
1864
1865         /* create a routing cache entry */
1866         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1867         if (err)
1868                 return err;
1869
1870         /* put it into the cache */
1871         hash = rt_hash(daddr, saddr, fl->iif);
1872         return rt_intern_hash(hash, rth, &skb->rtable);
1873 }
1874
1875 /*
1876  *      NOTE. We drop all the packets that has local source
1877  *      addresses, because every properly looped back packet
1878  *      must have correct destination already attached by output routine.
1879  *
1880  *      Such approach solves two big problems:
1881  *      1. Not simplex devices are handled properly.
1882  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1883  */
1884
1885 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1886                                u8 tos, struct net_device *dev)
1887 {
1888         struct fib_result res;
1889         struct in_device *in_dev = in_dev_get(dev);
1890         struct flowi fl = { .nl_u = { .ip4_u =
1891                                       { .daddr = daddr,
1892                                         .saddr = saddr,
1893                                         .tos = tos,
1894                                         .scope = RT_SCOPE_UNIVERSE,
1895                                       } },
1896                             .mark = skb->mark,
1897                             .iif = dev->ifindex };
1898         unsigned        flags = 0;
1899         u32             itag = 0;
1900         struct rtable * rth;
1901         unsigned        hash;
1902         __be32          spec_dst;
1903         int             err = -EINVAL;
1904         int             free_res = 0;
1905         struct net    * net = dev->nd_net;
1906
1907         /* IP on this device is disabled. */
1908
1909         if (!in_dev)
1910                 goto out;
1911
1912         /* Check for the most weird martians, which can be not detected
1913            by fib_lookup.
1914          */
1915
1916         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1917             ipv4_is_loopback(saddr))
1918                 goto martian_source;
1919
1920         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1921                 goto brd_input;
1922
1923         /* Accept zero addresses only to limited broadcast;
1924          * I even do not know to fix it or not. Waiting for complains :-)
1925          */
1926         if (ipv4_is_zeronet(saddr))
1927                 goto martian_source;
1928
1929         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1930             ipv4_is_loopback(daddr))
1931                 goto martian_destination;
1932
1933         /*
1934          *      Now we are ready to route packet.
1935          */
1936         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1937                 if (!IN_DEV_FORWARD(in_dev))
1938                         goto e_hostunreach;
1939                 goto no_route;
1940         }
1941         free_res = 1;
1942
1943         RT_CACHE_STAT_INC(in_slow_tot);
1944
1945         if (res.type == RTN_BROADCAST)
1946                 goto brd_input;
1947
1948         if (res.type == RTN_LOCAL) {
1949                 int result;
1950                 result = fib_validate_source(saddr, daddr, tos,
1951                                              net->loopback_dev->ifindex,
1952                                              dev, &spec_dst, &itag);
1953                 if (result < 0)
1954                         goto martian_source;
1955                 if (result)
1956                         flags |= RTCF_DIRECTSRC;
1957                 spec_dst = daddr;
1958                 goto local_input;
1959         }
1960
1961         if (!IN_DEV_FORWARD(in_dev))
1962                 goto e_hostunreach;
1963         if (res.type != RTN_UNICAST)
1964                 goto martian_destination;
1965
1966         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1967 done:
1968         in_dev_put(in_dev);
1969         if (free_res)
1970                 fib_res_put(&res);
1971 out:    return err;
1972
1973 brd_input:
1974         if (skb->protocol != htons(ETH_P_IP))
1975                 goto e_inval;
1976
1977         if (ipv4_is_zeronet(saddr))
1978                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1979         else {
1980                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1981                                           &itag);
1982                 if (err < 0)
1983                         goto martian_source;
1984                 if (err)
1985                         flags |= RTCF_DIRECTSRC;
1986         }
1987         flags |= RTCF_BROADCAST;
1988         res.type = RTN_BROADCAST;
1989         RT_CACHE_STAT_INC(in_brd);
1990
1991 local_input:
1992         rth = dst_alloc(&ipv4_dst_ops);
1993         if (!rth)
1994                 goto e_nobufs;
1995
1996         rth->u.dst.output= ip_rt_bug;
1997         rth->rt_genid = atomic_read(&rt_genid);
1998
1999         atomic_set(&rth->u.dst.__refcnt, 1);
2000         rth->u.dst.flags= DST_HOST;
2001         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2002                 rth->u.dst.flags |= DST_NOPOLICY;
2003         rth->fl.fl4_dst = daddr;
2004         rth->rt_dst     = daddr;
2005         rth->fl.fl4_tos = tos;
2006         rth->fl.mark    = skb->mark;
2007         rth->fl.fl4_src = saddr;
2008         rth->rt_src     = saddr;
2009 #ifdef CONFIG_NET_CLS_ROUTE
2010         rth->u.dst.tclassid = itag;
2011 #endif
2012         rth->rt_iif     =
2013         rth->fl.iif     = dev->ifindex;
2014         rth->u.dst.dev  = net->loopback_dev;
2015         dev_hold(rth->u.dst.dev);
2016         rth->idev       = in_dev_get(rth->u.dst.dev);
2017         rth->rt_gateway = daddr;
2018         rth->rt_spec_dst= spec_dst;
2019         rth->u.dst.input= ip_local_deliver;
2020         rth->rt_flags   = flags|RTCF_LOCAL;
2021         if (res.type == RTN_UNREACHABLE) {
2022                 rth->u.dst.input= ip_error;
2023                 rth->u.dst.error= -err;
2024                 rth->rt_flags   &= ~RTCF_LOCAL;
2025         }
2026         rth->rt_type    = res.type;
2027         hash = rt_hash(daddr, saddr, fl.iif);
2028         err = rt_intern_hash(hash, rth, &skb->rtable);
2029         goto done;
2030
2031 no_route:
2032         RT_CACHE_STAT_INC(in_no_route);
2033         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2034         res.type = RTN_UNREACHABLE;
2035         if (err == -ESRCH)
2036                 err = -ENETUNREACH;
2037         goto local_input;
2038
2039         /*
2040          *      Do not cache martian addresses: they should be logged (RFC1812)
2041          */
2042 martian_destination:
2043         RT_CACHE_STAT_INC(in_martian_dst);
2044 #ifdef CONFIG_IP_ROUTE_VERBOSE
2045         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2046                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2047                         "%u.%u.%u.%u, dev %s\n",
2048                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2049 #endif
2050
2051 e_hostunreach:
2052         err = -EHOSTUNREACH;
2053         goto done;
2054
2055 e_inval:
2056         err = -EINVAL;
2057         goto done;
2058
2059 e_nobufs:
2060         err = -ENOBUFS;
2061         goto done;
2062
2063 martian_source:
2064         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2065         goto e_inval;
2066 }
2067
2068 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2069                    u8 tos, struct net_device *dev)
2070 {
2071         struct rtable * rth;
2072         unsigned        hash;
2073         int iif = dev->ifindex;
2074         struct net *net;
2075
2076         net = dev->nd_net;
2077         tos &= IPTOS_RT_MASK;
2078         hash = rt_hash(daddr, saddr, iif);
2079
2080         rcu_read_lock();
2081         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2082              rth = rcu_dereference(rth->u.dst.rt_next)) {
2083                 if (rth->fl.fl4_dst == daddr &&
2084                     rth->fl.fl4_src == saddr &&
2085                     rth->fl.iif == iif &&
2086                     rth->fl.oif == 0 &&
2087                     rth->fl.mark == skb->mark &&
2088                     rth->fl.fl4_tos == tos &&
2089                     rth->u.dst.dev->nd_net == net &&
2090                     rth->rt_genid == atomic_read(&rt_genid)) {
2091                         dst_use(&rth->u.dst, jiffies);
2092                         RT_CACHE_STAT_INC(in_hit);
2093                         rcu_read_unlock();
2094                         skb->rtable = rth;
2095                         return 0;
2096                 }
2097                 RT_CACHE_STAT_INC(in_hlist_search);
2098         }
2099         rcu_read_unlock();
2100
2101         /* Multicast recognition logic is moved from route cache to here.
2102            The problem was that too many Ethernet cards have broken/missing
2103            hardware multicast filters :-( As result the host on multicasting
2104            network acquires a lot of useless route cache entries, sort of
2105            SDR messages from all the world. Now we try to get rid of them.
2106            Really, provided software IP multicast filter is organized
2107            reasonably (at least, hashed), it does not result in a slowdown
2108            comparing with route cache reject entries.
2109            Note, that multicast routers are not affected, because
2110            route cache entry is created eventually.
2111          */
2112         if (ipv4_is_multicast(daddr)) {
2113                 struct in_device *in_dev;
2114
2115                 rcu_read_lock();
2116                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2117                         int our = ip_check_mc(in_dev, daddr, saddr,
2118                                 ip_hdr(skb)->protocol);
2119                         if (our
2120 #ifdef CONFIG_IP_MROUTE
2121                             || (!ipv4_is_local_multicast(daddr) &&
2122                                 IN_DEV_MFORWARD(in_dev))
2123 #endif
2124                             ) {
2125                                 rcu_read_unlock();
2126                                 return ip_route_input_mc(skb, daddr, saddr,
2127                                                          tos, dev, our);
2128                         }
2129                 }
2130                 rcu_read_unlock();
2131                 return -EINVAL;
2132         }
2133         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2134 }
2135
2136 static inline int __mkroute_output(struct rtable **result,
2137                                    struct fib_result* res,
2138                                    const struct flowi *fl,
2139                                    const struct flowi *oldflp,
2140                                    struct net_device *dev_out,
2141                                    unsigned flags)
2142 {
2143         struct rtable *rth;
2144         struct in_device *in_dev;
2145         u32 tos = RT_FL_TOS(oldflp);
2146         int err = 0;
2147
2148         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2149                 return -EINVAL;
2150
2151         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2152                 res->type = RTN_BROADCAST;
2153         else if (ipv4_is_multicast(fl->fl4_dst))
2154                 res->type = RTN_MULTICAST;
2155         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2156                 return -EINVAL;
2157
2158         if (dev_out->flags & IFF_LOOPBACK)
2159                 flags |= RTCF_LOCAL;
2160
2161         /* get work reference to inet device */
2162         in_dev = in_dev_get(dev_out);
2163         if (!in_dev)
2164                 return -EINVAL;
2165
2166         if (res->type == RTN_BROADCAST) {
2167                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2168                 if (res->fi) {
2169                         fib_info_put(res->fi);
2170                         res->fi = NULL;
2171                 }
2172         } else if (res->type == RTN_MULTICAST) {
2173                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2174                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2175                                  oldflp->proto))
2176                         flags &= ~RTCF_LOCAL;
2177                 /* If multicast route do not exist use
2178                    default one, but do not gateway in this case.
2179                    Yes, it is hack.
2180                  */
2181                 if (res->fi && res->prefixlen < 4) {
2182                         fib_info_put(res->fi);
2183                         res->fi = NULL;
2184                 }
2185         }
2186
2187
2188         rth = dst_alloc(&ipv4_dst_ops);
2189         if (!rth) {
2190                 err = -ENOBUFS;
2191                 goto cleanup;
2192         }
2193
2194         atomic_set(&rth->u.dst.__refcnt, 1);
2195         rth->u.dst.flags= DST_HOST;
2196         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2197                 rth->u.dst.flags |= DST_NOXFRM;
2198         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2199                 rth->u.dst.flags |= DST_NOPOLICY;
2200
2201         rth->fl.fl4_dst = oldflp->fl4_dst;
2202         rth->fl.fl4_tos = tos;
2203         rth->fl.fl4_src = oldflp->fl4_src;
2204         rth->fl.oif     = oldflp->oif;
2205         rth->fl.mark    = oldflp->mark;
2206         rth->rt_dst     = fl->fl4_dst;
2207         rth->rt_src     = fl->fl4_src;
2208         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2209         /* get references to the devices that are to be hold by the routing
2210            cache entry */
2211         rth->u.dst.dev  = dev_out;
2212         dev_hold(dev_out);
2213         rth->idev       = in_dev_get(dev_out);
2214         rth->rt_gateway = fl->fl4_dst;
2215         rth->rt_spec_dst= fl->fl4_src;
2216
2217         rth->u.dst.output=ip_output;
2218         rth->rt_genid = atomic_read(&rt_genid);
2219
2220         RT_CACHE_STAT_INC(out_slow_tot);
2221
2222         if (flags & RTCF_LOCAL) {
2223                 rth->u.dst.input = ip_local_deliver;
2224                 rth->rt_spec_dst = fl->fl4_dst;
2225         }
2226         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2227                 rth->rt_spec_dst = fl->fl4_src;
2228                 if (flags & RTCF_LOCAL &&
2229                     !(dev_out->flags & IFF_LOOPBACK)) {
2230                         rth->u.dst.output = ip_mc_output;
2231                         RT_CACHE_STAT_INC(out_slow_mc);
2232                 }
2233 #ifdef CONFIG_IP_MROUTE
2234                 if (res->type == RTN_MULTICAST) {
2235                         if (IN_DEV_MFORWARD(in_dev) &&
2236                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2237                                 rth->u.dst.input = ip_mr_input;
2238                                 rth->u.dst.output = ip_mc_output;
2239                         }
2240                 }
2241 #endif
2242         }
2243
2244         rt_set_nexthop(rth, res, 0);
2245
2246         rth->rt_flags = flags;
2247
2248         *result = rth;
2249  cleanup:
2250         /* release work reference to inet device */
2251         in_dev_put(in_dev);
2252
2253         return err;
2254 }
2255
2256 static inline int ip_mkroute_output(struct rtable **rp,
2257                                     struct fib_result* res,
2258                                     const struct flowi *fl,
2259                                     const struct flowi *oldflp,
2260                                     struct net_device *dev_out,
2261                                     unsigned flags)
2262 {
2263         struct rtable *rth = NULL;
2264         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2265         unsigned hash;
2266         if (err == 0) {
2267                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2268                 err = rt_intern_hash(hash, rth, rp);
2269         }
2270
2271         return err;
2272 }
2273
2274 /*
2275  * Major route resolver routine.
2276  */
2277
2278 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2279                                 const struct flowi *oldflp)
2280 {
2281         u32 tos = RT_FL_TOS(oldflp);
2282         struct flowi fl = { .nl_u = { .ip4_u =
2283                                       { .daddr = oldflp->fl4_dst,
2284                                         .saddr = oldflp->fl4_src,
2285                                         .tos = tos & IPTOS_RT_MASK,
2286                                         .scope = ((tos & RTO_ONLINK) ?
2287                                                   RT_SCOPE_LINK :
2288                                                   RT_SCOPE_UNIVERSE),
2289                                       } },
2290                             .mark = oldflp->mark,
2291                             .iif = net->loopback_dev->ifindex,
2292                             .oif = oldflp->oif };
2293         struct fib_result res;
2294         unsigned flags = 0;
2295         struct net_device *dev_out = NULL;
2296         int free_res = 0;
2297         int err;
2298
2299
2300         res.fi          = NULL;
2301 #ifdef CONFIG_IP_MULTIPLE_TABLES
2302         res.r           = NULL;
2303 #endif
2304
2305         if (oldflp->fl4_src) {
2306                 err = -EINVAL;
2307                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2308                     ipv4_is_lbcast(oldflp->fl4_src) ||
2309                     ipv4_is_zeronet(oldflp->fl4_src))
2310                         goto out;
2311
2312                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2313                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2314                 if (dev_out == NULL)
2315                         goto out;
2316
2317                 /* I removed check for oif == dev_out->oif here.
2318                    It was wrong for two reasons:
2319                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2320                       is assigned to multiple interfaces.
2321                    2. Moreover, we are allowed to send packets with saddr
2322                       of another iface. --ANK
2323                  */
2324
2325                 if (oldflp->oif == 0
2326                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2327                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2328                         /* Special hack: user can direct multicasts
2329                            and limited broadcast via necessary interface
2330                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2331                            This hack is not just for fun, it allows
2332                            vic,vat and friends to work.
2333                            They bind socket to loopback, set ttl to zero
2334                            and expect that it will work.
2335                            From the viewpoint of routing cache they are broken,
2336                            because we are not allowed to build multicast path
2337                            with loopback source addr (look, routing cache
2338                            cannot know, that ttl is zero, so that packet
2339                            will not leave this host and route is valid).
2340                            Luckily, this hack is good workaround.
2341                          */
2342
2343                         fl.oif = dev_out->ifindex;
2344                         goto make_route;
2345                 }
2346                 if (dev_out)
2347                         dev_put(dev_out);
2348                 dev_out = NULL;
2349         }
2350
2351
2352         if (oldflp->oif) {
2353                 dev_out = dev_get_by_index(net, oldflp->oif);
2354                 err = -ENODEV;
2355                 if (dev_out == NULL)
2356                         goto out;
2357
2358                 /* RACE: Check return value of inet_select_addr instead. */
2359                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2360                         dev_put(dev_out);
2361                         goto out;       /* Wrong error code */
2362                 }
2363
2364                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2365                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2366                         if (!fl.fl4_src)
2367                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2368                                                               RT_SCOPE_LINK);
2369                         goto make_route;
2370                 }
2371                 if (!fl.fl4_src) {
2372                         if (ipv4_is_multicast(oldflp->fl4_dst))
2373                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2374                                                               fl.fl4_scope);
2375                         else if (!oldflp->fl4_dst)
2376                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2377                                                               RT_SCOPE_HOST);
2378                 }
2379         }
2380
2381         if (!fl.fl4_dst) {
2382                 fl.fl4_dst = fl.fl4_src;
2383                 if (!fl.fl4_dst)
2384                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2385                 if (dev_out)
2386                         dev_put(dev_out);
2387                 dev_out = net->loopback_dev;
2388                 dev_hold(dev_out);
2389                 fl.oif = net->loopback_dev->ifindex;
2390                 res.type = RTN_LOCAL;
2391                 flags |= RTCF_LOCAL;
2392                 goto make_route;
2393         }
2394
2395         if (fib_lookup(net, &fl, &res)) {
2396                 res.fi = NULL;
2397                 if (oldflp->oif) {
2398                         /* Apparently, routing tables are wrong. Assume,
2399                            that the destination is on link.
2400
2401                            WHY? DW.
2402                            Because we are allowed to send to iface
2403                            even if it has NO routes and NO assigned
2404                            addresses. When oif is specified, routing
2405                            tables are looked up with only one purpose:
2406                            to catch if destination is gatewayed, rather than
2407                            direct. Moreover, if MSG_DONTROUTE is set,
2408                            we send packet, ignoring both routing tables
2409                            and ifaddr state. --ANK
2410
2411
2412                            We could make it even if oif is unknown,
2413                            likely IPv6, but we do not.
2414                          */
2415
2416                         if (fl.fl4_src == 0)
2417                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2418                                                               RT_SCOPE_LINK);
2419                         res.type = RTN_UNICAST;
2420                         goto make_route;
2421                 }
2422                 if (dev_out)
2423                         dev_put(dev_out);
2424                 err = -ENETUNREACH;
2425                 goto out;
2426         }
2427         free_res = 1;
2428
2429         if (res.type == RTN_LOCAL) {
2430                 if (!fl.fl4_src)
2431                         fl.fl4_src = fl.fl4_dst;
2432                 if (dev_out)
2433                         dev_put(dev_out);
2434                 dev_out = net->loopback_dev;
2435                 dev_hold(dev_out);
2436                 fl.oif = dev_out->ifindex;
2437                 if (res.fi)
2438                         fib_info_put(res.fi);
2439                 res.fi = NULL;
2440                 flags |= RTCF_LOCAL;
2441                 goto make_route;
2442         }
2443
2444 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2445         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2446                 fib_select_multipath(&fl, &res);
2447         else
2448 #endif
2449         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2450                 fib_select_default(net, &fl, &res);
2451
2452         if (!fl.fl4_src)
2453                 fl.fl4_src = FIB_RES_PREFSRC(res);
2454
2455         if (dev_out)
2456                 dev_put(dev_out);
2457         dev_out = FIB_RES_DEV(res);
2458         dev_hold(dev_out);
2459         fl.oif = dev_out->ifindex;
2460
2461
2462 make_route:
2463         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2464
2465
2466         if (free_res)
2467                 fib_res_put(&res);
2468         if (dev_out)
2469                 dev_put(dev_out);
2470 out:    return err;
2471 }
2472
2473 int __ip_route_output_key(struct net *net, struct rtable **rp,
2474                           const struct flowi *flp)
2475 {
2476         unsigned hash;
2477         struct rtable *rth;
2478
2479         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2480
2481         rcu_read_lock_bh();
2482         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2483                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2484                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2485                     rth->fl.fl4_src == flp->fl4_src &&
2486                     rth->fl.iif == 0 &&
2487                     rth->fl.oif == flp->oif &&
2488                     rth->fl.mark == flp->mark &&
2489                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2490                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2491                     rth->u.dst.dev->nd_net == net &&
2492                     rth->rt_genid == atomic_read(&rt_genid)) {
2493                         dst_use(&rth->u.dst, jiffies);
2494                         RT_CACHE_STAT_INC(out_hit);
2495                         rcu_read_unlock_bh();
2496                         *rp = rth;
2497                         return 0;
2498                 }
2499                 RT_CACHE_STAT_INC(out_hlist_search);
2500         }
2501         rcu_read_unlock_bh();
2502
2503         return ip_route_output_slow(net, rp, flp);
2504 }
2505
2506 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2507
2508 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2509 {
2510 }
2511
2512 static struct dst_ops ipv4_dst_blackhole_ops = {
2513         .family                 =       AF_INET,
2514         .protocol               =       __constant_htons(ETH_P_IP),
2515         .destroy                =       ipv4_dst_destroy,
2516         .check                  =       ipv4_dst_check,
2517         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2518         .entry_size             =       sizeof(struct rtable),
2519         .entries                =       ATOMIC_INIT(0),
2520 };
2521
2522
2523 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp)
2524 {
2525         struct rtable *ort = *rp;
2526         struct rtable *rt = (struct rtable *)
2527                 dst_alloc(&ipv4_dst_blackhole_ops);
2528
2529         if (rt) {
2530                 struct dst_entry *new = &rt->u.dst;
2531
2532                 atomic_set(&new->__refcnt, 1);
2533                 new->__use = 1;
2534                 new->input = dst_discard;
2535                 new->output = dst_discard;
2536                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2537
2538                 new->dev = ort->u.dst.dev;
2539                 if (new->dev)
2540                         dev_hold(new->dev);
2541
2542                 rt->fl = ort->fl;
2543
2544                 rt->idev = ort->idev;
2545                 if (rt->idev)
2546                         in_dev_hold(rt->idev);
2547                 rt->rt_genid = atomic_read(&rt_genid);
2548                 rt->rt_flags = ort->rt_flags;
2549                 rt->rt_type = ort->rt_type;
2550                 rt->rt_dst = ort->rt_dst;
2551                 rt->rt_src = ort->rt_src;
2552                 rt->rt_iif = ort->rt_iif;
2553                 rt->rt_gateway = ort->rt_gateway;
2554                 rt->rt_spec_dst = ort->rt_spec_dst;
2555                 rt->peer = ort->peer;
2556                 if (rt->peer)
2557                         atomic_inc(&rt->peer->refcnt);
2558
2559                 dst_free(new);
2560         }
2561
2562         dst_release(&(*rp)->u.dst);
2563         *rp = rt;
2564         return (rt ? 0 : -ENOMEM);
2565 }
2566
2567 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2568                          struct sock *sk, int flags)
2569 {
2570         int err;
2571
2572         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2573                 return err;
2574
2575         if (flp->proto) {
2576                 if (!flp->fl4_src)
2577                         flp->fl4_src = (*rp)->rt_src;
2578                 if (!flp->fl4_dst)
2579                         flp->fl4_dst = (*rp)->rt_dst;
2580                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2581                                     flags ? XFRM_LOOKUP_WAIT : 0);
2582                 if (err == -EREMOTE)
2583                         err = ipv4_dst_blackhole(rp, flp);
2584
2585                 return err;
2586         }
2587
2588         return 0;
2589 }
2590
2591 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2592
2593 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2594 {
2595         return ip_route_output_flow(net, rp, flp, NULL, 0);
2596 }
2597
2598 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2599                         int nowait, unsigned int flags)
2600 {
2601         struct rtable *rt = skb->rtable;
2602         struct rtmsg *r;
2603         struct nlmsghdr *nlh;
2604         long expires;
2605         u32 id = 0, ts = 0, tsage = 0, error;
2606
2607         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2608         if (nlh == NULL)
2609                 return -EMSGSIZE;
2610
2611         r = nlmsg_data(nlh);
2612         r->rtm_family    = AF_INET;
2613         r->rtm_dst_len  = 32;
2614         r->rtm_src_len  = 0;
2615         r->rtm_tos      = rt->fl.fl4_tos;
2616         r->rtm_table    = RT_TABLE_MAIN;
2617         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2618         r->rtm_type     = rt->rt_type;
2619         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2620         r->rtm_protocol = RTPROT_UNSPEC;
2621         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2622         if (rt->rt_flags & RTCF_NOTIFY)
2623                 r->rtm_flags |= RTM_F_NOTIFY;
2624
2625         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2626
2627         if (rt->fl.fl4_src) {
2628                 r->rtm_src_len = 32;
2629                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2630         }
2631         if (rt->u.dst.dev)
2632                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2633 #ifdef CONFIG_NET_CLS_ROUTE
2634         if (rt->u.dst.tclassid)
2635                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2636 #endif
2637         if (rt->fl.iif)
2638                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2639         else if (rt->rt_src != rt->fl.fl4_src)
2640                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2641
2642         if (rt->rt_dst != rt->rt_gateway)
2643                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2644
2645         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2646                 goto nla_put_failure;
2647
2648         error = rt->u.dst.error;
2649         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2650         if (rt->peer) {
2651                 id = rt->peer->ip_id_count;
2652                 if (rt->peer->tcp_ts_stamp) {
2653                         ts = rt->peer->tcp_ts;
2654                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2655                 }
2656         }
2657
2658         if (rt->fl.iif) {
2659 #ifdef CONFIG_IP_MROUTE
2660                 __be32 dst = rt->rt_dst;
2661
2662                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2663                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2664                         int err = ipmr_get_route(skb, r, nowait);
2665                         if (err <= 0) {
2666                                 if (!nowait) {
2667                                         if (err == 0)
2668                                                 return 0;
2669                                         goto nla_put_failure;
2670                                 } else {
2671                                         if (err == -EMSGSIZE)
2672                                                 goto nla_put_failure;
2673                                         error = err;
2674                                 }
2675                         }
2676                 } else
2677 #endif
2678                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2679         }
2680
2681         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2682                                expires, error) < 0)
2683                 goto nla_put_failure;
2684
2685         return nlmsg_end(skb, nlh);
2686
2687 nla_put_failure:
2688         nlmsg_cancel(skb, nlh);
2689         return -EMSGSIZE;
2690 }
2691
2692 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2693 {
2694         struct net *net = in_skb->sk->sk_net;
2695         struct rtmsg *rtm;
2696         struct nlattr *tb[RTA_MAX+1];
2697         struct rtable *rt = NULL;
2698         __be32 dst = 0;
2699         __be32 src = 0;
2700         u32 iif;
2701         int err;
2702         struct sk_buff *skb;
2703
2704         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2705         if (err < 0)
2706                 goto errout;
2707
2708         rtm = nlmsg_data(nlh);
2709
2710         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2711         if (skb == NULL) {
2712                 err = -ENOBUFS;
2713                 goto errout;
2714         }
2715
2716         /* Reserve room for dummy headers, this skb can pass
2717            through good chunk of routing engine.
2718          */
2719         skb_reset_mac_header(skb);
2720         skb_reset_network_header(skb);
2721
2722         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2723         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2724         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2725
2726         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2727         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2728         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2729
2730         if (iif) {
2731                 struct net_device *dev;
2732
2733                 dev = __dev_get_by_index(net, iif);
2734                 if (dev == NULL) {
2735                         err = -ENODEV;
2736                         goto errout_free;
2737                 }
2738
2739                 skb->protocol   = htons(ETH_P_IP);
2740                 skb->dev        = dev;
2741                 local_bh_disable();
2742                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2743                 local_bh_enable();
2744
2745                 rt = skb->rtable;
2746                 if (err == 0 && rt->u.dst.error)
2747                         err = -rt->u.dst.error;
2748         } else {
2749                 struct flowi fl = {
2750                         .nl_u = {
2751                                 .ip4_u = {
2752                                         .daddr = dst,
2753                                         .saddr = src,
2754                                         .tos = rtm->rtm_tos,
2755                                 },
2756                         },
2757                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2758                 };
2759                 err = ip_route_output_key(net, &rt, &fl);
2760         }
2761
2762         if (err)
2763                 goto errout_free;
2764
2765         skb->rtable = rt;
2766         if (rtm->rtm_flags & RTM_F_NOTIFY)
2767                 rt->rt_flags |= RTCF_NOTIFY;
2768
2769         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2770                            RTM_NEWROUTE, 0, 0);
2771         if (err <= 0)
2772                 goto errout_free;
2773
2774         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2775 errout:
2776         return err;
2777
2778 errout_free:
2779         kfree_skb(skb);
2780         goto errout;
2781 }
2782
2783 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2784 {
2785         struct rtable *rt;
2786         int h, s_h;
2787         int idx, s_idx;
2788         struct net *net;
2789
2790         net = skb->sk->sk_net;
2791
2792         s_h = cb->args[0];
2793         if (s_h < 0)
2794                 s_h = 0;
2795         s_idx = idx = cb->args[1];
2796         for (h = s_h; h <= rt_hash_mask; h++) {
2797                 rcu_read_lock_bh();
2798                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2799                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2800                         if (rt->u.dst.dev->nd_net != net || idx < s_idx)
2801                                 continue;
2802                         if (rt->rt_genid != atomic_read(&rt_genid))
2803                                 continue;
2804                         skb->dst = dst_clone(&rt->u.dst);
2805                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2806                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2807                                          1, NLM_F_MULTI) <= 0) {
2808                                 dst_release(xchg(&skb->dst, NULL));
2809                                 rcu_read_unlock_bh();
2810                                 goto done;
2811                         }
2812                         dst_release(xchg(&skb->dst, NULL));
2813                 }
2814                 rcu_read_unlock_bh();
2815                 s_idx = 0;
2816         }
2817
2818 done:
2819         cb->args[0] = h;
2820         cb->args[1] = idx;
2821         return skb->len;
2822 }
2823
2824 void ip_rt_multicast_event(struct in_device *in_dev)
2825 {
2826         rt_cache_flush(0);
2827 }
2828
2829 #ifdef CONFIG_SYSCTL
2830 static int flush_delay;
2831
2832 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2833                                         struct file *filp, void __user *buffer,
2834                                         size_t *lenp, loff_t *ppos)
2835 {
2836         if (write) {
2837                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2838                 rt_cache_flush(flush_delay);
2839                 return 0;
2840         }
2841
2842         return -EINVAL;
2843 }
2844
2845 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2846                                                 int __user *name,
2847                                                 int nlen,
2848                                                 void __user *oldval,
2849                                                 size_t __user *oldlenp,
2850                                                 void __user *newval,
2851                                                 size_t newlen)
2852 {
2853         int delay;
2854         if (newlen != sizeof(int))
2855                 return -EINVAL;
2856         if (get_user(delay, (int __user *)newval))
2857                 return -EFAULT;
2858         rt_cache_flush(delay);
2859         return 0;
2860 }
2861
2862 ctl_table ipv4_route_table[] = {
2863         {
2864                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2865                 .procname       = "flush",
2866                 .data           = &flush_delay,
2867                 .maxlen         = sizeof(int),
2868                 .mode           = 0200,
2869                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2870                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2871         },
2872         {
2873                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2874                 .procname       = "gc_thresh",
2875                 .data           = &ipv4_dst_ops.gc_thresh,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = &proc_dointvec,
2879         },
2880         {
2881                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2882                 .procname       = "max_size",
2883                 .data           = &ip_rt_max_size,
2884                 .maxlen         = sizeof(int),
2885                 .mode           = 0644,
2886                 .proc_handler   = &proc_dointvec,
2887         },
2888         {
2889                 /*  Deprecated. Use gc_min_interval_ms */
2890
2891                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2892                 .procname       = "gc_min_interval",
2893                 .data           = &ip_rt_gc_min_interval,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = &proc_dointvec_jiffies,
2897                 .strategy       = &sysctl_jiffies,
2898         },
2899         {
2900                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2901                 .procname       = "gc_min_interval_ms",
2902                 .data           = &ip_rt_gc_min_interval,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = &proc_dointvec_ms_jiffies,
2906                 .strategy       = &sysctl_ms_jiffies,
2907         },
2908         {
2909                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2910                 .procname       = "gc_timeout",
2911                 .data           = &ip_rt_gc_timeout,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = &proc_dointvec_jiffies,
2915                 .strategy       = &sysctl_jiffies,
2916         },
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2919                 .procname       = "gc_interval",
2920                 .data           = &ip_rt_gc_interval,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec_jiffies,
2924                 .strategy       = &sysctl_jiffies,
2925         },
2926         {
2927                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2928                 .procname       = "redirect_load",
2929                 .data           = &ip_rt_redirect_load,
2930                 .maxlen         = sizeof(int),
2931                 .mode           = 0644,
2932                 .proc_handler   = &proc_dointvec,
2933         },
2934         {
2935                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2936                 .procname       = "redirect_number",
2937                 .data           = &ip_rt_redirect_number,
2938                 .maxlen         = sizeof(int),
2939                 .mode           = 0644,
2940                 .proc_handler   = &proc_dointvec,
2941         },
2942         {
2943                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2944                 .procname       = "redirect_silence",
2945                 .data           = &ip_rt_redirect_silence,
2946                 .maxlen         = sizeof(int),
2947                 .mode           = 0644,
2948                 .proc_handler   = &proc_dointvec,
2949         },
2950         {
2951                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2952                 .procname       = "error_cost",
2953                 .data           = &ip_rt_error_cost,
2954                 .maxlen         = sizeof(int),
2955                 .mode           = 0644,
2956                 .proc_handler   = &proc_dointvec,
2957         },
2958         {
2959                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2960                 .procname       = "error_burst",
2961                 .data           = &ip_rt_error_burst,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = &proc_dointvec,
2965         },
2966         {
2967                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2968                 .procname       = "gc_elasticity",
2969                 .data           = &ip_rt_gc_elasticity,
2970                 .maxlen         = sizeof(int),
2971                 .mode           = 0644,
2972                 .proc_handler   = &proc_dointvec,
2973         },
2974         {
2975                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2976                 .procname       = "mtu_expires",
2977                 .data           = &ip_rt_mtu_expires,
2978                 .maxlen         = sizeof(int),
2979                 .mode           = 0644,
2980                 .proc_handler   = &proc_dointvec_jiffies,
2981                 .strategy       = &sysctl_jiffies,
2982         },
2983         {
2984                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2985                 .procname       = "min_pmtu",
2986                 .data           = &ip_rt_min_pmtu,
2987                 .maxlen         = sizeof(int),
2988                 .mode           = 0644,
2989                 .proc_handler   = &proc_dointvec,
2990         },
2991         {
2992                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2993                 .procname       = "min_adv_mss",
2994                 .data           = &ip_rt_min_advmss,
2995                 .maxlen         = sizeof(int),
2996                 .mode           = 0644,
2997                 .proc_handler   = &proc_dointvec,
2998         },
2999         {
3000                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
3001                 .procname       = "secret_interval",
3002                 .data           = &ip_rt_secret_interval,
3003                 .maxlen         = sizeof(int),
3004                 .mode           = 0644,
3005                 .proc_handler   = &proc_dointvec_jiffies,
3006                 .strategy       = &sysctl_jiffies,
3007         },
3008         { .ctl_name = 0 }
3009 };
3010 #endif
3011
3012 #ifdef CONFIG_NET_CLS_ROUTE
3013 struct ip_rt_acct *ip_rt_acct __read_mostly;
3014 #endif /* CONFIG_NET_CLS_ROUTE */
3015
3016 static __initdata unsigned long rhash_entries;
3017 static int __init set_rhash_entries(char *str)
3018 {
3019         if (!str)
3020                 return 0;
3021         rhash_entries = simple_strtoul(str, &str, 0);
3022         return 1;
3023 }
3024 __setup("rhash_entries=", set_rhash_entries);
3025
3026 int __init ip_rt_init(void)
3027 {
3028         int rc = 0;
3029
3030         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3031                              (jiffies ^ (jiffies >> 7))));
3032
3033 #ifdef CONFIG_NET_CLS_ROUTE
3034         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3035         if (!ip_rt_acct)
3036                 panic("IP: failed to allocate ip_rt_acct\n");
3037 #endif
3038
3039         ipv4_dst_ops.kmem_cachep =
3040                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3041                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3042
3043         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3044
3045         rt_hash_table = (struct rt_hash_bucket *)
3046                 alloc_large_system_hash("IP route cache",
3047                                         sizeof(struct rt_hash_bucket),
3048                                         rhash_entries,
3049                                         (num_physpages >= 128 * 1024) ?
3050                                         15 : 17,
3051                                         0,
3052                                         &rt_hash_log,
3053                                         &rt_hash_mask,
3054                                         0);
3055         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3056         rt_hash_lock_init();
3057
3058         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3059         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3060
3061         devinet_init();
3062         ip_fib_init();
3063
3064         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3065
3066         /* All the timers, started at system startup tend
3067            to synchronize. Perturb it a bit.
3068          */
3069         schedule_delayed_work(&expires_work,
3070                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3071
3072         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3073                 ip_rt_secret_interval;
3074         add_timer(&rt_secret_timer);
3075
3076         if (ip_rt_proc_init())
3077                 printk(KERN_ERR "Unable to create route proc files\n");
3078 #ifdef CONFIG_XFRM
3079         xfrm_init();
3080         xfrm4_init();
3081 #endif
3082         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3083
3084         return rc;
3085 }
3086
3087 EXPORT_SYMBOL(__ip_select_ident);
3088 EXPORT_SYMBOL(ip_route_input);
3089 EXPORT_SYMBOL(ip_route_output_key);