[NETNS]: Add namespace parameter to ip_dev_find.
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(struct dst_ops *ops);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238
239 static __init void rt_hash_lock_init(void)
240 {
241         int i;
242
243         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
244                         GFP_KERNEL);
245         if (!rt_hash_locks)
246                 panic("IP: failed to allocate rt_hash_locks\n");
247
248         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
249                 spin_lock_init(&rt_hash_locks[i]);
250 }
251 #else
252 # define rt_hash_lock_addr(slot) NULL
253
254 static inline void rt_hash_lock_init(void)
255 {
256 }
257 #endif
258
259 static struct rt_hash_bucket    *rt_hash_table;
260 static unsigned                 rt_hash_mask;
261 static unsigned int             rt_hash_log;
262 static unsigned int             rt_hash_rnd;
263
264 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
265 #define RT_CACHE_STAT_INC(field) \
266         (__raw_get_cpu_var(rt_cache_stat).field++)
267
268 static int rt_intern_hash(unsigned hash, struct rtable *rth,
269                                 struct rtable **res);
270
271 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
272 {
273         return (jhash_2words(daddr, saddr, rt_hash_rnd)
274                 & rt_hash_mask);
275 }
276
277 #define rt_hash(daddr, saddr, idx) \
278         rt_hash_code((__force u32)(__be32)(daddr),\
279                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
280
281 #ifdef CONFIG_PROC_FS
282 struct rt_cache_iter_state {
283         int bucket;
284 };
285
286 static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 {
288         struct rtable *r = NULL;
289         struct rt_cache_iter_state *st = seq->private;
290
291         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
292                 rcu_read_lock_bh();
293                 r = rt_hash_table[st->bucket].chain;
294                 if (r)
295                         break;
296                 rcu_read_unlock_bh();
297         }
298         return rcu_dereference(r);
299 }
300
301 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 if (--st->bucket < 0)
309                         break;
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference(r);
314 }
315
316 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
317 {
318         struct rtable *r = rt_cache_get_first(seq);
319
320         if (r)
321                 while (pos && (r = rt_cache_get_next(seq, r)))
322                         --pos;
323         return pos ? NULL : r;
324 }
325
326 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
327 {
328         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
329 }
330
331 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
332 {
333         struct rtable *r = NULL;
334
335         if (v == SEQ_START_TOKEN)
336                 r = rt_cache_get_first(seq);
337         else
338                 r = rt_cache_get_next(seq, v);
339         ++*pos;
340         return r;
341 }
342
343 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
344 {
345         if (v && v != SEQ_START_TOKEN)
346                 rcu_read_unlock_bh();
347 }
348
349 static int rt_cache_seq_show(struct seq_file *seq, void *v)
350 {
351         if (v == SEQ_START_TOKEN)
352                 seq_printf(seq, "%-127s\n",
353                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
354                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
355                            "HHUptod\tSpecDst");
356         else {
357                 struct rtable *r = v;
358                 char temp[256];
359
360                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
361                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
362                         r->u.dst.dev ? r->u.dst.dev->name : "*",
363                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
364                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
365                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
366                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
367                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
368                         dst_metric(&r->u.dst, RTAX_WINDOW),
369                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
370                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
371                         r->fl.fl4_tos,
372                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
373                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
374                                        dev_queue_xmit) : 0,
375                         r->rt_spec_dst);
376                 seq_printf(seq, "%-127s\n", temp);
377         }
378         return 0;
379 }
380
381 static const struct seq_operations rt_cache_seq_ops = {
382         .start  = rt_cache_seq_start,
383         .next   = rt_cache_seq_next,
384         .stop   = rt_cache_seq_stop,
385         .show   = rt_cache_seq_show,
386 };
387
388 static int rt_cache_seq_open(struct inode *inode, struct file *file)
389 {
390         return seq_open_private(file, &rt_cache_seq_ops,
391                         sizeof(struct rt_cache_iter_state));
392 }
393
394 static const struct file_operations rt_cache_seq_fops = {
395         .owner   = THIS_MODULE,
396         .open    = rt_cache_seq_open,
397         .read    = seq_read,
398         .llseek  = seq_lseek,
399         .release = seq_release_private,
400 };
401
402
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405         int cpu;
406
407         if (*pos == 0)
408                 return SEQ_START_TOKEN;
409
410         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411                 if (!cpu_possible(cpu))
412                         continue;
413                 *pos = cpu+1;
414                 return &per_cpu(rt_cache_stat, cpu);
415         }
416         return NULL;
417 }
418
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421         int cpu;
422
423         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430
431 }
432
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435
436 }
437
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440         struct rt_cache_stat *st = v;
441
442         if (v == SEQ_START_TOKEN) {
443                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444                 return 0;
445         }
446
447         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449                    atomic_read(&ipv4_dst_ops.entries),
450                    st->in_hit,
451                    st->in_slow_tot,
452                    st->in_slow_mc,
453                    st->in_no_route,
454                    st->in_brd,
455                    st->in_martian_dst,
456                    st->in_martian_src,
457
458                    st->out_hit,
459                    st->out_slow_tot,
460                    st->out_slow_mc,
461
462                    st->gc_total,
463                    st->gc_ignored,
464                    st->gc_goal_miss,
465                    st->gc_dst_overflow,
466                    st->in_hlist_search,
467                    st->out_hlist_search
468                 );
469         return 0;
470 }
471
472 static const struct seq_operations rt_cpu_seq_ops = {
473         .start  = rt_cpu_seq_start,
474         .next   = rt_cpu_seq_next,
475         .stop   = rt_cpu_seq_stop,
476         .show   = rt_cpu_seq_show,
477 };
478
479
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482         return seq_open(file, &rt_cpu_seq_ops);
483 }
484
485 static const struct file_operations rt_cpu_seq_fops = {
486         .owner   = THIS_MODULE,
487         .open    = rt_cpu_seq_open,
488         .read    = seq_read,
489         .llseek  = seq_lseek,
490         .release = seq_release,
491 };
492
493 #ifdef CONFIG_NET_CLS_ROUTE
494 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
495                            int length, int *eof, void *data)
496 {
497         unsigned int i;
498
499         if ((offset & 3) || (length & 3))
500                 return -EIO;
501
502         if (offset >= sizeof(struct ip_rt_acct) * 256) {
503                 *eof = 1;
504                 return 0;
505         }
506
507         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
508                 length = sizeof(struct ip_rt_acct) * 256 - offset;
509                 *eof = 1;
510         }
511
512         offset /= sizeof(u32);
513
514         if (length > 0) {
515                 u32 *dst = (u32 *) buffer;
516
517                 *start = buffer;
518                 memset(dst, 0, length);
519
520                 for_each_possible_cpu(i) {
521                         unsigned int j;
522                         u32 *src;
523
524                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
525                         for (j = 0; j < length/4; j++)
526                                 dst[j] += src[j];
527                 }
528         }
529         return length;
530 }
531 #endif
532
533 static __init int ip_rt_proc_init(struct net *net)
534 {
535         struct proc_dir_entry *pde;
536
537         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
538                         &rt_cache_seq_fops);
539         if (!pde)
540                 goto err1;
541
542         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
543         if (!pde)
544                 goto err2;
545
546         pde->proc_fops = &rt_cpu_seq_fops;
547
548 #ifdef CONFIG_NET_CLS_ROUTE
549         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
550                         ip_rt_acct_read, NULL);
551         if (!pde)
552                 goto err3;
553 #endif
554         return 0;
555
556 #ifdef CONFIG_NET_CLS_ROUTE
557 err3:
558         remove_proc_entry("rt_cache", net->proc_net_stat);
559 #endif
560 err2:
561         remove_proc_entry("rt_cache", net->proc_net);
562 err1:
563         return -ENOMEM;
564 }
565 #else
566 static inline int ip_rt_proc_init(struct net *net)
567 {
568         return 0;
569 }
570 #endif /* CONFIG_PROC_FS */
571
572 static __inline__ void rt_free(struct rtable *rt)
573 {
574         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
575 }
576
577 static __inline__ void rt_drop(struct rtable *rt)
578 {
579         ip_rt_put(rt);
580         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
581 }
582
583 static __inline__ int rt_fast_clean(struct rtable *rth)
584 {
585         /* Kill broadcast/multicast entries very aggresively, if they
586            collide in hash table with more useful entries */
587         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
588                 rth->fl.iif && rth->u.dst.rt_next;
589 }
590
591 static __inline__ int rt_valuable(struct rtable *rth)
592 {
593         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
594                 rth->u.dst.expires;
595 }
596
597 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
598 {
599         unsigned long age;
600         int ret = 0;
601
602         if (atomic_read(&rth->u.dst.__refcnt))
603                 goto out;
604
605         ret = 1;
606         if (rth->u.dst.expires &&
607             time_after_eq(jiffies, rth->u.dst.expires))
608                 goto out;
609
610         age = jiffies - rth->u.dst.lastuse;
611         ret = 0;
612         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
613             (age <= tmo2 && rt_valuable(rth)))
614                 goto out;
615         ret = 1;
616 out:    return ret;
617 }
618
619 /* Bits of score are:
620  * 31: very valuable
621  * 30: not quite useless
622  * 29..0: usage counter
623  */
624 static inline u32 rt_score(struct rtable *rt)
625 {
626         u32 score = jiffies - rt->u.dst.lastuse;
627
628         score = ~score & ~(3<<30);
629
630         if (rt_valuable(rt))
631                 score |= (1<<31);
632
633         if (!rt->fl.iif ||
634             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
635                 score |= (1<<30);
636
637         return score;
638 }
639
640 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
641 {
642         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
643                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
644                 (fl1->mark ^ fl2->mark) |
645                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
646                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
647                 (fl1->oif ^ fl2->oif) |
648                 (fl1->iif ^ fl2->iif)) == 0;
649 }
650
651 /*
652  * Perform a full scan of hash table and free all entries.
653  * Can be called by a softirq or a process.
654  * In the later case, we want to be reschedule if necessary
655  */
656 static void rt_do_flush(int process_context)
657 {
658         unsigned int i;
659         struct rtable *rth, *next;
660
661         for (i = 0; i <= rt_hash_mask; i++) {
662                 if (process_context && need_resched())
663                         cond_resched();
664                 rth = rt_hash_table[i].chain;
665                 if (!rth)
666                         continue;
667
668                 spin_lock_bh(rt_hash_lock_addr(i));
669                 rth = rt_hash_table[i].chain;
670                 rt_hash_table[i].chain = NULL;
671                 spin_unlock_bh(rt_hash_lock_addr(i));
672
673                 for (; rth; rth = next) {
674                         next = rth->u.dst.rt_next;
675                         rt_free(rth);
676                 }
677         }
678 }
679
680 static void rt_check_expire(void)
681 {
682         static unsigned int rover;
683         unsigned int i = rover, goal;
684         struct rtable *rth, **rthp;
685         u64 mult;
686
687         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
688         if (ip_rt_gc_timeout > 1)
689                 do_div(mult, ip_rt_gc_timeout);
690         goal = (unsigned int)mult;
691         if (goal > rt_hash_mask)
692                 goal = rt_hash_mask + 1;
693         for (; goal > 0; goal--) {
694                 unsigned long tmo = ip_rt_gc_timeout;
695
696                 i = (i + 1) & rt_hash_mask;
697                 rthp = &rt_hash_table[i].chain;
698
699                 if (need_resched())
700                         cond_resched();
701
702                 if (*rthp == NULL)
703                         continue;
704                 spin_lock_bh(rt_hash_lock_addr(i));
705                 while ((rth = *rthp) != NULL) {
706                         if (rth->u.dst.expires) {
707                                 /* Entry is expired even if it is in use */
708                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
709                                         tmo >>= 1;
710                                         rthp = &rth->u.dst.rt_next;
711                                         continue;
712                                 }
713                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
714                                 tmo >>= 1;
715                                 rthp = &rth->u.dst.rt_next;
716                                 continue;
717                         }
718
719                         /* Cleanup aged off entries. */
720                         *rthp = rth->u.dst.rt_next;
721                         rt_free(rth);
722                 }
723                 spin_unlock_bh(rt_hash_lock_addr(i));
724         }
725         rover = i;
726 }
727
728 /*
729  * rt_worker_func() is run in process context.
730  * If a whole flush was scheduled, it is done.
731  * Else, we call rt_check_expire() to scan part of the hash table
732  */
733 static void rt_worker_func(struct work_struct *work)
734 {
735         if (ip_rt_flush_expected) {
736                 ip_rt_flush_expected = 0;
737                 rt_do_flush(1);
738         } else
739                 rt_check_expire();
740         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
741 }
742
743 /* This can run from both BH and non-BH contexts, the latter
744  * in the case of a forced flush event.
745  */
746 static void rt_run_flush(unsigned long process_context)
747 {
748         rt_deadline = 0;
749
750         get_random_bytes(&rt_hash_rnd, 4);
751
752         rt_do_flush(process_context);
753 }
754
755 static DEFINE_SPINLOCK(rt_flush_lock);
756
757 void rt_cache_flush(int delay)
758 {
759         unsigned long now = jiffies;
760         int user_mode = !in_softirq();
761
762         if (delay < 0)
763                 delay = ip_rt_min_delay;
764
765         spin_lock_bh(&rt_flush_lock);
766
767         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
768                 long tmo = (long)(rt_deadline - now);
769
770                 /* If flush timer is already running
771                    and flush request is not immediate (delay > 0):
772
773                    if deadline is not achieved, prolongate timer to "delay",
774                    otherwise fire it at deadline time.
775                  */
776
777                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
778                         tmo = 0;
779
780                 if (delay > tmo)
781                         delay = tmo;
782         }
783
784         if (delay <= 0) {
785                 spin_unlock_bh(&rt_flush_lock);
786                 rt_run_flush(user_mode);
787                 return;
788         }
789
790         if (rt_deadline == 0)
791                 rt_deadline = now + ip_rt_max_delay;
792
793         mod_timer(&rt_flush_timer, now+delay);
794         spin_unlock_bh(&rt_flush_lock);
795 }
796
797 /*
798  * We change rt_hash_rnd and ask next rt_worker_func() invocation
799  * to perform a flush in process context
800  */
801 static void rt_secret_rebuild(unsigned long dummy)
802 {
803         get_random_bytes(&rt_hash_rnd, 4);
804         ip_rt_flush_expected = 1;
805         cancel_delayed_work(&expires_work);
806         schedule_delayed_work(&expires_work, HZ/10);
807         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
808 }
809
810 /*
811    Short description of GC goals.
812
813    We want to build algorithm, which will keep routing cache
814    at some equilibrium point, when number of aged off entries
815    is kept approximately equal to newly generated ones.
816
817    Current expiration strength is variable "expire".
818    We try to adjust it dynamically, so that if networking
819    is idle expires is large enough to keep enough of warm entries,
820    and when load increases it reduces to limit cache size.
821  */
822
823 static int rt_garbage_collect(struct dst_ops *ops)
824 {
825         static unsigned long expire = RT_GC_TIMEOUT;
826         static unsigned long last_gc;
827         static int rover;
828         static int equilibrium;
829         struct rtable *rth, **rthp;
830         unsigned long now = jiffies;
831         int goal;
832
833         /*
834          * Garbage collection is pretty expensive,
835          * do not make it too frequently.
836          */
837
838         RT_CACHE_STAT_INC(gc_total);
839
840         if (now - last_gc < ip_rt_gc_min_interval &&
841             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842                 RT_CACHE_STAT_INC(gc_ignored);
843                 goto out;
844         }
845
846         /* Calculate number of entries, which we want to expire now. */
847         goal = atomic_read(&ipv4_dst_ops.entries) -
848                 (ip_rt_gc_elasticity << rt_hash_log);
849         if (goal <= 0) {
850                 if (equilibrium < ipv4_dst_ops.gc_thresh)
851                         equilibrium = ipv4_dst_ops.gc_thresh;
852                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853                 if (goal > 0) {
854                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
855                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856                 }
857         } else {
858                 /* We are in dangerous area. Try to reduce cache really
859                  * aggressively.
860                  */
861                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
862                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863         }
864
865         if (now - last_gc >= ip_rt_gc_min_interval)
866                 last_gc = now;
867
868         if (goal <= 0) {
869                 equilibrium += goal;
870                 goto work_done;
871         }
872
873         do {
874                 int i, k;
875
876                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877                         unsigned long tmo = expire;
878
879                         k = (k + 1) & rt_hash_mask;
880                         rthp = &rt_hash_table[k].chain;
881                         spin_lock_bh(rt_hash_lock_addr(k));
882                         while ((rth = *rthp) != NULL) {
883                                 if (!rt_may_expire(rth, tmo, expire)) {
884                                         tmo >>= 1;
885                                         rthp = &rth->u.dst.rt_next;
886                                         continue;
887                                 }
888                                 *rthp = rth->u.dst.rt_next;
889                                 rt_free(rth);
890                                 goal--;
891                         }
892                         spin_unlock_bh(rt_hash_lock_addr(k));
893                         if (goal <= 0)
894                                 break;
895                 }
896                 rover = k;
897
898                 if (goal <= 0)
899                         goto work_done;
900
901                 /* Goal is not achieved. We stop process if:
902
903                    - if expire reduced to zero. Otherwise, expire is halfed.
904                    - if table is not full.
905                    - if we are called from interrupt.
906                    - jiffies check is just fallback/debug loop breaker.
907                      We will not spin here for long time in any case.
908                  */
909
910                 RT_CACHE_STAT_INC(gc_goal_miss);
911
912                 if (expire == 0)
913                         break;
914
915                 expire >>= 1;
916 #if RT_CACHE_DEBUG >= 2
917                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
918                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
919 #endif
920
921                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
922                         goto out;
923         } while (!in_softirq() && time_before_eq(jiffies, now));
924
925         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
926                 goto out;
927         if (net_ratelimit())
928                 printk(KERN_WARNING "dst cache overflow\n");
929         RT_CACHE_STAT_INC(gc_dst_overflow);
930         return 1;
931
932 work_done:
933         expire += ip_rt_gc_min_interval;
934         if (expire > ip_rt_gc_timeout ||
935             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
936                 expire = ip_rt_gc_timeout;
937 #if RT_CACHE_DEBUG >= 2
938         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
939                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
940 #endif
941 out:    return 0;
942 }
943
944 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
945 {
946         struct rtable   *rth, **rthp;
947         unsigned long   now;
948         struct rtable *cand, **candp;
949         u32             min_score;
950         int             chain_length;
951         int attempts = !in_softirq();
952
953 restart:
954         chain_length = 0;
955         min_score = ~(u32)0;
956         cand = NULL;
957         candp = NULL;
958         now = jiffies;
959
960         rthp = &rt_hash_table[hash].chain;
961
962         spin_lock_bh(rt_hash_lock_addr(hash));
963         while ((rth = *rthp) != NULL) {
964                 if (compare_keys(&rth->fl, &rt->fl)) {
965                         /* Put it first */
966                         *rthp = rth->u.dst.rt_next;
967                         /*
968                          * Since lookup is lockfree, the deletion
969                          * must be visible to another weakly ordered CPU before
970                          * the insertion at the start of the hash chain.
971                          */
972                         rcu_assign_pointer(rth->u.dst.rt_next,
973                                            rt_hash_table[hash].chain);
974                         /*
975                          * Since lookup is lockfree, the update writes
976                          * must be ordered for consistency on SMP.
977                          */
978                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
979
980                         dst_use(&rth->u.dst, now);
981                         spin_unlock_bh(rt_hash_lock_addr(hash));
982
983                         rt_drop(rt);
984                         *rp = rth;
985                         return 0;
986                 }
987
988                 if (!atomic_read(&rth->u.dst.__refcnt)) {
989                         u32 score = rt_score(rth);
990
991                         if (score <= min_score) {
992                                 cand = rth;
993                                 candp = rthp;
994                                 min_score = score;
995                         }
996                 }
997
998                 chain_length++;
999
1000                 rthp = &rth->u.dst.rt_next;
1001         }
1002
1003         if (cand) {
1004                 /* ip_rt_gc_elasticity used to be average length of chain
1005                  * length, when exceeded gc becomes really aggressive.
1006                  *
1007                  * The second limit is less certain. At the moment it allows
1008                  * only 2 entries per bucket. We will see.
1009                  */
1010                 if (chain_length > ip_rt_gc_elasticity) {
1011                         *candp = cand->u.dst.rt_next;
1012                         rt_free(cand);
1013                 }
1014         }
1015
1016         /* Try to bind route to arp only if it is output
1017            route or unicast forwarding path.
1018          */
1019         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1020                 int err = arp_bind_neighbour(&rt->u.dst);
1021                 if (err) {
1022                         spin_unlock_bh(rt_hash_lock_addr(hash));
1023
1024                         if (err != -ENOBUFS) {
1025                                 rt_drop(rt);
1026                                 return err;
1027                         }
1028
1029                         /* Neighbour tables are full and nothing
1030                            can be released. Try to shrink route cache,
1031                            it is most likely it holds some neighbour records.
1032                          */
1033                         if (attempts-- > 0) {
1034                                 int saved_elasticity = ip_rt_gc_elasticity;
1035                                 int saved_int = ip_rt_gc_min_interval;
1036                                 ip_rt_gc_elasticity     = 1;
1037                                 ip_rt_gc_min_interval   = 0;
1038                                 rt_garbage_collect(&ipv4_dst_ops);
1039                                 ip_rt_gc_min_interval   = saved_int;
1040                                 ip_rt_gc_elasticity     = saved_elasticity;
1041                                 goto restart;
1042                         }
1043
1044                         if (net_ratelimit())
1045                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1046                         rt_drop(rt);
1047                         return -ENOBUFS;
1048                 }
1049         }
1050
1051         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1052 #if RT_CACHE_DEBUG >= 2
1053         if (rt->u.dst.rt_next) {
1054                 struct rtable *trt;
1055                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1056                        NIPQUAD(rt->rt_dst));
1057                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1058                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1059                 printk("\n");
1060         }
1061 #endif
1062         rt_hash_table[hash].chain = rt;
1063         spin_unlock_bh(rt_hash_lock_addr(hash));
1064         *rp = rt;
1065         return 0;
1066 }
1067
1068 void rt_bind_peer(struct rtable *rt, int create)
1069 {
1070         static DEFINE_SPINLOCK(rt_peer_lock);
1071         struct inet_peer *peer;
1072
1073         peer = inet_getpeer(rt->rt_dst, create);
1074
1075         spin_lock_bh(&rt_peer_lock);
1076         if (rt->peer == NULL) {
1077                 rt->peer = peer;
1078                 peer = NULL;
1079         }
1080         spin_unlock_bh(&rt_peer_lock);
1081         if (peer)
1082                 inet_putpeer(peer);
1083 }
1084
1085 /*
1086  * Peer allocation may fail only in serious out-of-memory conditions.  However
1087  * we still can generate some output.
1088  * Random ID selection looks a bit dangerous because we have no chances to
1089  * select ID being unique in a reasonable period of time.
1090  * But broken packet identifier may be better than no packet at all.
1091  */
1092 static void ip_select_fb_ident(struct iphdr *iph)
1093 {
1094         static DEFINE_SPINLOCK(ip_fb_id_lock);
1095         static u32 ip_fallback_id;
1096         u32 salt;
1097
1098         spin_lock_bh(&ip_fb_id_lock);
1099         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1100         iph->id = htons(salt & 0xFFFF);
1101         ip_fallback_id = salt;
1102         spin_unlock_bh(&ip_fb_id_lock);
1103 }
1104
1105 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1106 {
1107         struct rtable *rt = (struct rtable *) dst;
1108
1109         if (rt) {
1110                 if (rt->peer == NULL)
1111                         rt_bind_peer(rt, 1);
1112
1113                 /* If peer is attached to destination, it is never detached,
1114                    so that we need not to grab a lock to dereference it.
1115                  */
1116                 if (rt->peer) {
1117                         iph->id = htons(inet_getid(rt->peer, more));
1118                         return;
1119                 }
1120         } else
1121                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1122                        __builtin_return_address(0));
1123
1124         ip_select_fb_ident(iph);
1125 }
1126
1127 static void rt_del(unsigned hash, struct rtable *rt)
1128 {
1129         struct rtable **rthp;
1130
1131         spin_lock_bh(rt_hash_lock_addr(hash));
1132         ip_rt_put(rt);
1133         for (rthp = &rt_hash_table[hash].chain; *rthp;
1134              rthp = &(*rthp)->u.dst.rt_next)
1135                 if (*rthp == rt) {
1136                         *rthp = rt->u.dst.rt_next;
1137                         rt_free(rt);
1138                         break;
1139                 }
1140         spin_unlock_bh(rt_hash_lock_addr(hash));
1141 }
1142
1143 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1144                     __be32 saddr, struct net_device *dev)
1145 {
1146         int i, k;
1147         struct in_device *in_dev = in_dev_get(dev);
1148         struct rtable *rth, **rthp;
1149         __be32  skeys[2] = { saddr, 0 };
1150         int  ikeys[2] = { dev->ifindex, 0 };
1151         struct netevent_redirect netevent;
1152
1153         if (!in_dev)
1154                 return;
1155
1156         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1157             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1158             || ipv4_is_zeronet(new_gw))
1159                 goto reject_redirect;
1160
1161         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1162                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1163                         goto reject_redirect;
1164                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1165                         goto reject_redirect;
1166         } else {
1167                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1168                         goto reject_redirect;
1169         }
1170
1171         for (i = 0; i < 2; i++) {
1172                 for (k = 0; k < 2; k++) {
1173                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1174
1175                         rthp=&rt_hash_table[hash].chain;
1176
1177                         rcu_read_lock();
1178                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1179                                 struct rtable *rt;
1180
1181                                 if (rth->fl.fl4_dst != daddr ||
1182                                     rth->fl.fl4_src != skeys[i] ||
1183                                     rth->fl.oif != ikeys[k] ||
1184                                     rth->fl.iif != 0) {
1185                                         rthp = &rth->u.dst.rt_next;
1186                                         continue;
1187                                 }
1188
1189                                 if (rth->rt_dst != daddr ||
1190                                     rth->rt_src != saddr ||
1191                                     rth->u.dst.error ||
1192                                     rth->rt_gateway != old_gw ||
1193                                     rth->u.dst.dev != dev)
1194                                         break;
1195
1196                                 dst_hold(&rth->u.dst);
1197                                 rcu_read_unlock();
1198
1199                                 rt = dst_alloc(&ipv4_dst_ops);
1200                                 if (rt == NULL) {
1201                                         ip_rt_put(rth);
1202                                         in_dev_put(in_dev);
1203                                         return;
1204                                 }
1205
1206                                 /* Copy all the information. */
1207                                 *rt = *rth;
1208                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1209                                 rt->u.dst.__use         = 1;
1210                                 atomic_set(&rt->u.dst.__refcnt, 1);
1211                                 rt->u.dst.child         = NULL;
1212                                 if (rt->u.dst.dev)
1213                                         dev_hold(rt->u.dst.dev);
1214                                 if (rt->idev)
1215                                         in_dev_hold(rt->idev);
1216                                 rt->u.dst.obsolete      = 0;
1217                                 rt->u.dst.lastuse       = jiffies;
1218                                 rt->u.dst.path          = &rt->u.dst;
1219                                 rt->u.dst.neighbour     = NULL;
1220                                 rt->u.dst.hh            = NULL;
1221                                 rt->u.dst.xfrm          = NULL;
1222
1223                                 rt->rt_flags            |= RTCF_REDIRECTED;
1224
1225                                 /* Gateway is different ... */
1226                                 rt->rt_gateway          = new_gw;
1227
1228                                 /* Redirect received -> path was valid */
1229                                 dst_confirm(&rth->u.dst);
1230
1231                                 if (rt->peer)
1232                                         atomic_inc(&rt->peer->refcnt);
1233
1234                                 if (arp_bind_neighbour(&rt->u.dst) ||
1235                                     !(rt->u.dst.neighbour->nud_state &
1236                                             NUD_VALID)) {
1237                                         if (rt->u.dst.neighbour)
1238                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1239                                         ip_rt_put(rth);
1240                                         rt_drop(rt);
1241                                         goto do_next;
1242                                 }
1243
1244                                 netevent.old = &rth->u.dst;
1245                                 netevent.new = &rt->u.dst;
1246                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1247                                                         &netevent);
1248
1249                                 rt_del(hash, rth);
1250                                 if (!rt_intern_hash(hash, rt, &rt))
1251                                         ip_rt_put(rt);
1252                                 goto do_next;
1253                         }
1254                         rcu_read_unlock();
1255                 do_next:
1256                         ;
1257                 }
1258         }
1259         in_dev_put(in_dev);
1260         return;
1261
1262 reject_redirect:
1263 #ifdef CONFIG_IP_ROUTE_VERBOSE
1264         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1265                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1266                         "%u.%u.%u.%u ignored.\n"
1267                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1268                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1269                        NIPQUAD(saddr), NIPQUAD(daddr));
1270 #endif
1271         in_dev_put(in_dev);
1272 }
1273
1274 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1275 {
1276         struct rtable *rt = (struct rtable*)dst;
1277         struct dst_entry *ret = dst;
1278
1279         if (rt) {
1280                 if (dst->obsolete) {
1281                         ip_rt_put(rt);
1282                         ret = NULL;
1283                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1284                            rt->u.dst.expires) {
1285                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1286                                                 rt->fl.oif);
1287 #if RT_CACHE_DEBUG >= 1
1288                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1289                                           "%u.%u.%u.%u/%02x dropped\n",
1290                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1291 #endif
1292                         rt_del(hash, rt);
1293                         ret = NULL;
1294                 }
1295         }
1296         return ret;
1297 }
1298
1299 /*
1300  * Algorithm:
1301  *      1. The first ip_rt_redirect_number redirects are sent
1302  *         with exponential backoff, then we stop sending them at all,
1303  *         assuming that the host ignores our redirects.
1304  *      2. If we did not see packets requiring redirects
1305  *         during ip_rt_redirect_silence, we assume that the host
1306  *         forgot redirected route and start to send redirects again.
1307  *
1308  * This algorithm is much cheaper and more intelligent than dumb load limiting
1309  * in icmp.c.
1310  *
1311  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1312  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1313  */
1314
1315 void ip_rt_send_redirect(struct sk_buff *skb)
1316 {
1317         struct rtable *rt = (struct rtable*)skb->dst;
1318         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1319
1320         if (!in_dev)
1321                 return;
1322
1323         if (!IN_DEV_TX_REDIRECTS(in_dev))
1324                 goto out;
1325
1326         /* No redirected packets during ip_rt_redirect_silence;
1327          * reset the algorithm.
1328          */
1329         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1330                 rt->u.dst.rate_tokens = 0;
1331
1332         /* Too many ignored redirects; do not send anything
1333          * set u.dst.rate_last to the last seen redirected packet.
1334          */
1335         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1336                 rt->u.dst.rate_last = jiffies;
1337                 goto out;
1338         }
1339
1340         /* Check for load limit; set rate_last to the latest sent
1341          * redirect.
1342          */
1343         if (rt->u.dst.rate_tokens == 0 ||
1344             time_after(jiffies,
1345                        (rt->u.dst.rate_last +
1346                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1347                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1348                 rt->u.dst.rate_last = jiffies;
1349                 ++rt->u.dst.rate_tokens;
1350 #ifdef CONFIG_IP_ROUTE_VERBOSE
1351                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1352                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1353                     net_ratelimit())
1354                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1355                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1356                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1357                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1358 #endif
1359         }
1360 out:
1361         in_dev_put(in_dev);
1362 }
1363
1364 static int ip_error(struct sk_buff *skb)
1365 {
1366         struct rtable *rt = (struct rtable*)skb->dst;
1367         unsigned long now;
1368         int code;
1369
1370         switch (rt->u.dst.error) {
1371                 case EINVAL:
1372                 default:
1373                         goto out;
1374                 case EHOSTUNREACH:
1375                         code = ICMP_HOST_UNREACH;
1376                         break;
1377                 case ENETUNREACH:
1378                         code = ICMP_NET_UNREACH;
1379                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1380                         break;
1381                 case EACCES:
1382                         code = ICMP_PKT_FILTERED;
1383                         break;
1384         }
1385
1386         now = jiffies;
1387         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1388         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1389                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1390         rt->u.dst.rate_last = now;
1391         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1392                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1393                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1394         }
1395
1396 out:    kfree_skb(skb);
1397         return 0;
1398 }
1399
1400 /*
1401  *      The last two values are not from the RFC but
1402  *      are needed for AMPRnet AX.25 paths.
1403  */
1404
1405 static const unsigned short mtu_plateau[] =
1406 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1407
1408 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1409 {
1410         int i;
1411
1412         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1413                 if (old_mtu > mtu_plateau[i])
1414                         return mtu_plateau[i];
1415         return 68;
1416 }
1417
1418 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1419 {
1420         int i;
1421         unsigned short old_mtu = ntohs(iph->tot_len);
1422         struct rtable *rth;
1423         __be32  skeys[2] = { iph->saddr, 0, };
1424         __be32  daddr = iph->daddr;
1425         unsigned short est_mtu = 0;
1426
1427         if (ipv4_config.no_pmtu_disc)
1428                 return 0;
1429
1430         for (i = 0; i < 2; i++) {
1431                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1432
1433                 rcu_read_lock();
1434                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1435                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1436                         if (rth->fl.fl4_dst == daddr &&
1437                             rth->fl.fl4_src == skeys[i] &&
1438                             rth->rt_dst  == daddr &&
1439                             rth->rt_src  == iph->saddr &&
1440                             rth->fl.iif == 0 &&
1441                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1442                                 unsigned short mtu = new_mtu;
1443
1444                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1445
1446                                         /* BSD 4.2 compatibility hack :-( */
1447                                         if (mtu == 0 &&
1448                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1449                                             old_mtu >= 68 + (iph->ihl << 2))
1450                                                 old_mtu -= iph->ihl << 2;
1451
1452                                         mtu = guess_mtu(old_mtu);
1453                                 }
1454                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1455                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1456                                                 dst_confirm(&rth->u.dst);
1457                                                 if (mtu < ip_rt_min_pmtu) {
1458                                                         mtu = ip_rt_min_pmtu;
1459                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1460                                                                 (1 << RTAX_MTU);
1461                                                 }
1462                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1463                                                 dst_set_expires(&rth->u.dst,
1464                                                         ip_rt_mtu_expires);
1465                                         }
1466                                         est_mtu = mtu;
1467                                 }
1468                         }
1469                 }
1470                 rcu_read_unlock();
1471         }
1472         return est_mtu ? : new_mtu;
1473 }
1474
1475 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1476 {
1477         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1478             !(dst_metric_locked(dst, RTAX_MTU))) {
1479                 if (mtu < ip_rt_min_pmtu) {
1480                         mtu = ip_rt_min_pmtu;
1481                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1482                 }
1483                 dst->metrics[RTAX_MTU-1] = mtu;
1484                 dst_set_expires(dst, ip_rt_mtu_expires);
1485                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1486         }
1487 }
1488
1489 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1490 {
1491         return NULL;
1492 }
1493
1494 static void ipv4_dst_destroy(struct dst_entry *dst)
1495 {
1496         struct rtable *rt = (struct rtable *) dst;
1497         struct inet_peer *peer = rt->peer;
1498         struct in_device *idev = rt->idev;
1499
1500         if (peer) {
1501                 rt->peer = NULL;
1502                 inet_putpeer(peer);
1503         }
1504
1505         if (idev) {
1506                 rt->idev = NULL;
1507                 in_dev_put(idev);
1508         }
1509 }
1510
1511 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1512                             int how)
1513 {
1514         struct rtable *rt = (struct rtable *) dst;
1515         struct in_device *idev = rt->idev;
1516         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1517                 struct in_device *loopback_idev =
1518                         in_dev_get(dev->nd_net->loopback_dev);
1519                 if (loopback_idev) {
1520                         rt->idev = loopback_idev;
1521                         in_dev_put(idev);
1522                 }
1523         }
1524 }
1525
1526 static void ipv4_link_failure(struct sk_buff *skb)
1527 {
1528         struct rtable *rt;
1529
1530         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1531
1532         rt = (struct rtable *) skb->dst;
1533         if (rt)
1534                 dst_set_expires(&rt->u.dst, 0);
1535 }
1536
1537 static int ip_rt_bug(struct sk_buff *skb)
1538 {
1539         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1540                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1541                 skb->dev ? skb->dev->name : "?");
1542         kfree_skb(skb);
1543         return 0;
1544 }
1545
1546 /*
1547    We do not cache source address of outgoing interface,
1548    because it is used only by IP RR, TS and SRR options,
1549    so that it out of fast path.
1550
1551    BTW remember: "addr" is allowed to be not aligned
1552    in IP options!
1553  */
1554
1555 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1556 {
1557         __be32 src;
1558         struct fib_result res;
1559
1560         if (rt->fl.iif == 0)
1561                 src = rt->rt_src;
1562         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1563                 src = FIB_RES_PREFSRC(res);
1564                 fib_res_put(&res);
1565         } else
1566                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1567                                         RT_SCOPE_UNIVERSE);
1568         memcpy(addr, &src, 4);
1569 }
1570
1571 #ifdef CONFIG_NET_CLS_ROUTE
1572 static void set_class_tag(struct rtable *rt, u32 tag)
1573 {
1574         if (!(rt->u.dst.tclassid & 0xFFFF))
1575                 rt->u.dst.tclassid |= tag & 0xFFFF;
1576         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1577                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1578 }
1579 #endif
1580
1581 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1582 {
1583         struct fib_info *fi = res->fi;
1584
1585         if (fi) {
1586                 if (FIB_RES_GW(*res) &&
1587                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1588                         rt->rt_gateway = FIB_RES_GW(*res);
1589                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1590                        sizeof(rt->u.dst.metrics));
1591                 if (fi->fib_mtu == 0) {
1592                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1593                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1594                             rt->rt_gateway != rt->rt_dst &&
1595                             rt->u.dst.dev->mtu > 576)
1596                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1597                 }
1598 #ifdef CONFIG_NET_CLS_ROUTE
1599                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1600 #endif
1601         } else
1602                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1603
1604         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1605                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1606         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1607                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1608         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1609                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1610                                        ip_rt_min_advmss);
1611         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1612                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1613
1614 #ifdef CONFIG_NET_CLS_ROUTE
1615 #ifdef CONFIG_IP_MULTIPLE_TABLES
1616         set_class_tag(rt, fib_rules_tclass(res));
1617 #endif
1618         set_class_tag(rt, itag);
1619 #endif
1620         rt->rt_type = res->type;
1621 }
1622
1623 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1624                                 u8 tos, struct net_device *dev, int our)
1625 {
1626         unsigned hash;
1627         struct rtable *rth;
1628         __be32 spec_dst;
1629         struct in_device *in_dev = in_dev_get(dev);
1630         u32 itag = 0;
1631
1632         /* Primary sanity checks. */
1633
1634         if (in_dev == NULL)
1635                 return -EINVAL;
1636
1637         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1638             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1639                 goto e_inval;
1640
1641         if (ipv4_is_zeronet(saddr)) {
1642                 if (!ipv4_is_local_multicast(daddr))
1643                         goto e_inval;
1644                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1645         } else if (fib_validate_source(saddr, 0, tos, 0,
1646                                         dev, &spec_dst, &itag) < 0)
1647                 goto e_inval;
1648
1649         rth = dst_alloc(&ipv4_dst_ops);
1650         if (!rth)
1651                 goto e_nobufs;
1652
1653         rth->u.dst.output= ip_rt_bug;
1654
1655         atomic_set(&rth->u.dst.__refcnt, 1);
1656         rth->u.dst.flags= DST_HOST;
1657         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1658                 rth->u.dst.flags |= DST_NOPOLICY;
1659         rth->fl.fl4_dst = daddr;
1660         rth->rt_dst     = daddr;
1661         rth->fl.fl4_tos = tos;
1662         rth->fl.mark    = skb->mark;
1663         rth->fl.fl4_src = saddr;
1664         rth->rt_src     = saddr;
1665 #ifdef CONFIG_NET_CLS_ROUTE
1666         rth->u.dst.tclassid = itag;
1667 #endif
1668         rth->rt_iif     =
1669         rth->fl.iif     = dev->ifindex;
1670         rth->u.dst.dev  = init_net.loopback_dev;
1671         dev_hold(rth->u.dst.dev);
1672         rth->idev       = in_dev_get(rth->u.dst.dev);
1673         rth->fl.oif     = 0;
1674         rth->rt_gateway = daddr;
1675         rth->rt_spec_dst= spec_dst;
1676         rth->rt_type    = RTN_MULTICAST;
1677         rth->rt_flags   = RTCF_MULTICAST;
1678         if (our) {
1679                 rth->u.dst.input= ip_local_deliver;
1680                 rth->rt_flags |= RTCF_LOCAL;
1681         }
1682
1683 #ifdef CONFIG_IP_MROUTE
1684         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1685                 rth->u.dst.input = ip_mr_input;
1686 #endif
1687         RT_CACHE_STAT_INC(in_slow_mc);
1688
1689         in_dev_put(in_dev);
1690         hash = rt_hash(daddr, saddr, dev->ifindex);
1691         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1692
1693 e_nobufs:
1694         in_dev_put(in_dev);
1695         return -ENOBUFS;
1696
1697 e_inval:
1698         in_dev_put(in_dev);
1699         return -EINVAL;
1700 }
1701
1702
1703 static void ip_handle_martian_source(struct net_device *dev,
1704                                      struct in_device *in_dev,
1705                                      struct sk_buff *skb,
1706                                      __be32 daddr,
1707                                      __be32 saddr)
1708 {
1709         RT_CACHE_STAT_INC(in_martian_src);
1710 #ifdef CONFIG_IP_ROUTE_VERBOSE
1711         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1712                 /*
1713                  *      RFC1812 recommendation, if source is martian,
1714                  *      the only hint is MAC header.
1715                  */
1716                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1717                         "%u.%u.%u.%u, on dev %s\n",
1718                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1719                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1720                         int i;
1721                         const unsigned char *p = skb_mac_header(skb);
1722                         printk(KERN_WARNING "ll header: ");
1723                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1724                                 printk("%02x", *p);
1725                                 if (i < (dev->hard_header_len - 1))
1726                                         printk(":");
1727                         }
1728                         printk("\n");
1729                 }
1730         }
1731 #endif
1732 }
1733
1734 static inline int __mkroute_input(struct sk_buff *skb,
1735                                   struct fib_result* res,
1736                                   struct in_device *in_dev,
1737                                   __be32 daddr, __be32 saddr, u32 tos,
1738                                   struct rtable **result)
1739 {
1740
1741         struct rtable *rth;
1742         int err;
1743         struct in_device *out_dev;
1744         unsigned flags = 0;
1745         __be32 spec_dst;
1746         u32 itag;
1747
1748         /* get a working reference to the output device */
1749         out_dev = in_dev_get(FIB_RES_DEV(*res));
1750         if (out_dev == NULL) {
1751                 if (net_ratelimit())
1752                         printk(KERN_CRIT "Bug in ip_route_input" \
1753                                "_slow(). Please, report\n");
1754                 return -EINVAL;
1755         }
1756
1757
1758         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1759                                   in_dev->dev, &spec_dst, &itag);
1760         if (err < 0) {
1761                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1762                                          saddr);
1763
1764                 err = -EINVAL;
1765                 goto cleanup;
1766         }
1767
1768         if (err)
1769                 flags |= RTCF_DIRECTSRC;
1770
1771         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1772             (IN_DEV_SHARED_MEDIA(out_dev) ||
1773              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1774                 flags |= RTCF_DOREDIRECT;
1775
1776         if (skb->protocol != htons(ETH_P_IP)) {
1777                 /* Not IP (i.e. ARP). Do not create route, if it is
1778                  * invalid for proxy arp. DNAT routes are always valid.
1779                  */
1780                 if (out_dev == in_dev) {
1781                         err = -EINVAL;
1782                         goto cleanup;
1783                 }
1784         }
1785
1786
1787         rth = dst_alloc(&ipv4_dst_ops);
1788         if (!rth) {
1789                 err = -ENOBUFS;
1790                 goto cleanup;
1791         }
1792
1793         atomic_set(&rth->u.dst.__refcnt, 1);
1794         rth->u.dst.flags= DST_HOST;
1795         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1796                 rth->u.dst.flags |= DST_NOPOLICY;
1797         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1798                 rth->u.dst.flags |= DST_NOXFRM;
1799         rth->fl.fl4_dst = daddr;
1800         rth->rt_dst     = daddr;
1801         rth->fl.fl4_tos = tos;
1802         rth->fl.mark    = skb->mark;
1803         rth->fl.fl4_src = saddr;
1804         rth->rt_src     = saddr;
1805         rth->rt_gateway = daddr;
1806         rth->rt_iif     =
1807                 rth->fl.iif     = in_dev->dev->ifindex;
1808         rth->u.dst.dev  = (out_dev)->dev;
1809         dev_hold(rth->u.dst.dev);
1810         rth->idev       = in_dev_get(rth->u.dst.dev);
1811         rth->fl.oif     = 0;
1812         rth->rt_spec_dst= spec_dst;
1813
1814         rth->u.dst.input = ip_forward;
1815         rth->u.dst.output = ip_output;
1816
1817         rt_set_nexthop(rth, res, itag);
1818
1819         rth->rt_flags = flags;
1820
1821         *result = rth;
1822         err = 0;
1823  cleanup:
1824         /* release the working reference to the output device */
1825         in_dev_put(out_dev);
1826         return err;
1827 }
1828
1829 static inline int ip_mkroute_input(struct sk_buff *skb,
1830                                    struct fib_result* res,
1831                                    const struct flowi *fl,
1832                                    struct in_device *in_dev,
1833                                    __be32 daddr, __be32 saddr, u32 tos)
1834 {
1835         struct rtable* rth = NULL;
1836         int err;
1837         unsigned hash;
1838
1839 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1840         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1841                 fib_select_multipath(fl, res);
1842 #endif
1843
1844         /* create a routing cache entry */
1845         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1846         if (err)
1847                 return err;
1848
1849         /* put it into the cache */
1850         hash = rt_hash(daddr, saddr, fl->iif);
1851         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1852 }
1853
1854 /*
1855  *      NOTE. We drop all the packets that has local source
1856  *      addresses, because every properly looped back packet
1857  *      must have correct destination already attached by output routine.
1858  *
1859  *      Such approach solves two big problems:
1860  *      1. Not simplex devices are handled properly.
1861  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1862  */
1863
1864 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1865                                u8 tos, struct net_device *dev)
1866 {
1867         struct fib_result res;
1868         struct in_device *in_dev = in_dev_get(dev);
1869         struct flowi fl = { .nl_u = { .ip4_u =
1870                                       { .daddr = daddr,
1871                                         .saddr = saddr,
1872                                         .tos = tos,
1873                                         .scope = RT_SCOPE_UNIVERSE,
1874                                       } },
1875                             .mark = skb->mark,
1876                             .iif = dev->ifindex };
1877         unsigned        flags = 0;
1878         u32             itag = 0;
1879         struct rtable * rth;
1880         unsigned        hash;
1881         __be32          spec_dst;
1882         int             err = -EINVAL;
1883         int             free_res = 0;
1884         struct net    * net = dev->nd_net;
1885
1886         /* IP on this device is disabled. */
1887
1888         if (!in_dev)
1889                 goto out;
1890
1891         /* Check for the most weird martians, which can be not detected
1892            by fib_lookup.
1893          */
1894
1895         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1896             ipv4_is_loopback(saddr))
1897                 goto martian_source;
1898
1899         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1900                 goto brd_input;
1901
1902         /* Accept zero addresses only to limited broadcast;
1903          * I even do not know to fix it or not. Waiting for complains :-)
1904          */
1905         if (ipv4_is_zeronet(saddr))
1906                 goto martian_source;
1907
1908         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1909             ipv4_is_loopback(daddr))
1910                 goto martian_destination;
1911
1912         /*
1913          *      Now we are ready to route packet.
1914          */
1915         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1916                 if (!IN_DEV_FORWARD(in_dev))
1917                         goto e_hostunreach;
1918                 goto no_route;
1919         }
1920         free_res = 1;
1921
1922         RT_CACHE_STAT_INC(in_slow_tot);
1923
1924         if (res.type == RTN_BROADCAST)
1925                 goto brd_input;
1926
1927         if (res.type == RTN_LOCAL) {
1928                 int result;
1929                 result = fib_validate_source(saddr, daddr, tos,
1930                                              net->loopback_dev->ifindex,
1931                                              dev, &spec_dst, &itag);
1932                 if (result < 0)
1933                         goto martian_source;
1934                 if (result)
1935                         flags |= RTCF_DIRECTSRC;
1936                 spec_dst = daddr;
1937                 goto local_input;
1938         }
1939
1940         if (!IN_DEV_FORWARD(in_dev))
1941                 goto e_hostunreach;
1942         if (res.type != RTN_UNICAST)
1943                 goto martian_destination;
1944
1945         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1946 done:
1947         in_dev_put(in_dev);
1948         if (free_res)
1949                 fib_res_put(&res);
1950 out:    return err;
1951
1952 brd_input:
1953         if (skb->protocol != htons(ETH_P_IP))
1954                 goto e_inval;
1955
1956         if (ipv4_is_zeronet(saddr))
1957                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1958         else {
1959                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1960                                           &itag);
1961                 if (err < 0)
1962                         goto martian_source;
1963                 if (err)
1964                         flags |= RTCF_DIRECTSRC;
1965         }
1966         flags |= RTCF_BROADCAST;
1967         res.type = RTN_BROADCAST;
1968         RT_CACHE_STAT_INC(in_brd);
1969
1970 local_input:
1971         rth = dst_alloc(&ipv4_dst_ops);
1972         if (!rth)
1973                 goto e_nobufs;
1974
1975         rth->u.dst.output= ip_rt_bug;
1976
1977         atomic_set(&rth->u.dst.__refcnt, 1);
1978         rth->u.dst.flags= DST_HOST;
1979         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1980                 rth->u.dst.flags |= DST_NOPOLICY;
1981         rth->fl.fl4_dst = daddr;
1982         rth->rt_dst     = daddr;
1983         rth->fl.fl4_tos = tos;
1984         rth->fl.mark    = skb->mark;
1985         rth->fl.fl4_src = saddr;
1986         rth->rt_src     = saddr;
1987 #ifdef CONFIG_NET_CLS_ROUTE
1988         rth->u.dst.tclassid = itag;
1989 #endif
1990         rth->rt_iif     =
1991         rth->fl.iif     = dev->ifindex;
1992         rth->u.dst.dev  = net->loopback_dev;
1993         dev_hold(rth->u.dst.dev);
1994         rth->idev       = in_dev_get(rth->u.dst.dev);
1995         rth->rt_gateway = daddr;
1996         rth->rt_spec_dst= spec_dst;
1997         rth->u.dst.input= ip_local_deliver;
1998         rth->rt_flags   = flags|RTCF_LOCAL;
1999         if (res.type == RTN_UNREACHABLE) {
2000                 rth->u.dst.input= ip_error;
2001                 rth->u.dst.error= -err;
2002                 rth->rt_flags   &= ~RTCF_LOCAL;
2003         }
2004         rth->rt_type    = res.type;
2005         hash = rt_hash(daddr, saddr, fl.iif);
2006         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2007         goto done;
2008
2009 no_route:
2010         RT_CACHE_STAT_INC(in_no_route);
2011         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2012         res.type = RTN_UNREACHABLE;
2013         if (err == -ESRCH)
2014                 err = -ENETUNREACH;
2015         goto local_input;
2016
2017         /*
2018          *      Do not cache martian addresses: they should be logged (RFC1812)
2019          */
2020 martian_destination:
2021         RT_CACHE_STAT_INC(in_martian_dst);
2022 #ifdef CONFIG_IP_ROUTE_VERBOSE
2023         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2024                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2025                         "%u.%u.%u.%u, dev %s\n",
2026                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2027 #endif
2028
2029 e_hostunreach:
2030         err = -EHOSTUNREACH;
2031         goto done;
2032
2033 e_inval:
2034         err = -EINVAL;
2035         goto done;
2036
2037 e_nobufs:
2038         err = -ENOBUFS;
2039         goto done;
2040
2041 martian_source:
2042         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2043         goto e_inval;
2044 }
2045
2046 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2047                    u8 tos, struct net_device *dev)
2048 {
2049         struct rtable * rth;
2050         unsigned        hash;
2051         int iif = dev->ifindex;
2052
2053         tos &= IPTOS_RT_MASK;
2054         hash = rt_hash(daddr, saddr, iif);
2055
2056         rcu_read_lock();
2057         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2058              rth = rcu_dereference(rth->u.dst.rt_next)) {
2059                 if (rth->fl.fl4_dst == daddr &&
2060                     rth->fl.fl4_src == saddr &&
2061                     rth->fl.iif == iif &&
2062                     rth->fl.oif == 0 &&
2063                     rth->fl.mark == skb->mark &&
2064                     rth->fl.fl4_tos == tos) {
2065                         dst_use(&rth->u.dst, jiffies);
2066                         RT_CACHE_STAT_INC(in_hit);
2067                         rcu_read_unlock();
2068                         skb->dst = (struct dst_entry*)rth;
2069                         return 0;
2070                 }
2071                 RT_CACHE_STAT_INC(in_hlist_search);
2072         }
2073         rcu_read_unlock();
2074
2075         /* Multicast recognition logic is moved from route cache to here.
2076            The problem was that too many Ethernet cards have broken/missing
2077            hardware multicast filters :-( As result the host on multicasting
2078            network acquires a lot of useless route cache entries, sort of
2079            SDR messages from all the world. Now we try to get rid of them.
2080            Really, provided software IP multicast filter is organized
2081            reasonably (at least, hashed), it does not result in a slowdown
2082            comparing with route cache reject entries.
2083            Note, that multicast routers are not affected, because
2084            route cache entry is created eventually.
2085          */
2086         if (ipv4_is_multicast(daddr)) {
2087                 struct in_device *in_dev;
2088
2089                 rcu_read_lock();
2090                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2091                         int our = ip_check_mc(in_dev, daddr, saddr,
2092                                 ip_hdr(skb)->protocol);
2093                         if (our
2094 #ifdef CONFIG_IP_MROUTE
2095                             || (!ipv4_is_local_multicast(daddr) &&
2096                                 IN_DEV_MFORWARD(in_dev))
2097 #endif
2098                             ) {
2099                                 rcu_read_unlock();
2100                                 return ip_route_input_mc(skb, daddr, saddr,
2101                                                          tos, dev, our);
2102                         }
2103                 }
2104                 rcu_read_unlock();
2105                 return -EINVAL;
2106         }
2107         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2108 }
2109
2110 static inline int __mkroute_output(struct rtable **result,
2111                                    struct fib_result* res,
2112                                    const struct flowi *fl,
2113                                    const struct flowi *oldflp,
2114                                    struct net_device *dev_out,
2115                                    unsigned flags)
2116 {
2117         struct rtable *rth;
2118         struct in_device *in_dev;
2119         u32 tos = RT_FL_TOS(oldflp);
2120         int err = 0;
2121
2122         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2123                 return -EINVAL;
2124
2125         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2126                 res->type = RTN_BROADCAST;
2127         else if (ipv4_is_multicast(fl->fl4_dst))
2128                 res->type = RTN_MULTICAST;
2129         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2130                 return -EINVAL;
2131
2132         if (dev_out->flags & IFF_LOOPBACK)
2133                 flags |= RTCF_LOCAL;
2134
2135         /* get work reference to inet device */
2136         in_dev = in_dev_get(dev_out);
2137         if (!in_dev)
2138                 return -EINVAL;
2139
2140         if (res->type == RTN_BROADCAST) {
2141                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2142                 if (res->fi) {
2143                         fib_info_put(res->fi);
2144                         res->fi = NULL;
2145                 }
2146         } else if (res->type == RTN_MULTICAST) {
2147                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2148                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2149                                  oldflp->proto))
2150                         flags &= ~RTCF_LOCAL;
2151                 /* If multicast route do not exist use
2152                    default one, but do not gateway in this case.
2153                    Yes, it is hack.
2154                  */
2155                 if (res->fi && res->prefixlen < 4) {
2156                         fib_info_put(res->fi);
2157                         res->fi = NULL;
2158                 }
2159         }
2160
2161
2162         rth = dst_alloc(&ipv4_dst_ops);
2163         if (!rth) {
2164                 err = -ENOBUFS;
2165                 goto cleanup;
2166         }
2167
2168         atomic_set(&rth->u.dst.__refcnt, 1);
2169         rth->u.dst.flags= DST_HOST;
2170         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2171                 rth->u.dst.flags |= DST_NOXFRM;
2172         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2173                 rth->u.dst.flags |= DST_NOPOLICY;
2174
2175         rth->fl.fl4_dst = oldflp->fl4_dst;
2176         rth->fl.fl4_tos = tos;
2177         rth->fl.fl4_src = oldflp->fl4_src;
2178         rth->fl.oif     = oldflp->oif;
2179         rth->fl.mark    = oldflp->mark;
2180         rth->rt_dst     = fl->fl4_dst;
2181         rth->rt_src     = fl->fl4_src;
2182         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2183         /* get references to the devices that are to be hold by the routing
2184            cache entry */
2185         rth->u.dst.dev  = dev_out;
2186         dev_hold(dev_out);
2187         rth->idev       = in_dev_get(dev_out);
2188         rth->rt_gateway = fl->fl4_dst;
2189         rth->rt_spec_dst= fl->fl4_src;
2190
2191         rth->u.dst.output=ip_output;
2192
2193         RT_CACHE_STAT_INC(out_slow_tot);
2194
2195         if (flags & RTCF_LOCAL) {
2196                 rth->u.dst.input = ip_local_deliver;
2197                 rth->rt_spec_dst = fl->fl4_dst;
2198         }
2199         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2200                 rth->rt_spec_dst = fl->fl4_src;
2201                 if (flags & RTCF_LOCAL &&
2202                     !(dev_out->flags & IFF_LOOPBACK)) {
2203                         rth->u.dst.output = ip_mc_output;
2204                         RT_CACHE_STAT_INC(out_slow_mc);
2205                 }
2206 #ifdef CONFIG_IP_MROUTE
2207                 if (res->type == RTN_MULTICAST) {
2208                         if (IN_DEV_MFORWARD(in_dev) &&
2209                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2210                                 rth->u.dst.input = ip_mr_input;
2211                                 rth->u.dst.output = ip_mc_output;
2212                         }
2213                 }
2214 #endif
2215         }
2216
2217         rt_set_nexthop(rth, res, 0);
2218
2219         rth->rt_flags = flags;
2220
2221         *result = rth;
2222  cleanup:
2223         /* release work reference to inet device */
2224         in_dev_put(in_dev);
2225
2226         return err;
2227 }
2228
2229 static inline int ip_mkroute_output(struct rtable **rp,
2230                                     struct fib_result* res,
2231                                     const struct flowi *fl,
2232                                     const struct flowi *oldflp,
2233                                     struct net_device *dev_out,
2234                                     unsigned flags)
2235 {
2236         struct rtable *rth = NULL;
2237         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2238         unsigned hash;
2239         if (err == 0) {
2240                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2241                 err = rt_intern_hash(hash, rth, rp);
2242         }
2243
2244         return err;
2245 }
2246
2247 /*
2248  * Major route resolver routine.
2249  */
2250
2251 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2252 {
2253         u32 tos = RT_FL_TOS(oldflp);
2254         struct flowi fl = { .nl_u = { .ip4_u =
2255                                       { .daddr = oldflp->fl4_dst,
2256                                         .saddr = oldflp->fl4_src,
2257                                         .tos = tos & IPTOS_RT_MASK,
2258                                         .scope = ((tos & RTO_ONLINK) ?
2259                                                   RT_SCOPE_LINK :
2260                                                   RT_SCOPE_UNIVERSE),
2261                                       } },
2262                             .mark = oldflp->mark,
2263                             .iif = init_net.loopback_dev->ifindex,
2264                             .oif = oldflp->oif };
2265         struct fib_result res;
2266         unsigned flags = 0;
2267         struct net_device *dev_out = NULL;
2268         int free_res = 0;
2269         int err;
2270
2271
2272         res.fi          = NULL;
2273 #ifdef CONFIG_IP_MULTIPLE_TABLES
2274         res.r           = NULL;
2275 #endif
2276
2277         if (oldflp->fl4_src) {
2278                 err = -EINVAL;
2279                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2280                     ipv4_is_lbcast(oldflp->fl4_src) ||
2281                     ipv4_is_zeronet(oldflp->fl4_src))
2282                         goto out;
2283
2284                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2285                 dev_out = ip_dev_find(&init_net, oldflp->fl4_src);
2286                 if (dev_out == NULL)
2287                         goto out;
2288
2289                 /* I removed check for oif == dev_out->oif here.
2290                    It was wrong for two reasons:
2291                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2292                       is assigned to multiple interfaces.
2293                    2. Moreover, we are allowed to send packets with saddr
2294                       of another iface. --ANK
2295                  */
2296
2297                 if (oldflp->oif == 0
2298                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2299                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2300                         /* Special hack: user can direct multicasts
2301                            and limited broadcast via necessary interface
2302                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2303                            This hack is not just for fun, it allows
2304                            vic,vat and friends to work.
2305                            They bind socket to loopback, set ttl to zero
2306                            and expect that it will work.
2307                            From the viewpoint of routing cache they are broken,
2308                            because we are not allowed to build multicast path
2309                            with loopback source addr (look, routing cache
2310                            cannot know, that ttl is zero, so that packet
2311                            will not leave this host and route is valid).
2312                            Luckily, this hack is good workaround.
2313                          */
2314
2315                         fl.oif = dev_out->ifindex;
2316                         goto make_route;
2317                 }
2318                 if (dev_out)
2319                         dev_put(dev_out);
2320                 dev_out = NULL;
2321         }
2322
2323
2324         if (oldflp->oif) {
2325                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2326                 err = -ENODEV;
2327                 if (dev_out == NULL)
2328                         goto out;
2329
2330                 /* RACE: Check return value of inet_select_addr instead. */
2331                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2332                         dev_put(dev_out);
2333                         goto out;       /* Wrong error code */
2334                 }
2335
2336                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2337                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2338                         if (!fl.fl4_src)
2339                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2340                                                               RT_SCOPE_LINK);
2341                         goto make_route;
2342                 }
2343                 if (!fl.fl4_src) {
2344                         if (ipv4_is_multicast(oldflp->fl4_dst))
2345                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2346                                                               fl.fl4_scope);
2347                         else if (!oldflp->fl4_dst)
2348                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2349                                                               RT_SCOPE_HOST);
2350                 }
2351         }
2352
2353         if (!fl.fl4_dst) {
2354                 fl.fl4_dst = fl.fl4_src;
2355                 if (!fl.fl4_dst)
2356                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2357                 if (dev_out)
2358                         dev_put(dev_out);
2359                 dev_out = init_net.loopback_dev;
2360                 dev_hold(dev_out);
2361                 fl.oif = init_net.loopback_dev->ifindex;
2362                 res.type = RTN_LOCAL;
2363                 flags |= RTCF_LOCAL;
2364                 goto make_route;
2365         }
2366
2367         if (fib_lookup(&init_net, &fl, &res)) {
2368                 res.fi = NULL;
2369                 if (oldflp->oif) {
2370                         /* Apparently, routing tables are wrong. Assume,
2371                            that the destination is on link.
2372
2373                            WHY? DW.
2374                            Because we are allowed to send to iface
2375                            even if it has NO routes and NO assigned
2376                            addresses. When oif is specified, routing
2377                            tables are looked up with only one purpose:
2378                            to catch if destination is gatewayed, rather than
2379                            direct. Moreover, if MSG_DONTROUTE is set,
2380                            we send packet, ignoring both routing tables
2381                            and ifaddr state. --ANK
2382
2383
2384                            We could make it even if oif is unknown,
2385                            likely IPv6, but we do not.
2386                          */
2387
2388                         if (fl.fl4_src == 0)
2389                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2390                                                               RT_SCOPE_LINK);
2391                         res.type = RTN_UNICAST;
2392                         goto make_route;
2393                 }
2394                 if (dev_out)
2395                         dev_put(dev_out);
2396                 err = -ENETUNREACH;
2397                 goto out;
2398         }
2399         free_res = 1;
2400
2401         if (res.type == RTN_LOCAL) {
2402                 if (!fl.fl4_src)
2403                         fl.fl4_src = fl.fl4_dst;
2404                 if (dev_out)
2405                         dev_put(dev_out);
2406                 dev_out = init_net.loopback_dev;
2407                 dev_hold(dev_out);
2408                 fl.oif = dev_out->ifindex;
2409                 if (res.fi)
2410                         fib_info_put(res.fi);
2411                 res.fi = NULL;
2412                 flags |= RTCF_LOCAL;
2413                 goto make_route;
2414         }
2415
2416 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2417         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2418                 fib_select_multipath(&fl, &res);
2419         else
2420 #endif
2421         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2422                 fib_select_default(&init_net, &fl, &res);
2423
2424         if (!fl.fl4_src)
2425                 fl.fl4_src = FIB_RES_PREFSRC(res);
2426
2427         if (dev_out)
2428                 dev_put(dev_out);
2429         dev_out = FIB_RES_DEV(res);
2430         dev_hold(dev_out);
2431         fl.oif = dev_out->ifindex;
2432
2433
2434 make_route:
2435         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2436
2437
2438         if (free_res)
2439                 fib_res_put(&res);
2440         if (dev_out)
2441                 dev_put(dev_out);
2442 out:    return err;
2443 }
2444
2445 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2446 {
2447         unsigned hash;
2448         struct rtable *rth;
2449
2450         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2451
2452         rcu_read_lock_bh();
2453         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2454                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2455                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2456                     rth->fl.fl4_src == flp->fl4_src &&
2457                     rth->fl.iif == 0 &&
2458                     rth->fl.oif == flp->oif &&
2459                     rth->fl.mark == flp->mark &&
2460                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2461                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2462                         dst_use(&rth->u.dst, jiffies);
2463                         RT_CACHE_STAT_INC(out_hit);
2464                         rcu_read_unlock_bh();
2465                         *rp = rth;
2466                         return 0;
2467                 }
2468                 RT_CACHE_STAT_INC(out_hlist_search);
2469         }
2470         rcu_read_unlock_bh();
2471
2472         return ip_route_output_slow(rp, flp);
2473 }
2474
2475 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2476
2477 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2478 {
2479 }
2480
2481 static struct dst_ops ipv4_dst_blackhole_ops = {
2482         .family                 =       AF_INET,
2483         .protocol               =       __constant_htons(ETH_P_IP),
2484         .destroy                =       ipv4_dst_destroy,
2485         .check                  =       ipv4_dst_check,
2486         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2487         .entry_size             =       sizeof(struct rtable),
2488 };
2489
2490
2491 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2492 {
2493         struct rtable *ort = *rp;
2494         struct rtable *rt = (struct rtable *)
2495                 dst_alloc(&ipv4_dst_blackhole_ops);
2496
2497         if (rt) {
2498                 struct dst_entry *new = &rt->u.dst;
2499
2500                 atomic_set(&new->__refcnt, 1);
2501                 new->__use = 1;
2502                 new->input = dst_discard;
2503                 new->output = dst_discard;
2504                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2505
2506                 new->dev = ort->u.dst.dev;
2507                 if (new->dev)
2508                         dev_hold(new->dev);
2509
2510                 rt->fl = ort->fl;
2511
2512                 rt->idev = ort->idev;
2513                 if (rt->idev)
2514                         in_dev_hold(rt->idev);
2515                 rt->rt_flags = ort->rt_flags;
2516                 rt->rt_type = ort->rt_type;
2517                 rt->rt_dst = ort->rt_dst;
2518                 rt->rt_src = ort->rt_src;
2519                 rt->rt_iif = ort->rt_iif;
2520                 rt->rt_gateway = ort->rt_gateway;
2521                 rt->rt_spec_dst = ort->rt_spec_dst;
2522                 rt->peer = ort->peer;
2523                 if (rt->peer)
2524                         atomic_inc(&rt->peer->refcnt);
2525
2526                 dst_free(new);
2527         }
2528
2529         dst_release(&(*rp)->u.dst);
2530         *rp = rt;
2531         return (rt ? 0 : -ENOMEM);
2532 }
2533
2534 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2535 {
2536         int err;
2537
2538         if ((err = __ip_route_output_key(rp, flp)) != 0)
2539                 return err;
2540
2541         if (flp->proto) {
2542                 if (!flp->fl4_src)
2543                         flp->fl4_src = (*rp)->rt_src;
2544                 if (!flp->fl4_dst)
2545                         flp->fl4_dst = (*rp)->rt_dst;
2546                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2547                                     flags ? XFRM_LOOKUP_WAIT : 0);
2548                 if (err == -EREMOTE)
2549                         err = ipv4_dst_blackhole(rp, flp, sk);
2550
2551                 return err;
2552         }
2553
2554         return 0;
2555 }
2556
2557 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2558
2559 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2560 {
2561         return ip_route_output_flow(rp, flp, NULL, 0);
2562 }
2563
2564 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2565                         int nowait, unsigned int flags)
2566 {
2567         struct rtable *rt = (struct rtable*)skb->dst;
2568         struct rtmsg *r;
2569         struct nlmsghdr *nlh;
2570         long expires;
2571         u32 id = 0, ts = 0, tsage = 0, error;
2572
2573         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2574         if (nlh == NULL)
2575                 return -EMSGSIZE;
2576
2577         r = nlmsg_data(nlh);
2578         r->rtm_family    = AF_INET;
2579         r->rtm_dst_len  = 32;
2580         r->rtm_src_len  = 0;
2581         r->rtm_tos      = rt->fl.fl4_tos;
2582         r->rtm_table    = RT_TABLE_MAIN;
2583         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2584         r->rtm_type     = rt->rt_type;
2585         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2586         r->rtm_protocol = RTPROT_UNSPEC;
2587         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2588         if (rt->rt_flags & RTCF_NOTIFY)
2589                 r->rtm_flags |= RTM_F_NOTIFY;
2590
2591         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2592
2593         if (rt->fl.fl4_src) {
2594                 r->rtm_src_len = 32;
2595                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2596         }
2597         if (rt->u.dst.dev)
2598                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2599 #ifdef CONFIG_NET_CLS_ROUTE
2600         if (rt->u.dst.tclassid)
2601                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2602 #endif
2603         if (rt->fl.iif)
2604                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2605         else if (rt->rt_src != rt->fl.fl4_src)
2606                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2607
2608         if (rt->rt_dst != rt->rt_gateway)
2609                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2610
2611         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2612                 goto nla_put_failure;
2613
2614         error = rt->u.dst.error;
2615         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2616         if (rt->peer) {
2617                 id = rt->peer->ip_id_count;
2618                 if (rt->peer->tcp_ts_stamp) {
2619                         ts = rt->peer->tcp_ts;
2620                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2621                 }
2622         }
2623
2624         if (rt->fl.iif) {
2625 #ifdef CONFIG_IP_MROUTE
2626                 __be32 dst = rt->rt_dst;
2627
2628                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2629                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2630                         int err = ipmr_get_route(skb, r, nowait);
2631                         if (err <= 0) {
2632                                 if (!nowait) {
2633                                         if (err == 0)
2634                                                 return 0;
2635                                         goto nla_put_failure;
2636                                 } else {
2637                                         if (err == -EMSGSIZE)
2638                                                 goto nla_put_failure;
2639                                         error = err;
2640                                 }
2641                         }
2642                 } else
2643 #endif
2644                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2645         }
2646
2647         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2648                                expires, error) < 0)
2649                 goto nla_put_failure;
2650
2651         return nlmsg_end(skb, nlh);
2652
2653 nla_put_failure:
2654         nlmsg_cancel(skb, nlh);
2655         return -EMSGSIZE;
2656 }
2657
2658 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2659 {
2660         struct net *net = in_skb->sk->sk_net;
2661         struct rtmsg *rtm;
2662         struct nlattr *tb[RTA_MAX+1];
2663         struct rtable *rt = NULL;
2664         __be32 dst = 0;
2665         __be32 src = 0;
2666         u32 iif;
2667         int err;
2668         struct sk_buff *skb;
2669
2670         if (net != &init_net)
2671                 return -EINVAL;
2672
2673         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2674         if (err < 0)
2675                 goto errout;
2676
2677         rtm = nlmsg_data(nlh);
2678
2679         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2680         if (skb == NULL) {
2681                 err = -ENOBUFS;
2682                 goto errout;
2683         }
2684
2685         /* Reserve room for dummy headers, this skb can pass
2686            through good chunk of routing engine.
2687          */
2688         skb_reset_mac_header(skb);
2689         skb_reset_network_header(skb);
2690
2691         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2692         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2693         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2694
2695         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2696         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2697         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2698
2699         if (iif) {
2700                 struct net_device *dev;
2701
2702                 dev = __dev_get_by_index(&init_net, iif);
2703                 if (dev == NULL) {
2704                         err = -ENODEV;
2705                         goto errout_free;
2706                 }
2707
2708                 skb->protocol   = htons(ETH_P_IP);
2709                 skb->dev        = dev;
2710                 local_bh_disable();
2711                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2712                 local_bh_enable();
2713
2714                 rt = (struct rtable*) skb->dst;
2715                 if (err == 0 && rt->u.dst.error)
2716                         err = -rt->u.dst.error;
2717         } else {
2718                 struct flowi fl = {
2719                         .nl_u = {
2720                                 .ip4_u = {
2721                                         .daddr = dst,
2722                                         .saddr = src,
2723                                         .tos = rtm->rtm_tos,
2724                                 },
2725                         },
2726                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2727                 };
2728                 err = ip_route_output_key(&rt, &fl);
2729         }
2730
2731         if (err)
2732                 goto errout_free;
2733
2734         skb->dst = &rt->u.dst;
2735         if (rtm->rtm_flags & RTM_F_NOTIFY)
2736                 rt->rt_flags |= RTCF_NOTIFY;
2737
2738         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2739                                 RTM_NEWROUTE, 0, 0);
2740         if (err <= 0)
2741                 goto errout_free;
2742
2743         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2744 errout:
2745         return err;
2746
2747 errout_free:
2748         kfree_skb(skb);
2749         goto errout;
2750 }
2751
2752 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2753 {
2754         struct rtable *rt;
2755         int h, s_h;
2756         int idx, s_idx;
2757
2758         s_h = cb->args[0];
2759         if (s_h < 0)
2760                 s_h = 0;
2761         s_idx = idx = cb->args[1];
2762         for (h = s_h; h <= rt_hash_mask; h++) {
2763                 rcu_read_lock_bh();
2764                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2765                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2766                         if (idx < s_idx)
2767                                 continue;
2768                         skb->dst = dst_clone(&rt->u.dst);
2769                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2770                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2771                                          1, NLM_F_MULTI) <= 0) {
2772                                 dst_release(xchg(&skb->dst, NULL));
2773                                 rcu_read_unlock_bh();
2774                                 goto done;
2775                         }
2776                         dst_release(xchg(&skb->dst, NULL));
2777                 }
2778                 rcu_read_unlock_bh();
2779                 s_idx = 0;
2780         }
2781
2782 done:
2783         cb->args[0] = h;
2784         cb->args[1] = idx;
2785         return skb->len;
2786 }
2787
2788 void ip_rt_multicast_event(struct in_device *in_dev)
2789 {
2790         rt_cache_flush(0);
2791 }
2792
2793 #ifdef CONFIG_SYSCTL
2794 static int flush_delay;
2795
2796 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2797                                         struct file *filp, void __user *buffer,
2798                                         size_t *lenp, loff_t *ppos)
2799 {
2800         if (write) {
2801                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2802                 rt_cache_flush(flush_delay);
2803                 return 0;
2804         }
2805
2806         return -EINVAL;
2807 }
2808
2809 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2810                                                 int __user *name,
2811                                                 int nlen,
2812                                                 void __user *oldval,
2813                                                 size_t __user *oldlenp,
2814                                                 void __user *newval,
2815                                                 size_t newlen)
2816 {
2817         int delay;
2818         if (newlen != sizeof(int))
2819                 return -EINVAL;
2820         if (get_user(delay, (int __user *)newval))
2821                 return -EFAULT;
2822         rt_cache_flush(delay);
2823         return 0;
2824 }
2825
2826 ctl_table ipv4_route_table[] = {
2827         {
2828                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2829                 .procname       = "flush",
2830                 .data           = &flush_delay,
2831                 .maxlen         = sizeof(int),
2832                 .mode           = 0200,
2833                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2834                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2835         },
2836         {
2837                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2838                 .procname       = "min_delay",
2839                 .data           = &ip_rt_min_delay,
2840                 .maxlen         = sizeof(int),
2841                 .mode           = 0644,
2842                 .proc_handler   = &proc_dointvec_jiffies,
2843                 .strategy       = &sysctl_jiffies,
2844         },
2845         {
2846                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2847                 .procname       = "max_delay",
2848                 .data           = &ip_rt_max_delay,
2849                 .maxlen         = sizeof(int),
2850                 .mode           = 0644,
2851                 .proc_handler   = &proc_dointvec_jiffies,
2852                 .strategy       = &sysctl_jiffies,
2853         },
2854         {
2855                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2856                 .procname       = "gc_thresh",
2857                 .data           = &ipv4_dst_ops.gc_thresh,
2858                 .maxlen         = sizeof(int),
2859                 .mode           = 0644,
2860                 .proc_handler   = &proc_dointvec,
2861         },
2862         {
2863                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2864                 .procname       = "max_size",
2865                 .data           = &ip_rt_max_size,
2866                 .maxlen         = sizeof(int),
2867                 .mode           = 0644,
2868                 .proc_handler   = &proc_dointvec,
2869         },
2870         {
2871                 /*  Deprecated. Use gc_min_interval_ms */
2872
2873                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2874                 .procname       = "gc_min_interval",
2875                 .data           = &ip_rt_gc_min_interval,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = &proc_dointvec_jiffies,
2879                 .strategy       = &sysctl_jiffies,
2880         },
2881         {
2882                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2883                 .procname       = "gc_min_interval_ms",
2884                 .data           = &ip_rt_gc_min_interval,
2885                 .maxlen         = sizeof(int),
2886                 .mode           = 0644,
2887                 .proc_handler   = &proc_dointvec_ms_jiffies,
2888                 .strategy       = &sysctl_ms_jiffies,
2889         },
2890         {
2891                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2892                 .procname       = "gc_timeout",
2893                 .data           = &ip_rt_gc_timeout,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = &proc_dointvec_jiffies,
2897                 .strategy       = &sysctl_jiffies,
2898         },
2899         {
2900                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2901                 .procname       = "gc_interval",
2902                 .data           = &ip_rt_gc_interval,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = &proc_dointvec_jiffies,
2906                 .strategy       = &sysctl_jiffies,
2907         },
2908         {
2909                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2910                 .procname       = "redirect_load",
2911                 .data           = &ip_rt_redirect_load,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = &proc_dointvec,
2915         },
2916         {
2917                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2918                 .procname       = "redirect_number",
2919                 .data           = &ip_rt_redirect_number,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = &proc_dointvec,
2923         },
2924         {
2925                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2926                 .procname       = "redirect_silence",
2927                 .data           = &ip_rt_redirect_silence,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = &proc_dointvec,
2931         },
2932         {
2933                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2934                 .procname       = "error_cost",
2935                 .data           = &ip_rt_error_cost,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = &proc_dointvec,
2939         },
2940         {
2941                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2942                 .procname       = "error_burst",
2943                 .data           = &ip_rt_error_burst,
2944                 .maxlen         = sizeof(int),
2945                 .mode           = 0644,
2946                 .proc_handler   = &proc_dointvec,
2947         },
2948         {
2949                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2950                 .procname       = "gc_elasticity",
2951                 .data           = &ip_rt_gc_elasticity,
2952                 .maxlen         = sizeof(int),
2953                 .mode           = 0644,
2954                 .proc_handler   = &proc_dointvec,
2955         },
2956         {
2957                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2958                 .procname       = "mtu_expires",
2959                 .data           = &ip_rt_mtu_expires,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0644,
2962                 .proc_handler   = &proc_dointvec_jiffies,
2963                 .strategy       = &sysctl_jiffies,
2964         },
2965         {
2966                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2967                 .procname       = "min_pmtu",
2968                 .data           = &ip_rt_min_pmtu,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = &proc_dointvec,
2972         },
2973         {
2974                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2975                 .procname       = "min_adv_mss",
2976                 .data           = &ip_rt_min_advmss,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = &proc_dointvec,
2980         },
2981         {
2982                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2983                 .procname       = "secret_interval",
2984                 .data           = &ip_rt_secret_interval,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = &proc_dointvec_jiffies,
2988                 .strategy       = &sysctl_jiffies,
2989         },
2990         { .ctl_name = 0 }
2991 };
2992 #endif
2993
2994 #ifdef CONFIG_NET_CLS_ROUTE
2995 struct ip_rt_acct *ip_rt_acct __read_mostly;
2996 #endif /* CONFIG_NET_CLS_ROUTE */
2997
2998 static __initdata unsigned long rhash_entries;
2999 static int __init set_rhash_entries(char *str)
3000 {
3001         if (!str)
3002                 return 0;
3003         rhash_entries = simple_strtoul(str, &str, 0);
3004         return 1;
3005 }
3006 __setup("rhash_entries=", set_rhash_entries);
3007
3008 int __init ip_rt_init(void)
3009 {
3010         int rc = 0;
3011
3012         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3013                              (jiffies ^ (jiffies >> 7)));
3014
3015 #ifdef CONFIG_NET_CLS_ROUTE
3016         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3017         if (!ip_rt_acct)
3018                 panic("IP: failed to allocate ip_rt_acct\n");
3019 #endif
3020
3021         ipv4_dst_ops.kmem_cachep =
3022                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3023                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3024
3025         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3026
3027         rt_hash_table = (struct rt_hash_bucket *)
3028                 alloc_large_system_hash("IP route cache",
3029                                         sizeof(struct rt_hash_bucket),
3030                                         rhash_entries,
3031                                         (num_physpages >= 128 * 1024) ?
3032                                         15 : 17,
3033                                         0,
3034                                         &rt_hash_log,
3035                                         &rt_hash_mask,
3036                                         0);
3037         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3038         rt_hash_lock_init();
3039
3040         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3041         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3042
3043         devinet_init();
3044         ip_fib_init();
3045
3046         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3047         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3048
3049         /* All the timers, started at system startup tend
3050            to synchronize. Perturb it a bit.
3051          */
3052         schedule_delayed_work(&expires_work,
3053                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3054
3055         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3056                 ip_rt_secret_interval;
3057         add_timer(&rt_secret_timer);
3058
3059         if (ip_rt_proc_init(&init_net))
3060                 printk(KERN_ERR "Unable to create route proc files\n");
3061 #ifdef CONFIG_XFRM
3062         xfrm_init();
3063         xfrm4_init();
3064 #endif
3065         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3066
3067         return rc;
3068 }
3069
3070 EXPORT_SYMBOL(__ip_select_ident);
3071 EXPORT_SYMBOL(ip_route_input);
3072 EXPORT_SYMBOL(ip_route_output_key);