[IPV4] ROUTE: Convert rt_hash_lock_init() macro into function
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(void);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238
239 static __init void rt_hash_lock_init(void)
240 {
241         int i;
242
243         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
244                         GFP_KERNEL);
245         if (!rt_hash_locks)
246                 panic("IP: failed to allocate rt_hash_locks\n");
247
248         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
249                 spin_lock_init(&rt_hash_locks[i]);
250 }
251 #else
252 # define rt_hash_lock_addr(slot) NULL
253
254 static inline void rt_hash_lock_init(void)
255 {
256 }
257 #endif
258
259 static struct rt_hash_bucket    *rt_hash_table;
260 static unsigned                 rt_hash_mask;
261 static unsigned int             rt_hash_log;
262 static unsigned int             rt_hash_rnd;
263
264 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
265 #define RT_CACHE_STAT_INC(field) \
266         (__raw_get_cpu_var(rt_cache_stat).field++)
267
268 static int rt_intern_hash(unsigned hash, struct rtable *rth,
269                                 struct rtable **res);
270
271 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
272 {
273         return (jhash_2words(daddr, saddr, rt_hash_rnd)
274                 & rt_hash_mask);
275 }
276
277 #define rt_hash(daddr, saddr, idx) \
278         rt_hash_code((__force u32)(__be32)(daddr),\
279                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
280
281 #ifdef CONFIG_PROC_FS
282 struct rt_cache_iter_state {
283         int bucket;
284 };
285
286 static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 {
288         struct rtable *r = NULL;
289         struct rt_cache_iter_state *st = seq->private;
290
291         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
292                 rcu_read_lock_bh();
293                 r = rt_hash_table[st->bucket].chain;
294                 if (r)
295                         break;
296                 rcu_read_unlock_bh();
297         }
298         return rcu_dereference(r);
299 }
300
301 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 if (--st->bucket < 0)
309                         break;
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference(r);
314 }
315
316 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
317 {
318         struct rtable *r = rt_cache_get_first(seq);
319
320         if (r)
321                 while (pos && (r = rt_cache_get_next(seq, r)))
322                         --pos;
323         return pos ? NULL : r;
324 }
325
326 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
327 {
328         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
329 }
330
331 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
332 {
333         struct rtable *r = NULL;
334
335         if (v == SEQ_START_TOKEN)
336                 r = rt_cache_get_first(seq);
337         else
338                 r = rt_cache_get_next(seq, v);
339         ++*pos;
340         return r;
341 }
342
343 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
344 {
345         if (v && v != SEQ_START_TOKEN)
346                 rcu_read_unlock_bh();
347 }
348
349 static int rt_cache_seq_show(struct seq_file *seq, void *v)
350 {
351         if (v == SEQ_START_TOKEN)
352                 seq_printf(seq, "%-127s\n",
353                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
354                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
355                            "HHUptod\tSpecDst");
356         else {
357                 struct rtable *r = v;
358                 char temp[256];
359
360                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
361                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
362                         r->u.dst.dev ? r->u.dst.dev->name : "*",
363                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
364                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
365                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
366                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
367                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
368                         dst_metric(&r->u.dst, RTAX_WINDOW),
369                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
370                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
371                         r->fl.fl4_tos,
372                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
373                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
374                                        dev_queue_xmit) : 0,
375                         r->rt_spec_dst);
376                 seq_printf(seq, "%-127s\n", temp);
377         }
378         return 0;
379 }
380
381 static const struct seq_operations rt_cache_seq_ops = {
382         .start  = rt_cache_seq_start,
383         .next   = rt_cache_seq_next,
384         .stop   = rt_cache_seq_stop,
385         .show   = rt_cache_seq_show,
386 };
387
388 static int rt_cache_seq_open(struct inode *inode, struct file *file)
389 {
390         return seq_open_private(file, &rt_cache_seq_ops,
391                         sizeof(struct rt_cache_iter_state));
392 }
393
394 static const struct file_operations rt_cache_seq_fops = {
395         .owner   = THIS_MODULE,
396         .open    = rt_cache_seq_open,
397         .read    = seq_read,
398         .llseek  = seq_lseek,
399         .release = seq_release_private,
400 };
401
402
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405         int cpu;
406
407         if (*pos == 0)
408                 return SEQ_START_TOKEN;
409
410         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411                 if (!cpu_possible(cpu))
412                         continue;
413                 *pos = cpu+1;
414                 return &per_cpu(rt_cache_stat, cpu);
415         }
416         return NULL;
417 }
418
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421         int cpu;
422
423         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430
431 }
432
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435
436 }
437
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440         struct rt_cache_stat *st = v;
441
442         if (v == SEQ_START_TOKEN) {
443                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444                 return 0;
445         }
446
447         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449                    atomic_read(&ipv4_dst_ops.entries),
450                    st->in_hit,
451                    st->in_slow_tot,
452                    st->in_slow_mc,
453                    st->in_no_route,
454                    st->in_brd,
455                    st->in_martian_dst,
456                    st->in_martian_src,
457
458                    st->out_hit,
459                    st->out_slow_tot,
460                    st->out_slow_mc,
461
462                    st->gc_total,
463                    st->gc_ignored,
464                    st->gc_goal_miss,
465                    st->gc_dst_overflow,
466                    st->in_hlist_search,
467                    st->out_hlist_search
468                 );
469         return 0;
470 }
471
472 static const struct seq_operations rt_cpu_seq_ops = {
473         .start  = rt_cpu_seq_start,
474         .next   = rt_cpu_seq_next,
475         .stop   = rt_cpu_seq_stop,
476         .show   = rt_cpu_seq_show,
477 };
478
479
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482         return seq_open(file, &rt_cpu_seq_ops);
483 }
484
485 static const struct file_operations rt_cpu_seq_fops = {
486         .owner   = THIS_MODULE,
487         .open    = rt_cpu_seq_open,
488         .read    = seq_read,
489         .llseek  = seq_lseek,
490         .release = seq_release,
491 };
492
493 #ifdef CONFIG_NET_CLS_ROUTE
494 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
495                            int length, int *eof, void *data)
496 {
497         unsigned int i;
498
499         if ((offset & 3) || (length & 3))
500                 return -EIO;
501
502         if (offset >= sizeof(struct ip_rt_acct) * 256) {
503                 *eof = 1;
504                 return 0;
505         }
506
507         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
508                 length = sizeof(struct ip_rt_acct) * 256 - offset;
509                 *eof = 1;
510         }
511
512         offset /= sizeof(u32);
513
514         if (length > 0) {
515                 u32 *dst = (u32 *) buffer;
516
517                 *start = buffer;
518                 memset(dst, 0, length);
519
520                 for_each_possible_cpu(i) {
521                         unsigned int j;
522                         u32 *src;
523
524                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
525                         for (j = 0; j < length/4; j++)
526                                 dst[j] += src[j];
527                 }
528         }
529         return length;
530 }
531 #endif
532
533 static __init int ip_rt_proc_init(struct net *net)
534 {
535         struct proc_dir_entry *pde;
536
537         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
538                         &rt_cache_seq_fops);
539         if (!pde)
540                 goto err1;
541
542         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
543         if (!pde)
544                 goto err2;
545
546         pde->proc_fops = &rt_cpu_seq_fops;
547
548 #ifdef CONFIG_NET_CLS_ROUTE
549         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
550                         ip_rt_acct_read, NULL);
551         if (!pde)
552                 goto err3;
553 #endif
554         return 0;
555
556 #ifdef CONFIG_NET_CLS_ROUTE
557 err3:
558         remove_proc_entry("rt_cache", net->proc_net_stat);
559 #endif
560 err2:
561         remove_proc_entry("rt_cache", net->proc_net);
562 err1:
563         return -ENOMEM;
564 }
565 #else
566 static inline int ip_rt_proc_init(struct net *net)
567 {
568         return 0;
569 }
570 #endif /* CONFIG_PROC_FS */
571
572 static __inline__ void rt_free(struct rtable *rt)
573 {
574         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
575 }
576
577 static __inline__ void rt_drop(struct rtable *rt)
578 {
579         ip_rt_put(rt);
580         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
581 }
582
583 static __inline__ int rt_fast_clean(struct rtable *rth)
584 {
585         /* Kill broadcast/multicast entries very aggresively, if they
586            collide in hash table with more useful entries */
587         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
588                 rth->fl.iif && rth->u.dst.rt_next;
589 }
590
591 static __inline__ int rt_valuable(struct rtable *rth)
592 {
593         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
594                 rth->u.dst.expires;
595 }
596
597 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
598 {
599         unsigned long age;
600         int ret = 0;
601
602         if (atomic_read(&rth->u.dst.__refcnt))
603                 goto out;
604
605         ret = 1;
606         if (rth->u.dst.expires &&
607             time_after_eq(jiffies, rth->u.dst.expires))
608                 goto out;
609
610         age = jiffies - rth->u.dst.lastuse;
611         ret = 0;
612         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
613             (age <= tmo2 && rt_valuable(rth)))
614                 goto out;
615         ret = 1;
616 out:    return ret;
617 }
618
619 /* Bits of score are:
620  * 31: very valuable
621  * 30: not quite useless
622  * 29..0: usage counter
623  */
624 static inline u32 rt_score(struct rtable *rt)
625 {
626         u32 score = jiffies - rt->u.dst.lastuse;
627
628         score = ~score & ~(3<<30);
629
630         if (rt_valuable(rt))
631                 score |= (1<<31);
632
633         if (!rt->fl.iif ||
634             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
635                 score |= (1<<30);
636
637         return score;
638 }
639
640 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
641 {
642         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
643                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
644                 (fl1->mark ^ fl2->mark) |
645                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
646                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
647                 (fl1->oif ^ fl2->oif) |
648                 (fl1->iif ^ fl2->iif)) == 0;
649 }
650
651 /*
652  * Perform a full scan of hash table and free all entries.
653  * Can be called by a softirq or a process.
654  * In the later case, we want to be reschedule if necessary
655  */
656 static void rt_do_flush(int process_context)
657 {
658         unsigned int i;
659         struct rtable *rth, *next;
660
661         for (i = 0; i <= rt_hash_mask; i++) {
662                 if (process_context && need_resched())
663                         cond_resched();
664                 rth = rt_hash_table[i].chain;
665                 if (!rth)
666                         continue;
667
668                 spin_lock_bh(rt_hash_lock_addr(i));
669                 rth = rt_hash_table[i].chain;
670                 rt_hash_table[i].chain = NULL;
671                 spin_unlock_bh(rt_hash_lock_addr(i));
672
673                 for (; rth; rth = next) {
674                         next = rth->u.dst.rt_next;
675                         rt_free(rth);
676                 }
677         }
678 }
679
680 static void rt_check_expire(void)
681 {
682         static unsigned int rover;
683         unsigned int i = rover, goal;
684         struct rtable *rth, **rthp;
685         u64 mult;
686
687         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
688         if (ip_rt_gc_timeout > 1)
689                 do_div(mult, ip_rt_gc_timeout);
690         goal = (unsigned int)mult;
691         if (goal > rt_hash_mask)
692                 goal = rt_hash_mask + 1;
693         for (; goal > 0; goal--) {
694                 unsigned long tmo = ip_rt_gc_timeout;
695
696                 i = (i + 1) & rt_hash_mask;
697                 rthp = &rt_hash_table[i].chain;
698
699                 if (need_resched())
700                         cond_resched();
701
702                 if (*rthp == NULL)
703                         continue;
704                 spin_lock_bh(rt_hash_lock_addr(i));
705                 while ((rth = *rthp) != NULL) {
706                         if (rth->u.dst.expires) {
707                                 /* Entry is expired even if it is in use */
708                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
709                                         tmo >>= 1;
710                                         rthp = &rth->u.dst.rt_next;
711                                         continue;
712                                 }
713                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
714                                 tmo >>= 1;
715                                 rthp = &rth->u.dst.rt_next;
716                                 continue;
717                         }
718
719                         /* Cleanup aged off entries. */
720                         *rthp = rth->u.dst.rt_next;
721                         rt_free(rth);
722                 }
723                 spin_unlock_bh(rt_hash_lock_addr(i));
724         }
725         rover = i;
726 }
727
728 /*
729  * rt_worker_func() is run in process context.
730  * If a whole flush was scheduled, it is done.
731  * Else, we call rt_check_expire() to scan part of the hash table
732  */
733 static void rt_worker_func(struct work_struct *work)
734 {
735         if (ip_rt_flush_expected) {
736                 ip_rt_flush_expected = 0;
737                 rt_do_flush(1);
738         } else
739                 rt_check_expire();
740         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
741 }
742
743 /* This can run from both BH and non-BH contexts, the latter
744  * in the case of a forced flush event.
745  */
746 static void rt_run_flush(unsigned long process_context)
747 {
748         rt_deadline = 0;
749
750         get_random_bytes(&rt_hash_rnd, 4);
751
752         rt_do_flush(process_context);
753 }
754
755 static DEFINE_SPINLOCK(rt_flush_lock);
756
757 void rt_cache_flush(int delay)
758 {
759         unsigned long now = jiffies;
760         int user_mode = !in_softirq();
761
762         if (delay < 0)
763                 delay = ip_rt_min_delay;
764
765         spin_lock_bh(&rt_flush_lock);
766
767         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
768                 long tmo = (long)(rt_deadline - now);
769
770                 /* If flush timer is already running
771                    and flush request is not immediate (delay > 0):
772
773                    if deadline is not achieved, prolongate timer to "delay",
774                    otherwise fire it at deadline time.
775                  */
776
777                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
778                         tmo = 0;
779
780                 if (delay > tmo)
781                         delay = tmo;
782         }
783
784         if (delay <= 0) {
785                 spin_unlock_bh(&rt_flush_lock);
786                 rt_run_flush(user_mode);
787                 return;
788         }
789
790         if (rt_deadline == 0)
791                 rt_deadline = now + ip_rt_max_delay;
792
793         mod_timer(&rt_flush_timer, now+delay);
794         spin_unlock_bh(&rt_flush_lock);
795 }
796
797 /*
798  * We change rt_hash_rnd and ask next rt_worker_func() invocation
799  * to perform a flush in process context
800  */
801 static void rt_secret_rebuild(unsigned long dummy)
802 {
803         get_random_bytes(&rt_hash_rnd, 4);
804         ip_rt_flush_expected = 1;
805         cancel_delayed_work(&expires_work);
806         schedule_delayed_work(&expires_work, HZ/10);
807         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
808 }
809
810 /*
811    Short description of GC goals.
812
813    We want to build algorithm, which will keep routing cache
814    at some equilibrium point, when number of aged off entries
815    is kept approximately equal to newly generated ones.
816
817    Current expiration strength is variable "expire".
818    We try to adjust it dynamically, so that if networking
819    is idle expires is large enough to keep enough of warm entries,
820    and when load increases it reduces to limit cache size.
821  */
822
823 static int rt_garbage_collect(void)
824 {
825         static unsigned long expire = RT_GC_TIMEOUT;
826         static unsigned long last_gc;
827         static int rover;
828         static int equilibrium;
829         struct rtable *rth, **rthp;
830         unsigned long now = jiffies;
831         int goal;
832
833         /*
834          * Garbage collection is pretty expensive,
835          * do not make it too frequently.
836          */
837
838         RT_CACHE_STAT_INC(gc_total);
839
840         if (now - last_gc < ip_rt_gc_min_interval &&
841             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842                 RT_CACHE_STAT_INC(gc_ignored);
843                 goto out;
844         }
845
846         /* Calculate number of entries, which we want to expire now. */
847         goal = atomic_read(&ipv4_dst_ops.entries) -
848                 (ip_rt_gc_elasticity << rt_hash_log);
849         if (goal <= 0) {
850                 if (equilibrium < ipv4_dst_ops.gc_thresh)
851                         equilibrium = ipv4_dst_ops.gc_thresh;
852                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853                 if (goal > 0) {
854                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
855                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856                 }
857         } else {
858                 /* We are in dangerous area. Try to reduce cache really
859                  * aggressively.
860                  */
861                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
862                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863         }
864
865         if (now - last_gc >= ip_rt_gc_min_interval)
866                 last_gc = now;
867
868         if (goal <= 0) {
869                 equilibrium += goal;
870                 goto work_done;
871         }
872
873         do {
874                 int i, k;
875
876                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877                         unsigned long tmo = expire;
878
879                         k = (k + 1) & rt_hash_mask;
880                         rthp = &rt_hash_table[k].chain;
881                         spin_lock_bh(rt_hash_lock_addr(k));
882                         while ((rth = *rthp) != NULL) {
883                                 if (!rt_may_expire(rth, tmo, expire)) {
884                                         tmo >>= 1;
885                                         rthp = &rth->u.dst.rt_next;
886                                         continue;
887                                 }
888                                 *rthp = rth->u.dst.rt_next;
889                                 rt_free(rth);
890                                 goal--;
891                         }
892                         spin_unlock_bh(rt_hash_lock_addr(k));
893                         if (goal <= 0)
894                                 break;
895                 }
896                 rover = k;
897
898                 if (goal <= 0)
899                         goto work_done;
900
901                 /* Goal is not achieved. We stop process if:
902
903                    - if expire reduced to zero. Otherwise, expire is halfed.
904                    - if table is not full.
905                    - if we are called from interrupt.
906                    - jiffies check is just fallback/debug loop breaker.
907                      We will not spin here for long time in any case.
908                  */
909
910                 RT_CACHE_STAT_INC(gc_goal_miss);
911
912                 if (expire == 0)
913                         break;
914
915                 expire >>= 1;
916 #if RT_CACHE_DEBUG >= 2
917                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
918                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
919 #endif
920
921                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
922                         goto out;
923         } while (!in_softirq() && time_before_eq(jiffies, now));
924
925         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
926                 goto out;
927         if (net_ratelimit())
928                 printk(KERN_WARNING "dst cache overflow\n");
929         RT_CACHE_STAT_INC(gc_dst_overflow);
930         return 1;
931
932 work_done:
933         expire += ip_rt_gc_min_interval;
934         if (expire > ip_rt_gc_timeout ||
935             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
936                 expire = ip_rt_gc_timeout;
937 #if RT_CACHE_DEBUG >= 2
938         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
939                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
940 #endif
941 out:    return 0;
942 }
943
944 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
945 {
946         struct rtable   *rth, **rthp;
947         unsigned long   now;
948         struct rtable *cand, **candp;
949         u32             min_score;
950         int             chain_length;
951         int attempts = !in_softirq();
952
953 restart:
954         chain_length = 0;
955         min_score = ~(u32)0;
956         cand = NULL;
957         candp = NULL;
958         now = jiffies;
959
960         rthp = &rt_hash_table[hash].chain;
961
962         spin_lock_bh(rt_hash_lock_addr(hash));
963         while ((rth = *rthp) != NULL) {
964                 if (compare_keys(&rth->fl, &rt->fl)) {
965                         /* Put it first */
966                         *rthp = rth->u.dst.rt_next;
967                         /*
968                          * Since lookup is lockfree, the deletion
969                          * must be visible to another weakly ordered CPU before
970                          * the insertion at the start of the hash chain.
971                          */
972                         rcu_assign_pointer(rth->u.dst.rt_next,
973                                            rt_hash_table[hash].chain);
974                         /*
975                          * Since lookup is lockfree, the update writes
976                          * must be ordered for consistency on SMP.
977                          */
978                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
979
980                         dst_use(&rth->u.dst, now);
981                         spin_unlock_bh(rt_hash_lock_addr(hash));
982
983                         rt_drop(rt);
984                         *rp = rth;
985                         return 0;
986                 }
987
988                 if (!atomic_read(&rth->u.dst.__refcnt)) {
989                         u32 score = rt_score(rth);
990
991                         if (score <= min_score) {
992                                 cand = rth;
993                                 candp = rthp;
994                                 min_score = score;
995                         }
996                 }
997
998                 chain_length++;
999
1000                 rthp = &rth->u.dst.rt_next;
1001         }
1002
1003         if (cand) {
1004                 /* ip_rt_gc_elasticity used to be average length of chain
1005                  * length, when exceeded gc becomes really aggressive.
1006                  *
1007                  * The second limit is less certain. At the moment it allows
1008                  * only 2 entries per bucket. We will see.
1009                  */
1010                 if (chain_length > ip_rt_gc_elasticity) {
1011                         *candp = cand->u.dst.rt_next;
1012                         rt_free(cand);
1013                 }
1014         }
1015
1016         /* Try to bind route to arp only if it is output
1017            route or unicast forwarding path.
1018          */
1019         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1020                 int err = arp_bind_neighbour(&rt->u.dst);
1021                 if (err) {
1022                         spin_unlock_bh(rt_hash_lock_addr(hash));
1023
1024                         if (err != -ENOBUFS) {
1025                                 rt_drop(rt);
1026                                 return err;
1027                         }
1028
1029                         /* Neighbour tables are full and nothing
1030                            can be released. Try to shrink route cache,
1031                            it is most likely it holds some neighbour records.
1032                          */
1033                         if (attempts-- > 0) {
1034                                 int saved_elasticity = ip_rt_gc_elasticity;
1035                                 int saved_int = ip_rt_gc_min_interval;
1036                                 ip_rt_gc_elasticity     = 1;
1037                                 ip_rt_gc_min_interval   = 0;
1038                                 rt_garbage_collect();
1039                                 ip_rt_gc_min_interval   = saved_int;
1040                                 ip_rt_gc_elasticity     = saved_elasticity;
1041                                 goto restart;
1042                         }
1043
1044                         if (net_ratelimit())
1045                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1046                         rt_drop(rt);
1047                         return -ENOBUFS;
1048                 }
1049         }
1050
1051         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1052 #if RT_CACHE_DEBUG >= 2
1053         if (rt->u.dst.rt_next) {
1054                 struct rtable *trt;
1055                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1056                        NIPQUAD(rt->rt_dst));
1057                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1058                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1059                 printk("\n");
1060         }
1061 #endif
1062         rt_hash_table[hash].chain = rt;
1063         spin_unlock_bh(rt_hash_lock_addr(hash));
1064         *rp = rt;
1065         return 0;
1066 }
1067
1068 void rt_bind_peer(struct rtable *rt, int create)
1069 {
1070         static DEFINE_SPINLOCK(rt_peer_lock);
1071         struct inet_peer *peer;
1072
1073         peer = inet_getpeer(rt->rt_dst, create);
1074
1075         spin_lock_bh(&rt_peer_lock);
1076         if (rt->peer == NULL) {
1077                 rt->peer = peer;
1078                 peer = NULL;
1079         }
1080         spin_unlock_bh(&rt_peer_lock);
1081         if (peer)
1082                 inet_putpeer(peer);
1083 }
1084
1085 /*
1086  * Peer allocation may fail only in serious out-of-memory conditions.  However
1087  * we still can generate some output.
1088  * Random ID selection looks a bit dangerous because we have no chances to
1089  * select ID being unique in a reasonable period of time.
1090  * But broken packet identifier may be better than no packet at all.
1091  */
1092 static void ip_select_fb_ident(struct iphdr *iph)
1093 {
1094         static DEFINE_SPINLOCK(ip_fb_id_lock);
1095         static u32 ip_fallback_id;
1096         u32 salt;
1097
1098         spin_lock_bh(&ip_fb_id_lock);
1099         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1100         iph->id = htons(salt & 0xFFFF);
1101         ip_fallback_id = salt;
1102         spin_unlock_bh(&ip_fb_id_lock);
1103 }
1104
1105 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1106 {
1107         struct rtable *rt = (struct rtable *) dst;
1108
1109         if (rt) {
1110                 if (rt->peer == NULL)
1111                         rt_bind_peer(rt, 1);
1112
1113                 /* If peer is attached to destination, it is never detached,
1114                    so that we need not to grab a lock to dereference it.
1115                  */
1116                 if (rt->peer) {
1117                         iph->id = htons(inet_getid(rt->peer, more));
1118                         return;
1119                 }
1120         } else
1121                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1122                        __builtin_return_address(0));
1123
1124         ip_select_fb_ident(iph);
1125 }
1126
1127 static void rt_del(unsigned hash, struct rtable *rt)
1128 {
1129         struct rtable **rthp;
1130
1131         spin_lock_bh(rt_hash_lock_addr(hash));
1132         ip_rt_put(rt);
1133         for (rthp = &rt_hash_table[hash].chain; *rthp;
1134              rthp = &(*rthp)->u.dst.rt_next)
1135                 if (*rthp == rt) {
1136                         *rthp = rt->u.dst.rt_next;
1137                         rt_free(rt);
1138                         break;
1139                 }
1140         spin_unlock_bh(rt_hash_lock_addr(hash));
1141 }
1142
1143 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1144                     __be32 saddr, struct net_device *dev)
1145 {
1146         int i, k;
1147         struct in_device *in_dev = in_dev_get(dev);
1148         struct rtable *rth, **rthp;
1149         __be32  skeys[2] = { saddr, 0 };
1150         int  ikeys[2] = { dev->ifindex, 0 };
1151         struct netevent_redirect netevent;
1152
1153         if (!in_dev)
1154                 return;
1155
1156         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1157             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1158                 goto reject_redirect;
1159
1160         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1161                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1162                         goto reject_redirect;
1163                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1164                         goto reject_redirect;
1165         } else {
1166                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1167                         goto reject_redirect;
1168         }
1169
1170         for (i = 0; i < 2; i++) {
1171                 for (k = 0; k < 2; k++) {
1172                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1173
1174                         rthp=&rt_hash_table[hash].chain;
1175
1176                         rcu_read_lock();
1177                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1178                                 struct rtable *rt;
1179
1180                                 if (rth->fl.fl4_dst != daddr ||
1181                                     rth->fl.fl4_src != skeys[i] ||
1182                                     rth->fl.oif != ikeys[k] ||
1183                                     rth->fl.iif != 0) {
1184                                         rthp = &rth->u.dst.rt_next;
1185                                         continue;
1186                                 }
1187
1188                                 if (rth->rt_dst != daddr ||
1189                                     rth->rt_src != saddr ||
1190                                     rth->u.dst.error ||
1191                                     rth->rt_gateway != old_gw ||
1192                                     rth->u.dst.dev != dev)
1193                                         break;
1194
1195                                 dst_hold(&rth->u.dst);
1196                                 rcu_read_unlock();
1197
1198                                 rt = dst_alloc(&ipv4_dst_ops);
1199                                 if (rt == NULL) {
1200                                         ip_rt_put(rth);
1201                                         in_dev_put(in_dev);
1202                                         return;
1203                                 }
1204
1205                                 /* Copy all the information. */
1206                                 *rt = *rth;
1207                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1208                                 rt->u.dst.__use         = 1;
1209                                 atomic_set(&rt->u.dst.__refcnt, 1);
1210                                 rt->u.dst.child         = NULL;
1211                                 if (rt->u.dst.dev)
1212                                         dev_hold(rt->u.dst.dev);
1213                                 if (rt->idev)
1214                                         in_dev_hold(rt->idev);
1215                                 rt->u.dst.obsolete      = 0;
1216                                 rt->u.dst.lastuse       = jiffies;
1217                                 rt->u.dst.path          = &rt->u.dst;
1218                                 rt->u.dst.neighbour     = NULL;
1219                                 rt->u.dst.hh            = NULL;
1220                                 rt->u.dst.xfrm          = NULL;
1221
1222                                 rt->rt_flags            |= RTCF_REDIRECTED;
1223
1224                                 /* Gateway is different ... */
1225                                 rt->rt_gateway          = new_gw;
1226
1227                                 /* Redirect received -> path was valid */
1228                                 dst_confirm(&rth->u.dst);
1229
1230                                 if (rt->peer)
1231                                         atomic_inc(&rt->peer->refcnt);
1232
1233                                 if (arp_bind_neighbour(&rt->u.dst) ||
1234                                     !(rt->u.dst.neighbour->nud_state &
1235                                             NUD_VALID)) {
1236                                         if (rt->u.dst.neighbour)
1237                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1238                                         ip_rt_put(rth);
1239                                         rt_drop(rt);
1240                                         goto do_next;
1241                                 }
1242
1243                                 netevent.old = &rth->u.dst;
1244                                 netevent.new = &rt->u.dst;
1245                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1246                                                         &netevent);
1247
1248                                 rt_del(hash, rth);
1249                                 if (!rt_intern_hash(hash, rt, &rt))
1250                                         ip_rt_put(rt);
1251                                 goto do_next;
1252                         }
1253                         rcu_read_unlock();
1254                 do_next:
1255                         ;
1256                 }
1257         }
1258         in_dev_put(in_dev);
1259         return;
1260
1261 reject_redirect:
1262 #ifdef CONFIG_IP_ROUTE_VERBOSE
1263         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1264                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1265                         "%u.%u.%u.%u ignored.\n"
1266                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1267                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1268                        NIPQUAD(saddr), NIPQUAD(daddr));
1269 #endif
1270         in_dev_put(in_dev);
1271 }
1272
1273 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1274 {
1275         struct rtable *rt = (struct rtable*)dst;
1276         struct dst_entry *ret = dst;
1277
1278         if (rt) {
1279                 if (dst->obsolete) {
1280                         ip_rt_put(rt);
1281                         ret = NULL;
1282                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1283                            rt->u.dst.expires) {
1284                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1285                                                 rt->fl.oif);
1286 #if RT_CACHE_DEBUG >= 1
1287                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1288                                           "%u.%u.%u.%u/%02x dropped\n",
1289                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1290 #endif
1291                         rt_del(hash, rt);
1292                         ret = NULL;
1293                 }
1294         }
1295         return ret;
1296 }
1297
1298 /*
1299  * Algorithm:
1300  *      1. The first ip_rt_redirect_number redirects are sent
1301  *         with exponential backoff, then we stop sending them at all,
1302  *         assuming that the host ignores our redirects.
1303  *      2. If we did not see packets requiring redirects
1304  *         during ip_rt_redirect_silence, we assume that the host
1305  *         forgot redirected route and start to send redirects again.
1306  *
1307  * This algorithm is much cheaper and more intelligent than dumb load limiting
1308  * in icmp.c.
1309  *
1310  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1311  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1312  */
1313
1314 void ip_rt_send_redirect(struct sk_buff *skb)
1315 {
1316         struct rtable *rt = (struct rtable*)skb->dst;
1317         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1318
1319         if (!in_dev)
1320                 return;
1321
1322         if (!IN_DEV_TX_REDIRECTS(in_dev))
1323                 goto out;
1324
1325         /* No redirected packets during ip_rt_redirect_silence;
1326          * reset the algorithm.
1327          */
1328         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1329                 rt->u.dst.rate_tokens = 0;
1330
1331         /* Too many ignored redirects; do not send anything
1332          * set u.dst.rate_last to the last seen redirected packet.
1333          */
1334         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1335                 rt->u.dst.rate_last = jiffies;
1336                 goto out;
1337         }
1338
1339         /* Check for load limit; set rate_last to the latest sent
1340          * redirect.
1341          */
1342         if (rt->u.dst.rate_tokens == 0 ||
1343             time_after(jiffies,
1344                        (rt->u.dst.rate_last +
1345                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1346                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1347                 rt->u.dst.rate_last = jiffies;
1348                 ++rt->u.dst.rate_tokens;
1349 #ifdef CONFIG_IP_ROUTE_VERBOSE
1350                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1351                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1352                     net_ratelimit())
1353                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1354                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1355                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1356                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1357 #endif
1358         }
1359 out:
1360         in_dev_put(in_dev);
1361 }
1362
1363 static int ip_error(struct sk_buff *skb)
1364 {
1365         struct rtable *rt = (struct rtable*)skb->dst;
1366         unsigned long now;
1367         int code;
1368
1369         switch (rt->u.dst.error) {
1370                 case EINVAL:
1371                 default:
1372                         goto out;
1373                 case EHOSTUNREACH:
1374                         code = ICMP_HOST_UNREACH;
1375                         break;
1376                 case ENETUNREACH:
1377                         code = ICMP_NET_UNREACH;
1378                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1379                         break;
1380                 case EACCES:
1381                         code = ICMP_PKT_FILTERED;
1382                         break;
1383         }
1384
1385         now = jiffies;
1386         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1387         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1388                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1389         rt->u.dst.rate_last = now;
1390         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1391                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1392                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1393         }
1394
1395 out:    kfree_skb(skb);
1396         return 0;
1397 }
1398
1399 /*
1400  *      The last two values are not from the RFC but
1401  *      are needed for AMPRnet AX.25 paths.
1402  */
1403
1404 static const unsigned short mtu_plateau[] =
1405 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1406
1407 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1408 {
1409         int i;
1410
1411         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1412                 if (old_mtu > mtu_plateau[i])
1413                         return mtu_plateau[i];
1414         return 68;
1415 }
1416
1417 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1418 {
1419         int i;
1420         unsigned short old_mtu = ntohs(iph->tot_len);
1421         struct rtable *rth;
1422         __be32  skeys[2] = { iph->saddr, 0, };
1423         __be32  daddr = iph->daddr;
1424         unsigned short est_mtu = 0;
1425
1426         if (ipv4_config.no_pmtu_disc)
1427                 return 0;
1428
1429         for (i = 0; i < 2; i++) {
1430                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1431
1432                 rcu_read_lock();
1433                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1434                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1435                         if (rth->fl.fl4_dst == daddr &&
1436                             rth->fl.fl4_src == skeys[i] &&
1437                             rth->rt_dst  == daddr &&
1438                             rth->rt_src  == iph->saddr &&
1439                             rth->fl.iif == 0 &&
1440                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1441                                 unsigned short mtu = new_mtu;
1442
1443                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1444
1445                                         /* BSD 4.2 compatibility hack :-( */
1446                                         if (mtu == 0 &&
1447                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1448                                             old_mtu >= 68 + (iph->ihl << 2))
1449                                                 old_mtu -= iph->ihl << 2;
1450
1451                                         mtu = guess_mtu(old_mtu);
1452                                 }
1453                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1454                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1455                                                 dst_confirm(&rth->u.dst);
1456                                                 if (mtu < ip_rt_min_pmtu) {
1457                                                         mtu = ip_rt_min_pmtu;
1458                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1459                                                                 (1 << RTAX_MTU);
1460                                                 }
1461                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1462                                                 dst_set_expires(&rth->u.dst,
1463                                                         ip_rt_mtu_expires);
1464                                         }
1465                                         est_mtu = mtu;
1466                                 }
1467                         }
1468                 }
1469                 rcu_read_unlock();
1470         }
1471         return est_mtu ? : new_mtu;
1472 }
1473
1474 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1475 {
1476         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1477             !(dst_metric_locked(dst, RTAX_MTU))) {
1478                 if (mtu < ip_rt_min_pmtu) {
1479                         mtu = ip_rt_min_pmtu;
1480                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1481                 }
1482                 dst->metrics[RTAX_MTU-1] = mtu;
1483                 dst_set_expires(dst, ip_rt_mtu_expires);
1484                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1485         }
1486 }
1487
1488 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1489 {
1490         return NULL;
1491 }
1492
1493 static void ipv4_dst_destroy(struct dst_entry *dst)
1494 {
1495         struct rtable *rt = (struct rtable *) dst;
1496         struct inet_peer *peer = rt->peer;
1497         struct in_device *idev = rt->idev;
1498
1499         if (peer) {
1500                 rt->peer = NULL;
1501                 inet_putpeer(peer);
1502         }
1503
1504         if (idev) {
1505                 rt->idev = NULL;
1506                 in_dev_put(idev);
1507         }
1508 }
1509
1510 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1511                             int how)
1512 {
1513         struct rtable *rt = (struct rtable *) dst;
1514         struct in_device *idev = rt->idev;
1515         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1516                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1517                 if (loopback_idev) {
1518                         rt->idev = loopback_idev;
1519                         in_dev_put(idev);
1520                 }
1521         }
1522 }
1523
1524 static void ipv4_link_failure(struct sk_buff *skb)
1525 {
1526         struct rtable *rt;
1527
1528         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1529
1530         rt = (struct rtable *) skb->dst;
1531         if (rt)
1532                 dst_set_expires(&rt->u.dst, 0);
1533 }
1534
1535 static int ip_rt_bug(struct sk_buff *skb)
1536 {
1537         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1538                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1539                 skb->dev ? skb->dev->name : "?");
1540         kfree_skb(skb);
1541         return 0;
1542 }
1543
1544 /*
1545    We do not cache source address of outgoing interface,
1546    because it is used only by IP RR, TS and SRR options,
1547    so that it out of fast path.
1548
1549    BTW remember: "addr" is allowed to be not aligned
1550    in IP options!
1551  */
1552
1553 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1554 {
1555         __be32 src;
1556         struct fib_result res;
1557
1558         if (rt->fl.iif == 0)
1559                 src = rt->rt_src;
1560         else if (fib_lookup(&rt->fl, &res) == 0) {
1561                 src = FIB_RES_PREFSRC(res);
1562                 fib_res_put(&res);
1563         } else
1564                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1565                                         RT_SCOPE_UNIVERSE);
1566         memcpy(addr, &src, 4);
1567 }
1568
1569 #ifdef CONFIG_NET_CLS_ROUTE
1570 static void set_class_tag(struct rtable *rt, u32 tag)
1571 {
1572         if (!(rt->u.dst.tclassid & 0xFFFF))
1573                 rt->u.dst.tclassid |= tag & 0xFFFF;
1574         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1575                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1576 }
1577 #endif
1578
1579 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1580 {
1581         struct fib_info *fi = res->fi;
1582
1583         if (fi) {
1584                 if (FIB_RES_GW(*res) &&
1585                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1586                         rt->rt_gateway = FIB_RES_GW(*res);
1587                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1588                        sizeof(rt->u.dst.metrics));
1589                 if (fi->fib_mtu == 0) {
1590                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1591                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1592                             rt->rt_gateway != rt->rt_dst &&
1593                             rt->u.dst.dev->mtu > 576)
1594                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1595                 }
1596 #ifdef CONFIG_NET_CLS_ROUTE
1597                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1598 #endif
1599         } else
1600                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1601
1602         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1603                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1604         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1605                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1606         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1607                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1608                                        ip_rt_min_advmss);
1609         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1610                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1611
1612 #ifdef CONFIG_NET_CLS_ROUTE
1613 #ifdef CONFIG_IP_MULTIPLE_TABLES
1614         set_class_tag(rt, fib_rules_tclass(res));
1615 #endif
1616         set_class_tag(rt, itag);
1617 #endif
1618         rt->rt_type = res->type;
1619 }
1620
1621 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1622                                 u8 tos, struct net_device *dev, int our)
1623 {
1624         unsigned hash;
1625         struct rtable *rth;
1626         __be32 spec_dst;
1627         struct in_device *in_dev = in_dev_get(dev);
1628         u32 itag = 0;
1629
1630         /* Primary sanity checks. */
1631
1632         if (in_dev == NULL)
1633                 return -EINVAL;
1634
1635         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1636             skb->protocol != htons(ETH_P_IP))
1637                 goto e_inval;
1638
1639         if (ZERONET(saddr)) {
1640                 if (!LOCAL_MCAST(daddr))
1641                         goto e_inval;
1642                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1643         } else if (fib_validate_source(saddr, 0, tos, 0,
1644                                         dev, &spec_dst, &itag) < 0)
1645                 goto e_inval;
1646
1647         rth = dst_alloc(&ipv4_dst_ops);
1648         if (!rth)
1649                 goto e_nobufs;
1650
1651         rth->u.dst.output= ip_rt_bug;
1652
1653         atomic_set(&rth->u.dst.__refcnt, 1);
1654         rth->u.dst.flags= DST_HOST;
1655         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1656                 rth->u.dst.flags |= DST_NOPOLICY;
1657         rth->fl.fl4_dst = daddr;
1658         rth->rt_dst     = daddr;
1659         rth->fl.fl4_tos = tos;
1660         rth->fl.mark    = skb->mark;
1661         rth->fl.fl4_src = saddr;
1662         rth->rt_src     = saddr;
1663 #ifdef CONFIG_NET_CLS_ROUTE
1664         rth->u.dst.tclassid = itag;
1665 #endif
1666         rth->rt_iif     =
1667         rth->fl.iif     = dev->ifindex;
1668         rth->u.dst.dev  = init_net.loopback_dev;
1669         dev_hold(rth->u.dst.dev);
1670         rth->idev       = in_dev_get(rth->u.dst.dev);
1671         rth->fl.oif     = 0;
1672         rth->rt_gateway = daddr;
1673         rth->rt_spec_dst= spec_dst;
1674         rth->rt_type    = RTN_MULTICAST;
1675         rth->rt_flags   = RTCF_MULTICAST;
1676         if (our) {
1677                 rth->u.dst.input= ip_local_deliver;
1678                 rth->rt_flags |= RTCF_LOCAL;
1679         }
1680
1681 #ifdef CONFIG_IP_MROUTE
1682         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1683                 rth->u.dst.input = ip_mr_input;
1684 #endif
1685         RT_CACHE_STAT_INC(in_slow_mc);
1686
1687         in_dev_put(in_dev);
1688         hash = rt_hash(daddr, saddr, dev->ifindex);
1689         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1690
1691 e_nobufs:
1692         in_dev_put(in_dev);
1693         return -ENOBUFS;
1694
1695 e_inval:
1696         in_dev_put(in_dev);
1697         return -EINVAL;
1698 }
1699
1700
1701 static void ip_handle_martian_source(struct net_device *dev,
1702                                      struct in_device *in_dev,
1703                                      struct sk_buff *skb,
1704                                      __be32 daddr,
1705                                      __be32 saddr)
1706 {
1707         RT_CACHE_STAT_INC(in_martian_src);
1708 #ifdef CONFIG_IP_ROUTE_VERBOSE
1709         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1710                 /*
1711                  *      RFC1812 recommendation, if source is martian,
1712                  *      the only hint is MAC header.
1713                  */
1714                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1715                         "%u.%u.%u.%u, on dev %s\n",
1716                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1717                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1718                         int i;
1719                         const unsigned char *p = skb_mac_header(skb);
1720                         printk(KERN_WARNING "ll header: ");
1721                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1722                                 printk("%02x", *p);
1723                                 if (i < (dev->hard_header_len - 1))
1724                                         printk(":");
1725                         }
1726                         printk("\n");
1727                 }
1728         }
1729 #endif
1730 }
1731
1732 static inline int __mkroute_input(struct sk_buff *skb,
1733                                   struct fib_result* res,
1734                                   struct in_device *in_dev,
1735                                   __be32 daddr, __be32 saddr, u32 tos,
1736                                   struct rtable **result)
1737 {
1738
1739         struct rtable *rth;
1740         int err;
1741         struct in_device *out_dev;
1742         unsigned flags = 0;
1743         __be32 spec_dst;
1744         u32 itag;
1745
1746         /* get a working reference to the output device */
1747         out_dev = in_dev_get(FIB_RES_DEV(*res));
1748         if (out_dev == NULL) {
1749                 if (net_ratelimit())
1750                         printk(KERN_CRIT "Bug in ip_route_input" \
1751                                "_slow(). Please, report\n");
1752                 return -EINVAL;
1753         }
1754
1755
1756         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1757                                   in_dev->dev, &spec_dst, &itag);
1758         if (err < 0) {
1759                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1760                                          saddr);
1761
1762                 err = -EINVAL;
1763                 goto cleanup;
1764         }
1765
1766         if (err)
1767                 flags |= RTCF_DIRECTSRC;
1768
1769         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1770             (IN_DEV_SHARED_MEDIA(out_dev) ||
1771              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1772                 flags |= RTCF_DOREDIRECT;
1773
1774         if (skb->protocol != htons(ETH_P_IP)) {
1775                 /* Not IP (i.e. ARP). Do not create route, if it is
1776                  * invalid for proxy arp. DNAT routes are always valid.
1777                  */
1778                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1779                         err = -EINVAL;
1780                         goto cleanup;
1781                 }
1782         }
1783
1784
1785         rth = dst_alloc(&ipv4_dst_ops);
1786         if (!rth) {
1787                 err = -ENOBUFS;
1788                 goto cleanup;
1789         }
1790
1791         atomic_set(&rth->u.dst.__refcnt, 1);
1792         rth->u.dst.flags= DST_HOST;
1793         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1794                 rth->u.dst.flags |= DST_NOPOLICY;
1795         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1796                 rth->u.dst.flags |= DST_NOXFRM;
1797         rth->fl.fl4_dst = daddr;
1798         rth->rt_dst     = daddr;
1799         rth->fl.fl4_tos = tos;
1800         rth->fl.mark    = skb->mark;
1801         rth->fl.fl4_src = saddr;
1802         rth->rt_src     = saddr;
1803         rth->rt_gateway = daddr;
1804         rth->rt_iif     =
1805                 rth->fl.iif     = in_dev->dev->ifindex;
1806         rth->u.dst.dev  = (out_dev)->dev;
1807         dev_hold(rth->u.dst.dev);
1808         rth->idev       = in_dev_get(rth->u.dst.dev);
1809         rth->fl.oif     = 0;
1810         rth->rt_spec_dst= spec_dst;
1811
1812         rth->u.dst.input = ip_forward;
1813         rth->u.dst.output = ip_output;
1814
1815         rt_set_nexthop(rth, res, itag);
1816
1817         rth->rt_flags = flags;
1818
1819         *result = rth;
1820         err = 0;
1821  cleanup:
1822         /* release the working reference to the output device */
1823         in_dev_put(out_dev);
1824         return err;
1825 }
1826
1827 static inline int ip_mkroute_input(struct sk_buff *skb,
1828                                    struct fib_result* res,
1829                                    const struct flowi *fl,
1830                                    struct in_device *in_dev,
1831                                    __be32 daddr, __be32 saddr, u32 tos)
1832 {
1833         struct rtable* rth = NULL;
1834         int err;
1835         unsigned hash;
1836
1837 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1838         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1839                 fib_select_multipath(fl, res);
1840 #endif
1841
1842         /* create a routing cache entry */
1843         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1844         if (err)
1845                 return err;
1846
1847         /* put it into the cache */
1848         hash = rt_hash(daddr, saddr, fl->iif);
1849         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1850 }
1851
1852 /*
1853  *      NOTE. We drop all the packets that has local source
1854  *      addresses, because every properly looped back packet
1855  *      must have correct destination already attached by output routine.
1856  *
1857  *      Such approach solves two big problems:
1858  *      1. Not simplex devices are handled properly.
1859  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1860  */
1861
1862 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1863                                u8 tos, struct net_device *dev)
1864 {
1865         struct fib_result res;
1866         struct in_device *in_dev = in_dev_get(dev);
1867         struct flowi fl = { .nl_u = { .ip4_u =
1868                                       { .daddr = daddr,
1869                                         .saddr = saddr,
1870                                         .tos = tos,
1871                                         .scope = RT_SCOPE_UNIVERSE,
1872                                       } },
1873                             .mark = skb->mark,
1874                             .iif = dev->ifindex };
1875         unsigned        flags = 0;
1876         u32             itag = 0;
1877         struct rtable * rth;
1878         unsigned        hash;
1879         __be32          spec_dst;
1880         int             err = -EINVAL;
1881         int             free_res = 0;
1882
1883         /* IP on this device is disabled. */
1884
1885         if (!in_dev)
1886                 goto out;
1887
1888         /* Check for the most weird martians, which can be not detected
1889            by fib_lookup.
1890          */
1891
1892         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1893                 goto martian_source;
1894
1895         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1896                 goto brd_input;
1897
1898         /* Accept zero addresses only to limited broadcast;
1899          * I even do not know to fix it or not. Waiting for complains :-)
1900          */
1901         if (ZERONET(saddr))
1902                 goto martian_source;
1903
1904         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1905                 goto martian_destination;
1906
1907         /*
1908          *      Now we are ready to route packet.
1909          */
1910         if ((err = fib_lookup(&fl, &res)) != 0) {
1911                 if (!IN_DEV_FORWARD(in_dev))
1912                         goto e_hostunreach;
1913                 goto no_route;
1914         }
1915         free_res = 1;
1916
1917         RT_CACHE_STAT_INC(in_slow_tot);
1918
1919         if (res.type == RTN_BROADCAST)
1920                 goto brd_input;
1921
1922         if (res.type == RTN_LOCAL) {
1923                 int result;
1924                 result = fib_validate_source(saddr, daddr, tos,
1925                                              init_net.loopback_dev->ifindex,
1926                                              dev, &spec_dst, &itag);
1927                 if (result < 0)
1928                         goto martian_source;
1929                 if (result)
1930                         flags |= RTCF_DIRECTSRC;
1931                 spec_dst = daddr;
1932                 goto local_input;
1933         }
1934
1935         if (!IN_DEV_FORWARD(in_dev))
1936                 goto e_hostunreach;
1937         if (res.type != RTN_UNICAST)
1938                 goto martian_destination;
1939
1940         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1941 done:
1942         in_dev_put(in_dev);
1943         if (free_res)
1944                 fib_res_put(&res);
1945 out:    return err;
1946
1947 brd_input:
1948         if (skb->protocol != htons(ETH_P_IP))
1949                 goto e_inval;
1950
1951         if (ZERONET(saddr))
1952                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1953         else {
1954                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1955                                           &itag);
1956                 if (err < 0)
1957                         goto martian_source;
1958                 if (err)
1959                         flags |= RTCF_DIRECTSRC;
1960         }
1961         flags |= RTCF_BROADCAST;
1962         res.type = RTN_BROADCAST;
1963         RT_CACHE_STAT_INC(in_brd);
1964
1965 local_input:
1966         rth = dst_alloc(&ipv4_dst_ops);
1967         if (!rth)
1968                 goto e_nobufs;
1969
1970         rth->u.dst.output= ip_rt_bug;
1971
1972         atomic_set(&rth->u.dst.__refcnt, 1);
1973         rth->u.dst.flags= DST_HOST;
1974         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1975                 rth->u.dst.flags |= DST_NOPOLICY;
1976         rth->fl.fl4_dst = daddr;
1977         rth->rt_dst     = daddr;
1978         rth->fl.fl4_tos = tos;
1979         rth->fl.mark    = skb->mark;
1980         rth->fl.fl4_src = saddr;
1981         rth->rt_src     = saddr;
1982 #ifdef CONFIG_NET_CLS_ROUTE
1983         rth->u.dst.tclassid = itag;
1984 #endif
1985         rth->rt_iif     =
1986         rth->fl.iif     = dev->ifindex;
1987         rth->u.dst.dev  = init_net.loopback_dev;
1988         dev_hold(rth->u.dst.dev);
1989         rth->idev       = in_dev_get(rth->u.dst.dev);
1990         rth->rt_gateway = daddr;
1991         rth->rt_spec_dst= spec_dst;
1992         rth->u.dst.input= ip_local_deliver;
1993         rth->rt_flags   = flags|RTCF_LOCAL;
1994         if (res.type == RTN_UNREACHABLE) {
1995                 rth->u.dst.input= ip_error;
1996                 rth->u.dst.error= -err;
1997                 rth->rt_flags   &= ~RTCF_LOCAL;
1998         }
1999         rth->rt_type    = res.type;
2000         hash = rt_hash(daddr, saddr, fl.iif);
2001         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2002         goto done;
2003
2004 no_route:
2005         RT_CACHE_STAT_INC(in_no_route);
2006         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2007         res.type = RTN_UNREACHABLE;
2008         if (err == -ESRCH)
2009                 err = -ENETUNREACH;
2010         goto local_input;
2011
2012         /*
2013          *      Do not cache martian addresses: they should be logged (RFC1812)
2014          */
2015 martian_destination:
2016         RT_CACHE_STAT_INC(in_martian_dst);
2017 #ifdef CONFIG_IP_ROUTE_VERBOSE
2018         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2019                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2020                         "%u.%u.%u.%u, dev %s\n",
2021                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2022 #endif
2023
2024 e_hostunreach:
2025         err = -EHOSTUNREACH;
2026         goto done;
2027
2028 e_inval:
2029         err = -EINVAL;
2030         goto done;
2031
2032 e_nobufs:
2033         err = -ENOBUFS;
2034         goto done;
2035
2036 martian_source:
2037         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2038         goto e_inval;
2039 }
2040
2041 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2042                    u8 tos, struct net_device *dev)
2043 {
2044         struct rtable * rth;
2045         unsigned        hash;
2046         int iif = dev->ifindex;
2047
2048         tos &= IPTOS_RT_MASK;
2049         hash = rt_hash(daddr, saddr, iif);
2050
2051         rcu_read_lock();
2052         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2053              rth = rcu_dereference(rth->u.dst.rt_next)) {
2054                 if (rth->fl.fl4_dst == daddr &&
2055                     rth->fl.fl4_src == saddr &&
2056                     rth->fl.iif == iif &&
2057                     rth->fl.oif == 0 &&
2058                     rth->fl.mark == skb->mark &&
2059                     rth->fl.fl4_tos == tos) {
2060                         dst_use(&rth->u.dst, jiffies);
2061                         RT_CACHE_STAT_INC(in_hit);
2062                         rcu_read_unlock();
2063                         skb->dst = (struct dst_entry*)rth;
2064                         return 0;
2065                 }
2066                 RT_CACHE_STAT_INC(in_hlist_search);
2067         }
2068         rcu_read_unlock();
2069
2070         /* Multicast recognition logic is moved from route cache to here.
2071            The problem was that too many Ethernet cards have broken/missing
2072            hardware multicast filters :-( As result the host on multicasting
2073            network acquires a lot of useless route cache entries, sort of
2074            SDR messages from all the world. Now we try to get rid of them.
2075            Really, provided software IP multicast filter is organized
2076            reasonably (at least, hashed), it does not result in a slowdown
2077            comparing with route cache reject entries.
2078            Note, that multicast routers are not affected, because
2079            route cache entry is created eventually.
2080          */
2081         if (MULTICAST(daddr)) {
2082                 struct in_device *in_dev;
2083
2084                 rcu_read_lock();
2085                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2086                         int our = ip_check_mc(in_dev, daddr, saddr,
2087                                 ip_hdr(skb)->protocol);
2088                         if (our
2089 #ifdef CONFIG_IP_MROUTE
2090                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2091 #endif
2092                             ) {
2093                                 rcu_read_unlock();
2094                                 return ip_route_input_mc(skb, daddr, saddr,
2095                                                          tos, dev, our);
2096                         }
2097                 }
2098                 rcu_read_unlock();
2099                 return -EINVAL;
2100         }
2101         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2102 }
2103
2104 static inline int __mkroute_output(struct rtable **result,
2105                                    struct fib_result* res,
2106                                    const struct flowi *fl,
2107                                    const struct flowi *oldflp,
2108                                    struct net_device *dev_out,
2109                                    unsigned flags)
2110 {
2111         struct rtable *rth;
2112         struct in_device *in_dev;
2113         u32 tos = RT_FL_TOS(oldflp);
2114         int err = 0;
2115
2116         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2117                 return -EINVAL;
2118
2119         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2120                 res->type = RTN_BROADCAST;
2121         else if (MULTICAST(fl->fl4_dst))
2122                 res->type = RTN_MULTICAST;
2123         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2124                 return -EINVAL;
2125
2126         if (dev_out->flags & IFF_LOOPBACK)
2127                 flags |= RTCF_LOCAL;
2128
2129         /* get work reference to inet device */
2130         in_dev = in_dev_get(dev_out);
2131         if (!in_dev)
2132                 return -EINVAL;
2133
2134         if (res->type == RTN_BROADCAST) {
2135                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2136                 if (res->fi) {
2137                         fib_info_put(res->fi);
2138                         res->fi = NULL;
2139                 }
2140         } else if (res->type == RTN_MULTICAST) {
2141                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2142                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2143                                  oldflp->proto))
2144                         flags &= ~RTCF_LOCAL;
2145                 /* If multicast route do not exist use
2146                    default one, but do not gateway in this case.
2147                    Yes, it is hack.
2148                  */
2149                 if (res->fi && res->prefixlen < 4) {
2150                         fib_info_put(res->fi);
2151                         res->fi = NULL;
2152                 }
2153         }
2154
2155
2156         rth = dst_alloc(&ipv4_dst_ops);
2157         if (!rth) {
2158                 err = -ENOBUFS;
2159                 goto cleanup;
2160         }
2161
2162         atomic_set(&rth->u.dst.__refcnt, 1);
2163         rth->u.dst.flags= DST_HOST;
2164         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2165                 rth->u.dst.flags |= DST_NOXFRM;
2166         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2167                 rth->u.dst.flags |= DST_NOPOLICY;
2168
2169         rth->fl.fl4_dst = oldflp->fl4_dst;
2170         rth->fl.fl4_tos = tos;
2171         rth->fl.fl4_src = oldflp->fl4_src;
2172         rth->fl.oif     = oldflp->oif;
2173         rth->fl.mark    = oldflp->mark;
2174         rth->rt_dst     = fl->fl4_dst;
2175         rth->rt_src     = fl->fl4_src;
2176         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2177         /* get references to the devices that are to be hold by the routing
2178            cache entry */
2179         rth->u.dst.dev  = dev_out;
2180         dev_hold(dev_out);
2181         rth->idev       = in_dev_get(dev_out);
2182         rth->rt_gateway = fl->fl4_dst;
2183         rth->rt_spec_dst= fl->fl4_src;
2184
2185         rth->u.dst.output=ip_output;
2186
2187         RT_CACHE_STAT_INC(out_slow_tot);
2188
2189         if (flags & RTCF_LOCAL) {
2190                 rth->u.dst.input = ip_local_deliver;
2191                 rth->rt_spec_dst = fl->fl4_dst;
2192         }
2193         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2194                 rth->rt_spec_dst = fl->fl4_src;
2195                 if (flags & RTCF_LOCAL &&
2196                     !(dev_out->flags & IFF_LOOPBACK)) {
2197                         rth->u.dst.output = ip_mc_output;
2198                         RT_CACHE_STAT_INC(out_slow_mc);
2199                 }
2200 #ifdef CONFIG_IP_MROUTE
2201                 if (res->type == RTN_MULTICAST) {
2202                         if (IN_DEV_MFORWARD(in_dev) &&
2203                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2204                                 rth->u.dst.input = ip_mr_input;
2205                                 rth->u.dst.output = ip_mc_output;
2206                         }
2207                 }
2208 #endif
2209         }
2210
2211         rt_set_nexthop(rth, res, 0);
2212
2213         rth->rt_flags = flags;
2214
2215         *result = rth;
2216  cleanup:
2217         /* release work reference to inet device */
2218         in_dev_put(in_dev);
2219
2220         return err;
2221 }
2222
2223 static inline int ip_mkroute_output(struct rtable **rp,
2224                                     struct fib_result* res,
2225                                     const struct flowi *fl,
2226                                     const struct flowi *oldflp,
2227                                     struct net_device *dev_out,
2228                                     unsigned flags)
2229 {
2230         struct rtable *rth = NULL;
2231         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2232         unsigned hash;
2233         if (err == 0) {
2234                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2235                 err = rt_intern_hash(hash, rth, rp);
2236         }
2237
2238         return err;
2239 }
2240
2241 /*
2242  * Major route resolver routine.
2243  */
2244
2245 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2246 {
2247         u32 tos = RT_FL_TOS(oldflp);
2248         struct flowi fl = { .nl_u = { .ip4_u =
2249                                       { .daddr = oldflp->fl4_dst,
2250                                         .saddr = oldflp->fl4_src,
2251                                         .tos = tos & IPTOS_RT_MASK,
2252                                         .scope = ((tos & RTO_ONLINK) ?
2253                                                   RT_SCOPE_LINK :
2254                                                   RT_SCOPE_UNIVERSE),
2255                                       } },
2256                             .mark = oldflp->mark,
2257                             .iif = init_net.loopback_dev->ifindex,
2258                             .oif = oldflp->oif };
2259         struct fib_result res;
2260         unsigned flags = 0;
2261         struct net_device *dev_out = NULL;
2262         int free_res = 0;
2263         int err;
2264
2265
2266         res.fi          = NULL;
2267 #ifdef CONFIG_IP_MULTIPLE_TABLES
2268         res.r           = NULL;
2269 #endif
2270
2271         if (oldflp->fl4_src) {
2272                 err = -EINVAL;
2273                 if (MULTICAST(oldflp->fl4_src) ||
2274                     BADCLASS(oldflp->fl4_src) ||
2275                     ZERONET(oldflp->fl4_src))
2276                         goto out;
2277
2278                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2279                 dev_out = ip_dev_find(oldflp->fl4_src);
2280                 if (dev_out == NULL)
2281                         goto out;
2282
2283                 /* I removed check for oif == dev_out->oif here.
2284                    It was wrong for two reasons:
2285                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2286                       assigned to multiple interfaces.
2287                    2. Moreover, we are allowed to send packets with saddr
2288                       of another iface. --ANK
2289                  */
2290
2291                 if (oldflp->oif == 0
2292                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2293                         /* Special hack: user can direct multicasts
2294                            and limited broadcast via necessary interface
2295                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2296                            This hack is not just for fun, it allows
2297                            vic,vat and friends to work.
2298                            They bind socket to loopback, set ttl to zero
2299                            and expect that it will work.
2300                            From the viewpoint of routing cache they are broken,
2301                            because we are not allowed to build multicast path
2302                            with loopback source addr (look, routing cache
2303                            cannot know, that ttl is zero, so that packet
2304                            will not leave this host and route is valid).
2305                            Luckily, this hack is good workaround.
2306                          */
2307
2308                         fl.oif = dev_out->ifindex;
2309                         goto make_route;
2310                 }
2311                 if (dev_out)
2312                         dev_put(dev_out);
2313                 dev_out = NULL;
2314         }
2315
2316
2317         if (oldflp->oif) {
2318                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2319                 err = -ENODEV;
2320                 if (dev_out == NULL)
2321                         goto out;
2322
2323                 /* RACE: Check return value of inet_select_addr instead. */
2324                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2325                         dev_put(dev_out);
2326                         goto out;       /* Wrong error code */
2327                 }
2328
2329                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2330                         if (!fl.fl4_src)
2331                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2332                                                               RT_SCOPE_LINK);
2333                         goto make_route;
2334                 }
2335                 if (!fl.fl4_src) {
2336                         if (MULTICAST(oldflp->fl4_dst))
2337                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2338                                                               fl.fl4_scope);
2339                         else if (!oldflp->fl4_dst)
2340                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2341                                                               RT_SCOPE_HOST);
2342                 }
2343         }
2344
2345         if (!fl.fl4_dst) {
2346                 fl.fl4_dst = fl.fl4_src;
2347                 if (!fl.fl4_dst)
2348                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2349                 if (dev_out)
2350                         dev_put(dev_out);
2351                 dev_out = init_net.loopback_dev;
2352                 dev_hold(dev_out);
2353                 fl.oif = init_net.loopback_dev->ifindex;
2354                 res.type = RTN_LOCAL;
2355                 flags |= RTCF_LOCAL;
2356                 goto make_route;
2357         }
2358
2359         if (fib_lookup(&fl, &res)) {
2360                 res.fi = NULL;
2361                 if (oldflp->oif) {
2362                         /* Apparently, routing tables are wrong. Assume,
2363                            that the destination is on link.
2364
2365                            WHY? DW.
2366                            Because we are allowed to send to iface
2367                            even if it has NO routes and NO assigned
2368                            addresses. When oif is specified, routing
2369                            tables are looked up with only one purpose:
2370                            to catch if destination is gatewayed, rather than
2371                            direct. Moreover, if MSG_DONTROUTE is set,
2372                            we send packet, ignoring both routing tables
2373                            and ifaddr state. --ANK
2374
2375
2376                            We could make it even if oif is unknown,
2377                            likely IPv6, but we do not.
2378                          */
2379
2380                         if (fl.fl4_src == 0)
2381                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2382                                                               RT_SCOPE_LINK);
2383                         res.type = RTN_UNICAST;
2384                         goto make_route;
2385                 }
2386                 if (dev_out)
2387                         dev_put(dev_out);
2388                 err = -ENETUNREACH;
2389                 goto out;
2390         }
2391         free_res = 1;
2392
2393         if (res.type == RTN_LOCAL) {
2394                 if (!fl.fl4_src)
2395                         fl.fl4_src = fl.fl4_dst;
2396                 if (dev_out)
2397                         dev_put(dev_out);
2398                 dev_out = init_net.loopback_dev;
2399                 dev_hold(dev_out);
2400                 fl.oif = dev_out->ifindex;
2401                 if (res.fi)
2402                         fib_info_put(res.fi);
2403                 res.fi = NULL;
2404                 flags |= RTCF_LOCAL;
2405                 goto make_route;
2406         }
2407
2408 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2409         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2410                 fib_select_multipath(&fl, &res);
2411         else
2412 #endif
2413         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2414                 fib_select_default(&fl, &res);
2415
2416         if (!fl.fl4_src)
2417                 fl.fl4_src = FIB_RES_PREFSRC(res);
2418
2419         if (dev_out)
2420                 dev_put(dev_out);
2421         dev_out = FIB_RES_DEV(res);
2422         dev_hold(dev_out);
2423         fl.oif = dev_out->ifindex;
2424
2425
2426 make_route:
2427         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2428
2429
2430         if (free_res)
2431                 fib_res_put(&res);
2432         if (dev_out)
2433                 dev_put(dev_out);
2434 out:    return err;
2435 }
2436
2437 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2438 {
2439         unsigned hash;
2440         struct rtable *rth;
2441
2442         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2443
2444         rcu_read_lock_bh();
2445         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2446                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2447                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2448                     rth->fl.fl4_src == flp->fl4_src &&
2449                     rth->fl.iif == 0 &&
2450                     rth->fl.oif == flp->oif &&
2451                     rth->fl.mark == flp->mark &&
2452                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2453                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2454                         dst_use(&rth->u.dst, jiffies);
2455                         RT_CACHE_STAT_INC(out_hit);
2456                         rcu_read_unlock_bh();
2457                         *rp = rth;
2458                         return 0;
2459                 }
2460                 RT_CACHE_STAT_INC(out_hlist_search);
2461         }
2462         rcu_read_unlock_bh();
2463
2464         return ip_route_output_slow(rp, flp);
2465 }
2466
2467 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2468
2469 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2470 {
2471 }
2472
2473 static struct dst_ops ipv4_dst_blackhole_ops = {
2474         .family                 =       AF_INET,
2475         .protocol               =       __constant_htons(ETH_P_IP),
2476         .destroy                =       ipv4_dst_destroy,
2477         .check                  =       ipv4_dst_check,
2478         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2479         .entry_size             =       sizeof(struct rtable),
2480 };
2481
2482
2483 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2484 {
2485         struct rtable *ort = *rp;
2486         struct rtable *rt = (struct rtable *)
2487                 dst_alloc(&ipv4_dst_blackhole_ops);
2488
2489         if (rt) {
2490                 struct dst_entry *new = &rt->u.dst;
2491
2492                 atomic_set(&new->__refcnt, 1);
2493                 new->__use = 1;
2494                 new->input = dst_discard;
2495                 new->output = dst_discard;
2496                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2497
2498                 new->dev = ort->u.dst.dev;
2499                 if (new->dev)
2500                         dev_hold(new->dev);
2501
2502                 rt->fl = ort->fl;
2503
2504                 rt->idev = ort->idev;
2505                 if (rt->idev)
2506                         in_dev_hold(rt->idev);
2507                 rt->rt_flags = ort->rt_flags;
2508                 rt->rt_type = ort->rt_type;
2509                 rt->rt_dst = ort->rt_dst;
2510                 rt->rt_src = ort->rt_src;
2511                 rt->rt_iif = ort->rt_iif;
2512                 rt->rt_gateway = ort->rt_gateway;
2513                 rt->rt_spec_dst = ort->rt_spec_dst;
2514                 rt->peer = ort->peer;
2515                 if (rt->peer)
2516                         atomic_inc(&rt->peer->refcnt);
2517
2518                 dst_free(new);
2519         }
2520
2521         dst_release(&(*rp)->u.dst);
2522         *rp = rt;
2523         return (rt ? 0 : -ENOMEM);
2524 }
2525
2526 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2527 {
2528         int err;
2529
2530         if ((err = __ip_route_output_key(rp, flp)) != 0)
2531                 return err;
2532
2533         if (flp->proto) {
2534                 if (!flp->fl4_src)
2535                         flp->fl4_src = (*rp)->rt_src;
2536                 if (!flp->fl4_dst)
2537                         flp->fl4_dst = (*rp)->rt_dst;
2538                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2539                 if (err == -EREMOTE)
2540                         err = ipv4_dst_blackhole(rp, flp, sk);
2541
2542                 return err;
2543         }
2544
2545         return 0;
2546 }
2547
2548 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2549
2550 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2551 {
2552         return ip_route_output_flow(rp, flp, NULL, 0);
2553 }
2554
2555 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2556                         int nowait, unsigned int flags)
2557 {
2558         struct rtable *rt = (struct rtable*)skb->dst;
2559         struct rtmsg *r;
2560         struct nlmsghdr *nlh;
2561         long expires;
2562         u32 id = 0, ts = 0, tsage = 0, error;
2563
2564         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2565         if (nlh == NULL)
2566                 return -EMSGSIZE;
2567
2568         r = nlmsg_data(nlh);
2569         r->rtm_family    = AF_INET;
2570         r->rtm_dst_len  = 32;
2571         r->rtm_src_len  = 0;
2572         r->rtm_tos      = rt->fl.fl4_tos;
2573         r->rtm_table    = RT_TABLE_MAIN;
2574         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2575         r->rtm_type     = rt->rt_type;
2576         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2577         r->rtm_protocol = RTPROT_UNSPEC;
2578         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2579         if (rt->rt_flags & RTCF_NOTIFY)
2580                 r->rtm_flags |= RTM_F_NOTIFY;
2581
2582         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2583
2584         if (rt->fl.fl4_src) {
2585                 r->rtm_src_len = 32;
2586                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2587         }
2588         if (rt->u.dst.dev)
2589                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2590 #ifdef CONFIG_NET_CLS_ROUTE
2591         if (rt->u.dst.tclassid)
2592                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2593 #endif
2594         if (rt->fl.iif)
2595                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2596         else if (rt->rt_src != rt->fl.fl4_src)
2597                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2598
2599         if (rt->rt_dst != rt->rt_gateway)
2600                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2601
2602         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2603                 goto nla_put_failure;
2604
2605         error = rt->u.dst.error;
2606         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2607         if (rt->peer) {
2608                 id = rt->peer->ip_id_count;
2609                 if (rt->peer->tcp_ts_stamp) {
2610                         ts = rt->peer->tcp_ts;
2611                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2612                 }
2613         }
2614
2615         if (rt->fl.iif) {
2616 #ifdef CONFIG_IP_MROUTE
2617                 __be32 dst = rt->rt_dst;
2618
2619                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2620                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2621                         int err = ipmr_get_route(skb, r, nowait);
2622                         if (err <= 0) {
2623                                 if (!nowait) {
2624                                         if (err == 0)
2625                                                 return 0;
2626                                         goto nla_put_failure;
2627                                 } else {
2628                                         if (err == -EMSGSIZE)
2629                                                 goto nla_put_failure;
2630                                         error = err;
2631                                 }
2632                         }
2633                 } else
2634 #endif
2635                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2636         }
2637
2638         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2639                                expires, error) < 0)
2640                 goto nla_put_failure;
2641
2642         return nlmsg_end(skb, nlh);
2643
2644 nla_put_failure:
2645         nlmsg_cancel(skb, nlh);
2646         return -EMSGSIZE;
2647 }
2648
2649 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2650 {
2651         struct net *net = in_skb->sk->sk_net;
2652         struct rtmsg *rtm;
2653         struct nlattr *tb[RTA_MAX+1];
2654         struct rtable *rt = NULL;
2655         __be32 dst = 0;
2656         __be32 src = 0;
2657         u32 iif;
2658         int err;
2659         struct sk_buff *skb;
2660
2661         if (net != &init_net)
2662                 return -EINVAL;
2663
2664         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2665         if (err < 0)
2666                 goto errout;
2667
2668         rtm = nlmsg_data(nlh);
2669
2670         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2671         if (skb == NULL) {
2672                 err = -ENOBUFS;
2673                 goto errout;
2674         }
2675
2676         /* Reserve room for dummy headers, this skb can pass
2677            through good chunk of routing engine.
2678          */
2679         skb_reset_mac_header(skb);
2680         skb_reset_network_header(skb);
2681
2682         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2683         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2684         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2685
2686         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2687         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2688         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2689
2690         if (iif) {
2691                 struct net_device *dev;
2692
2693                 dev = __dev_get_by_index(&init_net, iif);
2694                 if (dev == NULL) {
2695                         err = -ENODEV;
2696                         goto errout_free;
2697                 }
2698
2699                 skb->protocol   = htons(ETH_P_IP);
2700                 skb->dev        = dev;
2701                 local_bh_disable();
2702                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2703                 local_bh_enable();
2704
2705                 rt = (struct rtable*) skb->dst;
2706                 if (err == 0 && rt->u.dst.error)
2707                         err = -rt->u.dst.error;
2708         } else {
2709                 struct flowi fl = {
2710                         .nl_u = {
2711                                 .ip4_u = {
2712                                         .daddr = dst,
2713                                         .saddr = src,
2714                                         .tos = rtm->rtm_tos,
2715                                 },
2716                         },
2717                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2718                 };
2719                 err = ip_route_output_key(&rt, &fl);
2720         }
2721
2722         if (err)
2723                 goto errout_free;
2724
2725         skb->dst = &rt->u.dst;
2726         if (rtm->rtm_flags & RTM_F_NOTIFY)
2727                 rt->rt_flags |= RTCF_NOTIFY;
2728
2729         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2730                                 RTM_NEWROUTE, 0, 0);
2731         if (err <= 0)
2732                 goto errout_free;
2733
2734         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2735 errout:
2736         return err;
2737
2738 errout_free:
2739         kfree_skb(skb);
2740         goto errout;
2741 }
2742
2743 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2744 {
2745         struct rtable *rt;
2746         int h, s_h;
2747         int idx, s_idx;
2748
2749         s_h = cb->args[0];
2750         if (s_h < 0)
2751                 s_h = 0;
2752         s_idx = idx = cb->args[1];
2753         for (h = s_h; h <= rt_hash_mask; h++) {
2754                 rcu_read_lock_bh();
2755                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2756                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2757                         if (idx < s_idx)
2758                                 continue;
2759                         skb->dst = dst_clone(&rt->u.dst);
2760                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2761                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2762                                          1, NLM_F_MULTI) <= 0) {
2763                                 dst_release(xchg(&skb->dst, NULL));
2764                                 rcu_read_unlock_bh();
2765                                 goto done;
2766                         }
2767                         dst_release(xchg(&skb->dst, NULL));
2768                 }
2769                 rcu_read_unlock_bh();
2770                 s_idx = 0;
2771         }
2772
2773 done:
2774         cb->args[0] = h;
2775         cb->args[1] = idx;
2776         return skb->len;
2777 }
2778
2779 void ip_rt_multicast_event(struct in_device *in_dev)
2780 {
2781         rt_cache_flush(0);
2782 }
2783
2784 #ifdef CONFIG_SYSCTL
2785 static int flush_delay;
2786
2787 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2788                                         struct file *filp, void __user *buffer,
2789                                         size_t *lenp, loff_t *ppos)
2790 {
2791         if (write) {
2792                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2793                 rt_cache_flush(flush_delay);
2794                 return 0;
2795         }
2796
2797         return -EINVAL;
2798 }
2799
2800 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2801                                                 int __user *name,
2802                                                 int nlen,
2803                                                 void __user *oldval,
2804                                                 size_t __user *oldlenp,
2805                                                 void __user *newval,
2806                                                 size_t newlen)
2807 {
2808         int delay;
2809         if (newlen != sizeof(int))
2810                 return -EINVAL;
2811         if (get_user(delay, (int __user *)newval))
2812                 return -EFAULT;
2813         rt_cache_flush(delay);
2814         return 0;
2815 }
2816
2817 ctl_table ipv4_route_table[] = {
2818         {
2819                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2820                 .procname       = "flush",
2821                 .data           = &flush_delay,
2822                 .maxlen         = sizeof(int),
2823                 .mode           = 0200,
2824                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2825                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2826         },
2827         {
2828                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2829                 .procname       = "min_delay",
2830                 .data           = &ip_rt_min_delay,
2831                 .maxlen         = sizeof(int),
2832                 .mode           = 0644,
2833                 .proc_handler   = &proc_dointvec_jiffies,
2834                 .strategy       = &sysctl_jiffies,
2835         },
2836         {
2837                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2838                 .procname       = "max_delay",
2839                 .data           = &ip_rt_max_delay,
2840                 .maxlen         = sizeof(int),
2841                 .mode           = 0644,
2842                 .proc_handler   = &proc_dointvec_jiffies,
2843                 .strategy       = &sysctl_jiffies,
2844         },
2845         {
2846                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2847                 .procname       = "gc_thresh",
2848                 .data           = &ipv4_dst_ops.gc_thresh,
2849                 .maxlen         = sizeof(int),
2850                 .mode           = 0644,
2851                 .proc_handler   = &proc_dointvec,
2852         },
2853         {
2854                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2855                 .procname       = "max_size",
2856                 .data           = &ip_rt_max_size,
2857                 .maxlen         = sizeof(int),
2858                 .mode           = 0644,
2859                 .proc_handler   = &proc_dointvec,
2860         },
2861         {
2862                 /*  Deprecated. Use gc_min_interval_ms */
2863
2864                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2865                 .procname       = "gc_min_interval",
2866                 .data           = &ip_rt_gc_min_interval,
2867                 .maxlen         = sizeof(int),
2868                 .mode           = 0644,
2869                 .proc_handler   = &proc_dointvec_jiffies,
2870                 .strategy       = &sysctl_jiffies,
2871         },
2872         {
2873                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2874                 .procname       = "gc_min_interval_ms",
2875                 .data           = &ip_rt_gc_min_interval,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = &proc_dointvec_ms_jiffies,
2879                 .strategy       = &sysctl_ms_jiffies,
2880         },
2881         {
2882                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2883                 .procname       = "gc_timeout",
2884                 .data           = &ip_rt_gc_timeout,
2885                 .maxlen         = sizeof(int),
2886                 .mode           = 0644,
2887                 .proc_handler   = &proc_dointvec_jiffies,
2888                 .strategy       = &sysctl_jiffies,
2889         },
2890         {
2891                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2892                 .procname       = "gc_interval",
2893                 .data           = &ip_rt_gc_interval,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = &proc_dointvec_jiffies,
2897                 .strategy       = &sysctl_jiffies,
2898         },
2899         {
2900                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2901                 .procname       = "redirect_load",
2902                 .data           = &ip_rt_redirect_load,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = &proc_dointvec,
2906         },
2907         {
2908                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2909                 .procname       = "redirect_number",
2910                 .data           = &ip_rt_redirect_number,
2911                 .maxlen         = sizeof(int),
2912                 .mode           = 0644,
2913                 .proc_handler   = &proc_dointvec,
2914         },
2915         {
2916                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2917                 .procname       = "redirect_silence",
2918                 .data           = &ip_rt_redirect_silence,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = &proc_dointvec,
2922         },
2923         {
2924                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2925                 .procname       = "error_cost",
2926                 .data           = &ip_rt_error_cost,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = &proc_dointvec,
2930         },
2931         {
2932                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2933                 .procname       = "error_burst",
2934                 .data           = &ip_rt_error_burst,
2935                 .maxlen         = sizeof(int),
2936                 .mode           = 0644,
2937                 .proc_handler   = &proc_dointvec,
2938         },
2939         {
2940                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2941                 .procname       = "gc_elasticity",
2942                 .data           = &ip_rt_gc_elasticity,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = &proc_dointvec,
2946         },
2947         {
2948                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2949                 .procname       = "mtu_expires",
2950                 .data           = &ip_rt_mtu_expires,
2951                 .maxlen         = sizeof(int),
2952                 .mode           = 0644,
2953                 .proc_handler   = &proc_dointvec_jiffies,
2954                 .strategy       = &sysctl_jiffies,
2955         },
2956         {
2957                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2958                 .procname       = "min_pmtu",
2959                 .data           = &ip_rt_min_pmtu,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0644,
2962                 .proc_handler   = &proc_dointvec,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2966                 .procname       = "min_adv_mss",
2967                 .data           = &ip_rt_min_advmss,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2974                 .procname       = "secret_interval",
2975                 .data           = &ip_rt_secret_interval,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec_jiffies,
2979                 .strategy       = &sysctl_jiffies,
2980         },
2981         { .ctl_name = 0 }
2982 };
2983 #endif
2984
2985 #ifdef CONFIG_NET_CLS_ROUTE
2986 struct ip_rt_acct *ip_rt_acct __read_mostly;
2987 #endif /* CONFIG_NET_CLS_ROUTE */
2988
2989 static __initdata unsigned long rhash_entries;
2990 static int __init set_rhash_entries(char *str)
2991 {
2992         if (!str)
2993                 return 0;
2994         rhash_entries = simple_strtoul(str, &str, 0);
2995         return 1;
2996 }
2997 __setup("rhash_entries=", set_rhash_entries);
2998
2999 int __init ip_rt_init(void)
3000 {
3001         int rc = 0;
3002
3003         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3004                              (jiffies ^ (jiffies >> 7)));
3005
3006 #ifdef CONFIG_NET_CLS_ROUTE
3007         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3008         if (!ip_rt_acct)
3009                 panic("IP: failed to allocate ip_rt_acct\n");
3010 #endif
3011
3012         ipv4_dst_ops.kmem_cachep =
3013                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3014                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3015
3016         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3017
3018         rt_hash_table = (struct rt_hash_bucket *)
3019                 alloc_large_system_hash("IP route cache",
3020                                         sizeof(struct rt_hash_bucket),
3021                                         rhash_entries,
3022                                         (num_physpages >= 128 * 1024) ?
3023                                         15 : 17,
3024                                         0,
3025                                         &rt_hash_log,
3026                                         &rt_hash_mask,
3027                                         0);
3028         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3029         rt_hash_lock_init();
3030
3031         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3032         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3033
3034         devinet_init();
3035         ip_fib_init();
3036
3037         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3038         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3039
3040         /* All the timers, started at system startup tend
3041            to synchronize. Perturb it a bit.
3042          */
3043         schedule_delayed_work(&expires_work,
3044                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3045
3046         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3047                 ip_rt_secret_interval;
3048         add_timer(&rt_secret_timer);
3049
3050         if (ip_rt_proc_init(&init_net))
3051                 printk(KERN_ERR "Unable to create route proc files\n");
3052 #ifdef CONFIG_XFRM
3053         xfrm_init();
3054         xfrm4_init();
3055 #endif
3056         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3057
3058         return rc;
3059 }
3060
3061 EXPORT_SYMBOL(__ip_select_ident);
3062 EXPORT_SYMBOL(ip_route_input);
3063 EXPORT_SYMBOL(ip_route_output_key);