[NET]: should explicitely initialize atomic_t field in struct dst_ops
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(struct dst_ops *ops);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172         .entries =              ATOMIC_INIT(0),
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(FILLER),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195
196
197 /*
198  * Route cache.
199  */
200
201 /* The locking scheme is rather straight forward:
202  *
203  * 1) Read-Copy Update protects the buckets of the central route hash.
204  * 2) Only writers remove entries, and they hold the lock
205  *    as they look at rtable reference counts.
206  * 3) Only readers acquire references to rtable entries,
207  *    they do so with atomic increments and with the
208  *    lock held.
209  */
210
211 struct rt_hash_bucket {
212         struct rtable   *chain;
213 };
214 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
215         defined(CONFIG_PROVE_LOCKING)
216 /*
217  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
218  * The size of this table is a power of two and depends on the number of CPUS.
219  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
220  */
221 #ifdef CONFIG_LOCKDEP
222 # define RT_HASH_LOCK_SZ        256
223 #else
224 # if NR_CPUS >= 32
225 #  define RT_HASH_LOCK_SZ       4096
226 # elif NR_CPUS >= 16
227 #  define RT_HASH_LOCK_SZ       2048
228 # elif NR_CPUS >= 8
229 #  define RT_HASH_LOCK_SZ       1024
230 # elif NR_CPUS >= 4
231 #  define RT_HASH_LOCK_SZ       512
232 # else
233 #  define RT_HASH_LOCK_SZ       256
234 # endif
235 #endif
236
237 static spinlock_t       *rt_hash_locks;
238 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
239
240 static __init void rt_hash_lock_init(void)
241 {
242         int i;
243
244         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
245                         GFP_KERNEL);
246         if (!rt_hash_locks)
247                 panic("IP: failed to allocate rt_hash_locks\n");
248
249         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
250                 spin_lock_init(&rt_hash_locks[i]);
251 }
252 #else
253 # define rt_hash_lock_addr(slot) NULL
254
255 static inline void rt_hash_lock_init(void)
256 {
257 }
258 #endif
259
260 static struct rt_hash_bucket    *rt_hash_table;
261 static unsigned                 rt_hash_mask;
262 static unsigned int             rt_hash_log;
263 static unsigned int             rt_hash_rnd;
264
265 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
266 #define RT_CACHE_STAT_INC(field) \
267         (__raw_get_cpu_var(rt_cache_stat).field++)
268
269 static int rt_intern_hash(unsigned hash, struct rtable *rth,
270                                 struct rtable **res);
271
272 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
273 {
274         return (jhash_2words(daddr, saddr, rt_hash_rnd)
275                 & rt_hash_mask);
276 }
277
278 #define rt_hash(daddr, saddr, idx) \
279         rt_hash_code((__force u32)(__be32)(daddr),\
280                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
281
282 #ifdef CONFIG_PROC_FS
283 struct rt_cache_iter_state {
284         int bucket;
285 };
286
287 static struct rtable *rt_cache_get_first(struct seq_file *seq)
288 {
289         struct rtable *r = NULL;
290         struct rt_cache_iter_state *st = seq->private;
291
292         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
293                 rcu_read_lock_bh();
294                 r = rt_hash_table[st->bucket].chain;
295                 if (r)
296                         break;
297                 rcu_read_unlock_bh();
298         }
299         return rcu_dereference(r);
300 }
301
302 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
303 {
304         struct rt_cache_iter_state *st = seq->private;
305
306         r = r->u.dst.rt_next;
307         while (!r) {
308                 rcu_read_unlock_bh();
309                 if (--st->bucket < 0)
310                         break;
311                 rcu_read_lock_bh();
312                 r = rt_hash_table[st->bucket].chain;
313         }
314         return rcu_dereference(r);
315 }
316
317 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
318 {
319         struct rtable *r = rt_cache_get_first(seq);
320
321         if (r)
322                 while (pos && (r = rt_cache_get_next(seq, r)))
323                         --pos;
324         return pos ? NULL : r;
325 }
326
327 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
328 {
329         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
330 }
331
332 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
333 {
334         struct rtable *r = NULL;
335
336         if (v == SEQ_START_TOKEN)
337                 r = rt_cache_get_first(seq);
338         else
339                 r = rt_cache_get_next(seq, v);
340         ++*pos;
341         return r;
342 }
343
344 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
345 {
346         if (v && v != SEQ_START_TOKEN)
347                 rcu_read_unlock_bh();
348 }
349
350 static int rt_cache_seq_show(struct seq_file *seq, void *v)
351 {
352         if (v == SEQ_START_TOKEN)
353                 seq_printf(seq, "%-127s\n",
354                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
355                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
356                            "HHUptod\tSpecDst");
357         else {
358                 struct rtable *r = v;
359                 char temp[256];
360
361                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
362                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
363                         r->u.dst.dev ? r->u.dst.dev->name : "*",
364                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
365                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
366                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
367                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
368                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
369                         dst_metric(&r->u.dst, RTAX_WINDOW),
370                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
371                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
372                         r->fl.fl4_tos,
373                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
374                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
375                                        dev_queue_xmit) : 0,
376                         r->rt_spec_dst);
377                 seq_printf(seq, "%-127s\n", temp);
378         }
379         return 0;
380 }
381
382 static const struct seq_operations rt_cache_seq_ops = {
383         .start  = rt_cache_seq_start,
384         .next   = rt_cache_seq_next,
385         .stop   = rt_cache_seq_stop,
386         .show   = rt_cache_seq_show,
387 };
388
389 static int rt_cache_seq_open(struct inode *inode, struct file *file)
390 {
391         return seq_open_private(file, &rt_cache_seq_ops,
392                         sizeof(struct rt_cache_iter_state));
393 }
394
395 static const struct file_operations rt_cache_seq_fops = {
396         .owner   = THIS_MODULE,
397         .open    = rt_cache_seq_open,
398         .read    = seq_read,
399         .llseek  = seq_lseek,
400         .release = seq_release_private,
401 };
402
403
404 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
405 {
406         int cpu;
407
408         if (*pos == 0)
409                 return SEQ_START_TOKEN;
410
411         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
412                 if (!cpu_possible(cpu))
413                         continue;
414                 *pos = cpu+1;
415                 return &per_cpu(rt_cache_stat, cpu);
416         }
417         return NULL;
418 }
419
420 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
421 {
422         int cpu;
423
424         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
425                 if (!cpu_possible(cpu))
426                         continue;
427                 *pos = cpu+1;
428                 return &per_cpu(rt_cache_stat, cpu);
429         }
430         return NULL;
431
432 }
433
434 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
435 {
436
437 }
438
439 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
440 {
441         struct rt_cache_stat *st = v;
442
443         if (v == SEQ_START_TOKEN) {
444                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
445                 return 0;
446         }
447
448         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
449                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
450                    atomic_read(&ipv4_dst_ops.entries),
451                    st->in_hit,
452                    st->in_slow_tot,
453                    st->in_slow_mc,
454                    st->in_no_route,
455                    st->in_brd,
456                    st->in_martian_dst,
457                    st->in_martian_src,
458
459                    st->out_hit,
460                    st->out_slow_tot,
461                    st->out_slow_mc,
462
463                    st->gc_total,
464                    st->gc_ignored,
465                    st->gc_goal_miss,
466                    st->gc_dst_overflow,
467                    st->in_hlist_search,
468                    st->out_hlist_search
469                 );
470         return 0;
471 }
472
473 static const struct seq_operations rt_cpu_seq_ops = {
474         .start  = rt_cpu_seq_start,
475         .next   = rt_cpu_seq_next,
476         .stop   = rt_cpu_seq_stop,
477         .show   = rt_cpu_seq_show,
478 };
479
480
481 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
482 {
483         return seq_open(file, &rt_cpu_seq_ops);
484 }
485
486 static const struct file_operations rt_cpu_seq_fops = {
487         .owner   = THIS_MODULE,
488         .open    = rt_cpu_seq_open,
489         .read    = seq_read,
490         .llseek  = seq_lseek,
491         .release = seq_release,
492 };
493
494 #ifdef CONFIG_NET_CLS_ROUTE
495 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
496                            int length, int *eof, void *data)
497 {
498         unsigned int i;
499
500         if ((offset & 3) || (length & 3))
501                 return -EIO;
502
503         if (offset >= sizeof(struct ip_rt_acct) * 256) {
504                 *eof = 1;
505                 return 0;
506         }
507
508         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
509                 length = sizeof(struct ip_rt_acct) * 256 - offset;
510                 *eof = 1;
511         }
512
513         offset /= sizeof(u32);
514
515         if (length > 0) {
516                 u32 *dst = (u32 *) buffer;
517
518                 *start = buffer;
519                 memset(dst, 0, length);
520
521                 for_each_possible_cpu(i) {
522                         unsigned int j;
523                         u32 *src;
524
525                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
526                         for (j = 0; j < length/4; j++)
527                                 dst[j] += src[j];
528                 }
529         }
530         return length;
531 }
532 #endif
533
534 static __init int ip_rt_proc_init(struct net *net)
535 {
536         struct proc_dir_entry *pde;
537
538         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
539                         &rt_cache_seq_fops);
540         if (!pde)
541                 goto err1;
542
543         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
544         if (!pde)
545                 goto err2;
546
547         pde->proc_fops = &rt_cpu_seq_fops;
548
549 #ifdef CONFIG_NET_CLS_ROUTE
550         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
551                         ip_rt_acct_read, NULL);
552         if (!pde)
553                 goto err3;
554 #endif
555         return 0;
556
557 #ifdef CONFIG_NET_CLS_ROUTE
558 err3:
559         remove_proc_entry("rt_cache", net->proc_net_stat);
560 #endif
561 err2:
562         remove_proc_entry("rt_cache", net->proc_net);
563 err1:
564         return -ENOMEM;
565 }
566 #else
567 static inline int ip_rt_proc_init(struct net *net)
568 {
569         return 0;
570 }
571 #endif /* CONFIG_PROC_FS */
572
573 static __inline__ void rt_free(struct rtable *rt)
574 {
575         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
576 }
577
578 static __inline__ void rt_drop(struct rtable *rt)
579 {
580         ip_rt_put(rt);
581         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
582 }
583
584 static __inline__ int rt_fast_clean(struct rtable *rth)
585 {
586         /* Kill broadcast/multicast entries very aggresively, if they
587            collide in hash table with more useful entries */
588         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
589                 rth->fl.iif && rth->u.dst.rt_next;
590 }
591
592 static __inline__ int rt_valuable(struct rtable *rth)
593 {
594         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
595                 rth->u.dst.expires;
596 }
597
598 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
599 {
600         unsigned long age;
601         int ret = 0;
602
603         if (atomic_read(&rth->u.dst.__refcnt))
604                 goto out;
605
606         ret = 1;
607         if (rth->u.dst.expires &&
608             time_after_eq(jiffies, rth->u.dst.expires))
609                 goto out;
610
611         age = jiffies - rth->u.dst.lastuse;
612         ret = 0;
613         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
614             (age <= tmo2 && rt_valuable(rth)))
615                 goto out;
616         ret = 1;
617 out:    return ret;
618 }
619
620 /* Bits of score are:
621  * 31: very valuable
622  * 30: not quite useless
623  * 29..0: usage counter
624  */
625 static inline u32 rt_score(struct rtable *rt)
626 {
627         u32 score = jiffies - rt->u.dst.lastuse;
628
629         score = ~score & ~(3<<30);
630
631         if (rt_valuable(rt))
632                 score |= (1<<31);
633
634         if (!rt->fl.iif ||
635             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
636                 score |= (1<<30);
637
638         return score;
639 }
640
641 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
642 {
643         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
644                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
645                 (fl1->mark ^ fl2->mark) |
646                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
647                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
648                 (fl1->oif ^ fl2->oif) |
649                 (fl1->iif ^ fl2->iif)) == 0;
650 }
651
652 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
653 {
654         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
655 }
656
657 /*
658  * Perform a full scan of hash table and free all entries.
659  * Can be called by a softirq or a process.
660  * In the later case, we want to be reschedule if necessary
661  */
662 static void rt_do_flush(int process_context)
663 {
664         unsigned int i;
665         struct rtable *rth, *next;
666
667         for (i = 0; i <= rt_hash_mask; i++) {
668                 if (process_context && need_resched())
669                         cond_resched();
670                 rth = rt_hash_table[i].chain;
671                 if (!rth)
672                         continue;
673
674                 spin_lock_bh(rt_hash_lock_addr(i));
675                 rth = rt_hash_table[i].chain;
676                 rt_hash_table[i].chain = NULL;
677                 spin_unlock_bh(rt_hash_lock_addr(i));
678
679                 for (; rth; rth = next) {
680                         next = rth->u.dst.rt_next;
681                         rt_free(rth);
682                 }
683         }
684 }
685
686 static void rt_check_expire(void)
687 {
688         static unsigned int rover;
689         unsigned int i = rover, goal;
690         struct rtable *rth, **rthp;
691         u64 mult;
692
693         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
694         if (ip_rt_gc_timeout > 1)
695                 do_div(mult, ip_rt_gc_timeout);
696         goal = (unsigned int)mult;
697         if (goal > rt_hash_mask)
698                 goal = rt_hash_mask + 1;
699         for (; goal > 0; goal--) {
700                 unsigned long tmo = ip_rt_gc_timeout;
701
702                 i = (i + 1) & rt_hash_mask;
703                 rthp = &rt_hash_table[i].chain;
704
705                 if (need_resched())
706                         cond_resched();
707
708                 if (*rthp == NULL)
709                         continue;
710                 spin_lock_bh(rt_hash_lock_addr(i));
711                 while ((rth = *rthp) != NULL) {
712                         if (rth->u.dst.expires) {
713                                 /* Entry is expired even if it is in use */
714                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
715                                         tmo >>= 1;
716                                         rthp = &rth->u.dst.rt_next;
717                                         continue;
718                                 }
719                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
720                                 tmo >>= 1;
721                                 rthp = &rth->u.dst.rt_next;
722                                 continue;
723                         }
724
725                         /* Cleanup aged off entries. */
726                         *rthp = rth->u.dst.rt_next;
727                         rt_free(rth);
728                 }
729                 spin_unlock_bh(rt_hash_lock_addr(i));
730         }
731         rover = i;
732 }
733
734 /*
735  * rt_worker_func() is run in process context.
736  * If a whole flush was scheduled, it is done.
737  * Else, we call rt_check_expire() to scan part of the hash table
738  */
739 static void rt_worker_func(struct work_struct *work)
740 {
741         if (ip_rt_flush_expected) {
742                 ip_rt_flush_expected = 0;
743                 rt_do_flush(1);
744         } else
745                 rt_check_expire();
746         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
747 }
748
749 /* This can run from both BH and non-BH contexts, the latter
750  * in the case of a forced flush event.
751  */
752 static void rt_run_flush(unsigned long process_context)
753 {
754         rt_deadline = 0;
755
756         get_random_bytes(&rt_hash_rnd, 4);
757
758         rt_do_flush(process_context);
759 }
760
761 static DEFINE_SPINLOCK(rt_flush_lock);
762
763 void rt_cache_flush(int delay)
764 {
765         unsigned long now = jiffies;
766         int user_mode = !in_softirq();
767
768         if (delay < 0)
769                 delay = ip_rt_min_delay;
770
771         spin_lock_bh(&rt_flush_lock);
772
773         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
774                 long tmo = (long)(rt_deadline - now);
775
776                 /* If flush timer is already running
777                    and flush request is not immediate (delay > 0):
778
779                    if deadline is not achieved, prolongate timer to "delay",
780                    otherwise fire it at deadline time.
781                  */
782
783                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
784                         tmo = 0;
785
786                 if (delay > tmo)
787                         delay = tmo;
788         }
789
790         if (delay <= 0) {
791                 spin_unlock_bh(&rt_flush_lock);
792                 rt_run_flush(user_mode);
793                 return;
794         }
795
796         if (rt_deadline == 0)
797                 rt_deadline = now + ip_rt_max_delay;
798
799         mod_timer(&rt_flush_timer, now+delay);
800         spin_unlock_bh(&rt_flush_lock);
801 }
802
803 /*
804  * We change rt_hash_rnd and ask next rt_worker_func() invocation
805  * to perform a flush in process context
806  */
807 static void rt_secret_rebuild(unsigned long dummy)
808 {
809         get_random_bytes(&rt_hash_rnd, 4);
810         ip_rt_flush_expected = 1;
811         cancel_delayed_work(&expires_work);
812         schedule_delayed_work(&expires_work, HZ/10);
813         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
814 }
815
816 /*
817    Short description of GC goals.
818
819    We want to build algorithm, which will keep routing cache
820    at some equilibrium point, when number of aged off entries
821    is kept approximately equal to newly generated ones.
822
823    Current expiration strength is variable "expire".
824    We try to adjust it dynamically, so that if networking
825    is idle expires is large enough to keep enough of warm entries,
826    and when load increases it reduces to limit cache size.
827  */
828
829 static int rt_garbage_collect(struct dst_ops *ops)
830 {
831         static unsigned long expire = RT_GC_TIMEOUT;
832         static unsigned long last_gc;
833         static int rover;
834         static int equilibrium;
835         struct rtable *rth, **rthp;
836         unsigned long now = jiffies;
837         int goal;
838
839         /*
840          * Garbage collection is pretty expensive,
841          * do not make it too frequently.
842          */
843
844         RT_CACHE_STAT_INC(gc_total);
845
846         if (now - last_gc < ip_rt_gc_min_interval &&
847             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
848                 RT_CACHE_STAT_INC(gc_ignored);
849                 goto out;
850         }
851
852         /* Calculate number of entries, which we want to expire now. */
853         goal = atomic_read(&ipv4_dst_ops.entries) -
854                 (ip_rt_gc_elasticity << rt_hash_log);
855         if (goal <= 0) {
856                 if (equilibrium < ipv4_dst_ops.gc_thresh)
857                         equilibrium = ipv4_dst_ops.gc_thresh;
858                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
859                 if (goal > 0) {
860                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
861                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
862                 }
863         } else {
864                 /* We are in dangerous area. Try to reduce cache really
865                  * aggressively.
866                  */
867                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
868                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
869         }
870
871         if (now - last_gc >= ip_rt_gc_min_interval)
872                 last_gc = now;
873
874         if (goal <= 0) {
875                 equilibrium += goal;
876                 goto work_done;
877         }
878
879         do {
880                 int i, k;
881
882                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
883                         unsigned long tmo = expire;
884
885                         k = (k + 1) & rt_hash_mask;
886                         rthp = &rt_hash_table[k].chain;
887                         spin_lock_bh(rt_hash_lock_addr(k));
888                         while ((rth = *rthp) != NULL) {
889                                 if (!rt_may_expire(rth, tmo, expire)) {
890                                         tmo >>= 1;
891                                         rthp = &rth->u.dst.rt_next;
892                                         continue;
893                                 }
894                                 *rthp = rth->u.dst.rt_next;
895                                 rt_free(rth);
896                                 goal--;
897                         }
898                         spin_unlock_bh(rt_hash_lock_addr(k));
899                         if (goal <= 0)
900                                 break;
901                 }
902                 rover = k;
903
904                 if (goal <= 0)
905                         goto work_done;
906
907                 /* Goal is not achieved. We stop process if:
908
909                    - if expire reduced to zero. Otherwise, expire is halfed.
910                    - if table is not full.
911                    - if we are called from interrupt.
912                    - jiffies check is just fallback/debug loop breaker.
913                      We will not spin here for long time in any case.
914                  */
915
916                 RT_CACHE_STAT_INC(gc_goal_miss);
917
918                 if (expire == 0)
919                         break;
920
921                 expire >>= 1;
922 #if RT_CACHE_DEBUG >= 2
923                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
924                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
925 #endif
926
927                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
928                         goto out;
929         } while (!in_softirq() && time_before_eq(jiffies, now));
930
931         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
932                 goto out;
933         if (net_ratelimit())
934                 printk(KERN_WARNING "dst cache overflow\n");
935         RT_CACHE_STAT_INC(gc_dst_overflow);
936         return 1;
937
938 work_done:
939         expire += ip_rt_gc_min_interval;
940         if (expire > ip_rt_gc_timeout ||
941             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
942                 expire = ip_rt_gc_timeout;
943 #if RT_CACHE_DEBUG >= 2
944         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
945                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
946 #endif
947 out:    return 0;
948 }
949
950 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
951 {
952         struct rtable   *rth, **rthp;
953         unsigned long   now;
954         struct rtable *cand, **candp;
955         u32             min_score;
956         int             chain_length;
957         int attempts = !in_softirq();
958
959 restart:
960         chain_length = 0;
961         min_score = ~(u32)0;
962         cand = NULL;
963         candp = NULL;
964         now = jiffies;
965
966         rthp = &rt_hash_table[hash].chain;
967
968         spin_lock_bh(rt_hash_lock_addr(hash));
969         while ((rth = *rthp) != NULL) {
970                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
971                         /* Put it first */
972                         *rthp = rth->u.dst.rt_next;
973                         /*
974                          * Since lookup is lockfree, the deletion
975                          * must be visible to another weakly ordered CPU before
976                          * the insertion at the start of the hash chain.
977                          */
978                         rcu_assign_pointer(rth->u.dst.rt_next,
979                                            rt_hash_table[hash].chain);
980                         /*
981                          * Since lookup is lockfree, the update writes
982                          * must be ordered for consistency on SMP.
983                          */
984                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
985
986                         dst_use(&rth->u.dst, now);
987                         spin_unlock_bh(rt_hash_lock_addr(hash));
988
989                         rt_drop(rt);
990                         *rp = rth;
991                         return 0;
992                 }
993
994                 if (!atomic_read(&rth->u.dst.__refcnt)) {
995                         u32 score = rt_score(rth);
996
997                         if (score <= min_score) {
998                                 cand = rth;
999                                 candp = rthp;
1000                                 min_score = score;
1001                         }
1002                 }
1003
1004                 chain_length++;
1005
1006                 rthp = &rth->u.dst.rt_next;
1007         }
1008
1009         if (cand) {
1010                 /* ip_rt_gc_elasticity used to be average length of chain
1011                  * length, when exceeded gc becomes really aggressive.
1012                  *
1013                  * The second limit is less certain. At the moment it allows
1014                  * only 2 entries per bucket. We will see.
1015                  */
1016                 if (chain_length > ip_rt_gc_elasticity) {
1017                         *candp = cand->u.dst.rt_next;
1018                         rt_free(cand);
1019                 }
1020         }
1021
1022         /* Try to bind route to arp only if it is output
1023            route or unicast forwarding path.
1024          */
1025         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1026                 int err = arp_bind_neighbour(&rt->u.dst);
1027                 if (err) {
1028                         spin_unlock_bh(rt_hash_lock_addr(hash));
1029
1030                         if (err != -ENOBUFS) {
1031                                 rt_drop(rt);
1032                                 return err;
1033                         }
1034
1035                         /* Neighbour tables are full and nothing
1036                            can be released. Try to shrink route cache,
1037                            it is most likely it holds some neighbour records.
1038                          */
1039                         if (attempts-- > 0) {
1040                                 int saved_elasticity = ip_rt_gc_elasticity;
1041                                 int saved_int = ip_rt_gc_min_interval;
1042                                 ip_rt_gc_elasticity     = 1;
1043                                 ip_rt_gc_min_interval   = 0;
1044                                 rt_garbage_collect(&ipv4_dst_ops);
1045                                 ip_rt_gc_min_interval   = saved_int;
1046                                 ip_rt_gc_elasticity     = saved_elasticity;
1047                                 goto restart;
1048                         }
1049
1050                         if (net_ratelimit())
1051                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1052                         rt_drop(rt);
1053                         return -ENOBUFS;
1054                 }
1055         }
1056
1057         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1058 #if RT_CACHE_DEBUG >= 2
1059         if (rt->u.dst.rt_next) {
1060                 struct rtable *trt;
1061                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1062                        NIPQUAD(rt->rt_dst));
1063                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1064                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1065                 printk("\n");
1066         }
1067 #endif
1068         rt_hash_table[hash].chain = rt;
1069         spin_unlock_bh(rt_hash_lock_addr(hash));
1070         *rp = rt;
1071         return 0;
1072 }
1073
1074 void rt_bind_peer(struct rtable *rt, int create)
1075 {
1076         static DEFINE_SPINLOCK(rt_peer_lock);
1077         struct inet_peer *peer;
1078
1079         peer = inet_getpeer(rt->rt_dst, create);
1080
1081         spin_lock_bh(&rt_peer_lock);
1082         if (rt->peer == NULL) {
1083                 rt->peer = peer;
1084                 peer = NULL;
1085         }
1086         spin_unlock_bh(&rt_peer_lock);
1087         if (peer)
1088                 inet_putpeer(peer);
1089 }
1090
1091 /*
1092  * Peer allocation may fail only in serious out-of-memory conditions.  However
1093  * we still can generate some output.
1094  * Random ID selection looks a bit dangerous because we have no chances to
1095  * select ID being unique in a reasonable period of time.
1096  * But broken packet identifier may be better than no packet at all.
1097  */
1098 static void ip_select_fb_ident(struct iphdr *iph)
1099 {
1100         static DEFINE_SPINLOCK(ip_fb_id_lock);
1101         static u32 ip_fallback_id;
1102         u32 salt;
1103
1104         spin_lock_bh(&ip_fb_id_lock);
1105         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1106         iph->id = htons(salt & 0xFFFF);
1107         ip_fallback_id = salt;
1108         spin_unlock_bh(&ip_fb_id_lock);
1109 }
1110
1111 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1112 {
1113         struct rtable *rt = (struct rtable *) dst;
1114
1115         if (rt) {
1116                 if (rt->peer == NULL)
1117                         rt_bind_peer(rt, 1);
1118
1119                 /* If peer is attached to destination, it is never detached,
1120                    so that we need not to grab a lock to dereference it.
1121                  */
1122                 if (rt->peer) {
1123                         iph->id = htons(inet_getid(rt->peer, more));
1124                         return;
1125                 }
1126         } else
1127                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1128                        __builtin_return_address(0));
1129
1130         ip_select_fb_ident(iph);
1131 }
1132
1133 static void rt_del(unsigned hash, struct rtable *rt)
1134 {
1135         struct rtable **rthp;
1136
1137         spin_lock_bh(rt_hash_lock_addr(hash));
1138         ip_rt_put(rt);
1139         for (rthp = &rt_hash_table[hash].chain; *rthp;
1140              rthp = &(*rthp)->u.dst.rt_next)
1141                 if (*rthp == rt) {
1142                         *rthp = rt->u.dst.rt_next;
1143                         rt_free(rt);
1144                         break;
1145                 }
1146         spin_unlock_bh(rt_hash_lock_addr(hash));
1147 }
1148
1149 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1150                     __be32 saddr, struct net_device *dev)
1151 {
1152         int i, k;
1153         struct in_device *in_dev = in_dev_get(dev);
1154         struct rtable *rth, **rthp;
1155         __be32  skeys[2] = { saddr, 0 };
1156         int  ikeys[2] = { dev->ifindex, 0 };
1157         struct netevent_redirect netevent;
1158
1159         if (!in_dev)
1160                 return;
1161
1162         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1163             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1164             || ipv4_is_zeronet(new_gw))
1165                 goto reject_redirect;
1166
1167         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1168                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1169                         goto reject_redirect;
1170                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1171                         goto reject_redirect;
1172         } else {
1173                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1174                         goto reject_redirect;
1175         }
1176
1177         for (i = 0; i < 2; i++) {
1178                 for (k = 0; k < 2; k++) {
1179                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1180
1181                         rthp=&rt_hash_table[hash].chain;
1182
1183                         rcu_read_lock();
1184                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1185                                 struct rtable *rt;
1186
1187                                 if (rth->fl.fl4_dst != daddr ||
1188                                     rth->fl.fl4_src != skeys[i] ||
1189                                     rth->fl.oif != ikeys[k] ||
1190                                     rth->fl.iif != 0) {
1191                                         rthp = &rth->u.dst.rt_next;
1192                                         continue;
1193                                 }
1194
1195                                 if (rth->rt_dst != daddr ||
1196                                     rth->rt_src != saddr ||
1197                                     rth->u.dst.error ||
1198                                     rth->rt_gateway != old_gw ||
1199                                     rth->u.dst.dev != dev)
1200                                         break;
1201
1202                                 dst_hold(&rth->u.dst);
1203                                 rcu_read_unlock();
1204
1205                                 rt = dst_alloc(&ipv4_dst_ops);
1206                                 if (rt == NULL) {
1207                                         ip_rt_put(rth);
1208                                         in_dev_put(in_dev);
1209                                         return;
1210                                 }
1211
1212                                 /* Copy all the information. */
1213                                 *rt = *rth;
1214                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1215                                 rt->u.dst.__use         = 1;
1216                                 atomic_set(&rt->u.dst.__refcnt, 1);
1217                                 rt->u.dst.child         = NULL;
1218                                 if (rt->u.dst.dev)
1219                                         dev_hold(rt->u.dst.dev);
1220                                 if (rt->idev)
1221                                         in_dev_hold(rt->idev);
1222                                 rt->u.dst.obsolete      = 0;
1223                                 rt->u.dst.lastuse       = jiffies;
1224                                 rt->u.dst.path          = &rt->u.dst;
1225                                 rt->u.dst.neighbour     = NULL;
1226                                 rt->u.dst.hh            = NULL;
1227                                 rt->u.dst.xfrm          = NULL;
1228
1229                                 rt->rt_flags            |= RTCF_REDIRECTED;
1230
1231                                 /* Gateway is different ... */
1232                                 rt->rt_gateway          = new_gw;
1233
1234                                 /* Redirect received -> path was valid */
1235                                 dst_confirm(&rth->u.dst);
1236
1237                                 if (rt->peer)
1238                                         atomic_inc(&rt->peer->refcnt);
1239
1240                                 if (arp_bind_neighbour(&rt->u.dst) ||
1241                                     !(rt->u.dst.neighbour->nud_state &
1242                                             NUD_VALID)) {
1243                                         if (rt->u.dst.neighbour)
1244                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1245                                         ip_rt_put(rth);
1246                                         rt_drop(rt);
1247                                         goto do_next;
1248                                 }
1249
1250                                 netevent.old = &rth->u.dst;
1251                                 netevent.new = &rt->u.dst;
1252                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1253                                                         &netevent);
1254
1255                                 rt_del(hash, rth);
1256                                 if (!rt_intern_hash(hash, rt, &rt))
1257                                         ip_rt_put(rt);
1258                                 goto do_next;
1259                         }
1260                         rcu_read_unlock();
1261                 do_next:
1262                         ;
1263                 }
1264         }
1265         in_dev_put(in_dev);
1266         return;
1267
1268 reject_redirect:
1269 #ifdef CONFIG_IP_ROUTE_VERBOSE
1270         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1271                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1272                         "%u.%u.%u.%u ignored.\n"
1273                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1274                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1275                        NIPQUAD(saddr), NIPQUAD(daddr));
1276 #endif
1277         in_dev_put(in_dev);
1278 }
1279
1280 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1281 {
1282         struct rtable *rt = (struct rtable*)dst;
1283         struct dst_entry *ret = dst;
1284
1285         if (rt) {
1286                 if (dst->obsolete) {
1287                         ip_rt_put(rt);
1288                         ret = NULL;
1289                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1290                            rt->u.dst.expires) {
1291                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1292                                                 rt->fl.oif);
1293 #if RT_CACHE_DEBUG >= 1
1294                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1295                                           "%u.%u.%u.%u/%02x dropped\n",
1296                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1297 #endif
1298                         rt_del(hash, rt);
1299                         ret = NULL;
1300                 }
1301         }
1302         return ret;
1303 }
1304
1305 /*
1306  * Algorithm:
1307  *      1. The first ip_rt_redirect_number redirects are sent
1308  *         with exponential backoff, then we stop sending them at all,
1309  *         assuming that the host ignores our redirects.
1310  *      2. If we did not see packets requiring redirects
1311  *         during ip_rt_redirect_silence, we assume that the host
1312  *         forgot redirected route and start to send redirects again.
1313  *
1314  * This algorithm is much cheaper and more intelligent than dumb load limiting
1315  * in icmp.c.
1316  *
1317  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1318  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1319  */
1320
1321 void ip_rt_send_redirect(struct sk_buff *skb)
1322 {
1323         struct rtable *rt = (struct rtable*)skb->dst;
1324         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1325
1326         if (!in_dev)
1327                 return;
1328
1329         if (!IN_DEV_TX_REDIRECTS(in_dev))
1330                 goto out;
1331
1332         /* No redirected packets during ip_rt_redirect_silence;
1333          * reset the algorithm.
1334          */
1335         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1336                 rt->u.dst.rate_tokens = 0;
1337
1338         /* Too many ignored redirects; do not send anything
1339          * set u.dst.rate_last to the last seen redirected packet.
1340          */
1341         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1342                 rt->u.dst.rate_last = jiffies;
1343                 goto out;
1344         }
1345
1346         /* Check for load limit; set rate_last to the latest sent
1347          * redirect.
1348          */
1349         if (rt->u.dst.rate_tokens == 0 ||
1350             time_after(jiffies,
1351                        (rt->u.dst.rate_last +
1352                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1353                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1354                 rt->u.dst.rate_last = jiffies;
1355                 ++rt->u.dst.rate_tokens;
1356 #ifdef CONFIG_IP_ROUTE_VERBOSE
1357                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1358                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1359                     net_ratelimit())
1360                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1361                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1362                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1363                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1364 #endif
1365         }
1366 out:
1367         in_dev_put(in_dev);
1368 }
1369
1370 static int ip_error(struct sk_buff *skb)
1371 {
1372         struct rtable *rt = (struct rtable*)skb->dst;
1373         unsigned long now;
1374         int code;
1375
1376         switch (rt->u.dst.error) {
1377                 case EINVAL:
1378                 default:
1379                         goto out;
1380                 case EHOSTUNREACH:
1381                         code = ICMP_HOST_UNREACH;
1382                         break;
1383                 case ENETUNREACH:
1384                         code = ICMP_NET_UNREACH;
1385                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1386                         break;
1387                 case EACCES:
1388                         code = ICMP_PKT_FILTERED;
1389                         break;
1390         }
1391
1392         now = jiffies;
1393         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1394         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1395                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1396         rt->u.dst.rate_last = now;
1397         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1398                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1399                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1400         }
1401
1402 out:    kfree_skb(skb);
1403         return 0;
1404 }
1405
1406 /*
1407  *      The last two values are not from the RFC but
1408  *      are needed for AMPRnet AX.25 paths.
1409  */
1410
1411 static const unsigned short mtu_plateau[] =
1412 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1413
1414 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1415 {
1416         int i;
1417
1418         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1419                 if (old_mtu > mtu_plateau[i])
1420                         return mtu_plateau[i];
1421         return 68;
1422 }
1423
1424 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1425                                  unsigned short new_mtu)
1426 {
1427         int i;
1428         unsigned short old_mtu = ntohs(iph->tot_len);
1429         struct rtable *rth;
1430         __be32  skeys[2] = { iph->saddr, 0, };
1431         __be32  daddr = iph->daddr;
1432         unsigned short est_mtu = 0;
1433
1434         if (ipv4_config.no_pmtu_disc)
1435                 return 0;
1436
1437         for (i = 0; i < 2; i++) {
1438                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1439
1440                 rcu_read_lock();
1441                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1442                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1443                         if (rth->fl.fl4_dst == daddr &&
1444                             rth->fl.fl4_src == skeys[i] &&
1445                             rth->rt_dst  == daddr &&
1446                             rth->rt_src  == iph->saddr &&
1447                             rth->fl.iif == 0 &&
1448                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1449                             rth->u.dst.dev->nd_net == net) {
1450                                 unsigned short mtu = new_mtu;
1451
1452                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1453
1454                                         /* BSD 4.2 compatibility hack :-( */
1455                                         if (mtu == 0 &&
1456                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1457                                             old_mtu >= 68 + (iph->ihl << 2))
1458                                                 old_mtu -= iph->ihl << 2;
1459
1460                                         mtu = guess_mtu(old_mtu);
1461                                 }
1462                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1463                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1464                                                 dst_confirm(&rth->u.dst);
1465                                                 if (mtu < ip_rt_min_pmtu) {
1466                                                         mtu = ip_rt_min_pmtu;
1467                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1468                                                                 (1 << RTAX_MTU);
1469                                                 }
1470                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1471                                                 dst_set_expires(&rth->u.dst,
1472                                                         ip_rt_mtu_expires);
1473                                         }
1474                                         est_mtu = mtu;
1475                                 }
1476                         }
1477                 }
1478                 rcu_read_unlock();
1479         }
1480         return est_mtu ? : new_mtu;
1481 }
1482
1483 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1484 {
1485         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1486             !(dst_metric_locked(dst, RTAX_MTU))) {
1487                 if (mtu < ip_rt_min_pmtu) {
1488                         mtu = ip_rt_min_pmtu;
1489                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1490                 }
1491                 dst->metrics[RTAX_MTU-1] = mtu;
1492                 dst_set_expires(dst, ip_rt_mtu_expires);
1493                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1494         }
1495 }
1496
1497 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1498 {
1499         return NULL;
1500 }
1501
1502 static void ipv4_dst_destroy(struct dst_entry *dst)
1503 {
1504         struct rtable *rt = (struct rtable *) dst;
1505         struct inet_peer *peer = rt->peer;
1506         struct in_device *idev = rt->idev;
1507
1508         if (peer) {
1509                 rt->peer = NULL;
1510                 inet_putpeer(peer);
1511         }
1512
1513         if (idev) {
1514                 rt->idev = NULL;
1515                 in_dev_put(idev);
1516         }
1517 }
1518
1519 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1520                             int how)
1521 {
1522         struct rtable *rt = (struct rtable *) dst;
1523         struct in_device *idev = rt->idev;
1524         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1525                 struct in_device *loopback_idev =
1526                         in_dev_get(dev->nd_net->loopback_dev);
1527                 if (loopback_idev) {
1528                         rt->idev = loopback_idev;
1529                         in_dev_put(idev);
1530                 }
1531         }
1532 }
1533
1534 static void ipv4_link_failure(struct sk_buff *skb)
1535 {
1536         struct rtable *rt;
1537
1538         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1539
1540         rt = (struct rtable *) skb->dst;
1541         if (rt)
1542                 dst_set_expires(&rt->u.dst, 0);
1543 }
1544
1545 static int ip_rt_bug(struct sk_buff *skb)
1546 {
1547         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1548                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1549                 skb->dev ? skb->dev->name : "?");
1550         kfree_skb(skb);
1551         return 0;
1552 }
1553
1554 /*
1555    We do not cache source address of outgoing interface,
1556    because it is used only by IP RR, TS and SRR options,
1557    so that it out of fast path.
1558
1559    BTW remember: "addr" is allowed to be not aligned
1560    in IP options!
1561  */
1562
1563 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1564 {
1565         __be32 src;
1566         struct fib_result res;
1567
1568         if (rt->fl.iif == 0)
1569                 src = rt->rt_src;
1570         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1571                 src = FIB_RES_PREFSRC(res);
1572                 fib_res_put(&res);
1573         } else
1574                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1575                                         RT_SCOPE_UNIVERSE);
1576         memcpy(addr, &src, 4);
1577 }
1578
1579 #ifdef CONFIG_NET_CLS_ROUTE
1580 static void set_class_tag(struct rtable *rt, u32 tag)
1581 {
1582         if (!(rt->u.dst.tclassid & 0xFFFF))
1583                 rt->u.dst.tclassid |= tag & 0xFFFF;
1584         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1585                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1586 }
1587 #endif
1588
1589 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1590 {
1591         struct fib_info *fi = res->fi;
1592
1593         if (fi) {
1594                 if (FIB_RES_GW(*res) &&
1595                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1596                         rt->rt_gateway = FIB_RES_GW(*res);
1597                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1598                        sizeof(rt->u.dst.metrics));
1599                 if (fi->fib_mtu == 0) {
1600                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1601                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1602                             rt->rt_gateway != rt->rt_dst &&
1603                             rt->u.dst.dev->mtu > 576)
1604                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1605                 }
1606 #ifdef CONFIG_NET_CLS_ROUTE
1607                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1608 #endif
1609         } else
1610                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1611
1612         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1613                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1614         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1615                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1616         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1617                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1618                                        ip_rt_min_advmss);
1619         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1620                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1621
1622 #ifdef CONFIG_NET_CLS_ROUTE
1623 #ifdef CONFIG_IP_MULTIPLE_TABLES
1624         set_class_tag(rt, fib_rules_tclass(res));
1625 #endif
1626         set_class_tag(rt, itag);
1627 #endif
1628         rt->rt_type = res->type;
1629 }
1630
1631 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1632                                 u8 tos, struct net_device *dev, int our)
1633 {
1634         unsigned hash;
1635         struct rtable *rth;
1636         __be32 spec_dst;
1637         struct in_device *in_dev = in_dev_get(dev);
1638         u32 itag = 0;
1639
1640         /* Primary sanity checks. */
1641
1642         if (in_dev == NULL)
1643                 return -EINVAL;
1644
1645         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1646             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1647                 goto e_inval;
1648
1649         if (ipv4_is_zeronet(saddr)) {
1650                 if (!ipv4_is_local_multicast(daddr))
1651                         goto e_inval;
1652                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1653         } else if (fib_validate_source(saddr, 0, tos, 0,
1654                                         dev, &spec_dst, &itag) < 0)
1655                 goto e_inval;
1656
1657         rth = dst_alloc(&ipv4_dst_ops);
1658         if (!rth)
1659                 goto e_nobufs;
1660
1661         rth->u.dst.output= ip_rt_bug;
1662
1663         atomic_set(&rth->u.dst.__refcnt, 1);
1664         rth->u.dst.flags= DST_HOST;
1665         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1666                 rth->u.dst.flags |= DST_NOPOLICY;
1667         rth->fl.fl4_dst = daddr;
1668         rth->rt_dst     = daddr;
1669         rth->fl.fl4_tos = tos;
1670         rth->fl.mark    = skb->mark;
1671         rth->fl.fl4_src = saddr;
1672         rth->rt_src     = saddr;
1673 #ifdef CONFIG_NET_CLS_ROUTE
1674         rth->u.dst.tclassid = itag;
1675 #endif
1676         rth->rt_iif     =
1677         rth->fl.iif     = dev->ifindex;
1678         rth->u.dst.dev  = init_net.loopback_dev;
1679         dev_hold(rth->u.dst.dev);
1680         rth->idev       = in_dev_get(rth->u.dst.dev);
1681         rth->fl.oif     = 0;
1682         rth->rt_gateway = daddr;
1683         rth->rt_spec_dst= spec_dst;
1684         rth->rt_type    = RTN_MULTICAST;
1685         rth->rt_flags   = RTCF_MULTICAST;
1686         if (our) {
1687                 rth->u.dst.input= ip_local_deliver;
1688                 rth->rt_flags |= RTCF_LOCAL;
1689         }
1690
1691 #ifdef CONFIG_IP_MROUTE
1692         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1693                 rth->u.dst.input = ip_mr_input;
1694 #endif
1695         RT_CACHE_STAT_INC(in_slow_mc);
1696
1697         in_dev_put(in_dev);
1698         hash = rt_hash(daddr, saddr, dev->ifindex);
1699         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1700
1701 e_nobufs:
1702         in_dev_put(in_dev);
1703         return -ENOBUFS;
1704
1705 e_inval:
1706         in_dev_put(in_dev);
1707         return -EINVAL;
1708 }
1709
1710
1711 static void ip_handle_martian_source(struct net_device *dev,
1712                                      struct in_device *in_dev,
1713                                      struct sk_buff *skb,
1714                                      __be32 daddr,
1715                                      __be32 saddr)
1716 {
1717         RT_CACHE_STAT_INC(in_martian_src);
1718 #ifdef CONFIG_IP_ROUTE_VERBOSE
1719         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1720                 /*
1721                  *      RFC1812 recommendation, if source is martian,
1722                  *      the only hint is MAC header.
1723                  */
1724                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1725                         "%u.%u.%u.%u, on dev %s\n",
1726                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1727                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1728                         int i;
1729                         const unsigned char *p = skb_mac_header(skb);
1730                         printk(KERN_WARNING "ll header: ");
1731                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1732                                 printk("%02x", *p);
1733                                 if (i < (dev->hard_header_len - 1))
1734                                         printk(":");
1735                         }
1736                         printk("\n");
1737                 }
1738         }
1739 #endif
1740 }
1741
1742 static inline int __mkroute_input(struct sk_buff *skb,
1743                                   struct fib_result* res,
1744                                   struct in_device *in_dev,
1745                                   __be32 daddr, __be32 saddr, u32 tos,
1746                                   struct rtable **result)
1747 {
1748
1749         struct rtable *rth;
1750         int err;
1751         struct in_device *out_dev;
1752         unsigned flags = 0;
1753         __be32 spec_dst;
1754         u32 itag;
1755
1756         /* get a working reference to the output device */
1757         out_dev = in_dev_get(FIB_RES_DEV(*res));
1758         if (out_dev == NULL) {
1759                 if (net_ratelimit())
1760                         printk(KERN_CRIT "Bug in ip_route_input" \
1761                                "_slow(). Please, report\n");
1762                 return -EINVAL;
1763         }
1764
1765
1766         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1767                                   in_dev->dev, &spec_dst, &itag);
1768         if (err < 0) {
1769                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1770                                          saddr);
1771
1772                 err = -EINVAL;
1773                 goto cleanup;
1774         }
1775
1776         if (err)
1777                 flags |= RTCF_DIRECTSRC;
1778
1779         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1780             (IN_DEV_SHARED_MEDIA(out_dev) ||
1781              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1782                 flags |= RTCF_DOREDIRECT;
1783
1784         if (skb->protocol != htons(ETH_P_IP)) {
1785                 /* Not IP (i.e. ARP). Do not create route, if it is
1786                  * invalid for proxy arp. DNAT routes are always valid.
1787                  */
1788                 if (out_dev == in_dev) {
1789                         err = -EINVAL;
1790                         goto cleanup;
1791                 }
1792         }
1793
1794
1795         rth = dst_alloc(&ipv4_dst_ops);
1796         if (!rth) {
1797                 err = -ENOBUFS;
1798                 goto cleanup;
1799         }
1800
1801         atomic_set(&rth->u.dst.__refcnt, 1);
1802         rth->u.dst.flags= DST_HOST;
1803         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1804                 rth->u.dst.flags |= DST_NOPOLICY;
1805         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1806                 rth->u.dst.flags |= DST_NOXFRM;
1807         rth->fl.fl4_dst = daddr;
1808         rth->rt_dst     = daddr;
1809         rth->fl.fl4_tos = tos;
1810         rth->fl.mark    = skb->mark;
1811         rth->fl.fl4_src = saddr;
1812         rth->rt_src     = saddr;
1813         rth->rt_gateway = daddr;
1814         rth->rt_iif     =
1815                 rth->fl.iif     = in_dev->dev->ifindex;
1816         rth->u.dst.dev  = (out_dev)->dev;
1817         dev_hold(rth->u.dst.dev);
1818         rth->idev       = in_dev_get(rth->u.dst.dev);
1819         rth->fl.oif     = 0;
1820         rth->rt_spec_dst= spec_dst;
1821
1822         rth->u.dst.input = ip_forward;
1823         rth->u.dst.output = ip_output;
1824
1825         rt_set_nexthop(rth, res, itag);
1826
1827         rth->rt_flags = flags;
1828
1829         *result = rth;
1830         err = 0;
1831  cleanup:
1832         /* release the working reference to the output device */
1833         in_dev_put(out_dev);
1834         return err;
1835 }
1836
1837 static inline int ip_mkroute_input(struct sk_buff *skb,
1838                                    struct fib_result* res,
1839                                    const struct flowi *fl,
1840                                    struct in_device *in_dev,
1841                                    __be32 daddr, __be32 saddr, u32 tos)
1842 {
1843         struct rtable* rth = NULL;
1844         int err;
1845         unsigned hash;
1846
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1848         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1849                 fib_select_multipath(fl, res);
1850 #endif
1851
1852         /* create a routing cache entry */
1853         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1854         if (err)
1855                 return err;
1856
1857         /* put it into the cache */
1858         hash = rt_hash(daddr, saddr, fl->iif);
1859         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1860 }
1861
1862 /*
1863  *      NOTE. We drop all the packets that has local source
1864  *      addresses, because every properly looped back packet
1865  *      must have correct destination already attached by output routine.
1866  *
1867  *      Such approach solves two big problems:
1868  *      1. Not simplex devices are handled properly.
1869  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1870  */
1871
1872 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1873                                u8 tos, struct net_device *dev)
1874 {
1875         struct fib_result res;
1876         struct in_device *in_dev = in_dev_get(dev);
1877         struct flowi fl = { .nl_u = { .ip4_u =
1878                                       { .daddr = daddr,
1879                                         .saddr = saddr,
1880                                         .tos = tos,
1881                                         .scope = RT_SCOPE_UNIVERSE,
1882                                       } },
1883                             .mark = skb->mark,
1884                             .iif = dev->ifindex };
1885         unsigned        flags = 0;
1886         u32             itag = 0;
1887         struct rtable * rth;
1888         unsigned        hash;
1889         __be32          spec_dst;
1890         int             err = -EINVAL;
1891         int             free_res = 0;
1892         struct net    * net = dev->nd_net;
1893
1894         /* IP on this device is disabled. */
1895
1896         if (!in_dev)
1897                 goto out;
1898
1899         /* Check for the most weird martians, which can be not detected
1900            by fib_lookup.
1901          */
1902
1903         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1904             ipv4_is_loopback(saddr))
1905                 goto martian_source;
1906
1907         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1908                 goto brd_input;
1909
1910         /* Accept zero addresses only to limited broadcast;
1911          * I even do not know to fix it or not. Waiting for complains :-)
1912          */
1913         if (ipv4_is_zeronet(saddr))
1914                 goto martian_source;
1915
1916         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1917             ipv4_is_loopback(daddr))
1918                 goto martian_destination;
1919
1920         /*
1921          *      Now we are ready to route packet.
1922          */
1923         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1924                 if (!IN_DEV_FORWARD(in_dev))
1925                         goto e_hostunreach;
1926                 goto no_route;
1927         }
1928         free_res = 1;
1929
1930         RT_CACHE_STAT_INC(in_slow_tot);
1931
1932         if (res.type == RTN_BROADCAST)
1933                 goto brd_input;
1934
1935         if (res.type == RTN_LOCAL) {
1936                 int result;
1937                 result = fib_validate_source(saddr, daddr, tos,
1938                                              net->loopback_dev->ifindex,
1939                                              dev, &spec_dst, &itag);
1940                 if (result < 0)
1941                         goto martian_source;
1942                 if (result)
1943                         flags |= RTCF_DIRECTSRC;
1944                 spec_dst = daddr;
1945                 goto local_input;
1946         }
1947
1948         if (!IN_DEV_FORWARD(in_dev))
1949                 goto e_hostunreach;
1950         if (res.type != RTN_UNICAST)
1951                 goto martian_destination;
1952
1953         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1954 done:
1955         in_dev_put(in_dev);
1956         if (free_res)
1957                 fib_res_put(&res);
1958 out:    return err;
1959
1960 brd_input:
1961         if (skb->protocol != htons(ETH_P_IP))
1962                 goto e_inval;
1963
1964         if (ipv4_is_zeronet(saddr))
1965                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1966         else {
1967                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1968                                           &itag);
1969                 if (err < 0)
1970                         goto martian_source;
1971                 if (err)
1972                         flags |= RTCF_DIRECTSRC;
1973         }
1974         flags |= RTCF_BROADCAST;
1975         res.type = RTN_BROADCAST;
1976         RT_CACHE_STAT_INC(in_brd);
1977
1978 local_input:
1979         rth = dst_alloc(&ipv4_dst_ops);
1980         if (!rth)
1981                 goto e_nobufs;
1982
1983         rth->u.dst.output= ip_rt_bug;
1984
1985         atomic_set(&rth->u.dst.__refcnt, 1);
1986         rth->u.dst.flags= DST_HOST;
1987         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1988                 rth->u.dst.flags |= DST_NOPOLICY;
1989         rth->fl.fl4_dst = daddr;
1990         rth->rt_dst     = daddr;
1991         rth->fl.fl4_tos = tos;
1992         rth->fl.mark    = skb->mark;
1993         rth->fl.fl4_src = saddr;
1994         rth->rt_src     = saddr;
1995 #ifdef CONFIG_NET_CLS_ROUTE
1996         rth->u.dst.tclassid = itag;
1997 #endif
1998         rth->rt_iif     =
1999         rth->fl.iif     = dev->ifindex;
2000         rth->u.dst.dev  = net->loopback_dev;
2001         dev_hold(rth->u.dst.dev);
2002         rth->idev       = in_dev_get(rth->u.dst.dev);
2003         rth->rt_gateway = daddr;
2004         rth->rt_spec_dst= spec_dst;
2005         rth->u.dst.input= ip_local_deliver;
2006         rth->rt_flags   = flags|RTCF_LOCAL;
2007         if (res.type == RTN_UNREACHABLE) {
2008                 rth->u.dst.input= ip_error;
2009                 rth->u.dst.error= -err;
2010                 rth->rt_flags   &= ~RTCF_LOCAL;
2011         }
2012         rth->rt_type    = res.type;
2013         hash = rt_hash(daddr, saddr, fl.iif);
2014         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2015         goto done;
2016
2017 no_route:
2018         RT_CACHE_STAT_INC(in_no_route);
2019         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2020         res.type = RTN_UNREACHABLE;
2021         if (err == -ESRCH)
2022                 err = -ENETUNREACH;
2023         goto local_input;
2024
2025         /*
2026          *      Do not cache martian addresses: they should be logged (RFC1812)
2027          */
2028 martian_destination:
2029         RT_CACHE_STAT_INC(in_martian_dst);
2030 #ifdef CONFIG_IP_ROUTE_VERBOSE
2031         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2032                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2033                         "%u.%u.%u.%u, dev %s\n",
2034                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2035 #endif
2036
2037 e_hostunreach:
2038         err = -EHOSTUNREACH;
2039         goto done;
2040
2041 e_inval:
2042         err = -EINVAL;
2043         goto done;
2044
2045 e_nobufs:
2046         err = -ENOBUFS;
2047         goto done;
2048
2049 martian_source:
2050         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2051         goto e_inval;
2052 }
2053
2054 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2055                    u8 tos, struct net_device *dev)
2056 {
2057         struct rtable * rth;
2058         unsigned        hash;
2059         int iif = dev->ifindex;
2060         struct net *net;
2061
2062         net = skb->dev->nd_net;
2063         tos &= IPTOS_RT_MASK;
2064         hash = rt_hash(daddr, saddr, iif);
2065
2066         rcu_read_lock();
2067         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2068              rth = rcu_dereference(rth->u.dst.rt_next)) {
2069                 if (rth->fl.fl4_dst == daddr &&
2070                     rth->fl.fl4_src == saddr &&
2071                     rth->fl.iif == iif &&
2072                     rth->fl.oif == 0 &&
2073                     rth->fl.mark == skb->mark &&
2074                     rth->fl.fl4_tos == tos &&
2075                     rth->u.dst.dev->nd_net == net) {
2076                         dst_use(&rth->u.dst, jiffies);
2077                         RT_CACHE_STAT_INC(in_hit);
2078                         rcu_read_unlock();
2079                         skb->dst = (struct dst_entry*)rth;
2080                         return 0;
2081                 }
2082                 RT_CACHE_STAT_INC(in_hlist_search);
2083         }
2084         rcu_read_unlock();
2085
2086         /* Multicast recognition logic is moved from route cache to here.
2087            The problem was that too many Ethernet cards have broken/missing
2088            hardware multicast filters :-( As result the host on multicasting
2089            network acquires a lot of useless route cache entries, sort of
2090            SDR messages from all the world. Now we try to get rid of them.
2091            Really, provided software IP multicast filter is organized
2092            reasonably (at least, hashed), it does not result in a slowdown
2093            comparing with route cache reject entries.
2094            Note, that multicast routers are not affected, because
2095            route cache entry is created eventually.
2096          */
2097         if (ipv4_is_multicast(daddr)) {
2098                 struct in_device *in_dev;
2099
2100                 rcu_read_lock();
2101                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2102                         int our = ip_check_mc(in_dev, daddr, saddr,
2103                                 ip_hdr(skb)->protocol);
2104                         if (our
2105 #ifdef CONFIG_IP_MROUTE
2106                             || (!ipv4_is_local_multicast(daddr) &&
2107                                 IN_DEV_MFORWARD(in_dev))
2108 #endif
2109                             ) {
2110                                 rcu_read_unlock();
2111                                 return ip_route_input_mc(skb, daddr, saddr,
2112                                                          tos, dev, our);
2113                         }
2114                 }
2115                 rcu_read_unlock();
2116                 return -EINVAL;
2117         }
2118         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2119 }
2120
2121 static inline int __mkroute_output(struct rtable **result,
2122                                    struct fib_result* res,
2123                                    const struct flowi *fl,
2124                                    const struct flowi *oldflp,
2125                                    struct net_device *dev_out,
2126                                    unsigned flags)
2127 {
2128         struct rtable *rth;
2129         struct in_device *in_dev;
2130         u32 tos = RT_FL_TOS(oldflp);
2131         int err = 0;
2132
2133         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2134                 return -EINVAL;
2135
2136         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2137                 res->type = RTN_BROADCAST;
2138         else if (ipv4_is_multicast(fl->fl4_dst))
2139                 res->type = RTN_MULTICAST;
2140         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2141                 return -EINVAL;
2142
2143         if (dev_out->flags & IFF_LOOPBACK)
2144                 flags |= RTCF_LOCAL;
2145
2146         /* get work reference to inet device */
2147         in_dev = in_dev_get(dev_out);
2148         if (!in_dev)
2149                 return -EINVAL;
2150
2151         if (res->type == RTN_BROADCAST) {
2152                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2153                 if (res->fi) {
2154                         fib_info_put(res->fi);
2155                         res->fi = NULL;
2156                 }
2157         } else if (res->type == RTN_MULTICAST) {
2158                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2159                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2160                                  oldflp->proto))
2161                         flags &= ~RTCF_LOCAL;
2162                 /* If multicast route do not exist use
2163                    default one, but do not gateway in this case.
2164                    Yes, it is hack.
2165                  */
2166                 if (res->fi && res->prefixlen < 4) {
2167                         fib_info_put(res->fi);
2168                         res->fi = NULL;
2169                 }
2170         }
2171
2172
2173         rth = dst_alloc(&ipv4_dst_ops);
2174         if (!rth) {
2175                 err = -ENOBUFS;
2176                 goto cleanup;
2177         }
2178
2179         atomic_set(&rth->u.dst.__refcnt, 1);
2180         rth->u.dst.flags= DST_HOST;
2181         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2182                 rth->u.dst.flags |= DST_NOXFRM;
2183         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2184                 rth->u.dst.flags |= DST_NOPOLICY;
2185
2186         rth->fl.fl4_dst = oldflp->fl4_dst;
2187         rth->fl.fl4_tos = tos;
2188         rth->fl.fl4_src = oldflp->fl4_src;
2189         rth->fl.oif     = oldflp->oif;
2190         rth->fl.mark    = oldflp->mark;
2191         rth->rt_dst     = fl->fl4_dst;
2192         rth->rt_src     = fl->fl4_src;
2193         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2194         /* get references to the devices that are to be hold by the routing
2195            cache entry */
2196         rth->u.dst.dev  = dev_out;
2197         dev_hold(dev_out);
2198         rth->idev       = in_dev_get(dev_out);
2199         rth->rt_gateway = fl->fl4_dst;
2200         rth->rt_spec_dst= fl->fl4_src;
2201
2202         rth->u.dst.output=ip_output;
2203
2204         RT_CACHE_STAT_INC(out_slow_tot);
2205
2206         if (flags & RTCF_LOCAL) {
2207                 rth->u.dst.input = ip_local_deliver;
2208                 rth->rt_spec_dst = fl->fl4_dst;
2209         }
2210         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2211                 rth->rt_spec_dst = fl->fl4_src;
2212                 if (flags & RTCF_LOCAL &&
2213                     !(dev_out->flags & IFF_LOOPBACK)) {
2214                         rth->u.dst.output = ip_mc_output;
2215                         RT_CACHE_STAT_INC(out_slow_mc);
2216                 }
2217 #ifdef CONFIG_IP_MROUTE
2218                 if (res->type == RTN_MULTICAST) {
2219                         if (IN_DEV_MFORWARD(in_dev) &&
2220                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2221                                 rth->u.dst.input = ip_mr_input;
2222                                 rth->u.dst.output = ip_mc_output;
2223                         }
2224                 }
2225 #endif
2226         }
2227
2228         rt_set_nexthop(rth, res, 0);
2229
2230         rth->rt_flags = flags;
2231
2232         *result = rth;
2233  cleanup:
2234         /* release work reference to inet device */
2235         in_dev_put(in_dev);
2236
2237         return err;
2238 }
2239
2240 static inline int ip_mkroute_output(struct rtable **rp,
2241                                     struct fib_result* res,
2242                                     const struct flowi *fl,
2243                                     const struct flowi *oldflp,
2244                                     struct net_device *dev_out,
2245                                     unsigned flags)
2246 {
2247         struct rtable *rth = NULL;
2248         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2249         unsigned hash;
2250         if (err == 0) {
2251                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2252                 err = rt_intern_hash(hash, rth, rp);
2253         }
2254
2255         return err;
2256 }
2257
2258 /*
2259  * Major route resolver routine.
2260  */
2261
2262 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2263                                 const struct flowi *oldflp)
2264 {
2265         u32 tos = RT_FL_TOS(oldflp);
2266         struct flowi fl = { .nl_u = { .ip4_u =
2267                                       { .daddr = oldflp->fl4_dst,
2268                                         .saddr = oldflp->fl4_src,
2269                                         .tos = tos & IPTOS_RT_MASK,
2270                                         .scope = ((tos & RTO_ONLINK) ?
2271                                                   RT_SCOPE_LINK :
2272                                                   RT_SCOPE_UNIVERSE),
2273                                       } },
2274                             .mark = oldflp->mark,
2275                             .iif = net->loopback_dev->ifindex,
2276                             .oif = oldflp->oif };
2277         struct fib_result res;
2278         unsigned flags = 0;
2279         struct net_device *dev_out = NULL;
2280         int free_res = 0;
2281         int err;
2282
2283
2284         res.fi          = NULL;
2285 #ifdef CONFIG_IP_MULTIPLE_TABLES
2286         res.r           = NULL;
2287 #endif
2288
2289         if (oldflp->fl4_src) {
2290                 err = -EINVAL;
2291                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2292                     ipv4_is_lbcast(oldflp->fl4_src) ||
2293                     ipv4_is_zeronet(oldflp->fl4_src))
2294                         goto out;
2295
2296                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2297                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2298                 if (dev_out == NULL)
2299                         goto out;
2300
2301                 /* I removed check for oif == dev_out->oif here.
2302                    It was wrong for two reasons:
2303                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2304                       is assigned to multiple interfaces.
2305                    2. Moreover, we are allowed to send packets with saddr
2306                       of another iface. --ANK
2307                  */
2308
2309                 if (oldflp->oif == 0
2310                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2311                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2312                         /* Special hack: user can direct multicasts
2313                            and limited broadcast via necessary interface
2314                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2315                            This hack is not just for fun, it allows
2316                            vic,vat and friends to work.
2317                            They bind socket to loopback, set ttl to zero
2318                            and expect that it will work.
2319                            From the viewpoint of routing cache they are broken,
2320                            because we are not allowed to build multicast path
2321                            with loopback source addr (look, routing cache
2322                            cannot know, that ttl is zero, so that packet
2323                            will not leave this host and route is valid).
2324                            Luckily, this hack is good workaround.
2325                          */
2326
2327                         fl.oif = dev_out->ifindex;
2328                         goto make_route;
2329                 }
2330                 if (dev_out)
2331                         dev_put(dev_out);
2332                 dev_out = NULL;
2333         }
2334
2335
2336         if (oldflp->oif) {
2337                 dev_out = dev_get_by_index(net, oldflp->oif);
2338                 err = -ENODEV;
2339                 if (dev_out == NULL)
2340                         goto out;
2341
2342                 /* RACE: Check return value of inet_select_addr instead. */
2343                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2344                         dev_put(dev_out);
2345                         goto out;       /* Wrong error code */
2346                 }
2347
2348                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2349                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2350                         if (!fl.fl4_src)
2351                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2352                                                               RT_SCOPE_LINK);
2353                         goto make_route;
2354                 }
2355                 if (!fl.fl4_src) {
2356                         if (ipv4_is_multicast(oldflp->fl4_dst))
2357                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2358                                                               fl.fl4_scope);
2359                         else if (!oldflp->fl4_dst)
2360                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2361                                                               RT_SCOPE_HOST);
2362                 }
2363         }
2364
2365         if (!fl.fl4_dst) {
2366                 fl.fl4_dst = fl.fl4_src;
2367                 if (!fl.fl4_dst)
2368                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2369                 if (dev_out)
2370                         dev_put(dev_out);
2371                 dev_out = net->loopback_dev;
2372                 dev_hold(dev_out);
2373                 fl.oif = net->loopback_dev->ifindex;
2374                 res.type = RTN_LOCAL;
2375                 flags |= RTCF_LOCAL;
2376                 goto make_route;
2377         }
2378
2379         if (fib_lookup(net, &fl, &res)) {
2380                 res.fi = NULL;
2381                 if (oldflp->oif) {
2382                         /* Apparently, routing tables are wrong. Assume,
2383                            that the destination is on link.
2384
2385                            WHY? DW.
2386                            Because we are allowed to send to iface
2387                            even if it has NO routes and NO assigned
2388                            addresses. When oif is specified, routing
2389                            tables are looked up with only one purpose:
2390                            to catch if destination is gatewayed, rather than
2391                            direct. Moreover, if MSG_DONTROUTE is set,
2392                            we send packet, ignoring both routing tables
2393                            and ifaddr state. --ANK
2394
2395
2396                            We could make it even if oif is unknown,
2397                            likely IPv6, but we do not.
2398                          */
2399
2400                         if (fl.fl4_src == 0)
2401                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2402                                                               RT_SCOPE_LINK);
2403                         res.type = RTN_UNICAST;
2404                         goto make_route;
2405                 }
2406                 if (dev_out)
2407                         dev_put(dev_out);
2408                 err = -ENETUNREACH;
2409                 goto out;
2410         }
2411         free_res = 1;
2412
2413         if (res.type == RTN_LOCAL) {
2414                 if (!fl.fl4_src)
2415                         fl.fl4_src = fl.fl4_dst;
2416                 if (dev_out)
2417                         dev_put(dev_out);
2418                 dev_out = net->loopback_dev;
2419                 dev_hold(dev_out);
2420                 fl.oif = dev_out->ifindex;
2421                 if (res.fi)
2422                         fib_info_put(res.fi);
2423                 res.fi = NULL;
2424                 flags |= RTCF_LOCAL;
2425                 goto make_route;
2426         }
2427
2428 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2429         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2430                 fib_select_multipath(&fl, &res);
2431         else
2432 #endif
2433         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2434                 fib_select_default(net, &fl, &res);
2435
2436         if (!fl.fl4_src)
2437                 fl.fl4_src = FIB_RES_PREFSRC(res);
2438
2439         if (dev_out)
2440                 dev_put(dev_out);
2441         dev_out = FIB_RES_DEV(res);
2442         dev_hold(dev_out);
2443         fl.oif = dev_out->ifindex;
2444
2445
2446 make_route:
2447         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2448
2449
2450         if (free_res)
2451                 fib_res_put(&res);
2452         if (dev_out)
2453                 dev_put(dev_out);
2454 out:    return err;
2455 }
2456
2457 int __ip_route_output_key(struct net *net, struct rtable **rp,
2458                           const struct flowi *flp)
2459 {
2460         unsigned hash;
2461         struct rtable *rth;
2462
2463         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2464
2465         rcu_read_lock_bh();
2466         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2467                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2468                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2469                     rth->fl.fl4_src == flp->fl4_src &&
2470                     rth->fl.iif == 0 &&
2471                     rth->fl.oif == flp->oif &&
2472                     rth->fl.mark == flp->mark &&
2473                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2474                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2475                     rth->u.dst.dev->nd_net == net) {
2476                         dst_use(&rth->u.dst, jiffies);
2477                         RT_CACHE_STAT_INC(out_hit);
2478                         rcu_read_unlock_bh();
2479                         *rp = rth;
2480                         return 0;
2481                 }
2482                 RT_CACHE_STAT_INC(out_hlist_search);
2483         }
2484         rcu_read_unlock_bh();
2485
2486         return ip_route_output_slow(net, rp, flp);
2487 }
2488
2489 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2490
2491 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2492 {
2493 }
2494
2495 static struct dst_ops ipv4_dst_blackhole_ops = {
2496         .family                 =       AF_INET,
2497         .protocol               =       __constant_htons(ETH_P_IP),
2498         .destroy                =       ipv4_dst_destroy,
2499         .check                  =       ipv4_dst_check,
2500         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2501         .entry_size             =       sizeof(struct rtable),
2502         .entries                =       ATOMIC_INIT(0),
2503 };
2504
2505
2506 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2507 {
2508         struct rtable *ort = *rp;
2509         struct rtable *rt = (struct rtable *)
2510                 dst_alloc(&ipv4_dst_blackhole_ops);
2511
2512         if (rt) {
2513                 struct dst_entry *new = &rt->u.dst;
2514
2515                 atomic_set(&new->__refcnt, 1);
2516                 new->__use = 1;
2517                 new->input = dst_discard;
2518                 new->output = dst_discard;
2519                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2520
2521                 new->dev = ort->u.dst.dev;
2522                 if (new->dev)
2523                         dev_hold(new->dev);
2524
2525                 rt->fl = ort->fl;
2526
2527                 rt->idev = ort->idev;
2528                 if (rt->idev)
2529                         in_dev_hold(rt->idev);
2530                 rt->rt_flags = ort->rt_flags;
2531                 rt->rt_type = ort->rt_type;
2532                 rt->rt_dst = ort->rt_dst;
2533                 rt->rt_src = ort->rt_src;
2534                 rt->rt_iif = ort->rt_iif;
2535                 rt->rt_gateway = ort->rt_gateway;
2536                 rt->rt_spec_dst = ort->rt_spec_dst;
2537                 rt->peer = ort->peer;
2538                 if (rt->peer)
2539                         atomic_inc(&rt->peer->refcnt);
2540
2541                 dst_free(new);
2542         }
2543
2544         dst_release(&(*rp)->u.dst);
2545         *rp = rt;
2546         return (rt ? 0 : -ENOMEM);
2547 }
2548
2549 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2550                          struct sock *sk, int flags)
2551 {
2552         int err;
2553
2554         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2555                 return err;
2556
2557         if (flp->proto) {
2558                 if (!flp->fl4_src)
2559                         flp->fl4_src = (*rp)->rt_src;
2560                 if (!flp->fl4_dst)
2561                         flp->fl4_dst = (*rp)->rt_dst;
2562                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2563                                     flags ? XFRM_LOOKUP_WAIT : 0);
2564                 if (err == -EREMOTE)
2565                         err = ipv4_dst_blackhole(rp, flp, sk);
2566
2567                 return err;
2568         }
2569
2570         return 0;
2571 }
2572
2573 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2574
2575 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2576 {
2577         return ip_route_output_flow(net, rp, flp, NULL, 0);
2578 }
2579
2580 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2581                         int nowait, unsigned int flags)
2582 {
2583         struct rtable *rt = (struct rtable*)skb->dst;
2584         struct rtmsg *r;
2585         struct nlmsghdr *nlh;
2586         long expires;
2587         u32 id = 0, ts = 0, tsage = 0, error;
2588
2589         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2590         if (nlh == NULL)
2591                 return -EMSGSIZE;
2592
2593         r = nlmsg_data(nlh);
2594         r->rtm_family    = AF_INET;
2595         r->rtm_dst_len  = 32;
2596         r->rtm_src_len  = 0;
2597         r->rtm_tos      = rt->fl.fl4_tos;
2598         r->rtm_table    = RT_TABLE_MAIN;
2599         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2600         r->rtm_type     = rt->rt_type;
2601         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2602         r->rtm_protocol = RTPROT_UNSPEC;
2603         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604         if (rt->rt_flags & RTCF_NOTIFY)
2605                 r->rtm_flags |= RTM_F_NOTIFY;
2606
2607         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2608
2609         if (rt->fl.fl4_src) {
2610                 r->rtm_src_len = 32;
2611                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2612         }
2613         if (rt->u.dst.dev)
2614                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2615 #ifdef CONFIG_NET_CLS_ROUTE
2616         if (rt->u.dst.tclassid)
2617                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2618 #endif
2619         if (rt->fl.iif)
2620                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2621         else if (rt->rt_src != rt->fl.fl4_src)
2622                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2623
2624         if (rt->rt_dst != rt->rt_gateway)
2625                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2626
2627         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2628                 goto nla_put_failure;
2629
2630         error = rt->u.dst.error;
2631         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2632         if (rt->peer) {
2633                 id = rt->peer->ip_id_count;
2634                 if (rt->peer->tcp_ts_stamp) {
2635                         ts = rt->peer->tcp_ts;
2636                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2637                 }
2638         }
2639
2640         if (rt->fl.iif) {
2641 #ifdef CONFIG_IP_MROUTE
2642                 __be32 dst = rt->rt_dst;
2643
2644                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2645                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2646                         int err = ipmr_get_route(skb, r, nowait);
2647                         if (err <= 0) {
2648                                 if (!nowait) {
2649                                         if (err == 0)
2650                                                 return 0;
2651                                         goto nla_put_failure;
2652                                 } else {
2653                                         if (err == -EMSGSIZE)
2654                                                 goto nla_put_failure;
2655                                         error = err;
2656                                 }
2657                         }
2658                 } else
2659 #endif
2660                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2661         }
2662
2663         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2664                                expires, error) < 0)
2665                 goto nla_put_failure;
2666
2667         return nlmsg_end(skb, nlh);
2668
2669 nla_put_failure:
2670         nlmsg_cancel(skb, nlh);
2671         return -EMSGSIZE;
2672 }
2673
2674 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2675 {
2676         struct net *net = in_skb->sk->sk_net;
2677         struct rtmsg *rtm;
2678         struct nlattr *tb[RTA_MAX+1];
2679         struct rtable *rt = NULL;
2680         __be32 dst = 0;
2681         __be32 src = 0;
2682         u32 iif;
2683         int err;
2684         struct sk_buff *skb;
2685
2686         if (net != &init_net)
2687                 return -EINVAL;
2688
2689         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2690         if (err < 0)
2691                 goto errout;
2692
2693         rtm = nlmsg_data(nlh);
2694
2695         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2696         if (skb == NULL) {
2697                 err = -ENOBUFS;
2698                 goto errout;
2699         }
2700
2701         /* Reserve room for dummy headers, this skb can pass
2702            through good chunk of routing engine.
2703          */
2704         skb_reset_mac_header(skb);
2705         skb_reset_network_header(skb);
2706
2707         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2708         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2709         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2710
2711         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2712         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2713         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2714
2715         if (iif) {
2716                 struct net_device *dev;
2717
2718                 dev = __dev_get_by_index(&init_net, iif);
2719                 if (dev == NULL) {
2720                         err = -ENODEV;
2721                         goto errout_free;
2722                 }
2723
2724                 skb->protocol   = htons(ETH_P_IP);
2725                 skb->dev        = dev;
2726                 local_bh_disable();
2727                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2728                 local_bh_enable();
2729
2730                 rt = (struct rtable*) skb->dst;
2731                 if (err == 0 && rt->u.dst.error)
2732                         err = -rt->u.dst.error;
2733         } else {
2734                 struct flowi fl = {
2735                         .nl_u = {
2736                                 .ip4_u = {
2737                                         .daddr = dst,
2738                                         .saddr = src,
2739                                         .tos = rtm->rtm_tos,
2740                                 },
2741                         },
2742                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2743                 };
2744                 err = ip_route_output_key(&init_net, &rt, &fl);
2745         }
2746
2747         if (err)
2748                 goto errout_free;
2749
2750         skb->dst = &rt->u.dst;
2751         if (rtm->rtm_flags & RTM_F_NOTIFY)
2752                 rt->rt_flags |= RTCF_NOTIFY;
2753
2754         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2755                                 RTM_NEWROUTE, 0, 0);
2756         if (err <= 0)
2757                 goto errout_free;
2758
2759         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2760 errout:
2761         return err;
2762
2763 errout_free:
2764         kfree_skb(skb);
2765         goto errout;
2766 }
2767
2768 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2769 {
2770         struct rtable *rt;
2771         int h, s_h;
2772         int idx, s_idx;
2773
2774         s_h = cb->args[0];
2775         if (s_h < 0)
2776                 s_h = 0;
2777         s_idx = idx = cb->args[1];
2778         for (h = s_h; h <= rt_hash_mask; h++) {
2779                 rcu_read_lock_bh();
2780                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2781                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2782                         if (idx < s_idx)
2783                                 continue;
2784                         skb->dst = dst_clone(&rt->u.dst);
2785                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2786                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2787                                          1, NLM_F_MULTI) <= 0) {
2788                                 dst_release(xchg(&skb->dst, NULL));
2789                                 rcu_read_unlock_bh();
2790                                 goto done;
2791                         }
2792                         dst_release(xchg(&skb->dst, NULL));
2793                 }
2794                 rcu_read_unlock_bh();
2795                 s_idx = 0;
2796         }
2797
2798 done:
2799         cb->args[0] = h;
2800         cb->args[1] = idx;
2801         return skb->len;
2802 }
2803
2804 void ip_rt_multicast_event(struct in_device *in_dev)
2805 {
2806         rt_cache_flush(0);
2807 }
2808
2809 #ifdef CONFIG_SYSCTL
2810 static int flush_delay;
2811
2812 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2813                                         struct file *filp, void __user *buffer,
2814                                         size_t *lenp, loff_t *ppos)
2815 {
2816         if (write) {
2817                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2818                 rt_cache_flush(flush_delay);
2819                 return 0;
2820         }
2821
2822         return -EINVAL;
2823 }
2824
2825 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2826                                                 int __user *name,
2827                                                 int nlen,
2828                                                 void __user *oldval,
2829                                                 size_t __user *oldlenp,
2830                                                 void __user *newval,
2831                                                 size_t newlen)
2832 {
2833         int delay;
2834         if (newlen != sizeof(int))
2835                 return -EINVAL;
2836         if (get_user(delay, (int __user *)newval))
2837                 return -EFAULT;
2838         rt_cache_flush(delay);
2839         return 0;
2840 }
2841
2842 ctl_table ipv4_route_table[] = {
2843         {
2844                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2845                 .procname       = "flush",
2846                 .data           = &flush_delay,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0200,
2849                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2850                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2851         },
2852         {
2853                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2854                 .procname       = "min_delay",
2855                 .data           = &ip_rt_min_delay,
2856                 .maxlen         = sizeof(int),
2857                 .mode           = 0644,
2858                 .proc_handler   = &proc_dointvec_jiffies,
2859                 .strategy       = &sysctl_jiffies,
2860         },
2861         {
2862                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2863                 .procname       = "max_delay",
2864                 .data           = &ip_rt_max_delay,
2865                 .maxlen         = sizeof(int),
2866                 .mode           = 0644,
2867                 .proc_handler   = &proc_dointvec_jiffies,
2868                 .strategy       = &sysctl_jiffies,
2869         },
2870         {
2871                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2872                 .procname       = "gc_thresh",
2873                 .data           = &ipv4_dst_ops.gc_thresh,
2874                 .maxlen         = sizeof(int),
2875                 .mode           = 0644,
2876                 .proc_handler   = &proc_dointvec,
2877         },
2878         {
2879                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2880                 .procname       = "max_size",
2881                 .data           = &ip_rt_max_size,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = &proc_dointvec,
2885         },
2886         {
2887                 /*  Deprecated. Use gc_min_interval_ms */
2888
2889                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2890                 .procname       = "gc_min_interval",
2891                 .data           = &ip_rt_gc_min_interval,
2892                 .maxlen         = sizeof(int),
2893                 .mode           = 0644,
2894                 .proc_handler   = &proc_dointvec_jiffies,
2895                 .strategy       = &sysctl_jiffies,
2896         },
2897         {
2898                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2899                 .procname       = "gc_min_interval_ms",
2900                 .data           = &ip_rt_gc_min_interval,
2901                 .maxlen         = sizeof(int),
2902                 .mode           = 0644,
2903                 .proc_handler   = &proc_dointvec_ms_jiffies,
2904                 .strategy       = &sysctl_ms_jiffies,
2905         },
2906         {
2907                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2908                 .procname       = "gc_timeout",
2909                 .data           = &ip_rt_gc_timeout,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = &proc_dointvec_jiffies,
2913                 .strategy       = &sysctl_jiffies,
2914         },
2915         {
2916                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2917                 .procname       = "gc_interval",
2918                 .data           = &ip_rt_gc_interval,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = &proc_dointvec_jiffies,
2922                 .strategy       = &sysctl_jiffies,
2923         },
2924         {
2925                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2926                 .procname       = "redirect_load",
2927                 .data           = &ip_rt_redirect_load,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = &proc_dointvec,
2931         },
2932         {
2933                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2934                 .procname       = "redirect_number",
2935                 .data           = &ip_rt_redirect_number,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = &proc_dointvec,
2939         },
2940         {
2941                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2942                 .procname       = "redirect_silence",
2943                 .data           = &ip_rt_redirect_silence,
2944                 .maxlen         = sizeof(int),
2945                 .mode           = 0644,
2946                 .proc_handler   = &proc_dointvec,
2947         },
2948         {
2949                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2950                 .procname       = "error_cost",
2951                 .data           = &ip_rt_error_cost,
2952                 .maxlen         = sizeof(int),
2953                 .mode           = 0644,
2954                 .proc_handler   = &proc_dointvec,
2955         },
2956         {
2957                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2958                 .procname       = "error_burst",
2959                 .data           = &ip_rt_error_burst,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0644,
2962                 .proc_handler   = &proc_dointvec,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2966                 .procname       = "gc_elasticity",
2967                 .data           = &ip_rt_gc_elasticity,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2974                 .procname       = "mtu_expires",
2975                 .data           = &ip_rt_mtu_expires,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec_jiffies,
2979                 .strategy       = &sysctl_jiffies,
2980         },
2981         {
2982                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2983                 .procname       = "min_pmtu",
2984                 .data           = &ip_rt_min_pmtu,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = &proc_dointvec,
2988         },
2989         {
2990                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2991                 .procname       = "min_adv_mss",
2992                 .data           = &ip_rt_min_advmss,
2993                 .maxlen         = sizeof(int),
2994                 .mode           = 0644,
2995                 .proc_handler   = &proc_dointvec,
2996         },
2997         {
2998                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2999                 .procname       = "secret_interval",
3000                 .data           = &ip_rt_secret_interval,
3001                 .maxlen         = sizeof(int),
3002                 .mode           = 0644,
3003                 .proc_handler   = &proc_dointvec_jiffies,
3004                 .strategy       = &sysctl_jiffies,
3005         },
3006         { .ctl_name = 0 }
3007 };
3008 #endif
3009
3010 #ifdef CONFIG_NET_CLS_ROUTE
3011 struct ip_rt_acct *ip_rt_acct __read_mostly;
3012 #endif /* CONFIG_NET_CLS_ROUTE */
3013
3014 static __initdata unsigned long rhash_entries;
3015 static int __init set_rhash_entries(char *str)
3016 {
3017         if (!str)
3018                 return 0;
3019         rhash_entries = simple_strtoul(str, &str, 0);
3020         return 1;
3021 }
3022 __setup("rhash_entries=", set_rhash_entries);
3023
3024 int __init ip_rt_init(void)
3025 {
3026         int rc = 0;
3027
3028         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3029                              (jiffies ^ (jiffies >> 7)));
3030
3031 #ifdef CONFIG_NET_CLS_ROUTE
3032         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3033         if (!ip_rt_acct)
3034                 panic("IP: failed to allocate ip_rt_acct\n");
3035 #endif
3036
3037         ipv4_dst_ops.kmem_cachep =
3038                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3039                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3040
3041         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3042
3043         rt_hash_table = (struct rt_hash_bucket *)
3044                 alloc_large_system_hash("IP route cache",
3045                                         sizeof(struct rt_hash_bucket),
3046                                         rhash_entries,
3047                                         (num_physpages >= 128 * 1024) ?
3048                                         15 : 17,
3049                                         0,
3050                                         &rt_hash_log,
3051                                         &rt_hash_mask,
3052                                         0);
3053         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3054         rt_hash_lock_init();
3055
3056         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3057         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3058
3059         devinet_init();
3060         ip_fib_init();
3061
3062         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3063         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3064
3065         /* All the timers, started at system startup tend
3066            to synchronize. Perturb it a bit.
3067          */
3068         schedule_delayed_work(&expires_work,
3069                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3070
3071         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3072                 ip_rt_secret_interval;
3073         add_timer(&rt_secret_timer);
3074
3075         if (ip_rt_proc_init(&init_net))
3076                 printk(KERN_ERR "Unable to create route proc files\n");
3077 #ifdef CONFIG_XFRM
3078         xfrm_init();
3079         xfrm4_init();
3080 #endif
3081         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3082
3083         return rc;
3084 }
3085
3086 EXPORT_SYMBOL(__ip_select_ident);
3087 EXPORT_SYMBOL(ip_route_input);
3088 EXPORT_SYMBOL(ip_route_output_key);