[IPV4] ROUTE: Clean up proc files creation.
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(void);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238 # define rt_hash_lock_init()    { \
239                 int i; \
240                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
241                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
242                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
243                         spin_lock_init(&rt_hash_locks[i]); \
244                 }
245 #else
246 # define rt_hash_lock_addr(slot) NULL
247 # define rt_hash_lock_init()
248 #endif
249
250 static struct rt_hash_bucket    *rt_hash_table;
251 static unsigned                 rt_hash_mask;
252 static unsigned int             rt_hash_log;
253 static unsigned int             rt_hash_rnd;
254
255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
256 #define RT_CACHE_STAT_INC(field) \
257         (__raw_get_cpu_var(rt_cache_stat).field++)
258
259 static int rt_intern_hash(unsigned hash, struct rtable *rth,
260                                 struct rtable **res);
261
262 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
263 {
264         return (jhash_2words(daddr, saddr, rt_hash_rnd)
265                 & rt_hash_mask);
266 }
267
268 #define rt_hash(daddr, saddr, idx) \
269         rt_hash_code((__force u32)(__be32)(daddr),\
270                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
271
272 #ifdef CONFIG_PROC_FS
273 struct rt_cache_iter_state {
274         int bucket;
275 };
276
277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
278 {
279         struct rtable *r = NULL;
280         struct rt_cache_iter_state *st = seq->private;
281
282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
283                 rcu_read_lock_bh();
284                 r = rt_hash_table[st->bucket].chain;
285                 if (r)
286                         break;
287                 rcu_read_unlock_bh();
288         }
289         return rcu_dereference(r);
290 }
291
292 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
293 {
294         struct rt_cache_iter_state *st = seq->private;
295
296         r = r->u.dst.rt_next;
297         while (!r) {
298                 rcu_read_unlock_bh();
299                 if (--st->bucket < 0)
300                         break;
301                 rcu_read_lock_bh();
302                 r = rt_hash_table[st->bucket].chain;
303         }
304         return rcu_dereference(r);
305 }
306
307 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
308 {
309         struct rtable *r = rt_cache_get_first(seq);
310
311         if (r)
312                 while (pos && (r = rt_cache_get_next(seq, r)))
313                         --pos;
314         return pos ? NULL : r;
315 }
316
317 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
318 {
319         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
320 }
321
322 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
323 {
324         struct rtable *r = NULL;
325
326         if (v == SEQ_START_TOKEN)
327                 r = rt_cache_get_first(seq);
328         else
329                 r = rt_cache_get_next(seq, v);
330         ++*pos;
331         return r;
332 }
333
334 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
335 {
336         if (v && v != SEQ_START_TOKEN)
337                 rcu_read_unlock_bh();
338 }
339
340 static int rt_cache_seq_show(struct seq_file *seq, void *v)
341 {
342         if (v == SEQ_START_TOKEN)
343                 seq_printf(seq, "%-127s\n",
344                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
345                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
346                            "HHUptod\tSpecDst");
347         else {
348                 struct rtable *r = v;
349                 char temp[256];
350
351                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
352                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
353                         r->u.dst.dev ? r->u.dst.dev->name : "*",
354                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
355                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
356                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
357                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
358                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
359                         dst_metric(&r->u.dst, RTAX_WINDOW),
360                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
361                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
362                         r->fl.fl4_tos,
363                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
364                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
365                                        dev_queue_xmit) : 0,
366                         r->rt_spec_dst);
367                 seq_printf(seq, "%-127s\n", temp);
368         }
369         return 0;
370 }
371
372 static const struct seq_operations rt_cache_seq_ops = {
373         .start  = rt_cache_seq_start,
374         .next   = rt_cache_seq_next,
375         .stop   = rt_cache_seq_stop,
376         .show   = rt_cache_seq_show,
377 };
378
379 static int rt_cache_seq_open(struct inode *inode, struct file *file)
380 {
381         return seq_open_private(file, &rt_cache_seq_ops,
382                         sizeof(struct rt_cache_iter_state));
383 }
384
385 static const struct file_operations rt_cache_seq_fops = {
386         .owner   = THIS_MODULE,
387         .open    = rt_cache_seq_open,
388         .read    = seq_read,
389         .llseek  = seq_lseek,
390         .release = seq_release_private,
391 };
392
393
394 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
395 {
396         int cpu;
397
398         if (*pos == 0)
399                 return SEQ_START_TOKEN;
400
401         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
402                 if (!cpu_possible(cpu))
403                         continue;
404                 *pos = cpu+1;
405                 return &per_cpu(rt_cache_stat, cpu);
406         }
407         return NULL;
408 }
409
410 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
411 {
412         int cpu;
413
414         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
415                 if (!cpu_possible(cpu))
416                         continue;
417                 *pos = cpu+1;
418                 return &per_cpu(rt_cache_stat, cpu);
419         }
420         return NULL;
421
422 }
423
424 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
425 {
426
427 }
428
429 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
430 {
431         struct rt_cache_stat *st = v;
432
433         if (v == SEQ_START_TOKEN) {
434                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
435                 return 0;
436         }
437
438         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
439                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
440                    atomic_read(&ipv4_dst_ops.entries),
441                    st->in_hit,
442                    st->in_slow_tot,
443                    st->in_slow_mc,
444                    st->in_no_route,
445                    st->in_brd,
446                    st->in_martian_dst,
447                    st->in_martian_src,
448
449                    st->out_hit,
450                    st->out_slow_tot,
451                    st->out_slow_mc,
452
453                    st->gc_total,
454                    st->gc_ignored,
455                    st->gc_goal_miss,
456                    st->gc_dst_overflow,
457                    st->in_hlist_search,
458                    st->out_hlist_search
459                 );
460         return 0;
461 }
462
463 static const struct seq_operations rt_cpu_seq_ops = {
464         .start  = rt_cpu_seq_start,
465         .next   = rt_cpu_seq_next,
466         .stop   = rt_cpu_seq_stop,
467         .show   = rt_cpu_seq_show,
468 };
469
470
471 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
472 {
473         return seq_open(file, &rt_cpu_seq_ops);
474 }
475
476 static const struct file_operations rt_cpu_seq_fops = {
477         .owner   = THIS_MODULE,
478         .open    = rt_cpu_seq_open,
479         .read    = seq_read,
480         .llseek  = seq_lseek,
481         .release = seq_release,
482 };
483
484 #ifdef CONFIG_NET_CLS_ROUTE
485 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
486                            int length, int *eof, void *data)
487 {
488         unsigned int i;
489
490         if ((offset & 3) || (length & 3))
491                 return -EIO;
492
493         if (offset >= sizeof(struct ip_rt_acct) * 256) {
494                 *eof = 1;
495                 return 0;
496         }
497
498         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
499                 length = sizeof(struct ip_rt_acct) * 256 - offset;
500                 *eof = 1;
501         }
502
503         offset /= sizeof(u32);
504
505         if (length > 0) {
506                 u32 *dst = (u32 *) buffer;
507
508                 *start = buffer;
509                 memset(dst, 0, length);
510
511                 for_each_possible_cpu(i) {
512                         unsigned int j;
513                         u32 *src;
514
515                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
516                         for (j = 0; j < length/4; j++)
517                                 dst[j] += src[j];
518                 }
519         }
520         return length;
521 }
522 #endif
523
524 static __init int ip_rt_proc_init(struct net *net)
525 {
526         struct proc_dir_entry *pde;
527
528         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
529                         &rt_cache_seq_fops);
530         if (!pde)
531                 goto err1;
532
533         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
534         if (!pde)
535                 goto err2;
536
537         pde->proc_fops = &rt_cpu_seq_fops;
538
539 #ifdef CONFIG_NET_CLS_ROUTE
540         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
541                         ip_rt_acct_read, NULL);
542         if (!pde)
543                 goto err3;
544 #endif
545         return 0;
546
547 #ifdef CONFIG_NET_CLS_ROUTE
548 err3:
549         remove_proc_entry("rt_cache", net->proc_net_stat);
550 #endif
551 err2:
552         remove_proc_entry("rt_cache", net->proc_net);
553 err1:
554         return -ENOMEM;
555 }
556 #else
557 static inline int ip_rt_proc_init(struct net *net)
558 {
559         return 0;
560 }
561 #endif /* CONFIG_PROC_FS */
562
563 static __inline__ void rt_free(struct rtable *rt)
564 {
565         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
566 }
567
568 static __inline__ void rt_drop(struct rtable *rt)
569 {
570         ip_rt_put(rt);
571         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
572 }
573
574 static __inline__ int rt_fast_clean(struct rtable *rth)
575 {
576         /* Kill broadcast/multicast entries very aggresively, if they
577            collide in hash table with more useful entries */
578         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
579                 rth->fl.iif && rth->u.dst.rt_next;
580 }
581
582 static __inline__ int rt_valuable(struct rtable *rth)
583 {
584         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
585                 rth->u.dst.expires;
586 }
587
588 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
589 {
590         unsigned long age;
591         int ret = 0;
592
593         if (atomic_read(&rth->u.dst.__refcnt))
594                 goto out;
595
596         ret = 1;
597         if (rth->u.dst.expires &&
598             time_after_eq(jiffies, rth->u.dst.expires))
599                 goto out;
600
601         age = jiffies - rth->u.dst.lastuse;
602         ret = 0;
603         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
604             (age <= tmo2 && rt_valuable(rth)))
605                 goto out;
606         ret = 1;
607 out:    return ret;
608 }
609
610 /* Bits of score are:
611  * 31: very valuable
612  * 30: not quite useless
613  * 29..0: usage counter
614  */
615 static inline u32 rt_score(struct rtable *rt)
616 {
617         u32 score = jiffies - rt->u.dst.lastuse;
618
619         score = ~score & ~(3<<30);
620
621         if (rt_valuable(rt))
622                 score |= (1<<31);
623
624         if (!rt->fl.iif ||
625             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
626                 score |= (1<<30);
627
628         return score;
629 }
630
631 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
632 {
633         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
634                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
635                 (fl1->mark ^ fl2->mark) |
636                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
637                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
638                 (fl1->oif ^ fl2->oif) |
639                 (fl1->iif ^ fl2->iif)) == 0;
640 }
641
642 /*
643  * Perform a full scan of hash table and free all entries.
644  * Can be called by a softirq or a process.
645  * In the later case, we want to be reschedule if necessary
646  */
647 static void rt_do_flush(int process_context)
648 {
649         unsigned int i;
650         struct rtable *rth, *next;
651
652         for (i = 0; i <= rt_hash_mask; i++) {
653                 if (process_context && need_resched())
654                         cond_resched();
655                 rth = rt_hash_table[i].chain;
656                 if (!rth)
657                         continue;
658
659                 spin_lock_bh(rt_hash_lock_addr(i));
660                 rth = rt_hash_table[i].chain;
661                 rt_hash_table[i].chain = NULL;
662                 spin_unlock_bh(rt_hash_lock_addr(i));
663
664                 for (; rth; rth = next) {
665                         next = rth->u.dst.rt_next;
666                         rt_free(rth);
667                 }
668         }
669 }
670
671 static void rt_check_expire(void)
672 {
673         static unsigned int rover;
674         unsigned int i = rover, goal;
675         struct rtable *rth, **rthp;
676         u64 mult;
677
678         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
679         if (ip_rt_gc_timeout > 1)
680                 do_div(mult, ip_rt_gc_timeout);
681         goal = (unsigned int)mult;
682         if (goal > rt_hash_mask)
683                 goal = rt_hash_mask + 1;
684         for (; goal > 0; goal--) {
685                 unsigned long tmo = ip_rt_gc_timeout;
686
687                 i = (i + 1) & rt_hash_mask;
688                 rthp = &rt_hash_table[i].chain;
689
690                 if (need_resched())
691                         cond_resched();
692
693                 if (*rthp == NULL)
694                         continue;
695                 spin_lock_bh(rt_hash_lock_addr(i));
696                 while ((rth = *rthp) != NULL) {
697                         if (rth->u.dst.expires) {
698                                 /* Entry is expired even if it is in use */
699                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
700                                         tmo >>= 1;
701                                         rthp = &rth->u.dst.rt_next;
702                                         continue;
703                                 }
704                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
705                                 tmo >>= 1;
706                                 rthp = &rth->u.dst.rt_next;
707                                 continue;
708                         }
709
710                         /* Cleanup aged off entries. */
711                         *rthp = rth->u.dst.rt_next;
712                         rt_free(rth);
713                 }
714                 spin_unlock_bh(rt_hash_lock_addr(i));
715         }
716         rover = i;
717 }
718
719 /*
720  * rt_worker_func() is run in process context.
721  * If a whole flush was scheduled, it is done.
722  * Else, we call rt_check_expire() to scan part of the hash table
723  */
724 static void rt_worker_func(struct work_struct *work)
725 {
726         if (ip_rt_flush_expected) {
727                 ip_rt_flush_expected = 0;
728                 rt_do_flush(1);
729         } else
730                 rt_check_expire();
731         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
732 }
733
734 /* This can run from both BH and non-BH contexts, the latter
735  * in the case of a forced flush event.
736  */
737 static void rt_run_flush(unsigned long process_context)
738 {
739         rt_deadline = 0;
740
741         get_random_bytes(&rt_hash_rnd, 4);
742
743         rt_do_flush(process_context);
744 }
745
746 static DEFINE_SPINLOCK(rt_flush_lock);
747
748 void rt_cache_flush(int delay)
749 {
750         unsigned long now = jiffies;
751         int user_mode = !in_softirq();
752
753         if (delay < 0)
754                 delay = ip_rt_min_delay;
755
756         spin_lock_bh(&rt_flush_lock);
757
758         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
759                 long tmo = (long)(rt_deadline - now);
760
761                 /* If flush timer is already running
762                    and flush request is not immediate (delay > 0):
763
764                    if deadline is not achieved, prolongate timer to "delay",
765                    otherwise fire it at deadline time.
766                  */
767
768                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
769                         tmo = 0;
770
771                 if (delay > tmo)
772                         delay = tmo;
773         }
774
775         if (delay <= 0) {
776                 spin_unlock_bh(&rt_flush_lock);
777                 rt_run_flush(user_mode);
778                 return;
779         }
780
781         if (rt_deadline == 0)
782                 rt_deadline = now + ip_rt_max_delay;
783
784         mod_timer(&rt_flush_timer, now+delay);
785         spin_unlock_bh(&rt_flush_lock);
786 }
787
788 /*
789  * We change rt_hash_rnd and ask next rt_worker_func() invocation
790  * to perform a flush in process context
791  */
792 static void rt_secret_rebuild(unsigned long dummy)
793 {
794         get_random_bytes(&rt_hash_rnd, 4);
795         ip_rt_flush_expected = 1;
796         cancel_delayed_work(&expires_work);
797         schedule_delayed_work(&expires_work, HZ/10);
798         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
799 }
800
801 /*
802    Short description of GC goals.
803
804    We want to build algorithm, which will keep routing cache
805    at some equilibrium point, when number of aged off entries
806    is kept approximately equal to newly generated ones.
807
808    Current expiration strength is variable "expire".
809    We try to adjust it dynamically, so that if networking
810    is idle expires is large enough to keep enough of warm entries,
811    and when load increases it reduces to limit cache size.
812  */
813
814 static int rt_garbage_collect(void)
815 {
816         static unsigned long expire = RT_GC_TIMEOUT;
817         static unsigned long last_gc;
818         static int rover;
819         static int equilibrium;
820         struct rtable *rth, **rthp;
821         unsigned long now = jiffies;
822         int goal;
823
824         /*
825          * Garbage collection is pretty expensive,
826          * do not make it too frequently.
827          */
828
829         RT_CACHE_STAT_INC(gc_total);
830
831         if (now - last_gc < ip_rt_gc_min_interval &&
832             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
833                 RT_CACHE_STAT_INC(gc_ignored);
834                 goto out;
835         }
836
837         /* Calculate number of entries, which we want to expire now. */
838         goal = atomic_read(&ipv4_dst_ops.entries) -
839                 (ip_rt_gc_elasticity << rt_hash_log);
840         if (goal <= 0) {
841                 if (equilibrium < ipv4_dst_ops.gc_thresh)
842                         equilibrium = ipv4_dst_ops.gc_thresh;
843                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
844                 if (goal > 0) {
845                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
846                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
847                 }
848         } else {
849                 /* We are in dangerous area. Try to reduce cache really
850                  * aggressively.
851                  */
852                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
853                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
854         }
855
856         if (now - last_gc >= ip_rt_gc_min_interval)
857                 last_gc = now;
858
859         if (goal <= 0) {
860                 equilibrium += goal;
861                 goto work_done;
862         }
863
864         do {
865                 int i, k;
866
867                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
868                         unsigned long tmo = expire;
869
870                         k = (k + 1) & rt_hash_mask;
871                         rthp = &rt_hash_table[k].chain;
872                         spin_lock_bh(rt_hash_lock_addr(k));
873                         while ((rth = *rthp) != NULL) {
874                                 if (!rt_may_expire(rth, tmo, expire)) {
875                                         tmo >>= 1;
876                                         rthp = &rth->u.dst.rt_next;
877                                         continue;
878                                 }
879                                 *rthp = rth->u.dst.rt_next;
880                                 rt_free(rth);
881                                 goal--;
882                         }
883                         spin_unlock_bh(rt_hash_lock_addr(k));
884                         if (goal <= 0)
885                                 break;
886                 }
887                 rover = k;
888
889                 if (goal <= 0)
890                         goto work_done;
891
892                 /* Goal is not achieved. We stop process if:
893
894                    - if expire reduced to zero. Otherwise, expire is halfed.
895                    - if table is not full.
896                    - if we are called from interrupt.
897                    - jiffies check is just fallback/debug loop breaker.
898                      We will not spin here for long time in any case.
899                  */
900
901                 RT_CACHE_STAT_INC(gc_goal_miss);
902
903                 if (expire == 0)
904                         break;
905
906                 expire >>= 1;
907 #if RT_CACHE_DEBUG >= 2
908                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
909                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
910 #endif
911
912                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
913                         goto out;
914         } while (!in_softirq() && time_before_eq(jiffies, now));
915
916         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
917                 goto out;
918         if (net_ratelimit())
919                 printk(KERN_WARNING "dst cache overflow\n");
920         RT_CACHE_STAT_INC(gc_dst_overflow);
921         return 1;
922
923 work_done:
924         expire += ip_rt_gc_min_interval;
925         if (expire > ip_rt_gc_timeout ||
926             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
927                 expire = ip_rt_gc_timeout;
928 #if RT_CACHE_DEBUG >= 2
929         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
930                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
931 #endif
932 out:    return 0;
933 }
934
935 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
936 {
937         struct rtable   *rth, **rthp;
938         unsigned long   now;
939         struct rtable *cand, **candp;
940         u32             min_score;
941         int             chain_length;
942         int attempts = !in_softirq();
943
944 restart:
945         chain_length = 0;
946         min_score = ~(u32)0;
947         cand = NULL;
948         candp = NULL;
949         now = jiffies;
950
951         rthp = &rt_hash_table[hash].chain;
952
953         spin_lock_bh(rt_hash_lock_addr(hash));
954         while ((rth = *rthp) != NULL) {
955                 if (compare_keys(&rth->fl, &rt->fl)) {
956                         /* Put it first */
957                         *rthp = rth->u.dst.rt_next;
958                         /*
959                          * Since lookup is lockfree, the deletion
960                          * must be visible to another weakly ordered CPU before
961                          * the insertion at the start of the hash chain.
962                          */
963                         rcu_assign_pointer(rth->u.dst.rt_next,
964                                            rt_hash_table[hash].chain);
965                         /*
966                          * Since lookup is lockfree, the update writes
967                          * must be ordered for consistency on SMP.
968                          */
969                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
970
971                         dst_use(&rth->u.dst, now);
972                         spin_unlock_bh(rt_hash_lock_addr(hash));
973
974                         rt_drop(rt);
975                         *rp = rth;
976                         return 0;
977                 }
978
979                 if (!atomic_read(&rth->u.dst.__refcnt)) {
980                         u32 score = rt_score(rth);
981
982                         if (score <= min_score) {
983                                 cand = rth;
984                                 candp = rthp;
985                                 min_score = score;
986                         }
987                 }
988
989                 chain_length++;
990
991                 rthp = &rth->u.dst.rt_next;
992         }
993
994         if (cand) {
995                 /* ip_rt_gc_elasticity used to be average length of chain
996                  * length, when exceeded gc becomes really aggressive.
997                  *
998                  * The second limit is less certain. At the moment it allows
999                  * only 2 entries per bucket. We will see.
1000                  */
1001                 if (chain_length > ip_rt_gc_elasticity) {
1002                         *candp = cand->u.dst.rt_next;
1003                         rt_free(cand);
1004                 }
1005         }
1006
1007         /* Try to bind route to arp only if it is output
1008            route or unicast forwarding path.
1009          */
1010         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1011                 int err = arp_bind_neighbour(&rt->u.dst);
1012                 if (err) {
1013                         spin_unlock_bh(rt_hash_lock_addr(hash));
1014
1015                         if (err != -ENOBUFS) {
1016                                 rt_drop(rt);
1017                                 return err;
1018                         }
1019
1020                         /* Neighbour tables are full and nothing
1021                            can be released. Try to shrink route cache,
1022                            it is most likely it holds some neighbour records.
1023                          */
1024                         if (attempts-- > 0) {
1025                                 int saved_elasticity = ip_rt_gc_elasticity;
1026                                 int saved_int = ip_rt_gc_min_interval;
1027                                 ip_rt_gc_elasticity     = 1;
1028                                 ip_rt_gc_min_interval   = 0;
1029                                 rt_garbage_collect();
1030                                 ip_rt_gc_min_interval   = saved_int;
1031                                 ip_rt_gc_elasticity     = saved_elasticity;
1032                                 goto restart;
1033                         }
1034
1035                         if (net_ratelimit())
1036                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1037                         rt_drop(rt);
1038                         return -ENOBUFS;
1039                 }
1040         }
1041
1042         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1043 #if RT_CACHE_DEBUG >= 2
1044         if (rt->u.dst.rt_next) {
1045                 struct rtable *trt;
1046                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1047                        NIPQUAD(rt->rt_dst));
1048                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1049                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1050                 printk("\n");
1051         }
1052 #endif
1053         rt_hash_table[hash].chain = rt;
1054         spin_unlock_bh(rt_hash_lock_addr(hash));
1055         *rp = rt;
1056         return 0;
1057 }
1058
1059 void rt_bind_peer(struct rtable *rt, int create)
1060 {
1061         static DEFINE_SPINLOCK(rt_peer_lock);
1062         struct inet_peer *peer;
1063
1064         peer = inet_getpeer(rt->rt_dst, create);
1065
1066         spin_lock_bh(&rt_peer_lock);
1067         if (rt->peer == NULL) {
1068                 rt->peer = peer;
1069                 peer = NULL;
1070         }
1071         spin_unlock_bh(&rt_peer_lock);
1072         if (peer)
1073                 inet_putpeer(peer);
1074 }
1075
1076 /*
1077  * Peer allocation may fail only in serious out-of-memory conditions.  However
1078  * we still can generate some output.
1079  * Random ID selection looks a bit dangerous because we have no chances to
1080  * select ID being unique in a reasonable period of time.
1081  * But broken packet identifier may be better than no packet at all.
1082  */
1083 static void ip_select_fb_ident(struct iphdr *iph)
1084 {
1085         static DEFINE_SPINLOCK(ip_fb_id_lock);
1086         static u32 ip_fallback_id;
1087         u32 salt;
1088
1089         spin_lock_bh(&ip_fb_id_lock);
1090         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1091         iph->id = htons(salt & 0xFFFF);
1092         ip_fallback_id = salt;
1093         spin_unlock_bh(&ip_fb_id_lock);
1094 }
1095
1096 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1097 {
1098         struct rtable *rt = (struct rtable *) dst;
1099
1100         if (rt) {
1101                 if (rt->peer == NULL)
1102                         rt_bind_peer(rt, 1);
1103
1104                 /* If peer is attached to destination, it is never detached,
1105                    so that we need not to grab a lock to dereference it.
1106                  */
1107                 if (rt->peer) {
1108                         iph->id = htons(inet_getid(rt->peer, more));
1109                         return;
1110                 }
1111         } else
1112                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1113                        __builtin_return_address(0));
1114
1115         ip_select_fb_ident(iph);
1116 }
1117
1118 static void rt_del(unsigned hash, struct rtable *rt)
1119 {
1120         struct rtable **rthp;
1121
1122         spin_lock_bh(rt_hash_lock_addr(hash));
1123         ip_rt_put(rt);
1124         for (rthp = &rt_hash_table[hash].chain; *rthp;
1125              rthp = &(*rthp)->u.dst.rt_next)
1126                 if (*rthp == rt) {
1127                         *rthp = rt->u.dst.rt_next;
1128                         rt_free(rt);
1129                         break;
1130                 }
1131         spin_unlock_bh(rt_hash_lock_addr(hash));
1132 }
1133
1134 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1135                     __be32 saddr, struct net_device *dev)
1136 {
1137         int i, k;
1138         struct in_device *in_dev = in_dev_get(dev);
1139         struct rtable *rth, **rthp;
1140         __be32  skeys[2] = { saddr, 0 };
1141         int  ikeys[2] = { dev->ifindex, 0 };
1142         struct netevent_redirect netevent;
1143
1144         if (!in_dev)
1145                 return;
1146
1147         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1148             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1149                 goto reject_redirect;
1150
1151         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1152                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1153                         goto reject_redirect;
1154                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1155                         goto reject_redirect;
1156         } else {
1157                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1158                         goto reject_redirect;
1159         }
1160
1161         for (i = 0; i < 2; i++) {
1162                 for (k = 0; k < 2; k++) {
1163                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1164
1165                         rthp=&rt_hash_table[hash].chain;
1166
1167                         rcu_read_lock();
1168                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1169                                 struct rtable *rt;
1170
1171                                 if (rth->fl.fl4_dst != daddr ||
1172                                     rth->fl.fl4_src != skeys[i] ||
1173                                     rth->fl.oif != ikeys[k] ||
1174                                     rth->fl.iif != 0) {
1175                                         rthp = &rth->u.dst.rt_next;
1176                                         continue;
1177                                 }
1178
1179                                 if (rth->rt_dst != daddr ||
1180                                     rth->rt_src != saddr ||
1181                                     rth->u.dst.error ||
1182                                     rth->rt_gateway != old_gw ||
1183                                     rth->u.dst.dev != dev)
1184                                         break;
1185
1186                                 dst_hold(&rth->u.dst);
1187                                 rcu_read_unlock();
1188
1189                                 rt = dst_alloc(&ipv4_dst_ops);
1190                                 if (rt == NULL) {
1191                                         ip_rt_put(rth);
1192                                         in_dev_put(in_dev);
1193                                         return;
1194                                 }
1195
1196                                 /* Copy all the information. */
1197                                 *rt = *rth;
1198                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1199                                 rt->u.dst.__use         = 1;
1200                                 atomic_set(&rt->u.dst.__refcnt, 1);
1201                                 rt->u.dst.child         = NULL;
1202                                 if (rt->u.dst.dev)
1203                                         dev_hold(rt->u.dst.dev);
1204                                 if (rt->idev)
1205                                         in_dev_hold(rt->idev);
1206                                 rt->u.dst.obsolete      = 0;
1207                                 rt->u.dst.lastuse       = jiffies;
1208                                 rt->u.dst.path          = &rt->u.dst;
1209                                 rt->u.dst.neighbour     = NULL;
1210                                 rt->u.dst.hh            = NULL;
1211                                 rt->u.dst.xfrm          = NULL;
1212
1213                                 rt->rt_flags            |= RTCF_REDIRECTED;
1214
1215                                 /* Gateway is different ... */
1216                                 rt->rt_gateway          = new_gw;
1217
1218                                 /* Redirect received -> path was valid */
1219                                 dst_confirm(&rth->u.dst);
1220
1221                                 if (rt->peer)
1222                                         atomic_inc(&rt->peer->refcnt);
1223
1224                                 if (arp_bind_neighbour(&rt->u.dst) ||
1225                                     !(rt->u.dst.neighbour->nud_state &
1226                                             NUD_VALID)) {
1227                                         if (rt->u.dst.neighbour)
1228                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1229                                         ip_rt_put(rth);
1230                                         rt_drop(rt);
1231                                         goto do_next;
1232                                 }
1233
1234                                 netevent.old = &rth->u.dst;
1235                                 netevent.new = &rt->u.dst;
1236                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1237                                                         &netevent);
1238
1239                                 rt_del(hash, rth);
1240                                 if (!rt_intern_hash(hash, rt, &rt))
1241                                         ip_rt_put(rt);
1242                                 goto do_next;
1243                         }
1244                         rcu_read_unlock();
1245                 do_next:
1246                         ;
1247                 }
1248         }
1249         in_dev_put(in_dev);
1250         return;
1251
1252 reject_redirect:
1253 #ifdef CONFIG_IP_ROUTE_VERBOSE
1254         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1255                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1256                         "%u.%u.%u.%u ignored.\n"
1257                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1258                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1259                        NIPQUAD(saddr), NIPQUAD(daddr));
1260 #endif
1261         in_dev_put(in_dev);
1262 }
1263
1264 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1265 {
1266         struct rtable *rt = (struct rtable*)dst;
1267         struct dst_entry *ret = dst;
1268
1269         if (rt) {
1270                 if (dst->obsolete) {
1271                         ip_rt_put(rt);
1272                         ret = NULL;
1273                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1274                            rt->u.dst.expires) {
1275                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1276                                                 rt->fl.oif);
1277 #if RT_CACHE_DEBUG >= 1
1278                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1279                                           "%u.%u.%u.%u/%02x dropped\n",
1280                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1281 #endif
1282                         rt_del(hash, rt);
1283                         ret = NULL;
1284                 }
1285         }
1286         return ret;
1287 }
1288
1289 /*
1290  * Algorithm:
1291  *      1. The first ip_rt_redirect_number redirects are sent
1292  *         with exponential backoff, then we stop sending them at all,
1293  *         assuming that the host ignores our redirects.
1294  *      2. If we did not see packets requiring redirects
1295  *         during ip_rt_redirect_silence, we assume that the host
1296  *         forgot redirected route and start to send redirects again.
1297  *
1298  * This algorithm is much cheaper and more intelligent than dumb load limiting
1299  * in icmp.c.
1300  *
1301  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1302  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1303  */
1304
1305 void ip_rt_send_redirect(struct sk_buff *skb)
1306 {
1307         struct rtable *rt = (struct rtable*)skb->dst;
1308         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1309
1310         if (!in_dev)
1311                 return;
1312
1313         if (!IN_DEV_TX_REDIRECTS(in_dev))
1314                 goto out;
1315
1316         /* No redirected packets during ip_rt_redirect_silence;
1317          * reset the algorithm.
1318          */
1319         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1320                 rt->u.dst.rate_tokens = 0;
1321
1322         /* Too many ignored redirects; do not send anything
1323          * set u.dst.rate_last to the last seen redirected packet.
1324          */
1325         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1326                 rt->u.dst.rate_last = jiffies;
1327                 goto out;
1328         }
1329
1330         /* Check for load limit; set rate_last to the latest sent
1331          * redirect.
1332          */
1333         if (rt->u.dst.rate_tokens == 0 ||
1334             time_after(jiffies,
1335                        (rt->u.dst.rate_last +
1336                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1337                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1338                 rt->u.dst.rate_last = jiffies;
1339                 ++rt->u.dst.rate_tokens;
1340 #ifdef CONFIG_IP_ROUTE_VERBOSE
1341                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1342                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1343                     net_ratelimit())
1344                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1345                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1346                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1347                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1348 #endif
1349         }
1350 out:
1351         in_dev_put(in_dev);
1352 }
1353
1354 static int ip_error(struct sk_buff *skb)
1355 {
1356         struct rtable *rt = (struct rtable*)skb->dst;
1357         unsigned long now;
1358         int code;
1359
1360         switch (rt->u.dst.error) {
1361                 case EINVAL:
1362                 default:
1363                         goto out;
1364                 case EHOSTUNREACH:
1365                         code = ICMP_HOST_UNREACH;
1366                         break;
1367                 case ENETUNREACH:
1368                         code = ICMP_NET_UNREACH;
1369                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1370                         break;
1371                 case EACCES:
1372                         code = ICMP_PKT_FILTERED;
1373                         break;
1374         }
1375
1376         now = jiffies;
1377         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1378         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1379                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1380         rt->u.dst.rate_last = now;
1381         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1382                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1383                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1384         }
1385
1386 out:    kfree_skb(skb);
1387         return 0;
1388 }
1389
1390 /*
1391  *      The last two values are not from the RFC but
1392  *      are needed for AMPRnet AX.25 paths.
1393  */
1394
1395 static const unsigned short mtu_plateau[] =
1396 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1397
1398 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1399 {
1400         int i;
1401
1402         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1403                 if (old_mtu > mtu_plateau[i])
1404                         return mtu_plateau[i];
1405         return 68;
1406 }
1407
1408 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1409 {
1410         int i;
1411         unsigned short old_mtu = ntohs(iph->tot_len);
1412         struct rtable *rth;
1413         __be32  skeys[2] = { iph->saddr, 0, };
1414         __be32  daddr = iph->daddr;
1415         unsigned short est_mtu = 0;
1416
1417         if (ipv4_config.no_pmtu_disc)
1418                 return 0;
1419
1420         for (i = 0; i < 2; i++) {
1421                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1422
1423                 rcu_read_lock();
1424                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1425                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1426                         if (rth->fl.fl4_dst == daddr &&
1427                             rth->fl.fl4_src == skeys[i] &&
1428                             rth->rt_dst  == daddr &&
1429                             rth->rt_src  == iph->saddr &&
1430                             rth->fl.iif == 0 &&
1431                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1432                                 unsigned short mtu = new_mtu;
1433
1434                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1435
1436                                         /* BSD 4.2 compatibility hack :-( */
1437                                         if (mtu == 0 &&
1438                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1439                                             old_mtu >= 68 + (iph->ihl << 2))
1440                                                 old_mtu -= iph->ihl << 2;
1441
1442                                         mtu = guess_mtu(old_mtu);
1443                                 }
1444                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1445                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1446                                                 dst_confirm(&rth->u.dst);
1447                                                 if (mtu < ip_rt_min_pmtu) {
1448                                                         mtu = ip_rt_min_pmtu;
1449                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1450                                                                 (1 << RTAX_MTU);
1451                                                 }
1452                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1453                                                 dst_set_expires(&rth->u.dst,
1454                                                         ip_rt_mtu_expires);
1455                                         }
1456                                         est_mtu = mtu;
1457                                 }
1458                         }
1459                 }
1460                 rcu_read_unlock();
1461         }
1462         return est_mtu ? : new_mtu;
1463 }
1464
1465 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1466 {
1467         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1468             !(dst_metric_locked(dst, RTAX_MTU))) {
1469                 if (mtu < ip_rt_min_pmtu) {
1470                         mtu = ip_rt_min_pmtu;
1471                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1472                 }
1473                 dst->metrics[RTAX_MTU-1] = mtu;
1474                 dst_set_expires(dst, ip_rt_mtu_expires);
1475                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1476         }
1477 }
1478
1479 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1480 {
1481         return NULL;
1482 }
1483
1484 static void ipv4_dst_destroy(struct dst_entry *dst)
1485 {
1486         struct rtable *rt = (struct rtable *) dst;
1487         struct inet_peer *peer = rt->peer;
1488         struct in_device *idev = rt->idev;
1489
1490         if (peer) {
1491                 rt->peer = NULL;
1492                 inet_putpeer(peer);
1493         }
1494
1495         if (idev) {
1496                 rt->idev = NULL;
1497                 in_dev_put(idev);
1498         }
1499 }
1500
1501 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1502                             int how)
1503 {
1504         struct rtable *rt = (struct rtable *) dst;
1505         struct in_device *idev = rt->idev;
1506         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1507                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1508                 if (loopback_idev) {
1509                         rt->idev = loopback_idev;
1510                         in_dev_put(idev);
1511                 }
1512         }
1513 }
1514
1515 static void ipv4_link_failure(struct sk_buff *skb)
1516 {
1517         struct rtable *rt;
1518
1519         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1520
1521         rt = (struct rtable *) skb->dst;
1522         if (rt)
1523                 dst_set_expires(&rt->u.dst, 0);
1524 }
1525
1526 static int ip_rt_bug(struct sk_buff *skb)
1527 {
1528         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1529                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1530                 skb->dev ? skb->dev->name : "?");
1531         kfree_skb(skb);
1532         return 0;
1533 }
1534
1535 /*
1536    We do not cache source address of outgoing interface,
1537    because it is used only by IP RR, TS and SRR options,
1538    so that it out of fast path.
1539
1540    BTW remember: "addr" is allowed to be not aligned
1541    in IP options!
1542  */
1543
1544 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1545 {
1546         __be32 src;
1547         struct fib_result res;
1548
1549         if (rt->fl.iif == 0)
1550                 src = rt->rt_src;
1551         else if (fib_lookup(&rt->fl, &res) == 0) {
1552                 src = FIB_RES_PREFSRC(res);
1553                 fib_res_put(&res);
1554         } else
1555                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1556                                         RT_SCOPE_UNIVERSE);
1557         memcpy(addr, &src, 4);
1558 }
1559
1560 #ifdef CONFIG_NET_CLS_ROUTE
1561 static void set_class_tag(struct rtable *rt, u32 tag)
1562 {
1563         if (!(rt->u.dst.tclassid & 0xFFFF))
1564                 rt->u.dst.tclassid |= tag & 0xFFFF;
1565         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1566                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1567 }
1568 #endif
1569
1570 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1571 {
1572         struct fib_info *fi = res->fi;
1573
1574         if (fi) {
1575                 if (FIB_RES_GW(*res) &&
1576                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1577                         rt->rt_gateway = FIB_RES_GW(*res);
1578                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1579                        sizeof(rt->u.dst.metrics));
1580                 if (fi->fib_mtu == 0) {
1581                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1582                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1583                             rt->rt_gateway != rt->rt_dst &&
1584                             rt->u.dst.dev->mtu > 576)
1585                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1586                 }
1587 #ifdef CONFIG_NET_CLS_ROUTE
1588                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1589 #endif
1590         } else
1591                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1592
1593         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1594                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1595         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1596                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1597         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1598                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1599                                        ip_rt_min_advmss);
1600         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1601                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1602
1603 #ifdef CONFIG_NET_CLS_ROUTE
1604 #ifdef CONFIG_IP_MULTIPLE_TABLES
1605         set_class_tag(rt, fib_rules_tclass(res));
1606 #endif
1607         set_class_tag(rt, itag);
1608 #endif
1609         rt->rt_type = res->type;
1610 }
1611
1612 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1613                                 u8 tos, struct net_device *dev, int our)
1614 {
1615         unsigned hash;
1616         struct rtable *rth;
1617         __be32 spec_dst;
1618         struct in_device *in_dev = in_dev_get(dev);
1619         u32 itag = 0;
1620
1621         /* Primary sanity checks. */
1622
1623         if (in_dev == NULL)
1624                 return -EINVAL;
1625
1626         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1627             skb->protocol != htons(ETH_P_IP))
1628                 goto e_inval;
1629
1630         if (ZERONET(saddr)) {
1631                 if (!LOCAL_MCAST(daddr))
1632                         goto e_inval;
1633                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1634         } else if (fib_validate_source(saddr, 0, tos, 0,
1635                                         dev, &spec_dst, &itag) < 0)
1636                 goto e_inval;
1637
1638         rth = dst_alloc(&ipv4_dst_ops);
1639         if (!rth)
1640                 goto e_nobufs;
1641
1642         rth->u.dst.output= ip_rt_bug;
1643
1644         atomic_set(&rth->u.dst.__refcnt, 1);
1645         rth->u.dst.flags= DST_HOST;
1646         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1647                 rth->u.dst.flags |= DST_NOPOLICY;
1648         rth->fl.fl4_dst = daddr;
1649         rth->rt_dst     = daddr;
1650         rth->fl.fl4_tos = tos;
1651         rth->fl.mark    = skb->mark;
1652         rth->fl.fl4_src = saddr;
1653         rth->rt_src     = saddr;
1654 #ifdef CONFIG_NET_CLS_ROUTE
1655         rth->u.dst.tclassid = itag;
1656 #endif
1657         rth->rt_iif     =
1658         rth->fl.iif     = dev->ifindex;
1659         rth->u.dst.dev  = init_net.loopback_dev;
1660         dev_hold(rth->u.dst.dev);
1661         rth->idev       = in_dev_get(rth->u.dst.dev);
1662         rth->fl.oif     = 0;
1663         rth->rt_gateway = daddr;
1664         rth->rt_spec_dst= spec_dst;
1665         rth->rt_type    = RTN_MULTICAST;
1666         rth->rt_flags   = RTCF_MULTICAST;
1667         if (our) {
1668                 rth->u.dst.input= ip_local_deliver;
1669                 rth->rt_flags |= RTCF_LOCAL;
1670         }
1671
1672 #ifdef CONFIG_IP_MROUTE
1673         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1674                 rth->u.dst.input = ip_mr_input;
1675 #endif
1676         RT_CACHE_STAT_INC(in_slow_mc);
1677
1678         in_dev_put(in_dev);
1679         hash = rt_hash(daddr, saddr, dev->ifindex);
1680         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1681
1682 e_nobufs:
1683         in_dev_put(in_dev);
1684         return -ENOBUFS;
1685
1686 e_inval:
1687         in_dev_put(in_dev);
1688         return -EINVAL;
1689 }
1690
1691
1692 static void ip_handle_martian_source(struct net_device *dev,
1693                                      struct in_device *in_dev,
1694                                      struct sk_buff *skb,
1695                                      __be32 daddr,
1696                                      __be32 saddr)
1697 {
1698         RT_CACHE_STAT_INC(in_martian_src);
1699 #ifdef CONFIG_IP_ROUTE_VERBOSE
1700         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1701                 /*
1702                  *      RFC1812 recommendation, if source is martian,
1703                  *      the only hint is MAC header.
1704                  */
1705                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1706                         "%u.%u.%u.%u, on dev %s\n",
1707                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1708                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1709                         int i;
1710                         const unsigned char *p = skb_mac_header(skb);
1711                         printk(KERN_WARNING "ll header: ");
1712                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1713                                 printk("%02x", *p);
1714                                 if (i < (dev->hard_header_len - 1))
1715                                         printk(":");
1716                         }
1717                         printk("\n");
1718                 }
1719         }
1720 #endif
1721 }
1722
1723 static inline int __mkroute_input(struct sk_buff *skb,
1724                                   struct fib_result* res,
1725                                   struct in_device *in_dev,
1726                                   __be32 daddr, __be32 saddr, u32 tos,
1727                                   struct rtable **result)
1728 {
1729
1730         struct rtable *rth;
1731         int err;
1732         struct in_device *out_dev;
1733         unsigned flags = 0;
1734         __be32 spec_dst;
1735         u32 itag;
1736
1737         /* get a working reference to the output device */
1738         out_dev = in_dev_get(FIB_RES_DEV(*res));
1739         if (out_dev == NULL) {
1740                 if (net_ratelimit())
1741                         printk(KERN_CRIT "Bug in ip_route_input" \
1742                                "_slow(). Please, report\n");
1743                 return -EINVAL;
1744         }
1745
1746
1747         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1748                                   in_dev->dev, &spec_dst, &itag);
1749         if (err < 0) {
1750                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1751                                          saddr);
1752
1753                 err = -EINVAL;
1754                 goto cleanup;
1755         }
1756
1757         if (err)
1758                 flags |= RTCF_DIRECTSRC;
1759
1760         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1761             (IN_DEV_SHARED_MEDIA(out_dev) ||
1762              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1763                 flags |= RTCF_DOREDIRECT;
1764
1765         if (skb->protocol != htons(ETH_P_IP)) {
1766                 /* Not IP (i.e. ARP). Do not create route, if it is
1767                  * invalid for proxy arp. DNAT routes are always valid.
1768                  */
1769                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1770                         err = -EINVAL;
1771                         goto cleanup;
1772                 }
1773         }
1774
1775
1776         rth = dst_alloc(&ipv4_dst_ops);
1777         if (!rth) {
1778                 err = -ENOBUFS;
1779                 goto cleanup;
1780         }
1781
1782         atomic_set(&rth->u.dst.__refcnt, 1);
1783         rth->u.dst.flags= DST_HOST;
1784         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1785                 rth->u.dst.flags |= DST_NOPOLICY;
1786         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1787                 rth->u.dst.flags |= DST_NOXFRM;
1788         rth->fl.fl4_dst = daddr;
1789         rth->rt_dst     = daddr;
1790         rth->fl.fl4_tos = tos;
1791         rth->fl.mark    = skb->mark;
1792         rth->fl.fl4_src = saddr;
1793         rth->rt_src     = saddr;
1794         rth->rt_gateway = daddr;
1795         rth->rt_iif     =
1796                 rth->fl.iif     = in_dev->dev->ifindex;
1797         rth->u.dst.dev  = (out_dev)->dev;
1798         dev_hold(rth->u.dst.dev);
1799         rth->idev       = in_dev_get(rth->u.dst.dev);
1800         rth->fl.oif     = 0;
1801         rth->rt_spec_dst= spec_dst;
1802
1803         rth->u.dst.input = ip_forward;
1804         rth->u.dst.output = ip_output;
1805
1806         rt_set_nexthop(rth, res, itag);
1807
1808         rth->rt_flags = flags;
1809
1810         *result = rth;
1811         err = 0;
1812  cleanup:
1813         /* release the working reference to the output device */
1814         in_dev_put(out_dev);
1815         return err;
1816 }
1817
1818 static inline int ip_mkroute_input(struct sk_buff *skb,
1819                                    struct fib_result* res,
1820                                    const struct flowi *fl,
1821                                    struct in_device *in_dev,
1822                                    __be32 daddr, __be32 saddr, u32 tos)
1823 {
1824         struct rtable* rth = NULL;
1825         int err;
1826         unsigned hash;
1827
1828 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1829         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1830                 fib_select_multipath(fl, res);
1831 #endif
1832
1833         /* create a routing cache entry */
1834         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1835         if (err)
1836                 return err;
1837
1838         /* put it into the cache */
1839         hash = rt_hash(daddr, saddr, fl->iif);
1840         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1841 }
1842
1843 /*
1844  *      NOTE. We drop all the packets that has local source
1845  *      addresses, because every properly looped back packet
1846  *      must have correct destination already attached by output routine.
1847  *
1848  *      Such approach solves two big problems:
1849  *      1. Not simplex devices are handled properly.
1850  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1851  */
1852
1853 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1854                                u8 tos, struct net_device *dev)
1855 {
1856         struct fib_result res;
1857         struct in_device *in_dev = in_dev_get(dev);
1858         struct flowi fl = { .nl_u = { .ip4_u =
1859                                       { .daddr = daddr,
1860                                         .saddr = saddr,
1861                                         .tos = tos,
1862                                         .scope = RT_SCOPE_UNIVERSE,
1863                                       } },
1864                             .mark = skb->mark,
1865                             .iif = dev->ifindex };
1866         unsigned        flags = 0;
1867         u32             itag = 0;
1868         struct rtable * rth;
1869         unsigned        hash;
1870         __be32          spec_dst;
1871         int             err = -EINVAL;
1872         int             free_res = 0;
1873
1874         /* IP on this device is disabled. */
1875
1876         if (!in_dev)
1877                 goto out;
1878
1879         /* Check for the most weird martians, which can be not detected
1880            by fib_lookup.
1881          */
1882
1883         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1884                 goto martian_source;
1885
1886         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1887                 goto brd_input;
1888
1889         /* Accept zero addresses only to limited broadcast;
1890          * I even do not know to fix it or not. Waiting for complains :-)
1891          */
1892         if (ZERONET(saddr))
1893                 goto martian_source;
1894
1895         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1896                 goto martian_destination;
1897
1898         /*
1899          *      Now we are ready to route packet.
1900          */
1901         if ((err = fib_lookup(&fl, &res)) != 0) {
1902                 if (!IN_DEV_FORWARD(in_dev))
1903                         goto e_hostunreach;
1904                 goto no_route;
1905         }
1906         free_res = 1;
1907
1908         RT_CACHE_STAT_INC(in_slow_tot);
1909
1910         if (res.type == RTN_BROADCAST)
1911                 goto brd_input;
1912
1913         if (res.type == RTN_LOCAL) {
1914                 int result;
1915                 result = fib_validate_source(saddr, daddr, tos,
1916                                              init_net.loopback_dev->ifindex,
1917                                              dev, &spec_dst, &itag);
1918                 if (result < 0)
1919                         goto martian_source;
1920                 if (result)
1921                         flags |= RTCF_DIRECTSRC;
1922                 spec_dst = daddr;
1923                 goto local_input;
1924         }
1925
1926         if (!IN_DEV_FORWARD(in_dev))
1927                 goto e_hostunreach;
1928         if (res.type != RTN_UNICAST)
1929                 goto martian_destination;
1930
1931         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1932 done:
1933         in_dev_put(in_dev);
1934         if (free_res)
1935                 fib_res_put(&res);
1936 out:    return err;
1937
1938 brd_input:
1939         if (skb->protocol != htons(ETH_P_IP))
1940                 goto e_inval;
1941
1942         if (ZERONET(saddr))
1943                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1944         else {
1945                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1946                                           &itag);
1947                 if (err < 0)
1948                         goto martian_source;
1949                 if (err)
1950                         flags |= RTCF_DIRECTSRC;
1951         }
1952         flags |= RTCF_BROADCAST;
1953         res.type = RTN_BROADCAST;
1954         RT_CACHE_STAT_INC(in_brd);
1955
1956 local_input:
1957         rth = dst_alloc(&ipv4_dst_ops);
1958         if (!rth)
1959                 goto e_nobufs;
1960
1961         rth->u.dst.output= ip_rt_bug;
1962
1963         atomic_set(&rth->u.dst.__refcnt, 1);
1964         rth->u.dst.flags= DST_HOST;
1965         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1966                 rth->u.dst.flags |= DST_NOPOLICY;
1967         rth->fl.fl4_dst = daddr;
1968         rth->rt_dst     = daddr;
1969         rth->fl.fl4_tos = tos;
1970         rth->fl.mark    = skb->mark;
1971         rth->fl.fl4_src = saddr;
1972         rth->rt_src     = saddr;
1973 #ifdef CONFIG_NET_CLS_ROUTE
1974         rth->u.dst.tclassid = itag;
1975 #endif
1976         rth->rt_iif     =
1977         rth->fl.iif     = dev->ifindex;
1978         rth->u.dst.dev  = init_net.loopback_dev;
1979         dev_hold(rth->u.dst.dev);
1980         rth->idev       = in_dev_get(rth->u.dst.dev);
1981         rth->rt_gateway = daddr;
1982         rth->rt_spec_dst= spec_dst;
1983         rth->u.dst.input= ip_local_deliver;
1984         rth->rt_flags   = flags|RTCF_LOCAL;
1985         if (res.type == RTN_UNREACHABLE) {
1986                 rth->u.dst.input= ip_error;
1987                 rth->u.dst.error= -err;
1988                 rth->rt_flags   &= ~RTCF_LOCAL;
1989         }
1990         rth->rt_type    = res.type;
1991         hash = rt_hash(daddr, saddr, fl.iif);
1992         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1993         goto done;
1994
1995 no_route:
1996         RT_CACHE_STAT_INC(in_no_route);
1997         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1998         res.type = RTN_UNREACHABLE;
1999         if (err == -ESRCH)
2000                 err = -ENETUNREACH;
2001         goto local_input;
2002
2003         /*
2004          *      Do not cache martian addresses: they should be logged (RFC1812)
2005          */
2006 martian_destination:
2007         RT_CACHE_STAT_INC(in_martian_dst);
2008 #ifdef CONFIG_IP_ROUTE_VERBOSE
2009         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2010                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2011                         "%u.%u.%u.%u, dev %s\n",
2012                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2013 #endif
2014
2015 e_hostunreach:
2016         err = -EHOSTUNREACH;
2017         goto done;
2018
2019 e_inval:
2020         err = -EINVAL;
2021         goto done;
2022
2023 e_nobufs:
2024         err = -ENOBUFS;
2025         goto done;
2026
2027 martian_source:
2028         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2029         goto e_inval;
2030 }
2031
2032 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2033                    u8 tos, struct net_device *dev)
2034 {
2035         struct rtable * rth;
2036         unsigned        hash;
2037         int iif = dev->ifindex;
2038
2039         tos &= IPTOS_RT_MASK;
2040         hash = rt_hash(daddr, saddr, iif);
2041
2042         rcu_read_lock();
2043         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2044              rth = rcu_dereference(rth->u.dst.rt_next)) {
2045                 if (rth->fl.fl4_dst == daddr &&
2046                     rth->fl.fl4_src == saddr &&
2047                     rth->fl.iif == iif &&
2048                     rth->fl.oif == 0 &&
2049                     rth->fl.mark == skb->mark &&
2050                     rth->fl.fl4_tos == tos) {
2051                         dst_use(&rth->u.dst, jiffies);
2052                         RT_CACHE_STAT_INC(in_hit);
2053                         rcu_read_unlock();
2054                         skb->dst = (struct dst_entry*)rth;
2055                         return 0;
2056                 }
2057                 RT_CACHE_STAT_INC(in_hlist_search);
2058         }
2059         rcu_read_unlock();
2060
2061         /* Multicast recognition logic is moved from route cache to here.
2062            The problem was that too many Ethernet cards have broken/missing
2063            hardware multicast filters :-( As result the host on multicasting
2064            network acquires a lot of useless route cache entries, sort of
2065            SDR messages from all the world. Now we try to get rid of them.
2066            Really, provided software IP multicast filter is organized
2067            reasonably (at least, hashed), it does not result in a slowdown
2068            comparing with route cache reject entries.
2069            Note, that multicast routers are not affected, because
2070            route cache entry is created eventually.
2071          */
2072         if (MULTICAST(daddr)) {
2073                 struct in_device *in_dev;
2074
2075                 rcu_read_lock();
2076                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2077                         int our = ip_check_mc(in_dev, daddr, saddr,
2078                                 ip_hdr(skb)->protocol);
2079                         if (our
2080 #ifdef CONFIG_IP_MROUTE
2081                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2082 #endif
2083                             ) {
2084                                 rcu_read_unlock();
2085                                 return ip_route_input_mc(skb, daddr, saddr,
2086                                                          tos, dev, our);
2087                         }
2088                 }
2089                 rcu_read_unlock();
2090                 return -EINVAL;
2091         }
2092         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2093 }
2094
2095 static inline int __mkroute_output(struct rtable **result,
2096                                    struct fib_result* res,
2097                                    const struct flowi *fl,
2098                                    const struct flowi *oldflp,
2099                                    struct net_device *dev_out,
2100                                    unsigned flags)
2101 {
2102         struct rtable *rth;
2103         struct in_device *in_dev;
2104         u32 tos = RT_FL_TOS(oldflp);
2105         int err = 0;
2106
2107         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2108                 return -EINVAL;
2109
2110         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2111                 res->type = RTN_BROADCAST;
2112         else if (MULTICAST(fl->fl4_dst))
2113                 res->type = RTN_MULTICAST;
2114         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2115                 return -EINVAL;
2116
2117         if (dev_out->flags & IFF_LOOPBACK)
2118                 flags |= RTCF_LOCAL;
2119
2120         /* get work reference to inet device */
2121         in_dev = in_dev_get(dev_out);
2122         if (!in_dev)
2123                 return -EINVAL;
2124
2125         if (res->type == RTN_BROADCAST) {
2126                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2127                 if (res->fi) {
2128                         fib_info_put(res->fi);
2129                         res->fi = NULL;
2130                 }
2131         } else if (res->type == RTN_MULTICAST) {
2132                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2133                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2134                                  oldflp->proto))
2135                         flags &= ~RTCF_LOCAL;
2136                 /* If multicast route do not exist use
2137                    default one, but do not gateway in this case.
2138                    Yes, it is hack.
2139                  */
2140                 if (res->fi && res->prefixlen < 4) {
2141                         fib_info_put(res->fi);
2142                         res->fi = NULL;
2143                 }
2144         }
2145
2146
2147         rth = dst_alloc(&ipv4_dst_ops);
2148         if (!rth) {
2149                 err = -ENOBUFS;
2150                 goto cleanup;
2151         }
2152
2153         atomic_set(&rth->u.dst.__refcnt, 1);
2154         rth->u.dst.flags= DST_HOST;
2155         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2156                 rth->u.dst.flags |= DST_NOXFRM;
2157         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2158                 rth->u.dst.flags |= DST_NOPOLICY;
2159
2160         rth->fl.fl4_dst = oldflp->fl4_dst;
2161         rth->fl.fl4_tos = tos;
2162         rth->fl.fl4_src = oldflp->fl4_src;
2163         rth->fl.oif     = oldflp->oif;
2164         rth->fl.mark    = oldflp->mark;
2165         rth->rt_dst     = fl->fl4_dst;
2166         rth->rt_src     = fl->fl4_src;
2167         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2168         /* get references to the devices that are to be hold by the routing
2169            cache entry */
2170         rth->u.dst.dev  = dev_out;
2171         dev_hold(dev_out);
2172         rth->idev       = in_dev_get(dev_out);
2173         rth->rt_gateway = fl->fl4_dst;
2174         rth->rt_spec_dst= fl->fl4_src;
2175
2176         rth->u.dst.output=ip_output;
2177
2178         RT_CACHE_STAT_INC(out_slow_tot);
2179
2180         if (flags & RTCF_LOCAL) {
2181                 rth->u.dst.input = ip_local_deliver;
2182                 rth->rt_spec_dst = fl->fl4_dst;
2183         }
2184         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2185                 rth->rt_spec_dst = fl->fl4_src;
2186                 if (flags & RTCF_LOCAL &&
2187                     !(dev_out->flags & IFF_LOOPBACK)) {
2188                         rth->u.dst.output = ip_mc_output;
2189                         RT_CACHE_STAT_INC(out_slow_mc);
2190                 }
2191 #ifdef CONFIG_IP_MROUTE
2192                 if (res->type == RTN_MULTICAST) {
2193                         if (IN_DEV_MFORWARD(in_dev) &&
2194                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2195                                 rth->u.dst.input = ip_mr_input;
2196                                 rth->u.dst.output = ip_mc_output;
2197                         }
2198                 }
2199 #endif
2200         }
2201
2202         rt_set_nexthop(rth, res, 0);
2203
2204         rth->rt_flags = flags;
2205
2206         *result = rth;
2207  cleanup:
2208         /* release work reference to inet device */
2209         in_dev_put(in_dev);
2210
2211         return err;
2212 }
2213
2214 static inline int ip_mkroute_output(struct rtable **rp,
2215                                     struct fib_result* res,
2216                                     const struct flowi *fl,
2217                                     const struct flowi *oldflp,
2218                                     struct net_device *dev_out,
2219                                     unsigned flags)
2220 {
2221         struct rtable *rth = NULL;
2222         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2223         unsigned hash;
2224         if (err == 0) {
2225                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2226                 err = rt_intern_hash(hash, rth, rp);
2227         }
2228
2229         return err;
2230 }
2231
2232 /*
2233  * Major route resolver routine.
2234  */
2235
2236 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2237 {
2238         u32 tos = RT_FL_TOS(oldflp);
2239         struct flowi fl = { .nl_u = { .ip4_u =
2240                                       { .daddr = oldflp->fl4_dst,
2241                                         .saddr = oldflp->fl4_src,
2242                                         .tos = tos & IPTOS_RT_MASK,
2243                                         .scope = ((tos & RTO_ONLINK) ?
2244                                                   RT_SCOPE_LINK :
2245                                                   RT_SCOPE_UNIVERSE),
2246                                       } },
2247                             .mark = oldflp->mark,
2248                             .iif = init_net.loopback_dev->ifindex,
2249                             .oif = oldflp->oif };
2250         struct fib_result res;
2251         unsigned flags = 0;
2252         struct net_device *dev_out = NULL;
2253         int free_res = 0;
2254         int err;
2255
2256
2257         res.fi          = NULL;
2258 #ifdef CONFIG_IP_MULTIPLE_TABLES
2259         res.r           = NULL;
2260 #endif
2261
2262         if (oldflp->fl4_src) {
2263                 err = -EINVAL;
2264                 if (MULTICAST(oldflp->fl4_src) ||
2265                     BADCLASS(oldflp->fl4_src) ||
2266                     ZERONET(oldflp->fl4_src))
2267                         goto out;
2268
2269                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2270                 dev_out = ip_dev_find(oldflp->fl4_src);
2271                 if (dev_out == NULL)
2272                         goto out;
2273
2274                 /* I removed check for oif == dev_out->oif here.
2275                    It was wrong for two reasons:
2276                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2277                       assigned to multiple interfaces.
2278                    2. Moreover, we are allowed to send packets with saddr
2279                       of another iface. --ANK
2280                  */
2281
2282                 if (oldflp->oif == 0
2283                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2284                         /* Special hack: user can direct multicasts
2285                            and limited broadcast via necessary interface
2286                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2287                            This hack is not just for fun, it allows
2288                            vic,vat and friends to work.
2289                            They bind socket to loopback, set ttl to zero
2290                            and expect that it will work.
2291                            From the viewpoint of routing cache they are broken,
2292                            because we are not allowed to build multicast path
2293                            with loopback source addr (look, routing cache
2294                            cannot know, that ttl is zero, so that packet
2295                            will not leave this host and route is valid).
2296                            Luckily, this hack is good workaround.
2297                          */
2298
2299                         fl.oif = dev_out->ifindex;
2300                         goto make_route;
2301                 }
2302                 if (dev_out)
2303                         dev_put(dev_out);
2304                 dev_out = NULL;
2305         }
2306
2307
2308         if (oldflp->oif) {
2309                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2310                 err = -ENODEV;
2311                 if (dev_out == NULL)
2312                         goto out;
2313
2314                 /* RACE: Check return value of inet_select_addr instead. */
2315                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2316                         dev_put(dev_out);
2317                         goto out;       /* Wrong error code */
2318                 }
2319
2320                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2321                         if (!fl.fl4_src)
2322                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2323                                                               RT_SCOPE_LINK);
2324                         goto make_route;
2325                 }
2326                 if (!fl.fl4_src) {
2327                         if (MULTICAST(oldflp->fl4_dst))
2328                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2329                                                               fl.fl4_scope);
2330                         else if (!oldflp->fl4_dst)
2331                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2332                                                               RT_SCOPE_HOST);
2333                 }
2334         }
2335
2336         if (!fl.fl4_dst) {
2337                 fl.fl4_dst = fl.fl4_src;
2338                 if (!fl.fl4_dst)
2339                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2340                 if (dev_out)
2341                         dev_put(dev_out);
2342                 dev_out = init_net.loopback_dev;
2343                 dev_hold(dev_out);
2344                 fl.oif = init_net.loopback_dev->ifindex;
2345                 res.type = RTN_LOCAL;
2346                 flags |= RTCF_LOCAL;
2347                 goto make_route;
2348         }
2349
2350         if (fib_lookup(&fl, &res)) {
2351                 res.fi = NULL;
2352                 if (oldflp->oif) {
2353                         /* Apparently, routing tables are wrong. Assume,
2354                            that the destination is on link.
2355
2356                            WHY? DW.
2357                            Because we are allowed to send to iface
2358                            even if it has NO routes and NO assigned
2359                            addresses. When oif is specified, routing
2360                            tables are looked up with only one purpose:
2361                            to catch if destination is gatewayed, rather than
2362                            direct. Moreover, if MSG_DONTROUTE is set,
2363                            we send packet, ignoring both routing tables
2364                            and ifaddr state. --ANK
2365
2366
2367                            We could make it even if oif is unknown,
2368                            likely IPv6, but we do not.
2369                          */
2370
2371                         if (fl.fl4_src == 0)
2372                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2373                                                               RT_SCOPE_LINK);
2374                         res.type = RTN_UNICAST;
2375                         goto make_route;
2376                 }
2377                 if (dev_out)
2378                         dev_put(dev_out);
2379                 err = -ENETUNREACH;
2380                 goto out;
2381         }
2382         free_res = 1;
2383
2384         if (res.type == RTN_LOCAL) {
2385                 if (!fl.fl4_src)
2386                         fl.fl4_src = fl.fl4_dst;
2387                 if (dev_out)
2388                         dev_put(dev_out);
2389                 dev_out = init_net.loopback_dev;
2390                 dev_hold(dev_out);
2391                 fl.oif = dev_out->ifindex;
2392                 if (res.fi)
2393                         fib_info_put(res.fi);
2394                 res.fi = NULL;
2395                 flags |= RTCF_LOCAL;
2396                 goto make_route;
2397         }
2398
2399 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2400         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2401                 fib_select_multipath(&fl, &res);
2402         else
2403 #endif
2404         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2405                 fib_select_default(&fl, &res);
2406
2407         if (!fl.fl4_src)
2408                 fl.fl4_src = FIB_RES_PREFSRC(res);
2409
2410         if (dev_out)
2411                 dev_put(dev_out);
2412         dev_out = FIB_RES_DEV(res);
2413         dev_hold(dev_out);
2414         fl.oif = dev_out->ifindex;
2415
2416
2417 make_route:
2418         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2419
2420
2421         if (free_res)
2422                 fib_res_put(&res);
2423         if (dev_out)
2424                 dev_put(dev_out);
2425 out:    return err;
2426 }
2427
2428 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2429 {
2430         unsigned hash;
2431         struct rtable *rth;
2432
2433         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2434
2435         rcu_read_lock_bh();
2436         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2437                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2438                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2439                     rth->fl.fl4_src == flp->fl4_src &&
2440                     rth->fl.iif == 0 &&
2441                     rth->fl.oif == flp->oif &&
2442                     rth->fl.mark == flp->mark &&
2443                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2444                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2445                         dst_use(&rth->u.dst, jiffies);
2446                         RT_CACHE_STAT_INC(out_hit);
2447                         rcu_read_unlock_bh();
2448                         *rp = rth;
2449                         return 0;
2450                 }
2451                 RT_CACHE_STAT_INC(out_hlist_search);
2452         }
2453         rcu_read_unlock_bh();
2454
2455         return ip_route_output_slow(rp, flp);
2456 }
2457
2458 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2459
2460 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2461 {
2462 }
2463
2464 static struct dst_ops ipv4_dst_blackhole_ops = {
2465         .family                 =       AF_INET,
2466         .protocol               =       __constant_htons(ETH_P_IP),
2467         .destroy                =       ipv4_dst_destroy,
2468         .check                  =       ipv4_dst_check,
2469         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2470         .entry_size             =       sizeof(struct rtable),
2471 };
2472
2473
2474 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2475 {
2476         struct rtable *ort = *rp;
2477         struct rtable *rt = (struct rtable *)
2478                 dst_alloc(&ipv4_dst_blackhole_ops);
2479
2480         if (rt) {
2481                 struct dst_entry *new = &rt->u.dst;
2482
2483                 atomic_set(&new->__refcnt, 1);
2484                 new->__use = 1;
2485                 new->input = dst_discard;
2486                 new->output = dst_discard;
2487                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2488
2489                 new->dev = ort->u.dst.dev;
2490                 if (new->dev)
2491                         dev_hold(new->dev);
2492
2493                 rt->fl = ort->fl;
2494
2495                 rt->idev = ort->idev;
2496                 if (rt->idev)
2497                         in_dev_hold(rt->idev);
2498                 rt->rt_flags = ort->rt_flags;
2499                 rt->rt_type = ort->rt_type;
2500                 rt->rt_dst = ort->rt_dst;
2501                 rt->rt_src = ort->rt_src;
2502                 rt->rt_iif = ort->rt_iif;
2503                 rt->rt_gateway = ort->rt_gateway;
2504                 rt->rt_spec_dst = ort->rt_spec_dst;
2505                 rt->peer = ort->peer;
2506                 if (rt->peer)
2507                         atomic_inc(&rt->peer->refcnt);
2508
2509                 dst_free(new);
2510         }
2511
2512         dst_release(&(*rp)->u.dst);
2513         *rp = rt;
2514         return (rt ? 0 : -ENOMEM);
2515 }
2516
2517 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2518 {
2519         int err;
2520
2521         if ((err = __ip_route_output_key(rp, flp)) != 0)
2522                 return err;
2523
2524         if (flp->proto) {
2525                 if (!flp->fl4_src)
2526                         flp->fl4_src = (*rp)->rt_src;
2527                 if (!flp->fl4_dst)
2528                         flp->fl4_dst = (*rp)->rt_dst;
2529                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2530                 if (err == -EREMOTE)
2531                         err = ipv4_dst_blackhole(rp, flp, sk);
2532
2533                 return err;
2534         }
2535
2536         return 0;
2537 }
2538
2539 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2540
2541 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2542 {
2543         return ip_route_output_flow(rp, flp, NULL, 0);
2544 }
2545
2546 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2547                         int nowait, unsigned int flags)
2548 {
2549         struct rtable *rt = (struct rtable*)skb->dst;
2550         struct rtmsg *r;
2551         struct nlmsghdr *nlh;
2552         long expires;
2553         u32 id = 0, ts = 0, tsage = 0, error;
2554
2555         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2556         if (nlh == NULL)
2557                 return -EMSGSIZE;
2558
2559         r = nlmsg_data(nlh);
2560         r->rtm_family    = AF_INET;
2561         r->rtm_dst_len  = 32;
2562         r->rtm_src_len  = 0;
2563         r->rtm_tos      = rt->fl.fl4_tos;
2564         r->rtm_table    = RT_TABLE_MAIN;
2565         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2566         r->rtm_type     = rt->rt_type;
2567         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2568         r->rtm_protocol = RTPROT_UNSPEC;
2569         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2570         if (rt->rt_flags & RTCF_NOTIFY)
2571                 r->rtm_flags |= RTM_F_NOTIFY;
2572
2573         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2574
2575         if (rt->fl.fl4_src) {
2576                 r->rtm_src_len = 32;
2577                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2578         }
2579         if (rt->u.dst.dev)
2580                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2581 #ifdef CONFIG_NET_CLS_ROUTE
2582         if (rt->u.dst.tclassid)
2583                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2584 #endif
2585         if (rt->fl.iif)
2586                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2587         else if (rt->rt_src != rt->fl.fl4_src)
2588                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2589
2590         if (rt->rt_dst != rt->rt_gateway)
2591                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2592
2593         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2594                 goto nla_put_failure;
2595
2596         error = rt->u.dst.error;
2597         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2598         if (rt->peer) {
2599                 id = rt->peer->ip_id_count;
2600                 if (rt->peer->tcp_ts_stamp) {
2601                         ts = rt->peer->tcp_ts;
2602                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2603                 }
2604         }
2605
2606         if (rt->fl.iif) {
2607 #ifdef CONFIG_IP_MROUTE
2608                 __be32 dst = rt->rt_dst;
2609
2610                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2611                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2612                         int err = ipmr_get_route(skb, r, nowait);
2613                         if (err <= 0) {
2614                                 if (!nowait) {
2615                                         if (err == 0)
2616                                                 return 0;
2617                                         goto nla_put_failure;
2618                                 } else {
2619                                         if (err == -EMSGSIZE)
2620                                                 goto nla_put_failure;
2621                                         error = err;
2622                                 }
2623                         }
2624                 } else
2625 #endif
2626                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2627         }
2628
2629         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2630                                expires, error) < 0)
2631                 goto nla_put_failure;
2632
2633         return nlmsg_end(skb, nlh);
2634
2635 nla_put_failure:
2636         nlmsg_cancel(skb, nlh);
2637         return -EMSGSIZE;
2638 }
2639
2640 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2641 {
2642         struct net *net = in_skb->sk->sk_net;
2643         struct rtmsg *rtm;
2644         struct nlattr *tb[RTA_MAX+1];
2645         struct rtable *rt = NULL;
2646         __be32 dst = 0;
2647         __be32 src = 0;
2648         u32 iif;
2649         int err;
2650         struct sk_buff *skb;
2651
2652         if (net != &init_net)
2653                 return -EINVAL;
2654
2655         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2656         if (err < 0)
2657                 goto errout;
2658
2659         rtm = nlmsg_data(nlh);
2660
2661         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2662         if (skb == NULL) {
2663                 err = -ENOBUFS;
2664                 goto errout;
2665         }
2666
2667         /* Reserve room for dummy headers, this skb can pass
2668            through good chunk of routing engine.
2669          */
2670         skb_reset_mac_header(skb);
2671         skb_reset_network_header(skb);
2672
2673         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2674         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2675         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2676
2677         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2678         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2679         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2680
2681         if (iif) {
2682                 struct net_device *dev;
2683
2684                 dev = __dev_get_by_index(&init_net, iif);
2685                 if (dev == NULL) {
2686                         err = -ENODEV;
2687                         goto errout_free;
2688                 }
2689
2690                 skb->protocol   = htons(ETH_P_IP);
2691                 skb->dev        = dev;
2692                 local_bh_disable();
2693                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2694                 local_bh_enable();
2695
2696                 rt = (struct rtable*) skb->dst;
2697                 if (err == 0 && rt->u.dst.error)
2698                         err = -rt->u.dst.error;
2699         } else {
2700                 struct flowi fl = {
2701                         .nl_u = {
2702                                 .ip4_u = {
2703                                         .daddr = dst,
2704                                         .saddr = src,
2705                                         .tos = rtm->rtm_tos,
2706                                 },
2707                         },
2708                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2709                 };
2710                 err = ip_route_output_key(&rt, &fl);
2711         }
2712
2713         if (err)
2714                 goto errout_free;
2715
2716         skb->dst = &rt->u.dst;
2717         if (rtm->rtm_flags & RTM_F_NOTIFY)
2718                 rt->rt_flags |= RTCF_NOTIFY;
2719
2720         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2721                                 RTM_NEWROUTE, 0, 0);
2722         if (err <= 0)
2723                 goto errout_free;
2724
2725         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2726 errout:
2727         return err;
2728
2729 errout_free:
2730         kfree_skb(skb);
2731         goto errout;
2732 }
2733
2734 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2735 {
2736         struct rtable *rt;
2737         int h, s_h;
2738         int idx, s_idx;
2739
2740         s_h = cb->args[0];
2741         if (s_h < 0)
2742                 s_h = 0;
2743         s_idx = idx = cb->args[1];
2744         for (h = s_h; h <= rt_hash_mask; h++) {
2745                 rcu_read_lock_bh();
2746                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2747                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2748                         if (idx < s_idx)
2749                                 continue;
2750                         skb->dst = dst_clone(&rt->u.dst);
2751                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2752                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2753                                          1, NLM_F_MULTI) <= 0) {
2754                                 dst_release(xchg(&skb->dst, NULL));
2755                                 rcu_read_unlock_bh();
2756                                 goto done;
2757                         }
2758                         dst_release(xchg(&skb->dst, NULL));
2759                 }
2760                 rcu_read_unlock_bh();
2761                 s_idx = 0;
2762         }
2763
2764 done:
2765         cb->args[0] = h;
2766         cb->args[1] = idx;
2767         return skb->len;
2768 }
2769
2770 void ip_rt_multicast_event(struct in_device *in_dev)
2771 {
2772         rt_cache_flush(0);
2773 }
2774
2775 #ifdef CONFIG_SYSCTL
2776 static int flush_delay;
2777
2778 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2779                                         struct file *filp, void __user *buffer,
2780                                         size_t *lenp, loff_t *ppos)
2781 {
2782         if (write) {
2783                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2784                 rt_cache_flush(flush_delay);
2785                 return 0;
2786         }
2787
2788         return -EINVAL;
2789 }
2790
2791 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2792                                                 int __user *name,
2793                                                 int nlen,
2794                                                 void __user *oldval,
2795                                                 size_t __user *oldlenp,
2796                                                 void __user *newval,
2797                                                 size_t newlen)
2798 {
2799         int delay;
2800         if (newlen != sizeof(int))
2801                 return -EINVAL;
2802         if (get_user(delay, (int __user *)newval))
2803                 return -EFAULT;
2804         rt_cache_flush(delay);
2805         return 0;
2806 }
2807
2808 ctl_table ipv4_route_table[] = {
2809         {
2810                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2811                 .procname       = "flush",
2812                 .data           = &flush_delay,
2813                 .maxlen         = sizeof(int),
2814                 .mode           = 0200,
2815                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2816                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2817         },
2818         {
2819                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2820                 .procname       = "min_delay",
2821                 .data           = &ip_rt_min_delay,
2822                 .maxlen         = sizeof(int),
2823                 .mode           = 0644,
2824                 .proc_handler   = &proc_dointvec_jiffies,
2825                 .strategy       = &sysctl_jiffies,
2826         },
2827         {
2828                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2829                 .procname       = "max_delay",
2830                 .data           = &ip_rt_max_delay,
2831                 .maxlen         = sizeof(int),
2832                 .mode           = 0644,
2833                 .proc_handler   = &proc_dointvec_jiffies,
2834                 .strategy       = &sysctl_jiffies,
2835         },
2836         {
2837                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2838                 .procname       = "gc_thresh",
2839                 .data           = &ipv4_dst_ops.gc_thresh,
2840                 .maxlen         = sizeof(int),
2841                 .mode           = 0644,
2842                 .proc_handler   = &proc_dointvec,
2843         },
2844         {
2845                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2846                 .procname       = "max_size",
2847                 .data           = &ip_rt_max_size,
2848                 .maxlen         = sizeof(int),
2849                 .mode           = 0644,
2850                 .proc_handler   = &proc_dointvec,
2851         },
2852         {
2853                 /*  Deprecated. Use gc_min_interval_ms */
2854
2855                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2856                 .procname       = "gc_min_interval",
2857                 .data           = &ip_rt_gc_min_interval,
2858                 .maxlen         = sizeof(int),
2859                 .mode           = 0644,
2860                 .proc_handler   = &proc_dointvec_jiffies,
2861                 .strategy       = &sysctl_jiffies,
2862         },
2863         {
2864                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2865                 .procname       = "gc_min_interval_ms",
2866                 .data           = &ip_rt_gc_min_interval,
2867                 .maxlen         = sizeof(int),
2868                 .mode           = 0644,
2869                 .proc_handler   = &proc_dointvec_ms_jiffies,
2870                 .strategy       = &sysctl_ms_jiffies,
2871         },
2872         {
2873                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2874                 .procname       = "gc_timeout",
2875                 .data           = &ip_rt_gc_timeout,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = &proc_dointvec_jiffies,
2879                 .strategy       = &sysctl_jiffies,
2880         },
2881         {
2882                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2883                 .procname       = "gc_interval",
2884                 .data           = &ip_rt_gc_interval,
2885                 .maxlen         = sizeof(int),
2886                 .mode           = 0644,
2887                 .proc_handler   = &proc_dointvec_jiffies,
2888                 .strategy       = &sysctl_jiffies,
2889         },
2890         {
2891                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2892                 .procname       = "redirect_load",
2893                 .data           = &ip_rt_redirect_load,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = &proc_dointvec,
2897         },
2898         {
2899                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2900                 .procname       = "redirect_number",
2901                 .data           = &ip_rt_redirect_number,
2902                 .maxlen         = sizeof(int),
2903                 .mode           = 0644,
2904                 .proc_handler   = &proc_dointvec,
2905         },
2906         {
2907                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2908                 .procname       = "redirect_silence",
2909                 .data           = &ip_rt_redirect_silence,
2910                 .maxlen         = sizeof(int),
2911                 .mode           = 0644,
2912                 .proc_handler   = &proc_dointvec,
2913         },
2914         {
2915                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2916                 .procname       = "error_cost",
2917                 .data           = &ip_rt_error_cost,
2918                 .maxlen         = sizeof(int),
2919                 .mode           = 0644,
2920                 .proc_handler   = &proc_dointvec,
2921         },
2922         {
2923                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2924                 .procname       = "error_burst",
2925                 .data           = &ip_rt_error_burst,
2926                 .maxlen         = sizeof(int),
2927                 .mode           = 0644,
2928                 .proc_handler   = &proc_dointvec,
2929         },
2930         {
2931                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2932                 .procname       = "gc_elasticity",
2933                 .data           = &ip_rt_gc_elasticity,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = &proc_dointvec,
2937         },
2938         {
2939                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2940                 .procname       = "mtu_expires",
2941                 .data           = &ip_rt_mtu_expires,
2942                 .maxlen         = sizeof(int),
2943                 .mode           = 0644,
2944                 .proc_handler   = &proc_dointvec_jiffies,
2945                 .strategy       = &sysctl_jiffies,
2946         },
2947         {
2948                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2949                 .procname       = "min_pmtu",
2950                 .data           = &ip_rt_min_pmtu,
2951                 .maxlen         = sizeof(int),
2952                 .mode           = 0644,
2953                 .proc_handler   = &proc_dointvec,
2954         },
2955         {
2956                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2957                 .procname       = "min_adv_mss",
2958                 .data           = &ip_rt_min_advmss,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = &proc_dointvec,
2962         },
2963         {
2964                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2965                 .procname       = "secret_interval",
2966                 .data           = &ip_rt_secret_interval,
2967                 .maxlen         = sizeof(int),
2968                 .mode           = 0644,
2969                 .proc_handler   = &proc_dointvec_jiffies,
2970                 .strategy       = &sysctl_jiffies,
2971         },
2972         { .ctl_name = 0 }
2973 };
2974 #endif
2975
2976 #ifdef CONFIG_NET_CLS_ROUTE
2977 struct ip_rt_acct *ip_rt_acct __read_mostly;
2978 #endif /* CONFIG_NET_CLS_ROUTE */
2979
2980 static __initdata unsigned long rhash_entries;
2981 static int __init set_rhash_entries(char *str)
2982 {
2983         if (!str)
2984                 return 0;
2985         rhash_entries = simple_strtoul(str, &str, 0);
2986         return 1;
2987 }
2988 __setup("rhash_entries=", set_rhash_entries);
2989
2990 int __init ip_rt_init(void)
2991 {
2992         int rc = 0;
2993
2994         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2995                              (jiffies ^ (jiffies >> 7)));
2996
2997 #ifdef CONFIG_NET_CLS_ROUTE
2998         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
2999         if (!ip_rt_acct)
3000                 panic("IP: failed to allocate ip_rt_acct\n");
3001 #endif
3002
3003         ipv4_dst_ops.kmem_cachep =
3004                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3005                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3006
3007         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3008
3009         rt_hash_table = (struct rt_hash_bucket *)
3010                 alloc_large_system_hash("IP route cache",
3011                                         sizeof(struct rt_hash_bucket),
3012                                         rhash_entries,
3013                                         (num_physpages >= 128 * 1024) ?
3014                                         15 : 17,
3015                                         0,
3016                                         &rt_hash_log,
3017                                         &rt_hash_mask,
3018                                         0);
3019         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3020         rt_hash_lock_init();
3021
3022         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3023         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3024
3025         devinet_init();
3026         ip_fib_init();
3027
3028         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3029         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3030
3031         /* All the timers, started at system startup tend
3032            to synchronize. Perturb it a bit.
3033          */
3034         schedule_delayed_work(&expires_work,
3035                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3036
3037         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3038                 ip_rt_secret_interval;
3039         add_timer(&rt_secret_timer);
3040
3041         if (ip_rt_proc_init(&init_net))
3042                 printk(KERN_ERR "Unable to create route proc files\n");
3043 #ifdef CONFIG_XFRM
3044         xfrm_init();
3045         xfrm4_init();
3046 #endif
3047         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3048
3049         return rc;
3050 }
3051
3052 EXPORT_SYMBOL(__ip_select_ident);
3053 EXPORT_SYMBOL(ip_route_input);
3054 EXPORT_SYMBOL(ip_route_output_key);