[NETNS][DST] dst: pass the dst_ops as parameter to the gc functions
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_min_delay              = 2 * HZ;
121 static int ip_rt_max_delay              = 10 * HZ;
122 static int ip_rt_max_size;
123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
124 static int ip_rt_gc_interval            = 60 * HZ;
125 static int ip_rt_gc_min_interval        = HZ / 2;
126 static int ip_rt_redirect_number        = 9;
127 static int ip_rt_redirect_load          = HZ / 50;
128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
129 static int ip_rt_error_cost             = HZ;
130 static int ip_rt_error_burst            = 5 * HZ;
131 static int ip_rt_gc_elasticity          = 8;
132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
134 static int ip_rt_min_advmss             = 256;
135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
136 static int ip_rt_flush_expected;
137 static unsigned long rt_deadline;
138
139 #define RTprint(a...)   printk(KERN_DEBUG a)
140
141 static struct timer_list rt_flush_timer;
142 static void rt_worker_func(struct work_struct *work);
143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
144 static struct timer_list rt_secret_timer;
145
146 /*
147  *      Interface to generic destination cache.
148  */
149
150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
151 static void              ipv4_dst_destroy(struct dst_entry *dst);
152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
153                                          struct net_device *dev, int how);
154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
155 static void              ipv4_link_failure(struct sk_buff *skb);
156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
157 static int rt_garbage_collect(struct dst_ops *ops);
158
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .protocol =             __constant_htons(ETH_P_IP),
163         .gc =                   rt_garbage_collect,
164         .check =                ipv4_dst_check,
165         .destroy =              ipv4_dst_destroy,
166         .ifdown =               ipv4_dst_ifdown,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .local_out =            ip_local_out,
171         .entry_size =           sizeof(struct rtable),
172 };
173
174 #define ECN_OR_COST(class)      TC_PRIO_##class
175
176 const __u8 ip_tos2prio[16] = {
177         TC_PRIO_BESTEFFORT,
178         ECN_OR_COST(FILLER),
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BULK,
182         ECN_OR_COST(BULK),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_INTERACTIVE,
186         ECN_OR_COST(INTERACTIVE),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE_BULK,
190         ECN_OR_COST(INTERACTIVE_BULK),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK)
193 };
194
195
196 /*
197  * Route cache.
198  */
199
200 /* The locking scheme is rather straight forward:
201  *
202  * 1) Read-Copy Update protects the buckets of the central route hash.
203  * 2) Only writers remove entries, and they hold the lock
204  *    as they look at rtable reference counts.
205  * 3) Only readers acquire references to rtable entries,
206  *    they do so with atomic increments and with the
207  *    lock held.
208  */
209
210 struct rt_hash_bucket {
211         struct rtable   *chain;
212 };
213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
214         defined(CONFIG_PROVE_LOCKING)
215 /*
216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
217  * The size of this table is a power of two and depends on the number of CPUS.
218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
219  */
220 #ifdef CONFIG_LOCKDEP
221 # define RT_HASH_LOCK_SZ        256
222 #else
223 # if NR_CPUS >= 32
224 #  define RT_HASH_LOCK_SZ       4096
225 # elif NR_CPUS >= 16
226 #  define RT_HASH_LOCK_SZ       2048
227 # elif NR_CPUS >= 8
228 #  define RT_HASH_LOCK_SZ       1024
229 # elif NR_CPUS >= 4
230 #  define RT_HASH_LOCK_SZ       512
231 # else
232 #  define RT_HASH_LOCK_SZ       256
233 # endif
234 #endif
235
236 static spinlock_t       *rt_hash_locks;
237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
238
239 static __init void rt_hash_lock_init(void)
240 {
241         int i;
242
243         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
244                         GFP_KERNEL);
245         if (!rt_hash_locks)
246                 panic("IP: failed to allocate rt_hash_locks\n");
247
248         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
249                 spin_lock_init(&rt_hash_locks[i]);
250 }
251 #else
252 # define rt_hash_lock_addr(slot) NULL
253
254 static inline void rt_hash_lock_init(void)
255 {
256 }
257 #endif
258
259 static struct rt_hash_bucket    *rt_hash_table;
260 static unsigned                 rt_hash_mask;
261 static unsigned int             rt_hash_log;
262 static unsigned int             rt_hash_rnd;
263
264 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
265 #define RT_CACHE_STAT_INC(field) \
266         (__raw_get_cpu_var(rt_cache_stat).field++)
267
268 static int rt_intern_hash(unsigned hash, struct rtable *rth,
269                                 struct rtable **res);
270
271 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
272 {
273         return (jhash_2words(daddr, saddr, rt_hash_rnd)
274                 & rt_hash_mask);
275 }
276
277 #define rt_hash(daddr, saddr, idx) \
278         rt_hash_code((__force u32)(__be32)(daddr),\
279                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
280
281 #ifdef CONFIG_PROC_FS
282 struct rt_cache_iter_state {
283         int bucket;
284 };
285
286 static struct rtable *rt_cache_get_first(struct seq_file *seq)
287 {
288         struct rtable *r = NULL;
289         struct rt_cache_iter_state *st = seq->private;
290
291         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
292                 rcu_read_lock_bh();
293                 r = rt_hash_table[st->bucket].chain;
294                 if (r)
295                         break;
296                 rcu_read_unlock_bh();
297         }
298         return rcu_dereference(r);
299 }
300
301 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
302 {
303         struct rt_cache_iter_state *st = seq->private;
304
305         r = r->u.dst.rt_next;
306         while (!r) {
307                 rcu_read_unlock_bh();
308                 if (--st->bucket < 0)
309                         break;
310                 rcu_read_lock_bh();
311                 r = rt_hash_table[st->bucket].chain;
312         }
313         return rcu_dereference(r);
314 }
315
316 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
317 {
318         struct rtable *r = rt_cache_get_first(seq);
319
320         if (r)
321                 while (pos && (r = rt_cache_get_next(seq, r)))
322                         --pos;
323         return pos ? NULL : r;
324 }
325
326 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
327 {
328         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
329 }
330
331 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
332 {
333         struct rtable *r = NULL;
334
335         if (v == SEQ_START_TOKEN)
336                 r = rt_cache_get_first(seq);
337         else
338                 r = rt_cache_get_next(seq, v);
339         ++*pos;
340         return r;
341 }
342
343 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
344 {
345         if (v && v != SEQ_START_TOKEN)
346                 rcu_read_unlock_bh();
347 }
348
349 static int rt_cache_seq_show(struct seq_file *seq, void *v)
350 {
351         if (v == SEQ_START_TOKEN)
352                 seq_printf(seq, "%-127s\n",
353                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
354                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
355                            "HHUptod\tSpecDst");
356         else {
357                 struct rtable *r = v;
358                 char temp[256];
359
360                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
361                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
362                         r->u.dst.dev ? r->u.dst.dev->name : "*",
363                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
364                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
365                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
366                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
367                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
368                         dst_metric(&r->u.dst, RTAX_WINDOW),
369                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
370                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
371                         r->fl.fl4_tos,
372                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
373                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
374                                        dev_queue_xmit) : 0,
375                         r->rt_spec_dst);
376                 seq_printf(seq, "%-127s\n", temp);
377         }
378         return 0;
379 }
380
381 static const struct seq_operations rt_cache_seq_ops = {
382         .start  = rt_cache_seq_start,
383         .next   = rt_cache_seq_next,
384         .stop   = rt_cache_seq_stop,
385         .show   = rt_cache_seq_show,
386 };
387
388 static int rt_cache_seq_open(struct inode *inode, struct file *file)
389 {
390         return seq_open_private(file, &rt_cache_seq_ops,
391                         sizeof(struct rt_cache_iter_state));
392 }
393
394 static const struct file_operations rt_cache_seq_fops = {
395         .owner   = THIS_MODULE,
396         .open    = rt_cache_seq_open,
397         .read    = seq_read,
398         .llseek  = seq_lseek,
399         .release = seq_release_private,
400 };
401
402
403 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
404 {
405         int cpu;
406
407         if (*pos == 0)
408                 return SEQ_START_TOKEN;
409
410         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
411                 if (!cpu_possible(cpu))
412                         continue;
413                 *pos = cpu+1;
414                 return &per_cpu(rt_cache_stat, cpu);
415         }
416         return NULL;
417 }
418
419 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
420 {
421         int cpu;
422
423         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
424                 if (!cpu_possible(cpu))
425                         continue;
426                 *pos = cpu+1;
427                 return &per_cpu(rt_cache_stat, cpu);
428         }
429         return NULL;
430
431 }
432
433 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
434 {
435
436 }
437
438 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
439 {
440         struct rt_cache_stat *st = v;
441
442         if (v == SEQ_START_TOKEN) {
443                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
444                 return 0;
445         }
446
447         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
448                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
449                    atomic_read(&ipv4_dst_ops.entries),
450                    st->in_hit,
451                    st->in_slow_tot,
452                    st->in_slow_mc,
453                    st->in_no_route,
454                    st->in_brd,
455                    st->in_martian_dst,
456                    st->in_martian_src,
457
458                    st->out_hit,
459                    st->out_slow_tot,
460                    st->out_slow_mc,
461
462                    st->gc_total,
463                    st->gc_ignored,
464                    st->gc_goal_miss,
465                    st->gc_dst_overflow,
466                    st->in_hlist_search,
467                    st->out_hlist_search
468                 );
469         return 0;
470 }
471
472 static const struct seq_operations rt_cpu_seq_ops = {
473         .start  = rt_cpu_seq_start,
474         .next   = rt_cpu_seq_next,
475         .stop   = rt_cpu_seq_stop,
476         .show   = rt_cpu_seq_show,
477 };
478
479
480 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
481 {
482         return seq_open(file, &rt_cpu_seq_ops);
483 }
484
485 static const struct file_operations rt_cpu_seq_fops = {
486         .owner   = THIS_MODULE,
487         .open    = rt_cpu_seq_open,
488         .read    = seq_read,
489         .llseek  = seq_lseek,
490         .release = seq_release,
491 };
492
493 #ifdef CONFIG_NET_CLS_ROUTE
494 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
495                            int length, int *eof, void *data)
496 {
497         unsigned int i;
498
499         if ((offset & 3) || (length & 3))
500                 return -EIO;
501
502         if (offset >= sizeof(struct ip_rt_acct) * 256) {
503                 *eof = 1;
504                 return 0;
505         }
506
507         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
508                 length = sizeof(struct ip_rt_acct) * 256 - offset;
509                 *eof = 1;
510         }
511
512         offset /= sizeof(u32);
513
514         if (length > 0) {
515                 u32 *dst = (u32 *) buffer;
516
517                 *start = buffer;
518                 memset(dst, 0, length);
519
520                 for_each_possible_cpu(i) {
521                         unsigned int j;
522                         u32 *src;
523
524                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
525                         for (j = 0; j < length/4; j++)
526                                 dst[j] += src[j];
527                 }
528         }
529         return length;
530 }
531 #endif
532
533 static __init int ip_rt_proc_init(struct net *net)
534 {
535         struct proc_dir_entry *pde;
536
537         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
538                         &rt_cache_seq_fops);
539         if (!pde)
540                 goto err1;
541
542         pde = create_proc_entry("rt_cache", S_IRUGO, net->proc_net_stat);
543         if (!pde)
544                 goto err2;
545
546         pde->proc_fops = &rt_cpu_seq_fops;
547
548 #ifdef CONFIG_NET_CLS_ROUTE
549         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
550                         ip_rt_acct_read, NULL);
551         if (!pde)
552                 goto err3;
553 #endif
554         return 0;
555
556 #ifdef CONFIG_NET_CLS_ROUTE
557 err3:
558         remove_proc_entry("rt_cache", net->proc_net_stat);
559 #endif
560 err2:
561         remove_proc_entry("rt_cache", net->proc_net);
562 err1:
563         return -ENOMEM;
564 }
565 #else
566 static inline int ip_rt_proc_init(struct net *net)
567 {
568         return 0;
569 }
570 #endif /* CONFIG_PROC_FS */
571
572 static __inline__ void rt_free(struct rtable *rt)
573 {
574         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
575 }
576
577 static __inline__ void rt_drop(struct rtable *rt)
578 {
579         ip_rt_put(rt);
580         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
581 }
582
583 static __inline__ int rt_fast_clean(struct rtable *rth)
584 {
585         /* Kill broadcast/multicast entries very aggresively, if they
586            collide in hash table with more useful entries */
587         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
588                 rth->fl.iif && rth->u.dst.rt_next;
589 }
590
591 static __inline__ int rt_valuable(struct rtable *rth)
592 {
593         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
594                 rth->u.dst.expires;
595 }
596
597 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
598 {
599         unsigned long age;
600         int ret = 0;
601
602         if (atomic_read(&rth->u.dst.__refcnt))
603                 goto out;
604
605         ret = 1;
606         if (rth->u.dst.expires &&
607             time_after_eq(jiffies, rth->u.dst.expires))
608                 goto out;
609
610         age = jiffies - rth->u.dst.lastuse;
611         ret = 0;
612         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
613             (age <= tmo2 && rt_valuable(rth)))
614                 goto out;
615         ret = 1;
616 out:    return ret;
617 }
618
619 /* Bits of score are:
620  * 31: very valuable
621  * 30: not quite useless
622  * 29..0: usage counter
623  */
624 static inline u32 rt_score(struct rtable *rt)
625 {
626         u32 score = jiffies - rt->u.dst.lastuse;
627
628         score = ~score & ~(3<<30);
629
630         if (rt_valuable(rt))
631                 score |= (1<<31);
632
633         if (!rt->fl.iif ||
634             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
635                 score |= (1<<30);
636
637         return score;
638 }
639
640 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
641 {
642         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
643                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
644                 (fl1->mark ^ fl2->mark) |
645                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
646                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
647                 (fl1->oif ^ fl2->oif) |
648                 (fl1->iif ^ fl2->iif)) == 0;
649 }
650
651 /*
652  * Perform a full scan of hash table and free all entries.
653  * Can be called by a softirq or a process.
654  * In the later case, we want to be reschedule if necessary
655  */
656 static void rt_do_flush(int process_context)
657 {
658         unsigned int i;
659         struct rtable *rth, *next;
660
661         for (i = 0; i <= rt_hash_mask; i++) {
662                 if (process_context && need_resched())
663                         cond_resched();
664                 rth = rt_hash_table[i].chain;
665                 if (!rth)
666                         continue;
667
668                 spin_lock_bh(rt_hash_lock_addr(i));
669                 rth = rt_hash_table[i].chain;
670                 rt_hash_table[i].chain = NULL;
671                 spin_unlock_bh(rt_hash_lock_addr(i));
672
673                 for (; rth; rth = next) {
674                         next = rth->u.dst.rt_next;
675                         rt_free(rth);
676                 }
677         }
678 }
679
680 static void rt_check_expire(void)
681 {
682         static unsigned int rover;
683         unsigned int i = rover, goal;
684         struct rtable *rth, **rthp;
685         u64 mult;
686
687         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
688         if (ip_rt_gc_timeout > 1)
689                 do_div(mult, ip_rt_gc_timeout);
690         goal = (unsigned int)mult;
691         if (goal > rt_hash_mask)
692                 goal = rt_hash_mask + 1;
693         for (; goal > 0; goal--) {
694                 unsigned long tmo = ip_rt_gc_timeout;
695
696                 i = (i + 1) & rt_hash_mask;
697                 rthp = &rt_hash_table[i].chain;
698
699                 if (need_resched())
700                         cond_resched();
701
702                 if (*rthp == NULL)
703                         continue;
704                 spin_lock_bh(rt_hash_lock_addr(i));
705                 while ((rth = *rthp) != NULL) {
706                         if (rth->u.dst.expires) {
707                                 /* Entry is expired even if it is in use */
708                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
709                                         tmo >>= 1;
710                                         rthp = &rth->u.dst.rt_next;
711                                         continue;
712                                 }
713                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
714                                 tmo >>= 1;
715                                 rthp = &rth->u.dst.rt_next;
716                                 continue;
717                         }
718
719                         /* Cleanup aged off entries. */
720                         *rthp = rth->u.dst.rt_next;
721                         rt_free(rth);
722                 }
723                 spin_unlock_bh(rt_hash_lock_addr(i));
724         }
725         rover = i;
726 }
727
728 /*
729  * rt_worker_func() is run in process context.
730  * If a whole flush was scheduled, it is done.
731  * Else, we call rt_check_expire() to scan part of the hash table
732  */
733 static void rt_worker_func(struct work_struct *work)
734 {
735         if (ip_rt_flush_expected) {
736                 ip_rt_flush_expected = 0;
737                 rt_do_flush(1);
738         } else
739                 rt_check_expire();
740         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
741 }
742
743 /* This can run from both BH and non-BH contexts, the latter
744  * in the case of a forced flush event.
745  */
746 static void rt_run_flush(unsigned long process_context)
747 {
748         rt_deadline = 0;
749
750         get_random_bytes(&rt_hash_rnd, 4);
751
752         rt_do_flush(process_context);
753 }
754
755 static DEFINE_SPINLOCK(rt_flush_lock);
756
757 void rt_cache_flush(int delay)
758 {
759         unsigned long now = jiffies;
760         int user_mode = !in_softirq();
761
762         if (delay < 0)
763                 delay = ip_rt_min_delay;
764
765         spin_lock_bh(&rt_flush_lock);
766
767         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
768                 long tmo = (long)(rt_deadline - now);
769
770                 /* If flush timer is already running
771                    and flush request is not immediate (delay > 0):
772
773                    if deadline is not achieved, prolongate timer to "delay",
774                    otherwise fire it at deadline time.
775                  */
776
777                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
778                         tmo = 0;
779
780                 if (delay > tmo)
781                         delay = tmo;
782         }
783
784         if (delay <= 0) {
785                 spin_unlock_bh(&rt_flush_lock);
786                 rt_run_flush(user_mode);
787                 return;
788         }
789
790         if (rt_deadline == 0)
791                 rt_deadline = now + ip_rt_max_delay;
792
793         mod_timer(&rt_flush_timer, now+delay);
794         spin_unlock_bh(&rt_flush_lock);
795 }
796
797 /*
798  * We change rt_hash_rnd and ask next rt_worker_func() invocation
799  * to perform a flush in process context
800  */
801 static void rt_secret_rebuild(unsigned long dummy)
802 {
803         get_random_bytes(&rt_hash_rnd, 4);
804         ip_rt_flush_expected = 1;
805         cancel_delayed_work(&expires_work);
806         schedule_delayed_work(&expires_work, HZ/10);
807         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
808 }
809
810 /*
811    Short description of GC goals.
812
813    We want to build algorithm, which will keep routing cache
814    at some equilibrium point, when number of aged off entries
815    is kept approximately equal to newly generated ones.
816
817    Current expiration strength is variable "expire".
818    We try to adjust it dynamically, so that if networking
819    is idle expires is large enough to keep enough of warm entries,
820    and when load increases it reduces to limit cache size.
821  */
822
823 static int rt_garbage_collect(struct dst_ops *ops)
824 {
825         static unsigned long expire = RT_GC_TIMEOUT;
826         static unsigned long last_gc;
827         static int rover;
828         static int equilibrium;
829         struct rtable *rth, **rthp;
830         unsigned long now = jiffies;
831         int goal;
832
833         /*
834          * Garbage collection is pretty expensive,
835          * do not make it too frequently.
836          */
837
838         RT_CACHE_STAT_INC(gc_total);
839
840         if (now - last_gc < ip_rt_gc_min_interval &&
841             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
842                 RT_CACHE_STAT_INC(gc_ignored);
843                 goto out;
844         }
845
846         /* Calculate number of entries, which we want to expire now. */
847         goal = atomic_read(&ipv4_dst_ops.entries) -
848                 (ip_rt_gc_elasticity << rt_hash_log);
849         if (goal <= 0) {
850                 if (equilibrium < ipv4_dst_ops.gc_thresh)
851                         equilibrium = ipv4_dst_ops.gc_thresh;
852                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
853                 if (goal > 0) {
854                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
855                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
856                 }
857         } else {
858                 /* We are in dangerous area. Try to reduce cache really
859                  * aggressively.
860                  */
861                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
862                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
863         }
864
865         if (now - last_gc >= ip_rt_gc_min_interval)
866                 last_gc = now;
867
868         if (goal <= 0) {
869                 equilibrium += goal;
870                 goto work_done;
871         }
872
873         do {
874                 int i, k;
875
876                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
877                         unsigned long tmo = expire;
878
879                         k = (k + 1) & rt_hash_mask;
880                         rthp = &rt_hash_table[k].chain;
881                         spin_lock_bh(rt_hash_lock_addr(k));
882                         while ((rth = *rthp) != NULL) {
883                                 if (!rt_may_expire(rth, tmo, expire)) {
884                                         tmo >>= 1;
885                                         rthp = &rth->u.dst.rt_next;
886                                         continue;
887                                 }
888                                 *rthp = rth->u.dst.rt_next;
889                                 rt_free(rth);
890                                 goal--;
891                         }
892                         spin_unlock_bh(rt_hash_lock_addr(k));
893                         if (goal <= 0)
894                                 break;
895                 }
896                 rover = k;
897
898                 if (goal <= 0)
899                         goto work_done;
900
901                 /* Goal is not achieved. We stop process if:
902
903                    - if expire reduced to zero. Otherwise, expire is halfed.
904                    - if table is not full.
905                    - if we are called from interrupt.
906                    - jiffies check is just fallback/debug loop breaker.
907                      We will not spin here for long time in any case.
908                  */
909
910                 RT_CACHE_STAT_INC(gc_goal_miss);
911
912                 if (expire == 0)
913                         break;
914
915                 expire >>= 1;
916 #if RT_CACHE_DEBUG >= 2
917                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
918                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
919 #endif
920
921                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
922                         goto out;
923         } while (!in_softirq() && time_before_eq(jiffies, now));
924
925         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
926                 goto out;
927         if (net_ratelimit())
928                 printk(KERN_WARNING "dst cache overflow\n");
929         RT_CACHE_STAT_INC(gc_dst_overflow);
930         return 1;
931
932 work_done:
933         expire += ip_rt_gc_min_interval;
934         if (expire > ip_rt_gc_timeout ||
935             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
936                 expire = ip_rt_gc_timeout;
937 #if RT_CACHE_DEBUG >= 2
938         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
939                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
940 #endif
941 out:    return 0;
942 }
943
944 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
945 {
946         struct rtable   *rth, **rthp;
947         unsigned long   now;
948         struct rtable *cand, **candp;
949         u32             min_score;
950         int             chain_length;
951         int attempts = !in_softirq();
952
953 restart:
954         chain_length = 0;
955         min_score = ~(u32)0;
956         cand = NULL;
957         candp = NULL;
958         now = jiffies;
959
960         rthp = &rt_hash_table[hash].chain;
961
962         spin_lock_bh(rt_hash_lock_addr(hash));
963         while ((rth = *rthp) != NULL) {
964                 if (compare_keys(&rth->fl, &rt->fl)) {
965                         /* Put it first */
966                         *rthp = rth->u.dst.rt_next;
967                         /*
968                          * Since lookup is lockfree, the deletion
969                          * must be visible to another weakly ordered CPU before
970                          * the insertion at the start of the hash chain.
971                          */
972                         rcu_assign_pointer(rth->u.dst.rt_next,
973                                            rt_hash_table[hash].chain);
974                         /*
975                          * Since lookup is lockfree, the update writes
976                          * must be ordered for consistency on SMP.
977                          */
978                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
979
980                         dst_use(&rth->u.dst, now);
981                         spin_unlock_bh(rt_hash_lock_addr(hash));
982
983                         rt_drop(rt);
984                         *rp = rth;
985                         return 0;
986                 }
987
988                 if (!atomic_read(&rth->u.dst.__refcnt)) {
989                         u32 score = rt_score(rth);
990
991                         if (score <= min_score) {
992                                 cand = rth;
993                                 candp = rthp;
994                                 min_score = score;
995                         }
996                 }
997
998                 chain_length++;
999
1000                 rthp = &rth->u.dst.rt_next;
1001         }
1002
1003         if (cand) {
1004                 /* ip_rt_gc_elasticity used to be average length of chain
1005                  * length, when exceeded gc becomes really aggressive.
1006                  *
1007                  * The second limit is less certain. At the moment it allows
1008                  * only 2 entries per bucket. We will see.
1009                  */
1010                 if (chain_length > ip_rt_gc_elasticity) {
1011                         *candp = cand->u.dst.rt_next;
1012                         rt_free(cand);
1013                 }
1014         }
1015
1016         /* Try to bind route to arp only if it is output
1017            route or unicast forwarding path.
1018          */
1019         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1020                 int err = arp_bind_neighbour(&rt->u.dst);
1021                 if (err) {
1022                         spin_unlock_bh(rt_hash_lock_addr(hash));
1023
1024                         if (err != -ENOBUFS) {
1025                                 rt_drop(rt);
1026                                 return err;
1027                         }
1028
1029                         /* Neighbour tables are full and nothing
1030                            can be released. Try to shrink route cache,
1031                            it is most likely it holds some neighbour records.
1032                          */
1033                         if (attempts-- > 0) {
1034                                 int saved_elasticity = ip_rt_gc_elasticity;
1035                                 int saved_int = ip_rt_gc_min_interval;
1036                                 ip_rt_gc_elasticity     = 1;
1037                                 ip_rt_gc_min_interval   = 0;
1038                                 rt_garbage_collect(&ipv4_dst_ops);
1039                                 ip_rt_gc_min_interval   = saved_int;
1040                                 ip_rt_gc_elasticity     = saved_elasticity;
1041                                 goto restart;
1042                         }
1043
1044                         if (net_ratelimit())
1045                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1046                         rt_drop(rt);
1047                         return -ENOBUFS;
1048                 }
1049         }
1050
1051         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1052 #if RT_CACHE_DEBUG >= 2
1053         if (rt->u.dst.rt_next) {
1054                 struct rtable *trt;
1055                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1056                        NIPQUAD(rt->rt_dst));
1057                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1058                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1059                 printk("\n");
1060         }
1061 #endif
1062         rt_hash_table[hash].chain = rt;
1063         spin_unlock_bh(rt_hash_lock_addr(hash));
1064         *rp = rt;
1065         return 0;
1066 }
1067
1068 void rt_bind_peer(struct rtable *rt, int create)
1069 {
1070         static DEFINE_SPINLOCK(rt_peer_lock);
1071         struct inet_peer *peer;
1072
1073         peer = inet_getpeer(rt->rt_dst, create);
1074
1075         spin_lock_bh(&rt_peer_lock);
1076         if (rt->peer == NULL) {
1077                 rt->peer = peer;
1078                 peer = NULL;
1079         }
1080         spin_unlock_bh(&rt_peer_lock);
1081         if (peer)
1082                 inet_putpeer(peer);
1083 }
1084
1085 /*
1086  * Peer allocation may fail only in serious out-of-memory conditions.  However
1087  * we still can generate some output.
1088  * Random ID selection looks a bit dangerous because we have no chances to
1089  * select ID being unique in a reasonable period of time.
1090  * But broken packet identifier may be better than no packet at all.
1091  */
1092 static void ip_select_fb_ident(struct iphdr *iph)
1093 {
1094         static DEFINE_SPINLOCK(ip_fb_id_lock);
1095         static u32 ip_fallback_id;
1096         u32 salt;
1097
1098         spin_lock_bh(&ip_fb_id_lock);
1099         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1100         iph->id = htons(salt & 0xFFFF);
1101         ip_fallback_id = salt;
1102         spin_unlock_bh(&ip_fb_id_lock);
1103 }
1104
1105 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1106 {
1107         struct rtable *rt = (struct rtable *) dst;
1108
1109         if (rt) {
1110                 if (rt->peer == NULL)
1111                         rt_bind_peer(rt, 1);
1112
1113                 /* If peer is attached to destination, it is never detached,
1114                    so that we need not to grab a lock to dereference it.
1115                  */
1116                 if (rt->peer) {
1117                         iph->id = htons(inet_getid(rt->peer, more));
1118                         return;
1119                 }
1120         } else
1121                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1122                        __builtin_return_address(0));
1123
1124         ip_select_fb_ident(iph);
1125 }
1126
1127 static void rt_del(unsigned hash, struct rtable *rt)
1128 {
1129         struct rtable **rthp;
1130
1131         spin_lock_bh(rt_hash_lock_addr(hash));
1132         ip_rt_put(rt);
1133         for (rthp = &rt_hash_table[hash].chain; *rthp;
1134              rthp = &(*rthp)->u.dst.rt_next)
1135                 if (*rthp == rt) {
1136                         *rthp = rt->u.dst.rt_next;
1137                         rt_free(rt);
1138                         break;
1139                 }
1140         spin_unlock_bh(rt_hash_lock_addr(hash));
1141 }
1142
1143 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1144                     __be32 saddr, struct net_device *dev)
1145 {
1146         int i, k;
1147         struct in_device *in_dev = in_dev_get(dev);
1148         struct rtable *rth, **rthp;
1149         __be32  skeys[2] = { saddr, 0 };
1150         int  ikeys[2] = { dev->ifindex, 0 };
1151         struct netevent_redirect netevent;
1152
1153         if (!in_dev)
1154                 return;
1155
1156         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1157             || ipv4_is_multicast(new_gw) || ipv4_is_badclass(new_gw)
1158             || ipv4_is_zeronet(new_gw))
1159                 goto reject_redirect;
1160
1161         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1162                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1163                         goto reject_redirect;
1164                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1165                         goto reject_redirect;
1166         } else {
1167                 if (inet_addr_type(&init_net, new_gw) != RTN_UNICAST)
1168                         goto reject_redirect;
1169         }
1170
1171         for (i = 0; i < 2; i++) {
1172                 for (k = 0; k < 2; k++) {
1173                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1174
1175                         rthp=&rt_hash_table[hash].chain;
1176
1177                         rcu_read_lock();
1178                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1179                                 struct rtable *rt;
1180
1181                                 if (rth->fl.fl4_dst != daddr ||
1182                                     rth->fl.fl4_src != skeys[i] ||
1183                                     rth->fl.oif != ikeys[k] ||
1184                                     rth->fl.iif != 0) {
1185                                         rthp = &rth->u.dst.rt_next;
1186                                         continue;
1187                                 }
1188
1189                                 if (rth->rt_dst != daddr ||
1190                                     rth->rt_src != saddr ||
1191                                     rth->u.dst.error ||
1192                                     rth->rt_gateway != old_gw ||
1193                                     rth->u.dst.dev != dev)
1194                                         break;
1195
1196                                 dst_hold(&rth->u.dst);
1197                                 rcu_read_unlock();
1198
1199                                 rt = dst_alloc(&ipv4_dst_ops);
1200                                 if (rt == NULL) {
1201                                         ip_rt_put(rth);
1202                                         in_dev_put(in_dev);
1203                                         return;
1204                                 }
1205
1206                                 /* Copy all the information. */
1207                                 *rt = *rth;
1208                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1209                                 rt->u.dst.__use         = 1;
1210                                 atomic_set(&rt->u.dst.__refcnt, 1);
1211                                 rt->u.dst.child         = NULL;
1212                                 if (rt->u.dst.dev)
1213                                         dev_hold(rt->u.dst.dev);
1214                                 if (rt->idev)
1215                                         in_dev_hold(rt->idev);
1216                                 rt->u.dst.obsolete      = 0;
1217                                 rt->u.dst.lastuse       = jiffies;
1218                                 rt->u.dst.path          = &rt->u.dst;
1219                                 rt->u.dst.neighbour     = NULL;
1220                                 rt->u.dst.hh            = NULL;
1221                                 rt->u.dst.xfrm          = NULL;
1222
1223                                 rt->rt_flags            |= RTCF_REDIRECTED;
1224
1225                                 /* Gateway is different ... */
1226                                 rt->rt_gateway          = new_gw;
1227
1228                                 /* Redirect received -> path was valid */
1229                                 dst_confirm(&rth->u.dst);
1230
1231                                 if (rt->peer)
1232                                         atomic_inc(&rt->peer->refcnt);
1233
1234                                 if (arp_bind_neighbour(&rt->u.dst) ||
1235                                     !(rt->u.dst.neighbour->nud_state &
1236                                             NUD_VALID)) {
1237                                         if (rt->u.dst.neighbour)
1238                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1239                                         ip_rt_put(rth);
1240                                         rt_drop(rt);
1241                                         goto do_next;
1242                                 }
1243
1244                                 netevent.old = &rth->u.dst;
1245                                 netevent.new = &rt->u.dst;
1246                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1247                                                         &netevent);
1248
1249                                 rt_del(hash, rth);
1250                                 if (!rt_intern_hash(hash, rt, &rt))
1251                                         ip_rt_put(rt);
1252                                 goto do_next;
1253                         }
1254                         rcu_read_unlock();
1255                 do_next:
1256                         ;
1257                 }
1258         }
1259         in_dev_put(in_dev);
1260         return;
1261
1262 reject_redirect:
1263 #ifdef CONFIG_IP_ROUTE_VERBOSE
1264         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1265                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1266                         "%u.%u.%u.%u ignored.\n"
1267                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1268                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1269                        NIPQUAD(saddr), NIPQUAD(daddr));
1270 #endif
1271         in_dev_put(in_dev);
1272 }
1273
1274 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1275 {
1276         struct rtable *rt = (struct rtable*)dst;
1277         struct dst_entry *ret = dst;
1278
1279         if (rt) {
1280                 if (dst->obsolete) {
1281                         ip_rt_put(rt);
1282                         ret = NULL;
1283                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1284                            rt->u.dst.expires) {
1285                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1286                                                 rt->fl.oif);
1287 #if RT_CACHE_DEBUG >= 1
1288                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1289                                           "%u.%u.%u.%u/%02x dropped\n",
1290                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1291 #endif
1292                         rt_del(hash, rt);
1293                         ret = NULL;
1294                 }
1295         }
1296         return ret;
1297 }
1298
1299 /*
1300  * Algorithm:
1301  *      1. The first ip_rt_redirect_number redirects are sent
1302  *         with exponential backoff, then we stop sending them at all,
1303  *         assuming that the host ignores our redirects.
1304  *      2. If we did not see packets requiring redirects
1305  *         during ip_rt_redirect_silence, we assume that the host
1306  *         forgot redirected route and start to send redirects again.
1307  *
1308  * This algorithm is much cheaper and more intelligent than dumb load limiting
1309  * in icmp.c.
1310  *
1311  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1312  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1313  */
1314
1315 void ip_rt_send_redirect(struct sk_buff *skb)
1316 {
1317         struct rtable *rt = (struct rtable*)skb->dst;
1318         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1319
1320         if (!in_dev)
1321                 return;
1322
1323         if (!IN_DEV_TX_REDIRECTS(in_dev))
1324                 goto out;
1325
1326         /* No redirected packets during ip_rt_redirect_silence;
1327          * reset the algorithm.
1328          */
1329         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1330                 rt->u.dst.rate_tokens = 0;
1331
1332         /* Too many ignored redirects; do not send anything
1333          * set u.dst.rate_last to the last seen redirected packet.
1334          */
1335         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1336                 rt->u.dst.rate_last = jiffies;
1337                 goto out;
1338         }
1339
1340         /* Check for load limit; set rate_last to the latest sent
1341          * redirect.
1342          */
1343         if (rt->u.dst.rate_tokens == 0 ||
1344             time_after(jiffies,
1345                        (rt->u.dst.rate_last +
1346                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1347                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1348                 rt->u.dst.rate_last = jiffies;
1349                 ++rt->u.dst.rate_tokens;
1350 #ifdef CONFIG_IP_ROUTE_VERBOSE
1351                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1352                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1353                     net_ratelimit())
1354                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1355                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1356                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1357                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1358 #endif
1359         }
1360 out:
1361         in_dev_put(in_dev);
1362 }
1363
1364 static int ip_error(struct sk_buff *skb)
1365 {
1366         struct rtable *rt = (struct rtable*)skb->dst;
1367         unsigned long now;
1368         int code;
1369
1370         switch (rt->u.dst.error) {
1371                 case EINVAL:
1372                 default:
1373                         goto out;
1374                 case EHOSTUNREACH:
1375                         code = ICMP_HOST_UNREACH;
1376                         break;
1377                 case ENETUNREACH:
1378                         code = ICMP_NET_UNREACH;
1379                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1380                         break;
1381                 case EACCES:
1382                         code = ICMP_PKT_FILTERED;
1383                         break;
1384         }
1385
1386         now = jiffies;
1387         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1388         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1389                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1390         rt->u.dst.rate_last = now;
1391         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1392                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1393                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1394         }
1395
1396 out:    kfree_skb(skb);
1397         return 0;
1398 }
1399
1400 /*
1401  *      The last two values are not from the RFC but
1402  *      are needed for AMPRnet AX.25 paths.
1403  */
1404
1405 static const unsigned short mtu_plateau[] =
1406 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1407
1408 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1409 {
1410         int i;
1411
1412         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1413                 if (old_mtu > mtu_plateau[i])
1414                         return mtu_plateau[i];
1415         return 68;
1416 }
1417
1418 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1419 {
1420         int i;
1421         unsigned short old_mtu = ntohs(iph->tot_len);
1422         struct rtable *rth;
1423         __be32  skeys[2] = { iph->saddr, 0, };
1424         __be32  daddr = iph->daddr;
1425         unsigned short est_mtu = 0;
1426
1427         if (ipv4_config.no_pmtu_disc)
1428                 return 0;
1429
1430         for (i = 0; i < 2; i++) {
1431                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1432
1433                 rcu_read_lock();
1434                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1435                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1436                         if (rth->fl.fl4_dst == daddr &&
1437                             rth->fl.fl4_src == skeys[i] &&
1438                             rth->rt_dst  == daddr &&
1439                             rth->rt_src  == iph->saddr &&
1440                             rth->fl.iif == 0 &&
1441                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1442                                 unsigned short mtu = new_mtu;
1443
1444                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1445
1446                                         /* BSD 4.2 compatibility hack :-( */
1447                                         if (mtu == 0 &&
1448                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1449                                             old_mtu >= 68 + (iph->ihl << 2))
1450                                                 old_mtu -= iph->ihl << 2;
1451
1452                                         mtu = guess_mtu(old_mtu);
1453                                 }
1454                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1455                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1456                                                 dst_confirm(&rth->u.dst);
1457                                                 if (mtu < ip_rt_min_pmtu) {
1458                                                         mtu = ip_rt_min_pmtu;
1459                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1460                                                                 (1 << RTAX_MTU);
1461                                                 }
1462                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1463                                                 dst_set_expires(&rth->u.dst,
1464                                                         ip_rt_mtu_expires);
1465                                         }
1466                                         est_mtu = mtu;
1467                                 }
1468                         }
1469                 }
1470                 rcu_read_unlock();
1471         }
1472         return est_mtu ? : new_mtu;
1473 }
1474
1475 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1476 {
1477         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1478             !(dst_metric_locked(dst, RTAX_MTU))) {
1479                 if (mtu < ip_rt_min_pmtu) {
1480                         mtu = ip_rt_min_pmtu;
1481                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1482                 }
1483                 dst->metrics[RTAX_MTU-1] = mtu;
1484                 dst_set_expires(dst, ip_rt_mtu_expires);
1485                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1486         }
1487 }
1488
1489 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1490 {
1491         return NULL;
1492 }
1493
1494 static void ipv4_dst_destroy(struct dst_entry *dst)
1495 {
1496         struct rtable *rt = (struct rtable *) dst;
1497         struct inet_peer *peer = rt->peer;
1498         struct in_device *idev = rt->idev;
1499
1500         if (peer) {
1501                 rt->peer = NULL;
1502                 inet_putpeer(peer);
1503         }
1504
1505         if (idev) {
1506                 rt->idev = NULL;
1507                 in_dev_put(idev);
1508         }
1509 }
1510
1511 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1512                             int how)
1513 {
1514         struct rtable *rt = (struct rtable *) dst;
1515         struct in_device *idev = rt->idev;
1516         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1517                 struct in_device *loopback_idev =
1518                         in_dev_get(dev->nd_net->loopback_dev);
1519                 if (loopback_idev) {
1520                         rt->idev = loopback_idev;
1521                         in_dev_put(idev);
1522                 }
1523         }
1524 }
1525
1526 static void ipv4_link_failure(struct sk_buff *skb)
1527 {
1528         struct rtable *rt;
1529
1530         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1531
1532         rt = (struct rtable *) skb->dst;
1533         if (rt)
1534                 dst_set_expires(&rt->u.dst, 0);
1535 }
1536
1537 static int ip_rt_bug(struct sk_buff *skb)
1538 {
1539         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1540                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1541                 skb->dev ? skb->dev->name : "?");
1542         kfree_skb(skb);
1543         return 0;
1544 }
1545
1546 /*
1547    We do not cache source address of outgoing interface,
1548    because it is used only by IP RR, TS and SRR options,
1549    so that it out of fast path.
1550
1551    BTW remember: "addr" is allowed to be not aligned
1552    in IP options!
1553  */
1554
1555 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1556 {
1557         __be32 src;
1558         struct fib_result res;
1559
1560         if (rt->fl.iif == 0)
1561                 src = rt->rt_src;
1562         else if (fib_lookup(&rt->fl, &res) == 0) {
1563                 src = FIB_RES_PREFSRC(res);
1564                 fib_res_put(&res);
1565         } else
1566                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1567                                         RT_SCOPE_UNIVERSE);
1568         memcpy(addr, &src, 4);
1569 }
1570
1571 #ifdef CONFIG_NET_CLS_ROUTE
1572 static void set_class_tag(struct rtable *rt, u32 tag)
1573 {
1574         if (!(rt->u.dst.tclassid & 0xFFFF))
1575                 rt->u.dst.tclassid |= tag & 0xFFFF;
1576         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1577                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1578 }
1579 #endif
1580
1581 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1582 {
1583         struct fib_info *fi = res->fi;
1584
1585         if (fi) {
1586                 if (FIB_RES_GW(*res) &&
1587                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1588                         rt->rt_gateway = FIB_RES_GW(*res);
1589                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1590                        sizeof(rt->u.dst.metrics));
1591                 if (fi->fib_mtu == 0) {
1592                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1593                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1594                             rt->rt_gateway != rt->rt_dst &&
1595                             rt->u.dst.dev->mtu > 576)
1596                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1597                 }
1598 #ifdef CONFIG_NET_CLS_ROUTE
1599                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1600 #endif
1601         } else
1602                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1603
1604         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1605                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1606         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1607                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1608         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1609                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1610                                        ip_rt_min_advmss);
1611         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1612                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1613
1614 #ifdef CONFIG_NET_CLS_ROUTE
1615 #ifdef CONFIG_IP_MULTIPLE_TABLES
1616         set_class_tag(rt, fib_rules_tclass(res));
1617 #endif
1618         set_class_tag(rt, itag);
1619 #endif
1620         rt->rt_type = res->type;
1621 }
1622
1623 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1624                                 u8 tos, struct net_device *dev, int our)
1625 {
1626         unsigned hash;
1627         struct rtable *rth;
1628         __be32 spec_dst;
1629         struct in_device *in_dev = in_dev_get(dev);
1630         u32 itag = 0;
1631
1632         /* Primary sanity checks. */
1633
1634         if (in_dev == NULL)
1635                 return -EINVAL;
1636
1637         if (ipv4_is_multicast(saddr) || ipv4_is_badclass(saddr) ||
1638             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1639                 goto e_inval;
1640
1641         if (ipv4_is_zeronet(saddr)) {
1642                 if (!ipv4_is_local_multicast(daddr))
1643                         goto e_inval;
1644                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1645         } else if (fib_validate_source(saddr, 0, tos, 0,
1646                                         dev, &spec_dst, &itag) < 0)
1647                 goto e_inval;
1648
1649         rth = dst_alloc(&ipv4_dst_ops);
1650         if (!rth)
1651                 goto e_nobufs;
1652
1653         rth->u.dst.output= ip_rt_bug;
1654
1655         atomic_set(&rth->u.dst.__refcnt, 1);
1656         rth->u.dst.flags= DST_HOST;
1657         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1658                 rth->u.dst.flags |= DST_NOPOLICY;
1659         rth->fl.fl4_dst = daddr;
1660         rth->rt_dst     = daddr;
1661         rth->fl.fl4_tos = tos;
1662         rth->fl.mark    = skb->mark;
1663         rth->fl.fl4_src = saddr;
1664         rth->rt_src     = saddr;
1665 #ifdef CONFIG_NET_CLS_ROUTE
1666         rth->u.dst.tclassid = itag;
1667 #endif
1668         rth->rt_iif     =
1669         rth->fl.iif     = dev->ifindex;
1670         rth->u.dst.dev  = init_net.loopback_dev;
1671         dev_hold(rth->u.dst.dev);
1672         rth->idev       = in_dev_get(rth->u.dst.dev);
1673         rth->fl.oif     = 0;
1674         rth->rt_gateway = daddr;
1675         rth->rt_spec_dst= spec_dst;
1676         rth->rt_type    = RTN_MULTICAST;
1677         rth->rt_flags   = RTCF_MULTICAST;
1678         if (our) {
1679                 rth->u.dst.input= ip_local_deliver;
1680                 rth->rt_flags |= RTCF_LOCAL;
1681         }
1682
1683 #ifdef CONFIG_IP_MROUTE
1684         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1685                 rth->u.dst.input = ip_mr_input;
1686 #endif
1687         RT_CACHE_STAT_INC(in_slow_mc);
1688
1689         in_dev_put(in_dev);
1690         hash = rt_hash(daddr, saddr, dev->ifindex);
1691         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1692
1693 e_nobufs:
1694         in_dev_put(in_dev);
1695         return -ENOBUFS;
1696
1697 e_inval:
1698         in_dev_put(in_dev);
1699         return -EINVAL;
1700 }
1701
1702
1703 static void ip_handle_martian_source(struct net_device *dev,
1704                                      struct in_device *in_dev,
1705                                      struct sk_buff *skb,
1706                                      __be32 daddr,
1707                                      __be32 saddr)
1708 {
1709         RT_CACHE_STAT_INC(in_martian_src);
1710 #ifdef CONFIG_IP_ROUTE_VERBOSE
1711         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1712                 /*
1713                  *      RFC1812 recommendation, if source is martian,
1714                  *      the only hint is MAC header.
1715                  */
1716                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1717                         "%u.%u.%u.%u, on dev %s\n",
1718                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1719                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1720                         int i;
1721                         const unsigned char *p = skb_mac_header(skb);
1722                         printk(KERN_WARNING "ll header: ");
1723                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1724                                 printk("%02x", *p);
1725                                 if (i < (dev->hard_header_len - 1))
1726                                         printk(":");
1727                         }
1728                         printk("\n");
1729                 }
1730         }
1731 #endif
1732 }
1733
1734 static inline int __mkroute_input(struct sk_buff *skb,
1735                                   struct fib_result* res,
1736                                   struct in_device *in_dev,
1737                                   __be32 daddr, __be32 saddr, u32 tos,
1738                                   struct rtable **result)
1739 {
1740
1741         struct rtable *rth;
1742         int err;
1743         struct in_device *out_dev;
1744         unsigned flags = 0;
1745         __be32 spec_dst;
1746         u32 itag;
1747
1748         /* get a working reference to the output device */
1749         out_dev = in_dev_get(FIB_RES_DEV(*res));
1750         if (out_dev == NULL) {
1751                 if (net_ratelimit())
1752                         printk(KERN_CRIT "Bug in ip_route_input" \
1753                                "_slow(). Please, report\n");
1754                 return -EINVAL;
1755         }
1756
1757
1758         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1759                                   in_dev->dev, &spec_dst, &itag);
1760         if (err < 0) {
1761                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1762                                          saddr);
1763
1764                 err = -EINVAL;
1765                 goto cleanup;
1766         }
1767
1768         if (err)
1769                 flags |= RTCF_DIRECTSRC;
1770
1771         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1772             (IN_DEV_SHARED_MEDIA(out_dev) ||
1773              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1774                 flags |= RTCF_DOREDIRECT;
1775
1776         if (skb->protocol != htons(ETH_P_IP)) {
1777                 /* Not IP (i.e. ARP). Do not create route, if it is
1778                  * invalid for proxy arp. DNAT routes are always valid.
1779                  */
1780                 if (out_dev == in_dev) {
1781                         err = -EINVAL;
1782                         goto cleanup;
1783                 }
1784         }
1785
1786
1787         rth = dst_alloc(&ipv4_dst_ops);
1788         if (!rth) {
1789                 err = -ENOBUFS;
1790                 goto cleanup;
1791         }
1792
1793         atomic_set(&rth->u.dst.__refcnt, 1);
1794         rth->u.dst.flags= DST_HOST;
1795         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1796                 rth->u.dst.flags |= DST_NOPOLICY;
1797         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1798                 rth->u.dst.flags |= DST_NOXFRM;
1799         rth->fl.fl4_dst = daddr;
1800         rth->rt_dst     = daddr;
1801         rth->fl.fl4_tos = tos;
1802         rth->fl.mark    = skb->mark;
1803         rth->fl.fl4_src = saddr;
1804         rth->rt_src     = saddr;
1805         rth->rt_gateway = daddr;
1806         rth->rt_iif     =
1807                 rth->fl.iif     = in_dev->dev->ifindex;
1808         rth->u.dst.dev  = (out_dev)->dev;
1809         dev_hold(rth->u.dst.dev);
1810         rth->idev       = in_dev_get(rth->u.dst.dev);
1811         rth->fl.oif     = 0;
1812         rth->rt_spec_dst= spec_dst;
1813
1814         rth->u.dst.input = ip_forward;
1815         rth->u.dst.output = ip_output;
1816
1817         rt_set_nexthop(rth, res, itag);
1818
1819         rth->rt_flags = flags;
1820
1821         *result = rth;
1822         err = 0;
1823  cleanup:
1824         /* release the working reference to the output device */
1825         in_dev_put(out_dev);
1826         return err;
1827 }
1828
1829 static inline int ip_mkroute_input(struct sk_buff *skb,
1830                                    struct fib_result* res,
1831                                    const struct flowi *fl,
1832                                    struct in_device *in_dev,
1833                                    __be32 daddr, __be32 saddr, u32 tos)
1834 {
1835         struct rtable* rth = NULL;
1836         int err;
1837         unsigned hash;
1838
1839 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1840         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1841                 fib_select_multipath(fl, res);
1842 #endif
1843
1844         /* create a routing cache entry */
1845         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1846         if (err)
1847                 return err;
1848
1849         /* put it into the cache */
1850         hash = rt_hash(daddr, saddr, fl->iif);
1851         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1852 }
1853
1854 /*
1855  *      NOTE. We drop all the packets that has local source
1856  *      addresses, because every properly looped back packet
1857  *      must have correct destination already attached by output routine.
1858  *
1859  *      Such approach solves two big problems:
1860  *      1. Not simplex devices are handled properly.
1861  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1862  */
1863
1864 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1865                                u8 tos, struct net_device *dev)
1866 {
1867         struct fib_result res;
1868         struct in_device *in_dev = in_dev_get(dev);
1869         struct flowi fl = { .nl_u = { .ip4_u =
1870                                       { .daddr = daddr,
1871                                         .saddr = saddr,
1872                                         .tos = tos,
1873                                         .scope = RT_SCOPE_UNIVERSE,
1874                                       } },
1875                             .mark = skb->mark,
1876                             .iif = dev->ifindex };
1877         unsigned        flags = 0;
1878         u32             itag = 0;
1879         struct rtable * rth;
1880         unsigned        hash;
1881         __be32          spec_dst;
1882         int             err = -EINVAL;
1883         int             free_res = 0;
1884
1885         /* IP on this device is disabled. */
1886
1887         if (!in_dev)
1888                 goto out;
1889
1890         /* Check for the most weird martians, which can be not detected
1891            by fib_lookup.
1892          */
1893
1894         if (ipv4_is_multicast(saddr) || ipv4_is_badclass(saddr) ||
1895             ipv4_is_loopback(saddr))
1896                 goto martian_source;
1897
1898         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1899                 goto brd_input;
1900
1901         /* Accept zero addresses only to limited broadcast;
1902          * I even do not know to fix it or not. Waiting for complains :-)
1903          */
1904         if (ipv4_is_zeronet(saddr))
1905                 goto martian_source;
1906
1907         if (ipv4_is_badclass(daddr) || ipv4_is_zeronet(daddr) ||
1908             ipv4_is_loopback(daddr))
1909                 goto martian_destination;
1910
1911         /*
1912          *      Now we are ready to route packet.
1913          */
1914         if ((err = fib_lookup(&fl, &res)) != 0) {
1915                 if (!IN_DEV_FORWARD(in_dev))
1916                         goto e_hostunreach;
1917                 goto no_route;
1918         }
1919         free_res = 1;
1920
1921         RT_CACHE_STAT_INC(in_slow_tot);
1922
1923         if (res.type == RTN_BROADCAST)
1924                 goto brd_input;
1925
1926         if (res.type == RTN_LOCAL) {
1927                 int result;
1928                 result = fib_validate_source(saddr, daddr, tos,
1929                                              init_net.loopback_dev->ifindex,
1930                                              dev, &spec_dst, &itag);
1931                 if (result < 0)
1932                         goto martian_source;
1933                 if (result)
1934                         flags |= RTCF_DIRECTSRC;
1935                 spec_dst = daddr;
1936                 goto local_input;
1937         }
1938
1939         if (!IN_DEV_FORWARD(in_dev))
1940                 goto e_hostunreach;
1941         if (res.type != RTN_UNICAST)
1942                 goto martian_destination;
1943
1944         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1945 done:
1946         in_dev_put(in_dev);
1947         if (free_res)
1948                 fib_res_put(&res);
1949 out:    return err;
1950
1951 brd_input:
1952         if (skb->protocol != htons(ETH_P_IP))
1953                 goto e_inval;
1954
1955         if (ipv4_is_zeronet(saddr))
1956                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1957         else {
1958                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1959                                           &itag);
1960                 if (err < 0)
1961                         goto martian_source;
1962                 if (err)
1963                         flags |= RTCF_DIRECTSRC;
1964         }
1965         flags |= RTCF_BROADCAST;
1966         res.type = RTN_BROADCAST;
1967         RT_CACHE_STAT_INC(in_brd);
1968
1969 local_input:
1970         rth = dst_alloc(&ipv4_dst_ops);
1971         if (!rth)
1972                 goto e_nobufs;
1973
1974         rth->u.dst.output= ip_rt_bug;
1975
1976         atomic_set(&rth->u.dst.__refcnt, 1);
1977         rth->u.dst.flags= DST_HOST;
1978         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1979                 rth->u.dst.flags |= DST_NOPOLICY;
1980         rth->fl.fl4_dst = daddr;
1981         rth->rt_dst     = daddr;
1982         rth->fl.fl4_tos = tos;
1983         rth->fl.mark    = skb->mark;
1984         rth->fl.fl4_src = saddr;
1985         rth->rt_src     = saddr;
1986 #ifdef CONFIG_NET_CLS_ROUTE
1987         rth->u.dst.tclassid = itag;
1988 #endif
1989         rth->rt_iif     =
1990         rth->fl.iif     = dev->ifindex;
1991         rth->u.dst.dev  = init_net.loopback_dev;
1992         dev_hold(rth->u.dst.dev);
1993         rth->idev       = in_dev_get(rth->u.dst.dev);
1994         rth->rt_gateway = daddr;
1995         rth->rt_spec_dst= spec_dst;
1996         rth->u.dst.input= ip_local_deliver;
1997         rth->rt_flags   = flags|RTCF_LOCAL;
1998         if (res.type == RTN_UNREACHABLE) {
1999                 rth->u.dst.input= ip_error;
2000                 rth->u.dst.error= -err;
2001                 rth->rt_flags   &= ~RTCF_LOCAL;
2002         }
2003         rth->rt_type    = res.type;
2004         hash = rt_hash(daddr, saddr, fl.iif);
2005         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2006         goto done;
2007
2008 no_route:
2009         RT_CACHE_STAT_INC(in_no_route);
2010         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2011         res.type = RTN_UNREACHABLE;
2012         if (err == -ESRCH)
2013                 err = -ENETUNREACH;
2014         goto local_input;
2015
2016         /*
2017          *      Do not cache martian addresses: they should be logged (RFC1812)
2018          */
2019 martian_destination:
2020         RT_CACHE_STAT_INC(in_martian_dst);
2021 #ifdef CONFIG_IP_ROUTE_VERBOSE
2022         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2023                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2024                         "%u.%u.%u.%u, dev %s\n",
2025                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2026 #endif
2027
2028 e_hostunreach:
2029         err = -EHOSTUNREACH;
2030         goto done;
2031
2032 e_inval:
2033         err = -EINVAL;
2034         goto done;
2035
2036 e_nobufs:
2037         err = -ENOBUFS;
2038         goto done;
2039
2040 martian_source:
2041         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2042         goto e_inval;
2043 }
2044
2045 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2046                    u8 tos, struct net_device *dev)
2047 {
2048         struct rtable * rth;
2049         unsigned        hash;
2050         int iif = dev->ifindex;
2051
2052         tos &= IPTOS_RT_MASK;
2053         hash = rt_hash(daddr, saddr, iif);
2054
2055         rcu_read_lock();
2056         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2057              rth = rcu_dereference(rth->u.dst.rt_next)) {
2058                 if (rth->fl.fl4_dst == daddr &&
2059                     rth->fl.fl4_src == saddr &&
2060                     rth->fl.iif == iif &&
2061                     rth->fl.oif == 0 &&
2062                     rth->fl.mark == skb->mark &&
2063                     rth->fl.fl4_tos == tos) {
2064                         dst_use(&rth->u.dst, jiffies);
2065                         RT_CACHE_STAT_INC(in_hit);
2066                         rcu_read_unlock();
2067                         skb->dst = (struct dst_entry*)rth;
2068                         return 0;
2069                 }
2070                 RT_CACHE_STAT_INC(in_hlist_search);
2071         }
2072         rcu_read_unlock();
2073
2074         /* Multicast recognition logic is moved from route cache to here.
2075            The problem was that too many Ethernet cards have broken/missing
2076            hardware multicast filters :-( As result the host on multicasting
2077            network acquires a lot of useless route cache entries, sort of
2078            SDR messages from all the world. Now we try to get rid of them.
2079            Really, provided software IP multicast filter is organized
2080            reasonably (at least, hashed), it does not result in a slowdown
2081            comparing with route cache reject entries.
2082            Note, that multicast routers are not affected, because
2083            route cache entry is created eventually.
2084          */
2085         if (ipv4_is_multicast(daddr)) {
2086                 struct in_device *in_dev;
2087
2088                 rcu_read_lock();
2089                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2090                         int our = ip_check_mc(in_dev, daddr, saddr,
2091                                 ip_hdr(skb)->protocol);
2092                         if (our
2093 #ifdef CONFIG_IP_MROUTE
2094                             || (!ipv4_is_local_multicast(daddr) &&
2095                                 IN_DEV_MFORWARD(in_dev))
2096 #endif
2097                             ) {
2098                                 rcu_read_unlock();
2099                                 return ip_route_input_mc(skb, daddr, saddr,
2100                                                          tos, dev, our);
2101                         }
2102                 }
2103                 rcu_read_unlock();
2104                 return -EINVAL;
2105         }
2106         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2107 }
2108
2109 static inline int __mkroute_output(struct rtable **result,
2110                                    struct fib_result* res,
2111                                    const struct flowi *fl,
2112                                    const struct flowi *oldflp,
2113                                    struct net_device *dev_out,
2114                                    unsigned flags)
2115 {
2116         struct rtable *rth;
2117         struct in_device *in_dev;
2118         u32 tos = RT_FL_TOS(oldflp);
2119         int err = 0;
2120
2121         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2122                 return -EINVAL;
2123
2124         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2125                 res->type = RTN_BROADCAST;
2126         else if (ipv4_is_multicast(fl->fl4_dst))
2127                 res->type = RTN_MULTICAST;
2128         else if (ipv4_is_badclass(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2129                 return -EINVAL;
2130
2131         if (dev_out->flags & IFF_LOOPBACK)
2132                 flags |= RTCF_LOCAL;
2133
2134         /* get work reference to inet device */
2135         in_dev = in_dev_get(dev_out);
2136         if (!in_dev)
2137                 return -EINVAL;
2138
2139         if (res->type == RTN_BROADCAST) {
2140                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2141                 if (res->fi) {
2142                         fib_info_put(res->fi);
2143                         res->fi = NULL;
2144                 }
2145         } else if (res->type == RTN_MULTICAST) {
2146                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2147                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2148                                  oldflp->proto))
2149                         flags &= ~RTCF_LOCAL;
2150                 /* If multicast route do not exist use
2151                    default one, but do not gateway in this case.
2152                    Yes, it is hack.
2153                  */
2154                 if (res->fi && res->prefixlen < 4) {
2155                         fib_info_put(res->fi);
2156                         res->fi = NULL;
2157                 }
2158         }
2159
2160
2161         rth = dst_alloc(&ipv4_dst_ops);
2162         if (!rth) {
2163                 err = -ENOBUFS;
2164                 goto cleanup;
2165         }
2166
2167         atomic_set(&rth->u.dst.__refcnt, 1);
2168         rth->u.dst.flags= DST_HOST;
2169         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2170                 rth->u.dst.flags |= DST_NOXFRM;
2171         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2172                 rth->u.dst.flags |= DST_NOPOLICY;
2173
2174         rth->fl.fl4_dst = oldflp->fl4_dst;
2175         rth->fl.fl4_tos = tos;
2176         rth->fl.fl4_src = oldflp->fl4_src;
2177         rth->fl.oif     = oldflp->oif;
2178         rth->fl.mark    = oldflp->mark;
2179         rth->rt_dst     = fl->fl4_dst;
2180         rth->rt_src     = fl->fl4_src;
2181         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2182         /* get references to the devices that are to be hold by the routing
2183            cache entry */
2184         rth->u.dst.dev  = dev_out;
2185         dev_hold(dev_out);
2186         rth->idev       = in_dev_get(dev_out);
2187         rth->rt_gateway = fl->fl4_dst;
2188         rth->rt_spec_dst= fl->fl4_src;
2189
2190         rth->u.dst.output=ip_output;
2191
2192         RT_CACHE_STAT_INC(out_slow_tot);
2193
2194         if (flags & RTCF_LOCAL) {
2195                 rth->u.dst.input = ip_local_deliver;
2196                 rth->rt_spec_dst = fl->fl4_dst;
2197         }
2198         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2199                 rth->rt_spec_dst = fl->fl4_src;
2200                 if (flags & RTCF_LOCAL &&
2201                     !(dev_out->flags & IFF_LOOPBACK)) {
2202                         rth->u.dst.output = ip_mc_output;
2203                         RT_CACHE_STAT_INC(out_slow_mc);
2204                 }
2205 #ifdef CONFIG_IP_MROUTE
2206                 if (res->type == RTN_MULTICAST) {
2207                         if (IN_DEV_MFORWARD(in_dev) &&
2208                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2209                                 rth->u.dst.input = ip_mr_input;
2210                                 rth->u.dst.output = ip_mc_output;
2211                         }
2212                 }
2213 #endif
2214         }
2215
2216         rt_set_nexthop(rth, res, 0);
2217
2218         rth->rt_flags = flags;
2219
2220         *result = rth;
2221  cleanup:
2222         /* release work reference to inet device */
2223         in_dev_put(in_dev);
2224
2225         return err;
2226 }
2227
2228 static inline int ip_mkroute_output(struct rtable **rp,
2229                                     struct fib_result* res,
2230                                     const struct flowi *fl,
2231                                     const struct flowi *oldflp,
2232                                     struct net_device *dev_out,
2233                                     unsigned flags)
2234 {
2235         struct rtable *rth = NULL;
2236         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2237         unsigned hash;
2238         if (err == 0) {
2239                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2240                 err = rt_intern_hash(hash, rth, rp);
2241         }
2242
2243         return err;
2244 }
2245
2246 /*
2247  * Major route resolver routine.
2248  */
2249
2250 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2251 {
2252         u32 tos = RT_FL_TOS(oldflp);
2253         struct flowi fl = { .nl_u = { .ip4_u =
2254                                       { .daddr = oldflp->fl4_dst,
2255                                         .saddr = oldflp->fl4_src,
2256                                         .tos = tos & IPTOS_RT_MASK,
2257                                         .scope = ((tos & RTO_ONLINK) ?
2258                                                   RT_SCOPE_LINK :
2259                                                   RT_SCOPE_UNIVERSE),
2260                                       } },
2261                             .mark = oldflp->mark,
2262                             .iif = init_net.loopback_dev->ifindex,
2263                             .oif = oldflp->oif };
2264         struct fib_result res;
2265         unsigned flags = 0;
2266         struct net_device *dev_out = NULL;
2267         int free_res = 0;
2268         int err;
2269
2270
2271         res.fi          = NULL;
2272 #ifdef CONFIG_IP_MULTIPLE_TABLES
2273         res.r           = NULL;
2274 #endif
2275
2276         if (oldflp->fl4_src) {
2277                 err = -EINVAL;
2278                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2279                     ipv4_is_badclass(oldflp->fl4_src) ||
2280                     ipv4_is_zeronet(oldflp->fl4_src))
2281                         goto out;
2282
2283                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2284                 dev_out = ip_dev_find(oldflp->fl4_src);
2285                 if (dev_out == NULL)
2286                         goto out;
2287
2288                 /* I removed check for oif == dev_out->oif here.
2289                    It was wrong for two reasons:
2290                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2291                       assigned to multiple interfaces.
2292                    2. Moreover, we are allowed to send packets with saddr
2293                       of another iface. --ANK
2294                  */
2295
2296                 if (oldflp->oif == 0
2297                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2298                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2299                         /* Special hack: user can direct multicasts
2300                            and limited broadcast via necessary interface
2301                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2302                            This hack is not just for fun, it allows
2303                            vic,vat and friends to work.
2304                            They bind socket to loopback, set ttl to zero
2305                            and expect that it will work.
2306                            From the viewpoint of routing cache they are broken,
2307                            because we are not allowed to build multicast path
2308                            with loopback source addr (look, routing cache
2309                            cannot know, that ttl is zero, so that packet
2310                            will not leave this host and route is valid).
2311                            Luckily, this hack is good workaround.
2312                          */
2313
2314                         fl.oif = dev_out->ifindex;
2315                         goto make_route;
2316                 }
2317                 if (dev_out)
2318                         dev_put(dev_out);
2319                 dev_out = NULL;
2320         }
2321
2322
2323         if (oldflp->oif) {
2324                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2325                 err = -ENODEV;
2326                 if (dev_out == NULL)
2327                         goto out;
2328
2329                 /* RACE: Check return value of inet_select_addr instead. */
2330                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2331                         dev_put(dev_out);
2332                         goto out;       /* Wrong error code */
2333                 }
2334
2335                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2336                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2337                         if (!fl.fl4_src)
2338                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2339                                                               RT_SCOPE_LINK);
2340                         goto make_route;
2341                 }
2342                 if (!fl.fl4_src) {
2343                         if (ipv4_is_multicast(oldflp->fl4_dst))
2344                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2345                                                               fl.fl4_scope);
2346                         else if (!oldflp->fl4_dst)
2347                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2348                                                               RT_SCOPE_HOST);
2349                 }
2350         }
2351
2352         if (!fl.fl4_dst) {
2353                 fl.fl4_dst = fl.fl4_src;
2354                 if (!fl.fl4_dst)
2355                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2356                 if (dev_out)
2357                         dev_put(dev_out);
2358                 dev_out = init_net.loopback_dev;
2359                 dev_hold(dev_out);
2360                 fl.oif = init_net.loopback_dev->ifindex;
2361                 res.type = RTN_LOCAL;
2362                 flags |= RTCF_LOCAL;
2363                 goto make_route;
2364         }
2365
2366         if (fib_lookup(&fl, &res)) {
2367                 res.fi = NULL;
2368                 if (oldflp->oif) {
2369                         /* Apparently, routing tables are wrong. Assume,
2370                            that the destination is on link.
2371
2372                            WHY? DW.
2373                            Because we are allowed to send to iface
2374                            even if it has NO routes and NO assigned
2375                            addresses. When oif is specified, routing
2376                            tables are looked up with only one purpose:
2377                            to catch if destination is gatewayed, rather than
2378                            direct. Moreover, if MSG_DONTROUTE is set,
2379                            we send packet, ignoring both routing tables
2380                            and ifaddr state. --ANK
2381
2382
2383                            We could make it even if oif is unknown,
2384                            likely IPv6, but we do not.
2385                          */
2386
2387                         if (fl.fl4_src == 0)
2388                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2389                                                               RT_SCOPE_LINK);
2390                         res.type = RTN_UNICAST;
2391                         goto make_route;
2392                 }
2393                 if (dev_out)
2394                         dev_put(dev_out);
2395                 err = -ENETUNREACH;
2396                 goto out;
2397         }
2398         free_res = 1;
2399
2400         if (res.type == RTN_LOCAL) {
2401                 if (!fl.fl4_src)
2402                         fl.fl4_src = fl.fl4_dst;
2403                 if (dev_out)
2404                         dev_put(dev_out);
2405                 dev_out = init_net.loopback_dev;
2406                 dev_hold(dev_out);
2407                 fl.oif = dev_out->ifindex;
2408                 if (res.fi)
2409                         fib_info_put(res.fi);
2410                 res.fi = NULL;
2411                 flags |= RTCF_LOCAL;
2412                 goto make_route;
2413         }
2414
2415 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2416         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2417                 fib_select_multipath(&fl, &res);
2418         else
2419 #endif
2420         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2421                 fib_select_default(&fl, &res);
2422
2423         if (!fl.fl4_src)
2424                 fl.fl4_src = FIB_RES_PREFSRC(res);
2425
2426         if (dev_out)
2427                 dev_put(dev_out);
2428         dev_out = FIB_RES_DEV(res);
2429         dev_hold(dev_out);
2430         fl.oif = dev_out->ifindex;
2431
2432
2433 make_route:
2434         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2435
2436
2437         if (free_res)
2438                 fib_res_put(&res);
2439         if (dev_out)
2440                 dev_put(dev_out);
2441 out:    return err;
2442 }
2443
2444 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2445 {
2446         unsigned hash;
2447         struct rtable *rth;
2448
2449         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2450
2451         rcu_read_lock_bh();
2452         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2453                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2454                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2455                     rth->fl.fl4_src == flp->fl4_src &&
2456                     rth->fl.iif == 0 &&
2457                     rth->fl.oif == flp->oif &&
2458                     rth->fl.mark == flp->mark &&
2459                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2460                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2461                         dst_use(&rth->u.dst, jiffies);
2462                         RT_CACHE_STAT_INC(out_hit);
2463                         rcu_read_unlock_bh();
2464                         *rp = rth;
2465                         return 0;
2466                 }
2467                 RT_CACHE_STAT_INC(out_hlist_search);
2468         }
2469         rcu_read_unlock_bh();
2470
2471         return ip_route_output_slow(rp, flp);
2472 }
2473
2474 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2475
2476 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2477 {
2478 }
2479
2480 static struct dst_ops ipv4_dst_blackhole_ops = {
2481         .family                 =       AF_INET,
2482         .protocol               =       __constant_htons(ETH_P_IP),
2483         .destroy                =       ipv4_dst_destroy,
2484         .check                  =       ipv4_dst_check,
2485         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2486         .entry_size             =       sizeof(struct rtable),
2487 };
2488
2489
2490 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2491 {
2492         struct rtable *ort = *rp;
2493         struct rtable *rt = (struct rtable *)
2494                 dst_alloc(&ipv4_dst_blackhole_ops);
2495
2496         if (rt) {
2497                 struct dst_entry *new = &rt->u.dst;
2498
2499                 atomic_set(&new->__refcnt, 1);
2500                 new->__use = 1;
2501                 new->input = dst_discard;
2502                 new->output = dst_discard;
2503                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2504
2505                 new->dev = ort->u.dst.dev;
2506                 if (new->dev)
2507                         dev_hold(new->dev);
2508
2509                 rt->fl = ort->fl;
2510
2511                 rt->idev = ort->idev;
2512                 if (rt->idev)
2513                         in_dev_hold(rt->idev);
2514                 rt->rt_flags = ort->rt_flags;
2515                 rt->rt_type = ort->rt_type;
2516                 rt->rt_dst = ort->rt_dst;
2517                 rt->rt_src = ort->rt_src;
2518                 rt->rt_iif = ort->rt_iif;
2519                 rt->rt_gateway = ort->rt_gateway;
2520                 rt->rt_spec_dst = ort->rt_spec_dst;
2521                 rt->peer = ort->peer;
2522                 if (rt->peer)
2523                         atomic_inc(&rt->peer->refcnt);
2524
2525                 dst_free(new);
2526         }
2527
2528         dst_release(&(*rp)->u.dst);
2529         *rp = rt;
2530         return (rt ? 0 : -ENOMEM);
2531 }
2532
2533 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2534 {
2535         int err;
2536
2537         if ((err = __ip_route_output_key(rp, flp)) != 0)
2538                 return err;
2539
2540         if (flp->proto) {
2541                 if (!flp->fl4_src)
2542                         flp->fl4_src = (*rp)->rt_src;
2543                 if (!flp->fl4_dst)
2544                         flp->fl4_dst = (*rp)->rt_dst;
2545                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2546                                     flags ? XFRM_LOOKUP_WAIT : 0);
2547                 if (err == -EREMOTE)
2548                         err = ipv4_dst_blackhole(rp, flp, sk);
2549
2550                 return err;
2551         }
2552
2553         return 0;
2554 }
2555
2556 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2557
2558 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2559 {
2560         return ip_route_output_flow(rp, flp, NULL, 0);
2561 }
2562
2563 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2564                         int nowait, unsigned int flags)
2565 {
2566         struct rtable *rt = (struct rtable*)skb->dst;
2567         struct rtmsg *r;
2568         struct nlmsghdr *nlh;
2569         long expires;
2570         u32 id = 0, ts = 0, tsage = 0, error;
2571
2572         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2573         if (nlh == NULL)
2574                 return -EMSGSIZE;
2575
2576         r = nlmsg_data(nlh);
2577         r->rtm_family    = AF_INET;
2578         r->rtm_dst_len  = 32;
2579         r->rtm_src_len  = 0;
2580         r->rtm_tos      = rt->fl.fl4_tos;
2581         r->rtm_table    = RT_TABLE_MAIN;
2582         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2583         r->rtm_type     = rt->rt_type;
2584         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2585         r->rtm_protocol = RTPROT_UNSPEC;
2586         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2587         if (rt->rt_flags & RTCF_NOTIFY)
2588                 r->rtm_flags |= RTM_F_NOTIFY;
2589
2590         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2591
2592         if (rt->fl.fl4_src) {
2593                 r->rtm_src_len = 32;
2594                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2595         }
2596         if (rt->u.dst.dev)
2597                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2598 #ifdef CONFIG_NET_CLS_ROUTE
2599         if (rt->u.dst.tclassid)
2600                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2601 #endif
2602         if (rt->fl.iif)
2603                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2604         else if (rt->rt_src != rt->fl.fl4_src)
2605                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2606
2607         if (rt->rt_dst != rt->rt_gateway)
2608                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2609
2610         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2611                 goto nla_put_failure;
2612
2613         error = rt->u.dst.error;
2614         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2615         if (rt->peer) {
2616                 id = rt->peer->ip_id_count;
2617                 if (rt->peer->tcp_ts_stamp) {
2618                         ts = rt->peer->tcp_ts;
2619                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2620                 }
2621         }
2622
2623         if (rt->fl.iif) {
2624 #ifdef CONFIG_IP_MROUTE
2625                 __be32 dst = rt->rt_dst;
2626
2627                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2628                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2629                         int err = ipmr_get_route(skb, r, nowait);
2630                         if (err <= 0) {
2631                                 if (!nowait) {
2632                                         if (err == 0)
2633                                                 return 0;
2634                                         goto nla_put_failure;
2635                                 } else {
2636                                         if (err == -EMSGSIZE)
2637                                                 goto nla_put_failure;
2638                                         error = err;
2639                                 }
2640                         }
2641                 } else
2642 #endif
2643                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2644         }
2645
2646         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2647                                expires, error) < 0)
2648                 goto nla_put_failure;
2649
2650         return nlmsg_end(skb, nlh);
2651
2652 nla_put_failure:
2653         nlmsg_cancel(skb, nlh);
2654         return -EMSGSIZE;
2655 }
2656
2657 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2658 {
2659         struct net *net = in_skb->sk->sk_net;
2660         struct rtmsg *rtm;
2661         struct nlattr *tb[RTA_MAX+1];
2662         struct rtable *rt = NULL;
2663         __be32 dst = 0;
2664         __be32 src = 0;
2665         u32 iif;
2666         int err;
2667         struct sk_buff *skb;
2668
2669         if (net != &init_net)
2670                 return -EINVAL;
2671
2672         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2673         if (err < 0)
2674                 goto errout;
2675
2676         rtm = nlmsg_data(nlh);
2677
2678         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2679         if (skb == NULL) {
2680                 err = -ENOBUFS;
2681                 goto errout;
2682         }
2683
2684         /* Reserve room for dummy headers, this skb can pass
2685            through good chunk of routing engine.
2686          */
2687         skb_reset_mac_header(skb);
2688         skb_reset_network_header(skb);
2689
2690         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2691         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2692         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2693
2694         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2695         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2696         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2697
2698         if (iif) {
2699                 struct net_device *dev;
2700
2701                 dev = __dev_get_by_index(&init_net, iif);
2702                 if (dev == NULL) {
2703                         err = -ENODEV;
2704                         goto errout_free;
2705                 }
2706
2707                 skb->protocol   = htons(ETH_P_IP);
2708                 skb->dev        = dev;
2709                 local_bh_disable();
2710                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2711                 local_bh_enable();
2712
2713                 rt = (struct rtable*) skb->dst;
2714                 if (err == 0 && rt->u.dst.error)
2715                         err = -rt->u.dst.error;
2716         } else {
2717                 struct flowi fl = {
2718                         .nl_u = {
2719                                 .ip4_u = {
2720                                         .daddr = dst,
2721                                         .saddr = src,
2722                                         .tos = rtm->rtm_tos,
2723                                 },
2724                         },
2725                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2726                 };
2727                 err = ip_route_output_key(&rt, &fl);
2728         }
2729
2730         if (err)
2731                 goto errout_free;
2732
2733         skb->dst = &rt->u.dst;
2734         if (rtm->rtm_flags & RTM_F_NOTIFY)
2735                 rt->rt_flags |= RTCF_NOTIFY;
2736
2737         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2738                                 RTM_NEWROUTE, 0, 0);
2739         if (err <= 0)
2740                 goto errout_free;
2741
2742         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2743 errout:
2744         return err;
2745
2746 errout_free:
2747         kfree_skb(skb);
2748         goto errout;
2749 }
2750
2751 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2752 {
2753         struct rtable *rt;
2754         int h, s_h;
2755         int idx, s_idx;
2756
2757         s_h = cb->args[0];
2758         if (s_h < 0)
2759                 s_h = 0;
2760         s_idx = idx = cb->args[1];
2761         for (h = s_h; h <= rt_hash_mask; h++) {
2762                 rcu_read_lock_bh();
2763                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2764                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2765                         if (idx < s_idx)
2766                                 continue;
2767                         skb->dst = dst_clone(&rt->u.dst);
2768                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2769                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2770                                          1, NLM_F_MULTI) <= 0) {
2771                                 dst_release(xchg(&skb->dst, NULL));
2772                                 rcu_read_unlock_bh();
2773                                 goto done;
2774                         }
2775                         dst_release(xchg(&skb->dst, NULL));
2776                 }
2777                 rcu_read_unlock_bh();
2778                 s_idx = 0;
2779         }
2780
2781 done:
2782         cb->args[0] = h;
2783         cb->args[1] = idx;
2784         return skb->len;
2785 }
2786
2787 void ip_rt_multicast_event(struct in_device *in_dev)
2788 {
2789         rt_cache_flush(0);
2790 }
2791
2792 #ifdef CONFIG_SYSCTL
2793 static int flush_delay;
2794
2795 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2796                                         struct file *filp, void __user *buffer,
2797                                         size_t *lenp, loff_t *ppos)
2798 {
2799         if (write) {
2800                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2801                 rt_cache_flush(flush_delay);
2802                 return 0;
2803         }
2804
2805         return -EINVAL;
2806 }
2807
2808 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2809                                                 int __user *name,
2810                                                 int nlen,
2811                                                 void __user *oldval,
2812                                                 size_t __user *oldlenp,
2813                                                 void __user *newval,
2814                                                 size_t newlen)
2815 {
2816         int delay;
2817         if (newlen != sizeof(int))
2818                 return -EINVAL;
2819         if (get_user(delay, (int __user *)newval))
2820                 return -EFAULT;
2821         rt_cache_flush(delay);
2822         return 0;
2823 }
2824
2825 ctl_table ipv4_route_table[] = {
2826         {
2827                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2828                 .procname       = "flush",
2829                 .data           = &flush_delay,
2830                 .maxlen         = sizeof(int),
2831                 .mode           = 0200,
2832                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2833                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2834         },
2835         {
2836                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2837                 .procname       = "min_delay",
2838                 .data           = &ip_rt_min_delay,
2839                 .maxlen         = sizeof(int),
2840                 .mode           = 0644,
2841                 .proc_handler   = &proc_dointvec_jiffies,
2842                 .strategy       = &sysctl_jiffies,
2843         },
2844         {
2845                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2846                 .procname       = "max_delay",
2847                 .data           = &ip_rt_max_delay,
2848                 .maxlen         = sizeof(int),
2849                 .mode           = 0644,
2850                 .proc_handler   = &proc_dointvec_jiffies,
2851                 .strategy       = &sysctl_jiffies,
2852         },
2853         {
2854                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2855                 .procname       = "gc_thresh",
2856                 .data           = &ipv4_dst_ops.gc_thresh,
2857                 .maxlen         = sizeof(int),
2858                 .mode           = 0644,
2859                 .proc_handler   = &proc_dointvec,
2860         },
2861         {
2862                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2863                 .procname       = "max_size",
2864                 .data           = &ip_rt_max_size,
2865                 .maxlen         = sizeof(int),
2866                 .mode           = 0644,
2867                 .proc_handler   = &proc_dointvec,
2868         },
2869         {
2870                 /*  Deprecated. Use gc_min_interval_ms */
2871
2872                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2873                 .procname       = "gc_min_interval",
2874                 .data           = &ip_rt_gc_min_interval,
2875                 .maxlen         = sizeof(int),
2876                 .mode           = 0644,
2877                 .proc_handler   = &proc_dointvec_jiffies,
2878                 .strategy       = &sysctl_jiffies,
2879         },
2880         {
2881                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2882                 .procname       = "gc_min_interval_ms",
2883                 .data           = &ip_rt_gc_min_interval,
2884                 .maxlen         = sizeof(int),
2885                 .mode           = 0644,
2886                 .proc_handler   = &proc_dointvec_ms_jiffies,
2887                 .strategy       = &sysctl_ms_jiffies,
2888         },
2889         {
2890                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2891                 .procname       = "gc_timeout",
2892                 .data           = &ip_rt_gc_timeout,
2893                 .maxlen         = sizeof(int),
2894                 .mode           = 0644,
2895                 .proc_handler   = &proc_dointvec_jiffies,
2896                 .strategy       = &sysctl_jiffies,
2897         },
2898         {
2899                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2900                 .procname       = "gc_interval",
2901                 .data           = &ip_rt_gc_interval,
2902                 .maxlen         = sizeof(int),
2903                 .mode           = 0644,
2904                 .proc_handler   = &proc_dointvec_jiffies,
2905                 .strategy       = &sysctl_jiffies,
2906         },
2907         {
2908                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2909                 .procname       = "redirect_load",
2910                 .data           = &ip_rt_redirect_load,
2911                 .maxlen         = sizeof(int),
2912                 .mode           = 0644,
2913                 .proc_handler   = &proc_dointvec,
2914         },
2915         {
2916                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2917                 .procname       = "redirect_number",
2918                 .data           = &ip_rt_redirect_number,
2919                 .maxlen         = sizeof(int),
2920                 .mode           = 0644,
2921                 .proc_handler   = &proc_dointvec,
2922         },
2923         {
2924                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2925                 .procname       = "redirect_silence",
2926                 .data           = &ip_rt_redirect_silence,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = &proc_dointvec,
2930         },
2931         {
2932                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2933                 .procname       = "error_cost",
2934                 .data           = &ip_rt_error_cost,
2935                 .maxlen         = sizeof(int),
2936                 .mode           = 0644,
2937                 .proc_handler   = &proc_dointvec,
2938         },
2939         {
2940                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2941                 .procname       = "error_burst",
2942                 .data           = &ip_rt_error_burst,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = &proc_dointvec,
2946         },
2947         {
2948                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2949                 .procname       = "gc_elasticity",
2950                 .data           = &ip_rt_gc_elasticity,
2951                 .maxlen         = sizeof(int),
2952                 .mode           = 0644,
2953                 .proc_handler   = &proc_dointvec,
2954         },
2955         {
2956                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2957                 .procname       = "mtu_expires",
2958                 .data           = &ip_rt_mtu_expires,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = &proc_dointvec_jiffies,
2962                 .strategy       = &sysctl_jiffies,
2963         },
2964         {
2965                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2966                 .procname       = "min_pmtu",
2967                 .data           = &ip_rt_min_pmtu,
2968                 .maxlen         = sizeof(int),
2969                 .mode           = 0644,
2970                 .proc_handler   = &proc_dointvec,
2971         },
2972         {
2973                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2974                 .procname       = "min_adv_mss",
2975                 .data           = &ip_rt_min_advmss,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = &proc_dointvec,
2979         },
2980         {
2981                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2982                 .procname       = "secret_interval",
2983                 .data           = &ip_rt_secret_interval,
2984                 .maxlen         = sizeof(int),
2985                 .mode           = 0644,
2986                 .proc_handler   = &proc_dointvec_jiffies,
2987                 .strategy       = &sysctl_jiffies,
2988         },
2989         { .ctl_name = 0 }
2990 };
2991 #endif
2992
2993 #ifdef CONFIG_NET_CLS_ROUTE
2994 struct ip_rt_acct *ip_rt_acct __read_mostly;
2995 #endif /* CONFIG_NET_CLS_ROUTE */
2996
2997 static __initdata unsigned long rhash_entries;
2998 static int __init set_rhash_entries(char *str)
2999 {
3000         if (!str)
3001                 return 0;
3002         rhash_entries = simple_strtoul(str, &str, 0);
3003         return 1;
3004 }
3005 __setup("rhash_entries=", set_rhash_entries);
3006
3007 int __init ip_rt_init(void)
3008 {
3009         int rc = 0;
3010
3011         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
3012                              (jiffies ^ (jiffies >> 7)));
3013
3014 #ifdef CONFIG_NET_CLS_ROUTE
3015         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3016         if (!ip_rt_acct)
3017                 panic("IP: failed to allocate ip_rt_acct\n");
3018 #endif
3019
3020         ipv4_dst_ops.kmem_cachep =
3021                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3022                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3023
3024         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3025
3026         rt_hash_table = (struct rt_hash_bucket *)
3027                 alloc_large_system_hash("IP route cache",
3028                                         sizeof(struct rt_hash_bucket),
3029                                         rhash_entries,
3030                                         (num_physpages >= 128 * 1024) ?
3031                                         15 : 17,
3032                                         0,
3033                                         &rt_hash_log,
3034                                         &rt_hash_mask,
3035                                         0);
3036         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3037         rt_hash_lock_init();
3038
3039         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3040         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3041
3042         devinet_init();
3043         ip_fib_init();
3044
3045         setup_timer(&rt_flush_timer, rt_run_flush, 0);
3046         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3047
3048         /* All the timers, started at system startup tend
3049            to synchronize. Perturb it a bit.
3050          */
3051         schedule_delayed_work(&expires_work,
3052                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3053
3054         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3055                 ip_rt_secret_interval;
3056         add_timer(&rt_secret_timer);
3057
3058         if (ip_rt_proc_init(&init_net))
3059                 printk(KERN_ERR "Unable to create route proc files\n");
3060 #ifdef CONFIG_XFRM
3061         xfrm_init();
3062         xfrm4_init();
3063 #endif
3064         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3065
3066         return rc;
3067 }
3068
3069 EXPORT_SYMBOL(__ip_select_ident);
3070 EXPORT_SYMBOL(ip_route_input);
3071 EXPORT_SYMBOL(ip_route_output_key);