[NETNS]: Process ip_rt_redirect in the correct namespace.
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval            = 60 * HZ;
123 static int ip_rt_gc_min_interval        = HZ / 2;
124 static int ip_rt_redirect_number        = 9;
125 static int ip_rt_redirect_load          = HZ / 50;
126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost             = HZ;
128 static int ip_rt_error_burst            = 5 * HZ;
129 static int ip_rt_gc_elasticity          = 8;
130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
132 static int ip_rt_min_advmss             = 256;
133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .local_out =            ip_local_out,
166         .entry_size =           sizeof(struct rtable),
167         .entries =              ATOMIC_INIT(0),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 const __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234
235 static __init void rt_hash_lock_init(void)
236 {
237         int i;
238
239         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240                         GFP_KERNEL);
241         if (!rt_hash_locks)
242                 panic("IP: failed to allocate rt_hash_locks\n");
243
244         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245                 spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254
255 static struct rt_hash_bucket    *rt_hash_table;
256 static unsigned                 rt_hash_mask;
257 static unsigned int             rt_hash_log;
258 static atomic_t                 rt_genid;
259
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262         (__raw_get_cpu_var(rt_cache_stat).field++)
263
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267                 & rt_hash_mask;
268 }
269
270 #define rt_hash(daddr, saddr, idx) \
271         rt_hash_code((__force u32)(__be32)(daddr),\
272                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276         int bucket;
277         int genid;
278 };
279
280 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
281 {
282         struct rtable *r = NULL;
283
284         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
285                 rcu_read_lock_bh();
286                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
287                 while (r) {
288                         if (r->rt_genid == st->genid)
289                                 return r;
290                         r = rcu_dereference(r->u.dst.rt_next);
291                 }
292                 rcu_read_unlock_bh();
293         }
294         return r;
295 }
296
297 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st, struct rtable *r)
298 {
299         r = r->u.dst.rt_next;
300         while (!r) {
301                 rcu_read_unlock_bh();
302                 if (--st->bucket < 0)
303                         break;
304                 rcu_read_lock_bh();
305                 r = rt_hash_table[st->bucket].chain;
306         }
307         return rcu_dereference(r);
308 }
309
310 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
311 {
312         struct rtable *r = rt_cache_get_first(st);
313
314         if (r)
315                 while (pos && (r = rt_cache_get_next(st, r))) {
316                         if (r->rt_genid != st->genid)
317                                 continue;
318                         --pos;
319                 }
320         return pos ? NULL : r;
321 }
322
323 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
324 {
325         struct rt_cache_iter_state *st = seq->private;
326
327         if (*pos)
328                 return rt_cache_get_idx(st, *pos - 1);
329         st->genid = atomic_read(&rt_genid);
330         return SEQ_START_TOKEN;
331 }
332
333 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
334 {
335         struct rtable *r;
336         struct rt_cache_iter_state *st = seq->private;
337
338         if (v == SEQ_START_TOKEN)
339                 r = rt_cache_get_first(st);
340         else
341                 r = rt_cache_get_next(st, v);
342         ++*pos;
343         return r;
344 }
345
346 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
347 {
348         if (v && v != SEQ_START_TOKEN)
349                 rcu_read_unlock_bh();
350 }
351
352 static int rt_cache_seq_show(struct seq_file *seq, void *v)
353 {
354         if (v == SEQ_START_TOKEN)
355                 seq_printf(seq, "%-127s\n",
356                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
357                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
358                            "HHUptod\tSpecDst");
359         else {
360                 struct rtable *r = v;
361                 char temp[256];
362
363                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
364                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
365                         r->u.dst.dev ? r->u.dst.dev->name : "*",
366                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
367                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
368                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
369                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
370                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
371                         dst_metric(&r->u.dst, RTAX_WINDOW),
372                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
373                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
374                         r->fl.fl4_tos,
375                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
376                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
377                                        dev_queue_xmit) : 0,
378                         r->rt_spec_dst);
379                 seq_printf(seq, "%-127s\n", temp);
380         }
381         return 0;
382 }
383
384 static const struct seq_operations rt_cache_seq_ops = {
385         .start  = rt_cache_seq_start,
386         .next   = rt_cache_seq_next,
387         .stop   = rt_cache_seq_stop,
388         .show   = rt_cache_seq_show,
389 };
390
391 static int rt_cache_seq_open(struct inode *inode, struct file *file)
392 {
393         return seq_open_private(file, &rt_cache_seq_ops,
394                         sizeof(struct rt_cache_iter_state));
395 }
396
397 static const struct file_operations rt_cache_seq_fops = {
398         .owner   = THIS_MODULE,
399         .open    = rt_cache_seq_open,
400         .read    = seq_read,
401         .llseek  = seq_lseek,
402         .release = seq_release_private,
403 };
404
405
406 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
407 {
408         int cpu;
409
410         if (*pos == 0)
411                 return SEQ_START_TOKEN;
412
413         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
414                 if (!cpu_possible(cpu))
415                         continue;
416                 *pos = cpu+1;
417                 return &per_cpu(rt_cache_stat, cpu);
418         }
419         return NULL;
420 }
421
422 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
423 {
424         int cpu;
425
426         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
427                 if (!cpu_possible(cpu))
428                         continue;
429                 *pos = cpu+1;
430                 return &per_cpu(rt_cache_stat, cpu);
431         }
432         return NULL;
433
434 }
435
436 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
437 {
438
439 }
440
441 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
442 {
443         struct rt_cache_stat *st = v;
444
445         if (v == SEQ_START_TOKEN) {
446                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
447                 return 0;
448         }
449
450         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
451                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
452                    atomic_read(&ipv4_dst_ops.entries),
453                    st->in_hit,
454                    st->in_slow_tot,
455                    st->in_slow_mc,
456                    st->in_no_route,
457                    st->in_brd,
458                    st->in_martian_dst,
459                    st->in_martian_src,
460
461                    st->out_hit,
462                    st->out_slow_tot,
463                    st->out_slow_mc,
464
465                    st->gc_total,
466                    st->gc_ignored,
467                    st->gc_goal_miss,
468                    st->gc_dst_overflow,
469                    st->in_hlist_search,
470                    st->out_hlist_search
471                 );
472         return 0;
473 }
474
475 static const struct seq_operations rt_cpu_seq_ops = {
476         .start  = rt_cpu_seq_start,
477         .next   = rt_cpu_seq_next,
478         .stop   = rt_cpu_seq_stop,
479         .show   = rt_cpu_seq_show,
480 };
481
482
483 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
484 {
485         return seq_open(file, &rt_cpu_seq_ops);
486 }
487
488 static const struct file_operations rt_cpu_seq_fops = {
489         .owner   = THIS_MODULE,
490         .open    = rt_cpu_seq_open,
491         .read    = seq_read,
492         .llseek  = seq_lseek,
493         .release = seq_release,
494 };
495
496 #ifdef CONFIG_NET_CLS_ROUTE
497 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
498                            int length, int *eof, void *data)
499 {
500         unsigned int i;
501
502         if ((offset & 3) || (length & 3))
503                 return -EIO;
504
505         if (offset >= sizeof(struct ip_rt_acct) * 256) {
506                 *eof = 1;
507                 return 0;
508         }
509
510         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
511                 length = sizeof(struct ip_rt_acct) * 256 - offset;
512                 *eof = 1;
513         }
514
515         offset /= sizeof(u32);
516
517         if (length > 0) {
518                 u32 *dst = (u32 *) buffer;
519
520                 *start = buffer;
521                 memset(dst, 0, length);
522
523                 for_each_possible_cpu(i) {
524                         unsigned int j;
525                         u32 *src;
526
527                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
528                         for (j = 0; j < length/4; j++)
529                                 dst[j] += src[j];
530                 }
531         }
532         return length;
533 }
534 #endif
535
536 static __init int ip_rt_proc_init(struct net *net)
537 {
538         struct proc_dir_entry *pde;
539
540         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
541                         &rt_cache_seq_fops);
542         if (!pde)
543                 goto err1;
544
545         pde = proc_create("rt_cache", S_IRUGO,
546                           net->proc_net_stat, &rt_cpu_seq_fops);
547         if (!pde)
548                 goto err2;
549
550 #ifdef CONFIG_NET_CLS_ROUTE
551         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
552                         ip_rt_acct_read, NULL);
553         if (!pde)
554                 goto err3;
555 #endif
556         return 0;
557
558 #ifdef CONFIG_NET_CLS_ROUTE
559 err3:
560         remove_proc_entry("rt_cache", net->proc_net_stat);
561 #endif
562 err2:
563         remove_proc_entry("rt_cache", net->proc_net);
564 err1:
565         return -ENOMEM;
566 }
567 #else
568 static inline int ip_rt_proc_init(struct net *net)
569 {
570         return 0;
571 }
572 #endif /* CONFIG_PROC_FS */
573
574 static __inline__ void rt_free(struct rtable *rt)
575 {
576         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
577 }
578
579 static __inline__ void rt_drop(struct rtable *rt)
580 {
581         ip_rt_put(rt);
582         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
583 }
584
585 static __inline__ int rt_fast_clean(struct rtable *rth)
586 {
587         /* Kill broadcast/multicast entries very aggresively, if they
588            collide in hash table with more useful entries */
589         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
590                 rth->fl.iif && rth->u.dst.rt_next;
591 }
592
593 static __inline__ int rt_valuable(struct rtable *rth)
594 {
595         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
596                 rth->u.dst.expires;
597 }
598
599 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
600 {
601         unsigned long age;
602         int ret = 0;
603
604         if (atomic_read(&rth->u.dst.__refcnt))
605                 goto out;
606
607         ret = 1;
608         if (rth->u.dst.expires &&
609             time_after_eq(jiffies, rth->u.dst.expires))
610                 goto out;
611
612         age = jiffies - rth->u.dst.lastuse;
613         ret = 0;
614         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
615             (age <= tmo2 && rt_valuable(rth)))
616                 goto out;
617         ret = 1;
618 out:    return ret;
619 }
620
621 /* Bits of score are:
622  * 31: very valuable
623  * 30: not quite useless
624  * 29..0: usage counter
625  */
626 static inline u32 rt_score(struct rtable *rt)
627 {
628         u32 score = jiffies - rt->u.dst.lastuse;
629
630         score = ~score & ~(3<<30);
631
632         if (rt_valuable(rt))
633                 score |= (1<<31);
634
635         if (!rt->fl.iif ||
636             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
637                 score |= (1<<30);
638
639         return score;
640 }
641
642 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
643 {
644         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
645                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
646                 (fl1->mark ^ fl2->mark) |
647                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
648                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
649                 (fl1->oif ^ fl2->oif) |
650                 (fl1->iif ^ fl2->iif)) == 0;
651 }
652
653 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
654 {
655         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
656 }
657
658 /*
659  * Perform a full scan of hash table and free all entries.
660  * Can be called by a softirq or a process.
661  * In the later case, we want to be reschedule if necessary
662  */
663 static void rt_do_flush(int process_context)
664 {
665         unsigned int i;
666         struct rtable *rth, *next;
667
668         for (i = 0; i <= rt_hash_mask; i++) {
669                 if (process_context && need_resched())
670                         cond_resched();
671                 rth = rt_hash_table[i].chain;
672                 if (!rth)
673                         continue;
674
675                 spin_lock_bh(rt_hash_lock_addr(i));
676                 rth = rt_hash_table[i].chain;
677                 rt_hash_table[i].chain = NULL;
678                 spin_unlock_bh(rt_hash_lock_addr(i));
679
680                 for (; rth; rth = next) {
681                         next = rth->u.dst.rt_next;
682                         rt_free(rth);
683                 }
684         }
685 }
686
687 static void rt_check_expire(void)
688 {
689         static unsigned int rover;
690         unsigned int i = rover, goal;
691         struct rtable *rth, **rthp;
692         u64 mult;
693
694         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
695         if (ip_rt_gc_timeout > 1)
696                 do_div(mult, ip_rt_gc_timeout);
697         goal = (unsigned int)mult;
698         if (goal > rt_hash_mask)
699                 goal = rt_hash_mask + 1;
700         for (; goal > 0; goal--) {
701                 unsigned long tmo = ip_rt_gc_timeout;
702
703                 i = (i + 1) & rt_hash_mask;
704                 rthp = &rt_hash_table[i].chain;
705
706                 if (need_resched())
707                         cond_resched();
708
709                 if (*rthp == NULL)
710                         continue;
711                 spin_lock_bh(rt_hash_lock_addr(i));
712                 while ((rth = *rthp) != NULL) {
713                         if (rth->rt_genid != atomic_read(&rt_genid)) {
714                                 *rthp = rth->u.dst.rt_next;
715                                 rt_free(rth);
716                                 continue;
717                         }
718                         if (rth->u.dst.expires) {
719                                 /* Entry is expired even if it is in use */
720                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
721                                         tmo >>= 1;
722                                         rthp = &rth->u.dst.rt_next;
723                                         continue;
724                                 }
725                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
726                                 tmo >>= 1;
727                                 rthp = &rth->u.dst.rt_next;
728                                 continue;
729                         }
730
731                         /* Cleanup aged off entries. */
732                         *rthp = rth->u.dst.rt_next;
733                         rt_free(rth);
734                 }
735                 spin_unlock_bh(rt_hash_lock_addr(i));
736         }
737         rover = i;
738 }
739
740 /*
741  * rt_worker_func() is run in process context.
742  * we call rt_check_expire() to scan part of the hash table
743  */
744 static void rt_worker_func(struct work_struct *work)
745 {
746         rt_check_expire();
747         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
748 }
749
750 /*
751  * Pertubation of rt_genid by a small quantity [1..256]
752  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
753  * many times (2^24) without giving recent rt_genid.
754  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
755  */
756 static void rt_cache_invalidate(void)
757 {
758         unsigned char shuffle;
759
760         get_random_bytes(&shuffle, sizeof(shuffle));
761         atomic_add(shuffle + 1U, &rt_genid);
762 }
763
764 /*
765  * delay < 0  : invalidate cache (fast : entries will be deleted later)
766  * delay >= 0 : invalidate & flush cache (can be long)
767  */
768 void rt_cache_flush(int delay)
769 {
770         rt_cache_invalidate();
771         if (delay >= 0)
772                 rt_do_flush(!in_softirq());
773 }
774
775 /*
776  * We change rt_genid and let gc do the cleanup
777  */
778 static void rt_secret_rebuild(unsigned long dummy)
779 {
780         rt_cache_invalidate();
781         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
782 }
783
784 /*
785    Short description of GC goals.
786
787    We want to build algorithm, which will keep routing cache
788    at some equilibrium point, when number of aged off entries
789    is kept approximately equal to newly generated ones.
790
791    Current expiration strength is variable "expire".
792    We try to adjust it dynamically, so that if networking
793    is idle expires is large enough to keep enough of warm entries,
794    and when load increases it reduces to limit cache size.
795  */
796
797 static int rt_garbage_collect(struct dst_ops *ops)
798 {
799         static unsigned long expire = RT_GC_TIMEOUT;
800         static unsigned long last_gc;
801         static int rover;
802         static int equilibrium;
803         struct rtable *rth, **rthp;
804         unsigned long now = jiffies;
805         int goal;
806
807         /*
808          * Garbage collection is pretty expensive,
809          * do not make it too frequently.
810          */
811
812         RT_CACHE_STAT_INC(gc_total);
813
814         if (now - last_gc < ip_rt_gc_min_interval &&
815             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
816                 RT_CACHE_STAT_INC(gc_ignored);
817                 goto out;
818         }
819
820         /* Calculate number of entries, which we want to expire now. */
821         goal = atomic_read(&ipv4_dst_ops.entries) -
822                 (ip_rt_gc_elasticity << rt_hash_log);
823         if (goal <= 0) {
824                 if (equilibrium < ipv4_dst_ops.gc_thresh)
825                         equilibrium = ipv4_dst_ops.gc_thresh;
826                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
827                 if (goal > 0) {
828                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
829                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
830                 }
831         } else {
832                 /* We are in dangerous area. Try to reduce cache really
833                  * aggressively.
834                  */
835                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
836                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
837         }
838
839         if (now - last_gc >= ip_rt_gc_min_interval)
840                 last_gc = now;
841
842         if (goal <= 0) {
843                 equilibrium += goal;
844                 goto work_done;
845         }
846
847         do {
848                 int i, k;
849
850                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
851                         unsigned long tmo = expire;
852
853                         k = (k + 1) & rt_hash_mask;
854                         rthp = &rt_hash_table[k].chain;
855                         spin_lock_bh(rt_hash_lock_addr(k));
856                         while ((rth = *rthp) != NULL) {
857                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
858                                         !rt_may_expire(rth, tmo, expire)) {
859                                         tmo >>= 1;
860                                         rthp = &rth->u.dst.rt_next;
861                                         continue;
862                                 }
863                                 *rthp = rth->u.dst.rt_next;
864                                 rt_free(rth);
865                                 goal--;
866                         }
867                         spin_unlock_bh(rt_hash_lock_addr(k));
868                         if (goal <= 0)
869                                 break;
870                 }
871                 rover = k;
872
873                 if (goal <= 0)
874                         goto work_done;
875
876                 /* Goal is not achieved. We stop process if:
877
878                    - if expire reduced to zero. Otherwise, expire is halfed.
879                    - if table is not full.
880                    - if we are called from interrupt.
881                    - jiffies check is just fallback/debug loop breaker.
882                      We will not spin here for long time in any case.
883                  */
884
885                 RT_CACHE_STAT_INC(gc_goal_miss);
886
887                 if (expire == 0)
888                         break;
889
890                 expire >>= 1;
891 #if RT_CACHE_DEBUG >= 2
892                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
893                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
894 #endif
895
896                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
897                         goto out;
898         } while (!in_softirq() && time_before_eq(jiffies, now));
899
900         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
901                 goto out;
902         if (net_ratelimit())
903                 printk(KERN_WARNING "dst cache overflow\n");
904         RT_CACHE_STAT_INC(gc_dst_overflow);
905         return 1;
906
907 work_done:
908         expire += ip_rt_gc_min_interval;
909         if (expire > ip_rt_gc_timeout ||
910             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
911                 expire = ip_rt_gc_timeout;
912 #if RT_CACHE_DEBUG >= 2
913         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
914                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
915 #endif
916 out:    return 0;
917 }
918
919 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
920 {
921         struct rtable   *rth, **rthp;
922         unsigned long   now;
923         struct rtable *cand, **candp;
924         u32             min_score;
925         int             chain_length;
926         int attempts = !in_softirq();
927
928 restart:
929         chain_length = 0;
930         min_score = ~(u32)0;
931         cand = NULL;
932         candp = NULL;
933         now = jiffies;
934
935         rthp = &rt_hash_table[hash].chain;
936
937         spin_lock_bh(rt_hash_lock_addr(hash));
938         while ((rth = *rthp) != NULL) {
939                 if (rth->rt_genid != atomic_read(&rt_genid)) {
940                         *rthp = rth->u.dst.rt_next;
941                         rt_free(rth);
942                         continue;
943                 }
944                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
945                         /* Put it first */
946                         *rthp = rth->u.dst.rt_next;
947                         /*
948                          * Since lookup is lockfree, the deletion
949                          * must be visible to another weakly ordered CPU before
950                          * the insertion at the start of the hash chain.
951                          */
952                         rcu_assign_pointer(rth->u.dst.rt_next,
953                                            rt_hash_table[hash].chain);
954                         /*
955                          * Since lookup is lockfree, the update writes
956                          * must be ordered for consistency on SMP.
957                          */
958                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
959
960                         dst_use(&rth->u.dst, now);
961                         spin_unlock_bh(rt_hash_lock_addr(hash));
962
963                         rt_drop(rt);
964                         *rp = rth;
965                         return 0;
966                 }
967
968                 if (!atomic_read(&rth->u.dst.__refcnt)) {
969                         u32 score = rt_score(rth);
970
971                         if (score <= min_score) {
972                                 cand = rth;
973                                 candp = rthp;
974                                 min_score = score;
975                         }
976                 }
977
978                 chain_length++;
979
980                 rthp = &rth->u.dst.rt_next;
981         }
982
983         if (cand) {
984                 /* ip_rt_gc_elasticity used to be average length of chain
985                  * length, when exceeded gc becomes really aggressive.
986                  *
987                  * The second limit is less certain. At the moment it allows
988                  * only 2 entries per bucket. We will see.
989                  */
990                 if (chain_length > ip_rt_gc_elasticity) {
991                         *candp = cand->u.dst.rt_next;
992                         rt_free(cand);
993                 }
994         }
995
996         /* Try to bind route to arp only if it is output
997            route or unicast forwarding path.
998          */
999         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1000                 int err = arp_bind_neighbour(&rt->u.dst);
1001                 if (err) {
1002                         spin_unlock_bh(rt_hash_lock_addr(hash));
1003
1004                         if (err != -ENOBUFS) {
1005                                 rt_drop(rt);
1006                                 return err;
1007                         }
1008
1009                         /* Neighbour tables are full and nothing
1010                            can be released. Try to shrink route cache,
1011                            it is most likely it holds some neighbour records.
1012                          */
1013                         if (attempts-- > 0) {
1014                                 int saved_elasticity = ip_rt_gc_elasticity;
1015                                 int saved_int = ip_rt_gc_min_interval;
1016                                 ip_rt_gc_elasticity     = 1;
1017                                 ip_rt_gc_min_interval   = 0;
1018                                 rt_garbage_collect(&ipv4_dst_ops);
1019                                 ip_rt_gc_min_interval   = saved_int;
1020                                 ip_rt_gc_elasticity     = saved_elasticity;
1021                                 goto restart;
1022                         }
1023
1024                         if (net_ratelimit())
1025                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1026                         rt_drop(rt);
1027                         return -ENOBUFS;
1028                 }
1029         }
1030
1031         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1032 #if RT_CACHE_DEBUG >= 2
1033         if (rt->u.dst.rt_next) {
1034                 struct rtable *trt;
1035                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1036                        NIPQUAD(rt->rt_dst));
1037                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1038                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1039                 printk("\n");
1040         }
1041 #endif
1042         rt_hash_table[hash].chain = rt;
1043         spin_unlock_bh(rt_hash_lock_addr(hash));
1044         *rp = rt;
1045         return 0;
1046 }
1047
1048 void rt_bind_peer(struct rtable *rt, int create)
1049 {
1050         static DEFINE_SPINLOCK(rt_peer_lock);
1051         struct inet_peer *peer;
1052
1053         peer = inet_getpeer(rt->rt_dst, create);
1054
1055         spin_lock_bh(&rt_peer_lock);
1056         if (rt->peer == NULL) {
1057                 rt->peer = peer;
1058                 peer = NULL;
1059         }
1060         spin_unlock_bh(&rt_peer_lock);
1061         if (peer)
1062                 inet_putpeer(peer);
1063 }
1064
1065 /*
1066  * Peer allocation may fail only in serious out-of-memory conditions.  However
1067  * we still can generate some output.
1068  * Random ID selection looks a bit dangerous because we have no chances to
1069  * select ID being unique in a reasonable period of time.
1070  * But broken packet identifier may be better than no packet at all.
1071  */
1072 static void ip_select_fb_ident(struct iphdr *iph)
1073 {
1074         static DEFINE_SPINLOCK(ip_fb_id_lock);
1075         static u32 ip_fallback_id;
1076         u32 salt;
1077
1078         spin_lock_bh(&ip_fb_id_lock);
1079         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1080         iph->id = htons(salt & 0xFFFF);
1081         ip_fallback_id = salt;
1082         spin_unlock_bh(&ip_fb_id_lock);
1083 }
1084
1085 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1086 {
1087         struct rtable *rt = (struct rtable *) dst;
1088
1089         if (rt) {
1090                 if (rt->peer == NULL)
1091                         rt_bind_peer(rt, 1);
1092
1093                 /* If peer is attached to destination, it is never detached,
1094                    so that we need not to grab a lock to dereference it.
1095                  */
1096                 if (rt->peer) {
1097                         iph->id = htons(inet_getid(rt->peer, more));
1098                         return;
1099                 }
1100         } else
1101                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1102                        __builtin_return_address(0));
1103
1104         ip_select_fb_ident(iph);
1105 }
1106
1107 static void rt_del(unsigned hash, struct rtable *rt)
1108 {
1109         struct rtable **rthp, *aux;
1110
1111         rthp = &rt_hash_table[hash].chain;
1112         spin_lock_bh(rt_hash_lock_addr(hash));
1113         ip_rt_put(rt);
1114         while ((aux = *rthp) != NULL) {
1115                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1116                         *rthp = aux->u.dst.rt_next;
1117                         rt_free(aux);
1118                         continue;
1119                 }
1120                 rthp = &aux->u.dst.rt_next;
1121         }
1122         spin_unlock_bh(rt_hash_lock_addr(hash));
1123 }
1124
1125 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1126                     __be32 saddr, struct net_device *dev)
1127 {
1128         int i, k;
1129         struct in_device *in_dev = in_dev_get(dev);
1130         struct rtable *rth, **rthp;
1131         __be32  skeys[2] = { saddr, 0 };
1132         int  ikeys[2] = { dev->ifindex, 0 };
1133         struct netevent_redirect netevent;
1134         struct net *net;
1135
1136         if (!in_dev)
1137                 return;
1138
1139         net = dev->nd_net;
1140         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1141             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1142             || ipv4_is_zeronet(new_gw))
1143                 goto reject_redirect;
1144
1145         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1146                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1147                         goto reject_redirect;
1148                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1149                         goto reject_redirect;
1150         } else {
1151                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1152                         goto reject_redirect;
1153         }
1154
1155         for (i = 0; i < 2; i++) {
1156                 for (k = 0; k < 2; k++) {
1157                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1158
1159                         rthp=&rt_hash_table[hash].chain;
1160
1161                         rcu_read_lock();
1162                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1163                                 struct rtable *rt;
1164
1165                                 if (rth->fl.fl4_dst != daddr ||
1166                                     rth->fl.fl4_src != skeys[i] ||
1167                                     rth->fl.oif != ikeys[k] ||
1168                                     rth->fl.iif != 0 ||
1169                                     rth->rt_genid != atomic_read(&rt_genid) ||
1170                                     rth->u.dst.dev->nd_net != net) {
1171                                         rthp = &rth->u.dst.rt_next;
1172                                         continue;
1173                                 }
1174
1175                                 if (rth->rt_dst != daddr ||
1176                                     rth->rt_src != saddr ||
1177                                     rth->u.dst.error ||
1178                                     rth->rt_gateway != old_gw ||
1179                                     rth->u.dst.dev != dev)
1180                                         break;
1181
1182                                 dst_hold(&rth->u.dst);
1183                                 rcu_read_unlock();
1184
1185                                 rt = dst_alloc(&ipv4_dst_ops);
1186                                 if (rt == NULL) {
1187                                         ip_rt_put(rth);
1188                                         in_dev_put(in_dev);
1189                                         return;
1190                                 }
1191
1192                                 /* Copy all the information. */
1193                                 *rt = *rth;
1194                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1195                                 rt->u.dst.__use         = 1;
1196                                 atomic_set(&rt->u.dst.__refcnt, 1);
1197                                 rt->u.dst.child         = NULL;
1198                                 if (rt->u.dst.dev)
1199                                         dev_hold(rt->u.dst.dev);
1200                                 if (rt->idev)
1201                                         in_dev_hold(rt->idev);
1202                                 rt->u.dst.obsolete      = 0;
1203                                 rt->u.dst.lastuse       = jiffies;
1204                                 rt->u.dst.path          = &rt->u.dst;
1205                                 rt->u.dst.neighbour     = NULL;
1206                                 rt->u.dst.hh            = NULL;
1207                                 rt->u.dst.xfrm          = NULL;
1208                                 rt->rt_genid            = atomic_read(&rt_genid);
1209                                 rt->rt_flags            |= RTCF_REDIRECTED;
1210
1211                                 /* Gateway is different ... */
1212                                 rt->rt_gateway          = new_gw;
1213
1214                                 /* Redirect received -> path was valid */
1215                                 dst_confirm(&rth->u.dst);
1216
1217                                 if (rt->peer)
1218                                         atomic_inc(&rt->peer->refcnt);
1219
1220                                 if (arp_bind_neighbour(&rt->u.dst) ||
1221                                     !(rt->u.dst.neighbour->nud_state &
1222                                             NUD_VALID)) {
1223                                         if (rt->u.dst.neighbour)
1224                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1225                                         ip_rt_put(rth);
1226                                         rt_drop(rt);
1227                                         goto do_next;
1228                                 }
1229
1230                                 netevent.old = &rth->u.dst;
1231                                 netevent.new = &rt->u.dst;
1232                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1233                                                         &netevent);
1234
1235                                 rt_del(hash, rth);
1236                                 if (!rt_intern_hash(hash, rt, &rt))
1237                                         ip_rt_put(rt);
1238                                 goto do_next;
1239                         }
1240                         rcu_read_unlock();
1241                 do_next:
1242                         ;
1243                 }
1244         }
1245         in_dev_put(in_dev);
1246         return;
1247
1248 reject_redirect:
1249 #ifdef CONFIG_IP_ROUTE_VERBOSE
1250         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1251                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1252                         "%u.%u.%u.%u ignored.\n"
1253                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1254                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1255                        NIPQUAD(saddr), NIPQUAD(daddr));
1256 #endif
1257         in_dev_put(in_dev);
1258 }
1259
1260 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1261 {
1262         struct rtable *rt = (struct rtable*)dst;
1263         struct dst_entry *ret = dst;
1264
1265         if (rt) {
1266                 if (dst->obsolete) {
1267                         ip_rt_put(rt);
1268                         ret = NULL;
1269                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1270                            rt->u.dst.expires) {
1271                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1272                                                 rt->fl.oif);
1273 #if RT_CACHE_DEBUG >= 1
1274                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1275                                           "%u.%u.%u.%u/%02x dropped\n",
1276                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1277 #endif
1278                         rt_del(hash, rt);
1279                         ret = NULL;
1280                 }
1281         }
1282         return ret;
1283 }
1284
1285 /*
1286  * Algorithm:
1287  *      1. The first ip_rt_redirect_number redirects are sent
1288  *         with exponential backoff, then we stop sending them at all,
1289  *         assuming that the host ignores our redirects.
1290  *      2. If we did not see packets requiring redirects
1291  *         during ip_rt_redirect_silence, we assume that the host
1292  *         forgot redirected route and start to send redirects again.
1293  *
1294  * This algorithm is much cheaper and more intelligent than dumb load limiting
1295  * in icmp.c.
1296  *
1297  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1298  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1299  */
1300
1301 void ip_rt_send_redirect(struct sk_buff *skb)
1302 {
1303         struct rtable *rt = (struct rtable*)skb->dst;
1304         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1305
1306         if (!in_dev)
1307                 return;
1308
1309         if (!IN_DEV_TX_REDIRECTS(in_dev))
1310                 goto out;
1311
1312         /* No redirected packets during ip_rt_redirect_silence;
1313          * reset the algorithm.
1314          */
1315         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1316                 rt->u.dst.rate_tokens = 0;
1317
1318         /* Too many ignored redirects; do not send anything
1319          * set u.dst.rate_last to the last seen redirected packet.
1320          */
1321         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1322                 rt->u.dst.rate_last = jiffies;
1323                 goto out;
1324         }
1325
1326         /* Check for load limit; set rate_last to the latest sent
1327          * redirect.
1328          */
1329         if (rt->u.dst.rate_tokens == 0 ||
1330             time_after(jiffies,
1331                        (rt->u.dst.rate_last +
1332                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1333                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1334                 rt->u.dst.rate_last = jiffies;
1335                 ++rt->u.dst.rate_tokens;
1336 #ifdef CONFIG_IP_ROUTE_VERBOSE
1337                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1338                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1339                     net_ratelimit())
1340                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1341                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1342                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1343                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1344 #endif
1345         }
1346 out:
1347         in_dev_put(in_dev);
1348 }
1349
1350 static int ip_error(struct sk_buff *skb)
1351 {
1352         struct rtable *rt = (struct rtable*)skb->dst;
1353         unsigned long now;
1354         int code;
1355
1356         switch (rt->u.dst.error) {
1357                 case EINVAL:
1358                 default:
1359                         goto out;
1360                 case EHOSTUNREACH:
1361                         code = ICMP_HOST_UNREACH;
1362                         break;
1363                 case ENETUNREACH:
1364                         code = ICMP_NET_UNREACH;
1365                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1366                         break;
1367                 case EACCES:
1368                         code = ICMP_PKT_FILTERED;
1369                         break;
1370         }
1371
1372         now = jiffies;
1373         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1374         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1375                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1376         rt->u.dst.rate_last = now;
1377         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1378                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1379                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1380         }
1381
1382 out:    kfree_skb(skb);
1383         return 0;
1384 }
1385
1386 /*
1387  *      The last two values are not from the RFC but
1388  *      are needed for AMPRnet AX.25 paths.
1389  */
1390
1391 static const unsigned short mtu_plateau[] =
1392 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1393
1394 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1395 {
1396         int i;
1397
1398         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1399                 if (old_mtu > mtu_plateau[i])
1400                         return mtu_plateau[i];
1401         return 68;
1402 }
1403
1404 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1405                                  unsigned short new_mtu)
1406 {
1407         int i;
1408         unsigned short old_mtu = ntohs(iph->tot_len);
1409         struct rtable *rth;
1410         __be32  skeys[2] = { iph->saddr, 0, };
1411         __be32  daddr = iph->daddr;
1412         unsigned short est_mtu = 0;
1413
1414         if (ipv4_config.no_pmtu_disc)
1415                 return 0;
1416
1417         for (i = 0; i < 2; i++) {
1418                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1419
1420                 rcu_read_lock();
1421                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1422                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1423                         if (rth->fl.fl4_dst == daddr &&
1424                             rth->fl.fl4_src == skeys[i] &&
1425                             rth->rt_dst  == daddr &&
1426                             rth->rt_src  == iph->saddr &&
1427                             rth->fl.iif == 0 &&
1428                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1429                             rth->u.dst.dev->nd_net == net &&
1430                             rth->rt_genid == atomic_read(&rt_genid)) {
1431                                 unsigned short mtu = new_mtu;
1432
1433                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1434
1435                                         /* BSD 4.2 compatibility hack :-( */
1436                                         if (mtu == 0 &&
1437                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1438                                             old_mtu >= 68 + (iph->ihl << 2))
1439                                                 old_mtu -= iph->ihl << 2;
1440
1441                                         mtu = guess_mtu(old_mtu);
1442                                 }
1443                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1444                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1445                                                 dst_confirm(&rth->u.dst);
1446                                                 if (mtu < ip_rt_min_pmtu) {
1447                                                         mtu = ip_rt_min_pmtu;
1448                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1449                                                                 (1 << RTAX_MTU);
1450                                                 }
1451                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1452                                                 dst_set_expires(&rth->u.dst,
1453                                                         ip_rt_mtu_expires);
1454                                         }
1455                                         est_mtu = mtu;
1456                                 }
1457                         }
1458                 }
1459                 rcu_read_unlock();
1460         }
1461         return est_mtu ? : new_mtu;
1462 }
1463
1464 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1465 {
1466         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1467             !(dst_metric_locked(dst, RTAX_MTU))) {
1468                 if (mtu < ip_rt_min_pmtu) {
1469                         mtu = ip_rt_min_pmtu;
1470                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1471                 }
1472                 dst->metrics[RTAX_MTU-1] = mtu;
1473                 dst_set_expires(dst, ip_rt_mtu_expires);
1474                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1475         }
1476 }
1477
1478 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1479 {
1480         return NULL;
1481 }
1482
1483 static void ipv4_dst_destroy(struct dst_entry *dst)
1484 {
1485         struct rtable *rt = (struct rtable *) dst;
1486         struct inet_peer *peer = rt->peer;
1487         struct in_device *idev = rt->idev;
1488
1489         if (peer) {
1490                 rt->peer = NULL;
1491                 inet_putpeer(peer);
1492         }
1493
1494         if (idev) {
1495                 rt->idev = NULL;
1496                 in_dev_put(idev);
1497         }
1498 }
1499
1500 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1501                             int how)
1502 {
1503         struct rtable *rt = (struct rtable *) dst;
1504         struct in_device *idev = rt->idev;
1505         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1506                 struct in_device *loopback_idev =
1507                         in_dev_get(dev->nd_net->loopback_dev);
1508                 if (loopback_idev) {
1509                         rt->idev = loopback_idev;
1510                         in_dev_put(idev);
1511                 }
1512         }
1513 }
1514
1515 static void ipv4_link_failure(struct sk_buff *skb)
1516 {
1517         struct rtable *rt;
1518
1519         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1520
1521         rt = (struct rtable *) skb->dst;
1522         if (rt)
1523                 dst_set_expires(&rt->u.dst, 0);
1524 }
1525
1526 static int ip_rt_bug(struct sk_buff *skb)
1527 {
1528         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1529                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1530                 skb->dev ? skb->dev->name : "?");
1531         kfree_skb(skb);
1532         return 0;
1533 }
1534
1535 /*
1536    We do not cache source address of outgoing interface,
1537    because it is used only by IP RR, TS and SRR options,
1538    so that it out of fast path.
1539
1540    BTW remember: "addr" is allowed to be not aligned
1541    in IP options!
1542  */
1543
1544 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1545 {
1546         __be32 src;
1547         struct fib_result res;
1548
1549         if (rt->fl.iif == 0)
1550                 src = rt->rt_src;
1551         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1552                 src = FIB_RES_PREFSRC(res);
1553                 fib_res_put(&res);
1554         } else
1555                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1556                                         RT_SCOPE_UNIVERSE);
1557         memcpy(addr, &src, 4);
1558 }
1559
1560 #ifdef CONFIG_NET_CLS_ROUTE
1561 static void set_class_tag(struct rtable *rt, u32 tag)
1562 {
1563         if (!(rt->u.dst.tclassid & 0xFFFF))
1564                 rt->u.dst.tclassid |= tag & 0xFFFF;
1565         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1566                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1567 }
1568 #endif
1569
1570 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1571 {
1572         struct fib_info *fi = res->fi;
1573
1574         if (fi) {
1575                 if (FIB_RES_GW(*res) &&
1576                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1577                         rt->rt_gateway = FIB_RES_GW(*res);
1578                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1579                        sizeof(rt->u.dst.metrics));
1580                 if (fi->fib_mtu == 0) {
1581                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1582                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1583                             rt->rt_gateway != rt->rt_dst &&
1584                             rt->u.dst.dev->mtu > 576)
1585                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1586                 }
1587 #ifdef CONFIG_NET_CLS_ROUTE
1588                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1589 #endif
1590         } else
1591                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1592
1593         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1594                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1595         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1596                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1597         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1598                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1599                                        ip_rt_min_advmss);
1600         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1601                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1602
1603 #ifdef CONFIG_NET_CLS_ROUTE
1604 #ifdef CONFIG_IP_MULTIPLE_TABLES
1605         set_class_tag(rt, fib_rules_tclass(res));
1606 #endif
1607         set_class_tag(rt, itag);
1608 #endif
1609         rt->rt_type = res->type;
1610 }
1611
1612 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1613                                 u8 tos, struct net_device *dev, int our)
1614 {
1615         unsigned hash;
1616         struct rtable *rth;
1617         __be32 spec_dst;
1618         struct in_device *in_dev = in_dev_get(dev);
1619         u32 itag = 0;
1620
1621         /* Primary sanity checks. */
1622
1623         if (in_dev == NULL)
1624                 return -EINVAL;
1625
1626         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1627             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1628                 goto e_inval;
1629
1630         if (ipv4_is_zeronet(saddr)) {
1631                 if (!ipv4_is_local_multicast(daddr))
1632                         goto e_inval;
1633                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1634         } else if (fib_validate_source(saddr, 0, tos, 0,
1635                                         dev, &spec_dst, &itag) < 0)
1636                 goto e_inval;
1637
1638         rth = dst_alloc(&ipv4_dst_ops);
1639         if (!rth)
1640                 goto e_nobufs;
1641
1642         rth->u.dst.output= ip_rt_bug;
1643
1644         atomic_set(&rth->u.dst.__refcnt, 1);
1645         rth->u.dst.flags= DST_HOST;
1646         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1647                 rth->u.dst.flags |= DST_NOPOLICY;
1648         rth->fl.fl4_dst = daddr;
1649         rth->rt_dst     = daddr;
1650         rth->fl.fl4_tos = tos;
1651         rth->fl.mark    = skb->mark;
1652         rth->fl.fl4_src = saddr;
1653         rth->rt_src     = saddr;
1654 #ifdef CONFIG_NET_CLS_ROUTE
1655         rth->u.dst.tclassid = itag;
1656 #endif
1657         rth->rt_iif     =
1658         rth->fl.iif     = dev->ifindex;
1659         rth->u.dst.dev  = init_net.loopback_dev;
1660         dev_hold(rth->u.dst.dev);
1661         rth->idev       = in_dev_get(rth->u.dst.dev);
1662         rth->fl.oif     = 0;
1663         rth->rt_gateway = daddr;
1664         rth->rt_spec_dst= spec_dst;
1665         rth->rt_genid   = atomic_read(&rt_genid);
1666         rth->rt_flags   = RTCF_MULTICAST;
1667         rth->rt_type    = RTN_MULTICAST;
1668         if (our) {
1669                 rth->u.dst.input= ip_local_deliver;
1670                 rth->rt_flags |= RTCF_LOCAL;
1671         }
1672
1673 #ifdef CONFIG_IP_MROUTE
1674         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1675                 rth->u.dst.input = ip_mr_input;
1676 #endif
1677         RT_CACHE_STAT_INC(in_slow_mc);
1678
1679         in_dev_put(in_dev);
1680         hash = rt_hash(daddr, saddr, dev->ifindex);
1681         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1682
1683 e_nobufs:
1684         in_dev_put(in_dev);
1685         return -ENOBUFS;
1686
1687 e_inval:
1688         in_dev_put(in_dev);
1689         return -EINVAL;
1690 }
1691
1692
1693 static void ip_handle_martian_source(struct net_device *dev,
1694                                      struct in_device *in_dev,
1695                                      struct sk_buff *skb,
1696                                      __be32 daddr,
1697                                      __be32 saddr)
1698 {
1699         RT_CACHE_STAT_INC(in_martian_src);
1700 #ifdef CONFIG_IP_ROUTE_VERBOSE
1701         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1702                 /*
1703                  *      RFC1812 recommendation, if source is martian,
1704                  *      the only hint is MAC header.
1705                  */
1706                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1707                         "%u.%u.%u.%u, on dev %s\n",
1708                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1709                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1710                         int i;
1711                         const unsigned char *p = skb_mac_header(skb);
1712                         printk(KERN_WARNING "ll header: ");
1713                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1714                                 printk("%02x", *p);
1715                                 if (i < (dev->hard_header_len - 1))
1716                                         printk(":");
1717                         }
1718                         printk("\n");
1719                 }
1720         }
1721 #endif
1722 }
1723
1724 static inline int __mkroute_input(struct sk_buff *skb,
1725                                   struct fib_result* res,
1726                                   struct in_device *in_dev,
1727                                   __be32 daddr, __be32 saddr, u32 tos,
1728                                   struct rtable **result)
1729 {
1730
1731         struct rtable *rth;
1732         int err;
1733         struct in_device *out_dev;
1734         unsigned flags = 0;
1735         __be32 spec_dst;
1736         u32 itag;
1737
1738         /* get a working reference to the output device */
1739         out_dev = in_dev_get(FIB_RES_DEV(*res));
1740         if (out_dev == NULL) {
1741                 if (net_ratelimit())
1742                         printk(KERN_CRIT "Bug in ip_route_input" \
1743                                "_slow(). Please, report\n");
1744                 return -EINVAL;
1745         }
1746
1747
1748         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1749                                   in_dev->dev, &spec_dst, &itag);
1750         if (err < 0) {
1751                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1752                                          saddr);
1753
1754                 err = -EINVAL;
1755                 goto cleanup;
1756         }
1757
1758         if (err)
1759                 flags |= RTCF_DIRECTSRC;
1760
1761         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1762             (IN_DEV_SHARED_MEDIA(out_dev) ||
1763              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1764                 flags |= RTCF_DOREDIRECT;
1765
1766         if (skb->protocol != htons(ETH_P_IP)) {
1767                 /* Not IP (i.e. ARP). Do not create route, if it is
1768                  * invalid for proxy arp. DNAT routes are always valid.
1769                  */
1770                 if (out_dev == in_dev) {
1771                         err = -EINVAL;
1772                         goto cleanup;
1773                 }
1774         }
1775
1776
1777         rth = dst_alloc(&ipv4_dst_ops);
1778         if (!rth) {
1779                 err = -ENOBUFS;
1780                 goto cleanup;
1781         }
1782
1783         atomic_set(&rth->u.dst.__refcnt, 1);
1784         rth->u.dst.flags= DST_HOST;
1785         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1786                 rth->u.dst.flags |= DST_NOPOLICY;
1787         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1788                 rth->u.dst.flags |= DST_NOXFRM;
1789         rth->fl.fl4_dst = daddr;
1790         rth->rt_dst     = daddr;
1791         rth->fl.fl4_tos = tos;
1792         rth->fl.mark    = skb->mark;
1793         rth->fl.fl4_src = saddr;
1794         rth->rt_src     = saddr;
1795         rth->rt_gateway = daddr;
1796         rth->rt_iif     =
1797                 rth->fl.iif     = in_dev->dev->ifindex;
1798         rth->u.dst.dev  = (out_dev)->dev;
1799         dev_hold(rth->u.dst.dev);
1800         rth->idev       = in_dev_get(rth->u.dst.dev);
1801         rth->fl.oif     = 0;
1802         rth->rt_spec_dst= spec_dst;
1803
1804         rth->u.dst.input = ip_forward;
1805         rth->u.dst.output = ip_output;
1806         rth->rt_genid = atomic_read(&rt_genid);
1807
1808         rt_set_nexthop(rth, res, itag);
1809
1810         rth->rt_flags = flags;
1811
1812         *result = rth;
1813         err = 0;
1814  cleanup:
1815         /* release the working reference to the output device */
1816         in_dev_put(out_dev);
1817         return err;
1818 }
1819
1820 static inline int ip_mkroute_input(struct sk_buff *skb,
1821                                    struct fib_result* res,
1822                                    const struct flowi *fl,
1823                                    struct in_device *in_dev,
1824                                    __be32 daddr, __be32 saddr, u32 tos)
1825 {
1826         struct rtable* rth = NULL;
1827         int err;
1828         unsigned hash;
1829
1830 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1831         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1832                 fib_select_multipath(fl, res);
1833 #endif
1834
1835         /* create a routing cache entry */
1836         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1837         if (err)
1838                 return err;
1839
1840         /* put it into the cache */
1841         hash = rt_hash(daddr, saddr, fl->iif);
1842         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1843 }
1844
1845 /*
1846  *      NOTE. We drop all the packets that has local source
1847  *      addresses, because every properly looped back packet
1848  *      must have correct destination already attached by output routine.
1849  *
1850  *      Such approach solves two big problems:
1851  *      1. Not simplex devices are handled properly.
1852  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1853  */
1854
1855 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1856                                u8 tos, struct net_device *dev)
1857 {
1858         struct fib_result res;
1859         struct in_device *in_dev = in_dev_get(dev);
1860         struct flowi fl = { .nl_u = { .ip4_u =
1861                                       { .daddr = daddr,
1862                                         .saddr = saddr,
1863                                         .tos = tos,
1864                                         .scope = RT_SCOPE_UNIVERSE,
1865                                       } },
1866                             .mark = skb->mark,
1867                             .iif = dev->ifindex };
1868         unsigned        flags = 0;
1869         u32             itag = 0;
1870         struct rtable * rth;
1871         unsigned        hash;
1872         __be32          spec_dst;
1873         int             err = -EINVAL;
1874         int             free_res = 0;
1875         struct net    * net = dev->nd_net;
1876
1877         /* IP on this device is disabled. */
1878
1879         if (!in_dev)
1880                 goto out;
1881
1882         /* Check for the most weird martians, which can be not detected
1883            by fib_lookup.
1884          */
1885
1886         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1887             ipv4_is_loopback(saddr))
1888                 goto martian_source;
1889
1890         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1891                 goto brd_input;
1892
1893         /* Accept zero addresses only to limited broadcast;
1894          * I even do not know to fix it or not. Waiting for complains :-)
1895          */
1896         if (ipv4_is_zeronet(saddr))
1897                 goto martian_source;
1898
1899         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1900             ipv4_is_loopback(daddr))
1901                 goto martian_destination;
1902
1903         /*
1904          *      Now we are ready to route packet.
1905          */
1906         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1907                 if (!IN_DEV_FORWARD(in_dev))
1908                         goto e_hostunreach;
1909                 goto no_route;
1910         }
1911         free_res = 1;
1912
1913         RT_CACHE_STAT_INC(in_slow_tot);
1914
1915         if (res.type == RTN_BROADCAST)
1916                 goto brd_input;
1917
1918         if (res.type == RTN_LOCAL) {
1919                 int result;
1920                 result = fib_validate_source(saddr, daddr, tos,
1921                                              net->loopback_dev->ifindex,
1922                                              dev, &spec_dst, &itag);
1923                 if (result < 0)
1924                         goto martian_source;
1925                 if (result)
1926                         flags |= RTCF_DIRECTSRC;
1927                 spec_dst = daddr;
1928                 goto local_input;
1929         }
1930
1931         if (!IN_DEV_FORWARD(in_dev))
1932                 goto e_hostunreach;
1933         if (res.type != RTN_UNICAST)
1934                 goto martian_destination;
1935
1936         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1937 done:
1938         in_dev_put(in_dev);
1939         if (free_res)
1940                 fib_res_put(&res);
1941 out:    return err;
1942
1943 brd_input:
1944         if (skb->protocol != htons(ETH_P_IP))
1945                 goto e_inval;
1946
1947         if (ipv4_is_zeronet(saddr))
1948                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1949         else {
1950                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1951                                           &itag);
1952                 if (err < 0)
1953                         goto martian_source;
1954                 if (err)
1955                         flags |= RTCF_DIRECTSRC;
1956         }
1957         flags |= RTCF_BROADCAST;
1958         res.type = RTN_BROADCAST;
1959         RT_CACHE_STAT_INC(in_brd);
1960
1961 local_input:
1962         rth = dst_alloc(&ipv4_dst_ops);
1963         if (!rth)
1964                 goto e_nobufs;
1965
1966         rth->u.dst.output= ip_rt_bug;
1967         rth->rt_genid = atomic_read(&rt_genid);
1968
1969         atomic_set(&rth->u.dst.__refcnt, 1);
1970         rth->u.dst.flags= DST_HOST;
1971         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1972                 rth->u.dst.flags |= DST_NOPOLICY;
1973         rth->fl.fl4_dst = daddr;
1974         rth->rt_dst     = daddr;
1975         rth->fl.fl4_tos = tos;
1976         rth->fl.mark    = skb->mark;
1977         rth->fl.fl4_src = saddr;
1978         rth->rt_src     = saddr;
1979 #ifdef CONFIG_NET_CLS_ROUTE
1980         rth->u.dst.tclassid = itag;
1981 #endif
1982         rth->rt_iif     =
1983         rth->fl.iif     = dev->ifindex;
1984         rth->u.dst.dev  = net->loopback_dev;
1985         dev_hold(rth->u.dst.dev);
1986         rth->idev       = in_dev_get(rth->u.dst.dev);
1987         rth->rt_gateway = daddr;
1988         rth->rt_spec_dst= spec_dst;
1989         rth->u.dst.input= ip_local_deliver;
1990         rth->rt_flags   = flags|RTCF_LOCAL;
1991         if (res.type == RTN_UNREACHABLE) {
1992                 rth->u.dst.input= ip_error;
1993                 rth->u.dst.error= -err;
1994                 rth->rt_flags   &= ~RTCF_LOCAL;
1995         }
1996         rth->rt_type    = res.type;
1997         hash = rt_hash(daddr, saddr, fl.iif);
1998         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1999         goto done;
2000
2001 no_route:
2002         RT_CACHE_STAT_INC(in_no_route);
2003         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2004         res.type = RTN_UNREACHABLE;
2005         if (err == -ESRCH)
2006                 err = -ENETUNREACH;
2007         goto local_input;
2008
2009         /*
2010          *      Do not cache martian addresses: they should be logged (RFC1812)
2011          */
2012 martian_destination:
2013         RT_CACHE_STAT_INC(in_martian_dst);
2014 #ifdef CONFIG_IP_ROUTE_VERBOSE
2015         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2016                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2017                         "%u.%u.%u.%u, dev %s\n",
2018                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2019 #endif
2020
2021 e_hostunreach:
2022         err = -EHOSTUNREACH;
2023         goto done;
2024
2025 e_inval:
2026         err = -EINVAL;
2027         goto done;
2028
2029 e_nobufs:
2030         err = -ENOBUFS;
2031         goto done;
2032
2033 martian_source:
2034         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2035         goto e_inval;
2036 }
2037
2038 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2039                    u8 tos, struct net_device *dev)
2040 {
2041         struct rtable * rth;
2042         unsigned        hash;
2043         int iif = dev->ifindex;
2044         struct net *net;
2045
2046         net = dev->nd_net;
2047         tos &= IPTOS_RT_MASK;
2048         hash = rt_hash(daddr, saddr, iif);
2049
2050         rcu_read_lock();
2051         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2052              rth = rcu_dereference(rth->u.dst.rt_next)) {
2053                 if (rth->fl.fl4_dst == daddr &&
2054                     rth->fl.fl4_src == saddr &&
2055                     rth->fl.iif == iif &&
2056                     rth->fl.oif == 0 &&
2057                     rth->fl.mark == skb->mark &&
2058                     rth->fl.fl4_tos == tos &&
2059                     rth->u.dst.dev->nd_net == net &&
2060                     rth->rt_genid == atomic_read(&rt_genid)) {
2061                         dst_use(&rth->u.dst, jiffies);
2062                         RT_CACHE_STAT_INC(in_hit);
2063                         rcu_read_unlock();
2064                         skb->dst = (struct dst_entry*)rth;
2065                         return 0;
2066                 }
2067                 RT_CACHE_STAT_INC(in_hlist_search);
2068         }
2069         rcu_read_unlock();
2070
2071         /* Multicast recognition logic is moved from route cache to here.
2072            The problem was that too many Ethernet cards have broken/missing
2073            hardware multicast filters :-( As result the host on multicasting
2074            network acquires a lot of useless route cache entries, sort of
2075            SDR messages from all the world. Now we try to get rid of them.
2076            Really, provided software IP multicast filter is organized
2077            reasonably (at least, hashed), it does not result in a slowdown
2078            comparing with route cache reject entries.
2079            Note, that multicast routers are not affected, because
2080            route cache entry is created eventually.
2081          */
2082         if (ipv4_is_multicast(daddr)) {
2083                 struct in_device *in_dev;
2084
2085                 rcu_read_lock();
2086                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2087                         int our = ip_check_mc(in_dev, daddr, saddr,
2088                                 ip_hdr(skb)->protocol);
2089                         if (our
2090 #ifdef CONFIG_IP_MROUTE
2091                             || (!ipv4_is_local_multicast(daddr) &&
2092                                 IN_DEV_MFORWARD(in_dev))
2093 #endif
2094                             ) {
2095                                 rcu_read_unlock();
2096                                 return ip_route_input_mc(skb, daddr, saddr,
2097                                                          tos, dev, our);
2098                         }
2099                 }
2100                 rcu_read_unlock();
2101                 return -EINVAL;
2102         }
2103         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2104 }
2105
2106 static inline int __mkroute_output(struct rtable **result,
2107                                    struct fib_result* res,
2108                                    const struct flowi *fl,
2109                                    const struct flowi *oldflp,
2110                                    struct net_device *dev_out,
2111                                    unsigned flags)
2112 {
2113         struct rtable *rth;
2114         struct in_device *in_dev;
2115         u32 tos = RT_FL_TOS(oldflp);
2116         int err = 0;
2117
2118         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2119                 return -EINVAL;
2120
2121         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2122                 res->type = RTN_BROADCAST;
2123         else if (ipv4_is_multicast(fl->fl4_dst))
2124                 res->type = RTN_MULTICAST;
2125         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2126                 return -EINVAL;
2127
2128         if (dev_out->flags & IFF_LOOPBACK)
2129                 flags |= RTCF_LOCAL;
2130
2131         /* get work reference to inet device */
2132         in_dev = in_dev_get(dev_out);
2133         if (!in_dev)
2134                 return -EINVAL;
2135
2136         if (res->type == RTN_BROADCAST) {
2137                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2138                 if (res->fi) {
2139                         fib_info_put(res->fi);
2140                         res->fi = NULL;
2141                 }
2142         } else if (res->type == RTN_MULTICAST) {
2143                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2144                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2145                                  oldflp->proto))
2146                         flags &= ~RTCF_LOCAL;
2147                 /* If multicast route do not exist use
2148                    default one, but do not gateway in this case.
2149                    Yes, it is hack.
2150                  */
2151                 if (res->fi && res->prefixlen < 4) {
2152                         fib_info_put(res->fi);
2153                         res->fi = NULL;
2154                 }
2155         }
2156
2157
2158         rth = dst_alloc(&ipv4_dst_ops);
2159         if (!rth) {
2160                 err = -ENOBUFS;
2161                 goto cleanup;
2162         }
2163
2164         atomic_set(&rth->u.dst.__refcnt, 1);
2165         rth->u.dst.flags= DST_HOST;
2166         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2167                 rth->u.dst.flags |= DST_NOXFRM;
2168         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2169                 rth->u.dst.flags |= DST_NOPOLICY;
2170
2171         rth->fl.fl4_dst = oldflp->fl4_dst;
2172         rth->fl.fl4_tos = tos;
2173         rth->fl.fl4_src = oldflp->fl4_src;
2174         rth->fl.oif     = oldflp->oif;
2175         rth->fl.mark    = oldflp->mark;
2176         rth->rt_dst     = fl->fl4_dst;
2177         rth->rt_src     = fl->fl4_src;
2178         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2179         /* get references to the devices that are to be hold by the routing
2180            cache entry */
2181         rth->u.dst.dev  = dev_out;
2182         dev_hold(dev_out);
2183         rth->idev       = in_dev_get(dev_out);
2184         rth->rt_gateway = fl->fl4_dst;
2185         rth->rt_spec_dst= fl->fl4_src;
2186
2187         rth->u.dst.output=ip_output;
2188         rth->rt_genid = atomic_read(&rt_genid);
2189
2190         RT_CACHE_STAT_INC(out_slow_tot);
2191
2192         if (flags & RTCF_LOCAL) {
2193                 rth->u.dst.input = ip_local_deliver;
2194                 rth->rt_spec_dst = fl->fl4_dst;
2195         }
2196         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2197                 rth->rt_spec_dst = fl->fl4_src;
2198                 if (flags & RTCF_LOCAL &&
2199                     !(dev_out->flags & IFF_LOOPBACK)) {
2200                         rth->u.dst.output = ip_mc_output;
2201                         RT_CACHE_STAT_INC(out_slow_mc);
2202                 }
2203 #ifdef CONFIG_IP_MROUTE
2204                 if (res->type == RTN_MULTICAST) {
2205                         if (IN_DEV_MFORWARD(in_dev) &&
2206                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2207                                 rth->u.dst.input = ip_mr_input;
2208                                 rth->u.dst.output = ip_mc_output;
2209                         }
2210                 }
2211 #endif
2212         }
2213
2214         rt_set_nexthop(rth, res, 0);
2215
2216         rth->rt_flags = flags;
2217
2218         *result = rth;
2219  cleanup:
2220         /* release work reference to inet device */
2221         in_dev_put(in_dev);
2222
2223         return err;
2224 }
2225
2226 static inline int ip_mkroute_output(struct rtable **rp,
2227                                     struct fib_result* res,
2228                                     const struct flowi *fl,
2229                                     const struct flowi *oldflp,
2230                                     struct net_device *dev_out,
2231                                     unsigned flags)
2232 {
2233         struct rtable *rth = NULL;
2234         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2235         unsigned hash;
2236         if (err == 0) {
2237                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2238                 err = rt_intern_hash(hash, rth, rp);
2239         }
2240
2241         return err;
2242 }
2243
2244 /*
2245  * Major route resolver routine.
2246  */
2247
2248 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2249                                 const struct flowi *oldflp)
2250 {
2251         u32 tos = RT_FL_TOS(oldflp);
2252         struct flowi fl = { .nl_u = { .ip4_u =
2253                                       { .daddr = oldflp->fl4_dst,
2254                                         .saddr = oldflp->fl4_src,
2255                                         .tos = tos & IPTOS_RT_MASK,
2256                                         .scope = ((tos & RTO_ONLINK) ?
2257                                                   RT_SCOPE_LINK :
2258                                                   RT_SCOPE_UNIVERSE),
2259                                       } },
2260                             .mark = oldflp->mark,
2261                             .iif = net->loopback_dev->ifindex,
2262                             .oif = oldflp->oif };
2263         struct fib_result res;
2264         unsigned flags = 0;
2265         struct net_device *dev_out = NULL;
2266         int free_res = 0;
2267         int err;
2268
2269
2270         res.fi          = NULL;
2271 #ifdef CONFIG_IP_MULTIPLE_TABLES
2272         res.r           = NULL;
2273 #endif
2274
2275         if (oldflp->fl4_src) {
2276                 err = -EINVAL;
2277                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2278                     ipv4_is_lbcast(oldflp->fl4_src) ||
2279                     ipv4_is_zeronet(oldflp->fl4_src))
2280                         goto out;
2281
2282                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2283                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2284                 if (dev_out == NULL)
2285                         goto out;
2286
2287                 /* I removed check for oif == dev_out->oif here.
2288                    It was wrong for two reasons:
2289                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2290                       is assigned to multiple interfaces.
2291                    2. Moreover, we are allowed to send packets with saddr
2292                       of another iface. --ANK
2293                  */
2294
2295                 if (oldflp->oif == 0
2296                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2297                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2298                         /* Special hack: user can direct multicasts
2299                            and limited broadcast via necessary interface
2300                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2301                            This hack is not just for fun, it allows
2302                            vic,vat and friends to work.
2303                            They bind socket to loopback, set ttl to zero
2304                            and expect that it will work.
2305                            From the viewpoint of routing cache they are broken,
2306                            because we are not allowed to build multicast path
2307                            with loopback source addr (look, routing cache
2308                            cannot know, that ttl is zero, so that packet
2309                            will not leave this host and route is valid).
2310                            Luckily, this hack is good workaround.
2311                          */
2312
2313                         fl.oif = dev_out->ifindex;
2314                         goto make_route;
2315                 }
2316                 if (dev_out)
2317                         dev_put(dev_out);
2318                 dev_out = NULL;
2319         }
2320
2321
2322         if (oldflp->oif) {
2323                 dev_out = dev_get_by_index(net, oldflp->oif);
2324                 err = -ENODEV;
2325                 if (dev_out == NULL)
2326                         goto out;
2327
2328                 /* RACE: Check return value of inet_select_addr instead. */
2329                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2330                         dev_put(dev_out);
2331                         goto out;       /* Wrong error code */
2332                 }
2333
2334                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2335                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2336                         if (!fl.fl4_src)
2337                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2338                                                               RT_SCOPE_LINK);
2339                         goto make_route;
2340                 }
2341                 if (!fl.fl4_src) {
2342                         if (ipv4_is_multicast(oldflp->fl4_dst))
2343                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2344                                                               fl.fl4_scope);
2345                         else if (!oldflp->fl4_dst)
2346                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2347                                                               RT_SCOPE_HOST);
2348                 }
2349         }
2350
2351         if (!fl.fl4_dst) {
2352                 fl.fl4_dst = fl.fl4_src;
2353                 if (!fl.fl4_dst)
2354                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2355                 if (dev_out)
2356                         dev_put(dev_out);
2357                 dev_out = net->loopback_dev;
2358                 dev_hold(dev_out);
2359                 fl.oif = net->loopback_dev->ifindex;
2360                 res.type = RTN_LOCAL;
2361                 flags |= RTCF_LOCAL;
2362                 goto make_route;
2363         }
2364
2365         if (fib_lookup(net, &fl, &res)) {
2366                 res.fi = NULL;
2367                 if (oldflp->oif) {
2368                         /* Apparently, routing tables are wrong. Assume,
2369                            that the destination is on link.
2370
2371                            WHY? DW.
2372                            Because we are allowed to send to iface
2373                            even if it has NO routes and NO assigned
2374                            addresses. When oif is specified, routing
2375                            tables are looked up with only one purpose:
2376                            to catch if destination is gatewayed, rather than
2377                            direct. Moreover, if MSG_DONTROUTE is set,
2378                            we send packet, ignoring both routing tables
2379                            and ifaddr state. --ANK
2380
2381
2382                            We could make it even if oif is unknown,
2383                            likely IPv6, but we do not.
2384                          */
2385
2386                         if (fl.fl4_src == 0)
2387                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2388                                                               RT_SCOPE_LINK);
2389                         res.type = RTN_UNICAST;
2390                         goto make_route;
2391                 }
2392                 if (dev_out)
2393                         dev_put(dev_out);
2394                 err = -ENETUNREACH;
2395                 goto out;
2396         }
2397         free_res = 1;
2398
2399         if (res.type == RTN_LOCAL) {
2400                 if (!fl.fl4_src)
2401                         fl.fl4_src = fl.fl4_dst;
2402                 if (dev_out)
2403                         dev_put(dev_out);
2404                 dev_out = net->loopback_dev;
2405                 dev_hold(dev_out);
2406                 fl.oif = dev_out->ifindex;
2407                 if (res.fi)
2408                         fib_info_put(res.fi);
2409                 res.fi = NULL;
2410                 flags |= RTCF_LOCAL;
2411                 goto make_route;
2412         }
2413
2414 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2415         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2416                 fib_select_multipath(&fl, &res);
2417         else
2418 #endif
2419         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2420                 fib_select_default(net, &fl, &res);
2421
2422         if (!fl.fl4_src)
2423                 fl.fl4_src = FIB_RES_PREFSRC(res);
2424
2425         if (dev_out)
2426                 dev_put(dev_out);
2427         dev_out = FIB_RES_DEV(res);
2428         dev_hold(dev_out);
2429         fl.oif = dev_out->ifindex;
2430
2431
2432 make_route:
2433         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2434
2435
2436         if (free_res)
2437                 fib_res_put(&res);
2438         if (dev_out)
2439                 dev_put(dev_out);
2440 out:    return err;
2441 }
2442
2443 int __ip_route_output_key(struct net *net, struct rtable **rp,
2444                           const struct flowi *flp)
2445 {
2446         unsigned hash;
2447         struct rtable *rth;
2448
2449         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2450
2451         rcu_read_lock_bh();
2452         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2453                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2454                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2455                     rth->fl.fl4_src == flp->fl4_src &&
2456                     rth->fl.iif == 0 &&
2457                     rth->fl.oif == flp->oif &&
2458                     rth->fl.mark == flp->mark &&
2459                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2460                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2461                     rth->u.dst.dev->nd_net == net &&
2462                     rth->rt_genid == atomic_read(&rt_genid)) {
2463                         dst_use(&rth->u.dst, jiffies);
2464                         RT_CACHE_STAT_INC(out_hit);
2465                         rcu_read_unlock_bh();
2466                         *rp = rth;
2467                         return 0;
2468                 }
2469                 RT_CACHE_STAT_INC(out_hlist_search);
2470         }
2471         rcu_read_unlock_bh();
2472
2473         return ip_route_output_slow(net, rp, flp);
2474 }
2475
2476 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2477
2478 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2479 {
2480 }
2481
2482 static struct dst_ops ipv4_dst_blackhole_ops = {
2483         .family                 =       AF_INET,
2484         .protocol               =       __constant_htons(ETH_P_IP),
2485         .destroy                =       ipv4_dst_destroy,
2486         .check                  =       ipv4_dst_check,
2487         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2488         .entry_size             =       sizeof(struct rtable),
2489         .entries                =       ATOMIC_INIT(0),
2490 };
2491
2492
2493 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2494 {
2495         struct rtable *ort = *rp;
2496         struct rtable *rt = (struct rtable *)
2497                 dst_alloc(&ipv4_dst_blackhole_ops);
2498
2499         if (rt) {
2500                 struct dst_entry *new = &rt->u.dst;
2501
2502                 atomic_set(&new->__refcnt, 1);
2503                 new->__use = 1;
2504                 new->input = dst_discard;
2505                 new->output = dst_discard;
2506                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2507
2508                 new->dev = ort->u.dst.dev;
2509                 if (new->dev)
2510                         dev_hold(new->dev);
2511
2512                 rt->fl = ort->fl;
2513
2514                 rt->idev = ort->idev;
2515                 if (rt->idev)
2516                         in_dev_hold(rt->idev);
2517                 rt->rt_genid = atomic_read(&rt_genid);
2518                 rt->rt_flags = ort->rt_flags;
2519                 rt->rt_type = ort->rt_type;
2520                 rt->rt_dst = ort->rt_dst;
2521                 rt->rt_src = ort->rt_src;
2522                 rt->rt_iif = ort->rt_iif;
2523                 rt->rt_gateway = ort->rt_gateway;
2524                 rt->rt_spec_dst = ort->rt_spec_dst;
2525                 rt->peer = ort->peer;
2526                 if (rt->peer)
2527                         atomic_inc(&rt->peer->refcnt);
2528
2529                 dst_free(new);
2530         }
2531
2532         dst_release(&(*rp)->u.dst);
2533         *rp = rt;
2534         return (rt ? 0 : -ENOMEM);
2535 }
2536
2537 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2538                          struct sock *sk, int flags)
2539 {
2540         int err;
2541
2542         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2543                 return err;
2544
2545         if (flp->proto) {
2546                 if (!flp->fl4_src)
2547                         flp->fl4_src = (*rp)->rt_src;
2548                 if (!flp->fl4_dst)
2549                         flp->fl4_dst = (*rp)->rt_dst;
2550                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2551                                     flags ? XFRM_LOOKUP_WAIT : 0);
2552                 if (err == -EREMOTE)
2553                         err = ipv4_dst_blackhole(rp, flp, sk);
2554
2555                 return err;
2556         }
2557
2558         return 0;
2559 }
2560
2561 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2562
2563 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2564 {
2565         return ip_route_output_flow(net, rp, flp, NULL, 0);
2566 }
2567
2568 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2569                         int nowait, unsigned int flags)
2570 {
2571         struct rtable *rt = (struct rtable*)skb->dst;
2572         struct rtmsg *r;
2573         struct nlmsghdr *nlh;
2574         long expires;
2575         u32 id = 0, ts = 0, tsage = 0, error;
2576
2577         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2578         if (nlh == NULL)
2579                 return -EMSGSIZE;
2580
2581         r = nlmsg_data(nlh);
2582         r->rtm_family    = AF_INET;
2583         r->rtm_dst_len  = 32;
2584         r->rtm_src_len  = 0;
2585         r->rtm_tos      = rt->fl.fl4_tos;
2586         r->rtm_table    = RT_TABLE_MAIN;
2587         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2588         r->rtm_type     = rt->rt_type;
2589         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2590         r->rtm_protocol = RTPROT_UNSPEC;
2591         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2592         if (rt->rt_flags & RTCF_NOTIFY)
2593                 r->rtm_flags |= RTM_F_NOTIFY;
2594
2595         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2596
2597         if (rt->fl.fl4_src) {
2598                 r->rtm_src_len = 32;
2599                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2600         }
2601         if (rt->u.dst.dev)
2602                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2603 #ifdef CONFIG_NET_CLS_ROUTE
2604         if (rt->u.dst.tclassid)
2605                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2606 #endif
2607         if (rt->fl.iif)
2608                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2609         else if (rt->rt_src != rt->fl.fl4_src)
2610                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2611
2612         if (rt->rt_dst != rt->rt_gateway)
2613                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2614
2615         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2616                 goto nla_put_failure;
2617
2618         error = rt->u.dst.error;
2619         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2620         if (rt->peer) {
2621                 id = rt->peer->ip_id_count;
2622                 if (rt->peer->tcp_ts_stamp) {
2623                         ts = rt->peer->tcp_ts;
2624                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2625                 }
2626         }
2627
2628         if (rt->fl.iif) {
2629 #ifdef CONFIG_IP_MROUTE
2630                 __be32 dst = rt->rt_dst;
2631
2632                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2633                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2634                         int err = ipmr_get_route(skb, r, nowait);
2635                         if (err <= 0) {
2636                                 if (!nowait) {
2637                                         if (err == 0)
2638                                                 return 0;
2639                                         goto nla_put_failure;
2640                                 } else {
2641                                         if (err == -EMSGSIZE)
2642                                                 goto nla_put_failure;
2643                                         error = err;
2644                                 }
2645                         }
2646                 } else
2647 #endif
2648                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2649         }
2650
2651         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2652                                expires, error) < 0)
2653                 goto nla_put_failure;
2654
2655         return nlmsg_end(skb, nlh);
2656
2657 nla_put_failure:
2658         nlmsg_cancel(skb, nlh);
2659         return -EMSGSIZE;
2660 }
2661
2662 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2663 {
2664         struct net *net = in_skb->sk->sk_net;
2665         struct rtmsg *rtm;
2666         struct nlattr *tb[RTA_MAX+1];
2667         struct rtable *rt = NULL;
2668         __be32 dst = 0;
2669         __be32 src = 0;
2670         u32 iif;
2671         int err;
2672         struct sk_buff *skb;
2673
2674         if (net != &init_net)
2675                 return -EINVAL;
2676
2677         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2678         if (err < 0)
2679                 goto errout;
2680
2681         rtm = nlmsg_data(nlh);
2682
2683         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2684         if (skb == NULL) {
2685                 err = -ENOBUFS;
2686                 goto errout;
2687         }
2688
2689         /* Reserve room for dummy headers, this skb can pass
2690            through good chunk of routing engine.
2691          */
2692         skb_reset_mac_header(skb);
2693         skb_reset_network_header(skb);
2694
2695         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2696         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2697         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2698
2699         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2700         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2701         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2702
2703         if (iif) {
2704                 struct net_device *dev;
2705
2706                 dev = __dev_get_by_index(&init_net, iif);
2707                 if (dev == NULL) {
2708                         err = -ENODEV;
2709                         goto errout_free;
2710                 }
2711
2712                 skb->protocol   = htons(ETH_P_IP);
2713                 skb->dev        = dev;
2714                 local_bh_disable();
2715                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2716                 local_bh_enable();
2717
2718                 rt = (struct rtable*) skb->dst;
2719                 if (err == 0 && rt->u.dst.error)
2720                         err = -rt->u.dst.error;
2721         } else {
2722                 struct flowi fl = {
2723                         .nl_u = {
2724                                 .ip4_u = {
2725                                         .daddr = dst,
2726                                         .saddr = src,
2727                                         .tos = rtm->rtm_tos,
2728                                 },
2729                         },
2730                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2731                 };
2732                 err = ip_route_output_key(&init_net, &rt, &fl);
2733         }
2734
2735         if (err)
2736                 goto errout_free;
2737
2738         skb->dst = &rt->u.dst;
2739         if (rtm->rtm_flags & RTM_F_NOTIFY)
2740                 rt->rt_flags |= RTCF_NOTIFY;
2741
2742         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2743                                 RTM_NEWROUTE, 0, 0);
2744         if (err <= 0)
2745                 goto errout_free;
2746
2747         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2748 errout:
2749         return err;
2750
2751 errout_free:
2752         kfree_skb(skb);
2753         goto errout;
2754 }
2755
2756 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2757 {
2758         struct rtable *rt;
2759         int h, s_h;
2760         int idx, s_idx;
2761
2762         s_h = cb->args[0];
2763         if (s_h < 0)
2764                 s_h = 0;
2765         s_idx = idx = cb->args[1];
2766         for (h = s_h; h <= rt_hash_mask; h++) {
2767                 rcu_read_lock_bh();
2768                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2769                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2770                         if (idx < s_idx)
2771                                 continue;
2772                         if (rt->rt_genid != atomic_read(&rt_genid))
2773                                 continue;
2774                         skb->dst = dst_clone(&rt->u.dst);
2775                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2776                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2777                                          1, NLM_F_MULTI) <= 0) {
2778                                 dst_release(xchg(&skb->dst, NULL));
2779                                 rcu_read_unlock_bh();
2780                                 goto done;
2781                         }
2782                         dst_release(xchg(&skb->dst, NULL));
2783                 }
2784                 rcu_read_unlock_bh();
2785                 s_idx = 0;
2786         }
2787
2788 done:
2789         cb->args[0] = h;
2790         cb->args[1] = idx;
2791         return skb->len;
2792 }
2793
2794 void ip_rt_multicast_event(struct in_device *in_dev)
2795 {
2796         rt_cache_flush(0);
2797 }
2798
2799 #ifdef CONFIG_SYSCTL
2800 static int flush_delay;
2801
2802 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2803                                         struct file *filp, void __user *buffer,
2804                                         size_t *lenp, loff_t *ppos)
2805 {
2806         if (write) {
2807                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2808                 rt_cache_flush(flush_delay);
2809                 return 0;
2810         }
2811
2812         return -EINVAL;
2813 }
2814
2815 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2816                                                 int __user *name,
2817                                                 int nlen,
2818                                                 void __user *oldval,
2819                                                 size_t __user *oldlenp,
2820                                                 void __user *newval,
2821                                                 size_t newlen)
2822 {
2823         int delay;
2824         if (newlen != sizeof(int))
2825                 return -EINVAL;
2826         if (get_user(delay, (int __user *)newval))
2827                 return -EFAULT;
2828         rt_cache_flush(delay);
2829         return 0;
2830 }
2831
2832 ctl_table ipv4_route_table[] = {
2833         {
2834                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2835                 .procname       = "flush",
2836                 .data           = &flush_delay,
2837                 .maxlen         = sizeof(int),
2838                 .mode           = 0200,
2839                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2840                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2841         },
2842         {
2843                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2844                 .procname       = "gc_thresh",
2845                 .data           = &ipv4_dst_ops.gc_thresh,
2846                 .maxlen         = sizeof(int),
2847                 .mode           = 0644,
2848                 .proc_handler   = &proc_dointvec,
2849         },
2850         {
2851                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2852                 .procname       = "max_size",
2853                 .data           = &ip_rt_max_size,
2854                 .maxlen         = sizeof(int),
2855                 .mode           = 0644,
2856                 .proc_handler   = &proc_dointvec,
2857         },
2858         {
2859                 /*  Deprecated. Use gc_min_interval_ms */
2860
2861                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2862                 .procname       = "gc_min_interval",
2863                 .data           = &ip_rt_gc_min_interval,
2864                 .maxlen         = sizeof(int),
2865                 .mode           = 0644,
2866                 .proc_handler   = &proc_dointvec_jiffies,
2867                 .strategy       = &sysctl_jiffies,
2868         },
2869         {
2870                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2871                 .procname       = "gc_min_interval_ms",
2872                 .data           = &ip_rt_gc_min_interval,
2873                 .maxlen         = sizeof(int),
2874                 .mode           = 0644,
2875                 .proc_handler   = &proc_dointvec_ms_jiffies,
2876                 .strategy       = &sysctl_ms_jiffies,
2877         },
2878         {
2879                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2880                 .procname       = "gc_timeout",
2881                 .data           = &ip_rt_gc_timeout,
2882                 .maxlen         = sizeof(int),
2883                 .mode           = 0644,
2884                 .proc_handler   = &proc_dointvec_jiffies,
2885                 .strategy       = &sysctl_jiffies,
2886         },
2887         {
2888                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2889                 .procname       = "gc_interval",
2890                 .data           = &ip_rt_gc_interval,
2891                 .maxlen         = sizeof(int),
2892                 .mode           = 0644,
2893                 .proc_handler   = &proc_dointvec_jiffies,
2894                 .strategy       = &sysctl_jiffies,
2895         },
2896         {
2897                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2898                 .procname       = "redirect_load",
2899                 .data           = &ip_rt_redirect_load,
2900                 .maxlen         = sizeof(int),
2901                 .mode           = 0644,
2902                 .proc_handler   = &proc_dointvec,
2903         },
2904         {
2905                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2906                 .procname       = "redirect_number",
2907                 .data           = &ip_rt_redirect_number,
2908                 .maxlen         = sizeof(int),
2909                 .mode           = 0644,
2910                 .proc_handler   = &proc_dointvec,
2911         },
2912         {
2913                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2914                 .procname       = "redirect_silence",
2915                 .data           = &ip_rt_redirect_silence,
2916                 .maxlen         = sizeof(int),
2917                 .mode           = 0644,
2918                 .proc_handler   = &proc_dointvec,
2919         },
2920         {
2921                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2922                 .procname       = "error_cost",
2923                 .data           = &ip_rt_error_cost,
2924                 .maxlen         = sizeof(int),
2925                 .mode           = 0644,
2926                 .proc_handler   = &proc_dointvec,
2927         },
2928         {
2929                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2930                 .procname       = "error_burst",
2931                 .data           = &ip_rt_error_burst,
2932                 .maxlen         = sizeof(int),
2933                 .mode           = 0644,
2934                 .proc_handler   = &proc_dointvec,
2935         },
2936         {
2937                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2938                 .procname       = "gc_elasticity",
2939                 .data           = &ip_rt_gc_elasticity,
2940                 .maxlen         = sizeof(int),
2941                 .mode           = 0644,
2942                 .proc_handler   = &proc_dointvec,
2943         },
2944         {
2945                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2946                 .procname       = "mtu_expires",
2947                 .data           = &ip_rt_mtu_expires,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = &proc_dointvec_jiffies,
2951                 .strategy       = &sysctl_jiffies,
2952         },
2953         {
2954                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2955                 .procname       = "min_pmtu",
2956                 .data           = &ip_rt_min_pmtu,
2957                 .maxlen         = sizeof(int),
2958                 .mode           = 0644,
2959                 .proc_handler   = &proc_dointvec,
2960         },
2961         {
2962                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2963                 .procname       = "min_adv_mss",
2964                 .data           = &ip_rt_min_advmss,
2965                 .maxlen         = sizeof(int),
2966                 .mode           = 0644,
2967                 .proc_handler   = &proc_dointvec,
2968         },
2969         {
2970                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2971                 .procname       = "secret_interval",
2972                 .data           = &ip_rt_secret_interval,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = &proc_dointvec_jiffies,
2976                 .strategy       = &sysctl_jiffies,
2977         },
2978         { .ctl_name = 0 }
2979 };
2980 #endif
2981
2982 #ifdef CONFIG_NET_CLS_ROUTE
2983 struct ip_rt_acct *ip_rt_acct __read_mostly;
2984 #endif /* CONFIG_NET_CLS_ROUTE */
2985
2986 static __initdata unsigned long rhash_entries;
2987 static int __init set_rhash_entries(char *str)
2988 {
2989         if (!str)
2990                 return 0;
2991         rhash_entries = simple_strtoul(str, &str, 0);
2992         return 1;
2993 }
2994 __setup("rhash_entries=", set_rhash_entries);
2995
2996 int __init ip_rt_init(void)
2997 {
2998         int rc = 0;
2999
3000         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3001                              (jiffies ^ (jiffies >> 7))));
3002
3003 #ifdef CONFIG_NET_CLS_ROUTE
3004         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3005         if (!ip_rt_acct)
3006                 panic("IP: failed to allocate ip_rt_acct\n");
3007 #endif
3008
3009         ipv4_dst_ops.kmem_cachep =
3010                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3011                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3012
3013         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3014
3015         rt_hash_table = (struct rt_hash_bucket *)
3016                 alloc_large_system_hash("IP route cache",
3017                                         sizeof(struct rt_hash_bucket),
3018                                         rhash_entries,
3019                                         (num_physpages >= 128 * 1024) ?
3020                                         15 : 17,
3021                                         0,
3022                                         &rt_hash_log,
3023                                         &rt_hash_mask,
3024                                         0);
3025         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3026         rt_hash_lock_init();
3027
3028         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3029         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3030
3031         devinet_init();
3032         ip_fib_init();
3033
3034         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3035
3036         /* All the timers, started at system startup tend
3037            to synchronize. Perturb it a bit.
3038          */
3039         schedule_delayed_work(&expires_work,
3040                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3041
3042         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3043                 ip_rt_secret_interval;
3044         add_timer(&rt_secret_timer);
3045
3046         if (ip_rt_proc_init(&init_net))
3047                 printk(KERN_ERR "Unable to create route proc files\n");
3048 #ifdef CONFIG_XFRM
3049         xfrm_init();
3050         xfrm4_init();
3051 #endif
3052         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3053
3054         return rc;
3055 }
3056
3057 EXPORT_SYMBOL(__ip_select_ident);
3058 EXPORT_SYMBOL(ip_route_input);
3059 EXPORT_SYMBOL(ip_route_output_key);