[NETNS]: Process /proc/net/rt_cache inside a namespace.
[safe/jmp/linux-2.6] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15  *
16  * Fixes:
17  *              Alan Cox        :       Verify area fixes.
18  *              Alan Cox        :       cli() protects routing changes
19  *              Rui Oliveira    :       ICMP routing table updates
20  *              (rco@di.uminho.pt)      Routing table insertion and update
21  *              Linus Torvalds  :       Rewrote bits to be sensible
22  *              Alan Cox        :       Added BSD route gw semantics
23  *              Alan Cox        :       Super /proc >4K
24  *              Alan Cox        :       MTU in route table
25  *              Alan Cox        :       MSS actually. Also added the window
26  *                                      clamper.
27  *              Sam Lantinga    :       Fixed route matching in rt_del()
28  *              Alan Cox        :       Routing cache support.
29  *              Alan Cox        :       Removed compatibility cruft.
30  *              Alan Cox        :       RTF_REJECT support.
31  *              Alan Cox        :       TCP irtt support.
32  *              Jonathan Naylor :       Added Metric support.
33  *      Miquel van Smoorenburg  :       BSD API fixes.
34  *      Miquel van Smoorenburg  :       Metrics.
35  *              Alan Cox        :       Use __u32 properly
36  *              Alan Cox        :       Aligned routing errors more closely with BSD
37  *                                      our system is still very different.
38  *              Alan Cox        :       Faster /proc handling
39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
40  *                                      routing caches and better behaviour.
41  *
42  *              Olaf Erb        :       irtt wasn't being copied right.
43  *              Bjorn Ekwall    :       Kerneld route support.
44  *              Alan Cox        :       Multicast fixed (I hope)
45  *              Pavel Krauz     :       Limited broadcast fixed
46  *              Mike McLagan    :       Routing by source
47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
48  *                                      route.c and rewritten from scratch.
49  *              Andi Kleen      :       Load-limit warning messages.
50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
54  *              Marc Boucher    :       routing by fwmark
55  *      Robert Olsson           :       Added rt_cache statistics
56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
60  *
61  *              This program is free software; you can redistribute it and/or
62  *              modify it under the terms of the GNU General Public License
63  *              as published by the Free Software Foundation; either version
64  *              2 of the License, or (at your option) any later version.
65  */
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <asm/system.h>
70 #include <linux/bitops.h>
71 #include <linux/types.h>
72 #include <linux/kernel.h>
73 #include <linux/mm.h>
74 #include <linux/bootmem.h>
75 #include <linux/string.h>
76 #include <linux/socket.h>
77 #include <linux/sockios.h>
78 #include <linux/errno.h>
79 #include <linux/in.h>
80 #include <linux/inet.h>
81 #include <linux/netdevice.h>
82 #include <linux/proc_fs.h>
83 #include <linux/init.h>
84 #include <linux/workqueue.h>
85 #include <linux/skbuff.h>
86 #include <linux/inetdevice.h>
87 #include <linux/igmp.h>
88 #include <linux/pkt_sched.h>
89 #include <linux/mroute.h>
90 #include <linux/netfilter_ipv4.h>
91 #include <linux/random.h>
92 #include <linux/jhash.h>
93 #include <linux/rcupdate.h>
94 #include <linux/times.h>
95 #include <net/dst.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/netevent.h>
108 #include <net/rtnetlink.h>
109 #ifdef CONFIG_SYSCTL
110 #include <linux/sysctl.h>
111 #endif
112
113 #define RT_FL_TOS(oldflp) \
114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
115
116 #define IP_MAX_MTU      0xFFF0
117
118 #define RT_GC_TIMEOUT (300*HZ)
119
120 static int ip_rt_max_size;
121 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
122 static int ip_rt_gc_interval            = 60 * HZ;
123 static int ip_rt_gc_min_interval        = HZ / 2;
124 static int ip_rt_redirect_number        = 9;
125 static int ip_rt_redirect_load          = HZ / 50;
126 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost             = HZ;
128 static int ip_rt_error_burst            = 5 * HZ;
129 static int ip_rt_gc_elasticity          = 8;
130 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
131 static int ip_rt_min_pmtu               = 512 + 20 + 20;
132 static int ip_rt_min_advmss             = 256;
133 static int ip_rt_secret_interval        = 10 * 60 * HZ;
134
135 #define RTprint(a...)   printk(KERN_DEBUG a)
136
137 static void rt_worker_func(struct work_struct *work);
138 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
139 static struct timer_list rt_secret_timer;
140
141 /*
142  *      Interface to generic destination cache.
143  */
144
145 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
146 static void              ipv4_dst_destroy(struct dst_entry *dst);
147 static void              ipv4_dst_ifdown(struct dst_entry *dst,
148                                          struct net_device *dev, int how);
149 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
150 static void              ipv4_link_failure(struct sk_buff *skb);
151 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
152 static int rt_garbage_collect(struct dst_ops *ops);
153
154
155 static struct dst_ops ipv4_dst_ops = {
156         .family =               AF_INET,
157         .protocol =             __constant_htons(ETH_P_IP),
158         .gc =                   rt_garbage_collect,
159         .check =                ipv4_dst_check,
160         .destroy =              ipv4_dst_destroy,
161         .ifdown =               ipv4_dst_ifdown,
162         .negative_advice =      ipv4_negative_advice,
163         .link_failure =         ipv4_link_failure,
164         .update_pmtu =          ip_rt_update_pmtu,
165         .local_out =            ip_local_out,
166         .entry_size =           sizeof(struct rtable),
167         .entries =              ATOMIC_INIT(0),
168 };
169
170 #define ECN_OR_COST(class)      TC_PRIO_##class
171
172 const __u8 ip_tos2prio[16] = {
173         TC_PRIO_BESTEFFORT,
174         ECN_OR_COST(FILLER),
175         TC_PRIO_BESTEFFORT,
176         ECN_OR_COST(BESTEFFORT),
177         TC_PRIO_BULK,
178         ECN_OR_COST(BULK),
179         TC_PRIO_BULK,
180         ECN_OR_COST(BULK),
181         TC_PRIO_INTERACTIVE,
182         ECN_OR_COST(INTERACTIVE),
183         TC_PRIO_INTERACTIVE,
184         ECN_OR_COST(INTERACTIVE),
185         TC_PRIO_INTERACTIVE_BULK,
186         ECN_OR_COST(INTERACTIVE_BULK),
187         TC_PRIO_INTERACTIVE_BULK,
188         ECN_OR_COST(INTERACTIVE_BULK)
189 };
190
191
192 /*
193  * Route cache.
194  */
195
196 /* The locking scheme is rather straight forward:
197  *
198  * 1) Read-Copy Update protects the buckets of the central route hash.
199  * 2) Only writers remove entries, and they hold the lock
200  *    as they look at rtable reference counts.
201  * 3) Only readers acquire references to rtable entries,
202  *    they do so with atomic increments and with the
203  *    lock held.
204  */
205
206 struct rt_hash_bucket {
207         struct rtable   *chain;
208 };
209 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
210         defined(CONFIG_PROVE_LOCKING)
211 /*
212  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
213  * The size of this table is a power of two and depends on the number of CPUS.
214  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
215  */
216 #ifdef CONFIG_LOCKDEP
217 # define RT_HASH_LOCK_SZ        256
218 #else
219 # if NR_CPUS >= 32
220 #  define RT_HASH_LOCK_SZ       4096
221 # elif NR_CPUS >= 16
222 #  define RT_HASH_LOCK_SZ       2048
223 # elif NR_CPUS >= 8
224 #  define RT_HASH_LOCK_SZ       1024
225 # elif NR_CPUS >= 4
226 #  define RT_HASH_LOCK_SZ       512
227 # else
228 #  define RT_HASH_LOCK_SZ       256
229 # endif
230 #endif
231
232 static spinlock_t       *rt_hash_locks;
233 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
234
235 static __init void rt_hash_lock_init(void)
236 {
237         int i;
238
239         rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ,
240                         GFP_KERNEL);
241         if (!rt_hash_locks)
242                 panic("IP: failed to allocate rt_hash_locks\n");
243
244         for (i = 0; i < RT_HASH_LOCK_SZ; i++)
245                 spin_lock_init(&rt_hash_locks[i]);
246 }
247 #else
248 # define rt_hash_lock_addr(slot) NULL
249
250 static inline void rt_hash_lock_init(void)
251 {
252 }
253 #endif
254
255 static struct rt_hash_bucket    *rt_hash_table;
256 static unsigned                 rt_hash_mask;
257 static unsigned int             rt_hash_log;
258 static atomic_t                 rt_genid;
259
260 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
261 #define RT_CACHE_STAT_INC(field) \
262         (__raw_get_cpu_var(rt_cache_stat).field++)
263
264 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
265 {
266         return jhash_2words(daddr, saddr, atomic_read(&rt_genid))
267                 & rt_hash_mask;
268 }
269
270 #define rt_hash(daddr, saddr, idx) \
271         rt_hash_code((__force u32)(__be32)(daddr),\
272                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
273
274 #ifdef CONFIG_PROC_FS
275 struct rt_cache_iter_state {
276         struct seq_net_private p;
277         int bucket;
278         int genid;
279 };
280
281 static struct rtable *rt_cache_get_first(struct rt_cache_iter_state *st)
282 {
283         struct rtable *r = NULL;
284
285         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
286                 rcu_read_lock_bh();
287                 r = rcu_dereference(rt_hash_table[st->bucket].chain);
288                 while (r) {
289                         if (r->u.dst.dev->nd_net == st->p.net &&
290                             r->rt_genid == st->genid)
291                                 return r;
292                         r = rcu_dereference(r->u.dst.rt_next);
293                 }
294                 rcu_read_unlock_bh();
295         }
296         return r;
297 }
298
299 static struct rtable *__rt_cache_get_next(struct rt_cache_iter_state *st,
300                                           struct rtable *r)
301 {
302         r = r->u.dst.rt_next;
303         while (!r) {
304                 rcu_read_unlock_bh();
305                 if (--st->bucket < 0)
306                         break;
307                 rcu_read_lock_bh();
308                 r = rt_hash_table[st->bucket].chain;
309         }
310         return rcu_dereference(r);
311 }
312
313 static struct rtable *rt_cache_get_next(struct rt_cache_iter_state *st,
314                                         struct rtable *r)
315 {
316         while ((r = __rt_cache_get_next(st, r)) != NULL) {
317                 if (r->u.dst.dev->nd_net != st->p.net)
318                         continue;
319                 if (r->rt_genid == st->genid)
320                         break;
321         }
322         return r;
323 }
324
325 static struct rtable *rt_cache_get_idx(struct rt_cache_iter_state *st, loff_t pos)
326 {
327         struct rtable *r = rt_cache_get_first(st);
328
329         if (r)
330                 while (pos && (r = rt_cache_get_next(st, r)))
331                         --pos;
332         return pos ? NULL : r;
333 }
334
335 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
336 {
337         struct rt_cache_iter_state *st = seq->private;
338
339         if (*pos)
340                 return rt_cache_get_idx(st, *pos - 1);
341         st->genid = atomic_read(&rt_genid);
342         return SEQ_START_TOKEN;
343 }
344
345 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
346 {
347         struct rtable *r;
348         struct rt_cache_iter_state *st = seq->private;
349
350         if (v == SEQ_START_TOKEN)
351                 r = rt_cache_get_first(st);
352         else
353                 r = rt_cache_get_next(st, v);
354         ++*pos;
355         return r;
356 }
357
358 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
359 {
360         if (v && v != SEQ_START_TOKEN)
361                 rcu_read_unlock_bh();
362 }
363
364 static int rt_cache_seq_show(struct seq_file *seq, void *v)
365 {
366         if (v == SEQ_START_TOKEN)
367                 seq_printf(seq, "%-127s\n",
368                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
369                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
370                            "HHUptod\tSpecDst");
371         else {
372                 struct rtable *r = v;
373                 char temp[256];
374
375                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
376                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
377                         r->u.dst.dev ? r->u.dst.dev->name : "*",
378                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
379                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
380                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
381                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
382                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
383                         dst_metric(&r->u.dst, RTAX_WINDOW),
384                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
385                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
386                         r->fl.fl4_tos,
387                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
388                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
389                                        dev_queue_xmit) : 0,
390                         r->rt_spec_dst);
391                 seq_printf(seq, "%-127s\n", temp);
392         }
393         return 0;
394 }
395
396 static const struct seq_operations rt_cache_seq_ops = {
397         .start  = rt_cache_seq_start,
398         .next   = rt_cache_seq_next,
399         .stop   = rt_cache_seq_stop,
400         .show   = rt_cache_seq_show,
401 };
402
403 static int rt_cache_seq_open(struct inode *inode, struct file *file)
404 {
405         return seq_open_net(inode, file, &rt_cache_seq_ops,
406                         sizeof(struct rt_cache_iter_state));
407 }
408
409 static const struct file_operations rt_cache_seq_fops = {
410         .owner   = THIS_MODULE,
411         .open    = rt_cache_seq_open,
412         .read    = seq_read,
413         .llseek  = seq_lseek,
414         .release = seq_release_net,
415 };
416
417
418 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
419 {
420         int cpu;
421
422         if (*pos == 0)
423                 return SEQ_START_TOKEN;
424
425         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
426                 if (!cpu_possible(cpu))
427                         continue;
428                 *pos = cpu+1;
429                 return &per_cpu(rt_cache_stat, cpu);
430         }
431         return NULL;
432 }
433
434 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435 {
436         int cpu;
437
438         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
439                 if (!cpu_possible(cpu))
440                         continue;
441                 *pos = cpu+1;
442                 return &per_cpu(rt_cache_stat, cpu);
443         }
444         return NULL;
445
446 }
447
448 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
449 {
450
451 }
452
453 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
454 {
455         struct rt_cache_stat *st = v;
456
457         if (v == SEQ_START_TOKEN) {
458                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
459                 return 0;
460         }
461
462         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
463                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
464                    atomic_read(&ipv4_dst_ops.entries),
465                    st->in_hit,
466                    st->in_slow_tot,
467                    st->in_slow_mc,
468                    st->in_no_route,
469                    st->in_brd,
470                    st->in_martian_dst,
471                    st->in_martian_src,
472
473                    st->out_hit,
474                    st->out_slow_tot,
475                    st->out_slow_mc,
476
477                    st->gc_total,
478                    st->gc_ignored,
479                    st->gc_goal_miss,
480                    st->gc_dst_overflow,
481                    st->in_hlist_search,
482                    st->out_hlist_search
483                 );
484         return 0;
485 }
486
487 static const struct seq_operations rt_cpu_seq_ops = {
488         .start  = rt_cpu_seq_start,
489         .next   = rt_cpu_seq_next,
490         .stop   = rt_cpu_seq_stop,
491         .show   = rt_cpu_seq_show,
492 };
493
494
495 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
496 {
497         return seq_open(file, &rt_cpu_seq_ops);
498 }
499
500 static const struct file_operations rt_cpu_seq_fops = {
501         .owner   = THIS_MODULE,
502         .open    = rt_cpu_seq_open,
503         .read    = seq_read,
504         .llseek  = seq_lseek,
505         .release = seq_release,
506 };
507
508 #ifdef CONFIG_NET_CLS_ROUTE
509 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
510                            int length, int *eof, void *data)
511 {
512         unsigned int i;
513
514         if ((offset & 3) || (length & 3))
515                 return -EIO;
516
517         if (offset >= sizeof(struct ip_rt_acct) * 256) {
518                 *eof = 1;
519                 return 0;
520         }
521
522         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
523                 length = sizeof(struct ip_rt_acct) * 256 - offset;
524                 *eof = 1;
525         }
526
527         offset /= sizeof(u32);
528
529         if (length > 0) {
530                 u32 *dst = (u32 *) buffer;
531
532                 *start = buffer;
533                 memset(dst, 0, length);
534
535                 for_each_possible_cpu(i) {
536                         unsigned int j;
537                         u32 *src;
538
539                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
540                         for (j = 0; j < length/4; j++)
541                                 dst[j] += src[j];
542                 }
543         }
544         return length;
545 }
546 #endif
547
548 static __init int ip_rt_proc_init(struct net *net)
549 {
550         struct proc_dir_entry *pde;
551
552         pde = proc_net_fops_create(net, "rt_cache", S_IRUGO,
553                         &rt_cache_seq_fops);
554         if (!pde)
555                 goto err1;
556
557         pde = proc_create("rt_cache", S_IRUGO,
558                           net->proc_net_stat, &rt_cpu_seq_fops);
559         if (!pde)
560                 goto err2;
561
562 #ifdef CONFIG_NET_CLS_ROUTE
563         pde = create_proc_read_entry("rt_acct", 0, net->proc_net,
564                         ip_rt_acct_read, NULL);
565         if (!pde)
566                 goto err3;
567 #endif
568         return 0;
569
570 #ifdef CONFIG_NET_CLS_ROUTE
571 err3:
572         remove_proc_entry("rt_cache", net->proc_net_stat);
573 #endif
574 err2:
575         remove_proc_entry("rt_cache", net->proc_net);
576 err1:
577         return -ENOMEM;
578 }
579 #else
580 static inline int ip_rt_proc_init(struct net *net)
581 {
582         return 0;
583 }
584 #endif /* CONFIG_PROC_FS */
585
586 static __inline__ void rt_free(struct rtable *rt)
587 {
588         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
589 }
590
591 static __inline__ void rt_drop(struct rtable *rt)
592 {
593         ip_rt_put(rt);
594         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
595 }
596
597 static __inline__ int rt_fast_clean(struct rtable *rth)
598 {
599         /* Kill broadcast/multicast entries very aggresively, if they
600            collide in hash table with more useful entries */
601         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
602                 rth->fl.iif && rth->u.dst.rt_next;
603 }
604
605 static __inline__ int rt_valuable(struct rtable *rth)
606 {
607         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
608                 rth->u.dst.expires;
609 }
610
611 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
612 {
613         unsigned long age;
614         int ret = 0;
615
616         if (atomic_read(&rth->u.dst.__refcnt))
617                 goto out;
618
619         ret = 1;
620         if (rth->u.dst.expires &&
621             time_after_eq(jiffies, rth->u.dst.expires))
622                 goto out;
623
624         age = jiffies - rth->u.dst.lastuse;
625         ret = 0;
626         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
627             (age <= tmo2 && rt_valuable(rth)))
628                 goto out;
629         ret = 1;
630 out:    return ret;
631 }
632
633 /* Bits of score are:
634  * 31: very valuable
635  * 30: not quite useless
636  * 29..0: usage counter
637  */
638 static inline u32 rt_score(struct rtable *rt)
639 {
640         u32 score = jiffies - rt->u.dst.lastuse;
641
642         score = ~score & ~(3<<30);
643
644         if (rt_valuable(rt))
645                 score |= (1<<31);
646
647         if (!rt->fl.iif ||
648             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
649                 score |= (1<<30);
650
651         return score;
652 }
653
654 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
655 {
656         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
657                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
658                 (fl1->mark ^ fl2->mark) |
659                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
660                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
661                 (fl1->oif ^ fl2->oif) |
662                 (fl1->iif ^ fl2->iif)) == 0;
663 }
664
665 static inline int compare_netns(struct rtable *rt1, struct rtable *rt2)
666 {
667         return rt1->u.dst.dev->nd_net == rt2->u.dst.dev->nd_net;
668 }
669
670 /*
671  * Perform a full scan of hash table and free all entries.
672  * Can be called by a softirq or a process.
673  * In the later case, we want to be reschedule if necessary
674  */
675 static void rt_do_flush(int process_context)
676 {
677         unsigned int i;
678         struct rtable *rth, *next;
679
680         for (i = 0; i <= rt_hash_mask; i++) {
681                 if (process_context && need_resched())
682                         cond_resched();
683                 rth = rt_hash_table[i].chain;
684                 if (!rth)
685                         continue;
686
687                 spin_lock_bh(rt_hash_lock_addr(i));
688                 rth = rt_hash_table[i].chain;
689                 rt_hash_table[i].chain = NULL;
690                 spin_unlock_bh(rt_hash_lock_addr(i));
691
692                 for (; rth; rth = next) {
693                         next = rth->u.dst.rt_next;
694                         rt_free(rth);
695                 }
696         }
697 }
698
699 static void rt_check_expire(void)
700 {
701         static unsigned int rover;
702         unsigned int i = rover, goal;
703         struct rtable *rth, **rthp;
704         u64 mult;
705
706         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
707         if (ip_rt_gc_timeout > 1)
708                 do_div(mult, ip_rt_gc_timeout);
709         goal = (unsigned int)mult;
710         if (goal > rt_hash_mask)
711                 goal = rt_hash_mask + 1;
712         for (; goal > 0; goal--) {
713                 unsigned long tmo = ip_rt_gc_timeout;
714
715                 i = (i + 1) & rt_hash_mask;
716                 rthp = &rt_hash_table[i].chain;
717
718                 if (need_resched())
719                         cond_resched();
720
721                 if (*rthp == NULL)
722                         continue;
723                 spin_lock_bh(rt_hash_lock_addr(i));
724                 while ((rth = *rthp) != NULL) {
725                         if (rth->rt_genid != atomic_read(&rt_genid)) {
726                                 *rthp = rth->u.dst.rt_next;
727                                 rt_free(rth);
728                                 continue;
729                         }
730                         if (rth->u.dst.expires) {
731                                 /* Entry is expired even if it is in use */
732                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
733                                         tmo >>= 1;
734                                         rthp = &rth->u.dst.rt_next;
735                                         continue;
736                                 }
737                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
738                                 tmo >>= 1;
739                                 rthp = &rth->u.dst.rt_next;
740                                 continue;
741                         }
742
743                         /* Cleanup aged off entries. */
744                         *rthp = rth->u.dst.rt_next;
745                         rt_free(rth);
746                 }
747                 spin_unlock_bh(rt_hash_lock_addr(i));
748         }
749         rover = i;
750 }
751
752 /*
753  * rt_worker_func() is run in process context.
754  * we call rt_check_expire() to scan part of the hash table
755  */
756 static void rt_worker_func(struct work_struct *work)
757 {
758         rt_check_expire();
759         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
760 }
761
762 /*
763  * Pertubation of rt_genid by a small quantity [1..256]
764  * Using 8 bits of shuffling ensure we can call rt_cache_invalidate()
765  * many times (2^24) without giving recent rt_genid.
766  * Jenkins hash is strong enough that litle changes of rt_genid are OK.
767  */
768 static void rt_cache_invalidate(void)
769 {
770         unsigned char shuffle;
771
772         get_random_bytes(&shuffle, sizeof(shuffle));
773         atomic_add(shuffle + 1U, &rt_genid);
774 }
775
776 /*
777  * delay < 0  : invalidate cache (fast : entries will be deleted later)
778  * delay >= 0 : invalidate & flush cache (can be long)
779  */
780 void rt_cache_flush(int delay)
781 {
782         rt_cache_invalidate();
783         if (delay >= 0)
784                 rt_do_flush(!in_softirq());
785 }
786
787 /*
788  * We change rt_genid and let gc do the cleanup
789  */
790 static void rt_secret_rebuild(unsigned long dummy)
791 {
792         rt_cache_invalidate();
793         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
794 }
795
796 /*
797    Short description of GC goals.
798
799    We want to build algorithm, which will keep routing cache
800    at some equilibrium point, when number of aged off entries
801    is kept approximately equal to newly generated ones.
802
803    Current expiration strength is variable "expire".
804    We try to adjust it dynamically, so that if networking
805    is idle expires is large enough to keep enough of warm entries,
806    and when load increases it reduces to limit cache size.
807  */
808
809 static int rt_garbage_collect(struct dst_ops *ops)
810 {
811         static unsigned long expire = RT_GC_TIMEOUT;
812         static unsigned long last_gc;
813         static int rover;
814         static int equilibrium;
815         struct rtable *rth, **rthp;
816         unsigned long now = jiffies;
817         int goal;
818
819         /*
820          * Garbage collection is pretty expensive,
821          * do not make it too frequently.
822          */
823
824         RT_CACHE_STAT_INC(gc_total);
825
826         if (now - last_gc < ip_rt_gc_min_interval &&
827             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
828                 RT_CACHE_STAT_INC(gc_ignored);
829                 goto out;
830         }
831
832         /* Calculate number of entries, which we want to expire now. */
833         goal = atomic_read(&ipv4_dst_ops.entries) -
834                 (ip_rt_gc_elasticity << rt_hash_log);
835         if (goal <= 0) {
836                 if (equilibrium < ipv4_dst_ops.gc_thresh)
837                         equilibrium = ipv4_dst_ops.gc_thresh;
838                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
839                 if (goal > 0) {
840                         equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1);
841                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
842                 }
843         } else {
844                 /* We are in dangerous area. Try to reduce cache really
845                  * aggressively.
846                  */
847                 goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1);
848                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
849         }
850
851         if (now - last_gc >= ip_rt_gc_min_interval)
852                 last_gc = now;
853
854         if (goal <= 0) {
855                 equilibrium += goal;
856                 goto work_done;
857         }
858
859         do {
860                 int i, k;
861
862                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
863                         unsigned long tmo = expire;
864
865                         k = (k + 1) & rt_hash_mask;
866                         rthp = &rt_hash_table[k].chain;
867                         spin_lock_bh(rt_hash_lock_addr(k));
868                         while ((rth = *rthp) != NULL) {
869                                 if (rth->rt_genid == atomic_read(&rt_genid) &&
870                                         !rt_may_expire(rth, tmo, expire)) {
871                                         tmo >>= 1;
872                                         rthp = &rth->u.dst.rt_next;
873                                         continue;
874                                 }
875                                 *rthp = rth->u.dst.rt_next;
876                                 rt_free(rth);
877                                 goal--;
878                         }
879                         spin_unlock_bh(rt_hash_lock_addr(k));
880                         if (goal <= 0)
881                                 break;
882                 }
883                 rover = k;
884
885                 if (goal <= 0)
886                         goto work_done;
887
888                 /* Goal is not achieved. We stop process if:
889
890                    - if expire reduced to zero. Otherwise, expire is halfed.
891                    - if table is not full.
892                    - if we are called from interrupt.
893                    - jiffies check is just fallback/debug loop breaker.
894                      We will not spin here for long time in any case.
895                  */
896
897                 RT_CACHE_STAT_INC(gc_goal_miss);
898
899                 if (expire == 0)
900                         break;
901
902                 expire >>= 1;
903 #if RT_CACHE_DEBUG >= 2
904                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
905                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
906 #endif
907
908                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
909                         goto out;
910         } while (!in_softirq() && time_before_eq(jiffies, now));
911
912         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
913                 goto out;
914         if (net_ratelimit())
915                 printk(KERN_WARNING "dst cache overflow\n");
916         RT_CACHE_STAT_INC(gc_dst_overflow);
917         return 1;
918
919 work_done:
920         expire += ip_rt_gc_min_interval;
921         if (expire > ip_rt_gc_timeout ||
922             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
923                 expire = ip_rt_gc_timeout;
924 #if RT_CACHE_DEBUG >= 2
925         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
926                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
927 #endif
928 out:    return 0;
929 }
930
931 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
932 {
933         struct rtable   *rth, **rthp;
934         unsigned long   now;
935         struct rtable *cand, **candp;
936         u32             min_score;
937         int             chain_length;
938         int attempts = !in_softirq();
939
940 restart:
941         chain_length = 0;
942         min_score = ~(u32)0;
943         cand = NULL;
944         candp = NULL;
945         now = jiffies;
946
947         rthp = &rt_hash_table[hash].chain;
948
949         spin_lock_bh(rt_hash_lock_addr(hash));
950         while ((rth = *rthp) != NULL) {
951                 if (rth->rt_genid != atomic_read(&rt_genid)) {
952                         *rthp = rth->u.dst.rt_next;
953                         rt_free(rth);
954                         continue;
955                 }
956                 if (compare_keys(&rth->fl, &rt->fl) && compare_netns(rth, rt)) {
957                         /* Put it first */
958                         *rthp = rth->u.dst.rt_next;
959                         /*
960                          * Since lookup is lockfree, the deletion
961                          * must be visible to another weakly ordered CPU before
962                          * the insertion at the start of the hash chain.
963                          */
964                         rcu_assign_pointer(rth->u.dst.rt_next,
965                                            rt_hash_table[hash].chain);
966                         /*
967                          * Since lookup is lockfree, the update writes
968                          * must be ordered for consistency on SMP.
969                          */
970                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
971
972                         dst_use(&rth->u.dst, now);
973                         spin_unlock_bh(rt_hash_lock_addr(hash));
974
975                         rt_drop(rt);
976                         *rp = rth;
977                         return 0;
978                 }
979
980                 if (!atomic_read(&rth->u.dst.__refcnt)) {
981                         u32 score = rt_score(rth);
982
983                         if (score <= min_score) {
984                                 cand = rth;
985                                 candp = rthp;
986                                 min_score = score;
987                         }
988                 }
989
990                 chain_length++;
991
992                 rthp = &rth->u.dst.rt_next;
993         }
994
995         if (cand) {
996                 /* ip_rt_gc_elasticity used to be average length of chain
997                  * length, when exceeded gc becomes really aggressive.
998                  *
999                  * The second limit is less certain. At the moment it allows
1000                  * only 2 entries per bucket. We will see.
1001                  */
1002                 if (chain_length > ip_rt_gc_elasticity) {
1003                         *candp = cand->u.dst.rt_next;
1004                         rt_free(cand);
1005                 }
1006         }
1007
1008         /* Try to bind route to arp only if it is output
1009            route or unicast forwarding path.
1010          */
1011         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
1012                 int err = arp_bind_neighbour(&rt->u.dst);
1013                 if (err) {
1014                         spin_unlock_bh(rt_hash_lock_addr(hash));
1015
1016                         if (err != -ENOBUFS) {
1017                                 rt_drop(rt);
1018                                 return err;
1019                         }
1020
1021                         /* Neighbour tables are full and nothing
1022                            can be released. Try to shrink route cache,
1023                            it is most likely it holds some neighbour records.
1024                          */
1025                         if (attempts-- > 0) {
1026                                 int saved_elasticity = ip_rt_gc_elasticity;
1027                                 int saved_int = ip_rt_gc_min_interval;
1028                                 ip_rt_gc_elasticity     = 1;
1029                                 ip_rt_gc_min_interval   = 0;
1030                                 rt_garbage_collect(&ipv4_dst_ops);
1031                                 ip_rt_gc_min_interval   = saved_int;
1032                                 ip_rt_gc_elasticity     = saved_elasticity;
1033                                 goto restart;
1034                         }
1035
1036                         if (net_ratelimit())
1037                                 printk(KERN_WARNING "Neighbour table overflow.\n");
1038                         rt_drop(rt);
1039                         return -ENOBUFS;
1040                 }
1041         }
1042
1043         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1044 #if RT_CACHE_DEBUG >= 2
1045         if (rt->u.dst.rt_next) {
1046                 struct rtable *trt;
1047                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1048                        NIPQUAD(rt->rt_dst));
1049                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1050                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1051                 printk("\n");
1052         }
1053 #endif
1054         rt_hash_table[hash].chain = rt;
1055         spin_unlock_bh(rt_hash_lock_addr(hash));
1056         *rp = rt;
1057         return 0;
1058 }
1059
1060 void rt_bind_peer(struct rtable *rt, int create)
1061 {
1062         static DEFINE_SPINLOCK(rt_peer_lock);
1063         struct inet_peer *peer;
1064
1065         peer = inet_getpeer(rt->rt_dst, create);
1066
1067         spin_lock_bh(&rt_peer_lock);
1068         if (rt->peer == NULL) {
1069                 rt->peer = peer;
1070                 peer = NULL;
1071         }
1072         spin_unlock_bh(&rt_peer_lock);
1073         if (peer)
1074                 inet_putpeer(peer);
1075 }
1076
1077 /*
1078  * Peer allocation may fail only in serious out-of-memory conditions.  However
1079  * we still can generate some output.
1080  * Random ID selection looks a bit dangerous because we have no chances to
1081  * select ID being unique in a reasonable period of time.
1082  * But broken packet identifier may be better than no packet at all.
1083  */
1084 static void ip_select_fb_ident(struct iphdr *iph)
1085 {
1086         static DEFINE_SPINLOCK(ip_fb_id_lock);
1087         static u32 ip_fallback_id;
1088         u32 salt;
1089
1090         spin_lock_bh(&ip_fb_id_lock);
1091         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1092         iph->id = htons(salt & 0xFFFF);
1093         ip_fallback_id = salt;
1094         spin_unlock_bh(&ip_fb_id_lock);
1095 }
1096
1097 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1098 {
1099         struct rtable *rt = (struct rtable *) dst;
1100
1101         if (rt) {
1102                 if (rt->peer == NULL)
1103                         rt_bind_peer(rt, 1);
1104
1105                 /* If peer is attached to destination, it is never detached,
1106                    so that we need not to grab a lock to dereference it.
1107                  */
1108                 if (rt->peer) {
1109                         iph->id = htons(inet_getid(rt->peer, more));
1110                         return;
1111                 }
1112         } else
1113                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1114                        __builtin_return_address(0));
1115
1116         ip_select_fb_ident(iph);
1117 }
1118
1119 static void rt_del(unsigned hash, struct rtable *rt)
1120 {
1121         struct rtable **rthp, *aux;
1122
1123         rthp = &rt_hash_table[hash].chain;
1124         spin_lock_bh(rt_hash_lock_addr(hash));
1125         ip_rt_put(rt);
1126         while ((aux = *rthp) != NULL) {
1127                 if (aux == rt || (aux->rt_genid != atomic_read(&rt_genid))) {
1128                         *rthp = aux->u.dst.rt_next;
1129                         rt_free(aux);
1130                         continue;
1131                 }
1132                 rthp = &aux->u.dst.rt_next;
1133         }
1134         spin_unlock_bh(rt_hash_lock_addr(hash));
1135 }
1136
1137 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1138                     __be32 saddr, struct net_device *dev)
1139 {
1140         int i, k;
1141         struct in_device *in_dev = in_dev_get(dev);
1142         struct rtable *rth, **rthp;
1143         __be32  skeys[2] = { saddr, 0 };
1144         int  ikeys[2] = { dev->ifindex, 0 };
1145         struct netevent_redirect netevent;
1146         struct net *net;
1147
1148         if (!in_dev)
1149                 return;
1150
1151         net = dev->nd_net;
1152         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1153             || ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw)
1154             || ipv4_is_zeronet(new_gw))
1155                 goto reject_redirect;
1156
1157         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1158                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1159                         goto reject_redirect;
1160                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1161                         goto reject_redirect;
1162         } else {
1163                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1164                         goto reject_redirect;
1165         }
1166
1167         for (i = 0; i < 2; i++) {
1168                 for (k = 0; k < 2; k++) {
1169                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1170
1171                         rthp=&rt_hash_table[hash].chain;
1172
1173                         rcu_read_lock();
1174                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1175                                 struct rtable *rt;
1176
1177                                 if (rth->fl.fl4_dst != daddr ||
1178                                     rth->fl.fl4_src != skeys[i] ||
1179                                     rth->fl.oif != ikeys[k] ||
1180                                     rth->fl.iif != 0 ||
1181                                     rth->rt_genid != atomic_read(&rt_genid) ||
1182                                     rth->u.dst.dev->nd_net != net) {
1183                                         rthp = &rth->u.dst.rt_next;
1184                                         continue;
1185                                 }
1186
1187                                 if (rth->rt_dst != daddr ||
1188                                     rth->rt_src != saddr ||
1189                                     rth->u.dst.error ||
1190                                     rth->rt_gateway != old_gw ||
1191                                     rth->u.dst.dev != dev)
1192                                         break;
1193
1194                                 dst_hold(&rth->u.dst);
1195                                 rcu_read_unlock();
1196
1197                                 rt = dst_alloc(&ipv4_dst_ops);
1198                                 if (rt == NULL) {
1199                                         ip_rt_put(rth);
1200                                         in_dev_put(in_dev);
1201                                         return;
1202                                 }
1203
1204                                 /* Copy all the information. */
1205                                 *rt = *rth;
1206                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1207                                 rt->u.dst.__use         = 1;
1208                                 atomic_set(&rt->u.dst.__refcnt, 1);
1209                                 rt->u.dst.child         = NULL;
1210                                 if (rt->u.dst.dev)
1211                                         dev_hold(rt->u.dst.dev);
1212                                 if (rt->idev)
1213                                         in_dev_hold(rt->idev);
1214                                 rt->u.dst.obsolete      = 0;
1215                                 rt->u.dst.lastuse       = jiffies;
1216                                 rt->u.dst.path          = &rt->u.dst;
1217                                 rt->u.dst.neighbour     = NULL;
1218                                 rt->u.dst.hh            = NULL;
1219                                 rt->u.dst.xfrm          = NULL;
1220                                 rt->rt_genid            = atomic_read(&rt_genid);
1221                                 rt->rt_flags            |= RTCF_REDIRECTED;
1222
1223                                 /* Gateway is different ... */
1224                                 rt->rt_gateway          = new_gw;
1225
1226                                 /* Redirect received -> path was valid */
1227                                 dst_confirm(&rth->u.dst);
1228
1229                                 if (rt->peer)
1230                                         atomic_inc(&rt->peer->refcnt);
1231
1232                                 if (arp_bind_neighbour(&rt->u.dst) ||
1233                                     !(rt->u.dst.neighbour->nud_state &
1234                                             NUD_VALID)) {
1235                                         if (rt->u.dst.neighbour)
1236                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1237                                         ip_rt_put(rth);
1238                                         rt_drop(rt);
1239                                         goto do_next;
1240                                 }
1241
1242                                 netevent.old = &rth->u.dst;
1243                                 netevent.new = &rt->u.dst;
1244                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1245                                                         &netevent);
1246
1247                                 rt_del(hash, rth);
1248                                 if (!rt_intern_hash(hash, rt, &rt))
1249                                         ip_rt_put(rt);
1250                                 goto do_next;
1251                         }
1252                         rcu_read_unlock();
1253                 do_next:
1254                         ;
1255                 }
1256         }
1257         in_dev_put(in_dev);
1258         return;
1259
1260 reject_redirect:
1261 #ifdef CONFIG_IP_ROUTE_VERBOSE
1262         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1263                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1264                         "%u.%u.%u.%u ignored.\n"
1265                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1266                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1267                        NIPQUAD(saddr), NIPQUAD(daddr));
1268 #endif
1269         in_dev_put(in_dev);
1270 }
1271
1272 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1273 {
1274         struct rtable *rt = (struct rtable*)dst;
1275         struct dst_entry *ret = dst;
1276
1277         if (rt) {
1278                 if (dst->obsolete) {
1279                         ip_rt_put(rt);
1280                         ret = NULL;
1281                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1282                            rt->u.dst.expires) {
1283                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1284                                                 rt->fl.oif);
1285 #if RT_CACHE_DEBUG >= 1
1286                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1287                                           "%u.%u.%u.%u/%02x dropped\n",
1288                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1289 #endif
1290                         rt_del(hash, rt);
1291                         ret = NULL;
1292                 }
1293         }
1294         return ret;
1295 }
1296
1297 /*
1298  * Algorithm:
1299  *      1. The first ip_rt_redirect_number redirects are sent
1300  *         with exponential backoff, then we stop sending them at all,
1301  *         assuming that the host ignores our redirects.
1302  *      2. If we did not see packets requiring redirects
1303  *         during ip_rt_redirect_silence, we assume that the host
1304  *         forgot redirected route and start to send redirects again.
1305  *
1306  * This algorithm is much cheaper and more intelligent than dumb load limiting
1307  * in icmp.c.
1308  *
1309  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1310  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1311  */
1312
1313 void ip_rt_send_redirect(struct sk_buff *skb)
1314 {
1315         struct rtable *rt = (struct rtable*)skb->dst;
1316         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1317
1318         if (!in_dev)
1319                 return;
1320
1321         if (!IN_DEV_TX_REDIRECTS(in_dev))
1322                 goto out;
1323
1324         /* No redirected packets during ip_rt_redirect_silence;
1325          * reset the algorithm.
1326          */
1327         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1328                 rt->u.dst.rate_tokens = 0;
1329
1330         /* Too many ignored redirects; do not send anything
1331          * set u.dst.rate_last to the last seen redirected packet.
1332          */
1333         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1334                 rt->u.dst.rate_last = jiffies;
1335                 goto out;
1336         }
1337
1338         /* Check for load limit; set rate_last to the latest sent
1339          * redirect.
1340          */
1341         if (rt->u.dst.rate_tokens == 0 ||
1342             time_after(jiffies,
1343                        (rt->u.dst.rate_last +
1344                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1345                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1346                 rt->u.dst.rate_last = jiffies;
1347                 ++rt->u.dst.rate_tokens;
1348 #ifdef CONFIG_IP_ROUTE_VERBOSE
1349                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1350                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1351                     net_ratelimit())
1352                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1353                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1354                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1355                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1356 #endif
1357         }
1358 out:
1359         in_dev_put(in_dev);
1360 }
1361
1362 static int ip_error(struct sk_buff *skb)
1363 {
1364         struct rtable *rt = (struct rtable*)skb->dst;
1365         unsigned long now;
1366         int code;
1367
1368         switch (rt->u.dst.error) {
1369                 case EINVAL:
1370                 default:
1371                         goto out;
1372                 case EHOSTUNREACH:
1373                         code = ICMP_HOST_UNREACH;
1374                         break;
1375                 case ENETUNREACH:
1376                         code = ICMP_NET_UNREACH;
1377                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1378                         break;
1379                 case EACCES:
1380                         code = ICMP_PKT_FILTERED;
1381                         break;
1382         }
1383
1384         now = jiffies;
1385         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1386         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1387                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1388         rt->u.dst.rate_last = now;
1389         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1390                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1391                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1392         }
1393
1394 out:    kfree_skb(skb);
1395         return 0;
1396 }
1397
1398 /*
1399  *      The last two values are not from the RFC but
1400  *      are needed for AMPRnet AX.25 paths.
1401  */
1402
1403 static const unsigned short mtu_plateau[] =
1404 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1405
1406 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1407 {
1408         int i;
1409
1410         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1411                 if (old_mtu > mtu_plateau[i])
1412                         return mtu_plateau[i];
1413         return 68;
1414 }
1415
1416 unsigned short ip_rt_frag_needed(struct net *net, struct iphdr *iph,
1417                                  unsigned short new_mtu)
1418 {
1419         int i;
1420         unsigned short old_mtu = ntohs(iph->tot_len);
1421         struct rtable *rth;
1422         __be32  skeys[2] = { iph->saddr, 0, };
1423         __be32  daddr = iph->daddr;
1424         unsigned short est_mtu = 0;
1425
1426         if (ipv4_config.no_pmtu_disc)
1427                 return 0;
1428
1429         for (i = 0; i < 2; i++) {
1430                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1431
1432                 rcu_read_lock();
1433                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1434                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1435                         if (rth->fl.fl4_dst == daddr &&
1436                             rth->fl.fl4_src == skeys[i] &&
1437                             rth->rt_dst  == daddr &&
1438                             rth->rt_src  == iph->saddr &&
1439                             rth->fl.iif == 0 &&
1440                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU)) &&
1441                             rth->u.dst.dev->nd_net == net &&
1442                             rth->rt_genid == atomic_read(&rt_genid)) {
1443                                 unsigned short mtu = new_mtu;
1444
1445                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1446
1447                                         /* BSD 4.2 compatibility hack :-( */
1448                                         if (mtu == 0 &&
1449                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1450                                             old_mtu >= 68 + (iph->ihl << 2))
1451                                                 old_mtu -= iph->ihl << 2;
1452
1453                                         mtu = guess_mtu(old_mtu);
1454                                 }
1455                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1456                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1457                                                 dst_confirm(&rth->u.dst);
1458                                                 if (mtu < ip_rt_min_pmtu) {
1459                                                         mtu = ip_rt_min_pmtu;
1460                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1461                                                                 (1 << RTAX_MTU);
1462                                                 }
1463                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1464                                                 dst_set_expires(&rth->u.dst,
1465                                                         ip_rt_mtu_expires);
1466                                         }
1467                                         est_mtu = mtu;
1468                                 }
1469                         }
1470                 }
1471                 rcu_read_unlock();
1472         }
1473         return est_mtu ? : new_mtu;
1474 }
1475
1476 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1477 {
1478         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1479             !(dst_metric_locked(dst, RTAX_MTU))) {
1480                 if (mtu < ip_rt_min_pmtu) {
1481                         mtu = ip_rt_min_pmtu;
1482                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1483                 }
1484                 dst->metrics[RTAX_MTU-1] = mtu;
1485                 dst_set_expires(dst, ip_rt_mtu_expires);
1486                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1487         }
1488 }
1489
1490 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1491 {
1492         return NULL;
1493 }
1494
1495 static void ipv4_dst_destroy(struct dst_entry *dst)
1496 {
1497         struct rtable *rt = (struct rtable *) dst;
1498         struct inet_peer *peer = rt->peer;
1499         struct in_device *idev = rt->idev;
1500
1501         if (peer) {
1502                 rt->peer = NULL;
1503                 inet_putpeer(peer);
1504         }
1505
1506         if (idev) {
1507                 rt->idev = NULL;
1508                 in_dev_put(idev);
1509         }
1510 }
1511
1512 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1513                             int how)
1514 {
1515         struct rtable *rt = (struct rtable *) dst;
1516         struct in_device *idev = rt->idev;
1517         if (dev != dev->nd_net->loopback_dev && idev && idev->dev == dev) {
1518                 struct in_device *loopback_idev =
1519                         in_dev_get(dev->nd_net->loopback_dev);
1520                 if (loopback_idev) {
1521                         rt->idev = loopback_idev;
1522                         in_dev_put(idev);
1523                 }
1524         }
1525 }
1526
1527 static void ipv4_link_failure(struct sk_buff *skb)
1528 {
1529         struct rtable *rt;
1530
1531         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1532
1533         rt = (struct rtable *) skb->dst;
1534         if (rt)
1535                 dst_set_expires(&rt->u.dst, 0);
1536 }
1537
1538 static int ip_rt_bug(struct sk_buff *skb)
1539 {
1540         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1541                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1542                 skb->dev ? skb->dev->name : "?");
1543         kfree_skb(skb);
1544         return 0;
1545 }
1546
1547 /*
1548    We do not cache source address of outgoing interface,
1549    because it is used only by IP RR, TS and SRR options,
1550    so that it out of fast path.
1551
1552    BTW remember: "addr" is allowed to be not aligned
1553    in IP options!
1554  */
1555
1556 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1557 {
1558         __be32 src;
1559         struct fib_result res;
1560
1561         if (rt->fl.iif == 0)
1562                 src = rt->rt_src;
1563         else if (fib_lookup(rt->u.dst.dev->nd_net, &rt->fl, &res) == 0) {
1564                 src = FIB_RES_PREFSRC(res);
1565                 fib_res_put(&res);
1566         } else
1567                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1568                                         RT_SCOPE_UNIVERSE);
1569         memcpy(addr, &src, 4);
1570 }
1571
1572 #ifdef CONFIG_NET_CLS_ROUTE
1573 static void set_class_tag(struct rtable *rt, u32 tag)
1574 {
1575         if (!(rt->u.dst.tclassid & 0xFFFF))
1576                 rt->u.dst.tclassid |= tag & 0xFFFF;
1577         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1578                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1579 }
1580 #endif
1581
1582 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1583 {
1584         struct fib_info *fi = res->fi;
1585
1586         if (fi) {
1587                 if (FIB_RES_GW(*res) &&
1588                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1589                         rt->rt_gateway = FIB_RES_GW(*res);
1590                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1591                        sizeof(rt->u.dst.metrics));
1592                 if (fi->fib_mtu == 0) {
1593                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1594                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1595                             rt->rt_gateway != rt->rt_dst &&
1596                             rt->u.dst.dev->mtu > 576)
1597                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1598                 }
1599 #ifdef CONFIG_NET_CLS_ROUTE
1600                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1601 #endif
1602         } else
1603                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1604
1605         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1606                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1607         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1608                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1609         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1610                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1611                                        ip_rt_min_advmss);
1612         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1613                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1614
1615 #ifdef CONFIG_NET_CLS_ROUTE
1616 #ifdef CONFIG_IP_MULTIPLE_TABLES
1617         set_class_tag(rt, fib_rules_tclass(res));
1618 #endif
1619         set_class_tag(rt, itag);
1620 #endif
1621         rt->rt_type = res->type;
1622 }
1623
1624 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1625                                 u8 tos, struct net_device *dev, int our)
1626 {
1627         unsigned hash;
1628         struct rtable *rth;
1629         __be32 spec_dst;
1630         struct in_device *in_dev = in_dev_get(dev);
1631         u32 itag = 0;
1632
1633         /* Primary sanity checks. */
1634
1635         if (in_dev == NULL)
1636                 return -EINVAL;
1637
1638         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1639             ipv4_is_loopback(saddr) || skb->protocol != htons(ETH_P_IP))
1640                 goto e_inval;
1641
1642         if (ipv4_is_zeronet(saddr)) {
1643                 if (!ipv4_is_local_multicast(daddr))
1644                         goto e_inval;
1645                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1646         } else if (fib_validate_source(saddr, 0, tos, 0,
1647                                         dev, &spec_dst, &itag) < 0)
1648                 goto e_inval;
1649
1650         rth = dst_alloc(&ipv4_dst_ops);
1651         if (!rth)
1652                 goto e_nobufs;
1653
1654         rth->u.dst.output= ip_rt_bug;
1655
1656         atomic_set(&rth->u.dst.__refcnt, 1);
1657         rth->u.dst.flags= DST_HOST;
1658         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1659                 rth->u.dst.flags |= DST_NOPOLICY;
1660         rth->fl.fl4_dst = daddr;
1661         rth->rt_dst     = daddr;
1662         rth->fl.fl4_tos = tos;
1663         rth->fl.mark    = skb->mark;
1664         rth->fl.fl4_src = saddr;
1665         rth->rt_src     = saddr;
1666 #ifdef CONFIG_NET_CLS_ROUTE
1667         rth->u.dst.tclassid = itag;
1668 #endif
1669         rth->rt_iif     =
1670         rth->fl.iif     = dev->ifindex;
1671         rth->u.dst.dev  = init_net.loopback_dev;
1672         dev_hold(rth->u.dst.dev);
1673         rth->idev       = in_dev_get(rth->u.dst.dev);
1674         rth->fl.oif     = 0;
1675         rth->rt_gateway = daddr;
1676         rth->rt_spec_dst= spec_dst;
1677         rth->rt_genid   = atomic_read(&rt_genid);
1678         rth->rt_flags   = RTCF_MULTICAST;
1679         rth->rt_type    = RTN_MULTICAST;
1680         if (our) {
1681                 rth->u.dst.input= ip_local_deliver;
1682                 rth->rt_flags |= RTCF_LOCAL;
1683         }
1684
1685 #ifdef CONFIG_IP_MROUTE
1686         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1687                 rth->u.dst.input = ip_mr_input;
1688 #endif
1689         RT_CACHE_STAT_INC(in_slow_mc);
1690
1691         in_dev_put(in_dev);
1692         hash = rt_hash(daddr, saddr, dev->ifindex);
1693         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1694
1695 e_nobufs:
1696         in_dev_put(in_dev);
1697         return -ENOBUFS;
1698
1699 e_inval:
1700         in_dev_put(in_dev);
1701         return -EINVAL;
1702 }
1703
1704
1705 static void ip_handle_martian_source(struct net_device *dev,
1706                                      struct in_device *in_dev,
1707                                      struct sk_buff *skb,
1708                                      __be32 daddr,
1709                                      __be32 saddr)
1710 {
1711         RT_CACHE_STAT_INC(in_martian_src);
1712 #ifdef CONFIG_IP_ROUTE_VERBOSE
1713         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1714                 /*
1715                  *      RFC1812 recommendation, if source is martian,
1716                  *      the only hint is MAC header.
1717                  */
1718                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1719                         "%u.%u.%u.%u, on dev %s\n",
1720                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1721                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1722                         int i;
1723                         const unsigned char *p = skb_mac_header(skb);
1724                         printk(KERN_WARNING "ll header: ");
1725                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1726                                 printk("%02x", *p);
1727                                 if (i < (dev->hard_header_len - 1))
1728                                         printk(":");
1729                         }
1730                         printk("\n");
1731                 }
1732         }
1733 #endif
1734 }
1735
1736 static inline int __mkroute_input(struct sk_buff *skb,
1737                                   struct fib_result* res,
1738                                   struct in_device *in_dev,
1739                                   __be32 daddr, __be32 saddr, u32 tos,
1740                                   struct rtable **result)
1741 {
1742
1743         struct rtable *rth;
1744         int err;
1745         struct in_device *out_dev;
1746         unsigned flags = 0;
1747         __be32 spec_dst;
1748         u32 itag;
1749
1750         /* get a working reference to the output device */
1751         out_dev = in_dev_get(FIB_RES_DEV(*res));
1752         if (out_dev == NULL) {
1753                 if (net_ratelimit())
1754                         printk(KERN_CRIT "Bug in ip_route_input" \
1755                                "_slow(). Please, report\n");
1756                 return -EINVAL;
1757         }
1758
1759
1760         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1761                                   in_dev->dev, &spec_dst, &itag);
1762         if (err < 0) {
1763                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1764                                          saddr);
1765
1766                 err = -EINVAL;
1767                 goto cleanup;
1768         }
1769
1770         if (err)
1771                 flags |= RTCF_DIRECTSRC;
1772
1773         if (out_dev == in_dev && err && !(flags & RTCF_MASQ) &&
1774             (IN_DEV_SHARED_MEDIA(out_dev) ||
1775              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1776                 flags |= RTCF_DOREDIRECT;
1777
1778         if (skb->protocol != htons(ETH_P_IP)) {
1779                 /* Not IP (i.e. ARP). Do not create route, if it is
1780                  * invalid for proxy arp. DNAT routes are always valid.
1781                  */
1782                 if (out_dev == in_dev) {
1783                         err = -EINVAL;
1784                         goto cleanup;
1785                 }
1786         }
1787
1788
1789         rth = dst_alloc(&ipv4_dst_ops);
1790         if (!rth) {
1791                 err = -ENOBUFS;
1792                 goto cleanup;
1793         }
1794
1795         atomic_set(&rth->u.dst.__refcnt, 1);
1796         rth->u.dst.flags= DST_HOST;
1797         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1798                 rth->u.dst.flags |= DST_NOPOLICY;
1799         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1800                 rth->u.dst.flags |= DST_NOXFRM;
1801         rth->fl.fl4_dst = daddr;
1802         rth->rt_dst     = daddr;
1803         rth->fl.fl4_tos = tos;
1804         rth->fl.mark    = skb->mark;
1805         rth->fl.fl4_src = saddr;
1806         rth->rt_src     = saddr;
1807         rth->rt_gateway = daddr;
1808         rth->rt_iif     =
1809                 rth->fl.iif     = in_dev->dev->ifindex;
1810         rth->u.dst.dev  = (out_dev)->dev;
1811         dev_hold(rth->u.dst.dev);
1812         rth->idev       = in_dev_get(rth->u.dst.dev);
1813         rth->fl.oif     = 0;
1814         rth->rt_spec_dst= spec_dst;
1815
1816         rth->u.dst.input = ip_forward;
1817         rth->u.dst.output = ip_output;
1818         rth->rt_genid = atomic_read(&rt_genid);
1819
1820         rt_set_nexthop(rth, res, itag);
1821
1822         rth->rt_flags = flags;
1823
1824         *result = rth;
1825         err = 0;
1826  cleanup:
1827         /* release the working reference to the output device */
1828         in_dev_put(out_dev);
1829         return err;
1830 }
1831
1832 static inline int ip_mkroute_input(struct sk_buff *skb,
1833                                    struct fib_result* res,
1834                                    const struct flowi *fl,
1835                                    struct in_device *in_dev,
1836                                    __be32 daddr, __be32 saddr, u32 tos)
1837 {
1838         struct rtable* rth = NULL;
1839         int err;
1840         unsigned hash;
1841
1842 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1843         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1844                 fib_select_multipath(fl, res);
1845 #endif
1846
1847         /* create a routing cache entry */
1848         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1849         if (err)
1850                 return err;
1851
1852         /* put it into the cache */
1853         hash = rt_hash(daddr, saddr, fl->iif);
1854         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1855 }
1856
1857 /*
1858  *      NOTE. We drop all the packets that has local source
1859  *      addresses, because every properly looped back packet
1860  *      must have correct destination already attached by output routine.
1861  *
1862  *      Such approach solves two big problems:
1863  *      1. Not simplex devices are handled properly.
1864  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1865  */
1866
1867 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1868                                u8 tos, struct net_device *dev)
1869 {
1870         struct fib_result res;
1871         struct in_device *in_dev = in_dev_get(dev);
1872         struct flowi fl = { .nl_u = { .ip4_u =
1873                                       { .daddr = daddr,
1874                                         .saddr = saddr,
1875                                         .tos = tos,
1876                                         .scope = RT_SCOPE_UNIVERSE,
1877                                       } },
1878                             .mark = skb->mark,
1879                             .iif = dev->ifindex };
1880         unsigned        flags = 0;
1881         u32             itag = 0;
1882         struct rtable * rth;
1883         unsigned        hash;
1884         __be32          spec_dst;
1885         int             err = -EINVAL;
1886         int             free_res = 0;
1887         struct net    * net = dev->nd_net;
1888
1889         /* IP on this device is disabled. */
1890
1891         if (!in_dev)
1892                 goto out;
1893
1894         /* Check for the most weird martians, which can be not detected
1895            by fib_lookup.
1896          */
1897
1898         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1899             ipv4_is_loopback(saddr))
1900                 goto martian_source;
1901
1902         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1903                 goto brd_input;
1904
1905         /* Accept zero addresses only to limited broadcast;
1906          * I even do not know to fix it or not. Waiting for complains :-)
1907          */
1908         if (ipv4_is_zeronet(saddr))
1909                 goto martian_source;
1910
1911         if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) ||
1912             ipv4_is_loopback(daddr))
1913                 goto martian_destination;
1914
1915         /*
1916          *      Now we are ready to route packet.
1917          */
1918         if ((err = fib_lookup(net, &fl, &res)) != 0) {
1919                 if (!IN_DEV_FORWARD(in_dev))
1920                         goto e_hostunreach;
1921                 goto no_route;
1922         }
1923         free_res = 1;
1924
1925         RT_CACHE_STAT_INC(in_slow_tot);
1926
1927         if (res.type == RTN_BROADCAST)
1928                 goto brd_input;
1929
1930         if (res.type == RTN_LOCAL) {
1931                 int result;
1932                 result = fib_validate_source(saddr, daddr, tos,
1933                                              net->loopback_dev->ifindex,
1934                                              dev, &spec_dst, &itag);
1935                 if (result < 0)
1936                         goto martian_source;
1937                 if (result)
1938                         flags |= RTCF_DIRECTSRC;
1939                 spec_dst = daddr;
1940                 goto local_input;
1941         }
1942
1943         if (!IN_DEV_FORWARD(in_dev))
1944                 goto e_hostunreach;
1945         if (res.type != RTN_UNICAST)
1946                 goto martian_destination;
1947
1948         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1949 done:
1950         in_dev_put(in_dev);
1951         if (free_res)
1952                 fib_res_put(&res);
1953 out:    return err;
1954
1955 brd_input:
1956         if (skb->protocol != htons(ETH_P_IP))
1957                 goto e_inval;
1958
1959         if (ipv4_is_zeronet(saddr))
1960                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1961         else {
1962                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1963                                           &itag);
1964                 if (err < 0)
1965                         goto martian_source;
1966                 if (err)
1967                         flags |= RTCF_DIRECTSRC;
1968         }
1969         flags |= RTCF_BROADCAST;
1970         res.type = RTN_BROADCAST;
1971         RT_CACHE_STAT_INC(in_brd);
1972
1973 local_input:
1974         rth = dst_alloc(&ipv4_dst_ops);
1975         if (!rth)
1976                 goto e_nobufs;
1977
1978         rth->u.dst.output= ip_rt_bug;
1979         rth->rt_genid = atomic_read(&rt_genid);
1980
1981         atomic_set(&rth->u.dst.__refcnt, 1);
1982         rth->u.dst.flags= DST_HOST;
1983         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1984                 rth->u.dst.flags |= DST_NOPOLICY;
1985         rth->fl.fl4_dst = daddr;
1986         rth->rt_dst     = daddr;
1987         rth->fl.fl4_tos = tos;
1988         rth->fl.mark    = skb->mark;
1989         rth->fl.fl4_src = saddr;
1990         rth->rt_src     = saddr;
1991 #ifdef CONFIG_NET_CLS_ROUTE
1992         rth->u.dst.tclassid = itag;
1993 #endif
1994         rth->rt_iif     =
1995         rth->fl.iif     = dev->ifindex;
1996         rth->u.dst.dev  = net->loopback_dev;
1997         dev_hold(rth->u.dst.dev);
1998         rth->idev       = in_dev_get(rth->u.dst.dev);
1999         rth->rt_gateway = daddr;
2000         rth->rt_spec_dst= spec_dst;
2001         rth->u.dst.input= ip_local_deliver;
2002         rth->rt_flags   = flags|RTCF_LOCAL;
2003         if (res.type == RTN_UNREACHABLE) {
2004                 rth->u.dst.input= ip_error;
2005                 rth->u.dst.error= -err;
2006                 rth->rt_flags   &= ~RTCF_LOCAL;
2007         }
2008         rth->rt_type    = res.type;
2009         hash = rt_hash(daddr, saddr, fl.iif);
2010         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
2011         goto done;
2012
2013 no_route:
2014         RT_CACHE_STAT_INC(in_no_route);
2015         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
2016         res.type = RTN_UNREACHABLE;
2017         if (err == -ESRCH)
2018                 err = -ENETUNREACH;
2019         goto local_input;
2020
2021         /*
2022          *      Do not cache martian addresses: they should be logged (RFC1812)
2023          */
2024 martian_destination:
2025         RT_CACHE_STAT_INC(in_martian_dst);
2026 #ifdef CONFIG_IP_ROUTE_VERBOSE
2027         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
2028                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
2029                         "%u.%u.%u.%u, dev %s\n",
2030                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
2031 #endif
2032
2033 e_hostunreach:
2034         err = -EHOSTUNREACH;
2035         goto done;
2036
2037 e_inval:
2038         err = -EINVAL;
2039         goto done;
2040
2041 e_nobufs:
2042         err = -ENOBUFS;
2043         goto done;
2044
2045 martian_source:
2046         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2047         goto e_inval;
2048 }
2049
2050 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2051                    u8 tos, struct net_device *dev)
2052 {
2053         struct rtable * rth;
2054         unsigned        hash;
2055         int iif = dev->ifindex;
2056         struct net *net;
2057
2058         net = dev->nd_net;
2059         tos &= IPTOS_RT_MASK;
2060         hash = rt_hash(daddr, saddr, iif);
2061
2062         rcu_read_lock();
2063         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2064              rth = rcu_dereference(rth->u.dst.rt_next)) {
2065                 if (rth->fl.fl4_dst == daddr &&
2066                     rth->fl.fl4_src == saddr &&
2067                     rth->fl.iif == iif &&
2068                     rth->fl.oif == 0 &&
2069                     rth->fl.mark == skb->mark &&
2070                     rth->fl.fl4_tos == tos &&
2071                     rth->u.dst.dev->nd_net == net &&
2072                     rth->rt_genid == atomic_read(&rt_genid)) {
2073                         dst_use(&rth->u.dst, jiffies);
2074                         RT_CACHE_STAT_INC(in_hit);
2075                         rcu_read_unlock();
2076                         skb->dst = (struct dst_entry*)rth;
2077                         return 0;
2078                 }
2079                 RT_CACHE_STAT_INC(in_hlist_search);
2080         }
2081         rcu_read_unlock();
2082
2083         /* Multicast recognition logic is moved from route cache to here.
2084            The problem was that too many Ethernet cards have broken/missing
2085            hardware multicast filters :-( As result the host on multicasting
2086            network acquires a lot of useless route cache entries, sort of
2087            SDR messages from all the world. Now we try to get rid of them.
2088            Really, provided software IP multicast filter is organized
2089            reasonably (at least, hashed), it does not result in a slowdown
2090            comparing with route cache reject entries.
2091            Note, that multicast routers are not affected, because
2092            route cache entry is created eventually.
2093          */
2094         if (ipv4_is_multicast(daddr)) {
2095                 struct in_device *in_dev;
2096
2097                 rcu_read_lock();
2098                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2099                         int our = ip_check_mc(in_dev, daddr, saddr,
2100                                 ip_hdr(skb)->protocol);
2101                         if (our
2102 #ifdef CONFIG_IP_MROUTE
2103                             || (!ipv4_is_local_multicast(daddr) &&
2104                                 IN_DEV_MFORWARD(in_dev))
2105 #endif
2106                             ) {
2107                                 rcu_read_unlock();
2108                                 return ip_route_input_mc(skb, daddr, saddr,
2109                                                          tos, dev, our);
2110                         }
2111                 }
2112                 rcu_read_unlock();
2113                 return -EINVAL;
2114         }
2115         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2116 }
2117
2118 static inline int __mkroute_output(struct rtable **result,
2119                                    struct fib_result* res,
2120                                    const struct flowi *fl,
2121                                    const struct flowi *oldflp,
2122                                    struct net_device *dev_out,
2123                                    unsigned flags)
2124 {
2125         struct rtable *rth;
2126         struct in_device *in_dev;
2127         u32 tos = RT_FL_TOS(oldflp);
2128         int err = 0;
2129
2130         if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2131                 return -EINVAL;
2132
2133         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2134                 res->type = RTN_BROADCAST;
2135         else if (ipv4_is_multicast(fl->fl4_dst))
2136                 res->type = RTN_MULTICAST;
2137         else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst))
2138                 return -EINVAL;
2139
2140         if (dev_out->flags & IFF_LOOPBACK)
2141                 flags |= RTCF_LOCAL;
2142
2143         /* get work reference to inet device */
2144         in_dev = in_dev_get(dev_out);
2145         if (!in_dev)
2146                 return -EINVAL;
2147
2148         if (res->type == RTN_BROADCAST) {
2149                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2150                 if (res->fi) {
2151                         fib_info_put(res->fi);
2152                         res->fi = NULL;
2153                 }
2154         } else if (res->type == RTN_MULTICAST) {
2155                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2156                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2157                                  oldflp->proto))
2158                         flags &= ~RTCF_LOCAL;
2159                 /* If multicast route do not exist use
2160                    default one, but do not gateway in this case.
2161                    Yes, it is hack.
2162                  */
2163                 if (res->fi && res->prefixlen < 4) {
2164                         fib_info_put(res->fi);
2165                         res->fi = NULL;
2166                 }
2167         }
2168
2169
2170         rth = dst_alloc(&ipv4_dst_ops);
2171         if (!rth) {
2172                 err = -ENOBUFS;
2173                 goto cleanup;
2174         }
2175
2176         atomic_set(&rth->u.dst.__refcnt, 1);
2177         rth->u.dst.flags= DST_HOST;
2178         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2179                 rth->u.dst.flags |= DST_NOXFRM;
2180         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2181                 rth->u.dst.flags |= DST_NOPOLICY;
2182
2183         rth->fl.fl4_dst = oldflp->fl4_dst;
2184         rth->fl.fl4_tos = tos;
2185         rth->fl.fl4_src = oldflp->fl4_src;
2186         rth->fl.oif     = oldflp->oif;
2187         rth->fl.mark    = oldflp->mark;
2188         rth->rt_dst     = fl->fl4_dst;
2189         rth->rt_src     = fl->fl4_src;
2190         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2191         /* get references to the devices that are to be hold by the routing
2192            cache entry */
2193         rth->u.dst.dev  = dev_out;
2194         dev_hold(dev_out);
2195         rth->idev       = in_dev_get(dev_out);
2196         rth->rt_gateway = fl->fl4_dst;
2197         rth->rt_spec_dst= fl->fl4_src;
2198
2199         rth->u.dst.output=ip_output;
2200         rth->rt_genid = atomic_read(&rt_genid);
2201
2202         RT_CACHE_STAT_INC(out_slow_tot);
2203
2204         if (flags & RTCF_LOCAL) {
2205                 rth->u.dst.input = ip_local_deliver;
2206                 rth->rt_spec_dst = fl->fl4_dst;
2207         }
2208         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2209                 rth->rt_spec_dst = fl->fl4_src;
2210                 if (flags & RTCF_LOCAL &&
2211                     !(dev_out->flags & IFF_LOOPBACK)) {
2212                         rth->u.dst.output = ip_mc_output;
2213                         RT_CACHE_STAT_INC(out_slow_mc);
2214                 }
2215 #ifdef CONFIG_IP_MROUTE
2216                 if (res->type == RTN_MULTICAST) {
2217                         if (IN_DEV_MFORWARD(in_dev) &&
2218                             !ipv4_is_local_multicast(oldflp->fl4_dst)) {
2219                                 rth->u.dst.input = ip_mr_input;
2220                                 rth->u.dst.output = ip_mc_output;
2221                         }
2222                 }
2223 #endif
2224         }
2225
2226         rt_set_nexthop(rth, res, 0);
2227
2228         rth->rt_flags = flags;
2229
2230         *result = rth;
2231  cleanup:
2232         /* release work reference to inet device */
2233         in_dev_put(in_dev);
2234
2235         return err;
2236 }
2237
2238 static inline int ip_mkroute_output(struct rtable **rp,
2239                                     struct fib_result* res,
2240                                     const struct flowi *fl,
2241                                     const struct flowi *oldflp,
2242                                     struct net_device *dev_out,
2243                                     unsigned flags)
2244 {
2245         struct rtable *rth = NULL;
2246         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2247         unsigned hash;
2248         if (err == 0) {
2249                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2250                 err = rt_intern_hash(hash, rth, rp);
2251         }
2252
2253         return err;
2254 }
2255
2256 /*
2257  * Major route resolver routine.
2258  */
2259
2260 static int ip_route_output_slow(struct net *net, struct rtable **rp,
2261                                 const struct flowi *oldflp)
2262 {
2263         u32 tos = RT_FL_TOS(oldflp);
2264         struct flowi fl = { .nl_u = { .ip4_u =
2265                                       { .daddr = oldflp->fl4_dst,
2266                                         .saddr = oldflp->fl4_src,
2267                                         .tos = tos & IPTOS_RT_MASK,
2268                                         .scope = ((tos & RTO_ONLINK) ?
2269                                                   RT_SCOPE_LINK :
2270                                                   RT_SCOPE_UNIVERSE),
2271                                       } },
2272                             .mark = oldflp->mark,
2273                             .iif = net->loopback_dev->ifindex,
2274                             .oif = oldflp->oif };
2275         struct fib_result res;
2276         unsigned flags = 0;
2277         struct net_device *dev_out = NULL;
2278         int free_res = 0;
2279         int err;
2280
2281
2282         res.fi          = NULL;
2283 #ifdef CONFIG_IP_MULTIPLE_TABLES
2284         res.r           = NULL;
2285 #endif
2286
2287         if (oldflp->fl4_src) {
2288                 err = -EINVAL;
2289                 if (ipv4_is_multicast(oldflp->fl4_src) ||
2290                     ipv4_is_lbcast(oldflp->fl4_src) ||
2291                     ipv4_is_zeronet(oldflp->fl4_src))
2292                         goto out;
2293
2294                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2295                 dev_out = ip_dev_find(net, oldflp->fl4_src);
2296                 if (dev_out == NULL)
2297                         goto out;
2298
2299                 /* I removed check for oif == dev_out->oif here.
2300                    It was wrong for two reasons:
2301                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2302                       is assigned to multiple interfaces.
2303                    2. Moreover, we are allowed to send packets with saddr
2304                       of another iface. --ANK
2305                  */
2306
2307                 if (oldflp->oif == 0
2308                     && (ipv4_is_multicast(oldflp->fl4_dst) ||
2309                         oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2310                         /* Special hack: user can direct multicasts
2311                            and limited broadcast via necessary interface
2312                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2313                            This hack is not just for fun, it allows
2314                            vic,vat and friends to work.
2315                            They bind socket to loopback, set ttl to zero
2316                            and expect that it will work.
2317                            From the viewpoint of routing cache they are broken,
2318                            because we are not allowed to build multicast path
2319                            with loopback source addr (look, routing cache
2320                            cannot know, that ttl is zero, so that packet
2321                            will not leave this host and route is valid).
2322                            Luckily, this hack is good workaround.
2323                          */
2324
2325                         fl.oif = dev_out->ifindex;
2326                         goto make_route;
2327                 }
2328                 if (dev_out)
2329                         dev_put(dev_out);
2330                 dev_out = NULL;
2331         }
2332
2333
2334         if (oldflp->oif) {
2335                 dev_out = dev_get_by_index(net, oldflp->oif);
2336                 err = -ENODEV;
2337                 if (dev_out == NULL)
2338                         goto out;
2339
2340                 /* RACE: Check return value of inet_select_addr instead. */
2341                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2342                         dev_put(dev_out);
2343                         goto out;       /* Wrong error code */
2344                 }
2345
2346                 if (ipv4_is_local_multicast(oldflp->fl4_dst) ||
2347                     oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2348                         if (!fl.fl4_src)
2349                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2350                                                               RT_SCOPE_LINK);
2351                         goto make_route;
2352                 }
2353                 if (!fl.fl4_src) {
2354                         if (ipv4_is_multicast(oldflp->fl4_dst))
2355                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2356                                                               fl.fl4_scope);
2357                         else if (!oldflp->fl4_dst)
2358                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2359                                                               RT_SCOPE_HOST);
2360                 }
2361         }
2362
2363         if (!fl.fl4_dst) {
2364                 fl.fl4_dst = fl.fl4_src;
2365                 if (!fl.fl4_dst)
2366                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2367                 if (dev_out)
2368                         dev_put(dev_out);
2369                 dev_out = net->loopback_dev;
2370                 dev_hold(dev_out);
2371                 fl.oif = net->loopback_dev->ifindex;
2372                 res.type = RTN_LOCAL;
2373                 flags |= RTCF_LOCAL;
2374                 goto make_route;
2375         }
2376
2377         if (fib_lookup(net, &fl, &res)) {
2378                 res.fi = NULL;
2379                 if (oldflp->oif) {
2380                         /* Apparently, routing tables are wrong. Assume,
2381                            that the destination is on link.
2382
2383                            WHY? DW.
2384                            Because we are allowed to send to iface
2385                            even if it has NO routes and NO assigned
2386                            addresses. When oif is specified, routing
2387                            tables are looked up with only one purpose:
2388                            to catch if destination is gatewayed, rather than
2389                            direct. Moreover, if MSG_DONTROUTE is set,
2390                            we send packet, ignoring both routing tables
2391                            and ifaddr state. --ANK
2392
2393
2394                            We could make it even if oif is unknown,
2395                            likely IPv6, but we do not.
2396                          */
2397
2398                         if (fl.fl4_src == 0)
2399                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2400                                                               RT_SCOPE_LINK);
2401                         res.type = RTN_UNICAST;
2402                         goto make_route;
2403                 }
2404                 if (dev_out)
2405                         dev_put(dev_out);
2406                 err = -ENETUNREACH;
2407                 goto out;
2408         }
2409         free_res = 1;
2410
2411         if (res.type == RTN_LOCAL) {
2412                 if (!fl.fl4_src)
2413                         fl.fl4_src = fl.fl4_dst;
2414                 if (dev_out)
2415                         dev_put(dev_out);
2416                 dev_out = net->loopback_dev;
2417                 dev_hold(dev_out);
2418                 fl.oif = dev_out->ifindex;
2419                 if (res.fi)
2420                         fib_info_put(res.fi);
2421                 res.fi = NULL;
2422                 flags |= RTCF_LOCAL;
2423                 goto make_route;
2424         }
2425
2426 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2427         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2428                 fib_select_multipath(&fl, &res);
2429         else
2430 #endif
2431         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2432                 fib_select_default(net, &fl, &res);
2433
2434         if (!fl.fl4_src)
2435                 fl.fl4_src = FIB_RES_PREFSRC(res);
2436
2437         if (dev_out)
2438                 dev_put(dev_out);
2439         dev_out = FIB_RES_DEV(res);
2440         dev_hold(dev_out);
2441         fl.oif = dev_out->ifindex;
2442
2443
2444 make_route:
2445         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2446
2447
2448         if (free_res)
2449                 fib_res_put(&res);
2450         if (dev_out)
2451                 dev_put(dev_out);
2452 out:    return err;
2453 }
2454
2455 int __ip_route_output_key(struct net *net, struct rtable **rp,
2456                           const struct flowi *flp)
2457 {
2458         unsigned hash;
2459         struct rtable *rth;
2460
2461         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2462
2463         rcu_read_lock_bh();
2464         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2465                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2466                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2467                     rth->fl.fl4_src == flp->fl4_src &&
2468                     rth->fl.iif == 0 &&
2469                     rth->fl.oif == flp->oif &&
2470                     rth->fl.mark == flp->mark &&
2471                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2472                             (IPTOS_RT_MASK | RTO_ONLINK)) &&
2473                     rth->u.dst.dev->nd_net == net &&
2474                     rth->rt_genid == atomic_read(&rt_genid)) {
2475                         dst_use(&rth->u.dst, jiffies);
2476                         RT_CACHE_STAT_INC(out_hit);
2477                         rcu_read_unlock_bh();
2478                         *rp = rth;
2479                         return 0;
2480                 }
2481                 RT_CACHE_STAT_INC(out_hlist_search);
2482         }
2483         rcu_read_unlock_bh();
2484
2485         return ip_route_output_slow(net, rp, flp);
2486 }
2487
2488 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2489
2490 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2491 {
2492 }
2493
2494 static struct dst_ops ipv4_dst_blackhole_ops = {
2495         .family                 =       AF_INET,
2496         .protocol               =       __constant_htons(ETH_P_IP),
2497         .destroy                =       ipv4_dst_destroy,
2498         .check                  =       ipv4_dst_check,
2499         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2500         .entry_size             =       sizeof(struct rtable),
2501         .entries                =       ATOMIC_INIT(0),
2502 };
2503
2504
2505 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2506 {
2507         struct rtable *ort = *rp;
2508         struct rtable *rt = (struct rtable *)
2509                 dst_alloc(&ipv4_dst_blackhole_ops);
2510
2511         if (rt) {
2512                 struct dst_entry *new = &rt->u.dst;
2513
2514                 atomic_set(&new->__refcnt, 1);
2515                 new->__use = 1;
2516                 new->input = dst_discard;
2517                 new->output = dst_discard;
2518                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2519
2520                 new->dev = ort->u.dst.dev;
2521                 if (new->dev)
2522                         dev_hold(new->dev);
2523
2524                 rt->fl = ort->fl;
2525
2526                 rt->idev = ort->idev;
2527                 if (rt->idev)
2528                         in_dev_hold(rt->idev);
2529                 rt->rt_genid = atomic_read(&rt_genid);
2530                 rt->rt_flags = ort->rt_flags;
2531                 rt->rt_type = ort->rt_type;
2532                 rt->rt_dst = ort->rt_dst;
2533                 rt->rt_src = ort->rt_src;
2534                 rt->rt_iif = ort->rt_iif;
2535                 rt->rt_gateway = ort->rt_gateway;
2536                 rt->rt_spec_dst = ort->rt_spec_dst;
2537                 rt->peer = ort->peer;
2538                 if (rt->peer)
2539                         atomic_inc(&rt->peer->refcnt);
2540
2541                 dst_free(new);
2542         }
2543
2544         dst_release(&(*rp)->u.dst);
2545         *rp = rt;
2546         return (rt ? 0 : -ENOMEM);
2547 }
2548
2549 int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp,
2550                          struct sock *sk, int flags)
2551 {
2552         int err;
2553
2554         if ((err = __ip_route_output_key(net, rp, flp)) != 0)
2555                 return err;
2556
2557         if (flp->proto) {
2558                 if (!flp->fl4_src)
2559                         flp->fl4_src = (*rp)->rt_src;
2560                 if (!flp->fl4_dst)
2561                         flp->fl4_dst = (*rp)->rt_dst;
2562                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk,
2563                                     flags ? XFRM_LOOKUP_WAIT : 0);
2564                 if (err == -EREMOTE)
2565                         err = ipv4_dst_blackhole(rp, flp, sk);
2566
2567                 return err;
2568         }
2569
2570         return 0;
2571 }
2572
2573 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2574
2575 int ip_route_output_key(struct net *net, struct rtable **rp, struct flowi *flp)
2576 {
2577         return ip_route_output_flow(net, rp, flp, NULL, 0);
2578 }
2579
2580 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2581                         int nowait, unsigned int flags)
2582 {
2583         struct rtable *rt = (struct rtable*)skb->dst;
2584         struct rtmsg *r;
2585         struct nlmsghdr *nlh;
2586         long expires;
2587         u32 id = 0, ts = 0, tsage = 0, error;
2588
2589         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2590         if (nlh == NULL)
2591                 return -EMSGSIZE;
2592
2593         r = nlmsg_data(nlh);
2594         r->rtm_family    = AF_INET;
2595         r->rtm_dst_len  = 32;
2596         r->rtm_src_len  = 0;
2597         r->rtm_tos      = rt->fl.fl4_tos;
2598         r->rtm_table    = RT_TABLE_MAIN;
2599         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2600         r->rtm_type     = rt->rt_type;
2601         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2602         r->rtm_protocol = RTPROT_UNSPEC;
2603         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2604         if (rt->rt_flags & RTCF_NOTIFY)
2605                 r->rtm_flags |= RTM_F_NOTIFY;
2606
2607         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2608
2609         if (rt->fl.fl4_src) {
2610                 r->rtm_src_len = 32;
2611                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2612         }
2613         if (rt->u.dst.dev)
2614                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2615 #ifdef CONFIG_NET_CLS_ROUTE
2616         if (rt->u.dst.tclassid)
2617                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2618 #endif
2619         if (rt->fl.iif)
2620                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2621         else if (rt->rt_src != rt->fl.fl4_src)
2622                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2623
2624         if (rt->rt_dst != rt->rt_gateway)
2625                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2626
2627         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2628                 goto nla_put_failure;
2629
2630         error = rt->u.dst.error;
2631         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2632         if (rt->peer) {
2633                 id = rt->peer->ip_id_count;
2634                 if (rt->peer->tcp_ts_stamp) {
2635                         ts = rt->peer->tcp_ts;
2636                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2637                 }
2638         }
2639
2640         if (rt->fl.iif) {
2641 #ifdef CONFIG_IP_MROUTE
2642                 __be32 dst = rt->rt_dst;
2643
2644                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2645                     IPV4_DEVCONF_ALL(&init_net, MC_FORWARDING)) {
2646                         int err = ipmr_get_route(skb, r, nowait);
2647                         if (err <= 0) {
2648                                 if (!nowait) {
2649                                         if (err == 0)
2650                                                 return 0;
2651                                         goto nla_put_failure;
2652                                 } else {
2653                                         if (err == -EMSGSIZE)
2654                                                 goto nla_put_failure;
2655                                         error = err;
2656                                 }
2657                         }
2658                 } else
2659 #endif
2660                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2661         }
2662
2663         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2664                                expires, error) < 0)
2665                 goto nla_put_failure;
2666
2667         return nlmsg_end(skb, nlh);
2668
2669 nla_put_failure:
2670         nlmsg_cancel(skb, nlh);
2671         return -EMSGSIZE;
2672 }
2673
2674 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2675 {
2676         struct net *net = in_skb->sk->sk_net;
2677         struct rtmsg *rtm;
2678         struct nlattr *tb[RTA_MAX+1];
2679         struct rtable *rt = NULL;
2680         __be32 dst = 0;
2681         __be32 src = 0;
2682         u32 iif;
2683         int err;
2684         struct sk_buff *skb;
2685
2686         if (net != &init_net)
2687                 return -EINVAL;
2688
2689         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2690         if (err < 0)
2691                 goto errout;
2692
2693         rtm = nlmsg_data(nlh);
2694
2695         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2696         if (skb == NULL) {
2697                 err = -ENOBUFS;
2698                 goto errout;
2699         }
2700
2701         /* Reserve room for dummy headers, this skb can pass
2702            through good chunk of routing engine.
2703          */
2704         skb_reset_mac_header(skb);
2705         skb_reset_network_header(skb);
2706
2707         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2708         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2709         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2710
2711         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2712         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2713         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2714
2715         if (iif) {
2716                 struct net_device *dev;
2717
2718                 dev = __dev_get_by_index(&init_net, iif);
2719                 if (dev == NULL) {
2720                         err = -ENODEV;
2721                         goto errout_free;
2722                 }
2723
2724                 skb->protocol   = htons(ETH_P_IP);
2725                 skb->dev        = dev;
2726                 local_bh_disable();
2727                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2728                 local_bh_enable();
2729
2730                 rt = (struct rtable*) skb->dst;
2731                 if (err == 0 && rt->u.dst.error)
2732                         err = -rt->u.dst.error;
2733         } else {
2734                 struct flowi fl = {
2735                         .nl_u = {
2736                                 .ip4_u = {
2737                                         .daddr = dst,
2738                                         .saddr = src,
2739                                         .tos = rtm->rtm_tos,
2740                                 },
2741                         },
2742                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2743                 };
2744                 err = ip_route_output_key(&init_net, &rt, &fl);
2745         }
2746
2747         if (err)
2748                 goto errout_free;
2749
2750         skb->dst = &rt->u.dst;
2751         if (rtm->rtm_flags & RTM_F_NOTIFY)
2752                 rt->rt_flags |= RTCF_NOTIFY;
2753
2754         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2755                                 RTM_NEWROUTE, 0, 0);
2756         if (err <= 0)
2757                 goto errout_free;
2758
2759         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2760 errout:
2761         return err;
2762
2763 errout_free:
2764         kfree_skb(skb);
2765         goto errout;
2766 }
2767
2768 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2769 {
2770         struct rtable *rt;
2771         int h, s_h;
2772         int idx, s_idx;
2773
2774         s_h = cb->args[0];
2775         if (s_h < 0)
2776                 s_h = 0;
2777         s_idx = idx = cb->args[1];
2778         for (h = s_h; h <= rt_hash_mask; h++) {
2779                 rcu_read_lock_bh();
2780                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2781                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2782                         if (idx < s_idx)
2783                                 continue;
2784                         if (rt->rt_genid != atomic_read(&rt_genid))
2785                                 continue;
2786                         skb->dst = dst_clone(&rt->u.dst);
2787                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2788                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2789                                          1, NLM_F_MULTI) <= 0) {
2790                                 dst_release(xchg(&skb->dst, NULL));
2791                                 rcu_read_unlock_bh();
2792                                 goto done;
2793                         }
2794                         dst_release(xchg(&skb->dst, NULL));
2795                 }
2796                 rcu_read_unlock_bh();
2797                 s_idx = 0;
2798         }
2799
2800 done:
2801         cb->args[0] = h;
2802         cb->args[1] = idx;
2803         return skb->len;
2804 }
2805
2806 void ip_rt_multicast_event(struct in_device *in_dev)
2807 {
2808         rt_cache_flush(0);
2809 }
2810
2811 #ifdef CONFIG_SYSCTL
2812 static int flush_delay;
2813
2814 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2815                                         struct file *filp, void __user *buffer,
2816                                         size_t *lenp, loff_t *ppos)
2817 {
2818         if (write) {
2819                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2820                 rt_cache_flush(flush_delay);
2821                 return 0;
2822         }
2823
2824         return -EINVAL;
2825 }
2826
2827 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2828                                                 int __user *name,
2829                                                 int nlen,
2830                                                 void __user *oldval,
2831                                                 size_t __user *oldlenp,
2832                                                 void __user *newval,
2833                                                 size_t newlen)
2834 {
2835         int delay;
2836         if (newlen != sizeof(int))
2837                 return -EINVAL;
2838         if (get_user(delay, (int __user *)newval))
2839                 return -EFAULT;
2840         rt_cache_flush(delay);
2841         return 0;
2842 }
2843
2844 ctl_table ipv4_route_table[] = {
2845         {
2846                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2847                 .procname       = "flush",
2848                 .data           = &flush_delay,
2849                 .maxlen         = sizeof(int),
2850                 .mode           = 0200,
2851                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2852                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2853         },
2854         {
2855                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2856                 .procname       = "gc_thresh",
2857                 .data           = &ipv4_dst_ops.gc_thresh,
2858                 .maxlen         = sizeof(int),
2859                 .mode           = 0644,
2860                 .proc_handler   = &proc_dointvec,
2861         },
2862         {
2863                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2864                 .procname       = "max_size",
2865                 .data           = &ip_rt_max_size,
2866                 .maxlen         = sizeof(int),
2867                 .mode           = 0644,
2868                 .proc_handler   = &proc_dointvec,
2869         },
2870         {
2871                 /*  Deprecated. Use gc_min_interval_ms */
2872
2873                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2874                 .procname       = "gc_min_interval",
2875                 .data           = &ip_rt_gc_min_interval,
2876                 .maxlen         = sizeof(int),
2877                 .mode           = 0644,
2878                 .proc_handler   = &proc_dointvec_jiffies,
2879                 .strategy       = &sysctl_jiffies,
2880         },
2881         {
2882                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2883                 .procname       = "gc_min_interval_ms",
2884                 .data           = &ip_rt_gc_min_interval,
2885                 .maxlen         = sizeof(int),
2886                 .mode           = 0644,
2887                 .proc_handler   = &proc_dointvec_ms_jiffies,
2888                 .strategy       = &sysctl_ms_jiffies,
2889         },
2890         {
2891                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2892                 .procname       = "gc_timeout",
2893                 .data           = &ip_rt_gc_timeout,
2894                 .maxlen         = sizeof(int),
2895                 .mode           = 0644,
2896                 .proc_handler   = &proc_dointvec_jiffies,
2897                 .strategy       = &sysctl_jiffies,
2898         },
2899         {
2900                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2901                 .procname       = "gc_interval",
2902                 .data           = &ip_rt_gc_interval,
2903                 .maxlen         = sizeof(int),
2904                 .mode           = 0644,
2905                 .proc_handler   = &proc_dointvec_jiffies,
2906                 .strategy       = &sysctl_jiffies,
2907         },
2908         {
2909                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2910                 .procname       = "redirect_load",
2911                 .data           = &ip_rt_redirect_load,
2912                 .maxlen         = sizeof(int),
2913                 .mode           = 0644,
2914                 .proc_handler   = &proc_dointvec,
2915         },
2916         {
2917                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2918                 .procname       = "redirect_number",
2919                 .data           = &ip_rt_redirect_number,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = &proc_dointvec,
2923         },
2924         {
2925                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2926                 .procname       = "redirect_silence",
2927                 .data           = &ip_rt_redirect_silence,
2928                 .maxlen         = sizeof(int),
2929                 .mode           = 0644,
2930                 .proc_handler   = &proc_dointvec,
2931         },
2932         {
2933                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2934                 .procname       = "error_cost",
2935                 .data           = &ip_rt_error_cost,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = &proc_dointvec,
2939         },
2940         {
2941                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2942                 .procname       = "error_burst",
2943                 .data           = &ip_rt_error_burst,
2944                 .maxlen         = sizeof(int),
2945                 .mode           = 0644,
2946                 .proc_handler   = &proc_dointvec,
2947         },
2948         {
2949                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2950                 .procname       = "gc_elasticity",
2951                 .data           = &ip_rt_gc_elasticity,
2952                 .maxlen         = sizeof(int),
2953                 .mode           = 0644,
2954                 .proc_handler   = &proc_dointvec,
2955         },
2956         {
2957                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2958                 .procname       = "mtu_expires",
2959                 .data           = &ip_rt_mtu_expires,
2960                 .maxlen         = sizeof(int),
2961                 .mode           = 0644,
2962                 .proc_handler   = &proc_dointvec_jiffies,
2963                 .strategy       = &sysctl_jiffies,
2964         },
2965         {
2966                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2967                 .procname       = "min_pmtu",
2968                 .data           = &ip_rt_min_pmtu,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = &proc_dointvec,
2972         },
2973         {
2974                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2975                 .procname       = "min_adv_mss",
2976                 .data           = &ip_rt_min_advmss,
2977                 .maxlen         = sizeof(int),
2978                 .mode           = 0644,
2979                 .proc_handler   = &proc_dointvec,
2980         },
2981         {
2982                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2983                 .procname       = "secret_interval",
2984                 .data           = &ip_rt_secret_interval,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = &proc_dointvec_jiffies,
2988                 .strategy       = &sysctl_jiffies,
2989         },
2990         { .ctl_name = 0 }
2991 };
2992 #endif
2993
2994 #ifdef CONFIG_NET_CLS_ROUTE
2995 struct ip_rt_acct *ip_rt_acct __read_mostly;
2996 #endif /* CONFIG_NET_CLS_ROUTE */
2997
2998 static __initdata unsigned long rhash_entries;
2999 static int __init set_rhash_entries(char *str)
3000 {
3001         if (!str)
3002                 return 0;
3003         rhash_entries = simple_strtoul(str, &str, 0);
3004         return 1;
3005 }
3006 __setup("rhash_entries=", set_rhash_entries);
3007
3008 int __init ip_rt_init(void)
3009 {
3010         int rc = 0;
3011
3012         atomic_set(&rt_genid, (int) ((num_physpages ^ (num_physpages>>8)) ^
3013                              (jiffies ^ (jiffies >> 7))));
3014
3015 #ifdef CONFIG_NET_CLS_ROUTE
3016         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
3017         if (!ip_rt_acct)
3018                 panic("IP: failed to allocate ip_rt_acct\n");
3019 #endif
3020
3021         ipv4_dst_ops.kmem_cachep =
3022                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3023                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3024
3025         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3026
3027         rt_hash_table = (struct rt_hash_bucket *)
3028                 alloc_large_system_hash("IP route cache",
3029                                         sizeof(struct rt_hash_bucket),
3030                                         rhash_entries,
3031                                         (num_physpages >= 128 * 1024) ?
3032                                         15 : 17,
3033                                         0,
3034                                         &rt_hash_log,
3035                                         &rt_hash_mask,
3036                                         0);
3037         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
3038         rt_hash_lock_init();
3039
3040         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
3041         ip_rt_max_size = (rt_hash_mask + 1) * 16;
3042
3043         devinet_init();
3044         ip_fib_init();
3045
3046         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
3047
3048         /* All the timers, started at system startup tend
3049            to synchronize. Perturb it a bit.
3050          */
3051         schedule_delayed_work(&expires_work,
3052                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
3053
3054         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3055                 ip_rt_secret_interval;
3056         add_timer(&rt_secret_timer);
3057
3058         if (ip_rt_proc_init(&init_net))
3059                 printk(KERN_ERR "Unable to create route proc files\n");
3060 #ifdef CONFIG_XFRM
3061         xfrm_init();
3062         xfrm4_init();
3063 #endif
3064         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3065
3066         return rc;
3067 }
3068
3069 EXPORT_SYMBOL(__ip_select_ident);
3070 EXPORT_SYMBOL(ip_route_input);
3071 EXPORT_SYMBOL(ip_route_output_key);