SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Version:     $Id: route.c,v 1.103 2002/01/12 07:44:09 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  13  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  14  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  15  *
  16  * Fixes:
  17  *              Alan Cox        :       Verify area fixes.
  18  *              Alan Cox        :       cli() protects routing changes
  19  *              Rui Oliveira    :       ICMP routing table updates
  20  *              (rco@di.uminho.pt)      Routing table insertion and update
  21  *              Linus Torvalds  :       Rewrote bits to be sensible
  22  *              Alan Cox        :       Added BSD route gw semantics
  23  *              Alan Cox        :       Super /proc >4K
  24  *              Alan Cox        :       MTU in route table
  25  *              Alan Cox        :       MSS actually. Also added the window
  26  *                                      clamper.
  27  *              Sam Lantinga    :       Fixed route matching in rt_del()
  28  *              Alan Cox        :       Routing cache support.
  29  *              Alan Cox        :       Removed compatibility cruft.
  30  *              Alan Cox        :       RTF_REJECT support.
  31  *              Alan Cox        :       TCP irtt support.
  32  *              Jonathan Naylor :       Added Metric support.
  33  *      Miquel van Smoorenburg  :       BSD API fixes.
  34  *      Miquel van Smoorenburg  :       Metrics.
  35  *              Alan Cox        :       Use __u32 properly
  36  *              Alan Cox        :       Aligned routing errors more closely with BSD
  37  *                                      our system is still very different.
  38  *              Alan Cox        :       Faster /proc handling
  39  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  40  *                                      routing caches and better behaviour.
  41  *
  42  *              Olaf Erb        :       irtt wasn't being copied right.
  43  *              Bjorn Ekwall    :       Kerneld route support.
  44  *              Alan Cox        :       Multicast fixed (I hope)
  45  *              Pavel Krauz     :       Limited broadcast fixed
  46  *              Mike McLagan    :       Routing by source
  47  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  48  *                                      route.c and rewritten from scratch.
  49  *              Andi Kleen      :       Load-limit warning messages.
  50  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  51  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  52  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  53  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  54  *              Marc Boucher    :       routing by fwmark
  55  *      Robert Olsson           :       Added rt_cache statistics
  56  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  57  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  58  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  59  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  60  *
  61  *              This program is free software; you can redistribute it and/or
  62  *              modify it under the terms of the GNU General Public License
  63  *              as published by the Free Software Foundation; either version
  64  *              2 of the License, or (at your option) any later version.
  65  */
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <asm/system.h>
  70 #include <linux/bitops.h>
  71 #include <linux/types.h>
  72 #include <linux/kernel.h>
  73 #include <linux/mm.h>
  74 #include <linux/bootmem.h>
  75 #include <linux/string.h>
  76 #include <linux/socket.h>
  77 #include <linux/sockios.h>
  78 #include <linux/errno.h>
  79 #include <linux/in.h>
  80 #include <linux/inet.h>
  81 #include <linux/netdevice.h>
  82 #include <linux/proc_fs.h>
  83 #include <linux/init.h>
  84 #include <linux/workqueue.h>
  85 #include <linux/skbuff.h>
  86 #include <linux/inetdevice.h>
  87 #include <linux/igmp.h>
  88 #include <linux/pkt_sched.h>
  89 #include <linux/mroute.h>
  90 #include <linux/netfilter_ipv4.h>
  91 #include <linux/random.h>
  92 #include <linux/jhash.h>
  93 #include <linux/rcupdate.h>
  94 #include <linux/times.h>
  95 #include <net/dst.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112
 113 #define RT_FL_TOS(oldflp) \
 114     ((u32)(oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK)))
 115
 116 #define IP_MAX_MTU      0xFFF0
 117
 118 #define RT_GC_TIMEOUT (300*HZ)
 119
 120 static int ip_rt_min_delay              = 2 * HZ;
 121 static int ip_rt_max_delay              = 10 * HZ;
 122 static int ip_rt_max_size;
 123 static int ip_rt_gc_timeout             = RT_GC_TIMEOUT;
 124 static int ip_rt_gc_interval            = 60 * HZ;
 125 static int ip_rt_gc_min_interval        = HZ / 2;
 126 static int ip_rt_redirect_number        = 9;
 127 static int ip_rt_redirect_load          = HZ / 50;
 128 static int ip_rt_redirect_silence       = ((HZ / 50) << (9 + 1));
 129 static int ip_rt_error_cost             = HZ;
 130 static int ip_rt_error_burst            = 5 * HZ;
 131 static int ip_rt_gc_elasticity          = 8;
 132 static int ip_rt_mtu_expires            = 10 * 60 * HZ;
 133 static int ip_rt_min_pmtu               = 512 + 20 + 20;
 134 static int ip_rt_min_advmss             = 256;
 135 static int ip_rt_secret_interval        = 10 * 60 * HZ;
 136 static int ip_rt_flush_expected;
 137 static unsigned long rt_deadline;
 138
 139 #define RTprint(a...)   printk(KERN_DEBUG a)
 140
 141 static struct timer_list rt_flush_timer;
 142 static void rt_worker_func(struct work_struct *work);
 143 static DECLARE_DELAYED_WORK(expires_work, rt_worker_func);
 144 static struct timer_list rt_secret_timer;
 145
 146 /*
 147  *      Interface to generic destination cache.
 148  */
 149
 150 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 151 static void              ipv4_dst_destroy(struct dst_entry *dst);
 152 static void              ipv4_dst_ifdown(struct dst_entry *dst,
 153                                          struct net_device *dev, int how);
 154 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 155 static void              ipv4_link_failure(struct sk_buff *skb);
 156 static void              ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
 157 static int rt_garbage_collect(void);
 158
 159
 160 static struct dst_ops ipv4_dst_ops = {
 161         .family =               AF_INET,
 162         .protocol =             __constant_htons(ETH_P_IP),
 163         .gc =                   rt_garbage_collect,
 164         .check =                ipv4_dst_check,
 165         .destroy =              ipv4_dst_destroy,
 166         .ifdown =               ipv4_dst_ifdown,
 167         .negative_advice =      ipv4_negative_advice,
 168         .link_failure =         ipv4_link_failure,
 169         .update_pmtu =          ip_rt_update_pmtu,
 170         .local_out =            ip_local_out,
 171         .entry_size =           sizeof(struct rtable),
 172 };
 173
 174 #define ECN_OR_COST(class)      TC_PRIO_##class
 175
 176 const __u8 ip_tos2prio[16] = {
 177         TC_PRIO_BESTEFFORT,
 178         ECN_OR_COST(FILLER),
 179         TC_PRIO_BESTEFFORT,
 180         ECN_OR_COST(BESTEFFORT),
 181         TC_PRIO_BULK,
 182         ECN_OR_COST(BULK),
 183         TC_PRIO_BULK,
 184         ECN_OR_COST(BULK),
 185         TC_PRIO_INTERACTIVE,
 186         ECN_OR_COST(INTERACTIVE),
 187         TC_PRIO_INTERACTIVE,
 188         ECN_OR_COST(INTERACTIVE),
 189         TC_PRIO_INTERACTIVE_BULK,
 190         ECN_OR_COST(INTERACTIVE_BULK),
 191         TC_PRIO_INTERACTIVE_BULK,
 192         ECN_OR_COST(INTERACTIVE_BULK)
 193 };
 194
 195
 196 /*
 197  * Route cache.
 198  */
 199
 200 /* The locking scheme is rather straight forward:
 201  *
 202  * 1) Read-Copy Update protects the buckets of the central route hash.
 203  * 2) Only writers remove entries, and they hold the lock
 204  *    as they look at rtable reference counts.
 205  * 3) Only readers acquire references to rtable entries,
 206  *    they do so with atomic increments and with the
 207  *    lock held.
 208  */
 209
 210 struct rt_hash_bucket {
 211         struct rtable   *chain;
 212 };
 213 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || \
 214         defined(CONFIG_PROVE_LOCKING)
 215 /*
 216  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
 217  * The size of this table is a power of two and depends on the number of CPUS.
 218  * (on lockdep we have a quite big spinlock_t, so keep the size down there)
 219  */
 220 #ifdef CONFIG_LOCKDEP
 221 # define RT_HASH_LOCK_SZ        256
 222 #else
 223 # if NR_CPUS >= 32
 224 #  define RT_HASH_LOCK_SZ       4096
 225 # elif NR_CPUS >= 16
 226 #  define RT_HASH_LOCK_SZ       2048
 227 # elif NR_CPUS >= 8
 228 #  define RT_HASH_LOCK_SZ       1024
 229 # elif NR_CPUS >= 4
 230 #  define RT_HASH_LOCK_SZ       512
 231 # else
 232 #  define RT_HASH_LOCK_SZ       256
 233 # endif
 234 #endif
 235
 236 static spinlock_t       *rt_hash_locks;
 237 # define rt_hash_lock_addr(slot) &rt_hash_locks[(slot) & (RT_HASH_LOCK_SZ - 1)]
 238 # define rt_hash_lock_init()    { \
 239                 int i; \
 240                 rt_hash_locks = kmalloc(sizeof(spinlock_t) * RT_HASH_LOCK_SZ, GFP_KERNEL); \
 241                 if (!rt_hash_locks) panic("IP: failed to allocate rt_hash_locks\n"); \
 242                 for (i = 0; i < RT_HASH_LOCK_SZ; i++) \
 243                         spin_lock_init(&rt_hash_locks[i]); \
 244                 }
 245 #else
 246 # define rt_hash_lock_addr(slot) NULL
 247 # define rt_hash_lock_init()
 248 #endif
 249
 250 static struct rt_hash_bucket    *rt_hash_table;
 251 static unsigned                 rt_hash_mask;
 252 static unsigned int             rt_hash_log;
 253 static unsigned int             rt_hash_rnd;
 254
 255 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 256 #define RT_CACHE_STAT_INC(field) \
 257         (__raw_get_cpu_var(rt_cache_stat).field++)
 258
 259 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 260                                 struct rtable **res);
 261
 262 static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 263 {
 264         return (jhash_2words(daddr, saddr, rt_hash_rnd)
 265                 & rt_hash_mask);
 266 }
 267
 268 #define rt_hash(daddr, saddr, idx) \
 269         rt_hash_code((__force u32)(__be32)(daddr),\
 270                      (__force u32)(__be32)(saddr) ^ ((idx) << 5))
 271
 272 #ifdef CONFIG_PROC_FS
 273 struct rt_cache_iter_state {
 274         int bucket;
 275 };
 276
 277 static struct rtable *rt_cache_get_first(struct seq_file *seq)
 278 {
 279         struct rtable *r = NULL;
 280         struct rt_cache_iter_state *st = seq->private;
 281
 282         for (st->bucket = rt_hash_mask; st->bucket >= 0; --st->bucket) {
 283                 rcu_read_lock_bh();
 284                 r = rt_hash_table[st->bucket].chain;
 285                 if (r)
 286                         break;
 287                 rcu_read_unlock_bh();
 288         }
 289         return rcu_dereference(r);
 290 }
 291
 292 static struct rtable *rt_cache_get_next(struct seq_file *seq, struct rtable *r)
 293 {
 294         struct rt_cache_iter_state *st = seq->private;
 295
 296         r = r->u.dst.rt_next;
 297         while (!r) {
 298                 rcu_read_unlock_bh();
 299                 if (--st->bucket < 0)
 300                         break;
 301                 rcu_read_lock_bh();
 302                 r = rt_hash_table[st->bucket].chain;
 303         }
 304         return rcu_dereference(r);
 305 }
 306
 307 static struct rtable *rt_cache_get_idx(struct seq_file *seq, loff_t pos)
 308 {
 309         struct rtable *r = rt_cache_get_first(seq);
 310
 311         if (r)
 312                 while (pos && (r = rt_cache_get_next(seq, r)))
 313                         --pos;
 314         return pos ? NULL : r;
 315 }
 316
 317 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 318 {
 319         return *pos ? rt_cache_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
 320 }
 321
 322 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 323 {
 324         struct rtable *r = NULL;
 325
 326         if (v == SEQ_START_TOKEN)
 327                 r = rt_cache_get_first(seq);
 328         else
 329                 r = rt_cache_get_next(seq, v);
 330         ++*pos;
 331         return r;
 332 }
 333
 334 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 335 {
 336         if (v && v != SEQ_START_TOKEN)
 337                 rcu_read_unlock_bh();
 338 }
 339
 340 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 341 {
 342         if (v == SEQ_START_TOKEN)
 343                 seq_printf(seq, "%-127s\n",
 344                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 345                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 346                            "HHUptod\tSpecDst");
 347         else {
 348                 struct rtable *r = v;
 349                 char temp[256];
 350
 351                 sprintf(temp, "%s\t%08lX\t%08lX\t%8X\t%d\t%u\t%d\t"
 352                               "%08lX\t%d\t%u\t%u\t%02X\t%d\t%1d\t%08X",
 353                         r->u.dst.dev ? r->u.dst.dev->name : "*",
 354                         (unsigned long)r->rt_dst, (unsigned long)r->rt_gateway,
 355                         r->rt_flags, atomic_read(&r->u.dst.__refcnt),
 356                         r->u.dst.__use, 0, (unsigned long)r->rt_src,
 357                         (dst_metric(&r->u.dst, RTAX_ADVMSS) ?
 358                              (int)dst_metric(&r->u.dst, RTAX_ADVMSS) + 40 : 0),
 359                         dst_metric(&r->u.dst, RTAX_WINDOW),
 360                         (int)((dst_metric(&r->u.dst, RTAX_RTT) >> 3) +
 361                               dst_metric(&r->u.dst, RTAX_RTTVAR)),
 362                         r->fl.fl4_tos,
 363                         r->u.dst.hh ? atomic_read(&r->u.dst.hh->hh_refcnt) : -1,
 364                         r->u.dst.hh ? (r->u.dst.hh->hh_output ==
 365                                        dev_queue_xmit) : 0,
 366                         r->rt_spec_dst);
 367                 seq_printf(seq, "%-127s\n", temp);
 368         }
 369         return 0;
 370 }
 371
 372 static const struct seq_operations rt_cache_seq_ops = {
 373         .start  = rt_cache_seq_start,
 374         .next   = rt_cache_seq_next,
 375         .stop   = rt_cache_seq_stop,
 376         .show   = rt_cache_seq_show,
 377 };
 378
 379 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 380 {
 381         return seq_open_private(file, &rt_cache_seq_ops,
 382                         sizeof(struct rt_cache_iter_state));
 383 }
 384
 385 static const struct file_operations rt_cache_seq_fops = {
 386         .owner   = THIS_MODULE,
 387         .open    = rt_cache_seq_open,
 388         .read    = seq_read,
 389         .llseek  = seq_lseek,
 390         .release = seq_release_private,
 391 };
 392
 393
 394 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 395 {
 396         int cpu;
 397
 398         if (*pos == 0)
 399                 return SEQ_START_TOKEN;
 400
 401         for (cpu = *pos-1; cpu < NR_CPUS; ++cpu) {
 402                 if (!cpu_possible(cpu))
 403                         continue;
 404                 *pos = cpu+1;
 405                 return &per_cpu(rt_cache_stat, cpu);
 406         }
 407         return NULL;
 408 }
 409
 410 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 411 {
 412         int cpu;
 413
 414         for (cpu = *pos; cpu < NR_CPUS; ++cpu) {
 415                 if (!cpu_possible(cpu))
 416                         continue;
 417                 *pos = cpu+1;
 418                 return &per_cpu(rt_cache_stat, cpu);
 419         }
 420         return NULL;
 421
 422 }
 423
 424 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 425 {
 426
 427 }
 428
 429 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 430 {
 431         struct rt_cache_stat *st = v;
 432
 433         if (v == SEQ_START_TOKEN) {
 434                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 435                 return 0;
 436         }
 437
 438         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 439                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 440                    atomic_read(&ipv4_dst_ops.entries),
 441                    st->in_hit,
 442                    st->in_slow_tot,
 443                    st->in_slow_mc,
 444                    st->in_no_route,
 445                    st->in_brd,
 446                    st->in_martian_dst,
 447                    st->in_martian_src,
 448
 449                    st->out_hit,
 450                    st->out_slow_tot,
 451                    st->out_slow_mc,
 452
 453                    st->gc_total,
 454                    st->gc_ignored,
 455                    st->gc_goal_miss,
 456                    st->gc_dst_overflow,
 457                    st->in_hlist_search,
 458                    st->out_hlist_search
 459                 );
 460         return 0;
 461 }
 462
 463 static const struct seq_operations rt_cpu_seq_ops = {
 464         .start  = rt_cpu_seq_start,
 465         .next   = rt_cpu_seq_next,
 466         .stop   = rt_cpu_seq_stop,
 467         .show   = rt_cpu_seq_show,
 468 };
 469
 470
 471 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 472 {
 473         return seq_open(file, &rt_cpu_seq_ops);
 474 }
 475
 476 static const struct file_operations rt_cpu_seq_fops = {
 477         .owner   = THIS_MODULE,
 478         .open    = rt_cpu_seq_open,
 479         .read    = seq_read,
 480         .llseek  = seq_lseek,
 481         .release = seq_release,
 482 };
 483
 484 #ifdef CONFIG_NET_CLS_ROUTE
 485 static int ip_rt_acct_read(char *buffer, char **start, off_t offset,
 486                            int length, int *eof, void *data)
 487 {
 488         unsigned int i;
 489
 490         if ((offset & 3) || (length & 3))
 491                 return -EIO;
 492
 493         if (offset >= sizeof(struct ip_rt_acct) * 256) {
 494                 *eof = 1;
 495                 return 0;
 496         }
 497
 498         if (offset + length >= sizeof(struct ip_rt_acct) * 256) {
 499                 length = sizeof(struct ip_rt_acct) * 256 - offset;
 500                 *eof = 1;
 501         }
 502
 503         offset /= sizeof(u32);
 504
 505         if (length > 0) {
 506                 u32 *dst = (u32 *) buffer;
 507
 508                 *start = buffer;
 509                 memset(dst, 0, length);
 510
 511                 for_each_possible_cpu(i) {
 512                         unsigned int j;
 513                         u32 *src;
 514
 515                         src = ((u32 *) per_cpu_ptr(ip_rt_acct, i)) + offset;
 516                         for (j = 0; j < length/4; j++)
 517                                 dst[j] += src[j];
 518                 }
 519         }
 520         return length;
 521 }
 522 #endif
 523 #endif /* CONFIG_PROC_FS */
 524
 525 static __inline__ void rt_free(struct rtable *rt)
 526 {
 527         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 528 }
 529
 530 static __inline__ void rt_drop(struct rtable *rt)
 531 {
 532         ip_rt_put(rt);
 533         call_rcu_bh(&rt->u.dst.rcu_head, dst_rcu_free);
 534 }
 535
 536 static __inline__ int rt_fast_clean(struct rtable *rth)
 537 {
 538         /* Kill broadcast/multicast entries very aggresively, if they
 539            collide in hash table with more useful entries */
 540         return (rth->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) &&
 541                 rth->fl.iif && rth->u.dst.rt_next;
 542 }
 543
 544 static __inline__ int rt_valuable(struct rtable *rth)
 545 {
 546         return (rth->rt_flags & (RTCF_REDIRECTED | RTCF_NOTIFY)) ||
 547                 rth->u.dst.expires;
 548 }
 549
 550 static int rt_may_expire(struct rtable *rth, unsigned long tmo1, unsigned long tmo2)
 551 {
 552         unsigned long age;
 553         int ret = 0;
 554
 555         if (atomic_read(&rth->u.dst.__refcnt))
 556                 goto out;
 557
 558         ret = 1;
 559         if (rth->u.dst.expires &&
 560             time_after_eq(jiffies, rth->u.dst.expires))
 561                 goto out;
 562
 563         age = jiffies - rth->u.dst.lastuse;
 564         ret = 0;
 565         if ((age <= tmo1 && !rt_fast_clean(rth)) ||
 566             (age <= tmo2 && rt_valuable(rth)))
 567                 goto out;
 568         ret = 1;
 569 out:    return ret;
 570 }
 571
 572 /* Bits of score are:
 573  * 31: very valuable
 574  * 30: not quite useless
 575  * 29..0: usage counter
 576  */
 577 static inline u32 rt_score(struct rtable *rt)
 578 {
 579         u32 score = jiffies - rt->u.dst.lastuse;
 580
 581         score = ~score & ~(3<<30);
 582
 583         if (rt_valuable(rt))
 584                 score |= (1<<31);
 585
 586         if (!rt->fl.iif ||
 587             !(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST|RTCF_LOCAL)))
 588                 score |= (1<<30);
 589
 590         return score;
 591 }
 592
 593 static inline int compare_keys(struct flowi *fl1, struct flowi *fl2)
 594 {
 595         return ((__force u32)((fl1->nl_u.ip4_u.daddr ^ fl2->nl_u.ip4_u.daddr) |
 596                 (fl1->nl_u.ip4_u.saddr ^ fl2->nl_u.ip4_u.saddr)) |
 597                 (fl1->mark ^ fl2->mark) |
 598                 (*(u16 *)&fl1->nl_u.ip4_u.tos ^
 599                  *(u16 *)&fl2->nl_u.ip4_u.tos) |
 600                 (fl1->oif ^ fl2->oif) |
 601                 (fl1->iif ^ fl2->iif)) == 0;
 602 }
 603
 604 /*
 605  * Perform a full scan of hash table and free all entries.
 606  * Can be called by a softirq or a process.
 607  * In the later case, we want to be reschedule if necessary
 608  */
 609 static void rt_do_flush(int process_context)
 610 {
 611         unsigned int i;
 612         struct rtable *rth, *next;
 613
 614         for (i = 0; i <= rt_hash_mask; i++) {
 615                 if (process_context && need_resched())
 616                         cond_resched();
 617                 rth = rt_hash_table[i].chain;
 618                 if (!rth)
 619                         continue;
 620
 621                 spin_lock_bh(rt_hash_lock_addr(i));
 622                 rth = rt_hash_table[i].chain;
 623                 rt_hash_table[i].chain = NULL;
 624                 spin_unlock_bh(rt_hash_lock_addr(i));
 625
 626                 for (; rth; rth = next) {
 627                         next = rth->u.dst.rt_next;
 628                         rt_free(rth);
 629                 }
 630         }
 631 }
 632
 633 static void rt_check_expire(void)
 634 {
 635         static unsigned int rover;
 636         unsigned int i = rover, goal;
 637         struct rtable *rth, **rthp;
 638         u64 mult;
 639
 640         mult = ((u64)ip_rt_gc_interval) << rt_hash_log;
 641         if (ip_rt_gc_timeout > 1)
 642                 do_div(mult, ip_rt_gc_timeout);
 643         goal = (unsigned int)mult;
 644         if (goal > rt_hash_mask)
 645                 goal = rt_hash_mask + 1;
 646         for (; goal > 0; goal--) {
 647                 unsigned long tmo = ip_rt_gc_timeout;
 648
 649                 i = (i + 1) & rt_hash_mask;
 650                 rthp = &rt_hash_table[i].chain;
 651
 652                 if (need_resched())
 653                         cond_resched();
 654
 655                 if (*rthp == NULL)
 656                         continue;
 657                 spin_lock_bh(rt_hash_lock_addr(i));
 658                 while ((rth = *rthp) != NULL) {
 659                         if (rth->u.dst.expires) {
 660                                 /* Entry is expired even if it is in use */
 661                                 if (time_before_eq(jiffies, rth->u.dst.expires)) {
 662                                         tmo >>= 1;
 663                                         rthp = &rth->u.dst.rt_next;
 664                                         continue;
 665                                 }
 666                         } else if (!rt_may_expire(rth, tmo, ip_rt_gc_timeout)) {
 667                                 tmo >>= 1;
 668                                 rthp = &rth->u.dst.rt_next;
 669                                 continue;
 670                         }
 671
 672                         /* Cleanup aged off entries. */
 673                         *rthp = rth->u.dst.rt_next;
 674                         rt_free(rth);
 675                 }
 676                 spin_unlock_bh(rt_hash_lock_addr(i));
 677         }
 678         rover = i;
 679 }
 680
 681 /*
 682  * rt_worker_func() is run in process context.
 683  * If a whole flush was scheduled, it is done.
 684  * Else, we call rt_check_expire() to scan part of the hash table
 685  */
 686 static void rt_worker_func(struct work_struct *work)
 687 {
 688         if (ip_rt_flush_expected) {
 689                 ip_rt_flush_expected = 0;
 690                 rt_do_flush(1);
 691         } else
 692                 rt_check_expire();
 693         schedule_delayed_work(&expires_work, ip_rt_gc_interval);
 694 }
 695
 696 /* This can run from both BH and non-BH contexts, the latter
 697  * in the case of a forced flush event.
 698  */
 699 static void rt_run_flush(unsigned long process_context)
 700 {
 701         rt_deadline = 0;
 702
 703         get_random_bytes(&rt_hash_rnd, 4);
 704
 705         rt_do_flush(process_context);
 706 }
 707
 708 static DEFINE_SPINLOCK(rt_flush_lock);
 709
 710 void rt_cache_flush(int delay)
 711 {
 712         unsigned long now = jiffies;
 713         int user_mode = !in_softirq();
 714
 715         if (delay < 0)
 716                 delay = ip_rt_min_delay;
 717
 718         spin_lock_bh(&rt_flush_lock);
 719
 720         if (del_timer(&rt_flush_timer) && delay > 0 && rt_deadline) {
 721                 long tmo = (long)(rt_deadline - now);
 722
 723                 /* If flush timer is already running
 724                    and flush request is not immediate (delay > 0):
 725
 726                    if deadline is not achieved, prolongate timer to "delay",
 727                    otherwise fire it at deadline time.
 728                  */
 729
 730                 if (user_mode && tmo < ip_rt_max_delay-ip_rt_min_delay)
 731                         tmo = 0;
 732
 733                 if (delay > tmo)
 734                         delay = tmo;
 735         }
 736
 737         if (delay <= 0) {
 738                 spin_unlock_bh(&rt_flush_lock);
 739                 rt_run_flush(user_mode);
 740                 return;
 741         }
 742
 743         if (rt_deadline == 0)
 744                 rt_deadline = now + ip_rt_max_delay;
 745
 746         mod_timer(&rt_flush_timer, now+delay);
 747         spin_unlock_bh(&rt_flush_lock);
 748 }
 749
 750 /*
 751  * We change rt_hash_rnd and ask next rt_worker_func() invocation
 752  * to perform a flush in process context
 753  */
 754 static void rt_secret_rebuild(unsigned long dummy)
 755 {
 756         get_random_bytes(&rt_hash_rnd, 4);
 757         ip_rt_flush_expected = 1;
 758         cancel_delayed_work(&expires_work);
 759         schedule_delayed_work(&expires_work, HZ/10);
 760         mod_timer(&rt_secret_timer, jiffies + ip_rt_secret_interval);
 761 }
 762
 763 /*
 764    Short description of GC goals.
 765
 766    We want to build algorithm, which will keep routing cache
 767    at some equilibrium point, when number of aged off entries
 768    is kept approximately equal to newly generated ones.
 769
 770    Current expiration strength is variable "expire".
 771    We try to adjust it dynamically, so that if networking
 772    is idle expires is large enough to keep enough of warm entries,
 773    and when load increases it reduces to limit cache size.
 774  */
 775
 776 static int rt_garbage_collect(void)
 777 {
 778         static unsigned long expire = RT_GC_TIMEOUT;
 779         static unsigned long last_gc;
 780         static int rover;
 781         static int equilibrium;
 782         struct rtable *rth, **rthp;
 783         unsigned long now = jiffies;
 784         int goal;
 785
 786         /*
 787          * Garbage collection is pretty expensive,
 788          * do not make it too frequently.
 789          */
 790
 791         RT_CACHE_STAT_INC(gc_total);
 792
 793         if (now - last_gc < ip_rt_gc_min_interval &&
 794             atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) {
 795                 RT_CACHE_STAT_INC(gc_ignored);
 796                 goto out;
 797         }
 798
 799         /* Calculate number of entries, which we want to expire now. */
 800         goal = atomic_read(&ipv4_dst_ops.entries) -
 801                 (ip_rt_gc_elasticity << rt_hash_log);
 802         if (goal <= 0) {
 803                 if (equilibrium < ipv4_dst_ops.gc_thresh)
 804                         equilibrium = ipv4_dst_ops.gc_thresh;
 805                 goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 806                 if (goal > 0) {
 807                         equilibrium += min_t(unsigned int, goal / 2, rt_hash_mask + 1);
 808                         goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium;
 809                 }
 810         } else {
 811                 /* We are in dangerous area. Try to reduce cache really
 812                  * aggressively.
 813                  */
 814                 goal = max_t(unsigned int, goal / 2, rt_hash_mask + 1);
 815                 equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal;
 816         }
 817
 818         if (now - last_gc >= ip_rt_gc_min_interval)
 819                 last_gc = now;
 820
 821         if (goal <= 0) {
 822                 equilibrium += goal;
 823                 goto work_done;
 824         }
 825
 826         do {
 827                 int i, k;
 828
 829                 for (i = rt_hash_mask, k = rover; i >= 0; i--) {
 830                         unsigned long tmo = expire;
 831
 832                         k = (k + 1) & rt_hash_mask;
 833                         rthp = &rt_hash_table[k].chain;
 834                         spin_lock_bh(rt_hash_lock_addr(k));
 835                         while ((rth = *rthp) != NULL) {
 836                                 if (!rt_may_expire(rth, tmo, expire)) {
 837                                         tmo >>= 1;
 838                                         rthp = &rth->u.dst.rt_next;
 839                                         continue;
 840                                 }
 841                                 *rthp = rth->u.dst.rt_next;
 842                                 rt_free(rth);
 843                                 goal--;
 844                         }
 845                         spin_unlock_bh(rt_hash_lock_addr(k));
 846                         if (goal <= 0)
 847                                 break;
 848                 }
 849                 rover = k;
 850
 851                 if (goal <= 0)
 852                         goto work_done;
 853
 854                 /* Goal is not achieved. We stop process if:
 855
 856                    - if expire reduced to zero. Otherwise, expire is halfed.
 857                    - if table is not full.
 858                    - if we are called from interrupt.
 859                    - jiffies check is just fallback/debug loop breaker.
 860                      We will not spin here for long time in any case.
 861                  */
 862
 863                 RT_CACHE_STAT_INC(gc_goal_miss);
 864
 865                 if (expire == 0)
 866                         break;
 867
 868                 expire >>= 1;
 869 #if RT_CACHE_DEBUG >= 2
 870                 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire,
 871                                 atomic_read(&ipv4_dst_ops.entries), goal, i);
 872 #endif
 873
 874                 if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 875                         goto out;
 876         } while (!in_softirq() && time_before_eq(jiffies, now));
 877
 878         if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size)
 879                 goto out;
 880         if (net_ratelimit())
 881                 printk(KERN_WARNING "dst cache overflow\n");
 882         RT_CACHE_STAT_INC(gc_dst_overflow);
 883         return 1;
 884
 885 work_done:
 886         expire += ip_rt_gc_min_interval;
 887         if (expire > ip_rt_gc_timeout ||
 888             atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh)
 889                 expire = ip_rt_gc_timeout;
 890 #if RT_CACHE_DEBUG >= 2
 891         printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire,
 892                         atomic_read(&ipv4_dst_ops.entries), goal, rover);
 893 #endif
 894 out:    return 0;
 895 }
 896
 897 static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp)
 898 {
 899         struct rtable   *rth, **rthp;
 900         unsigned long   now;
 901         struct rtable *cand, **candp;
 902         u32             min_score;
 903         int             chain_length;
 904         int attempts = !in_softirq();
 905
 906 restart:
 907         chain_length = 0;
 908         min_score = ~(u32)0;
 909         cand = NULL;
 910         candp = NULL;
 911         now = jiffies;
 912
 913         rthp = &rt_hash_table[hash].chain;
 914
 915         spin_lock_bh(rt_hash_lock_addr(hash));
 916         while ((rth = *rthp) != NULL) {
 917                 if (compare_keys(&rth->fl, &rt->fl)) {
 918                         /* Put it first */
 919                         *rthp = rth->u.dst.rt_next;
 920                         /*
 921                          * Since lookup is lockfree, the deletion
 922                          * must be visible to another weakly ordered CPU before
 923                          * the insertion at the start of the hash chain.
 924                          */
 925                         rcu_assign_pointer(rth->u.dst.rt_next,
 926                                            rt_hash_table[hash].chain);
 927                         /*
 928                          * Since lookup is lockfree, the update writes
 929                          * must be ordered for consistency on SMP.
 930                          */
 931                         rcu_assign_pointer(rt_hash_table[hash].chain, rth);
 932
 933                         dst_use(&rth->u.dst, now);
 934                         spin_unlock_bh(rt_hash_lock_addr(hash));
 935
 936                         rt_drop(rt);
 937                         *rp = rth;
 938                         return 0;
 939                 }
 940
 941                 if (!atomic_read(&rth->u.dst.__refcnt)) {
 942                         u32 score = rt_score(rth);
 943
 944                         if (score <= min_score) {
 945                                 cand = rth;
 946                                 candp = rthp;
 947                                 min_score = score;
 948                         }
 949                 }
 950
 951                 chain_length++;
 952
 953                 rthp = &rth->u.dst.rt_next;
 954         }
 955
 956         if (cand) {
 957                 /* ip_rt_gc_elasticity used to be average length of chain
 958                  * length, when exceeded gc becomes really aggressive.
 959                  *
 960                  * The second limit is less certain. At the moment it allows
 961                  * only 2 entries per bucket. We will see.
 962                  */
 963                 if (chain_length > ip_rt_gc_elasticity) {
 964                         *candp = cand->u.dst.rt_next;
 965                         rt_free(cand);
 966                 }
 967         }
 968
 969         /* Try to bind route to arp only if it is output
 970            route or unicast forwarding path.
 971          */
 972         if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) {
 973                 int err = arp_bind_neighbour(&rt->u.dst);
 974                 if (err) {
 975                         spin_unlock_bh(rt_hash_lock_addr(hash));
 976
 977                         if (err != -ENOBUFS) {
 978                                 rt_drop(rt);
 979                                 return err;
 980                         }
 981
 982                         /* Neighbour tables are full and nothing
 983                            can be released. Try to shrink route cache,
 984                            it is most likely it holds some neighbour records.
 985                          */
 986                         if (attempts-- > 0) {
 987                                 int saved_elasticity = ip_rt_gc_elasticity;
 988                                 int saved_int = ip_rt_gc_min_interval;
 989                                 ip_rt_gc_elasticity     = 1;
 990                                 ip_rt_gc_min_interval   = 0;
 991                                 rt_garbage_collect();
 992                                 ip_rt_gc_min_interval   = saved_int;
 993                                 ip_rt_gc_elasticity     = saved_elasticity;
 994                                 goto restart;
 995                         }
 996
 997                         if (net_ratelimit())
 998                                 printk(KERN_WARNING "Neighbour table overflow.\n");
 999                         rt_drop(rt);
1000                         return -ENOBUFS;
1001                 }
1002         }
1003
1004         rt->u.dst.rt_next = rt_hash_table[hash].chain;
1005 #if RT_CACHE_DEBUG >= 2
1006         if (rt->u.dst.rt_next) {
1007                 struct rtable *trt;
1008                 printk(KERN_DEBUG "rt_cache @%02x: %u.%u.%u.%u", hash,
1009                        NIPQUAD(rt->rt_dst));
1010                 for (trt = rt->u.dst.rt_next; trt; trt = trt->u.dst.rt_next)
1011                         printk(" . %u.%u.%u.%u", NIPQUAD(trt->rt_dst));
1012                 printk("\n");
1013         }
1014 #endif
1015         rt_hash_table[hash].chain = rt;
1016         spin_unlock_bh(rt_hash_lock_addr(hash));
1017         *rp = rt;
1018         return 0;
1019 }
1020
1021 void rt_bind_peer(struct rtable *rt, int create)
1022 {
1023         static DEFINE_SPINLOCK(rt_peer_lock);
1024         struct inet_peer *peer;
1025
1026         peer = inet_getpeer(rt->rt_dst, create);
1027
1028         spin_lock_bh(&rt_peer_lock);
1029         if (rt->peer == NULL) {
1030                 rt->peer = peer;
1031                 peer = NULL;
1032         }
1033         spin_unlock_bh(&rt_peer_lock);
1034         if (peer)
1035                 inet_putpeer(peer);
1036 }
1037
1038 /*
1039  * Peer allocation may fail only in serious out-of-memory conditions.  However
1040  * we still can generate some output.
1041  * Random ID selection looks a bit dangerous because we have no chances to
1042  * select ID being unique in a reasonable period of time.
1043  * But broken packet identifier may be better than no packet at all.
1044  */
1045 static void ip_select_fb_ident(struct iphdr *iph)
1046 {
1047         static DEFINE_SPINLOCK(ip_fb_id_lock);
1048         static u32 ip_fallback_id;
1049         u32 salt;
1050
1051         spin_lock_bh(&ip_fb_id_lock);
1052         salt = secure_ip_id((__force __be32)ip_fallback_id ^ iph->daddr);
1053         iph->id = htons(salt & 0xFFFF);
1054         ip_fallback_id = salt;
1055         spin_unlock_bh(&ip_fb_id_lock);
1056 }
1057
1058 void __ip_select_ident(struct iphdr *iph, struct dst_entry *dst, int more)
1059 {
1060         struct rtable *rt = (struct rtable *) dst;
1061
1062         if (rt) {
1063                 if (rt->peer == NULL)
1064                         rt_bind_peer(rt, 1);
1065
1066                 /* If peer is attached to destination, it is never detached,
1067                    so that we need not to grab a lock to dereference it.
1068                  */
1069                 if (rt->peer) {
1070                         iph->id = htons(inet_getid(rt->peer, more));
1071                         return;
1072                 }
1073         } else
1074                 printk(KERN_DEBUG "rt_bind_peer(0) @%p\n",
1075                        __builtin_return_address(0));
1076
1077         ip_select_fb_ident(iph);
1078 }
1079
1080 static void rt_del(unsigned hash, struct rtable *rt)
1081 {
1082         struct rtable **rthp;
1083
1084         spin_lock_bh(rt_hash_lock_addr(hash));
1085         ip_rt_put(rt);
1086         for (rthp = &rt_hash_table[hash].chain; *rthp;
1087              rthp = &(*rthp)->u.dst.rt_next)
1088                 if (*rthp == rt) {
1089                         *rthp = rt->u.dst.rt_next;
1090                         rt_free(rt);
1091                         break;
1092                 }
1093         spin_unlock_bh(rt_hash_lock_addr(hash));
1094 }
1095
1096 void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw,
1097                     __be32 saddr, struct net_device *dev)
1098 {
1099         int i, k;
1100         struct in_device *in_dev = in_dev_get(dev);
1101         struct rtable *rth, **rthp;
1102         __be32  skeys[2] = { saddr, 0 };
1103         int  ikeys[2] = { dev->ifindex, 0 };
1104         struct netevent_redirect netevent;
1105
1106         if (!in_dev)
1107                 return;
1108
1109         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev)
1110             || MULTICAST(new_gw) || BADCLASS(new_gw) || ZERONET(new_gw))
1111                 goto reject_redirect;
1112
1113         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
1114                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
1115                         goto reject_redirect;
1116                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
1117                         goto reject_redirect;
1118         } else {
1119                 if (inet_addr_type(new_gw) != RTN_UNICAST)
1120                         goto reject_redirect;
1121         }
1122
1123         for (i = 0; i < 2; i++) {
1124                 for (k = 0; k < 2; k++) {
1125                         unsigned hash = rt_hash(daddr, skeys[i], ikeys[k]);
1126
1127                         rthp=&rt_hash_table[hash].chain;
1128
1129                         rcu_read_lock();
1130                         while ((rth = rcu_dereference(*rthp)) != NULL) {
1131                                 struct rtable *rt;
1132
1133                                 if (rth->fl.fl4_dst != daddr ||
1134                                     rth->fl.fl4_src != skeys[i] ||
1135                                     rth->fl.oif != ikeys[k] ||
1136                                     rth->fl.iif != 0) {
1137                                         rthp = &rth->u.dst.rt_next;
1138                                         continue;
1139                                 }
1140
1141                                 if (rth->rt_dst != daddr ||
1142                                     rth->rt_src != saddr ||
1143                                     rth->u.dst.error ||
1144                                     rth->rt_gateway != old_gw ||
1145                                     rth->u.dst.dev != dev)
1146                                         break;
1147
1148                                 dst_hold(&rth->u.dst);
1149                                 rcu_read_unlock();
1150
1151                                 rt = dst_alloc(&ipv4_dst_ops);
1152                                 if (rt == NULL) {
1153                                         ip_rt_put(rth);
1154                                         in_dev_put(in_dev);
1155                                         return;
1156                                 }
1157
1158                                 /* Copy all the information. */
1159                                 *rt = *rth;
1160                                 INIT_RCU_HEAD(&rt->u.dst.rcu_head);
1161                                 rt->u.dst.__use         = 1;
1162                                 atomic_set(&rt->u.dst.__refcnt, 1);
1163                                 rt->u.dst.child         = NULL;
1164                                 if (rt->u.dst.dev)
1165                                         dev_hold(rt->u.dst.dev);
1166                                 if (rt->idev)
1167                                         in_dev_hold(rt->idev);
1168                                 rt->u.dst.obsolete      = 0;
1169                                 rt->u.dst.lastuse       = jiffies;
1170                                 rt->u.dst.path          = &rt->u.dst;
1171                                 rt->u.dst.neighbour     = NULL;
1172                                 rt->u.dst.hh            = NULL;
1173                                 rt->u.dst.xfrm          = NULL;
1174
1175                                 rt->rt_flags            |= RTCF_REDIRECTED;
1176
1177                                 /* Gateway is different ... */
1178                                 rt->rt_gateway          = new_gw;
1179
1180                                 /* Redirect received -> path was valid */
1181                                 dst_confirm(&rth->u.dst);
1182
1183                                 if (rt->peer)
1184                                         atomic_inc(&rt->peer->refcnt);
1185
1186                                 if (arp_bind_neighbour(&rt->u.dst) ||
1187                                     !(rt->u.dst.neighbour->nud_state &
1188                                             NUD_VALID)) {
1189                                         if (rt->u.dst.neighbour)
1190                                                 neigh_event_send(rt->u.dst.neighbour, NULL);
1191                                         ip_rt_put(rth);
1192                                         rt_drop(rt);
1193                                         goto do_next;
1194                                 }
1195
1196                                 netevent.old = &rth->u.dst;
1197                                 netevent.new = &rt->u.dst;
1198                                 call_netevent_notifiers(NETEVENT_REDIRECT,
1199                                                         &netevent);
1200
1201                                 rt_del(hash, rth);
1202                                 if (!rt_intern_hash(hash, rt, &rt))
1203                                         ip_rt_put(rt);
1204                                 goto do_next;
1205                         }
1206                         rcu_read_unlock();
1207                 do_next:
1208                         ;
1209                 }
1210         }
1211         in_dev_put(in_dev);
1212         return;
1213
1214 reject_redirect:
1215 #ifdef CONFIG_IP_ROUTE_VERBOSE
1216         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1217                 printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
1218                         "%u.%u.%u.%u ignored.\n"
1219                         "  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
1220                        NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
1221                        NIPQUAD(saddr), NIPQUAD(daddr));
1222 #endif
1223         in_dev_put(in_dev);
1224 }
1225
1226 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
1227 {
1228         struct rtable *rt = (struct rtable*)dst;
1229         struct dst_entry *ret = dst;
1230
1231         if (rt) {
1232                 if (dst->obsolete) {
1233                         ip_rt_put(rt);
1234                         ret = NULL;
1235                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
1236                            rt->u.dst.expires) {
1237                         unsigned hash = rt_hash(rt->fl.fl4_dst, rt->fl.fl4_src,
1238                                                 rt->fl.oif);
1239 #if RT_CACHE_DEBUG >= 1
1240                         printk(KERN_DEBUG "ipv4_negative_advice: redirect to "
1241                                           "%u.%u.%u.%u/%02x dropped\n",
1242                                 NIPQUAD(rt->rt_dst), rt->fl.fl4_tos);
1243 #endif
1244                         rt_del(hash, rt);
1245                         ret = NULL;
1246                 }
1247         }
1248         return ret;
1249 }
1250
1251 /*
1252  * Algorithm:
1253  *      1. The first ip_rt_redirect_number redirects are sent
1254  *         with exponential backoff, then we stop sending them at all,
1255  *         assuming that the host ignores our redirects.
1256  *      2. If we did not see packets requiring redirects
1257  *         during ip_rt_redirect_silence, we assume that the host
1258  *         forgot redirected route and start to send redirects again.
1259  *
1260  * This algorithm is much cheaper and more intelligent than dumb load limiting
1261  * in icmp.c.
1262  *
1263  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
1264  * and "frag. need" (breaks PMTU discovery) in icmp.c.
1265  */
1266
1267 void ip_rt_send_redirect(struct sk_buff *skb)
1268 {
1269         struct rtable *rt = (struct rtable*)skb->dst;
1270         struct in_device *in_dev = in_dev_get(rt->u.dst.dev);
1271
1272         if (!in_dev)
1273                 return;
1274
1275         if (!IN_DEV_TX_REDIRECTS(in_dev))
1276                 goto out;
1277
1278         /* No redirected packets during ip_rt_redirect_silence;
1279          * reset the algorithm.
1280          */
1281         if (time_after(jiffies, rt->u.dst.rate_last + ip_rt_redirect_silence))
1282                 rt->u.dst.rate_tokens = 0;
1283
1284         /* Too many ignored redirects; do not send anything
1285          * set u.dst.rate_last to the last seen redirected packet.
1286          */
1287         if (rt->u.dst.rate_tokens >= ip_rt_redirect_number) {
1288                 rt->u.dst.rate_last = jiffies;
1289                 goto out;
1290         }
1291
1292         /* Check for load limit; set rate_last to the latest sent
1293          * redirect.
1294          */
1295         if (rt->u.dst.rate_tokens == 0 ||
1296             time_after(jiffies,
1297                        (rt->u.dst.rate_last +
1298                         (ip_rt_redirect_load << rt->u.dst.rate_tokens)))) {
1299                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, rt->rt_gateway);
1300                 rt->u.dst.rate_last = jiffies;
1301                 ++rt->u.dst.rate_tokens;
1302 #ifdef CONFIG_IP_ROUTE_VERBOSE
1303                 if (IN_DEV_LOG_MARTIANS(in_dev) &&
1304                     rt->u.dst.rate_tokens == ip_rt_redirect_number &&
1305                     net_ratelimit())
1306                         printk(KERN_WARNING "host %u.%u.%u.%u/if%d ignores "
1307                                 "redirects for %u.%u.%u.%u to %u.%u.%u.%u.\n",
1308                                 NIPQUAD(rt->rt_src), rt->rt_iif,
1309                                 NIPQUAD(rt->rt_dst), NIPQUAD(rt->rt_gateway));
1310 #endif
1311         }
1312 out:
1313         in_dev_put(in_dev);
1314 }
1315
1316 static int ip_error(struct sk_buff *skb)
1317 {
1318         struct rtable *rt = (struct rtable*)skb->dst;
1319         unsigned long now;
1320         int code;
1321
1322         switch (rt->u.dst.error) {
1323                 case EINVAL:
1324                 default:
1325                         goto out;
1326                 case EHOSTUNREACH:
1327                         code = ICMP_HOST_UNREACH;
1328                         break;
1329                 case ENETUNREACH:
1330                         code = ICMP_NET_UNREACH;
1331                         IP_INC_STATS_BH(IPSTATS_MIB_INNOROUTES);
1332                         break;
1333                 case EACCES:
1334                         code = ICMP_PKT_FILTERED;
1335                         break;
1336         }
1337
1338         now = jiffies;
1339         rt->u.dst.rate_tokens += now - rt->u.dst.rate_last;
1340         if (rt->u.dst.rate_tokens > ip_rt_error_burst)
1341                 rt->u.dst.rate_tokens = ip_rt_error_burst;
1342         rt->u.dst.rate_last = now;
1343         if (rt->u.dst.rate_tokens >= ip_rt_error_cost) {
1344                 rt->u.dst.rate_tokens -= ip_rt_error_cost;
1345                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1346         }
1347
1348 out:    kfree_skb(skb);
1349         return 0;
1350 }
1351
1352 /*
1353  *      The last two values are not from the RFC but
1354  *      are needed for AMPRnet AX.25 paths.
1355  */
1356
1357 static const unsigned short mtu_plateau[] =
1358 {32000, 17914, 8166, 4352, 2002, 1492, 576, 296, 216, 128 };
1359
1360 static __inline__ unsigned short guess_mtu(unsigned short old_mtu)
1361 {
1362         int i;
1363
1364         for (i = 0; i < ARRAY_SIZE(mtu_plateau); i++)
1365                 if (old_mtu > mtu_plateau[i])
1366                         return mtu_plateau[i];
1367         return 68;
1368 }
1369
1370 unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu)
1371 {
1372         int i;
1373         unsigned short old_mtu = ntohs(iph->tot_len);
1374         struct rtable *rth;
1375         __be32  skeys[2] = { iph->saddr, 0, };
1376         __be32  daddr = iph->daddr;
1377         unsigned short est_mtu = 0;
1378
1379         if (ipv4_config.no_pmtu_disc)
1380                 return 0;
1381
1382         for (i = 0; i < 2; i++) {
1383                 unsigned hash = rt_hash(daddr, skeys[i], 0);
1384
1385                 rcu_read_lock();
1386                 for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
1387                      rth = rcu_dereference(rth->u.dst.rt_next)) {
1388                         if (rth->fl.fl4_dst == daddr &&
1389                             rth->fl.fl4_src == skeys[i] &&
1390                             rth->rt_dst  == daddr &&
1391                             rth->rt_src  == iph->saddr &&
1392                             rth->fl.iif == 0 &&
1393                             !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
1394                                 unsigned short mtu = new_mtu;
1395
1396                                 if (new_mtu < 68 || new_mtu >= old_mtu) {
1397
1398                                         /* BSD 4.2 compatibility hack :-( */
1399                                         if (mtu == 0 &&
1400                                             old_mtu >= rth->u.dst.metrics[RTAX_MTU-1] &&
1401                                             old_mtu >= 68 + (iph->ihl << 2))
1402                                                 old_mtu -= iph->ihl << 2;
1403
1404                                         mtu = guess_mtu(old_mtu);
1405                                 }
1406                                 if (mtu <= rth->u.dst.metrics[RTAX_MTU-1]) {
1407                                         if (mtu < rth->u.dst.metrics[RTAX_MTU-1]) {
1408                                                 dst_confirm(&rth->u.dst);
1409                                                 if (mtu < ip_rt_min_pmtu) {
1410                                                         mtu = ip_rt_min_pmtu;
1411                                                         rth->u.dst.metrics[RTAX_LOCK-1] |=
1412                                                                 (1 << RTAX_MTU);
1413                                                 }
1414                                                 rth->u.dst.metrics[RTAX_MTU-1] = mtu;
1415                                                 dst_set_expires(&rth->u.dst,
1416                                                         ip_rt_mtu_expires);
1417                                         }
1418                                         est_mtu = mtu;
1419                                 }
1420                         }
1421                 }
1422                 rcu_read_unlock();
1423         }
1424         return est_mtu ? : new_mtu;
1425 }
1426
1427 static void ip_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
1428 {
1429         if (dst->metrics[RTAX_MTU-1] > mtu && mtu >= 68 &&
1430             !(dst_metric_locked(dst, RTAX_MTU))) {
1431                 if (mtu < ip_rt_min_pmtu) {
1432                         mtu = ip_rt_min_pmtu;
1433                         dst->metrics[RTAX_LOCK-1] |= (1 << RTAX_MTU);
1434                 }
1435                 dst->metrics[RTAX_MTU-1] = mtu;
1436                 dst_set_expires(dst, ip_rt_mtu_expires);
1437                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
1438         }
1439 }
1440
1441 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1442 {
1443         return NULL;
1444 }
1445
1446 static void ipv4_dst_destroy(struct dst_entry *dst)
1447 {
1448         struct rtable *rt = (struct rtable *) dst;
1449         struct inet_peer *peer = rt->peer;
1450         struct in_device *idev = rt->idev;
1451
1452         if (peer) {
1453                 rt->peer = NULL;
1454                 inet_putpeer(peer);
1455         }
1456
1457         if (idev) {
1458                 rt->idev = NULL;
1459                 in_dev_put(idev);
1460         }
1461 }
1462
1463 static void ipv4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
1464                             int how)
1465 {
1466         struct rtable *rt = (struct rtable *) dst;
1467         struct in_device *idev = rt->idev;
1468         if (dev != init_net.loopback_dev && idev && idev->dev == dev) {
1469                 struct in_device *loopback_idev = in_dev_get(init_net.loopback_dev);
1470                 if (loopback_idev) {
1471                         rt->idev = loopback_idev;
1472                         in_dev_put(idev);
1473                 }
1474         }
1475 }
1476
1477 static void ipv4_link_failure(struct sk_buff *skb)
1478 {
1479         struct rtable *rt;
1480
1481         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1482
1483         rt = (struct rtable *) skb->dst;
1484         if (rt)
1485                 dst_set_expires(&rt->u.dst, 0);
1486 }
1487
1488 static int ip_rt_bug(struct sk_buff *skb)
1489 {
1490         printk(KERN_DEBUG "ip_rt_bug: %u.%u.%u.%u -> %u.%u.%u.%u, %s\n",
1491                 NIPQUAD(ip_hdr(skb)->saddr), NIPQUAD(ip_hdr(skb)->daddr),
1492                 skb->dev ? skb->dev->name : "?");
1493         kfree_skb(skb);
1494         return 0;
1495 }
1496
1497 /*
1498    We do not cache source address of outgoing interface,
1499    because it is used only by IP RR, TS and SRR options,
1500    so that it out of fast path.
1501
1502    BTW remember: "addr" is allowed to be not aligned
1503    in IP options!
1504  */
1505
1506 void ip_rt_get_source(u8 *addr, struct rtable *rt)
1507 {
1508         __be32 src;
1509         struct fib_result res;
1510
1511         if (rt->fl.iif == 0)
1512                 src = rt->rt_src;
1513         else if (fib_lookup(&rt->fl, &res) == 0) {
1514                 src = FIB_RES_PREFSRC(res);
1515                 fib_res_put(&res);
1516         } else
1517                 src = inet_select_addr(rt->u.dst.dev, rt->rt_gateway,
1518                                         RT_SCOPE_UNIVERSE);
1519         memcpy(addr, &src, 4);
1520 }
1521
1522 #ifdef CONFIG_NET_CLS_ROUTE
1523 static void set_class_tag(struct rtable *rt, u32 tag)
1524 {
1525         if (!(rt->u.dst.tclassid & 0xFFFF))
1526                 rt->u.dst.tclassid |= tag & 0xFFFF;
1527         if (!(rt->u.dst.tclassid & 0xFFFF0000))
1528                 rt->u.dst.tclassid |= tag & 0xFFFF0000;
1529 }
1530 #endif
1531
1532 static void rt_set_nexthop(struct rtable *rt, struct fib_result *res, u32 itag)
1533 {
1534         struct fib_info *fi = res->fi;
1535
1536         if (fi) {
1537                 if (FIB_RES_GW(*res) &&
1538                     FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK)
1539                         rt->rt_gateway = FIB_RES_GW(*res);
1540                 memcpy(rt->u.dst.metrics, fi->fib_metrics,
1541                        sizeof(rt->u.dst.metrics));
1542                 if (fi->fib_mtu == 0) {
1543                         rt->u.dst.metrics[RTAX_MTU-1] = rt->u.dst.dev->mtu;
1544                         if (rt->u.dst.metrics[RTAX_LOCK-1] & (1 << RTAX_MTU) &&
1545                             rt->rt_gateway != rt->rt_dst &&
1546                             rt->u.dst.dev->mtu > 576)
1547                                 rt->u.dst.metrics[RTAX_MTU-1] = 576;
1548                 }
1549 #ifdef CONFIG_NET_CLS_ROUTE
1550                 rt->u.dst.tclassid = FIB_RES_NH(*res).nh_tclassid;
1551 #endif
1552         } else
1553                 rt->u.dst.metrics[RTAX_MTU-1]= rt->u.dst.dev->mtu;
1554
1555         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1556                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = sysctl_ip_default_ttl;
1557         if (rt->u.dst.metrics[RTAX_MTU-1] > IP_MAX_MTU)
1558                 rt->u.dst.metrics[RTAX_MTU-1] = IP_MAX_MTU;
1559         if (rt->u.dst.metrics[RTAX_ADVMSS-1] == 0)
1560                 rt->u.dst.metrics[RTAX_ADVMSS-1] = max_t(unsigned int, rt->u.dst.dev->mtu - 40,
1561                                        ip_rt_min_advmss);
1562         if (rt->u.dst.metrics[RTAX_ADVMSS-1] > 65535 - 40)
1563                 rt->u.dst.metrics[RTAX_ADVMSS-1] = 65535 - 40;
1564
1565 #ifdef CONFIG_NET_CLS_ROUTE
1566 #ifdef CONFIG_IP_MULTIPLE_TABLES
1567         set_class_tag(rt, fib_rules_tclass(res));
1568 #endif
1569         set_class_tag(rt, itag);
1570 #endif
1571         rt->rt_type = res->type;
1572 }
1573
1574 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1575                                 u8 tos, struct net_device *dev, int our)
1576 {
1577         unsigned hash;
1578         struct rtable *rth;
1579         __be32 spec_dst;
1580         struct in_device *in_dev = in_dev_get(dev);
1581         u32 itag = 0;
1582
1583         /* Primary sanity checks. */
1584
1585         if (in_dev == NULL)
1586                 return -EINVAL;
1587
1588         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr) ||
1589             skb->protocol != htons(ETH_P_IP))
1590                 goto e_inval;
1591
1592         if (ZERONET(saddr)) {
1593                 if (!LOCAL_MCAST(daddr))
1594                         goto e_inval;
1595                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1596         } else if (fib_validate_source(saddr, 0, tos, 0,
1597                                         dev, &spec_dst, &itag) < 0)
1598                 goto e_inval;
1599
1600         rth = dst_alloc(&ipv4_dst_ops);
1601         if (!rth)
1602                 goto e_nobufs;
1603
1604         rth->u.dst.output= ip_rt_bug;
1605
1606         atomic_set(&rth->u.dst.__refcnt, 1);
1607         rth->u.dst.flags= DST_HOST;
1608         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1609                 rth->u.dst.flags |= DST_NOPOLICY;
1610         rth->fl.fl4_dst = daddr;
1611         rth->rt_dst     = daddr;
1612         rth->fl.fl4_tos = tos;
1613         rth->fl.mark    = skb->mark;
1614         rth->fl.fl4_src = saddr;
1615         rth->rt_src     = saddr;
1616 #ifdef CONFIG_NET_CLS_ROUTE
1617         rth->u.dst.tclassid = itag;
1618 #endif
1619         rth->rt_iif     =
1620         rth->fl.iif     = dev->ifindex;
1621         rth->u.dst.dev  = init_net.loopback_dev;
1622         dev_hold(rth->u.dst.dev);
1623         rth->idev       = in_dev_get(rth->u.dst.dev);
1624         rth->fl.oif     = 0;
1625         rth->rt_gateway = daddr;
1626         rth->rt_spec_dst= spec_dst;
1627         rth->rt_type    = RTN_MULTICAST;
1628         rth->rt_flags   = RTCF_MULTICAST;
1629         if (our) {
1630                 rth->u.dst.input= ip_local_deliver;
1631                 rth->rt_flags |= RTCF_LOCAL;
1632         }
1633
1634 #ifdef CONFIG_IP_MROUTE
1635         if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
1636                 rth->u.dst.input = ip_mr_input;
1637 #endif
1638         RT_CACHE_STAT_INC(in_slow_mc);
1639
1640         in_dev_put(in_dev);
1641         hash = rt_hash(daddr, saddr, dev->ifindex);
1642         return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
1643
1644 e_nobufs:
1645         in_dev_put(in_dev);
1646         return -ENOBUFS;
1647
1648 e_inval:
1649         in_dev_put(in_dev);
1650         return -EINVAL;
1651 }
1652
1653
1654 static void ip_handle_martian_source(struct net_device *dev,
1655                                      struct in_device *in_dev,
1656                                      struct sk_buff *skb,
1657                                      __be32 daddr,
1658                                      __be32 saddr)
1659 {
1660         RT_CACHE_STAT_INC(in_martian_src);
1661 #ifdef CONFIG_IP_ROUTE_VERBOSE
1662         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1663                 /*
1664                  *      RFC1812 recommendation, if source is martian,
1665                  *      the only hint is MAC header.
1666                  */
1667                 printk(KERN_WARNING "martian source %u.%u.%u.%u from "
1668                         "%u.%u.%u.%u, on dev %s\n",
1669                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1670                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1671                         int i;
1672                         const unsigned char *p = skb_mac_header(skb);
1673                         printk(KERN_WARNING "ll header: ");
1674                         for (i = 0; i < dev->hard_header_len; i++, p++) {
1675                                 printk("%02x", *p);
1676                                 if (i < (dev->hard_header_len - 1))
1677                                         printk(":");
1678                         }
1679                         printk("\n");
1680                 }
1681         }
1682 #endif
1683 }
1684
1685 static inline int __mkroute_input(struct sk_buff *skb,
1686                                   struct fib_result* res,
1687                                   struct in_device *in_dev,
1688                                   __be32 daddr, __be32 saddr, u32 tos,
1689                                   struct rtable **result)
1690 {
1691
1692         struct rtable *rth;
1693         int err;
1694         struct in_device *out_dev;
1695         unsigned flags = 0;
1696         __be32 spec_dst;
1697         u32 itag;
1698
1699         /* get a working reference to the output device */
1700         out_dev = in_dev_get(FIB_RES_DEV(*res));
1701         if (out_dev == NULL) {
1702                 if (net_ratelimit())
1703                         printk(KERN_CRIT "Bug in ip_route_input" \
1704                                "_slow(). Please, report\n");
1705                 return -EINVAL;
1706         }
1707
1708
1709         err = fib_validate_source(saddr, daddr, tos, FIB_RES_OIF(*res),
1710                                   in_dev->dev, &spec_dst, &itag);
1711         if (err < 0) {
1712                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1713                                          saddr);
1714
1715                 err = -EINVAL;
1716                 goto cleanup;
1717         }
1718
1719         if (err)
1720                 flags |= RTCF_DIRECTSRC;
1721
1722         if (out_dev == in_dev && err && !(flags & (RTCF_NAT | RTCF_MASQ)) &&
1723             (IN_DEV_SHARED_MEDIA(out_dev) ||
1724              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1725                 flags |= RTCF_DOREDIRECT;
1726
1727         if (skb->protocol != htons(ETH_P_IP)) {
1728                 /* Not IP (i.e. ARP). Do not create route, if it is
1729                  * invalid for proxy arp. DNAT routes are always valid.
1730                  */
1731                 if (out_dev == in_dev && !(flags & RTCF_DNAT)) {
1732                         err = -EINVAL;
1733                         goto cleanup;
1734                 }
1735         }
1736
1737
1738         rth = dst_alloc(&ipv4_dst_ops);
1739         if (!rth) {
1740                 err = -ENOBUFS;
1741                 goto cleanup;
1742         }
1743
1744         atomic_set(&rth->u.dst.__refcnt, 1);
1745         rth->u.dst.flags= DST_HOST;
1746         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1747                 rth->u.dst.flags |= DST_NOPOLICY;
1748         if (IN_DEV_CONF_GET(out_dev, NOXFRM))
1749                 rth->u.dst.flags |= DST_NOXFRM;
1750         rth->fl.fl4_dst = daddr;
1751         rth->rt_dst     = daddr;
1752         rth->fl.fl4_tos = tos;
1753         rth->fl.mark    = skb->mark;
1754         rth->fl.fl4_src = saddr;
1755         rth->rt_src     = saddr;
1756         rth->rt_gateway = daddr;
1757         rth->rt_iif     =
1758                 rth->fl.iif     = in_dev->dev->ifindex;
1759         rth->u.dst.dev  = (out_dev)->dev;
1760         dev_hold(rth->u.dst.dev);
1761         rth->idev       = in_dev_get(rth->u.dst.dev);
1762         rth->fl.oif     = 0;
1763         rth->rt_spec_dst= spec_dst;
1764
1765         rth->u.dst.input = ip_forward;
1766         rth->u.dst.output = ip_output;
1767
1768         rt_set_nexthop(rth, res, itag);
1769
1770         rth->rt_flags = flags;
1771
1772         *result = rth;
1773         err = 0;
1774  cleanup:
1775         /* release the working reference to the output device */
1776         in_dev_put(out_dev);
1777         return err;
1778 }
1779
1780 static inline int ip_mkroute_input(struct sk_buff *skb,
1781                                    struct fib_result* res,
1782                                    const struct flowi *fl,
1783                                    struct in_device *in_dev,
1784                                    __be32 daddr, __be32 saddr, u32 tos)
1785 {
1786         struct rtable* rth = NULL;
1787         int err;
1788         unsigned hash;
1789
1790 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1791         if (res->fi && res->fi->fib_nhs > 1 && fl->oif == 0)
1792                 fib_select_multipath(fl, res);
1793 #endif
1794
1795         /* create a routing cache entry */
1796         err = __mkroute_input(skb, res, in_dev, daddr, saddr, tos, &rth);
1797         if (err)
1798                 return err;
1799
1800         /* put it into the cache */
1801         hash = rt_hash(daddr, saddr, fl->iif);
1802         return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1803 }
1804
1805 /*
1806  *      NOTE. We drop all the packets that has local source
1807  *      addresses, because every properly looped back packet
1808  *      must have correct destination already attached by output routine.
1809  *
1810  *      Such approach solves two big problems:
1811  *      1. Not simplex devices are handled properly.
1812  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1813  */
1814
1815 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1816                                u8 tos, struct net_device *dev)
1817 {
1818         struct fib_result res;
1819         struct in_device *in_dev = in_dev_get(dev);
1820         struct flowi fl = { .nl_u = { .ip4_u =
1821                                       { .daddr = daddr,
1822                                         .saddr = saddr,
1823                                         .tos = tos,
1824                                         .scope = RT_SCOPE_UNIVERSE,
1825                                       } },
1826                             .mark = skb->mark,
1827                             .iif = dev->ifindex };
1828         unsigned        flags = 0;
1829         u32             itag = 0;
1830         struct rtable * rth;
1831         unsigned        hash;
1832         __be32          spec_dst;
1833         int             err = -EINVAL;
1834         int             free_res = 0;
1835
1836         /* IP on this device is disabled. */
1837
1838         if (!in_dev)
1839                 goto out;
1840
1841         /* Check for the most weird martians, which can be not detected
1842            by fib_lookup.
1843          */
1844
1845         if (MULTICAST(saddr) || BADCLASS(saddr) || LOOPBACK(saddr))
1846                 goto martian_source;
1847
1848         if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0))
1849                 goto brd_input;
1850
1851         /* Accept zero addresses only to limited broadcast;
1852          * I even do not know to fix it or not. Waiting for complains :-)
1853          */
1854         if (ZERONET(saddr))
1855                 goto martian_source;
1856
1857         if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr))
1858                 goto martian_destination;
1859
1860         /*
1861          *      Now we are ready to route packet.
1862          */
1863         if ((err = fib_lookup(&fl, &res)) != 0) {
1864                 if (!IN_DEV_FORWARD(in_dev))
1865                         goto e_hostunreach;
1866                 goto no_route;
1867         }
1868         free_res = 1;
1869
1870         RT_CACHE_STAT_INC(in_slow_tot);
1871
1872         if (res.type == RTN_BROADCAST)
1873                 goto brd_input;
1874
1875         if (res.type == RTN_LOCAL) {
1876                 int result;
1877                 result = fib_validate_source(saddr, daddr, tos,
1878                                              init_net.loopback_dev->ifindex,
1879                                              dev, &spec_dst, &itag);
1880                 if (result < 0)
1881                         goto martian_source;
1882                 if (result)
1883                         flags |= RTCF_DIRECTSRC;
1884                 spec_dst = daddr;
1885                 goto local_input;
1886         }
1887
1888         if (!IN_DEV_FORWARD(in_dev))
1889                 goto e_hostunreach;
1890         if (res.type != RTN_UNICAST)
1891                 goto martian_destination;
1892
1893         err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos);
1894 done:
1895         in_dev_put(in_dev);
1896         if (free_res)
1897                 fib_res_put(&res);
1898 out:    return err;
1899
1900 brd_input:
1901         if (skb->protocol != htons(ETH_P_IP))
1902                 goto e_inval;
1903
1904         if (ZERONET(saddr))
1905                 spec_dst = inet_select_addr(dev, 0, RT_SCOPE_LINK);
1906         else {
1907                 err = fib_validate_source(saddr, 0, tos, 0, dev, &spec_dst,
1908                                           &itag);
1909                 if (err < 0)
1910                         goto martian_source;
1911                 if (err)
1912                         flags |= RTCF_DIRECTSRC;
1913         }
1914         flags |= RTCF_BROADCAST;
1915         res.type = RTN_BROADCAST;
1916         RT_CACHE_STAT_INC(in_brd);
1917
1918 local_input:
1919         rth = dst_alloc(&ipv4_dst_ops);
1920         if (!rth)
1921                 goto e_nobufs;
1922
1923         rth->u.dst.output= ip_rt_bug;
1924
1925         atomic_set(&rth->u.dst.__refcnt, 1);
1926         rth->u.dst.flags= DST_HOST;
1927         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
1928                 rth->u.dst.flags |= DST_NOPOLICY;
1929         rth->fl.fl4_dst = daddr;
1930         rth->rt_dst     = daddr;
1931         rth->fl.fl4_tos = tos;
1932         rth->fl.mark    = skb->mark;
1933         rth->fl.fl4_src = saddr;
1934         rth->rt_src     = saddr;
1935 #ifdef CONFIG_NET_CLS_ROUTE
1936         rth->u.dst.tclassid = itag;
1937 #endif
1938         rth->rt_iif     =
1939         rth->fl.iif     = dev->ifindex;
1940         rth->u.dst.dev  = init_net.loopback_dev;
1941         dev_hold(rth->u.dst.dev);
1942         rth->idev       = in_dev_get(rth->u.dst.dev);
1943         rth->rt_gateway = daddr;
1944         rth->rt_spec_dst= spec_dst;
1945         rth->u.dst.input= ip_local_deliver;
1946         rth->rt_flags   = flags|RTCF_LOCAL;
1947         if (res.type == RTN_UNREACHABLE) {
1948                 rth->u.dst.input= ip_error;
1949                 rth->u.dst.error= -err;
1950                 rth->rt_flags   &= ~RTCF_LOCAL;
1951         }
1952         rth->rt_type    = res.type;
1953         hash = rt_hash(daddr, saddr, fl.iif);
1954         err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
1955         goto done;
1956
1957 no_route:
1958         RT_CACHE_STAT_INC(in_no_route);
1959         spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
1960         res.type = RTN_UNREACHABLE;
1961         if (err == -ESRCH)
1962                 err = -ENETUNREACH;
1963         goto local_input;
1964
1965         /*
1966          *      Do not cache martian addresses: they should be logged (RFC1812)
1967          */
1968 martian_destination:
1969         RT_CACHE_STAT_INC(in_martian_dst);
1970 #ifdef CONFIG_IP_ROUTE_VERBOSE
1971         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
1972                 printk(KERN_WARNING "martian destination %u.%u.%u.%u from "
1973                         "%u.%u.%u.%u, dev %s\n",
1974                         NIPQUAD(daddr), NIPQUAD(saddr), dev->name);
1975 #endif
1976
1977 e_hostunreach:
1978         err = -EHOSTUNREACH;
1979         goto done;
1980
1981 e_inval:
1982         err = -EINVAL;
1983         goto done;
1984
1985 e_nobufs:
1986         err = -ENOBUFS;
1987         goto done;
1988
1989 martian_source:
1990         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
1991         goto e_inval;
1992 }
1993
1994 int ip_route_input(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1995                    u8 tos, struct net_device *dev)
1996 {
1997         struct rtable * rth;
1998         unsigned        hash;
1999         int iif = dev->ifindex;
2000
2001         tos &= IPTOS_RT_MASK;
2002         hash = rt_hash(daddr, saddr, iif);
2003
2004         rcu_read_lock();
2005         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2006              rth = rcu_dereference(rth->u.dst.rt_next)) {
2007                 if (rth->fl.fl4_dst == daddr &&
2008                     rth->fl.fl4_src == saddr &&
2009                     rth->fl.iif == iif &&
2010                     rth->fl.oif == 0 &&
2011                     rth->fl.mark == skb->mark &&
2012                     rth->fl.fl4_tos == tos) {
2013                         dst_use(&rth->u.dst, jiffies);
2014                         RT_CACHE_STAT_INC(in_hit);
2015                         rcu_read_unlock();
2016                         skb->dst = (struct dst_entry*)rth;
2017                         return 0;
2018                 }
2019                 RT_CACHE_STAT_INC(in_hlist_search);
2020         }
2021         rcu_read_unlock();
2022
2023         /* Multicast recognition logic is moved from route cache to here.
2024            The problem was that too many Ethernet cards have broken/missing
2025            hardware multicast filters :-( As result the host on multicasting
2026            network acquires a lot of useless route cache entries, sort of
2027            SDR messages from all the world. Now we try to get rid of them.
2028            Really, provided software IP multicast filter is organized
2029            reasonably (at least, hashed), it does not result in a slowdown
2030            comparing with route cache reject entries.
2031            Note, that multicast routers are not affected, because
2032            route cache entry is created eventually.
2033          */
2034         if (MULTICAST(daddr)) {
2035                 struct in_device *in_dev;
2036
2037                 rcu_read_lock();
2038                 if ((in_dev = __in_dev_get_rcu(dev)) != NULL) {
2039                         int our = ip_check_mc(in_dev, daddr, saddr,
2040                                 ip_hdr(skb)->protocol);
2041                         if (our
2042 #ifdef CONFIG_IP_MROUTE
2043                             || (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
2044 #endif
2045                             ) {
2046                                 rcu_read_unlock();
2047                                 return ip_route_input_mc(skb, daddr, saddr,
2048                                                          tos, dev, our);
2049                         }
2050                 }
2051                 rcu_read_unlock();
2052                 return -EINVAL;
2053         }
2054         return ip_route_input_slow(skb, daddr, saddr, tos, dev);
2055 }
2056
2057 static inline int __mkroute_output(struct rtable **result,
2058                                    struct fib_result* res,
2059                                    const struct flowi *fl,
2060                                    const struct flowi *oldflp,
2061                                    struct net_device *dev_out,
2062                                    unsigned flags)
2063 {
2064         struct rtable *rth;
2065         struct in_device *in_dev;
2066         u32 tos = RT_FL_TOS(oldflp);
2067         int err = 0;
2068
2069         if (LOOPBACK(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK))
2070                 return -EINVAL;
2071
2072         if (fl->fl4_dst == htonl(0xFFFFFFFF))
2073                 res->type = RTN_BROADCAST;
2074         else if (MULTICAST(fl->fl4_dst))
2075                 res->type = RTN_MULTICAST;
2076         else if (BADCLASS(fl->fl4_dst) || ZERONET(fl->fl4_dst))
2077                 return -EINVAL;
2078
2079         if (dev_out->flags & IFF_LOOPBACK)
2080                 flags |= RTCF_LOCAL;
2081
2082         /* get work reference to inet device */
2083         in_dev = in_dev_get(dev_out);
2084         if (!in_dev)
2085                 return -EINVAL;
2086
2087         if (res->type == RTN_BROADCAST) {
2088                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2089                 if (res->fi) {
2090                         fib_info_put(res->fi);
2091                         res->fi = NULL;
2092                 }
2093         } else if (res->type == RTN_MULTICAST) {
2094                 flags |= RTCF_MULTICAST|RTCF_LOCAL;
2095                 if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src,
2096                                  oldflp->proto))
2097                         flags &= ~RTCF_LOCAL;
2098                 /* If multicast route do not exist use
2099                    default one, but do not gateway in this case.
2100                    Yes, it is hack.
2101                  */
2102                 if (res->fi && res->prefixlen < 4) {
2103                         fib_info_put(res->fi);
2104                         res->fi = NULL;
2105                 }
2106         }
2107
2108
2109         rth = dst_alloc(&ipv4_dst_ops);
2110         if (!rth) {
2111                 err = -ENOBUFS;
2112                 goto cleanup;
2113         }
2114
2115         atomic_set(&rth->u.dst.__refcnt, 1);
2116         rth->u.dst.flags= DST_HOST;
2117         if (IN_DEV_CONF_GET(in_dev, NOXFRM))
2118                 rth->u.dst.flags |= DST_NOXFRM;
2119         if (IN_DEV_CONF_GET(in_dev, NOPOLICY))
2120                 rth->u.dst.flags |= DST_NOPOLICY;
2121
2122         rth->fl.fl4_dst = oldflp->fl4_dst;
2123         rth->fl.fl4_tos = tos;
2124         rth->fl.fl4_src = oldflp->fl4_src;
2125         rth->fl.oif     = oldflp->oif;
2126         rth->fl.mark    = oldflp->mark;
2127         rth->rt_dst     = fl->fl4_dst;
2128         rth->rt_src     = fl->fl4_src;
2129         rth->rt_iif     = oldflp->oif ? : dev_out->ifindex;
2130         /* get references to the devices that are to be hold by the routing
2131            cache entry */
2132         rth->u.dst.dev  = dev_out;
2133         dev_hold(dev_out);
2134         rth->idev       = in_dev_get(dev_out);
2135         rth->rt_gateway = fl->fl4_dst;
2136         rth->rt_spec_dst= fl->fl4_src;
2137
2138         rth->u.dst.output=ip_output;
2139
2140         RT_CACHE_STAT_INC(out_slow_tot);
2141
2142         if (flags & RTCF_LOCAL) {
2143                 rth->u.dst.input = ip_local_deliver;
2144                 rth->rt_spec_dst = fl->fl4_dst;
2145         }
2146         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2147                 rth->rt_spec_dst = fl->fl4_src;
2148                 if (flags & RTCF_LOCAL &&
2149                     !(dev_out->flags & IFF_LOOPBACK)) {
2150                         rth->u.dst.output = ip_mc_output;
2151                         RT_CACHE_STAT_INC(out_slow_mc);
2152                 }
2153 #ifdef CONFIG_IP_MROUTE
2154                 if (res->type == RTN_MULTICAST) {
2155                         if (IN_DEV_MFORWARD(in_dev) &&
2156                             !LOCAL_MCAST(oldflp->fl4_dst)) {
2157                                 rth->u.dst.input = ip_mr_input;
2158                                 rth->u.dst.output = ip_mc_output;
2159                         }
2160                 }
2161 #endif
2162         }
2163
2164         rt_set_nexthop(rth, res, 0);
2165
2166         rth->rt_flags = flags;
2167
2168         *result = rth;
2169  cleanup:
2170         /* release work reference to inet device */
2171         in_dev_put(in_dev);
2172
2173         return err;
2174 }
2175
2176 static inline int ip_mkroute_output(struct rtable **rp,
2177                                     struct fib_result* res,
2178                                     const struct flowi *fl,
2179                                     const struct flowi *oldflp,
2180                                     struct net_device *dev_out,
2181                                     unsigned flags)
2182 {
2183         struct rtable *rth = NULL;
2184         int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
2185         unsigned hash;
2186         if (err == 0) {
2187                 hash = rt_hash(oldflp->fl4_dst, oldflp->fl4_src, oldflp->oif);
2188                 err = rt_intern_hash(hash, rth, rp);
2189         }
2190
2191         return err;
2192 }
2193
2194 /*
2195  * Major route resolver routine.
2196  */
2197
2198 static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp)
2199 {
2200         u32 tos = RT_FL_TOS(oldflp);
2201         struct flowi fl = { .nl_u = { .ip4_u =
2202                                       { .daddr = oldflp->fl4_dst,
2203                                         .saddr = oldflp->fl4_src,
2204                                         .tos = tos & IPTOS_RT_MASK,
2205                                         .scope = ((tos & RTO_ONLINK) ?
2206                                                   RT_SCOPE_LINK :
2207                                                   RT_SCOPE_UNIVERSE),
2208                                       } },
2209                             .mark = oldflp->mark,
2210                             .iif = init_net.loopback_dev->ifindex,
2211                             .oif = oldflp->oif };
2212         struct fib_result res;
2213         unsigned flags = 0;
2214         struct net_device *dev_out = NULL;
2215         int free_res = 0;
2216         int err;
2217
2218
2219         res.fi          = NULL;
2220 #ifdef CONFIG_IP_MULTIPLE_TABLES
2221         res.r           = NULL;
2222 #endif
2223
2224         if (oldflp->fl4_src) {
2225                 err = -EINVAL;
2226                 if (MULTICAST(oldflp->fl4_src) ||
2227                     BADCLASS(oldflp->fl4_src) ||
2228                     ZERONET(oldflp->fl4_src))
2229                         goto out;
2230
2231                 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2232                 dev_out = ip_dev_find(oldflp->fl4_src);
2233                 if (dev_out == NULL)
2234                         goto out;
2235
2236                 /* I removed check for oif == dev_out->oif here.
2237                    It was wrong for two reasons:
2238                    1. ip_dev_find(saddr) can return wrong iface, if saddr is
2239                       assigned to multiple interfaces.
2240                    2. Moreover, we are allowed to send packets with saddr
2241                       of another iface. --ANK
2242                  */
2243
2244                 if (oldflp->oif == 0
2245                     && (MULTICAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) {
2246                         /* Special hack: user can direct multicasts
2247                            and limited broadcast via necessary interface
2248                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2249                            This hack is not just for fun, it allows
2250                            vic,vat and friends to work.
2251                            They bind socket to loopback, set ttl to zero
2252                            and expect that it will work.
2253                            From the viewpoint of routing cache they are broken,
2254                            because we are not allowed to build multicast path
2255                            with loopback source addr (look, routing cache
2256                            cannot know, that ttl is zero, so that packet
2257                            will not leave this host and route is valid).
2258                            Luckily, this hack is good workaround.
2259                          */
2260
2261                         fl.oif = dev_out->ifindex;
2262                         goto make_route;
2263                 }
2264                 if (dev_out)
2265                         dev_put(dev_out);
2266                 dev_out = NULL;
2267         }
2268
2269
2270         if (oldflp->oif) {
2271                 dev_out = dev_get_by_index(&init_net, oldflp->oif);
2272                 err = -ENODEV;
2273                 if (dev_out == NULL)
2274                         goto out;
2275
2276                 /* RACE: Check return value of inet_select_addr instead. */
2277                 if (__in_dev_get_rtnl(dev_out) == NULL) {
2278                         dev_put(dev_out);
2279                         goto out;       /* Wrong error code */
2280                 }
2281
2282                 if (LOCAL_MCAST(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) {
2283                         if (!fl.fl4_src)
2284                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2285                                                               RT_SCOPE_LINK);
2286                         goto make_route;
2287                 }
2288                 if (!fl.fl4_src) {
2289                         if (MULTICAST(oldflp->fl4_dst))
2290                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2291                                                               fl.fl4_scope);
2292                         else if (!oldflp->fl4_dst)
2293                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2294                                                               RT_SCOPE_HOST);
2295                 }
2296         }
2297
2298         if (!fl.fl4_dst) {
2299                 fl.fl4_dst = fl.fl4_src;
2300                 if (!fl.fl4_dst)
2301                         fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK);
2302                 if (dev_out)
2303                         dev_put(dev_out);
2304                 dev_out = init_net.loopback_dev;
2305                 dev_hold(dev_out);
2306                 fl.oif = init_net.loopback_dev->ifindex;
2307                 res.type = RTN_LOCAL;
2308                 flags |= RTCF_LOCAL;
2309                 goto make_route;
2310         }
2311
2312         if (fib_lookup(&fl, &res)) {
2313                 res.fi = NULL;
2314                 if (oldflp->oif) {
2315                         /* Apparently, routing tables are wrong. Assume,
2316                            that the destination is on link.
2317
2318                            WHY? DW.
2319                            Because we are allowed to send to iface
2320                            even if it has NO routes and NO assigned
2321                            addresses. When oif is specified, routing
2322                            tables are looked up with only one purpose:
2323                            to catch if destination is gatewayed, rather than
2324                            direct. Moreover, if MSG_DONTROUTE is set,
2325                            we send packet, ignoring both routing tables
2326                            and ifaddr state. --ANK
2327
2328
2329                            We could make it even if oif is unknown,
2330                            likely IPv6, but we do not.
2331                          */
2332
2333                         if (fl.fl4_src == 0)
2334                                 fl.fl4_src = inet_select_addr(dev_out, 0,
2335                                                               RT_SCOPE_LINK);
2336                         res.type = RTN_UNICAST;
2337                         goto make_route;
2338                 }
2339                 if (dev_out)
2340                         dev_put(dev_out);
2341                 err = -ENETUNREACH;
2342                 goto out;
2343         }
2344         free_res = 1;
2345
2346         if (res.type == RTN_LOCAL) {
2347                 if (!fl.fl4_src)
2348                         fl.fl4_src = fl.fl4_dst;
2349                 if (dev_out)
2350                         dev_put(dev_out);
2351                 dev_out = init_net.loopback_dev;
2352                 dev_hold(dev_out);
2353                 fl.oif = dev_out->ifindex;
2354                 if (res.fi)
2355                         fib_info_put(res.fi);
2356                 res.fi = NULL;
2357                 flags |= RTCF_LOCAL;
2358                 goto make_route;
2359         }
2360
2361 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2362         if (res.fi->fib_nhs > 1 && fl.oif == 0)
2363                 fib_select_multipath(&fl, &res);
2364         else
2365 #endif
2366         if (!res.prefixlen && res.type == RTN_UNICAST && !fl.oif)
2367                 fib_select_default(&fl, &res);
2368
2369         if (!fl.fl4_src)
2370                 fl.fl4_src = FIB_RES_PREFSRC(res);
2371
2372         if (dev_out)
2373                 dev_put(dev_out);
2374         dev_out = FIB_RES_DEV(res);
2375         dev_hold(dev_out);
2376         fl.oif = dev_out->ifindex;
2377
2378
2379 make_route:
2380         err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags);
2381
2382
2383         if (free_res)
2384                 fib_res_put(&res);
2385         if (dev_out)
2386                 dev_put(dev_out);
2387 out:    return err;
2388 }
2389
2390 int __ip_route_output_key(struct rtable **rp, const struct flowi *flp)
2391 {
2392         unsigned hash;
2393         struct rtable *rth;
2394
2395         hash = rt_hash(flp->fl4_dst, flp->fl4_src, flp->oif);
2396
2397         rcu_read_lock_bh();
2398         for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
2399                 rth = rcu_dereference(rth->u.dst.rt_next)) {
2400                 if (rth->fl.fl4_dst == flp->fl4_dst &&
2401                     rth->fl.fl4_src == flp->fl4_src &&
2402                     rth->fl.iif == 0 &&
2403                     rth->fl.oif == flp->oif &&
2404                     rth->fl.mark == flp->mark &&
2405                     !((rth->fl.fl4_tos ^ flp->fl4_tos) &
2406                             (IPTOS_RT_MASK | RTO_ONLINK))) {
2407                         dst_use(&rth->u.dst, jiffies);
2408                         RT_CACHE_STAT_INC(out_hit);
2409                         rcu_read_unlock_bh();
2410                         *rp = rth;
2411                         return 0;
2412                 }
2413                 RT_CACHE_STAT_INC(out_hlist_search);
2414         }
2415         rcu_read_unlock_bh();
2416
2417         return ip_route_output_slow(rp, flp);
2418 }
2419
2420 EXPORT_SYMBOL_GPL(__ip_route_output_key);
2421
2422 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
2423 {
2424 }
2425
2426 static struct dst_ops ipv4_dst_blackhole_ops = {
2427         .family                 =       AF_INET,
2428         .protocol               =       __constant_htons(ETH_P_IP),
2429         .destroy                =       ipv4_dst_destroy,
2430         .check                  =       ipv4_dst_check,
2431         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2432         .entry_size             =       sizeof(struct rtable),
2433 };
2434
2435
2436 static int ipv4_dst_blackhole(struct rtable **rp, struct flowi *flp, struct sock *sk)
2437 {
2438         struct rtable *ort = *rp;
2439         struct rtable *rt = (struct rtable *)
2440                 dst_alloc(&ipv4_dst_blackhole_ops);
2441
2442         if (rt) {
2443                 struct dst_entry *new = &rt->u.dst;
2444
2445                 atomic_set(&new->__refcnt, 1);
2446                 new->__use = 1;
2447                 new->input = dst_discard;
2448                 new->output = dst_discard;
2449                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
2450
2451                 new->dev = ort->u.dst.dev;
2452                 if (new->dev)
2453                         dev_hold(new->dev);
2454
2455                 rt->fl = ort->fl;
2456
2457                 rt->idev = ort->idev;
2458                 if (rt->idev)
2459                         in_dev_hold(rt->idev);
2460                 rt->rt_flags = ort->rt_flags;
2461                 rt->rt_type = ort->rt_type;
2462                 rt->rt_dst = ort->rt_dst;
2463                 rt->rt_src = ort->rt_src;
2464                 rt->rt_iif = ort->rt_iif;
2465                 rt->rt_gateway = ort->rt_gateway;
2466                 rt->rt_spec_dst = ort->rt_spec_dst;
2467                 rt->peer = ort->peer;
2468                 if (rt->peer)
2469                         atomic_inc(&rt->peer->refcnt);
2470
2471                 dst_free(new);
2472         }
2473
2474         dst_release(&(*rp)->u.dst);
2475         *rp = rt;
2476         return (rt ? 0 : -ENOMEM);
2477 }
2478
2479 int ip_route_output_flow(struct rtable **rp, struct flowi *flp, struct sock *sk, int flags)
2480 {
2481         int err;
2482
2483         if ((err = __ip_route_output_key(rp, flp)) != 0)
2484                 return err;
2485
2486         if (flp->proto) {
2487                 if (!flp->fl4_src)
2488                         flp->fl4_src = (*rp)->rt_src;
2489                 if (!flp->fl4_dst)
2490                         flp->fl4_dst = (*rp)->rt_dst;
2491                 err = __xfrm_lookup((struct dst_entry **)rp, flp, sk, flags);
2492                 if (err == -EREMOTE)
2493                         err = ipv4_dst_blackhole(rp, flp, sk);
2494
2495                 return err;
2496         }
2497
2498         return 0;
2499 }
2500
2501 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2502
2503 int ip_route_output_key(struct rtable **rp, struct flowi *flp)
2504 {
2505         return ip_route_output_flow(rp, flp, NULL, 0);
2506 }
2507
2508 static int rt_fill_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
2509                         int nowait, unsigned int flags)
2510 {
2511         struct rtable *rt = (struct rtable*)skb->dst;
2512         struct rtmsg *r;
2513         struct nlmsghdr *nlh;
2514         long expires;
2515         u32 id = 0, ts = 0, tsage = 0, error;
2516
2517         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*r), flags);
2518         if (nlh == NULL)
2519                 return -EMSGSIZE;
2520
2521         r = nlmsg_data(nlh);
2522         r->rtm_family    = AF_INET;
2523         r->rtm_dst_len  = 32;
2524         r->rtm_src_len  = 0;
2525         r->rtm_tos      = rt->fl.fl4_tos;
2526         r->rtm_table    = RT_TABLE_MAIN;
2527         NLA_PUT_U32(skb, RTA_TABLE, RT_TABLE_MAIN);
2528         r->rtm_type     = rt->rt_type;
2529         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2530         r->rtm_protocol = RTPROT_UNSPEC;
2531         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2532         if (rt->rt_flags & RTCF_NOTIFY)
2533                 r->rtm_flags |= RTM_F_NOTIFY;
2534
2535         NLA_PUT_BE32(skb, RTA_DST, rt->rt_dst);
2536
2537         if (rt->fl.fl4_src) {
2538                 r->rtm_src_len = 32;
2539                 NLA_PUT_BE32(skb, RTA_SRC, rt->fl.fl4_src);
2540         }
2541         if (rt->u.dst.dev)
2542                 NLA_PUT_U32(skb, RTA_OIF, rt->u.dst.dev->ifindex);
2543 #ifdef CONFIG_NET_CLS_ROUTE
2544         if (rt->u.dst.tclassid)
2545                 NLA_PUT_U32(skb, RTA_FLOW, rt->u.dst.tclassid);
2546 #endif
2547         if (rt->fl.iif)
2548                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_spec_dst);
2549         else if (rt->rt_src != rt->fl.fl4_src)
2550                 NLA_PUT_BE32(skb, RTA_PREFSRC, rt->rt_src);
2551
2552         if (rt->rt_dst != rt->rt_gateway)
2553                 NLA_PUT_BE32(skb, RTA_GATEWAY, rt->rt_gateway);
2554
2555         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2556                 goto nla_put_failure;
2557
2558         error = rt->u.dst.error;
2559         expires = rt->u.dst.expires ? rt->u.dst.expires - jiffies : 0;
2560         if (rt->peer) {
2561                 id = rt->peer->ip_id_count;
2562                 if (rt->peer->tcp_ts_stamp) {
2563                         ts = rt->peer->tcp_ts;
2564                         tsage = get_seconds() - rt->peer->tcp_ts_stamp;
2565                 }
2566         }
2567
2568         if (rt->fl.iif) {
2569 #ifdef CONFIG_IP_MROUTE
2570                 __be32 dst = rt->rt_dst;
2571
2572                 if (MULTICAST(dst) && !LOCAL_MCAST(dst) &&
2573                     IPV4_DEVCONF_ALL(MC_FORWARDING)) {
2574                         int err = ipmr_get_route(skb, r, nowait);
2575                         if (err <= 0) {
2576                                 if (!nowait) {
2577                                         if (err == 0)
2578                                                 return 0;
2579                                         goto nla_put_failure;
2580                                 } else {
2581                                         if (err == -EMSGSIZE)
2582                                                 goto nla_put_failure;
2583                                         error = err;
2584                                 }
2585                         }
2586                 } else
2587 #endif
2588                         NLA_PUT_U32(skb, RTA_IIF, rt->fl.iif);
2589         }
2590
2591         if (rtnl_put_cacheinfo(skb, &rt->u.dst, id, ts, tsage,
2592                                expires, error) < 0)
2593                 goto nla_put_failure;
2594
2595         return nlmsg_end(skb, nlh);
2596
2597 nla_put_failure:
2598         nlmsg_cancel(skb, nlh);
2599         return -EMSGSIZE;
2600 }
2601
2602 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2603 {
2604         struct net *net = in_skb->sk->sk_net;
2605         struct rtmsg *rtm;
2606         struct nlattr *tb[RTA_MAX+1];
2607         struct rtable *rt = NULL;
2608         __be32 dst = 0;
2609         __be32 src = 0;
2610         u32 iif;
2611         int err;
2612         struct sk_buff *skb;
2613
2614         if (net != &init_net)
2615                 return -EINVAL;
2616
2617         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2618         if (err < 0)
2619                 goto errout;
2620
2621         rtm = nlmsg_data(nlh);
2622
2623         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2624         if (skb == NULL) {
2625                 err = -ENOBUFS;
2626                 goto errout;
2627         }
2628
2629         /* Reserve room for dummy headers, this skb can pass
2630            through good chunk of routing engine.
2631          */
2632         skb_reset_mac_header(skb);
2633         skb_reset_network_header(skb);
2634
2635         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2636         ip_hdr(skb)->protocol = IPPROTO_ICMP;
2637         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2638
2639         src = tb[RTA_SRC] ? nla_get_be32(tb[RTA_SRC]) : 0;
2640         dst = tb[RTA_DST] ? nla_get_be32(tb[RTA_DST]) : 0;
2641         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2642
2643         if (iif) {
2644                 struct net_device *dev;
2645
2646                 dev = __dev_get_by_index(&init_net, iif);
2647                 if (dev == NULL) {
2648                         err = -ENODEV;
2649                         goto errout_free;
2650                 }
2651
2652                 skb->protocol   = htons(ETH_P_IP);
2653                 skb->dev        = dev;
2654                 local_bh_disable();
2655                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2656                 local_bh_enable();
2657
2658                 rt = (struct rtable*) skb->dst;
2659                 if (err == 0 && rt->u.dst.error)
2660                         err = -rt->u.dst.error;
2661         } else {
2662                 struct flowi fl = {
2663                         .nl_u = {
2664                                 .ip4_u = {
2665                                         .daddr = dst,
2666                                         .saddr = src,
2667                                         .tos = rtm->rtm_tos,
2668                                 },
2669                         },
2670                         .oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0,
2671                 };
2672                 err = ip_route_output_key(&rt, &fl);
2673         }
2674
2675         if (err)
2676                 goto errout_free;
2677
2678         skb->dst = &rt->u.dst;
2679         if (rtm->rtm_flags & RTM_F_NOTIFY)
2680                 rt->rt_flags |= RTCF_NOTIFY;
2681
2682         err = rt_fill_info(skb, NETLINK_CB(in_skb).pid, nlh->nlmsg_seq,
2683                                 RTM_NEWROUTE, 0, 0);
2684         if (err <= 0)
2685                 goto errout_free;
2686
2687         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2688 errout:
2689         return err;
2690
2691 errout_free:
2692         kfree_skb(skb);
2693         goto errout;
2694 }
2695
2696 int ip_rt_dump(struct sk_buff *skb,  struct netlink_callback *cb)
2697 {
2698         struct rtable *rt;
2699         int h, s_h;
2700         int idx, s_idx;
2701
2702         s_h = cb->args[0];
2703         if (s_h < 0)
2704                 s_h = 0;
2705         s_idx = idx = cb->args[1];
2706         for (h = s_h; h <= rt_hash_mask; h++) {
2707                 rcu_read_lock_bh();
2708                 for (rt = rcu_dereference(rt_hash_table[h].chain), idx = 0; rt;
2709                      rt = rcu_dereference(rt->u.dst.rt_next), idx++) {
2710                         if (idx < s_idx)
2711                                 continue;
2712                         skb->dst = dst_clone(&rt->u.dst);
2713                         if (rt_fill_info(skb, NETLINK_CB(cb->skb).pid,
2714                                          cb->nlh->nlmsg_seq, RTM_NEWROUTE,
2715                                          1, NLM_F_MULTI) <= 0) {
2716                                 dst_release(xchg(&skb->dst, NULL));
2717                                 rcu_read_unlock_bh();
2718                                 goto done;
2719                         }
2720                         dst_release(xchg(&skb->dst, NULL));
2721                 }
2722                 rcu_read_unlock_bh();
2723                 s_idx = 0;
2724         }
2725
2726 done:
2727         cb->args[0] = h;
2728         cb->args[1] = idx;
2729         return skb->len;
2730 }
2731
2732 void ip_rt_multicast_event(struct in_device *in_dev)
2733 {
2734         rt_cache_flush(0);
2735 }
2736
2737 #ifdef CONFIG_SYSCTL
2738 static int flush_delay;
2739
2740 static int ipv4_sysctl_rtcache_flush(ctl_table *ctl, int write,
2741                                         struct file *filp, void __user *buffer,
2742                                         size_t *lenp, loff_t *ppos)
2743 {
2744         if (write) {
2745                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2746                 rt_cache_flush(flush_delay);
2747                 return 0;
2748         }
2749
2750         return -EINVAL;
2751 }
2752
2753 static int ipv4_sysctl_rtcache_flush_strategy(ctl_table *table,
2754                                                 int __user *name,
2755                                                 int nlen,
2756                                                 void __user *oldval,
2757                                                 size_t __user *oldlenp,
2758                                                 void __user *newval,
2759                                                 size_t newlen)
2760 {
2761         int delay;
2762         if (newlen != sizeof(int))
2763                 return -EINVAL;
2764         if (get_user(delay, (int __user *)newval))
2765                 return -EFAULT;
2766         rt_cache_flush(delay);
2767         return 0;
2768 }
2769
2770 ctl_table ipv4_route_table[] = {
2771         {
2772                 .ctl_name       = NET_IPV4_ROUTE_FLUSH,
2773                 .procname       = "flush",
2774                 .data           = &flush_delay,
2775                 .maxlen         = sizeof(int),
2776                 .mode           = 0200,
2777                 .proc_handler   = &ipv4_sysctl_rtcache_flush,
2778                 .strategy       = &ipv4_sysctl_rtcache_flush_strategy,
2779         },
2780         {
2781                 .ctl_name       = NET_IPV4_ROUTE_MIN_DELAY,
2782                 .procname       = "min_delay",
2783                 .data           = &ip_rt_min_delay,
2784                 .maxlen         = sizeof(int),
2785                 .mode           = 0644,
2786                 .proc_handler   = &proc_dointvec_jiffies,
2787                 .strategy       = &sysctl_jiffies,
2788         },
2789         {
2790                 .ctl_name       = NET_IPV4_ROUTE_MAX_DELAY,
2791                 .procname       = "max_delay",
2792                 .data           = &ip_rt_max_delay,
2793                 .maxlen         = sizeof(int),
2794                 .mode           = 0644,
2795                 .proc_handler   = &proc_dointvec_jiffies,
2796                 .strategy       = &sysctl_jiffies,
2797         },
2798         {
2799                 .ctl_name       = NET_IPV4_ROUTE_GC_THRESH,
2800                 .procname       = "gc_thresh",
2801                 .data           = &ipv4_dst_ops.gc_thresh,
2802                 .maxlen         = sizeof(int),
2803                 .mode           = 0644,
2804                 .proc_handler   = &proc_dointvec,
2805         },
2806         {
2807                 .ctl_name       = NET_IPV4_ROUTE_MAX_SIZE,
2808                 .procname       = "max_size",
2809                 .data           = &ip_rt_max_size,
2810                 .maxlen         = sizeof(int),
2811                 .mode           = 0644,
2812                 .proc_handler   = &proc_dointvec,
2813         },
2814         {
2815                 /*  Deprecated. Use gc_min_interval_ms */
2816
2817                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL,
2818                 .procname       = "gc_min_interval",
2819                 .data           = &ip_rt_gc_min_interval,
2820                 .maxlen         = sizeof(int),
2821                 .mode           = 0644,
2822                 .proc_handler   = &proc_dointvec_jiffies,
2823                 .strategy       = &sysctl_jiffies,
2824         },
2825         {
2826                 .ctl_name       = NET_IPV4_ROUTE_GC_MIN_INTERVAL_MS,
2827                 .procname       = "gc_min_interval_ms",
2828                 .data           = &ip_rt_gc_min_interval,
2829                 .maxlen         = sizeof(int),
2830                 .mode           = 0644,
2831                 .proc_handler   = &proc_dointvec_ms_jiffies,
2832                 .strategy       = &sysctl_ms_jiffies,
2833         },
2834         {
2835                 .ctl_name       = NET_IPV4_ROUTE_GC_TIMEOUT,
2836                 .procname       = "gc_timeout",
2837                 .data           = &ip_rt_gc_timeout,
2838                 .maxlen         = sizeof(int),
2839                 .mode           = 0644,
2840                 .proc_handler   = &proc_dointvec_jiffies,
2841                 .strategy       = &sysctl_jiffies,
2842         },
2843         {
2844                 .ctl_name       = NET_IPV4_ROUTE_GC_INTERVAL,
2845                 .procname       = "gc_interval",
2846                 .data           = &ip_rt_gc_interval,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0644,
2849                 .proc_handler   = &proc_dointvec_jiffies,
2850                 .strategy       = &sysctl_jiffies,
2851         },
2852         {
2853                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_LOAD,
2854                 .procname       = "redirect_load",
2855                 .data           = &ip_rt_redirect_load,
2856                 .maxlen         = sizeof(int),
2857                 .mode           = 0644,
2858                 .proc_handler   = &proc_dointvec,
2859         },
2860         {
2861                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_NUMBER,
2862                 .procname       = "redirect_number",
2863                 .data           = &ip_rt_redirect_number,
2864                 .maxlen         = sizeof(int),
2865                 .mode           = 0644,
2866                 .proc_handler   = &proc_dointvec,
2867         },
2868         {
2869                 .ctl_name       = NET_IPV4_ROUTE_REDIRECT_SILENCE,
2870                 .procname       = "redirect_silence",
2871                 .data           = &ip_rt_redirect_silence,
2872                 .maxlen         = sizeof(int),
2873                 .mode           = 0644,
2874                 .proc_handler   = &proc_dointvec,
2875         },
2876         {
2877                 .ctl_name       = NET_IPV4_ROUTE_ERROR_COST,
2878                 .procname       = "error_cost",
2879                 .data           = &ip_rt_error_cost,
2880                 .maxlen         = sizeof(int),
2881                 .mode           = 0644,
2882                 .proc_handler   = &proc_dointvec,
2883         },
2884         {
2885                 .ctl_name       = NET_IPV4_ROUTE_ERROR_BURST,
2886                 .procname       = "error_burst",
2887                 .data           = &ip_rt_error_burst,
2888                 .maxlen         = sizeof(int),
2889                 .mode           = 0644,
2890                 .proc_handler   = &proc_dointvec,
2891         },
2892         {
2893                 .ctl_name       = NET_IPV4_ROUTE_GC_ELASTICITY,
2894                 .procname       = "gc_elasticity",
2895                 .data           = &ip_rt_gc_elasticity,
2896                 .maxlen         = sizeof(int),
2897                 .mode           = 0644,
2898                 .proc_handler   = &proc_dointvec,
2899         },
2900         {
2901                 .ctl_name       = NET_IPV4_ROUTE_MTU_EXPIRES,
2902                 .procname       = "mtu_expires",
2903                 .data           = &ip_rt_mtu_expires,
2904                 .maxlen         = sizeof(int),
2905                 .mode           = 0644,
2906                 .proc_handler   = &proc_dointvec_jiffies,
2907                 .strategy       = &sysctl_jiffies,
2908         },
2909         {
2910                 .ctl_name       = NET_IPV4_ROUTE_MIN_PMTU,
2911                 .procname       = "min_pmtu",
2912                 .data           = &ip_rt_min_pmtu,
2913                 .maxlen         = sizeof(int),
2914                 .mode           = 0644,
2915                 .proc_handler   = &proc_dointvec,
2916         },
2917         {
2918                 .ctl_name       = NET_IPV4_ROUTE_MIN_ADVMSS,
2919                 .procname       = "min_adv_mss",
2920                 .data           = &ip_rt_min_advmss,
2921                 .maxlen         = sizeof(int),
2922                 .mode           = 0644,
2923                 .proc_handler   = &proc_dointvec,
2924         },
2925         {
2926                 .ctl_name       = NET_IPV4_ROUTE_SECRET_INTERVAL,
2927                 .procname       = "secret_interval",
2928                 .data           = &ip_rt_secret_interval,
2929                 .maxlen         = sizeof(int),
2930                 .mode           = 0644,
2931                 .proc_handler   = &proc_dointvec_jiffies,
2932                 .strategy       = &sysctl_jiffies,
2933         },
2934         { .ctl_name = 0 }
2935 };
2936 #endif
2937
2938 #ifdef CONFIG_NET_CLS_ROUTE
2939 struct ip_rt_acct *ip_rt_acct __read_mostly;
2940 #endif /* CONFIG_NET_CLS_ROUTE */
2941
2942 static __initdata unsigned long rhash_entries;
2943 static int __init set_rhash_entries(char *str)
2944 {
2945         if (!str)
2946                 return 0;
2947         rhash_entries = simple_strtoul(str, &str, 0);
2948         return 1;
2949 }
2950 __setup("rhash_entries=", set_rhash_entries);
2951
2952 int __init ip_rt_init(void)
2953 {
2954         int rc = 0;
2955
2956         rt_hash_rnd = (int) ((num_physpages ^ (num_physpages>>8)) ^
2957                              (jiffies ^ (jiffies >> 7)));
2958
2959 #ifdef CONFIG_NET_CLS_ROUTE
2960         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct));
2961         if (!ip_rt_acct)
2962                 panic("IP: failed to allocate ip_rt_acct\n");
2963 #endif
2964
2965         ipv4_dst_ops.kmem_cachep =
2966                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2967                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2968
2969         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2970
2971         rt_hash_table = (struct rt_hash_bucket *)
2972                 alloc_large_system_hash("IP route cache",
2973                                         sizeof(struct rt_hash_bucket),
2974                                         rhash_entries,
2975                                         (num_physpages >= 128 * 1024) ?
2976                                         15 : 17,
2977                                         0,
2978                                         &rt_hash_log,
2979                                         &rt_hash_mask,
2980                                         0);
2981         memset(rt_hash_table, 0, (rt_hash_mask + 1) * sizeof(struct rt_hash_bucket));
2982         rt_hash_lock_init();
2983
2984         ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
2985         ip_rt_max_size = (rt_hash_mask + 1) * 16;
2986
2987         devinet_init();
2988         ip_fib_init();
2989
2990         setup_timer(&rt_flush_timer, rt_run_flush, 0);
2991         setup_timer(&rt_secret_timer, rt_secret_rebuild, 0);
2992
2993         /* All the timers, started at system startup tend
2994            to synchronize. Perturb it a bit.
2995          */
2996         schedule_delayed_work(&expires_work,
2997                 net_random() % ip_rt_gc_interval + ip_rt_gc_interval);
2998
2999         rt_secret_timer.expires = jiffies + net_random() % ip_rt_secret_interval +
3000                 ip_rt_secret_interval;
3001         add_timer(&rt_secret_timer);
3002
3003 #ifdef CONFIG_PROC_FS
3004         {
3005         struct proc_dir_entry *rtstat_pde = NULL; /* keep gcc happy */
3006         if (!proc_net_fops_create(&init_net, "rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
3007             !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO,
3008                                              init_net.proc_net_stat))) {
3009                 return -ENOMEM;
3010         }
3011         rtstat_pde->proc_fops = &rt_cpu_seq_fops;
3012         }
3013 #ifdef CONFIG_NET_CLS_ROUTE
3014         create_proc_read_entry("rt_acct", 0, init_net.proc_net, ip_rt_acct_read, NULL);
3015 #endif
3016 #endif
3017 #ifdef CONFIG_XFRM
3018         xfrm_init();
3019         xfrm4_init();
3020 #endif
3021         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL);
3022
3023         return rc;
3024 }
3025
3026 EXPORT_SYMBOL(__ip_select_ident);
3027 EXPORT_SYMBOL(ip_route_input);
3028 EXPORT_SYMBOL(ip_route_output_key);