4278cec522c5b842d44a83170b38fef29460365b
[safe/jmp/linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      $Id: route.c,v 1.56 2001/10/31 21:55:55 davem Exp $
9  *
10  *      This program is free software; you can redistribute it and/or
11  *      modify it under the terms of the GNU General Public License
12  *      as published by the Free Software Foundation; either version
13  *      2 of the License, or (at your option) any later version.
14  */
15
16 /*      Changes:
17  *
18  *      YOSHIFUJI Hideaki @USAGI
19  *              reworked default router selection.
20  *              - respect outgoing interface
21  *              - select from (probably) reachable routers (i.e.
22  *              routers in REACHABLE, STALE, DELAY or PROBE states).
23  *              - always select the same router if it is (probably)
24  *              reachable.  otherwise, round-robin the list.
25  *      Ville Nuorvala
26  *              Fixed routing subtrees.
27  */
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/types.h>
32 #include <linux/times.h>
33 #include <linux/socket.h>
34 #include <linux/sockios.h>
35 #include <linux/net.h>
36 #include <linux/route.h>
37 #include <linux/netdevice.h>
38 #include <linux/in6.h>
39 #include <linux/init.h>
40 #include <linux/if_arp.h>
41 #include <linux/proc_fs.h>
42 #include <linux/seq_file.h>
43 #include <linux/nsproxy.h>
44 #include <net/net_namespace.h>
45 #include <net/snmp.h>
46 #include <net/ipv6.h>
47 #include <net/ip6_fib.h>
48 #include <net/ip6_route.h>
49 #include <net/ndisc.h>
50 #include <net/addrconf.h>
51 #include <net/tcp.h>
52 #include <linux/rtnetlink.h>
53 #include <net/dst.h>
54 #include <net/xfrm.h>
55 #include <net/netevent.h>
56 #include <net/netlink.h>
57
58 #include <asm/uaccess.h>
59
60 #ifdef CONFIG_SYSCTL
61 #include <linux/sysctl.h>
62 #endif
63
64 /* Set to 3 to get tracing. */
65 #define RT6_DEBUG 2
66
67 #if RT6_DEBUG >= 3
68 #define RDBG(x) printk x
69 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
70 #else
71 #define RDBG(x)
72 #define RT6_TRACE(x...) do { ; } while (0)
73 #endif
74
75 #define CLONE_OFFLINK_ROUTE 0
76
77 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
78 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
79 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
80 static void             ip6_dst_destroy(struct dst_entry *);
81 static void             ip6_dst_ifdown(struct dst_entry *,
82                                        struct net_device *dev, int how);
83 static int               ip6_dst_gc(struct dst_ops *ops);
84
85 static int              ip6_pkt_discard(struct sk_buff *skb);
86 static int              ip6_pkt_discard_out(struct sk_buff *skb);
87 static void             ip6_link_failure(struct sk_buff *skb);
88 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
89
90 #ifdef CONFIG_IPV6_ROUTE_INFO
91 static struct rt6_info *rt6_add_route_info(struct net *net,
92                                            struct in6_addr *prefix, int prefixlen,
93                                            struct in6_addr *gwaddr, int ifindex,
94                                            unsigned pref);
95 static struct rt6_info *rt6_get_route_info(struct net *net,
96                                            struct in6_addr *prefix, int prefixlen,
97                                            struct in6_addr *gwaddr, int ifindex);
98 #endif
99
100 static struct dst_ops ip6_dst_ops = {
101         .family                 =       AF_INET6,
102         .protocol               =       __constant_htons(ETH_P_IPV6),
103         .gc                     =       ip6_dst_gc,
104         .gc_thresh              =       1024,
105         .check                  =       ip6_dst_check,
106         .destroy                =       ip6_dst_destroy,
107         .ifdown                 =       ip6_dst_ifdown,
108         .negative_advice        =       ip6_negative_advice,
109         .link_failure           =       ip6_link_failure,
110         .update_pmtu            =       ip6_rt_update_pmtu,
111         .local_out              =       ip6_local_out,
112         .entry_size             =       sizeof(struct rt6_info),
113         .entries                =       ATOMIC_INIT(0),
114 };
115
116 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
117 {
118 }
119
120 static struct dst_ops ip6_dst_blackhole_ops = {
121         .family                 =       AF_INET6,
122         .protocol               =       __constant_htons(ETH_P_IPV6),
123         .destroy                =       ip6_dst_destroy,
124         .check                  =       ip6_dst_check,
125         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
126         .entry_size             =       sizeof(struct rt6_info),
127         .entries                =       ATOMIC_INIT(0),
128 };
129
130 struct rt6_info ip6_null_entry = {
131         .u = {
132                 .dst = {
133                         .__refcnt       = ATOMIC_INIT(1),
134                         .__use          = 1,
135                         .obsolete       = -1,
136                         .error          = -ENETUNREACH,
137                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
138                         .input          = ip6_pkt_discard,
139                         .output         = ip6_pkt_discard_out,
140                         .ops            = &ip6_dst_ops,
141                         .path           = (struct dst_entry*)&ip6_null_entry,
142                 }
143         },
144         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
145         .rt6i_metric    = ~(u32) 0,
146         .rt6i_ref       = ATOMIC_INIT(1),
147 };
148
149 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
150
151 static int ip6_pkt_prohibit(struct sk_buff *skb);
152 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
153
154 struct rt6_info ip6_prohibit_entry = {
155         .u = {
156                 .dst = {
157                         .__refcnt       = ATOMIC_INIT(1),
158                         .__use          = 1,
159                         .obsolete       = -1,
160                         .error          = -EACCES,
161                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
162                         .input          = ip6_pkt_prohibit,
163                         .output         = ip6_pkt_prohibit_out,
164                         .ops            = &ip6_dst_ops,
165                         .path           = (struct dst_entry*)&ip6_prohibit_entry,
166                 }
167         },
168         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
169         .rt6i_metric    = ~(u32) 0,
170         .rt6i_ref       = ATOMIC_INIT(1),
171 };
172
173 struct rt6_info ip6_blk_hole_entry = {
174         .u = {
175                 .dst = {
176                         .__refcnt       = ATOMIC_INIT(1),
177                         .__use          = 1,
178                         .obsolete       = -1,
179                         .error          = -EINVAL,
180                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
181                         .input          = dst_discard,
182                         .output         = dst_discard,
183                         .ops            = &ip6_dst_ops,
184                         .path           = (struct dst_entry*)&ip6_blk_hole_entry,
185                 }
186         },
187         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
188         .rt6i_metric    = ~(u32) 0,
189         .rt6i_ref       = ATOMIC_INIT(1),
190 };
191
192 #endif
193
194 /* allocate dst with ip6_dst_ops */
195 static __inline__ struct rt6_info *ip6_dst_alloc(void)
196 {
197         return (struct rt6_info *)dst_alloc(&ip6_dst_ops);
198 }
199
200 static void ip6_dst_destroy(struct dst_entry *dst)
201 {
202         struct rt6_info *rt = (struct rt6_info *)dst;
203         struct inet6_dev *idev = rt->rt6i_idev;
204
205         if (idev != NULL) {
206                 rt->rt6i_idev = NULL;
207                 in6_dev_put(idev);
208         }
209 }
210
211 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
212                            int how)
213 {
214         struct rt6_info *rt = (struct rt6_info *)dst;
215         struct inet6_dev *idev = rt->rt6i_idev;
216         struct net_device *loopback_dev =
217                 dev->nd_net->loopback_dev;
218
219         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
220                 struct inet6_dev *loopback_idev =
221                         in6_dev_get(loopback_dev);
222                 if (loopback_idev != NULL) {
223                         rt->rt6i_idev = loopback_idev;
224                         in6_dev_put(idev);
225                 }
226         }
227 }
228
229 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
230 {
231         return (rt->rt6i_flags & RTF_EXPIRES &&
232                 time_after(jiffies, rt->rt6i_expires));
233 }
234
235 static inline int rt6_need_strict(struct in6_addr *daddr)
236 {
237         return (ipv6_addr_type(daddr) &
238                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
239 }
240
241 /*
242  *      Route lookup. Any table->tb6_lock is implied.
243  */
244
245 static __inline__ struct rt6_info *rt6_device_match(struct rt6_info *rt,
246                                                     int oif,
247                                                     int strict)
248 {
249         struct rt6_info *local = NULL;
250         struct rt6_info *sprt;
251
252         if (oif) {
253                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
254                         struct net_device *dev = sprt->rt6i_dev;
255                         if (dev->ifindex == oif)
256                                 return sprt;
257                         if (dev->flags & IFF_LOOPBACK) {
258                                 if (sprt->rt6i_idev == NULL ||
259                                     sprt->rt6i_idev->dev->ifindex != oif) {
260                                         if (strict && oif)
261                                                 continue;
262                                         if (local && (!oif ||
263                                                       local->rt6i_idev->dev->ifindex == oif))
264                                                 continue;
265                                 }
266                                 local = sprt;
267                         }
268                 }
269
270                 if (local)
271                         return local;
272
273                 if (strict)
274                         return &ip6_null_entry;
275         }
276         return rt;
277 }
278
279 #ifdef CONFIG_IPV6_ROUTER_PREF
280 static void rt6_probe(struct rt6_info *rt)
281 {
282         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
283         /*
284          * Okay, this does not seem to be appropriate
285          * for now, however, we need to check if it
286          * is really so; aka Router Reachability Probing.
287          *
288          * Router Reachability Probe MUST be rate-limited
289          * to no more than one per minute.
290          */
291         if (!neigh || (neigh->nud_state & NUD_VALID))
292                 return;
293         read_lock_bh(&neigh->lock);
294         if (!(neigh->nud_state & NUD_VALID) &&
295             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
296                 struct in6_addr mcaddr;
297                 struct in6_addr *target;
298
299                 neigh->updated = jiffies;
300                 read_unlock_bh(&neigh->lock);
301
302                 target = (struct in6_addr *)&neigh->primary_key;
303                 addrconf_addr_solict_mult(target, &mcaddr);
304                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
305         } else
306                 read_unlock_bh(&neigh->lock);
307 }
308 #else
309 static inline void rt6_probe(struct rt6_info *rt)
310 {
311         return;
312 }
313 #endif
314
315 /*
316  * Default Router Selection (RFC 2461 6.3.6)
317  */
318 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
319 {
320         struct net_device *dev = rt->rt6i_dev;
321         if (!oif || dev->ifindex == oif)
322                 return 2;
323         if ((dev->flags & IFF_LOOPBACK) &&
324             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
325                 return 1;
326         return 0;
327 }
328
329 static inline int rt6_check_neigh(struct rt6_info *rt)
330 {
331         struct neighbour *neigh = rt->rt6i_nexthop;
332         int m;
333         if (rt->rt6i_flags & RTF_NONEXTHOP ||
334             !(rt->rt6i_flags & RTF_GATEWAY))
335                 m = 1;
336         else if (neigh) {
337                 read_lock_bh(&neigh->lock);
338                 if (neigh->nud_state & NUD_VALID)
339                         m = 2;
340 #ifdef CONFIG_IPV6_ROUTER_PREF
341                 else if (neigh->nud_state & NUD_FAILED)
342                         m = 0;
343 #endif
344                 else
345                         m = 1;
346                 read_unlock_bh(&neigh->lock);
347         } else
348                 m = 0;
349         return m;
350 }
351
352 static int rt6_score_route(struct rt6_info *rt, int oif,
353                            int strict)
354 {
355         int m, n;
356
357         m = rt6_check_dev(rt, oif);
358         if (!m && (strict & RT6_LOOKUP_F_IFACE))
359                 return -1;
360 #ifdef CONFIG_IPV6_ROUTER_PREF
361         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
362 #endif
363         n = rt6_check_neigh(rt);
364         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
365                 return -1;
366         return m;
367 }
368
369 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
370                                    int *mpri, struct rt6_info *match)
371 {
372         int m;
373
374         if (rt6_check_expired(rt))
375                 goto out;
376
377         m = rt6_score_route(rt, oif, strict);
378         if (m < 0)
379                 goto out;
380
381         if (m > *mpri) {
382                 if (strict & RT6_LOOKUP_F_REACHABLE)
383                         rt6_probe(match);
384                 *mpri = m;
385                 match = rt;
386         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
387                 rt6_probe(rt);
388         }
389
390 out:
391         return match;
392 }
393
394 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
395                                      struct rt6_info *rr_head,
396                                      u32 metric, int oif, int strict)
397 {
398         struct rt6_info *rt, *match;
399         int mpri = -1;
400
401         match = NULL;
402         for (rt = rr_head; rt && rt->rt6i_metric == metric;
403              rt = rt->u.dst.rt6_next)
404                 match = find_match(rt, oif, strict, &mpri, match);
405         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
406              rt = rt->u.dst.rt6_next)
407                 match = find_match(rt, oif, strict, &mpri, match);
408
409         return match;
410 }
411
412 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
413 {
414         struct rt6_info *match, *rt0;
415
416         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
417                   __FUNCTION__, fn->leaf, oif);
418
419         rt0 = fn->rr_ptr;
420         if (!rt0)
421                 fn->rr_ptr = rt0 = fn->leaf;
422
423         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
424
425         if (!match &&
426             (strict & RT6_LOOKUP_F_REACHABLE)) {
427                 struct rt6_info *next = rt0->u.dst.rt6_next;
428
429                 /* no entries matched; do round-robin */
430                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
431                         next = fn->leaf;
432
433                 if (next != rt0)
434                         fn->rr_ptr = next;
435         }
436
437         RT6_TRACE("%s() => %p\n",
438                   __FUNCTION__, match);
439
440         return (match ? match : &ip6_null_entry);
441 }
442
443 #ifdef CONFIG_IPV6_ROUTE_INFO
444 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
445                   struct in6_addr *gwaddr)
446 {
447         struct net *net = dev->nd_net;
448         struct route_info *rinfo = (struct route_info *) opt;
449         struct in6_addr prefix_buf, *prefix;
450         unsigned int pref;
451         u32 lifetime;
452         struct rt6_info *rt;
453
454         if (len < sizeof(struct route_info)) {
455                 return -EINVAL;
456         }
457
458         /* Sanity check for prefix_len and length */
459         if (rinfo->length > 3) {
460                 return -EINVAL;
461         } else if (rinfo->prefix_len > 128) {
462                 return -EINVAL;
463         } else if (rinfo->prefix_len > 64) {
464                 if (rinfo->length < 2) {
465                         return -EINVAL;
466                 }
467         } else if (rinfo->prefix_len > 0) {
468                 if (rinfo->length < 1) {
469                         return -EINVAL;
470                 }
471         }
472
473         pref = rinfo->route_pref;
474         if (pref == ICMPV6_ROUTER_PREF_INVALID)
475                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
476
477         lifetime = ntohl(rinfo->lifetime);
478         if (lifetime == 0xffffffff) {
479                 /* infinity */
480         } else if (lifetime > 0x7fffffff/HZ) {
481                 /* Avoid arithmetic overflow */
482                 lifetime = 0x7fffffff/HZ - 1;
483         }
484
485         if (rinfo->length == 3)
486                 prefix = (struct in6_addr *)rinfo->prefix;
487         else {
488                 /* this function is safe */
489                 ipv6_addr_prefix(&prefix_buf,
490                                  (struct in6_addr *)rinfo->prefix,
491                                  rinfo->prefix_len);
492                 prefix = &prefix_buf;
493         }
494
495         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
496                                 dev->ifindex);
497
498         if (rt && !lifetime) {
499                 ip6_del_rt(rt);
500                 rt = NULL;
501         }
502
503         if (!rt && lifetime)
504                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
505                                         pref);
506         else if (rt)
507                 rt->rt6i_flags = RTF_ROUTEINFO |
508                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
509
510         if (rt) {
511                 if (lifetime == 0xffffffff) {
512                         rt->rt6i_flags &= ~RTF_EXPIRES;
513                 } else {
514                         rt->rt6i_expires = jiffies + HZ * lifetime;
515                         rt->rt6i_flags |= RTF_EXPIRES;
516                 }
517                 dst_release(&rt->u.dst);
518         }
519         return 0;
520 }
521 #endif
522
523 #define BACKTRACK(saddr) \
524 do { \
525         if (rt == &ip6_null_entry) { \
526                 struct fib6_node *pn; \
527                 while (1) { \
528                         if (fn->fn_flags & RTN_TL_ROOT) \
529                                 goto out; \
530                         pn = fn->parent; \
531                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
532                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
533                         else \
534                                 fn = pn; \
535                         if (fn->fn_flags & RTN_RTINFO) \
536                                 goto restart; \
537                 } \
538         } \
539 } while(0)
540
541 static struct rt6_info *ip6_pol_route_lookup(struct fib6_table *table,
542                                              struct flowi *fl, int flags)
543 {
544         struct fib6_node *fn;
545         struct rt6_info *rt;
546
547         read_lock_bh(&table->tb6_lock);
548         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
549 restart:
550         rt = fn->leaf;
551         rt = rt6_device_match(rt, fl->oif, flags);
552         BACKTRACK(&fl->fl6_src);
553 out:
554         dst_use(&rt->u.dst, jiffies);
555         read_unlock_bh(&table->tb6_lock);
556         return rt;
557
558 }
559
560 struct rt6_info *rt6_lookup(struct net *net, struct in6_addr *daddr,
561                             struct in6_addr *saddr, int oif, int strict)
562 {
563         struct flowi fl = {
564                 .oif = oif,
565                 .nl_u = {
566                         .ip6_u = {
567                                 .daddr = *daddr,
568                         },
569                 },
570         };
571         struct dst_entry *dst;
572         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
573
574         if (saddr) {
575                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
576                 flags |= RT6_LOOKUP_F_HAS_SADDR;
577         }
578
579         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
580         if (dst->error == 0)
581                 return (struct rt6_info *) dst;
582
583         dst_release(dst);
584
585         return NULL;
586 }
587
588 EXPORT_SYMBOL(rt6_lookup);
589
590 /* ip6_ins_rt is called with FREE table->tb6_lock.
591    It takes new route entry, the addition fails by any reason the
592    route is freed. In any case, if caller does not hold it, it may
593    be destroyed.
594  */
595
596 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
597 {
598         int err;
599         struct fib6_table *table;
600
601         table = rt->rt6i_table;
602         write_lock_bh(&table->tb6_lock);
603         err = fib6_add(&table->tb6_root, rt, info);
604         write_unlock_bh(&table->tb6_lock);
605
606         return err;
607 }
608
609 int ip6_ins_rt(struct rt6_info *rt)
610 {
611         struct nl_info info = {
612                 .nl_net = &init_net,
613         };
614         return __ip6_ins_rt(rt, &info);
615 }
616
617 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
618                                       struct in6_addr *saddr)
619 {
620         struct rt6_info *rt;
621
622         /*
623          *      Clone the route.
624          */
625
626         rt = ip6_rt_copy(ort);
627
628         if (rt) {
629                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
630                         if (rt->rt6i_dst.plen != 128 &&
631                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
632                                 rt->rt6i_flags |= RTF_ANYCAST;
633                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
634                 }
635
636                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
637                 rt->rt6i_dst.plen = 128;
638                 rt->rt6i_flags |= RTF_CACHE;
639                 rt->u.dst.flags |= DST_HOST;
640
641 #ifdef CONFIG_IPV6_SUBTREES
642                 if (rt->rt6i_src.plen && saddr) {
643                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
644                         rt->rt6i_src.plen = 128;
645                 }
646 #endif
647
648                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
649
650         }
651
652         return rt;
653 }
654
655 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
656 {
657         struct rt6_info *rt = ip6_rt_copy(ort);
658         if (rt) {
659                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
660                 rt->rt6i_dst.plen = 128;
661                 rt->rt6i_flags |= RTF_CACHE;
662                 rt->u.dst.flags |= DST_HOST;
663                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
664         }
665         return rt;
666 }
667
668 static struct rt6_info *ip6_pol_route(struct fib6_table *table, int oif,
669                                             struct flowi *fl, int flags)
670 {
671         struct fib6_node *fn;
672         struct rt6_info *rt, *nrt;
673         int strict = 0;
674         int attempts = 3;
675         int err;
676         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
677
678         strict |= flags & RT6_LOOKUP_F_IFACE;
679
680 relookup:
681         read_lock_bh(&table->tb6_lock);
682
683 restart_2:
684         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
685
686 restart:
687         rt = rt6_select(fn, oif, strict | reachable);
688         BACKTRACK(&fl->fl6_src);
689         if (rt == &ip6_null_entry ||
690             rt->rt6i_flags & RTF_CACHE)
691                 goto out;
692
693         dst_hold(&rt->u.dst);
694         read_unlock_bh(&table->tb6_lock);
695
696         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
697                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
698         else {
699 #if CLONE_OFFLINK_ROUTE
700                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
701 #else
702                 goto out2;
703 #endif
704         }
705
706         dst_release(&rt->u.dst);
707         rt = nrt ? : &ip6_null_entry;
708
709         dst_hold(&rt->u.dst);
710         if (nrt) {
711                 err = ip6_ins_rt(nrt);
712                 if (!err)
713                         goto out2;
714         }
715
716         if (--attempts <= 0)
717                 goto out2;
718
719         /*
720          * Race condition! In the gap, when table->tb6_lock was
721          * released someone could insert this route.  Relookup.
722          */
723         dst_release(&rt->u.dst);
724         goto relookup;
725
726 out:
727         if (reachable) {
728                 reachable = 0;
729                 goto restart_2;
730         }
731         dst_hold(&rt->u.dst);
732         read_unlock_bh(&table->tb6_lock);
733 out2:
734         rt->u.dst.lastuse = jiffies;
735         rt->u.dst.__use++;
736
737         return rt;
738 }
739
740 static struct rt6_info *ip6_pol_route_input(struct fib6_table *table,
741                                             struct flowi *fl, int flags)
742 {
743         return ip6_pol_route(table, fl->iif, fl, flags);
744 }
745
746 void ip6_route_input(struct sk_buff *skb)
747 {
748         struct ipv6hdr *iph = ipv6_hdr(skb);
749         int flags = RT6_LOOKUP_F_HAS_SADDR;
750         struct flowi fl = {
751                 .iif = skb->dev->ifindex,
752                 .nl_u = {
753                         .ip6_u = {
754                                 .daddr = iph->daddr,
755                                 .saddr = iph->saddr,
756                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
757                         },
758                 },
759                 .mark = skb->mark,
760                 .proto = iph->nexthdr,
761         };
762
763         if (rt6_need_strict(&iph->daddr))
764                 flags |= RT6_LOOKUP_F_IFACE;
765
766         skb->dst = fib6_rule_lookup(&init_net, &fl, flags, ip6_pol_route_input);
767 }
768
769 static struct rt6_info *ip6_pol_route_output(struct fib6_table *table,
770                                              struct flowi *fl, int flags)
771 {
772         return ip6_pol_route(table, fl->oif, fl, flags);
773 }
774
775 struct dst_entry * ip6_route_output(struct sock *sk, struct flowi *fl)
776 {
777         int flags = 0;
778
779         if (rt6_need_strict(&fl->fl6_dst))
780                 flags |= RT6_LOOKUP_F_IFACE;
781
782         if (!ipv6_addr_any(&fl->fl6_src))
783                 flags |= RT6_LOOKUP_F_HAS_SADDR;
784
785         return fib6_rule_lookup(&init_net, fl, flags, ip6_pol_route_output);
786 }
787
788 EXPORT_SYMBOL(ip6_route_output);
789
790 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
791 {
792         struct rt6_info *ort = (struct rt6_info *) *dstp;
793         struct rt6_info *rt = (struct rt6_info *)
794                 dst_alloc(&ip6_dst_blackhole_ops);
795         struct dst_entry *new = NULL;
796
797         if (rt) {
798                 new = &rt->u.dst;
799
800                 atomic_set(&new->__refcnt, 1);
801                 new->__use = 1;
802                 new->input = dst_discard;
803                 new->output = dst_discard;
804
805                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
806                 new->dev = ort->u.dst.dev;
807                 if (new->dev)
808                         dev_hold(new->dev);
809                 rt->rt6i_idev = ort->rt6i_idev;
810                 if (rt->rt6i_idev)
811                         in6_dev_hold(rt->rt6i_idev);
812                 rt->rt6i_expires = 0;
813
814                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
815                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
816                 rt->rt6i_metric = 0;
817
818                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
819 #ifdef CONFIG_IPV6_SUBTREES
820                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
821 #endif
822
823                 dst_free(new);
824         }
825
826         dst_release(*dstp);
827         *dstp = new;
828         return (new ? 0 : -ENOMEM);
829 }
830 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
831
832 /*
833  *      Destination cache support functions
834  */
835
836 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
837 {
838         struct rt6_info *rt;
839
840         rt = (struct rt6_info *) dst;
841
842         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
843                 return dst;
844
845         return NULL;
846 }
847
848 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
849 {
850         struct rt6_info *rt = (struct rt6_info *) dst;
851
852         if (rt) {
853                 if (rt->rt6i_flags & RTF_CACHE)
854                         ip6_del_rt(rt);
855                 else
856                         dst_release(dst);
857         }
858         return NULL;
859 }
860
861 static void ip6_link_failure(struct sk_buff *skb)
862 {
863         struct rt6_info *rt;
864
865         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
866
867         rt = (struct rt6_info *) skb->dst;
868         if (rt) {
869                 if (rt->rt6i_flags&RTF_CACHE) {
870                         dst_set_expires(&rt->u.dst, 0);
871                         rt->rt6i_flags |= RTF_EXPIRES;
872                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
873                         rt->rt6i_node->fn_sernum = -1;
874         }
875 }
876
877 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
878 {
879         struct rt6_info *rt6 = (struct rt6_info*)dst;
880
881         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
882                 rt6->rt6i_flags |= RTF_MODIFIED;
883                 if (mtu < IPV6_MIN_MTU) {
884                         mtu = IPV6_MIN_MTU;
885                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
886                 }
887                 dst->metrics[RTAX_MTU-1] = mtu;
888                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
889         }
890 }
891
892 static int ipv6_get_mtu(struct net_device *dev);
893
894 static inline unsigned int ipv6_advmss(unsigned int mtu)
895 {
896         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
897
898         if (mtu < init_net.ipv6.sysctl.ip6_rt_min_advmss)
899                 mtu = init_net.ipv6.sysctl.ip6_rt_min_advmss;
900
901         /*
902          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
903          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
904          * IPV6_MAXPLEN is also valid and means: "any MSS,
905          * rely only on pmtu discovery"
906          */
907         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
908                 mtu = IPV6_MAXPLEN;
909         return mtu;
910 }
911
912 static struct dst_entry *icmp6_dst_gc_list;
913 static DEFINE_SPINLOCK(icmp6_dst_lock);
914
915 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
916                                   struct neighbour *neigh,
917                                   struct in6_addr *addr)
918 {
919         struct rt6_info *rt;
920         struct inet6_dev *idev = in6_dev_get(dev);
921
922         if (unlikely(idev == NULL))
923                 return NULL;
924
925         rt = ip6_dst_alloc();
926         if (unlikely(rt == NULL)) {
927                 in6_dev_put(idev);
928                 goto out;
929         }
930
931         dev_hold(dev);
932         if (neigh)
933                 neigh_hold(neigh);
934         else
935                 neigh = ndisc_get_neigh(dev, addr);
936
937         rt->rt6i_dev      = dev;
938         rt->rt6i_idev     = idev;
939         rt->rt6i_nexthop  = neigh;
940         atomic_set(&rt->u.dst.__refcnt, 1);
941         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
942         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
943         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
944         rt->u.dst.output  = ip6_output;
945
946 #if 0   /* there's no chance to use these for ndisc */
947         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
948                                 ? DST_HOST
949                                 : 0;
950         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
951         rt->rt6i_dst.plen = 128;
952 #endif
953
954         spin_lock_bh(&icmp6_dst_lock);
955         rt->u.dst.next = icmp6_dst_gc_list;
956         icmp6_dst_gc_list = &rt->u.dst;
957         spin_unlock_bh(&icmp6_dst_lock);
958
959         fib6_force_start_gc(dev->nd_net);
960
961 out:
962         return &rt->u.dst;
963 }
964
965 int icmp6_dst_gc(int *more)
966 {
967         struct dst_entry *dst, *next, **pprev;
968         int freed;
969
970         next = NULL;
971         freed = 0;
972
973         spin_lock_bh(&icmp6_dst_lock);
974         pprev = &icmp6_dst_gc_list;
975
976         while ((dst = *pprev) != NULL) {
977                 if (!atomic_read(&dst->__refcnt)) {
978                         *pprev = dst->next;
979                         dst_free(dst);
980                         freed++;
981                 } else {
982                         pprev = &dst->next;
983                         (*more)++;
984                 }
985         }
986
987         spin_unlock_bh(&icmp6_dst_lock);
988
989         return freed;
990 }
991
992 static int ip6_dst_gc(struct dst_ops *ops)
993 {
994         static unsigned expire = 30*HZ;
995         static unsigned long last_gc;
996         unsigned long now = jiffies;
997
998         if (time_after(last_gc + init_net.ipv6.sysctl.ip6_rt_gc_min_interval, now) &&
999             atomic_read(&ip6_dst_ops.entries) <= init_net.ipv6.sysctl.ip6_rt_max_size)
1000                 goto out;
1001
1002         expire++;
1003         fib6_run_gc(expire, &init_net);
1004         last_gc = now;
1005         if (atomic_read(&ip6_dst_ops.entries) < ip6_dst_ops.gc_thresh)
1006                 expire = init_net.ipv6.sysctl.ip6_rt_gc_timeout>>1;
1007
1008 out:
1009         expire -= expire>>init_net.ipv6.sysctl.ip6_rt_gc_elasticity;
1010         return (atomic_read(&ip6_dst_ops.entries) > init_net.ipv6.sysctl.ip6_rt_max_size);
1011 }
1012
1013 /* Clean host part of a prefix. Not necessary in radix tree,
1014    but results in cleaner routing tables.
1015
1016    Remove it only when all the things will work!
1017  */
1018
1019 static int ipv6_get_mtu(struct net_device *dev)
1020 {
1021         int mtu = IPV6_MIN_MTU;
1022         struct inet6_dev *idev;
1023
1024         idev = in6_dev_get(dev);
1025         if (idev) {
1026                 mtu = idev->cnf.mtu6;
1027                 in6_dev_put(idev);
1028         }
1029         return mtu;
1030 }
1031
1032 int ipv6_get_hoplimit(struct net_device *dev)
1033 {
1034         int hoplimit = ipv6_devconf.hop_limit;
1035         struct inet6_dev *idev;
1036
1037         idev = in6_dev_get(dev);
1038         if (idev) {
1039                 hoplimit = idev->cnf.hop_limit;
1040                 in6_dev_put(idev);
1041         }
1042         return hoplimit;
1043 }
1044
1045 /*
1046  *
1047  */
1048
1049 int ip6_route_add(struct fib6_config *cfg)
1050 {
1051         int err;
1052         struct rt6_info *rt = NULL;
1053         struct net_device *dev = NULL;
1054         struct inet6_dev *idev = NULL;
1055         struct fib6_table *table;
1056         int addr_type;
1057
1058         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1059                 return -EINVAL;
1060 #ifndef CONFIG_IPV6_SUBTREES
1061         if (cfg->fc_src_len)
1062                 return -EINVAL;
1063 #endif
1064         if (cfg->fc_ifindex) {
1065                 err = -ENODEV;
1066                 dev = dev_get_by_index(&init_net, cfg->fc_ifindex);
1067                 if (!dev)
1068                         goto out;
1069                 idev = in6_dev_get(dev);
1070                 if (!idev)
1071                         goto out;
1072         }
1073
1074         if (cfg->fc_metric == 0)
1075                 cfg->fc_metric = IP6_RT_PRIO_USER;
1076
1077         table = fib6_new_table(&init_net, cfg->fc_table);
1078         if (table == NULL) {
1079                 err = -ENOBUFS;
1080                 goto out;
1081         }
1082
1083         rt = ip6_dst_alloc();
1084
1085         if (rt == NULL) {
1086                 err = -ENOMEM;
1087                 goto out;
1088         }
1089
1090         rt->u.dst.obsolete = -1;
1091         rt->rt6i_expires = jiffies + clock_t_to_jiffies(cfg->fc_expires);
1092
1093         if (cfg->fc_protocol == RTPROT_UNSPEC)
1094                 cfg->fc_protocol = RTPROT_BOOT;
1095         rt->rt6i_protocol = cfg->fc_protocol;
1096
1097         addr_type = ipv6_addr_type(&cfg->fc_dst);
1098
1099         if (addr_type & IPV6_ADDR_MULTICAST)
1100                 rt->u.dst.input = ip6_mc_input;
1101         else
1102                 rt->u.dst.input = ip6_forward;
1103
1104         rt->u.dst.output = ip6_output;
1105
1106         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1107         rt->rt6i_dst.plen = cfg->fc_dst_len;
1108         if (rt->rt6i_dst.plen == 128)
1109                rt->u.dst.flags = DST_HOST;
1110
1111 #ifdef CONFIG_IPV6_SUBTREES
1112         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1113         rt->rt6i_src.plen = cfg->fc_src_len;
1114 #endif
1115
1116         rt->rt6i_metric = cfg->fc_metric;
1117
1118         /* We cannot add true routes via loopback here,
1119            they would result in kernel looping; promote them to reject routes
1120          */
1121         if ((cfg->fc_flags & RTF_REJECT) ||
1122             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1123                 /* hold loopback dev/idev if we haven't done so. */
1124                 if (dev != init_net.loopback_dev) {
1125                         if (dev) {
1126                                 dev_put(dev);
1127                                 in6_dev_put(idev);
1128                         }
1129                         dev = init_net.loopback_dev;
1130                         dev_hold(dev);
1131                         idev = in6_dev_get(dev);
1132                         if (!idev) {
1133                                 err = -ENODEV;
1134                                 goto out;
1135                         }
1136                 }
1137                 rt->u.dst.output = ip6_pkt_discard_out;
1138                 rt->u.dst.input = ip6_pkt_discard;
1139                 rt->u.dst.error = -ENETUNREACH;
1140                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1141                 goto install_route;
1142         }
1143
1144         if (cfg->fc_flags & RTF_GATEWAY) {
1145                 struct in6_addr *gw_addr;
1146                 int gwa_type;
1147
1148                 gw_addr = &cfg->fc_gateway;
1149                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1150                 gwa_type = ipv6_addr_type(gw_addr);
1151
1152                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1153                         struct rt6_info *grt;
1154
1155                         /* IPv6 strictly inhibits using not link-local
1156                            addresses as nexthop address.
1157                            Otherwise, router will not able to send redirects.
1158                            It is very good, but in some (rare!) circumstances
1159                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1160                            some exceptions. --ANK
1161                          */
1162                         err = -EINVAL;
1163                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1164                                 goto out;
1165
1166                         grt = rt6_lookup(&init_net, gw_addr, NULL, cfg->fc_ifindex, 1);
1167
1168                         err = -EHOSTUNREACH;
1169                         if (grt == NULL)
1170                                 goto out;
1171                         if (dev) {
1172                                 if (dev != grt->rt6i_dev) {
1173                                         dst_release(&grt->u.dst);
1174                                         goto out;
1175                                 }
1176                         } else {
1177                                 dev = grt->rt6i_dev;
1178                                 idev = grt->rt6i_idev;
1179                                 dev_hold(dev);
1180                                 in6_dev_hold(grt->rt6i_idev);
1181                         }
1182                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1183                                 err = 0;
1184                         dst_release(&grt->u.dst);
1185
1186                         if (err)
1187                                 goto out;
1188                 }
1189                 err = -EINVAL;
1190                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1191                         goto out;
1192         }
1193
1194         err = -ENODEV;
1195         if (dev == NULL)
1196                 goto out;
1197
1198         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1199                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1200                 if (IS_ERR(rt->rt6i_nexthop)) {
1201                         err = PTR_ERR(rt->rt6i_nexthop);
1202                         rt->rt6i_nexthop = NULL;
1203                         goto out;
1204                 }
1205         }
1206
1207         rt->rt6i_flags = cfg->fc_flags;
1208
1209 install_route:
1210         if (cfg->fc_mx) {
1211                 struct nlattr *nla;
1212                 int remaining;
1213
1214                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1215                         int type = nla_type(nla);
1216
1217                         if (type) {
1218                                 if (type > RTAX_MAX) {
1219                                         err = -EINVAL;
1220                                         goto out;
1221                                 }
1222
1223                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1224                         }
1225                 }
1226         }
1227
1228         if (rt->u.dst.metrics[RTAX_HOPLIMIT-1] == 0)
1229                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1230         if (!rt->u.dst.metrics[RTAX_MTU-1])
1231                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1232         if (!rt->u.dst.metrics[RTAX_ADVMSS-1])
1233                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1234         rt->u.dst.dev = dev;
1235         rt->rt6i_idev = idev;
1236         rt->rt6i_table = table;
1237
1238         cfg->fc_nlinfo.nl_net = dev->nd_net;
1239
1240         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1241
1242 out:
1243         if (dev)
1244                 dev_put(dev);
1245         if (idev)
1246                 in6_dev_put(idev);
1247         if (rt)
1248                 dst_free(&rt->u.dst);
1249         return err;
1250 }
1251
1252 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1253 {
1254         int err;
1255         struct fib6_table *table;
1256
1257         if (rt == &ip6_null_entry)
1258                 return -ENOENT;
1259
1260         table = rt->rt6i_table;
1261         write_lock_bh(&table->tb6_lock);
1262
1263         err = fib6_del(rt, info);
1264         dst_release(&rt->u.dst);
1265
1266         write_unlock_bh(&table->tb6_lock);
1267
1268         return err;
1269 }
1270
1271 int ip6_del_rt(struct rt6_info *rt)
1272 {
1273         struct nl_info info = {
1274                 .nl_net = &init_net,
1275         };
1276         return __ip6_del_rt(rt, &info);
1277 }
1278
1279 static int ip6_route_del(struct fib6_config *cfg)
1280 {
1281         struct fib6_table *table;
1282         struct fib6_node *fn;
1283         struct rt6_info *rt;
1284         int err = -ESRCH;
1285
1286         table = fib6_get_table(&init_net, cfg->fc_table);
1287         if (table == NULL)
1288                 return err;
1289
1290         read_lock_bh(&table->tb6_lock);
1291
1292         fn = fib6_locate(&table->tb6_root,
1293                          &cfg->fc_dst, cfg->fc_dst_len,
1294                          &cfg->fc_src, cfg->fc_src_len);
1295
1296         if (fn) {
1297                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1298                         if (cfg->fc_ifindex &&
1299                             (rt->rt6i_dev == NULL ||
1300                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1301                                 continue;
1302                         if (cfg->fc_flags & RTF_GATEWAY &&
1303                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1304                                 continue;
1305                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1306                                 continue;
1307                         dst_hold(&rt->u.dst);
1308                         read_unlock_bh(&table->tb6_lock);
1309
1310                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1311                 }
1312         }
1313         read_unlock_bh(&table->tb6_lock);
1314
1315         return err;
1316 }
1317
1318 /*
1319  *      Handle redirects
1320  */
1321 struct ip6rd_flowi {
1322         struct flowi fl;
1323         struct in6_addr gateway;
1324 };
1325
1326 static struct rt6_info *__ip6_route_redirect(struct fib6_table *table,
1327                                              struct flowi *fl,
1328                                              int flags)
1329 {
1330         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1331         struct rt6_info *rt;
1332         struct fib6_node *fn;
1333
1334         /*
1335          * Get the "current" route for this destination and
1336          * check if the redirect has come from approriate router.
1337          *
1338          * RFC 2461 specifies that redirects should only be
1339          * accepted if they come from the nexthop to the target.
1340          * Due to the way the routes are chosen, this notion
1341          * is a bit fuzzy and one might need to check all possible
1342          * routes.
1343          */
1344
1345         read_lock_bh(&table->tb6_lock);
1346         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1347 restart:
1348         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1349                 /*
1350                  * Current route is on-link; redirect is always invalid.
1351                  *
1352                  * Seems, previous statement is not true. It could
1353                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1354                  * But then router serving it might decide, that we should
1355                  * know truth 8)8) --ANK (980726).
1356                  */
1357                 if (rt6_check_expired(rt))
1358                         continue;
1359                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1360                         continue;
1361                 if (fl->oif != rt->rt6i_dev->ifindex)
1362                         continue;
1363                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1364                         continue;
1365                 break;
1366         }
1367
1368         if (!rt)
1369                 rt = &ip6_null_entry;
1370         BACKTRACK(&fl->fl6_src);
1371 out:
1372         dst_hold(&rt->u.dst);
1373
1374         read_unlock_bh(&table->tb6_lock);
1375
1376         return rt;
1377 };
1378
1379 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1380                                            struct in6_addr *src,
1381                                            struct in6_addr *gateway,
1382                                            struct net_device *dev)
1383 {
1384         int flags = RT6_LOOKUP_F_HAS_SADDR;
1385         struct ip6rd_flowi rdfl = {
1386                 .fl = {
1387                         .oif = dev->ifindex,
1388                         .nl_u = {
1389                                 .ip6_u = {
1390                                         .daddr = *dest,
1391                                         .saddr = *src,
1392                                 },
1393                         },
1394                 },
1395                 .gateway = *gateway,
1396         };
1397
1398         if (rt6_need_strict(dest))
1399                 flags |= RT6_LOOKUP_F_IFACE;
1400
1401         return (struct rt6_info *)fib6_rule_lookup(&init_net,
1402                                                    (struct flowi *)&rdfl,
1403                                                    flags, __ip6_route_redirect);
1404 }
1405
1406 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1407                   struct in6_addr *saddr,
1408                   struct neighbour *neigh, u8 *lladdr, int on_link)
1409 {
1410         struct rt6_info *rt, *nrt = NULL;
1411         struct netevent_redirect netevent;
1412
1413         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1414
1415         if (rt == &ip6_null_entry) {
1416                 if (net_ratelimit())
1417                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1418                                "for redirect target\n");
1419                 goto out;
1420         }
1421
1422         /*
1423          *      We have finally decided to accept it.
1424          */
1425
1426         neigh_update(neigh, lladdr, NUD_STALE,
1427                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1428                      NEIGH_UPDATE_F_OVERRIDE|
1429                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1430                                      NEIGH_UPDATE_F_ISROUTER))
1431                      );
1432
1433         /*
1434          * Redirect received -> path was valid.
1435          * Look, redirects are sent only in response to data packets,
1436          * so that this nexthop apparently is reachable. --ANK
1437          */
1438         dst_confirm(&rt->u.dst);
1439
1440         /* Duplicate redirect: silently ignore. */
1441         if (neigh == rt->u.dst.neighbour)
1442                 goto out;
1443
1444         nrt = ip6_rt_copy(rt);
1445         if (nrt == NULL)
1446                 goto out;
1447
1448         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1449         if (on_link)
1450                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1451
1452         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1453         nrt->rt6i_dst.plen = 128;
1454         nrt->u.dst.flags |= DST_HOST;
1455
1456         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1457         nrt->rt6i_nexthop = neigh_clone(neigh);
1458         /* Reset pmtu, it may be better */
1459         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1460         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&nrt->u.dst));
1461
1462         if (ip6_ins_rt(nrt))
1463                 goto out;
1464
1465         netevent.old = &rt->u.dst;
1466         netevent.new = &nrt->u.dst;
1467         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1468
1469         if (rt->rt6i_flags&RTF_CACHE) {
1470                 ip6_del_rt(rt);
1471                 return;
1472         }
1473
1474 out:
1475         dst_release(&rt->u.dst);
1476         return;
1477 }
1478
1479 /*
1480  *      Handle ICMP "packet too big" messages
1481  *      i.e. Path MTU discovery
1482  */
1483
1484 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1485                         struct net_device *dev, u32 pmtu)
1486 {
1487         struct rt6_info *rt, *nrt;
1488         int allfrag = 0;
1489
1490         rt = rt6_lookup(dev->nd_net, daddr, saddr, dev->ifindex, 0);
1491         if (rt == NULL)
1492                 return;
1493
1494         if (pmtu >= dst_mtu(&rt->u.dst))
1495                 goto out;
1496
1497         if (pmtu < IPV6_MIN_MTU) {
1498                 /*
1499                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1500                  * MTU (1280) and a fragment header should always be included
1501                  * after a node receiving Too Big message reporting PMTU is
1502                  * less than the IPv6 Minimum Link MTU.
1503                  */
1504                 pmtu = IPV6_MIN_MTU;
1505                 allfrag = 1;
1506         }
1507
1508         /* New mtu received -> path was valid.
1509            They are sent only in response to data packets,
1510            so that this nexthop apparently is reachable. --ANK
1511          */
1512         dst_confirm(&rt->u.dst);
1513
1514         /* Host route. If it is static, it would be better
1515            not to override it, but add new one, so that
1516            when cache entry will expire old pmtu
1517            would return automatically.
1518          */
1519         if (rt->rt6i_flags & RTF_CACHE) {
1520                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1521                 if (allfrag)
1522                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1523                 dst_set_expires(&rt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires);
1524                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1525                 goto out;
1526         }
1527
1528         /* Network route.
1529            Two cases are possible:
1530            1. It is connected route. Action: COW
1531            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1532          */
1533         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1534                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1535         else
1536                 nrt = rt6_alloc_clone(rt, daddr);
1537
1538         if (nrt) {
1539                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1540                 if (allfrag)
1541                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1542
1543                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1544                  * happened within 5 mins, the recommended timer is 10 mins.
1545                  * Here this route expiration time is set to ip6_rt_mtu_expires
1546                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1547                  * and detecting PMTU increase will be automatically happened.
1548                  */
1549                 dst_set_expires(&nrt->u.dst, init_net.ipv6.sysctl.ip6_rt_mtu_expires);
1550                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1551
1552                 ip6_ins_rt(nrt);
1553         }
1554 out:
1555         dst_release(&rt->u.dst);
1556 }
1557
1558 /*
1559  *      Misc support functions
1560  */
1561
1562 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1563 {
1564         struct rt6_info *rt = ip6_dst_alloc();
1565
1566         if (rt) {
1567                 rt->u.dst.input = ort->u.dst.input;
1568                 rt->u.dst.output = ort->u.dst.output;
1569
1570                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1571                 rt->u.dst.error = ort->u.dst.error;
1572                 rt->u.dst.dev = ort->u.dst.dev;
1573                 if (rt->u.dst.dev)
1574                         dev_hold(rt->u.dst.dev);
1575                 rt->rt6i_idev = ort->rt6i_idev;
1576                 if (rt->rt6i_idev)
1577                         in6_dev_hold(rt->rt6i_idev);
1578                 rt->u.dst.lastuse = jiffies;
1579                 rt->rt6i_expires = 0;
1580
1581                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1582                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1583                 rt->rt6i_metric = 0;
1584
1585                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1586 #ifdef CONFIG_IPV6_SUBTREES
1587                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1588 #endif
1589                 rt->rt6i_table = ort->rt6i_table;
1590         }
1591         return rt;
1592 }
1593
1594 #ifdef CONFIG_IPV6_ROUTE_INFO
1595 static struct rt6_info *rt6_get_route_info(struct net *net,
1596                                            struct in6_addr *prefix, int prefixlen,
1597                                            struct in6_addr *gwaddr, int ifindex)
1598 {
1599         struct fib6_node *fn;
1600         struct rt6_info *rt = NULL;
1601         struct fib6_table *table;
1602
1603         table = fib6_get_table(net, RT6_TABLE_INFO);
1604         if (table == NULL)
1605                 return NULL;
1606
1607         write_lock_bh(&table->tb6_lock);
1608         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1609         if (!fn)
1610                 goto out;
1611
1612         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1613                 if (rt->rt6i_dev->ifindex != ifindex)
1614                         continue;
1615                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1616                         continue;
1617                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1618                         continue;
1619                 dst_hold(&rt->u.dst);
1620                 break;
1621         }
1622 out:
1623         write_unlock_bh(&table->tb6_lock);
1624         return rt;
1625 }
1626
1627 static struct rt6_info *rt6_add_route_info(struct net *net,
1628                                            struct in6_addr *prefix, int prefixlen,
1629                                            struct in6_addr *gwaddr, int ifindex,
1630                                            unsigned pref)
1631 {
1632         struct fib6_config cfg = {
1633                 .fc_table       = RT6_TABLE_INFO,
1634                 .fc_metric      = IP6_RT_PRIO_USER,
1635                 .fc_ifindex     = ifindex,
1636                 .fc_dst_len     = prefixlen,
1637                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1638                                   RTF_UP | RTF_PREF(pref),
1639                 .fc_nlinfo.pid = 0,
1640                 .fc_nlinfo.nlh = NULL,
1641                 .fc_nlinfo.nl_net = net,
1642         };
1643
1644         ipv6_addr_copy(&cfg.fc_dst, prefix);
1645         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1646
1647         /* We should treat it as a default route if prefix length is 0. */
1648         if (!prefixlen)
1649                 cfg.fc_flags |= RTF_DEFAULT;
1650
1651         ip6_route_add(&cfg);
1652
1653         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1654 }
1655 #endif
1656
1657 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1658 {
1659         struct rt6_info *rt;
1660         struct fib6_table *table;
1661
1662         table = fib6_get_table(&init_net, RT6_TABLE_DFLT);
1663         if (table == NULL)
1664                 return NULL;
1665
1666         write_lock_bh(&table->tb6_lock);
1667         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1668                 if (dev == rt->rt6i_dev &&
1669                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1670                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1671                         break;
1672         }
1673         if (rt)
1674                 dst_hold(&rt->u.dst);
1675         write_unlock_bh(&table->tb6_lock);
1676         return rt;
1677 }
1678
1679 EXPORT_SYMBOL(rt6_get_dflt_router);
1680
1681 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1682                                      struct net_device *dev,
1683                                      unsigned int pref)
1684 {
1685         struct fib6_config cfg = {
1686                 .fc_table       = RT6_TABLE_DFLT,
1687                 .fc_metric      = IP6_RT_PRIO_USER,
1688                 .fc_ifindex     = dev->ifindex,
1689                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1690                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1691         };
1692
1693         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1694
1695         ip6_route_add(&cfg);
1696
1697         return rt6_get_dflt_router(gwaddr, dev);
1698 }
1699
1700 void rt6_purge_dflt_routers(void)
1701 {
1702         struct rt6_info *rt;
1703         struct fib6_table *table;
1704
1705         /* NOTE: Keep consistent with rt6_get_dflt_router */
1706         table = fib6_get_table(&init_net, RT6_TABLE_DFLT);
1707         if (table == NULL)
1708                 return;
1709
1710 restart:
1711         read_lock_bh(&table->tb6_lock);
1712         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1713                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1714                         dst_hold(&rt->u.dst);
1715                         read_unlock_bh(&table->tb6_lock);
1716                         ip6_del_rt(rt);
1717                         goto restart;
1718                 }
1719         }
1720         read_unlock_bh(&table->tb6_lock);
1721 }
1722
1723 static void rtmsg_to_fib6_config(struct in6_rtmsg *rtmsg,
1724                                  struct fib6_config *cfg)
1725 {
1726         memset(cfg, 0, sizeof(*cfg));
1727
1728         cfg->fc_table = RT6_TABLE_MAIN;
1729         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1730         cfg->fc_metric = rtmsg->rtmsg_metric;
1731         cfg->fc_expires = rtmsg->rtmsg_info;
1732         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1733         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1734         cfg->fc_flags = rtmsg->rtmsg_flags;
1735
1736         cfg->fc_nlinfo.nl_net = &init_net;
1737
1738         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1739         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1740         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1741 }
1742
1743 int ipv6_route_ioctl(unsigned int cmd, void __user *arg)
1744 {
1745         struct fib6_config cfg;
1746         struct in6_rtmsg rtmsg;
1747         int err;
1748
1749         switch(cmd) {
1750         case SIOCADDRT:         /* Add a route */
1751         case SIOCDELRT:         /* Delete a route */
1752                 if (!capable(CAP_NET_ADMIN))
1753                         return -EPERM;
1754                 err = copy_from_user(&rtmsg, arg,
1755                                      sizeof(struct in6_rtmsg));
1756                 if (err)
1757                         return -EFAULT;
1758
1759                 rtmsg_to_fib6_config(&rtmsg, &cfg);
1760
1761                 rtnl_lock();
1762                 switch (cmd) {
1763                 case SIOCADDRT:
1764                         err = ip6_route_add(&cfg);
1765                         break;
1766                 case SIOCDELRT:
1767                         err = ip6_route_del(&cfg);
1768                         break;
1769                 default:
1770                         err = -EINVAL;
1771                 }
1772                 rtnl_unlock();
1773
1774                 return err;
1775         }
1776
1777         return -EINVAL;
1778 }
1779
1780 /*
1781  *      Drop the packet on the floor
1782  */
1783
1784 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1785 {
1786         int type;
1787         switch (ipstats_mib_noroutes) {
1788         case IPSTATS_MIB_INNOROUTES:
1789                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1790                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1791                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1792                         break;
1793                 }
1794                 /* FALLTHROUGH */
1795         case IPSTATS_MIB_OUTNOROUTES:
1796                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1797                 break;
1798         }
1799         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1800         kfree_skb(skb);
1801         return 0;
1802 }
1803
1804 static int ip6_pkt_discard(struct sk_buff *skb)
1805 {
1806         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1807 }
1808
1809 static int ip6_pkt_discard_out(struct sk_buff *skb)
1810 {
1811         skb->dev = skb->dst->dev;
1812         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1813 }
1814
1815 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1816
1817 static int ip6_pkt_prohibit(struct sk_buff *skb)
1818 {
1819         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1820 }
1821
1822 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1823 {
1824         skb->dev = skb->dst->dev;
1825         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1826 }
1827
1828 #endif
1829
1830 /*
1831  *      Allocate a dst for local (unicast / anycast) address.
1832  */
1833
1834 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1835                                     const struct in6_addr *addr,
1836                                     int anycast)
1837 {
1838         struct rt6_info *rt = ip6_dst_alloc();
1839
1840         if (rt == NULL)
1841                 return ERR_PTR(-ENOMEM);
1842
1843         dev_hold(init_net.loopback_dev);
1844         in6_dev_hold(idev);
1845
1846         rt->u.dst.flags = DST_HOST;
1847         rt->u.dst.input = ip6_input;
1848         rt->u.dst.output = ip6_output;
1849         rt->rt6i_dev = init_net.loopback_dev;
1850         rt->rt6i_idev = idev;
1851         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1852         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dst_mtu(&rt->u.dst));
1853         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1854         rt->u.dst.obsolete = -1;
1855
1856         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1857         if (anycast)
1858                 rt->rt6i_flags |= RTF_ANYCAST;
1859         else
1860                 rt->rt6i_flags |= RTF_LOCAL;
1861         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1862         if (rt->rt6i_nexthop == NULL) {
1863                 dst_free(&rt->u.dst);
1864                 return ERR_PTR(-ENOMEM);
1865         }
1866
1867         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1868         rt->rt6i_dst.plen = 128;
1869         rt->rt6i_table = fib6_get_table(&init_net, RT6_TABLE_LOCAL);
1870
1871         atomic_set(&rt->u.dst.__refcnt, 1);
1872
1873         return rt;
1874 }
1875
1876 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1877 {
1878         if (((void*)rt->rt6i_dev == arg || arg == NULL) &&
1879             rt != &ip6_null_entry) {
1880                 RT6_TRACE("deleted by ifdown %p\n", rt);
1881                 return -1;
1882         }
1883         return 0;
1884 }
1885
1886 void rt6_ifdown(struct net *net, struct net_device *dev)
1887 {
1888         fib6_clean_all(net, fib6_ifdown, 0, dev);
1889 }
1890
1891 struct rt6_mtu_change_arg
1892 {
1893         struct net_device *dev;
1894         unsigned mtu;
1895 };
1896
1897 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1898 {
1899         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1900         struct inet6_dev *idev;
1901
1902         /* In IPv6 pmtu discovery is not optional,
1903            so that RTAX_MTU lock cannot disable it.
1904            We still use this lock to block changes
1905            caused by addrconf/ndisc.
1906         */
1907
1908         idev = __in6_dev_get(arg->dev);
1909         if (idev == NULL)
1910                 return 0;
1911
1912         /* For administrative MTU increase, there is no way to discover
1913            IPv6 PMTU increase, so PMTU increase should be updated here.
1914            Since RFC 1981 doesn't include administrative MTU increase
1915            update PMTU increase is a MUST. (i.e. jumbo frame)
1916          */
1917         /*
1918            If new MTU is less than route PMTU, this new MTU will be the
1919            lowest MTU in the path, update the route PMTU to reflect PMTU
1920            decreases; if new MTU is greater than route PMTU, and the
1921            old MTU is the lowest MTU in the path, update the route PMTU
1922            to reflect the increase. In this case if the other nodes' MTU
1923            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1924            PMTU discouvery.
1925          */
1926         if (rt->rt6i_dev == arg->dev &&
1927             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1928             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1929              (dst_mtu(&rt->u.dst) < arg->mtu &&
1930               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1931                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1932                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(arg->mtu);
1933         }
1934         return 0;
1935 }
1936
1937 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1938 {
1939         struct rt6_mtu_change_arg arg = {
1940                 .dev = dev,
1941                 .mtu = mtu,
1942         };
1943
1944         fib6_clean_all(dev->nd_net, rt6_mtu_change_route, 0, &arg);
1945 }
1946
1947 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1948         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1949         [RTA_OIF]               = { .type = NLA_U32 },
1950         [RTA_IIF]               = { .type = NLA_U32 },
1951         [RTA_PRIORITY]          = { .type = NLA_U32 },
1952         [RTA_METRICS]           = { .type = NLA_NESTED },
1953 };
1954
1955 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1956                               struct fib6_config *cfg)
1957 {
1958         struct rtmsg *rtm;
1959         struct nlattr *tb[RTA_MAX+1];
1960         int err;
1961
1962         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1963         if (err < 0)
1964                 goto errout;
1965
1966         err = -EINVAL;
1967         rtm = nlmsg_data(nlh);
1968         memset(cfg, 0, sizeof(*cfg));
1969
1970         cfg->fc_table = rtm->rtm_table;
1971         cfg->fc_dst_len = rtm->rtm_dst_len;
1972         cfg->fc_src_len = rtm->rtm_src_len;
1973         cfg->fc_flags = RTF_UP;
1974         cfg->fc_protocol = rtm->rtm_protocol;
1975
1976         if (rtm->rtm_type == RTN_UNREACHABLE)
1977                 cfg->fc_flags |= RTF_REJECT;
1978
1979         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
1980         cfg->fc_nlinfo.nlh = nlh;
1981         cfg->fc_nlinfo.nl_net = skb->sk->sk_net;
1982
1983         if (tb[RTA_GATEWAY]) {
1984                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
1985                 cfg->fc_flags |= RTF_GATEWAY;
1986         }
1987
1988         if (tb[RTA_DST]) {
1989                 int plen = (rtm->rtm_dst_len + 7) >> 3;
1990
1991                 if (nla_len(tb[RTA_DST]) < plen)
1992                         goto errout;
1993
1994                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1995         }
1996
1997         if (tb[RTA_SRC]) {
1998                 int plen = (rtm->rtm_src_len + 7) >> 3;
1999
2000                 if (nla_len(tb[RTA_SRC]) < plen)
2001                         goto errout;
2002
2003                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2004         }
2005
2006         if (tb[RTA_OIF])
2007                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2008
2009         if (tb[RTA_PRIORITY])
2010                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2011
2012         if (tb[RTA_METRICS]) {
2013                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2014                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2015         }
2016
2017         if (tb[RTA_TABLE])
2018                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2019
2020         err = 0;
2021 errout:
2022         return err;
2023 }
2024
2025 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2026 {
2027         struct net *net = skb->sk->sk_net;
2028         struct fib6_config cfg;
2029         int err;
2030
2031         if (net != &init_net)
2032                 return -EINVAL;
2033
2034         err = rtm_to_fib6_config(skb, nlh, &cfg);
2035         if (err < 0)
2036                 return err;
2037
2038         return ip6_route_del(&cfg);
2039 }
2040
2041 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2042 {
2043         struct net *net = skb->sk->sk_net;
2044         struct fib6_config cfg;
2045         int err;
2046
2047         if (net != &init_net)
2048                 return -EINVAL;
2049
2050         err = rtm_to_fib6_config(skb, nlh, &cfg);
2051         if (err < 0)
2052                 return err;
2053
2054         return ip6_route_add(&cfg);
2055 }
2056
2057 static inline size_t rt6_nlmsg_size(void)
2058 {
2059         return NLMSG_ALIGN(sizeof(struct rtmsg))
2060                + nla_total_size(16) /* RTA_SRC */
2061                + nla_total_size(16) /* RTA_DST */
2062                + nla_total_size(16) /* RTA_GATEWAY */
2063                + nla_total_size(16) /* RTA_PREFSRC */
2064                + nla_total_size(4) /* RTA_TABLE */
2065                + nla_total_size(4) /* RTA_IIF */
2066                + nla_total_size(4) /* RTA_OIF */
2067                + nla_total_size(4) /* RTA_PRIORITY */
2068                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2069                + nla_total_size(sizeof(struct rta_cacheinfo));
2070 }
2071
2072 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2073                          struct in6_addr *dst, struct in6_addr *src,
2074                          int iif, int type, u32 pid, u32 seq,
2075                          int prefix, unsigned int flags)
2076 {
2077         struct rtmsg *rtm;
2078         struct nlmsghdr *nlh;
2079         long expires;
2080         u32 table;
2081
2082         if (prefix) {   /* user wants prefix routes only */
2083                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2084                         /* success since this is not a prefix route */
2085                         return 1;
2086                 }
2087         }
2088
2089         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2090         if (nlh == NULL)
2091                 return -EMSGSIZE;
2092
2093         rtm = nlmsg_data(nlh);
2094         rtm->rtm_family = AF_INET6;
2095         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2096         rtm->rtm_src_len = rt->rt6i_src.plen;
2097         rtm->rtm_tos = 0;
2098         if (rt->rt6i_table)
2099                 table = rt->rt6i_table->tb6_id;
2100         else
2101                 table = RT6_TABLE_UNSPEC;
2102         rtm->rtm_table = table;
2103         NLA_PUT_U32(skb, RTA_TABLE, table);
2104         if (rt->rt6i_flags&RTF_REJECT)
2105                 rtm->rtm_type = RTN_UNREACHABLE;
2106         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2107                 rtm->rtm_type = RTN_LOCAL;
2108         else
2109                 rtm->rtm_type = RTN_UNICAST;
2110         rtm->rtm_flags = 0;
2111         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2112         rtm->rtm_protocol = rt->rt6i_protocol;
2113         if (rt->rt6i_flags&RTF_DYNAMIC)
2114                 rtm->rtm_protocol = RTPROT_REDIRECT;
2115         else if (rt->rt6i_flags & RTF_ADDRCONF)
2116                 rtm->rtm_protocol = RTPROT_KERNEL;
2117         else if (rt->rt6i_flags&RTF_DEFAULT)
2118                 rtm->rtm_protocol = RTPROT_RA;
2119
2120         if (rt->rt6i_flags&RTF_CACHE)
2121                 rtm->rtm_flags |= RTM_F_CLONED;
2122
2123         if (dst) {
2124                 NLA_PUT(skb, RTA_DST, 16, dst);
2125                 rtm->rtm_dst_len = 128;
2126         } else if (rtm->rtm_dst_len)
2127                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2128 #ifdef CONFIG_IPV6_SUBTREES
2129         if (src) {
2130                 NLA_PUT(skb, RTA_SRC, 16, src);
2131                 rtm->rtm_src_len = 128;
2132         } else if (rtm->rtm_src_len)
2133                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2134 #endif
2135         if (iif)
2136                 NLA_PUT_U32(skb, RTA_IIF, iif);
2137         else if (dst) {
2138                 struct in6_addr saddr_buf;
2139                 if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2140                                        dst, &saddr_buf) == 0)
2141                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2142         }
2143
2144         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2145                 goto nla_put_failure;
2146
2147         if (rt->u.dst.neighbour)
2148                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2149
2150         if (rt->u.dst.dev)
2151                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2152
2153         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2154
2155         expires = rt->rt6i_expires ? rt->rt6i_expires - jiffies : 0;
2156         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2157                                expires, rt->u.dst.error) < 0)
2158                 goto nla_put_failure;
2159
2160         return nlmsg_end(skb, nlh);
2161
2162 nla_put_failure:
2163         nlmsg_cancel(skb, nlh);
2164         return -EMSGSIZE;
2165 }
2166
2167 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2168 {
2169         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2170         int prefix;
2171
2172         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2173                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2174                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2175         } else
2176                 prefix = 0;
2177
2178         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2179                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2180                      prefix, NLM_F_MULTI);
2181 }
2182
2183 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2184 {
2185         struct net *net = in_skb->sk->sk_net;
2186         struct nlattr *tb[RTA_MAX+1];
2187         struct rt6_info *rt;
2188         struct sk_buff *skb;
2189         struct rtmsg *rtm;
2190         struct flowi fl;
2191         int err, iif = 0;
2192
2193         if (net != &init_net)
2194                 return -EINVAL;
2195
2196         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2197         if (err < 0)
2198                 goto errout;
2199
2200         err = -EINVAL;
2201         memset(&fl, 0, sizeof(fl));
2202
2203         if (tb[RTA_SRC]) {
2204                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2205                         goto errout;
2206
2207                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2208         }
2209
2210         if (tb[RTA_DST]) {
2211                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2212                         goto errout;
2213
2214                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2215         }
2216
2217         if (tb[RTA_IIF])
2218                 iif = nla_get_u32(tb[RTA_IIF]);
2219
2220         if (tb[RTA_OIF])
2221                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2222
2223         if (iif) {
2224                 struct net_device *dev;
2225                 dev = __dev_get_by_index(&init_net, iif);
2226                 if (!dev) {
2227                         err = -ENODEV;
2228                         goto errout;
2229                 }
2230         }
2231
2232         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2233         if (skb == NULL) {
2234                 err = -ENOBUFS;
2235                 goto errout;
2236         }
2237
2238         /* Reserve room for dummy headers, this skb can pass
2239            through good chunk of routing engine.
2240          */
2241         skb_reset_mac_header(skb);
2242         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2243
2244         rt = (struct rt6_info*) ip6_route_output(NULL, &fl);
2245         skb->dst = &rt->u.dst;
2246
2247         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2248                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2249                             nlh->nlmsg_seq, 0, 0);
2250         if (err < 0) {
2251                 kfree_skb(skb);
2252                 goto errout;
2253         }
2254
2255         err = rtnl_unicast(skb, &init_net, NETLINK_CB(in_skb).pid);
2256 errout:
2257         return err;
2258 }
2259
2260 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2261 {
2262         struct sk_buff *skb;
2263         u32 seq;
2264         int err;
2265
2266         err = -ENOBUFS;
2267         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2268
2269         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2270         if (skb == NULL)
2271                 goto errout;
2272
2273         err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2274                                 event, info->pid, seq, 0, 0);
2275         if (err < 0) {
2276                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2277                 WARN_ON(err == -EMSGSIZE);
2278                 kfree_skb(skb);
2279                 goto errout;
2280         }
2281         err = rtnl_notify(skb, &init_net, info->pid,
2282                                 RTNLGRP_IPV6_ROUTE, info->nlh, gfp_any());
2283 errout:
2284         if (err < 0)
2285                 rtnl_set_sk_err(&init_net, RTNLGRP_IPV6_ROUTE, err);
2286 }
2287
2288 /*
2289  *      /proc
2290  */
2291
2292 #ifdef CONFIG_PROC_FS
2293
2294 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2295
2296 struct rt6_proc_arg
2297 {
2298         char *buffer;
2299         int offset;
2300         int length;
2301         int skip;
2302         int len;
2303 };
2304
2305 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2306 {
2307         struct seq_file *m = p_arg;
2308
2309         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2310                    rt->rt6i_dst.plen);
2311
2312 #ifdef CONFIG_IPV6_SUBTREES
2313         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2314                    rt->rt6i_src.plen);
2315 #else
2316         seq_puts(m, "00000000000000000000000000000000 00 ");
2317 #endif
2318
2319         if (rt->rt6i_nexthop) {
2320                 seq_printf(m, NIP6_SEQFMT,
2321                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2322         } else {
2323                 seq_puts(m, "00000000000000000000000000000000");
2324         }
2325         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2326                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2327                    rt->u.dst.__use, rt->rt6i_flags,
2328                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2329         return 0;
2330 }
2331
2332 static int ipv6_route_show(struct seq_file *m, void *v)
2333 {
2334         struct net *net = (struct net *)m->private;
2335         fib6_clean_all(net, rt6_info_route, 0, m);
2336         return 0;
2337 }
2338
2339 static int ipv6_route_open(struct inode *inode, struct file *file)
2340 {
2341         struct net *net = get_proc_net(inode);
2342         if (!net)
2343                 return -ENXIO;
2344         return single_open(file, ipv6_route_show, net);
2345 }
2346
2347 static int ipv6_route_release(struct inode *inode, struct file *file)
2348 {
2349         struct seq_file *seq = file->private_data;
2350         struct net *net = seq->private;
2351         put_net(net);
2352         return single_release(inode, file);
2353 }
2354
2355 static const struct file_operations ipv6_route_proc_fops = {
2356         .owner          = THIS_MODULE,
2357         .open           = ipv6_route_open,
2358         .read           = seq_read,
2359         .llseek         = seq_lseek,
2360         .release        = ipv6_route_release,
2361 };
2362
2363 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2364 {
2365         struct net *net = (struct net *)seq->private;
2366         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2367                    net->ipv6.rt6_stats->fib_nodes,
2368                    net->ipv6.rt6_stats->fib_route_nodes,
2369                    net->ipv6.rt6_stats->fib_rt_alloc,
2370                    net->ipv6.rt6_stats->fib_rt_entries,
2371                    net->ipv6.rt6_stats->fib_rt_cache,
2372                    atomic_read(&ip6_dst_ops.entries),
2373                    net->ipv6.rt6_stats->fib_discarded_routes);
2374
2375         return 0;
2376 }
2377
2378 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2379 {
2380         struct net *net = get_proc_net(inode);
2381         return single_open(file, rt6_stats_seq_show, net);
2382 }
2383
2384 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2385 {
2386         struct seq_file *seq = file->private_data;
2387         struct net *net = (struct net *)seq->private;
2388         put_net(net);
2389         return single_release(inode, file);
2390 }
2391
2392 static const struct file_operations rt6_stats_seq_fops = {
2393         .owner   = THIS_MODULE,
2394         .open    = rt6_stats_seq_open,
2395         .read    = seq_read,
2396         .llseek  = seq_lseek,
2397         .release = rt6_stats_seq_release,
2398 };
2399 #endif  /* CONFIG_PROC_FS */
2400
2401 #ifdef CONFIG_SYSCTL
2402
2403 static
2404 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2405                               void __user *buffer, size_t *lenp, loff_t *ppos)
2406 {
2407         struct net *net = current->nsproxy->net_ns;
2408         int delay = net->ipv6.sysctl.flush_delay;
2409         if (write) {
2410                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2411                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2412                 return 0;
2413         } else
2414                 return -EINVAL;
2415 }
2416
2417 ctl_table ipv6_route_table_template[] = {
2418         {
2419                 .procname       =       "flush",
2420                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2421                 .maxlen         =       sizeof(int),
2422                 .mode           =       0200,
2423                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2424         },
2425         {
2426                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2427                 .procname       =       "gc_thresh",
2428                 .data           =       &ip6_dst_ops.gc_thresh,
2429                 .maxlen         =       sizeof(int),
2430                 .mode           =       0644,
2431                 .proc_handler   =       &proc_dointvec,
2432         },
2433         {
2434                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2435                 .procname       =       "max_size",
2436                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2437                 .maxlen         =       sizeof(int),
2438                 .mode           =       0644,
2439                 .proc_handler   =       &proc_dointvec,
2440         },
2441         {
2442                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2443                 .procname       =       "gc_min_interval",
2444                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2445                 .maxlen         =       sizeof(int),
2446                 .mode           =       0644,
2447                 .proc_handler   =       &proc_dointvec_jiffies,
2448                 .strategy       =       &sysctl_jiffies,
2449         },
2450         {
2451                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2452                 .procname       =       "gc_timeout",
2453                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2454                 .maxlen         =       sizeof(int),
2455                 .mode           =       0644,
2456                 .proc_handler   =       &proc_dointvec_jiffies,
2457                 .strategy       =       &sysctl_jiffies,
2458         },
2459         {
2460                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2461                 .procname       =       "gc_interval",
2462                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2463                 .maxlen         =       sizeof(int),
2464                 .mode           =       0644,
2465                 .proc_handler   =       &proc_dointvec_jiffies,
2466                 .strategy       =       &sysctl_jiffies,
2467         },
2468         {
2469                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2470                 .procname       =       "gc_elasticity",
2471                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2472                 .maxlen         =       sizeof(int),
2473                 .mode           =       0644,
2474                 .proc_handler   =       &proc_dointvec_jiffies,
2475                 .strategy       =       &sysctl_jiffies,
2476         },
2477         {
2478                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2479                 .procname       =       "mtu_expires",
2480                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2481                 .maxlen         =       sizeof(int),
2482                 .mode           =       0644,
2483                 .proc_handler   =       &proc_dointvec_jiffies,
2484                 .strategy       =       &sysctl_jiffies,
2485         },
2486         {
2487                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2488                 .procname       =       "min_adv_mss",
2489                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2490                 .maxlen         =       sizeof(int),
2491                 .mode           =       0644,
2492                 .proc_handler   =       &proc_dointvec_jiffies,
2493                 .strategy       =       &sysctl_jiffies,
2494         },
2495         {
2496                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2497                 .procname       =       "gc_min_interval_ms",
2498                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2499                 .maxlen         =       sizeof(int),
2500                 .mode           =       0644,
2501                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2502                 .strategy       =       &sysctl_ms_jiffies,
2503         },
2504         { .ctl_name = 0 }
2505 };
2506
2507 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2508 {
2509         struct ctl_table *table;
2510
2511         table = kmemdup(ipv6_route_table_template,
2512                         sizeof(ipv6_route_table_template),
2513                         GFP_KERNEL);
2514
2515         if (table) {
2516                 table[0].data = &net->ipv6.sysctl.flush_delay;
2517                 /* table[1].data will be handled when we have
2518                    routes per namespace */
2519                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2520                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2521                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2522                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2523                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2524                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2525                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2526         }
2527
2528         return table;
2529 }
2530 #endif
2531
2532 static int ip6_route_net_init(struct net *net)
2533 {
2534 #ifdef CONFIG_PROC_FS
2535         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2536         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2537 #endif
2538         return 0;
2539 }
2540
2541 static void ip6_route_net_exit(struct net *net)
2542 {
2543 #ifdef CONFIG_PROC_FS
2544         proc_net_remove(net, "ipv6_route");
2545         proc_net_remove(net, "rt6_stats");
2546 #endif
2547 }
2548
2549 static struct pernet_operations ip6_route_net_ops = {
2550         .init = ip6_route_net_init,
2551         .exit = ip6_route_net_exit,
2552 };
2553
2554 int __init ip6_route_init(void)
2555 {
2556         int ret;
2557
2558         ip6_dst_ops.kmem_cachep =
2559                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2560                                   SLAB_HWCACHE_ALIGN, NULL);
2561         if (!ip6_dst_ops.kmem_cachep)
2562                 return -ENOMEM;
2563
2564         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops.kmem_cachep;
2565
2566         ret = fib6_init();
2567         if (ret)
2568                 goto out_kmem_cache;
2569
2570         ret = xfrm6_init();
2571         if (ret)
2572                 goto out_fib6_init;
2573
2574         ret = fib6_rules_init();
2575         if (ret)
2576                 goto xfrm6_init;
2577
2578         ret = -ENOBUFS;
2579         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2580             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2581             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2582                 goto fib6_rules_init;
2583
2584         ret = register_pernet_subsys(&ip6_route_net_ops);
2585         if (ret)
2586                 goto fib6_rules_init;
2587 out:
2588         return ret;
2589
2590 fib6_rules_init:
2591         fib6_rules_cleanup();
2592 xfrm6_init:
2593         xfrm6_fini();
2594 out_fib6_init:
2595         rt6_ifdown(&init_net, NULL);
2596         fib6_gc_cleanup();
2597 out_kmem_cache:
2598         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2599         goto out;
2600 }
2601
2602 void ip6_route_cleanup(void)
2603 {
2604         unregister_pernet_subsys(&ip6_route_net_ops);
2605         fib6_rules_cleanup();
2606         xfrm6_fini();
2607         rt6_ifdown(&init_net, NULL);
2608         fib6_gc_cleanup();
2609         kmem_cache_destroy(ip6_dst_ops.kmem_cachep);
2610 }