Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[safe/jmp/linux-2.6] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #include <linux/capability.h>
28 #include <linux/errno.h>
29 #include <linux/types.h>
30 #include <linux/times.h>
31 #include <linux/socket.h>
32 #include <linux/sockios.h>
33 #include <linux/net.h>
34 #include <linux/route.h>
35 #include <linux/netdevice.h>
36 #include <linux/in6.h>
37 #include <linux/mroute6.h>
38 #include <linux/init.h>
39 #include <linux/if_arp.h>
40 #include <linux/proc_fs.h>
41 #include <linux/seq_file.h>
42 #include <linux/nsproxy.h>
43 #include <net/net_namespace.h>
44 #include <net/snmp.h>
45 #include <net/ipv6.h>
46 #include <net/ip6_fib.h>
47 #include <net/ip6_route.h>
48 #include <net/ndisc.h>
49 #include <net/addrconf.h>
50 #include <net/tcp.h>
51 #include <linux/rtnetlink.h>
52 #include <net/dst.h>
53 #include <net/xfrm.h>
54 #include <net/netevent.h>
55 #include <net/netlink.h>
56
57 #include <asm/uaccess.h>
58
59 #ifdef CONFIG_SYSCTL
60 #include <linux/sysctl.h>
61 #endif
62
63 /* Set to 3 to get tracing. */
64 #define RT6_DEBUG 2
65
66 #if RT6_DEBUG >= 3
67 #define RDBG(x) printk x
68 #define RT6_TRACE(x...) printk(KERN_DEBUG x)
69 #else
70 #define RDBG(x)
71 #define RT6_TRACE(x...) do { ; } while (0)
72 #endif
73
74 #define CLONE_OFFLINK_ROUTE 0
75
76 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort);
77 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
78 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
79 static void             ip6_dst_destroy(struct dst_entry *);
80 static void             ip6_dst_ifdown(struct dst_entry *,
81                                        struct net_device *dev, int how);
82 static int               ip6_dst_gc(struct dst_ops *ops);
83
84 static int              ip6_pkt_discard(struct sk_buff *skb);
85 static int              ip6_pkt_discard_out(struct sk_buff *skb);
86 static void             ip6_link_failure(struct sk_buff *skb);
87 static void             ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu);
88
89 #ifdef CONFIG_IPV6_ROUTE_INFO
90 static struct rt6_info *rt6_add_route_info(struct net *net,
91                                            struct in6_addr *prefix, int prefixlen,
92                                            struct in6_addr *gwaddr, int ifindex,
93                                            unsigned pref);
94 static struct rt6_info *rt6_get_route_info(struct net *net,
95                                            struct in6_addr *prefix, int prefixlen,
96                                            struct in6_addr *gwaddr, int ifindex);
97 #endif
98
99 static struct dst_ops ip6_dst_ops_template = {
100         .family                 =       AF_INET6,
101         .protocol               =       __constant_htons(ETH_P_IPV6),
102         .gc                     =       ip6_dst_gc,
103         .gc_thresh              =       1024,
104         .check                  =       ip6_dst_check,
105         .destroy                =       ip6_dst_destroy,
106         .ifdown                 =       ip6_dst_ifdown,
107         .negative_advice        =       ip6_negative_advice,
108         .link_failure           =       ip6_link_failure,
109         .update_pmtu            =       ip6_rt_update_pmtu,
110         .local_out              =       __ip6_local_out,
111         .entry_size             =       sizeof(struct rt6_info),
112         .entries                =       ATOMIC_INIT(0),
113 };
114
115 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu)
116 {
117 }
118
119 static struct dst_ops ip6_dst_blackhole_ops = {
120         .family                 =       AF_INET6,
121         .protocol               =       __constant_htons(ETH_P_IPV6),
122         .destroy                =       ip6_dst_destroy,
123         .check                  =       ip6_dst_check,
124         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
125         .entry_size             =       sizeof(struct rt6_info),
126         .entries                =       ATOMIC_INIT(0),
127 };
128
129 static struct rt6_info ip6_null_entry_template = {
130         .u = {
131                 .dst = {
132                         .__refcnt       = ATOMIC_INIT(1),
133                         .__use          = 1,
134                         .obsolete       = -1,
135                         .error          = -ENETUNREACH,
136                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
137                         .input          = ip6_pkt_discard,
138                         .output         = ip6_pkt_discard_out,
139                 }
140         },
141         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
142         .rt6i_metric    = ~(u32) 0,
143         .rt6i_ref       = ATOMIC_INIT(1),
144 };
145
146 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
147
148 static int ip6_pkt_prohibit(struct sk_buff *skb);
149 static int ip6_pkt_prohibit_out(struct sk_buff *skb);
150
151 static struct rt6_info ip6_prohibit_entry_template = {
152         .u = {
153                 .dst = {
154                         .__refcnt       = ATOMIC_INIT(1),
155                         .__use          = 1,
156                         .obsolete       = -1,
157                         .error          = -EACCES,
158                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
159                         .input          = ip6_pkt_prohibit,
160                         .output         = ip6_pkt_prohibit_out,
161                 }
162         },
163         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
164         .rt6i_metric    = ~(u32) 0,
165         .rt6i_ref       = ATOMIC_INIT(1),
166 };
167
168 static struct rt6_info ip6_blk_hole_entry_template = {
169         .u = {
170                 .dst = {
171                         .__refcnt       = ATOMIC_INIT(1),
172                         .__use          = 1,
173                         .obsolete       = -1,
174                         .error          = -EINVAL,
175                         .metrics        = { [RTAX_HOPLIMIT - 1] = 255, },
176                         .input          = dst_discard,
177                         .output         = dst_discard,
178                 }
179         },
180         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
181         .rt6i_metric    = ~(u32) 0,
182         .rt6i_ref       = ATOMIC_INIT(1),
183 };
184
185 #endif
186
187 /* allocate dst with ip6_dst_ops */
188 static inline struct rt6_info *ip6_dst_alloc(struct dst_ops *ops)
189 {
190         return (struct rt6_info *)dst_alloc(ops);
191 }
192
193 static void ip6_dst_destroy(struct dst_entry *dst)
194 {
195         struct rt6_info *rt = (struct rt6_info *)dst;
196         struct inet6_dev *idev = rt->rt6i_idev;
197
198         if (idev != NULL) {
199                 rt->rt6i_idev = NULL;
200                 in6_dev_put(idev);
201         }
202 }
203
204 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
205                            int how)
206 {
207         struct rt6_info *rt = (struct rt6_info *)dst;
208         struct inet6_dev *idev = rt->rt6i_idev;
209         struct net_device *loopback_dev =
210                 dev_net(dev)->loopback_dev;
211
212         if (dev != loopback_dev && idev != NULL && idev->dev == dev) {
213                 struct inet6_dev *loopback_idev =
214                         in6_dev_get(loopback_dev);
215                 if (loopback_idev != NULL) {
216                         rt->rt6i_idev = loopback_idev;
217                         in6_dev_put(idev);
218                 }
219         }
220 }
221
222 static __inline__ int rt6_check_expired(const struct rt6_info *rt)
223 {
224         return (rt->rt6i_flags & RTF_EXPIRES &&
225                 time_after(jiffies, rt->rt6i_expires));
226 }
227
228 static inline int rt6_need_strict(struct in6_addr *daddr)
229 {
230         return (ipv6_addr_type(daddr) &
231                 (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL));
232 }
233
234 /*
235  *      Route lookup. Any table->tb6_lock is implied.
236  */
237
238 static inline struct rt6_info *rt6_device_match(struct net *net,
239                                                     struct rt6_info *rt,
240                                                     int oif,
241                                                     int strict)
242 {
243         struct rt6_info *local = NULL;
244         struct rt6_info *sprt;
245
246         if (oif) {
247                 for (sprt = rt; sprt; sprt = sprt->u.dst.rt6_next) {
248                         struct net_device *dev = sprt->rt6i_dev;
249                         if (dev->ifindex == oif)
250                                 return sprt;
251                         if (dev->flags & IFF_LOOPBACK) {
252                                 if (sprt->rt6i_idev == NULL ||
253                                     sprt->rt6i_idev->dev->ifindex != oif) {
254                                         if (strict && oif)
255                                                 continue;
256                                         if (local && (!oif ||
257                                                       local->rt6i_idev->dev->ifindex == oif))
258                                                 continue;
259                                 }
260                                 local = sprt;
261                         }
262                 }
263
264                 if (local)
265                         return local;
266
267                 if (strict)
268                         return net->ipv6.ip6_null_entry;
269         }
270         return rt;
271 }
272
273 #ifdef CONFIG_IPV6_ROUTER_PREF
274 static void rt6_probe(struct rt6_info *rt)
275 {
276         struct neighbour *neigh = rt ? rt->rt6i_nexthop : NULL;
277         /*
278          * Okay, this does not seem to be appropriate
279          * for now, however, we need to check if it
280          * is really so; aka Router Reachability Probing.
281          *
282          * Router Reachability Probe MUST be rate-limited
283          * to no more than one per minute.
284          */
285         if (!neigh || (neigh->nud_state & NUD_VALID))
286                 return;
287         read_lock_bh(&neigh->lock);
288         if (!(neigh->nud_state & NUD_VALID) &&
289             time_after(jiffies, neigh->updated + rt->rt6i_idev->cnf.rtr_probe_interval)) {
290                 struct in6_addr mcaddr;
291                 struct in6_addr *target;
292
293                 neigh->updated = jiffies;
294                 read_unlock_bh(&neigh->lock);
295
296                 target = (struct in6_addr *)&neigh->primary_key;
297                 addrconf_addr_solict_mult(target, &mcaddr);
298                 ndisc_send_ns(rt->rt6i_dev, NULL, target, &mcaddr, NULL);
299         } else
300                 read_unlock_bh(&neigh->lock);
301 }
302 #else
303 static inline void rt6_probe(struct rt6_info *rt)
304 {
305         return;
306 }
307 #endif
308
309 /*
310  * Default Router Selection (RFC 2461 6.3.6)
311  */
312 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
313 {
314         struct net_device *dev = rt->rt6i_dev;
315         if (!oif || dev->ifindex == oif)
316                 return 2;
317         if ((dev->flags & IFF_LOOPBACK) &&
318             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
319                 return 1;
320         return 0;
321 }
322
323 static inline int rt6_check_neigh(struct rt6_info *rt)
324 {
325         struct neighbour *neigh = rt->rt6i_nexthop;
326         int m;
327         if (rt->rt6i_flags & RTF_NONEXTHOP ||
328             !(rt->rt6i_flags & RTF_GATEWAY))
329                 m = 1;
330         else if (neigh) {
331                 read_lock_bh(&neigh->lock);
332                 if (neigh->nud_state & NUD_VALID)
333                         m = 2;
334 #ifdef CONFIG_IPV6_ROUTER_PREF
335                 else if (neigh->nud_state & NUD_FAILED)
336                         m = 0;
337 #endif
338                 else
339                         m = 1;
340                 read_unlock_bh(&neigh->lock);
341         } else
342                 m = 0;
343         return m;
344 }
345
346 static int rt6_score_route(struct rt6_info *rt, int oif,
347                            int strict)
348 {
349         int m, n;
350
351         m = rt6_check_dev(rt, oif);
352         if (!m && (strict & RT6_LOOKUP_F_IFACE))
353                 return -1;
354 #ifdef CONFIG_IPV6_ROUTER_PREF
355         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
356 #endif
357         n = rt6_check_neigh(rt);
358         if (!n && (strict & RT6_LOOKUP_F_REACHABLE))
359                 return -1;
360         return m;
361 }
362
363 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
364                                    int *mpri, struct rt6_info *match)
365 {
366         int m;
367
368         if (rt6_check_expired(rt))
369                 goto out;
370
371         m = rt6_score_route(rt, oif, strict);
372         if (m < 0)
373                 goto out;
374
375         if (m > *mpri) {
376                 if (strict & RT6_LOOKUP_F_REACHABLE)
377                         rt6_probe(match);
378                 *mpri = m;
379                 match = rt;
380         } else if (strict & RT6_LOOKUP_F_REACHABLE) {
381                 rt6_probe(rt);
382         }
383
384 out:
385         return match;
386 }
387
388 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
389                                      struct rt6_info *rr_head,
390                                      u32 metric, int oif, int strict)
391 {
392         struct rt6_info *rt, *match;
393         int mpri = -1;
394
395         match = NULL;
396         for (rt = rr_head; rt && rt->rt6i_metric == metric;
397              rt = rt->u.dst.rt6_next)
398                 match = find_match(rt, oif, strict, &mpri, match);
399         for (rt = fn->leaf; rt && rt != rr_head && rt->rt6i_metric == metric;
400              rt = rt->u.dst.rt6_next)
401                 match = find_match(rt, oif, strict, &mpri, match);
402
403         return match;
404 }
405
406 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
407 {
408         struct rt6_info *match, *rt0;
409         struct net *net;
410
411         RT6_TRACE("%s(fn->leaf=%p, oif=%d)\n",
412                   __func__, fn->leaf, oif);
413
414         rt0 = fn->rr_ptr;
415         if (!rt0)
416                 fn->rr_ptr = rt0 = fn->leaf;
417
418         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict);
419
420         if (!match &&
421             (strict & RT6_LOOKUP_F_REACHABLE)) {
422                 struct rt6_info *next = rt0->u.dst.rt6_next;
423
424                 /* no entries matched; do round-robin */
425                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
426                         next = fn->leaf;
427
428                 if (next != rt0)
429                         fn->rr_ptr = next;
430         }
431
432         RT6_TRACE("%s() => %p\n",
433                   __func__, match);
434
435         net = dev_net(rt0->rt6i_dev);
436         return (match ? match : net->ipv6.ip6_null_entry);
437 }
438
439 #ifdef CONFIG_IPV6_ROUTE_INFO
440 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
441                   struct in6_addr *gwaddr)
442 {
443         struct net *net = dev_net(dev);
444         struct route_info *rinfo = (struct route_info *) opt;
445         struct in6_addr prefix_buf, *prefix;
446         unsigned int pref;
447         unsigned long lifetime;
448         struct rt6_info *rt;
449
450         if (len < sizeof(struct route_info)) {
451                 return -EINVAL;
452         }
453
454         /* Sanity check for prefix_len and length */
455         if (rinfo->length > 3) {
456                 return -EINVAL;
457         } else if (rinfo->prefix_len > 128) {
458                 return -EINVAL;
459         } else if (rinfo->prefix_len > 64) {
460                 if (rinfo->length < 2) {
461                         return -EINVAL;
462                 }
463         } else if (rinfo->prefix_len > 0) {
464                 if (rinfo->length < 1) {
465                         return -EINVAL;
466                 }
467         }
468
469         pref = rinfo->route_pref;
470         if (pref == ICMPV6_ROUTER_PREF_INVALID)
471                 pref = ICMPV6_ROUTER_PREF_MEDIUM;
472
473         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
474
475         if (rinfo->length == 3)
476                 prefix = (struct in6_addr *)rinfo->prefix;
477         else {
478                 /* this function is safe */
479                 ipv6_addr_prefix(&prefix_buf,
480                                  (struct in6_addr *)rinfo->prefix,
481                                  rinfo->prefix_len);
482                 prefix = &prefix_buf;
483         }
484
485         rt = rt6_get_route_info(net, prefix, rinfo->prefix_len, gwaddr,
486                                 dev->ifindex);
487
488         if (rt && !lifetime) {
489                 ip6_del_rt(rt);
490                 rt = NULL;
491         }
492
493         if (!rt && lifetime)
494                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr, dev->ifindex,
495                                         pref);
496         else if (rt)
497                 rt->rt6i_flags = RTF_ROUTEINFO |
498                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
499
500         if (rt) {
501                 if (!addrconf_finite_timeout(lifetime)) {
502                         rt->rt6i_flags &= ~RTF_EXPIRES;
503                 } else {
504                         rt->rt6i_expires = jiffies + HZ * lifetime;
505                         rt->rt6i_flags |= RTF_EXPIRES;
506                 }
507                 dst_release(&rt->u.dst);
508         }
509         return 0;
510 }
511 #endif
512
513 #define BACKTRACK(__net, saddr)                 \
514 do { \
515         if (rt == __net->ipv6.ip6_null_entry) { \
516                 struct fib6_node *pn; \
517                 while (1) { \
518                         if (fn->fn_flags & RTN_TL_ROOT) \
519                                 goto out; \
520                         pn = fn->parent; \
521                         if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn) \
522                                 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr); \
523                         else \
524                                 fn = pn; \
525                         if (fn->fn_flags & RTN_RTINFO) \
526                                 goto restart; \
527                 } \
528         } \
529 } while(0)
530
531 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
532                                              struct fib6_table *table,
533                                              struct flowi *fl, int flags)
534 {
535         struct fib6_node *fn;
536         struct rt6_info *rt;
537
538         read_lock_bh(&table->tb6_lock);
539         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
540 restart:
541         rt = fn->leaf;
542         rt = rt6_device_match(net, rt, fl->oif, flags);
543         BACKTRACK(net, &fl->fl6_src);
544 out:
545         dst_use(&rt->u.dst, jiffies);
546         read_unlock_bh(&table->tb6_lock);
547         return rt;
548
549 }
550
551 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
552                             const struct in6_addr *saddr, int oif, int strict)
553 {
554         struct flowi fl = {
555                 .oif = oif,
556                 .nl_u = {
557                         .ip6_u = {
558                                 .daddr = *daddr,
559                         },
560                 },
561         };
562         struct dst_entry *dst;
563         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
564
565         if (saddr) {
566                 memcpy(&fl.fl6_src, saddr, sizeof(*saddr));
567                 flags |= RT6_LOOKUP_F_HAS_SADDR;
568         }
569
570         dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_lookup);
571         if (dst->error == 0)
572                 return (struct rt6_info *) dst;
573
574         dst_release(dst);
575
576         return NULL;
577 }
578
579 EXPORT_SYMBOL(rt6_lookup);
580
581 /* ip6_ins_rt is called with FREE table->tb6_lock.
582    It takes new route entry, the addition fails by any reason the
583    route is freed. In any case, if caller does not hold it, it may
584    be destroyed.
585  */
586
587 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info)
588 {
589         int err;
590         struct fib6_table *table;
591
592         table = rt->rt6i_table;
593         write_lock_bh(&table->tb6_lock);
594         err = fib6_add(&table->tb6_root, rt, info);
595         write_unlock_bh(&table->tb6_lock);
596
597         return err;
598 }
599
600 int ip6_ins_rt(struct rt6_info *rt)
601 {
602         struct nl_info info = {
603                 .nl_net = dev_net(rt->rt6i_dev),
604         };
605         return __ip6_ins_rt(rt, &info);
606 }
607
608 static struct rt6_info *rt6_alloc_cow(struct rt6_info *ort, struct in6_addr *daddr,
609                                       struct in6_addr *saddr)
610 {
611         struct rt6_info *rt;
612
613         /*
614          *      Clone the route.
615          */
616
617         rt = ip6_rt_copy(ort);
618
619         if (rt) {
620                 if (!(rt->rt6i_flags&RTF_GATEWAY)) {
621                         if (rt->rt6i_dst.plen != 128 &&
622                             ipv6_addr_equal(&rt->rt6i_dst.addr, daddr))
623                                 rt->rt6i_flags |= RTF_ANYCAST;
624                         ipv6_addr_copy(&rt->rt6i_gateway, daddr);
625                 }
626
627                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
628                 rt->rt6i_dst.plen = 128;
629                 rt->rt6i_flags |= RTF_CACHE;
630                 rt->u.dst.flags |= DST_HOST;
631
632 #ifdef CONFIG_IPV6_SUBTREES
633                 if (rt->rt6i_src.plen && saddr) {
634                         ipv6_addr_copy(&rt->rt6i_src.addr, saddr);
635                         rt->rt6i_src.plen = 128;
636                 }
637 #endif
638
639                 rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
640
641         }
642
643         return rt;
644 }
645
646 static struct rt6_info *rt6_alloc_clone(struct rt6_info *ort, struct in6_addr *daddr)
647 {
648         struct rt6_info *rt = ip6_rt_copy(ort);
649         if (rt) {
650                 ipv6_addr_copy(&rt->rt6i_dst.addr, daddr);
651                 rt->rt6i_dst.plen = 128;
652                 rt->rt6i_flags |= RTF_CACHE;
653                 rt->u.dst.flags |= DST_HOST;
654                 rt->rt6i_nexthop = neigh_clone(ort->rt6i_nexthop);
655         }
656         return rt;
657 }
658
659 static struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int oif,
660                                       struct flowi *fl, int flags)
661 {
662         struct fib6_node *fn;
663         struct rt6_info *rt, *nrt;
664         int strict = 0;
665         int attempts = 3;
666         int err;
667         int reachable = ipv6_devconf.forwarding ? 0 : RT6_LOOKUP_F_REACHABLE;
668
669         strict |= flags & RT6_LOOKUP_F_IFACE;
670
671 relookup:
672         read_lock_bh(&table->tb6_lock);
673
674 restart_2:
675         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
676
677 restart:
678         rt = rt6_select(fn, oif, strict | reachable);
679
680         BACKTRACK(net, &fl->fl6_src);
681         if (rt == net->ipv6.ip6_null_entry ||
682             rt->rt6i_flags & RTF_CACHE)
683                 goto out;
684
685         dst_hold(&rt->u.dst);
686         read_unlock_bh(&table->tb6_lock);
687
688         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
689                 nrt = rt6_alloc_cow(rt, &fl->fl6_dst, &fl->fl6_src);
690         else {
691 #if CLONE_OFFLINK_ROUTE
692                 nrt = rt6_alloc_clone(rt, &fl->fl6_dst);
693 #else
694                 goto out2;
695 #endif
696         }
697
698         dst_release(&rt->u.dst);
699         rt = nrt ? : net->ipv6.ip6_null_entry;
700
701         dst_hold(&rt->u.dst);
702         if (nrt) {
703                 err = ip6_ins_rt(nrt);
704                 if (!err)
705                         goto out2;
706         }
707
708         if (--attempts <= 0)
709                 goto out2;
710
711         /*
712          * Race condition! In the gap, when table->tb6_lock was
713          * released someone could insert this route.  Relookup.
714          */
715         dst_release(&rt->u.dst);
716         goto relookup;
717
718 out:
719         if (reachable) {
720                 reachable = 0;
721                 goto restart_2;
722         }
723         dst_hold(&rt->u.dst);
724         read_unlock_bh(&table->tb6_lock);
725 out2:
726         rt->u.dst.lastuse = jiffies;
727         rt->u.dst.__use++;
728
729         return rt;
730 }
731
732 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
733                                             struct flowi *fl, int flags)
734 {
735         return ip6_pol_route(net, table, fl->iif, fl, flags);
736 }
737
738 void ip6_route_input(struct sk_buff *skb)
739 {
740         struct ipv6hdr *iph = ipv6_hdr(skb);
741         struct net *net = dev_net(skb->dev);
742         int flags = RT6_LOOKUP_F_HAS_SADDR;
743         struct flowi fl = {
744                 .iif = skb->dev->ifindex,
745                 .nl_u = {
746                         .ip6_u = {
747                                 .daddr = iph->daddr,
748                                 .saddr = iph->saddr,
749                                 .flowlabel = (* (__be32 *) iph)&IPV6_FLOWINFO_MASK,
750                         },
751                 },
752                 .mark = skb->mark,
753                 .proto = iph->nexthdr,
754         };
755
756         if (rt6_need_strict(&iph->daddr))
757                 flags |= RT6_LOOKUP_F_IFACE;
758
759         skb->dst = fib6_rule_lookup(net, &fl, flags, ip6_pol_route_input);
760 }
761
762 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
763                                              struct flowi *fl, int flags)
764 {
765         return ip6_pol_route(net, table, fl->oif, fl, flags);
766 }
767
768 struct dst_entry * ip6_route_output(struct net *net, struct sock *sk,
769                                     struct flowi *fl)
770 {
771         int flags = 0;
772
773         if (rt6_need_strict(&fl->fl6_dst))
774                 flags |= RT6_LOOKUP_F_IFACE;
775
776         if (!ipv6_addr_any(&fl->fl6_src))
777                 flags |= RT6_LOOKUP_F_HAS_SADDR;
778         else if (sk) {
779                 unsigned int prefs = inet6_sk(sk)->srcprefs;
780                 if (prefs & IPV6_PREFER_SRC_TMP)
781                         flags |= RT6_LOOKUP_F_SRCPREF_TMP;
782                 if (prefs & IPV6_PREFER_SRC_PUBLIC)
783                         flags |= RT6_LOOKUP_F_SRCPREF_PUBLIC;
784                 if (prefs & IPV6_PREFER_SRC_COA)
785                         flags |= RT6_LOOKUP_F_SRCPREF_COA;
786         }
787
788         return fib6_rule_lookup(net, fl, flags, ip6_pol_route_output);
789 }
790
791 EXPORT_SYMBOL(ip6_route_output);
792
793 int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl)
794 {
795         struct rt6_info *ort = (struct rt6_info *) *dstp;
796         struct rt6_info *rt = (struct rt6_info *)
797                 dst_alloc(&ip6_dst_blackhole_ops);
798         struct dst_entry *new = NULL;
799
800         if (rt) {
801                 new = &rt->u.dst;
802
803                 atomic_set(&new->__refcnt, 1);
804                 new->__use = 1;
805                 new->input = dst_discard;
806                 new->output = dst_discard;
807
808                 memcpy(new->metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
809                 new->dev = ort->u.dst.dev;
810                 if (new->dev)
811                         dev_hold(new->dev);
812                 rt->rt6i_idev = ort->rt6i_idev;
813                 if (rt->rt6i_idev)
814                         in6_dev_hold(rt->rt6i_idev);
815                 rt->rt6i_expires = 0;
816
817                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
818                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
819                 rt->rt6i_metric = 0;
820
821                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
822 #ifdef CONFIG_IPV6_SUBTREES
823                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
824 #endif
825
826                 dst_free(new);
827         }
828
829         dst_release(*dstp);
830         *dstp = new;
831         return (new ? 0 : -ENOMEM);
832 }
833 EXPORT_SYMBOL_GPL(ip6_dst_blackhole);
834
835 /*
836  *      Destination cache support functions
837  */
838
839 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
840 {
841         struct rt6_info *rt;
842
843         rt = (struct rt6_info *) dst;
844
845         if (rt && rt->rt6i_node && (rt->rt6i_node->fn_sernum == cookie))
846                 return dst;
847
848         return NULL;
849 }
850
851 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
852 {
853         struct rt6_info *rt = (struct rt6_info *) dst;
854
855         if (rt) {
856                 if (rt->rt6i_flags & RTF_CACHE)
857                         ip6_del_rt(rt);
858                 else
859                         dst_release(dst);
860         }
861         return NULL;
862 }
863
864 static void ip6_link_failure(struct sk_buff *skb)
865 {
866         struct rt6_info *rt;
867
868         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0, skb->dev);
869
870         rt = (struct rt6_info *) skb->dst;
871         if (rt) {
872                 if (rt->rt6i_flags&RTF_CACHE) {
873                         dst_set_expires(&rt->u.dst, 0);
874                         rt->rt6i_flags |= RTF_EXPIRES;
875                 } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT))
876                         rt->rt6i_node->fn_sernum = -1;
877         }
878 }
879
880 static void ip6_rt_update_pmtu(struct dst_entry *dst, u32 mtu)
881 {
882         struct rt6_info *rt6 = (struct rt6_info*)dst;
883
884         if (mtu < dst_mtu(dst) && rt6->rt6i_dst.plen == 128) {
885                 rt6->rt6i_flags |= RTF_MODIFIED;
886                 if (mtu < IPV6_MIN_MTU) {
887                         mtu = IPV6_MIN_MTU;
888                         dst->metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
889                 }
890                 dst->metrics[RTAX_MTU-1] = mtu;
891                 call_netevent_notifiers(NETEVENT_PMTU_UPDATE, dst);
892         }
893 }
894
895 static int ipv6_get_mtu(struct net_device *dev);
896
897 static inline unsigned int ipv6_advmss(struct net *net, unsigned int mtu)
898 {
899         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
900
901         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
902                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
903
904         /*
905          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
906          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
907          * IPV6_MAXPLEN is also valid and means: "any MSS,
908          * rely only on pmtu discovery"
909          */
910         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
911                 mtu = IPV6_MAXPLEN;
912         return mtu;
913 }
914
915 static struct dst_entry *icmp6_dst_gc_list;
916 static DEFINE_SPINLOCK(icmp6_dst_lock);
917
918 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
919                                   struct neighbour *neigh,
920                                   const struct in6_addr *addr)
921 {
922         struct rt6_info *rt;
923         struct inet6_dev *idev = in6_dev_get(dev);
924         struct net *net = dev_net(dev);
925
926         if (unlikely(idev == NULL))
927                 return NULL;
928
929         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
930         if (unlikely(rt == NULL)) {
931                 in6_dev_put(idev);
932                 goto out;
933         }
934
935         dev_hold(dev);
936         if (neigh)
937                 neigh_hold(neigh);
938         else
939                 neigh = ndisc_get_neigh(dev, addr);
940
941         rt->rt6i_dev      = dev;
942         rt->rt6i_idev     = idev;
943         rt->rt6i_nexthop  = neigh;
944         atomic_set(&rt->u.dst.__refcnt, 1);
945         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = 255;
946         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
947         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
948         rt->u.dst.output  = ip6_output;
949
950 #if 0   /* there's no chance to use these for ndisc */
951         rt->u.dst.flags   = ipv6_addr_type(addr) & IPV6_ADDR_UNICAST
952                                 ? DST_HOST
953                                 : 0;
954         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
955         rt->rt6i_dst.plen = 128;
956 #endif
957
958         spin_lock_bh(&icmp6_dst_lock);
959         rt->u.dst.next = icmp6_dst_gc_list;
960         icmp6_dst_gc_list = &rt->u.dst;
961         spin_unlock_bh(&icmp6_dst_lock);
962
963         fib6_force_start_gc(net);
964
965 out:
966         return &rt->u.dst;
967 }
968
969 int icmp6_dst_gc(int *more)
970 {
971         struct dst_entry *dst, *next, **pprev;
972         int freed;
973
974         next = NULL;
975         freed = 0;
976
977         spin_lock_bh(&icmp6_dst_lock);
978         pprev = &icmp6_dst_gc_list;
979
980         while ((dst = *pprev) != NULL) {
981                 if (!atomic_read(&dst->__refcnt)) {
982                         *pprev = dst->next;
983                         dst_free(dst);
984                         freed++;
985                 } else {
986                         pprev = &dst->next;
987                         (*more)++;
988                 }
989         }
990
991         spin_unlock_bh(&icmp6_dst_lock);
992
993         return freed;
994 }
995
996 static int ip6_dst_gc(struct dst_ops *ops)
997 {
998         unsigned long now = jiffies;
999         struct net *net = ops->dst_net;
1000         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1001         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1002         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1003         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1004         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1005
1006         if (time_after(rt_last_gc + rt_min_interval, now) &&
1007             atomic_read(&ops->entries) <= rt_max_size)
1008                 goto out;
1009
1010         net->ipv6.ip6_rt_gc_expire++;
1011         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net);
1012         net->ipv6.ip6_rt_last_gc = now;
1013         if (atomic_read(&ops->entries) < ops->gc_thresh)
1014                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1015 out:
1016         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1017         return (atomic_read(&ops->entries) > rt_max_size);
1018 }
1019
1020 /* Clean host part of a prefix. Not necessary in radix tree,
1021    but results in cleaner routing tables.
1022
1023    Remove it only when all the things will work!
1024  */
1025
1026 static int ipv6_get_mtu(struct net_device *dev)
1027 {
1028         int mtu = IPV6_MIN_MTU;
1029         struct inet6_dev *idev;
1030
1031         idev = in6_dev_get(dev);
1032         if (idev) {
1033                 mtu = idev->cnf.mtu6;
1034                 in6_dev_put(idev);
1035         }
1036         return mtu;
1037 }
1038
1039 int ip6_dst_hoplimit(struct dst_entry *dst)
1040 {
1041         int hoplimit = dst_metric(dst, RTAX_HOPLIMIT);
1042         if (hoplimit < 0) {
1043                 struct net_device *dev = dst->dev;
1044                 struct inet6_dev *idev = in6_dev_get(dev);
1045                 if (idev) {
1046                         hoplimit = idev->cnf.hop_limit;
1047                         in6_dev_put(idev);
1048                 } else
1049                         hoplimit = ipv6_devconf.hop_limit;
1050         }
1051         return hoplimit;
1052 }
1053
1054 /*
1055  *
1056  */
1057
1058 int ip6_route_add(struct fib6_config *cfg)
1059 {
1060         int err;
1061         struct net *net = cfg->fc_nlinfo.nl_net;
1062         struct rt6_info *rt = NULL;
1063         struct net_device *dev = NULL;
1064         struct inet6_dev *idev = NULL;
1065         struct fib6_table *table;
1066         int addr_type;
1067
1068         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1069                 return -EINVAL;
1070 #ifndef CONFIG_IPV6_SUBTREES
1071         if (cfg->fc_src_len)
1072                 return -EINVAL;
1073 #endif
1074         if (cfg->fc_ifindex) {
1075                 err = -ENODEV;
1076                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1077                 if (!dev)
1078                         goto out;
1079                 idev = in6_dev_get(dev);
1080                 if (!idev)
1081                         goto out;
1082         }
1083
1084         if (cfg->fc_metric == 0)
1085                 cfg->fc_metric = IP6_RT_PRIO_USER;
1086
1087         table = fib6_new_table(net, cfg->fc_table);
1088         if (table == NULL) {
1089                 err = -ENOBUFS;
1090                 goto out;
1091         }
1092
1093         rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1094
1095         if (rt == NULL) {
1096                 err = -ENOMEM;
1097                 goto out;
1098         }
1099
1100         rt->u.dst.obsolete = -1;
1101         rt->rt6i_expires = (cfg->fc_flags & RTF_EXPIRES) ?
1102                                 jiffies + clock_t_to_jiffies(cfg->fc_expires) :
1103                                 0;
1104
1105         if (cfg->fc_protocol == RTPROT_UNSPEC)
1106                 cfg->fc_protocol = RTPROT_BOOT;
1107         rt->rt6i_protocol = cfg->fc_protocol;
1108
1109         addr_type = ipv6_addr_type(&cfg->fc_dst);
1110
1111         if (addr_type & IPV6_ADDR_MULTICAST)
1112                 rt->u.dst.input = ip6_mc_input;
1113         else
1114                 rt->u.dst.input = ip6_forward;
1115
1116         rt->u.dst.output = ip6_output;
1117
1118         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1119         rt->rt6i_dst.plen = cfg->fc_dst_len;
1120         if (rt->rt6i_dst.plen == 128)
1121                rt->u.dst.flags = DST_HOST;
1122
1123 #ifdef CONFIG_IPV6_SUBTREES
1124         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1125         rt->rt6i_src.plen = cfg->fc_src_len;
1126 #endif
1127
1128         rt->rt6i_metric = cfg->fc_metric;
1129
1130         /* We cannot add true routes via loopback here,
1131            they would result in kernel looping; promote them to reject routes
1132          */
1133         if ((cfg->fc_flags & RTF_REJECT) ||
1134             (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) {
1135                 /* hold loopback dev/idev if we haven't done so. */
1136                 if (dev != net->loopback_dev) {
1137                         if (dev) {
1138                                 dev_put(dev);
1139                                 in6_dev_put(idev);
1140                         }
1141                         dev = net->loopback_dev;
1142                         dev_hold(dev);
1143                         idev = in6_dev_get(dev);
1144                         if (!idev) {
1145                                 err = -ENODEV;
1146                                 goto out;
1147                         }
1148                 }
1149                 rt->u.dst.output = ip6_pkt_discard_out;
1150                 rt->u.dst.input = ip6_pkt_discard;
1151                 rt->u.dst.error = -ENETUNREACH;
1152                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1153                 goto install_route;
1154         }
1155
1156         if (cfg->fc_flags & RTF_GATEWAY) {
1157                 struct in6_addr *gw_addr;
1158                 int gwa_type;
1159
1160                 gw_addr = &cfg->fc_gateway;
1161                 ipv6_addr_copy(&rt->rt6i_gateway, gw_addr);
1162                 gwa_type = ipv6_addr_type(gw_addr);
1163
1164                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
1165                         struct rt6_info *grt;
1166
1167                         /* IPv6 strictly inhibits using not link-local
1168                            addresses as nexthop address.
1169                            Otherwise, router will not able to send redirects.
1170                            It is very good, but in some (rare!) circumstances
1171                            (SIT, PtP, NBMA NOARP links) it is handy to allow
1172                            some exceptions. --ANK
1173                          */
1174                         err = -EINVAL;
1175                         if (!(gwa_type&IPV6_ADDR_UNICAST))
1176                                 goto out;
1177
1178                         grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, 1);
1179
1180                         err = -EHOSTUNREACH;
1181                         if (grt == NULL)
1182                                 goto out;
1183                         if (dev) {
1184                                 if (dev != grt->rt6i_dev) {
1185                                         dst_release(&grt->u.dst);
1186                                         goto out;
1187                                 }
1188                         } else {
1189                                 dev = grt->rt6i_dev;
1190                                 idev = grt->rt6i_idev;
1191                                 dev_hold(dev);
1192                                 in6_dev_hold(grt->rt6i_idev);
1193                         }
1194                         if (!(grt->rt6i_flags&RTF_GATEWAY))
1195                                 err = 0;
1196                         dst_release(&grt->u.dst);
1197
1198                         if (err)
1199                                 goto out;
1200                 }
1201                 err = -EINVAL;
1202                 if (dev == NULL || (dev->flags&IFF_LOOPBACK))
1203                         goto out;
1204         }
1205
1206         err = -ENODEV;
1207         if (dev == NULL)
1208                 goto out;
1209
1210         if (cfg->fc_flags & (RTF_GATEWAY | RTF_NONEXTHOP)) {
1211                 rt->rt6i_nexthop = __neigh_lookup_errno(&nd_tbl, &rt->rt6i_gateway, dev);
1212                 if (IS_ERR(rt->rt6i_nexthop)) {
1213                         err = PTR_ERR(rt->rt6i_nexthop);
1214                         rt->rt6i_nexthop = NULL;
1215                         goto out;
1216                 }
1217         }
1218
1219         rt->rt6i_flags = cfg->fc_flags;
1220
1221 install_route:
1222         if (cfg->fc_mx) {
1223                 struct nlattr *nla;
1224                 int remaining;
1225
1226                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1227                         int type = nla_type(nla);
1228
1229                         if (type) {
1230                                 if (type > RTAX_MAX) {
1231                                         err = -EINVAL;
1232                                         goto out;
1233                                 }
1234
1235                                 rt->u.dst.metrics[type - 1] = nla_get_u32(nla);
1236                         }
1237                 }
1238         }
1239
1240         if (dst_metric(&rt->u.dst, RTAX_HOPLIMIT) == 0)
1241                 rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1242         if (!dst_metric(&rt->u.dst, RTAX_MTU))
1243                 rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(dev);
1244         if (!dst_metric(&rt->u.dst, RTAX_ADVMSS))
1245                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1246         rt->u.dst.dev = dev;
1247         rt->rt6i_idev = idev;
1248         rt->rt6i_table = table;
1249
1250         cfg->fc_nlinfo.nl_net = dev_net(dev);
1251
1252         return __ip6_ins_rt(rt, &cfg->fc_nlinfo);
1253
1254 out:
1255         if (dev)
1256                 dev_put(dev);
1257         if (idev)
1258                 in6_dev_put(idev);
1259         if (rt)
1260                 dst_free(&rt->u.dst);
1261         return err;
1262 }
1263
1264 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1265 {
1266         int err;
1267         struct fib6_table *table;
1268         struct net *net = dev_net(rt->rt6i_dev);
1269
1270         if (rt == net->ipv6.ip6_null_entry)
1271                 return -ENOENT;
1272
1273         table = rt->rt6i_table;
1274         write_lock_bh(&table->tb6_lock);
1275
1276         err = fib6_del(rt, info);
1277         dst_release(&rt->u.dst);
1278
1279         write_unlock_bh(&table->tb6_lock);
1280
1281         return err;
1282 }
1283
1284 int ip6_del_rt(struct rt6_info *rt)
1285 {
1286         struct nl_info info = {
1287                 .nl_net = dev_net(rt->rt6i_dev),
1288         };
1289         return __ip6_del_rt(rt, &info);
1290 }
1291
1292 static int ip6_route_del(struct fib6_config *cfg)
1293 {
1294         struct fib6_table *table;
1295         struct fib6_node *fn;
1296         struct rt6_info *rt;
1297         int err = -ESRCH;
1298
1299         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
1300         if (table == NULL)
1301                 return err;
1302
1303         read_lock_bh(&table->tb6_lock);
1304
1305         fn = fib6_locate(&table->tb6_root,
1306                          &cfg->fc_dst, cfg->fc_dst_len,
1307                          &cfg->fc_src, cfg->fc_src_len);
1308
1309         if (fn) {
1310                 for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1311                         if (cfg->fc_ifindex &&
1312                             (rt->rt6i_dev == NULL ||
1313                              rt->rt6i_dev->ifindex != cfg->fc_ifindex))
1314                                 continue;
1315                         if (cfg->fc_flags & RTF_GATEWAY &&
1316                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1317                                 continue;
1318                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1319                                 continue;
1320                         dst_hold(&rt->u.dst);
1321                         read_unlock_bh(&table->tb6_lock);
1322
1323                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
1324                 }
1325         }
1326         read_unlock_bh(&table->tb6_lock);
1327
1328         return err;
1329 }
1330
1331 /*
1332  *      Handle redirects
1333  */
1334 struct ip6rd_flowi {
1335         struct flowi fl;
1336         struct in6_addr gateway;
1337 };
1338
1339 static struct rt6_info *__ip6_route_redirect(struct net *net,
1340                                              struct fib6_table *table,
1341                                              struct flowi *fl,
1342                                              int flags)
1343 {
1344         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl;
1345         struct rt6_info *rt;
1346         struct fib6_node *fn;
1347
1348         /*
1349          * Get the "current" route for this destination and
1350          * check if the redirect has come from approriate router.
1351          *
1352          * RFC 2461 specifies that redirects should only be
1353          * accepted if they come from the nexthop to the target.
1354          * Due to the way the routes are chosen, this notion
1355          * is a bit fuzzy and one might need to check all possible
1356          * routes.
1357          */
1358
1359         read_lock_bh(&table->tb6_lock);
1360         fn = fib6_lookup(&table->tb6_root, &fl->fl6_dst, &fl->fl6_src);
1361 restart:
1362         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1363                 /*
1364                  * Current route is on-link; redirect is always invalid.
1365                  *
1366                  * Seems, previous statement is not true. It could
1367                  * be node, which looks for us as on-link (f.e. proxy ndisc)
1368                  * But then router serving it might decide, that we should
1369                  * know truth 8)8) --ANK (980726).
1370                  */
1371                 if (rt6_check_expired(rt))
1372                         continue;
1373                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1374                         continue;
1375                 if (fl->oif != rt->rt6i_dev->ifindex)
1376                         continue;
1377                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1378                         continue;
1379                 break;
1380         }
1381
1382         if (!rt)
1383                 rt = net->ipv6.ip6_null_entry;
1384         BACKTRACK(net, &fl->fl6_src);
1385 out:
1386         dst_hold(&rt->u.dst);
1387
1388         read_unlock_bh(&table->tb6_lock);
1389
1390         return rt;
1391 };
1392
1393 static struct rt6_info *ip6_route_redirect(struct in6_addr *dest,
1394                                            struct in6_addr *src,
1395                                            struct in6_addr *gateway,
1396                                            struct net_device *dev)
1397 {
1398         int flags = RT6_LOOKUP_F_HAS_SADDR;
1399         struct net *net = dev_net(dev);
1400         struct ip6rd_flowi rdfl = {
1401                 .fl = {
1402                         .oif = dev->ifindex,
1403                         .nl_u = {
1404                                 .ip6_u = {
1405                                         .daddr = *dest,
1406                                         .saddr = *src,
1407                                 },
1408                         },
1409                 },
1410                 .gateway = *gateway,
1411         };
1412
1413         if (rt6_need_strict(dest))
1414                 flags |= RT6_LOOKUP_F_IFACE;
1415
1416         return (struct rt6_info *)fib6_rule_lookup(net, (struct flowi *)&rdfl,
1417                                                    flags, __ip6_route_redirect);
1418 }
1419
1420 void rt6_redirect(struct in6_addr *dest, struct in6_addr *src,
1421                   struct in6_addr *saddr,
1422                   struct neighbour *neigh, u8 *lladdr, int on_link)
1423 {
1424         struct rt6_info *rt, *nrt = NULL;
1425         struct netevent_redirect netevent;
1426         struct net *net = dev_net(neigh->dev);
1427
1428         rt = ip6_route_redirect(dest, src, saddr, neigh->dev);
1429
1430         if (rt == net->ipv6.ip6_null_entry) {
1431                 if (net_ratelimit())
1432                         printk(KERN_DEBUG "rt6_redirect: source isn't a valid nexthop "
1433                                "for redirect target\n");
1434                 goto out;
1435         }
1436
1437         /*
1438          *      We have finally decided to accept it.
1439          */
1440
1441         neigh_update(neigh, lladdr, NUD_STALE,
1442                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
1443                      NEIGH_UPDATE_F_OVERRIDE|
1444                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
1445                                      NEIGH_UPDATE_F_ISROUTER))
1446                      );
1447
1448         /*
1449          * Redirect received -> path was valid.
1450          * Look, redirects are sent only in response to data packets,
1451          * so that this nexthop apparently is reachable. --ANK
1452          */
1453         dst_confirm(&rt->u.dst);
1454
1455         /* Duplicate redirect: silently ignore. */
1456         if (neigh == rt->u.dst.neighbour)
1457                 goto out;
1458
1459         nrt = ip6_rt_copy(rt);
1460         if (nrt == NULL)
1461                 goto out;
1462
1463         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
1464         if (on_link)
1465                 nrt->rt6i_flags &= ~RTF_GATEWAY;
1466
1467         ipv6_addr_copy(&nrt->rt6i_dst.addr, dest);
1468         nrt->rt6i_dst.plen = 128;
1469         nrt->u.dst.flags |= DST_HOST;
1470
1471         ipv6_addr_copy(&nrt->rt6i_gateway, (struct in6_addr*)neigh->primary_key);
1472         nrt->rt6i_nexthop = neigh_clone(neigh);
1473         /* Reset pmtu, it may be better */
1474         nrt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(neigh->dev);
1475         nrt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(dev_net(neigh->dev),
1476                                                         dst_mtu(&nrt->u.dst));
1477
1478         if (ip6_ins_rt(nrt))
1479                 goto out;
1480
1481         netevent.old = &rt->u.dst;
1482         netevent.new = &nrt->u.dst;
1483         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
1484
1485         if (rt->rt6i_flags&RTF_CACHE) {
1486                 ip6_del_rt(rt);
1487                 return;
1488         }
1489
1490 out:
1491         dst_release(&rt->u.dst);
1492         return;
1493 }
1494
1495 /*
1496  *      Handle ICMP "packet too big" messages
1497  *      i.e. Path MTU discovery
1498  */
1499
1500 void rt6_pmtu_discovery(struct in6_addr *daddr, struct in6_addr *saddr,
1501                         struct net_device *dev, u32 pmtu)
1502 {
1503         struct rt6_info *rt, *nrt;
1504         struct net *net = dev_net(dev);
1505         int allfrag = 0;
1506
1507         rt = rt6_lookup(net, daddr, saddr, dev->ifindex, 0);
1508         if (rt == NULL)
1509                 return;
1510
1511         if (pmtu >= dst_mtu(&rt->u.dst))
1512                 goto out;
1513
1514         if (pmtu < IPV6_MIN_MTU) {
1515                 /*
1516                  * According to RFC2460, PMTU is set to the IPv6 Minimum Link
1517                  * MTU (1280) and a fragment header should always be included
1518                  * after a node receiving Too Big message reporting PMTU is
1519                  * less than the IPv6 Minimum Link MTU.
1520                  */
1521                 pmtu = IPV6_MIN_MTU;
1522                 allfrag = 1;
1523         }
1524
1525         /* New mtu received -> path was valid.
1526            They are sent only in response to data packets,
1527            so that this nexthop apparently is reachable. --ANK
1528          */
1529         dst_confirm(&rt->u.dst);
1530
1531         /* Host route. If it is static, it would be better
1532            not to override it, but add new one, so that
1533            when cache entry will expire old pmtu
1534            would return automatically.
1535          */
1536         if (rt->rt6i_flags & RTF_CACHE) {
1537                 rt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1538                 if (allfrag)
1539                         rt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1540                 dst_set_expires(&rt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1541                 rt->rt6i_flags |= RTF_MODIFIED|RTF_EXPIRES;
1542                 goto out;
1543         }
1544
1545         /* Network route.
1546            Two cases are possible:
1547            1. It is connected route. Action: COW
1548            2. It is gatewayed route or NONEXTHOP route. Action: clone it.
1549          */
1550         if (!rt->rt6i_nexthop && !(rt->rt6i_flags & RTF_NONEXTHOP))
1551                 nrt = rt6_alloc_cow(rt, daddr, saddr);
1552         else
1553                 nrt = rt6_alloc_clone(rt, daddr);
1554
1555         if (nrt) {
1556                 nrt->u.dst.metrics[RTAX_MTU-1] = pmtu;
1557                 if (allfrag)
1558                         nrt->u.dst.metrics[RTAX_FEATURES-1] |= RTAX_FEATURE_ALLFRAG;
1559
1560                 /* According to RFC 1981, detecting PMTU increase shouldn't be
1561                  * happened within 5 mins, the recommended timer is 10 mins.
1562                  * Here this route expiration time is set to ip6_rt_mtu_expires
1563                  * which is 10 mins. After 10 mins the decreased pmtu is expired
1564                  * and detecting PMTU increase will be automatically happened.
1565                  */
1566                 dst_set_expires(&nrt->u.dst, net->ipv6.sysctl.ip6_rt_mtu_expires);
1567                 nrt->rt6i_flags |= RTF_DYNAMIC|RTF_EXPIRES;
1568
1569                 ip6_ins_rt(nrt);
1570         }
1571 out:
1572         dst_release(&rt->u.dst);
1573 }
1574
1575 /*
1576  *      Misc support functions
1577  */
1578
1579 static struct rt6_info * ip6_rt_copy(struct rt6_info *ort)
1580 {
1581         struct net *net = dev_net(ort->rt6i_dev);
1582         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1583
1584         if (rt) {
1585                 rt->u.dst.input = ort->u.dst.input;
1586                 rt->u.dst.output = ort->u.dst.output;
1587
1588                 memcpy(rt->u.dst.metrics, ort->u.dst.metrics, RTAX_MAX*sizeof(u32));
1589                 rt->u.dst.error = ort->u.dst.error;
1590                 rt->u.dst.dev = ort->u.dst.dev;
1591                 if (rt->u.dst.dev)
1592                         dev_hold(rt->u.dst.dev);
1593                 rt->rt6i_idev = ort->rt6i_idev;
1594                 if (rt->rt6i_idev)
1595                         in6_dev_hold(rt->rt6i_idev);
1596                 rt->u.dst.lastuse = jiffies;
1597                 rt->rt6i_expires = 0;
1598
1599                 ipv6_addr_copy(&rt->rt6i_gateway, &ort->rt6i_gateway);
1600                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_EXPIRES;
1601                 rt->rt6i_metric = 0;
1602
1603                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1604 #ifdef CONFIG_IPV6_SUBTREES
1605                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1606 #endif
1607                 rt->rt6i_table = ort->rt6i_table;
1608         }
1609         return rt;
1610 }
1611
1612 #ifdef CONFIG_IPV6_ROUTE_INFO
1613 static struct rt6_info *rt6_get_route_info(struct net *net,
1614                                            struct in6_addr *prefix, int prefixlen,
1615                                            struct in6_addr *gwaddr, int ifindex)
1616 {
1617         struct fib6_node *fn;
1618         struct rt6_info *rt = NULL;
1619         struct fib6_table *table;
1620
1621         table = fib6_get_table(net, RT6_TABLE_INFO);
1622         if (table == NULL)
1623                 return NULL;
1624
1625         write_lock_bh(&table->tb6_lock);
1626         fn = fib6_locate(&table->tb6_root, prefix ,prefixlen, NULL, 0);
1627         if (!fn)
1628                 goto out;
1629
1630         for (rt = fn->leaf; rt; rt = rt->u.dst.rt6_next) {
1631                 if (rt->rt6i_dev->ifindex != ifindex)
1632                         continue;
1633                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
1634                         continue;
1635                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
1636                         continue;
1637                 dst_hold(&rt->u.dst);
1638                 break;
1639         }
1640 out:
1641         write_unlock_bh(&table->tb6_lock);
1642         return rt;
1643 }
1644
1645 static struct rt6_info *rt6_add_route_info(struct net *net,
1646                                            struct in6_addr *prefix, int prefixlen,
1647                                            struct in6_addr *gwaddr, int ifindex,
1648                                            unsigned pref)
1649 {
1650         struct fib6_config cfg = {
1651                 .fc_table       = RT6_TABLE_INFO,
1652                 .fc_metric      = IP6_RT_PRIO_USER,
1653                 .fc_ifindex     = ifindex,
1654                 .fc_dst_len     = prefixlen,
1655                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
1656                                   RTF_UP | RTF_PREF(pref),
1657                 .fc_nlinfo.pid = 0,
1658                 .fc_nlinfo.nlh = NULL,
1659                 .fc_nlinfo.nl_net = net,
1660         };
1661
1662         ipv6_addr_copy(&cfg.fc_dst, prefix);
1663         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1664
1665         /* We should treat it as a default route if prefix length is 0. */
1666         if (!prefixlen)
1667                 cfg.fc_flags |= RTF_DEFAULT;
1668
1669         ip6_route_add(&cfg);
1670
1671         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, ifindex);
1672 }
1673 #endif
1674
1675 struct rt6_info *rt6_get_dflt_router(struct in6_addr *addr, struct net_device *dev)
1676 {
1677         struct rt6_info *rt;
1678         struct fib6_table *table;
1679
1680         table = fib6_get_table(dev_net(dev), RT6_TABLE_DFLT);
1681         if (table == NULL)
1682                 return NULL;
1683
1684         write_lock_bh(&table->tb6_lock);
1685         for (rt = table->tb6_root.leaf; rt; rt=rt->u.dst.rt6_next) {
1686                 if (dev == rt->rt6i_dev &&
1687                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1688                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
1689                         break;
1690         }
1691         if (rt)
1692                 dst_hold(&rt->u.dst);
1693         write_unlock_bh(&table->tb6_lock);
1694         return rt;
1695 }
1696
1697 struct rt6_info *rt6_add_dflt_router(struct in6_addr *gwaddr,
1698                                      struct net_device *dev,
1699                                      unsigned int pref)
1700 {
1701         struct fib6_config cfg = {
1702                 .fc_table       = RT6_TABLE_DFLT,
1703                 .fc_metric      = IP6_RT_PRIO_USER,
1704                 .fc_ifindex     = dev->ifindex,
1705                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
1706                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
1707                 .fc_nlinfo.pid = 0,
1708                 .fc_nlinfo.nlh = NULL,
1709                 .fc_nlinfo.nl_net = dev_net(dev),
1710         };
1711
1712         ipv6_addr_copy(&cfg.fc_gateway, gwaddr);
1713
1714         ip6_route_add(&cfg);
1715
1716         return rt6_get_dflt_router(gwaddr, dev);
1717 }
1718
1719 void rt6_purge_dflt_routers(struct net *net)
1720 {
1721         struct rt6_info *rt;
1722         struct fib6_table *table;
1723
1724         /* NOTE: Keep consistent with rt6_get_dflt_router */
1725         table = fib6_get_table(net, RT6_TABLE_DFLT);
1726         if (table == NULL)
1727                 return;
1728
1729 restart:
1730         read_lock_bh(&table->tb6_lock);
1731         for (rt = table->tb6_root.leaf; rt; rt = rt->u.dst.rt6_next) {
1732                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF)) {
1733                         dst_hold(&rt->u.dst);
1734                         read_unlock_bh(&table->tb6_lock);
1735                         ip6_del_rt(rt);
1736                         goto restart;
1737                 }
1738         }
1739         read_unlock_bh(&table->tb6_lock);
1740 }
1741
1742 static void rtmsg_to_fib6_config(struct net *net,
1743                                  struct in6_rtmsg *rtmsg,
1744                                  struct fib6_config *cfg)
1745 {
1746         memset(cfg, 0, sizeof(*cfg));
1747
1748         cfg->fc_table = RT6_TABLE_MAIN;
1749         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
1750         cfg->fc_metric = rtmsg->rtmsg_metric;
1751         cfg->fc_expires = rtmsg->rtmsg_info;
1752         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
1753         cfg->fc_src_len = rtmsg->rtmsg_src_len;
1754         cfg->fc_flags = rtmsg->rtmsg_flags;
1755
1756         cfg->fc_nlinfo.nl_net = net;
1757
1758         ipv6_addr_copy(&cfg->fc_dst, &rtmsg->rtmsg_dst);
1759         ipv6_addr_copy(&cfg->fc_src, &rtmsg->rtmsg_src);
1760         ipv6_addr_copy(&cfg->fc_gateway, &rtmsg->rtmsg_gateway);
1761 }
1762
1763 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1764 {
1765         struct fib6_config cfg;
1766         struct in6_rtmsg rtmsg;
1767         int err;
1768
1769         switch(cmd) {
1770         case SIOCADDRT:         /* Add a route */
1771         case SIOCDELRT:         /* Delete a route */
1772                 if (!capable(CAP_NET_ADMIN))
1773                         return -EPERM;
1774                 err = copy_from_user(&rtmsg, arg,
1775                                      sizeof(struct in6_rtmsg));
1776                 if (err)
1777                         return -EFAULT;
1778
1779                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
1780
1781                 rtnl_lock();
1782                 switch (cmd) {
1783                 case SIOCADDRT:
1784                         err = ip6_route_add(&cfg);
1785                         break;
1786                 case SIOCDELRT:
1787                         err = ip6_route_del(&cfg);
1788                         break;
1789                 default:
1790                         err = -EINVAL;
1791                 }
1792                 rtnl_unlock();
1793
1794                 return err;
1795         }
1796
1797         return -EINVAL;
1798 }
1799
1800 /*
1801  *      Drop the packet on the floor
1802  */
1803
1804 static int ip6_pkt_drop(struct sk_buff *skb, int code, int ipstats_mib_noroutes)
1805 {
1806         int type;
1807         switch (ipstats_mib_noroutes) {
1808         case IPSTATS_MIB_INNOROUTES:
1809                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
1810                 if (type == IPV6_ADDR_ANY || type == IPV6_ADDR_RESERVED) {
1811                         IP6_INC_STATS(ip6_dst_idev(skb->dst), IPSTATS_MIB_INADDRERRORS);
1812                         break;
1813                 }
1814                 /* FALLTHROUGH */
1815         case IPSTATS_MIB_OUTNOROUTES:
1816                 IP6_INC_STATS(ip6_dst_idev(skb->dst), ipstats_mib_noroutes);
1817                 break;
1818         }
1819         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0, skb->dev);
1820         kfree_skb(skb);
1821         return 0;
1822 }
1823
1824 static int ip6_pkt_discard(struct sk_buff *skb)
1825 {
1826         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
1827 }
1828
1829 static int ip6_pkt_discard_out(struct sk_buff *skb)
1830 {
1831         skb->dev = skb->dst->dev;
1832         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1833 }
1834
1835 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
1836
1837 static int ip6_pkt_prohibit(struct sk_buff *skb)
1838 {
1839         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
1840 }
1841
1842 static int ip6_pkt_prohibit_out(struct sk_buff *skb)
1843 {
1844         skb->dev = skb->dst->dev;
1845         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
1846 }
1847
1848 #endif
1849
1850 /*
1851  *      Allocate a dst for local (unicast / anycast) address.
1852  */
1853
1854 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
1855                                     const struct in6_addr *addr,
1856                                     int anycast)
1857 {
1858         struct net *net = dev_net(idev->dev);
1859         struct rt6_info *rt = ip6_dst_alloc(net->ipv6.ip6_dst_ops);
1860
1861         if (rt == NULL)
1862                 return ERR_PTR(-ENOMEM);
1863
1864         dev_hold(net->loopback_dev);
1865         in6_dev_hold(idev);
1866
1867         rt->u.dst.flags = DST_HOST;
1868         rt->u.dst.input = ip6_input;
1869         rt->u.dst.output = ip6_output;
1870         rt->rt6i_dev = net->loopback_dev;
1871         rt->rt6i_idev = idev;
1872         rt->u.dst.metrics[RTAX_MTU-1] = ipv6_get_mtu(rt->rt6i_dev);
1873         rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, dst_mtu(&rt->u.dst));
1874         rt->u.dst.metrics[RTAX_HOPLIMIT-1] = -1;
1875         rt->u.dst.obsolete = -1;
1876
1877         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
1878         if (anycast)
1879                 rt->rt6i_flags |= RTF_ANYCAST;
1880         else
1881                 rt->rt6i_flags |= RTF_LOCAL;
1882         rt->rt6i_nexthop = ndisc_get_neigh(rt->rt6i_dev, &rt->rt6i_gateway);
1883         if (rt->rt6i_nexthop == NULL) {
1884                 dst_free(&rt->u.dst);
1885                 return ERR_PTR(-ENOMEM);
1886         }
1887
1888         ipv6_addr_copy(&rt->rt6i_dst.addr, addr);
1889         rt->rt6i_dst.plen = 128;
1890         rt->rt6i_table = fib6_get_table(net, RT6_TABLE_LOCAL);
1891
1892         atomic_set(&rt->u.dst.__refcnt, 1);
1893
1894         return rt;
1895 }
1896
1897 struct arg_dev_net {
1898         struct net_device *dev;
1899         struct net *net;
1900 };
1901
1902 static int fib6_ifdown(struct rt6_info *rt, void *arg)
1903 {
1904         struct net_device *dev = ((struct arg_dev_net *)arg)->dev;
1905         struct net *net = ((struct arg_dev_net *)arg)->net;
1906
1907         if (((void *)rt->rt6i_dev == dev || dev == NULL) &&
1908             rt != net->ipv6.ip6_null_entry) {
1909                 RT6_TRACE("deleted by ifdown %p\n", rt);
1910                 return -1;
1911         }
1912         return 0;
1913 }
1914
1915 void rt6_ifdown(struct net *net, struct net_device *dev)
1916 {
1917         struct arg_dev_net adn = {
1918                 .dev = dev,
1919                 .net = net,
1920         };
1921
1922         fib6_clean_all(net, fib6_ifdown, 0, &adn);
1923 }
1924
1925 struct rt6_mtu_change_arg
1926 {
1927         struct net_device *dev;
1928         unsigned mtu;
1929 };
1930
1931 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
1932 {
1933         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
1934         struct inet6_dev *idev;
1935         struct net *net = dev_net(arg->dev);
1936
1937         /* In IPv6 pmtu discovery is not optional,
1938            so that RTAX_MTU lock cannot disable it.
1939            We still use this lock to block changes
1940            caused by addrconf/ndisc.
1941         */
1942
1943         idev = __in6_dev_get(arg->dev);
1944         if (idev == NULL)
1945                 return 0;
1946
1947         /* For administrative MTU increase, there is no way to discover
1948            IPv6 PMTU increase, so PMTU increase should be updated here.
1949            Since RFC 1981 doesn't include administrative MTU increase
1950            update PMTU increase is a MUST. (i.e. jumbo frame)
1951          */
1952         /*
1953            If new MTU is less than route PMTU, this new MTU will be the
1954            lowest MTU in the path, update the route PMTU to reflect PMTU
1955            decreases; if new MTU is greater than route PMTU, and the
1956            old MTU is the lowest MTU in the path, update the route PMTU
1957            to reflect the increase. In this case if the other nodes' MTU
1958            also have the lowest MTU, TOO BIG MESSAGE will be lead to
1959            PMTU discouvery.
1960          */
1961         if (rt->rt6i_dev == arg->dev &&
1962             !dst_metric_locked(&rt->u.dst, RTAX_MTU) &&
1963             (dst_mtu(&rt->u.dst) >= arg->mtu ||
1964              (dst_mtu(&rt->u.dst) < arg->mtu &&
1965               dst_mtu(&rt->u.dst) == idev->cnf.mtu6))) {
1966                 rt->u.dst.metrics[RTAX_MTU-1] = arg->mtu;
1967                 rt->u.dst.metrics[RTAX_ADVMSS-1] = ipv6_advmss(net, arg->mtu);
1968         }
1969         return 0;
1970 }
1971
1972 void rt6_mtu_change(struct net_device *dev, unsigned mtu)
1973 {
1974         struct rt6_mtu_change_arg arg = {
1975                 .dev = dev,
1976                 .mtu = mtu,
1977         };
1978
1979         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, 0, &arg);
1980 }
1981
1982 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
1983         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
1984         [RTA_OIF]               = { .type = NLA_U32 },
1985         [RTA_IIF]               = { .type = NLA_U32 },
1986         [RTA_PRIORITY]          = { .type = NLA_U32 },
1987         [RTA_METRICS]           = { .type = NLA_NESTED },
1988 };
1989
1990 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
1991                               struct fib6_config *cfg)
1992 {
1993         struct rtmsg *rtm;
1994         struct nlattr *tb[RTA_MAX+1];
1995         int err;
1996
1997         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
1998         if (err < 0)
1999                 goto errout;
2000
2001         err = -EINVAL;
2002         rtm = nlmsg_data(nlh);
2003         memset(cfg, 0, sizeof(*cfg));
2004
2005         cfg->fc_table = rtm->rtm_table;
2006         cfg->fc_dst_len = rtm->rtm_dst_len;
2007         cfg->fc_src_len = rtm->rtm_src_len;
2008         cfg->fc_flags = RTF_UP;
2009         cfg->fc_protocol = rtm->rtm_protocol;
2010
2011         if (rtm->rtm_type == RTN_UNREACHABLE)
2012                 cfg->fc_flags |= RTF_REJECT;
2013
2014         cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid;
2015         cfg->fc_nlinfo.nlh = nlh;
2016         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2017
2018         if (tb[RTA_GATEWAY]) {
2019                 nla_memcpy(&cfg->fc_gateway, tb[RTA_GATEWAY], 16);
2020                 cfg->fc_flags |= RTF_GATEWAY;
2021         }
2022
2023         if (tb[RTA_DST]) {
2024                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2025
2026                 if (nla_len(tb[RTA_DST]) < plen)
2027                         goto errout;
2028
2029                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2030         }
2031
2032         if (tb[RTA_SRC]) {
2033                 int plen = (rtm->rtm_src_len + 7) >> 3;
2034
2035                 if (nla_len(tb[RTA_SRC]) < plen)
2036                         goto errout;
2037
2038                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2039         }
2040
2041         if (tb[RTA_OIF])
2042                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2043
2044         if (tb[RTA_PRIORITY])
2045                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2046
2047         if (tb[RTA_METRICS]) {
2048                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2049                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2050         }
2051
2052         if (tb[RTA_TABLE])
2053                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2054
2055         err = 0;
2056 errout:
2057         return err;
2058 }
2059
2060 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2061 {
2062         struct fib6_config cfg;
2063         int err;
2064
2065         err = rtm_to_fib6_config(skb, nlh, &cfg);
2066         if (err < 0)
2067                 return err;
2068
2069         return ip6_route_del(&cfg);
2070 }
2071
2072 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
2073 {
2074         struct fib6_config cfg;
2075         int err;
2076
2077         err = rtm_to_fib6_config(skb, nlh, &cfg);
2078         if (err < 0)
2079                 return err;
2080
2081         return ip6_route_add(&cfg);
2082 }
2083
2084 static inline size_t rt6_nlmsg_size(void)
2085 {
2086         return NLMSG_ALIGN(sizeof(struct rtmsg))
2087                + nla_total_size(16) /* RTA_SRC */
2088                + nla_total_size(16) /* RTA_DST */
2089                + nla_total_size(16) /* RTA_GATEWAY */
2090                + nla_total_size(16) /* RTA_PREFSRC */
2091                + nla_total_size(4) /* RTA_TABLE */
2092                + nla_total_size(4) /* RTA_IIF */
2093                + nla_total_size(4) /* RTA_OIF */
2094                + nla_total_size(4) /* RTA_PRIORITY */
2095                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
2096                + nla_total_size(sizeof(struct rta_cacheinfo));
2097 }
2098
2099 static int rt6_fill_node(struct sk_buff *skb, struct rt6_info *rt,
2100                          struct in6_addr *dst, struct in6_addr *src,
2101                          int iif, int type, u32 pid, u32 seq,
2102                          int prefix, int nowait, unsigned int flags)
2103 {
2104         struct rtmsg *rtm;
2105         struct nlmsghdr *nlh;
2106         long expires;
2107         u32 table;
2108
2109         if (prefix) {   /* user wants prefix routes only */
2110                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
2111                         /* success since this is not a prefix route */
2112                         return 1;
2113                 }
2114         }
2115
2116         nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtm), flags);
2117         if (nlh == NULL)
2118                 return -EMSGSIZE;
2119
2120         rtm = nlmsg_data(nlh);
2121         rtm->rtm_family = AF_INET6;
2122         rtm->rtm_dst_len = rt->rt6i_dst.plen;
2123         rtm->rtm_src_len = rt->rt6i_src.plen;
2124         rtm->rtm_tos = 0;
2125         if (rt->rt6i_table)
2126                 table = rt->rt6i_table->tb6_id;
2127         else
2128                 table = RT6_TABLE_UNSPEC;
2129         rtm->rtm_table = table;
2130         NLA_PUT_U32(skb, RTA_TABLE, table);
2131         if (rt->rt6i_flags&RTF_REJECT)
2132                 rtm->rtm_type = RTN_UNREACHABLE;
2133         else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK))
2134                 rtm->rtm_type = RTN_LOCAL;
2135         else
2136                 rtm->rtm_type = RTN_UNICAST;
2137         rtm->rtm_flags = 0;
2138         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
2139         rtm->rtm_protocol = rt->rt6i_protocol;
2140         if (rt->rt6i_flags&RTF_DYNAMIC)
2141                 rtm->rtm_protocol = RTPROT_REDIRECT;
2142         else if (rt->rt6i_flags & RTF_ADDRCONF)
2143                 rtm->rtm_protocol = RTPROT_KERNEL;
2144         else if (rt->rt6i_flags&RTF_DEFAULT)
2145                 rtm->rtm_protocol = RTPROT_RA;
2146
2147         if (rt->rt6i_flags&RTF_CACHE)
2148                 rtm->rtm_flags |= RTM_F_CLONED;
2149
2150         if (dst) {
2151                 NLA_PUT(skb, RTA_DST, 16, dst);
2152                 rtm->rtm_dst_len = 128;
2153         } else if (rtm->rtm_dst_len)
2154                 NLA_PUT(skb, RTA_DST, 16, &rt->rt6i_dst.addr);
2155 #ifdef CONFIG_IPV6_SUBTREES
2156         if (src) {
2157                 NLA_PUT(skb, RTA_SRC, 16, src);
2158                 rtm->rtm_src_len = 128;
2159         } else if (rtm->rtm_src_len)
2160                 NLA_PUT(skb, RTA_SRC, 16, &rt->rt6i_src.addr);
2161 #endif
2162         if (iif) {
2163 #ifdef CONFIG_IPV6_MROUTE
2164                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
2165                         int err = ip6mr_get_route(skb, rtm, nowait);
2166                         if (err <= 0) {
2167                                 if (!nowait) {
2168                                         if (err == 0)
2169                                                 return 0;
2170                                         goto nla_put_failure;
2171                                 } else {
2172                                         if (err == -EMSGSIZE)
2173                                                 goto nla_put_failure;
2174                                 }
2175                         }
2176                 } else
2177 #endif
2178                         NLA_PUT_U32(skb, RTA_IIF, iif);
2179         } else if (dst) {
2180                 struct in6_addr saddr_buf;
2181                 if (ipv6_dev_get_saddr(ip6_dst_idev(&rt->u.dst)->dev,
2182                                        dst, 0, &saddr_buf) == 0)
2183                         NLA_PUT(skb, RTA_PREFSRC, 16, &saddr_buf);
2184         }
2185
2186         if (rtnetlink_put_metrics(skb, rt->u.dst.metrics) < 0)
2187                 goto nla_put_failure;
2188
2189         if (rt->u.dst.neighbour)
2190                 NLA_PUT(skb, RTA_GATEWAY, 16, &rt->u.dst.neighbour->primary_key);
2191
2192         if (rt->u.dst.dev)
2193                 NLA_PUT_U32(skb, RTA_OIF, rt->rt6i_dev->ifindex);
2194
2195         NLA_PUT_U32(skb, RTA_PRIORITY, rt->rt6i_metric);
2196
2197         if (!(rt->rt6i_flags & RTF_EXPIRES))
2198                 expires = 0;
2199         else if (rt->rt6i_expires - jiffies < INT_MAX)
2200                 expires = rt->rt6i_expires - jiffies;
2201         else
2202                 expires = INT_MAX;
2203
2204         if (rtnl_put_cacheinfo(skb, &rt->u.dst, 0, 0, 0,
2205                                expires, rt->u.dst.error) < 0)
2206                 goto nla_put_failure;
2207
2208         return nlmsg_end(skb, nlh);
2209
2210 nla_put_failure:
2211         nlmsg_cancel(skb, nlh);
2212         return -EMSGSIZE;
2213 }
2214
2215 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
2216 {
2217         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
2218         int prefix;
2219
2220         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
2221                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
2222                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
2223         } else
2224                 prefix = 0;
2225
2226         return rt6_fill_node(arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
2227                      NETLINK_CB(arg->cb->skb).pid, arg->cb->nlh->nlmsg_seq,
2228                      prefix, 0, NLM_F_MULTI);
2229 }
2230
2231 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr* nlh, void *arg)
2232 {
2233         struct net *net = sock_net(in_skb->sk);
2234         struct nlattr *tb[RTA_MAX+1];
2235         struct rt6_info *rt;
2236         struct sk_buff *skb;
2237         struct rtmsg *rtm;
2238         struct flowi fl;
2239         int err, iif = 0;
2240
2241         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2242         if (err < 0)
2243                 goto errout;
2244
2245         err = -EINVAL;
2246         memset(&fl, 0, sizeof(fl));
2247
2248         if (tb[RTA_SRC]) {
2249                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
2250                         goto errout;
2251
2252                 ipv6_addr_copy(&fl.fl6_src, nla_data(tb[RTA_SRC]));
2253         }
2254
2255         if (tb[RTA_DST]) {
2256                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
2257                         goto errout;
2258
2259                 ipv6_addr_copy(&fl.fl6_dst, nla_data(tb[RTA_DST]));
2260         }
2261
2262         if (tb[RTA_IIF])
2263                 iif = nla_get_u32(tb[RTA_IIF]);
2264
2265         if (tb[RTA_OIF])
2266                 fl.oif = nla_get_u32(tb[RTA_OIF]);
2267
2268         if (iif) {
2269                 struct net_device *dev;
2270                 dev = __dev_get_by_index(net, iif);
2271                 if (!dev) {
2272                         err = -ENODEV;
2273                         goto errout;
2274                 }
2275         }
2276
2277         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2278         if (skb == NULL) {
2279                 err = -ENOBUFS;
2280                 goto errout;
2281         }
2282
2283         /* Reserve room for dummy headers, this skb can pass
2284            through good chunk of routing engine.
2285          */
2286         skb_reset_mac_header(skb);
2287         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
2288
2289         rt = (struct rt6_info*) ip6_route_output(net, NULL, &fl);
2290         skb->dst = &rt->u.dst;
2291
2292         err = rt6_fill_node(skb, rt, &fl.fl6_dst, &fl.fl6_src, iif,
2293                             RTM_NEWROUTE, NETLINK_CB(in_skb).pid,
2294                             nlh->nlmsg_seq, 0, 0, 0);
2295         if (err < 0) {
2296                 kfree_skb(skb);
2297                 goto errout;
2298         }
2299
2300         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).pid);
2301 errout:
2302         return err;
2303 }
2304
2305 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info)
2306 {
2307         struct sk_buff *skb;
2308         struct net *net = info->nl_net;
2309         u32 seq;
2310         int err;
2311
2312         err = -ENOBUFS;
2313         seq = info->nlh != NULL ? info->nlh->nlmsg_seq : 0;
2314
2315         skb = nlmsg_new(rt6_nlmsg_size(), gfp_any());
2316         if (skb == NULL)
2317                 goto errout;
2318
2319         err = rt6_fill_node(skb, rt, NULL, NULL, 0,
2320                                 event, info->pid, seq, 0, 0, 0);
2321         if (err < 0) {
2322                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
2323                 WARN_ON(err == -EMSGSIZE);
2324                 kfree_skb(skb);
2325                 goto errout;
2326         }
2327         err = rtnl_notify(skb, net, info->pid, RTNLGRP_IPV6_ROUTE,
2328                           info->nlh, gfp_any());
2329 errout:
2330         if (err < 0)
2331                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
2332 }
2333
2334 static int ip6_route_dev_notify(struct notifier_block *this,
2335                                 unsigned long event, void *data)
2336 {
2337         struct net_device *dev = (struct net_device *)data;
2338         struct net *net = dev_net(dev);
2339
2340         if (event == NETDEV_REGISTER && (dev->flags & IFF_LOOPBACK)) {
2341                 net->ipv6.ip6_null_entry->u.dst.dev = dev;
2342                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
2343 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2344                 net->ipv6.ip6_prohibit_entry->u.dst.dev = dev;
2345                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
2346                 net->ipv6.ip6_blk_hole_entry->u.dst.dev = dev;
2347                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
2348 #endif
2349         }
2350
2351         return NOTIFY_OK;
2352 }
2353
2354 /*
2355  *      /proc
2356  */
2357
2358 #ifdef CONFIG_PROC_FS
2359
2360 #define RT6_INFO_LEN (32 + 4 + 32 + 4 + 32 + 40 + 5 + 1)
2361
2362 struct rt6_proc_arg
2363 {
2364         char *buffer;
2365         int offset;
2366         int length;
2367         int skip;
2368         int len;
2369 };
2370
2371 static int rt6_info_route(struct rt6_info *rt, void *p_arg)
2372 {
2373         struct seq_file *m = p_arg;
2374
2375         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_dst.addr),
2376                    rt->rt6i_dst.plen);
2377
2378 #ifdef CONFIG_IPV6_SUBTREES
2379         seq_printf(m, NIP6_SEQFMT " %02x ", NIP6(rt->rt6i_src.addr),
2380                    rt->rt6i_src.plen);
2381 #else
2382         seq_puts(m, "00000000000000000000000000000000 00 ");
2383 #endif
2384
2385         if (rt->rt6i_nexthop) {
2386                 seq_printf(m, NIP6_SEQFMT,
2387                            NIP6(*((struct in6_addr *)rt->rt6i_nexthop->primary_key)));
2388         } else {
2389                 seq_puts(m, "00000000000000000000000000000000");
2390         }
2391         seq_printf(m, " %08x %08x %08x %08x %8s\n",
2392                    rt->rt6i_metric, atomic_read(&rt->u.dst.__refcnt),
2393                    rt->u.dst.__use, rt->rt6i_flags,
2394                    rt->rt6i_dev ? rt->rt6i_dev->name : "");
2395         return 0;
2396 }
2397
2398 static int ipv6_route_show(struct seq_file *m, void *v)
2399 {
2400         struct net *net = (struct net *)m->private;
2401         fib6_clean_all(net, rt6_info_route, 0, m);
2402         return 0;
2403 }
2404
2405 static int ipv6_route_open(struct inode *inode, struct file *file)
2406 {
2407         int err;
2408         struct net *net = get_proc_net(inode);
2409         if (!net)
2410                 return -ENXIO;
2411
2412         err = single_open(file, ipv6_route_show, net);
2413         if (err < 0) {
2414                 put_net(net);
2415                 return err;
2416         }
2417
2418         return 0;
2419 }
2420
2421 static int ipv6_route_release(struct inode *inode, struct file *file)
2422 {
2423         struct seq_file *seq = file->private_data;
2424         struct net *net = seq->private;
2425         put_net(net);
2426         return single_release(inode, file);
2427 }
2428
2429 static const struct file_operations ipv6_route_proc_fops = {
2430         .owner          = THIS_MODULE,
2431         .open           = ipv6_route_open,
2432         .read           = seq_read,
2433         .llseek         = seq_lseek,
2434         .release        = ipv6_route_release,
2435 };
2436
2437 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
2438 {
2439         struct net *net = (struct net *)seq->private;
2440         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
2441                    net->ipv6.rt6_stats->fib_nodes,
2442                    net->ipv6.rt6_stats->fib_route_nodes,
2443                    net->ipv6.rt6_stats->fib_rt_alloc,
2444                    net->ipv6.rt6_stats->fib_rt_entries,
2445                    net->ipv6.rt6_stats->fib_rt_cache,
2446                    atomic_read(&net->ipv6.ip6_dst_ops->entries),
2447                    net->ipv6.rt6_stats->fib_discarded_routes);
2448
2449         return 0;
2450 }
2451
2452 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
2453 {
2454         int err;
2455         struct net *net = get_proc_net(inode);
2456         if (!net)
2457                 return -ENXIO;
2458
2459         err = single_open(file, rt6_stats_seq_show, net);
2460         if (err < 0) {
2461                 put_net(net);
2462                 return err;
2463         }
2464
2465         return 0;
2466 }
2467
2468 static int rt6_stats_seq_release(struct inode *inode, struct file *file)
2469 {
2470         struct seq_file *seq = file->private_data;
2471         struct net *net = (struct net *)seq->private;
2472         put_net(net);
2473         return single_release(inode, file);
2474 }
2475
2476 static const struct file_operations rt6_stats_seq_fops = {
2477         .owner   = THIS_MODULE,
2478         .open    = rt6_stats_seq_open,
2479         .read    = seq_read,
2480         .llseek  = seq_lseek,
2481         .release = rt6_stats_seq_release,
2482 };
2483 #endif  /* CONFIG_PROC_FS */
2484
2485 #ifdef CONFIG_SYSCTL
2486
2487 static
2488 int ipv6_sysctl_rtcache_flush(ctl_table *ctl, int write, struct file * filp,
2489                               void __user *buffer, size_t *lenp, loff_t *ppos)
2490 {
2491         struct net *net = current->nsproxy->net_ns;
2492         int delay = net->ipv6.sysctl.flush_delay;
2493         if (write) {
2494                 proc_dointvec(ctl, write, filp, buffer, lenp, ppos);
2495                 fib6_run_gc(delay <= 0 ? ~0UL : (unsigned long)delay, net);
2496                 return 0;
2497         } else
2498                 return -EINVAL;
2499 }
2500
2501 ctl_table ipv6_route_table_template[] = {
2502         {
2503                 .procname       =       "flush",
2504                 .data           =       &init_net.ipv6.sysctl.flush_delay,
2505                 .maxlen         =       sizeof(int),
2506                 .mode           =       0200,
2507                 .proc_handler   =       &ipv6_sysctl_rtcache_flush
2508         },
2509         {
2510                 .ctl_name       =       NET_IPV6_ROUTE_GC_THRESH,
2511                 .procname       =       "gc_thresh",
2512                 .data           =       &ip6_dst_ops_template.gc_thresh,
2513                 .maxlen         =       sizeof(int),
2514                 .mode           =       0644,
2515                 .proc_handler   =       &proc_dointvec,
2516         },
2517         {
2518                 .ctl_name       =       NET_IPV6_ROUTE_MAX_SIZE,
2519                 .procname       =       "max_size",
2520                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
2521                 .maxlen         =       sizeof(int),
2522                 .mode           =       0644,
2523                 .proc_handler   =       &proc_dointvec,
2524         },
2525         {
2526                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL,
2527                 .procname       =       "gc_min_interval",
2528                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2529                 .maxlen         =       sizeof(int),
2530                 .mode           =       0644,
2531                 .proc_handler   =       &proc_dointvec_jiffies,
2532                 .strategy       =       &sysctl_jiffies,
2533         },
2534         {
2535                 .ctl_name       =       NET_IPV6_ROUTE_GC_TIMEOUT,
2536                 .procname       =       "gc_timeout",
2537                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
2538                 .maxlen         =       sizeof(int),
2539                 .mode           =       0644,
2540                 .proc_handler   =       &proc_dointvec_jiffies,
2541                 .strategy       =       &sysctl_jiffies,
2542         },
2543         {
2544                 .ctl_name       =       NET_IPV6_ROUTE_GC_INTERVAL,
2545                 .procname       =       "gc_interval",
2546                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
2547                 .maxlen         =       sizeof(int),
2548                 .mode           =       0644,
2549                 .proc_handler   =       &proc_dointvec_jiffies,
2550                 .strategy       =       &sysctl_jiffies,
2551         },
2552         {
2553                 .ctl_name       =       NET_IPV6_ROUTE_GC_ELASTICITY,
2554                 .procname       =       "gc_elasticity",
2555                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
2556                 .maxlen         =       sizeof(int),
2557                 .mode           =       0644,
2558                 .proc_handler   =       &proc_dointvec_jiffies,
2559                 .strategy       =       &sysctl_jiffies,
2560         },
2561         {
2562                 .ctl_name       =       NET_IPV6_ROUTE_MTU_EXPIRES,
2563                 .procname       =       "mtu_expires",
2564                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
2565                 .maxlen         =       sizeof(int),
2566                 .mode           =       0644,
2567                 .proc_handler   =       &proc_dointvec_jiffies,
2568                 .strategy       =       &sysctl_jiffies,
2569         },
2570         {
2571                 .ctl_name       =       NET_IPV6_ROUTE_MIN_ADVMSS,
2572                 .procname       =       "min_adv_mss",
2573                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
2574                 .maxlen         =       sizeof(int),
2575                 .mode           =       0644,
2576                 .proc_handler   =       &proc_dointvec_jiffies,
2577                 .strategy       =       &sysctl_jiffies,
2578         },
2579         {
2580                 .ctl_name       =       NET_IPV6_ROUTE_GC_MIN_INTERVAL_MS,
2581                 .procname       =       "gc_min_interval_ms",
2582                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
2583                 .maxlen         =       sizeof(int),
2584                 .mode           =       0644,
2585                 .proc_handler   =       &proc_dointvec_ms_jiffies,
2586                 .strategy       =       &sysctl_ms_jiffies,
2587         },
2588         { .ctl_name = 0 }
2589 };
2590
2591 struct ctl_table *ipv6_route_sysctl_init(struct net *net)
2592 {
2593         struct ctl_table *table;
2594
2595         table = kmemdup(ipv6_route_table_template,
2596                         sizeof(ipv6_route_table_template),
2597                         GFP_KERNEL);
2598
2599         if (table) {
2600                 table[0].data = &net->ipv6.sysctl.flush_delay;
2601                 table[1].data = &net->ipv6.ip6_dst_ops->gc_thresh;
2602                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
2603                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
2604                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
2605                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
2606                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
2607                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
2608                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
2609         }
2610
2611         return table;
2612 }
2613 #endif
2614
2615 static int ip6_route_net_init(struct net *net)
2616 {
2617         int ret = -ENOMEM;
2618
2619         net->ipv6.ip6_dst_ops = kmemdup(&ip6_dst_ops_template,
2620                                         sizeof(*net->ipv6.ip6_dst_ops),
2621                                         GFP_KERNEL);
2622         if (!net->ipv6.ip6_dst_ops)
2623                 goto out;
2624         net->ipv6.ip6_dst_ops->dst_net = hold_net(net);
2625
2626         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
2627                                            sizeof(*net->ipv6.ip6_null_entry),
2628                                            GFP_KERNEL);
2629         if (!net->ipv6.ip6_null_entry)
2630                 goto out_ip6_dst_ops;
2631         net->ipv6.ip6_null_entry->u.dst.path =
2632                 (struct dst_entry *)net->ipv6.ip6_null_entry;
2633         net->ipv6.ip6_null_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2634
2635 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2636         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
2637                                                sizeof(*net->ipv6.ip6_prohibit_entry),
2638                                                GFP_KERNEL);
2639         if (!net->ipv6.ip6_prohibit_entry) {
2640                 kfree(net->ipv6.ip6_null_entry);
2641                 goto out;
2642         }
2643         net->ipv6.ip6_prohibit_entry->u.dst.path =
2644                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
2645         net->ipv6.ip6_prohibit_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2646
2647         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
2648                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
2649                                                GFP_KERNEL);
2650         if (!net->ipv6.ip6_blk_hole_entry) {
2651                 kfree(net->ipv6.ip6_null_entry);
2652                 kfree(net->ipv6.ip6_prohibit_entry);
2653                 goto out;
2654         }
2655         net->ipv6.ip6_blk_hole_entry->u.dst.path =
2656                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
2657         net->ipv6.ip6_blk_hole_entry->u.dst.ops = net->ipv6.ip6_dst_ops;
2658 #endif
2659
2660 #ifdef CONFIG_PROC_FS
2661         proc_net_fops_create(net, "ipv6_route", 0, &ipv6_route_proc_fops);
2662         proc_net_fops_create(net, "rt6_stats", S_IRUGO, &rt6_stats_seq_fops);
2663 #endif
2664         net->ipv6.ip6_rt_gc_expire = 30*HZ;
2665
2666         ret = 0;
2667 out:
2668         return ret;
2669
2670 out_ip6_dst_ops:
2671         release_net(net->ipv6.ip6_dst_ops->dst_net);
2672         kfree(net->ipv6.ip6_dst_ops);
2673         goto out;
2674 }
2675
2676 static void ip6_route_net_exit(struct net *net)
2677 {
2678 #ifdef CONFIG_PROC_FS
2679         proc_net_remove(net, "ipv6_route");
2680         proc_net_remove(net, "rt6_stats");
2681 #endif
2682         kfree(net->ipv6.ip6_null_entry);
2683 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2684         kfree(net->ipv6.ip6_prohibit_entry);
2685         kfree(net->ipv6.ip6_blk_hole_entry);
2686 #endif
2687         release_net(net->ipv6.ip6_dst_ops->dst_net);
2688         kfree(net->ipv6.ip6_dst_ops);
2689 }
2690
2691 static struct pernet_operations ip6_route_net_ops = {
2692         .init = ip6_route_net_init,
2693         .exit = ip6_route_net_exit,
2694 };
2695
2696 static struct notifier_block ip6_route_dev_notifier = {
2697         .notifier_call = ip6_route_dev_notify,
2698         .priority = 0,
2699 };
2700
2701 int __init ip6_route_init(void)
2702 {
2703         int ret;
2704
2705         ret = -ENOMEM;
2706         ip6_dst_ops_template.kmem_cachep =
2707                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
2708                                   SLAB_HWCACHE_ALIGN, NULL);
2709         if (!ip6_dst_ops_template.kmem_cachep)
2710                 goto out;;
2711
2712         ret = register_pernet_subsys(&ip6_route_net_ops);
2713         if (ret)
2714                 goto out_kmem_cache;
2715
2716         /* Registering of the loopback is done before this portion of code,
2717          * the loopback reference in rt6_info will not be taken, do it
2718          * manually for init_net */
2719         init_net.ipv6.ip6_null_entry->u.dst.dev = init_net.loopback_dev;
2720         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2721   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2722         init_net.ipv6.ip6_prohibit_entry->u.dst.dev = init_net.loopback_dev;
2723         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2724         init_net.ipv6.ip6_blk_hole_entry->u.dst.dev = init_net.loopback_dev;
2725         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
2726   #endif
2727         ret = fib6_init();
2728         if (ret)
2729                 goto out_register_subsys;
2730
2731         ret = xfrm6_init();
2732         if (ret)
2733                 goto out_fib6_init;
2734
2735         ret = fib6_rules_init();
2736         if (ret)
2737                 goto xfrm6_init;
2738
2739         ret = -ENOBUFS;
2740         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL) ||
2741             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL) ||
2742             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL))
2743                 goto fib6_rules_init;
2744
2745         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
2746         if (ret)
2747                 goto fib6_rules_init;
2748
2749 out:
2750         return ret;
2751
2752 fib6_rules_init:
2753         fib6_rules_cleanup();
2754 xfrm6_init:
2755         xfrm6_fini();
2756 out_fib6_init:
2757         fib6_gc_cleanup();
2758 out_register_subsys:
2759         unregister_pernet_subsys(&ip6_route_net_ops);
2760 out_kmem_cache:
2761         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2762         goto out;
2763 }
2764
2765 void ip6_route_cleanup(void)
2766 {
2767         unregister_netdevice_notifier(&ip6_route_dev_notifier);
2768         fib6_rules_cleanup();
2769         xfrm6_fini();
2770         fib6_gc_cleanup();
2771         unregister_pernet_subsys(&ip6_route_net_ops);
2772         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
2773 }