gre: fix netns vs proto registration ordering
[safe/jmp/linux-2.6] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *              This program is free software; you can redistribute it and/or
11  *              modify it under the terms of the GNU General Public License
12  *              as published by the Free Software Foundation; either version
13  *              2 of the License, or (at your option) any later version.
14  */
15
16 #include <asm/uaccess.h>
17 #include <asm/system.h>
18 #include <linux/bitops.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/jiffies.h>
22 #include <linux/mm.h>
23 #include <linux/string.h>
24 #include <linux/socket.h>
25 #include <linux/sockios.h>
26 #include <linux/errno.h>
27 #include <linux/in.h>
28 #include <linux/inet.h>
29 #include <linux/inetdevice.h>
30 #include <linux/netdevice.h>
31 #include <linux/if_arp.h>
32 #include <linux/proc_fs.h>
33 #include <linux/skbuff.h>
34 #include <linux/init.h>
35
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45
46 #include "fib_lookup.h"
47
48 static DEFINE_SPINLOCK(fib_info_lock);
49 static struct hlist_head *fib_info_hash;
50 static struct hlist_head *fib_info_laddrhash;
51 static unsigned int fib_hash_size;
52 static unsigned int fib_info_cnt;
53
54 #define DEVINDEX_HASHBITS 8
55 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
56 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
57
58 #ifdef CONFIG_IP_ROUTE_MULTIPATH
59
60 static DEFINE_SPINLOCK(fib_multipath_lock);
61
62 #define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \
63 for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++)
64
65 #define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \
66 for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++)
67
68 #else /* CONFIG_IP_ROUTE_MULTIPATH */
69
70 /* Hope, that gcc will optimize it to get rid of dummy loop */
71
72 #define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \
73 for (nhsel=0; nhsel < 1; nhsel++)
74
75 #define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \
76 for (nhsel=0; nhsel < 1; nhsel++)
77
78 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
79
80 #define endfor_nexthops(fi) }
81
82
83 static const struct
84 {
85         int     error;
86         u8      scope;
87 } fib_props[RTN_MAX + 1] = {
88         {
89                 .error  = 0,
90                 .scope  = RT_SCOPE_NOWHERE,
91         },      /* RTN_UNSPEC */
92         {
93                 .error  = 0,
94                 .scope  = RT_SCOPE_UNIVERSE,
95         },      /* RTN_UNICAST */
96         {
97                 .error  = 0,
98                 .scope  = RT_SCOPE_HOST,
99         },      /* RTN_LOCAL */
100         {
101                 .error  = 0,
102                 .scope  = RT_SCOPE_LINK,
103         },      /* RTN_BROADCAST */
104         {
105                 .error  = 0,
106                 .scope  = RT_SCOPE_LINK,
107         },      /* RTN_ANYCAST */
108         {
109                 .error  = 0,
110                 .scope  = RT_SCOPE_UNIVERSE,
111         },      /* RTN_MULTICAST */
112         {
113                 .error  = -EINVAL,
114                 .scope  = RT_SCOPE_UNIVERSE,
115         },      /* RTN_BLACKHOLE */
116         {
117                 .error  = -EHOSTUNREACH,
118                 .scope  = RT_SCOPE_UNIVERSE,
119         },      /* RTN_UNREACHABLE */
120         {
121                 .error  = -EACCES,
122                 .scope  = RT_SCOPE_UNIVERSE,
123         },      /* RTN_PROHIBIT */
124         {
125                 .error  = -EAGAIN,
126                 .scope  = RT_SCOPE_UNIVERSE,
127         },      /* RTN_THROW */
128         {
129                 .error  = -EINVAL,
130                 .scope  = RT_SCOPE_NOWHERE,
131         },      /* RTN_NAT */
132         {
133                 .error  = -EINVAL,
134                 .scope  = RT_SCOPE_NOWHERE,
135         },      /* RTN_XRESOLVE */
136 };
137
138
139 /* Release a nexthop info record */
140
141 void free_fib_info(struct fib_info *fi)
142 {
143         if (fi->fib_dead == 0) {
144                 printk(KERN_WARNING "Freeing alive fib_info %p\n", fi);
145                 return;
146         }
147         change_nexthops(fi) {
148                 if (nexthop_nh->nh_dev)
149                         dev_put(nexthop_nh->nh_dev);
150                 nexthop_nh->nh_dev = NULL;
151         } endfor_nexthops(fi);
152         fib_info_cnt--;
153         release_net(fi->fib_net);
154         kfree(fi);
155 }
156
157 void fib_release_info(struct fib_info *fi)
158 {
159         spin_lock_bh(&fib_info_lock);
160         if (fi && --fi->fib_treeref == 0) {
161                 hlist_del(&fi->fib_hash);
162                 if (fi->fib_prefsrc)
163                         hlist_del(&fi->fib_lhash);
164                 change_nexthops(fi) {
165                         if (!nexthop_nh->nh_dev)
166                                 continue;
167                         hlist_del(&nexthop_nh->nh_hash);
168                 } endfor_nexthops(fi)
169                 fi->fib_dead = 1;
170                 fib_info_put(fi);
171         }
172         spin_unlock_bh(&fib_info_lock);
173 }
174
175 static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
176 {
177         const struct fib_nh *onh = ofi->fib_nh;
178
179         for_nexthops(fi) {
180                 if (nh->nh_oif != onh->nh_oif ||
181                     nh->nh_gw  != onh->nh_gw ||
182                     nh->nh_scope != onh->nh_scope ||
183 #ifdef CONFIG_IP_ROUTE_MULTIPATH
184                     nh->nh_weight != onh->nh_weight ||
185 #endif
186 #ifdef CONFIG_NET_CLS_ROUTE
187                     nh->nh_tclassid != onh->nh_tclassid ||
188 #endif
189                     ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD))
190                         return -1;
191                 onh++;
192         } endfor_nexthops(fi);
193         return 0;
194 }
195
196 static inline unsigned int fib_devindex_hashfn(unsigned int val)
197 {
198         unsigned int mask = DEVINDEX_HASHSIZE - 1;
199
200         return (val ^
201                 (val >> DEVINDEX_HASHBITS) ^
202                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
203 }
204
205 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
206 {
207         unsigned int mask = (fib_hash_size - 1);
208         unsigned int val = fi->fib_nhs;
209
210         val ^= fi->fib_protocol;
211         val ^= (__force u32)fi->fib_prefsrc;
212         val ^= fi->fib_priority;
213         for_nexthops(fi) {
214                 val ^= fib_devindex_hashfn(nh->nh_oif);
215         } endfor_nexthops(fi)
216
217         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
218 }
219
220 static struct fib_info *fib_find_info(const struct fib_info *nfi)
221 {
222         struct hlist_head *head;
223         struct hlist_node *node;
224         struct fib_info *fi;
225         unsigned int hash;
226
227         hash = fib_info_hashfn(nfi);
228         head = &fib_info_hash[hash];
229
230         hlist_for_each_entry(fi, node, head, fib_hash) {
231                 if (!net_eq(fi->fib_net, nfi->fib_net))
232                         continue;
233                 if (fi->fib_nhs != nfi->fib_nhs)
234                         continue;
235                 if (nfi->fib_protocol == fi->fib_protocol &&
236                     nfi->fib_prefsrc == fi->fib_prefsrc &&
237                     nfi->fib_priority == fi->fib_priority &&
238                     memcmp(nfi->fib_metrics, fi->fib_metrics,
239                            sizeof(fi->fib_metrics)) == 0 &&
240                     ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 &&
241                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
242                         return fi;
243         }
244
245         return NULL;
246 }
247
248 /* Check, that the gateway is already configured.
249    Used only by redirect accept routine.
250  */
251
252 int ip_fib_check_default(__be32 gw, struct net_device *dev)
253 {
254         struct hlist_head *head;
255         struct hlist_node *node;
256         struct fib_nh *nh;
257         unsigned int hash;
258
259         spin_lock(&fib_info_lock);
260
261         hash = fib_devindex_hashfn(dev->ifindex);
262         head = &fib_info_devhash[hash];
263         hlist_for_each_entry(nh, node, head, nh_hash) {
264                 if (nh->nh_dev == dev &&
265                     nh->nh_gw == gw &&
266                     !(nh->nh_flags&RTNH_F_DEAD)) {
267                         spin_unlock(&fib_info_lock);
268                         return 0;
269                 }
270         }
271
272         spin_unlock(&fib_info_lock);
273
274         return -1;
275 }
276
277 static inline size_t fib_nlmsg_size(struct fib_info *fi)
278 {
279         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
280                          + nla_total_size(4) /* RTA_TABLE */
281                          + nla_total_size(4) /* RTA_DST */
282                          + nla_total_size(4) /* RTA_PRIORITY */
283                          + nla_total_size(4); /* RTA_PREFSRC */
284
285         /* space for nested metrics */
286         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
287
288         if (fi->fib_nhs) {
289                 /* Also handles the special case fib_nhs == 1 */
290
291                 /* each nexthop is packed in an attribute */
292                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
293
294                 /* may contain flow and gateway attribute */
295                 nhsize += 2 * nla_total_size(4);
296
297                 /* all nexthops are packed in a nested attribute */
298                 payload += nla_total_size(fi->fib_nhs * nhsize);
299         }
300
301         return payload;
302 }
303
304 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
305                int dst_len, u32 tb_id, struct nl_info *info,
306                unsigned int nlm_flags)
307 {
308         struct sk_buff *skb;
309         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
310         int err = -ENOBUFS;
311
312         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
313         if (skb == NULL)
314                 goto errout;
315
316         err = fib_dump_info(skb, info->pid, seq, event, tb_id,
317                             fa->fa_type, fa->fa_scope, key, dst_len,
318                             fa->fa_tos, fa->fa_info, nlm_flags);
319         if (err < 0) {
320                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
321                 WARN_ON(err == -EMSGSIZE);
322                 kfree_skb(skb);
323                 goto errout;
324         }
325         rtnl_notify(skb, info->nl_net, info->pid, RTNLGRP_IPV4_ROUTE,
326                     info->nlh, GFP_KERNEL);
327         return;
328 errout:
329         if (err < 0)
330                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
331 }
332
333 /* Return the first fib alias matching TOS with
334  * priority less than or equal to PRIO.
335  */
336 struct fib_alias *fib_find_alias(struct list_head *fah, u8 tos, u32 prio)
337 {
338         if (fah) {
339                 struct fib_alias *fa;
340                 list_for_each_entry(fa, fah, fa_list) {
341                         if (fa->fa_tos > tos)
342                                 continue;
343                         if (fa->fa_info->fib_priority >= prio ||
344                             fa->fa_tos < tos)
345                                 return fa;
346                 }
347         }
348         return NULL;
349 }
350
351 int fib_detect_death(struct fib_info *fi, int order,
352                      struct fib_info **last_resort, int *last_idx, int dflt)
353 {
354         struct neighbour *n;
355         int state = NUD_NONE;
356
357         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
358         if (n) {
359                 state = n->nud_state;
360                 neigh_release(n);
361         }
362         if (state == NUD_REACHABLE)
363                 return 0;
364         if ((state&NUD_VALID) && order != dflt)
365                 return 0;
366         if ((state&NUD_VALID) ||
367             (*last_idx<0 && order > dflt)) {
368                 *last_resort = fi;
369                 *last_idx = order;
370         }
371         return 1;
372 }
373
374 #ifdef CONFIG_IP_ROUTE_MULTIPATH
375
376 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
377 {
378         int nhs = 0;
379
380         while (rtnh_ok(rtnh, remaining)) {
381                 nhs++;
382                 rtnh = rtnh_next(rtnh, &remaining);
383         }
384
385         /* leftover implies invalid nexthop configuration, discard it */
386         return remaining > 0 ? 0 : nhs;
387 }
388
389 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
390                        int remaining, struct fib_config *cfg)
391 {
392         change_nexthops(fi) {
393                 int attrlen;
394
395                 if (!rtnh_ok(rtnh, remaining))
396                         return -EINVAL;
397
398                 nexthop_nh->nh_flags =
399                         (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
400                 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
401                 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
402
403                 attrlen = rtnh_attrlen(rtnh);
404                 if (attrlen > 0) {
405                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
406
407                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
408                         nexthop_nh->nh_gw = nla ? nla_get_be32(nla) : 0;
409 #ifdef CONFIG_NET_CLS_ROUTE
410                         nla = nla_find(attrs, attrlen, RTA_FLOW);
411                         nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
412 #endif
413                 }
414
415                 rtnh = rtnh_next(rtnh, &remaining);
416         } endfor_nexthops(fi);
417
418         return 0;
419 }
420
421 #endif
422
423 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
424 {
425 #ifdef CONFIG_IP_ROUTE_MULTIPATH
426         struct rtnexthop *rtnh;
427         int remaining;
428 #endif
429
430         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
431                 return 1;
432
433         if (cfg->fc_oif || cfg->fc_gw) {
434                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
435                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
436                         return 0;
437                 return 1;
438         }
439
440 #ifdef CONFIG_IP_ROUTE_MULTIPATH
441         if (cfg->fc_mp == NULL)
442                 return 0;
443
444         rtnh = cfg->fc_mp;
445         remaining = cfg->fc_mp_len;
446
447         for_nexthops(fi) {
448                 int attrlen;
449
450                 if (!rtnh_ok(rtnh, remaining))
451                         return -EINVAL;
452
453                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
454                         return 1;
455
456                 attrlen = rtnh_attrlen(rtnh);
457                 if (attrlen < 0) {
458                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
459
460                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
461                         if (nla && nla_get_be32(nla) != nh->nh_gw)
462                                 return 1;
463 #ifdef CONFIG_NET_CLS_ROUTE
464                         nla = nla_find(attrs, attrlen, RTA_FLOW);
465                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
466                                 return 1;
467 #endif
468                 }
469
470                 rtnh = rtnh_next(rtnh, &remaining);
471         } endfor_nexthops(fi);
472 #endif
473         return 0;
474 }
475
476
477 /*
478    Picture
479    -------
480
481    Semantics of nexthop is very messy by historical reasons.
482    We have to take into account, that:
483    a) gateway can be actually local interface address,
484       so that gatewayed route is direct.
485    b) gateway must be on-link address, possibly
486       described not by an ifaddr, but also by a direct route.
487    c) If both gateway and interface are specified, they should not
488       contradict.
489    d) If we use tunnel routes, gateway could be not on-link.
490
491    Attempt to reconcile all of these (alas, self-contradictory) conditions
492    results in pretty ugly and hairy code with obscure logic.
493
494    I chose to generalized it instead, so that the size
495    of code does not increase practically, but it becomes
496    much more general.
497    Every prefix is assigned a "scope" value: "host" is local address,
498    "link" is direct route,
499    [ ... "site" ... "interior" ... ]
500    and "universe" is true gateway route with global meaning.
501
502    Every prefix refers to a set of "nexthop"s (gw, oif),
503    where gw must have narrower scope. This recursion stops
504    when gw has LOCAL scope or if "nexthop" is declared ONLINK,
505    which means that gw is forced to be on link.
506
507    Code is still hairy, but now it is apparently logically
508    consistent and very flexible. F.e. as by-product it allows
509    to co-exists in peace independent exterior and interior
510    routing processes.
511
512    Normally it looks as following.
513
514    {universe prefix}  -> (gw, oif) [scope link]
515                           |
516                           |-> {link prefix} -> (gw, oif) [scope local]
517                                                 |
518                                                 |-> {local prefix} (terminal node)
519  */
520
521 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
522                         struct fib_nh *nh)
523 {
524         int err;
525         struct net *net;
526
527         net = cfg->fc_nlinfo.nl_net;
528         if (nh->nh_gw) {
529                 struct fib_result res;
530
531                 if (nh->nh_flags&RTNH_F_ONLINK) {
532                         struct net_device *dev;
533
534                         if (cfg->fc_scope >= RT_SCOPE_LINK)
535                                 return -EINVAL;
536                         if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST)
537                                 return -EINVAL;
538                         if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL)
539                                 return -ENODEV;
540                         if (!(dev->flags&IFF_UP))
541                                 return -ENETDOWN;
542                         nh->nh_dev = dev;
543                         dev_hold(dev);
544                         nh->nh_scope = RT_SCOPE_LINK;
545                         return 0;
546                 }
547                 {
548                         struct flowi fl = {
549                                 .nl_u = {
550                                         .ip4_u = {
551                                                 .daddr = nh->nh_gw,
552                                                 .scope = cfg->fc_scope + 1,
553                                         },
554                                 },
555                                 .oif = nh->nh_oif,
556                         };
557
558                         /* It is not necessary, but requires a bit of thinking */
559                         if (fl.fl4_scope < RT_SCOPE_LINK)
560                                 fl.fl4_scope = RT_SCOPE_LINK;
561                         if ((err = fib_lookup(net, &fl, &res)) != 0)
562                                 return err;
563                 }
564                 err = -EINVAL;
565                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
566                         goto out;
567                 nh->nh_scope = res.scope;
568                 nh->nh_oif = FIB_RES_OIF(res);
569                 if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL)
570                         goto out;
571                 dev_hold(nh->nh_dev);
572                 err = -ENETDOWN;
573                 if (!(nh->nh_dev->flags & IFF_UP))
574                         goto out;
575                 err = 0;
576 out:
577                 fib_res_put(&res);
578                 return err;
579         } else {
580                 struct in_device *in_dev;
581
582                 if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK))
583                         return -EINVAL;
584
585                 in_dev = inetdev_by_index(net, nh->nh_oif);
586                 if (in_dev == NULL)
587                         return -ENODEV;
588                 if (!(in_dev->dev->flags&IFF_UP)) {
589                         in_dev_put(in_dev);
590                         return -ENETDOWN;
591                 }
592                 nh->nh_dev = in_dev->dev;
593                 dev_hold(nh->nh_dev);
594                 nh->nh_scope = RT_SCOPE_HOST;
595                 in_dev_put(in_dev);
596         }
597         return 0;
598 }
599
600 static inline unsigned int fib_laddr_hashfn(__be32 val)
601 {
602         unsigned int mask = (fib_hash_size - 1);
603
604         return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask;
605 }
606
607 static struct hlist_head *fib_hash_alloc(int bytes)
608 {
609         if (bytes <= PAGE_SIZE)
610                 return kzalloc(bytes, GFP_KERNEL);
611         else
612                 return (struct hlist_head *)
613                         __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes));
614 }
615
616 static void fib_hash_free(struct hlist_head *hash, int bytes)
617 {
618         if (!hash)
619                 return;
620
621         if (bytes <= PAGE_SIZE)
622                 kfree(hash);
623         else
624                 free_pages((unsigned long) hash, get_order(bytes));
625 }
626
627 static void fib_hash_move(struct hlist_head *new_info_hash,
628                           struct hlist_head *new_laddrhash,
629                           unsigned int new_size)
630 {
631         struct hlist_head *old_info_hash, *old_laddrhash;
632         unsigned int old_size = fib_hash_size;
633         unsigned int i, bytes;
634
635         spin_lock_bh(&fib_info_lock);
636         old_info_hash = fib_info_hash;
637         old_laddrhash = fib_info_laddrhash;
638         fib_hash_size = new_size;
639
640         for (i = 0; i < old_size; i++) {
641                 struct hlist_head *head = &fib_info_hash[i];
642                 struct hlist_node *node, *n;
643                 struct fib_info *fi;
644
645                 hlist_for_each_entry_safe(fi, node, n, head, fib_hash) {
646                         struct hlist_head *dest;
647                         unsigned int new_hash;
648
649                         hlist_del(&fi->fib_hash);
650
651                         new_hash = fib_info_hashfn(fi);
652                         dest = &new_info_hash[new_hash];
653                         hlist_add_head(&fi->fib_hash, dest);
654                 }
655         }
656         fib_info_hash = new_info_hash;
657
658         for (i = 0; i < old_size; i++) {
659                 struct hlist_head *lhead = &fib_info_laddrhash[i];
660                 struct hlist_node *node, *n;
661                 struct fib_info *fi;
662
663                 hlist_for_each_entry_safe(fi, node, n, lhead, fib_lhash) {
664                         struct hlist_head *ldest;
665                         unsigned int new_hash;
666
667                         hlist_del(&fi->fib_lhash);
668
669                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
670                         ldest = &new_laddrhash[new_hash];
671                         hlist_add_head(&fi->fib_lhash, ldest);
672                 }
673         }
674         fib_info_laddrhash = new_laddrhash;
675
676         spin_unlock_bh(&fib_info_lock);
677
678         bytes = old_size * sizeof(struct hlist_head *);
679         fib_hash_free(old_info_hash, bytes);
680         fib_hash_free(old_laddrhash, bytes);
681 }
682
683 struct fib_info *fib_create_info(struct fib_config *cfg)
684 {
685         int err;
686         struct fib_info *fi = NULL;
687         struct fib_info *ofi;
688         int nhs = 1;
689         struct net *net = cfg->fc_nlinfo.nl_net;
690
691         /* Fast check to catch the most weird cases */
692         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
693                 goto err_inval;
694
695 #ifdef CONFIG_IP_ROUTE_MULTIPATH
696         if (cfg->fc_mp) {
697                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
698                 if (nhs == 0)
699                         goto err_inval;
700         }
701 #endif
702
703         err = -ENOBUFS;
704         if (fib_info_cnt >= fib_hash_size) {
705                 unsigned int new_size = fib_hash_size << 1;
706                 struct hlist_head *new_info_hash;
707                 struct hlist_head *new_laddrhash;
708                 unsigned int bytes;
709
710                 if (!new_size)
711                         new_size = 1;
712                 bytes = new_size * sizeof(struct hlist_head *);
713                 new_info_hash = fib_hash_alloc(bytes);
714                 new_laddrhash = fib_hash_alloc(bytes);
715                 if (!new_info_hash || !new_laddrhash) {
716                         fib_hash_free(new_info_hash, bytes);
717                         fib_hash_free(new_laddrhash, bytes);
718                 } else
719                         fib_hash_move(new_info_hash, new_laddrhash, new_size);
720
721                 if (!fib_hash_size)
722                         goto failure;
723         }
724
725         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
726         if (fi == NULL)
727                 goto failure;
728         fib_info_cnt++;
729
730         fi->fib_net = hold_net(net);
731         fi->fib_protocol = cfg->fc_protocol;
732         fi->fib_flags = cfg->fc_flags;
733         fi->fib_priority = cfg->fc_priority;
734         fi->fib_prefsrc = cfg->fc_prefsrc;
735
736         fi->fib_nhs = nhs;
737         change_nexthops(fi) {
738                 nexthop_nh->nh_parent = fi;
739         } endfor_nexthops(fi)
740
741         if (cfg->fc_mx) {
742                 struct nlattr *nla;
743                 int remaining;
744
745                 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
746                         int type = nla_type(nla);
747
748                         if (type) {
749                                 if (type > RTAX_MAX)
750                                         goto err_inval;
751                                 fi->fib_metrics[type - 1] = nla_get_u32(nla);
752                         }
753                 }
754         }
755
756         if (cfg->fc_mp) {
757 #ifdef CONFIG_IP_ROUTE_MULTIPATH
758                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
759                 if (err != 0)
760                         goto failure;
761                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
762                         goto err_inval;
763                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
764                         goto err_inval;
765 #ifdef CONFIG_NET_CLS_ROUTE
766                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
767                         goto err_inval;
768 #endif
769 #else
770                 goto err_inval;
771 #endif
772         } else {
773                 struct fib_nh *nh = fi->fib_nh;
774
775                 nh->nh_oif = cfg->fc_oif;
776                 nh->nh_gw = cfg->fc_gw;
777                 nh->nh_flags = cfg->fc_flags;
778 #ifdef CONFIG_NET_CLS_ROUTE
779                 nh->nh_tclassid = cfg->fc_flow;
780 #endif
781 #ifdef CONFIG_IP_ROUTE_MULTIPATH
782                 nh->nh_weight = 1;
783 #endif
784         }
785
786         if (fib_props[cfg->fc_type].error) {
787                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
788                         goto err_inval;
789                 goto link_it;
790         }
791
792         if (cfg->fc_scope > RT_SCOPE_HOST)
793                 goto err_inval;
794
795         if (cfg->fc_scope == RT_SCOPE_HOST) {
796                 struct fib_nh *nh = fi->fib_nh;
797
798                 /* Local address is added. */
799                 if (nhs != 1 || nh->nh_gw)
800                         goto err_inval;
801                 nh->nh_scope = RT_SCOPE_NOWHERE;
802                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
803                 err = -ENODEV;
804                 if (nh->nh_dev == NULL)
805                         goto failure;
806         } else {
807                 change_nexthops(fi) {
808                         if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0)
809                                 goto failure;
810                 } endfor_nexthops(fi)
811         }
812
813         if (fi->fib_prefsrc) {
814                 if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
815                     fi->fib_prefsrc != cfg->fc_dst)
816                         if (inet_addr_type(net, fi->fib_prefsrc) != RTN_LOCAL)
817                                 goto err_inval;
818         }
819
820 link_it:
821         if ((ofi = fib_find_info(fi)) != NULL) {
822                 fi->fib_dead = 1;
823                 free_fib_info(fi);
824                 ofi->fib_treeref++;
825                 return ofi;
826         }
827
828         fi->fib_treeref++;
829         atomic_inc(&fi->fib_clntref);
830         spin_lock_bh(&fib_info_lock);
831         hlist_add_head(&fi->fib_hash,
832                        &fib_info_hash[fib_info_hashfn(fi)]);
833         if (fi->fib_prefsrc) {
834                 struct hlist_head *head;
835
836                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
837                 hlist_add_head(&fi->fib_lhash, head);
838         }
839         change_nexthops(fi) {
840                 struct hlist_head *head;
841                 unsigned int hash;
842
843                 if (!nexthop_nh->nh_dev)
844                         continue;
845                 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
846                 head = &fib_info_devhash[hash];
847                 hlist_add_head(&nexthop_nh->nh_hash, head);
848         } endfor_nexthops(fi)
849         spin_unlock_bh(&fib_info_lock);
850         return fi;
851
852 err_inval:
853         err = -EINVAL;
854
855 failure:
856         if (fi) {
857                 fi->fib_dead = 1;
858                 free_fib_info(fi);
859         }
860
861         return ERR_PTR(err);
862 }
863
864 /* Note! fib_semantic_match intentionally uses  RCU list functions. */
865 int fib_semantic_match(struct list_head *head, const struct flowi *flp,
866                        struct fib_result *res, int prefixlen)
867 {
868         struct fib_alias *fa;
869         int nh_sel = 0;
870
871         list_for_each_entry_rcu(fa, head, fa_list) {
872                 int err;
873
874                 if (fa->fa_tos &&
875                     fa->fa_tos != flp->fl4_tos)
876                         continue;
877
878                 if (fa->fa_scope < flp->fl4_scope)
879                         continue;
880
881                 fa->fa_state |= FA_S_ACCESSED;
882
883                 err = fib_props[fa->fa_type].error;
884                 if (err == 0) {
885                         struct fib_info *fi = fa->fa_info;
886
887                         if (fi->fib_flags & RTNH_F_DEAD)
888                                 continue;
889
890                         switch (fa->fa_type) {
891                         case RTN_UNICAST:
892                         case RTN_LOCAL:
893                         case RTN_BROADCAST:
894                         case RTN_ANYCAST:
895                         case RTN_MULTICAST:
896                                 for_nexthops(fi) {
897                                         if (nh->nh_flags&RTNH_F_DEAD)
898                                                 continue;
899                                         if (!flp->oif || flp->oif == nh->nh_oif)
900                                                 break;
901                                 }
902 #ifdef CONFIG_IP_ROUTE_MULTIPATH
903                                 if (nhsel < fi->fib_nhs) {
904                                         nh_sel = nhsel;
905                                         goto out_fill_res;
906                                 }
907 #else
908                                 if (nhsel < 1) {
909                                         goto out_fill_res;
910                                 }
911 #endif
912                                 endfor_nexthops(fi);
913                                 continue;
914
915                         default:
916                                 printk(KERN_WARNING "fib_semantic_match bad type %#x\n",
917                                         fa->fa_type);
918                                 return -EINVAL;
919                         }
920                 }
921                 return err;
922         }
923         return 1;
924
925 out_fill_res:
926         res->prefixlen = prefixlen;
927         res->nh_sel = nh_sel;
928         res->type = fa->fa_type;
929         res->scope = fa->fa_scope;
930         res->fi = fa->fa_info;
931         atomic_inc(&res->fi->fib_clntref);
932         return 0;
933 }
934
935 /* Find appropriate source address to this destination */
936
937 __be32 __fib_res_prefsrc(struct fib_result *res)
938 {
939         return inet_select_addr(FIB_RES_DEV(*res), FIB_RES_GW(*res), res->scope);
940 }
941
942 int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event,
943                   u32 tb_id, u8 type, u8 scope, __be32 dst, int dst_len, u8 tos,
944                   struct fib_info *fi, unsigned int flags)
945 {
946         struct nlmsghdr *nlh;
947         struct rtmsg *rtm;
948
949         nlh = nlmsg_put(skb, pid, seq, event, sizeof(*rtm), flags);
950         if (nlh == NULL)
951                 return -EMSGSIZE;
952
953         rtm = nlmsg_data(nlh);
954         rtm->rtm_family = AF_INET;
955         rtm->rtm_dst_len = dst_len;
956         rtm->rtm_src_len = 0;
957         rtm->rtm_tos = tos;
958         if (tb_id < 256)
959                 rtm->rtm_table = tb_id;
960         else
961                 rtm->rtm_table = RT_TABLE_COMPAT;
962         NLA_PUT_U32(skb, RTA_TABLE, tb_id);
963         rtm->rtm_type = type;
964         rtm->rtm_flags = fi->fib_flags;
965         rtm->rtm_scope = scope;
966         rtm->rtm_protocol = fi->fib_protocol;
967
968         if (rtm->rtm_dst_len)
969                 NLA_PUT_BE32(skb, RTA_DST, dst);
970
971         if (fi->fib_priority)
972                 NLA_PUT_U32(skb, RTA_PRIORITY, fi->fib_priority);
973
974         if (rtnetlink_put_metrics(skb, fi->fib_metrics) < 0)
975                 goto nla_put_failure;
976
977         if (fi->fib_prefsrc)
978                 NLA_PUT_BE32(skb, RTA_PREFSRC, fi->fib_prefsrc);
979
980         if (fi->fib_nhs == 1) {
981                 if (fi->fib_nh->nh_gw)
982                         NLA_PUT_BE32(skb, RTA_GATEWAY, fi->fib_nh->nh_gw);
983
984                 if (fi->fib_nh->nh_oif)
985                         NLA_PUT_U32(skb, RTA_OIF, fi->fib_nh->nh_oif);
986 #ifdef CONFIG_NET_CLS_ROUTE
987                 if (fi->fib_nh[0].nh_tclassid)
988                         NLA_PUT_U32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid);
989 #endif
990         }
991 #ifdef CONFIG_IP_ROUTE_MULTIPATH
992         if (fi->fib_nhs > 1) {
993                 struct rtnexthop *rtnh;
994                 struct nlattr *mp;
995
996                 mp = nla_nest_start(skb, RTA_MULTIPATH);
997                 if (mp == NULL)
998                         goto nla_put_failure;
999
1000                 for_nexthops(fi) {
1001                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1002                         if (rtnh == NULL)
1003                                 goto nla_put_failure;
1004
1005                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1006                         rtnh->rtnh_hops = nh->nh_weight - 1;
1007                         rtnh->rtnh_ifindex = nh->nh_oif;
1008
1009                         if (nh->nh_gw)
1010                                 NLA_PUT_BE32(skb, RTA_GATEWAY, nh->nh_gw);
1011 #ifdef CONFIG_NET_CLS_ROUTE
1012                         if (nh->nh_tclassid)
1013                                 NLA_PUT_U32(skb, RTA_FLOW, nh->nh_tclassid);
1014 #endif
1015                         /* length of rtnetlink header + attributes */
1016                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1017                 } endfor_nexthops(fi);
1018
1019                 nla_nest_end(skb, mp);
1020         }
1021 #endif
1022         return nlmsg_end(skb, nlh);
1023
1024 nla_put_failure:
1025         nlmsg_cancel(skb, nlh);
1026         return -EMSGSIZE;
1027 }
1028
1029 /*
1030    Update FIB if:
1031    - local address disappeared -> we must delete all the entries
1032      referring to it.
1033    - device went down -> we must shutdown all nexthops going via it.
1034  */
1035 int fib_sync_down_addr(struct net *net, __be32 local)
1036 {
1037         int ret = 0;
1038         unsigned int hash = fib_laddr_hashfn(local);
1039         struct hlist_head *head = &fib_info_laddrhash[hash];
1040         struct hlist_node *node;
1041         struct fib_info *fi;
1042
1043         if (fib_info_laddrhash == NULL || local == 0)
1044                 return 0;
1045
1046         hlist_for_each_entry(fi, node, head, fib_lhash) {
1047                 if (!net_eq(fi->fib_net, net))
1048                         continue;
1049                 if (fi->fib_prefsrc == local) {
1050                         fi->fib_flags |= RTNH_F_DEAD;
1051                         ret++;
1052                 }
1053         }
1054         return ret;
1055 }
1056
1057 int fib_sync_down_dev(struct net_device *dev, int force)
1058 {
1059         int ret = 0;
1060         int scope = RT_SCOPE_NOWHERE;
1061         struct fib_info *prev_fi = NULL;
1062         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1063         struct hlist_head *head = &fib_info_devhash[hash];
1064         struct hlist_node *node;
1065         struct fib_nh *nh;
1066
1067         if (force)
1068                 scope = -1;
1069
1070         hlist_for_each_entry(nh, node, head, nh_hash) {
1071                 struct fib_info *fi = nh->nh_parent;
1072                 int dead;
1073
1074                 BUG_ON(!fi->fib_nhs);
1075                 if (nh->nh_dev != dev || fi == prev_fi)
1076                         continue;
1077                 prev_fi = fi;
1078                 dead = 0;
1079                 change_nexthops(fi) {
1080                         if (nexthop_nh->nh_flags&RTNH_F_DEAD)
1081                                 dead++;
1082                         else if (nexthop_nh->nh_dev == dev &&
1083                                  nexthop_nh->nh_scope != scope) {
1084                                 nexthop_nh->nh_flags |= RTNH_F_DEAD;
1085 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1086                                 spin_lock_bh(&fib_multipath_lock);
1087                                 fi->fib_power -= nexthop_nh->nh_power;
1088                                 nexthop_nh->nh_power = 0;
1089                                 spin_unlock_bh(&fib_multipath_lock);
1090 #endif
1091                                 dead++;
1092                         }
1093 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1094                         if (force > 1 && nexthop_nh->nh_dev == dev) {
1095                                 dead = fi->fib_nhs;
1096                                 break;
1097                         }
1098 #endif
1099                 } endfor_nexthops(fi)
1100                 if (dead == fi->fib_nhs) {
1101                         fi->fib_flags |= RTNH_F_DEAD;
1102                         ret++;
1103                 }
1104         }
1105
1106         return ret;
1107 }
1108
1109 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1110
1111 /*
1112    Dead device goes up. We wake up dead nexthops.
1113    It takes sense only on multipath routes.
1114  */
1115
1116 int fib_sync_up(struct net_device *dev)
1117 {
1118         struct fib_info *prev_fi;
1119         unsigned int hash;
1120         struct hlist_head *head;
1121         struct hlist_node *node;
1122         struct fib_nh *nh;
1123         int ret;
1124
1125         if (!(dev->flags&IFF_UP))
1126                 return 0;
1127
1128         prev_fi = NULL;
1129         hash = fib_devindex_hashfn(dev->ifindex);
1130         head = &fib_info_devhash[hash];
1131         ret = 0;
1132
1133         hlist_for_each_entry(nh, node, head, nh_hash) {
1134                 struct fib_info *fi = nh->nh_parent;
1135                 int alive;
1136
1137                 BUG_ON(!fi->fib_nhs);
1138                 if (nh->nh_dev != dev || fi == prev_fi)
1139                         continue;
1140
1141                 prev_fi = fi;
1142                 alive = 0;
1143                 change_nexthops(fi) {
1144                         if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1145                                 alive++;
1146                                 continue;
1147                         }
1148                         if (nexthop_nh->nh_dev == NULL ||
1149                             !(nexthop_nh->nh_dev->flags&IFF_UP))
1150                                 continue;
1151                         if (nexthop_nh->nh_dev != dev ||
1152                             !__in_dev_get_rtnl(dev))
1153                                 continue;
1154                         alive++;
1155                         spin_lock_bh(&fib_multipath_lock);
1156                         nexthop_nh->nh_power = 0;
1157                         nexthop_nh->nh_flags &= ~RTNH_F_DEAD;
1158                         spin_unlock_bh(&fib_multipath_lock);
1159                 } endfor_nexthops(fi)
1160
1161                 if (alive > 0) {
1162                         fi->fib_flags &= ~RTNH_F_DEAD;
1163                         ret++;
1164                 }
1165         }
1166
1167         return ret;
1168 }
1169
1170 /*
1171    The algorithm is suboptimal, but it provides really
1172    fair weighted route distribution.
1173  */
1174
1175 void fib_select_multipath(const struct flowi *flp, struct fib_result *res)
1176 {
1177         struct fib_info *fi = res->fi;
1178         int w;
1179
1180         spin_lock_bh(&fib_multipath_lock);
1181         if (fi->fib_power <= 0) {
1182                 int power = 0;
1183                 change_nexthops(fi) {
1184                         if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) {
1185                                 power += nexthop_nh->nh_weight;
1186                                 nexthop_nh->nh_power = nexthop_nh->nh_weight;
1187                         }
1188                 } endfor_nexthops(fi);
1189                 fi->fib_power = power;
1190                 if (power <= 0) {
1191                         spin_unlock_bh(&fib_multipath_lock);
1192                         /* Race condition: route has just become dead. */
1193                         res->nh_sel = 0;
1194                         return;
1195                 }
1196         }
1197
1198
1199         /* w should be random number [0..fi->fib_power-1],
1200            it is pretty bad approximation.
1201          */
1202
1203         w = jiffies % fi->fib_power;
1204
1205         change_nexthops(fi) {
1206                 if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) &&
1207                     nexthop_nh->nh_power) {
1208                         if ((w -= nexthop_nh->nh_power) <= 0) {
1209                                 nexthop_nh->nh_power--;
1210                                 fi->fib_power--;
1211                                 res->nh_sel = nhsel;
1212                                 spin_unlock_bh(&fib_multipath_lock);
1213                                 return;
1214                         }
1215                 }
1216         } endfor_nexthops(fi);
1217
1218         /* Race condition: route has just become dead. */
1219         res->nh_sel = 0;
1220         spin_unlock_bh(&fib_multipath_lock);
1221 }
1222 #endif