Merge branch 'linus' into cont_syslog
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requirement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <linux/slab.h>
51 #include <net/net_namespace.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65 #include <net/netlink.h>
66 #include <net/fib_rules.h>
67
68 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
69 #define CONFIG_IP_PIMSM 1
70 #endif
71
72 struct mr_table {
73         struct list_head        list;
74 #ifdef CONFIG_NET_NS
75         struct net              *net;
76 #endif
77         u32                     id;
78         struct sock             *mroute_sk;
79         struct timer_list       ipmr_expire_timer;
80         struct list_head        mfc_unres_queue;
81         struct list_head        mfc_cache_array[MFC_LINES];
82         struct vif_device       vif_table[MAXVIFS];
83         int                     maxvif;
84         atomic_t                cache_resolve_queue_len;
85         int                     mroute_do_assert;
86         int                     mroute_do_pim;
87 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
88         int                     mroute_reg_vif_num;
89 #endif
90 };
91
92 struct ipmr_rule {
93         struct fib_rule         common;
94 };
95
96 struct ipmr_result {
97         struct mr_table         *mrt;
98 };
99
100 /* Big lock, protecting vif table, mrt cache and mroute socket state.
101    Note that the changes are semaphored via rtnl_lock.
102  */
103
104 static DEFINE_RWLOCK(mrt_lock);
105
106 /*
107  *      Multicast router control variables
108  */
109
110 #define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
111
112 /* Special spinlock for queue of unresolved entries */
113 static DEFINE_SPINLOCK(mfc_unres_lock);
114
115 /* We return to original Alan's scheme. Hash table of resolved
116    entries is changed only in process context and protected
117    with weak lock mrt_lock. Queue of unresolved entries is protected
118    with strong spinlock mfc_unres_lock.
119
120    In this case data path is free of exclusive locks at all.
121  */
122
123 static struct kmem_cache *mrt_cachep __read_mostly;
124
125 static struct mr_table *ipmr_new_table(struct net *net, u32 id);
126 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
127                          struct sk_buff *skb, struct mfc_cache *cache,
128                          int local);
129 static int ipmr_cache_report(struct mr_table *mrt,
130                              struct sk_buff *pkt, vifi_t vifi, int assert);
131 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
132                               struct mfc_cache *c, struct rtmsg *rtm);
133 static void ipmr_expire_process(unsigned long arg);
134
135 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
136 #define ipmr_for_each_table(mrt, net) \
137         list_for_each_entry_rcu(mrt, &net->ipv4.mr_tables, list)
138
139 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
140 {
141         struct mr_table *mrt;
142
143         ipmr_for_each_table(mrt, net) {
144                 if (mrt->id == id)
145                         return mrt;
146         }
147         return NULL;
148 }
149
150 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
151                            struct mr_table **mrt)
152 {
153         struct ipmr_result res;
154         struct fib_lookup_arg arg = { .result = &res, };
155         int err;
156
157         err = fib_rules_lookup(net->ipv4.mr_rules_ops, flp, 0, &arg);
158         if (err < 0)
159                 return err;
160         *mrt = res.mrt;
161         return 0;
162 }
163
164 static int ipmr_rule_action(struct fib_rule *rule, struct flowi *flp,
165                             int flags, struct fib_lookup_arg *arg)
166 {
167         struct ipmr_result *res = arg->result;
168         struct mr_table *mrt;
169
170         switch (rule->action) {
171         case FR_ACT_TO_TBL:
172                 break;
173         case FR_ACT_UNREACHABLE:
174                 return -ENETUNREACH;
175         case FR_ACT_PROHIBIT:
176                 return -EACCES;
177         case FR_ACT_BLACKHOLE:
178         default:
179                 return -EINVAL;
180         }
181
182         mrt = ipmr_get_table(rule->fr_net, rule->table);
183         if (mrt == NULL)
184                 return -EAGAIN;
185         res->mrt = mrt;
186         return 0;
187 }
188
189 static int ipmr_rule_match(struct fib_rule *rule, struct flowi *fl, int flags)
190 {
191         return 1;
192 }
193
194 static const struct nla_policy ipmr_rule_policy[FRA_MAX + 1] = {
195         FRA_GENERIC_POLICY,
196 };
197
198 static int ipmr_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
199                                struct fib_rule_hdr *frh, struct nlattr **tb)
200 {
201         return 0;
202 }
203
204 static int ipmr_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
205                              struct nlattr **tb)
206 {
207         return 1;
208 }
209
210 static int ipmr_rule_fill(struct fib_rule *rule, struct sk_buff *skb,
211                           struct fib_rule_hdr *frh)
212 {
213         frh->dst_len = 0;
214         frh->src_len = 0;
215         frh->tos     = 0;
216         return 0;
217 }
218
219 static const struct fib_rules_ops __net_initdata ipmr_rules_ops_template = {
220         .family         = RTNL_FAMILY_IPMR,
221         .rule_size      = sizeof(struct ipmr_rule),
222         .addr_size      = sizeof(u32),
223         .action         = ipmr_rule_action,
224         .match          = ipmr_rule_match,
225         .configure      = ipmr_rule_configure,
226         .compare        = ipmr_rule_compare,
227         .default_pref   = fib_default_rule_pref,
228         .fill           = ipmr_rule_fill,
229         .nlgroup        = RTNLGRP_IPV4_RULE,
230         .policy         = ipmr_rule_policy,
231         .owner          = THIS_MODULE,
232 };
233
234 static int __net_init ipmr_rules_init(struct net *net)
235 {
236         struct fib_rules_ops *ops;
237         struct mr_table *mrt;
238         int err;
239
240         ops = fib_rules_register(&ipmr_rules_ops_template, net);
241         if (IS_ERR(ops))
242                 return PTR_ERR(ops);
243
244         INIT_LIST_HEAD(&net->ipv4.mr_tables);
245
246         mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
247         if (mrt == NULL) {
248                 err = -ENOMEM;
249                 goto err1;
250         }
251
252         err = fib_default_rule_add(ops, 0x7fff, RT_TABLE_DEFAULT, 0);
253         if (err < 0)
254                 goto err2;
255
256         net->ipv4.mr_rules_ops = ops;
257         return 0;
258
259 err2:
260         kfree(mrt);
261 err1:
262         fib_rules_unregister(ops);
263         return err;
264 }
265
266 static void __net_exit ipmr_rules_exit(struct net *net)
267 {
268         struct mr_table *mrt, *next;
269
270         list_for_each_entry_safe(mrt, next, &net->ipv4.mr_tables, list) {
271                 list_del(&mrt->list);
272                 kfree(mrt);
273         }
274         fib_rules_unregister(net->ipv4.mr_rules_ops);
275 }
276 #else
277 #define ipmr_for_each_table(mrt, net) \
278         for (mrt = net->ipv4.mrt; mrt; mrt = NULL)
279
280 static struct mr_table *ipmr_get_table(struct net *net, u32 id)
281 {
282         return net->ipv4.mrt;
283 }
284
285 static int ipmr_fib_lookup(struct net *net, struct flowi *flp,
286                            struct mr_table **mrt)
287 {
288         *mrt = net->ipv4.mrt;
289         return 0;
290 }
291
292 static int __net_init ipmr_rules_init(struct net *net)
293 {
294         net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
295         return net->ipv4.mrt ? 0 : -ENOMEM;
296 }
297
298 static void __net_exit ipmr_rules_exit(struct net *net)
299 {
300         kfree(net->ipv4.mrt);
301 }
302 #endif
303
304 static struct mr_table *ipmr_new_table(struct net *net, u32 id)
305 {
306         struct mr_table *mrt;
307         unsigned int i;
308
309         mrt = ipmr_get_table(net, id);
310         if (mrt != NULL)
311                 return mrt;
312
313         mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
314         if (mrt == NULL)
315                 return NULL;
316         write_pnet(&mrt->net, net);
317         mrt->id = id;
318
319         /* Forwarding cache */
320         for (i = 0; i < MFC_LINES; i++)
321                 INIT_LIST_HEAD(&mrt->mfc_cache_array[i]);
322
323         INIT_LIST_HEAD(&mrt->mfc_unres_queue);
324
325         setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
326                     (unsigned long)mrt);
327
328 #ifdef CONFIG_IP_PIMSM
329         mrt->mroute_reg_vif_num = -1;
330 #endif
331 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
332         list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
333 #endif
334         return mrt;
335 }
336
337 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
338
339 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
340 {
341         struct net *net = dev_net(dev);
342
343         dev_close(dev);
344
345         dev = __dev_get_by_name(net, "tunl0");
346         if (dev) {
347                 const struct net_device_ops *ops = dev->netdev_ops;
348                 struct ifreq ifr;
349                 struct ip_tunnel_parm p;
350
351                 memset(&p, 0, sizeof(p));
352                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
353                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
354                 p.iph.version = 4;
355                 p.iph.ihl = 5;
356                 p.iph.protocol = IPPROTO_IPIP;
357                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
358                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
359
360                 if (ops->ndo_do_ioctl) {
361                         mm_segment_t oldfs = get_fs();
362
363                         set_fs(KERNEL_DS);
364                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
365                         set_fs(oldfs);
366                 }
367         }
368 }
369
370 static
371 struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
372 {
373         struct net_device  *dev;
374
375         dev = __dev_get_by_name(net, "tunl0");
376
377         if (dev) {
378                 const struct net_device_ops *ops = dev->netdev_ops;
379                 int err;
380                 struct ifreq ifr;
381                 struct ip_tunnel_parm p;
382                 struct in_device  *in_dev;
383
384                 memset(&p, 0, sizeof(p));
385                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
386                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
387                 p.iph.version = 4;
388                 p.iph.ihl = 5;
389                 p.iph.protocol = IPPROTO_IPIP;
390                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
391                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
392
393                 if (ops->ndo_do_ioctl) {
394                         mm_segment_t oldfs = get_fs();
395
396                         set_fs(KERNEL_DS);
397                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
398                         set_fs(oldfs);
399                 } else
400                         err = -EOPNOTSUPP;
401
402                 dev = NULL;
403
404                 if (err == 0 &&
405                     (dev = __dev_get_by_name(net, p.name)) != NULL) {
406                         dev->flags |= IFF_MULTICAST;
407
408                         in_dev = __in_dev_get_rtnl(dev);
409                         if (in_dev == NULL)
410                                 goto failure;
411
412                         ipv4_devconf_setall(in_dev);
413                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
414
415                         if (dev_open(dev))
416                                 goto failure;
417                         dev_hold(dev);
418                 }
419         }
420         return dev;
421
422 failure:
423         /* allow the register to be completed before unregistering. */
424         rtnl_unlock();
425         rtnl_lock();
426
427         unregister_netdevice(dev);
428         return NULL;
429 }
430
431 #ifdef CONFIG_IP_PIMSM
432
433 static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
434 {
435         struct net *net = dev_net(dev);
436         struct mr_table *mrt;
437         struct flowi fl = {
438                 .oif            = dev->ifindex,
439                 .iif            = skb->skb_iif,
440                 .mark           = skb->mark,
441         };
442         int err;
443
444         err = ipmr_fib_lookup(net, &fl, &mrt);
445         if (err < 0)
446                 return err;
447
448         read_lock(&mrt_lock);
449         dev->stats.tx_bytes += skb->len;
450         dev->stats.tx_packets++;
451         ipmr_cache_report(mrt, skb, mrt->mroute_reg_vif_num, IGMPMSG_WHOLEPKT);
452         read_unlock(&mrt_lock);
453         kfree_skb(skb);
454         return NETDEV_TX_OK;
455 }
456
457 static const struct net_device_ops reg_vif_netdev_ops = {
458         .ndo_start_xmit = reg_vif_xmit,
459 };
460
461 static void reg_vif_setup(struct net_device *dev)
462 {
463         dev->type               = ARPHRD_PIMREG;
464         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
465         dev->flags              = IFF_NOARP;
466         dev->netdev_ops         = &reg_vif_netdev_ops,
467         dev->destructor         = free_netdev;
468         dev->features           |= NETIF_F_NETNS_LOCAL;
469 }
470
471 static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
472 {
473         struct net_device *dev;
474         struct in_device *in_dev;
475         char name[IFNAMSIZ];
476
477         if (mrt->id == RT_TABLE_DEFAULT)
478                 sprintf(name, "pimreg");
479         else
480                 sprintf(name, "pimreg%u", mrt->id);
481
482         dev = alloc_netdev(0, name, reg_vif_setup);
483
484         if (dev == NULL)
485                 return NULL;
486
487         dev_net_set(dev, net);
488
489         if (register_netdevice(dev)) {
490                 free_netdev(dev);
491                 return NULL;
492         }
493         dev->iflink = 0;
494
495         rcu_read_lock();
496         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
497                 rcu_read_unlock();
498                 goto failure;
499         }
500
501         ipv4_devconf_setall(in_dev);
502         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
503         rcu_read_unlock();
504
505         if (dev_open(dev))
506                 goto failure;
507
508         dev_hold(dev);
509
510         return dev;
511
512 failure:
513         /* allow the register to be completed before unregistering. */
514         rtnl_unlock();
515         rtnl_lock();
516
517         unregister_netdevice(dev);
518         return NULL;
519 }
520 #endif
521
522 /*
523  *      Delete a VIF entry
524  *      @notify: Set to 1, if the caller is a notifier_call
525  */
526
527 static int vif_delete(struct mr_table *mrt, int vifi, int notify,
528                       struct list_head *head)
529 {
530         struct vif_device *v;
531         struct net_device *dev;
532         struct in_device *in_dev;
533
534         if (vifi < 0 || vifi >= mrt->maxvif)
535                 return -EADDRNOTAVAIL;
536
537         v = &mrt->vif_table[vifi];
538
539         write_lock_bh(&mrt_lock);
540         dev = v->dev;
541         v->dev = NULL;
542
543         if (!dev) {
544                 write_unlock_bh(&mrt_lock);
545                 return -EADDRNOTAVAIL;
546         }
547
548 #ifdef CONFIG_IP_PIMSM
549         if (vifi == mrt->mroute_reg_vif_num)
550                 mrt->mroute_reg_vif_num = -1;
551 #endif
552
553         if (vifi+1 == mrt->maxvif) {
554                 int tmp;
555                 for (tmp=vifi-1; tmp>=0; tmp--) {
556                         if (VIF_EXISTS(mrt, tmp))
557                                 break;
558                 }
559                 mrt->maxvif = tmp+1;
560         }
561
562         write_unlock_bh(&mrt_lock);
563
564         dev_set_allmulti(dev, -1);
565
566         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
567                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
568                 ip_rt_multicast_event(in_dev);
569         }
570
571         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
572                 unregister_netdevice_queue(dev, head);
573
574         dev_put(dev);
575         return 0;
576 }
577
578 static inline void ipmr_cache_free(struct mfc_cache *c)
579 {
580         kmem_cache_free(mrt_cachep, c);
581 }
582
583 /* Destroy an unresolved cache entry, killing queued skbs
584    and reporting error to netlink readers.
585  */
586
587 static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
588 {
589         struct net *net = read_pnet(&mrt->net);
590         struct sk_buff *skb;
591         struct nlmsgerr *e;
592
593         atomic_dec(&mrt->cache_resolve_queue_len);
594
595         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
596                 if (ip_hdr(skb)->version == 0) {
597                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
598                         nlh->nlmsg_type = NLMSG_ERROR;
599                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
600                         skb_trim(skb, nlh->nlmsg_len);
601                         e = NLMSG_DATA(nlh);
602                         e->error = -ETIMEDOUT;
603                         memset(&e->msg, 0, sizeof(e->msg));
604
605                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
606                 } else
607                         kfree_skb(skb);
608         }
609
610         ipmr_cache_free(c);
611 }
612
613
614 /* Timer process for the unresolved queue. */
615
616 static void ipmr_expire_process(unsigned long arg)
617 {
618         struct mr_table *mrt = (struct mr_table *)arg;
619         unsigned long now;
620         unsigned long expires;
621         struct mfc_cache *c, *next;
622
623         if (!spin_trylock(&mfc_unres_lock)) {
624                 mod_timer(&mrt->ipmr_expire_timer, jiffies+HZ/10);
625                 return;
626         }
627
628         if (list_empty(&mrt->mfc_unres_queue))
629                 goto out;
630
631         now = jiffies;
632         expires = 10*HZ;
633
634         list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
635                 if (time_after(c->mfc_un.unres.expires, now)) {
636                         unsigned long interval = c->mfc_un.unres.expires - now;
637                         if (interval < expires)
638                                 expires = interval;
639                         continue;
640                 }
641
642                 list_del(&c->list);
643                 ipmr_destroy_unres(mrt, c);
644         }
645
646         if (!list_empty(&mrt->mfc_unres_queue))
647                 mod_timer(&mrt->ipmr_expire_timer, jiffies + expires);
648
649 out:
650         spin_unlock(&mfc_unres_lock);
651 }
652
653 /* Fill oifs list. It is called under write locked mrt_lock. */
654
655 static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
656                                    unsigned char *ttls)
657 {
658         int vifi;
659
660         cache->mfc_un.res.minvif = MAXVIFS;
661         cache->mfc_un.res.maxvif = 0;
662         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
663
664         for (vifi = 0; vifi < mrt->maxvif; vifi++) {
665                 if (VIF_EXISTS(mrt, vifi) &&
666                     ttls[vifi] && ttls[vifi] < 255) {
667                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
668                         if (cache->mfc_un.res.minvif > vifi)
669                                 cache->mfc_un.res.minvif = vifi;
670                         if (cache->mfc_un.res.maxvif <= vifi)
671                                 cache->mfc_un.res.maxvif = vifi + 1;
672                 }
673         }
674 }
675
676 static int vif_add(struct net *net, struct mr_table *mrt,
677                    struct vifctl *vifc, int mrtsock)
678 {
679         int vifi = vifc->vifc_vifi;
680         struct vif_device *v = &mrt->vif_table[vifi];
681         struct net_device *dev;
682         struct in_device *in_dev;
683         int err;
684
685         /* Is vif busy ? */
686         if (VIF_EXISTS(mrt, vifi))
687                 return -EADDRINUSE;
688
689         switch (vifc->vifc_flags) {
690 #ifdef CONFIG_IP_PIMSM
691         case VIFF_REGISTER:
692                 /*
693                  * Special Purpose VIF in PIM
694                  * All the packets will be sent to the daemon
695                  */
696                 if (mrt->mroute_reg_vif_num >= 0)
697                         return -EADDRINUSE;
698                 dev = ipmr_reg_vif(net, mrt);
699                 if (!dev)
700                         return -ENOBUFS;
701                 err = dev_set_allmulti(dev, 1);
702                 if (err) {
703                         unregister_netdevice(dev);
704                         dev_put(dev);
705                         return err;
706                 }
707                 break;
708 #endif
709         case VIFF_TUNNEL:
710                 dev = ipmr_new_tunnel(net, vifc);
711                 if (!dev)
712                         return -ENOBUFS;
713                 err = dev_set_allmulti(dev, 1);
714                 if (err) {
715                         ipmr_del_tunnel(dev, vifc);
716                         dev_put(dev);
717                         return err;
718                 }
719                 break;
720
721         case VIFF_USE_IFINDEX:
722         case 0:
723                 if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
724                         dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex);
725                         if (dev && dev->ip_ptr == NULL) {
726                                 dev_put(dev);
727                                 return -EADDRNOTAVAIL;
728                         }
729                 } else
730                         dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr);
731
732                 if (!dev)
733                         return -EADDRNOTAVAIL;
734                 err = dev_set_allmulti(dev, 1);
735                 if (err) {
736                         dev_put(dev);
737                         return err;
738                 }
739                 break;
740         default:
741                 return -EINVAL;
742         }
743
744         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) {
745                 dev_put(dev);
746                 return -EADDRNOTAVAIL;
747         }
748         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
749         ip_rt_multicast_event(in_dev);
750
751         /*
752          *      Fill in the VIF structures
753          */
754         v->rate_limit = vifc->vifc_rate_limit;
755         v->local = vifc->vifc_lcl_addr.s_addr;
756         v->remote = vifc->vifc_rmt_addr.s_addr;
757         v->flags = vifc->vifc_flags;
758         if (!mrtsock)
759                 v->flags |= VIFF_STATIC;
760         v->threshold = vifc->vifc_threshold;
761         v->bytes_in = 0;
762         v->bytes_out = 0;
763         v->pkt_in = 0;
764         v->pkt_out = 0;
765         v->link = dev->ifindex;
766         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
767                 v->link = dev->iflink;
768
769         /* And finish update writing critical data */
770         write_lock_bh(&mrt_lock);
771         v->dev = dev;
772 #ifdef CONFIG_IP_PIMSM
773         if (v->flags&VIFF_REGISTER)
774                 mrt->mroute_reg_vif_num = vifi;
775 #endif
776         if (vifi+1 > mrt->maxvif)
777                 mrt->maxvif = vifi+1;
778         write_unlock_bh(&mrt_lock);
779         return 0;
780 }
781
782 static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt,
783                                          __be32 origin,
784                                          __be32 mcastgrp)
785 {
786         int line = MFC_HASH(mcastgrp, origin);
787         struct mfc_cache *c;
788
789         list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
790                 if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp)
791                         return c;
792         }
793         return NULL;
794 }
795
796 /*
797  *      Allocate a multicast cache entry
798  */
799 static struct mfc_cache *ipmr_cache_alloc(void)
800 {
801         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
802         if (c == NULL)
803                 return NULL;
804         c->mfc_un.res.minvif = MAXVIFS;
805         return c;
806 }
807
808 static struct mfc_cache *ipmr_cache_alloc_unres(void)
809 {
810         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
811         if (c == NULL)
812                 return NULL;
813         skb_queue_head_init(&c->mfc_un.unres.unresolved);
814         c->mfc_un.unres.expires = jiffies + 10*HZ;
815         return c;
816 }
817
818 /*
819  *      A cache entry has gone into a resolved state from queued
820  */
821
822 static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
823                                struct mfc_cache *uc, struct mfc_cache *c)
824 {
825         struct sk_buff *skb;
826         struct nlmsgerr *e;
827
828         /*
829          *      Play the pending entries through our router
830          */
831
832         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
833                 if (ip_hdr(skb)->version == 0) {
834                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
835
836                         if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) {
837                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
838                                                   (u8 *)nlh);
839                         } else {
840                                 nlh->nlmsg_type = NLMSG_ERROR;
841                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
842                                 skb_trim(skb, nlh->nlmsg_len);
843                                 e = NLMSG_DATA(nlh);
844                                 e->error = -EMSGSIZE;
845                                 memset(&e->msg, 0, sizeof(e->msg));
846                         }
847
848                         rtnl_unicast(skb, net, NETLINK_CB(skb).pid);
849                 } else
850                         ip_mr_forward(net, mrt, skb, c, 0);
851         }
852 }
853
854 /*
855  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
856  *      expects the following bizarre scheme.
857  *
858  *      Called under mrt_lock.
859  */
860
861 static int ipmr_cache_report(struct mr_table *mrt,
862                              struct sk_buff *pkt, vifi_t vifi, int assert)
863 {
864         struct sk_buff *skb;
865         const int ihl = ip_hdrlen(pkt);
866         struct igmphdr *igmp;
867         struct igmpmsg *msg;
868         int ret;
869
870 #ifdef CONFIG_IP_PIMSM
871         if (assert == IGMPMSG_WHOLEPKT)
872                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
873         else
874 #endif
875                 skb = alloc_skb(128, GFP_ATOMIC);
876
877         if (!skb)
878                 return -ENOBUFS;
879
880 #ifdef CONFIG_IP_PIMSM
881         if (assert == IGMPMSG_WHOLEPKT) {
882                 /* Ugly, but we have no choice with this interface.
883                    Duplicate old header, fix ihl, length etc.
884                    And all this only to mangle msg->im_msgtype and
885                    to set msg->im_mbz to "mbz" :-)
886                  */
887                 skb_push(skb, sizeof(struct iphdr));
888                 skb_reset_network_header(skb);
889                 skb_reset_transport_header(skb);
890                 msg = (struct igmpmsg *)skb_network_header(skb);
891                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
892                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
893                 msg->im_mbz = 0;
894                 msg->im_vif = mrt->mroute_reg_vif_num;
895                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
896                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
897                                              sizeof(struct iphdr));
898         } else
899 #endif
900         {
901
902         /*
903          *      Copy the IP header
904          */
905
906         skb->network_header = skb->tail;
907         skb_put(skb, ihl);
908         skb_copy_to_linear_data(skb, pkt->data, ihl);
909         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
910         msg = (struct igmpmsg *)skb_network_header(skb);
911         msg->im_vif = vifi;
912         skb_dst_set(skb, dst_clone(skb_dst(pkt)));
913
914         /*
915          *      Add our header
916          */
917
918         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
919         igmp->type      =
920         msg->im_msgtype = assert;
921         igmp->code      =       0;
922         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
923         skb->transport_header = skb->network_header;
924         }
925
926         if (mrt->mroute_sk == NULL) {
927                 kfree_skb(skb);
928                 return -EINVAL;
929         }
930
931         /*
932          *      Deliver to mrouted
933          */
934         ret = sock_queue_rcv_skb(mrt->mroute_sk, skb);
935         if (ret < 0) {
936                 if (net_ratelimit())
937                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
938                 kfree_skb(skb);
939         }
940
941         return ret;
942 }
943
944 /*
945  *      Queue a packet for resolution. It gets locked cache entry!
946  */
947
948 static int
949 ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
950 {
951         bool found = false;
952         int err;
953         struct mfc_cache *c;
954         const struct iphdr *iph = ip_hdr(skb);
955
956         spin_lock_bh(&mfc_unres_lock);
957         list_for_each_entry(c, &mrt->mfc_unres_queue, list) {
958                 if (c->mfc_mcastgrp == iph->daddr &&
959                     c->mfc_origin == iph->saddr) {
960                         found = true;
961                         break;
962                 }
963         }
964
965         if (!found) {
966                 /*
967                  *      Create a new entry if allowable
968                  */
969
970                 if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
971                     (c = ipmr_cache_alloc_unres()) == NULL) {
972                         spin_unlock_bh(&mfc_unres_lock);
973
974                         kfree_skb(skb);
975                         return -ENOBUFS;
976                 }
977
978                 /*
979                  *      Fill in the new cache entry
980                  */
981                 c->mfc_parent   = -1;
982                 c->mfc_origin   = iph->saddr;
983                 c->mfc_mcastgrp = iph->daddr;
984
985                 /*
986                  *      Reflect first query at mrouted.
987                  */
988                 err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
989                 if (err < 0) {
990                         /* If the report failed throw the cache entry
991                            out - Brad Parker
992                          */
993                         spin_unlock_bh(&mfc_unres_lock);
994
995                         ipmr_cache_free(c);
996                         kfree_skb(skb);
997                         return err;
998                 }
999
1000                 atomic_inc(&mrt->cache_resolve_queue_len);
1001                 list_add(&c->list, &mrt->mfc_unres_queue);
1002
1003                 if (atomic_read(&mrt->cache_resolve_queue_len) == 1)
1004                         mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires);
1005         }
1006
1007         /*
1008          *      See if we can append the packet
1009          */
1010         if (c->mfc_un.unres.unresolved.qlen>3) {
1011                 kfree_skb(skb);
1012                 err = -ENOBUFS;
1013         } else {
1014                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
1015                 err = 0;
1016         }
1017
1018         spin_unlock_bh(&mfc_unres_lock);
1019         return err;
1020 }
1021
1022 /*
1023  *      MFC cache manipulation by user space mroute daemon
1024  */
1025
1026 static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc)
1027 {
1028         int line;
1029         struct mfc_cache *c, *next;
1030
1031         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1032
1033         list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) {
1034                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1035                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1036                         write_lock_bh(&mrt_lock);
1037                         list_del(&c->list);
1038                         write_unlock_bh(&mrt_lock);
1039
1040                         ipmr_cache_free(c);
1041                         return 0;
1042                 }
1043         }
1044         return -ENOENT;
1045 }
1046
1047 static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
1048                         struct mfcctl *mfc, int mrtsock)
1049 {
1050         bool found = false;
1051         int line;
1052         struct mfc_cache *uc, *c;
1053
1054         if (mfc->mfcc_parent >= MAXVIFS)
1055                 return -ENFILE;
1056
1057         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
1058
1059         list_for_each_entry(c, &mrt->mfc_cache_array[line], list) {
1060                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
1061                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
1062                         found = true;
1063                         break;
1064                 }
1065         }
1066
1067         if (found) {
1068                 write_lock_bh(&mrt_lock);
1069                 c->mfc_parent = mfc->mfcc_parent;
1070                 ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1071                 if (!mrtsock)
1072                         c->mfc_flags |= MFC_STATIC;
1073                 write_unlock_bh(&mrt_lock);
1074                 return 0;
1075         }
1076
1077         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
1078                 return -EINVAL;
1079
1080         c = ipmr_cache_alloc();
1081         if (c == NULL)
1082                 return -ENOMEM;
1083
1084         c->mfc_origin = mfc->mfcc_origin.s_addr;
1085         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
1086         c->mfc_parent = mfc->mfcc_parent;
1087         ipmr_update_thresholds(mrt, c, mfc->mfcc_ttls);
1088         if (!mrtsock)
1089                 c->mfc_flags |= MFC_STATIC;
1090
1091         write_lock_bh(&mrt_lock);
1092         list_add(&c->list, &mrt->mfc_cache_array[line]);
1093         write_unlock_bh(&mrt_lock);
1094
1095         /*
1096          *      Check to see if we resolved a queued list. If so we
1097          *      need to send on the frames and tidy up.
1098          */
1099         found = false;
1100         spin_lock_bh(&mfc_unres_lock);
1101         list_for_each_entry(uc, &mrt->mfc_unres_queue, list) {
1102                 if (uc->mfc_origin == c->mfc_origin &&
1103                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
1104                         list_del(&uc->list);
1105                         atomic_dec(&mrt->cache_resolve_queue_len);
1106                         found = true;
1107                         break;
1108                 }
1109         }
1110         if (list_empty(&mrt->mfc_unres_queue))
1111                 del_timer(&mrt->ipmr_expire_timer);
1112         spin_unlock_bh(&mfc_unres_lock);
1113
1114         if (found) {
1115                 ipmr_cache_resolve(net, mrt, uc, c);
1116                 ipmr_cache_free(uc);
1117         }
1118         return 0;
1119 }
1120
1121 /*
1122  *      Close the multicast socket, and clear the vif tables etc
1123  */
1124
1125 static void mroute_clean_tables(struct mr_table *mrt)
1126 {
1127         int i;
1128         LIST_HEAD(list);
1129         struct mfc_cache *c, *next;
1130
1131         /*
1132          *      Shut down all active vif entries
1133          */
1134         for (i = 0; i < mrt->maxvif; i++) {
1135                 if (!(mrt->vif_table[i].flags&VIFF_STATIC))
1136                         vif_delete(mrt, i, 0, &list);
1137         }
1138         unregister_netdevice_many(&list);
1139
1140         /*
1141          *      Wipe the cache
1142          */
1143         for (i = 0; i < MFC_LINES; i++) {
1144                 list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
1145                         if (c->mfc_flags&MFC_STATIC)
1146                                 continue;
1147                         write_lock_bh(&mrt_lock);
1148                         list_del(&c->list);
1149                         write_unlock_bh(&mrt_lock);
1150
1151                         ipmr_cache_free(c);
1152                 }
1153         }
1154
1155         if (atomic_read(&mrt->cache_resolve_queue_len) != 0) {
1156                 spin_lock_bh(&mfc_unres_lock);
1157                 list_for_each_entry_safe(c, next, &mrt->mfc_unres_queue, list) {
1158                         list_del(&c->list);
1159                         ipmr_destroy_unres(mrt, c);
1160                 }
1161                 spin_unlock_bh(&mfc_unres_lock);
1162         }
1163 }
1164
1165 static void mrtsock_destruct(struct sock *sk)
1166 {
1167         struct net *net = sock_net(sk);
1168         struct mr_table *mrt;
1169
1170         rtnl_lock();
1171         ipmr_for_each_table(mrt, net) {
1172                 if (sk == mrt->mroute_sk) {
1173                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)--;
1174
1175                         write_lock_bh(&mrt_lock);
1176                         mrt->mroute_sk = NULL;
1177                         write_unlock_bh(&mrt_lock);
1178
1179                         mroute_clean_tables(mrt);
1180                 }
1181         }
1182         rtnl_unlock();
1183 }
1184
1185 /*
1186  *      Socket options and virtual interface manipulation. The whole
1187  *      virtual interface system is a complete heap, but unfortunately
1188  *      that's how BSD mrouted happens to think. Maybe one day with a proper
1189  *      MOSPF/PIM router set up we can clean this up.
1190  */
1191
1192 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
1193 {
1194         int ret;
1195         struct vifctl vif;
1196         struct mfcctl mfc;
1197         struct net *net = sock_net(sk);
1198         struct mr_table *mrt;
1199
1200         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1201         if (mrt == NULL)
1202                 return -ENOENT;
1203
1204         if (optname != MRT_INIT) {
1205                 if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN))
1206                         return -EACCES;
1207         }
1208
1209         switch (optname) {
1210         case MRT_INIT:
1211                 if (sk->sk_type != SOCK_RAW ||
1212                     inet_sk(sk)->inet_num != IPPROTO_IGMP)
1213                         return -EOPNOTSUPP;
1214                 if (optlen != sizeof(int))
1215                         return -ENOPROTOOPT;
1216
1217                 rtnl_lock();
1218                 if (mrt->mroute_sk) {
1219                         rtnl_unlock();
1220                         return -EADDRINUSE;
1221                 }
1222
1223                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
1224                 if (ret == 0) {
1225                         write_lock_bh(&mrt_lock);
1226                         mrt->mroute_sk = sk;
1227                         write_unlock_bh(&mrt_lock);
1228
1229                         IPV4_DEVCONF_ALL(net, MC_FORWARDING)++;
1230                 }
1231                 rtnl_unlock();
1232                 return ret;
1233         case MRT_DONE:
1234                 if (sk != mrt->mroute_sk)
1235                         return -EACCES;
1236                 return ip_ra_control(sk, 0, NULL);
1237         case MRT_ADD_VIF:
1238         case MRT_DEL_VIF:
1239                 if (optlen != sizeof(vif))
1240                         return -EINVAL;
1241                 if (copy_from_user(&vif, optval, sizeof(vif)))
1242                         return -EFAULT;
1243                 if (vif.vifc_vifi >= MAXVIFS)
1244                         return -ENFILE;
1245                 rtnl_lock();
1246                 if (optname == MRT_ADD_VIF) {
1247                         ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk);
1248                 } else {
1249                         ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
1250                 }
1251                 rtnl_unlock();
1252                 return ret;
1253
1254                 /*
1255                  *      Manipulate the forwarding caches. These live
1256                  *      in a sort of kernel/user symbiosis.
1257                  */
1258         case MRT_ADD_MFC:
1259         case MRT_DEL_MFC:
1260                 if (optlen != sizeof(mfc))
1261                         return -EINVAL;
1262                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
1263                         return -EFAULT;
1264                 rtnl_lock();
1265                 if (optname == MRT_DEL_MFC)
1266                         ret = ipmr_mfc_delete(mrt, &mfc);
1267                 else
1268                         ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk);
1269                 rtnl_unlock();
1270                 return ret;
1271                 /*
1272                  *      Control PIM assert.
1273                  */
1274         case MRT_ASSERT:
1275         {
1276                 int v;
1277                 if (get_user(v,(int __user *)optval))
1278                         return -EFAULT;
1279                 mrt->mroute_do_assert = (v) ? 1 : 0;
1280                 return 0;
1281         }
1282 #ifdef CONFIG_IP_PIMSM
1283         case MRT_PIM:
1284         {
1285                 int v;
1286
1287                 if (get_user(v,(int __user *)optval))
1288                         return -EFAULT;
1289                 v = (v) ? 1 : 0;
1290
1291                 rtnl_lock();
1292                 ret = 0;
1293                 if (v != mrt->mroute_do_pim) {
1294                         mrt->mroute_do_pim = v;
1295                         mrt->mroute_do_assert = v;
1296                 }
1297                 rtnl_unlock();
1298                 return ret;
1299         }
1300 #endif
1301 #ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
1302         case MRT_TABLE:
1303         {
1304                 u32 v;
1305
1306                 if (optlen != sizeof(u32))
1307                         return -EINVAL;
1308                 if (get_user(v, (u32 __user *)optval))
1309                         return -EFAULT;
1310                 if (sk == mrt->mroute_sk)
1311                         return -EBUSY;
1312
1313                 rtnl_lock();
1314                 ret = 0;
1315                 if (!ipmr_new_table(net, v))
1316                         ret = -ENOMEM;
1317                 raw_sk(sk)->ipmr_table = v;
1318                 rtnl_unlock();
1319                 return ret;
1320         }
1321 #endif
1322         /*
1323          *      Spurious command, or MRT_VERSION which you cannot
1324          *      set.
1325          */
1326         default:
1327                 return -ENOPROTOOPT;
1328         }
1329 }
1330
1331 /*
1332  *      Getsock opt support for the multicast routing system.
1333  */
1334
1335 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1336 {
1337         int olr;
1338         int val;
1339         struct net *net = sock_net(sk);
1340         struct mr_table *mrt;
1341
1342         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1343         if (mrt == NULL)
1344                 return -ENOENT;
1345
1346         if (optname != MRT_VERSION &&
1347 #ifdef CONFIG_IP_PIMSM
1348            optname!=MRT_PIM &&
1349 #endif
1350            optname!=MRT_ASSERT)
1351                 return -ENOPROTOOPT;
1352
1353         if (get_user(olr, optlen))
1354                 return -EFAULT;
1355
1356         olr = min_t(unsigned int, olr, sizeof(int));
1357         if (olr < 0)
1358                 return -EINVAL;
1359
1360         if (put_user(olr, optlen))
1361                 return -EFAULT;
1362         if (optname == MRT_VERSION)
1363                 val = 0x0305;
1364 #ifdef CONFIG_IP_PIMSM
1365         else if (optname == MRT_PIM)
1366                 val = mrt->mroute_do_pim;
1367 #endif
1368         else
1369                 val = mrt->mroute_do_assert;
1370         if (copy_to_user(optval, &val, olr))
1371                 return -EFAULT;
1372         return 0;
1373 }
1374
1375 /*
1376  *      The IP multicast ioctl support routines.
1377  */
1378
1379 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1380 {
1381         struct sioc_sg_req sr;
1382         struct sioc_vif_req vr;
1383         struct vif_device *vif;
1384         struct mfc_cache *c;
1385         struct net *net = sock_net(sk);
1386         struct mr_table *mrt;
1387
1388         mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
1389         if (mrt == NULL)
1390                 return -ENOENT;
1391
1392         switch (cmd) {
1393         case SIOCGETVIFCNT:
1394                 if (copy_from_user(&vr, arg, sizeof(vr)))
1395                         return -EFAULT;
1396                 if (vr.vifi >= mrt->maxvif)
1397                         return -EINVAL;
1398                 read_lock(&mrt_lock);
1399                 vif = &mrt->vif_table[vr.vifi];
1400                 if (VIF_EXISTS(mrt, vr.vifi)) {
1401                         vr.icount = vif->pkt_in;
1402                         vr.ocount = vif->pkt_out;
1403                         vr.ibytes = vif->bytes_in;
1404                         vr.obytes = vif->bytes_out;
1405                         read_unlock(&mrt_lock);
1406
1407                         if (copy_to_user(arg, &vr, sizeof(vr)))
1408                                 return -EFAULT;
1409                         return 0;
1410                 }
1411                 read_unlock(&mrt_lock);
1412                 return -EADDRNOTAVAIL;
1413         case SIOCGETSGCNT:
1414                 if (copy_from_user(&sr, arg, sizeof(sr)))
1415                         return -EFAULT;
1416
1417                 read_lock(&mrt_lock);
1418                 c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr);
1419                 if (c) {
1420                         sr.pktcnt = c->mfc_un.res.pkt;
1421                         sr.bytecnt = c->mfc_un.res.bytes;
1422                         sr.wrong_if = c->mfc_un.res.wrong_if;
1423                         read_unlock(&mrt_lock);
1424
1425                         if (copy_to_user(arg, &sr, sizeof(sr)))
1426                                 return -EFAULT;
1427                         return 0;
1428                 }
1429                 read_unlock(&mrt_lock);
1430                 return -EADDRNOTAVAIL;
1431         default:
1432                 return -ENOIOCTLCMD;
1433         }
1434 }
1435
1436
1437 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1438 {
1439         struct net_device *dev = ptr;
1440         struct net *net = dev_net(dev);
1441         struct mr_table *mrt;
1442         struct vif_device *v;
1443         int ct;
1444         LIST_HEAD(list);
1445
1446         if (event != NETDEV_UNREGISTER)
1447                 return NOTIFY_DONE;
1448
1449         ipmr_for_each_table(mrt, net) {
1450                 v = &mrt->vif_table[0];
1451                 for (ct = 0; ct < mrt->maxvif; ct++, v++) {
1452                         if (v->dev == dev)
1453                                 vif_delete(mrt, ct, 1, &list);
1454                 }
1455         }
1456         unregister_netdevice_many(&list);
1457         return NOTIFY_DONE;
1458 }
1459
1460
1461 static struct notifier_block ip_mr_notifier = {
1462         .notifier_call = ipmr_device_event,
1463 };
1464
1465 /*
1466  *      Encapsulate a packet by attaching a valid IPIP header to it.
1467  *      This avoids tunnel drivers and other mess and gives us the speed so
1468  *      important for multicast video.
1469  */
1470
1471 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1472 {
1473         struct iphdr *iph;
1474         struct iphdr *old_iph = ip_hdr(skb);
1475
1476         skb_push(skb, sizeof(struct iphdr));
1477         skb->transport_header = skb->network_header;
1478         skb_reset_network_header(skb);
1479         iph = ip_hdr(skb);
1480
1481         iph->version    =       4;
1482         iph->tos        =       old_iph->tos;
1483         iph->ttl        =       old_iph->ttl;
1484         iph->frag_off   =       0;
1485         iph->daddr      =       daddr;
1486         iph->saddr      =       saddr;
1487         iph->protocol   =       IPPROTO_IPIP;
1488         iph->ihl        =       5;
1489         iph->tot_len    =       htons(skb->len);
1490         ip_select_ident(iph, skb_dst(skb), NULL);
1491         ip_send_check(iph);
1492
1493         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1494         nf_reset(skb);
1495 }
1496
1497 static inline int ipmr_forward_finish(struct sk_buff *skb)
1498 {
1499         struct ip_options * opt = &(IPCB(skb)->opt);
1500
1501         IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1502
1503         if (unlikely(opt->optlen))
1504                 ip_forward_options(skb);
1505
1506         return dst_output(skb);
1507 }
1508
1509 /*
1510  *      Processing handlers for ipmr_forward
1511  */
1512
1513 static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
1514                             struct sk_buff *skb, struct mfc_cache *c, int vifi)
1515 {
1516         const struct iphdr *iph = ip_hdr(skb);
1517         struct vif_device *vif = &mrt->vif_table[vifi];
1518         struct net_device *dev;
1519         struct rtable *rt;
1520         int    encap = 0;
1521
1522         if (vif->dev == NULL)
1523                 goto out_free;
1524
1525 #ifdef CONFIG_IP_PIMSM
1526         if (vif->flags & VIFF_REGISTER) {
1527                 vif->pkt_out++;
1528                 vif->bytes_out += skb->len;
1529                 vif->dev->stats.tx_bytes += skb->len;
1530                 vif->dev->stats.tx_packets++;
1531                 ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
1532                 goto out_free;
1533         }
1534 #endif
1535
1536         if (vif->flags&VIFF_TUNNEL) {
1537                 struct flowi fl = { .oif = vif->link,
1538                                     .nl_u = { .ip4_u =
1539                                               { .daddr = vif->remote,
1540                                                 .saddr = vif->local,
1541                                                 .tos = RT_TOS(iph->tos) } },
1542                                     .proto = IPPROTO_IPIP };
1543                 if (ip_route_output_key(net, &rt, &fl))
1544                         goto out_free;
1545                 encap = sizeof(struct iphdr);
1546         } else {
1547                 struct flowi fl = { .oif = vif->link,
1548                                     .nl_u = { .ip4_u =
1549                                               { .daddr = iph->daddr,
1550                                                 .tos = RT_TOS(iph->tos) } },
1551                                     .proto = IPPROTO_IPIP };
1552                 if (ip_route_output_key(net, &rt, &fl))
1553                         goto out_free;
1554         }
1555
1556         dev = rt->u.dst.dev;
1557
1558         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1559                 /* Do not fragment multicasts. Alas, IPv4 does not
1560                    allow to send ICMP, so that packets will disappear
1561                    to blackhole.
1562                  */
1563
1564                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1565                 ip_rt_put(rt);
1566                 goto out_free;
1567         }
1568
1569         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1570
1571         if (skb_cow(skb, encap)) {
1572                 ip_rt_put(rt);
1573                 goto out_free;
1574         }
1575
1576         vif->pkt_out++;
1577         vif->bytes_out += skb->len;
1578
1579         skb_dst_drop(skb);
1580         skb_dst_set(skb, &rt->u.dst);
1581         ip_decrease_ttl(ip_hdr(skb));
1582
1583         /* FIXME: forward and output firewalls used to be called here.
1584          * What do we do with netfilter? -- RR */
1585         if (vif->flags & VIFF_TUNNEL) {
1586                 ip_encap(skb, vif->local, vif->remote);
1587                 /* FIXME: extra output firewall step used to be here. --RR */
1588                 vif->dev->stats.tx_packets++;
1589                 vif->dev->stats.tx_bytes += skb->len;
1590         }
1591
1592         IPCB(skb)->flags |= IPSKB_FORWARDED;
1593
1594         /*
1595          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1596          * not only before forwarding, but after forwarding on all output
1597          * interfaces. It is clear, if mrouter runs a multicasting
1598          * program, it should receive packets not depending to what interface
1599          * program is joined.
1600          * If we will not make it, the program will have to join on all
1601          * interfaces. On the other hand, multihoming host (or router, but
1602          * not mrouter) cannot join to more than one interface - it will
1603          * result in receiving multiple packets.
1604          */
1605         NF_HOOK(NFPROTO_IPV4, NF_INET_FORWARD, skb, skb->dev, dev,
1606                 ipmr_forward_finish);
1607         return;
1608
1609 out_free:
1610         kfree_skb(skb);
1611 }
1612
1613 static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
1614 {
1615         int ct;
1616
1617         for (ct = mrt->maxvif-1; ct >= 0; ct--) {
1618                 if (mrt->vif_table[ct].dev == dev)
1619                         break;
1620         }
1621         return ct;
1622 }
1623
1624 /* "local" means that we should preserve one skb (for local delivery) */
1625
1626 static int ip_mr_forward(struct net *net, struct mr_table *mrt,
1627                          struct sk_buff *skb, struct mfc_cache *cache,
1628                          int local)
1629 {
1630         int psend = -1;
1631         int vif, ct;
1632
1633         vif = cache->mfc_parent;
1634         cache->mfc_un.res.pkt++;
1635         cache->mfc_un.res.bytes += skb->len;
1636
1637         /*
1638          * Wrong interface: drop packet and (maybe) send PIM assert.
1639          */
1640         if (mrt->vif_table[vif].dev != skb->dev) {
1641                 int true_vifi;
1642
1643                 if (skb_rtable(skb)->fl.iif == 0) {
1644                         /* It is our own packet, looped back.
1645                            Very complicated situation...
1646
1647                            The best workaround until routing daemons will be
1648                            fixed is not to redistribute packet, if it was
1649                            send through wrong interface. It means, that
1650                            multicast applications WILL NOT work for
1651                            (S,G), which have default multicast route pointing
1652                            to wrong oif. In any case, it is not a good
1653                            idea to use multicasting applications on router.
1654                          */
1655                         goto dont_forward;
1656                 }
1657
1658                 cache->mfc_un.res.wrong_if++;
1659                 true_vifi = ipmr_find_vif(mrt, skb->dev);
1660
1661                 if (true_vifi >= 0 && mrt->mroute_do_assert &&
1662                     /* pimsm uses asserts, when switching from RPT to SPT,
1663                        so that we cannot check that packet arrived on an oif.
1664                        It is bad, but otherwise we would need to move pretty
1665                        large chunk of pimd to kernel. Ough... --ANK
1666                      */
1667                     (mrt->mroute_do_pim ||
1668                      cache->mfc_un.res.ttls[true_vifi] < 255) &&
1669                     time_after(jiffies,
1670                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1671                         cache->mfc_un.res.last_assert = jiffies;
1672                         ipmr_cache_report(mrt, skb, true_vifi, IGMPMSG_WRONGVIF);
1673                 }
1674                 goto dont_forward;
1675         }
1676
1677         mrt->vif_table[vif].pkt_in++;
1678         mrt->vif_table[vif].bytes_in += skb->len;
1679
1680         /*
1681          *      Forward the frame
1682          */
1683         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1684                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1685                         if (psend != -1) {
1686                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1687                                 if (skb2)
1688                                         ipmr_queue_xmit(net, mrt, skb2, cache,
1689                                                         psend);
1690                         }
1691                         psend = ct;
1692                 }
1693         }
1694         if (psend != -1) {
1695                 if (local) {
1696                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1697                         if (skb2)
1698                                 ipmr_queue_xmit(net, mrt, skb2, cache, psend);
1699                 } else {
1700                         ipmr_queue_xmit(net, mrt, skb, cache, psend);
1701                         return 0;
1702                 }
1703         }
1704
1705 dont_forward:
1706         if (!local)
1707                 kfree_skb(skb);
1708         return 0;
1709 }
1710
1711
1712 /*
1713  *      Multicast packets for forwarding arrive here
1714  */
1715
1716 int ip_mr_input(struct sk_buff *skb)
1717 {
1718         struct mfc_cache *cache;
1719         struct net *net = dev_net(skb->dev);
1720         int local = skb_rtable(skb)->rt_flags & RTCF_LOCAL;
1721         struct mr_table *mrt;
1722         int err;
1723
1724         /* Packet is looped back after forward, it should not be
1725            forwarded second time, but still can be delivered locally.
1726          */
1727         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1728                 goto dont_forward;
1729
1730         err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt);
1731         if (err < 0)
1732                 return err;
1733
1734         if (!local) {
1735                     if (IPCB(skb)->opt.router_alert) {
1736                             if (ip_call_ra_chain(skb))
1737                                     return 0;
1738                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1739                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1740                                Cisco IOS <= 11.2(8)) do not put router alert
1741                                option to IGMP packets destined to routable
1742                                groups. It is very bad, because it means
1743                                that we can forward NO IGMP messages.
1744                              */
1745                             read_lock(&mrt_lock);
1746                             if (mrt->mroute_sk) {
1747                                     nf_reset(skb);
1748                                     raw_rcv(mrt->mroute_sk, skb);
1749                                     read_unlock(&mrt_lock);
1750                                     return 0;
1751                             }
1752                             read_unlock(&mrt_lock);
1753                     }
1754         }
1755
1756         read_lock(&mrt_lock);
1757         cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1758
1759         /*
1760          *      No usable cache entry
1761          */
1762         if (cache == NULL) {
1763                 int vif;
1764
1765                 if (local) {
1766                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1767                         ip_local_deliver(skb);
1768                         if (skb2 == NULL) {
1769                                 read_unlock(&mrt_lock);
1770                                 return -ENOBUFS;
1771                         }
1772                         skb = skb2;
1773                 }
1774
1775                 vif = ipmr_find_vif(mrt, skb->dev);
1776                 if (vif >= 0) {
1777                         int err2 = ipmr_cache_unresolved(mrt, vif, skb);
1778                         read_unlock(&mrt_lock);
1779
1780                         return err2;
1781                 }
1782                 read_unlock(&mrt_lock);
1783                 kfree_skb(skb);
1784                 return -ENODEV;
1785         }
1786
1787         ip_mr_forward(net, mrt, skb, cache, local);
1788
1789         read_unlock(&mrt_lock);
1790
1791         if (local)
1792                 return ip_local_deliver(skb);
1793
1794         return 0;
1795
1796 dont_forward:
1797         if (local)
1798                 return ip_local_deliver(skb);
1799         kfree_skb(skb);
1800         return 0;
1801 }
1802
1803 #ifdef CONFIG_IP_PIMSM
1804 static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
1805                      unsigned int pimlen)
1806 {
1807         struct net_device *reg_dev = NULL;
1808         struct iphdr *encap;
1809
1810         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1811         /*
1812            Check that:
1813            a. packet is really destinted to a multicast group
1814            b. packet is not a NULL-REGISTER
1815            c. packet is not truncated
1816          */
1817         if (!ipv4_is_multicast(encap->daddr) ||
1818             encap->tot_len == 0 ||
1819             ntohs(encap->tot_len) + pimlen > skb->len)
1820                 return 1;
1821
1822         read_lock(&mrt_lock);
1823         if (mrt->mroute_reg_vif_num >= 0)
1824                 reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
1825         if (reg_dev)
1826                 dev_hold(reg_dev);
1827         read_unlock(&mrt_lock);
1828
1829         if (reg_dev == NULL)
1830                 return 1;
1831
1832         skb->mac_header = skb->network_header;
1833         skb_pull(skb, (u8*)encap - skb->data);
1834         skb_reset_network_header(skb);
1835         skb->protocol = htons(ETH_P_IP);
1836         skb->ip_summed = 0;
1837         skb->pkt_type = PACKET_HOST;
1838
1839         skb_tunnel_rx(skb, reg_dev);
1840
1841         netif_rx(skb);
1842         dev_put(reg_dev);
1843
1844         return 0;
1845 }
1846 #endif
1847
1848 #ifdef CONFIG_IP_PIMSM_V1
1849 /*
1850  * Handle IGMP messages of PIMv1
1851  */
1852
1853 int pim_rcv_v1(struct sk_buff * skb)
1854 {
1855         struct igmphdr *pim;
1856         struct net *net = dev_net(skb->dev);
1857         struct mr_table *mrt;
1858
1859         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1860                 goto drop;
1861
1862         pim = igmp_hdr(skb);
1863
1864         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1865                 goto drop;
1866
1867         if (!mrt->mroute_do_pim ||
1868             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1869                 goto drop;
1870
1871         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1872 drop:
1873                 kfree_skb(skb);
1874         }
1875         return 0;
1876 }
1877 #endif
1878
1879 #ifdef CONFIG_IP_PIMSM_V2
1880 static int pim_rcv(struct sk_buff * skb)
1881 {
1882         struct pimreghdr *pim;
1883         struct net *net = dev_net(skb->dev);
1884         struct mr_table *mrt;
1885
1886         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1887                 goto drop;
1888
1889         pim = (struct pimreghdr *)skb_transport_header(skb);
1890         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1891             (pim->flags&PIM_NULL_REGISTER) ||
1892             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1893              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1894                 goto drop;
1895
1896         if (ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt) < 0)
1897                 goto drop;
1898
1899         if (__pim_rcv(mrt, skb, sizeof(*pim))) {
1900 drop:
1901                 kfree_skb(skb);
1902         }
1903         return 0;
1904 }
1905 #endif
1906
1907 static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
1908                               struct mfc_cache *c, struct rtmsg *rtm)
1909 {
1910         int ct;
1911         struct rtnexthop *nhp;
1912         u8 *b = skb_tail_pointer(skb);
1913         struct rtattr *mp_head;
1914
1915         /* If cache is unresolved, don't try to parse IIF and OIF */
1916         if (c->mfc_parent >= MAXVIFS)
1917                 return -ENOENT;
1918
1919         if (VIF_EXISTS(mrt, c->mfc_parent))
1920                 RTA_PUT(skb, RTA_IIF, 4, &mrt->vif_table[c->mfc_parent].dev->ifindex);
1921
1922         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1923
1924         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1925                 if (VIF_EXISTS(mrt, ct) && c->mfc_un.res.ttls[ct] < 255) {
1926                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1927                                 goto rtattr_failure;
1928                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1929                         nhp->rtnh_flags = 0;
1930                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1931                         nhp->rtnh_ifindex = mrt->vif_table[ct].dev->ifindex;
1932                         nhp->rtnh_len = sizeof(*nhp);
1933                 }
1934         }
1935         mp_head->rta_type = RTA_MULTIPATH;
1936         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1937         rtm->rtm_type = RTN_MULTICAST;
1938         return 1;
1939
1940 rtattr_failure:
1941         nlmsg_trim(skb, b);
1942         return -EMSGSIZE;
1943 }
1944
1945 int ipmr_get_route(struct net *net,
1946                    struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1947 {
1948         int err;
1949         struct mr_table *mrt;
1950         struct mfc_cache *cache;
1951         struct rtable *rt = skb_rtable(skb);
1952
1953         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
1954         if (mrt == NULL)
1955                 return -ENOENT;
1956
1957         read_lock(&mrt_lock);
1958         cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst);
1959
1960         if (cache == NULL) {
1961                 struct sk_buff *skb2;
1962                 struct iphdr *iph;
1963                 struct net_device *dev;
1964                 int vif;
1965
1966                 if (nowait) {
1967                         read_unlock(&mrt_lock);
1968                         return -EAGAIN;
1969                 }
1970
1971                 dev = skb->dev;
1972                 if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) {
1973                         read_unlock(&mrt_lock);
1974                         return -ENODEV;
1975                 }
1976                 skb2 = skb_clone(skb, GFP_ATOMIC);
1977                 if (!skb2) {
1978                         read_unlock(&mrt_lock);
1979                         return -ENOMEM;
1980                 }
1981
1982                 skb_push(skb2, sizeof(struct iphdr));
1983                 skb_reset_network_header(skb2);
1984                 iph = ip_hdr(skb2);
1985                 iph->ihl = sizeof(struct iphdr) >> 2;
1986                 iph->saddr = rt->rt_src;
1987                 iph->daddr = rt->rt_dst;
1988                 iph->version = 0;
1989                 err = ipmr_cache_unresolved(mrt, vif, skb2);
1990                 read_unlock(&mrt_lock);
1991                 return err;
1992         }
1993
1994         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1995                 cache->mfc_flags |= MFC_NOTIFY;
1996         err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
1997         read_unlock(&mrt_lock);
1998         return err;
1999 }
2000
2001 static int ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
2002                             u32 pid, u32 seq, struct mfc_cache *c)
2003 {
2004         struct nlmsghdr *nlh;
2005         struct rtmsg *rtm;
2006
2007         nlh = nlmsg_put(skb, pid, seq, RTM_NEWROUTE, sizeof(*rtm), NLM_F_MULTI);
2008         if (nlh == NULL)
2009                 return -EMSGSIZE;
2010
2011         rtm = nlmsg_data(nlh);
2012         rtm->rtm_family   = RTNL_FAMILY_IPMR;
2013         rtm->rtm_dst_len  = 32;
2014         rtm->rtm_src_len  = 32;
2015         rtm->rtm_tos      = 0;
2016         rtm->rtm_table    = mrt->id;
2017         NLA_PUT_U32(skb, RTA_TABLE, mrt->id);
2018         rtm->rtm_type     = RTN_MULTICAST;
2019         rtm->rtm_scope    = RT_SCOPE_UNIVERSE;
2020         rtm->rtm_protocol = RTPROT_UNSPEC;
2021         rtm->rtm_flags    = 0;
2022
2023         NLA_PUT_BE32(skb, RTA_SRC, c->mfc_origin);
2024         NLA_PUT_BE32(skb, RTA_DST, c->mfc_mcastgrp);
2025
2026         if (__ipmr_fill_mroute(mrt, skb, c, rtm) < 0)
2027                 goto nla_put_failure;
2028
2029         return nlmsg_end(skb, nlh);
2030
2031 nla_put_failure:
2032         nlmsg_cancel(skb, nlh);
2033         return -EMSGSIZE;
2034 }
2035
2036 static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb)
2037 {
2038         struct net *net = sock_net(skb->sk);
2039         struct mr_table *mrt;
2040         struct mfc_cache *mfc;
2041         unsigned int t = 0, s_t;
2042         unsigned int h = 0, s_h;
2043         unsigned int e = 0, s_e;
2044
2045         s_t = cb->args[0];
2046         s_h = cb->args[1];
2047         s_e = cb->args[2];
2048
2049         read_lock(&mrt_lock);
2050         ipmr_for_each_table(mrt, net) {
2051                 if (t < s_t)
2052                         goto next_table;
2053                 if (t > s_t)
2054                         s_h = 0;
2055                 for (h = s_h; h < MFC_LINES; h++) {
2056                         list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) {
2057                                 if (e < s_e)
2058                                         goto next_entry;
2059                                 if (ipmr_fill_mroute(mrt, skb,
2060                                                      NETLINK_CB(cb->skb).pid,
2061                                                      cb->nlh->nlmsg_seq,
2062                                                      mfc) < 0)
2063                                         goto done;
2064 next_entry:
2065                                 e++;
2066                         }
2067                         e = s_e = 0;
2068                 }
2069                 s_h = 0;
2070 next_table:
2071                 t++;
2072         }
2073 done:
2074         read_unlock(&mrt_lock);
2075
2076         cb->args[2] = e;
2077         cb->args[1] = h;
2078         cb->args[0] = t;
2079
2080         return skb->len;
2081 }
2082
2083 #ifdef CONFIG_PROC_FS
2084 /*
2085  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
2086  */
2087 struct ipmr_vif_iter {
2088         struct seq_net_private p;
2089         struct mr_table *mrt;
2090         int ct;
2091 };
2092
2093 static struct vif_device *ipmr_vif_seq_idx(struct net *net,
2094                                            struct ipmr_vif_iter *iter,
2095                                            loff_t pos)
2096 {
2097         struct mr_table *mrt = iter->mrt;
2098
2099         for (iter->ct = 0; iter->ct < mrt->maxvif; ++iter->ct) {
2100                 if (!VIF_EXISTS(mrt, iter->ct))
2101                         continue;
2102                 if (pos-- == 0)
2103                         return &mrt->vif_table[iter->ct];
2104         }
2105         return NULL;
2106 }
2107
2108 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
2109         __acquires(mrt_lock)
2110 {
2111         struct ipmr_vif_iter *iter = seq->private;
2112         struct net *net = seq_file_net(seq);
2113         struct mr_table *mrt;
2114
2115         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2116         if (mrt == NULL)
2117                 return ERR_PTR(-ENOENT);
2118
2119         iter->mrt = mrt;
2120
2121         read_lock(&mrt_lock);
2122         return *pos ? ipmr_vif_seq_idx(net, seq->private, *pos - 1)
2123                 : SEQ_START_TOKEN;
2124 }
2125
2126 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2127 {
2128         struct ipmr_vif_iter *iter = seq->private;
2129         struct net *net = seq_file_net(seq);
2130         struct mr_table *mrt = iter->mrt;
2131
2132         ++*pos;
2133         if (v == SEQ_START_TOKEN)
2134                 return ipmr_vif_seq_idx(net, iter, 0);
2135
2136         while (++iter->ct < mrt->maxvif) {
2137                 if (!VIF_EXISTS(mrt, iter->ct))
2138                         continue;
2139                 return &mrt->vif_table[iter->ct];
2140         }
2141         return NULL;
2142 }
2143
2144 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
2145         __releases(mrt_lock)
2146 {
2147         read_unlock(&mrt_lock);
2148 }
2149
2150 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
2151 {
2152         struct ipmr_vif_iter *iter = seq->private;
2153         struct mr_table *mrt = iter->mrt;
2154
2155         if (v == SEQ_START_TOKEN) {
2156                 seq_puts(seq,
2157                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
2158         } else {
2159                 const struct vif_device *vif = v;
2160                 const char *name =  vif->dev ? vif->dev->name : "none";
2161
2162                 seq_printf(seq,
2163                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
2164                            vif - mrt->vif_table,
2165                            name, vif->bytes_in, vif->pkt_in,
2166                            vif->bytes_out, vif->pkt_out,
2167                            vif->flags, vif->local, vif->remote);
2168         }
2169         return 0;
2170 }
2171
2172 static const struct seq_operations ipmr_vif_seq_ops = {
2173         .start = ipmr_vif_seq_start,
2174         .next  = ipmr_vif_seq_next,
2175         .stop  = ipmr_vif_seq_stop,
2176         .show  = ipmr_vif_seq_show,
2177 };
2178
2179 static int ipmr_vif_open(struct inode *inode, struct file *file)
2180 {
2181         return seq_open_net(inode, file, &ipmr_vif_seq_ops,
2182                             sizeof(struct ipmr_vif_iter));
2183 }
2184
2185 static const struct file_operations ipmr_vif_fops = {
2186         .owner   = THIS_MODULE,
2187         .open    = ipmr_vif_open,
2188         .read    = seq_read,
2189         .llseek  = seq_lseek,
2190         .release = seq_release_net,
2191 };
2192
2193 struct ipmr_mfc_iter {
2194         struct seq_net_private p;
2195         struct mr_table *mrt;
2196         struct list_head *cache;
2197         int ct;
2198 };
2199
2200
2201 static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net,
2202                                           struct ipmr_mfc_iter *it, loff_t pos)
2203 {
2204         struct mr_table *mrt = it->mrt;
2205         struct mfc_cache *mfc;
2206
2207         read_lock(&mrt_lock);
2208         for (it->ct = 0; it->ct < MFC_LINES; it->ct++) {
2209                 it->cache = &mrt->mfc_cache_array[it->ct];
2210                 list_for_each_entry(mfc, it->cache, list)
2211                         if (pos-- == 0)
2212                                 return mfc;
2213         }
2214         read_unlock(&mrt_lock);
2215
2216         spin_lock_bh(&mfc_unres_lock);
2217         it->cache = &mrt->mfc_unres_queue;
2218         list_for_each_entry(mfc, it->cache, list)
2219                 if (pos-- == 0)
2220                         return mfc;
2221         spin_unlock_bh(&mfc_unres_lock);
2222
2223         it->cache = NULL;
2224         return NULL;
2225 }
2226
2227
2228 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
2229 {
2230         struct ipmr_mfc_iter *it = seq->private;
2231         struct net *net = seq_file_net(seq);
2232         struct mr_table *mrt;
2233
2234         mrt = ipmr_get_table(net, RT_TABLE_DEFAULT);
2235         if (mrt == NULL)
2236                 return ERR_PTR(-ENOENT);
2237
2238         it->mrt = mrt;
2239         it->cache = NULL;
2240         it->ct = 0;
2241         return *pos ? ipmr_mfc_seq_idx(net, seq->private, *pos - 1)
2242                 : SEQ_START_TOKEN;
2243 }
2244
2245 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2246 {
2247         struct mfc_cache *mfc = v;
2248         struct ipmr_mfc_iter *it = seq->private;
2249         struct net *net = seq_file_net(seq);
2250         struct mr_table *mrt = it->mrt;
2251
2252         ++*pos;
2253
2254         if (v == SEQ_START_TOKEN)
2255                 return ipmr_mfc_seq_idx(net, seq->private, 0);
2256
2257         if (mfc->list.next != it->cache)
2258                 return list_entry(mfc->list.next, struct mfc_cache, list);
2259
2260         if (it->cache == &mrt->mfc_unres_queue)
2261                 goto end_of_list;
2262
2263         BUG_ON(it->cache != &mrt->mfc_cache_array[it->ct]);
2264
2265         while (++it->ct < MFC_LINES) {
2266                 it->cache = &mrt->mfc_cache_array[it->ct];
2267                 if (list_empty(it->cache))
2268                         continue;
2269                 return list_first_entry(it->cache, struct mfc_cache, list);
2270         }
2271
2272         /* exhausted cache_array, show unresolved */
2273         read_unlock(&mrt_lock);
2274         it->cache = &mrt->mfc_unres_queue;
2275         it->ct = 0;
2276
2277         spin_lock_bh(&mfc_unres_lock);
2278         if (!list_empty(it->cache))
2279                 return list_first_entry(it->cache, struct mfc_cache, list);
2280
2281  end_of_list:
2282         spin_unlock_bh(&mfc_unres_lock);
2283         it->cache = NULL;
2284
2285         return NULL;
2286 }
2287
2288 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
2289 {
2290         struct ipmr_mfc_iter *it = seq->private;
2291         struct mr_table *mrt = it->mrt;
2292
2293         if (it->cache == &mrt->mfc_unres_queue)
2294                 spin_unlock_bh(&mfc_unres_lock);
2295         else if (it->cache == &mrt->mfc_cache_array[it->ct])
2296                 read_unlock(&mrt_lock);
2297 }
2298
2299 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
2300 {
2301         int n;
2302
2303         if (v == SEQ_START_TOKEN) {
2304                 seq_puts(seq,
2305                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
2306         } else {
2307                 const struct mfc_cache *mfc = v;
2308                 const struct ipmr_mfc_iter *it = seq->private;
2309                 const struct mr_table *mrt = it->mrt;
2310
2311                 seq_printf(seq, "%08X %08X %-3hd",
2312                            (__force u32) mfc->mfc_mcastgrp,
2313                            (__force u32) mfc->mfc_origin,
2314                            mfc->mfc_parent);
2315
2316                 if (it->cache != &mrt->mfc_unres_queue) {
2317                         seq_printf(seq, " %8lu %8lu %8lu",
2318                                    mfc->mfc_un.res.pkt,
2319                                    mfc->mfc_un.res.bytes,
2320                                    mfc->mfc_un.res.wrong_if);
2321                         for (n = mfc->mfc_un.res.minvif;
2322                              n < mfc->mfc_un.res.maxvif; n++ ) {
2323                                 if (VIF_EXISTS(mrt, n) &&
2324                                     mfc->mfc_un.res.ttls[n] < 255)
2325                                         seq_printf(seq,
2326                                            " %2d:%-3d",
2327                                            n, mfc->mfc_un.res.ttls[n]);
2328                         }
2329                 } else {
2330                         /* unresolved mfc_caches don't contain
2331                          * pkt, bytes and wrong_if values
2332                          */
2333                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
2334                 }
2335                 seq_putc(seq, '\n');
2336         }
2337         return 0;
2338 }
2339
2340 static const struct seq_operations ipmr_mfc_seq_ops = {
2341         .start = ipmr_mfc_seq_start,
2342         .next  = ipmr_mfc_seq_next,
2343         .stop  = ipmr_mfc_seq_stop,
2344         .show  = ipmr_mfc_seq_show,
2345 };
2346
2347 static int ipmr_mfc_open(struct inode *inode, struct file *file)
2348 {
2349         return seq_open_net(inode, file, &ipmr_mfc_seq_ops,
2350                             sizeof(struct ipmr_mfc_iter));
2351 }
2352
2353 static const struct file_operations ipmr_mfc_fops = {
2354         .owner   = THIS_MODULE,
2355         .open    = ipmr_mfc_open,
2356         .read    = seq_read,
2357         .llseek  = seq_lseek,
2358         .release = seq_release_net,
2359 };
2360 #endif
2361
2362 #ifdef CONFIG_IP_PIMSM_V2
2363 static const struct net_protocol pim_protocol = {
2364         .handler        =       pim_rcv,
2365         .netns_ok       =       1,
2366 };
2367 #endif
2368
2369
2370 /*
2371  *      Setup for IP multicast routing
2372  */
2373 static int __net_init ipmr_net_init(struct net *net)
2374 {
2375         int err;
2376
2377         err = ipmr_rules_init(net);
2378         if (err < 0)
2379                 goto fail;
2380
2381 #ifdef CONFIG_PROC_FS
2382         err = -ENOMEM;
2383         if (!proc_net_fops_create(net, "ip_mr_vif", 0, &ipmr_vif_fops))
2384                 goto proc_vif_fail;
2385         if (!proc_net_fops_create(net, "ip_mr_cache", 0, &ipmr_mfc_fops))
2386                 goto proc_cache_fail;
2387 #endif
2388         return 0;
2389
2390 #ifdef CONFIG_PROC_FS
2391 proc_cache_fail:
2392         proc_net_remove(net, "ip_mr_vif");
2393 proc_vif_fail:
2394         ipmr_rules_exit(net);
2395 #endif
2396 fail:
2397         return err;
2398 }
2399
2400 static void __net_exit ipmr_net_exit(struct net *net)
2401 {
2402 #ifdef CONFIG_PROC_FS
2403         proc_net_remove(net, "ip_mr_cache");
2404         proc_net_remove(net, "ip_mr_vif");
2405 #endif
2406         ipmr_rules_exit(net);
2407 }
2408
2409 static struct pernet_operations ipmr_net_ops = {
2410         .init = ipmr_net_init,
2411         .exit = ipmr_net_exit,
2412 };
2413
2414 int __init ip_mr_init(void)
2415 {
2416         int err;
2417
2418         mrt_cachep = kmem_cache_create("ip_mrt_cache",
2419                                        sizeof(struct mfc_cache),
2420                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
2421                                        NULL);
2422         if (!mrt_cachep)
2423                 return -ENOMEM;
2424
2425         err = register_pernet_subsys(&ipmr_net_ops);
2426         if (err)
2427                 goto reg_pernet_fail;
2428
2429         err = register_netdevice_notifier(&ip_mr_notifier);
2430         if (err)
2431                 goto reg_notif_fail;
2432 #ifdef CONFIG_IP_PIMSM_V2
2433         if (inet_add_protocol(&pim_protocol, IPPROTO_PIM) < 0) {
2434                 printk(KERN_ERR "ip_mr_init: can't add PIM protocol\n");
2435                 err = -EAGAIN;
2436                 goto add_proto_fail;
2437         }
2438 #endif
2439         rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE, NULL, ipmr_rtm_dumproute);
2440         return 0;
2441
2442 #ifdef CONFIG_IP_PIMSM_V2
2443 add_proto_fail:
2444         unregister_netdevice_notifier(&ip_mr_notifier);
2445 #endif
2446 reg_notif_fail:
2447         unregister_pernet_subsys(&ipmr_net_ops);
2448 reg_pernet_fail:
2449         kmem_cache_destroy(mrt_cachep);
2450         return err;
2451 }