netns: ipmr: declare mroute_do_assert and mroute_do_pim per-namespace
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 /* Big lock, protecting vif table, mrt cache and mroute socket state.
71    Note that the changes are semaphored via rtnl_lock.
72  */
73
74 static DEFINE_RWLOCK(mrt_lock);
75
76 /*
77  *      Multicast router control variables
78  */
79
80 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
81
82 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
83
84 /* Special spinlock for queue of unresolved entries */
85 static DEFINE_SPINLOCK(mfc_unres_lock);
86
87 /* We return to original Alan's scheme. Hash table of resolved
88    entries is changed only in process context and protected
89    with weak lock mrt_lock. Queue of unresolved entries is protected
90    with strong spinlock mfc_unres_lock.
91
92    In this case data path is free of exclusive locks at all.
93  */
94
95 static struct kmem_cache *mrt_cachep __read_mostly;
96
97 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
98 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
99 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
100
101 #ifdef CONFIG_IP_PIMSM_V2
102 static struct net_protocol pim_protocol;
103 #endif
104
105 static struct timer_list ipmr_expire_timer;
106
107 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
108
109 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
110 {
111         dev_close(dev);
112
113         dev = __dev_get_by_name(&init_net, "tunl0");
114         if (dev) {
115                 const struct net_device_ops *ops = dev->netdev_ops;
116                 struct ifreq ifr;
117                 struct ip_tunnel_parm p;
118
119                 memset(&p, 0, sizeof(p));
120                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
121                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
122                 p.iph.version = 4;
123                 p.iph.ihl = 5;
124                 p.iph.protocol = IPPROTO_IPIP;
125                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
126                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
127
128                 if (ops->ndo_do_ioctl) {
129                         mm_segment_t oldfs = get_fs();
130
131                         set_fs(KERNEL_DS);
132                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
133                         set_fs(oldfs);
134                 }
135         }
136 }
137
138 static
139 struct net_device *ipmr_new_tunnel(struct vifctl *v)
140 {
141         struct net_device  *dev;
142
143         dev = __dev_get_by_name(&init_net, "tunl0");
144
145         if (dev) {
146                 const struct net_device_ops *ops = dev->netdev_ops;
147                 int err;
148                 struct ifreq ifr;
149                 struct ip_tunnel_parm p;
150                 struct in_device  *in_dev;
151
152                 memset(&p, 0, sizeof(p));
153                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
154                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
155                 p.iph.version = 4;
156                 p.iph.ihl = 5;
157                 p.iph.protocol = IPPROTO_IPIP;
158                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
159                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
160
161                 if (ops->ndo_do_ioctl) {
162                         mm_segment_t oldfs = get_fs();
163
164                         set_fs(KERNEL_DS);
165                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
166                         set_fs(oldfs);
167                 } else
168                         err = -EOPNOTSUPP;
169
170                 dev = NULL;
171
172                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
173                         dev->flags |= IFF_MULTICAST;
174
175                         in_dev = __in_dev_get_rtnl(dev);
176                         if (in_dev == NULL)
177                                 goto failure;
178
179                         ipv4_devconf_setall(in_dev);
180                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
181
182                         if (dev_open(dev))
183                                 goto failure;
184                         dev_hold(dev);
185                 }
186         }
187         return dev;
188
189 failure:
190         /* allow the register to be completed before unregistering. */
191         rtnl_unlock();
192         rtnl_lock();
193
194         unregister_netdevice(dev);
195         return NULL;
196 }
197
198 #ifdef CONFIG_IP_PIMSM
199
200 static int reg_vif_num = -1;
201
202 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
203 {
204         read_lock(&mrt_lock);
205         dev->stats.tx_bytes += skb->len;
206         dev->stats.tx_packets++;
207         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
208         read_unlock(&mrt_lock);
209         kfree_skb(skb);
210         return 0;
211 }
212
213 static const struct net_device_ops reg_vif_netdev_ops = {
214         .ndo_start_xmit = reg_vif_xmit,
215 };
216
217 static void reg_vif_setup(struct net_device *dev)
218 {
219         dev->type               = ARPHRD_PIMREG;
220         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
221         dev->flags              = IFF_NOARP;
222         dev->netdev_ops         = &reg_vif_netdev_ops,
223         dev->destructor         = free_netdev;
224 }
225
226 static struct net_device *ipmr_reg_vif(void)
227 {
228         struct net_device *dev;
229         struct in_device *in_dev;
230
231         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
232
233         if (dev == NULL)
234                 return NULL;
235
236         if (register_netdevice(dev)) {
237                 free_netdev(dev);
238                 return NULL;
239         }
240         dev->iflink = 0;
241
242         rcu_read_lock();
243         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
244                 rcu_read_unlock();
245                 goto failure;
246         }
247
248         ipv4_devconf_setall(in_dev);
249         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
250         rcu_read_unlock();
251
252         if (dev_open(dev))
253                 goto failure;
254
255         dev_hold(dev);
256
257         return dev;
258
259 failure:
260         /* allow the register to be completed before unregistering. */
261         rtnl_unlock();
262         rtnl_lock();
263
264         unregister_netdevice(dev);
265         return NULL;
266 }
267 #endif
268
269 /*
270  *      Delete a VIF entry
271  *      @notify: Set to 1, if the caller is a notifier_call
272  */
273
274 static int vif_delete(int vifi, int notify)
275 {
276         struct vif_device *v;
277         struct net_device *dev;
278         struct in_device *in_dev;
279
280         if (vifi < 0 || vifi >= init_net.ipv4.maxvif)
281                 return -EADDRNOTAVAIL;
282
283         v = &init_net.ipv4.vif_table[vifi];
284
285         write_lock_bh(&mrt_lock);
286         dev = v->dev;
287         v->dev = NULL;
288
289         if (!dev) {
290                 write_unlock_bh(&mrt_lock);
291                 return -EADDRNOTAVAIL;
292         }
293
294 #ifdef CONFIG_IP_PIMSM
295         if (vifi == reg_vif_num)
296                 reg_vif_num = -1;
297 #endif
298
299         if (vifi+1 == init_net.ipv4.maxvif) {
300                 int tmp;
301                 for (tmp=vifi-1; tmp>=0; tmp--) {
302                         if (VIF_EXISTS(&init_net, tmp))
303                                 break;
304                 }
305                 init_net.ipv4.maxvif = tmp+1;
306         }
307
308         write_unlock_bh(&mrt_lock);
309
310         dev_set_allmulti(dev, -1);
311
312         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
313                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
314                 ip_rt_multicast_event(in_dev);
315         }
316
317         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
318                 unregister_netdevice(dev);
319
320         dev_put(dev);
321         return 0;
322 }
323
324 static inline void ipmr_cache_free(struct mfc_cache *c)
325 {
326         release_net(mfc_net(c));
327         kmem_cache_free(mrt_cachep, c);
328 }
329
330 /* Destroy an unresolved cache entry, killing queued skbs
331    and reporting error to netlink readers.
332  */
333
334 static void ipmr_destroy_unres(struct mfc_cache *c)
335 {
336         struct sk_buff *skb;
337         struct nlmsgerr *e;
338
339         atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
340
341         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
342                 if (ip_hdr(skb)->version == 0) {
343                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
344                         nlh->nlmsg_type = NLMSG_ERROR;
345                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
346                         skb_trim(skb, nlh->nlmsg_len);
347                         e = NLMSG_DATA(nlh);
348                         e->error = -ETIMEDOUT;
349                         memset(&e->msg, 0, sizeof(e->msg));
350
351                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
352                 } else
353                         kfree_skb(skb);
354         }
355
356         ipmr_cache_free(c);
357 }
358
359
360 /* Single timer process for all the unresolved queue. */
361
362 static void ipmr_expire_process(unsigned long dummy)
363 {
364         unsigned long now;
365         unsigned long expires;
366         struct mfc_cache *c, **cp;
367
368         if (!spin_trylock(&mfc_unres_lock)) {
369                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
370                 return;
371         }
372
373         if (mfc_unres_queue == NULL)
374                 goto out;
375
376         now = jiffies;
377         expires = 10*HZ;
378         cp = &mfc_unres_queue;
379
380         while ((c=*cp) != NULL) {
381                 if (time_after(c->mfc_un.unres.expires, now)) {
382                         unsigned long interval = c->mfc_un.unres.expires - now;
383                         if (interval < expires)
384                                 expires = interval;
385                         cp = &c->next;
386                         continue;
387                 }
388
389                 *cp = c->next;
390
391                 ipmr_destroy_unres(c);
392         }
393
394         if (mfc_unres_queue != NULL)
395                 mod_timer(&ipmr_expire_timer, jiffies + expires);
396
397 out:
398         spin_unlock(&mfc_unres_lock);
399 }
400
401 /* Fill oifs list. It is called under write locked mrt_lock. */
402
403 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
404 {
405         int vifi;
406
407         cache->mfc_un.res.minvif = MAXVIFS;
408         cache->mfc_un.res.maxvif = 0;
409         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
410
411         for (vifi = 0; vifi < init_net.ipv4.maxvif; vifi++) {
412                 if (VIF_EXISTS(&init_net, vifi) &&
413                     ttls[vifi] && ttls[vifi] < 255) {
414                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
415                         if (cache->mfc_un.res.minvif > vifi)
416                                 cache->mfc_un.res.minvif = vifi;
417                         if (cache->mfc_un.res.maxvif <= vifi)
418                                 cache->mfc_un.res.maxvif = vifi + 1;
419                 }
420         }
421 }
422
423 static int vif_add(struct vifctl *vifc, int mrtsock)
424 {
425         int vifi = vifc->vifc_vifi;
426         struct vif_device *v = &init_net.ipv4.vif_table[vifi];
427         struct net_device *dev;
428         struct in_device *in_dev;
429         int err;
430
431         /* Is vif busy ? */
432         if (VIF_EXISTS(&init_net, vifi))
433                 return -EADDRINUSE;
434
435         switch (vifc->vifc_flags) {
436 #ifdef CONFIG_IP_PIMSM
437         case VIFF_REGISTER:
438                 /*
439                  * Special Purpose VIF in PIM
440                  * All the packets will be sent to the daemon
441                  */
442                 if (reg_vif_num >= 0)
443                         return -EADDRINUSE;
444                 dev = ipmr_reg_vif();
445                 if (!dev)
446                         return -ENOBUFS;
447                 err = dev_set_allmulti(dev, 1);
448                 if (err) {
449                         unregister_netdevice(dev);
450                         dev_put(dev);
451                         return err;
452                 }
453                 break;
454 #endif
455         case VIFF_TUNNEL:
456                 dev = ipmr_new_tunnel(vifc);
457                 if (!dev)
458                         return -ENOBUFS;
459                 err = dev_set_allmulti(dev, 1);
460                 if (err) {
461                         ipmr_del_tunnel(dev, vifc);
462                         dev_put(dev);
463                         return err;
464                 }
465                 break;
466         case 0:
467                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
468                 if (!dev)
469                         return -EADDRNOTAVAIL;
470                 err = dev_set_allmulti(dev, 1);
471                 if (err) {
472                         dev_put(dev);
473                         return err;
474                 }
475                 break;
476         default:
477                 return -EINVAL;
478         }
479
480         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
481                 return -EADDRNOTAVAIL;
482         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
483         ip_rt_multicast_event(in_dev);
484
485         /*
486          *      Fill in the VIF structures
487          */
488         v->rate_limit = vifc->vifc_rate_limit;
489         v->local = vifc->vifc_lcl_addr.s_addr;
490         v->remote = vifc->vifc_rmt_addr.s_addr;
491         v->flags = vifc->vifc_flags;
492         if (!mrtsock)
493                 v->flags |= VIFF_STATIC;
494         v->threshold = vifc->vifc_threshold;
495         v->bytes_in = 0;
496         v->bytes_out = 0;
497         v->pkt_in = 0;
498         v->pkt_out = 0;
499         v->link = dev->ifindex;
500         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
501                 v->link = dev->iflink;
502
503         /* And finish update writing critical data */
504         write_lock_bh(&mrt_lock);
505         v->dev = dev;
506 #ifdef CONFIG_IP_PIMSM
507         if (v->flags&VIFF_REGISTER)
508                 reg_vif_num = vifi;
509 #endif
510         if (vifi+1 > init_net.ipv4.maxvif)
511                 init_net.ipv4.maxvif = vifi+1;
512         write_unlock_bh(&mrt_lock);
513         return 0;
514 }
515
516 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
517 {
518         int line = MFC_HASH(mcastgrp, origin);
519         struct mfc_cache *c;
520
521         for (c = init_net.ipv4.mfc_cache_array[line]; c; c = c->next) {
522                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
523                         break;
524         }
525         return c;
526 }
527
528 /*
529  *      Allocate a multicast cache entry
530  */
531 static struct mfc_cache *ipmr_cache_alloc(struct net *net)
532 {
533         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
534         if (c == NULL)
535                 return NULL;
536         c->mfc_un.res.minvif = MAXVIFS;
537         mfc_net_set(c, net);
538         return c;
539 }
540
541 static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
542 {
543         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
544         if (c == NULL)
545                 return NULL;
546         skb_queue_head_init(&c->mfc_un.unres.unresolved);
547         c->mfc_un.unres.expires = jiffies + 10*HZ;
548         mfc_net_set(c, net);
549         return c;
550 }
551
552 /*
553  *      A cache entry has gone into a resolved state from queued
554  */
555
556 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
557 {
558         struct sk_buff *skb;
559         struct nlmsgerr *e;
560
561         /*
562          *      Play the pending entries through our router
563          */
564
565         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
566                 if (ip_hdr(skb)->version == 0) {
567                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
568
569                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
570                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
571                                                   (u8 *)nlh);
572                         } else {
573                                 nlh->nlmsg_type = NLMSG_ERROR;
574                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
575                                 skb_trim(skb, nlh->nlmsg_len);
576                                 e = NLMSG_DATA(nlh);
577                                 e->error = -EMSGSIZE;
578                                 memset(&e->msg, 0, sizeof(e->msg));
579                         }
580
581                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
582                 } else
583                         ip_mr_forward(skb, c, 0);
584         }
585 }
586
587 /*
588  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
589  *      expects the following bizarre scheme.
590  *
591  *      Called under mrt_lock.
592  */
593
594 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
595 {
596         struct sk_buff *skb;
597         const int ihl = ip_hdrlen(pkt);
598         struct igmphdr *igmp;
599         struct igmpmsg *msg;
600         int ret;
601
602 #ifdef CONFIG_IP_PIMSM
603         if (assert == IGMPMSG_WHOLEPKT)
604                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
605         else
606 #endif
607                 skb = alloc_skb(128, GFP_ATOMIC);
608
609         if (!skb)
610                 return -ENOBUFS;
611
612 #ifdef CONFIG_IP_PIMSM
613         if (assert == IGMPMSG_WHOLEPKT) {
614                 /* Ugly, but we have no choice with this interface.
615                    Duplicate old header, fix ihl, length etc.
616                    And all this only to mangle msg->im_msgtype and
617                    to set msg->im_mbz to "mbz" :-)
618                  */
619                 skb_push(skb, sizeof(struct iphdr));
620                 skb_reset_network_header(skb);
621                 skb_reset_transport_header(skb);
622                 msg = (struct igmpmsg *)skb_network_header(skb);
623                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
624                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
625                 msg->im_mbz = 0;
626                 msg->im_vif = reg_vif_num;
627                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
628                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
629                                              sizeof(struct iphdr));
630         } else
631 #endif
632         {
633
634         /*
635          *      Copy the IP header
636          */
637
638         skb->network_header = skb->tail;
639         skb_put(skb, ihl);
640         skb_copy_to_linear_data(skb, pkt->data, ihl);
641         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
642         msg = (struct igmpmsg *)skb_network_header(skb);
643         msg->im_vif = vifi;
644         skb->dst = dst_clone(pkt->dst);
645
646         /*
647          *      Add our header
648          */
649
650         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
651         igmp->type      =
652         msg->im_msgtype = assert;
653         igmp->code      =       0;
654         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
655         skb->transport_header = skb->network_header;
656         }
657
658         if (init_net.ipv4.mroute_sk == NULL) {
659                 kfree_skb(skb);
660                 return -EINVAL;
661         }
662
663         /*
664          *      Deliver to mrouted
665          */
666         ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
667         if (ret < 0) {
668                 if (net_ratelimit())
669                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
670                 kfree_skb(skb);
671         }
672
673         return ret;
674 }
675
676 /*
677  *      Queue a packet for resolution. It gets locked cache entry!
678  */
679
680 static int
681 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
682 {
683         int err;
684         struct mfc_cache *c;
685         const struct iphdr *iph = ip_hdr(skb);
686
687         spin_lock_bh(&mfc_unres_lock);
688         for (c=mfc_unres_queue; c; c=c->next) {
689                 if (net_eq(mfc_net(c), &init_net) &&
690                     c->mfc_mcastgrp == iph->daddr &&
691                     c->mfc_origin == iph->saddr)
692                         break;
693         }
694
695         if (c == NULL) {
696                 /*
697                  *      Create a new entry if allowable
698                  */
699
700                 if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) >= 10 ||
701                     (c = ipmr_cache_alloc_unres(&init_net)) == NULL) {
702                         spin_unlock_bh(&mfc_unres_lock);
703
704                         kfree_skb(skb);
705                         return -ENOBUFS;
706                 }
707
708                 /*
709                  *      Fill in the new cache entry
710                  */
711                 c->mfc_parent   = -1;
712                 c->mfc_origin   = iph->saddr;
713                 c->mfc_mcastgrp = iph->daddr;
714
715                 /*
716                  *      Reflect first query at mrouted.
717                  */
718                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
719                         /* If the report failed throw the cache entry
720                            out - Brad Parker
721                          */
722                         spin_unlock_bh(&mfc_unres_lock);
723
724                         ipmr_cache_free(c);
725                         kfree_skb(skb);
726                         return err;
727                 }
728
729                 atomic_inc(&init_net.ipv4.cache_resolve_queue_len);
730                 c->next = mfc_unres_queue;
731                 mfc_unres_queue = c;
732
733                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
734         }
735
736         /*
737          *      See if we can append the packet
738          */
739         if (c->mfc_un.unres.unresolved.qlen>3) {
740                 kfree_skb(skb);
741                 err = -ENOBUFS;
742         } else {
743                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
744                 err = 0;
745         }
746
747         spin_unlock_bh(&mfc_unres_lock);
748         return err;
749 }
750
751 /*
752  *      MFC cache manipulation by user space mroute daemon
753  */
754
755 static int ipmr_mfc_delete(struct mfcctl *mfc)
756 {
757         int line;
758         struct mfc_cache *c, **cp;
759
760         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
761
762         for (cp = &init_net.ipv4.mfc_cache_array[line];
763              (c = *cp) != NULL; cp = &c->next) {
764                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
765                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
766                         write_lock_bh(&mrt_lock);
767                         *cp = c->next;
768                         write_unlock_bh(&mrt_lock);
769
770                         ipmr_cache_free(c);
771                         return 0;
772                 }
773         }
774         return -ENOENT;
775 }
776
777 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
778 {
779         int line;
780         struct mfc_cache *uc, *c, **cp;
781
782         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
783
784         for (cp = &init_net.ipv4.mfc_cache_array[line];
785              (c = *cp) != NULL; cp = &c->next) {
786                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
787                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
788                         break;
789         }
790
791         if (c != NULL) {
792                 write_lock_bh(&mrt_lock);
793                 c->mfc_parent = mfc->mfcc_parent;
794                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
795                 if (!mrtsock)
796                         c->mfc_flags |= MFC_STATIC;
797                 write_unlock_bh(&mrt_lock);
798                 return 0;
799         }
800
801         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
802                 return -EINVAL;
803
804         c = ipmr_cache_alloc(&init_net);
805         if (c == NULL)
806                 return -ENOMEM;
807
808         c->mfc_origin = mfc->mfcc_origin.s_addr;
809         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
810         c->mfc_parent = mfc->mfcc_parent;
811         ipmr_update_thresholds(c, mfc->mfcc_ttls);
812         if (!mrtsock)
813                 c->mfc_flags |= MFC_STATIC;
814
815         write_lock_bh(&mrt_lock);
816         c->next = init_net.ipv4.mfc_cache_array[line];
817         init_net.ipv4.mfc_cache_array[line] = c;
818         write_unlock_bh(&mrt_lock);
819
820         /*
821          *      Check to see if we resolved a queued list. If so we
822          *      need to send on the frames and tidy up.
823          */
824         spin_lock_bh(&mfc_unres_lock);
825         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
826              cp = &uc->next) {
827                 if (net_eq(mfc_net(uc), &init_net) &&
828                     uc->mfc_origin == c->mfc_origin &&
829                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
830                         *cp = uc->next;
831                         atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
832                         break;
833                 }
834         }
835         if (mfc_unres_queue == NULL)
836                 del_timer(&ipmr_expire_timer);
837         spin_unlock_bh(&mfc_unres_lock);
838
839         if (uc) {
840                 ipmr_cache_resolve(uc, c);
841                 ipmr_cache_free(uc);
842         }
843         return 0;
844 }
845
846 /*
847  *      Close the multicast socket, and clear the vif tables etc
848  */
849
850 static void mroute_clean_tables(struct sock *sk)
851 {
852         int i;
853
854         /*
855          *      Shut down all active vif entries
856          */
857         for (i = 0; i < init_net.ipv4.maxvif; i++) {
858                 if (!(init_net.ipv4.vif_table[i].flags&VIFF_STATIC))
859                         vif_delete(i, 0);
860         }
861
862         /*
863          *      Wipe the cache
864          */
865         for (i=0; i<MFC_LINES; i++) {
866                 struct mfc_cache *c, **cp;
867
868                 cp = &init_net.ipv4.mfc_cache_array[i];
869                 while ((c = *cp) != NULL) {
870                         if (c->mfc_flags&MFC_STATIC) {
871                                 cp = &c->next;
872                                 continue;
873                         }
874                         write_lock_bh(&mrt_lock);
875                         *cp = c->next;
876                         write_unlock_bh(&mrt_lock);
877
878                         ipmr_cache_free(c);
879                 }
880         }
881
882         if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) != 0) {
883                 struct mfc_cache *c, **cp;
884
885                 spin_lock_bh(&mfc_unres_lock);
886                 cp = &mfc_unres_queue;
887                 while ((c = *cp) != NULL) {
888                         if (!net_eq(mfc_net(c), &init_net)) {
889                                 cp = &c->next;
890                                 continue;
891                         }
892                         *cp = c->next;
893
894                         ipmr_destroy_unres(c);
895                 }
896                 spin_unlock_bh(&mfc_unres_lock);
897         }
898 }
899
900 static void mrtsock_destruct(struct sock *sk)
901 {
902         rtnl_lock();
903         if (sk == init_net.ipv4.mroute_sk) {
904                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
905
906                 write_lock_bh(&mrt_lock);
907                 init_net.ipv4.mroute_sk = NULL;
908                 write_unlock_bh(&mrt_lock);
909
910                 mroute_clean_tables(sk);
911         }
912         rtnl_unlock();
913 }
914
915 /*
916  *      Socket options and virtual interface manipulation. The whole
917  *      virtual interface system is a complete heap, but unfortunately
918  *      that's how BSD mrouted happens to think. Maybe one day with a proper
919  *      MOSPF/PIM router set up we can clean this up.
920  */
921
922 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
923 {
924         int ret;
925         struct vifctl vif;
926         struct mfcctl mfc;
927
928         if (optname != MRT_INIT) {
929                 if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
930                         return -EACCES;
931         }
932
933         switch (optname) {
934         case MRT_INIT:
935                 if (sk->sk_type != SOCK_RAW ||
936                     inet_sk(sk)->num != IPPROTO_IGMP)
937                         return -EOPNOTSUPP;
938                 if (optlen != sizeof(int))
939                         return -ENOPROTOOPT;
940
941                 rtnl_lock();
942                 if (init_net.ipv4.mroute_sk) {
943                         rtnl_unlock();
944                         return -EADDRINUSE;
945                 }
946
947                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
948                 if (ret == 0) {
949                         write_lock_bh(&mrt_lock);
950                         init_net.ipv4.mroute_sk = sk;
951                         write_unlock_bh(&mrt_lock);
952
953                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
954                 }
955                 rtnl_unlock();
956                 return ret;
957         case MRT_DONE:
958                 if (sk != init_net.ipv4.mroute_sk)
959                         return -EACCES;
960                 return ip_ra_control(sk, 0, NULL);
961         case MRT_ADD_VIF:
962         case MRT_DEL_VIF:
963                 if (optlen != sizeof(vif))
964                         return -EINVAL;
965                 if (copy_from_user(&vif, optval, sizeof(vif)))
966                         return -EFAULT;
967                 if (vif.vifc_vifi >= MAXVIFS)
968                         return -ENFILE;
969                 rtnl_lock();
970                 if (optname == MRT_ADD_VIF) {
971                         ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
972                 } else {
973                         ret = vif_delete(vif.vifc_vifi, 0);
974                 }
975                 rtnl_unlock();
976                 return ret;
977
978                 /*
979                  *      Manipulate the forwarding caches. These live
980                  *      in a sort of kernel/user symbiosis.
981                  */
982         case MRT_ADD_MFC:
983         case MRT_DEL_MFC:
984                 if (optlen != sizeof(mfc))
985                         return -EINVAL;
986                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
987                         return -EFAULT;
988                 rtnl_lock();
989                 if (optname == MRT_DEL_MFC)
990                         ret = ipmr_mfc_delete(&mfc);
991                 else
992                         ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
993                 rtnl_unlock();
994                 return ret;
995                 /*
996                  *      Control PIM assert.
997                  */
998         case MRT_ASSERT:
999         {
1000                 int v;
1001                 if (get_user(v,(int __user *)optval))
1002                         return -EFAULT;
1003                 init_net.ipv4.mroute_do_assert = (v) ? 1 : 0;
1004                 return 0;
1005         }
1006 #ifdef CONFIG_IP_PIMSM
1007         case MRT_PIM:
1008         {
1009                 int v;
1010
1011                 if (get_user(v,(int __user *)optval))
1012                         return -EFAULT;
1013                 v = (v) ? 1 : 0;
1014
1015                 rtnl_lock();
1016                 ret = 0;
1017                 if (v != init_net.ipv4.mroute_do_pim) {
1018                         init_net.ipv4.mroute_do_pim = v;
1019                         init_net.ipv4.mroute_do_assert = v;
1020 #ifdef CONFIG_IP_PIMSM_V2
1021                         if (init_net.ipv4.mroute_do_pim)
1022                                 ret = inet_add_protocol(&pim_protocol,
1023                                                         IPPROTO_PIM);
1024                         else
1025                                 ret = inet_del_protocol(&pim_protocol,
1026                                                         IPPROTO_PIM);
1027                         if (ret < 0)
1028                                 ret = -EAGAIN;
1029 #endif
1030                 }
1031                 rtnl_unlock();
1032                 return ret;
1033         }
1034 #endif
1035         /*
1036          *      Spurious command, or MRT_VERSION which you cannot
1037          *      set.
1038          */
1039         default:
1040                 return -ENOPROTOOPT;
1041         }
1042 }
1043
1044 /*
1045  *      Getsock opt support for the multicast routing system.
1046  */
1047
1048 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1049 {
1050         int olr;
1051         int val;
1052
1053         if (optname != MRT_VERSION &&
1054 #ifdef CONFIG_IP_PIMSM
1055            optname!=MRT_PIM &&
1056 #endif
1057            optname!=MRT_ASSERT)
1058                 return -ENOPROTOOPT;
1059
1060         if (get_user(olr, optlen))
1061                 return -EFAULT;
1062
1063         olr = min_t(unsigned int, olr, sizeof(int));
1064         if (olr < 0)
1065                 return -EINVAL;
1066
1067         if (put_user(olr, optlen))
1068                 return -EFAULT;
1069         if (optname == MRT_VERSION)
1070                 val = 0x0305;
1071 #ifdef CONFIG_IP_PIMSM
1072         else if (optname == MRT_PIM)
1073                 val = init_net.ipv4.mroute_do_pim;
1074 #endif
1075         else
1076                 val = init_net.ipv4.mroute_do_assert;
1077         if (copy_to_user(optval, &val, olr))
1078                 return -EFAULT;
1079         return 0;
1080 }
1081
1082 /*
1083  *      The IP multicast ioctl support routines.
1084  */
1085
1086 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1087 {
1088         struct sioc_sg_req sr;
1089         struct sioc_vif_req vr;
1090         struct vif_device *vif;
1091         struct mfc_cache *c;
1092
1093         switch (cmd) {
1094         case SIOCGETVIFCNT:
1095                 if (copy_from_user(&vr, arg, sizeof(vr)))
1096                         return -EFAULT;
1097                 if (vr.vifi >= init_net.ipv4.maxvif)
1098                         return -EINVAL;
1099                 read_lock(&mrt_lock);
1100                 vif = &init_net.ipv4.vif_table[vr.vifi];
1101                 if (VIF_EXISTS(&init_net, vr.vifi)) {
1102                         vr.icount = vif->pkt_in;
1103                         vr.ocount = vif->pkt_out;
1104                         vr.ibytes = vif->bytes_in;
1105                         vr.obytes = vif->bytes_out;
1106                         read_unlock(&mrt_lock);
1107
1108                         if (copy_to_user(arg, &vr, sizeof(vr)))
1109                                 return -EFAULT;
1110                         return 0;
1111                 }
1112                 read_unlock(&mrt_lock);
1113                 return -EADDRNOTAVAIL;
1114         case SIOCGETSGCNT:
1115                 if (copy_from_user(&sr, arg, sizeof(sr)))
1116                         return -EFAULT;
1117
1118                 read_lock(&mrt_lock);
1119                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1120                 if (c) {
1121                         sr.pktcnt = c->mfc_un.res.pkt;
1122                         sr.bytecnt = c->mfc_un.res.bytes;
1123                         sr.wrong_if = c->mfc_un.res.wrong_if;
1124                         read_unlock(&mrt_lock);
1125
1126                         if (copy_to_user(arg, &sr, sizeof(sr)))
1127                                 return -EFAULT;
1128                         return 0;
1129                 }
1130                 read_unlock(&mrt_lock);
1131                 return -EADDRNOTAVAIL;
1132         default:
1133                 return -ENOIOCTLCMD;
1134         }
1135 }
1136
1137
1138 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1139 {
1140         struct net_device *dev = ptr;
1141         struct vif_device *v;
1142         int ct;
1143
1144         if (!net_eq(dev_net(dev), &init_net))
1145                 return NOTIFY_DONE;
1146
1147         if (event != NETDEV_UNREGISTER)
1148                 return NOTIFY_DONE;
1149         v = &init_net.ipv4.vif_table[0];
1150         for (ct = 0; ct < init_net.ipv4.maxvif; ct++, v++) {
1151                 if (v->dev == dev)
1152                         vif_delete(ct, 1);
1153         }
1154         return NOTIFY_DONE;
1155 }
1156
1157
1158 static struct notifier_block ip_mr_notifier = {
1159         .notifier_call = ipmr_device_event,
1160 };
1161
1162 /*
1163  *      Encapsulate a packet by attaching a valid IPIP header to it.
1164  *      This avoids tunnel drivers and other mess and gives us the speed so
1165  *      important for multicast video.
1166  */
1167
1168 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1169 {
1170         struct iphdr *iph;
1171         struct iphdr *old_iph = ip_hdr(skb);
1172
1173         skb_push(skb, sizeof(struct iphdr));
1174         skb->transport_header = skb->network_header;
1175         skb_reset_network_header(skb);
1176         iph = ip_hdr(skb);
1177
1178         iph->version    =       4;
1179         iph->tos        =       old_iph->tos;
1180         iph->ttl        =       old_iph->ttl;
1181         iph->frag_off   =       0;
1182         iph->daddr      =       daddr;
1183         iph->saddr      =       saddr;
1184         iph->protocol   =       IPPROTO_IPIP;
1185         iph->ihl        =       5;
1186         iph->tot_len    =       htons(skb->len);
1187         ip_select_ident(iph, skb->dst, NULL);
1188         ip_send_check(iph);
1189
1190         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1191         nf_reset(skb);
1192 }
1193
1194 static inline int ipmr_forward_finish(struct sk_buff *skb)
1195 {
1196         struct ip_options * opt = &(IPCB(skb)->opt);
1197
1198         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1199
1200         if (unlikely(opt->optlen))
1201                 ip_forward_options(skb);
1202
1203         return dst_output(skb);
1204 }
1205
1206 /*
1207  *      Processing handlers for ipmr_forward
1208  */
1209
1210 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1211 {
1212         const struct iphdr *iph = ip_hdr(skb);
1213         struct vif_device *vif = &init_net.ipv4.vif_table[vifi];
1214         struct net_device *dev;
1215         struct rtable *rt;
1216         int    encap = 0;
1217
1218         if (vif->dev == NULL)
1219                 goto out_free;
1220
1221 #ifdef CONFIG_IP_PIMSM
1222         if (vif->flags & VIFF_REGISTER) {
1223                 vif->pkt_out++;
1224                 vif->bytes_out += skb->len;
1225                 vif->dev->stats.tx_bytes += skb->len;
1226                 vif->dev->stats.tx_packets++;
1227                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1228                 kfree_skb(skb);
1229                 return;
1230         }
1231 #endif
1232
1233         if (vif->flags&VIFF_TUNNEL) {
1234                 struct flowi fl = { .oif = vif->link,
1235                                     .nl_u = { .ip4_u =
1236                                               { .daddr = vif->remote,
1237                                                 .saddr = vif->local,
1238                                                 .tos = RT_TOS(iph->tos) } },
1239                                     .proto = IPPROTO_IPIP };
1240                 if (ip_route_output_key(&init_net, &rt, &fl))
1241                         goto out_free;
1242                 encap = sizeof(struct iphdr);
1243         } else {
1244                 struct flowi fl = { .oif = vif->link,
1245                                     .nl_u = { .ip4_u =
1246                                               { .daddr = iph->daddr,
1247                                                 .tos = RT_TOS(iph->tos) } },
1248                                     .proto = IPPROTO_IPIP };
1249                 if (ip_route_output_key(&init_net, &rt, &fl))
1250                         goto out_free;
1251         }
1252
1253         dev = rt->u.dst.dev;
1254
1255         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1256                 /* Do not fragment multicasts. Alas, IPv4 does not
1257                    allow to send ICMP, so that packets will disappear
1258                    to blackhole.
1259                  */
1260
1261                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1262                 ip_rt_put(rt);
1263                 goto out_free;
1264         }
1265
1266         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1267
1268         if (skb_cow(skb, encap)) {
1269                 ip_rt_put(rt);
1270                 goto out_free;
1271         }
1272
1273         vif->pkt_out++;
1274         vif->bytes_out += skb->len;
1275
1276         dst_release(skb->dst);
1277         skb->dst = &rt->u.dst;
1278         ip_decrease_ttl(ip_hdr(skb));
1279
1280         /* FIXME: forward and output firewalls used to be called here.
1281          * What do we do with netfilter? -- RR */
1282         if (vif->flags & VIFF_TUNNEL) {
1283                 ip_encap(skb, vif->local, vif->remote);
1284                 /* FIXME: extra output firewall step used to be here. --RR */
1285                 vif->dev->stats.tx_packets++;
1286                 vif->dev->stats.tx_bytes += skb->len;
1287         }
1288
1289         IPCB(skb)->flags |= IPSKB_FORWARDED;
1290
1291         /*
1292          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1293          * not only before forwarding, but after forwarding on all output
1294          * interfaces. It is clear, if mrouter runs a multicasting
1295          * program, it should receive packets not depending to what interface
1296          * program is joined.
1297          * If we will not make it, the program will have to join on all
1298          * interfaces. On the other hand, multihoming host (or router, but
1299          * not mrouter) cannot join to more than one interface - it will
1300          * result in receiving multiple packets.
1301          */
1302         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1303                 ipmr_forward_finish);
1304         return;
1305
1306 out_free:
1307         kfree_skb(skb);
1308         return;
1309 }
1310
1311 static int ipmr_find_vif(struct net_device *dev)
1312 {
1313         int ct;
1314         for (ct = init_net.ipv4.maxvif-1; ct >= 0; ct--) {
1315                 if (init_net.ipv4.vif_table[ct].dev == dev)
1316                         break;
1317         }
1318         return ct;
1319 }
1320
1321 /* "local" means that we should preserve one skb (for local delivery) */
1322
1323 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1324 {
1325         int psend = -1;
1326         int vif, ct;
1327
1328         vif = cache->mfc_parent;
1329         cache->mfc_un.res.pkt++;
1330         cache->mfc_un.res.bytes += skb->len;
1331
1332         /*
1333          * Wrong interface: drop packet and (maybe) send PIM assert.
1334          */
1335         if (init_net.ipv4.vif_table[vif].dev != skb->dev) {
1336                 int true_vifi;
1337
1338                 if (skb->rtable->fl.iif == 0) {
1339                         /* It is our own packet, looped back.
1340                            Very complicated situation...
1341
1342                            The best workaround until routing daemons will be
1343                            fixed is not to redistribute packet, if it was
1344                            send through wrong interface. It means, that
1345                            multicast applications WILL NOT work for
1346                            (S,G), which have default multicast route pointing
1347                            to wrong oif. In any case, it is not a good
1348                            idea to use multicasting applications on router.
1349                          */
1350                         goto dont_forward;
1351                 }
1352
1353                 cache->mfc_un.res.wrong_if++;
1354                 true_vifi = ipmr_find_vif(skb->dev);
1355
1356                 if (true_vifi >= 0 && init_net.ipv4.mroute_do_assert &&
1357                     /* pimsm uses asserts, when switching from RPT to SPT,
1358                        so that we cannot check that packet arrived on an oif.
1359                        It is bad, but otherwise we would need to move pretty
1360                        large chunk of pimd to kernel. Ough... --ANK
1361                      */
1362                     (init_net.ipv4.mroute_do_pim ||
1363                      cache->mfc_un.res.ttls[true_vifi] < 255) &&
1364                     time_after(jiffies,
1365                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1366                         cache->mfc_un.res.last_assert = jiffies;
1367                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1368                 }
1369                 goto dont_forward;
1370         }
1371
1372         init_net.ipv4.vif_table[vif].pkt_in++;
1373         init_net.ipv4.vif_table[vif].bytes_in += skb->len;
1374
1375         /*
1376          *      Forward the frame
1377          */
1378         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1379                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1380                         if (psend != -1) {
1381                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1382                                 if (skb2)
1383                                         ipmr_queue_xmit(skb2, cache, psend);
1384                         }
1385                         psend = ct;
1386                 }
1387         }
1388         if (psend != -1) {
1389                 if (local) {
1390                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1391                         if (skb2)
1392                                 ipmr_queue_xmit(skb2, cache, psend);
1393                 } else {
1394                         ipmr_queue_xmit(skb, cache, psend);
1395                         return 0;
1396                 }
1397         }
1398
1399 dont_forward:
1400         if (!local)
1401                 kfree_skb(skb);
1402         return 0;
1403 }
1404
1405
1406 /*
1407  *      Multicast packets for forwarding arrive here
1408  */
1409
1410 int ip_mr_input(struct sk_buff *skb)
1411 {
1412         struct mfc_cache *cache;
1413         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1414
1415         /* Packet is looped back after forward, it should not be
1416            forwarded second time, but still can be delivered locally.
1417          */
1418         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1419                 goto dont_forward;
1420
1421         if (!local) {
1422                     if (IPCB(skb)->opt.router_alert) {
1423                             if (ip_call_ra_chain(skb))
1424                                     return 0;
1425                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1426                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1427                                Cisco IOS <= 11.2(8)) do not put router alert
1428                                option to IGMP packets destined to routable
1429                                groups. It is very bad, because it means
1430                                that we can forward NO IGMP messages.
1431                              */
1432                             read_lock(&mrt_lock);
1433                             if (init_net.ipv4.mroute_sk) {
1434                                     nf_reset(skb);
1435                                     raw_rcv(init_net.ipv4.mroute_sk, skb);
1436                                     read_unlock(&mrt_lock);
1437                                     return 0;
1438                             }
1439                             read_unlock(&mrt_lock);
1440                     }
1441         }
1442
1443         read_lock(&mrt_lock);
1444         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1445
1446         /*
1447          *      No usable cache entry
1448          */
1449         if (cache == NULL) {
1450                 int vif;
1451
1452                 if (local) {
1453                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1454                         ip_local_deliver(skb);
1455                         if (skb2 == NULL) {
1456                                 read_unlock(&mrt_lock);
1457                                 return -ENOBUFS;
1458                         }
1459                         skb = skb2;
1460                 }
1461
1462                 vif = ipmr_find_vif(skb->dev);
1463                 if (vif >= 0) {
1464                         int err = ipmr_cache_unresolved(vif, skb);
1465                         read_unlock(&mrt_lock);
1466
1467                         return err;
1468                 }
1469                 read_unlock(&mrt_lock);
1470                 kfree_skb(skb);
1471                 return -ENODEV;
1472         }
1473
1474         ip_mr_forward(skb, cache, local);
1475
1476         read_unlock(&mrt_lock);
1477
1478         if (local)
1479                 return ip_local_deliver(skb);
1480
1481         return 0;
1482
1483 dont_forward:
1484         if (local)
1485                 return ip_local_deliver(skb);
1486         kfree_skb(skb);
1487         return 0;
1488 }
1489
1490 #ifdef CONFIG_IP_PIMSM
1491 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1492 {
1493         struct net_device *reg_dev = NULL;
1494         struct iphdr *encap;
1495
1496         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1497         /*
1498            Check that:
1499            a. packet is really destinted to a multicast group
1500            b. packet is not a NULL-REGISTER
1501            c. packet is not truncated
1502          */
1503         if (!ipv4_is_multicast(encap->daddr) ||
1504             encap->tot_len == 0 ||
1505             ntohs(encap->tot_len) + pimlen > skb->len)
1506                 return 1;
1507
1508         read_lock(&mrt_lock);
1509         if (reg_vif_num >= 0)
1510                 reg_dev = init_net.ipv4.vif_table[reg_vif_num].dev;
1511         if (reg_dev)
1512                 dev_hold(reg_dev);
1513         read_unlock(&mrt_lock);
1514
1515         if (reg_dev == NULL)
1516                 return 1;
1517
1518         skb->mac_header = skb->network_header;
1519         skb_pull(skb, (u8*)encap - skb->data);
1520         skb_reset_network_header(skb);
1521         skb->dev = reg_dev;
1522         skb->protocol = htons(ETH_P_IP);
1523         skb->ip_summed = 0;
1524         skb->pkt_type = PACKET_HOST;
1525         dst_release(skb->dst);
1526         skb->dst = NULL;
1527         reg_dev->stats.rx_bytes += skb->len;
1528         reg_dev->stats.rx_packets++;
1529         nf_reset(skb);
1530         netif_rx(skb);
1531         dev_put(reg_dev);
1532
1533         return 0;
1534 }
1535 #endif
1536
1537 #ifdef CONFIG_IP_PIMSM_V1
1538 /*
1539  * Handle IGMP messages of PIMv1
1540  */
1541
1542 int pim_rcv_v1(struct sk_buff * skb)
1543 {
1544         struct igmphdr *pim;
1545
1546         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1547                 goto drop;
1548
1549         pim = igmp_hdr(skb);
1550
1551         if (!init_net.ipv4.mroute_do_pim ||
1552             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1553                 goto drop;
1554
1555         if (__pim_rcv(skb, sizeof(*pim))) {
1556 drop:
1557                 kfree_skb(skb);
1558         }
1559         return 0;
1560 }
1561 #endif
1562
1563 #ifdef CONFIG_IP_PIMSM_V2
1564 static int pim_rcv(struct sk_buff * skb)
1565 {
1566         struct pimreghdr *pim;
1567
1568         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1569                 goto drop;
1570
1571         pim = (struct pimreghdr *)skb_transport_header(skb);
1572         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1573             (pim->flags&PIM_NULL_REGISTER) ||
1574             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1575              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1576                 goto drop;
1577
1578         if (__pim_rcv(skb, sizeof(*pim))) {
1579 drop:
1580                 kfree_skb(skb);
1581         }
1582         return 0;
1583 }
1584 #endif
1585
1586 static int
1587 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1588 {
1589         int ct;
1590         struct rtnexthop *nhp;
1591         struct net_device *dev = init_net.ipv4.vif_table[c->mfc_parent].dev;
1592         u8 *b = skb_tail_pointer(skb);
1593         struct rtattr *mp_head;
1594
1595         if (dev)
1596                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1597
1598         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1599
1600         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1601                 if (c->mfc_un.res.ttls[ct] < 255) {
1602                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1603                                 goto rtattr_failure;
1604                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1605                         nhp->rtnh_flags = 0;
1606                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1607                         nhp->rtnh_ifindex = init_net.ipv4.vif_table[ct].dev->ifindex;
1608                         nhp->rtnh_len = sizeof(*nhp);
1609                 }
1610         }
1611         mp_head->rta_type = RTA_MULTIPATH;
1612         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1613         rtm->rtm_type = RTN_MULTICAST;
1614         return 1;
1615
1616 rtattr_failure:
1617         nlmsg_trim(skb, b);
1618         return -EMSGSIZE;
1619 }
1620
1621 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1622 {
1623         int err;
1624         struct mfc_cache *cache;
1625         struct rtable *rt = skb->rtable;
1626
1627         read_lock(&mrt_lock);
1628         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1629
1630         if (cache == NULL) {
1631                 struct sk_buff *skb2;
1632                 struct iphdr *iph;
1633                 struct net_device *dev;
1634                 int vif;
1635
1636                 if (nowait) {
1637                         read_unlock(&mrt_lock);
1638                         return -EAGAIN;
1639                 }
1640
1641                 dev = skb->dev;
1642                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1643                         read_unlock(&mrt_lock);
1644                         return -ENODEV;
1645                 }
1646                 skb2 = skb_clone(skb, GFP_ATOMIC);
1647                 if (!skb2) {
1648                         read_unlock(&mrt_lock);
1649                         return -ENOMEM;
1650                 }
1651
1652                 skb_push(skb2, sizeof(struct iphdr));
1653                 skb_reset_network_header(skb2);
1654                 iph = ip_hdr(skb2);
1655                 iph->ihl = sizeof(struct iphdr) >> 2;
1656                 iph->saddr = rt->rt_src;
1657                 iph->daddr = rt->rt_dst;
1658                 iph->version = 0;
1659                 err = ipmr_cache_unresolved(vif, skb2);
1660                 read_unlock(&mrt_lock);
1661                 return err;
1662         }
1663
1664         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1665                 cache->mfc_flags |= MFC_NOTIFY;
1666         err = ipmr_fill_mroute(skb, cache, rtm);
1667         read_unlock(&mrt_lock);
1668         return err;
1669 }
1670
1671 #ifdef CONFIG_PROC_FS
1672 /*
1673  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1674  */
1675 struct ipmr_vif_iter {
1676         int ct;
1677 };
1678
1679 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1680                                            loff_t pos)
1681 {
1682         for (iter->ct = 0; iter->ct < init_net.ipv4.maxvif; ++iter->ct) {
1683                 if (!VIF_EXISTS(&init_net, iter->ct))
1684                         continue;
1685                 if (pos-- == 0)
1686                         return &init_net.ipv4.vif_table[iter->ct];
1687         }
1688         return NULL;
1689 }
1690
1691 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1692         __acquires(mrt_lock)
1693 {
1694         read_lock(&mrt_lock);
1695         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1696                 : SEQ_START_TOKEN;
1697 }
1698
1699 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1700 {
1701         struct ipmr_vif_iter *iter = seq->private;
1702
1703         ++*pos;
1704         if (v == SEQ_START_TOKEN)
1705                 return ipmr_vif_seq_idx(iter, 0);
1706
1707         while (++iter->ct < init_net.ipv4.maxvif) {
1708                 if (!VIF_EXISTS(&init_net, iter->ct))
1709                         continue;
1710                 return &init_net.ipv4.vif_table[iter->ct];
1711         }
1712         return NULL;
1713 }
1714
1715 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1716         __releases(mrt_lock)
1717 {
1718         read_unlock(&mrt_lock);
1719 }
1720
1721 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1722 {
1723         if (v == SEQ_START_TOKEN) {
1724                 seq_puts(seq,
1725                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1726         } else {
1727                 const struct vif_device *vif = v;
1728                 const char *name =  vif->dev ? vif->dev->name : "none";
1729
1730                 seq_printf(seq,
1731                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1732                            vif - init_net.ipv4.vif_table,
1733                            name, vif->bytes_in, vif->pkt_in,
1734                            vif->bytes_out, vif->pkt_out,
1735                            vif->flags, vif->local, vif->remote);
1736         }
1737         return 0;
1738 }
1739
1740 static const struct seq_operations ipmr_vif_seq_ops = {
1741         .start = ipmr_vif_seq_start,
1742         .next  = ipmr_vif_seq_next,
1743         .stop  = ipmr_vif_seq_stop,
1744         .show  = ipmr_vif_seq_show,
1745 };
1746
1747 static int ipmr_vif_open(struct inode *inode, struct file *file)
1748 {
1749         return seq_open_private(file, &ipmr_vif_seq_ops,
1750                         sizeof(struct ipmr_vif_iter));
1751 }
1752
1753 static const struct file_operations ipmr_vif_fops = {
1754         .owner   = THIS_MODULE,
1755         .open    = ipmr_vif_open,
1756         .read    = seq_read,
1757         .llseek  = seq_lseek,
1758         .release = seq_release_private,
1759 };
1760
1761 struct ipmr_mfc_iter {
1762         struct mfc_cache **cache;
1763         int ct;
1764 };
1765
1766
1767 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1768 {
1769         struct mfc_cache *mfc;
1770
1771         it->cache = init_net.ipv4.mfc_cache_array;
1772         read_lock(&mrt_lock);
1773         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1774                 for (mfc = init_net.ipv4.mfc_cache_array[it->ct];
1775                      mfc; mfc = mfc->next)
1776                         if (pos-- == 0)
1777                                 return mfc;
1778         read_unlock(&mrt_lock);
1779
1780         it->cache = &mfc_unres_queue;
1781         spin_lock_bh(&mfc_unres_lock);
1782         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1783                 if (pos-- == 0)
1784                         return mfc;
1785         spin_unlock_bh(&mfc_unres_lock);
1786
1787         it->cache = NULL;
1788         return NULL;
1789 }
1790
1791
1792 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1793 {
1794         struct ipmr_mfc_iter *it = seq->private;
1795         it->cache = NULL;
1796         it->ct = 0;
1797         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1798                 : SEQ_START_TOKEN;
1799 }
1800
1801 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1802 {
1803         struct mfc_cache *mfc = v;
1804         struct ipmr_mfc_iter *it = seq->private;
1805
1806         ++*pos;
1807
1808         if (v == SEQ_START_TOKEN)
1809                 return ipmr_mfc_seq_idx(seq->private, 0);
1810
1811         if (mfc->next)
1812                 return mfc->next;
1813
1814         if (it->cache == &mfc_unres_queue)
1815                 goto end_of_list;
1816
1817         BUG_ON(it->cache != init_net.ipv4.mfc_cache_array);
1818
1819         while (++it->ct < MFC_LINES) {
1820                 mfc = init_net.ipv4.mfc_cache_array[it->ct];
1821                 if (mfc)
1822                         return mfc;
1823         }
1824
1825         /* exhausted cache_array, show unresolved */
1826         read_unlock(&mrt_lock);
1827         it->cache = &mfc_unres_queue;
1828         it->ct = 0;
1829
1830         spin_lock_bh(&mfc_unres_lock);
1831         mfc = mfc_unres_queue;
1832         if (mfc)
1833                 return mfc;
1834
1835  end_of_list:
1836         spin_unlock_bh(&mfc_unres_lock);
1837         it->cache = NULL;
1838
1839         return NULL;
1840 }
1841
1842 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1843 {
1844         struct ipmr_mfc_iter *it = seq->private;
1845
1846         if (it->cache == &mfc_unres_queue)
1847                 spin_unlock_bh(&mfc_unres_lock);
1848         else if (it->cache == init_net.ipv4.mfc_cache_array)
1849                 read_unlock(&mrt_lock);
1850 }
1851
1852 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1853 {
1854         int n;
1855
1856         if (v == SEQ_START_TOKEN) {
1857                 seq_puts(seq,
1858                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1859         } else {
1860                 const struct mfc_cache *mfc = v;
1861                 const struct ipmr_mfc_iter *it = seq->private;
1862
1863                 seq_printf(seq, "%08lX %08lX %-3hd",
1864                            (unsigned long) mfc->mfc_mcastgrp,
1865                            (unsigned long) mfc->mfc_origin,
1866                            mfc->mfc_parent);
1867
1868                 if (it->cache != &mfc_unres_queue) {
1869                         seq_printf(seq, " %8lu %8lu %8lu",
1870                                    mfc->mfc_un.res.pkt,
1871                                    mfc->mfc_un.res.bytes,
1872                                    mfc->mfc_un.res.wrong_if);
1873                         for (n = mfc->mfc_un.res.minvif;
1874                              n < mfc->mfc_un.res.maxvif; n++ ) {
1875                                 if (VIF_EXISTS(&init_net, n) &&
1876                                     mfc->mfc_un.res.ttls[n] < 255)
1877                                         seq_printf(seq,
1878                                            " %2d:%-3d",
1879                                            n, mfc->mfc_un.res.ttls[n]);
1880                         }
1881                 } else {
1882                         /* unresolved mfc_caches don't contain
1883                          * pkt, bytes and wrong_if values
1884                          */
1885                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1886                 }
1887                 seq_putc(seq, '\n');
1888         }
1889         return 0;
1890 }
1891
1892 static const struct seq_operations ipmr_mfc_seq_ops = {
1893         .start = ipmr_mfc_seq_start,
1894         .next  = ipmr_mfc_seq_next,
1895         .stop  = ipmr_mfc_seq_stop,
1896         .show  = ipmr_mfc_seq_show,
1897 };
1898
1899 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1900 {
1901         return seq_open_private(file, &ipmr_mfc_seq_ops,
1902                         sizeof(struct ipmr_mfc_iter));
1903 }
1904
1905 static const struct file_operations ipmr_mfc_fops = {
1906         .owner   = THIS_MODULE,
1907         .open    = ipmr_mfc_open,
1908         .read    = seq_read,
1909         .llseek  = seq_lseek,
1910         .release = seq_release_private,
1911 };
1912 #endif
1913
1914 #ifdef CONFIG_IP_PIMSM_V2
1915 static struct net_protocol pim_protocol = {
1916         .handler        =       pim_rcv,
1917 };
1918 #endif
1919
1920
1921 /*
1922  *      Setup for IP multicast routing
1923  */
1924 static int __net_init ipmr_net_init(struct net *net)
1925 {
1926         int err = 0;
1927
1928         net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1929                                       GFP_KERNEL);
1930         if (!net->ipv4.vif_table) {
1931                 err = -ENOMEM;
1932                 goto fail;
1933         }
1934
1935         /* Forwarding cache */
1936         net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1937                                             sizeof(struct mfc_cache *),
1938                                             GFP_KERNEL);
1939         if (!net->ipv4.mfc_cache_array) {
1940                 err = -ENOMEM;
1941                 goto fail_mfc_cache;
1942         }
1943         return 0;
1944
1945 fail_mfc_cache:
1946         kfree(net->ipv4.vif_table);
1947 fail:
1948         return err;
1949 }
1950
1951 static void __net_exit ipmr_net_exit(struct net *net)
1952 {
1953         kfree(net->ipv4.mfc_cache_array);
1954         kfree(net->ipv4.vif_table);
1955 }
1956
1957 static struct pernet_operations ipmr_net_ops = {
1958         .init = ipmr_net_init,
1959         .exit = ipmr_net_exit,
1960 };
1961
1962 int __init ip_mr_init(void)
1963 {
1964         int err;
1965
1966         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1967                                        sizeof(struct mfc_cache),
1968                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1969                                        NULL);
1970         if (!mrt_cachep)
1971                 return -ENOMEM;
1972
1973         err = register_pernet_subsys(&ipmr_net_ops);
1974         if (err)
1975                 goto reg_pernet_fail;
1976
1977         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1978         err = register_netdevice_notifier(&ip_mr_notifier);
1979         if (err)
1980                 goto reg_notif_fail;
1981 #ifdef CONFIG_PROC_FS
1982         err = -ENOMEM;
1983         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1984                 goto proc_vif_fail;
1985         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1986                 goto proc_cache_fail;
1987 #endif
1988         return 0;
1989 #ifdef CONFIG_PROC_FS
1990 proc_cache_fail:
1991         proc_net_remove(&init_net, "ip_mr_vif");
1992 proc_vif_fail:
1993         unregister_netdevice_notifier(&ip_mr_notifier);
1994 #endif
1995 reg_notif_fail:
1996         del_timer(&ipmr_expire_timer);
1997         unregister_pernet_subsys(&ipmr_net_ops);
1998 reg_pernet_fail:
1999         kmem_cache_destroy(mrt_cachep);
2000         return err;
2001 }