netns: ipmr: declare counter cache_resolve_queue_len per-namespace
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 /* Big lock, protecting vif table, mrt cache and mroute socket state.
71    Note that the changes are semaphored via rtnl_lock.
72  */
73
74 static DEFINE_RWLOCK(mrt_lock);
75
76 /*
77  *      Multicast router control variables
78  */
79
80 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
81
82 static int mroute_do_assert;                            /* Set in PIM assert    */
83 static int mroute_do_pim;
84
85 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
86
87 /* Special spinlock for queue of unresolved entries */
88 static DEFINE_SPINLOCK(mfc_unres_lock);
89
90 /* We return to original Alan's scheme. Hash table of resolved
91    entries is changed only in process context and protected
92    with weak lock mrt_lock. Queue of unresolved entries is protected
93    with strong spinlock mfc_unres_lock.
94
95    In this case data path is free of exclusive locks at all.
96  */
97
98 static struct kmem_cache *mrt_cachep __read_mostly;
99
100 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
101 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
102 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
103
104 #ifdef CONFIG_IP_PIMSM_V2
105 static struct net_protocol pim_protocol;
106 #endif
107
108 static struct timer_list ipmr_expire_timer;
109
110 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
111
112 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
113 {
114         dev_close(dev);
115
116         dev = __dev_get_by_name(&init_net, "tunl0");
117         if (dev) {
118                 const struct net_device_ops *ops = dev->netdev_ops;
119                 struct ifreq ifr;
120                 struct ip_tunnel_parm p;
121
122                 memset(&p, 0, sizeof(p));
123                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
124                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
125                 p.iph.version = 4;
126                 p.iph.ihl = 5;
127                 p.iph.protocol = IPPROTO_IPIP;
128                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
129                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
130
131                 if (ops->ndo_do_ioctl) {
132                         mm_segment_t oldfs = get_fs();
133
134                         set_fs(KERNEL_DS);
135                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
136                         set_fs(oldfs);
137                 }
138         }
139 }
140
141 static
142 struct net_device *ipmr_new_tunnel(struct vifctl *v)
143 {
144         struct net_device  *dev;
145
146         dev = __dev_get_by_name(&init_net, "tunl0");
147
148         if (dev) {
149                 const struct net_device_ops *ops = dev->netdev_ops;
150                 int err;
151                 struct ifreq ifr;
152                 struct ip_tunnel_parm p;
153                 struct in_device  *in_dev;
154
155                 memset(&p, 0, sizeof(p));
156                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
157                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
158                 p.iph.version = 4;
159                 p.iph.ihl = 5;
160                 p.iph.protocol = IPPROTO_IPIP;
161                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
162                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
163
164                 if (ops->ndo_do_ioctl) {
165                         mm_segment_t oldfs = get_fs();
166
167                         set_fs(KERNEL_DS);
168                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
169                         set_fs(oldfs);
170                 } else
171                         err = -EOPNOTSUPP;
172
173                 dev = NULL;
174
175                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
176                         dev->flags |= IFF_MULTICAST;
177
178                         in_dev = __in_dev_get_rtnl(dev);
179                         if (in_dev == NULL)
180                                 goto failure;
181
182                         ipv4_devconf_setall(in_dev);
183                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
184
185                         if (dev_open(dev))
186                                 goto failure;
187                         dev_hold(dev);
188                 }
189         }
190         return dev;
191
192 failure:
193         /* allow the register to be completed before unregistering. */
194         rtnl_unlock();
195         rtnl_lock();
196
197         unregister_netdevice(dev);
198         return NULL;
199 }
200
201 #ifdef CONFIG_IP_PIMSM
202
203 static int reg_vif_num = -1;
204
205 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
206 {
207         read_lock(&mrt_lock);
208         dev->stats.tx_bytes += skb->len;
209         dev->stats.tx_packets++;
210         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
211         read_unlock(&mrt_lock);
212         kfree_skb(skb);
213         return 0;
214 }
215
216 static const struct net_device_ops reg_vif_netdev_ops = {
217         .ndo_start_xmit = reg_vif_xmit,
218 };
219
220 static void reg_vif_setup(struct net_device *dev)
221 {
222         dev->type               = ARPHRD_PIMREG;
223         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
224         dev->flags              = IFF_NOARP;
225         dev->netdev_ops         = &reg_vif_netdev_ops,
226         dev->destructor         = free_netdev;
227 }
228
229 static struct net_device *ipmr_reg_vif(void)
230 {
231         struct net_device *dev;
232         struct in_device *in_dev;
233
234         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
235
236         if (dev == NULL)
237                 return NULL;
238
239         if (register_netdevice(dev)) {
240                 free_netdev(dev);
241                 return NULL;
242         }
243         dev->iflink = 0;
244
245         rcu_read_lock();
246         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
247                 rcu_read_unlock();
248                 goto failure;
249         }
250
251         ipv4_devconf_setall(in_dev);
252         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
253         rcu_read_unlock();
254
255         if (dev_open(dev))
256                 goto failure;
257
258         dev_hold(dev);
259
260         return dev;
261
262 failure:
263         /* allow the register to be completed before unregistering. */
264         rtnl_unlock();
265         rtnl_lock();
266
267         unregister_netdevice(dev);
268         return NULL;
269 }
270 #endif
271
272 /*
273  *      Delete a VIF entry
274  *      @notify: Set to 1, if the caller is a notifier_call
275  */
276
277 static int vif_delete(int vifi, int notify)
278 {
279         struct vif_device *v;
280         struct net_device *dev;
281         struct in_device *in_dev;
282
283         if (vifi < 0 || vifi >= init_net.ipv4.maxvif)
284                 return -EADDRNOTAVAIL;
285
286         v = &init_net.ipv4.vif_table[vifi];
287
288         write_lock_bh(&mrt_lock);
289         dev = v->dev;
290         v->dev = NULL;
291
292         if (!dev) {
293                 write_unlock_bh(&mrt_lock);
294                 return -EADDRNOTAVAIL;
295         }
296
297 #ifdef CONFIG_IP_PIMSM
298         if (vifi == reg_vif_num)
299                 reg_vif_num = -1;
300 #endif
301
302         if (vifi+1 == init_net.ipv4.maxvif) {
303                 int tmp;
304                 for (tmp=vifi-1; tmp>=0; tmp--) {
305                         if (VIF_EXISTS(&init_net, tmp))
306                                 break;
307                 }
308                 init_net.ipv4.maxvif = tmp+1;
309         }
310
311         write_unlock_bh(&mrt_lock);
312
313         dev_set_allmulti(dev, -1);
314
315         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
316                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
317                 ip_rt_multicast_event(in_dev);
318         }
319
320         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
321                 unregister_netdevice(dev);
322
323         dev_put(dev);
324         return 0;
325 }
326
327 static inline void ipmr_cache_free(struct mfc_cache *c)
328 {
329         release_net(mfc_net(c));
330         kmem_cache_free(mrt_cachep, c);
331 }
332
333 /* Destroy an unresolved cache entry, killing queued skbs
334    and reporting error to netlink readers.
335  */
336
337 static void ipmr_destroy_unres(struct mfc_cache *c)
338 {
339         struct sk_buff *skb;
340         struct nlmsgerr *e;
341
342         atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
343
344         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
345                 if (ip_hdr(skb)->version == 0) {
346                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
347                         nlh->nlmsg_type = NLMSG_ERROR;
348                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
349                         skb_trim(skb, nlh->nlmsg_len);
350                         e = NLMSG_DATA(nlh);
351                         e->error = -ETIMEDOUT;
352                         memset(&e->msg, 0, sizeof(e->msg));
353
354                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
355                 } else
356                         kfree_skb(skb);
357         }
358
359         ipmr_cache_free(c);
360 }
361
362
363 /* Single timer process for all the unresolved queue. */
364
365 static void ipmr_expire_process(unsigned long dummy)
366 {
367         unsigned long now;
368         unsigned long expires;
369         struct mfc_cache *c, **cp;
370
371         if (!spin_trylock(&mfc_unres_lock)) {
372                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
373                 return;
374         }
375
376         if (mfc_unres_queue == NULL)
377                 goto out;
378
379         now = jiffies;
380         expires = 10*HZ;
381         cp = &mfc_unres_queue;
382
383         while ((c=*cp) != NULL) {
384                 if (time_after(c->mfc_un.unres.expires, now)) {
385                         unsigned long interval = c->mfc_un.unres.expires - now;
386                         if (interval < expires)
387                                 expires = interval;
388                         cp = &c->next;
389                         continue;
390                 }
391
392                 *cp = c->next;
393
394                 ipmr_destroy_unres(c);
395         }
396
397         if (mfc_unres_queue != NULL)
398                 mod_timer(&ipmr_expire_timer, jiffies + expires);
399
400 out:
401         spin_unlock(&mfc_unres_lock);
402 }
403
404 /* Fill oifs list. It is called under write locked mrt_lock. */
405
406 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
407 {
408         int vifi;
409
410         cache->mfc_un.res.minvif = MAXVIFS;
411         cache->mfc_un.res.maxvif = 0;
412         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
413
414         for (vifi = 0; vifi < init_net.ipv4.maxvif; vifi++) {
415                 if (VIF_EXISTS(&init_net, vifi) &&
416                     ttls[vifi] && ttls[vifi] < 255) {
417                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
418                         if (cache->mfc_un.res.minvif > vifi)
419                                 cache->mfc_un.res.minvif = vifi;
420                         if (cache->mfc_un.res.maxvif <= vifi)
421                                 cache->mfc_un.res.maxvif = vifi + 1;
422                 }
423         }
424 }
425
426 static int vif_add(struct vifctl *vifc, int mrtsock)
427 {
428         int vifi = vifc->vifc_vifi;
429         struct vif_device *v = &init_net.ipv4.vif_table[vifi];
430         struct net_device *dev;
431         struct in_device *in_dev;
432         int err;
433
434         /* Is vif busy ? */
435         if (VIF_EXISTS(&init_net, vifi))
436                 return -EADDRINUSE;
437
438         switch (vifc->vifc_flags) {
439 #ifdef CONFIG_IP_PIMSM
440         case VIFF_REGISTER:
441                 /*
442                  * Special Purpose VIF in PIM
443                  * All the packets will be sent to the daemon
444                  */
445                 if (reg_vif_num >= 0)
446                         return -EADDRINUSE;
447                 dev = ipmr_reg_vif();
448                 if (!dev)
449                         return -ENOBUFS;
450                 err = dev_set_allmulti(dev, 1);
451                 if (err) {
452                         unregister_netdevice(dev);
453                         dev_put(dev);
454                         return err;
455                 }
456                 break;
457 #endif
458         case VIFF_TUNNEL:
459                 dev = ipmr_new_tunnel(vifc);
460                 if (!dev)
461                         return -ENOBUFS;
462                 err = dev_set_allmulti(dev, 1);
463                 if (err) {
464                         ipmr_del_tunnel(dev, vifc);
465                         dev_put(dev);
466                         return err;
467                 }
468                 break;
469         case 0:
470                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
471                 if (!dev)
472                         return -EADDRNOTAVAIL;
473                 err = dev_set_allmulti(dev, 1);
474                 if (err) {
475                         dev_put(dev);
476                         return err;
477                 }
478                 break;
479         default:
480                 return -EINVAL;
481         }
482
483         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
484                 return -EADDRNOTAVAIL;
485         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
486         ip_rt_multicast_event(in_dev);
487
488         /*
489          *      Fill in the VIF structures
490          */
491         v->rate_limit = vifc->vifc_rate_limit;
492         v->local = vifc->vifc_lcl_addr.s_addr;
493         v->remote = vifc->vifc_rmt_addr.s_addr;
494         v->flags = vifc->vifc_flags;
495         if (!mrtsock)
496                 v->flags |= VIFF_STATIC;
497         v->threshold = vifc->vifc_threshold;
498         v->bytes_in = 0;
499         v->bytes_out = 0;
500         v->pkt_in = 0;
501         v->pkt_out = 0;
502         v->link = dev->ifindex;
503         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
504                 v->link = dev->iflink;
505
506         /* And finish update writing critical data */
507         write_lock_bh(&mrt_lock);
508         v->dev = dev;
509 #ifdef CONFIG_IP_PIMSM
510         if (v->flags&VIFF_REGISTER)
511                 reg_vif_num = vifi;
512 #endif
513         if (vifi+1 > init_net.ipv4.maxvif)
514                 init_net.ipv4.maxvif = vifi+1;
515         write_unlock_bh(&mrt_lock);
516         return 0;
517 }
518
519 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
520 {
521         int line = MFC_HASH(mcastgrp, origin);
522         struct mfc_cache *c;
523
524         for (c = init_net.ipv4.mfc_cache_array[line]; c; c = c->next) {
525                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
526                         break;
527         }
528         return c;
529 }
530
531 /*
532  *      Allocate a multicast cache entry
533  */
534 static struct mfc_cache *ipmr_cache_alloc(struct net *net)
535 {
536         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
537         if (c == NULL)
538                 return NULL;
539         c->mfc_un.res.minvif = MAXVIFS;
540         mfc_net_set(c, net);
541         return c;
542 }
543
544 static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
545 {
546         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
547         if (c == NULL)
548                 return NULL;
549         skb_queue_head_init(&c->mfc_un.unres.unresolved);
550         c->mfc_un.unres.expires = jiffies + 10*HZ;
551         mfc_net_set(c, net);
552         return c;
553 }
554
555 /*
556  *      A cache entry has gone into a resolved state from queued
557  */
558
559 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
560 {
561         struct sk_buff *skb;
562         struct nlmsgerr *e;
563
564         /*
565          *      Play the pending entries through our router
566          */
567
568         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
569                 if (ip_hdr(skb)->version == 0) {
570                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
571
572                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
573                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
574                                                   (u8 *)nlh);
575                         } else {
576                                 nlh->nlmsg_type = NLMSG_ERROR;
577                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
578                                 skb_trim(skb, nlh->nlmsg_len);
579                                 e = NLMSG_DATA(nlh);
580                                 e->error = -EMSGSIZE;
581                                 memset(&e->msg, 0, sizeof(e->msg));
582                         }
583
584                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
585                 } else
586                         ip_mr_forward(skb, c, 0);
587         }
588 }
589
590 /*
591  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
592  *      expects the following bizarre scheme.
593  *
594  *      Called under mrt_lock.
595  */
596
597 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
598 {
599         struct sk_buff *skb;
600         const int ihl = ip_hdrlen(pkt);
601         struct igmphdr *igmp;
602         struct igmpmsg *msg;
603         int ret;
604
605 #ifdef CONFIG_IP_PIMSM
606         if (assert == IGMPMSG_WHOLEPKT)
607                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
608         else
609 #endif
610                 skb = alloc_skb(128, GFP_ATOMIC);
611
612         if (!skb)
613                 return -ENOBUFS;
614
615 #ifdef CONFIG_IP_PIMSM
616         if (assert == IGMPMSG_WHOLEPKT) {
617                 /* Ugly, but we have no choice with this interface.
618                    Duplicate old header, fix ihl, length etc.
619                    And all this only to mangle msg->im_msgtype and
620                    to set msg->im_mbz to "mbz" :-)
621                  */
622                 skb_push(skb, sizeof(struct iphdr));
623                 skb_reset_network_header(skb);
624                 skb_reset_transport_header(skb);
625                 msg = (struct igmpmsg *)skb_network_header(skb);
626                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
627                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
628                 msg->im_mbz = 0;
629                 msg->im_vif = reg_vif_num;
630                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
631                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
632                                              sizeof(struct iphdr));
633         } else
634 #endif
635         {
636
637         /*
638          *      Copy the IP header
639          */
640
641         skb->network_header = skb->tail;
642         skb_put(skb, ihl);
643         skb_copy_to_linear_data(skb, pkt->data, ihl);
644         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
645         msg = (struct igmpmsg *)skb_network_header(skb);
646         msg->im_vif = vifi;
647         skb->dst = dst_clone(pkt->dst);
648
649         /*
650          *      Add our header
651          */
652
653         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
654         igmp->type      =
655         msg->im_msgtype = assert;
656         igmp->code      =       0;
657         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
658         skb->transport_header = skb->network_header;
659         }
660
661         if (init_net.ipv4.mroute_sk == NULL) {
662                 kfree_skb(skb);
663                 return -EINVAL;
664         }
665
666         /*
667          *      Deliver to mrouted
668          */
669         ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
670         if (ret < 0) {
671                 if (net_ratelimit())
672                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
673                 kfree_skb(skb);
674         }
675
676         return ret;
677 }
678
679 /*
680  *      Queue a packet for resolution. It gets locked cache entry!
681  */
682
683 static int
684 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
685 {
686         int err;
687         struct mfc_cache *c;
688         const struct iphdr *iph = ip_hdr(skb);
689
690         spin_lock_bh(&mfc_unres_lock);
691         for (c=mfc_unres_queue; c; c=c->next) {
692                 if (net_eq(mfc_net(c), &init_net) &&
693                     c->mfc_mcastgrp == iph->daddr &&
694                     c->mfc_origin == iph->saddr)
695                         break;
696         }
697
698         if (c == NULL) {
699                 /*
700                  *      Create a new entry if allowable
701                  */
702
703                 if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) >= 10 ||
704                     (c = ipmr_cache_alloc_unres(&init_net)) == NULL) {
705                         spin_unlock_bh(&mfc_unres_lock);
706
707                         kfree_skb(skb);
708                         return -ENOBUFS;
709                 }
710
711                 /*
712                  *      Fill in the new cache entry
713                  */
714                 c->mfc_parent   = -1;
715                 c->mfc_origin   = iph->saddr;
716                 c->mfc_mcastgrp = iph->daddr;
717
718                 /*
719                  *      Reflect first query at mrouted.
720                  */
721                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
722                         /* If the report failed throw the cache entry
723                            out - Brad Parker
724                          */
725                         spin_unlock_bh(&mfc_unres_lock);
726
727                         ipmr_cache_free(c);
728                         kfree_skb(skb);
729                         return err;
730                 }
731
732                 atomic_inc(&init_net.ipv4.cache_resolve_queue_len);
733                 c->next = mfc_unres_queue;
734                 mfc_unres_queue = c;
735
736                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
737         }
738
739         /*
740          *      See if we can append the packet
741          */
742         if (c->mfc_un.unres.unresolved.qlen>3) {
743                 kfree_skb(skb);
744                 err = -ENOBUFS;
745         } else {
746                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
747                 err = 0;
748         }
749
750         spin_unlock_bh(&mfc_unres_lock);
751         return err;
752 }
753
754 /*
755  *      MFC cache manipulation by user space mroute daemon
756  */
757
758 static int ipmr_mfc_delete(struct mfcctl *mfc)
759 {
760         int line;
761         struct mfc_cache *c, **cp;
762
763         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
764
765         for (cp = &init_net.ipv4.mfc_cache_array[line];
766              (c = *cp) != NULL; cp = &c->next) {
767                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
768                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
769                         write_lock_bh(&mrt_lock);
770                         *cp = c->next;
771                         write_unlock_bh(&mrt_lock);
772
773                         ipmr_cache_free(c);
774                         return 0;
775                 }
776         }
777         return -ENOENT;
778 }
779
780 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
781 {
782         int line;
783         struct mfc_cache *uc, *c, **cp;
784
785         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
786
787         for (cp = &init_net.ipv4.mfc_cache_array[line];
788              (c = *cp) != NULL; cp = &c->next) {
789                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
790                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
791                         break;
792         }
793
794         if (c != NULL) {
795                 write_lock_bh(&mrt_lock);
796                 c->mfc_parent = mfc->mfcc_parent;
797                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
798                 if (!mrtsock)
799                         c->mfc_flags |= MFC_STATIC;
800                 write_unlock_bh(&mrt_lock);
801                 return 0;
802         }
803
804         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
805                 return -EINVAL;
806
807         c = ipmr_cache_alloc(&init_net);
808         if (c == NULL)
809                 return -ENOMEM;
810
811         c->mfc_origin = mfc->mfcc_origin.s_addr;
812         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
813         c->mfc_parent = mfc->mfcc_parent;
814         ipmr_update_thresholds(c, mfc->mfcc_ttls);
815         if (!mrtsock)
816                 c->mfc_flags |= MFC_STATIC;
817
818         write_lock_bh(&mrt_lock);
819         c->next = init_net.ipv4.mfc_cache_array[line];
820         init_net.ipv4.mfc_cache_array[line] = c;
821         write_unlock_bh(&mrt_lock);
822
823         /*
824          *      Check to see if we resolved a queued list. If so we
825          *      need to send on the frames and tidy up.
826          */
827         spin_lock_bh(&mfc_unres_lock);
828         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
829              cp = &uc->next) {
830                 if (net_eq(mfc_net(uc), &init_net) &&
831                     uc->mfc_origin == c->mfc_origin &&
832                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
833                         *cp = uc->next;
834                         atomic_dec(&init_net.ipv4.cache_resolve_queue_len);
835                         break;
836                 }
837         }
838         if (mfc_unres_queue == NULL)
839                 del_timer(&ipmr_expire_timer);
840         spin_unlock_bh(&mfc_unres_lock);
841
842         if (uc) {
843                 ipmr_cache_resolve(uc, c);
844                 ipmr_cache_free(uc);
845         }
846         return 0;
847 }
848
849 /*
850  *      Close the multicast socket, and clear the vif tables etc
851  */
852
853 static void mroute_clean_tables(struct sock *sk)
854 {
855         int i;
856
857         /*
858          *      Shut down all active vif entries
859          */
860         for (i = 0; i < init_net.ipv4.maxvif; i++) {
861                 if (!(init_net.ipv4.vif_table[i].flags&VIFF_STATIC))
862                         vif_delete(i, 0);
863         }
864
865         /*
866          *      Wipe the cache
867          */
868         for (i=0; i<MFC_LINES; i++) {
869                 struct mfc_cache *c, **cp;
870
871                 cp = &init_net.ipv4.mfc_cache_array[i];
872                 while ((c = *cp) != NULL) {
873                         if (c->mfc_flags&MFC_STATIC) {
874                                 cp = &c->next;
875                                 continue;
876                         }
877                         write_lock_bh(&mrt_lock);
878                         *cp = c->next;
879                         write_unlock_bh(&mrt_lock);
880
881                         ipmr_cache_free(c);
882                 }
883         }
884
885         if (atomic_read(&init_net.ipv4.cache_resolve_queue_len) != 0) {
886                 struct mfc_cache *c, **cp;
887
888                 spin_lock_bh(&mfc_unres_lock);
889                 cp = &mfc_unres_queue;
890                 while ((c = *cp) != NULL) {
891                         if (!net_eq(mfc_net(c), &init_net)) {
892                                 cp = &c->next;
893                                 continue;
894                         }
895                         *cp = c->next;
896
897                         ipmr_destroy_unres(c);
898                 }
899                 spin_unlock_bh(&mfc_unres_lock);
900         }
901 }
902
903 static void mrtsock_destruct(struct sock *sk)
904 {
905         rtnl_lock();
906         if (sk == init_net.ipv4.mroute_sk) {
907                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
908
909                 write_lock_bh(&mrt_lock);
910                 init_net.ipv4.mroute_sk = NULL;
911                 write_unlock_bh(&mrt_lock);
912
913                 mroute_clean_tables(sk);
914         }
915         rtnl_unlock();
916 }
917
918 /*
919  *      Socket options and virtual interface manipulation. The whole
920  *      virtual interface system is a complete heap, but unfortunately
921  *      that's how BSD mrouted happens to think. Maybe one day with a proper
922  *      MOSPF/PIM router set up we can clean this up.
923  */
924
925 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
926 {
927         int ret;
928         struct vifctl vif;
929         struct mfcctl mfc;
930
931         if (optname != MRT_INIT) {
932                 if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
933                         return -EACCES;
934         }
935
936         switch (optname) {
937         case MRT_INIT:
938                 if (sk->sk_type != SOCK_RAW ||
939                     inet_sk(sk)->num != IPPROTO_IGMP)
940                         return -EOPNOTSUPP;
941                 if (optlen != sizeof(int))
942                         return -ENOPROTOOPT;
943
944                 rtnl_lock();
945                 if (init_net.ipv4.mroute_sk) {
946                         rtnl_unlock();
947                         return -EADDRINUSE;
948                 }
949
950                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
951                 if (ret == 0) {
952                         write_lock_bh(&mrt_lock);
953                         init_net.ipv4.mroute_sk = sk;
954                         write_unlock_bh(&mrt_lock);
955
956                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
957                 }
958                 rtnl_unlock();
959                 return ret;
960         case MRT_DONE:
961                 if (sk != init_net.ipv4.mroute_sk)
962                         return -EACCES;
963                 return ip_ra_control(sk, 0, NULL);
964         case MRT_ADD_VIF:
965         case MRT_DEL_VIF:
966                 if (optlen != sizeof(vif))
967                         return -EINVAL;
968                 if (copy_from_user(&vif, optval, sizeof(vif)))
969                         return -EFAULT;
970                 if (vif.vifc_vifi >= MAXVIFS)
971                         return -ENFILE;
972                 rtnl_lock();
973                 if (optname == MRT_ADD_VIF) {
974                         ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
975                 } else {
976                         ret = vif_delete(vif.vifc_vifi, 0);
977                 }
978                 rtnl_unlock();
979                 return ret;
980
981                 /*
982                  *      Manipulate the forwarding caches. These live
983                  *      in a sort of kernel/user symbiosis.
984                  */
985         case MRT_ADD_MFC:
986         case MRT_DEL_MFC:
987                 if (optlen != sizeof(mfc))
988                         return -EINVAL;
989                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
990                         return -EFAULT;
991                 rtnl_lock();
992                 if (optname == MRT_DEL_MFC)
993                         ret = ipmr_mfc_delete(&mfc);
994                 else
995                         ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
996                 rtnl_unlock();
997                 return ret;
998                 /*
999                  *      Control PIM assert.
1000                  */
1001         case MRT_ASSERT:
1002         {
1003                 int v;
1004                 if (get_user(v,(int __user *)optval))
1005                         return -EFAULT;
1006                 mroute_do_assert=(v)?1:0;
1007                 return 0;
1008         }
1009 #ifdef CONFIG_IP_PIMSM
1010         case MRT_PIM:
1011         {
1012                 int v;
1013
1014                 if (get_user(v,(int __user *)optval))
1015                         return -EFAULT;
1016                 v = (v) ? 1 : 0;
1017
1018                 rtnl_lock();
1019                 ret = 0;
1020                 if (v != mroute_do_pim) {
1021                         mroute_do_pim = v;
1022                         mroute_do_assert = v;
1023 #ifdef CONFIG_IP_PIMSM_V2
1024                         if (mroute_do_pim)
1025                                 ret = inet_add_protocol(&pim_protocol,
1026                                                         IPPROTO_PIM);
1027                         else
1028                                 ret = inet_del_protocol(&pim_protocol,
1029                                                         IPPROTO_PIM);
1030                         if (ret < 0)
1031                                 ret = -EAGAIN;
1032 #endif
1033                 }
1034                 rtnl_unlock();
1035                 return ret;
1036         }
1037 #endif
1038         /*
1039          *      Spurious command, or MRT_VERSION which you cannot
1040          *      set.
1041          */
1042         default:
1043                 return -ENOPROTOOPT;
1044         }
1045 }
1046
1047 /*
1048  *      Getsock opt support for the multicast routing system.
1049  */
1050
1051 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1052 {
1053         int olr;
1054         int val;
1055
1056         if (optname != MRT_VERSION &&
1057 #ifdef CONFIG_IP_PIMSM
1058            optname!=MRT_PIM &&
1059 #endif
1060            optname!=MRT_ASSERT)
1061                 return -ENOPROTOOPT;
1062
1063         if (get_user(olr, optlen))
1064                 return -EFAULT;
1065
1066         olr = min_t(unsigned int, olr, sizeof(int));
1067         if (olr < 0)
1068                 return -EINVAL;
1069
1070         if (put_user(olr, optlen))
1071                 return -EFAULT;
1072         if (optname == MRT_VERSION)
1073                 val = 0x0305;
1074 #ifdef CONFIG_IP_PIMSM
1075         else if (optname == MRT_PIM)
1076                 val = mroute_do_pim;
1077 #endif
1078         else
1079                 val = mroute_do_assert;
1080         if (copy_to_user(optval, &val, olr))
1081                 return -EFAULT;
1082         return 0;
1083 }
1084
1085 /*
1086  *      The IP multicast ioctl support routines.
1087  */
1088
1089 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1090 {
1091         struct sioc_sg_req sr;
1092         struct sioc_vif_req vr;
1093         struct vif_device *vif;
1094         struct mfc_cache *c;
1095
1096         switch (cmd) {
1097         case SIOCGETVIFCNT:
1098                 if (copy_from_user(&vr, arg, sizeof(vr)))
1099                         return -EFAULT;
1100                 if (vr.vifi >= init_net.ipv4.maxvif)
1101                         return -EINVAL;
1102                 read_lock(&mrt_lock);
1103                 vif = &init_net.ipv4.vif_table[vr.vifi];
1104                 if (VIF_EXISTS(&init_net, vr.vifi)) {
1105                         vr.icount = vif->pkt_in;
1106                         vr.ocount = vif->pkt_out;
1107                         vr.ibytes = vif->bytes_in;
1108                         vr.obytes = vif->bytes_out;
1109                         read_unlock(&mrt_lock);
1110
1111                         if (copy_to_user(arg, &vr, sizeof(vr)))
1112                                 return -EFAULT;
1113                         return 0;
1114                 }
1115                 read_unlock(&mrt_lock);
1116                 return -EADDRNOTAVAIL;
1117         case SIOCGETSGCNT:
1118                 if (copy_from_user(&sr, arg, sizeof(sr)))
1119                         return -EFAULT;
1120
1121                 read_lock(&mrt_lock);
1122                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1123                 if (c) {
1124                         sr.pktcnt = c->mfc_un.res.pkt;
1125                         sr.bytecnt = c->mfc_un.res.bytes;
1126                         sr.wrong_if = c->mfc_un.res.wrong_if;
1127                         read_unlock(&mrt_lock);
1128
1129                         if (copy_to_user(arg, &sr, sizeof(sr)))
1130                                 return -EFAULT;
1131                         return 0;
1132                 }
1133                 read_unlock(&mrt_lock);
1134                 return -EADDRNOTAVAIL;
1135         default:
1136                 return -ENOIOCTLCMD;
1137         }
1138 }
1139
1140
1141 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1142 {
1143         struct net_device *dev = ptr;
1144         struct vif_device *v;
1145         int ct;
1146
1147         if (!net_eq(dev_net(dev), &init_net))
1148                 return NOTIFY_DONE;
1149
1150         if (event != NETDEV_UNREGISTER)
1151                 return NOTIFY_DONE;
1152         v = &init_net.ipv4.vif_table[0];
1153         for (ct = 0; ct < init_net.ipv4.maxvif; ct++, v++) {
1154                 if (v->dev == dev)
1155                         vif_delete(ct, 1);
1156         }
1157         return NOTIFY_DONE;
1158 }
1159
1160
1161 static struct notifier_block ip_mr_notifier = {
1162         .notifier_call = ipmr_device_event,
1163 };
1164
1165 /*
1166  *      Encapsulate a packet by attaching a valid IPIP header to it.
1167  *      This avoids tunnel drivers and other mess and gives us the speed so
1168  *      important for multicast video.
1169  */
1170
1171 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1172 {
1173         struct iphdr *iph;
1174         struct iphdr *old_iph = ip_hdr(skb);
1175
1176         skb_push(skb, sizeof(struct iphdr));
1177         skb->transport_header = skb->network_header;
1178         skb_reset_network_header(skb);
1179         iph = ip_hdr(skb);
1180
1181         iph->version    =       4;
1182         iph->tos        =       old_iph->tos;
1183         iph->ttl        =       old_iph->ttl;
1184         iph->frag_off   =       0;
1185         iph->daddr      =       daddr;
1186         iph->saddr      =       saddr;
1187         iph->protocol   =       IPPROTO_IPIP;
1188         iph->ihl        =       5;
1189         iph->tot_len    =       htons(skb->len);
1190         ip_select_ident(iph, skb->dst, NULL);
1191         ip_send_check(iph);
1192
1193         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1194         nf_reset(skb);
1195 }
1196
1197 static inline int ipmr_forward_finish(struct sk_buff *skb)
1198 {
1199         struct ip_options * opt = &(IPCB(skb)->opt);
1200
1201         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1202
1203         if (unlikely(opt->optlen))
1204                 ip_forward_options(skb);
1205
1206         return dst_output(skb);
1207 }
1208
1209 /*
1210  *      Processing handlers for ipmr_forward
1211  */
1212
1213 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1214 {
1215         const struct iphdr *iph = ip_hdr(skb);
1216         struct vif_device *vif = &init_net.ipv4.vif_table[vifi];
1217         struct net_device *dev;
1218         struct rtable *rt;
1219         int    encap = 0;
1220
1221         if (vif->dev == NULL)
1222                 goto out_free;
1223
1224 #ifdef CONFIG_IP_PIMSM
1225         if (vif->flags & VIFF_REGISTER) {
1226                 vif->pkt_out++;
1227                 vif->bytes_out += skb->len;
1228                 vif->dev->stats.tx_bytes += skb->len;
1229                 vif->dev->stats.tx_packets++;
1230                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1231                 kfree_skb(skb);
1232                 return;
1233         }
1234 #endif
1235
1236         if (vif->flags&VIFF_TUNNEL) {
1237                 struct flowi fl = { .oif = vif->link,
1238                                     .nl_u = { .ip4_u =
1239                                               { .daddr = vif->remote,
1240                                                 .saddr = vif->local,
1241                                                 .tos = RT_TOS(iph->tos) } },
1242                                     .proto = IPPROTO_IPIP };
1243                 if (ip_route_output_key(&init_net, &rt, &fl))
1244                         goto out_free;
1245                 encap = sizeof(struct iphdr);
1246         } else {
1247                 struct flowi fl = { .oif = vif->link,
1248                                     .nl_u = { .ip4_u =
1249                                               { .daddr = iph->daddr,
1250                                                 .tos = RT_TOS(iph->tos) } },
1251                                     .proto = IPPROTO_IPIP };
1252                 if (ip_route_output_key(&init_net, &rt, &fl))
1253                         goto out_free;
1254         }
1255
1256         dev = rt->u.dst.dev;
1257
1258         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1259                 /* Do not fragment multicasts. Alas, IPv4 does not
1260                    allow to send ICMP, so that packets will disappear
1261                    to blackhole.
1262                  */
1263
1264                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1265                 ip_rt_put(rt);
1266                 goto out_free;
1267         }
1268
1269         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1270
1271         if (skb_cow(skb, encap)) {
1272                 ip_rt_put(rt);
1273                 goto out_free;
1274         }
1275
1276         vif->pkt_out++;
1277         vif->bytes_out += skb->len;
1278
1279         dst_release(skb->dst);
1280         skb->dst = &rt->u.dst;
1281         ip_decrease_ttl(ip_hdr(skb));
1282
1283         /* FIXME: forward and output firewalls used to be called here.
1284          * What do we do with netfilter? -- RR */
1285         if (vif->flags & VIFF_TUNNEL) {
1286                 ip_encap(skb, vif->local, vif->remote);
1287                 /* FIXME: extra output firewall step used to be here. --RR */
1288                 vif->dev->stats.tx_packets++;
1289                 vif->dev->stats.tx_bytes += skb->len;
1290         }
1291
1292         IPCB(skb)->flags |= IPSKB_FORWARDED;
1293
1294         /*
1295          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1296          * not only before forwarding, but after forwarding on all output
1297          * interfaces. It is clear, if mrouter runs a multicasting
1298          * program, it should receive packets not depending to what interface
1299          * program is joined.
1300          * If we will not make it, the program will have to join on all
1301          * interfaces. On the other hand, multihoming host (or router, but
1302          * not mrouter) cannot join to more than one interface - it will
1303          * result in receiving multiple packets.
1304          */
1305         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1306                 ipmr_forward_finish);
1307         return;
1308
1309 out_free:
1310         kfree_skb(skb);
1311         return;
1312 }
1313
1314 static int ipmr_find_vif(struct net_device *dev)
1315 {
1316         int ct;
1317         for (ct = init_net.ipv4.maxvif-1; ct >= 0; ct--) {
1318                 if (init_net.ipv4.vif_table[ct].dev == dev)
1319                         break;
1320         }
1321         return ct;
1322 }
1323
1324 /* "local" means that we should preserve one skb (for local delivery) */
1325
1326 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1327 {
1328         int psend = -1;
1329         int vif, ct;
1330
1331         vif = cache->mfc_parent;
1332         cache->mfc_un.res.pkt++;
1333         cache->mfc_un.res.bytes += skb->len;
1334
1335         /*
1336          * Wrong interface: drop packet and (maybe) send PIM assert.
1337          */
1338         if (init_net.ipv4.vif_table[vif].dev != skb->dev) {
1339                 int true_vifi;
1340
1341                 if (skb->rtable->fl.iif == 0) {
1342                         /* It is our own packet, looped back.
1343                            Very complicated situation...
1344
1345                            The best workaround until routing daemons will be
1346                            fixed is not to redistribute packet, if it was
1347                            send through wrong interface. It means, that
1348                            multicast applications WILL NOT work for
1349                            (S,G), which have default multicast route pointing
1350                            to wrong oif. In any case, it is not a good
1351                            idea to use multicasting applications on router.
1352                          */
1353                         goto dont_forward;
1354                 }
1355
1356                 cache->mfc_un.res.wrong_if++;
1357                 true_vifi = ipmr_find_vif(skb->dev);
1358
1359                 if (true_vifi >= 0 && mroute_do_assert &&
1360                     /* pimsm uses asserts, when switching from RPT to SPT,
1361                        so that we cannot check that packet arrived on an oif.
1362                        It is bad, but otherwise we would need to move pretty
1363                        large chunk of pimd to kernel. Ough... --ANK
1364                      */
1365                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1366                     time_after(jiffies,
1367                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1368                         cache->mfc_un.res.last_assert = jiffies;
1369                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1370                 }
1371                 goto dont_forward;
1372         }
1373
1374         init_net.ipv4.vif_table[vif].pkt_in++;
1375         init_net.ipv4.vif_table[vif].bytes_in += skb->len;
1376
1377         /*
1378          *      Forward the frame
1379          */
1380         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1381                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1382                         if (psend != -1) {
1383                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1384                                 if (skb2)
1385                                         ipmr_queue_xmit(skb2, cache, psend);
1386                         }
1387                         psend = ct;
1388                 }
1389         }
1390         if (psend != -1) {
1391                 if (local) {
1392                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1393                         if (skb2)
1394                                 ipmr_queue_xmit(skb2, cache, psend);
1395                 } else {
1396                         ipmr_queue_xmit(skb, cache, psend);
1397                         return 0;
1398                 }
1399         }
1400
1401 dont_forward:
1402         if (!local)
1403                 kfree_skb(skb);
1404         return 0;
1405 }
1406
1407
1408 /*
1409  *      Multicast packets for forwarding arrive here
1410  */
1411
1412 int ip_mr_input(struct sk_buff *skb)
1413 {
1414         struct mfc_cache *cache;
1415         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1416
1417         /* Packet is looped back after forward, it should not be
1418            forwarded second time, but still can be delivered locally.
1419          */
1420         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1421                 goto dont_forward;
1422
1423         if (!local) {
1424                     if (IPCB(skb)->opt.router_alert) {
1425                             if (ip_call_ra_chain(skb))
1426                                     return 0;
1427                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1428                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1429                                Cisco IOS <= 11.2(8)) do not put router alert
1430                                option to IGMP packets destined to routable
1431                                groups. It is very bad, because it means
1432                                that we can forward NO IGMP messages.
1433                              */
1434                             read_lock(&mrt_lock);
1435                             if (init_net.ipv4.mroute_sk) {
1436                                     nf_reset(skb);
1437                                     raw_rcv(init_net.ipv4.mroute_sk, skb);
1438                                     read_unlock(&mrt_lock);
1439                                     return 0;
1440                             }
1441                             read_unlock(&mrt_lock);
1442                     }
1443         }
1444
1445         read_lock(&mrt_lock);
1446         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1447
1448         /*
1449          *      No usable cache entry
1450          */
1451         if (cache == NULL) {
1452                 int vif;
1453
1454                 if (local) {
1455                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1456                         ip_local_deliver(skb);
1457                         if (skb2 == NULL) {
1458                                 read_unlock(&mrt_lock);
1459                                 return -ENOBUFS;
1460                         }
1461                         skb = skb2;
1462                 }
1463
1464                 vif = ipmr_find_vif(skb->dev);
1465                 if (vif >= 0) {
1466                         int err = ipmr_cache_unresolved(vif, skb);
1467                         read_unlock(&mrt_lock);
1468
1469                         return err;
1470                 }
1471                 read_unlock(&mrt_lock);
1472                 kfree_skb(skb);
1473                 return -ENODEV;
1474         }
1475
1476         ip_mr_forward(skb, cache, local);
1477
1478         read_unlock(&mrt_lock);
1479
1480         if (local)
1481                 return ip_local_deliver(skb);
1482
1483         return 0;
1484
1485 dont_forward:
1486         if (local)
1487                 return ip_local_deliver(skb);
1488         kfree_skb(skb);
1489         return 0;
1490 }
1491
1492 #ifdef CONFIG_IP_PIMSM
1493 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1494 {
1495         struct net_device *reg_dev = NULL;
1496         struct iphdr *encap;
1497
1498         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1499         /*
1500            Check that:
1501            a. packet is really destinted to a multicast group
1502            b. packet is not a NULL-REGISTER
1503            c. packet is not truncated
1504          */
1505         if (!ipv4_is_multicast(encap->daddr) ||
1506             encap->tot_len == 0 ||
1507             ntohs(encap->tot_len) + pimlen > skb->len)
1508                 return 1;
1509
1510         read_lock(&mrt_lock);
1511         if (reg_vif_num >= 0)
1512                 reg_dev = init_net.ipv4.vif_table[reg_vif_num].dev;
1513         if (reg_dev)
1514                 dev_hold(reg_dev);
1515         read_unlock(&mrt_lock);
1516
1517         if (reg_dev == NULL)
1518                 return 1;
1519
1520         skb->mac_header = skb->network_header;
1521         skb_pull(skb, (u8*)encap - skb->data);
1522         skb_reset_network_header(skb);
1523         skb->dev = reg_dev;
1524         skb->protocol = htons(ETH_P_IP);
1525         skb->ip_summed = 0;
1526         skb->pkt_type = PACKET_HOST;
1527         dst_release(skb->dst);
1528         skb->dst = NULL;
1529         reg_dev->stats.rx_bytes += skb->len;
1530         reg_dev->stats.rx_packets++;
1531         nf_reset(skb);
1532         netif_rx(skb);
1533         dev_put(reg_dev);
1534
1535         return 0;
1536 }
1537 #endif
1538
1539 #ifdef CONFIG_IP_PIMSM_V1
1540 /*
1541  * Handle IGMP messages of PIMv1
1542  */
1543
1544 int pim_rcv_v1(struct sk_buff * skb)
1545 {
1546         struct igmphdr *pim;
1547
1548         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1549                 goto drop;
1550
1551         pim = igmp_hdr(skb);
1552
1553         if (!mroute_do_pim ||
1554             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1555                 goto drop;
1556
1557         if (__pim_rcv(skb, sizeof(*pim))) {
1558 drop:
1559                 kfree_skb(skb);
1560         }
1561         return 0;
1562 }
1563 #endif
1564
1565 #ifdef CONFIG_IP_PIMSM_V2
1566 static int pim_rcv(struct sk_buff * skb)
1567 {
1568         struct pimreghdr *pim;
1569
1570         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1571                 goto drop;
1572
1573         pim = (struct pimreghdr *)skb_transport_header(skb);
1574         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1575             (pim->flags&PIM_NULL_REGISTER) ||
1576             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1577              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1578                 goto drop;
1579
1580         if (__pim_rcv(skb, sizeof(*pim))) {
1581 drop:
1582                 kfree_skb(skb);
1583         }
1584         return 0;
1585 }
1586 #endif
1587
1588 static int
1589 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1590 {
1591         int ct;
1592         struct rtnexthop *nhp;
1593         struct net_device *dev = init_net.ipv4.vif_table[c->mfc_parent].dev;
1594         u8 *b = skb_tail_pointer(skb);
1595         struct rtattr *mp_head;
1596
1597         if (dev)
1598                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1599
1600         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1601
1602         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1603                 if (c->mfc_un.res.ttls[ct] < 255) {
1604                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1605                                 goto rtattr_failure;
1606                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1607                         nhp->rtnh_flags = 0;
1608                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1609                         nhp->rtnh_ifindex = init_net.ipv4.vif_table[ct].dev->ifindex;
1610                         nhp->rtnh_len = sizeof(*nhp);
1611                 }
1612         }
1613         mp_head->rta_type = RTA_MULTIPATH;
1614         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1615         rtm->rtm_type = RTN_MULTICAST;
1616         return 1;
1617
1618 rtattr_failure:
1619         nlmsg_trim(skb, b);
1620         return -EMSGSIZE;
1621 }
1622
1623 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1624 {
1625         int err;
1626         struct mfc_cache *cache;
1627         struct rtable *rt = skb->rtable;
1628
1629         read_lock(&mrt_lock);
1630         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1631
1632         if (cache == NULL) {
1633                 struct sk_buff *skb2;
1634                 struct iphdr *iph;
1635                 struct net_device *dev;
1636                 int vif;
1637
1638                 if (nowait) {
1639                         read_unlock(&mrt_lock);
1640                         return -EAGAIN;
1641                 }
1642
1643                 dev = skb->dev;
1644                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1645                         read_unlock(&mrt_lock);
1646                         return -ENODEV;
1647                 }
1648                 skb2 = skb_clone(skb, GFP_ATOMIC);
1649                 if (!skb2) {
1650                         read_unlock(&mrt_lock);
1651                         return -ENOMEM;
1652                 }
1653
1654                 skb_push(skb2, sizeof(struct iphdr));
1655                 skb_reset_network_header(skb2);
1656                 iph = ip_hdr(skb2);
1657                 iph->ihl = sizeof(struct iphdr) >> 2;
1658                 iph->saddr = rt->rt_src;
1659                 iph->daddr = rt->rt_dst;
1660                 iph->version = 0;
1661                 err = ipmr_cache_unresolved(vif, skb2);
1662                 read_unlock(&mrt_lock);
1663                 return err;
1664         }
1665
1666         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1667                 cache->mfc_flags |= MFC_NOTIFY;
1668         err = ipmr_fill_mroute(skb, cache, rtm);
1669         read_unlock(&mrt_lock);
1670         return err;
1671 }
1672
1673 #ifdef CONFIG_PROC_FS
1674 /*
1675  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1676  */
1677 struct ipmr_vif_iter {
1678         int ct;
1679 };
1680
1681 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1682                                            loff_t pos)
1683 {
1684         for (iter->ct = 0; iter->ct < init_net.ipv4.maxvif; ++iter->ct) {
1685                 if (!VIF_EXISTS(&init_net, iter->ct))
1686                         continue;
1687                 if (pos-- == 0)
1688                         return &init_net.ipv4.vif_table[iter->ct];
1689         }
1690         return NULL;
1691 }
1692
1693 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1694         __acquires(mrt_lock)
1695 {
1696         read_lock(&mrt_lock);
1697         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1698                 : SEQ_START_TOKEN;
1699 }
1700
1701 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1702 {
1703         struct ipmr_vif_iter *iter = seq->private;
1704
1705         ++*pos;
1706         if (v == SEQ_START_TOKEN)
1707                 return ipmr_vif_seq_idx(iter, 0);
1708
1709         while (++iter->ct < init_net.ipv4.maxvif) {
1710                 if (!VIF_EXISTS(&init_net, iter->ct))
1711                         continue;
1712                 return &init_net.ipv4.vif_table[iter->ct];
1713         }
1714         return NULL;
1715 }
1716
1717 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1718         __releases(mrt_lock)
1719 {
1720         read_unlock(&mrt_lock);
1721 }
1722
1723 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1724 {
1725         if (v == SEQ_START_TOKEN) {
1726                 seq_puts(seq,
1727                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1728         } else {
1729                 const struct vif_device *vif = v;
1730                 const char *name =  vif->dev ? vif->dev->name : "none";
1731
1732                 seq_printf(seq,
1733                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1734                            vif - init_net.ipv4.vif_table,
1735                            name, vif->bytes_in, vif->pkt_in,
1736                            vif->bytes_out, vif->pkt_out,
1737                            vif->flags, vif->local, vif->remote);
1738         }
1739         return 0;
1740 }
1741
1742 static const struct seq_operations ipmr_vif_seq_ops = {
1743         .start = ipmr_vif_seq_start,
1744         .next  = ipmr_vif_seq_next,
1745         .stop  = ipmr_vif_seq_stop,
1746         .show  = ipmr_vif_seq_show,
1747 };
1748
1749 static int ipmr_vif_open(struct inode *inode, struct file *file)
1750 {
1751         return seq_open_private(file, &ipmr_vif_seq_ops,
1752                         sizeof(struct ipmr_vif_iter));
1753 }
1754
1755 static const struct file_operations ipmr_vif_fops = {
1756         .owner   = THIS_MODULE,
1757         .open    = ipmr_vif_open,
1758         .read    = seq_read,
1759         .llseek  = seq_lseek,
1760         .release = seq_release_private,
1761 };
1762
1763 struct ipmr_mfc_iter {
1764         struct mfc_cache **cache;
1765         int ct;
1766 };
1767
1768
1769 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1770 {
1771         struct mfc_cache *mfc;
1772
1773         it->cache = init_net.ipv4.mfc_cache_array;
1774         read_lock(&mrt_lock);
1775         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1776                 for (mfc = init_net.ipv4.mfc_cache_array[it->ct];
1777                      mfc; mfc = mfc->next)
1778                         if (pos-- == 0)
1779                                 return mfc;
1780         read_unlock(&mrt_lock);
1781
1782         it->cache = &mfc_unres_queue;
1783         spin_lock_bh(&mfc_unres_lock);
1784         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1785                 if (pos-- == 0)
1786                         return mfc;
1787         spin_unlock_bh(&mfc_unres_lock);
1788
1789         it->cache = NULL;
1790         return NULL;
1791 }
1792
1793
1794 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1795 {
1796         struct ipmr_mfc_iter *it = seq->private;
1797         it->cache = NULL;
1798         it->ct = 0;
1799         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1800                 : SEQ_START_TOKEN;
1801 }
1802
1803 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1804 {
1805         struct mfc_cache *mfc = v;
1806         struct ipmr_mfc_iter *it = seq->private;
1807
1808         ++*pos;
1809
1810         if (v == SEQ_START_TOKEN)
1811                 return ipmr_mfc_seq_idx(seq->private, 0);
1812
1813         if (mfc->next)
1814                 return mfc->next;
1815
1816         if (it->cache == &mfc_unres_queue)
1817                 goto end_of_list;
1818
1819         BUG_ON(it->cache != init_net.ipv4.mfc_cache_array);
1820
1821         while (++it->ct < MFC_LINES) {
1822                 mfc = init_net.ipv4.mfc_cache_array[it->ct];
1823                 if (mfc)
1824                         return mfc;
1825         }
1826
1827         /* exhausted cache_array, show unresolved */
1828         read_unlock(&mrt_lock);
1829         it->cache = &mfc_unres_queue;
1830         it->ct = 0;
1831
1832         spin_lock_bh(&mfc_unres_lock);
1833         mfc = mfc_unres_queue;
1834         if (mfc)
1835                 return mfc;
1836
1837  end_of_list:
1838         spin_unlock_bh(&mfc_unres_lock);
1839         it->cache = NULL;
1840
1841         return NULL;
1842 }
1843
1844 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1845 {
1846         struct ipmr_mfc_iter *it = seq->private;
1847
1848         if (it->cache == &mfc_unres_queue)
1849                 spin_unlock_bh(&mfc_unres_lock);
1850         else if (it->cache == init_net.ipv4.mfc_cache_array)
1851                 read_unlock(&mrt_lock);
1852 }
1853
1854 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1855 {
1856         int n;
1857
1858         if (v == SEQ_START_TOKEN) {
1859                 seq_puts(seq,
1860                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1861         } else {
1862                 const struct mfc_cache *mfc = v;
1863                 const struct ipmr_mfc_iter *it = seq->private;
1864
1865                 seq_printf(seq, "%08lX %08lX %-3hd",
1866                            (unsigned long) mfc->mfc_mcastgrp,
1867                            (unsigned long) mfc->mfc_origin,
1868                            mfc->mfc_parent);
1869
1870                 if (it->cache != &mfc_unres_queue) {
1871                         seq_printf(seq, " %8lu %8lu %8lu",
1872                                    mfc->mfc_un.res.pkt,
1873                                    mfc->mfc_un.res.bytes,
1874                                    mfc->mfc_un.res.wrong_if);
1875                         for (n = mfc->mfc_un.res.minvif;
1876                              n < mfc->mfc_un.res.maxvif; n++ ) {
1877                                 if (VIF_EXISTS(&init_net, n) &&
1878                                     mfc->mfc_un.res.ttls[n] < 255)
1879                                         seq_printf(seq,
1880                                            " %2d:%-3d",
1881                                            n, mfc->mfc_un.res.ttls[n]);
1882                         }
1883                 } else {
1884                         /* unresolved mfc_caches don't contain
1885                          * pkt, bytes and wrong_if values
1886                          */
1887                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1888                 }
1889                 seq_putc(seq, '\n');
1890         }
1891         return 0;
1892 }
1893
1894 static const struct seq_operations ipmr_mfc_seq_ops = {
1895         .start = ipmr_mfc_seq_start,
1896         .next  = ipmr_mfc_seq_next,
1897         .stop  = ipmr_mfc_seq_stop,
1898         .show  = ipmr_mfc_seq_show,
1899 };
1900
1901 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1902 {
1903         return seq_open_private(file, &ipmr_mfc_seq_ops,
1904                         sizeof(struct ipmr_mfc_iter));
1905 }
1906
1907 static const struct file_operations ipmr_mfc_fops = {
1908         .owner   = THIS_MODULE,
1909         .open    = ipmr_mfc_open,
1910         .read    = seq_read,
1911         .llseek  = seq_lseek,
1912         .release = seq_release_private,
1913 };
1914 #endif
1915
1916 #ifdef CONFIG_IP_PIMSM_V2
1917 static struct net_protocol pim_protocol = {
1918         .handler        =       pim_rcv,
1919 };
1920 #endif
1921
1922
1923 /*
1924  *      Setup for IP multicast routing
1925  */
1926 static int __net_init ipmr_net_init(struct net *net)
1927 {
1928         int err = 0;
1929
1930         net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1931                                       GFP_KERNEL);
1932         if (!net->ipv4.vif_table) {
1933                 err = -ENOMEM;
1934                 goto fail;
1935         }
1936
1937         /* Forwarding cache */
1938         net->ipv4.mfc_cache_array = kcalloc(MFC_LINES,
1939                                             sizeof(struct mfc_cache *),
1940                                             GFP_KERNEL);
1941         if (!net->ipv4.mfc_cache_array) {
1942                 err = -ENOMEM;
1943                 goto fail_mfc_cache;
1944         }
1945         return 0;
1946
1947 fail_mfc_cache:
1948         kfree(net->ipv4.vif_table);
1949 fail:
1950         return err;
1951 }
1952
1953 static void __net_exit ipmr_net_exit(struct net *net)
1954 {
1955         kfree(net->ipv4.mfc_cache_array);
1956         kfree(net->ipv4.vif_table);
1957 }
1958
1959 static struct pernet_operations ipmr_net_ops = {
1960         .init = ipmr_net_init,
1961         .exit = ipmr_net_exit,
1962 };
1963
1964 int __init ip_mr_init(void)
1965 {
1966         int err;
1967
1968         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1969                                        sizeof(struct mfc_cache),
1970                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1971                                        NULL);
1972         if (!mrt_cachep)
1973                 return -ENOMEM;
1974
1975         err = register_pernet_subsys(&ipmr_net_ops);
1976         if (err)
1977                 goto reg_pernet_fail;
1978
1979         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1980         err = register_netdevice_notifier(&ip_mr_notifier);
1981         if (err)
1982                 goto reg_notif_fail;
1983 #ifdef CONFIG_PROC_FS
1984         err = -ENOMEM;
1985         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1986                 goto proc_vif_fail;
1987         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1988                 goto proc_cache_fail;
1989 #endif
1990         return 0;
1991 #ifdef CONFIG_PROC_FS
1992 proc_cache_fail:
1993         proc_net_remove(&init_net, "ip_mr_vif");
1994 proc_vif_fail:
1995         unregister_netdevice_notifier(&ip_mr_notifier);
1996 #endif
1997 reg_notif_fail:
1998         del_timer(&ipmr_expire_timer);
1999         unregister_pernet_subsys(&ipmr_net_ops);
2000 reg_pernet_fail:
2001         kmem_cache_destroy(mrt_cachep);
2002         return err;
2003 }