netns: ipmr: allocate mroute_socket per-namespace.
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 /* Big lock, protecting vif table, mrt cache and mroute socket state.
71    Note that the changes are semaphored via rtnl_lock.
72  */
73
74 static DEFINE_RWLOCK(mrt_lock);
75
76 /*
77  *      Multicast router control variables
78  */
79
80 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
81 static int maxvif;
82
83 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
84
85 static int mroute_do_assert;                            /* Set in PIM assert    */
86 static int mroute_do_pim;
87
88 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
89
90 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
91 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
92
93 /* Special spinlock for queue of unresolved entries */
94 static DEFINE_SPINLOCK(mfc_unres_lock);
95
96 /* We return to original Alan's scheme. Hash table of resolved
97    entries is changed only in process context and protected
98    with weak lock mrt_lock. Queue of unresolved entries is protected
99    with strong spinlock mfc_unres_lock.
100
101    In this case data path is free of exclusive locks at all.
102  */
103
104 static struct kmem_cache *mrt_cachep __read_mostly;
105
106 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
107 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
108 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
109
110 #ifdef CONFIG_IP_PIMSM_V2
111 static struct net_protocol pim_protocol;
112 #endif
113
114 static struct timer_list ipmr_expire_timer;
115
116 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
117
118 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
119 {
120         dev_close(dev);
121
122         dev = __dev_get_by_name(&init_net, "tunl0");
123         if (dev) {
124                 const struct net_device_ops *ops = dev->netdev_ops;
125                 struct ifreq ifr;
126                 struct ip_tunnel_parm p;
127
128                 memset(&p, 0, sizeof(p));
129                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
130                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
131                 p.iph.version = 4;
132                 p.iph.ihl = 5;
133                 p.iph.protocol = IPPROTO_IPIP;
134                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
135                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
136
137                 if (ops->ndo_do_ioctl) {
138                         mm_segment_t oldfs = get_fs();
139
140                         set_fs(KERNEL_DS);
141                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
142                         set_fs(oldfs);
143                 }
144         }
145 }
146
147 static
148 struct net_device *ipmr_new_tunnel(struct vifctl *v)
149 {
150         struct net_device  *dev;
151
152         dev = __dev_get_by_name(&init_net, "tunl0");
153
154         if (dev) {
155                 const struct net_device_ops *ops = dev->netdev_ops;
156                 int err;
157                 struct ifreq ifr;
158                 struct ip_tunnel_parm p;
159                 struct in_device  *in_dev;
160
161                 memset(&p, 0, sizeof(p));
162                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
163                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
164                 p.iph.version = 4;
165                 p.iph.ihl = 5;
166                 p.iph.protocol = IPPROTO_IPIP;
167                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
168                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
169
170                 if (ops->ndo_do_ioctl) {
171                         mm_segment_t oldfs = get_fs();
172
173                         set_fs(KERNEL_DS);
174                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
175                         set_fs(oldfs);
176                 } else
177                         err = -EOPNOTSUPP;
178
179                 dev = NULL;
180
181                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
182                         dev->flags |= IFF_MULTICAST;
183
184                         in_dev = __in_dev_get_rtnl(dev);
185                         if (in_dev == NULL)
186                                 goto failure;
187
188                         ipv4_devconf_setall(in_dev);
189                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
190
191                         if (dev_open(dev))
192                                 goto failure;
193                         dev_hold(dev);
194                 }
195         }
196         return dev;
197
198 failure:
199         /* allow the register to be completed before unregistering. */
200         rtnl_unlock();
201         rtnl_lock();
202
203         unregister_netdevice(dev);
204         return NULL;
205 }
206
207 #ifdef CONFIG_IP_PIMSM
208
209 static int reg_vif_num = -1;
210
211 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
212 {
213         read_lock(&mrt_lock);
214         dev->stats.tx_bytes += skb->len;
215         dev->stats.tx_packets++;
216         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
217         read_unlock(&mrt_lock);
218         kfree_skb(skb);
219         return 0;
220 }
221
222 static const struct net_device_ops reg_vif_netdev_ops = {
223         .ndo_start_xmit = reg_vif_xmit,
224 };
225
226 static void reg_vif_setup(struct net_device *dev)
227 {
228         dev->type               = ARPHRD_PIMREG;
229         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
230         dev->flags              = IFF_NOARP;
231         dev->netdev_ops         = &reg_vif_netdev_ops,
232         dev->destructor         = free_netdev;
233 }
234
235 static struct net_device *ipmr_reg_vif(void)
236 {
237         struct net_device *dev;
238         struct in_device *in_dev;
239
240         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
241
242         if (dev == NULL)
243                 return NULL;
244
245         if (register_netdevice(dev)) {
246                 free_netdev(dev);
247                 return NULL;
248         }
249         dev->iflink = 0;
250
251         rcu_read_lock();
252         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
253                 rcu_read_unlock();
254                 goto failure;
255         }
256
257         ipv4_devconf_setall(in_dev);
258         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
259         rcu_read_unlock();
260
261         if (dev_open(dev))
262                 goto failure;
263
264         dev_hold(dev);
265
266         return dev;
267
268 failure:
269         /* allow the register to be completed before unregistering. */
270         rtnl_unlock();
271         rtnl_lock();
272
273         unregister_netdevice(dev);
274         return NULL;
275 }
276 #endif
277
278 /*
279  *      Delete a VIF entry
280  *      @notify: Set to 1, if the caller is a notifier_call
281  */
282
283 static int vif_delete(int vifi, int notify)
284 {
285         struct vif_device *v;
286         struct net_device *dev;
287         struct in_device *in_dev;
288
289         if (vifi < 0 || vifi >= maxvif)
290                 return -EADDRNOTAVAIL;
291
292         v = &vif_table[vifi];
293
294         write_lock_bh(&mrt_lock);
295         dev = v->dev;
296         v->dev = NULL;
297
298         if (!dev) {
299                 write_unlock_bh(&mrt_lock);
300                 return -EADDRNOTAVAIL;
301         }
302
303 #ifdef CONFIG_IP_PIMSM
304         if (vifi == reg_vif_num)
305                 reg_vif_num = -1;
306 #endif
307
308         if (vifi+1 == maxvif) {
309                 int tmp;
310                 for (tmp=vifi-1; tmp>=0; tmp--) {
311                         if (VIF_EXISTS(tmp))
312                                 break;
313                 }
314                 maxvif = tmp+1;
315         }
316
317         write_unlock_bh(&mrt_lock);
318
319         dev_set_allmulti(dev, -1);
320
321         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
322                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
323                 ip_rt_multicast_event(in_dev);
324         }
325
326         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
327                 unregister_netdevice(dev);
328
329         dev_put(dev);
330         return 0;
331 }
332
333 /* Destroy an unresolved cache entry, killing queued skbs
334    and reporting error to netlink readers.
335  */
336
337 static void ipmr_destroy_unres(struct mfc_cache *c)
338 {
339         struct sk_buff *skb;
340         struct nlmsgerr *e;
341
342         atomic_dec(&cache_resolve_queue_len);
343
344         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
345                 if (ip_hdr(skb)->version == 0) {
346                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
347                         nlh->nlmsg_type = NLMSG_ERROR;
348                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
349                         skb_trim(skb, nlh->nlmsg_len);
350                         e = NLMSG_DATA(nlh);
351                         e->error = -ETIMEDOUT;
352                         memset(&e->msg, 0, sizeof(e->msg));
353
354                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
355                 } else
356                         kfree_skb(skb);
357         }
358
359         kmem_cache_free(mrt_cachep, c);
360 }
361
362
363 /* Single timer process for all the unresolved queue. */
364
365 static void ipmr_expire_process(unsigned long dummy)
366 {
367         unsigned long now;
368         unsigned long expires;
369         struct mfc_cache *c, **cp;
370
371         if (!spin_trylock(&mfc_unres_lock)) {
372                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
373                 return;
374         }
375
376         if (atomic_read(&cache_resolve_queue_len) == 0)
377                 goto out;
378
379         now = jiffies;
380         expires = 10*HZ;
381         cp = &mfc_unres_queue;
382
383         while ((c=*cp) != NULL) {
384                 if (time_after(c->mfc_un.unres.expires, now)) {
385                         unsigned long interval = c->mfc_un.unres.expires - now;
386                         if (interval < expires)
387                                 expires = interval;
388                         cp = &c->next;
389                         continue;
390                 }
391
392                 *cp = c->next;
393
394                 ipmr_destroy_unres(c);
395         }
396
397         if (atomic_read(&cache_resolve_queue_len))
398                 mod_timer(&ipmr_expire_timer, jiffies + expires);
399
400 out:
401         spin_unlock(&mfc_unres_lock);
402 }
403
404 /* Fill oifs list. It is called under write locked mrt_lock. */
405
406 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
407 {
408         int vifi;
409
410         cache->mfc_un.res.minvif = MAXVIFS;
411         cache->mfc_un.res.maxvif = 0;
412         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
413
414         for (vifi=0; vifi<maxvif; vifi++) {
415                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
416                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
417                         if (cache->mfc_un.res.minvif > vifi)
418                                 cache->mfc_un.res.minvif = vifi;
419                         if (cache->mfc_un.res.maxvif <= vifi)
420                                 cache->mfc_un.res.maxvif = vifi + 1;
421                 }
422         }
423 }
424
425 static int vif_add(struct vifctl *vifc, int mrtsock)
426 {
427         int vifi = vifc->vifc_vifi;
428         struct vif_device *v = &vif_table[vifi];
429         struct net_device *dev;
430         struct in_device *in_dev;
431         int err;
432
433         /* Is vif busy ? */
434         if (VIF_EXISTS(vifi))
435                 return -EADDRINUSE;
436
437         switch (vifc->vifc_flags) {
438 #ifdef CONFIG_IP_PIMSM
439         case VIFF_REGISTER:
440                 /*
441                  * Special Purpose VIF in PIM
442                  * All the packets will be sent to the daemon
443                  */
444                 if (reg_vif_num >= 0)
445                         return -EADDRINUSE;
446                 dev = ipmr_reg_vif();
447                 if (!dev)
448                         return -ENOBUFS;
449                 err = dev_set_allmulti(dev, 1);
450                 if (err) {
451                         unregister_netdevice(dev);
452                         dev_put(dev);
453                         return err;
454                 }
455                 break;
456 #endif
457         case VIFF_TUNNEL:
458                 dev = ipmr_new_tunnel(vifc);
459                 if (!dev)
460                         return -ENOBUFS;
461                 err = dev_set_allmulti(dev, 1);
462                 if (err) {
463                         ipmr_del_tunnel(dev, vifc);
464                         dev_put(dev);
465                         return err;
466                 }
467                 break;
468         case 0:
469                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
470                 if (!dev)
471                         return -EADDRNOTAVAIL;
472                 err = dev_set_allmulti(dev, 1);
473                 if (err) {
474                         dev_put(dev);
475                         return err;
476                 }
477                 break;
478         default:
479                 return -EINVAL;
480         }
481
482         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
483                 return -EADDRNOTAVAIL;
484         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
485         ip_rt_multicast_event(in_dev);
486
487         /*
488          *      Fill in the VIF structures
489          */
490         v->rate_limit = vifc->vifc_rate_limit;
491         v->local = vifc->vifc_lcl_addr.s_addr;
492         v->remote = vifc->vifc_rmt_addr.s_addr;
493         v->flags = vifc->vifc_flags;
494         if (!mrtsock)
495                 v->flags |= VIFF_STATIC;
496         v->threshold = vifc->vifc_threshold;
497         v->bytes_in = 0;
498         v->bytes_out = 0;
499         v->pkt_in = 0;
500         v->pkt_out = 0;
501         v->link = dev->ifindex;
502         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
503                 v->link = dev->iflink;
504
505         /* And finish update writing critical data */
506         write_lock_bh(&mrt_lock);
507         v->dev = dev;
508 #ifdef CONFIG_IP_PIMSM
509         if (v->flags&VIFF_REGISTER)
510                 reg_vif_num = vifi;
511 #endif
512         if (vifi+1 > maxvif)
513                 maxvif = vifi+1;
514         write_unlock_bh(&mrt_lock);
515         return 0;
516 }
517
518 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
519 {
520         int line = MFC_HASH(mcastgrp, origin);
521         struct mfc_cache *c;
522
523         for (c=mfc_cache_array[line]; c; c = c->next) {
524                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
525                         break;
526         }
527         return c;
528 }
529
530 /*
531  *      Allocate a multicast cache entry
532  */
533 static struct mfc_cache *ipmr_cache_alloc(void)
534 {
535         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
536         if (c == NULL)
537                 return NULL;
538         c->mfc_un.res.minvif = MAXVIFS;
539         return c;
540 }
541
542 static struct mfc_cache *ipmr_cache_alloc_unres(void)
543 {
544         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
545         if (c == NULL)
546                 return NULL;
547         skb_queue_head_init(&c->mfc_un.unres.unresolved);
548         c->mfc_un.unres.expires = jiffies + 10*HZ;
549         return c;
550 }
551
552 /*
553  *      A cache entry has gone into a resolved state from queued
554  */
555
556 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
557 {
558         struct sk_buff *skb;
559         struct nlmsgerr *e;
560
561         /*
562          *      Play the pending entries through our router
563          */
564
565         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
566                 if (ip_hdr(skb)->version == 0) {
567                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
568
569                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
570                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
571                                                   (u8 *)nlh);
572                         } else {
573                                 nlh->nlmsg_type = NLMSG_ERROR;
574                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
575                                 skb_trim(skb, nlh->nlmsg_len);
576                                 e = NLMSG_DATA(nlh);
577                                 e->error = -EMSGSIZE;
578                                 memset(&e->msg, 0, sizeof(e->msg));
579                         }
580
581                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
582                 } else
583                         ip_mr_forward(skb, c, 0);
584         }
585 }
586
587 /*
588  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
589  *      expects the following bizarre scheme.
590  *
591  *      Called under mrt_lock.
592  */
593
594 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
595 {
596         struct sk_buff *skb;
597         const int ihl = ip_hdrlen(pkt);
598         struct igmphdr *igmp;
599         struct igmpmsg *msg;
600         int ret;
601
602 #ifdef CONFIG_IP_PIMSM
603         if (assert == IGMPMSG_WHOLEPKT)
604                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
605         else
606 #endif
607                 skb = alloc_skb(128, GFP_ATOMIC);
608
609         if (!skb)
610                 return -ENOBUFS;
611
612 #ifdef CONFIG_IP_PIMSM
613         if (assert == IGMPMSG_WHOLEPKT) {
614                 /* Ugly, but we have no choice with this interface.
615                    Duplicate old header, fix ihl, length etc.
616                    And all this only to mangle msg->im_msgtype and
617                    to set msg->im_mbz to "mbz" :-)
618                  */
619                 skb_push(skb, sizeof(struct iphdr));
620                 skb_reset_network_header(skb);
621                 skb_reset_transport_header(skb);
622                 msg = (struct igmpmsg *)skb_network_header(skb);
623                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
624                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
625                 msg->im_mbz = 0;
626                 msg->im_vif = reg_vif_num;
627                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
628                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
629                                              sizeof(struct iphdr));
630         } else
631 #endif
632         {
633
634         /*
635          *      Copy the IP header
636          */
637
638         skb->network_header = skb->tail;
639         skb_put(skb, ihl);
640         skb_copy_to_linear_data(skb, pkt->data, ihl);
641         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
642         msg = (struct igmpmsg *)skb_network_header(skb);
643         msg->im_vif = vifi;
644         skb->dst = dst_clone(pkt->dst);
645
646         /*
647          *      Add our header
648          */
649
650         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
651         igmp->type      =
652         msg->im_msgtype = assert;
653         igmp->code      =       0;
654         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
655         skb->transport_header = skb->network_header;
656         }
657
658         if (init_net.ipv4.mroute_sk == NULL) {
659                 kfree_skb(skb);
660                 return -EINVAL;
661         }
662
663         /*
664          *      Deliver to mrouted
665          */
666         ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
667         if (ret < 0) {
668                 if (net_ratelimit())
669                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
670                 kfree_skb(skb);
671         }
672
673         return ret;
674 }
675
676 /*
677  *      Queue a packet for resolution. It gets locked cache entry!
678  */
679
680 static int
681 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
682 {
683         int err;
684         struct mfc_cache *c;
685         const struct iphdr *iph = ip_hdr(skb);
686
687         spin_lock_bh(&mfc_unres_lock);
688         for (c=mfc_unres_queue; c; c=c->next) {
689                 if (c->mfc_mcastgrp == iph->daddr &&
690                     c->mfc_origin == iph->saddr)
691                         break;
692         }
693
694         if (c == NULL) {
695                 /*
696                  *      Create a new entry if allowable
697                  */
698
699                 if (atomic_read(&cache_resolve_queue_len) >= 10 ||
700                     (c=ipmr_cache_alloc_unres())==NULL) {
701                         spin_unlock_bh(&mfc_unres_lock);
702
703                         kfree_skb(skb);
704                         return -ENOBUFS;
705                 }
706
707                 /*
708                  *      Fill in the new cache entry
709                  */
710                 c->mfc_parent   = -1;
711                 c->mfc_origin   = iph->saddr;
712                 c->mfc_mcastgrp = iph->daddr;
713
714                 /*
715                  *      Reflect first query at mrouted.
716                  */
717                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
718                         /* If the report failed throw the cache entry
719                            out - Brad Parker
720                          */
721                         spin_unlock_bh(&mfc_unres_lock);
722
723                         kmem_cache_free(mrt_cachep, c);
724                         kfree_skb(skb);
725                         return err;
726                 }
727
728                 atomic_inc(&cache_resolve_queue_len);
729                 c->next = mfc_unres_queue;
730                 mfc_unres_queue = c;
731
732                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
733         }
734
735         /*
736          *      See if we can append the packet
737          */
738         if (c->mfc_un.unres.unresolved.qlen>3) {
739                 kfree_skb(skb);
740                 err = -ENOBUFS;
741         } else {
742                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
743                 err = 0;
744         }
745
746         spin_unlock_bh(&mfc_unres_lock);
747         return err;
748 }
749
750 /*
751  *      MFC cache manipulation by user space mroute daemon
752  */
753
754 static int ipmr_mfc_delete(struct mfcctl *mfc)
755 {
756         int line;
757         struct mfc_cache *c, **cp;
758
759         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
760
761         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
762                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
763                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
764                         write_lock_bh(&mrt_lock);
765                         *cp = c->next;
766                         write_unlock_bh(&mrt_lock);
767
768                         kmem_cache_free(mrt_cachep, c);
769                         return 0;
770                 }
771         }
772         return -ENOENT;
773 }
774
775 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
776 {
777         int line;
778         struct mfc_cache *uc, *c, **cp;
779
780         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
781
782         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
783                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
784                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
785                         break;
786         }
787
788         if (c != NULL) {
789                 write_lock_bh(&mrt_lock);
790                 c->mfc_parent = mfc->mfcc_parent;
791                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
792                 if (!mrtsock)
793                         c->mfc_flags |= MFC_STATIC;
794                 write_unlock_bh(&mrt_lock);
795                 return 0;
796         }
797
798         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
799                 return -EINVAL;
800
801         c = ipmr_cache_alloc();
802         if (c == NULL)
803                 return -ENOMEM;
804
805         c->mfc_origin = mfc->mfcc_origin.s_addr;
806         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
807         c->mfc_parent = mfc->mfcc_parent;
808         ipmr_update_thresholds(c, mfc->mfcc_ttls);
809         if (!mrtsock)
810                 c->mfc_flags |= MFC_STATIC;
811
812         write_lock_bh(&mrt_lock);
813         c->next = mfc_cache_array[line];
814         mfc_cache_array[line] = c;
815         write_unlock_bh(&mrt_lock);
816
817         /*
818          *      Check to see if we resolved a queued list. If so we
819          *      need to send on the frames and tidy up.
820          */
821         spin_lock_bh(&mfc_unres_lock);
822         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
823              cp = &uc->next) {
824                 if (uc->mfc_origin == c->mfc_origin &&
825                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
826                         *cp = uc->next;
827                         if (atomic_dec_and_test(&cache_resolve_queue_len))
828                                 del_timer(&ipmr_expire_timer);
829                         break;
830                 }
831         }
832         spin_unlock_bh(&mfc_unres_lock);
833
834         if (uc) {
835                 ipmr_cache_resolve(uc, c);
836                 kmem_cache_free(mrt_cachep, uc);
837         }
838         return 0;
839 }
840
841 /*
842  *      Close the multicast socket, and clear the vif tables etc
843  */
844
845 static void mroute_clean_tables(struct sock *sk)
846 {
847         int i;
848
849         /*
850          *      Shut down all active vif entries
851          */
852         for (i=0; i<maxvif; i++) {
853                 if (!(vif_table[i].flags&VIFF_STATIC))
854                         vif_delete(i, 0);
855         }
856
857         /*
858          *      Wipe the cache
859          */
860         for (i=0; i<MFC_LINES; i++) {
861                 struct mfc_cache *c, **cp;
862
863                 cp = &mfc_cache_array[i];
864                 while ((c = *cp) != NULL) {
865                         if (c->mfc_flags&MFC_STATIC) {
866                                 cp = &c->next;
867                                 continue;
868                         }
869                         write_lock_bh(&mrt_lock);
870                         *cp = c->next;
871                         write_unlock_bh(&mrt_lock);
872
873                         kmem_cache_free(mrt_cachep, c);
874                 }
875         }
876
877         if (atomic_read(&cache_resolve_queue_len) != 0) {
878                 struct mfc_cache *c;
879
880                 spin_lock_bh(&mfc_unres_lock);
881                 while (mfc_unres_queue != NULL) {
882                         c = mfc_unres_queue;
883                         mfc_unres_queue = c->next;
884                         spin_unlock_bh(&mfc_unres_lock);
885
886                         ipmr_destroy_unres(c);
887
888                         spin_lock_bh(&mfc_unres_lock);
889                 }
890                 spin_unlock_bh(&mfc_unres_lock);
891         }
892 }
893
894 static void mrtsock_destruct(struct sock *sk)
895 {
896         rtnl_lock();
897         if (sk == init_net.ipv4.mroute_sk) {
898                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
899
900                 write_lock_bh(&mrt_lock);
901                 init_net.ipv4.mroute_sk = NULL;
902                 write_unlock_bh(&mrt_lock);
903
904                 mroute_clean_tables(sk);
905         }
906         rtnl_unlock();
907 }
908
909 /*
910  *      Socket options and virtual interface manipulation. The whole
911  *      virtual interface system is a complete heap, but unfortunately
912  *      that's how BSD mrouted happens to think. Maybe one day with a proper
913  *      MOSPF/PIM router set up we can clean this up.
914  */
915
916 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
917 {
918         int ret;
919         struct vifctl vif;
920         struct mfcctl mfc;
921
922         if (optname != MRT_INIT) {
923                 if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
924                         return -EACCES;
925         }
926
927         switch (optname) {
928         case MRT_INIT:
929                 if (sk->sk_type != SOCK_RAW ||
930                     inet_sk(sk)->num != IPPROTO_IGMP)
931                         return -EOPNOTSUPP;
932                 if (optlen != sizeof(int))
933                         return -ENOPROTOOPT;
934
935                 rtnl_lock();
936                 if (init_net.ipv4.mroute_sk) {
937                         rtnl_unlock();
938                         return -EADDRINUSE;
939                 }
940
941                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
942                 if (ret == 0) {
943                         write_lock_bh(&mrt_lock);
944                         init_net.ipv4.mroute_sk = sk;
945                         write_unlock_bh(&mrt_lock);
946
947                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
948                 }
949                 rtnl_unlock();
950                 return ret;
951         case MRT_DONE:
952                 if (sk != init_net.ipv4.mroute_sk)
953                         return -EACCES;
954                 return ip_ra_control(sk, 0, NULL);
955         case MRT_ADD_VIF:
956         case MRT_DEL_VIF:
957                 if (optlen != sizeof(vif))
958                         return -EINVAL;
959                 if (copy_from_user(&vif, optval, sizeof(vif)))
960                         return -EFAULT;
961                 if (vif.vifc_vifi >= MAXVIFS)
962                         return -ENFILE;
963                 rtnl_lock();
964                 if (optname == MRT_ADD_VIF) {
965                         ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
966                 } else {
967                         ret = vif_delete(vif.vifc_vifi, 0);
968                 }
969                 rtnl_unlock();
970                 return ret;
971
972                 /*
973                  *      Manipulate the forwarding caches. These live
974                  *      in a sort of kernel/user symbiosis.
975                  */
976         case MRT_ADD_MFC:
977         case MRT_DEL_MFC:
978                 if (optlen != sizeof(mfc))
979                         return -EINVAL;
980                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
981                         return -EFAULT;
982                 rtnl_lock();
983                 if (optname == MRT_DEL_MFC)
984                         ret = ipmr_mfc_delete(&mfc);
985                 else
986                         ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
987                 rtnl_unlock();
988                 return ret;
989                 /*
990                  *      Control PIM assert.
991                  */
992         case MRT_ASSERT:
993         {
994                 int v;
995                 if (get_user(v,(int __user *)optval))
996                         return -EFAULT;
997                 mroute_do_assert=(v)?1:0;
998                 return 0;
999         }
1000 #ifdef CONFIG_IP_PIMSM
1001         case MRT_PIM:
1002         {
1003                 int v;
1004
1005                 if (get_user(v,(int __user *)optval))
1006                         return -EFAULT;
1007                 v = (v) ? 1 : 0;
1008
1009                 rtnl_lock();
1010                 ret = 0;
1011                 if (v != mroute_do_pim) {
1012                         mroute_do_pim = v;
1013                         mroute_do_assert = v;
1014 #ifdef CONFIG_IP_PIMSM_V2
1015                         if (mroute_do_pim)
1016                                 ret = inet_add_protocol(&pim_protocol,
1017                                                         IPPROTO_PIM);
1018                         else
1019                                 ret = inet_del_protocol(&pim_protocol,
1020                                                         IPPROTO_PIM);
1021                         if (ret < 0)
1022                                 ret = -EAGAIN;
1023 #endif
1024                 }
1025                 rtnl_unlock();
1026                 return ret;
1027         }
1028 #endif
1029         /*
1030          *      Spurious command, or MRT_VERSION which you cannot
1031          *      set.
1032          */
1033         default:
1034                 return -ENOPROTOOPT;
1035         }
1036 }
1037
1038 /*
1039  *      Getsock opt support for the multicast routing system.
1040  */
1041
1042 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1043 {
1044         int olr;
1045         int val;
1046
1047         if (optname != MRT_VERSION &&
1048 #ifdef CONFIG_IP_PIMSM
1049            optname!=MRT_PIM &&
1050 #endif
1051            optname!=MRT_ASSERT)
1052                 return -ENOPROTOOPT;
1053
1054         if (get_user(olr, optlen))
1055                 return -EFAULT;
1056
1057         olr = min_t(unsigned int, olr, sizeof(int));
1058         if (olr < 0)
1059                 return -EINVAL;
1060
1061         if (put_user(olr, optlen))
1062                 return -EFAULT;
1063         if (optname == MRT_VERSION)
1064                 val = 0x0305;
1065 #ifdef CONFIG_IP_PIMSM
1066         else if (optname == MRT_PIM)
1067                 val = mroute_do_pim;
1068 #endif
1069         else
1070                 val = mroute_do_assert;
1071         if (copy_to_user(optval, &val, olr))
1072                 return -EFAULT;
1073         return 0;
1074 }
1075
1076 /*
1077  *      The IP multicast ioctl support routines.
1078  */
1079
1080 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1081 {
1082         struct sioc_sg_req sr;
1083         struct sioc_vif_req vr;
1084         struct vif_device *vif;
1085         struct mfc_cache *c;
1086
1087         switch (cmd) {
1088         case SIOCGETVIFCNT:
1089                 if (copy_from_user(&vr, arg, sizeof(vr)))
1090                         return -EFAULT;
1091                 if (vr.vifi >= maxvif)
1092                         return -EINVAL;
1093                 read_lock(&mrt_lock);
1094                 vif=&vif_table[vr.vifi];
1095                 if (VIF_EXISTS(vr.vifi))        {
1096                         vr.icount = vif->pkt_in;
1097                         vr.ocount = vif->pkt_out;
1098                         vr.ibytes = vif->bytes_in;
1099                         vr.obytes = vif->bytes_out;
1100                         read_unlock(&mrt_lock);
1101
1102                         if (copy_to_user(arg, &vr, sizeof(vr)))
1103                                 return -EFAULT;
1104                         return 0;
1105                 }
1106                 read_unlock(&mrt_lock);
1107                 return -EADDRNOTAVAIL;
1108         case SIOCGETSGCNT:
1109                 if (copy_from_user(&sr, arg, sizeof(sr)))
1110                         return -EFAULT;
1111
1112                 read_lock(&mrt_lock);
1113                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1114                 if (c) {
1115                         sr.pktcnt = c->mfc_un.res.pkt;
1116                         sr.bytecnt = c->mfc_un.res.bytes;
1117                         sr.wrong_if = c->mfc_un.res.wrong_if;
1118                         read_unlock(&mrt_lock);
1119
1120                         if (copy_to_user(arg, &sr, sizeof(sr)))
1121                                 return -EFAULT;
1122                         return 0;
1123                 }
1124                 read_unlock(&mrt_lock);
1125                 return -EADDRNOTAVAIL;
1126         default:
1127                 return -ENOIOCTLCMD;
1128         }
1129 }
1130
1131
1132 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1133 {
1134         struct net_device *dev = ptr;
1135         struct vif_device *v;
1136         int ct;
1137
1138         if (!net_eq(dev_net(dev), &init_net))
1139                 return NOTIFY_DONE;
1140
1141         if (event != NETDEV_UNREGISTER)
1142                 return NOTIFY_DONE;
1143         v=&vif_table[0];
1144         for (ct=0; ct<maxvif; ct++,v++) {
1145                 if (v->dev == dev)
1146                         vif_delete(ct, 1);
1147         }
1148         return NOTIFY_DONE;
1149 }
1150
1151
1152 static struct notifier_block ip_mr_notifier = {
1153         .notifier_call = ipmr_device_event,
1154 };
1155
1156 /*
1157  *      Encapsulate a packet by attaching a valid IPIP header to it.
1158  *      This avoids tunnel drivers and other mess and gives us the speed so
1159  *      important for multicast video.
1160  */
1161
1162 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1163 {
1164         struct iphdr *iph;
1165         struct iphdr *old_iph = ip_hdr(skb);
1166
1167         skb_push(skb, sizeof(struct iphdr));
1168         skb->transport_header = skb->network_header;
1169         skb_reset_network_header(skb);
1170         iph = ip_hdr(skb);
1171
1172         iph->version    =       4;
1173         iph->tos        =       old_iph->tos;
1174         iph->ttl        =       old_iph->ttl;
1175         iph->frag_off   =       0;
1176         iph->daddr      =       daddr;
1177         iph->saddr      =       saddr;
1178         iph->protocol   =       IPPROTO_IPIP;
1179         iph->ihl        =       5;
1180         iph->tot_len    =       htons(skb->len);
1181         ip_select_ident(iph, skb->dst, NULL);
1182         ip_send_check(iph);
1183
1184         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1185         nf_reset(skb);
1186 }
1187
1188 static inline int ipmr_forward_finish(struct sk_buff *skb)
1189 {
1190         struct ip_options * opt = &(IPCB(skb)->opt);
1191
1192         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1193
1194         if (unlikely(opt->optlen))
1195                 ip_forward_options(skb);
1196
1197         return dst_output(skb);
1198 }
1199
1200 /*
1201  *      Processing handlers for ipmr_forward
1202  */
1203
1204 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1205 {
1206         const struct iphdr *iph = ip_hdr(skb);
1207         struct vif_device *vif = &vif_table[vifi];
1208         struct net_device *dev;
1209         struct rtable *rt;
1210         int    encap = 0;
1211
1212         if (vif->dev == NULL)
1213                 goto out_free;
1214
1215 #ifdef CONFIG_IP_PIMSM
1216         if (vif->flags & VIFF_REGISTER) {
1217                 vif->pkt_out++;
1218                 vif->bytes_out += skb->len;
1219                 vif->dev->stats.tx_bytes += skb->len;
1220                 vif->dev->stats.tx_packets++;
1221                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1222                 kfree_skb(skb);
1223                 return;
1224         }
1225 #endif
1226
1227         if (vif->flags&VIFF_TUNNEL) {
1228                 struct flowi fl = { .oif = vif->link,
1229                                     .nl_u = { .ip4_u =
1230                                               { .daddr = vif->remote,
1231                                                 .saddr = vif->local,
1232                                                 .tos = RT_TOS(iph->tos) } },
1233                                     .proto = IPPROTO_IPIP };
1234                 if (ip_route_output_key(&init_net, &rt, &fl))
1235                         goto out_free;
1236                 encap = sizeof(struct iphdr);
1237         } else {
1238                 struct flowi fl = { .oif = vif->link,
1239                                     .nl_u = { .ip4_u =
1240                                               { .daddr = iph->daddr,
1241                                                 .tos = RT_TOS(iph->tos) } },
1242                                     .proto = IPPROTO_IPIP };
1243                 if (ip_route_output_key(&init_net, &rt, &fl))
1244                         goto out_free;
1245         }
1246
1247         dev = rt->u.dst.dev;
1248
1249         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1250                 /* Do not fragment multicasts. Alas, IPv4 does not
1251                    allow to send ICMP, so that packets will disappear
1252                    to blackhole.
1253                  */
1254
1255                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1256                 ip_rt_put(rt);
1257                 goto out_free;
1258         }
1259
1260         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1261
1262         if (skb_cow(skb, encap)) {
1263                 ip_rt_put(rt);
1264                 goto out_free;
1265         }
1266
1267         vif->pkt_out++;
1268         vif->bytes_out += skb->len;
1269
1270         dst_release(skb->dst);
1271         skb->dst = &rt->u.dst;
1272         ip_decrease_ttl(ip_hdr(skb));
1273
1274         /* FIXME: forward and output firewalls used to be called here.
1275          * What do we do with netfilter? -- RR */
1276         if (vif->flags & VIFF_TUNNEL) {
1277                 ip_encap(skb, vif->local, vif->remote);
1278                 /* FIXME: extra output firewall step used to be here. --RR */
1279                 vif->dev->stats.tx_packets++;
1280                 vif->dev->stats.tx_bytes += skb->len;
1281         }
1282
1283         IPCB(skb)->flags |= IPSKB_FORWARDED;
1284
1285         /*
1286          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1287          * not only before forwarding, but after forwarding on all output
1288          * interfaces. It is clear, if mrouter runs a multicasting
1289          * program, it should receive packets not depending to what interface
1290          * program is joined.
1291          * If we will not make it, the program will have to join on all
1292          * interfaces. On the other hand, multihoming host (or router, but
1293          * not mrouter) cannot join to more than one interface - it will
1294          * result in receiving multiple packets.
1295          */
1296         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1297                 ipmr_forward_finish);
1298         return;
1299
1300 out_free:
1301         kfree_skb(skb);
1302         return;
1303 }
1304
1305 static int ipmr_find_vif(struct net_device *dev)
1306 {
1307         int ct;
1308         for (ct=maxvif-1; ct>=0; ct--) {
1309                 if (vif_table[ct].dev == dev)
1310                         break;
1311         }
1312         return ct;
1313 }
1314
1315 /* "local" means that we should preserve one skb (for local delivery) */
1316
1317 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1318 {
1319         int psend = -1;
1320         int vif, ct;
1321
1322         vif = cache->mfc_parent;
1323         cache->mfc_un.res.pkt++;
1324         cache->mfc_un.res.bytes += skb->len;
1325
1326         /*
1327          * Wrong interface: drop packet and (maybe) send PIM assert.
1328          */
1329         if (vif_table[vif].dev != skb->dev) {
1330                 int true_vifi;
1331
1332                 if (skb->rtable->fl.iif == 0) {
1333                         /* It is our own packet, looped back.
1334                            Very complicated situation...
1335
1336                            The best workaround until routing daemons will be
1337                            fixed is not to redistribute packet, if it was
1338                            send through wrong interface. It means, that
1339                            multicast applications WILL NOT work for
1340                            (S,G), which have default multicast route pointing
1341                            to wrong oif. In any case, it is not a good
1342                            idea to use multicasting applications on router.
1343                          */
1344                         goto dont_forward;
1345                 }
1346
1347                 cache->mfc_un.res.wrong_if++;
1348                 true_vifi = ipmr_find_vif(skb->dev);
1349
1350                 if (true_vifi >= 0 && mroute_do_assert &&
1351                     /* pimsm uses asserts, when switching from RPT to SPT,
1352                        so that we cannot check that packet arrived on an oif.
1353                        It is bad, but otherwise we would need to move pretty
1354                        large chunk of pimd to kernel. Ough... --ANK
1355                      */
1356                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1357                     time_after(jiffies,
1358                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1359                         cache->mfc_un.res.last_assert = jiffies;
1360                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1361                 }
1362                 goto dont_forward;
1363         }
1364
1365         vif_table[vif].pkt_in++;
1366         vif_table[vif].bytes_in += skb->len;
1367
1368         /*
1369          *      Forward the frame
1370          */
1371         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1372                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1373                         if (psend != -1) {
1374                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1375                                 if (skb2)
1376                                         ipmr_queue_xmit(skb2, cache, psend);
1377                         }
1378                         psend = ct;
1379                 }
1380         }
1381         if (psend != -1) {
1382                 if (local) {
1383                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1384                         if (skb2)
1385                                 ipmr_queue_xmit(skb2, cache, psend);
1386                 } else {
1387                         ipmr_queue_xmit(skb, cache, psend);
1388                         return 0;
1389                 }
1390         }
1391
1392 dont_forward:
1393         if (!local)
1394                 kfree_skb(skb);
1395         return 0;
1396 }
1397
1398
1399 /*
1400  *      Multicast packets for forwarding arrive here
1401  */
1402
1403 int ip_mr_input(struct sk_buff *skb)
1404 {
1405         struct mfc_cache *cache;
1406         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1407
1408         /* Packet is looped back after forward, it should not be
1409            forwarded second time, but still can be delivered locally.
1410          */
1411         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1412                 goto dont_forward;
1413
1414         if (!local) {
1415                     if (IPCB(skb)->opt.router_alert) {
1416                             if (ip_call_ra_chain(skb))
1417                                     return 0;
1418                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1419                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1420                                Cisco IOS <= 11.2(8)) do not put router alert
1421                                option to IGMP packets destined to routable
1422                                groups. It is very bad, because it means
1423                                that we can forward NO IGMP messages.
1424                              */
1425                             read_lock(&mrt_lock);
1426                             if (init_net.ipv4.mroute_sk) {
1427                                     nf_reset(skb);
1428                                     raw_rcv(init_net.ipv4.mroute_sk, skb);
1429                                     read_unlock(&mrt_lock);
1430                                     return 0;
1431                             }
1432                             read_unlock(&mrt_lock);
1433                     }
1434         }
1435
1436         read_lock(&mrt_lock);
1437         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1438
1439         /*
1440          *      No usable cache entry
1441          */
1442         if (cache == NULL) {
1443                 int vif;
1444
1445                 if (local) {
1446                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1447                         ip_local_deliver(skb);
1448                         if (skb2 == NULL) {
1449                                 read_unlock(&mrt_lock);
1450                                 return -ENOBUFS;
1451                         }
1452                         skb = skb2;
1453                 }
1454
1455                 vif = ipmr_find_vif(skb->dev);
1456                 if (vif >= 0) {
1457                         int err = ipmr_cache_unresolved(vif, skb);
1458                         read_unlock(&mrt_lock);
1459
1460                         return err;
1461                 }
1462                 read_unlock(&mrt_lock);
1463                 kfree_skb(skb);
1464                 return -ENODEV;
1465         }
1466
1467         ip_mr_forward(skb, cache, local);
1468
1469         read_unlock(&mrt_lock);
1470
1471         if (local)
1472                 return ip_local_deliver(skb);
1473
1474         return 0;
1475
1476 dont_forward:
1477         if (local)
1478                 return ip_local_deliver(skb);
1479         kfree_skb(skb);
1480         return 0;
1481 }
1482
1483 #ifdef CONFIG_IP_PIMSM
1484 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1485 {
1486         struct net_device *reg_dev = NULL;
1487         struct iphdr *encap;
1488
1489         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1490         /*
1491            Check that:
1492            a. packet is really destinted to a multicast group
1493            b. packet is not a NULL-REGISTER
1494            c. packet is not truncated
1495          */
1496         if (!ipv4_is_multicast(encap->daddr) ||
1497             encap->tot_len == 0 ||
1498             ntohs(encap->tot_len) + pimlen > skb->len)
1499                 return 1;
1500
1501         read_lock(&mrt_lock);
1502         if (reg_vif_num >= 0)
1503                 reg_dev = vif_table[reg_vif_num].dev;
1504         if (reg_dev)
1505                 dev_hold(reg_dev);
1506         read_unlock(&mrt_lock);
1507
1508         if (reg_dev == NULL)
1509                 return 1;
1510
1511         skb->mac_header = skb->network_header;
1512         skb_pull(skb, (u8*)encap - skb->data);
1513         skb_reset_network_header(skb);
1514         skb->dev = reg_dev;
1515         skb->protocol = htons(ETH_P_IP);
1516         skb->ip_summed = 0;
1517         skb->pkt_type = PACKET_HOST;
1518         dst_release(skb->dst);
1519         skb->dst = NULL;
1520         reg_dev->stats.rx_bytes += skb->len;
1521         reg_dev->stats.rx_packets++;
1522         nf_reset(skb);
1523         netif_rx(skb);
1524         dev_put(reg_dev);
1525
1526         return 0;
1527 }
1528 #endif
1529
1530 #ifdef CONFIG_IP_PIMSM_V1
1531 /*
1532  * Handle IGMP messages of PIMv1
1533  */
1534
1535 int pim_rcv_v1(struct sk_buff * skb)
1536 {
1537         struct igmphdr *pim;
1538
1539         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1540                 goto drop;
1541
1542         pim = igmp_hdr(skb);
1543
1544         if (!mroute_do_pim ||
1545             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1546                 goto drop;
1547
1548         if (__pim_rcv(skb, sizeof(*pim))) {
1549 drop:
1550                 kfree_skb(skb);
1551         }
1552         return 0;
1553 }
1554 #endif
1555
1556 #ifdef CONFIG_IP_PIMSM_V2
1557 static int pim_rcv(struct sk_buff * skb)
1558 {
1559         struct pimreghdr *pim;
1560
1561         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1562                 goto drop;
1563
1564         pim = (struct pimreghdr *)skb_transport_header(skb);
1565         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1566             (pim->flags&PIM_NULL_REGISTER) ||
1567             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1568              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1569                 goto drop;
1570
1571         if (__pim_rcv(skb, sizeof(*pim))) {
1572 drop:
1573                 kfree_skb(skb);
1574         }
1575         return 0;
1576 }
1577 #endif
1578
1579 static int
1580 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1581 {
1582         int ct;
1583         struct rtnexthop *nhp;
1584         struct net_device *dev = vif_table[c->mfc_parent].dev;
1585         u8 *b = skb_tail_pointer(skb);
1586         struct rtattr *mp_head;
1587
1588         if (dev)
1589                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1590
1591         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1592
1593         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1594                 if (c->mfc_un.res.ttls[ct] < 255) {
1595                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1596                                 goto rtattr_failure;
1597                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1598                         nhp->rtnh_flags = 0;
1599                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1600                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1601                         nhp->rtnh_len = sizeof(*nhp);
1602                 }
1603         }
1604         mp_head->rta_type = RTA_MULTIPATH;
1605         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1606         rtm->rtm_type = RTN_MULTICAST;
1607         return 1;
1608
1609 rtattr_failure:
1610         nlmsg_trim(skb, b);
1611         return -EMSGSIZE;
1612 }
1613
1614 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1615 {
1616         int err;
1617         struct mfc_cache *cache;
1618         struct rtable *rt = skb->rtable;
1619
1620         read_lock(&mrt_lock);
1621         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1622
1623         if (cache == NULL) {
1624                 struct sk_buff *skb2;
1625                 struct iphdr *iph;
1626                 struct net_device *dev;
1627                 int vif;
1628
1629                 if (nowait) {
1630                         read_unlock(&mrt_lock);
1631                         return -EAGAIN;
1632                 }
1633
1634                 dev = skb->dev;
1635                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1636                         read_unlock(&mrt_lock);
1637                         return -ENODEV;
1638                 }
1639                 skb2 = skb_clone(skb, GFP_ATOMIC);
1640                 if (!skb2) {
1641                         read_unlock(&mrt_lock);
1642                         return -ENOMEM;
1643                 }
1644
1645                 skb_push(skb2, sizeof(struct iphdr));
1646                 skb_reset_network_header(skb2);
1647                 iph = ip_hdr(skb2);
1648                 iph->ihl = sizeof(struct iphdr) >> 2;
1649                 iph->saddr = rt->rt_src;
1650                 iph->daddr = rt->rt_dst;
1651                 iph->version = 0;
1652                 err = ipmr_cache_unresolved(vif, skb2);
1653                 read_unlock(&mrt_lock);
1654                 return err;
1655         }
1656
1657         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1658                 cache->mfc_flags |= MFC_NOTIFY;
1659         err = ipmr_fill_mroute(skb, cache, rtm);
1660         read_unlock(&mrt_lock);
1661         return err;
1662 }
1663
1664 #ifdef CONFIG_PROC_FS
1665 /*
1666  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1667  */
1668 struct ipmr_vif_iter {
1669         int ct;
1670 };
1671
1672 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1673                                            loff_t pos)
1674 {
1675         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1676                 if (!VIF_EXISTS(iter->ct))
1677                         continue;
1678                 if (pos-- == 0)
1679                         return &vif_table[iter->ct];
1680         }
1681         return NULL;
1682 }
1683
1684 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1685         __acquires(mrt_lock)
1686 {
1687         read_lock(&mrt_lock);
1688         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1689                 : SEQ_START_TOKEN;
1690 }
1691
1692 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1693 {
1694         struct ipmr_vif_iter *iter = seq->private;
1695
1696         ++*pos;
1697         if (v == SEQ_START_TOKEN)
1698                 return ipmr_vif_seq_idx(iter, 0);
1699
1700         while (++iter->ct < maxvif) {
1701                 if (!VIF_EXISTS(iter->ct))
1702                         continue;
1703                 return &vif_table[iter->ct];
1704         }
1705         return NULL;
1706 }
1707
1708 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1709         __releases(mrt_lock)
1710 {
1711         read_unlock(&mrt_lock);
1712 }
1713
1714 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1715 {
1716         if (v == SEQ_START_TOKEN) {
1717                 seq_puts(seq,
1718                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1719         } else {
1720                 const struct vif_device *vif = v;
1721                 const char *name =  vif->dev ? vif->dev->name : "none";
1722
1723                 seq_printf(seq,
1724                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1725                            vif - vif_table,
1726                            name, vif->bytes_in, vif->pkt_in,
1727                            vif->bytes_out, vif->pkt_out,
1728                            vif->flags, vif->local, vif->remote);
1729         }
1730         return 0;
1731 }
1732
1733 static const struct seq_operations ipmr_vif_seq_ops = {
1734         .start = ipmr_vif_seq_start,
1735         .next  = ipmr_vif_seq_next,
1736         .stop  = ipmr_vif_seq_stop,
1737         .show  = ipmr_vif_seq_show,
1738 };
1739
1740 static int ipmr_vif_open(struct inode *inode, struct file *file)
1741 {
1742         return seq_open_private(file, &ipmr_vif_seq_ops,
1743                         sizeof(struct ipmr_vif_iter));
1744 }
1745
1746 static const struct file_operations ipmr_vif_fops = {
1747         .owner   = THIS_MODULE,
1748         .open    = ipmr_vif_open,
1749         .read    = seq_read,
1750         .llseek  = seq_lseek,
1751         .release = seq_release_private,
1752 };
1753
1754 struct ipmr_mfc_iter {
1755         struct mfc_cache **cache;
1756         int ct;
1757 };
1758
1759
1760 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1761 {
1762         struct mfc_cache *mfc;
1763
1764         it->cache = mfc_cache_array;
1765         read_lock(&mrt_lock);
1766         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1767                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1768                         if (pos-- == 0)
1769                                 return mfc;
1770         read_unlock(&mrt_lock);
1771
1772         it->cache = &mfc_unres_queue;
1773         spin_lock_bh(&mfc_unres_lock);
1774         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1775                 if (pos-- == 0)
1776                         return mfc;
1777         spin_unlock_bh(&mfc_unres_lock);
1778
1779         it->cache = NULL;
1780         return NULL;
1781 }
1782
1783
1784 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1785 {
1786         struct ipmr_mfc_iter *it = seq->private;
1787         it->cache = NULL;
1788         it->ct = 0;
1789         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1790                 : SEQ_START_TOKEN;
1791 }
1792
1793 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1794 {
1795         struct mfc_cache *mfc = v;
1796         struct ipmr_mfc_iter *it = seq->private;
1797
1798         ++*pos;
1799
1800         if (v == SEQ_START_TOKEN)
1801                 return ipmr_mfc_seq_idx(seq->private, 0);
1802
1803         if (mfc->next)
1804                 return mfc->next;
1805
1806         if (it->cache == &mfc_unres_queue)
1807                 goto end_of_list;
1808
1809         BUG_ON(it->cache != mfc_cache_array);
1810
1811         while (++it->ct < MFC_LINES) {
1812                 mfc = mfc_cache_array[it->ct];
1813                 if (mfc)
1814                         return mfc;
1815         }
1816
1817         /* exhausted cache_array, show unresolved */
1818         read_unlock(&mrt_lock);
1819         it->cache = &mfc_unres_queue;
1820         it->ct = 0;
1821
1822         spin_lock_bh(&mfc_unres_lock);
1823         mfc = mfc_unres_queue;
1824         if (mfc)
1825                 return mfc;
1826
1827  end_of_list:
1828         spin_unlock_bh(&mfc_unres_lock);
1829         it->cache = NULL;
1830
1831         return NULL;
1832 }
1833
1834 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1835 {
1836         struct ipmr_mfc_iter *it = seq->private;
1837
1838         if (it->cache == &mfc_unres_queue)
1839                 spin_unlock_bh(&mfc_unres_lock);
1840         else if (it->cache == mfc_cache_array)
1841                 read_unlock(&mrt_lock);
1842 }
1843
1844 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1845 {
1846         int n;
1847
1848         if (v == SEQ_START_TOKEN) {
1849                 seq_puts(seq,
1850                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1851         } else {
1852                 const struct mfc_cache *mfc = v;
1853                 const struct ipmr_mfc_iter *it = seq->private;
1854
1855                 seq_printf(seq, "%08lX %08lX %-3hd",
1856                            (unsigned long) mfc->mfc_mcastgrp,
1857                            (unsigned long) mfc->mfc_origin,
1858                            mfc->mfc_parent);
1859
1860                 if (it->cache != &mfc_unres_queue) {
1861                         seq_printf(seq, " %8lu %8lu %8lu",
1862                                    mfc->mfc_un.res.pkt,
1863                                    mfc->mfc_un.res.bytes,
1864                                    mfc->mfc_un.res.wrong_if);
1865                         for (n = mfc->mfc_un.res.minvif;
1866                              n < mfc->mfc_un.res.maxvif; n++ ) {
1867                                 if (VIF_EXISTS(n)
1868                                    && mfc->mfc_un.res.ttls[n] < 255)
1869                                 seq_printf(seq,
1870                                            " %2d:%-3d",
1871                                            n, mfc->mfc_un.res.ttls[n]);
1872                         }
1873                 } else {
1874                         /* unresolved mfc_caches don't contain
1875                          * pkt, bytes and wrong_if values
1876                          */
1877                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1878                 }
1879                 seq_putc(seq, '\n');
1880         }
1881         return 0;
1882 }
1883
1884 static const struct seq_operations ipmr_mfc_seq_ops = {
1885         .start = ipmr_mfc_seq_start,
1886         .next  = ipmr_mfc_seq_next,
1887         .stop  = ipmr_mfc_seq_stop,
1888         .show  = ipmr_mfc_seq_show,
1889 };
1890
1891 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1892 {
1893         return seq_open_private(file, &ipmr_mfc_seq_ops,
1894                         sizeof(struct ipmr_mfc_iter));
1895 }
1896
1897 static const struct file_operations ipmr_mfc_fops = {
1898         .owner   = THIS_MODULE,
1899         .open    = ipmr_mfc_open,
1900         .read    = seq_read,
1901         .llseek  = seq_lseek,
1902         .release = seq_release_private,
1903 };
1904 #endif
1905
1906 #ifdef CONFIG_IP_PIMSM_V2
1907 static struct net_protocol pim_protocol = {
1908         .handler        =       pim_rcv,
1909 };
1910 #endif
1911
1912
1913 /*
1914  *      Setup for IP multicast routing
1915  */
1916
1917 int __init ip_mr_init(void)
1918 {
1919         int err;
1920
1921         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1922                                        sizeof(struct mfc_cache),
1923                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1924                                        NULL);
1925         if (!mrt_cachep)
1926                 return -ENOMEM;
1927
1928         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1929         err = register_netdevice_notifier(&ip_mr_notifier);
1930         if (err)
1931                 goto reg_notif_fail;
1932 #ifdef CONFIG_PROC_FS
1933         err = -ENOMEM;
1934         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1935                 goto proc_vif_fail;
1936         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1937                 goto proc_cache_fail;
1938 #endif
1939         return 0;
1940 #ifdef CONFIG_PROC_FS
1941 proc_cache_fail:
1942         proc_net_remove(&init_net, "ip_mr_vif");
1943 proc_vif_fail:
1944         unregister_netdevice_notifier(&ip_mr_notifier);
1945 #endif
1946 reg_notif_fail:
1947         del_timer(&ipmr_expire_timer);
1948         kmem_cache_destroy(mrt_cachep);
1949         return err;
1950 }