ipv4: Do cleanup for ip_mr_init
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static struct kmem_cache *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static
122 struct net_device *ipmr_new_tunnel(struct vifctl *v)
123 {
124         struct net_device  *dev;
125
126         dev = __dev_get_by_name(&init_net, "tunl0");
127
128         if (dev) {
129                 int err;
130                 struct ifreq ifr;
131                 mm_segment_t    oldfs;
132                 struct ip_tunnel_parm p;
133                 struct in_device  *in_dev;
134
135                 memset(&p, 0, sizeof(p));
136                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
137                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
138                 p.iph.version = 4;
139                 p.iph.ihl = 5;
140                 p.iph.protocol = IPPROTO_IPIP;
141                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
142                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
143
144                 oldfs = get_fs(); set_fs(KERNEL_DS);
145                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
146                 set_fs(oldfs);
147
148                 dev = NULL;
149
150                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
151                         dev->flags |= IFF_MULTICAST;
152
153                         in_dev = __in_dev_get_rtnl(dev);
154                         if (in_dev == NULL)
155                                 goto failure;
156
157                         ipv4_devconf_setall(in_dev);
158                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
159
160                         if (dev_open(dev))
161                                 goto failure;
162                 }
163         }
164         return dev;
165
166 failure:
167         /* allow the register to be completed before unregistering. */
168         rtnl_unlock();
169         rtnl_lock();
170
171         unregister_netdevice(dev);
172         return NULL;
173 }
174
175 #ifdef CONFIG_IP_PIMSM
176
177 static int reg_vif_num = -1;
178
179 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
180 {
181         read_lock(&mrt_lock);
182         dev->stats.tx_bytes += skb->len;
183         dev->stats.tx_packets++;
184         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
185         read_unlock(&mrt_lock);
186         kfree_skb(skb);
187         return 0;
188 }
189
190 static void reg_vif_setup(struct net_device *dev)
191 {
192         dev->type               = ARPHRD_PIMREG;
193         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
194         dev->flags              = IFF_NOARP;
195         dev->hard_start_xmit    = reg_vif_xmit;
196         dev->destructor         = free_netdev;
197 }
198
199 static struct net_device *ipmr_reg_vif(void)
200 {
201         struct net_device *dev;
202         struct in_device *in_dev;
203
204         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
205
206         if (dev == NULL)
207                 return NULL;
208
209         if (register_netdevice(dev)) {
210                 free_netdev(dev);
211                 return NULL;
212         }
213         dev->iflink = 0;
214
215         rcu_read_lock();
216         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
217                 rcu_read_unlock();
218                 goto failure;
219         }
220
221         ipv4_devconf_setall(in_dev);
222         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
223         rcu_read_unlock();
224
225         if (dev_open(dev))
226                 goto failure;
227
228         return dev;
229
230 failure:
231         /* allow the register to be completed before unregistering. */
232         rtnl_unlock();
233         rtnl_lock();
234
235         unregister_netdevice(dev);
236         return NULL;
237 }
238 #endif
239
240 /*
241  *      Delete a VIF entry
242  */
243
244 static int vif_delete(int vifi)
245 {
246         struct vif_device *v;
247         struct net_device *dev;
248         struct in_device *in_dev;
249
250         if (vifi < 0 || vifi >= maxvif)
251                 return -EADDRNOTAVAIL;
252
253         v = &vif_table[vifi];
254
255         write_lock_bh(&mrt_lock);
256         dev = v->dev;
257         v->dev = NULL;
258
259         if (!dev) {
260                 write_unlock_bh(&mrt_lock);
261                 return -EADDRNOTAVAIL;
262         }
263
264 #ifdef CONFIG_IP_PIMSM
265         if (vifi == reg_vif_num)
266                 reg_vif_num = -1;
267 #endif
268
269         if (vifi+1 == maxvif) {
270                 int tmp;
271                 for (tmp=vifi-1; tmp>=0; tmp--) {
272                         if (VIF_EXISTS(tmp))
273                                 break;
274                 }
275                 maxvif = tmp+1;
276         }
277
278         write_unlock_bh(&mrt_lock);
279
280         dev_set_allmulti(dev, -1);
281
282         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
283                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
284                 ip_rt_multicast_event(in_dev);
285         }
286
287         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
288                 unregister_netdevice(dev);
289
290         dev_put(dev);
291         return 0;
292 }
293
294 /* Destroy an unresolved cache entry, killing queued skbs
295    and reporting error to netlink readers.
296  */
297
298 static void ipmr_destroy_unres(struct mfc_cache *c)
299 {
300         struct sk_buff *skb;
301         struct nlmsgerr *e;
302
303         atomic_dec(&cache_resolve_queue_len);
304
305         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
306                 if (ip_hdr(skb)->version == 0) {
307                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
308                         nlh->nlmsg_type = NLMSG_ERROR;
309                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
310                         skb_trim(skb, nlh->nlmsg_len);
311                         e = NLMSG_DATA(nlh);
312                         e->error = -ETIMEDOUT;
313                         memset(&e->msg, 0, sizeof(e->msg));
314
315                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
316                 } else
317                         kfree_skb(skb);
318         }
319
320         kmem_cache_free(mrt_cachep, c);
321 }
322
323
324 /* Single timer process for all the unresolved queue. */
325
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328         unsigned long now;
329         unsigned long expires;
330         struct mfc_cache *c, **cp;
331
332         if (!spin_trylock(&mfc_unres_lock)) {
333                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334                 return;
335         }
336
337         if (atomic_read(&cache_resolve_queue_len) == 0)
338                 goto out;
339
340         now = jiffies;
341         expires = 10*HZ;
342         cp = &mfc_unres_queue;
343
344         while ((c=*cp) != NULL) {
345                 if (time_after(c->mfc_un.unres.expires, now)) {
346                         unsigned long interval = c->mfc_un.unres.expires - now;
347                         if (interval < expires)
348                                 expires = interval;
349                         cp = &c->next;
350                         continue;
351                 }
352
353                 *cp = c->next;
354
355                 ipmr_destroy_unres(c);
356         }
357
358         if (atomic_read(&cache_resolve_queue_len))
359                 mod_timer(&ipmr_expire_timer, jiffies + expires);
360
361 out:
362         spin_unlock(&mfc_unres_lock);
363 }
364
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369         int vifi;
370
371         cache->mfc_un.res.minvif = MAXVIFS;
372         cache->mfc_un.res.maxvif = 0;
373         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374
375         for (vifi=0; vifi<maxvif; vifi++) {
376                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378                         if (cache->mfc_un.res.minvif > vifi)
379                                 cache->mfc_un.res.minvif = vifi;
380                         if (cache->mfc_un.res.maxvif <= vifi)
381                                 cache->mfc_un.res.maxvif = vifi + 1;
382                 }
383         }
384 }
385
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388         int vifi = vifc->vifc_vifi;
389         struct vif_device *v = &vif_table[vifi];
390         struct net_device *dev;
391         struct in_device *in_dev;
392
393         /* Is vif busy ? */
394         if (VIF_EXISTS(vifi))
395                 return -EADDRINUSE;
396
397         switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399         case VIFF_REGISTER:
400                 /*
401                  * Special Purpose VIF in PIM
402                  * All the packets will be sent to the daemon
403                  */
404                 if (reg_vif_num >= 0)
405                         return -EADDRINUSE;
406                 dev = ipmr_reg_vif();
407                 if (!dev)
408                         return -ENOBUFS;
409                 break;
410 #endif
411         case VIFF_TUNNEL:
412                 dev = ipmr_new_tunnel(vifc);
413                 if (!dev)
414                         return -ENOBUFS;
415                 break;
416         case 0:
417                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
418                 if (!dev)
419                         return -EADDRNOTAVAIL;
420                 dev_put(dev);
421                 break;
422         default:
423                 return -EINVAL;
424         }
425
426         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427                 return -EADDRNOTAVAIL;
428         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
429         dev_set_allmulti(dev, +1);
430         ip_rt_multicast_event(in_dev);
431
432         /*
433          *      Fill in the VIF structures
434          */
435         v->rate_limit=vifc->vifc_rate_limit;
436         v->local=vifc->vifc_lcl_addr.s_addr;
437         v->remote=vifc->vifc_rmt_addr.s_addr;
438         v->flags=vifc->vifc_flags;
439         if (!mrtsock)
440                 v->flags |= VIFF_STATIC;
441         v->threshold=vifc->vifc_threshold;
442         v->bytes_in = 0;
443         v->bytes_out = 0;
444         v->pkt_in = 0;
445         v->pkt_out = 0;
446         v->link = dev->ifindex;
447         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448                 v->link = dev->iflink;
449
450         /* And finish update writing critical data */
451         write_lock_bh(&mrt_lock);
452         dev_hold(dev);
453         v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455         if (v->flags&VIFF_REGISTER)
456                 reg_vif_num = vifi;
457 #endif
458         if (vifi+1 > maxvif)
459                 maxvif = vifi+1;
460         write_unlock_bh(&mrt_lock);
461         return 0;
462 }
463
464 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
465 {
466         int line=MFC_HASH(mcastgrp,origin);
467         struct mfc_cache *c;
468
469         for (c=mfc_cache_array[line]; c; c = c->next) {
470                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471                         break;
472         }
473         return c;
474 }
475
476 /*
477  *      Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
482         if (c==NULL)
483                 return NULL;
484         c->mfc_un.res.minvif = MAXVIFS;
485         return c;
486 }
487
488 static struct mfc_cache *ipmr_cache_alloc_unres(void)
489 {
490         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
491         if (c==NULL)
492                 return NULL;
493         skb_queue_head_init(&c->mfc_un.unres.unresolved);
494         c->mfc_un.unres.expires = jiffies + 10*HZ;
495         return c;
496 }
497
498 /*
499  *      A cache entry has gone into a resolved state from queued
500  */
501
502 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
503 {
504         struct sk_buff *skb;
505         struct nlmsgerr *e;
506
507         /*
508          *      Play the pending entries through our router
509          */
510
511         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
512                 if (ip_hdr(skb)->version == 0) {
513                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
514
515                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
516                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
517                                                   (u8 *)nlh);
518                         } else {
519                                 nlh->nlmsg_type = NLMSG_ERROR;
520                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
521                                 skb_trim(skb, nlh->nlmsg_len);
522                                 e = NLMSG_DATA(nlh);
523                                 e->error = -EMSGSIZE;
524                                 memset(&e->msg, 0, sizeof(e->msg));
525                         }
526
527                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
528                 } else
529                         ip_mr_forward(skb, c, 0);
530         }
531 }
532
533 /*
534  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
535  *      expects the following bizarre scheme.
536  *
537  *      Called under mrt_lock.
538  */
539
540 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
541 {
542         struct sk_buff *skb;
543         const int ihl = ip_hdrlen(pkt);
544         struct igmphdr *igmp;
545         struct igmpmsg *msg;
546         int ret;
547
548 #ifdef CONFIG_IP_PIMSM
549         if (assert == IGMPMSG_WHOLEPKT)
550                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
551         else
552 #endif
553                 skb = alloc_skb(128, GFP_ATOMIC);
554
555         if (!skb)
556                 return -ENOBUFS;
557
558 #ifdef CONFIG_IP_PIMSM
559         if (assert == IGMPMSG_WHOLEPKT) {
560                 /* Ugly, but we have no choice with this interface.
561                    Duplicate old header, fix ihl, length etc.
562                    And all this only to mangle msg->im_msgtype and
563                    to set msg->im_mbz to "mbz" :-)
564                  */
565                 skb_push(skb, sizeof(struct iphdr));
566                 skb_reset_network_header(skb);
567                 skb_reset_transport_header(skb);
568                 msg = (struct igmpmsg *)skb_network_header(skb);
569                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
570                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
571                 msg->im_mbz = 0;
572                 msg->im_vif = reg_vif_num;
573                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
574                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
575                                              sizeof(struct iphdr));
576         } else
577 #endif
578         {
579
580         /*
581          *      Copy the IP header
582          */
583
584         skb->network_header = skb->tail;
585         skb_put(skb, ihl);
586         skb_copy_to_linear_data(skb, pkt->data, ihl);
587         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
588         msg = (struct igmpmsg *)skb_network_header(skb);
589         msg->im_vif = vifi;
590         skb->dst = dst_clone(pkt->dst);
591
592         /*
593          *      Add our header
594          */
595
596         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
597         igmp->type      =
598         msg->im_msgtype = assert;
599         igmp->code      =       0;
600         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
601         skb->transport_header = skb->network_header;
602         }
603
604         if (mroute_socket == NULL) {
605                 kfree_skb(skb);
606                 return -EINVAL;
607         }
608
609         /*
610          *      Deliver to mrouted
611          */
612         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
613                 if (net_ratelimit())
614                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
615                 kfree_skb(skb);
616         }
617
618         return ret;
619 }
620
621 /*
622  *      Queue a packet for resolution. It gets locked cache entry!
623  */
624
625 static int
626 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
627 {
628         int err;
629         struct mfc_cache *c;
630         const struct iphdr *iph = ip_hdr(skb);
631
632         spin_lock_bh(&mfc_unres_lock);
633         for (c=mfc_unres_queue; c; c=c->next) {
634                 if (c->mfc_mcastgrp == iph->daddr &&
635                     c->mfc_origin == iph->saddr)
636                         break;
637         }
638
639         if (c == NULL) {
640                 /*
641                  *      Create a new entry if allowable
642                  */
643
644                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
645                     (c=ipmr_cache_alloc_unres())==NULL) {
646                         spin_unlock_bh(&mfc_unres_lock);
647
648                         kfree_skb(skb);
649                         return -ENOBUFS;
650                 }
651
652                 /*
653                  *      Fill in the new cache entry
654                  */
655                 c->mfc_parent   = -1;
656                 c->mfc_origin   = iph->saddr;
657                 c->mfc_mcastgrp = iph->daddr;
658
659                 /*
660                  *      Reflect first query at mrouted.
661                  */
662                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
663                         /* If the report failed throw the cache entry
664                            out - Brad Parker
665                          */
666                         spin_unlock_bh(&mfc_unres_lock);
667
668                         kmem_cache_free(mrt_cachep, c);
669                         kfree_skb(skb);
670                         return err;
671                 }
672
673                 atomic_inc(&cache_resolve_queue_len);
674                 c->next = mfc_unres_queue;
675                 mfc_unres_queue = c;
676
677                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
678         }
679
680         /*
681          *      See if we can append the packet
682          */
683         if (c->mfc_un.unres.unresolved.qlen>3) {
684                 kfree_skb(skb);
685                 err = -ENOBUFS;
686         } else {
687                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
688                 err = 0;
689         }
690
691         spin_unlock_bh(&mfc_unres_lock);
692         return err;
693 }
694
695 /*
696  *      MFC cache manipulation by user space mroute daemon
697  */
698
699 static int ipmr_mfc_delete(struct mfcctl *mfc)
700 {
701         int line;
702         struct mfc_cache *c, **cp;
703
704         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
705
706         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
707                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
708                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
709                         write_lock_bh(&mrt_lock);
710                         *cp = c->next;
711                         write_unlock_bh(&mrt_lock);
712
713                         kmem_cache_free(mrt_cachep, c);
714                         return 0;
715                 }
716         }
717         return -ENOENT;
718 }
719
720 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
721 {
722         int line;
723         struct mfc_cache *uc, *c, **cp;
724
725         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
726
727         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
728                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
729                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
730                         break;
731         }
732
733         if (c != NULL) {
734                 write_lock_bh(&mrt_lock);
735                 c->mfc_parent = mfc->mfcc_parent;
736                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
737                 if (!mrtsock)
738                         c->mfc_flags |= MFC_STATIC;
739                 write_unlock_bh(&mrt_lock);
740                 return 0;
741         }
742
743         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
744                 return -EINVAL;
745
746         c=ipmr_cache_alloc();
747         if (c==NULL)
748                 return -ENOMEM;
749
750         c->mfc_origin=mfc->mfcc_origin.s_addr;
751         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
752         c->mfc_parent=mfc->mfcc_parent;
753         ipmr_update_thresholds(c, mfc->mfcc_ttls);
754         if (!mrtsock)
755                 c->mfc_flags |= MFC_STATIC;
756
757         write_lock_bh(&mrt_lock);
758         c->next = mfc_cache_array[line];
759         mfc_cache_array[line] = c;
760         write_unlock_bh(&mrt_lock);
761
762         /*
763          *      Check to see if we resolved a queued list. If so we
764          *      need to send on the frames and tidy up.
765          */
766         spin_lock_bh(&mfc_unres_lock);
767         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
768              cp = &uc->next) {
769                 if (uc->mfc_origin == c->mfc_origin &&
770                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
771                         *cp = uc->next;
772                         if (atomic_dec_and_test(&cache_resolve_queue_len))
773                                 del_timer(&ipmr_expire_timer);
774                         break;
775                 }
776         }
777         spin_unlock_bh(&mfc_unres_lock);
778
779         if (uc) {
780                 ipmr_cache_resolve(uc, c);
781                 kmem_cache_free(mrt_cachep, uc);
782         }
783         return 0;
784 }
785
786 /*
787  *      Close the multicast socket, and clear the vif tables etc
788  */
789
790 static void mroute_clean_tables(struct sock *sk)
791 {
792         int i;
793
794         /*
795          *      Shut down all active vif entries
796          */
797         for (i=0; i<maxvif; i++) {
798                 if (!(vif_table[i].flags&VIFF_STATIC))
799                         vif_delete(i);
800         }
801
802         /*
803          *      Wipe the cache
804          */
805         for (i=0;i<MFC_LINES;i++) {
806                 struct mfc_cache *c, **cp;
807
808                 cp = &mfc_cache_array[i];
809                 while ((c = *cp) != NULL) {
810                         if (c->mfc_flags&MFC_STATIC) {
811                                 cp = &c->next;
812                                 continue;
813                         }
814                         write_lock_bh(&mrt_lock);
815                         *cp = c->next;
816                         write_unlock_bh(&mrt_lock);
817
818                         kmem_cache_free(mrt_cachep, c);
819                 }
820         }
821
822         if (atomic_read(&cache_resolve_queue_len) != 0) {
823                 struct mfc_cache *c;
824
825                 spin_lock_bh(&mfc_unres_lock);
826                 while (mfc_unres_queue != NULL) {
827                         c = mfc_unres_queue;
828                         mfc_unres_queue = c->next;
829                         spin_unlock_bh(&mfc_unres_lock);
830
831                         ipmr_destroy_unres(c);
832
833                         spin_lock_bh(&mfc_unres_lock);
834                 }
835                 spin_unlock_bh(&mfc_unres_lock);
836         }
837 }
838
839 static void mrtsock_destruct(struct sock *sk)
840 {
841         rtnl_lock();
842         if (sk == mroute_socket) {
843                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
844
845                 write_lock_bh(&mrt_lock);
846                 mroute_socket=NULL;
847                 write_unlock_bh(&mrt_lock);
848
849                 mroute_clean_tables(sk);
850         }
851         rtnl_unlock();
852 }
853
854 /*
855  *      Socket options and virtual interface manipulation. The whole
856  *      virtual interface system is a complete heap, but unfortunately
857  *      that's how BSD mrouted happens to think. Maybe one day with a proper
858  *      MOSPF/PIM router set up we can clean this up.
859  */
860
861 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
862 {
863         int ret;
864         struct vifctl vif;
865         struct mfcctl mfc;
866
867         if (optname != MRT_INIT) {
868                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
869                         return -EACCES;
870         }
871
872         switch (optname) {
873         case MRT_INIT:
874                 if (sk->sk_type != SOCK_RAW ||
875                     inet_sk(sk)->num != IPPROTO_IGMP)
876                         return -EOPNOTSUPP;
877                 if (optlen!=sizeof(int))
878                         return -ENOPROTOOPT;
879
880                 rtnl_lock();
881                 if (mroute_socket) {
882                         rtnl_unlock();
883                         return -EADDRINUSE;
884                 }
885
886                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
887                 if (ret == 0) {
888                         write_lock_bh(&mrt_lock);
889                         mroute_socket=sk;
890                         write_unlock_bh(&mrt_lock);
891
892                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
893                 }
894                 rtnl_unlock();
895                 return ret;
896         case MRT_DONE:
897                 if (sk!=mroute_socket)
898                         return -EACCES;
899                 return ip_ra_control(sk, 0, NULL);
900         case MRT_ADD_VIF:
901         case MRT_DEL_VIF:
902                 if (optlen!=sizeof(vif))
903                         return -EINVAL;
904                 if (copy_from_user(&vif,optval,sizeof(vif)))
905                         return -EFAULT;
906                 if (vif.vifc_vifi >= MAXVIFS)
907                         return -ENFILE;
908                 rtnl_lock();
909                 if (optname==MRT_ADD_VIF) {
910                         ret = vif_add(&vif, sk==mroute_socket);
911                 } else {
912                         ret = vif_delete(vif.vifc_vifi);
913                 }
914                 rtnl_unlock();
915                 return ret;
916
917                 /*
918                  *      Manipulate the forwarding caches. These live
919                  *      in a sort of kernel/user symbiosis.
920                  */
921         case MRT_ADD_MFC:
922         case MRT_DEL_MFC:
923                 if (optlen!=sizeof(mfc))
924                         return -EINVAL;
925                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
926                         return -EFAULT;
927                 rtnl_lock();
928                 if (optname==MRT_DEL_MFC)
929                         ret = ipmr_mfc_delete(&mfc);
930                 else
931                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
932                 rtnl_unlock();
933                 return ret;
934                 /*
935                  *      Control PIM assert.
936                  */
937         case MRT_ASSERT:
938         {
939                 int v;
940                 if (get_user(v,(int __user *)optval))
941                         return -EFAULT;
942                 mroute_do_assert=(v)?1:0;
943                 return 0;
944         }
945 #ifdef CONFIG_IP_PIMSM
946         case MRT_PIM:
947         {
948                 int v;
949
950                 if (get_user(v,(int __user *)optval))
951                         return -EFAULT;
952                 v = (v) ? 1 : 0;
953
954                 rtnl_lock();
955                 ret = 0;
956                 if (v != mroute_do_pim) {
957                         mroute_do_pim = v;
958                         mroute_do_assert = v;
959 #ifdef CONFIG_IP_PIMSM_V2
960                         if (mroute_do_pim)
961                                 ret = inet_add_protocol(&pim_protocol,
962                                                         IPPROTO_PIM);
963                         else
964                                 ret = inet_del_protocol(&pim_protocol,
965                                                         IPPROTO_PIM);
966                         if (ret < 0)
967                                 ret = -EAGAIN;
968 #endif
969                 }
970                 rtnl_unlock();
971                 return ret;
972         }
973 #endif
974         /*
975          *      Spurious command, or MRT_VERSION which you cannot
976          *      set.
977          */
978         default:
979                 return -ENOPROTOOPT;
980         }
981 }
982
983 /*
984  *      Getsock opt support for the multicast routing system.
985  */
986
987 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
988 {
989         int olr;
990         int val;
991
992         if (optname!=MRT_VERSION &&
993 #ifdef CONFIG_IP_PIMSM
994            optname!=MRT_PIM &&
995 #endif
996            optname!=MRT_ASSERT)
997                 return -ENOPROTOOPT;
998
999         if (get_user(olr, optlen))
1000                 return -EFAULT;
1001
1002         olr = min_t(unsigned int, olr, sizeof(int));
1003         if (olr < 0)
1004                 return -EINVAL;
1005
1006         if (put_user(olr,optlen))
1007                 return -EFAULT;
1008         if (optname==MRT_VERSION)
1009                 val=0x0305;
1010 #ifdef CONFIG_IP_PIMSM
1011         else if (optname==MRT_PIM)
1012                 val=mroute_do_pim;
1013 #endif
1014         else
1015                 val=mroute_do_assert;
1016         if (copy_to_user(optval,&val,olr))
1017                 return -EFAULT;
1018         return 0;
1019 }
1020
1021 /*
1022  *      The IP multicast ioctl support routines.
1023  */
1024
1025 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1026 {
1027         struct sioc_sg_req sr;
1028         struct sioc_vif_req vr;
1029         struct vif_device *vif;
1030         struct mfc_cache *c;
1031
1032         switch (cmd) {
1033         case SIOCGETVIFCNT:
1034                 if (copy_from_user(&vr,arg,sizeof(vr)))
1035                         return -EFAULT;
1036                 if (vr.vifi>=maxvif)
1037                         return -EINVAL;
1038                 read_lock(&mrt_lock);
1039                 vif=&vif_table[vr.vifi];
1040                 if (VIF_EXISTS(vr.vifi))        {
1041                         vr.icount=vif->pkt_in;
1042                         vr.ocount=vif->pkt_out;
1043                         vr.ibytes=vif->bytes_in;
1044                         vr.obytes=vif->bytes_out;
1045                         read_unlock(&mrt_lock);
1046
1047                         if (copy_to_user(arg,&vr,sizeof(vr)))
1048                                 return -EFAULT;
1049                         return 0;
1050                 }
1051                 read_unlock(&mrt_lock);
1052                 return -EADDRNOTAVAIL;
1053         case SIOCGETSGCNT:
1054                 if (copy_from_user(&sr,arg,sizeof(sr)))
1055                         return -EFAULT;
1056
1057                 read_lock(&mrt_lock);
1058                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1059                 if (c) {
1060                         sr.pktcnt = c->mfc_un.res.pkt;
1061                         sr.bytecnt = c->mfc_un.res.bytes;
1062                         sr.wrong_if = c->mfc_un.res.wrong_if;
1063                         read_unlock(&mrt_lock);
1064
1065                         if (copy_to_user(arg,&sr,sizeof(sr)))
1066                                 return -EFAULT;
1067                         return 0;
1068                 }
1069                 read_unlock(&mrt_lock);
1070                 return -EADDRNOTAVAIL;
1071         default:
1072                 return -ENOIOCTLCMD;
1073         }
1074 }
1075
1076
1077 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1078 {
1079         struct net_device *dev = ptr;
1080         struct vif_device *v;
1081         int ct;
1082
1083         if (dev_net(dev) != &init_net)
1084                 return NOTIFY_DONE;
1085
1086         if (event != NETDEV_UNREGISTER)
1087                 return NOTIFY_DONE;
1088         v=&vif_table[0];
1089         for (ct=0;ct<maxvif;ct++,v++) {
1090                 if (v->dev==dev)
1091                         vif_delete(ct);
1092         }
1093         return NOTIFY_DONE;
1094 }
1095
1096
1097 static struct notifier_block ip_mr_notifier={
1098         .notifier_call = ipmr_device_event,
1099 };
1100
1101 /*
1102  *      Encapsulate a packet by attaching a valid IPIP header to it.
1103  *      This avoids tunnel drivers and other mess and gives us the speed so
1104  *      important for multicast video.
1105  */
1106
1107 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1108 {
1109         struct iphdr *iph;
1110         struct iphdr *old_iph = ip_hdr(skb);
1111
1112         skb_push(skb, sizeof(struct iphdr));
1113         skb->transport_header = skb->network_header;
1114         skb_reset_network_header(skb);
1115         iph = ip_hdr(skb);
1116
1117         iph->version    =       4;
1118         iph->tos        =       old_iph->tos;
1119         iph->ttl        =       old_iph->ttl;
1120         iph->frag_off   =       0;
1121         iph->daddr      =       daddr;
1122         iph->saddr      =       saddr;
1123         iph->protocol   =       IPPROTO_IPIP;
1124         iph->ihl        =       5;
1125         iph->tot_len    =       htons(skb->len);
1126         ip_select_ident(iph, skb->dst, NULL);
1127         ip_send_check(iph);
1128
1129         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1130         nf_reset(skb);
1131 }
1132
1133 static inline int ipmr_forward_finish(struct sk_buff *skb)
1134 {
1135         struct ip_options * opt = &(IPCB(skb)->opt);
1136
1137         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1138
1139         if (unlikely(opt->optlen))
1140                 ip_forward_options(skb);
1141
1142         return dst_output(skb);
1143 }
1144
1145 /*
1146  *      Processing handlers for ipmr_forward
1147  */
1148
1149 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1150 {
1151         const struct iphdr *iph = ip_hdr(skb);
1152         struct vif_device *vif = &vif_table[vifi];
1153         struct net_device *dev;
1154         struct rtable *rt;
1155         int    encap = 0;
1156
1157         if (vif->dev == NULL)
1158                 goto out_free;
1159
1160 #ifdef CONFIG_IP_PIMSM
1161         if (vif->flags & VIFF_REGISTER) {
1162                 vif->pkt_out++;
1163                 vif->bytes_out+=skb->len;
1164                 vif->dev->stats.tx_bytes += skb->len;
1165                 vif->dev->stats.tx_packets++;
1166                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1167                 kfree_skb(skb);
1168                 return;
1169         }
1170 #endif
1171
1172         if (vif->flags&VIFF_TUNNEL) {
1173                 struct flowi fl = { .oif = vif->link,
1174                                     .nl_u = { .ip4_u =
1175                                               { .daddr = vif->remote,
1176                                                 .saddr = vif->local,
1177                                                 .tos = RT_TOS(iph->tos) } },
1178                                     .proto = IPPROTO_IPIP };
1179                 if (ip_route_output_key(&init_net, &rt, &fl))
1180                         goto out_free;
1181                 encap = sizeof(struct iphdr);
1182         } else {
1183                 struct flowi fl = { .oif = vif->link,
1184                                     .nl_u = { .ip4_u =
1185                                               { .daddr = iph->daddr,
1186                                                 .tos = RT_TOS(iph->tos) } },
1187                                     .proto = IPPROTO_IPIP };
1188                 if (ip_route_output_key(&init_net, &rt, &fl))
1189                         goto out_free;
1190         }
1191
1192         dev = rt->u.dst.dev;
1193
1194         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1195                 /* Do not fragment multicasts. Alas, IPv4 does not
1196                    allow to send ICMP, so that packets will disappear
1197                    to blackhole.
1198                  */
1199
1200                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1201                 ip_rt_put(rt);
1202                 goto out_free;
1203         }
1204
1205         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1206
1207         if (skb_cow(skb, encap)) {
1208                 ip_rt_put(rt);
1209                 goto out_free;
1210         }
1211
1212         vif->pkt_out++;
1213         vif->bytes_out+=skb->len;
1214
1215         dst_release(skb->dst);
1216         skb->dst = &rt->u.dst;
1217         ip_decrease_ttl(ip_hdr(skb));
1218
1219         /* FIXME: forward and output firewalls used to be called here.
1220          * What do we do with netfilter? -- RR */
1221         if (vif->flags & VIFF_TUNNEL) {
1222                 ip_encap(skb, vif->local, vif->remote);
1223                 /* FIXME: extra output firewall step used to be here. --RR */
1224                 vif->dev->stats.tx_packets++;
1225                 vif->dev->stats.tx_bytes += skb->len;
1226         }
1227
1228         IPCB(skb)->flags |= IPSKB_FORWARDED;
1229
1230         /*
1231          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1232          * not only before forwarding, but after forwarding on all output
1233          * interfaces. It is clear, if mrouter runs a multicasting
1234          * program, it should receive packets not depending to what interface
1235          * program is joined.
1236          * If we will not make it, the program will have to join on all
1237          * interfaces. On the other hand, multihoming host (or router, but
1238          * not mrouter) cannot join to more than one interface - it will
1239          * result in receiving multiple packets.
1240          */
1241         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1242                 ipmr_forward_finish);
1243         return;
1244
1245 out_free:
1246         kfree_skb(skb);
1247         return;
1248 }
1249
1250 static int ipmr_find_vif(struct net_device *dev)
1251 {
1252         int ct;
1253         for (ct=maxvif-1; ct>=0; ct--) {
1254                 if (vif_table[ct].dev == dev)
1255                         break;
1256         }
1257         return ct;
1258 }
1259
1260 /* "local" means that we should preserve one skb (for local delivery) */
1261
1262 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1263 {
1264         int psend = -1;
1265         int vif, ct;
1266
1267         vif = cache->mfc_parent;
1268         cache->mfc_un.res.pkt++;
1269         cache->mfc_un.res.bytes += skb->len;
1270
1271         /*
1272          * Wrong interface: drop packet and (maybe) send PIM assert.
1273          */
1274         if (vif_table[vif].dev != skb->dev) {
1275                 int true_vifi;
1276
1277                 if (skb->rtable->fl.iif == 0) {
1278                         /* It is our own packet, looped back.
1279                            Very complicated situation...
1280
1281                            The best workaround until routing daemons will be
1282                            fixed is not to redistribute packet, if it was
1283                            send through wrong interface. It means, that
1284                            multicast applications WILL NOT work for
1285                            (S,G), which have default multicast route pointing
1286                            to wrong oif. In any case, it is not a good
1287                            idea to use multicasting applications on router.
1288                          */
1289                         goto dont_forward;
1290                 }
1291
1292                 cache->mfc_un.res.wrong_if++;
1293                 true_vifi = ipmr_find_vif(skb->dev);
1294
1295                 if (true_vifi >= 0 && mroute_do_assert &&
1296                     /* pimsm uses asserts, when switching from RPT to SPT,
1297                        so that we cannot check that packet arrived on an oif.
1298                        It is bad, but otherwise we would need to move pretty
1299                        large chunk of pimd to kernel. Ough... --ANK
1300                      */
1301                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1302                     time_after(jiffies,
1303                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1304                         cache->mfc_un.res.last_assert = jiffies;
1305                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1306                 }
1307                 goto dont_forward;
1308         }
1309
1310         vif_table[vif].pkt_in++;
1311         vif_table[vif].bytes_in+=skb->len;
1312
1313         /*
1314          *      Forward the frame
1315          */
1316         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1317                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1318                         if (psend != -1) {
1319                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1320                                 if (skb2)
1321                                         ipmr_queue_xmit(skb2, cache, psend);
1322                         }
1323                         psend=ct;
1324                 }
1325         }
1326         if (psend != -1) {
1327                 if (local) {
1328                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1329                         if (skb2)
1330                                 ipmr_queue_xmit(skb2, cache, psend);
1331                 } else {
1332                         ipmr_queue_xmit(skb, cache, psend);
1333                         return 0;
1334                 }
1335         }
1336
1337 dont_forward:
1338         if (!local)
1339                 kfree_skb(skb);
1340         return 0;
1341 }
1342
1343
1344 /*
1345  *      Multicast packets for forwarding arrive here
1346  */
1347
1348 int ip_mr_input(struct sk_buff *skb)
1349 {
1350         struct mfc_cache *cache;
1351         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1352
1353         /* Packet is looped back after forward, it should not be
1354            forwarded second time, but still can be delivered locally.
1355          */
1356         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1357                 goto dont_forward;
1358
1359         if (!local) {
1360                     if (IPCB(skb)->opt.router_alert) {
1361                             if (ip_call_ra_chain(skb))
1362                                     return 0;
1363                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1364                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1365                                Cisco IOS <= 11.2(8)) do not put router alert
1366                                option to IGMP packets destined to routable
1367                                groups. It is very bad, because it means
1368                                that we can forward NO IGMP messages.
1369                              */
1370                             read_lock(&mrt_lock);
1371                             if (mroute_socket) {
1372                                     nf_reset(skb);
1373                                     raw_rcv(mroute_socket, skb);
1374                                     read_unlock(&mrt_lock);
1375                                     return 0;
1376                             }
1377                             read_unlock(&mrt_lock);
1378                     }
1379         }
1380
1381         read_lock(&mrt_lock);
1382         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1383
1384         /*
1385          *      No usable cache entry
1386          */
1387         if (cache==NULL) {
1388                 int vif;
1389
1390                 if (local) {
1391                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1392                         ip_local_deliver(skb);
1393                         if (skb2 == NULL) {
1394                                 read_unlock(&mrt_lock);
1395                                 return -ENOBUFS;
1396                         }
1397                         skb = skb2;
1398                 }
1399
1400                 vif = ipmr_find_vif(skb->dev);
1401                 if (vif >= 0) {
1402                         int err = ipmr_cache_unresolved(vif, skb);
1403                         read_unlock(&mrt_lock);
1404
1405                         return err;
1406                 }
1407                 read_unlock(&mrt_lock);
1408                 kfree_skb(skb);
1409                 return -ENODEV;
1410         }
1411
1412         ip_mr_forward(skb, cache, local);
1413
1414         read_unlock(&mrt_lock);
1415
1416         if (local)
1417                 return ip_local_deliver(skb);
1418
1419         return 0;
1420
1421 dont_forward:
1422         if (local)
1423                 return ip_local_deliver(skb);
1424         kfree_skb(skb);
1425         return 0;
1426 }
1427
1428 #ifdef CONFIG_IP_PIMSM_V1
1429 /*
1430  * Handle IGMP messages of PIMv1
1431  */
1432
1433 int pim_rcv_v1(struct sk_buff * skb)
1434 {
1435         struct igmphdr *pim;
1436         struct iphdr   *encap;
1437         struct net_device  *reg_dev = NULL;
1438
1439         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1440                 goto drop;
1441
1442         pim = igmp_hdr(skb);
1443
1444         if (!mroute_do_pim ||
1445             skb->len < sizeof(*pim) + sizeof(*encap) ||
1446             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1447                 goto drop;
1448
1449         encap = (struct iphdr *)(skb_transport_header(skb) +
1450                                  sizeof(struct igmphdr));
1451         /*
1452            Check that:
1453            a. packet is really destinted to a multicast group
1454            b. packet is not a NULL-REGISTER
1455            c. packet is not truncated
1456          */
1457         if (!ipv4_is_multicast(encap->daddr) ||
1458             encap->tot_len == 0 ||
1459             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1460                 goto drop;
1461
1462         read_lock(&mrt_lock);
1463         if (reg_vif_num >= 0)
1464                 reg_dev = vif_table[reg_vif_num].dev;
1465         if (reg_dev)
1466                 dev_hold(reg_dev);
1467         read_unlock(&mrt_lock);
1468
1469         if (reg_dev == NULL)
1470                 goto drop;
1471
1472         skb->mac_header = skb->network_header;
1473         skb_pull(skb, (u8*)encap - skb->data);
1474         skb_reset_network_header(skb);
1475         skb->dev = reg_dev;
1476         skb->protocol = htons(ETH_P_IP);
1477         skb->ip_summed = 0;
1478         skb->pkt_type = PACKET_HOST;
1479         dst_release(skb->dst);
1480         skb->dst = NULL;
1481         reg_dev->stats.rx_bytes += skb->len;
1482         reg_dev->stats.rx_packets++;
1483         nf_reset(skb);
1484         netif_rx(skb);
1485         dev_put(reg_dev);
1486         return 0;
1487  drop:
1488         kfree_skb(skb);
1489         return 0;
1490 }
1491 #endif
1492
1493 #ifdef CONFIG_IP_PIMSM_V2
1494 static int pim_rcv(struct sk_buff * skb)
1495 {
1496         struct pimreghdr *pim;
1497         struct iphdr   *encap;
1498         struct net_device  *reg_dev = NULL;
1499
1500         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1501                 goto drop;
1502
1503         pim = (struct pimreghdr *)skb_transport_header(skb);
1504         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1505             (pim->flags&PIM_NULL_REGISTER) ||
1506             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1507              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1508                 goto drop;
1509
1510         /* check if the inner packet is destined to mcast group */
1511         encap = (struct iphdr *)(skb_transport_header(skb) +
1512                                  sizeof(struct pimreghdr));
1513         if (!ipv4_is_multicast(encap->daddr) ||
1514             encap->tot_len == 0 ||
1515             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1516                 goto drop;
1517
1518         read_lock(&mrt_lock);
1519         if (reg_vif_num >= 0)
1520                 reg_dev = vif_table[reg_vif_num].dev;
1521         if (reg_dev)
1522                 dev_hold(reg_dev);
1523         read_unlock(&mrt_lock);
1524
1525         if (reg_dev == NULL)
1526                 goto drop;
1527
1528         skb->mac_header = skb->network_header;
1529         skb_pull(skb, (u8*)encap - skb->data);
1530         skb_reset_network_header(skb);
1531         skb->dev = reg_dev;
1532         skb->protocol = htons(ETH_P_IP);
1533         skb->ip_summed = 0;
1534         skb->pkt_type = PACKET_HOST;
1535         dst_release(skb->dst);
1536         reg_dev->stats.rx_bytes += skb->len;
1537         reg_dev->stats.rx_packets++;
1538         skb->dst = NULL;
1539         nf_reset(skb);
1540         netif_rx(skb);
1541         dev_put(reg_dev);
1542         return 0;
1543  drop:
1544         kfree_skb(skb);
1545         return 0;
1546 }
1547 #endif
1548
1549 static int
1550 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1551 {
1552         int ct;
1553         struct rtnexthop *nhp;
1554         struct net_device *dev = vif_table[c->mfc_parent].dev;
1555         u8 *b = skb_tail_pointer(skb);
1556         struct rtattr *mp_head;
1557
1558         if (dev)
1559                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1560
1561         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1562
1563         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1564                 if (c->mfc_un.res.ttls[ct] < 255) {
1565                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1566                                 goto rtattr_failure;
1567                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1568                         nhp->rtnh_flags = 0;
1569                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1570                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1571                         nhp->rtnh_len = sizeof(*nhp);
1572                 }
1573         }
1574         mp_head->rta_type = RTA_MULTIPATH;
1575         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1576         rtm->rtm_type = RTN_MULTICAST;
1577         return 1;
1578
1579 rtattr_failure:
1580         nlmsg_trim(skb, b);
1581         return -EMSGSIZE;
1582 }
1583
1584 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1585 {
1586         int err;
1587         struct mfc_cache *cache;
1588         struct rtable *rt = skb->rtable;
1589
1590         read_lock(&mrt_lock);
1591         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1592
1593         if (cache==NULL) {
1594                 struct sk_buff *skb2;
1595                 struct iphdr *iph;
1596                 struct net_device *dev;
1597                 int vif;
1598
1599                 if (nowait) {
1600                         read_unlock(&mrt_lock);
1601                         return -EAGAIN;
1602                 }
1603
1604                 dev = skb->dev;
1605                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1606                         read_unlock(&mrt_lock);
1607                         return -ENODEV;
1608                 }
1609                 skb2 = skb_clone(skb, GFP_ATOMIC);
1610                 if (!skb2) {
1611                         read_unlock(&mrt_lock);
1612                         return -ENOMEM;
1613                 }
1614
1615                 skb_push(skb2, sizeof(struct iphdr));
1616                 skb_reset_network_header(skb2);
1617                 iph = ip_hdr(skb2);
1618                 iph->ihl = sizeof(struct iphdr) >> 2;
1619                 iph->saddr = rt->rt_src;
1620                 iph->daddr = rt->rt_dst;
1621                 iph->version = 0;
1622                 err = ipmr_cache_unresolved(vif, skb2);
1623                 read_unlock(&mrt_lock);
1624                 return err;
1625         }
1626
1627         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1628                 cache->mfc_flags |= MFC_NOTIFY;
1629         err = ipmr_fill_mroute(skb, cache, rtm);
1630         read_unlock(&mrt_lock);
1631         return err;
1632 }
1633
1634 #ifdef CONFIG_PROC_FS
1635 /*
1636  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1637  */
1638 struct ipmr_vif_iter {
1639         int ct;
1640 };
1641
1642 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1643                                            loff_t pos)
1644 {
1645         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1646                 if (!VIF_EXISTS(iter->ct))
1647                         continue;
1648                 if (pos-- == 0)
1649                         return &vif_table[iter->ct];
1650         }
1651         return NULL;
1652 }
1653
1654 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1655         __acquires(mrt_lock)
1656 {
1657         read_lock(&mrt_lock);
1658         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1659                 : SEQ_START_TOKEN;
1660 }
1661
1662 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1663 {
1664         struct ipmr_vif_iter *iter = seq->private;
1665
1666         ++*pos;
1667         if (v == SEQ_START_TOKEN)
1668                 return ipmr_vif_seq_idx(iter, 0);
1669
1670         while (++iter->ct < maxvif) {
1671                 if (!VIF_EXISTS(iter->ct))
1672                         continue;
1673                 return &vif_table[iter->ct];
1674         }
1675         return NULL;
1676 }
1677
1678 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1679         __releases(mrt_lock)
1680 {
1681         read_unlock(&mrt_lock);
1682 }
1683
1684 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1685 {
1686         if (v == SEQ_START_TOKEN) {
1687                 seq_puts(seq,
1688                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1689         } else {
1690                 const struct vif_device *vif = v;
1691                 const char *name =  vif->dev ? vif->dev->name : "none";
1692
1693                 seq_printf(seq,
1694                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1695                            vif - vif_table,
1696                            name, vif->bytes_in, vif->pkt_in,
1697                            vif->bytes_out, vif->pkt_out,
1698                            vif->flags, vif->local, vif->remote);
1699         }
1700         return 0;
1701 }
1702
1703 static const struct seq_operations ipmr_vif_seq_ops = {
1704         .start = ipmr_vif_seq_start,
1705         .next  = ipmr_vif_seq_next,
1706         .stop  = ipmr_vif_seq_stop,
1707         .show  = ipmr_vif_seq_show,
1708 };
1709
1710 static int ipmr_vif_open(struct inode *inode, struct file *file)
1711 {
1712         return seq_open_private(file, &ipmr_vif_seq_ops,
1713                         sizeof(struct ipmr_vif_iter));
1714 }
1715
1716 static const struct file_operations ipmr_vif_fops = {
1717         .owner   = THIS_MODULE,
1718         .open    = ipmr_vif_open,
1719         .read    = seq_read,
1720         .llseek  = seq_lseek,
1721         .release = seq_release_private,
1722 };
1723
1724 struct ipmr_mfc_iter {
1725         struct mfc_cache **cache;
1726         int ct;
1727 };
1728
1729
1730 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1731 {
1732         struct mfc_cache *mfc;
1733
1734         it->cache = mfc_cache_array;
1735         read_lock(&mrt_lock);
1736         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1737                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1738                         if (pos-- == 0)
1739                                 return mfc;
1740         read_unlock(&mrt_lock);
1741
1742         it->cache = &mfc_unres_queue;
1743         spin_lock_bh(&mfc_unres_lock);
1744         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1745                 if (pos-- == 0)
1746                         return mfc;
1747         spin_unlock_bh(&mfc_unres_lock);
1748
1749         it->cache = NULL;
1750         return NULL;
1751 }
1752
1753
1754 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1755 {
1756         struct ipmr_mfc_iter *it = seq->private;
1757         it->cache = NULL;
1758         it->ct = 0;
1759         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1760                 : SEQ_START_TOKEN;
1761 }
1762
1763 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1764 {
1765         struct mfc_cache *mfc = v;
1766         struct ipmr_mfc_iter *it = seq->private;
1767
1768         ++*pos;
1769
1770         if (v == SEQ_START_TOKEN)
1771                 return ipmr_mfc_seq_idx(seq->private, 0);
1772
1773         if (mfc->next)
1774                 return mfc->next;
1775
1776         if (it->cache == &mfc_unres_queue)
1777                 goto end_of_list;
1778
1779         BUG_ON(it->cache != mfc_cache_array);
1780
1781         while (++it->ct < MFC_LINES) {
1782                 mfc = mfc_cache_array[it->ct];
1783                 if (mfc)
1784                         return mfc;
1785         }
1786
1787         /* exhausted cache_array, show unresolved */
1788         read_unlock(&mrt_lock);
1789         it->cache = &mfc_unres_queue;
1790         it->ct = 0;
1791
1792         spin_lock_bh(&mfc_unres_lock);
1793         mfc = mfc_unres_queue;
1794         if (mfc)
1795                 return mfc;
1796
1797  end_of_list:
1798         spin_unlock_bh(&mfc_unres_lock);
1799         it->cache = NULL;
1800
1801         return NULL;
1802 }
1803
1804 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1805 {
1806         struct ipmr_mfc_iter *it = seq->private;
1807
1808         if (it->cache == &mfc_unres_queue)
1809                 spin_unlock_bh(&mfc_unres_lock);
1810         else if (it->cache == mfc_cache_array)
1811                 read_unlock(&mrt_lock);
1812 }
1813
1814 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1815 {
1816         int n;
1817
1818         if (v == SEQ_START_TOKEN) {
1819                 seq_puts(seq,
1820                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1821         } else {
1822                 const struct mfc_cache *mfc = v;
1823                 const struct ipmr_mfc_iter *it = seq->private;
1824
1825                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1826                            (unsigned long) mfc->mfc_mcastgrp,
1827                            (unsigned long) mfc->mfc_origin,
1828                            mfc->mfc_parent,
1829                            mfc->mfc_un.res.pkt,
1830                            mfc->mfc_un.res.bytes,
1831                            mfc->mfc_un.res.wrong_if);
1832
1833                 if (it->cache != &mfc_unres_queue) {
1834                         for (n = mfc->mfc_un.res.minvif;
1835                              n < mfc->mfc_un.res.maxvif; n++ ) {
1836                                 if (VIF_EXISTS(n)
1837                                    && mfc->mfc_un.res.ttls[n] < 255)
1838                                 seq_printf(seq,
1839                                            " %2d:%-3d",
1840                                            n, mfc->mfc_un.res.ttls[n]);
1841                         }
1842                 }
1843                 seq_putc(seq, '\n');
1844         }
1845         return 0;
1846 }
1847
1848 static const struct seq_operations ipmr_mfc_seq_ops = {
1849         .start = ipmr_mfc_seq_start,
1850         .next  = ipmr_mfc_seq_next,
1851         .stop  = ipmr_mfc_seq_stop,
1852         .show  = ipmr_mfc_seq_show,
1853 };
1854
1855 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1856 {
1857         return seq_open_private(file, &ipmr_mfc_seq_ops,
1858                         sizeof(struct ipmr_mfc_iter));
1859 }
1860
1861 static const struct file_operations ipmr_mfc_fops = {
1862         .owner   = THIS_MODULE,
1863         .open    = ipmr_mfc_open,
1864         .read    = seq_read,
1865         .llseek  = seq_lseek,
1866         .release = seq_release_private,
1867 };
1868 #endif
1869
1870 #ifdef CONFIG_IP_PIMSM_V2
1871 static struct net_protocol pim_protocol = {
1872         .handler        =       pim_rcv,
1873 };
1874 #endif
1875
1876
1877 /*
1878  *      Setup for IP multicast routing
1879  */
1880
1881 int __init ip_mr_init(void)
1882 {
1883         int err;
1884
1885         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1886                                        sizeof(struct mfc_cache),
1887                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1888                                        NULL);
1889         if (!mrt_cachep)
1890                 return -ENOMEM;
1891
1892         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1893         err = register_netdevice_notifier(&ip_mr_notifier);
1894         if (err)
1895                 goto reg_notif_fail;
1896 #ifdef CONFIG_PROC_FS
1897         err = -ENOMEM;
1898         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1899                 goto proc_vif_fail;
1900         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1901                 goto proc_cache_fail;
1902 #endif
1903         return 0;
1904 reg_notif_fail:
1905         kmem_cache_destroy(mrt_cachep);
1906 #ifdef CONFIG_PROC_FS
1907 proc_vif_fail:
1908         unregister_netdevice_notifier(&ip_mr_notifier);
1909 proc_cache_fail:
1910         proc_net_remove(&init_net, "ip_mr_vif");
1911 #endif
1912         return err;
1913 }