ea0a491dce92a91737f42386a07fc8a3acebe70d
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static struct kmem_cache *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static
122 struct net_device *ipmr_new_tunnel(struct vifctl *v)
123 {
124         struct net_device  *dev;
125
126         dev = __dev_get_by_name("tunl0");
127
128         if (dev) {
129                 int err;
130                 struct ifreq ifr;
131                 mm_segment_t    oldfs;
132                 struct ip_tunnel_parm p;
133                 struct in_device  *in_dev;
134
135                 memset(&p, 0, sizeof(p));
136                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
137                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
138                 p.iph.version = 4;
139                 p.iph.ihl = 5;
140                 p.iph.protocol = IPPROTO_IPIP;
141                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
142                 ifr.ifr_ifru.ifru_data = (void*)&p;
143
144                 oldfs = get_fs(); set_fs(KERNEL_DS);
145                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
146                 set_fs(oldfs);
147
148                 dev = NULL;
149
150                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
151                         dev->flags |= IFF_MULTICAST;
152
153                         in_dev = __in_dev_get_rtnl(dev);
154                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
155                                 goto failure;
156                         in_dev->cnf.rp_filter = 0;
157
158                         if (dev_open(dev))
159                                 goto failure;
160                 }
161         }
162         return dev;
163
164 failure:
165         /* allow the register to be completed before unregistering. */
166         rtnl_unlock();
167         rtnl_lock();
168
169         unregister_netdevice(dev);
170         return NULL;
171 }
172
173 #ifdef CONFIG_IP_PIMSM
174
175 static int reg_vif_num = -1;
176
177 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
178 {
179         read_lock(&mrt_lock);
180         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
181         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
182         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
183         read_unlock(&mrt_lock);
184         kfree_skb(skb);
185         return 0;
186 }
187
188 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
189 {
190         return (struct net_device_stats*)netdev_priv(dev);
191 }
192
193 static void reg_vif_setup(struct net_device *dev)
194 {
195         dev->type               = ARPHRD_PIMREG;
196         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
197         dev->flags              = IFF_NOARP;
198         dev->hard_start_xmit    = reg_vif_xmit;
199         dev->get_stats          = reg_vif_get_stats;
200         dev->destructor         = free_netdev;
201 }
202
203 static struct net_device *ipmr_reg_vif(void)
204 {
205         struct net_device *dev;
206         struct in_device *in_dev;
207
208         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
209                            reg_vif_setup);
210
211         if (dev == NULL)
212                 return NULL;
213
214         if (register_netdevice(dev)) {
215                 free_netdev(dev);
216                 return NULL;
217         }
218         dev->iflink = 0;
219
220         if ((in_dev = inetdev_init(dev)) == NULL)
221                 goto failure;
222
223         in_dev->cnf.rp_filter = 0;
224
225         if (dev_open(dev))
226                 goto failure;
227
228         return dev;
229
230 failure:
231         /* allow the register to be completed before unregistering. */
232         rtnl_unlock();
233         rtnl_lock();
234
235         unregister_netdevice(dev);
236         return NULL;
237 }
238 #endif
239
240 /*
241  *      Delete a VIF entry
242  */
243
244 static int vif_delete(int vifi)
245 {
246         struct vif_device *v;
247         struct net_device *dev;
248         struct in_device *in_dev;
249
250         if (vifi < 0 || vifi >= maxvif)
251                 return -EADDRNOTAVAIL;
252
253         v = &vif_table[vifi];
254
255         write_lock_bh(&mrt_lock);
256         dev = v->dev;
257         v->dev = NULL;
258
259         if (!dev) {
260                 write_unlock_bh(&mrt_lock);
261                 return -EADDRNOTAVAIL;
262         }
263
264 #ifdef CONFIG_IP_PIMSM
265         if (vifi == reg_vif_num)
266                 reg_vif_num = -1;
267 #endif
268
269         if (vifi+1 == maxvif) {
270                 int tmp;
271                 for (tmp=vifi-1; tmp>=0; tmp--) {
272                         if (VIF_EXISTS(tmp))
273                                 break;
274                 }
275                 maxvif = tmp+1;
276         }
277
278         write_unlock_bh(&mrt_lock);
279
280         dev_set_allmulti(dev, -1);
281
282         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
283                 in_dev->cnf.mc_forwarding--;
284                 ip_rt_multicast_event(in_dev);
285         }
286
287         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
288                 unregister_netdevice(dev);
289
290         dev_put(dev);
291         return 0;
292 }
293
294 /* Destroy an unresolved cache entry, killing queued skbs
295    and reporting error to netlink readers.
296  */
297
298 static void ipmr_destroy_unres(struct mfc_cache *c)
299 {
300         struct sk_buff *skb;
301         struct nlmsgerr *e;
302
303         atomic_dec(&cache_resolve_queue_len);
304
305         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
306                 if (ip_hdr(skb)->version == 0) {
307                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
308                         nlh->nlmsg_type = NLMSG_ERROR;
309                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
310                         skb_trim(skb, nlh->nlmsg_len);
311                         e = NLMSG_DATA(nlh);
312                         e->error = -ETIMEDOUT;
313                         memset(&e->msg, 0, sizeof(e->msg));
314
315                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
316                 } else
317                         kfree_skb(skb);
318         }
319
320         kmem_cache_free(mrt_cachep, c);
321 }
322
323
324 /* Single timer process for all the unresolved queue. */
325
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328         unsigned long now;
329         unsigned long expires;
330         struct mfc_cache *c, **cp;
331
332         if (!spin_trylock(&mfc_unres_lock)) {
333                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334                 return;
335         }
336
337         if (atomic_read(&cache_resolve_queue_len) == 0)
338                 goto out;
339
340         now = jiffies;
341         expires = 10*HZ;
342         cp = &mfc_unres_queue;
343
344         while ((c=*cp) != NULL) {
345                 if (time_after(c->mfc_un.unres.expires, now)) {
346                         unsigned long interval = c->mfc_un.unres.expires - now;
347                         if (interval < expires)
348                                 expires = interval;
349                         cp = &c->next;
350                         continue;
351                 }
352
353                 *cp = c->next;
354
355                 ipmr_destroy_unres(c);
356         }
357
358         if (atomic_read(&cache_resolve_queue_len))
359                 mod_timer(&ipmr_expire_timer, jiffies + expires);
360
361 out:
362         spin_unlock(&mfc_unres_lock);
363 }
364
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369         int vifi;
370
371         cache->mfc_un.res.minvif = MAXVIFS;
372         cache->mfc_un.res.maxvif = 0;
373         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374
375         for (vifi=0; vifi<maxvif; vifi++) {
376                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378                         if (cache->mfc_un.res.minvif > vifi)
379                                 cache->mfc_un.res.minvif = vifi;
380                         if (cache->mfc_un.res.maxvif <= vifi)
381                                 cache->mfc_un.res.maxvif = vifi + 1;
382                 }
383         }
384 }
385
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388         int vifi = vifc->vifc_vifi;
389         struct vif_device *v = &vif_table[vifi];
390         struct net_device *dev;
391         struct in_device *in_dev;
392
393         /* Is vif busy ? */
394         if (VIF_EXISTS(vifi))
395                 return -EADDRINUSE;
396
397         switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399         case VIFF_REGISTER:
400                 /*
401                  * Special Purpose VIF in PIM
402                  * All the packets will be sent to the daemon
403                  */
404                 if (reg_vif_num >= 0)
405                         return -EADDRINUSE;
406                 dev = ipmr_reg_vif();
407                 if (!dev)
408                         return -ENOBUFS;
409                 break;
410 #endif
411         case VIFF_TUNNEL:
412                 dev = ipmr_new_tunnel(vifc);
413                 if (!dev)
414                         return -ENOBUFS;
415                 break;
416         case 0:
417                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
418                 if (!dev)
419                         return -EADDRNOTAVAIL;
420                 dev_put(dev);
421                 break;
422         default:
423                 return -EINVAL;
424         }
425
426         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427                 return -EADDRNOTAVAIL;
428         in_dev->cnf.mc_forwarding++;
429         dev_set_allmulti(dev, +1);
430         ip_rt_multicast_event(in_dev);
431
432         /*
433          *      Fill in the VIF structures
434          */
435         v->rate_limit=vifc->vifc_rate_limit;
436         v->local=vifc->vifc_lcl_addr.s_addr;
437         v->remote=vifc->vifc_rmt_addr.s_addr;
438         v->flags=vifc->vifc_flags;
439         if (!mrtsock)
440                 v->flags |= VIFF_STATIC;
441         v->threshold=vifc->vifc_threshold;
442         v->bytes_in = 0;
443         v->bytes_out = 0;
444         v->pkt_in = 0;
445         v->pkt_out = 0;
446         v->link = dev->ifindex;
447         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448                 v->link = dev->iflink;
449
450         /* And finish update writing critical data */
451         write_lock_bh(&mrt_lock);
452         dev_hold(dev);
453         v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455         if (v->flags&VIFF_REGISTER)
456                 reg_vif_num = vifi;
457 #endif
458         if (vifi+1 > maxvif)
459                 maxvif = vifi+1;
460         write_unlock_bh(&mrt_lock);
461         return 0;
462 }
463
464 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
465 {
466         int line=MFC_HASH(mcastgrp,origin);
467         struct mfc_cache *c;
468
469         for (c=mfc_cache_array[line]; c; c = c->next) {
470                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471                         break;
472         }
473         return c;
474 }
475
476 /*
477  *      Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
482         if (c==NULL)
483                 return NULL;
484         c->mfc_un.res.minvif = MAXVIFS;
485         return c;
486 }
487
488 static struct mfc_cache *ipmr_cache_alloc_unres(void)
489 {
490         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
491         if (c==NULL)
492                 return NULL;
493         skb_queue_head_init(&c->mfc_un.unres.unresolved);
494         c->mfc_un.unres.expires = jiffies + 10*HZ;
495         return c;
496 }
497
498 /*
499  *      A cache entry has gone into a resolved state from queued
500  */
501
502 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
503 {
504         struct sk_buff *skb;
505         struct nlmsgerr *e;
506
507         /*
508          *      Play the pending entries through our router
509          */
510
511         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
512                 if (ip_hdr(skb)->version == 0) {
513                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
514
515                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
516                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
517                                                   (u8 *)nlh);
518                         } else {
519                                 nlh->nlmsg_type = NLMSG_ERROR;
520                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
521                                 skb_trim(skb, nlh->nlmsg_len);
522                                 e = NLMSG_DATA(nlh);
523                                 e->error = -EMSGSIZE;
524                                 memset(&e->msg, 0, sizeof(e->msg));
525                         }
526
527                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
528                 } else
529                         ip_mr_forward(skb, c, 0);
530         }
531 }
532
533 /*
534  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
535  *      expects the following bizarre scheme.
536  *
537  *      Called under mrt_lock.
538  */
539
540 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
541 {
542         struct sk_buff *skb;
543         const int ihl = ip_hdrlen(pkt);
544         struct igmphdr *igmp;
545         struct igmpmsg *msg;
546         int ret;
547
548 #ifdef CONFIG_IP_PIMSM
549         if (assert == IGMPMSG_WHOLEPKT)
550                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
551         else
552 #endif
553                 skb = alloc_skb(128, GFP_ATOMIC);
554
555         if (!skb)
556                 return -ENOBUFS;
557
558 #ifdef CONFIG_IP_PIMSM
559         if (assert == IGMPMSG_WHOLEPKT) {
560                 /* Ugly, but we have no choice with this interface.
561                    Duplicate old header, fix ihl, length etc.
562                    And all this only to mangle msg->im_msgtype and
563                    to set msg->im_mbz to "mbz" :-)
564                  */
565                 skb_push(skb, sizeof(struct iphdr));
566                 skb_reset_network_header(skb);
567                 skb_reset_transport_header(skb);
568                 msg = (struct igmpmsg *)skb_network_header(skb);
569                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
570                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
571                 msg->im_mbz = 0;
572                 msg->im_vif = reg_vif_num;
573                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
574                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
575                                              sizeof(struct iphdr));
576         } else
577 #endif
578         {
579
580         /*
581          *      Copy the IP header
582          */
583
584         skb->network_header = skb->tail;
585         skb_put(skb, ihl);
586         memcpy(skb->data,pkt->data,ihl);
587         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
588         msg = (struct igmpmsg *)skb_network_header(skb);
589         msg->im_vif = vifi;
590         skb->dst = dst_clone(pkt->dst);
591
592         /*
593          *      Add our header
594          */
595
596         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
597         igmp->type      =
598         msg->im_msgtype = assert;
599         igmp->code      =       0;
600         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
601         skb->transport_header = skb->network_header;
602         }
603
604         if (mroute_socket == NULL) {
605                 kfree_skb(skb);
606                 return -EINVAL;
607         }
608
609         /*
610          *      Deliver to mrouted
611          */
612         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
613                 if (net_ratelimit())
614                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
615                 kfree_skb(skb);
616         }
617
618         return ret;
619 }
620
621 /*
622  *      Queue a packet for resolution. It gets locked cache entry!
623  */
624
625 static int
626 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
627 {
628         int err;
629         struct mfc_cache *c;
630         const struct iphdr *iph = ip_hdr(skb);
631
632         spin_lock_bh(&mfc_unres_lock);
633         for (c=mfc_unres_queue; c; c=c->next) {
634                 if (c->mfc_mcastgrp == iph->daddr &&
635                     c->mfc_origin == iph->saddr)
636                         break;
637         }
638
639         if (c == NULL) {
640                 /*
641                  *      Create a new entry if allowable
642                  */
643
644                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
645                     (c=ipmr_cache_alloc_unres())==NULL) {
646                         spin_unlock_bh(&mfc_unres_lock);
647
648                         kfree_skb(skb);
649                         return -ENOBUFS;
650                 }
651
652                 /*
653                  *      Fill in the new cache entry
654                  */
655                 c->mfc_parent   = -1;
656                 c->mfc_origin   = iph->saddr;
657                 c->mfc_mcastgrp = iph->daddr;
658
659                 /*
660                  *      Reflect first query at mrouted.
661                  */
662                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
663                         /* If the report failed throw the cache entry
664                            out - Brad Parker
665                          */
666                         spin_unlock_bh(&mfc_unres_lock);
667
668                         kmem_cache_free(mrt_cachep, c);
669                         kfree_skb(skb);
670                         return err;
671                 }
672
673                 atomic_inc(&cache_resolve_queue_len);
674                 c->next = mfc_unres_queue;
675                 mfc_unres_queue = c;
676
677                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
678         }
679
680         /*
681          *      See if we can append the packet
682          */
683         if (c->mfc_un.unres.unresolved.qlen>3) {
684                 kfree_skb(skb);
685                 err = -ENOBUFS;
686         } else {
687                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
688                 err = 0;
689         }
690
691         spin_unlock_bh(&mfc_unres_lock);
692         return err;
693 }
694
695 /*
696  *      MFC cache manipulation by user space mroute daemon
697  */
698
699 static int ipmr_mfc_delete(struct mfcctl *mfc)
700 {
701         int line;
702         struct mfc_cache *c, **cp;
703
704         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
705
706         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
707                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
708                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
709                         write_lock_bh(&mrt_lock);
710                         *cp = c->next;
711                         write_unlock_bh(&mrt_lock);
712
713                         kmem_cache_free(mrt_cachep, c);
714                         return 0;
715                 }
716         }
717         return -ENOENT;
718 }
719
720 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
721 {
722         int line;
723         struct mfc_cache *uc, *c, **cp;
724
725         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
726
727         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
728                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
729                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
730                         break;
731         }
732
733         if (c != NULL) {
734                 write_lock_bh(&mrt_lock);
735                 c->mfc_parent = mfc->mfcc_parent;
736                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
737                 if (!mrtsock)
738                         c->mfc_flags |= MFC_STATIC;
739                 write_unlock_bh(&mrt_lock);
740                 return 0;
741         }
742
743         if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
744                 return -EINVAL;
745
746         c=ipmr_cache_alloc();
747         if (c==NULL)
748                 return -ENOMEM;
749
750         c->mfc_origin=mfc->mfcc_origin.s_addr;
751         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
752         c->mfc_parent=mfc->mfcc_parent;
753         ipmr_update_thresholds(c, mfc->mfcc_ttls);
754         if (!mrtsock)
755                 c->mfc_flags |= MFC_STATIC;
756
757         write_lock_bh(&mrt_lock);
758         c->next = mfc_cache_array[line];
759         mfc_cache_array[line] = c;
760         write_unlock_bh(&mrt_lock);
761
762         /*
763          *      Check to see if we resolved a queued list. If so we
764          *      need to send on the frames and tidy up.
765          */
766         spin_lock_bh(&mfc_unres_lock);
767         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
768              cp = &uc->next) {
769                 if (uc->mfc_origin == c->mfc_origin &&
770                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
771                         *cp = uc->next;
772                         if (atomic_dec_and_test(&cache_resolve_queue_len))
773                                 del_timer(&ipmr_expire_timer);
774                         break;
775                 }
776         }
777         spin_unlock_bh(&mfc_unres_lock);
778
779         if (uc) {
780                 ipmr_cache_resolve(uc, c);
781                 kmem_cache_free(mrt_cachep, uc);
782         }
783         return 0;
784 }
785
786 /*
787  *      Close the multicast socket, and clear the vif tables etc
788  */
789
790 static void mroute_clean_tables(struct sock *sk)
791 {
792         int i;
793
794         /*
795          *      Shut down all active vif entries
796          */
797         for (i=0; i<maxvif; i++) {
798                 if (!(vif_table[i].flags&VIFF_STATIC))
799                         vif_delete(i);
800         }
801
802         /*
803          *      Wipe the cache
804          */
805         for (i=0;i<MFC_LINES;i++) {
806                 struct mfc_cache *c, **cp;
807
808                 cp = &mfc_cache_array[i];
809                 while ((c = *cp) != NULL) {
810                         if (c->mfc_flags&MFC_STATIC) {
811                                 cp = &c->next;
812                                 continue;
813                         }
814                         write_lock_bh(&mrt_lock);
815                         *cp = c->next;
816                         write_unlock_bh(&mrt_lock);
817
818                         kmem_cache_free(mrt_cachep, c);
819                 }
820         }
821
822         if (atomic_read(&cache_resolve_queue_len) != 0) {
823                 struct mfc_cache *c;
824
825                 spin_lock_bh(&mfc_unres_lock);
826                 while (mfc_unres_queue != NULL) {
827                         c = mfc_unres_queue;
828                         mfc_unres_queue = c->next;
829                         spin_unlock_bh(&mfc_unres_lock);
830
831                         ipmr_destroy_unres(c);
832
833                         spin_lock_bh(&mfc_unres_lock);
834                 }
835                 spin_unlock_bh(&mfc_unres_lock);
836         }
837 }
838
839 static void mrtsock_destruct(struct sock *sk)
840 {
841         rtnl_lock();
842         if (sk == mroute_socket) {
843                 ipv4_devconf.mc_forwarding--;
844
845                 write_lock_bh(&mrt_lock);
846                 mroute_socket=NULL;
847                 write_unlock_bh(&mrt_lock);
848
849                 mroute_clean_tables(sk);
850         }
851         rtnl_unlock();
852 }
853
854 /*
855  *      Socket options and virtual interface manipulation. The whole
856  *      virtual interface system is a complete heap, but unfortunately
857  *      that's how BSD mrouted happens to think. Maybe one day with a proper
858  *      MOSPF/PIM router set up we can clean this up.
859  */
860
861 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
862 {
863         int ret;
864         struct vifctl vif;
865         struct mfcctl mfc;
866
867         if (optname != MRT_INIT) {
868                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
869                         return -EACCES;
870         }
871
872         switch (optname) {
873         case MRT_INIT:
874                 if (sk->sk_type != SOCK_RAW ||
875                     inet_sk(sk)->num != IPPROTO_IGMP)
876                         return -EOPNOTSUPP;
877                 if (optlen!=sizeof(int))
878                         return -ENOPROTOOPT;
879
880                 rtnl_lock();
881                 if (mroute_socket) {
882                         rtnl_unlock();
883                         return -EADDRINUSE;
884                 }
885
886                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
887                 if (ret == 0) {
888                         write_lock_bh(&mrt_lock);
889                         mroute_socket=sk;
890                         write_unlock_bh(&mrt_lock);
891
892                         ipv4_devconf.mc_forwarding++;
893                 }
894                 rtnl_unlock();
895                 return ret;
896         case MRT_DONE:
897                 if (sk!=mroute_socket)
898                         return -EACCES;
899                 return ip_ra_control(sk, 0, NULL);
900         case MRT_ADD_VIF:
901         case MRT_DEL_VIF:
902                 if (optlen!=sizeof(vif))
903                         return -EINVAL;
904                 if (copy_from_user(&vif,optval,sizeof(vif)))
905                         return -EFAULT;
906                 if (vif.vifc_vifi >= MAXVIFS)
907                         return -ENFILE;
908                 rtnl_lock();
909                 if (optname==MRT_ADD_VIF) {
910                         ret = vif_add(&vif, sk==mroute_socket);
911                 } else {
912                         ret = vif_delete(vif.vifc_vifi);
913                 }
914                 rtnl_unlock();
915                 return ret;
916
917                 /*
918                  *      Manipulate the forwarding caches. These live
919                  *      in a sort of kernel/user symbiosis.
920                  */
921         case MRT_ADD_MFC:
922         case MRT_DEL_MFC:
923                 if (optlen!=sizeof(mfc))
924                         return -EINVAL;
925                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
926                         return -EFAULT;
927                 rtnl_lock();
928                 if (optname==MRT_DEL_MFC)
929                         ret = ipmr_mfc_delete(&mfc);
930                 else
931                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
932                 rtnl_unlock();
933                 return ret;
934                 /*
935                  *      Control PIM assert.
936                  */
937         case MRT_ASSERT:
938         {
939                 int v;
940                 if (get_user(v,(int __user *)optval))
941                         return -EFAULT;
942                 mroute_do_assert=(v)?1:0;
943                 return 0;
944         }
945 #ifdef CONFIG_IP_PIMSM
946         case MRT_PIM:
947         {
948                 int v, ret;
949                 if (get_user(v,(int __user *)optval))
950                         return -EFAULT;
951                 v = (v)?1:0;
952                 rtnl_lock();
953                 ret = 0;
954                 if (v != mroute_do_pim) {
955                         mroute_do_pim = v;
956                         mroute_do_assert = v;
957 #ifdef CONFIG_IP_PIMSM_V2
958                         if (mroute_do_pim)
959                                 ret = inet_add_protocol(&pim_protocol,
960                                                         IPPROTO_PIM);
961                         else
962                                 ret = inet_del_protocol(&pim_protocol,
963                                                         IPPROTO_PIM);
964                         if (ret < 0)
965                                 ret = -EAGAIN;
966 #endif
967                 }
968                 rtnl_unlock();
969                 return ret;
970         }
971 #endif
972         /*
973          *      Spurious command, or MRT_VERSION which you cannot
974          *      set.
975          */
976         default:
977                 return -ENOPROTOOPT;
978         }
979 }
980
981 /*
982  *      Getsock opt support for the multicast routing system.
983  */
984
985 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
986 {
987         int olr;
988         int val;
989
990         if (optname!=MRT_VERSION &&
991 #ifdef CONFIG_IP_PIMSM
992            optname!=MRT_PIM &&
993 #endif
994            optname!=MRT_ASSERT)
995                 return -ENOPROTOOPT;
996
997         if (get_user(olr, optlen))
998                 return -EFAULT;
999
1000         olr = min_t(unsigned int, olr, sizeof(int));
1001         if (olr < 0)
1002                 return -EINVAL;
1003
1004         if (put_user(olr,optlen))
1005                 return -EFAULT;
1006         if (optname==MRT_VERSION)
1007                 val=0x0305;
1008 #ifdef CONFIG_IP_PIMSM
1009         else if (optname==MRT_PIM)
1010                 val=mroute_do_pim;
1011 #endif
1012         else
1013                 val=mroute_do_assert;
1014         if (copy_to_user(optval,&val,olr))
1015                 return -EFAULT;
1016         return 0;
1017 }
1018
1019 /*
1020  *      The IP multicast ioctl support routines.
1021  */
1022
1023 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1024 {
1025         struct sioc_sg_req sr;
1026         struct sioc_vif_req vr;
1027         struct vif_device *vif;
1028         struct mfc_cache *c;
1029
1030         switch (cmd) {
1031         case SIOCGETVIFCNT:
1032                 if (copy_from_user(&vr,arg,sizeof(vr)))
1033                         return -EFAULT;
1034                 if (vr.vifi>=maxvif)
1035                         return -EINVAL;
1036                 read_lock(&mrt_lock);
1037                 vif=&vif_table[vr.vifi];
1038                 if (VIF_EXISTS(vr.vifi))        {
1039                         vr.icount=vif->pkt_in;
1040                         vr.ocount=vif->pkt_out;
1041                         vr.ibytes=vif->bytes_in;
1042                         vr.obytes=vif->bytes_out;
1043                         read_unlock(&mrt_lock);
1044
1045                         if (copy_to_user(arg,&vr,sizeof(vr)))
1046                                 return -EFAULT;
1047                         return 0;
1048                 }
1049                 read_unlock(&mrt_lock);
1050                 return -EADDRNOTAVAIL;
1051         case SIOCGETSGCNT:
1052                 if (copy_from_user(&sr,arg,sizeof(sr)))
1053                         return -EFAULT;
1054
1055                 read_lock(&mrt_lock);
1056                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1057                 if (c) {
1058                         sr.pktcnt = c->mfc_un.res.pkt;
1059                         sr.bytecnt = c->mfc_un.res.bytes;
1060                         sr.wrong_if = c->mfc_un.res.wrong_if;
1061                         read_unlock(&mrt_lock);
1062
1063                         if (copy_to_user(arg,&sr,sizeof(sr)))
1064                                 return -EFAULT;
1065                         return 0;
1066                 }
1067                 read_unlock(&mrt_lock);
1068                 return -EADDRNOTAVAIL;
1069         default:
1070                 return -ENOIOCTLCMD;
1071         }
1072 }
1073
1074
1075 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1076 {
1077         struct vif_device *v;
1078         int ct;
1079         if (event != NETDEV_UNREGISTER)
1080                 return NOTIFY_DONE;
1081         v=&vif_table[0];
1082         for (ct=0;ct<maxvif;ct++,v++) {
1083                 if (v->dev==ptr)
1084                         vif_delete(ct);
1085         }
1086         return NOTIFY_DONE;
1087 }
1088
1089
1090 static struct notifier_block ip_mr_notifier={
1091         .notifier_call = ipmr_device_event,
1092 };
1093
1094 /*
1095  *      Encapsulate a packet by attaching a valid IPIP header to it.
1096  *      This avoids tunnel drivers and other mess and gives us the speed so
1097  *      important for multicast video.
1098  */
1099
1100 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1101 {
1102         struct iphdr *iph;
1103         struct iphdr *old_iph = ip_hdr(skb);
1104
1105         skb_push(skb, sizeof(struct iphdr));
1106         skb->transport_header = skb->network_header;
1107         skb_reset_network_header(skb);
1108         iph = ip_hdr(skb);
1109
1110         iph->version    =       4;
1111         iph->tos        =       old_iph->tos;
1112         iph->ttl        =       old_iph->ttl;
1113         iph->frag_off   =       0;
1114         iph->daddr      =       daddr;
1115         iph->saddr      =       saddr;
1116         iph->protocol   =       IPPROTO_IPIP;
1117         iph->ihl        =       5;
1118         iph->tot_len    =       htons(skb->len);
1119         ip_select_ident(iph, skb->dst, NULL);
1120         ip_send_check(iph);
1121
1122         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1123         nf_reset(skb);
1124 }
1125
1126 static inline int ipmr_forward_finish(struct sk_buff *skb)
1127 {
1128         struct ip_options * opt = &(IPCB(skb)->opt);
1129
1130         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1131
1132         if (unlikely(opt->optlen))
1133                 ip_forward_options(skb);
1134
1135         return dst_output(skb);
1136 }
1137
1138 /*
1139  *      Processing handlers for ipmr_forward
1140  */
1141
1142 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1143 {
1144         const struct iphdr *iph = ip_hdr(skb);
1145         struct vif_device *vif = &vif_table[vifi];
1146         struct net_device *dev;
1147         struct rtable *rt;
1148         int    encap = 0;
1149
1150         if (vif->dev == NULL)
1151                 goto out_free;
1152
1153 #ifdef CONFIG_IP_PIMSM
1154         if (vif->flags & VIFF_REGISTER) {
1155                 vif->pkt_out++;
1156                 vif->bytes_out+=skb->len;
1157                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1158                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1159                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1160                 kfree_skb(skb);
1161                 return;
1162         }
1163 #endif
1164
1165         if (vif->flags&VIFF_TUNNEL) {
1166                 struct flowi fl = { .oif = vif->link,
1167                                     .nl_u = { .ip4_u =
1168                                               { .daddr = vif->remote,
1169                                                 .saddr = vif->local,
1170                                                 .tos = RT_TOS(iph->tos) } },
1171                                     .proto = IPPROTO_IPIP };
1172                 if (ip_route_output_key(&rt, &fl))
1173                         goto out_free;
1174                 encap = sizeof(struct iphdr);
1175         } else {
1176                 struct flowi fl = { .oif = vif->link,
1177                                     .nl_u = { .ip4_u =
1178                                               { .daddr = iph->daddr,
1179                                                 .tos = RT_TOS(iph->tos) } },
1180                                     .proto = IPPROTO_IPIP };
1181                 if (ip_route_output_key(&rt, &fl))
1182                         goto out_free;
1183         }
1184
1185         dev = rt->u.dst.dev;
1186
1187         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1188                 /* Do not fragment multicasts. Alas, IPv4 does not
1189                    allow to send ICMP, so that packets will disappear
1190                    to blackhole.
1191                  */
1192
1193                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1194                 ip_rt_put(rt);
1195                 goto out_free;
1196         }
1197
1198         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1199
1200         if (skb_cow(skb, encap)) {
1201                 ip_rt_put(rt);
1202                 goto out_free;
1203         }
1204
1205         vif->pkt_out++;
1206         vif->bytes_out+=skb->len;
1207
1208         dst_release(skb->dst);
1209         skb->dst = &rt->u.dst;
1210         ip_decrease_ttl(ip_hdr(skb));
1211
1212         /* FIXME: forward and output firewalls used to be called here.
1213          * What do we do with netfilter? -- RR */
1214         if (vif->flags & VIFF_TUNNEL) {
1215                 ip_encap(skb, vif->local, vif->remote);
1216                 /* FIXME: extra output firewall step used to be here. --RR */
1217                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1218                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1219         }
1220
1221         IPCB(skb)->flags |= IPSKB_FORWARDED;
1222
1223         /*
1224          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1225          * not only before forwarding, but after forwarding on all output
1226          * interfaces. It is clear, if mrouter runs a multicasting
1227          * program, it should receive packets not depending to what interface
1228          * program is joined.
1229          * If we will not make it, the program will have to join on all
1230          * interfaces. On the other hand, multihoming host (or router, but
1231          * not mrouter) cannot join to more than one interface - it will
1232          * result in receiving multiple packets.
1233          */
1234         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1235                 ipmr_forward_finish);
1236         return;
1237
1238 out_free:
1239         kfree_skb(skb);
1240         return;
1241 }
1242
1243 static int ipmr_find_vif(struct net_device *dev)
1244 {
1245         int ct;
1246         for (ct=maxvif-1; ct>=0; ct--) {
1247                 if (vif_table[ct].dev == dev)
1248                         break;
1249         }
1250         return ct;
1251 }
1252
1253 /* "local" means that we should preserve one skb (for local delivery) */
1254
1255 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1256 {
1257         int psend = -1;
1258         int vif, ct;
1259
1260         vif = cache->mfc_parent;
1261         cache->mfc_un.res.pkt++;
1262         cache->mfc_un.res.bytes += skb->len;
1263
1264         /*
1265          * Wrong interface: drop packet and (maybe) send PIM assert.
1266          */
1267         if (vif_table[vif].dev != skb->dev) {
1268                 int true_vifi;
1269
1270                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1271                         /* It is our own packet, looped back.
1272                            Very complicated situation...
1273
1274                            The best workaround until routing daemons will be
1275                            fixed is not to redistribute packet, if it was
1276                            send through wrong interface. It means, that
1277                            multicast applications WILL NOT work for
1278                            (S,G), which have default multicast route pointing
1279                            to wrong oif. In any case, it is not a good
1280                            idea to use multicasting applications on router.
1281                          */
1282                         goto dont_forward;
1283                 }
1284
1285                 cache->mfc_un.res.wrong_if++;
1286                 true_vifi = ipmr_find_vif(skb->dev);
1287
1288                 if (true_vifi >= 0 && mroute_do_assert &&
1289                     /* pimsm uses asserts, when switching from RPT to SPT,
1290                        so that we cannot check that packet arrived on an oif.
1291                        It is bad, but otherwise we would need to move pretty
1292                        large chunk of pimd to kernel. Ough... --ANK
1293                      */
1294                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1295                     time_after(jiffies,
1296                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1297                         cache->mfc_un.res.last_assert = jiffies;
1298                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1299                 }
1300                 goto dont_forward;
1301         }
1302
1303         vif_table[vif].pkt_in++;
1304         vif_table[vif].bytes_in+=skb->len;
1305
1306         /*
1307          *      Forward the frame
1308          */
1309         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1310                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1311                         if (psend != -1) {
1312                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1313                                 if (skb2)
1314                                         ipmr_queue_xmit(skb2, cache, psend);
1315                         }
1316                         psend=ct;
1317                 }
1318         }
1319         if (psend != -1) {
1320                 if (local) {
1321                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1322                         if (skb2)
1323                                 ipmr_queue_xmit(skb2, cache, psend);
1324                 } else {
1325                         ipmr_queue_xmit(skb, cache, psend);
1326                         return 0;
1327                 }
1328         }
1329
1330 dont_forward:
1331         if (!local)
1332                 kfree_skb(skb);
1333         return 0;
1334 }
1335
1336
1337 /*
1338  *      Multicast packets for forwarding arrive here
1339  */
1340
1341 int ip_mr_input(struct sk_buff *skb)
1342 {
1343         struct mfc_cache *cache;
1344         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1345
1346         /* Packet is looped back after forward, it should not be
1347            forwarded second time, but still can be delivered locally.
1348          */
1349         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1350                 goto dont_forward;
1351
1352         if (!local) {
1353                     if (IPCB(skb)->opt.router_alert) {
1354                             if (ip_call_ra_chain(skb))
1355                                     return 0;
1356                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1357                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1358                                Cisco IOS <= 11.2(8)) do not put router alert
1359                                option to IGMP packets destined to routable
1360                                groups. It is very bad, because it means
1361                                that we can forward NO IGMP messages.
1362                              */
1363                             read_lock(&mrt_lock);
1364                             if (mroute_socket) {
1365                                     nf_reset(skb);
1366                                     raw_rcv(mroute_socket, skb);
1367                                     read_unlock(&mrt_lock);
1368                                     return 0;
1369                             }
1370                             read_unlock(&mrt_lock);
1371                     }
1372         }
1373
1374         read_lock(&mrt_lock);
1375         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1376
1377         /*
1378          *      No usable cache entry
1379          */
1380         if (cache==NULL) {
1381                 int vif;
1382
1383                 if (local) {
1384                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1385                         ip_local_deliver(skb);
1386                         if (skb2 == NULL) {
1387                                 read_unlock(&mrt_lock);
1388                                 return -ENOBUFS;
1389                         }
1390                         skb = skb2;
1391                 }
1392
1393                 vif = ipmr_find_vif(skb->dev);
1394                 if (vif >= 0) {
1395                         int err = ipmr_cache_unresolved(vif, skb);
1396                         read_unlock(&mrt_lock);
1397
1398                         return err;
1399                 }
1400                 read_unlock(&mrt_lock);
1401                 kfree_skb(skb);
1402                 return -ENODEV;
1403         }
1404
1405         ip_mr_forward(skb, cache, local);
1406
1407         read_unlock(&mrt_lock);
1408
1409         if (local)
1410                 return ip_local_deliver(skb);
1411
1412         return 0;
1413
1414 dont_forward:
1415         if (local)
1416                 return ip_local_deliver(skb);
1417         kfree_skb(skb);
1418         return 0;
1419 }
1420
1421 #ifdef CONFIG_IP_PIMSM_V1
1422 /*
1423  * Handle IGMP messages of PIMv1
1424  */
1425
1426 int pim_rcv_v1(struct sk_buff * skb)
1427 {
1428         struct igmphdr *pim;
1429         struct iphdr   *encap;
1430         struct net_device  *reg_dev = NULL;
1431
1432         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1433                 goto drop;
1434
1435         pim = igmp_hdr(skb);
1436
1437         if (!mroute_do_pim ||
1438             skb->len < sizeof(*pim) + sizeof(*encap) ||
1439             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1440                 goto drop;
1441
1442         encap = (struct iphdr *)(skb_transport_header(skb) +
1443                                  sizeof(struct igmphdr));
1444         /*
1445            Check that:
1446            a. packet is really destinted to a multicast group
1447            b. packet is not a NULL-REGISTER
1448            c. packet is not truncated
1449          */
1450         if (!MULTICAST(encap->daddr) ||
1451             encap->tot_len == 0 ||
1452             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1453                 goto drop;
1454
1455         read_lock(&mrt_lock);
1456         if (reg_vif_num >= 0)
1457                 reg_dev = vif_table[reg_vif_num].dev;
1458         if (reg_dev)
1459                 dev_hold(reg_dev);
1460         read_unlock(&mrt_lock);
1461
1462         if (reg_dev == NULL)
1463                 goto drop;
1464
1465         skb->mac_header = skb->network_header;
1466         skb_pull(skb, (u8*)encap - skb->data);
1467         skb_reset_network_header(skb);
1468         skb->dev = reg_dev;
1469         skb->protocol = htons(ETH_P_IP);
1470         skb->ip_summed = 0;
1471         skb->pkt_type = PACKET_HOST;
1472         dst_release(skb->dst);
1473         skb->dst = NULL;
1474         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1475         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1476         nf_reset(skb);
1477         netif_rx(skb);
1478         dev_put(reg_dev);
1479         return 0;
1480  drop:
1481         kfree_skb(skb);
1482         return 0;
1483 }
1484 #endif
1485
1486 #ifdef CONFIG_IP_PIMSM_V2
1487 static int pim_rcv(struct sk_buff * skb)
1488 {
1489         struct pimreghdr *pim;
1490         struct iphdr   *encap;
1491         struct net_device  *reg_dev = NULL;
1492
1493         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1494                 goto drop;
1495
1496         pim = (struct pimreghdr *)skb_transport_header(skb);
1497         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1498             (pim->flags&PIM_NULL_REGISTER) ||
1499             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1500              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1501                 goto drop;
1502
1503         /* check if the inner packet is destined to mcast group */
1504         encap = (struct iphdr *)(skb_transport_header(skb) +
1505                                  sizeof(struct pimreghdr));
1506         if (!MULTICAST(encap->daddr) ||
1507             encap->tot_len == 0 ||
1508             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1509                 goto drop;
1510
1511         read_lock(&mrt_lock);
1512         if (reg_vif_num >= 0)
1513                 reg_dev = vif_table[reg_vif_num].dev;
1514         if (reg_dev)
1515                 dev_hold(reg_dev);
1516         read_unlock(&mrt_lock);
1517
1518         if (reg_dev == NULL)
1519                 goto drop;
1520
1521         skb->mac_header = skb->network_header;
1522         skb_pull(skb, (u8*)encap - skb->data);
1523         skb_reset_network_header(skb);
1524         skb->dev = reg_dev;
1525         skb->protocol = htons(ETH_P_IP);
1526         skb->ip_summed = 0;
1527         skb->pkt_type = PACKET_HOST;
1528         dst_release(skb->dst);
1529         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1530         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1531         skb->dst = NULL;
1532         nf_reset(skb);
1533         netif_rx(skb);
1534         dev_put(reg_dev);
1535         return 0;
1536  drop:
1537         kfree_skb(skb);
1538         return 0;
1539 }
1540 #endif
1541
1542 static int
1543 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1544 {
1545         int ct;
1546         struct rtnexthop *nhp;
1547         struct net_device *dev = vif_table[c->mfc_parent].dev;
1548         u8 *b = skb_tail_pointer(skb);
1549         struct rtattr *mp_head;
1550
1551         if (dev)
1552                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1553
1554         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1555
1556         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1557                 if (c->mfc_un.res.ttls[ct] < 255) {
1558                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1559                                 goto rtattr_failure;
1560                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1561                         nhp->rtnh_flags = 0;
1562                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1563                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1564                         nhp->rtnh_len = sizeof(*nhp);
1565                 }
1566         }
1567         mp_head->rta_type = RTA_MULTIPATH;
1568         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1569         rtm->rtm_type = RTN_MULTICAST;
1570         return 1;
1571
1572 rtattr_failure:
1573         skb_trim(skb, b - skb->data);
1574         return -EMSGSIZE;
1575 }
1576
1577 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1578 {
1579         int err;
1580         struct mfc_cache *cache;
1581         struct rtable *rt = (struct rtable*)skb->dst;
1582
1583         read_lock(&mrt_lock);
1584         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1585
1586         if (cache==NULL) {
1587                 struct sk_buff *skb2;
1588                 struct iphdr *iph;
1589                 struct net_device *dev;
1590                 int vif;
1591
1592                 if (nowait) {
1593                         read_unlock(&mrt_lock);
1594                         return -EAGAIN;
1595                 }
1596
1597                 dev = skb->dev;
1598                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1599                         read_unlock(&mrt_lock);
1600                         return -ENODEV;
1601                 }
1602                 skb2 = skb_clone(skb, GFP_ATOMIC);
1603                 if (!skb2) {
1604                         read_unlock(&mrt_lock);
1605                         return -ENOMEM;
1606                 }
1607
1608                 skb_push(skb2, sizeof(struct iphdr));
1609                 skb_reset_network_header(skb2);
1610                 iph = ip_hdr(skb2);
1611                 iph->ihl = sizeof(struct iphdr) >> 2;
1612                 iph->saddr = rt->rt_src;
1613                 iph->daddr = rt->rt_dst;
1614                 iph->version = 0;
1615                 err = ipmr_cache_unresolved(vif, skb2);
1616                 read_unlock(&mrt_lock);
1617                 return err;
1618         }
1619
1620         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1621                 cache->mfc_flags |= MFC_NOTIFY;
1622         err = ipmr_fill_mroute(skb, cache, rtm);
1623         read_unlock(&mrt_lock);
1624         return err;
1625 }
1626
1627 #ifdef CONFIG_PROC_FS
1628 /*
1629  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1630  */
1631 struct ipmr_vif_iter {
1632         int ct;
1633 };
1634
1635 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1636                                            loff_t pos)
1637 {
1638         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1639                 if (!VIF_EXISTS(iter->ct))
1640                         continue;
1641                 if (pos-- == 0)
1642                         return &vif_table[iter->ct];
1643         }
1644         return NULL;
1645 }
1646
1647 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1648 {
1649         read_lock(&mrt_lock);
1650         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1651                 : SEQ_START_TOKEN;
1652 }
1653
1654 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1655 {
1656         struct ipmr_vif_iter *iter = seq->private;
1657
1658         ++*pos;
1659         if (v == SEQ_START_TOKEN)
1660                 return ipmr_vif_seq_idx(iter, 0);
1661
1662         while (++iter->ct < maxvif) {
1663                 if (!VIF_EXISTS(iter->ct))
1664                         continue;
1665                 return &vif_table[iter->ct];
1666         }
1667         return NULL;
1668 }
1669
1670 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1671 {
1672         read_unlock(&mrt_lock);
1673 }
1674
1675 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1676 {
1677         if (v == SEQ_START_TOKEN) {
1678                 seq_puts(seq,
1679                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1680         } else {
1681                 const struct vif_device *vif = v;
1682                 const char *name =  vif->dev ? vif->dev->name : "none";
1683
1684                 seq_printf(seq,
1685                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1686                            vif - vif_table,
1687                            name, vif->bytes_in, vif->pkt_in,
1688                            vif->bytes_out, vif->pkt_out,
1689                            vif->flags, vif->local, vif->remote);
1690         }
1691         return 0;
1692 }
1693
1694 static const struct seq_operations ipmr_vif_seq_ops = {
1695         .start = ipmr_vif_seq_start,
1696         .next  = ipmr_vif_seq_next,
1697         .stop  = ipmr_vif_seq_stop,
1698         .show  = ipmr_vif_seq_show,
1699 };
1700
1701 static int ipmr_vif_open(struct inode *inode, struct file *file)
1702 {
1703         struct seq_file *seq;
1704         int rc = -ENOMEM;
1705         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1706
1707         if (!s)
1708                 goto out;
1709
1710         rc = seq_open(file, &ipmr_vif_seq_ops);
1711         if (rc)
1712                 goto out_kfree;
1713
1714         s->ct = 0;
1715         seq = file->private_data;
1716         seq->private = s;
1717 out:
1718         return rc;
1719 out_kfree:
1720         kfree(s);
1721         goto out;
1722
1723 }
1724
1725 static const struct file_operations ipmr_vif_fops = {
1726         .owner   = THIS_MODULE,
1727         .open    = ipmr_vif_open,
1728         .read    = seq_read,
1729         .llseek  = seq_lseek,
1730         .release = seq_release_private,
1731 };
1732
1733 struct ipmr_mfc_iter {
1734         struct mfc_cache **cache;
1735         int ct;
1736 };
1737
1738
1739 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1740 {
1741         struct mfc_cache *mfc;
1742
1743         it->cache = mfc_cache_array;
1744         read_lock(&mrt_lock);
1745         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1746                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1747                         if (pos-- == 0)
1748                                 return mfc;
1749         read_unlock(&mrt_lock);
1750
1751         it->cache = &mfc_unres_queue;
1752         spin_lock_bh(&mfc_unres_lock);
1753         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1754                 if (pos-- == 0)
1755                         return mfc;
1756         spin_unlock_bh(&mfc_unres_lock);
1757
1758         it->cache = NULL;
1759         return NULL;
1760 }
1761
1762
1763 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1764 {
1765         struct ipmr_mfc_iter *it = seq->private;
1766         it->cache = NULL;
1767         it->ct = 0;
1768         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1769                 : SEQ_START_TOKEN;
1770 }
1771
1772 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1773 {
1774         struct mfc_cache *mfc = v;
1775         struct ipmr_mfc_iter *it = seq->private;
1776
1777         ++*pos;
1778
1779         if (v == SEQ_START_TOKEN)
1780                 return ipmr_mfc_seq_idx(seq->private, 0);
1781
1782         if (mfc->next)
1783                 return mfc->next;
1784
1785         if (it->cache == &mfc_unres_queue)
1786                 goto end_of_list;
1787
1788         BUG_ON(it->cache != mfc_cache_array);
1789
1790         while (++it->ct < MFC_LINES) {
1791                 mfc = mfc_cache_array[it->ct];
1792                 if (mfc)
1793                         return mfc;
1794         }
1795
1796         /* exhausted cache_array, show unresolved */
1797         read_unlock(&mrt_lock);
1798         it->cache = &mfc_unres_queue;
1799         it->ct = 0;
1800
1801         spin_lock_bh(&mfc_unres_lock);
1802         mfc = mfc_unres_queue;
1803         if (mfc)
1804                 return mfc;
1805
1806  end_of_list:
1807         spin_unlock_bh(&mfc_unres_lock);
1808         it->cache = NULL;
1809
1810         return NULL;
1811 }
1812
1813 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1814 {
1815         struct ipmr_mfc_iter *it = seq->private;
1816
1817         if (it->cache == &mfc_unres_queue)
1818                 spin_unlock_bh(&mfc_unres_lock);
1819         else if (it->cache == mfc_cache_array)
1820                 read_unlock(&mrt_lock);
1821 }
1822
1823 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1824 {
1825         int n;
1826
1827         if (v == SEQ_START_TOKEN) {
1828                 seq_puts(seq,
1829                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1830         } else {
1831                 const struct mfc_cache *mfc = v;
1832                 const struct ipmr_mfc_iter *it = seq->private;
1833
1834                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1835                            (unsigned long) mfc->mfc_mcastgrp,
1836                            (unsigned long) mfc->mfc_origin,
1837                            mfc->mfc_parent,
1838                            mfc->mfc_un.res.pkt,
1839                            mfc->mfc_un.res.bytes,
1840                            mfc->mfc_un.res.wrong_if);
1841
1842                 if (it->cache != &mfc_unres_queue) {
1843                         for (n = mfc->mfc_un.res.minvif;
1844                              n < mfc->mfc_un.res.maxvif; n++ ) {
1845                                 if (VIF_EXISTS(n)
1846                                    && mfc->mfc_un.res.ttls[n] < 255)
1847                                 seq_printf(seq,
1848                                            " %2d:%-3d",
1849                                            n, mfc->mfc_un.res.ttls[n]);
1850                         }
1851                 }
1852                 seq_putc(seq, '\n');
1853         }
1854         return 0;
1855 }
1856
1857 static const struct seq_operations ipmr_mfc_seq_ops = {
1858         .start = ipmr_mfc_seq_start,
1859         .next  = ipmr_mfc_seq_next,
1860         .stop  = ipmr_mfc_seq_stop,
1861         .show  = ipmr_mfc_seq_show,
1862 };
1863
1864 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1865 {
1866         struct seq_file *seq;
1867         int rc = -ENOMEM;
1868         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1869
1870         if (!s)
1871                 goto out;
1872
1873         rc = seq_open(file, &ipmr_mfc_seq_ops);
1874         if (rc)
1875                 goto out_kfree;
1876
1877         seq = file->private_data;
1878         seq->private = s;
1879 out:
1880         return rc;
1881 out_kfree:
1882         kfree(s);
1883         goto out;
1884
1885 }
1886
1887 static const struct file_operations ipmr_mfc_fops = {
1888         .owner   = THIS_MODULE,
1889         .open    = ipmr_mfc_open,
1890         .read    = seq_read,
1891         .llseek  = seq_lseek,
1892         .release = seq_release_private,
1893 };
1894 #endif
1895
1896 #ifdef CONFIG_IP_PIMSM_V2
1897 static struct net_protocol pim_protocol = {
1898         .handler        =       pim_rcv,
1899 };
1900 #endif
1901
1902
1903 /*
1904  *      Setup for IP multicast routing
1905  */
1906
1907 void __init ip_mr_init(void)
1908 {
1909         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1910                                        sizeof(struct mfc_cache),
1911                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1912                                        NULL, NULL);
1913         init_timer(&ipmr_expire_timer);
1914         ipmr_expire_timer.function=ipmr_expire_process;
1915         register_netdevice_notifier(&ip_mr_notifier);
1916 #ifdef CONFIG_PROC_FS
1917         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1918         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1919 #endif
1920 }