[SK_BUFF]: Some more layer header conversions
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@redhat.com>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Version: $Id: ipmr.c,v 1.65 2001/10/31 21:55:54 davem Exp $
13  *
14  *      Fixes:
15  *      Michael Chastain        :       Incorrect size of copying.
16  *      Alan Cox                :       Added the cache manager code
17  *      Alan Cox                :       Fixed the clone/copy bug and device race.
18  *      Mike McLagan            :       Routing by source
19  *      Malcolm Beattie         :       Buffer handling fixes.
20  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
21  *      SVR Anand               :       Fixed several multicast bugs and problems.
22  *      Alexey Kuznetsov        :       Status, optimisations and more.
23  *      Brad Parker             :       Better behaviour on mrouted upcall
24  *                                      overflow.
25  *      Carlos Picoto           :       PIMv1 Support
26  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
27  *                                      Relax this requrement to work with older peers.
28  *
29  */
30
31 #include <asm/system.h>
32 #include <asm/uaccess.h>
33 #include <linux/types.h>
34 #include <linux/capability.h>
35 #include <linux/errno.h>
36 #include <linux/timer.h>
37 #include <linux/mm.h>
38 #include <linux/kernel.h>
39 #include <linux/fcntl.h>
40 #include <linux/stat.h>
41 #include <linux/socket.h>
42 #include <linux/in.h>
43 #include <linux/inet.h>
44 #include <linux/netdevice.h>
45 #include <linux/inetdevice.h>
46 #include <linux/igmp.h>
47 #include <linux/proc_fs.h>
48 #include <linux/seq_file.h>
49 #include <linux/mroute.h>
50 #include <linux/init.h>
51 #include <linux/if_ether.h>
52 #include <net/ip.h>
53 #include <net/protocol.h>
54 #include <linux/skbuff.h>
55 #include <net/route.h>
56 #include <net/sock.h>
57 #include <net/icmp.h>
58 #include <net/udp.h>
59 #include <net/raw.h>
60 #include <linux/notifier.h>
61 #include <linux/if_arp.h>
62 #include <linux/netfilter_ipv4.h>
63 #include <net/ipip.h>
64 #include <net/checksum.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 static struct sock *mroute_socket;
71
72
73 /* Big lock, protecting vif table, mrt cache and mroute socket state.
74    Note that the changes are semaphored via rtnl_lock.
75  */
76
77 static DEFINE_RWLOCK(mrt_lock);
78
79 /*
80  *      Multicast router control variables
81  */
82
83 static struct vif_device vif_table[MAXVIFS];            /* Devices              */
84 static int maxvif;
85
86 #define VIF_EXISTS(idx) (vif_table[idx].dev != NULL)
87
88 static int mroute_do_assert;                            /* Set in PIM assert    */
89 static int mroute_do_pim;
90
91 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
92
93 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
94 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
95
96 /* Special spinlock for queue of unresolved entries */
97 static DEFINE_SPINLOCK(mfc_unres_lock);
98
99 /* We return to original Alan's scheme. Hash table of resolved
100    entries is changed only in process context and protected
101    with weak lock mrt_lock. Queue of unresolved entries is protected
102    with strong spinlock mfc_unres_lock.
103
104    In this case data path is free of exclusive locks at all.
105  */
106
107 static struct kmem_cache *mrt_cachep __read_mostly;
108
109 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
110 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
111 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
112
113 #ifdef CONFIG_IP_PIMSM_V2
114 static struct net_protocol pim_protocol;
115 #endif
116
117 static struct timer_list ipmr_expire_timer;
118
119 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
120
121 static
122 struct net_device *ipmr_new_tunnel(struct vifctl *v)
123 {
124         struct net_device  *dev;
125
126         dev = __dev_get_by_name("tunl0");
127
128         if (dev) {
129                 int err;
130                 struct ifreq ifr;
131                 mm_segment_t    oldfs;
132                 struct ip_tunnel_parm p;
133                 struct in_device  *in_dev;
134
135                 memset(&p, 0, sizeof(p));
136                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
137                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
138                 p.iph.version = 4;
139                 p.iph.ihl = 5;
140                 p.iph.protocol = IPPROTO_IPIP;
141                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
142                 ifr.ifr_ifru.ifru_data = (void*)&p;
143
144                 oldfs = get_fs(); set_fs(KERNEL_DS);
145                 err = dev->do_ioctl(dev, &ifr, SIOCADDTUNNEL);
146                 set_fs(oldfs);
147
148                 dev = NULL;
149
150                 if (err == 0 && (dev = __dev_get_by_name(p.name)) != NULL) {
151                         dev->flags |= IFF_MULTICAST;
152
153                         in_dev = __in_dev_get_rtnl(dev);
154                         if (in_dev == NULL && (in_dev = inetdev_init(dev)) == NULL)
155                                 goto failure;
156                         in_dev->cnf.rp_filter = 0;
157
158                         if (dev_open(dev))
159                                 goto failure;
160                 }
161         }
162         return dev;
163
164 failure:
165         /* allow the register to be completed before unregistering. */
166         rtnl_unlock();
167         rtnl_lock();
168
169         unregister_netdevice(dev);
170         return NULL;
171 }
172
173 #ifdef CONFIG_IP_PIMSM
174
175 static int reg_vif_num = -1;
176
177 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
178 {
179         read_lock(&mrt_lock);
180         ((struct net_device_stats*)netdev_priv(dev))->tx_bytes += skb->len;
181         ((struct net_device_stats*)netdev_priv(dev))->tx_packets++;
182         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
183         read_unlock(&mrt_lock);
184         kfree_skb(skb);
185         return 0;
186 }
187
188 static struct net_device_stats *reg_vif_get_stats(struct net_device *dev)
189 {
190         return (struct net_device_stats*)netdev_priv(dev);
191 }
192
193 static void reg_vif_setup(struct net_device *dev)
194 {
195         dev->type               = ARPHRD_PIMREG;
196         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
197         dev->flags              = IFF_NOARP;
198         dev->hard_start_xmit    = reg_vif_xmit;
199         dev->get_stats          = reg_vif_get_stats;
200         dev->destructor         = free_netdev;
201 }
202
203 static struct net_device *ipmr_reg_vif(void)
204 {
205         struct net_device *dev;
206         struct in_device *in_dev;
207
208         dev = alloc_netdev(sizeof(struct net_device_stats), "pimreg",
209                            reg_vif_setup);
210
211         if (dev == NULL)
212                 return NULL;
213
214         if (register_netdevice(dev)) {
215                 free_netdev(dev);
216                 return NULL;
217         }
218         dev->iflink = 0;
219
220         if ((in_dev = inetdev_init(dev)) == NULL)
221                 goto failure;
222
223         in_dev->cnf.rp_filter = 0;
224
225         if (dev_open(dev))
226                 goto failure;
227
228         return dev;
229
230 failure:
231         /* allow the register to be completed before unregistering. */
232         rtnl_unlock();
233         rtnl_lock();
234
235         unregister_netdevice(dev);
236         return NULL;
237 }
238 #endif
239
240 /*
241  *      Delete a VIF entry
242  */
243
244 static int vif_delete(int vifi)
245 {
246         struct vif_device *v;
247         struct net_device *dev;
248         struct in_device *in_dev;
249
250         if (vifi < 0 || vifi >= maxvif)
251                 return -EADDRNOTAVAIL;
252
253         v = &vif_table[vifi];
254
255         write_lock_bh(&mrt_lock);
256         dev = v->dev;
257         v->dev = NULL;
258
259         if (!dev) {
260                 write_unlock_bh(&mrt_lock);
261                 return -EADDRNOTAVAIL;
262         }
263
264 #ifdef CONFIG_IP_PIMSM
265         if (vifi == reg_vif_num)
266                 reg_vif_num = -1;
267 #endif
268
269         if (vifi+1 == maxvif) {
270                 int tmp;
271                 for (tmp=vifi-1; tmp>=0; tmp--) {
272                         if (VIF_EXISTS(tmp))
273                                 break;
274                 }
275                 maxvif = tmp+1;
276         }
277
278         write_unlock_bh(&mrt_lock);
279
280         dev_set_allmulti(dev, -1);
281
282         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
283                 in_dev->cnf.mc_forwarding--;
284                 ip_rt_multicast_event(in_dev);
285         }
286
287         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
288                 unregister_netdevice(dev);
289
290         dev_put(dev);
291         return 0;
292 }
293
294 /* Destroy an unresolved cache entry, killing queued skbs
295    and reporting error to netlink readers.
296  */
297
298 static void ipmr_destroy_unres(struct mfc_cache *c)
299 {
300         struct sk_buff *skb;
301         struct nlmsgerr *e;
302
303         atomic_dec(&cache_resolve_queue_len);
304
305         while ((skb=skb_dequeue(&c->mfc_un.unres.unresolved))) {
306                 if (ip_hdr(skb)->version == 0) {
307                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
308                         nlh->nlmsg_type = NLMSG_ERROR;
309                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
310                         skb_trim(skb, nlh->nlmsg_len);
311                         e = NLMSG_DATA(nlh);
312                         e->error = -ETIMEDOUT;
313                         memset(&e->msg, 0, sizeof(e->msg));
314
315                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
316                 } else
317                         kfree_skb(skb);
318         }
319
320         kmem_cache_free(mrt_cachep, c);
321 }
322
323
324 /* Single timer process for all the unresolved queue. */
325
326 static void ipmr_expire_process(unsigned long dummy)
327 {
328         unsigned long now;
329         unsigned long expires;
330         struct mfc_cache *c, **cp;
331
332         if (!spin_trylock(&mfc_unres_lock)) {
333                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
334                 return;
335         }
336
337         if (atomic_read(&cache_resolve_queue_len) == 0)
338                 goto out;
339
340         now = jiffies;
341         expires = 10*HZ;
342         cp = &mfc_unres_queue;
343
344         while ((c=*cp) != NULL) {
345                 if (time_after(c->mfc_un.unres.expires, now)) {
346                         unsigned long interval = c->mfc_un.unres.expires - now;
347                         if (interval < expires)
348                                 expires = interval;
349                         cp = &c->next;
350                         continue;
351                 }
352
353                 *cp = c->next;
354
355                 ipmr_destroy_unres(c);
356         }
357
358         if (atomic_read(&cache_resolve_queue_len))
359                 mod_timer(&ipmr_expire_timer, jiffies + expires);
360
361 out:
362         spin_unlock(&mfc_unres_lock);
363 }
364
365 /* Fill oifs list. It is called under write locked mrt_lock. */
366
367 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
368 {
369         int vifi;
370
371         cache->mfc_un.res.minvif = MAXVIFS;
372         cache->mfc_un.res.maxvif = 0;
373         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
374
375         for (vifi=0; vifi<maxvif; vifi++) {
376                 if (VIF_EXISTS(vifi) && ttls[vifi] && ttls[vifi] < 255) {
377                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
378                         if (cache->mfc_un.res.minvif > vifi)
379                                 cache->mfc_un.res.minvif = vifi;
380                         if (cache->mfc_un.res.maxvif <= vifi)
381                                 cache->mfc_un.res.maxvif = vifi + 1;
382                 }
383         }
384 }
385
386 static int vif_add(struct vifctl *vifc, int mrtsock)
387 {
388         int vifi = vifc->vifc_vifi;
389         struct vif_device *v = &vif_table[vifi];
390         struct net_device *dev;
391         struct in_device *in_dev;
392
393         /* Is vif busy ? */
394         if (VIF_EXISTS(vifi))
395                 return -EADDRINUSE;
396
397         switch (vifc->vifc_flags) {
398 #ifdef CONFIG_IP_PIMSM
399         case VIFF_REGISTER:
400                 /*
401                  * Special Purpose VIF in PIM
402                  * All the packets will be sent to the daemon
403                  */
404                 if (reg_vif_num >= 0)
405                         return -EADDRINUSE;
406                 dev = ipmr_reg_vif();
407                 if (!dev)
408                         return -ENOBUFS;
409                 break;
410 #endif
411         case VIFF_TUNNEL:
412                 dev = ipmr_new_tunnel(vifc);
413                 if (!dev)
414                         return -ENOBUFS;
415                 break;
416         case 0:
417                 dev = ip_dev_find(vifc->vifc_lcl_addr.s_addr);
418                 if (!dev)
419                         return -EADDRNOTAVAIL;
420                 dev_put(dev);
421                 break;
422         default:
423                 return -EINVAL;
424         }
425
426         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
427                 return -EADDRNOTAVAIL;
428         in_dev->cnf.mc_forwarding++;
429         dev_set_allmulti(dev, +1);
430         ip_rt_multicast_event(in_dev);
431
432         /*
433          *      Fill in the VIF structures
434          */
435         v->rate_limit=vifc->vifc_rate_limit;
436         v->local=vifc->vifc_lcl_addr.s_addr;
437         v->remote=vifc->vifc_rmt_addr.s_addr;
438         v->flags=vifc->vifc_flags;
439         if (!mrtsock)
440                 v->flags |= VIFF_STATIC;
441         v->threshold=vifc->vifc_threshold;
442         v->bytes_in = 0;
443         v->bytes_out = 0;
444         v->pkt_in = 0;
445         v->pkt_out = 0;
446         v->link = dev->ifindex;
447         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
448                 v->link = dev->iflink;
449
450         /* And finish update writing critical data */
451         write_lock_bh(&mrt_lock);
452         dev_hold(dev);
453         v->dev=dev;
454 #ifdef CONFIG_IP_PIMSM
455         if (v->flags&VIFF_REGISTER)
456                 reg_vif_num = vifi;
457 #endif
458         if (vifi+1 > maxvif)
459                 maxvif = vifi+1;
460         write_unlock_bh(&mrt_lock);
461         return 0;
462 }
463
464 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
465 {
466         int line=MFC_HASH(mcastgrp,origin);
467         struct mfc_cache *c;
468
469         for (c=mfc_cache_array[line]; c; c = c->next) {
470                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
471                         break;
472         }
473         return c;
474 }
475
476 /*
477  *      Allocate a multicast cache entry
478  */
479 static struct mfc_cache *ipmr_cache_alloc(void)
480 {
481         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
482         if (c==NULL)
483                 return NULL;
484         c->mfc_un.res.minvif = MAXVIFS;
485         return c;
486 }
487
488 static struct mfc_cache *ipmr_cache_alloc_unres(void)
489 {
490         struct mfc_cache *c=kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
491         if (c==NULL)
492                 return NULL;
493         skb_queue_head_init(&c->mfc_un.unres.unresolved);
494         c->mfc_un.unres.expires = jiffies + 10*HZ;
495         return c;
496 }
497
498 /*
499  *      A cache entry has gone into a resolved state from queued
500  */
501
502 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
503 {
504         struct sk_buff *skb;
505         struct nlmsgerr *e;
506
507         /*
508          *      Play the pending entries through our router
509          */
510
511         while ((skb=__skb_dequeue(&uc->mfc_un.unres.unresolved))) {
512                 if (ip_hdr(skb)->version == 0) {
513                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
514
515                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
516                                 nlh->nlmsg_len = skb->tail - (u8*)nlh;
517                         } else {
518                                 nlh->nlmsg_type = NLMSG_ERROR;
519                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
520                                 skb_trim(skb, nlh->nlmsg_len);
521                                 e = NLMSG_DATA(nlh);
522                                 e->error = -EMSGSIZE;
523                                 memset(&e->msg, 0, sizeof(e->msg));
524                         }
525
526                         rtnl_unicast(skb, NETLINK_CB(skb).pid);
527                 } else
528                         ip_mr_forward(skb, c, 0);
529         }
530 }
531
532 /*
533  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
534  *      expects the following bizarre scheme.
535  *
536  *      Called under mrt_lock.
537  */
538
539 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
540 {
541         struct sk_buff *skb;
542         const int ihl = ip_hdrlen(pkt);
543         struct igmphdr *igmp;
544         struct igmpmsg *msg;
545         int ret;
546
547 #ifdef CONFIG_IP_PIMSM
548         if (assert == IGMPMSG_WHOLEPKT)
549                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
550         else
551 #endif
552                 skb = alloc_skb(128, GFP_ATOMIC);
553
554         if (!skb)
555                 return -ENOBUFS;
556
557 #ifdef CONFIG_IP_PIMSM
558         if (assert == IGMPMSG_WHOLEPKT) {
559                 /* Ugly, but we have no choice with this interface.
560                    Duplicate old header, fix ihl, length etc.
561                    And all this only to mangle msg->im_msgtype and
562                    to set msg->im_mbz to "mbz" :-)
563                  */
564                 skb_push(skb, sizeof(struct iphdr));
565                 skb_reset_network_header(skb);
566                 skb_reset_transport_header(skb);
567                 msg = (struct igmpmsg *)skb_network_header(skb);
568                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
569                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
570                 msg->im_mbz = 0;
571                 msg->im_vif = reg_vif_num;
572                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
573                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
574                                              sizeof(struct iphdr));
575         } else
576 #endif
577         {
578
579         /*
580          *      Copy the IP header
581          */
582
583         skb_set_network_header(skb, skb->tail - skb->data);
584         skb_put(skb, ihl);
585         memcpy(skb->data,pkt->data,ihl);
586         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
587         msg = (struct igmpmsg *)skb_network_header(skb);
588         msg->im_vif = vifi;
589         skb->dst = dst_clone(pkt->dst);
590
591         /*
592          *      Add our header
593          */
594
595         igmp=(struct igmphdr *)skb_put(skb,sizeof(struct igmphdr));
596         igmp->type      =
597         msg->im_msgtype = assert;
598         igmp->code      =       0;
599         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
600         skb->h.raw = skb->nh.raw;
601         }
602
603         if (mroute_socket == NULL) {
604                 kfree_skb(skb);
605                 return -EINVAL;
606         }
607
608         /*
609          *      Deliver to mrouted
610          */
611         if ((ret=sock_queue_rcv_skb(mroute_socket,skb))<0) {
612                 if (net_ratelimit())
613                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
614                 kfree_skb(skb);
615         }
616
617         return ret;
618 }
619
620 /*
621  *      Queue a packet for resolution. It gets locked cache entry!
622  */
623
624 static int
625 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
626 {
627         int err;
628         struct mfc_cache *c;
629         const struct iphdr *iph = ip_hdr(skb);
630
631         spin_lock_bh(&mfc_unres_lock);
632         for (c=mfc_unres_queue; c; c=c->next) {
633                 if (c->mfc_mcastgrp == iph->daddr &&
634                     c->mfc_origin == iph->saddr)
635                         break;
636         }
637
638         if (c == NULL) {
639                 /*
640                  *      Create a new entry if allowable
641                  */
642
643                 if (atomic_read(&cache_resolve_queue_len)>=10 ||
644                     (c=ipmr_cache_alloc_unres())==NULL) {
645                         spin_unlock_bh(&mfc_unres_lock);
646
647                         kfree_skb(skb);
648                         return -ENOBUFS;
649                 }
650
651                 /*
652                  *      Fill in the new cache entry
653                  */
654                 c->mfc_parent   = -1;
655                 c->mfc_origin   = iph->saddr;
656                 c->mfc_mcastgrp = iph->daddr;
657
658                 /*
659                  *      Reflect first query at mrouted.
660                  */
661                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
662                         /* If the report failed throw the cache entry
663                            out - Brad Parker
664                          */
665                         spin_unlock_bh(&mfc_unres_lock);
666
667                         kmem_cache_free(mrt_cachep, c);
668                         kfree_skb(skb);
669                         return err;
670                 }
671
672                 atomic_inc(&cache_resolve_queue_len);
673                 c->next = mfc_unres_queue;
674                 mfc_unres_queue = c;
675
676                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
677         }
678
679         /*
680          *      See if we can append the packet
681          */
682         if (c->mfc_un.unres.unresolved.qlen>3) {
683                 kfree_skb(skb);
684                 err = -ENOBUFS;
685         } else {
686                 skb_queue_tail(&c->mfc_un.unres.unresolved,skb);
687                 err = 0;
688         }
689
690         spin_unlock_bh(&mfc_unres_lock);
691         return err;
692 }
693
694 /*
695  *      MFC cache manipulation by user space mroute daemon
696  */
697
698 static int ipmr_mfc_delete(struct mfcctl *mfc)
699 {
700         int line;
701         struct mfc_cache *c, **cp;
702
703         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
704
705         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
706                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
707                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
708                         write_lock_bh(&mrt_lock);
709                         *cp = c->next;
710                         write_unlock_bh(&mrt_lock);
711
712                         kmem_cache_free(mrt_cachep, c);
713                         return 0;
714                 }
715         }
716         return -ENOENT;
717 }
718
719 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
720 {
721         int line;
722         struct mfc_cache *uc, *c, **cp;
723
724         line=MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
725
726         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
727                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
728                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
729                         break;
730         }
731
732         if (c != NULL) {
733                 write_lock_bh(&mrt_lock);
734                 c->mfc_parent = mfc->mfcc_parent;
735                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
736                 if (!mrtsock)
737                         c->mfc_flags |= MFC_STATIC;
738                 write_unlock_bh(&mrt_lock);
739                 return 0;
740         }
741
742         if (!MULTICAST(mfc->mfcc_mcastgrp.s_addr))
743                 return -EINVAL;
744
745         c=ipmr_cache_alloc();
746         if (c==NULL)
747                 return -ENOMEM;
748
749         c->mfc_origin=mfc->mfcc_origin.s_addr;
750         c->mfc_mcastgrp=mfc->mfcc_mcastgrp.s_addr;
751         c->mfc_parent=mfc->mfcc_parent;
752         ipmr_update_thresholds(c, mfc->mfcc_ttls);
753         if (!mrtsock)
754                 c->mfc_flags |= MFC_STATIC;
755
756         write_lock_bh(&mrt_lock);
757         c->next = mfc_cache_array[line];
758         mfc_cache_array[line] = c;
759         write_unlock_bh(&mrt_lock);
760
761         /*
762          *      Check to see if we resolved a queued list. If so we
763          *      need to send on the frames and tidy up.
764          */
765         spin_lock_bh(&mfc_unres_lock);
766         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
767              cp = &uc->next) {
768                 if (uc->mfc_origin == c->mfc_origin &&
769                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
770                         *cp = uc->next;
771                         if (atomic_dec_and_test(&cache_resolve_queue_len))
772                                 del_timer(&ipmr_expire_timer);
773                         break;
774                 }
775         }
776         spin_unlock_bh(&mfc_unres_lock);
777
778         if (uc) {
779                 ipmr_cache_resolve(uc, c);
780                 kmem_cache_free(mrt_cachep, uc);
781         }
782         return 0;
783 }
784
785 /*
786  *      Close the multicast socket, and clear the vif tables etc
787  */
788
789 static void mroute_clean_tables(struct sock *sk)
790 {
791         int i;
792
793         /*
794          *      Shut down all active vif entries
795          */
796         for (i=0; i<maxvif; i++) {
797                 if (!(vif_table[i].flags&VIFF_STATIC))
798                         vif_delete(i);
799         }
800
801         /*
802          *      Wipe the cache
803          */
804         for (i=0;i<MFC_LINES;i++) {
805                 struct mfc_cache *c, **cp;
806
807                 cp = &mfc_cache_array[i];
808                 while ((c = *cp) != NULL) {
809                         if (c->mfc_flags&MFC_STATIC) {
810                                 cp = &c->next;
811                                 continue;
812                         }
813                         write_lock_bh(&mrt_lock);
814                         *cp = c->next;
815                         write_unlock_bh(&mrt_lock);
816
817                         kmem_cache_free(mrt_cachep, c);
818                 }
819         }
820
821         if (atomic_read(&cache_resolve_queue_len) != 0) {
822                 struct mfc_cache *c;
823
824                 spin_lock_bh(&mfc_unres_lock);
825                 while (mfc_unres_queue != NULL) {
826                         c = mfc_unres_queue;
827                         mfc_unres_queue = c->next;
828                         spin_unlock_bh(&mfc_unres_lock);
829
830                         ipmr_destroy_unres(c);
831
832                         spin_lock_bh(&mfc_unres_lock);
833                 }
834                 spin_unlock_bh(&mfc_unres_lock);
835         }
836 }
837
838 static void mrtsock_destruct(struct sock *sk)
839 {
840         rtnl_lock();
841         if (sk == mroute_socket) {
842                 ipv4_devconf.mc_forwarding--;
843
844                 write_lock_bh(&mrt_lock);
845                 mroute_socket=NULL;
846                 write_unlock_bh(&mrt_lock);
847
848                 mroute_clean_tables(sk);
849         }
850         rtnl_unlock();
851 }
852
853 /*
854  *      Socket options and virtual interface manipulation. The whole
855  *      virtual interface system is a complete heap, but unfortunately
856  *      that's how BSD mrouted happens to think. Maybe one day with a proper
857  *      MOSPF/PIM router set up we can clean this up.
858  */
859
860 int ip_mroute_setsockopt(struct sock *sk,int optname,char __user *optval,int optlen)
861 {
862         int ret;
863         struct vifctl vif;
864         struct mfcctl mfc;
865
866         if (optname != MRT_INIT) {
867                 if (sk != mroute_socket && !capable(CAP_NET_ADMIN))
868                         return -EACCES;
869         }
870
871         switch (optname) {
872         case MRT_INIT:
873                 if (sk->sk_type != SOCK_RAW ||
874                     inet_sk(sk)->num != IPPROTO_IGMP)
875                         return -EOPNOTSUPP;
876                 if (optlen!=sizeof(int))
877                         return -ENOPROTOOPT;
878
879                 rtnl_lock();
880                 if (mroute_socket) {
881                         rtnl_unlock();
882                         return -EADDRINUSE;
883                 }
884
885                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
886                 if (ret == 0) {
887                         write_lock_bh(&mrt_lock);
888                         mroute_socket=sk;
889                         write_unlock_bh(&mrt_lock);
890
891                         ipv4_devconf.mc_forwarding++;
892                 }
893                 rtnl_unlock();
894                 return ret;
895         case MRT_DONE:
896                 if (sk!=mroute_socket)
897                         return -EACCES;
898                 return ip_ra_control(sk, 0, NULL);
899         case MRT_ADD_VIF:
900         case MRT_DEL_VIF:
901                 if (optlen!=sizeof(vif))
902                         return -EINVAL;
903                 if (copy_from_user(&vif,optval,sizeof(vif)))
904                         return -EFAULT;
905                 if (vif.vifc_vifi >= MAXVIFS)
906                         return -ENFILE;
907                 rtnl_lock();
908                 if (optname==MRT_ADD_VIF) {
909                         ret = vif_add(&vif, sk==mroute_socket);
910                 } else {
911                         ret = vif_delete(vif.vifc_vifi);
912                 }
913                 rtnl_unlock();
914                 return ret;
915
916                 /*
917                  *      Manipulate the forwarding caches. These live
918                  *      in a sort of kernel/user symbiosis.
919                  */
920         case MRT_ADD_MFC:
921         case MRT_DEL_MFC:
922                 if (optlen!=sizeof(mfc))
923                         return -EINVAL;
924                 if (copy_from_user(&mfc,optval, sizeof(mfc)))
925                         return -EFAULT;
926                 rtnl_lock();
927                 if (optname==MRT_DEL_MFC)
928                         ret = ipmr_mfc_delete(&mfc);
929                 else
930                         ret = ipmr_mfc_add(&mfc, sk==mroute_socket);
931                 rtnl_unlock();
932                 return ret;
933                 /*
934                  *      Control PIM assert.
935                  */
936         case MRT_ASSERT:
937         {
938                 int v;
939                 if (get_user(v,(int __user *)optval))
940                         return -EFAULT;
941                 mroute_do_assert=(v)?1:0;
942                 return 0;
943         }
944 #ifdef CONFIG_IP_PIMSM
945         case MRT_PIM:
946         {
947                 int v, ret;
948                 if (get_user(v,(int __user *)optval))
949                         return -EFAULT;
950                 v = (v)?1:0;
951                 rtnl_lock();
952                 ret = 0;
953                 if (v != mroute_do_pim) {
954                         mroute_do_pim = v;
955                         mroute_do_assert = v;
956 #ifdef CONFIG_IP_PIMSM_V2
957                         if (mroute_do_pim)
958                                 ret = inet_add_protocol(&pim_protocol,
959                                                         IPPROTO_PIM);
960                         else
961                                 ret = inet_del_protocol(&pim_protocol,
962                                                         IPPROTO_PIM);
963                         if (ret < 0)
964                                 ret = -EAGAIN;
965 #endif
966                 }
967                 rtnl_unlock();
968                 return ret;
969         }
970 #endif
971         /*
972          *      Spurious command, or MRT_VERSION which you cannot
973          *      set.
974          */
975         default:
976                 return -ENOPROTOOPT;
977         }
978 }
979
980 /*
981  *      Getsock opt support for the multicast routing system.
982  */
983
984 int ip_mroute_getsockopt(struct sock *sk,int optname,char __user *optval,int __user *optlen)
985 {
986         int olr;
987         int val;
988
989         if (optname!=MRT_VERSION &&
990 #ifdef CONFIG_IP_PIMSM
991            optname!=MRT_PIM &&
992 #endif
993            optname!=MRT_ASSERT)
994                 return -ENOPROTOOPT;
995
996         if (get_user(olr, optlen))
997                 return -EFAULT;
998
999         olr = min_t(unsigned int, olr, sizeof(int));
1000         if (olr < 0)
1001                 return -EINVAL;
1002
1003         if (put_user(olr,optlen))
1004                 return -EFAULT;
1005         if (optname==MRT_VERSION)
1006                 val=0x0305;
1007 #ifdef CONFIG_IP_PIMSM
1008         else if (optname==MRT_PIM)
1009                 val=mroute_do_pim;
1010 #endif
1011         else
1012                 val=mroute_do_assert;
1013         if (copy_to_user(optval,&val,olr))
1014                 return -EFAULT;
1015         return 0;
1016 }
1017
1018 /*
1019  *      The IP multicast ioctl support routines.
1020  */
1021
1022 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1023 {
1024         struct sioc_sg_req sr;
1025         struct sioc_vif_req vr;
1026         struct vif_device *vif;
1027         struct mfc_cache *c;
1028
1029         switch (cmd) {
1030         case SIOCGETVIFCNT:
1031                 if (copy_from_user(&vr,arg,sizeof(vr)))
1032                         return -EFAULT;
1033                 if (vr.vifi>=maxvif)
1034                         return -EINVAL;
1035                 read_lock(&mrt_lock);
1036                 vif=&vif_table[vr.vifi];
1037                 if (VIF_EXISTS(vr.vifi))        {
1038                         vr.icount=vif->pkt_in;
1039                         vr.ocount=vif->pkt_out;
1040                         vr.ibytes=vif->bytes_in;
1041                         vr.obytes=vif->bytes_out;
1042                         read_unlock(&mrt_lock);
1043
1044                         if (copy_to_user(arg,&vr,sizeof(vr)))
1045                                 return -EFAULT;
1046                         return 0;
1047                 }
1048                 read_unlock(&mrt_lock);
1049                 return -EADDRNOTAVAIL;
1050         case SIOCGETSGCNT:
1051                 if (copy_from_user(&sr,arg,sizeof(sr)))
1052                         return -EFAULT;
1053
1054                 read_lock(&mrt_lock);
1055                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1056                 if (c) {
1057                         sr.pktcnt = c->mfc_un.res.pkt;
1058                         sr.bytecnt = c->mfc_un.res.bytes;
1059                         sr.wrong_if = c->mfc_un.res.wrong_if;
1060                         read_unlock(&mrt_lock);
1061
1062                         if (copy_to_user(arg,&sr,sizeof(sr)))
1063                                 return -EFAULT;
1064                         return 0;
1065                 }
1066                 read_unlock(&mrt_lock);
1067                 return -EADDRNOTAVAIL;
1068         default:
1069                 return -ENOIOCTLCMD;
1070         }
1071 }
1072
1073
1074 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1075 {
1076         struct vif_device *v;
1077         int ct;
1078         if (event != NETDEV_UNREGISTER)
1079                 return NOTIFY_DONE;
1080         v=&vif_table[0];
1081         for (ct=0;ct<maxvif;ct++,v++) {
1082                 if (v->dev==ptr)
1083                         vif_delete(ct);
1084         }
1085         return NOTIFY_DONE;
1086 }
1087
1088
1089 static struct notifier_block ip_mr_notifier={
1090         .notifier_call = ipmr_device_event,
1091 };
1092
1093 /*
1094  *      Encapsulate a packet by attaching a valid IPIP header to it.
1095  *      This avoids tunnel drivers and other mess and gives us the speed so
1096  *      important for multicast video.
1097  */
1098
1099 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1100 {
1101         struct iphdr *iph;
1102         struct iphdr *old_iph = ip_hdr(skb);
1103
1104         skb_push(skb, sizeof(struct iphdr));
1105         skb->h.raw = skb->nh.raw;
1106         skb_reset_network_header(skb);
1107         iph = ip_hdr(skb);
1108
1109         iph->version    =       4;
1110         iph->tos        =       old_iph->tos;
1111         iph->ttl        =       old_iph->ttl;
1112         iph->frag_off   =       0;
1113         iph->daddr      =       daddr;
1114         iph->saddr      =       saddr;
1115         iph->protocol   =       IPPROTO_IPIP;
1116         iph->ihl        =       5;
1117         iph->tot_len    =       htons(skb->len);
1118         ip_select_ident(iph, skb->dst, NULL);
1119         ip_send_check(iph);
1120
1121         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1122         nf_reset(skb);
1123 }
1124
1125 static inline int ipmr_forward_finish(struct sk_buff *skb)
1126 {
1127         struct ip_options * opt = &(IPCB(skb)->opt);
1128
1129         IP_INC_STATS_BH(IPSTATS_MIB_OUTFORWDATAGRAMS);
1130
1131         if (unlikely(opt->optlen))
1132                 ip_forward_options(skb);
1133
1134         return dst_output(skb);
1135 }
1136
1137 /*
1138  *      Processing handlers for ipmr_forward
1139  */
1140
1141 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1142 {
1143         const struct iphdr *iph = ip_hdr(skb);
1144         struct vif_device *vif = &vif_table[vifi];
1145         struct net_device *dev;
1146         struct rtable *rt;
1147         int    encap = 0;
1148
1149         if (vif->dev == NULL)
1150                 goto out_free;
1151
1152 #ifdef CONFIG_IP_PIMSM
1153         if (vif->flags & VIFF_REGISTER) {
1154                 vif->pkt_out++;
1155                 vif->bytes_out+=skb->len;
1156                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_bytes += skb->len;
1157                 ((struct net_device_stats*)netdev_priv(vif->dev))->tx_packets++;
1158                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1159                 kfree_skb(skb);
1160                 return;
1161         }
1162 #endif
1163
1164         if (vif->flags&VIFF_TUNNEL) {
1165                 struct flowi fl = { .oif = vif->link,
1166                                     .nl_u = { .ip4_u =
1167                                               { .daddr = vif->remote,
1168                                                 .saddr = vif->local,
1169                                                 .tos = RT_TOS(iph->tos) } },
1170                                     .proto = IPPROTO_IPIP };
1171                 if (ip_route_output_key(&rt, &fl))
1172                         goto out_free;
1173                 encap = sizeof(struct iphdr);
1174         } else {
1175                 struct flowi fl = { .oif = vif->link,
1176                                     .nl_u = { .ip4_u =
1177                                               { .daddr = iph->daddr,
1178                                                 .tos = RT_TOS(iph->tos) } },
1179                                     .proto = IPPROTO_IPIP };
1180                 if (ip_route_output_key(&rt, &fl))
1181                         goto out_free;
1182         }
1183
1184         dev = rt->u.dst.dev;
1185
1186         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1187                 /* Do not fragment multicasts. Alas, IPv4 does not
1188                    allow to send ICMP, so that packets will disappear
1189                    to blackhole.
1190                  */
1191
1192                 IP_INC_STATS_BH(IPSTATS_MIB_FRAGFAILS);
1193                 ip_rt_put(rt);
1194                 goto out_free;
1195         }
1196
1197         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1198
1199         if (skb_cow(skb, encap)) {
1200                 ip_rt_put(rt);
1201                 goto out_free;
1202         }
1203
1204         vif->pkt_out++;
1205         vif->bytes_out+=skb->len;
1206
1207         dst_release(skb->dst);
1208         skb->dst = &rt->u.dst;
1209         ip_decrease_ttl(ip_hdr(skb));
1210
1211         /* FIXME: forward and output firewalls used to be called here.
1212          * What do we do with netfilter? -- RR */
1213         if (vif->flags & VIFF_TUNNEL) {
1214                 ip_encap(skb, vif->local, vif->remote);
1215                 /* FIXME: extra output firewall step used to be here. --RR */
1216                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_packets++;
1217                 ((struct ip_tunnel *)netdev_priv(vif->dev))->stat.tx_bytes+=skb->len;
1218         }
1219
1220         IPCB(skb)->flags |= IPSKB_FORWARDED;
1221
1222         /*
1223          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1224          * not only before forwarding, but after forwarding on all output
1225          * interfaces. It is clear, if mrouter runs a multicasting
1226          * program, it should receive packets not depending to what interface
1227          * program is joined.
1228          * If we will not make it, the program will have to join on all
1229          * interfaces. On the other hand, multihoming host (or router, but
1230          * not mrouter) cannot join to more than one interface - it will
1231          * result in receiving multiple packets.
1232          */
1233         NF_HOOK(PF_INET, NF_IP_FORWARD, skb, skb->dev, dev,
1234                 ipmr_forward_finish);
1235         return;
1236
1237 out_free:
1238         kfree_skb(skb);
1239         return;
1240 }
1241
1242 static int ipmr_find_vif(struct net_device *dev)
1243 {
1244         int ct;
1245         for (ct=maxvif-1; ct>=0; ct--) {
1246                 if (vif_table[ct].dev == dev)
1247                         break;
1248         }
1249         return ct;
1250 }
1251
1252 /* "local" means that we should preserve one skb (for local delivery) */
1253
1254 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1255 {
1256         int psend = -1;
1257         int vif, ct;
1258
1259         vif = cache->mfc_parent;
1260         cache->mfc_un.res.pkt++;
1261         cache->mfc_un.res.bytes += skb->len;
1262
1263         /*
1264          * Wrong interface: drop packet and (maybe) send PIM assert.
1265          */
1266         if (vif_table[vif].dev != skb->dev) {
1267                 int true_vifi;
1268
1269                 if (((struct rtable*)skb->dst)->fl.iif == 0) {
1270                         /* It is our own packet, looped back.
1271                            Very complicated situation...
1272
1273                            The best workaround until routing daemons will be
1274                            fixed is not to redistribute packet, if it was
1275                            send through wrong interface. It means, that
1276                            multicast applications WILL NOT work for
1277                            (S,G), which have default multicast route pointing
1278                            to wrong oif. In any case, it is not a good
1279                            idea to use multicasting applications on router.
1280                          */
1281                         goto dont_forward;
1282                 }
1283
1284                 cache->mfc_un.res.wrong_if++;
1285                 true_vifi = ipmr_find_vif(skb->dev);
1286
1287                 if (true_vifi >= 0 && mroute_do_assert &&
1288                     /* pimsm uses asserts, when switching from RPT to SPT,
1289                        so that we cannot check that packet arrived on an oif.
1290                        It is bad, but otherwise we would need to move pretty
1291                        large chunk of pimd to kernel. Ough... --ANK
1292                      */
1293                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1294                     time_after(jiffies,
1295                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1296                         cache->mfc_un.res.last_assert = jiffies;
1297                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1298                 }
1299                 goto dont_forward;
1300         }
1301
1302         vif_table[vif].pkt_in++;
1303         vif_table[vif].bytes_in+=skb->len;
1304
1305         /*
1306          *      Forward the frame
1307          */
1308         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1309                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1310                         if (psend != -1) {
1311                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1312                                 if (skb2)
1313                                         ipmr_queue_xmit(skb2, cache, psend);
1314                         }
1315                         psend=ct;
1316                 }
1317         }
1318         if (psend != -1) {
1319                 if (local) {
1320                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1321                         if (skb2)
1322                                 ipmr_queue_xmit(skb2, cache, psend);
1323                 } else {
1324                         ipmr_queue_xmit(skb, cache, psend);
1325                         return 0;
1326                 }
1327         }
1328
1329 dont_forward:
1330         if (!local)
1331                 kfree_skb(skb);
1332         return 0;
1333 }
1334
1335
1336 /*
1337  *      Multicast packets for forwarding arrive here
1338  */
1339
1340 int ip_mr_input(struct sk_buff *skb)
1341 {
1342         struct mfc_cache *cache;
1343         int local = ((struct rtable*)skb->dst)->rt_flags&RTCF_LOCAL;
1344
1345         /* Packet is looped back after forward, it should not be
1346            forwarded second time, but still can be delivered locally.
1347          */
1348         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1349                 goto dont_forward;
1350
1351         if (!local) {
1352                     if (IPCB(skb)->opt.router_alert) {
1353                             if (ip_call_ra_chain(skb))
1354                                     return 0;
1355                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1356                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1357                                Cisco IOS <= 11.2(8)) do not put router alert
1358                                option to IGMP packets destined to routable
1359                                groups. It is very bad, because it means
1360                                that we can forward NO IGMP messages.
1361                              */
1362                             read_lock(&mrt_lock);
1363                             if (mroute_socket) {
1364                                     nf_reset(skb);
1365                                     raw_rcv(mroute_socket, skb);
1366                                     read_unlock(&mrt_lock);
1367                                     return 0;
1368                             }
1369                             read_unlock(&mrt_lock);
1370                     }
1371         }
1372
1373         read_lock(&mrt_lock);
1374         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1375
1376         /*
1377          *      No usable cache entry
1378          */
1379         if (cache==NULL) {
1380                 int vif;
1381
1382                 if (local) {
1383                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1384                         ip_local_deliver(skb);
1385                         if (skb2 == NULL) {
1386                                 read_unlock(&mrt_lock);
1387                                 return -ENOBUFS;
1388                         }
1389                         skb = skb2;
1390                 }
1391
1392                 vif = ipmr_find_vif(skb->dev);
1393                 if (vif >= 0) {
1394                         int err = ipmr_cache_unresolved(vif, skb);
1395                         read_unlock(&mrt_lock);
1396
1397                         return err;
1398                 }
1399                 read_unlock(&mrt_lock);
1400                 kfree_skb(skb);
1401                 return -ENODEV;
1402         }
1403
1404         ip_mr_forward(skb, cache, local);
1405
1406         read_unlock(&mrt_lock);
1407
1408         if (local)
1409                 return ip_local_deliver(skb);
1410
1411         return 0;
1412
1413 dont_forward:
1414         if (local)
1415                 return ip_local_deliver(skb);
1416         kfree_skb(skb);
1417         return 0;
1418 }
1419
1420 #ifdef CONFIG_IP_PIMSM_V1
1421 /*
1422  * Handle IGMP messages of PIMv1
1423  */
1424
1425 int pim_rcv_v1(struct sk_buff * skb)
1426 {
1427         struct igmphdr *pim;
1428         struct iphdr   *encap;
1429         struct net_device  *reg_dev = NULL;
1430
1431         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1432                 goto drop;
1433
1434         pim = igmp_hdr(skb);
1435
1436         if (!mroute_do_pim ||
1437             skb->len < sizeof(*pim) + sizeof(*encap) ||
1438             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1439                 goto drop;
1440
1441         encap = (struct iphdr *)(skb_transport_header(skb) +
1442                                  sizeof(struct igmphdr));
1443         /*
1444            Check that:
1445            a. packet is really destinted to a multicast group
1446            b. packet is not a NULL-REGISTER
1447            c. packet is not truncated
1448          */
1449         if (!MULTICAST(encap->daddr) ||
1450             encap->tot_len == 0 ||
1451             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1452                 goto drop;
1453
1454         read_lock(&mrt_lock);
1455         if (reg_vif_num >= 0)
1456                 reg_dev = vif_table[reg_vif_num].dev;
1457         if (reg_dev)
1458                 dev_hold(reg_dev);
1459         read_unlock(&mrt_lock);
1460
1461         if (reg_dev == NULL)
1462                 goto drop;
1463
1464         skb->mac.raw = skb->nh.raw;
1465         skb_pull(skb, (u8*)encap - skb->data);
1466         skb_reset_network_header(skb);
1467         skb->dev = reg_dev;
1468         skb->protocol = htons(ETH_P_IP);
1469         skb->ip_summed = 0;
1470         skb->pkt_type = PACKET_HOST;
1471         dst_release(skb->dst);
1472         skb->dst = NULL;
1473         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1474         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1475         nf_reset(skb);
1476         netif_rx(skb);
1477         dev_put(reg_dev);
1478         return 0;
1479  drop:
1480         kfree_skb(skb);
1481         return 0;
1482 }
1483 #endif
1484
1485 #ifdef CONFIG_IP_PIMSM_V2
1486 static int pim_rcv(struct sk_buff * skb)
1487 {
1488         struct pimreghdr *pim;
1489         struct iphdr   *encap;
1490         struct net_device  *reg_dev = NULL;
1491
1492         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(*encap)))
1493                 goto drop;
1494
1495         pim = (struct pimreghdr *)skb_transport_header(skb);
1496         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1497             (pim->flags&PIM_NULL_REGISTER) ||
1498             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1499              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1500                 goto drop;
1501
1502         /* check if the inner packet is destined to mcast group */
1503         encap = (struct iphdr *)(skb_transport_header(skb) +
1504                                  sizeof(struct pimreghdr));
1505         if (!MULTICAST(encap->daddr) ||
1506             encap->tot_len == 0 ||
1507             ntohs(encap->tot_len) + sizeof(*pim) > skb->len)
1508                 goto drop;
1509
1510         read_lock(&mrt_lock);
1511         if (reg_vif_num >= 0)
1512                 reg_dev = vif_table[reg_vif_num].dev;
1513         if (reg_dev)
1514                 dev_hold(reg_dev);
1515         read_unlock(&mrt_lock);
1516
1517         if (reg_dev == NULL)
1518                 goto drop;
1519
1520         skb->mac.raw = skb->nh.raw;
1521         skb_pull(skb, (u8*)encap - skb->data);
1522         skb_reset_network_header(skb);
1523         skb->dev = reg_dev;
1524         skb->protocol = htons(ETH_P_IP);
1525         skb->ip_summed = 0;
1526         skb->pkt_type = PACKET_HOST;
1527         dst_release(skb->dst);
1528         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_bytes += skb->len;
1529         ((struct net_device_stats*)netdev_priv(reg_dev))->rx_packets++;
1530         skb->dst = NULL;
1531         nf_reset(skb);
1532         netif_rx(skb);
1533         dev_put(reg_dev);
1534         return 0;
1535  drop:
1536         kfree_skb(skb);
1537         return 0;
1538 }
1539 #endif
1540
1541 static int
1542 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1543 {
1544         int ct;
1545         struct rtnexthop *nhp;
1546         struct net_device *dev = vif_table[c->mfc_parent].dev;
1547         u8 *b = skb->tail;
1548         struct rtattr *mp_head;
1549
1550         if (dev)
1551                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1552
1553         mp_head = (struct rtattr*)skb_put(skb, RTA_LENGTH(0));
1554
1555         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1556                 if (c->mfc_un.res.ttls[ct] < 255) {
1557                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1558                                 goto rtattr_failure;
1559                         nhp = (struct rtnexthop*)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1560                         nhp->rtnh_flags = 0;
1561                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1562                         nhp->rtnh_ifindex = vif_table[ct].dev->ifindex;
1563                         nhp->rtnh_len = sizeof(*nhp);
1564                 }
1565         }
1566         mp_head->rta_type = RTA_MULTIPATH;
1567         mp_head->rta_len = skb->tail - (u8*)mp_head;
1568         rtm->rtm_type = RTN_MULTICAST;
1569         return 1;
1570
1571 rtattr_failure:
1572         skb_trim(skb, b - skb->data);
1573         return -EMSGSIZE;
1574 }
1575
1576 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1577 {
1578         int err;
1579         struct mfc_cache *cache;
1580         struct rtable *rt = (struct rtable*)skb->dst;
1581
1582         read_lock(&mrt_lock);
1583         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1584
1585         if (cache==NULL) {
1586                 struct sk_buff *skb2;
1587                 struct iphdr *iph;
1588                 struct net_device *dev;
1589                 int vif;
1590
1591                 if (nowait) {
1592                         read_unlock(&mrt_lock);
1593                         return -EAGAIN;
1594                 }
1595
1596                 dev = skb->dev;
1597                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1598                         read_unlock(&mrt_lock);
1599                         return -ENODEV;
1600                 }
1601                 skb2 = skb_clone(skb, GFP_ATOMIC);
1602                 if (!skb2) {
1603                         read_unlock(&mrt_lock);
1604                         return -ENOMEM;
1605                 }
1606
1607                 skb_push(skb2, sizeof(struct iphdr));
1608                 skb_reset_network_header(skb2);
1609                 iph = ip_hdr(skb2);
1610                 iph->ihl = sizeof(struct iphdr) >> 2;
1611                 iph->saddr = rt->rt_src;
1612                 iph->daddr = rt->rt_dst;
1613                 iph->version = 0;
1614                 err = ipmr_cache_unresolved(vif, skb2);
1615                 read_unlock(&mrt_lock);
1616                 return err;
1617         }
1618
1619         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1620                 cache->mfc_flags |= MFC_NOTIFY;
1621         err = ipmr_fill_mroute(skb, cache, rtm);
1622         read_unlock(&mrt_lock);
1623         return err;
1624 }
1625
1626 #ifdef CONFIG_PROC_FS
1627 /*
1628  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1629  */
1630 struct ipmr_vif_iter {
1631         int ct;
1632 };
1633
1634 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1635                                            loff_t pos)
1636 {
1637         for (iter->ct = 0; iter->ct < maxvif; ++iter->ct) {
1638                 if (!VIF_EXISTS(iter->ct))
1639                         continue;
1640                 if (pos-- == 0)
1641                         return &vif_table[iter->ct];
1642         }
1643         return NULL;
1644 }
1645
1646 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1647 {
1648         read_lock(&mrt_lock);
1649         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1650                 : SEQ_START_TOKEN;
1651 }
1652
1653 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1654 {
1655         struct ipmr_vif_iter *iter = seq->private;
1656
1657         ++*pos;
1658         if (v == SEQ_START_TOKEN)
1659                 return ipmr_vif_seq_idx(iter, 0);
1660
1661         while (++iter->ct < maxvif) {
1662                 if (!VIF_EXISTS(iter->ct))
1663                         continue;
1664                 return &vif_table[iter->ct];
1665         }
1666         return NULL;
1667 }
1668
1669 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1670 {
1671         read_unlock(&mrt_lock);
1672 }
1673
1674 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1675 {
1676         if (v == SEQ_START_TOKEN) {
1677                 seq_puts(seq,
1678                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1679         } else {
1680                 const struct vif_device *vif = v;
1681                 const char *name =  vif->dev ? vif->dev->name : "none";
1682
1683                 seq_printf(seq,
1684                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1685                            vif - vif_table,
1686                            name, vif->bytes_in, vif->pkt_in,
1687                            vif->bytes_out, vif->pkt_out,
1688                            vif->flags, vif->local, vif->remote);
1689         }
1690         return 0;
1691 }
1692
1693 static const struct seq_operations ipmr_vif_seq_ops = {
1694         .start = ipmr_vif_seq_start,
1695         .next  = ipmr_vif_seq_next,
1696         .stop  = ipmr_vif_seq_stop,
1697         .show  = ipmr_vif_seq_show,
1698 };
1699
1700 static int ipmr_vif_open(struct inode *inode, struct file *file)
1701 {
1702         struct seq_file *seq;
1703         int rc = -ENOMEM;
1704         struct ipmr_vif_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1705
1706         if (!s)
1707                 goto out;
1708
1709         rc = seq_open(file, &ipmr_vif_seq_ops);
1710         if (rc)
1711                 goto out_kfree;
1712
1713         s->ct = 0;
1714         seq = file->private_data;
1715         seq->private = s;
1716 out:
1717         return rc;
1718 out_kfree:
1719         kfree(s);
1720         goto out;
1721
1722 }
1723
1724 static const struct file_operations ipmr_vif_fops = {
1725         .owner   = THIS_MODULE,
1726         .open    = ipmr_vif_open,
1727         .read    = seq_read,
1728         .llseek  = seq_lseek,
1729         .release = seq_release_private,
1730 };
1731
1732 struct ipmr_mfc_iter {
1733         struct mfc_cache **cache;
1734         int ct;
1735 };
1736
1737
1738 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1739 {
1740         struct mfc_cache *mfc;
1741
1742         it->cache = mfc_cache_array;
1743         read_lock(&mrt_lock);
1744         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1745                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1746                         if (pos-- == 0)
1747                                 return mfc;
1748         read_unlock(&mrt_lock);
1749
1750         it->cache = &mfc_unres_queue;
1751         spin_lock_bh(&mfc_unres_lock);
1752         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1753                 if (pos-- == 0)
1754                         return mfc;
1755         spin_unlock_bh(&mfc_unres_lock);
1756
1757         it->cache = NULL;
1758         return NULL;
1759 }
1760
1761
1762 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1763 {
1764         struct ipmr_mfc_iter *it = seq->private;
1765         it->cache = NULL;
1766         it->ct = 0;
1767         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1768                 : SEQ_START_TOKEN;
1769 }
1770
1771 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1772 {
1773         struct mfc_cache *mfc = v;
1774         struct ipmr_mfc_iter *it = seq->private;
1775
1776         ++*pos;
1777
1778         if (v == SEQ_START_TOKEN)
1779                 return ipmr_mfc_seq_idx(seq->private, 0);
1780
1781         if (mfc->next)
1782                 return mfc->next;
1783
1784         if (it->cache == &mfc_unres_queue)
1785                 goto end_of_list;
1786
1787         BUG_ON(it->cache != mfc_cache_array);
1788
1789         while (++it->ct < MFC_LINES) {
1790                 mfc = mfc_cache_array[it->ct];
1791                 if (mfc)
1792                         return mfc;
1793         }
1794
1795         /* exhausted cache_array, show unresolved */
1796         read_unlock(&mrt_lock);
1797         it->cache = &mfc_unres_queue;
1798         it->ct = 0;
1799
1800         spin_lock_bh(&mfc_unres_lock);
1801         mfc = mfc_unres_queue;
1802         if (mfc)
1803                 return mfc;
1804
1805  end_of_list:
1806         spin_unlock_bh(&mfc_unres_lock);
1807         it->cache = NULL;
1808
1809         return NULL;
1810 }
1811
1812 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1813 {
1814         struct ipmr_mfc_iter *it = seq->private;
1815
1816         if (it->cache == &mfc_unres_queue)
1817                 spin_unlock_bh(&mfc_unres_lock);
1818         else if (it->cache == mfc_cache_array)
1819                 read_unlock(&mrt_lock);
1820 }
1821
1822 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1823 {
1824         int n;
1825
1826         if (v == SEQ_START_TOKEN) {
1827                 seq_puts(seq,
1828                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1829         } else {
1830                 const struct mfc_cache *mfc = v;
1831                 const struct ipmr_mfc_iter *it = seq->private;
1832
1833                 seq_printf(seq, "%08lX %08lX %-3d %8ld %8ld %8ld",
1834                            (unsigned long) mfc->mfc_mcastgrp,
1835                            (unsigned long) mfc->mfc_origin,
1836                            mfc->mfc_parent,
1837                            mfc->mfc_un.res.pkt,
1838                            mfc->mfc_un.res.bytes,
1839                            mfc->mfc_un.res.wrong_if);
1840
1841                 if (it->cache != &mfc_unres_queue) {
1842                         for (n = mfc->mfc_un.res.minvif;
1843                              n < mfc->mfc_un.res.maxvif; n++ ) {
1844                                 if (VIF_EXISTS(n)
1845                                    && mfc->mfc_un.res.ttls[n] < 255)
1846                                 seq_printf(seq,
1847                                            " %2d:%-3d",
1848                                            n, mfc->mfc_un.res.ttls[n]);
1849                         }
1850                 }
1851                 seq_putc(seq, '\n');
1852         }
1853         return 0;
1854 }
1855
1856 static const struct seq_operations ipmr_mfc_seq_ops = {
1857         .start = ipmr_mfc_seq_start,
1858         .next  = ipmr_mfc_seq_next,
1859         .stop  = ipmr_mfc_seq_stop,
1860         .show  = ipmr_mfc_seq_show,
1861 };
1862
1863 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1864 {
1865         struct seq_file *seq;
1866         int rc = -ENOMEM;
1867         struct ipmr_mfc_iter *s = kmalloc(sizeof(*s), GFP_KERNEL);
1868
1869         if (!s)
1870                 goto out;
1871
1872         rc = seq_open(file, &ipmr_mfc_seq_ops);
1873         if (rc)
1874                 goto out_kfree;
1875
1876         seq = file->private_data;
1877         seq->private = s;
1878 out:
1879         return rc;
1880 out_kfree:
1881         kfree(s);
1882         goto out;
1883
1884 }
1885
1886 static const struct file_operations ipmr_mfc_fops = {
1887         .owner   = THIS_MODULE,
1888         .open    = ipmr_mfc_open,
1889         .read    = seq_read,
1890         .llseek  = seq_lseek,
1891         .release = seq_release_private,
1892 };
1893 #endif
1894
1895 #ifdef CONFIG_IP_PIMSM_V2
1896 static struct net_protocol pim_protocol = {
1897         .handler        =       pim_rcv,
1898 };
1899 #endif
1900
1901
1902 /*
1903  *      Setup for IP multicast routing
1904  */
1905
1906 void __init ip_mr_init(void)
1907 {
1908         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1909                                        sizeof(struct mfc_cache),
1910                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1911                                        NULL, NULL);
1912         init_timer(&ipmr_expire_timer);
1913         ipmr_expire_timer.function=ipmr_expire_process;
1914         register_netdevice_notifier(&ip_mr_notifier);
1915 #ifdef CONFIG_PROC_FS
1916         proc_net_fops_create("ip_mr_vif", 0, &ipmr_vif_fops);
1917         proc_net_fops_create("ip_mr_cache", 0, &ipmr_mfc_fops);
1918 #endif
1919 }