netns: ipmr: dynamically allocate vif_table
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 /* Big lock, protecting vif table, mrt cache and mroute socket state.
71    Note that the changes are semaphored via rtnl_lock.
72  */
73
74 static DEFINE_RWLOCK(mrt_lock);
75
76 /*
77  *      Multicast router control variables
78  */
79
80 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
81
82 static int mroute_do_assert;                            /* Set in PIM assert    */
83 static int mroute_do_pim;
84
85 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
86
87 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
88 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
89
90 /* Special spinlock for queue of unresolved entries */
91 static DEFINE_SPINLOCK(mfc_unres_lock);
92
93 /* We return to original Alan's scheme. Hash table of resolved
94    entries is changed only in process context and protected
95    with weak lock mrt_lock. Queue of unresolved entries is protected
96    with strong spinlock mfc_unres_lock.
97
98    In this case data path is free of exclusive locks at all.
99  */
100
101 static struct kmem_cache *mrt_cachep __read_mostly;
102
103 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
104 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
105 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
106
107 #ifdef CONFIG_IP_PIMSM_V2
108 static struct net_protocol pim_protocol;
109 #endif
110
111 static struct timer_list ipmr_expire_timer;
112
113 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
114
115 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
116 {
117         dev_close(dev);
118
119         dev = __dev_get_by_name(&init_net, "tunl0");
120         if (dev) {
121                 const struct net_device_ops *ops = dev->netdev_ops;
122                 struct ifreq ifr;
123                 struct ip_tunnel_parm p;
124
125                 memset(&p, 0, sizeof(p));
126                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
127                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
128                 p.iph.version = 4;
129                 p.iph.ihl = 5;
130                 p.iph.protocol = IPPROTO_IPIP;
131                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
132                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
133
134                 if (ops->ndo_do_ioctl) {
135                         mm_segment_t oldfs = get_fs();
136
137                         set_fs(KERNEL_DS);
138                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
139                         set_fs(oldfs);
140                 }
141         }
142 }
143
144 static
145 struct net_device *ipmr_new_tunnel(struct vifctl *v)
146 {
147         struct net_device  *dev;
148
149         dev = __dev_get_by_name(&init_net, "tunl0");
150
151         if (dev) {
152                 const struct net_device_ops *ops = dev->netdev_ops;
153                 int err;
154                 struct ifreq ifr;
155                 struct ip_tunnel_parm p;
156                 struct in_device  *in_dev;
157
158                 memset(&p, 0, sizeof(p));
159                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
160                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
161                 p.iph.version = 4;
162                 p.iph.ihl = 5;
163                 p.iph.protocol = IPPROTO_IPIP;
164                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
165                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
166
167                 if (ops->ndo_do_ioctl) {
168                         mm_segment_t oldfs = get_fs();
169
170                         set_fs(KERNEL_DS);
171                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
172                         set_fs(oldfs);
173                 } else
174                         err = -EOPNOTSUPP;
175
176                 dev = NULL;
177
178                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
179                         dev->flags |= IFF_MULTICAST;
180
181                         in_dev = __in_dev_get_rtnl(dev);
182                         if (in_dev == NULL)
183                                 goto failure;
184
185                         ipv4_devconf_setall(in_dev);
186                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
187
188                         if (dev_open(dev))
189                                 goto failure;
190                         dev_hold(dev);
191                 }
192         }
193         return dev;
194
195 failure:
196         /* allow the register to be completed before unregistering. */
197         rtnl_unlock();
198         rtnl_lock();
199
200         unregister_netdevice(dev);
201         return NULL;
202 }
203
204 #ifdef CONFIG_IP_PIMSM
205
206 static int reg_vif_num = -1;
207
208 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
209 {
210         read_lock(&mrt_lock);
211         dev->stats.tx_bytes += skb->len;
212         dev->stats.tx_packets++;
213         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
214         read_unlock(&mrt_lock);
215         kfree_skb(skb);
216         return 0;
217 }
218
219 static const struct net_device_ops reg_vif_netdev_ops = {
220         .ndo_start_xmit = reg_vif_xmit,
221 };
222
223 static void reg_vif_setup(struct net_device *dev)
224 {
225         dev->type               = ARPHRD_PIMREG;
226         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
227         dev->flags              = IFF_NOARP;
228         dev->netdev_ops         = &reg_vif_netdev_ops,
229         dev->destructor         = free_netdev;
230 }
231
232 static struct net_device *ipmr_reg_vif(void)
233 {
234         struct net_device *dev;
235         struct in_device *in_dev;
236
237         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
238
239         if (dev == NULL)
240                 return NULL;
241
242         if (register_netdevice(dev)) {
243                 free_netdev(dev);
244                 return NULL;
245         }
246         dev->iflink = 0;
247
248         rcu_read_lock();
249         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
250                 rcu_read_unlock();
251                 goto failure;
252         }
253
254         ipv4_devconf_setall(in_dev);
255         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
256         rcu_read_unlock();
257
258         if (dev_open(dev))
259                 goto failure;
260
261         dev_hold(dev);
262
263         return dev;
264
265 failure:
266         /* allow the register to be completed before unregistering. */
267         rtnl_unlock();
268         rtnl_lock();
269
270         unregister_netdevice(dev);
271         return NULL;
272 }
273 #endif
274
275 /*
276  *      Delete a VIF entry
277  *      @notify: Set to 1, if the caller is a notifier_call
278  */
279
280 static int vif_delete(int vifi, int notify)
281 {
282         struct vif_device *v;
283         struct net_device *dev;
284         struct in_device *in_dev;
285
286         if (vifi < 0 || vifi >= init_net.ipv4.maxvif)
287                 return -EADDRNOTAVAIL;
288
289         v = &init_net.ipv4.vif_table[vifi];
290
291         write_lock_bh(&mrt_lock);
292         dev = v->dev;
293         v->dev = NULL;
294
295         if (!dev) {
296                 write_unlock_bh(&mrt_lock);
297                 return -EADDRNOTAVAIL;
298         }
299
300 #ifdef CONFIG_IP_PIMSM
301         if (vifi == reg_vif_num)
302                 reg_vif_num = -1;
303 #endif
304
305         if (vifi+1 == init_net.ipv4.maxvif) {
306                 int tmp;
307                 for (tmp=vifi-1; tmp>=0; tmp--) {
308                         if (VIF_EXISTS(&init_net, tmp))
309                                 break;
310                 }
311                 init_net.ipv4.maxvif = tmp+1;
312         }
313
314         write_unlock_bh(&mrt_lock);
315
316         dev_set_allmulti(dev, -1);
317
318         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
319                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
320                 ip_rt_multicast_event(in_dev);
321         }
322
323         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
324                 unregister_netdevice(dev);
325
326         dev_put(dev);
327         return 0;
328 }
329
330 /* Destroy an unresolved cache entry, killing queued skbs
331    and reporting error to netlink readers.
332  */
333
334 static void ipmr_destroy_unres(struct mfc_cache *c)
335 {
336         struct sk_buff *skb;
337         struct nlmsgerr *e;
338
339         atomic_dec(&cache_resolve_queue_len);
340
341         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
342                 if (ip_hdr(skb)->version == 0) {
343                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
344                         nlh->nlmsg_type = NLMSG_ERROR;
345                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
346                         skb_trim(skb, nlh->nlmsg_len);
347                         e = NLMSG_DATA(nlh);
348                         e->error = -ETIMEDOUT;
349                         memset(&e->msg, 0, sizeof(e->msg));
350
351                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
352                 } else
353                         kfree_skb(skb);
354         }
355
356         kmem_cache_free(mrt_cachep, c);
357 }
358
359
360 /* Single timer process for all the unresolved queue. */
361
362 static void ipmr_expire_process(unsigned long dummy)
363 {
364         unsigned long now;
365         unsigned long expires;
366         struct mfc_cache *c, **cp;
367
368         if (!spin_trylock(&mfc_unres_lock)) {
369                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
370                 return;
371         }
372
373         if (atomic_read(&cache_resolve_queue_len) == 0)
374                 goto out;
375
376         now = jiffies;
377         expires = 10*HZ;
378         cp = &mfc_unres_queue;
379
380         while ((c=*cp) != NULL) {
381                 if (time_after(c->mfc_un.unres.expires, now)) {
382                         unsigned long interval = c->mfc_un.unres.expires - now;
383                         if (interval < expires)
384                                 expires = interval;
385                         cp = &c->next;
386                         continue;
387                 }
388
389                 *cp = c->next;
390
391                 ipmr_destroy_unres(c);
392         }
393
394         if (atomic_read(&cache_resolve_queue_len))
395                 mod_timer(&ipmr_expire_timer, jiffies + expires);
396
397 out:
398         spin_unlock(&mfc_unres_lock);
399 }
400
401 /* Fill oifs list. It is called under write locked mrt_lock. */
402
403 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
404 {
405         int vifi;
406
407         cache->mfc_un.res.minvif = MAXVIFS;
408         cache->mfc_un.res.maxvif = 0;
409         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
410
411         for (vifi = 0; vifi < init_net.ipv4.maxvif; vifi++) {
412                 if (VIF_EXISTS(&init_net, vifi) &&
413                     ttls[vifi] && ttls[vifi] < 255) {
414                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
415                         if (cache->mfc_un.res.minvif > vifi)
416                                 cache->mfc_un.res.minvif = vifi;
417                         if (cache->mfc_un.res.maxvif <= vifi)
418                                 cache->mfc_un.res.maxvif = vifi + 1;
419                 }
420         }
421 }
422
423 static int vif_add(struct vifctl *vifc, int mrtsock)
424 {
425         int vifi = vifc->vifc_vifi;
426         struct vif_device *v = &init_net.ipv4.vif_table[vifi];
427         struct net_device *dev;
428         struct in_device *in_dev;
429         int err;
430
431         /* Is vif busy ? */
432         if (VIF_EXISTS(&init_net, vifi))
433                 return -EADDRINUSE;
434
435         switch (vifc->vifc_flags) {
436 #ifdef CONFIG_IP_PIMSM
437         case VIFF_REGISTER:
438                 /*
439                  * Special Purpose VIF in PIM
440                  * All the packets will be sent to the daemon
441                  */
442                 if (reg_vif_num >= 0)
443                         return -EADDRINUSE;
444                 dev = ipmr_reg_vif();
445                 if (!dev)
446                         return -ENOBUFS;
447                 err = dev_set_allmulti(dev, 1);
448                 if (err) {
449                         unregister_netdevice(dev);
450                         dev_put(dev);
451                         return err;
452                 }
453                 break;
454 #endif
455         case VIFF_TUNNEL:
456                 dev = ipmr_new_tunnel(vifc);
457                 if (!dev)
458                         return -ENOBUFS;
459                 err = dev_set_allmulti(dev, 1);
460                 if (err) {
461                         ipmr_del_tunnel(dev, vifc);
462                         dev_put(dev);
463                         return err;
464                 }
465                 break;
466         case 0:
467                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
468                 if (!dev)
469                         return -EADDRNOTAVAIL;
470                 err = dev_set_allmulti(dev, 1);
471                 if (err) {
472                         dev_put(dev);
473                         return err;
474                 }
475                 break;
476         default:
477                 return -EINVAL;
478         }
479
480         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
481                 return -EADDRNOTAVAIL;
482         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
483         ip_rt_multicast_event(in_dev);
484
485         /*
486          *      Fill in the VIF structures
487          */
488         v->rate_limit = vifc->vifc_rate_limit;
489         v->local = vifc->vifc_lcl_addr.s_addr;
490         v->remote = vifc->vifc_rmt_addr.s_addr;
491         v->flags = vifc->vifc_flags;
492         if (!mrtsock)
493                 v->flags |= VIFF_STATIC;
494         v->threshold = vifc->vifc_threshold;
495         v->bytes_in = 0;
496         v->bytes_out = 0;
497         v->pkt_in = 0;
498         v->pkt_out = 0;
499         v->link = dev->ifindex;
500         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
501                 v->link = dev->iflink;
502
503         /* And finish update writing critical data */
504         write_lock_bh(&mrt_lock);
505         v->dev = dev;
506 #ifdef CONFIG_IP_PIMSM
507         if (v->flags&VIFF_REGISTER)
508                 reg_vif_num = vifi;
509 #endif
510         if (vifi+1 > init_net.ipv4.maxvif)
511                 init_net.ipv4.maxvif = vifi+1;
512         write_unlock_bh(&mrt_lock);
513         return 0;
514 }
515
516 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
517 {
518         int line = MFC_HASH(mcastgrp, origin);
519         struct mfc_cache *c;
520
521         for (c=mfc_cache_array[line]; c; c = c->next) {
522                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
523                         break;
524         }
525         return c;
526 }
527
528 /*
529  *      Allocate a multicast cache entry
530  */
531 static struct mfc_cache *ipmr_cache_alloc(void)
532 {
533         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
534         if (c == NULL)
535                 return NULL;
536         c->mfc_un.res.minvif = MAXVIFS;
537         return c;
538 }
539
540 static struct mfc_cache *ipmr_cache_alloc_unres(void)
541 {
542         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
543         if (c == NULL)
544                 return NULL;
545         skb_queue_head_init(&c->mfc_un.unres.unresolved);
546         c->mfc_un.unres.expires = jiffies + 10*HZ;
547         return c;
548 }
549
550 /*
551  *      A cache entry has gone into a resolved state from queued
552  */
553
554 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
555 {
556         struct sk_buff *skb;
557         struct nlmsgerr *e;
558
559         /*
560          *      Play the pending entries through our router
561          */
562
563         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
564                 if (ip_hdr(skb)->version == 0) {
565                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
566
567                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
568                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
569                                                   (u8 *)nlh);
570                         } else {
571                                 nlh->nlmsg_type = NLMSG_ERROR;
572                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
573                                 skb_trim(skb, nlh->nlmsg_len);
574                                 e = NLMSG_DATA(nlh);
575                                 e->error = -EMSGSIZE;
576                                 memset(&e->msg, 0, sizeof(e->msg));
577                         }
578
579                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
580                 } else
581                         ip_mr_forward(skb, c, 0);
582         }
583 }
584
585 /*
586  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
587  *      expects the following bizarre scheme.
588  *
589  *      Called under mrt_lock.
590  */
591
592 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
593 {
594         struct sk_buff *skb;
595         const int ihl = ip_hdrlen(pkt);
596         struct igmphdr *igmp;
597         struct igmpmsg *msg;
598         int ret;
599
600 #ifdef CONFIG_IP_PIMSM
601         if (assert == IGMPMSG_WHOLEPKT)
602                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
603         else
604 #endif
605                 skb = alloc_skb(128, GFP_ATOMIC);
606
607         if (!skb)
608                 return -ENOBUFS;
609
610 #ifdef CONFIG_IP_PIMSM
611         if (assert == IGMPMSG_WHOLEPKT) {
612                 /* Ugly, but we have no choice with this interface.
613                    Duplicate old header, fix ihl, length etc.
614                    And all this only to mangle msg->im_msgtype and
615                    to set msg->im_mbz to "mbz" :-)
616                  */
617                 skb_push(skb, sizeof(struct iphdr));
618                 skb_reset_network_header(skb);
619                 skb_reset_transport_header(skb);
620                 msg = (struct igmpmsg *)skb_network_header(skb);
621                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
622                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
623                 msg->im_mbz = 0;
624                 msg->im_vif = reg_vif_num;
625                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
626                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
627                                              sizeof(struct iphdr));
628         } else
629 #endif
630         {
631
632         /*
633          *      Copy the IP header
634          */
635
636         skb->network_header = skb->tail;
637         skb_put(skb, ihl);
638         skb_copy_to_linear_data(skb, pkt->data, ihl);
639         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
640         msg = (struct igmpmsg *)skb_network_header(skb);
641         msg->im_vif = vifi;
642         skb->dst = dst_clone(pkt->dst);
643
644         /*
645          *      Add our header
646          */
647
648         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
649         igmp->type      =
650         msg->im_msgtype = assert;
651         igmp->code      =       0;
652         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
653         skb->transport_header = skb->network_header;
654         }
655
656         if (init_net.ipv4.mroute_sk == NULL) {
657                 kfree_skb(skb);
658                 return -EINVAL;
659         }
660
661         /*
662          *      Deliver to mrouted
663          */
664         ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
665         if (ret < 0) {
666                 if (net_ratelimit())
667                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
668                 kfree_skb(skb);
669         }
670
671         return ret;
672 }
673
674 /*
675  *      Queue a packet for resolution. It gets locked cache entry!
676  */
677
678 static int
679 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
680 {
681         int err;
682         struct mfc_cache *c;
683         const struct iphdr *iph = ip_hdr(skb);
684
685         spin_lock_bh(&mfc_unres_lock);
686         for (c=mfc_unres_queue; c; c=c->next) {
687                 if (c->mfc_mcastgrp == iph->daddr &&
688                     c->mfc_origin == iph->saddr)
689                         break;
690         }
691
692         if (c == NULL) {
693                 /*
694                  *      Create a new entry if allowable
695                  */
696
697                 if (atomic_read(&cache_resolve_queue_len) >= 10 ||
698                     (c=ipmr_cache_alloc_unres())==NULL) {
699                         spin_unlock_bh(&mfc_unres_lock);
700
701                         kfree_skb(skb);
702                         return -ENOBUFS;
703                 }
704
705                 /*
706                  *      Fill in the new cache entry
707                  */
708                 c->mfc_parent   = -1;
709                 c->mfc_origin   = iph->saddr;
710                 c->mfc_mcastgrp = iph->daddr;
711
712                 /*
713                  *      Reflect first query at mrouted.
714                  */
715                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
716                         /* If the report failed throw the cache entry
717                            out - Brad Parker
718                          */
719                         spin_unlock_bh(&mfc_unres_lock);
720
721                         kmem_cache_free(mrt_cachep, c);
722                         kfree_skb(skb);
723                         return err;
724                 }
725
726                 atomic_inc(&cache_resolve_queue_len);
727                 c->next = mfc_unres_queue;
728                 mfc_unres_queue = c;
729
730                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
731         }
732
733         /*
734          *      See if we can append the packet
735          */
736         if (c->mfc_un.unres.unresolved.qlen>3) {
737                 kfree_skb(skb);
738                 err = -ENOBUFS;
739         } else {
740                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
741                 err = 0;
742         }
743
744         spin_unlock_bh(&mfc_unres_lock);
745         return err;
746 }
747
748 /*
749  *      MFC cache manipulation by user space mroute daemon
750  */
751
752 static int ipmr_mfc_delete(struct mfcctl *mfc)
753 {
754         int line;
755         struct mfc_cache *c, **cp;
756
757         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
758
759         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
760                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
761                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
762                         write_lock_bh(&mrt_lock);
763                         *cp = c->next;
764                         write_unlock_bh(&mrt_lock);
765
766                         kmem_cache_free(mrt_cachep, c);
767                         return 0;
768                 }
769         }
770         return -ENOENT;
771 }
772
773 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
774 {
775         int line;
776         struct mfc_cache *uc, *c, **cp;
777
778         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
779
780         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
781                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
782                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
783                         break;
784         }
785
786         if (c != NULL) {
787                 write_lock_bh(&mrt_lock);
788                 c->mfc_parent = mfc->mfcc_parent;
789                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
790                 if (!mrtsock)
791                         c->mfc_flags |= MFC_STATIC;
792                 write_unlock_bh(&mrt_lock);
793                 return 0;
794         }
795
796         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
797                 return -EINVAL;
798
799         c = ipmr_cache_alloc();
800         if (c == NULL)
801                 return -ENOMEM;
802
803         c->mfc_origin = mfc->mfcc_origin.s_addr;
804         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
805         c->mfc_parent = mfc->mfcc_parent;
806         ipmr_update_thresholds(c, mfc->mfcc_ttls);
807         if (!mrtsock)
808                 c->mfc_flags |= MFC_STATIC;
809
810         write_lock_bh(&mrt_lock);
811         c->next = mfc_cache_array[line];
812         mfc_cache_array[line] = c;
813         write_unlock_bh(&mrt_lock);
814
815         /*
816          *      Check to see if we resolved a queued list. If so we
817          *      need to send on the frames and tidy up.
818          */
819         spin_lock_bh(&mfc_unres_lock);
820         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
821              cp = &uc->next) {
822                 if (uc->mfc_origin == c->mfc_origin &&
823                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
824                         *cp = uc->next;
825                         if (atomic_dec_and_test(&cache_resolve_queue_len))
826                                 del_timer(&ipmr_expire_timer);
827                         break;
828                 }
829         }
830         spin_unlock_bh(&mfc_unres_lock);
831
832         if (uc) {
833                 ipmr_cache_resolve(uc, c);
834                 kmem_cache_free(mrt_cachep, uc);
835         }
836         return 0;
837 }
838
839 /*
840  *      Close the multicast socket, and clear the vif tables etc
841  */
842
843 static void mroute_clean_tables(struct sock *sk)
844 {
845         int i;
846
847         /*
848          *      Shut down all active vif entries
849          */
850         for (i = 0; i < init_net.ipv4.maxvif; i++) {
851                 if (!(init_net.ipv4.vif_table[i].flags&VIFF_STATIC))
852                         vif_delete(i, 0);
853         }
854
855         /*
856          *      Wipe the cache
857          */
858         for (i=0; i<MFC_LINES; i++) {
859                 struct mfc_cache *c, **cp;
860
861                 cp = &mfc_cache_array[i];
862                 while ((c = *cp) != NULL) {
863                         if (c->mfc_flags&MFC_STATIC) {
864                                 cp = &c->next;
865                                 continue;
866                         }
867                         write_lock_bh(&mrt_lock);
868                         *cp = c->next;
869                         write_unlock_bh(&mrt_lock);
870
871                         kmem_cache_free(mrt_cachep, c);
872                 }
873         }
874
875         if (atomic_read(&cache_resolve_queue_len) != 0) {
876                 struct mfc_cache *c;
877
878                 spin_lock_bh(&mfc_unres_lock);
879                 while (mfc_unres_queue != NULL) {
880                         c = mfc_unres_queue;
881                         mfc_unres_queue = c->next;
882                         spin_unlock_bh(&mfc_unres_lock);
883
884                         ipmr_destroy_unres(c);
885
886                         spin_lock_bh(&mfc_unres_lock);
887                 }
888                 spin_unlock_bh(&mfc_unres_lock);
889         }
890 }
891
892 static void mrtsock_destruct(struct sock *sk)
893 {
894         rtnl_lock();
895         if (sk == init_net.ipv4.mroute_sk) {
896                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
897
898                 write_lock_bh(&mrt_lock);
899                 init_net.ipv4.mroute_sk = NULL;
900                 write_unlock_bh(&mrt_lock);
901
902                 mroute_clean_tables(sk);
903         }
904         rtnl_unlock();
905 }
906
907 /*
908  *      Socket options and virtual interface manipulation. The whole
909  *      virtual interface system is a complete heap, but unfortunately
910  *      that's how BSD mrouted happens to think. Maybe one day with a proper
911  *      MOSPF/PIM router set up we can clean this up.
912  */
913
914 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
915 {
916         int ret;
917         struct vifctl vif;
918         struct mfcctl mfc;
919
920         if (optname != MRT_INIT) {
921                 if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
922                         return -EACCES;
923         }
924
925         switch (optname) {
926         case MRT_INIT:
927                 if (sk->sk_type != SOCK_RAW ||
928                     inet_sk(sk)->num != IPPROTO_IGMP)
929                         return -EOPNOTSUPP;
930                 if (optlen != sizeof(int))
931                         return -ENOPROTOOPT;
932
933                 rtnl_lock();
934                 if (init_net.ipv4.mroute_sk) {
935                         rtnl_unlock();
936                         return -EADDRINUSE;
937                 }
938
939                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
940                 if (ret == 0) {
941                         write_lock_bh(&mrt_lock);
942                         init_net.ipv4.mroute_sk = sk;
943                         write_unlock_bh(&mrt_lock);
944
945                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
946                 }
947                 rtnl_unlock();
948                 return ret;
949         case MRT_DONE:
950                 if (sk != init_net.ipv4.mroute_sk)
951                         return -EACCES;
952                 return ip_ra_control(sk, 0, NULL);
953         case MRT_ADD_VIF:
954         case MRT_DEL_VIF:
955                 if (optlen != sizeof(vif))
956                         return -EINVAL;
957                 if (copy_from_user(&vif, optval, sizeof(vif)))
958                         return -EFAULT;
959                 if (vif.vifc_vifi >= MAXVIFS)
960                         return -ENFILE;
961                 rtnl_lock();
962                 if (optname == MRT_ADD_VIF) {
963                         ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
964                 } else {
965                         ret = vif_delete(vif.vifc_vifi, 0);
966                 }
967                 rtnl_unlock();
968                 return ret;
969
970                 /*
971                  *      Manipulate the forwarding caches. These live
972                  *      in a sort of kernel/user symbiosis.
973                  */
974         case MRT_ADD_MFC:
975         case MRT_DEL_MFC:
976                 if (optlen != sizeof(mfc))
977                         return -EINVAL;
978                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
979                         return -EFAULT;
980                 rtnl_lock();
981                 if (optname == MRT_DEL_MFC)
982                         ret = ipmr_mfc_delete(&mfc);
983                 else
984                         ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
985                 rtnl_unlock();
986                 return ret;
987                 /*
988                  *      Control PIM assert.
989                  */
990         case MRT_ASSERT:
991         {
992                 int v;
993                 if (get_user(v,(int __user *)optval))
994                         return -EFAULT;
995                 mroute_do_assert=(v)?1:0;
996                 return 0;
997         }
998 #ifdef CONFIG_IP_PIMSM
999         case MRT_PIM:
1000         {
1001                 int v;
1002
1003                 if (get_user(v,(int __user *)optval))
1004                         return -EFAULT;
1005                 v = (v) ? 1 : 0;
1006
1007                 rtnl_lock();
1008                 ret = 0;
1009                 if (v != mroute_do_pim) {
1010                         mroute_do_pim = v;
1011                         mroute_do_assert = v;
1012 #ifdef CONFIG_IP_PIMSM_V2
1013                         if (mroute_do_pim)
1014                                 ret = inet_add_protocol(&pim_protocol,
1015                                                         IPPROTO_PIM);
1016                         else
1017                                 ret = inet_del_protocol(&pim_protocol,
1018                                                         IPPROTO_PIM);
1019                         if (ret < 0)
1020                                 ret = -EAGAIN;
1021 #endif
1022                 }
1023                 rtnl_unlock();
1024                 return ret;
1025         }
1026 #endif
1027         /*
1028          *      Spurious command, or MRT_VERSION which you cannot
1029          *      set.
1030          */
1031         default:
1032                 return -ENOPROTOOPT;
1033         }
1034 }
1035
1036 /*
1037  *      Getsock opt support for the multicast routing system.
1038  */
1039
1040 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1041 {
1042         int olr;
1043         int val;
1044
1045         if (optname != MRT_VERSION &&
1046 #ifdef CONFIG_IP_PIMSM
1047            optname!=MRT_PIM &&
1048 #endif
1049            optname!=MRT_ASSERT)
1050                 return -ENOPROTOOPT;
1051
1052         if (get_user(olr, optlen))
1053                 return -EFAULT;
1054
1055         olr = min_t(unsigned int, olr, sizeof(int));
1056         if (olr < 0)
1057                 return -EINVAL;
1058
1059         if (put_user(olr, optlen))
1060                 return -EFAULT;
1061         if (optname == MRT_VERSION)
1062                 val = 0x0305;
1063 #ifdef CONFIG_IP_PIMSM
1064         else if (optname == MRT_PIM)
1065                 val = mroute_do_pim;
1066 #endif
1067         else
1068                 val = mroute_do_assert;
1069         if (copy_to_user(optval, &val, olr))
1070                 return -EFAULT;
1071         return 0;
1072 }
1073
1074 /*
1075  *      The IP multicast ioctl support routines.
1076  */
1077
1078 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1079 {
1080         struct sioc_sg_req sr;
1081         struct sioc_vif_req vr;
1082         struct vif_device *vif;
1083         struct mfc_cache *c;
1084
1085         switch (cmd) {
1086         case SIOCGETVIFCNT:
1087                 if (copy_from_user(&vr, arg, sizeof(vr)))
1088                         return -EFAULT;
1089                 if (vr.vifi >= init_net.ipv4.maxvif)
1090                         return -EINVAL;
1091                 read_lock(&mrt_lock);
1092                 vif = &init_net.ipv4.vif_table[vr.vifi];
1093                 if (VIF_EXISTS(&init_net, vr.vifi)) {
1094                         vr.icount = vif->pkt_in;
1095                         vr.ocount = vif->pkt_out;
1096                         vr.ibytes = vif->bytes_in;
1097                         vr.obytes = vif->bytes_out;
1098                         read_unlock(&mrt_lock);
1099
1100                         if (copy_to_user(arg, &vr, sizeof(vr)))
1101                                 return -EFAULT;
1102                         return 0;
1103                 }
1104                 read_unlock(&mrt_lock);
1105                 return -EADDRNOTAVAIL;
1106         case SIOCGETSGCNT:
1107                 if (copy_from_user(&sr, arg, sizeof(sr)))
1108                         return -EFAULT;
1109
1110                 read_lock(&mrt_lock);
1111                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1112                 if (c) {
1113                         sr.pktcnt = c->mfc_un.res.pkt;
1114                         sr.bytecnt = c->mfc_un.res.bytes;
1115                         sr.wrong_if = c->mfc_un.res.wrong_if;
1116                         read_unlock(&mrt_lock);
1117
1118                         if (copy_to_user(arg, &sr, sizeof(sr)))
1119                                 return -EFAULT;
1120                         return 0;
1121                 }
1122                 read_unlock(&mrt_lock);
1123                 return -EADDRNOTAVAIL;
1124         default:
1125                 return -ENOIOCTLCMD;
1126         }
1127 }
1128
1129
1130 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1131 {
1132         struct net_device *dev = ptr;
1133         struct vif_device *v;
1134         int ct;
1135
1136         if (!net_eq(dev_net(dev), &init_net))
1137                 return NOTIFY_DONE;
1138
1139         if (event != NETDEV_UNREGISTER)
1140                 return NOTIFY_DONE;
1141         v = &init_net.ipv4.vif_table[0];
1142         for (ct = 0; ct < init_net.ipv4.maxvif; ct++, v++) {
1143                 if (v->dev == dev)
1144                         vif_delete(ct, 1);
1145         }
1146         return NOTIFY_DONE;
1147 }
1148
1149
1150 static struct notifier_block ip_mr_notifier = {
1151         .notifier_call = ipmr_device_event,
1152 };
1153
1154 /*
1155  *      Encapsulate a packet by attaching a valid IPIP header to it.
1156  *      This avoids tunnel drivers and other mess and gives us the speed so
1157  *      important for multicast video.
1158  */
1159
1160 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1161 {
1162         struct iphdr *iph;
1163         struct iphdr *old_iph = ip_hdr(skb);
1164
1165         skb_push(skb, sizeof(struct iphdr));
1166         skb->transport_header = skb->network_header;
1167         skb_reset_network_header(skb);
1168         iph = ip_hdr(skb);
1169
1170         iph->version    =       4;
1171         iph->tos        =       old_iph->tos;
1172         iph->ttl        =       old_iph->ttl;
1173         iph->frag_off   =       0;
1174         iph->daddr      =       daddr;
1175         iph->saddr      =       saddr;
1176         iph->protocol   =       IPPROTO_IPIP;
1177         iph->ihl        =       5;
1178         iph->tot_len    =       htons(skb->len);
1179         ip_select_ident(iph, skb->dst, NULL);
1180         ip_send_check(iph);
1181
1182         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1183         nf_reset(skb);
1184 }
1185
1186 static inline int ipmr_forward_finish(struct sk_buff *skb)
1187 {
1188         struct ip_options * opt = &(IPCB(skb)->opt);
1189
1190         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1191
1192         if (unlikely(opt->optlen))
1193                 ip_forward_options(skb);
1194
1195         return dst_output(skb);
1196 }
1197
1198 /*
1199  *      Processing handlers for ipmr_forward
1200  */
1201
1202 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1203 {
1204         const struct iphdr *iph = ip_hdr(skb);
1205         struct vif_device *vif = &init_net.ipv4.vif_table[vifi];
1206         struct net_device *dev;
1207         struct rtable *rt;
1208         int    encap = 0;
1209
1210         if (vif->dev == NULL)
1211                 goto out_free;
1212
1213 #ifdef CONFIG_IP_PIMSM
1214         if (vif->flags & VIFF_REGISTER) {
1215                 vif->pkt_out++;
1216                 vif->bytes_out += skb->len;
1217                 vif->dev->stats.tx_bytes += skb->len;
1218                 vif->dev->stats.tx_packets++;
1219                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1220                 kfree_skb(skb);
1221                 return;
1222         }
1223 #endif
1224
1225         if (vif->flags&VIFF_TUNNEL) {
1226                 struct flowi fl = { .oif = vif->link,
1227                                     .nl_u = { .ip4_u =
1228                                               { .daddr = vif->remote,
1229                                                 .saddr = vif->local,
1230                                                 .tos = RT_TOS(iph->tos) } },
1231                                     .proto = IPPROTO_IPIP };
1232                 if (ip_route_output_key(&init_net, &rt, &fl))
1233                         goto out_free;
1234                 encap = sizeof(struct iphdr);
1235         } else {
1236                 struct flowi fl = { .oif = vif->link,
1237                                     .nl_u = { .ip4_u =
1238                                               { .daddr = iph->daddr,
1239                                                 .tos = RT_TOS(iph->tos) } },
1240                                     .proto = IPPROTO_IPIP };
1241                 if (ip_route_output_key(&init_net, &rt, &fl))
1242                         goto out_free;
1243         }
1244
1245         dev = rt->u.dst.dev;
1246
1247         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1248                 /* Do not fragment multicasts. Alas, IPv4 does not
1249                    allow to send ICMP, so that packets will disappear
1250                    to blackhole.
1251                  */
1252
1253                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1254                 ip_rt_put(rt);
1255                 goto out_free;
1256         }
1257
1258         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1259
1260         if (skb_cow(skb, encap)) {
1261                 ip_rt_put(rt);
1262                 goto out_free;
1263         }
1264
1265         vif->pkt_out++;
1266         vif->bytes_out += skb->len;
1267
1268         dst_release(skb->dst);
1269         skb->dst = &rt->u.dst;
1270         ip_decrease_ttl(ip_hdr(skb));
1271
1272         /* FIXME: forward and output firewalls used to be called here.
1273          * What do we do with netfilter? -- RR */
1274         if (vif->flags & VIFF_TUNNEL) {
1275                 ip_encap(skb, vif->local, vif->remote);
1276                 /* FIXME: extra output firewall step used to be here. --RR */
1277                 vif->dev->stats.tx_packets++;
1278                 vif->dev->stats.tx_bytes += skb->len;
1279         }
1280
1281         IPCB(skb)->flags |= IPSKB_FORWARDED;
1282
1283         /*
1284          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1285          * not only before forwarding, but after forwarding on all output
1286          * interfaces. It is clear, if mrouter runs a multicasting
1287          * program, it should receive packets not depending to what interface
1288          * program is joined.
1289          * If we will not make it, the program will have to join on all
1290          * interfaces. On the other hand, multihoming host (or router, but
1291          * not mrouter) cannot join to more than one interface - it will
1292          * result in receiving multiple packets.
1293          */
1294         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1295                 ipmr_forward_finish);
1296         return;
1297
1298 out_free:
1299         kfree_skb(skb);
1300         return;
1301 }
1302
1303 static int ipmr_find_vif(struct net_device *dev)
1304 {
1305         int ct;
1306         for (ct = init_net.ipv4.maxvif-1; ct >= 0; ct--) {
1307                 if (init_net.ipv4.vif_table[ct].dev == dev)
1308                         break;
1309         }
1310         return ct;
1311 }
1312
1313 /* "local" means that we should preserve one skb (for local delivery) */
1314
1315 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1316 {
1317         int psend = -1;
1318         int vif, ct;
1319
1320         vif = cache->mfc_parent;
1321         cache->mfc_un.res.pkt++;
1322         cache->mfc_un.res.bytes += skb->len;
1323
1324         /*
1325          * Wrong interface: drop packet and (maybe) send PIM assert.
1326          */
1327         if (init_net.ipv4.vif_table[vif].dev != skb->dev) {
1328                 int true_vifi;
1329
1330                 if (skb->rtable->fl.iif == 0) {
1331                         /* It is our own packet, looped back.
1332                            Very complicated situation...
1333
1334                            The best workaround until routing daemons will be
1335                            fixed is not to redistribute packet, if it was
1336                            send through wrong interface. It means, that
1337                            multicast applications WILL NOT work for
1338                            (S,G), which have default multicast route pointing
1339                            to wrong oif. In any case, it is not a good
1340                            idea to use multicasting applications on router.
1341                          */
1342                         goto dont_forward;
1343                 }
1344
1345                 cache->mfc_un.res.wrong_if++;
1346                 true_vifi = ipmr_find_vif(skb->dev);
1347
1348                 if (true_vifi >= 0 && mroute_do_assert &&
1349                     /* pimsm uses asserts, when switching from RPT to SPT,
1350                        so that we cannot check that packet arrived on an oif.
1351                        It is bad, but otherwise we would need to move pretty
1352                        large chunk of pimd to kernel. Ough... --ANK
1353                      */
1354                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1355                     time_after(jiffies,
1356                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1357                         cache->mfc_un.res.last_assert = jiffies;
1358                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1359                 }
1360                 goto dont_forward;
1361         }
1362
1363         init_net.ipv4.vif_table[vif].pkt_in++;
1364         init_net.ipv4.vif_table[vif].bytes_in += skb->len;
1365
1366         /*
1367          *      Forward the frame
1368          */
1369         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1370                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1371                         if (psend != -1) {
1372                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1373                                 if (skb2)
1374                                         ipmr_queue_xmit(skb2, cache, psend);
1375                         }
1376                         psend = ct;
1377                 }
1378         }
1379         if (psend != -1) {
1380                 if (local) {
1381                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1382                         if (skb2)
1383                                 ipmr_queue_xmit(skb2, cache, psend);
1384                 } else {
1385                         ipmr_queue_xmit(skb, cache, psend);
1386                         return 0;
1387                 }
1388         }
1389
1390 dont_forward:
1391         if (!local)
1392                 kfree_skb(skb);
1393         return 0;
1394 }
1395
1396
1397 /*
1398  *      Multicast packets for forwarding arrive here
1399  */
1400
1401 int ip_mr_input(struct sk_buff *skb)
1402 {
1403         struct mfc_cache *cache;
1404         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1405
1406         /* Packet is looped back after forward, it should not be
1407            forwarded second time, but still can be delivered locally.
1408          */
1409         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1410                 goto dont_forward;
1411
1412         if (!local) {
1413                     if (IPCB(skb)->opt.router_alert) {
1414                             if (ip_call_ra_chain(skb))
1415                                     return 0;
1416                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1417                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1418                                Cisco IOS <= 11.2(8)) do not put router alert
1419                                option to IGMP packets destined to routable
1420                                groups. It is very bad, because it means
1421                                that we can forward NO IGMP messages.
1422                              */
1423                             read_lock(&mrt_lock);
1424                             if (init_net.ipv4.mroute_sk) {
1425                                     nf_reset(skb);
1426                                     raw_rcv(init_net.ipv4.mroute_sk, skb);
1427                                     read_unlock(&mrt_lock);
1428                                     return 0;
1429                             }
1430                             read_unlock(&mrt_lock);
1431                     }
1432         }
1433
1434         read_lock(&mrt_lock);
1435         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1436
1437         /*
1438          *      No usable cache entry
1439          */
1440         if (cache == NULL) {
1441                 int vif;
1442
1443                 if (local) {
1444                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1445                         ip_local_deliver(skb);
1446                         if (skb2 == NULL) {
1447                                 read_unlock(&mrt_lock);
1448                                 return -ENOBUFS;
1449                         }
1450                         skb = skb2;
1451                 }
1452
1453                 vif = ipmr_find_vif(skb->dev);
1454                 if (vif >= 0) {
1455                         int err = ipmr_cache_unresolved(vif, skb);
1456                         read_unlock(&mrt_lock);
1457
1458                         return err;
1459                 }
1460                 read_unlock(&mrt_lock);
1461                 kfree_skb(skb);
1462                 return -ENODEV;
1463         }
1464
1465         ip_mr_forward(skb, cache, local);
1466
1467         read_unlock(&mrt_lock);
1468
1469         if (local)
1470                 return ip_local_deliver(skb);
1471
1472         return 0;
1473
1474 dont_forward:
1475         if (local)
1476                 return ip_local_deliver(skb);
1477         kfree_skb(skb);
1478         return 0;
1479 }
1480
1481 #ifdef CONFIG_IP_PIMSM
1482 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1483 {
1484         struct net_device *reg_dev = NULL;
1485         struct iphdr *encap;
1486
1487         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1488         /*
1489            Check that:
1490            a. packet is really destinted to a multicast group
1491            b. packet is not a NULL-REGISTER
1492            c. packet is not truncated
1493          */
1494         if (!ipv4_is_multicast(encap->daddr) ||
1495             encap->tot_len == 0 ||
1496             ntohs(encap->tot_len) + pimlen > skb->len)
1497                 return 1;
1498
1499         read_lock(&mrt_lock);
1500         if (reg_vif_num >= 0)
1501                 reg_dev = init_net.ipv4.vif_table[reg_vif_num].dev;
1502         if (reg_dev)
1503                 dev_hold(reg_dev);
1504         read_unlock(&mrt_lock);
1505
1506         if (reg_dev == NULL)
1507                 return 1;
1508
1509         skb->mac_header = skb->network_header;
1510         skb_pull(skb, (u8*)encap - skb->data);
1511         skb_reset_network_header(skb);
1512         skb->dev = reg_dev;
1513         skb->protocol = htons(ETH_P_IP);
1514         skb->ip_summed = 0;
1515         skb->pkt_type = PACKET_HOST;
1516         dst_release(skb->dst);
1517         skb->dst = NULL;
1518         reg_dev->stats.rx_bytes += skb->len;
1519         reg_dev->stats.rx_packets++;
1520         nf_reset(skb);
1521         netif_rx(skb);
1522         dev_put(reg_dev);
1523
1524         return 0;
1525 }
1526 #endif
1527
1528 #ifdef CONFIG_IP_PIMSM_V1
1529 /*
1530  * Handle IGMP messages of PIMv1
1531  */
1532
1533 int pim_rcv_v1(struct sk_buff * skb)
1534 {
1535         struct igmphdr *pim;
1536
1537         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1538                 goto drop;
1539
1540         pim = igmp_hdr(skb);
1541
1542         if (!mroute_do_pim ||
1543             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1544                 goto drop;
1545
1546         if (__pim_rcv(skb, sizeof(*pim))) {
1547 drop:
1548                 kfree_skb(skb);
1549         }
1550         return 0;
1551 }
1552 #endif
1553
1554 #ifdef CONFIG_IP_PIMSM_V2
1555 static int pim_rcv(struct sk_buff * skb)
1556 {
1557         struct pimreghdr *pim;
1558
1559         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1560                 goto drop;
1561
1562         pim = (struct pimreghdr *)skb_transport_header(skb);
1563         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1564             (pim->flags&PIM_NULL_REGISTER) ||
1565             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1566              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1567                 goto drop;
1568
1569         if (__pim_rcv(skb, sizeof(*pim))) {
1570 drop:
1571                 kfree_skb(skb);
1572         }
1573         return 0;
1574 }
1575 #endif
1576
1577 static int
1578 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1579 {
1580         int ct;
1581         struct rtnexthop *nhp;
1582         struct net_device *dev = init_net.ipv4.vif_table[c->mfc_parent].dev;
1583         u8 *b = skb_tail_pointer(skb);
1584         struct rtattr *mp_head;
1585
1586         if (dev)
1587                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1588
1589         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1590
1591         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1592                 if (c->mfc_un.res.ttls[ct] < 255) {
1593                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1594                                 goto rtattr_failure;
1595                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1596                         nhp->rtnh_flags = 0;
1597                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1598                         nhp->rtnh_ifindex = init_net.ipv4.vif_table[ct].dev->ifindex;
1599                         nhp->rtnh_len = sizeof(*nhp);
1600                 }
1601         }
1602         mp_head->rta_type = RTA_MULTIPATH;
1603         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1604         rtm->rtm_type = RTN_MULTICAST;
1605         return 1;
1606
1607 rtattr_failure:
1608         nlmsg_trim(skb, b);
1609         return -EMSGSIZE;
1610 }
1611
1612 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1613 {
1614         int err;
1615         struct mfc_cache *cache;
1616         struct rtable *rt = skb->rtable;
1617
1618         read_lock(&mrt_lock);
1619         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1620
1621         if (cache == NULL) {
1622                 struct sk_buff *skb2;
1623                 struct iphdr *iph;
1624                 struct net_device *dev;
1625                 int vif;
1626
1627                 if (nowait) {
1628                         read_unlock(&mrt_lock);
1629                         return -EAGAIN;
1630                 }
1631
1632                 dev = skb->dev;
1633                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1634                         read_unlock(&mrt_lock);
1635                         return -ENODEV;
1636                 }
1637                 skb2 = skb_clone(skb, GFP_ATOMIC);
1638                 if (!skb2) {
1639                         read_unlock(&mrt_lock);
1640                         return -ENOMEM;
1641                 }
1642
1643                 skb_push(skb2, sizeof(struct iphdr));
1644                 skb_reset_network_header(skb2);
1645                 iph = ip_hdr(skb2);
1646                 iph->ihl = sizeof(struct iphdr) >> 2;
1647                 iph->saddr = rt->rt_src;
1648                 iph->daddr = rt->rt_dst;
1649                 iph->version = 0;
1650                 err = ipmr_cache_unresolved(vif, skb2);
1651                 read_unlock(&mrt_lock);
1652                 return err;
1653         }
1654
1655         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1656                 cache->mfc_flags |= MFC_NOTIFY;
1657         err = ipmr_fill_mroute(skb, cache, rtm);
1658         read_unlock(&mrt_lock);
1659         return err;
1660 }
1661
1662 #ifdef CONFIG_PROC_FS
1663 /*
1664  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1665  */
1666 struct ipmr_vif_iter {
1667         int ct;
1668 };
1669
1670 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1671                                            loff_t pos)
1672 {
1673         for (iter->ct = 0; iter->ct < init_net.ipv4.maxvif; ++iter->ct) {
1674                 if (!VIF_EXISTS(&init_net, iter->ct))
1675                         continue;
1676                 if (pos-- == 0)
1677                         return &init_net.ipv4.vif_table[iter->ct];
1678         }
1679         return NULL;
1680 }
1681
1682 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1683         __acquires(mrt_lock)
1684 {
1685         read_lock(&mrt_lock);
1686         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1687                 : SEQ_START_TOKEN;
1688 }
1689
1690 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1691 {
1692         struct ipmr_vif_iter *iter = seq->private;
1693
1694         ++*pos;
1695         if (v == SEQ_START_TOKEN)
1696                 return ipmr_vif_seq_idx(iter, 0);
1697
1698         while (++iter->ct < init_net.ipv4.maxvif) {
1699                 if (!VIF_EXISTS(&init_net, iter->ct))
1700                         continue;
1701                 return &init_net.ipv4.vif_table[iter->ct];
1702         }
1703         return NULL;
1704 }
1705
1706 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1707         __releases(mrt_lock)
1708 {
1709         read_unlock(&mrt_lock);
1710 }
1711
1712 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1713 {
1714         if (v == SEQ_START_TOKEN) {
1715                 seq_puts(seq,
1716                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1717         } else {
1718                 const struct vif_device *vif = v;
1719                 const char *name =  vif->dev ? vif->dev->name : "none";
1720
1721                 seq_printf(seq,
1722                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1723                            vif - init_net.ipv4.vif_table,
1724                            name, vif->bytes_in, vif->pkt_in,
1725                            vif->bytes_out, vif->pkt_out,
1726                            vif->flags, vif->local, vif->remote);
1727         }
1728         return 0;
1729 }
1730
1731 static const struct seq_operations ipmr_vif_seq_ops = {
1732         .start = ipmr_vif_seq_start,
1733         .next  = ipmr_vif_seq_next,
1734         .stop  = ipmr_vif_seq_stop,
1735         .show  = ipmr_vif_seq_show,
1736 };
1737
1738 static int ipmr_vif_open(struct inode *inode, struct file *file)
1739 {
1740         return seq_open_private(file, &ipmr_vif_seq_ops,
1741                         sizeof(struct ipmr_vif_iter));
1742 }
1743
1744 static const struct file_operations ipmr_vif_fops = {
1745         .owner   = THIS_MODULE,
1746         .open    = ipmr_vif_open,
1747         .read    = seq_read,
1748         .llseek  = seq_lseek,
1749         .release = seq_release_private,
1750 };
1751
1752 struct ipmr_mfc_iter {
1753         struct mfc_cache **cache;
1754         int ct;
1755 };
1756
1757
1758 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1759 {
1760         struct mfc_cache *mfc;
1761
1762         it->cache = mfc_cache_array;
1763         read_lock(&mrt_lock);
1764         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1765                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1766                         if (pos-- == 0)
1767                                 return mfc;
1768         read_unlock(&mrt_lock);
1769
1770         it->cache = &mfc_unres_queue;
1771         spin_lock_bh(&mfc_unres_lock);
1772         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1773                 if (pos-- == 0)
1774                         return mfc;
1775         spin_unlock_bh(&mfc_unres_lock);
1776
1777         it->cache = NULL;
1778         return NULL;
1779 }
1780
1781
1782 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1783 {
1784         struct ipmr_mfc_iter *it = seq->private;
1785         it->cache = NULL;
1786         it->ct = 0;
1787         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1788                 : SEQ_START_TOKEN;
1789 }
1790
1791 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1792 {
1793         struct mfc_cache *mfc = v;
1794         struct ipmr_mfc_iter *it = seq->private;
1795
1796         ++*pos;
1797
1798         if (v == SEQ_START_TOKEN)
1799                 return ipmr_mfc_seq_idx(seq->private, 0);
1800
1801         if (mfc->next)
1802                 return mfc->next;
1803
1804         if (it->cache == &mfc_unres_queue)
1805                 goto end_of_list;
1806
1807         BUG_ON(it->cache != mfc_cache_array);
1808
1809         while (++it->ct < MFC_LINES) {
1810                 mfc = mfc_cache_array[it->ct];
1811                 if (mfc)
1812                         return mfc;
1813         }
1814
1815         /* exhausted cache_array, show unresolved */
1816         read_unlock(&mrt_lock);
1817         it->cache = &mfc_unres_queue;
1818         it->ct = 0;
1819
1820         spin_lock_bh(&mfc_unres_lock);
1821         mfc = mfc_unres_queue;
1822         if (mfc)
1823                 return mfc;
1824
1825  end_of_list:
1826         spin_unlock_bh(&mfc_unres_lock);
1827         it->cache = NULL;
1828
1829         return NULL;
1830 }
1831
1832 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1833 {
1834         struct ipmr_mfc_iter *it = seq->private;
1835
1836         if (it->cache == &mfc_unres_queue)
1837                 spin_unlock_bh(&mfc_unres_lock);
1838         else if (it->cache == mfc_cache_array)
1839                 read_unlock(&mrt_lock);
1840 }
1841
1842 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1843 {
1844         int n;
1845
1846         if (v == SEQ_START_TOKEN) {
1847                 seq_puts(seq,
1848                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1849         } else {
1850                 const struct mfc_cache *mfc = v;
1851                 const struct ipmr_mfc_iter *it = seq->private;
1852
1853                 seq_printf(seq, "%08lX %08lX %-3hd",
1854                            (unsigned long) mfc->mfc_mcastgrp,
1855                            (unsigned long) mfc->mfc_origin,
1856                            mfc->mfc_parent);
1857
1858                 if (it->cache != &mfc_unres_queue) {
1859                         seq_printf(seq, " %8lu %8lu %8lu",
1860                                    mfc->mfc_un.res.pkt,
1861                                    mfc->mfc_un.res.bytes,
1862                                    mfc->mfc_un.res.wrong_if);
1863                         for (n = mfc->mfc_un.res.minvif;
1864                              n < mfc->mfc_un.res.maxvif; n++ ) {
1865                                 if (VIF_EXISTS(&init_net, n) &&
1866                                     mfc->mfc_un.res.ttls[n] < 255)
1867                                         seq_printf(seq,
1868                                            " %2d:%-3d",
1869                                            n, mfc->mfc_un.res.ttls[n]);
1870                         }
1871                 } else {
1872                         /* unresolved mfc_caches don't contain
1873                          * pkt, bytes and wrong_if values
1874                          */
1875                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1876                 }
1877                 seq_putc(seq, '\n');
1878         }
1879         return 0;
1880 }
1881
1882 static const struct seq_operations ipmr_mfc_seq_ops = {
1883         .start = ipmr_mfc_seq_start,
1884         .next  = ipmr_mfc_seq_next,
1885         .stop  = ipmr_mfc_seq_stop,
1886         .show  = ipmr_mfc_seq_show,
1887 };
1888
1889 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1890 {
1891         return seq_open_private(file, &ipmr_mfc_seq_ops,
1892                         sizeof(struct ipmr_mfc_iter));
1893 }
1894
1895 static const struct file_operations ipmr_mfc_fops = {
1896         .owner   = THIS_MODULE,
1897         .open    = ipmr_mfc_open,
1898         .read    = seq_read,
1899         .llseek  = seq_lseek,
1900         .release = seq_release_private,
1901 };
1902 #endif
1903
1904 #ifdef CONFIG_IP_PIMSM_V2
1905 static struct net_protocol pim_protocol = {
1906         .handler        =       pim_rcv,
1907 };
1908 #endif
1909
1910
1911 /*
1912  *      Setup for IP multicast routing
1913  */
1914 static int __net_init ipmr_net_init(struct net *net)
1915 {
1916         int err = 0;
1917
1918         net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1919                                       GFP_KERNEL);
1920         if (!net->ipv4.vif_table) {
1921                 err = -ENOMEM;
1922                 goto fail;
1923         }
1924 fail:
1925         return err;
1926 }
1927
1928 static void __net_exit ipmr_net_exit(struct net *net)
1929 {
1930         kfree(net->ipv4.vif_table);
1931 }
1932
1933 static struct pernet_operations ipmr_net_ops = {
1934         .init = ipmr_net_init,
1935         .exit = ipmr_net_exit,
1936 };
1937
1938 int __init ip_mr_init(void)
1939 {
1940         int err;
1941
1942         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1943                                        sizeof(struct mfc_cache),
1944                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1945                                        NULL);
1946         if (!mrt_cachep)
1947                 return -ENOMEM;
1948
1949         err = register_pernet_subsys(&ipmr_net_ops);
1950         if (err)
1951                 goto reg_pernet_fail;
1952
1953         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1954         err = register_netdevice_notifier(&ip_mr_notifier);
1955         if (err)
1956                 goto reg_notif_fail;
1957 #ifdef CONFIG_PROC_FS
1958         err = -ENOMEM;
1959         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1960                 goto proc_vif_fail;
1961         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1962                 goto proc_cache_fail;
1963 #endif
1964         return 0;
1965 #ifdef CONFIG_PROC_FS
1966 proc_cache_fail:
1967         proc_net_remove(&init_net, "ip_mr_vif");
1968 proc_vif_fail:
1969         unregister_netdevice_notifier(&ip_mr_notifier);
1970 #endif
1971 reg_notif_fail:
1972         del_timer(&ipmr_expire_timer);
1973         unregister_pernet_subsys(&ipmr_net_ops);
1974 reg_pernet_fail:
1975         kmem_cache_destroy(mrt_cachep);
1976         return err;
1977 }