netns: ipmr: store netns in struct mfc_cache
[safe/jmp/linux-2.6] / net / ipv4 / ipmr.c
1 /*
2  *      IP multicast routing support for mrouted 3.6/3.8
3  *
4  *              (c) 1995 Alan Cox, <alan@lxorguk.ukuu.org.uk>
5  *        Linux Consultancy and Custom Driver Development
6  *
7  *      This program is free software; you can redistribute it and/or
8  *      modify it under the terms of the GNU General Public License
9  *      as published by the Free Software Foundation; either version
10  *      2 of the License, or (at your option) any later version.
11  *
12  *      Fixes:
13  *      Michael Chastain        :       Incorrect size of copying.
14  *      Alan Cox                :       Added the cache manager code
15  *      Alan Cox                :       Fixed the clone/copy bug and device race.
16  *      Mike McLagan            :       Routing by source
17  *      Malcolm Beattie         :       Buffer handling fixes.
18  *      Alexey Kuznetsov        :       Double buffer free and other fixes.
19  *      SVR Anand               :       Fixed several multicast bugs and problems.
20  *      Alexey Kuznetsov        :       Status, optimisations and more.
21  *      Brad Parker             :       Better behaviour on mrouted upcall
22  *                                      overflow.
23  *      Carlos Picoto           :       PIMv1 Support
24  *      Pavlin Ivanov Radoslavov:       PIMv2 Registers must checksum only PIM header
25  *                                      Relax this requrement to work with older peers.
26  *
27  */
28
29 #include <asm/system.h>
30 #include <asm/uaccess.h>
31 #include <linux/types.h>
32 #include <linux/capability.h>
33 #include <linux/errno.h>
34 #include <linux/timer.h>
35 #include <linux/mm.h>
36 #include <linux/kernel.h>
37 #include <linux/fcntl.h>
38 #include <linux/stat.h>
39 #include <linux/socket.h>
40 #include <linux/in.h>
41 #include <linux/inet.h>
42 #include <linux/netdevice.h>
43 #include <linux/inetdevice.h>
44 #include <linux/igmp.h>
45 #include <linux/proc_fs.h>
46 #include <linux/seq_file.h>
47 #include <linux/mroute.h>
48 #include <linux/init.h>
49 #include <linux/if_ether.h>
50 #include <net/net_namespace.h>
51 #include <net/ip.h>
52 #include <net/protocol.h>
53 #include <linux/skbuff.h>
54 #include <net/route.h>
55 #include <net/sock.h>
56 #include <net/icmp.h>
57 #include <net/udp.h>
58 #include <net/raw.h>
59 #include <linux/notifier.h>
60 #include <linux/if_arp.h>
61 #include <linux/netfilter_ipv4.h>
62 #include <net/ipip.h>
63 #include <net/checksum.h>
64 #include <net/netlink.h>
65
66 #if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
67 #define CONFIG_IP_PIMSM 1
68 #endif
69
70 /* Big lock, protecting vif table, mrt cache and mroute socket state.
71    Note that the changes are semaphored via rtnl_lock.
72  */
73
74 static DEFINE_RWLOCK(mrt_lock);
75
76 /*
77  *      Multicast router control variables
78  */
79
80 #define VIF_EXISTS(_net, _idx) ((_net)->ipv4.vif_table[_idx].dev != NULL)
81
82 static int mroute_do_assert;                            /* Set in PIM assert    */
83 static int mroute_do_pim;
84
85 static struct mfc_cache *mfc_cache_array[MFC_LINES];    /* Forwarding cache     */
86
87 static struct mfc_cache *mfc_unres_queue;               /* Queue of unresolved entries */
88 static atomic_t cache_resolve_queue_len;                /* Size of unresolved   */
89
90 /* Special spinlock for queue of unresolved entries */
91 static DEFINE_SPINLOCK(mfc_unres_lock);
92
93 /* We return to original Alan's scheme. Hash table of resolved
94    entries is changed only in process context and protected
95    with weak lock mrt_lock. Queue of unresolved entries is protected
96    with strong spinlock mfc_unres_lock.
97
98    In this case data path is free of exclusive locks at all.
99  */
100
101 static struct kmem_cache *mrt_cachep __read_mostly;
102
103 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local);
104 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert);
105 static int ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm);
106
107 #ifdef CONFIG_IP_PIMSM_V2
108 static struct net_protocol pim_protocol;
109 #endif
110
111 static struct timer_list ipmr_expire_timer;
112
113 /* Service routines creating virtual interfaces: DVMRP tunnels and PIMREG */
114
115 static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
116 {
117         dev_close(dev);
118
119         dev = __dev_get_by_name(&init_net, "tunl0");
120         if (dev) {
121                 const struct net_device_ops *ops = dev->netdev_ops;
122                 struct ifreq ifr;
123                 struct ip_tunnel_parm p;
124
125                 memset(&p, 0, sizeof(p));
126                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
127                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
128                 p.iph.version = 4;
129                 p.iph.ihl = 5;
130                 p.iph.protocol = IPPROTO_IPIP;
131                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
132                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
133
134                 if (ops->ndo_do_ioctl) {
135                         mm_segment_t oldfs = get_fs();
136
137                         set_fs(KERNEL_DS);
138                         ops->ndo_do_ioctl(dev, &ifr, SIOCDELTUNNEL);
139                         set_fs(oldfs);
140                 }
141         }
142 }
143
144 static
145 struct net_device *ipmr_new_tunnel(struct vifctl *v)
146 {
147         struct net_device  *dev;
148
149         dev = __dev_get_by_name(&init_net, "tunl0");
150
151         if (dev) {
152                 const struct net_device_ops *ops = dev->netdev_ops;
153                 int err;
154                 struct ifreq ifr;
155                 struct ip_tunnel_parm p;
156                 struct in_device  *in_dev;
157
158                 memset(&p, 0, sizeof(p));
159                 p.iph.daddr = v->vifc_rmt_addr.s_addr;
160                 p.iph.saddr = v->vifc_lcl_addr.s_addr;
161                 p.iph.version = 4;
162                 p.iph.ihl = 5;
163                 p.iph.protocol = IPPROTO_IPIP;
164                 sprintf(p.name, "dvmrp%d", v->vifc_vifi);
165                 ifr.ifr_ifru.ifru_data = (__force void __user *)&p;
166
167                 if (ops->ndo_do_ioctl) {
168                         mm_segment_t oldfs = get_fs();
169
170                         set_fs(KERNEL_DS);
171                         err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL);
172                         set_fs(oldfs);
173                 } else
174                         err = -EOPNOTSUPP;
175
176                 dev = NULL;
177
178                 if (err == 0 && (dev = __dev_get_by_name(&init_net, p.name)) != NULL) {
179                         dev->flags |= IFF_MULTICAST;
180
181                         in_dev = __in_dev_get_rtnl(dev);
182                         if (in_dev == NULL)
183                                 goto failure;
184
185                         ipv4_devconf_setall(in_dev);
186                         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
187
188                         if (dev_open(dev))
189                                 goto failure;
190                         dev_hold(dev);
191                 }
192         }
193         return dev;
194
195 failure:
196         /* allow the register to be completed before unregistering. */
197         rtnl_unlock();
198         rtnl_lock();
199
200         unregister_netdevice(dev);
201         return NULL;
202 }
203
204 #ifdef CONFIG_IP_PIMSM
205
206 static int reg_vif_num = -1;
207
208 static int reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
209 {
210         read_lock(&mrt_lock);
211         dev->stats.tx_bytes += skb->len;
212         dev->stats.tx_packets++;
213         ipmr_cache_report(skb, reg_vif_num, IGMPMSG_WHOLEPKT);
214         read_unlock(&mrt_lock);
215         kfree_skb(skb);
216         return 0;
217 }
218
219 static const struct net_device_ops reg_vif_netdev_ops = {
220         .ndo_start_xmit = reg_vif_xmit,
221 };
222
223 static void reg_vif_setup(struct net_device *dev)
224 {
225         dev->type               = ARPHRD_PIMREG;
226         dev->mtu                = ETH_DATA_LEN - sizeof(struct iphdr) - 8;
227         dev->flags              = IFF_NOARP;
228         dev->netdev_ops         = &reg_vif_netdev_ops,
229         dev->destructor         = free_netdev;
230 }
231
232 static struct net_device *ipmr_reg_vif(void)
233 {
234         struct net_device *dev;
235         struct in_device *in_dev;
236
237         dev = alloc_netdev(0, "pimreg", reg_vif_setup);
238
239         if (dev == NULL)
240                 return NULL;
241
242         if (register_netdevice(dev)) {
243                 free_netdev(dev);
244                 return NULL;
245         }
246         dev->iflink = 0;
247
248         rcu_read_lock();
249         if ((in_dev = __in_dev_get_rcu(dev)) == NULL) {
250                 rcu_read_unlock();
251                 goto failure;
252         }
253
254         ipv4_devconf_setall(in_dev);
255         IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
256         rcu_read_unlock();
257
258         if (dev_open(dev))
259                 goto failure;
260
261         dev_hold(dev);
262
263         return dev;
264
265 failure:
266         /* allow the register to be completed before unregistering. */
267         rtnl_unlock();
268         rtnl_lock();
269
270         unregister_netdevice(dev);
271         return NULL;
272 }
273 #endif
274
275 /*
276  *      Delete a VIF entry
277  *      @notify: Set to 1, if the caller is a notifier_call
278  */
279
280 static int vif_delete(int vifi, int notify)
281 {
282         struct vif_device *v;
283         struct net_device *dev;
284         struct in_device *in_dev;
285
286         if (vifi < 0 || vifi >= init_net.ipv4.maxvif)
287                 return -EADDRNOTAVAIL;
288
289         v = &init_net.ipv4.vif_table[vifi];
290
291         write_lock_bh(&mrt_lock);
292         dev = v->dev;
293         v->dev = NULL;
294
295         if (!dev) {
296                 write_unlock_bh(&mrt_lock);
297                 return -EADDRNOTAVAIL;
298         }
299
300 #ifdef CONFIG_IP_PIMSM
301         if (vifi == reg_vif_num)
302                 reg_vif_num = -1;
303 #endif
304
305         if (vifi+1 == init_net.ipv4.maxvif) {
306                 int tmp;
307                 for (tmp=vifi-1; tmp>=0; tmp--) {
308                         if (VIF_EXISTS(&init_net, tmp))
309                                 break;
310                 }
311                 init_net.ipv4.maxvif = tmp+1;
312         }
313
314         write_unlock_bh(&mrt_lock);
315
316         dev_set_allmulti(dev, -1);
317
318         if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) {
319                 IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--;
320                 ip_rt_multicast_event(in_dev);
321         }
322
323         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify)
324                 unregister_netdevice(dev);
325
326         dev_put(dev);
327         return 0;
328 }
329
330 static inline void ipmr_cache_free(struct mfc_cache *c)
331 {
332         release_net(mfc_net(c));
333         kmem_cache_free(mrt_cachep, c);
334 }
335
336 /* Destroy an unresolved cache entry, killing queued skbs
337    and reporting error to netlink readers.
338  */
339
340 static void ipmr_destroy_unres(struct mfc_cache *c)
341 {
342         struct sk_buff *skb;
343         struct nlmsgerr *e;
344
345         atomic_dec(&cache_resolve_queue_len);
346
347         while ((skb = skb_dequeue(&c->mfc_un.unres.unresolved))) {
348                 if (ip_hdr(skb)->version == 0) {
349                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
350                         nlh->nlmsg_type = NLMSG_ERROR;
351                         nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
352                         skb_trim(skb, nlh->nlmsg_len);
353                         e = NLMSG_DATA(nlh);
354                         e->error = -ETIMEDOUT;
355                         memset(&e->msg, 0, sizeof(e->msg));
356
357                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
358                 } else
359                         kfree_skb(skb);
360         }
361
362         ipmr_cache_free(c);
363 }
364
365
366 /* Single timer process for all the unresolved queue. */
367
368 static void ipmr_expire_process(unsigned long dummy)
369 {
370         unsigned long now;
371         unsigned long expires;
372         struct mfc_cache *c, **cp;
373
374         if (!spin_trylock(&mfc_unres_lock)) {
375                 mod_timer(&ipmr_expire_timer, jiffies+HZ/10);
376                 return;
377         }
378
379         if (atomic_read(&cache_resolve_queue_len) == 0)
380                 goto out;
381
382         now = jiffies;
383         expires = 10*HZ;
384         cp = &mfc_unres_queue;
385
386         while ((c=*cp) != NULL) {
387                 if (time_after(c->mfc_un.unres.expires, now)) {
388                         unsigned long interval = c->mfc_un.unres.expires - now;
389                         if (interval < expires)
390                                 expires = interval;
391                         cp = &c->next;
392                         continue;
393                 }
394
395                 *cp = c->next;
396
397                 ipmr_destroy_unres(c);
398         }
399
400         if (atomic_read(&cache_resolve_queue_len))
401                 mod_timer(&ipmr_expire_timer, jiffies + expires);
402
403 out:
404         spin_unlock(&mfc_unres_lock);
405 }
406
407 /* Fill oifs list. It is called under write locked mrt_lock. */
408
409 static void ipmr_update_thresholds(struct mfc_cache *cache, unsigned char *ttls)
410 {
411         int vifi;
412
413         cache->mfc_un.res.minvif = MAXVIFS;
414         cache->mfc_un.res.maxvif = 0;
415         memset(cache->mfc_un.res.ttls, 255, MAXVIFS);
416
417         for (vifi = 0; vifi < init_net.ipv4.maxvif; vifi++) {
418                 if (VIF_EXISTS(&init_net, vifi) &&
419                     ttls[vifi] && ttls[vifi] < 255) {
420                         cache->mfc_un.res.ttls[vifi] = ttls[vifi];
421                         if (cache->mfc_un.res.minvif > vifi)
422                                 cache->mfc_un.res.minvif = vifi;
423                         if (cache->mfc_un.res.maxvif <= vifi)
424                                 cache->mfc_un.res.maxvif = vifi + 1;
425                 }
426         }
427 }
428
429 static int vif_add(struct vifctl *vifc, int mrtsock)
430 {
431         int vifi = vifc->vifc_vifi;
432         struct vif_device *v = &init_net.ipv4.vif_table[vifi];
433         struct net_device *dev;
434         struct in_device *in_dev;
435         int err;
436
437         /* Is vif busy ? */
438         if (VIF_EXISTS(&init_net, vifi))
439                 return -EADDRINUSE;
440
441         switch (vifc->vifc_flags) {
442 #ifdef CONFIG_IP_PIMSM
443         case VIFF_REGISTER:
444                 /*
445                  * Special Purpose VIF in PIM
446                  * All the packets will be sent to the daemon
447                  */
448                 if (reg_vif_num >= 0)
449                         return -EADDRINUSE;
450                 dev = ipmr_reg_vif();
451                 if (!dev)
452                         return -ENOBUFS;
453                 err = dev_set_allmulti(dev, 1);
454                 if (err) {
455                         unregister_netdevice(dev);
456                         dev_put(dev);
457                         return err;
458                 }
459                 break;
460 #endif
461         case VIFF_TUNNEL:
462                 dev = ipmr_new_tunnel(vifc);
463                 if (!dev)
464                         return -ENOBUFS;
465                 err = dev_set_allmulti(dev, 1);
466                 if (err) {
467                         ipmr_del_tunnel(dev, vifc);
468                         dev_put(dev);
469                         return err;
470                 }
471                 break;
472         case 0:
473                 dev = ip_dev_find(&init_net, vifc->vifc_lcl_addr.s_addr);
474                 if (!dev)
475                         return -EADDRNOTAVAIL;
476                 err = dev_set_allmulti(dev, 1);
477                 if (err) {
478                         dev_put(dev);
479                         return err;
480                 }
481                 break;
482         default:
483                 return -EINVAL;
484         }
485
486         if ((in_dev = __in_dev_get_rtnl(dev)) == NULL)
487                 return -EADDRNOTAVAIL;
488         IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++;
489         ip_rt_multicast_event(in_dev);
490
491         /*
492          *      Fill in the VIF structures
493          */
494         v->rate_limit = vifc->vifc_rate_limit;
495         v->local = vifc->vifc_lcl_addr.s_addr;
496         v->remote = vifc->vifc_rmt_addr.s_addr;
497         v->flags = vifc->vifc_flags;
498         if (!mrtsock)
499                 v->flags |= VIFF_STATIC;
500         v->threshold = vifc->vifc_threshold;
501         v->bytes_in = 0;
502         v->bytes_out = 0;
503         v->pkt_in = 0;
504         v->pkt_out = 0;
505         v->link = dev->ifindex;
506         if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER))
507                 v->link = dev->iflink;
508
509         /* And finish update writing critical data */
510         write_lock_bh(&mrt_lock);
511         v->dev = dev;
512 #ifdef CONFIG_IP_PIMSM
513         if (v->flags&VIFF_REGISTER)
514                 reg_vif_num = vifi;
515 #endif
516         if (vifi+1 > init_net.ipv4.maxvif)
517                 init_net.ipv4.maxvif = vifi+1;
518         write_unlock_bh(&mrt_lock);
519         return 0;
520 }
521
522 static struct mfc_cache *ipmr_cache_find(__be32 origin, __be32 mcastgrp)
523 {
524         int line = MFC_HASH(mcastgrp, origin);
525         struct mfc_cache *c;
526
527         for (c=mfc_cache_array[line]; c; c = c->next) {
528                 if (c->mfc_origin==origin && c->mfc_mcastgrp==mcastgrp)
529                         break;
530         }
531         return c;
532 }
533
534 /*
535  *      Allocate a multicast cache entry
536  */
537 static struct mfc_cache *ipmr_cache_alloc(struct net *net)
538 {
539         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
540         if (c == NULL)
541                 return NULL;
542         c->mfc_un.res.minvif = MAXVIFS;
543         mfc_net_set(c, net);
544         return c;
545 }
546
547 static struct mfc_cache *ipmr_cache_alloc_unres(struct net *net)
548 {
549         struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC);
550         if (c == NULL)
551                 return NULL;
552         skb_queue_head_init(&c->mfc_un.unres.unresolved);
553         c->mfc_un.unres.expires = jiffies + 10*HZ;
554         mfc_net_set(c, net);
555         return c;
556 }
557
558 /*
559  *      A cache entry has gone into a resolved state from queued
560  */
561
562 static void ipmr_cache_resolve(struct mfc_cache *uc, struct mfc_cache *c)
563 {
564         struct sk_buff *skb;
565         struct nlmsgerr *e;
566
567         /*
568          *      Play the pending entries through our router
569          */
570
571         while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
572                 if (ip_hdr(skb)->version == 0) {
573                         struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
574
575                         if (ipmr_fill_mroute(skb, c, NLMSG_DATA(nlh)) > 0) {
576                                 nlh->nlmsg_len = (skb_tail_pointer(skb) -
577                                                   (u8 *)nlh);
578                         } else {
579                                 nlh->nlmsg_type = NLMSG_ERROR;
580                                 nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr));
581                                 skb_trim(skb, nlh->nlmsg_len);
582                                 e = NLMSG_DATA(nlh);
583                                 e->error = -EMSGSIZE;
584                                 memset(&e->msg, 0, sizeof(e->msg));
585                         }
586
587                         rtnl_unicast(skb, &init_net, NETLINK_CB(skb).pid);
588                 } else
589                         ip_mr_forward(skb, c, 0);
590         }
591 }
592
593 /*
594  *      Bounce a cache query up to mrouted. We could use netlink for this but mrouted
595  *      expects the following bizarre scheme.
596  *
597  *      Called under mrt_lock.
598  */
599
600 static int ipmr_cache_report(struct sk_buff *pkt, vifi_t vifi, int assert)
601 {
602         struct sk_buff *skb;
603         const int ihl = ip_hdrlen(pkt);
604         struct igmphdr *igmp;
605         struct igmpmsg *msg;
606         int ret;
607
608 #ifdef CONFIG_IP_PIMSM
609         if (assert == IGMPMSG_WHOLEPKT)
610                 skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
611         else
612 #endif
613                 skb = alloc_skb(128, GFP_ATOMIC);
614
615         if (!skb)
616                 return -ENOBUFS;
617
618 #ifdef CONFIG_IP_PIMSM
619         if (assert == IGMPMSG_WHOLEPKT) {
620                 /* Ugly, but we have no choice with this interface.
621                    Duplicate old header, fix ihl, length etc.
622                    And all this only to mangle msg->im_msgtype and
623                    to set msg->im_mbz to "mbz" :-)
624                  */
625                 skb_push(skb, sizeof(struct iphdr));
626                 skb_reset_network_header(skb);
627                 skb_reset_transport_header(skb);
628                 msg = (struct igmpmsg *)skb_network_header(skb);
629                 memcpy(msg, skb_network_header(pkt), sizeof(struct iphdr));
630                 msg->im_msgtype = IGMPMSG_WHOLEPKT;
631                 msg->im_mbz = 0;
632                 msg->im_vif = reg_vif_num;
633                 ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
634                 ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
635                                              sizeof(struct iphdr));
636         } else
637 #endif
638         {
639
640         /*
641          *      Copy the IP header
642          */
643
644         skb->network_header = skb->tail;
645         skb_put(skb, ihl);
646         skb_copy_to_linear_data(skb, pkt->data, ihl);
647         ip_hdr(skb)->protocol = 0;                      /* Flag to the kernel this is a route add */
648         msg = (struct igmpmsg *)skb_network_header(skb);
649         msg->im_vif = vifi;
650         skb->dst = dst_clone(pkt->dst);
651
652         /*
653          *      Add our header
654          */
655
656         igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
657         igmp->type      =
658         msg->im_msgtype = assert;
659         igmp->code      =       0;
660         ip_hdr(skb)->tot_len = htons(skb->len);                 /* Fix the length */
661         skb->transport_header = skb->network_header;
662         }
663
664         if (init_net.ipv4.mroute_sk == NULL) {
665                 kfree_skb(skb);
666                 return -EINVAL;
667         }
668
669         /*
670          *      Deliver to mrouted
671          */
672         ret = sock_queue_rcv_skb(init_net.ipv4.mroute_sk, skb);
673         if (ret < 0) {
674                 if (net_ratelimit())
675                         printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n");
676                 kfree_skb(skb);
677         }
678
679         return ret;
680 }
681
682 /*
683  *      Queue a packet for resolution. It gets locked cache entry!
684  */
685
686 static int
687 ipmr_cache_unresolved(vifi_t vifi, struct sk_buff *skb)
688 {
689         int err;
690         struct mfc_cache *c;
691         const struct iphdr *iph = ip_hdr(skb);
692
693         spin_lock_bh(&mfc_unres_lock);
694         for (c=mfc_unres_queue; c; c=c->next) {
695                 if (c->mfc_mcastgrp == iph->daddr &&
696                     c->mfc_origin == iph->saddr)
697                         break;
698         }
699
700         if (c == NULL) {
701                 /*
702                  *      Create a new entry if allowable
703                  */
704
705                 if (atomic_read(&cache_resolve_queue_len) >= 10 ||
706                     (c = ipmr_cache_alloc_unres(&init_net)) == NULL) {
707                         spin_unlock_bh(&mfc_unres_lock);
708
709                         kfree_skb(skb);
710                         return -ENOBUFS;
711                 }
712
713                 /*
714                  *      Fill in the new cache entry
715                  */
716                 c->mfc_parent   = -1;
717                 c->mfc_origin   = iph->saddr;
718                 c->mfc_mcastgrp = iph->daddr;
719
720                 /*
721                  *      Reflect first query at mrouted.
722                  */
723                 if ((err = ipmr_cache_report(skb, vifi, IGMPMSG_NOCACHE))<0) {
724                         /* If the report failed throw the cache entry
725                            out - Brad Parker
726                          */
727                         spin_unlock_bh(&mfc_unres_lock);
728
729                         ipmr_cache_free(c);
730                         kfree_skb(skb);
731                         return err;
732                 }
733
734                 atomic_inc(&cache_resolve_queue_len);
735                 c->next = mfc_unres_queue;
736                 mfc_unres_queue = c;
737
738                 mod_timer(&ipmr_expire_timer, c->mfc_un.unres.expires);
739         }
740
741         /*
742          *      See if we can append the packet
743          */
744         if (c->mfc_un.unres.unresolved.qlen>3) {
745                 kfree_skb(skb);
746                 err = -ENOBUFS;
747         } else {
748                 skb_queue_tail(&c->mfc_un.unres.unresolved, skb);
749                 err = 0;
750         }
751
752         spin_unlock_bh(&mfc_unres_lock);
753         return err;
754 }
755
756 /*
757  *      MFC cache manipulation by user space mroute daemon
758  */
759
760 static int ipmr_mfc_delete(struct mfcctl *mfc)
761 {
762         int line;
763         struct mfc_cache *c, **cp;
764
765         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
766
767         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
768                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
769                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) {
770                         write_lock_bh(&mrt_lock);
771                         *cp = c->next;
772                         write_unlock_bh(&mrt_lock);
773
774                         ipmr_cache_free(c);
775                         return 0;
776                 }
777         }
778         return -ENOENT;
779 }
780
781 static int ipmr_mfc_add(struct mfcctl *mfc, int mrtsock)
782 {
783         int line;
784         struct mfc_cache *uc, *c, **cp;
785
786         line = MFC_HASH(mfc->mfcc_mcastgrp.s_addr, mfc->mfcc_origin.s_addr);
787
788         for (cp=&mfc_cache_array[line]; (c=*cp) != NULL; cp = &c->next) {
789                 if (c->mfc_origin == mfc->mfcc_origin.s_addr &&
790                     c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr)
791                         break;
792         }
793
794         if (c != NULL) {
795                 write_lock_bh(&mrt_lock);
796                 c->mfc_parent = mfc->mfcc_parent;
797                 ipmr_update_thresholds(c, mfc->mfcc_ttls);
798                 if (!mrtsock)
799                         c->mfc_flags |= MFC_STATIC;
800                 write_unlock_bh(&mrt_lock);
801                 return 0;
802         }
803
804         if (!ipv4_is_multicast(mfc->mfcc_mcastgrp.s_addr))
805                 return -EINVAL;
806
807         c = ipmr_cache_alloc(&init_net);
808         if (c == NULL)
809                 return -ENOMEM;
810
811         c->mfc_origin = mfc->mfcc_origin.s_addr;
812         c->mfc_mcastgrp = mfc->mfcc_mcastgrp.s_addr;
813         c->mfc_parent = mfc->mfcc_parent;
814         ipmr_update_thresholds(c, mfc->mfcc_ttls);
815         if (!mrtsock)
816                 c->mfc_flags |= MFC_STATIC;
817
818         write_lock_bh(&mrt_lock);
819         c->next = mfc_cache_array[line];
820         mfc_cache_array[line] = c;
821         write_unlock_bh(&mrt_lock);
822
823         /*
824          *      Check to see if we resolved a queued list. If so we
825          *      need to send on the frames and tidy up.
826          */
827         spin_lock_bh(&mfc_unres_lock);
828         for (cp = &mfc_unres_queue; (uc=*cp) != NULL;
829              cp = &uc->next) {
830                 if (uc->mfc_origin == c->mfc_origin &&
831                     uc->mfc_mcastgrp == c->mfc_mcastgrp) {
832                         *cp = uc->next;
833                         if (atomic_dec_and_test(&cache_resolve_queue_len))
834                                 del_timer(&ipmr_expire_timer);
835                         break;
836                 }
837         }
838         spin_unlock_bh(&mfc_unres_lock);
839
840         if (uc) {
841                 ipmr_cache_resolve(uc, c);
842                 ipmr_cache_free(uc);
843         }
844         return 0;
845 }
846
847 /*
848  *      Close the multicast socket, and clear the vif tables etc
849  */
850
851 static void mroute_clean_tables(struct sock *sk)
852 {
853         int i;
854
855         /*
856          *      Shut down all active vif entries
857          */
858         for (i = 0; i < init_net.ipv4.maxvif; i++) {
859                 if (!(init_net.ipv4.vif_table[i].flags&VIFF_STATIC))
860                         vif_delete(i, 0);
861         }
862
863         /*
864          *      Wipe the cache
865          */
866         for (i=0; i<MFC_LINES; i++) {
867                 struct mfc_cache *c, **cp;
868
869                 cp = &mfc_cache_array[i];
870                 while ((c = *cp) != NULL) {
871                         if (c->mfc_flags&MFC_STATIC) {
872                                 cp = &c->next;
873                                 continue;
874                         }
875                         write_lock_bh(&mrt_lock);
876                         *cp = c->next;
877                         write_unlock_bh(&mrt_lock);
878
879                         ipmr_cache_free(c);
880                 }
881         }
882
883         if (atomic_read(&cache_resolve_queue_len) != 0) {
884                 struct mfc_cache *c;
885
886                 spin_lock_bh(&mfc_unres_lock);
887                 while (mfc_unres_queue != NULL) {
888                         c = mfc_unres_queue;
889                         mfc_unres_queue = c->next;
890                         spin_unlock_bh(&mfc_unres_lock);
891
892                         ipmr_destroy_unres(c);
893
894                         spin_lock_bh(&mfc_unres_lock);
895                 }
896                 spin_unlock_bh(&mfc_unres_lock);
897         }
898 }
899
900 static void mrtsock_destruct(struct sock *sk)
901 {
902         rtnl_lock();
903         if (sk == init_net.ipv4.mroute_sk) {
904                 IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)--;
905
906                 write_lock_bh(&mrt_lock);
907                 init_net.ipv4.mroute_sk = NULL;
908                 write_unlock_bh(&mrt_lock);
909
910                 mroute_clean_tables(sk);
911         }
912         rtnl_unlock();
913 }
914
915 /*
916  *      Socket options and virtual interface manipulation. The whole
917  *      virtual interface system is a complete heap, but unfortunately
918  *      that's how BSD mrouted happens to think. Maybe one day with a proper
919  *      MOSPF/PIM router set up we can clean this up.
920  */
921
922 int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, int optlen)
923 {
924         int ret;
925         struct vifctl vif;
926         struct mfcctl mfc;
927
928         if (optname != MRT_INIT) {
929                 if (sk != init_net.ipv4.mroute_sk && !capable(CAP_NET_ADMIN))
930                         return -EACCES;
931         }
932
933         switch (optname) {
934         case MRT_INIT:
935                 if (sk->sk_type != SOCK_RAW ||
936                     inet_sk(sk)->num != IPPROTO_IGMP)
937                         return -EOPNOTSUPP;
938                 if (optlen != sizeof(int))
939                         return -ENOPROTOOPT;
940
941                 rtnl_lock();
942                 if (init_net.ipv4.mroute_sk) {
943                         rtnl_unlock();
944                         return -EADDRINUSE;
945                 }
946
947                 ret = ip_ra_control(sk, 1, mrtsock_destruct);
948                 if (ret == 0) {
949                         write_lock_bh(&mrt_lock);
950                         init_net.ipv4.mroute_sk = sk;
951                         write_unlock_bh(&mrt_lock);
952
953                         IPV4_DEVCONF_ALL(sock_net(sk), MC_FORWARDING)++;
954                 }
955                 rtnl_unlock();
956                 return ret;
957         case MRT_DONE:
958                 if (sk != init_net.ipv4.mroute_sk)
959                         return -EACCES;
960                 return ip_ra_control(sk, 0, NULL);
961         case MRT_ADD_VIF:
962         case MRT_DEL_VIF:
963                 if (optlen != sizeof(vif))
964                         return -EINVAL;
965                 if (copy_from_user(&vif, optval, sizeof(vif)))
966                         return -EFAULT;
967                 if (vif.vifc_vifi >= MAXVIFS)
968                         return -ENFILE;
969                 rtnl_lock();
970                 if (optname == MRT_ADD_VIF) {
971                         ret = vif_add(&vif, sk == init_net.ipv4.mroute_sk);
972                 } else {
973                         ret = vif_delete(vif.vifc_vifi, 0);
974                 }
975                 rtnl_unlock();
976                 return ret;
977
978                 /*
979                  *      Manipulate the forwarding caches. These live
980                  *      in a sort of kernel/user symbiosis.
981                  */
982         case MRT_ADD_MFC:
983         case MRT_DEL_MFC:
984                 if (optlen != sizeof(mfc))
985                         return -EINVAL;
986                 if (copy_from_user(&mfc, optval, sizeof(mfc)))
987                         return -EFAULT;
988                 rtnl_lock();
989                 if (optname == MRT_DEL_MFC)
990                         ret = ipmr_mfc_delete(&mfc);
991                 else
992                         ret = ipmr_mfc_add(&mfc, sk == init_net.ipv4.mroute_sk);
993                 rtnl_unlock();
994                 return ret;
995                 /*
996                  *      Control PIM assert.
997                  */
998         case MRT_ASSERT:
999         {
1000                 int v;
1001                 if (get_user(v,(int __user *)optval))
1002                         return -EFAULT;
1003                 mroute_do_assert=(v)?1:0;
1004                 return 0;
1005         }
1006 #ifdef CONFIG_IP_PIMSM
1007         case MRT_PIM:
1008         {
1009                 int v;
1010
1011                 if (get_user(v,(int __user *)optval))
1012                         return -EFAULT;
1013                 v = (v) ? 1 : 0;
1014
1015                 rtnl_lock();
1016                 ret = 0;
1017                 if (v != mroute_do_pim) {
1018                         mroute_do_pim = v;
1019                         mroute_do_assert = v;
1020 #ifdef CONFIG_IP_PIMSM_V2
1021                         if (mroute_do_pim)
1022                                 ret = inet_add_protocol(&pim_protocol,
1023                                                         IPPROTO_PIM);
1024                         else
1025                                 ret = inet_del_protocol(&pim_protocol,
1026                                                         IPPROTO_PIM);
1027                         if (ret < 0)
1028                                 ret = -EAGAIN;
1029 #endif
1030                 }
1031                 rtnl_unlock();
1032                 return ret;
1033         }
1034 #endif
1035         /*
1036          *      Spurious command, or MRT_VERSION which you cannot
1037          *      set.
1038          */
1039         default:
1040                 return -ENOPROTOOPT;
1041         }
1042 }
1043
1044 /*
1045  *      Getsock opt support for the multicast routing system.
1046  */
1047
1048 int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
1049 {
1050         int olr;
1051         int val;
1052
1053         if (optname != MRT_VERSION &&
1054 #ifdef CONFIG_IP_PIMSM
1055            optname!=MRT_PIM &&
1056 #endif
1057            optname!=MRT_ASSERT)
1058                 return -ENOPROTOOPT;
1059
1060         if (get_user(olr, optlen))
1061                 return -EFAULT;
1062
1063         olr = min_t(unsigned int, olr, sizeof(int));
1064         if (olr < 0)
1065                 return -EINVAL;
1066
1067         if (put_user(olr, optlen))
1068                 return -EFAULT;
1069         if (optname == MRT_VERSION)
1070                 val = 0x0305;
1071 #ifdef CONFIG_IP_PIMSM
1072         else if (optname == MRT_PIM)
1073                 val = mroute_do_pim;
1074 #endif
1075         else
1076                 val = mroute_do_assert;
1077         if (copy_to_user(optval, &val, olr))
1078                 return -EFAULT;
1079         return 0;
1080 }
1081
1082 /*
1083  *      The IP multicast ioctl support routines.
1084  */
1085
1086 int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
1087 {
1088         struct sioc_sg_req sr;
1089         struct sioc_vif_req vr;
1090         struct vif_device *vif;
1091         struct mfc_cache *c;
1092
1093         switch (cmd) {
1094         case SIOCGETVIFCNT:
1095                 if (copy_from_user(&vr, arg, sizeof(vr)))
1096                         return -EFAULT;
1097                 if (vr.vifi >= init_net.ipv4.maxvif)
1098                         return -EINVAL;
1099                 read_lock(&mrt_lock);
1100                 vif = &init_net.ipv4.vif_table[vr.vifi];
1101                 if (VIF_EXISTS(&init_net, vr.vifi)) {
1102                         vr.icount = vif->pkt_in;
1103                         vr.ocount = vif->pkt_out;
1104                         vr.ibytes = vif->bytes_in;
1105                         vr.obytes = vif->bytes_out;
1106                         read_unlock(&mrt_lock);
1107
1108                         if (copy_to_user(arg, &vr, sizeof(vr)))
1109                                 return -EFAULT;
1110                         return 0;
1111                 }
1112                 read_unlock(&mrt_lock);
1113                 return -EADDRNOTAVAIL;
1114         case SIOCGETSGCNT:
1115                 if (copy_from_user(&sr, arg, sizeof(sr)))
1116                         return -EFAULT;
1117
1118                 read_lock(&mrt_lock);
1119                 c = ipmr_cache_find(sr.src.s_addr, sr.grp.s_addr);
1120                 if (c) {
1121                         sr.pktcnt = c->mfc_un.res.pkt;
1122                         sr.bytecnt = c->mfc_un.res.bytes;
1123                         sr.wrong_if = c->mfc_un.res.wrong_if;
1124                         read_unlock(&mrt_lock);
1125
1126                         if (copy_to_user(arg, &sr, sizeof(sr)))
1127                                 return -EFAULT;
1128                         return 0;
1129                 }
1130                 read_unlock(&mrt_lock);
1131                 return -EADDRNOTAVAIL;
1132         default:
1133                 return -ENOIOCTLCMD;
1134         }
1135 }
1136
1137
1138 static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
1139 {
1140         struct net_device *dev = ptr;
1141         struct vif_device *v;
1142         int ct;
1143
1144         if (!net_eq(dev_net(dev), &init_net))
1145                 return NOTIFY_DONE;
1146
1147         if (event != NETDEV_UNREGISTER)
1148                 return NOTIFY_DONE;
1149         v = &init_net.ipv4.vif_table[0];
1150         for (ct = 0; ct < init_net.ipv4.maxvif; ct++, v++) {
1151                 if (v->dev == dev)
1152                         vif_delete(ct, 1);
1153         }
1154         return NOTIFY_DONE;
1155 }
1156
1157
1158 static struct notifier_block ip_mr_notifier = {
1159         .notifier_call = ipmr_device_event,
1160 };
1161
1162 /*
1163  *      Encapsulate a packet by attaching a valid IPIP header to it.
1164  *      This avoids tunnel drivers and other mess and gives us the speed so
1165  *      important for multicast video.
1166  */
1167
1168 static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr)
1169 {
1170         struct iphdr *iph;
1171         struct iphdr *old_iph = ip_hdr(skb);
1172
1173         skb_push(skb, sizeof(struct iphdr));
1174         skb->transport_header = skb->network_header;
1175         skb_reset_network_header(skb);
1176         iph = ip_hdr(skb);
1177
1178         iph->version    =       4;
1179         iph->tos        =       old_iph->tos;
1180         iph->ttl        =       old_iph->ttl;
1181         iph->frag_off   =       0;
1182         iph->daddr      =       daddr;
1183         iph->saddr      =       saddr;
1184         iph->protocol   =       IPPROTO_IPIP;
1185         iph->ihl        =       5;
1186         iph->tot_len    =       htons(skb->len);
1187         ip_select_ident(iph, skb->dst, NULL);
1188         ip_send_check(iph);
1189
1190         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1191         nf_reset(skb);
1192 }
1193
1194 static inline int ipmr_forward_finish(struct sk_buff *skb)
1195 {
1196         struct ip_options * opt = &(IPCB(skb)->opt);
1197
1198         IP_INC_STATS_BH(dev_net(skb->dst->dev), IPSTATS_MIB_OUTFORWDATAGRAMS);
1199
1200         if (unlikely(opt->optlen))
1201                 ip_forward_options(skb);
1202
1203         return dst_output(skb);
1204 }
1205
1206 /*
1207  *      Processing handlers for ipmr_forward
1208  */
1209
1210 static void ipmr_queue_xmit(struct sk_buff *skb, struct mfc_cache *c, int vifi)
1211 {
1212         const struct iphdr *iph = ip_hdr(skb);
1213         struct vif_device *vif = &init_net.ipv4.vif_table[vifi];
1214         struct net_device *dev;
1215         struct rtable *rt;
1216         int    encap = 0;
1217
1218         if (vif->dev == NULL)
1219                 goto out_free;
1220
1221 #ifdef CONFIG_IP_PIMSM
1222         if (vif->flags & VIFF_REGISTER) {
1223                 vif->pkt_out++;
1224                 vif->bytes_out += skb->len;
1225                 vif->dev->stats.tx_bytes += skb->len;
1226                 vif->dev->stats.tx_packets++;
1227                 ipmr_cache_report(skb, vifi, IGMPMSG_WHOLEPKT);
1228                 kfree_skb(skb);
1229                 return;
1230         }
1231 #endif
1232
1233         if (vif->flags&VIFF_TUNNEL) {
1234                 struct flowi fl = { .oif = vif->link,
1235                                     .nl_u = { .ip4_u =
1236                                               { .daddr = vif->remote,
1237                                                 .saddr = vif->local,
1238                                                 .tos = RT_TOS(iph->tos) } },
1239                                     .proto = IPPROTO_IPIP };
1240                 if (ip_route_output_key(&init_net, &rt, &fl))
1241                         goto out_free;
1242                 encap = sizeof(struct iphdr);
1243         } else {
1244                 struct flowi fl = { .oif = vif->link,
1245                                     .nl_u = { .ip4_u =
1246                                               { .daddr = iph->daddr,
1247                                                 .tos = RT_TOS(iph->tos) } },
1248                                     .proto = IPPROTO_IPIP };
1249                 if (ip_route_output_key(&init_net, &rt, &fl))
1250                         goto out_free;
1251         }
1252
1253         dev = rt->u.dst.dev;
1254
1255         if (skb->len+encap > dst_mtu(&rt->u.dst) && (ntohs(iph->frag_off) & IP_DF)) {
1256                 /* Do not fragment multicasts. Alas, IPv4 does not
1257                    allow to send ICMP, so that packets will disappear
1258                    to blackhole.
1259                  */
1260
1261                 IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS);
1262                 ip_rt_put(rt);
1263                 goto out_free;
1264         }
1265
1266         encap += LL_RESERVED_SPACE(dev) + rt->u.dst.header_len;
1267
1268         if (skb_cow(skb, encap)) {
1269                 ip_rt_put(rt);
1270                 goto out_free;
1271         }
1272
1273         vif->pkt_out++;
1274         vif->bytes_out += skb->len;
1275
1276         dst_release(skb->dst);
1277         skb->dst = &rt->u.dst;
1278         ip_decrease_ttl(ip_hdr(skb));
1279
1280         /* FIXME: forward and output firewalls used to be called here.
1281          * What do we do with netfilter? -- RR */
1282         if (vif->flags & VIFF_TUNNEL) {
1283                 ip_encap(skb, vif->local, vif->remote);
1284                 /* FIXME: extra output firewall step used to be here. --RR */
1285                 vif->dev->stats.tx_packets++;
1286                 vif->dev->stats.tx_bytes += skb->len;
1287         }
1288
1289         IPCB(skb)->flags |= IPSKB_FORWARDED;
1290
1291         /*
1292          * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
1293          * not only before forwarding, but after forwarding on all output
1294          * interfaces. It is clear, if mrouter runs a multicasting
1295          * program, it should receive packets not depending to what interface
1296          * program is joined.
1297          * If we will not make it, the program will have to join on all
1298          * interfaces. On the other hand, multihoming host (or router, but
1299          * not mrouter) cannot join to more than one interface - it will
1300          * result in receiving multiple packets.
1301          */
1302         NF_HOOK(PF_INET, NF_INET_FORWARD, skb, skb->dev, dev,
1303                 ipmr_forward_finish);
1304         return;
1305
1306 out_free:
1307         kfree_skb(skb);
1308         return;
1309 }
1310
1311 static int ipmr_find_vif(struct net_device *dev)
1312 {
1313         int ct;
1314         for (ct = init_net.ipv4.maxvif-1; ct >= 0; ct--) {
1315                 if (init_net.ipv4.vif_table[ct].dev == dev)
1316                         break;
1317         }
1318         return ct;
1319 }
1320
1321 /* "local" means that we should preserve one skb (for local delivery) */
1322
1323 static int ip_mr_forward(struct sk_buff *skb, struct mfc_cache *cache, int local)
1324 {
1325         int psend = -1;
1326         int vif, ct;
1327
1328         vif = cache->mfc_parent;
1329         cache->mfc_un.res.pkt++;
1330         cache->mfc_un.res.bytes += skb->len;
1331
1332         /*
1333          * Wrong interface: drop packet and (maybe) send PIM assert.
1334          */
1335         if (init_net.ipv4.vif_table[vif].dev != skb->dev) {
1336                 int true_vifi;
1337
1338                 if (skb->rtable->fl.iif == 0) {
1339                         /* It is our own packet, looped back.
1340                            Very complicated situation...
1341
1342                            The best workaround until routing daemons will be
1343                            fixed is not to redistribute packet, if it was
1344                            send through wrong interface. It means, that
1345                            multicast applications WILL NOT work for
1346                            (S,G), which have default multicast route pointing
1347                            to wrong oif. In any case, it is not a good
1348                            idea to use multicasting applications on router.
1349                          */
1350                         goto dont_forward;
1351                 }
1352
1353                 cache->mfc_un.res.wrong_if++;
1354                 true_vifi = ipmr_find_vif(skb->dev);
1355
1356                 if (true_vifi >= 0 && mroute_do_assert &&
1357                     /* pimsm uses asserts, when switching from RPT to SPT,
1358                        so that we cannot check that packet arrived on an oif.
1359                        It is bad, but otherwise we would need to move pretty
1360                        large chunk of pimd to kernel. Ough... --ANK
1361                      */
1362                     (mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) &&
1363                     time_after(jiffies,
1364                                cache->mfc_un.res.last_assert + MFC_ASSERT_THRESH)) {
1365                         cache->mfc_un.res.last_assert = jiffies;
1366                         ipmr_cache_report(skb, true_vifi, IGMPMSG_WRONGVIF);
1367                 }
1368                 goto dont_forward;
1369         }
1370
1371         init_net.ipv4.vif_table[vif].pkt_in++;
1372         init_net.ipv4.vif_table[vif].bytes_in += skb->len;
1373
1374         /*
1375          *      Forward the frame
1376          */
1377         for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) {
1378                 if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) {
1379                         if (psend != -1) {
1380                                 struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1381                                 if (skb2)
1382                                         ipmr_queue_xmit(skb2, cache, psend);
1383                         }
1384                         psend = ct;
1385                 }
1386         }
1387         if (psend != -1) {
1388                 if (local) {
1389                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1390                         if (skb2)
1391                                 ipmr_queue_xmit(skb2, cache, psend);
1392                 } else {
1393                         ipmr_queue_xmit(skb, cache, psend);
1394                         return 0;
1395                 }
1396         }
1397
1398 dont_forward:
1399         if (!local)
1400                 kfree_skb(skb);
1401         return 0;
1402 }
1403
1404
1405 /*
1406  *      Multicast packets for forwarding arrive here
1407  */
1408
1409 int ip_mr_input(struct sk_buff *skb)
1410 {
1411         struct mfc_cache *cache;
1412         int local = skb->rtable->rt_flags&RTCF_LOCAL;
1413
1414         /* Packet is looped back after forward, it should not be
1415            forwarded second time, but still can be delivered locally.
1416          */
1417         if (IPCB(skb)->flags&IPSKB_FORWARDED)
1418                 goto dont_forward;
1419
1420         if (!local) {
1421                     if (IPCB(skb)->opt.router_alert) {
1422                             if (ip_call_ra_chain(skb))
1423                                     return 0;
1424                     } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){
1425                             /* IGMPv1 (and broken IGMPv2 implementations sort of
1426                                Cisco IOS <= 11.2(8)) do not put router alert
1427                                option to IGMP packets destined to routable
1428                                groups. It is very bad, because it means
1429                                that we can forward NO IGMP messages.
1430                              */
1431                             read_lock(&mrt_lock);
1432                             if (init_net.ipv4.mroute_sk) {
1433                                     nf_reset(skb);
1434                                     raw_rcv(init_net.ipv4.mroute_sk, skb);
1435                                     read_unlock(&mrt_lock);
1436                                     return 0;
1437                             }
1438                             read_unlock(&mrt_lock);
1439                     }
1440         }
1441
1442         read_lock(&mrt_lock);
1443         cache = ipmr_cache_find(ip_hdr(skb)->saddr, ip_hdr(skb)->daddr);
1444
1445         /*
1446          *      No usable cache entry
1447          */
1448         if (cache == NULL) {
1449                 int vif;
1450
1451                 if (local) {
1452                         struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
1453                         ip_local_deliver(skb);
1454                         if (skb2 == NULL) {
1455                                 read_unlock(&mrt_lock);
1456                                 return -ENOBUFS;
1457                         }
1458                         skb = skb2;
1459                 }
1460
1461                 vif = ipmr_find_vif(skb->dev);
1462                 if (vif >= 0) {
1463                         int err = ipmr_cache_unresolved(vif, skb);
1464                         read_unlock(&mrt_lock);
1465
1466                         return err;
1467                 }
1468                 read_unlock(&mrt_lock);
1469                 kfree_skb(skb);
1470                 return -ENODEV;
1471         }
1472
1473         ip_mr_forward(skb, cache, local);
1474
1475         read_unlock(&mrt_lock);
1476
1477         if (local)
1478                 return ip_local_deliver(skb);
1479
1480         return 0;
1481
1482 dont_forward:
1483         if (local)
1484                 return ip_local_deliver(skb);
1485         kfree_skb(skb);
1486         return 0;
1487 }
1488
1489 #ifdef CONFIG_IP_PIMSM
1490 static int __pim_rcv(struct sk_buff *skb, unsigned int pimlen)
1491 {
1492         struct net_device *reg_dev = NULL;
1493         struct iphdr *encap;
1494
1495         encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
1496         /*
1497            Check that:
1498            a. packet is really destinted to a multicast group
1499            b. packet is not a NULL-REGISTER
1500            c. packet is not truncated
1501          */
1502         if (!ipv4_is_multicast(encap->daddr) ||
1503             encap->tot_len == 0 ||
1504             ntohs(encap->tot_len) + pimlen > skb->len)
1505                 return 1;
1506
1507         read_lock(&mrt_lock);
1508         if (reg_vif_num >= 0)
1509                 reg_dev = init_net.ipv4.vif_table[reg_vif_num].dev;
1510         if (reg_dev)
1511                 dev_hold(reg_dev);
1512         read_unlock(&mrt_lock);
1513
1514         if (reg_dev == NULL)
1515                 return 1;
1516
1517         skb->mac_header = skb->network_header;
1518         skb_pull(skb, (u8*)encap - skb->data);
1519         skb_reset_network_header(skb);
1520         skb->dev = reg_dev;
1521         skb->protocol = htons(ETH_P_IP);
1522         skb->ip_summed = 0;
1523         skb->pkt_type = PACKET_HOST;
1524         dst_release(skb->dst);
1525         skb->dst = NULL;
1526         reg_dev->stats.rx_bytes += skb->len;
1527         reg_dev->stats.rx_packets++;
1528         nf_reset(skb);
1529         netif_rx(skb);
1530         dev_put(reg_dev);
1531
1532         return 0;
1533 }
1534 #endif
1535
1536 #ifdef CONFIG_IP_PIMSM_V1
1537 /*
1538  * Handle IGMP messages of PIMv1
1539  */
1540
1541 int pim_rcv_v1(struct sk_buff * skb)
1542 {
1543         struct igmphdr *pim;
1544
1545         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1546                 goto drop;
1547
1548         pim = igmp_hdr(skb);
1549
1550         if (!mroute_do_pim ||
1551             pim->group != PIM_V1_VERSION || pim->code != PIM_V1_REGISTER)
1552                 goto drop;
1553
1554         if (__pim_rcv(skb, sizeof(*pim))) {
1555 drop:
1556                 kfree_skb(skb);
1557         }
1558         return 0;
1559 }
1560 #endif
1561
1562 #ifdef CONFIG_IP_PIMSM_V2
1563 static int pim_rcv(struct sk_buff * skb)
1564 {
1565         struct pimreghdr *pim;
1566
1567         if (!pskb_may_pull(skb, sizeof(*pim) + sizeof(struct iphdr)))
1568                 goto drop;
1569
1570         pim = (struct pimreghdr *)skb_transport_header(skb);
1571         if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) ||
1572             (pim->flags&PIM_NULL_REGISTER) ||
1573             (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 &&
1574              csum_fold(skb_checksum(skb, 0, skb->len, 0))))
1575                 goto drop;
1576
1577         if (__pim_rcv(skb, sizeof(*pim))) {
1578 drop:
1579                 kfree_skb(skb);
1580         }
1581         return 0;
1582 }
1583 #endif
1584
1585 static int
1586 ipmr_fill_mroute(struct sk_buff *skb, struct mfc_cache *c, struct rtmsg *rtm)
1587 {
1588         int ct;
1589         struct rtnexthop *nhp;
1590         struct net_device *dev = init_net.ipv4.vif_table[c->mfc_parent].dev;
1591         u8 *b = skb_tail_pointer(skb);
1592         struct rtattr *mp_head;
1593
1594         if (dev)
1595                 RTA_PUT(skb, RTA_IIF, 4, &dev->ifindex);
1596
1597         mp_head = (struct rtattr *)skb_put(skb, RTA_LENGTH(0));
1598
1599         for (ct = c->mfc_un.res.minvif; ct < c->mfc_un.res.maxvif; ct++) {
1600                 if (c->mfc_un.res.ttls[ct] < 255) {
1601                         if (skb_tailroom(skb) < RTA_ALIGN(RTA_ALIGN(sizeof(*nhp)) + 4))
1602                                 goto rtattr_failure;
1603                         nhp = (struct rtnexthop *)skb_put(skb, RTA_ALIGN(sizeof(*nhp)));
1604                         nhp->rtnh_flags = 0;
1605                         nhp->rtnh_hops = c->mfc_un.res.ttls[ct];
1606                         nhp->rtnh_ifindex = init_net.ipv4.vif_table[ct].dev->ifindex;
1607                         nhp->rtnh_len = sizeof(*nhp);
1608                 }
1609         }
1610         mp_head->rta_type = RTA_MULTIPATH;
1611         mp_head->rta_len = skb_tail_pointer(skb) - (u8 *)mp_head;
1612         rtm->rtm_type = RTN_MULTICAST;
1613         return 1;
1614
1615 rtattr_failure:
1616         nlmsg_trim(skb, b);
1617         return -EMSGSIZE;
1618 }
1619
1620 int ipmr_get_route(struct sk_buff *skb, struct rtmsg *rtm, int nowait)
1621 {
1622         int err;
1623         struct mfc_cache *cache;
1624         struct rtable *rt = skb->rtable;
1625
1626         read_lock(&mrt_lock);
1627         cache = ipmr_cache_find(rt->rt_src, rt->rt_dst);
1628
1629         if (cache == NULL) {
1630                 struct sk_buff *skb2;
1631                 struct iphdr *iph;
1632                 struct net_device *dev;
1633                 int vif;
1634
1635                 if (nowait) {
1636                         read_unlock(&mrt_lock);
1637                         return -EAGAIN;
1638                 }
1639
1640                 dev = skb->dev;
1641                 if (dev == NULL || (vif = ipmr_find_vif(dev)) < 0) {
1642                         read_unlock(&mrt_lock);
1643                         return -ENODEV;
1644                 }
1645                 skb2 = skb_clone(skb, GFP_ATOMIC);
1646                 if (!skb2) {
1647                         read_unlock(&mrt_lock);
1648                         return -ENOMEM;
1649                 }
1650
1651                 skb_push(skb2, sizeof(struct iphdr));
1652                 skb_reset_network_header(skb2);
1653                 iph = ip_hdr(skb2);
1654                 iph->ihl = sizeof(struct iphdr) >> 2;
1655                 iph->saddr = rt->rt_src;
1656                 iph->daddr = rt->rt_dst;
1657                 iph->version = 0;
1658                 err = ipmr_cache_unresolved(vif, skb2);
1659                 read_unlock(&mrt_lock);
1660                 return err;
1661         }
1662
1663         if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY))
1664                 cache->mfc_flags |= MFC_NOTIFY;
1665         err = ipmr_fill_mroute(skb, cache, rtm);
1666         read_unlock(&mrt_lock);
1667         return err;
1668 }
1669
1670 #ifdef CONFIG_PROC_FS
1671 /*
1672  *      The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif
1673  */
1674 struct ipmr_vif_iter {
1675         int ct;
1676 };
1677
1678 static struct vif_device *ipmr_vif_seq_idx(struct ipmr_vif_iter *iter,
1679                                            loff_t pos)
1680 {
1681         for (iter->ct = 0; iter->ct < init_net.ipv4.maxvif; ++iter->ct) {
1682                 if (!VIF_EXISTS(&init_net, iter->ct))
1683                         continue;
1684                 if (pos-- == 0)
1685                         return &init_net.ipv4.vif_table[iter->ct];
1686         }
1687         return NULL;
1688 }
1689
1690 static void *ipmr_vif_seq_start(struct seq_file *seq, loff_t *pos)
1691         __acquires(mrt_lock)
1692 {
1693         read_lock(&mrt_lock);
1694         return *pos ? ipmr_vif_seq_idx(seq->private, *pos - 1)
1695                 : SEQ_START_TOKEN;
1696 }
1697
1698 static void *ipmr_vif_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1699 {
1700         struct ipmr_vif_iter *iter = seq->private;
1701
1702         ++*pos;
1703         if (v == SEQ_START_TOKEN)
1704                 return ipmr_vif_seq_idx(iter, 0);
1705
1706         while (++iter->ct < init_net.ipv4.maxvif) {
1707                 if (!VIF_EXISTS(&init_net, iter->ct))
1708                         continue;
1709                 return &init_net.ipv4.vif_table[iter->ct];
1710         }
1711         return NULL;
1712 }
1713
1714 static void ipmr_vif_seq_stop(struct seq_file *seq, void *v)
1715         __releases(mrt_lock)
1716 {
1717         read_unlock(&mrt_lock);
1718 }
1719
1720 static int ipmr_vif_seq_show(struct seq_file *seq, void *v)
1721 {
1722         if (v == SEQ_START_TOKEN) {
1723                 seq_puts(seq,
1724                          "Interface      BytesIn  PktsIn  BytesOut PktsOut Flags Local    Remote\n");
1725         } else {
1726                 const struct vif_device *vif = v;
1727                 const char *name =  vif->dev ? vif->dev->name : "none";
1728
1729                 seq_printf(seq,
1730                            "%2Zd %-10s %8ld %7ld  %8ld %7ld %05X %08X %08X\n",
1731                            vif - init_net.ipv4.vif_table,
1732                            name, vif->bytes_in, vif->pkt_in,
1733                            vif->bytes_out, vif->pkt_out,
1734                            vif->flags, vif->local, vif->remote);
1735         }
1736         return 0;
1737 }
1738
1739 static const struct seq_operations ipmr_vif_seq_ops = {
1740         .start = ipmr_vif_seq_start,
1741         .next  = ipmr_vif_seq_next,
1742         .stop  = ipmr_vif_seq_stop,
1743         .show  = ipmr_vif_seq_show,
1744 };
1745
1746 static int ipmr_vif_open(struct inode *inode, struct file *file)
1747 {
1748         return seq_open_private(file, &ipmr_vif_seq_ops,
1749                         sizeof(struct ipmr_vif_iter));
1750 }
1751
1752 static const struct file_operations ipmr_vif_fops = {
1753         .owner   = THIS_MODULE,
1754         .open    = ipmr_vif_open,
1755         .read    = seq_read,
1756         .llseek  = seq_lseek,
1757         .release = seq_release_private,
1758 };
1759
1760 struct ipmr_mfc_iter {
1761         struct mfc_cache **cache;
1762         int ct;
1763 };
1764
1765
1766 static struct mfc_cache *ipmr_mfc_seq_idx(struct ipmr_mfc_iter *it, loff_t pos)
1767 {
1768         struct mfc_cache *mfc;
1769
1770         it->cache = mfc_cache_array;
1771         read_lock(&mrt_lock);
1772         for (it->ct = 0; it->ct < MFC_LINES; it->ct++)
1773                 for (mfc = mfc_cache_array[it->ct]; mfc; mfc = mfc->next)
1774                         if (pos-- == 0)
1775                                 return mfc;
1776         read_unlock(&mrt_lock);
1777
1778         it->cache = &mfc_unres_queue;
1779         spin_lock_bh(&mfc_unres_lock);
1780         for (mfc = mfc_unres_queue; mfc; mfc = mfc->next)
1781                 if (pos-- == 0)
1782                         return mfc;
1783         spin_unlock_bh(&mfc_unres_lock);
1784
1785         it->cache = NULL;
1786         return NULL;
1787 }
1788
1789
1790 static void *ipmr_mfc_seq_start(struct seq_file *seq, loff_t *pos)
1791 {
1792         struct ipmr_mfc_iter *it = seq->private;
1793         it->cache = NULL;
1794         it->ct = 0;
1795         return *pos ? ipmr_mfc_seq_idx(seq->private, *pos - 1)
1796                 : SEQ_START_TOKEN;
1797 }
1798
1799 static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1800 {
1801         struct mfc_cache *mfc = v;
1802         struct ipmr_mfc_iter *it = seq->private;
1803
1804         ++*pos;
1805
1806         if (v == SEQ_START_TOKEN)
1807                 return ipmr_mfc_seq_idx(seq->private, 0);
1808
1809         if (mfc->next)
1810                 return mfc->next;
1811
1812         if (it->cache == &mfc_unres_queue)
1813                 goto end_of_list;
1814
1815         BUG_ON(it->cache != mfc_cache_array);
1816
1817         while (++it->ct < MFC_LINES) {
1818                 mfc = mfc_cache_array[it->ct];
1819                 if (mfc)
1820                         return mfc;
1821         }
1822
1823         /* exhausted cache_array, show unresolved */
1824         read_unlock(&mrt_lock);
1825         it->cache = &mfc_unres_queue;
1826         it->ct = 0;
1827
1828         spin_lock_bh(&mfc_unres_lock);
1829         mfc = mfc_unres_queue;
1830         if (mfc)
1831                 return mfc;
1832
1833  end_of_list:
1834         spin_unlock_bh(&mfc_unres_lock);
1835         it->cache = NULL;
1836
1837         return NULL;
1838 }
1839
1840 static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v)
1841 {
1842         struct ipmr_mfc_iter *it = seq->private;
1843
1844         if (it->cache == &mfc_unres_queue)
1845                 spin_unlock_bh(&mfc_unres_lock);
1846         else if (it->cache == mfc_cache_array)
1847                 read_unlock(&mrt_lock);
1848 }
1849
1850 static int ipmr_mfc_seq_show(struct seq_file *seq, void *v)
1851 {
1852         int n;
1853
1854         if (v == SEQ_START_TOKEN) {
1855                 seq_puts(seq,
1856                  "Group    Origin   Iif     Pkts    Bytes    Wrong Oifs\n");
1857         } else {
1858                 const struct mfc_cache *mfc = v;
1859                 const struct ipmr_mfc_iter *it = seq->private;
1860
1861                 seq_printf(seq, "%08lX %08lX %-3hd",
1862                            (unsigned long) mfc->mfc_mcastgrp,
1863                            (unsigned long) mfc->mfc_origin,
1864                            mfc->mfc_parent);
1865
1866                 if (it->cache != &mfc_unres_queue) {
1867                         seq_printf(seq, " %8lu %8lu %8lu",
1868                                    mfc->mfc_un.res.pkt,
1869                                    mfc->mfc_un.res.bytes,
1870                                    mfc->mfc_un.res.wrong_if);
1871                         for (n = mfc->mfc_un.res.minvif;
1872                              n < mfc->mfc_un.res.maxvif; n++ ) {
1873                                 if (VIF_EXISTS(&init_net, n) &&
1874                                     mfc->mfc_un.res.ttls[n] < 255)
1875                                         seq_printf(seq,
1876                                            " %2d:%-3d",
1877                                            n, mfc->mfc_un.res.ttls[n]);
1878                         }
1879                 } else {
1880                         /* unresolved mfc_caches don't contain
1881                          * pkt, bytes and wrong_if values
1882                          */
1883                         seq_printf(seq, " %8lu %8lu %8lu", 0ul, 0ul, 0ul);
1884                 }
1885                 seq_putc(seq, '\n');
1886         }
1887         return 0;
1888 }
1889
1890 static const struct seq_operations ipmr_mfc_seq_ops = {
1891         .start = ipmr_mfc_seq_start,
1892         .next  = ipmr_mfc_seq_next,
1893         .stop  = ipmr_mfc_seq_stop,
1894         .show  = ipmr_mfc_seq_show,
1895 };
1896
1897 static int ipmr_mfc_open(struct inode *inode, struct file *file)
1898 {
1899         return seq_open_private(file, &ipmr_mfc_seq_ops,
1900                         sizeof(struct ipmr_mfc_iter));
1901 }
1902
1903 static const struct file_operations ipmr_mfc_fops = {
1904         .owner   = THIS_MODULE,
1905         .open    = ipmr_mfc_open,
1906         .read    = seq_read,
1907         .llseek  = seq_lseek,
1908         .release = seq_release_private,
1909 };
1910 #endif
1911
1912 #ifdef CONFIG_IP_PIMSM_V2
1913 static struct net_protocol pim_protocol = {
1914         .handler        =       pim_rcv,
1915 };
1916 #endif
1917
1918
1919 /*
1920  *      Setup for IP multicast routing
1921  */
1922 static int __net_init ipmr_net_init(struct net *net)
1923 {
1924         int err = 0;
1925
1926         net->ipv4.vif_table = kcalloc(MAXVIFS, sizeof(struct vif_device),
1927                                       GFP_KERNEL);
1928         if (!net->ipv4.vif_table) {
1929                 err = -ENOMEM;
1930                 goto fail;
1931         }
1932 fail:
1933         return err;
1934 }
1935
1936 static void __net_exit ipmr_net_exit(struct net *net)
1937 {
1938         kfree(net->ipv4.vif_table);
1939 }
1940
1941 static struct pernet_operations ipmr_net_ops = {
1942         .init = ipmr_net_init,
1943         .exit = ipmr_net_exit,
1944 };
1945
1946 int __init ip_mr_init(void)
1947 {
1948         int err;
1949
1950         mrt_cachep = kmem_cache_create("ip_mrt_cache",
1951                                        sizeof(struct mfc_cache),
1952                                        0, SLAB_HWCACHE_ALIGN|SLAB_PANIC,
1953                                        NULL);
1954         if (!mrt_cachep)
1955                 return -ENOMEM;
1956
1957         err = register_pernet_subsys(&ipmr_net_ops);
1958         if (err)
1959                 goto reg_pernet_fail;
1960
1961         setup_timer(&ipmr_expire_timer, ipmr_expire_process, 0);
1962         err = register_netdevice_notifier(&ip_mr_notifier);
1963         if (err)
1964                 goto reg_notif_fail;
1965 #ifdef CONFIG_PROC_FS
1966         err = -ENOMEM;
1967         if (!proc_net_fops_create(&init_net, "ip_mr_vif", 0, &ipmr_vif_fops))
1968                 goto proc_vif_fail;
1969         if (!proc_net_fops_create(&init_net, "ip_mr_cache", 0, &ipmr_mfc_fops))
1970                 goto proc_cache_fail;
1971 #endif
1972         return 0;
1973 #ifdef CONFIG_PROC_FS
1974 proc_cache_fail:
1975         proc_net_remove(&init_net, "ip_mr_vif");
1976 proc_vif_fail:
1977         unregister_netdevice_notifier(&ip_mr_notifier);
1978 #endif
1979 reg_notif_fail:
1980         del_timer(&ipmr_expire_timer);
1981         unregister_pernet_subsys(&ipmr_net_ops);
1982 reg_pernet_fail:
1983         kmem_cache_destroy(mrt_cachep);
1984         return err;
1985 }