[NET]: Make socket creation namespace safe.
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* List of all packet sockets. */
139 static HLIST_HEAD(packet_sklist);
140 static DEFINE_RWLOCK(packet_sklist_lock);
141
142 static atomic_t packet_socks_nr;
143
144
145 /* Private packet socket structures. */
146
147 struct packet_mclist
148 {
149         struct packet_mclist    *next;
150         int                     ifindex;
151         int                     count;
152         unsigned short          type;
153         unsigned short          alen;
154         unsigned char           addr[MAX_ADDR_LEN];
155 };
156 /* identical to struct packet_mreq except it has
157  * a longer address field.
158  */
159 struct packet_mreq_max
160 {
161         int             mr_ifindex;
162         unsigned short  mr_type;
163         unsigned short  mr_alen;
164         unsigned char   mr_address[MAX_ADDR_LEN];
165 };
166
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 #endif
170
171 static void packet_flush_mclist(struct sock *sk);
172
173 struct packet_sock {
174         /* struct sock has to be the first member of packet_sock */
175         struct sock             sk;
176         struct tpacket_stats    stats;
177 #ifdef CONFIG_PACKET_MMAP
178         char *                  *pg_vec;
179         unsigned int            head;
180         unsigned int            frames_per_block;
181         unsigned int            frame_size;
182         unsigned int            frame_max;
183         int                     copy_thresh;
184 #endif
185         struct packet_type      prot_hook;
186         spinlock_t              bind_lock;
187         unsigned int            running:1,      /* prot_hook is attached*/
188                                 auxdata:1,
189                                 origdev:1;
190         int                     ifindex;        /* bound device         */
191         __be16                  num;
192         struct packet_mclist    *mclist;
193 #ifdef CONFIG_PACKET_MMAP
194         atomic_t                mapped;
195         unsigned int            pg_vec_order;
196         unsigned int            pg_vec_pages;
197         unsigned int            pg_vec_len;
198 #endif
199 };
200
201 struct packet_skb_cb {
202         unsigned int origlen;
203         union {
204                 struct sockaddr_pkt pkt;
205                 struct sockaddr_ll ll;
206         } sa;
207 };
208
209 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
210
211 #ifdef CONFIG_PACKET_MMAP
212
213 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
214 {
215         unsigned int pg_vec_pos, frame_offset;
216
217         pg_vec_pos = position / po->frames_per_block;
218         frame_offset = position % po->frames_per_block;
219
220         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
221 }
222 #endif
223
224 static inline struct packet_sock *pkt_sk(struct sock *sk)
225 {
226         return (struct packet_sock *)sk;
227 }
228
229 static void packet_sock_destruct(struct sock *sk)
230 {
231         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
232         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
233
234         if (!sock_flag(sk, SOCK_DEAD)) {
235                 printk("Attempt to release alive packet socket: %p\n", sk);
236                 return;
237         }
238
239         atomic_dec(&packet_socks_nr);
240 #ifdef PACKET_REFCNT_DEBUG
241         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
242 #endif
243 }
244
245
246 static const struct proto_ops packet_ops;
247
248 static const struct proto_ops packet_ops_spkt;
249
250 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
251 {
252         struct sock *sk;
253         struct sockaddr_pkt *spkt;
254
255         /*
256          *      When we registered the protocol we saved the socket in the data
257          *      field for just this event.
258          */
259
260         sk = pt->af_packet_priv;
261
262         /*
263          *      Yank back the headers [hope the device set this
264          *      right or kerboom...]
265          *
266          *      Incoming packets have ll header pulled,
267          *      push it back.
268          *
269          *      For outgoing ones skb->data == skb_mac_header(skb)
270          *      so that this procedure is noop.
271          */
272
273         if (skb->pkt_type == PACKET_LOOPBACK)
274                 goto out;
275
276         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
277                 goto oom;
278
279         /* drop any routing info */
280         dst_release(skb->dst);
281         skb->dst = NULL;
282
283         /* drop conntrack reference */
284         nf_reset(skb);
285
286         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
287
288         skb_push(skb, skb->data - skb_mac_header(skb));
289
290         /*
291          *      The SOCK_PACKET socket receives _all_ frames.
292          */
293
294         spkt->spkt_family = dev->type;
295         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
296         spkt->spkt_protocol = skb->protocol;
297
298         /*
299          *      Charge the memory to the socket. This is done specifically
300          *      to prevent sockets using all the memory up.
301          */
302
303         if (sock_queue_rcv_skb(sk,skb) == 0)
304                 return 0;
305
306 out:
307         kfree_skb(skb);
308 oom:
309         return 0;
310 }
311
312
313 /*
314  *      Output a raw packet to a device layer. This bypasses all the other
315  *      protocol layers and you must therefore supply it with a complete frame
316  */
317
318 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
319                                struct msghdr *msg, size_t len)
320 {
321         struct sock *sk = sock->sk;
322         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
323         struct sk_buff *skb;
324         struct net_device *dev;
325         __be16 proto=0;
326         int err;
327
328         /*
329          *      Get and verify the address.
330          */
331
332         if (saddr)
333         {
334                 if (msg->msg_namelen < sizeof(struct sockaddr))
335                         return(-EINVAL);
336                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
337                         proto=saddr->spkt_protocol;
338         }
339         else
340                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
341
342         /*
343          *      Find the device first to size check it
344          */
345
346         saddr->spkt_device[13] = 0;
347         dev = dev_get_by_name(saddr->spkt_device);
348         err = -ENODEV;
349         if (dev == NULL)
350                 goto out_unlock;
351
352         err = -ENETDOWN;
353         if (!(dev->flags & IFF_UP))
354                 goto out_unlock;
355
356         /*
357          *      You may not queue a frame bigger than the mtu. This is the lowest level
358          *      raw protocol and you must do your own fragmentation at this level.
359          */
360
361         err = -EMSGSIZE;
362         if (len > dev->mtu + dev->hard_header_len)
363                 goto out_unlock;
364
365         err = -ENOBUFS;
366         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
367
368         /*
369          *      If the write buffer is full, then tough. At this level the user gets to
370          *      deal with the problem - do your own algorithmic backoffs. That's far
371          *      more flexible.
372          */
373
374         if (skb == NULL)
375                 goto out_unlock;
376
377         /*
378          *      Fill it in
379          */
380
381         /* FIXME: Save some space for broken drivers that write a
382          * hard header at transmission time by themselves. PPP is the
383          * notable one here. This should really be fixed at the driver level.
384          */
385         skb_reserve(skb, LL_RESERVED_SPACE(dev));
386         skb_reset_network_header(skb);
387
388         /* Try to align data part correctly */
389         if (dev->hard_header) {
390                 skb->data -= dev->hard_header_len;
391                 skb->tail -= dev->hard_header_len;
392                 if (len < dev->hard_header_len)
393                         skb_reset_network_header(skb);
394         }
395
396         /* Returns -EFAULT on error */
397         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
398         skb->protocol = proto;
399         skb->dev = dev;
400         skb->priority = sk->sk_priority;
401         if (err)
402                 goto out_free;
403
404         /*
405          *      Now send it
406          */
407
408         dev_queue_xmit(skb);
409         dev_put(dev);
410         return(len);
411
412 out_free:
413         kfree_skb(skb);
414 out_unlock:
415         if (dev)
416                 dev_put(dev);
417         return err;
418 }
419
420 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
421                                       unsigned int res)
422 {
423         struct sk_filter *filter;
424
425         rcu_read_lock_bh();
426         filter = rcu_dereference(sk->sk_filter);
427         if (filter != NULL)
428                 res = sk_run_filter(skb, filter->insns, filter->len);
429         rcu_read_unlock_bh();
430
431         return res;
432 }
433
434 /*
435    This function makes lazy skb cloning in hope that most of packets
436    are discarded by BPF.
437
438    Note tricky part: we DO mangle shared skb! skb->data, skb->len
439    and skb->cb are mangled. It works because (and until) packets
440    falling here are owned by current CPU. Output packets are cloned
441    by dev_queue_xmit_nit(), input packets are processed by net_bh
442    sequencially, so that if we return skb to original state on exit,
443    we will not harm anyone.
444  */
445
446 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
447 {
448         struct sock *sk;
449         struct sockaddr_ll *sll;
450         struct packet_sock *po;
451         u8 * skb_head = skb->data;
452         int skb_len = skb->len;
453         unsigned int snaplen, res;
454
455         if (skb->pkt_type == PACKET_LOOPBACK)
456                 goto drop;
457
458         sk = pt->af_packet_priv;
459         po = pkt_sk(sk);
460
461         skb->dev = dev;
462
463         if (dev->hard_header) {
464                 /* The device has an explicit notion of ll header,
465                    exported to higher levels.
466
467                    Otherwise, the device hides datails of it frame
468                    structure, so that corresponding packet head
469                    never delivered to user.
470                  */
471                 if (sk->sk_type != SOCK_DGRAM)
472                         skb_push(skb, skb->data - skb_mac_header(skb));
473                 else if (skb->pkt_type == PACKET_OUTGOING) {
474                         /* Special case: outgoing packets have ll header at head */
475                         skb_pull(skb, skb_network_offset(skb));
476                 }
477         }
478
479         snaplen = skb->len;
480
481         res = run_filter(skb, sk, snaplen);
482         if (!res)
483                 goto drop_n_restore;
484         if (snaplen > res)
485                 snaplen = res;
486
487         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
488             (unsigned)sk->sk_rcvbuf)
489                 goto drop_n_acct;
490
491         if (skb_shared(skb)) {
492                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
493                 if (nskb == NULL)
494                         goto drop_n_acct;
495
496                 if (skb_head != skb->data) {
497                         skb->data = skb_head;
498                         skb->len = skb_len;
499                 }
500                 kfree_skb(skb);
501                 skb = nskb;
502         }
503
504         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
505                      sizeof(skb->cb));
506
507         sll = &PACKET_SKB_CB(skb)->sa.ll;
508         sll->sll_family = AF_PACKET;
509         sll->sll_hatype = dev->type;
510         sll->sll_protocol = skb->protocol;
511         sll->sll_pkttype = skb->pkt_type;
512         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
513                 sll->sll_ifindex = orig_dev->ifindex;
514         else
515                 sll->sll_ifindex = dev->ifindex;
516         sll->sll_halen = 0;
517
518         if (dev->hard_header_parse)
519                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
520
521         PACKET_SKB_CB(skb)->origlen = skb->len;
522
523         if (pskb_trim(skb, snaplen))
524                 goto drop_n_acct;
525
526         skb_set_owner_r(skb, sk);
527         skb->dev = NULL;
528         dst_release(skb->dst);
529         skb->dst = NULL;
530
531         /* drop conntrack reference */
532         nf_reset(skb);
533
534         spin_lock(&sk->sk_receive_queue.lock);
535         po->stats.tp_packets++;
536         __skb_queue_tail(&sk->sk_receive_queue, skb);
537         spin_unlock(&sk->sk_receive_queue.lock);
538         sk->sk_data_ready(sk, skb->len);
539         return 0;
540
541 drop_n_acct:
542         spin_lock(&sk->sk_receive_queue.lock);
543         po->stats.tp_drops++;
544         spin_unlock(&sk->sk_receive_queue.lock);
545
546 drop_n_restore:
547         if (skb_head != skb->data && skb_shared(skb)) {
548                 skb->data = skb_head;
549                 skb->len = skb_len;
550         }
551 drop:
552         kfree_skb(skb);
553         return 0;
554 }
555
556 #ifdef CONFIG_PACKET_MMAP
557 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
558 {
559         struct sock *sk;
560         struct packet_sock *po;
561         struct sockaddr_ll *sll;
562         struct tpacket_hdr *h;
563         u8 * skb_head = skb->data;
564         int skb_len = skb->len;
565         unsigned int snaplen, res;
566         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
567         unsigned short macoff, netoff;
568         struct sk_buff *copy_skb = NULL;
569         struct timeval tv;
570
571         if (skb->pkt_type == PACKET_LOOPBACK)
572                 goto drop;
573
574         sk = pt->af_packet_priv;
575         po = pkt_sk(sk);
576
577         if (dev->hard_header) {
578                 if (sk->sk_type != SOCK_DGRAM)
579                         skb_push(skb, skb->data - skb_mac_header(skb));
580                 else if (skb->pkt_type == PACKET_OUTGOING) {
581                         /* Special case: outgoing packets have ll header at head */
582                         skb_pull(skb, skb_network_offset(skb));
583                 }
584         }
585
586         if (skb->ip_summed == CHECKSUM_PARTIAL)
587                 status |= TP_STATUS_CSUMNOTREADY;
588
589         snaplen = skb->len;
590
591         res = run_filter(skb, sk, snaplen);
592         if (!res)
593                 goto drop_n_restore;
594         if (snaplen > res)
595                 snaplen = res;
596
597         if (sk->sk_type == SOCK_DGRAM) {
598                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
599         } else {
600                 unsigned maclen = skb_network_offset(skb);
601                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
602                 macoff = netoff - maclen;
603         }
604
605         if (macoff + snaplen > po->frame_size) {
606                 if (po->copy_thresh &&
607                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
608                     (unsigned)sk->sk_rcvbuf) {
609                         if (skb_shared(skb)) {
610                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
611                         } else {
612                                 copy_skb = skb_get(skb);
613                                 skb_head = skb->data;
614                         }
615                         if (copy_skb)
616                                 skb_set_owner_r(copy_skb, sk);
617                 }
618                 snaplen = po->frame_size - macoff;
619                 if ((int)snaplen < 0)
620                         snaplen = 0;
621         }
622
623         spin_lock(&sk->sk_receive_queue.lock);
624         h = packet_lookup_frame(po, po->head);
625
626         if (h->tp_status)
627                 goto ring_is_full;
628         po->head = po->head != po->frame_max ? po->head+1 : 0;
629         po->stats.tp_packets++;
630         if (copy_skb) {
631                 status |= TP_STATUS_COPY;
632                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
633         }
634         if (!po->stats.tp_drops)
635                 status &= ~TP_STATUS_LOSING;
636         spin_unlock(&sk->sk_receive_queue.lock);
637
638         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
639
640         h->tp_len = skb->len;
641         h->tp_snaplen = snaplen;
642         h->tp_mac = macoff;
643         h->tp_net = netoff;
644         if (skb->tstamp.tv64)
645                 tv = ktime_to_timeval(skb->tstamp);
646         else
647                 do_gettimeofday(&tv);
648         h->tp_sec = tv.tv_sec;
649         h->tp_usec = tv.tv_usec;
650
651         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
652         sll->sll_halen = 0;
653         if (dev->hard_header_parse)
654                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
655         sll->sll_family = AF_PACKET;
656         sll->sll_hatype = dev->type;
657         sll->sll_protocol = skb->protocol;
658         sll->sll_pkttype = skb->pkt_type;
659         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
660                 sll->sll_ifindex = orig_dev->ifindex;
661         else
662                 sll->sll_ifindex = dev->ifindex;
663
664         h->tp_status = status;
665         smp_mb();
666
667         {
668                 struct page *p_start, *p_end;
669                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
670
671                 p_start = virt_to_page(h);
672                 p_end = virt_to_page(h_end);
673                 while (p_start <= p_end) {
674                         flush_dcache_page(p_start);
675                         p_start++;
676                 }
677         }
678
679         sk->sk_data_ready(sk, 0);
680
681 drop_n_restore:
682         if (skb_head != skb->data && skb_shared(skb)) {
683                 skb->data = skb_head;
684                 skb->len = skb_len;
685         }
686 drop:
687         kfree_skb(skb);
688         return 0;
689
690 ring_is_full:
691         po->stats.tp_drops++;
692         spin_unlock(&sk->sk_receive_queue.lock);
693
694         sk->sk_data_ready(sk, 0);
695         if (copy_skb)
696                 kfree_skb(copy_skb);
697         goto drop_n_restore;
698 }
699
700 #endif
701
702
703 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
704                           struct msghdr *msg, size_t len)
705 {
706         struct sock *sk = sock->sk;
707         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
708         struct sk_buff *skb;
709         struct net_device *dev;
710         __be16 proto;
711         unsigned char *addr;
712         int ifindex, err, reserve = 0;
713
714         /*
715          *      Get and verify the address.
716          */
717
718         if (saddr == NULL) {
719                 struct packet_sock *po = pkt_sk(sk);
720
721                 ifindex = po->ifindex;
722                 proto   = po->num;
723                 addr    = NULL;
724         } else {
725                 err = -EINVAL;
726                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
727                         goto out;
728                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
729                         goto out;
730                 ifindex = saddr->sll_ifindex;
731                 proto   = saddr->sll_protocol;
732                 addr    = saddr->sll_addr;
733         }
734
735
736         dev = dev_get_by_index(ifindex);
737         err = -ENXIO;
738         if (dev == NULL)
739                 goto out_unlock;
740         if (sock->type == SOCK_RAW)
741                 reserve = dev->hard_header_len;
742
743         err = -ENETDOWN;
744         if (!(dev->flags & IFF_UP))
745                 goto out_unlock;
746
747         err = -EMSGSIZE;
748         if (len > dev->mtu+reserve)
749                 goto out_unlock;
750
751         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
752                                 msg->msg_flags & MSG_DONTWAIT, &err);
753         if (skb==NULL)
754                 goto out_unlock;
755
756         skb_reserve(skb, LL_RESERVED_SPACE(dev));
757         skb_reset_network_header(skb);
758
759         if (dev->hard_header) {
760                 int res;
761                 err = -EINVAL;
762                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
763                 if (sock->type != SOCK_DGRAM) {
764                         skb_reset_tail_pointer(skb);
765                         skb->len = 0;
766                 } else if (res < 0)
767                         goto out_free;
768         }
769
770         /* Returns -EFAULT on error */
771         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
772         if (err)
773                 goto out_free;
774
775         skb->protocol = proto;
776         skb->dev = dev;
777         skb->priority = sk->sk_priority;
778
779         /*
780          *      Now send it
781          */
782
783         err = dev_queue_xmit(skb);
784         if (err > 0 && (err = net_xmit_errno(err)) != 0)
785                 goto out_unlock;
786
787         dev_put(dev);
788
789         return(len);
790
791 out_free:
792         kfree_skb(skb);
793 out_unlock:
794         if (dev)
795                 dev_put(dev);
796 out:
797         return err;
798 }
799
800 /*
801  *      Close a PACKET socket. This is fairly simple. We immediately go
802  *      to 'closed' state and remove our protocol entry in the device list.
803  */
804
805 static int packet_release(struct socket *sock)
806 {
807         struct sock *sk = sock->sk;
808         struct packet_sock *po;
809
810         if (!sk)
811                 return 0;
812
813         po = pkt_sk(sk);
814
815         write_lock_bh(&packet_sklist_lock);
816         sk_del_node_init(sk);
817         write_unlock_bh(&packet_sklist_lock);
818
819         /*
820          *      Unhook packet receive handler.
821          */
822
823         if (po->running) {
824                 /*
825                  *      Remove the protocol hook
826                  */
827                 dev_remove_pack(&po->prot_hook);
828                 po->running = 0;
829                 po->num = 0;
830                 __sock_put(sk);
831         }
832
833         packet_flush_mclist(sk);
834
835 #ifdef CONFIG_PACKET_MMAP
836         if (po->pg_vec) {
837                 struct tpacket_req req;
838                 memset(&req, 0, sizeof(req));
839                 packet_set_ring(sk, &req, 1);
840         }
841 #endif
842
843         /*
844          *      Now the socket is dead. No more input will appear.
845          */
846
847         sock_orphan(sk);
848         sock->sk = NULL;
849
850         /* Purge queues */
851
852         skb_queue_purge(&sk->sk_receive_queue);
853
854         sock_put(sk);
855         return 0;
856 }
857
858 /*
859  *      Attach a packet hook.
860  */
861
862 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
863 {
864         struct packet_sock *po = pkt_sk(sk);
865         /*
866          *      Detach an existing hook if present.
867          */
868
869         lock_sock(sk);
870
871         spin_lock(&po->bind_lock);
872         if (po->running) {
873                 __sock_put(sk);
874                 po->running = 0;
875                 po->num = 0;
876                 spin_unlock(&po->bind_lock);
877                 dev_remove_pack(&po->prot_hook);
878                 spin_lock(&po->bind_lock);
879         }
880
881         po->num = protocol;
882         po->prot_hook.type = protocol;
883         po->prot_hook.dev = dev;
884
885         po->ifindex = dev ? dev->ifindex : 0;
886
887         if (protocol == 0)
888                 goto out_unlock;
889
890         if (dev) {
891                 if (dev->flags&IFF_UP) {
892                         dev_add_pack(&po->prot_hook);
893                         sock_hold(sk);
894                         po->running = 1;
895                 } else {
896                         sk->sk_err = ENETDOWN;
897                         if (!sock_flag(sk, SOCK_DEAD))
898                                 sk->sk_error_report(sk);
899                 }
900         } else {
901                 dev_add_pack(&po->prot_hook);
902                 sock_hold(sk);
903                 po->running = 1;
904         }
905
906 out_unlock:
907         spin_unlock(&po->bind_lock);
908         release_sock(sk);
909         return 0;
910 }
911
912 /*
913  *      Bind a packet socket to a device
914  */
915
916 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
917 {
918         struct sock *sk=sock->sk;
919         char name[15];
920         struct net_device *dev;
921         int err = -ENODEV;
922
923         /*
924          *      Check legality
925          */
926
927         if (addr_len != sizeof(struct sockaddr))
928                 return -EINVAL;
929         strlcpy(name,uaddr->sa_data,sizeof(name));
930
931         dev = dev_get_by_name(name);
932         if (dev) {
933                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
934                 dev_put(dev);
935         }
936         return err;
937 }
938
939 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
940 {
941         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
942         struct sock *sk=sock->sk;
943         struct net_device *dev = NULL;
944         int err;
945
946
947         /*
948          *      Check legality
949          */
950
951         if (addr_len < sizeof(struct sockaddr_ll))
952                 return -EINVAL;
953         if (sll->sll_family != AF_PACKET)
954                 return -EINVAL;
955
956         if (sll->sll_ifindex) {
957                 err = -ENODEV;
958                 dev = dev_get_by_index(sll->sll_ifindex);
959                 if (dev == NULL)
960                         goto out;
961         }
962         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
963         if (dev)
964                 dev_put(dev);
965
966 out:
967         return err;
968 }
969
970 static struct proto packet_proto = {
971         .name     = "PACKET",
972         .owner    = THIS_MODULE,
973         .obj_size = sizeof(struct packet_sock),
974 };
975
976 /*
977  *      Create a packet of type SOCK_PACKET.
978  */
979
980 static int packet_create(struct net *net, struct socket *sock, int protocol)
981 {
982         struct sock *sk;
983         struct packet_sock *po;
984         __be16 proto = (__force __be16)protocol; /* weird, but documented */
985         int err;
986
987         if (net != &init_net)
988                 return -EAFNOSUPPORT;
989
990         if (!capable(CAP_NET_RAW))
991                 return -EPERM;
992         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
993             sock->type != SOCK_PACKET)
994                 return -ESOCKTNOSUPPORT;
995
996         sock->state = SS_UNCONNECTED;
997
998         err = -ENOBUFS;
999         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1000         if (sk == NULL)
1001                 goto out;
1002
1003         sock->ops = &packet_ops;
1004         if (sock->type == SOCK_PACKET)
1005                 sock->ops = &packet_ops_spkt;
1006
1007         sock_init_data(sock, sk);
1008
1009         po = pkt_sk(sk);
1010         sk->sk_family = PF_PACKET;
1011         po->num = proto;
1012
1013         sk->sk_destruct = packet_sock_destruct;
1014         atomic_inc(&packet_socks_nr);
1015
1016         /*
1017          *      Attach a protocol block
1018          */
1019
1020         spin_lock_init(&po->bind_lock);
1021         po->prot_hook.func = packet_rcv;
1022
1023         if (sock->type == SOCK_PACKET)
1024                 po->prot_hook.func = packet_rcv_spkt;
1025
1026         po->prot_hook.af_packet_priv = sk;
1027
1028         if (proto) {
1029                 po->prot_hook.type = proto;
1030                 dev_add_pack(&po->prot_hook);
1031                 sock_hold(sk);
1032                 po->running = 1;
1033         }
1034
1035         write_lock_bh(&packet_sklist_lock);
1036         sk_add_node(sk, &packet_sklist);
1037         write_unlock_bh(&packet_sklist_lock);
1038         return(0);
1039 out:
1040         return err;
1041 }
1042
1043 /*
1044  *      Pull a packet from our receive queue and hand it to the user.
1045  *      If necessary we block.
1046  */
1047
1048 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1049                           struct msghdr *msg, size_t len, int flags)
1050 {
1051         struct sock *sk = sock->sk;
1052         struct sk_buff *skb;
1053         int copied, err;
1054         struct sockaddr_ll *sll;
1055
1056         err = -EINVAL;
1057         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1058                 goto out;
1059
1060 #if 0
1061         /* What error should we return now? EUNATTACH? */
1062         if (pkt_sk(sk)->ifindex < 0)
1063                 return -ENODEV;
1064 #endif
1065
1066         /*
1067          *      Call the generic datagram receiver. This handles all sorts
1068          *      of horrible races and re-entrancy so we can forget about it
1069          *      in the protocol layers.
1070          *
1071          *      Now it will return ENETDOWN, if device have just gone down,
1072          *      but then it will block.
1073          */
1074
1075         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1076
1077         /*
1078          *      An error occurred so return it. Because skb_recv_datagram()
1079          *      handles the blocking we don't see and worry about blocking
1080          *      retries.
1081          */
1082
1083         if (skb == NULL)
1084                 goto out;
1085
1086         /*
1087          *      If the address length field is there to be filled in, we fill
1088          *      it in now.
1089          */
1090
1091         sll = &PACKET_SKB_CB(skb)->sa.ll;
1092         if (sock->type == SOCK_PACKET)
1093                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1094         else
1095                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1096
1097         /*
1098          *      You lose any data beyond the buffer you gave. If it worries a
1099          *      user program they can ask the device for its MTU anyway.
1100          */
1101
1102         copied = skb->len;
1103         if (copied > len)
1104         {
1105                 copied=len;
1106                 msg->msg_flags|=MSG_TRUNC;
1107         }
1108
1109         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1110         if (err)
1111                 goto out_free;
1112
1113         sock_recv_timestamp(msg, sk, skb);
1114
1115         if (msg->msg_name)
1116                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1117                        msg->msg_namelen);
1118
1119         if (pkt_sk(sk)->auxdata) {
1120                 struct tpacket_auxdata aux;
1121
1122                 aux.tp_status = TP_STATUS_USER;
1123                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1124                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1125                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1126                 aux.tp_snaplen = skb->len;
1127                 aux.tp_mac = 0;
1128                 aux.tp_net = skb_network_offset(skb);
1129
1130                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1131         }
1132
1133         /*
1134          *      Free or return the buffer as appropriate. Again this
1135          *      hides all the races and re-entrancy issues from us.
1136          */
1137         err = (flags&MSG_TRUNC) ? skb->len : copied;
1138
1139 out_free:
1140         skb_free_datagram(sk, skb);
1141 out:
1142         return err;
1143 }
1144
1145 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1146                                int *uaddr_len, int peer)
1147 {
1148         struct net_device *dev;
1149         struct sock *sk = sock->sk;
1150
1151         if (peer)
1152                 return -EOPNOTSUPP;
1153
1154         uaddr->sa_family = AF_PACKET;
1155         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1156         if (dev) {
1157                 strlcpy(uaddr->sa_data, dev->name, 15);
1158                 dev_put(dev);
1159         } else
1160                 memset(uaddr->sa_data, 0, 14);
1161         *uaddr_len = sizeof(*uaddr);
1162
1163         return 0;
1164 }
1165
1166 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1167                           int *uaddr_len, int peer)
1168 {
1169         struct net_device *dev;
1170         struct sock *sk = sock->sk;
1171         struct packet_sock *po = pkt_sk(sk);
1172         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1173
1174         if (peer)
1175                 return -EOPNOTSUPP;
1176
1177         sll->sll_family = AF_PACKET;
1178         sll->sll_ifindex = po->ifindex;
1179         sll->sll_protocol = po->num;
1180         dev = dev_get_by_index(po->ifindex);
1181         if (dev) {
1182                 sll->sll_hatype = dev->type;
1183                 sll->sll_halen = dev->addr_len;
1184                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1185                 dev_put(dev);
1186         } else {
1187                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1188                 sll->sll_halen = 0;
1189         }
1190         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1191
1192         return 0;
1193 }
1194
1195 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1196 {
1197         switch (i->type) {
1198         case PACKET_MR_MULTICAST:
1199                 if (what > 0)
1200                         dev_mc_add(dev, i->addr, i->alen, 0);
1201                 else
1202                         dev_mc_delete(dev, i->addr, i->alen, 0);
1203                 break;
1204         case PACKET_MR_PROMISC:
1205                 dev_set_promiscuity(dev, what);
1206                 break;
1207         case PACKET_MR_ALLMULTI:
1208                 dev_set_allmulti(dev, what);
1209                 break;
1210         default:;
1211         }
1212 }
1213
1214 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1215 {
1216         for ( ; i; i=i->next) {
1217                 if (i->ifindex == dev->ifindex)
1218                         packet_dev_mc(dev, i, what);
1219         }
1220 }
1221
1222 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1223 {
1224         struct packet_sock *po = pkt_sk(sk);
1225         struct packet_mclist *ml, *i;
1226         struct net_device *dev;
1227         int err;
1228
1229         rtnl_lock();
1230
1231         err = -ENODEV;
1232         dev = __dev_get_by_index(mreq->mr_ifindex);
1233         if (!dev)
1234                 goto done;
1235
1236         err = -EINVAL;
1237         if (mreq->mr_alen > dev->addr_len)
1238                 goto done;
1239
1240         err = -ENOBUFS;
1241         i = kmalloc(sizeof(*i), GFP_KERNEL);
1242         if (i == NULL)
1243                 goto done;
1244
1245         err = 0;
1246         for (ml = po->mclist; ml; ml = ml->next) {
1247                 if (ml->ifindex == mreq->mr_ifindex &&
1248                     ml->type == mreq->mr_type &&
1249                     ml->alen == mreq->mr_alen &&
1250                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1251                         ml->count++;
1252                         /* Free the new element ... */
1253                         kfree(i);
1254                         goto done;
1255                 }
1256         }
1257
1258         i->type = mreq->mr_type;
1259         i->ifindex = mreq->mr_ifindex;
1260         i->alen = mreq->mr_alen;
1261         memcpy(i->addr, mreq->mr_address, i->alen);
1262         i->count = 1;
1263         i->next = po->mclist;
1264         po->mclist = i;
1265         packet_dev_mc(dev, i, +1);
1266
1267 done:
1268         rtnl_unlock();
1269         return err;
1270 }
1271
1272 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1273 {
1274         struct packet_mclist *ml, **mlp;
1275
1276         rtnl_lock();
1277
1278         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1279                 if (ml->ifindex == mreq->mr_ifindex &&
1280                     ml->type == mreq->mr_type &&
1281                     ml->alen == mreq->mr_alen &&
1282                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1283                         if (--ml->count == 0) {
1284                                 struct net_device *dev;
1285                                 *mlp = ml->next;
1286                                 dev = dev_get_by_index(ml->ifindex);
1287                                 if (dev) {
1288                                         packet_dev_mc(dev, ml, -1);
1289                                         dev_put(dev);
1290                                 }
1291                                 kfree(ml);
1292                         }
1293                         rtnl_unlock();
1294                         return 0;
1295                 }
1296         }
1297         rtnl_unlock();
1298         return -EADDRNOTAVAIL;
1299 }
1300
1301 static void packet_flush_mclist(struct sock *sk)
1302 {
1303         struct packet_sock *po = pkt_sk(sk);
1304         struct packet_mclist *ml;
1305
1306         if (!po->mclist)
1307                 return;
1308
1309         rtnl_lock();
1310         while ((ml = po->mclist) != NULL) {
1311                 struct net_device *dev;
1312
1313                 po->mclist = ml->next;
1314                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1315                         packet_dev_mc(dev, ml, -1);
1316                         dev_put(dev);
1317                 }
1318                 kfree(ml);
1319         }
1320         rtnl_unlock();
1321 }
1322
1323 static int
1324 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1325 {
1326         struct sock *sk = sock->sk;
1327         struct packet_sock *po = pkt_sk(sk);
1328         int ret;
1329
1330         if (level != SOL_PACKET)
1331                 return -ENOPROTOOPT;
1332
1333         switch(optname) {
1334         case PACKET_ADD_MEMBERSHIP:
1335         case PACKET_DROP_MEMBERSHIP:
1336         {
1337                 struct packet_mreq_max mreq;
1338                 int len = optlen;
1339                 memset(&mreq, 0, sizeof(mreq));
1340                 if (len < sizeof(struct packet_mreq))
1341                         return -EINVAL;
1342                 if (len > sizeof(mreq))
1343                         len = sizeof(mreq);
1344                 if (copy_from_user(&mreq,optval,len))
1345                         return -EFAULT;
1346                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1347                         return -EINVAL;
1348                 if (optname == PACKET_ADD_MEMBERSHIP)
1349                         ret = packet_mc_add(sk, &mreq);
1350                 else
1351                         ret = packet_mc_drop(sk, &mreq);
1352                 return ret;
1353         }
1354
1355 #ifdef CONFIG_PACKET_MMAP
1356         case PACKET_RX_RING:
1357         {
1358                 struct tpacket_req req;
1359
1360                 if (optlen<sizeof(req))
1361                         return -EINVAL;
1362                 if (copy_from_user(&req,optval,sizeof(req)))
1363                         return -EFAULT;
1364                 return packet_set_ring(sk, &req, 0);
1365         }
1366         case PACKET_COPY_THRESH:
1367         {
1368                 int val;
1369
1370                 if (optlen!=sizeof(val))
1371                         return -EINVAL;
1372                 if (copy_from_user(&val,optval,sizeof(val)))
1373                         return -EFAULT;
1374
1375                 pkt_sk(sk)->copy_thresh = val;
1376                 return 0;
1377         }
1378 #endif
1379         case PACKET_AUXDATA:
1380         {
1381                 int val;
1382
1383                 if (optlen < sizeof(val))
1384                         return -EINVAL;
1385                 if (copy_from_user(&val, optval, sizeof(val)))
1386                         return -EFAULT;
1387
1388                 po->auxdata = !!val;
1389                 return 0;
1390         }
1391         case PACKET_ORIGDEV:
1392         {
1393                 int val;
1394
1395                 if (optlen < sizeof(val))
1396                         return -EINVAL;
1397                 if (copy_from_user(&val, optval, sizeof(val)))
1398                         return -EFAULT;
1399
1400                 po->origdev = !!val;
1401                 return 0;
1402         }
1403         default:
1404                 return -ENOPROTOOPT;
1405         }
1406 }
1407
1408 static int packet_getsockopt(struct socket *sock, int level, int optname,
1409                              char __user *optval, int __user *optlen)
1410 {
1411         int len;
1412         int val;
1413         struct sock *sk = sock->sk;
1414         struct packet_sock *po = pkt_sk(sk);
1415         void *data;
1416         struct tpacket_stats st;
1417
1418         if (level != SOL_PACKET)
1419                 return -ENOPROTOOPT;
1420
1421         if (get_user(len, optlen))
1422                 return -EFAULT;
1423
1424         if (len < 0)
1425                 return -EINVAL;
1426
1427         switch(optname) {
1428         case PACKET_STATISTICS:
1429                 if (len > sizeof(struct tpacket_stats))
1430                         len = sizeof(struct tpacket_stats);
1431                 spin_lock_bh(&sk->sk_receive_queue.lock);
1432                 st = po->stats;
1433                 memset(&po->stats, 0, sizeof(st));
1434                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1435                 st.tp_packets += st.tp_drops;
1436
1437                 data = &st;
1438                 break;
1439         case PACKET_AUXDATA:
1440                 if (len > sizeof(int))
1441                         len = sizeof(int);
1442                 val = po->auxdata;
1443
1444                 data = &val;
1445                 break;
1446         case PACKET_ORIGDEV:
1447                 if (len > sizeof(int))
1448                         len = sizeof(int);
1449                 val = po->origdev;
1450
1451                 data = &val;
1452                 break;
1453         default:
1454                 return -ENOPROTOOPT;
1455         }
1456
1457         if (put_user(len, optlen))
1458                 return -EFAULT;
1459         if (copy_to_user(optval, data, len))
1460                 return -EFAULT;
1461         return 0;
1462 }
1463
1464
1465 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1466 {
1467         struct sock *sk;
1468         struct hlist_node *node;
1469         struct net_device *dev = data;
1470
1471         read_lock(&packet_sklist_lock);
1472         sk_for_each(sk, node, &packet_sklist) {
1473                 struct packet_sock *po = pkt_sk(sk);
1474
1475                 switch (msg) {
1476                 case NETDEV_UNREGISTER:
1477                         if (po->mclist)
1478                                 packet_dev_mclist(dev, po->mclist, -1);
1479                         /* fallthrough */
1480
1481                 case NETDEV_DOWN:
1482                         if (dev->ifindex == po->ifindex) {
1483                                 spin_lock(&po->bind_lock);
1484                                 if (po->running) {
1485                                         __dev_remove_pack(&po->prot_hook);
1486                                         __sock_put(sk);
1487                                         po->running = 0;
1488                                         sk->sk_err = ENETDOWN;
1489                                         if (!sock_flag(sk, SOCK_DEAD))
1490                                                 sk->sk_error_report(sk);
1491                                 }
1492                                 if (msg == NETDEV_UNREGISTER) {
1493                                         po->ifindex = -1;
1494                                         po->prot_hook.dev = NULL;
1495                                 }
1496                                 spin_unlock(&po->bind_lock);
1497                         }
1498                         break;
1499                 case NETDEV_UP:
1500                         spin_lock(&po->bind_lock);
1501                         if (dev->ifindex == po->ifindex && po->num &&
1502                             !po->running) {
1503                                 dev_add_pack(&po->prot_hook);
1504                                 sock_hold(sk);
1505                                 po->running = 1;
1506                         }
1507                         spin_unlock(&po->bind_lock);
1508                         break;
1509                 }
1510         }
1511         read_unlock(&packet_sklist_lock);
1512         return NOTIFY_DONE;
1513 }
1514
1515
1516 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1517                         unsigned long arg)
1518 {
1519         struct sock *sk = sock->sk;
1520
1521         switch(cmd) {
1522                 case SIOCOUTQ:
1523                 {
1524                         int amount = atomic_read(&sk->sk_wmem_alloc);
1525                         return put_user(amount, (int __user *)arg);
1526                 }
1527                 case SIOCINQ:
1528                 {
1529                         struct sk_buff *skb;
1530                         int amount = 0;
1531
1532                         spin_lock_bh(&sk->sk_receive_queue.lock);
1533                         skb = skb_peek(&sk->sk_receive_queue);
1534                         if (skb)
1535                                 amount = skb->len;
1536                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1537                         return put_user(amount, (int __user *)arg);
1538                 }
1539                 case SIOCGSTAMP:
1540                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1541                 case SIOCGSTAMPNS:
1542                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1543
1544 #ifdef CONFIG_INET
1545                 case SIOCADDRT:
1546                 case SIOCDELRT:
1547                 case SIOCDARP:
1548                 case SIOCGARP:
1549                 case SIOCSARP:
1550                 case SIOCGIFADDR:
1551                 case SIOCSIFADDR:
1552                 case SIOCGIFBRDADDR:
1553                 case SIOCSIFBRDADDR:
1554                 case SIOCGIFNETMASK:
1555                 case SIOCSIFNETMASK:
1556                 case SIOCGIFDSTADDR:
1557                 case SIOCSIFDSTADDR:
1558                 case SIOCSIFFLAGS:
1559                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1560 #endif
1561
1562                 default:
1563                         return -ENOIOCTLCMD;
1564         }
1565         return 0;
1566 }
1567
1568 #ifndef CONFIG_PACKET_MMAP
1569 #define packet_mmap sock_no_mmap
1570 #define packet_poll datagram_poll
1571 #else
1572
1573 static unsigned int packet_poll(struct file * file, struct socket *sock,
1574                                 poll_table *wait)
1575 {
1576         struct sock *sk = sock->sk;
1577         struct packet_sock *po = pkt_sk(sk);
1578         unsigned int mask = datagram_poll(file, sock, wait);
1579
1580         spin_lock_bh(&sk->sk_receive_queue.lock);
1581         if (po->pg_vec) {
1582                 unsigned last = po->head ? po->head-1 : po->frame_max;
1583                 struct tpacket_hdr *h;
1584
1585                 h = packet_lookup_frame(po, last);
1586
1587                 if (h->tp_status)
1588                         mask |= POLLIN | POLLRDNORM;
1589         }
1590         spin_unlock_bh(&sk->sk_receive_queue.lock);
1591         return mask;
1592 }
1593
1594
1595 /* Dirty? Well, I still did not learn better way to account
1596  * for user mmaps.
1597  */
1598
1599 static void packet_mm_open(struct vm_area_struct *vma)
1600 {
1601         struct file *file = vma->vm_file;
1602         struct socket * sock = file->private_data;
1603         struct sock *sk = sock->sk;
1604
1605         if (sk)
1606                 atomic_inc(&pkt_sk(sk)->mapped);
1607 }
1608
1609 static void packet_mm_close(struct vm_area_struct *vma)
1610 {
1611         struct file *file = vma->vm_file;
1612         struct socket * sock = file->private_data;
1613         struct sock *sk = sock->sk;
1614
1615         if (sk)
1616                 atomic_dec(&pkt_sk(sk)->mapped);
1617 }
1618
1619 static struct vm_operations_struct packet_mmap_ops = {
1620         .open = packet_mm_open,
1621         .close =packet_mm_close,
1622 };
1623
1624 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1625 {
1626         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1627 }
1628
1629 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1630 {
1631         int i;
1632
1633         for (i = 0; i < len; i++) {
1634                 if (likely(pg_vec[i]))
1635                         free_pages((unsigned long) pg_vec[i], order);
1636         }
1637         kfree(pg_vec);
1638 }
1639
1640 static inline char *alloc_one_pg_vec_page(unsigned long order)
1641 {
1642         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1643                                          order);
1644 }
1645
1646 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1647 {
1648         unsigned int block_nr = req->tp_block_nr;
1649         char **pg_vec;
1650         int i;
1651
1652         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1653         if (unlikely(!pg_vec))
1654                 goto out;
1655
1656         for (i = 0; i < block_nr; i++) {
1657                 pg_vec[i] = alloc_one_pg_vec_page(order);
1658                 if (unlikely(!pg_vec[i]))
1659                         goto out_free_pgvec;
1660         }
1661
1662 out:
1663         return pg_vec;
1664
1665 out_free_pgvec:
1666         free_pg_vec(pg_vec, order, block_nr);
1667         pg_vec = NULL;
1668         goto out;
1669 }
1670
1671 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1672 {
1673         char **pg_vec = NULL;
1674         struct packet_sock *po = pkt_sk(sk);
1675         int was_running, order = 0;
1676         __be16 num;
1677         int err = 0;
1678
1679         if (req->tp_block_nr) {
1680                 int i, l;
1681
1682                 /* Sanity tests and some calculations */
1683
1684                 if (unlikely(po->pg_vec))
1685                         return -EBUSY;
1686
1687                 if (unlikely((int)req->tp_block_size <= 0))
1688                         return -EINVAL;
1689                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1690                         return -EINVAL;
1691                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1692                         return -EINVAL;
1693                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1694                         return -EINVAL;
1695
1696                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1697                 if (unlikely(po->frames_per_block <= 0))
1698                         return -EINVAL;
1699                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1700                              req->tp_frame_nr))
1701                         return -EINVAL;
1702
1703                 err = -ENOMEM;
1704                 order = get_order(req->tp_block_size);
1705                 pg_vec = alloc_pg_vec(req, order);
1706                 if (unlikely(!pg_vec))
1707                         goto out;
1708
1709                 l = 0;
1710                 for (i = 0; i < req->tp_block_nr; i++) {
1711                         char *ptr = pg_vec[i];
1712                         struct tpacket_hdr *header;
1713                         int k;
1714
1715                         for (k = 0; k < po->frames_per_block; k++) {
1716                                 header = (struct tpacket_hdr *) ptr;
1717                                 header->tp_status = TP_STATUS_KERNEL;
1718                                 ptr += req->tp_frame_size;
1719                         }
1720                 }
1721                 /* Done */
1722         } else {
1723                 if (unlikely(req->tp_frame_nr))
1724                         return -EINVAL;
1725         }
1726
1727         lock_sock(sk);
1728
1729         /* Detach socket from network */
1730         spin_lock(&po->bind_lock);
1731         was_running = po->running;
1732         num = po->num;
1733         if (was_running) {
1734                 __dev_remove_pack(&po->prot_hook);
1735                 po->num = 0;
1736                 po->running = 0;
1737                 __sock_put(sk);
1738         }
1739         spin_unlock(&po->bind_lock);
1740
1741         synchronize_net();
1742
1743         err = -EBUSY;
1744         if (closing || atomic_read(&po->mapped) == 0) {
1745                 err = 0;
1746 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1747
1748                 spin_lock_bh(&sk->sk_receive_queue.lock);
1749                 pg_vec = XC(po->pg_vec, pg_vec);
1750                 po->frame_max = (req->tp_frame_nr - 1);
1751                 po->head = 0;
1752                 po->frame_size = req->tp_frame_size;
1753                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1754
1755                 order = XC(po->pg_vec_order, order);
1756                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1757
1758                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1759                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1760                 skb_queue_purge(&sk->sk_receive_queue);
1761 #undef XC
1762                 if (atomic_read(&po->mapped))
1763                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1764         }
1765
1766         spin_lock(&po->bind_lock);
1767         if (was_running && !po->running) {
1768                 sock_hold(sk);
1769                 po->running = 1;
1770                 po->num = num;
1771                 dev_add_pack(&po->prot_hook);
1772         }
1773         spin_unlock(&po->bind_lock);
1774
1775         release_sock(sk);
1776
1777         if (pg_vec)
1778                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1779 out:
1780         return err;
1781 }
1782
1783 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1784 {
1785         struct sock *sk = sock->sk;
1786         struct packet_sock *po = pkt_sk(sk);
1787         unsigned long size;
1788         unsigned long start;
1789         int err = -EINVAL;
1790         int i;
1791
1792         if (vma->vm_pgoff)
1793                 return -EINVAL;
1794
1795         size = vma->vm_end - vma->vm_start;
1796
1797         lock_sock(sk);
1798         if (po->pg_vec == NULL)
1799                 goto out;
1800         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1801                 goto out;
1802
1803         start = vma->vm_start;
1804         for (i = 0; i < po->pg_vec_len; i++) {
1805                 struct page *page = virt_to_page(po->pg_vec[i]);
1806                 int pg_num;
1807
1808                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1809                         err = vm_insert_page(vma, start, page);
1810                         if (unlikely(err))
1811                                 goto out;
1812                         start += PAGE_SIZE;
1813                 }
1814         }
1815         atomic_inc(&po->mapped);
1816         vma->vm_ops = &packet_mmap_ops;
1817         err = 0;
1818
1819 out:
1820         release_sock(sk);
1821         return err;
1822 }
1823 #endif
1824
1825
1826 static const struct proto_ops packet_ops_spkt = {
1827         .family =       PF_PACKET,
1828         .owner =        THIS_MODULE,
1829         .release =      packet_release,
1830         .bind =         packet_bind_spkt,
1831         .connect =      sock_no_connect,
1832         .socketpair =   sock_no_socketpair,
1833         .accept =       sock_no_accept,
1834         .getname =      packet_getname_spkt,
1835         .poll =         datagram_poll,
1836         .ioctl =        packet_ioctl,
1837         .listen =       sock_no_listen,
1838         .shutdown =     sock_no_shutdown,
1839         .setsockopt =   sock_no_setsockopt,
1840         .getsockopt =   sock_no_getsockopt,
1841         .sendmsg =      packet_sendmsg_spkt,
1842         .recvmsg =      packet_recvmsg,
1843         .mmap =         sock_no_mmap,
1844         .sendpage =     sock_no_sendpage,
1845 };
1846
1847 static const struct proto_ops packet_ops = {
1848         .family =       PF_PACKET,
1849         .owner =        THIS_MODULE,
1850         .release =      packet_release,
1851         .bind =         packet_bind,
1852         .connect =      sock_no_connect,
1853         .socketpair =   sock_no_socketpair,
1854         .accept =       sock_no_accept,
1855         .getname =      packet_getname,
1856         .poll =         packet_poll,
1857         .ioctl =        packet_ioctl,
1858         .listen =       sock_no_listen,
1859         .shutdown =     sock_no_shutdown,
1860         .setsockopt =   packet_setsockopt,
1861         .getsockopt =   packet_getsockopt,
1862         .sendmsg =      packet_sendmsg,
1863         .recvmsg =      packet_recvmsg,
1864         .mmap =         packet_mmap,
1865         .sendpage =     sock_no_sendpage,
1866 };
1867
1868 static struct net_proto_family packet_family_ops = {
1869         .family =       PF_PACKET,
1870         .create =       packet_create,
1871         .owner  =       THIS_MODULE,
1872 };
1873
1874 static struct notifier_block packet_netdev_notifier = {
1875         .notifier_call =packet_notifier,
1876 };
1877
1878 #ifdef CONFIG_PROC_FS
1879 static inline struct sock *packet_seq_idx(loff_t off)
1880 {
1881         struct sock *s;
1882         struct hlist_node *node;
1883
1884         sk_for_each(s, node, &packet_sklist) {
1885                 if (!off--)
1886                         return s;
1887         }
1888         return NULL;
1889 }
1890
1891 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1892 {
1893         read_lock(&packet_sklist_lock);
1894         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1895 }
1896
1897 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1898 {
1899         ++*pos;
1900         return  (v == SEQ_START_TOKEN)
1901                 ? sk_head(&packet_sklist)
1902                 : sk_next((struct sock*)v) ;
1903 }
1904
1905 static void packet_seq_stop(struct seq_file *seq, void *v)
1906 {
1907         read_unlock(&packet_sklist_lock);
1908 }
1909
1910 static int packet_seq_show(struct seq_file *seq, void *v)
1911 {
1912         if (v == SEQ_START_TOKEN)
1913                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1914         else {
1915                 struct sock *s = v;
1916                 const struct packet_sock *po = pkt_sk(s);
1917
1918                 seq_printf(seq,
1919                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1920                            s,
1921                            atomic_read(&s->sk_refcnt),
1922                            s->sk_type,
1923                            ntohs(po->num),
1924                            po->ifindex,
1925                            po->running,
1926                            atomic_read(&s->sk_rmem_alloc),
1927                            sock_i_uid(s),
1928                            sock_i_ino(s) );
1929         }
1930
1931         return 0;
1932 }
1933
1934 static const struct seq_operations packet_seq_ops = {
1935         .start  = packet_seq_start,
1936         .next   = packet_seq_next,
1937         .stop   = packet_seq_stop,
1938         .show   = packet_seq_show,
1939 };
1940
1941 static int packet_seq_open(struct inode *inode, struct file *file)
1942 {
1943         return seq_open(file, &packet_seq_ops);
1944 }
1945
1946 static const struct file_operations packet_seq_fops = {
1947         .owner          = THIS_MODULE,
1948         .open           = packet_seq_open,
1949         .read           = seq_read,
1950         .llseek         = seq_lseek,
1951         .release        = seq_release,
1952 };
1953
1954 #endif
1955
1956 static void __exit packet_exit(void)
1957 {
1958         proc_net_remove(&init_net, "packet");
1959         unregister_netdevice_notifier(&packet_netdev_notifier);
1960         sock_unregister(PF_PACKET);
1961         proto_unregister(&packet_proto);
1962 }
1963
1964 static int __init packet_init(void)
1965 {
1966         int rc = proto_register(&packet_proto, 0);
1967
1968         if (rc != 0)
1969                 goto out;
1970
1971         sock_register(&packet_family_ops);
1972         register_netdevice_notifier(&packet_netdev_notifier);
1973         proc_net_fops_create(&init_net, "packet", 0, &packet_seq_fops);
1974 out:
1975         return rc;
1976 }
1977
1978 module_init(packet_init);
1979 module_exit(packet_exit);
1980 MODULE_LICENSE("GPL");
1981 MODULE_ALIAS_NETPROTO(PF_PACKET);