Revert "af_packet: add interframe drop cmsg (v6)"
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* Private packet socket structures. */
139
140 struct packet_mclist {
141         struct packet_mclist    *next;
142         int                     ifindex;
143         int                     count;
144         unsigned short          type;
145         unsigned short          alen;
146         unsigned char           addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160                 int closing, int tx_ring);
161
162 struct packet_ring_buffer {
163         char                    **pg_vec;
164         unsigned int            head;
165         unsigned int            frames_per_block;
166         unsigned int            frame_size;
167         unsigned int            frame_max;
168
169         unsigned int            pg_vec_order;
170         unsigned int            pg_vec_pages;
171         unsigned int            pg_vec_len;
172
173         atomic_t                pending;
174 };
175
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179
180 static void packet_flush_mclist(struct sock *sk);
181
182 struct packet_sock {
183         /* struct sock has to be the first member of packet_sock */
184         struct sock             sk;
185         struct tpacket_stats    stats;
186 #ifdef CONFIG_PACKET_MMAP
187         struct packet_ring_buffer       rx_ring;
188         struct packet_ring_buffer       tx_ring;
189         int                     copy_thresh;
190 #endif
191         struct packet_type      prot_hook;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207 };
208
209 struct packet_skb_cb {
210         unsigned int origlen;
211         union {
212                 struct sockaddr_pkt pkt;
213                 struct sockaddr_ll ll;
214         } sa;
215 };
216
217 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
218
219 #ifdef CONFIG_PACKET_MMAP
220
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223         union {
224                 struct tpacket_hdr *h1;
225                 struct tpacket2_hdr *h2;
226                 void *raw;
227         } h;
228
229         h.raw = frame;
230         switch (po->tp_version) {
231         case TPACKET_V1:
232                 h.h1->tp_status = status;
233                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
234                 break;
235         case TPACKET_V2:
236                 h.h2->tp_status = status;
237                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
238                 break;
239         default:
240                 pr_err("TPACKET version not supported\n");
241                 BUG();
242         }
243
244         smp_wmb();
245 }
246
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249         union {
250                 struct tpacket_hdr *h1;
251                 struct tpacket2_hdr *h2;
252                 void *raw;
253         } h;
254
255         smp_rmb();
256
257         h.raw = frame;
258         switch (po->tp_version) {
259         case TPACKET_V1:
260                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
261                 return h.h1->tp_status;
262         case TPACKET_V2:
263                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
264                 return h.h2->tp_status;
265         default:
266                 pr_err("TPACKET version not supported\n");
267                 BUG();
268                 return 0;
269         }
270 }
271
272 static void *packet_lookup_frame(struct packet_sock *po,
273                 struct packet_ring_buffer *rb,
274                 unsigned int position,
275                 int status)
276 {
277         unsigned int pg_vec_pos, frame_offset;
278         union {
279                 struct tpacket_hdr *h1;
280                 struct tpacket2_hdr *h2;
281                 void *raw;
282         } h;
283
284         pg_vec_pos = position / rb->frames_per_block;
285         frame_offset = position % rb->frames_per_block;
286
287         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288
289         if (status != __packet_get_status(po, h.raw))
290                 return NULL;
291
292         return h.raw;
293 }
294
295 static inline void *packet_current_frame(struct packet_sock *po,
296                 struct packet_ring_buffer *rb,
297                 int status)
298 {
299         return packet_lookup_frame(po, rb, rb->head, status);
300 }
301
302 static inline void *packet_previous_frame(struct packet_sock *po,
303                 struct packet_ring_buffer *rb,
304                 int status)
305 {
306         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307         return packet_lookup_frame(po, rb, previous, status);
308 }
309
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314
315 #endif
316
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319         return (struct packet_sock *)sk;
320 }
321
322 static void packet_sock_destruct(struct sock *sk)
323 {
324         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326
327         if (!sock_flag(sk, SOCK_DEAD)) {
328                 pr_err("Attempt to release alive packet socket: %p\n", sk);
329                 return;
330         }
331
332         sk_refcnt_debug_dec(sk);
333 }
334
335
336 static const struct proto_ops packet_ops;
337
338 static const struct proto_ops packet_ops_spkt;
339
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341                            struct packet_type *pt, struct net_device *orig_dev)
342 {
343         struct sock *sk;
344         struct sockaddr_pkt *spkt;
345
346         /*
347          *      When we registered the protocol we saved the socket in the data
348          *      field for just this event.
349          */
350
351         sk = pt->af_packet_priv;
352
353         /*
354          *      Yank back the headers [hope the device set this
355          *      right or kerboom...]
356          *
357          *      Incoming packets have ll header pulled,
358          *      push it back.
359          *
360          *      For outgoing ones skb->data == skb_mac_header(skb)
361          *      so that this procedure is noop.
362          */
363
364         if (skb->pkt_type == PACKET_LOOPBACK)
365                 goto out;
366
367         if (dev_net(dev) != sock_net(sk))
368                 goto out;
369
370         skb = skb_share_check(skb, GFP_ATOMIC);
371         if (skb == NULL)
372                 goto oom;
373
374         /* drop any routing info */
375         skb_dst_drop(skb);
376
377         /* drop conntrack reference */
378         nf_reset(skb);
379
380         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381
382         skb_push(skb, skb->data - skb_mac_header(skb));
383
384         /*
385          *      The SOCK_PACKET socket receives _all_ frames.
386          */
387
388         spkt->spkt_family = dev->type;
389         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390         spkt->spkt_protocol = skb->protocol;
391
392         /*
393          *      Charge the memory to the socket. This is done specifically
394          *      to prevent sockets using all the memory up.
395          */
396
397         if (sock_queue_rcv_skb(sk, skb) == 0)
398                 return 0;
399
400 out:
401         kfree_skb(skb);
402 oom:
403         return 0;
404 }
405
406
407 /*
408  *      Output a raw packet to a device layer. This bypasses all the other
409  *      protocol layers and you must therefore supply it with a complete frame
410  */
411
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413                                struct msghdr *msg, size_t len)
414 {
415         struct sock *sk = sock->sk;
416         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417         struct sk_buff *skb;
418         struct net_device *dev;
419         __be16 proto = 0;
420         int err;
421
422         /*
423          *      Get and verify the address.
424          */
425
426         if (saddr) {
427                 if (msg->msg_namelen < sizeof(struct sockaddr))
428                         return -EINVAL;
429                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430                         proto = saddr->spkt_protocol;
431         } else
432                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
433
434         /*
435          *      Find the device first to size check it
436          */
437
438         saddr->spkt_device[13] = 0;
439         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440         err = -ENODEV;
441         if (dev == NULL)
442                 goto out_unlock;
443
444         err = -ENETDOWN;
445         if (!(dev->flags & IFF_UP))
446                 goto out_unlock;
447
448         /*
449          * You may not queue a frame bigger than the mtu. This is the lowest level
450          * raw protocol and you must do your own fragmentation at this level.
451          */
452
453         err = -EMSGSIZE;
454         if (len > dev->mtu + dev->hard_header_len)
455                 goto out_unlock;
456
457         err = -ENOBUFS;
458         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459
460         /*
461          * If the write buffer is full, then tough. At this level the user
462          * gets to deal with the problem - do your own algorithmic backoffs.
463          * That's far more flexible.
464          */
465
466         if (skb == NULL)
467                 goto out_unlock;
468
469         /*
470          *      Fill it in
471          */
472
473         /* FIXME: Save some space for broken drivers that write a
474          * hard header at transmission time by themselves. PPP is the
475          * notable one here. This should really be fixed at the driver level.
476          */
477         skb_reserve(skb, LL_RESERVED_SPACE(dev));
478         skb_reset_network_header(skb);
479
480         /* Try to align data part correctly */
481         if (dev->header_ops) {
482                 skb->data -= dev->hard_header_len;
483                 skb->tail -= dev->hard_header_len;
484                 if (len < dev->hard_header_len)
485                         skb_reset_network_header(skb);
486         }
487
488         /* Returns -EFAULT on error */
489         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490         skb->protocol = proto;
491         skb->dev = dev;
492         skb->priority = sk->sk_priority;
493         skb->mark = sk->sk_mark;
494         if (err)
495                 goto out_free;
496
497         /*
498          *      Now send it
499          */
500
501         dev_queue_xmit(skb);
502         dev_put(dev);
503         return len;
504
505 out_free:
506         kfree_skb(skb);
507 out_unlock:
508         if (dev)
509                 dev_put(dev);
510         return err;
511 }
512
513 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
514                                       unsigned int res)
515 {
516         struct sk_filter *filter;
517
518         rcu_read_lock_bh();
519         filter = rcu_dereference(sk->sk_filter);
520         if (filter != NULL)
521                 res = sk_run_filter(skb, filter->insns, filter->len);
522         rcu_read_unlock_bh();
523
524         return res;
525 }
526
527 /*
528    This function makes lazy skb cloning in hope that most of packets
529    are discarded by BPF.
530
531    Note tricky part: we DO mangle shared skb! skb->data, skb->len
532    and skb->cb are mangled. It works because (and until) packets
533    falling here are owned by current CPU. Output packets are cloned
534    by dev_queue_xmit_nit(), input packets are processed by net_bh
535    sequencially, so that if we return skb to original state on exit,
536    we will not harm anyone.
537  */
538
539 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
540                       struct packet_type *pt, struct net_device *orig_dev)
541 {
542         struct sock *sk;
543         struct sockaddr_ll *sll;
544         struct packet_sock *po;
545         u8 *skb_head = skb->data;
546         int skb_len = skb->len;
547         unsigned int snaplen, res;
548
549         if (skb->pkt_type == PACKET_LOOPBACK)
550                 goto drop;
551
552         sk = pt->af_packet_priv;
553         po = pkt_sk(sk);
554
555         if (dev_net(dev) != sock_net(sk))
556                 goto drop;
557
558         skb->dev = dev;
559
560         if (dev->header_ops) {
561                 /* The device has an explicit notion of ll header,
562                    exported to higher levels.
563
564                    Otherwise, the device hides datails of it frame
565                    structure, so that corresponding packet head
566                    never delivered to user.
567                  */
568                 if (sk->sk_type != SOCK_DGRAM)
569                         skb_push(skb, skb->data - skb_mac_header(skb));
570                 else if (skb->pkt_type == PACKET_OUTGOING) {
571                         /* Special case: outgoing packets have ll header at head */
572                         skb_pull(skb, skb_network_offset(skb));
573                 }
574         }
575
576         snaplen = skb->len;
577
578         res = run_filter(skb, sk, snaplen);
579         if (!res)
580                 goto drop_n_restore;
581         if (snaplen > res)
582                 snaplen = res;
583
584         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
585             (unsigned)sk->sk_rcvbuf)
586                 goto drop_n_acct;
587
588         if (skb_shared(skb)) {
589                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
590                 if (nskb == NULL)
591                         goto drop_n_acct;
592
593                 if (skb_head != skb->data) {
594                         skb->data = skb_head;
595                         skb->len = skb_len;
596                 }
597                 kfree_skb(skb);
598                 skb = nskb;
599         }
600
601         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
602                      sizeof(skb->cb));
603
604         sll = &PACKET_SKB_CB(skb)->sa.ll;
605         sll->sll_family = AF_PACKET;
606         sll->sll_hatype = dev->type;
607         sll->sll_protocol = skb->protocol;
608         sll->sll_pkttype = skb->pkt_type;
609         if (unlikely(po->origdev))
610                 sll->sll_ifindex = orig_dev->ifindex;
611         else
612                 sll->sll_ifindex = dev->ifindex;
613
614         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
615
616         PACKET_SKB_CB(skb)->origlen = skb->len;
617
618         if (pskb_trim(skb, snaplen))
619                 goto drop_n_acct;
620
621         skb_set_owner_r(skb, sk);
622         skb->dev = NULL;
623         skb_dst_drop(skb);
624
625         /* drop conntrack reference */
626         nf_reset(skb);
627
628         spin_lock(&sk->sk_receive_queue.lock);
629         po->stats.tp_packets++;
630         __skb_queue_tail(&sk->sk_receive_queue, skb);
631         spin_unlock(&sk->sk_receive_queue.lock);
632         sk->sk_data_ready(sk, skb->len);
633         return 0;
634
635 drop_n_acct:
636         spin_lock(&sk->sk_receive_queue.lock);
637         po->stats.tp_drops++;
638         spin_unlock(&sk->sk_receive_queue.lock);
639
640 drop_n_restore:
641         if (skb_head != skb->data && skb_shared(skb)) {
642                 skb->data = skb_head;
643                 skb->len = skb_len;
644         }
645 drop:
646         consume_skb(skb);
647         return 0;
648 }
649
650 #ifdef CONFIG_PACKET_MMAP
651 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
652                        struct packet_type *pt, struct net_device *orig_dev)
653 {
654         struct sock *sk;
655         struct packet_sock *po;
656         struct sockaddr_ll *sll;
657         union {
658                 struct tpacket_hdr *h1;
659                 struct tpacket2_hdr *h2;
660                 void *raw;
661         } h;
662         u8 *skb_head = skb->data;
663         int skb_len = skb->len;
664         unsigned int snaplen, res;
665         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
666         unsigned short macoff, netoff, hdrlen;
667         struct sk_buff *copy_skb = NULL;
668         struct timeval tv;
669         struct timespec ts;
670
671         if (skb->pkt_type == PACKET_LOOPBACK)
672                 goto drop;
673
674         sk = pt->af_packet_priv;
675         po = pkt_sk(sk);
676
677         if (dev_net(dev) != sock_net(sk))
678                 goto drop;
679
680         if (dev->header_ops) {
681                 if (sk->sk_type != SOCK_DGRAM)
682                         skb_push(skb, skb->data - skb_mac_header(skb));
683                 else if (skb->pkt_type == PACKET_OUTGOING) {
684                         /* Special case: outgoing packets have ll header at head */
685                         skb_pull(skb, skb_network_offset(skb));
686                 }
687         }
688
689         if (skb->ip_summed == CHECKSUM_PARTIAL)
690                 status |= TP_STATUS_CSUMNOTREADY;
691
692         snaplen = skb->len;
693
694         res = run_filter(skb, sk, snaplen);
695         if (!res)
696                 goto drop_n_restore;
697         if (snaplen > res)
698                 snaplen = res;
699
700         if (sk->sk_type == SOCK_DGRAM) {
701                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702                                   po->tp_reserve;
703         } else {
704                 unsigned maclen = skb_network_offset(skb);
705                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
706                                        (maclen < 16 ? 16 : maclen)) +
707                         po->tp_reserve;
708                 macoff = netoff - maclen;
709         }
710
711         if (macoff + snaplen > po->rx_ring.frame_size) {
712                 if (po->copy_thresh &&
713                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714                     (unsigned)sk->sk_rcvbuf) {
715                         if (skb_shared(skb)) {
716                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
717                         } else {
718                                 copy_skb = skb_get(skb);
719                                 skb_head = skb->data;
720                         }
721                         if (copy_skb)
722                                 skb_set_owner_r(copy_skb, sk);
723                 }
724                 snaplen = po->rx_ring.frame_size - macoff;
725                 if ((int)snaplen < 0)
726                         snaplen = 0;
727         }
728
729         spin_lock(&sk->sk_receive_queue.lock);
730         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
731         if (!h.raw)
732                 goto ring_is_full;
733         packet_increment_head(&po->rx_ring);
734         po->stats.tp_packets++;
735         if (copy_skb) {
736                 status |= TP_STATUS_COPY;
737                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738         }
739         if (!po->stats.tp_drops)
740                 status &= ~TP_STATUS_LOSING;
741         spin_unlock(&sk->sk_receive_queue.lock);
742
743         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
744
745         switch (po->tp_version) {
746         case TPACKET_V1:
747                 h.h1->tp_len = skb->len;
748                 h.h1->tp_snaplen = snaplen;
749                 h.h1->tp_mac = macoff;
750                 h.h1->tp_net = netoff;
751                 if (skb->tstamp.tv64)
752                         tv = ktime_to_timeval(skb->tstamp);
753                 else
754                         do_gettimeofday(&tv);
755                 h.h1->tp_sec = tv.tv_sec;
756                 h.h1->tp_usec = tv.tv_usec;
757                 hdrlen = sizeof(*h.h1);
758                 break;
759         case TPACKET_V2:
760                 h.h2->tp_len = skb->len;
761                 h.h2->tp_snaplen = snaplen;
762                 h.h2->tp_mac = macoff;
763                 h.h2->tp_net = netoff;
764                 if (skb->tstamp.tv64)
765                         ts = ktime_to_timespec(skb->tstamp);
766                 else
767                         getnstimeofday(&ts);
768                 h.h2->tp_sec = ts.tv_sec;
769                 h.h2->tp_nsec = ts.tv_nsec;
770                 h.h2->tp_vlan_tci = skb->vlan_tci;
771                 hdrlen = sizeof(*h.h2);
772                 break;
773         default:
774                 BUG();
775         }
776
777         sll = h.raw + TPACKET_ALIGN(hdrlen);
778         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
779         sll->sll_family = AF_PACKET;
780         sll->sll_hatype = dev->type;
781         sll->sll_protocol = skb->protocol;
782         sll->sll_pkttype = skb->pkt_type;
783         if (unlikely(po->origdev))
784                 sll->sll_ifindex = orig_dev->ifindex;
785         else
786                 sll->sll_ifindex = dev->ifindex;
787
788         __packet_set_status(po, h.raw, status);
789         smp_mb();
790         {
791                 struct page *p_start, *p_end;
792                 u8 *h_end = h.raw + macoff + snaplen - 1;
793
794                 p_start = virt_to_page(h.raw);
795                 p_end = virt_to_page(h_end);
796                 while (p_start <= p_end) {
797                         flush_dcache_page(p_start);
798                         p_start++;
799                 }
800         }
801
802         sk->sk_data_ready(sk, 0);
803
804 drop_n_restore:
805         if (skb_head != skb->data && skb_shared(skb)) {
806                 skb->data = skb_head;
807                 skb->len = skb_len;
808         }
809 drop:
810         kfree_skb(skb);
811         return 0;
812
813 ring_is_full:
814         po->stats.tp_drops++;
815         spin_unlock(&sk->sk_receive_queue.lock);
816
817         sk->sk_data_ready(sk, 0);
818         kfree_skb(copy_skb);
819         goto drop_n_restore;
820 }
821
822 static void tpacket_destruct_skb(struct sk_buff *skb)
823 {
824         struct packet_sock *po = pkt_sk(skb->sk);
825         void *ph;
826
827         BUG_ON(skb == NULL);
828
829         if (likely(po->tx_ring.pg_vec)) {
830                 ph = skb_shinfo(skb)->destructor_arg;
831                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
832                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
833                 atomic_dec(&po->tx_ring.pending);
834                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
835         }
836
837         sock_wfree(skb);
838 }
839
840 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
841                 void *frame, struct net_device *dev, int size_max,
842                 __be16 proto, unsigned char *addr)
843 {
844         union {
845                 struct tpacket_hdr *h1;
846                 struct tpacket2_hdr *h2;
847                 void *raw;
848         } ph;
849         int to_write, offset, len, tp_len, nr_frags, len_max;
850         struct socket *sock = po->sk.sk_socket;
851         struct page *page;
852         void *data;
853         int err;
854
855         ph.raw = frame;
856
857         skb->protocol = proto;
858         skb->dev = dev;
859         skb->priority = po->sk.sk_priority;
860         skb->mark = po->sk.sk_mark;
861         skb_shinfo(skb)->destructor_arg = ph.raw;
862
863         switch (po->tp_version) {
864         case TPACKET_V2:
865                 tp_len = ph.h2->tp_len;
866                 break;
867         default:
868                 tp_len = ph.h1->tp_len;
869                 break;
870         }
871         if (unlikely(tp_len > size_max)) {
872                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
873                 return -EMSGSIZE;
874         }
875
876         skb_reserve(skb, LL_RESERVED_SPACE(dev));
877         skb_reset_network_header(skb);
878
879         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
880         to_write = tp_len;
881
882         if (sock->type == SOCK_DGRAM) {
883                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
884                                 NULL, tp_len);
885                 if (unlikely(err < 0))
886                         return -EINVAL;
887         } else if (dev->hard_header_len) {
888                 /* net device doesn't like empty head */
889                 if (unlikely(tp_len <= dev->hard_header_len)) {
890                         pr_err("packet size is too short (%d < %d)\n",
891                                tp_len, dev->hard_header_len);
892                         return -EINVAL;
893                 }
894
895                 skb_push(skb, dev->hard_header_len);
896                 err = skb_store_bits(skb, 0, data,
897                                 dev->hard_header_len);
898                 if (unlikely(err))
899                         return err;
900
901                 data += dev->hard_header_len;
902                 to_write -= dev->hard_header_len;
903         }
904
905         err = -EFAULT;
906         page = virt_to_page(data);
907         offset = offset_in_page(data);
908         len_max = PAGE_SIZE - offset;
909         len = ((to_write > len_max) ? len_max : to_write);
910
911         skb->data_len = to_write;
912         skb->len += to_write;
913         skb->truesize += to_write;
914         atomic_add(to_write, &po->sk.sk_wmem_alloc);
915
916         while (likely(to_write)) {
917                 nr_frags = skb_shinfo(skb)->nr_frags;
918
919                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920                         pr_err("Packet exceed the number of skb frags(%lu)\n",
921                                MAX_SKB_FRAGS);
922                         return -EFAULT;
923                 }
924
925                 flush_dcache_page(page);
926                 get_page(page);
927                 skb_fill_page_desc(skb,
928                                 nr_frags,
929                                 page++, offset, len);
930                 to_write -= len;
931                 offset = 0;
932                 len_max = PAGE_SIZE;
933                 len = ((to_write > len_max) ? len_max : to_write);
934         }
935
936         return tp_len;
937 }
938
939 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
940 {
941         struct socket *sock;
942         struct sk_buff *skb;
943         struct net_device *dev;
944         __be16 proto;
945         int ifindex, err, reserve = 0;
946         void *ph;
947         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
948         int tp_len, size_max;
949         unsigned char *addr;
950         int len_sum = 0;
951         int status = 0;
952
953         sock = po->sk.sk_socket;
954
955         mutex_lock(&po->pg_vec_lock);
956
957         err = -EBUSY;
958         if (saddr == NULL) {
959                 ifindex = po->ifindex;
960                 proto   = po->num;
961                 addr    = NULL;
962         } else {
963                 err = -EINVAL;
964                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
965                         goto out;
966                 if (msg->msg_namelen < (saddr->sll_halen
967                                         + offsetof(struct sockaddr_ll,
968                                                 sll_addr)))
969                         goto out;
970                 ifindex = saddr->sll_ifindex;
971                 proto   = saddr->sll_protocol;
972                 addr    = saddr->sll_addr;
973         }
974
975         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
976         err = -ENXIO;
977         if (unlikely(dev == NULL))
978                 goto out;
979
980         reserve = dev->hard_header_len;
981
982         err = -ENETDOWN;
983         if (unlikely(!(dev->flags & IFF_UP)))
984                 goto out_put;
985
986         size_max = po->tx_ring.frame_size
987                 - sizeof(struct skb_shared_info)
988                 - po->tp_hdrlen
989                 - LL_ALLOCATED_SPACE(dev)
990                 - sizeof(struct sockaddr_ll);
991
992         if (size_max > dev->mtu + reserve)
993                 size_max = dev->mtu + reserve;
994
995         do {
996                 ph = packet_current_frame(po, &po->tx_ring,
997                                 TP_STATUS_SEND_REQUEST);
998
999                 if (unlikely(ph == NULL)) {
1000                         schedule();
1001                         continue;
1002                 }
1003
1004                 status = TP_STATUS_SEND_REQUEST;
1005                 skb = sock_alloc_send_skb(&po->sk,
1006                                 LL_ALLOCATED_SPACE(dev)
1007                                 + sizeof(struct sockaddr_ll),
1008                                 0, &err);
1009
1010                 if (unlikely(skb == NULL))
1011                         goto out_status;
1012
1013                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1014                                 addr);
1015
1016                 if (unlikely(tp_len < 0)) {
1017                         if (po->tp_loss) {
1018                                 __packet_set_status(po, ph,
1019                                                 TP_STATUS_AVAILABLE);
1020                                 packet_increment_head(&po->tx_ring);
1021                                 kfree_skb(skb);
1022                                 continue;
1023                         } else {
1024                                 status = TP_STATUS_WRONG_FORMAT;
1025                                 err = tp_len;
1026                                 goto out_status;
1027                         }
1028                 }
1029
1030                 skb->destructor = tpacket_destruct_skb;
1031                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1032                 atomic_inc(&po->tx_ring.pending);
1033
1034                 status = TP_STATUS_SEND_REQUEST;
1035                 err = dev_queue_xmit(skb);
1036                 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1037                         goto out_xmit;
1038                 packet_increment_head(&po->tx_ring);
1039                 len_sum += tp_len;
1040         } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1041                                         && (atomic_read(&po->tx_ring.pending))))
1042               );
1043
1044         err = len_sum;
1045         goto out_put;
1046
1047 out_xmit:
1048         skb->destructor = sock_wfree;
1049         atomic_dec(&po->tx_ring.pending);
1050 out_status:
1051         __packet_set_status(po, ph, status);
1052         kfree_skb(skb);
1053 out_put:
1054         dev_put(dev);
1055 out:
1056         mutex_unlock(&po->pg_vec_lock);
1057         return err;
1058 }
1059 #endif
1060
1061 static int packet_snd(struct socket *sock,
1062                           struct msghdr *msg, size_t len)
1063 {
1064         struct sock *sk = sock->sk;
1065         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1066         struct sk_buff *skb;
1067         struct net_device *dev;
1068         __be16 proto;
1069         unsigned char *addr;
1070         int ifindex, err, reserve = 0;
1071
1072         /*
1073          *      Get and verify the address.
1074          */
1075
1076         if (saddr == NULL) {
1077                 struct packet_sock *po = pkt_sk(sk);
1078
1079                 ifindex = po->ifindex;
1080                 proto   = po->num;
1081                 addr    = NULL;
1082         } else {
1083                 err = -EINVAL;
1084                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1085                         goto out;
1086                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1087                         goto out;
1088                 ifindex = saddr->sll_ifindex;
1089                 proto   = saddr->sll_protocol;
1090                 addr    = saddr->sll_addr;
1091         }
1092
1093
1094         dev = dev_get_by_index(sock_net(sk), ifindex);
1095         err = -ENXIO;
1096         if (dev == NULL)
1097                 goto out_unlock;
1098         if (sock->type == SOCK_RAW)
1099                 reserve = dev->hard_header_len;
1100
1101         err = -ENETDOWN;
1102         if (!(dev->flags & IFF_UP))
1103                 goto out_unlock;
1104
1105         err = -EMSGSIZE;
1106         if (len > dev->mtu+reserve)
1107                 goto out_unlock;
1108
1109         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1110                                 msg->msg_flags & MSG_DONTWAIT, &err);
1111         if (skb == NULL)
1112                 goto out_unlock;
1113
1114         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1115         skb_reset_network_header(skb);
1116
1117         err = -EINVAL;
1118         if (sock->type == SOCK_DGRAM &&
1119             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1120                 goto out_free;
1121
1122         /* Returns -EFAULT on error */
1123         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1124         if (err)
1125                 goto out_free;
1126
1127         skb->protocol = proto;
1128         skb->dev = dev;
1129         skb->priority = sk->sk_priority;
1130         skb->mark = sk->sk_mark;
1131
1132         /*
1133          *      Now send it
1134          */
1135
1136         err = dev_queue_xmit(skb);
1137         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1138                 goto out_unlock;
1139
1140         dev_put(dev);
1141
1142         return len;
1143
1144 out_free:
1145         kfree_skb(skb);
1146 out_unlock:
1147         if (dev)
1148                 dev_put(dev);
1149 out:
1150         return err;
1151 }
1152
1153 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1154                 struct msghdr *msg, size_t len)
1155 {
1156 #ifdef CONFIG_PACKET_MMAP
1157         struct sock *sk = sock->sk;
1158         struct packet_sock *po = pkt_sk(sk);
1159         if (po->tx_ring.pg_vec)
1160                 return tpacket_snd(po, msg);
1161         else
1162 #endif
1163                 return packet_snd(sock, msg, len);
1164 }
1165
1166 /*
1167  *      Close a PACKET socket. This is fairly simple. We immediately go
1168  *      to 'closed' state and remove our protocol entry in the device list.
1169  */
1170
1171 static int packet_release(struct socket *sock)
1172 {
1173         struct sock *sk = sock->sk;
1174         struct packet_sock *po;
1175         struct net *net;
1176 #ifdef CONFIG_PACKET_MMAP
1177         struct tpacket_req req;
1178 #endif
1179
1180         if (!sk)
1181                 return 0;
1182
1183         net = sock_net(sk);
1184         po = pkt_sk(sk);
1185
1186         write_lock_bh(&net->packet.sklist_lock);
1187         sk_del_node_init(sk);
1188         sock_prot_inuse_add(net, sk->sk_prot, -1);
1189         write_unlock_bh(&net->packet.sklist_lock);
1190
1191         /*
1192          *      Unhook packet receive handler.
1193          */
1194
1195         if (po->running) {
1196                 /*
1197                  *      Remove the protocol hook
1198                  */
1199                 dev_remove_pack(&po->prot_hook);
1200                 po->running = 0;
1201                 po->num = 0;
1202                 __sock_put(sk);
1203         }
1204
1205         packet_flush_mclist(sk);
1206
1207 #ifdef CONFIG_PACKET_MMAP
1208         memset(&req, 0, sizeof(req));
1209
1210         if (po->rx_ring.pg_vec)
1211                 packet_set_ring(sk, &req, 1, 0);
1212
1213         if (po->tx_ring.pg_vec)
1214                 packet_set_ring(sk, &req, 1, 1);
1215 #endif
1216
1217         /*
1218          *      Now the socket is dead. No more input will appear.
1219          */
1220
1221         sock_orphan(sk);
1222         sock->sk = NULL;
1223
1224         /* Purge queues */
1225
1226         skb_queue_purge(&sk->sk_receive_queue);
1227         sk_refcnt_debug_release(sk);
1228
1229         sock_put(sk);
1230         return 0;
1231 }
1232
1233 /*
1234  *      Attach a packet hook.
1235  */
1236
1237 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1238 {
1239         struct packet_sock *po = pkt_sk(sk);
1240         /*
1241          *      Detach an existing hook if present.
1242          */
1243
1244         lock_sock(sk);
1245
1246         spin_lock(&po->bind_lock);
1247         if (po->running) {
1248                 __sock_put(sk);
1249                 po->running = 0;
1250                 po->num = 0;
1251                 spin_unlock(&po->bind_lock);
1252                 dev_remove_pack(&po->prot_hook);
1253                 spin_lock(&po->bind_lock);
1254         }
1255
1256         po->num = protocol;
1257         po->prot_hook.type = protocol;
1258         po->prot_hook.dev = dev;
1259
1260         po->ifindex = dev ? dev->ifindex : 0;
1261
1262         if (protocol == 0)
1263                 goto out_unlock;
1264
1265         if (!dev || (dev->flags & IFF_UP)) {
1266                 dev_add_pack(&po->prot_hook);
1267                 sock_hold(sk);
1268                 po->running = 1;
1269         } else {
1270                 sk->sk_err = ENETDOWN;
1271                 if (!sock_flag(sk, SOCK_DEAD))
1272                         sk->sk_error_report(sk);
1273         }
1274
1275 out_unlock:
1276         spin_unlock(&po->bind_lock);
1277         release_sock(sk);
1278         return 0;
1279 }
1280
1281 /*
1282  *      Bind a packet socket to a device
1283  */
1284
1285 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1286                             int addr_len)
1287 {
1288         struct sock *sk = sock->sk;
1289         char name[15];
1290         struct net_device *dev;
1291         int err = -ENODEV;
1292
1293         /*
1294          *      Check legality
1295          */
1296
1297         if (addr_len != sizeof(struct sockaddr))
1298                 return -EINVAL;
1299         strlcpy(name, uaddr->sa_data, sizeof(name));
1300
1301         dev = dev_get_by_name(sock_net(sk), name);
1302         if (dev) {
1303                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1304                 dev_put(dev);
1305         }
1306         return err;
1307 }
1308
1309 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1310 {
1311         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1312         struct sock *sk = sock->sk;
1313         struct net_device *dev = NULL;
1314         int err;
1315
1316
1317         /*
1318          *      Check legality
1319          */
1320
1321         if (addr_len < sizeof(struct sockaddr_ll))
1322                 return -EINVAL;
1323         if (sll->sll_family != AF_PACKET)
1324                 return -EINVAL;
1325
1326         if (sll->sll_ifindex) {
1327                 err = -ENODEV;
1328                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1329                 if (dev == NULL)
1330                         goto out;
1331         }
1332         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1333         if (dev)
1334                 dev_put(dev);
1335
1336 out:
1337         return err;
1338 }
1339
1340 static struct proto packet_proto = {
1341         .name     = "PACKET",
1342         .owner    = THIS_MODULE,
1343         .obj_size = sizeof(struct packet_sock),
1344 };
1345
1346 /*
1347  *      Create a packet of type SOCK_PACKET.
1348  */
1349
1350 static int packet_create(struct net *net, struct socket *sock, int protocol)
1351 {
1352         struct sock *sk;
1353         struct packet_sock *po;
1354         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1355         int err;
1356
1357         if (!capable(CAP_NET_RAW))
1358                 return -EPERM;
1359         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1360             sock->type != SOCK_PACKET)
1361                 return -ESOCKTNOSUPPORT;
1362
1363         sock->state = SS_UNCONNECTED;
1364
1365         err = -ENOBUFS;
1366         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1367         if (sk == NULL)
1368                 goto out;
1369
1370         sock->ops = &packet_ops;
1371         if (sock->type == SOCK_PACKET)
1372                 sock->ops = &packet_ops_spkt;
1373
1374         sock_init_data(sock, sk);
1375
1376         po = pkt_sk(sk);
1377         sk->sk_family = PF_PACKET;
1378         po->num = proto;
1379
1380         sk->sk_destruct = packet_sock_destruct;
1381         sk_refcnt_debug_inc(sk);
1382
1383         /*
1384          *      Attach a protocol block
1385          */
1386
1387         spin_lock_init(&po->bind_lock);
1388         mutex_init(&po->pg_vec_lock);
1389         po->prot_hook.func = packet_rcv;
1390
1391         if (sock->type == SOCK_PACKET)
1392                 po->prot_hook.func = packet_rcv_spkt;
1393
1394         po->prot_hook.af_packet_priv = sk;
1395
1396         if (proto) {
1397                 po->prot_hook.type = proto;
1398                 dev_add_pack(&po->prot_hook);
1399                 sock_hold(sk);
1400                 po->running = 1;
1401         }
1402
1403         write_lock_bh(&net->packet.sklist_lock);
1404         sk_add_node(sk, &net->packet.sklist);
1405         sock_prot_inuse_add(net, &packet_proto, 1);
1406         write_unlock_bh(&net->packet.sklist_lock);
1407         return 0;
1408 out:
1409         return err;
1410 }
1411
1412 /*
1413  *      Pull a packet from our receive queue and hand it to the user.
1414  *      If necessary we block.
1415  */
1416
1417 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1418                           struct msghdr *msg, size_t len, int flags)
1419 {
1420         struct sock *sk = sock->sk;
1421         struct sk_buff *skb;
1422         int copied, err;
1423         struct sockaddr_ll *sll;
1424
1425         err = -EINVAL;
1426         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1427                 goto out;
1428
1429 #if 0
1430         /* What error should we return now? EUNATTACH? */
1431         if (pkt_sk(sk)->ifindex < 0)
1432                 return -ENODEV;
1433 #endif
1434
1435         /*
1436          *      Call the generic datagram receiver. This handles all sorts
1437          *      of horrible races and re-entrancy so we can forget about it
1438          *      in the protocol layers.
1439          *
1440          *      Now it will return ENETDOWN, if device have just gone down,
1441          *      but then it will block.
1442          */
1443
1444         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1445
1446         /*
1447          *      An error occurred so return it. Because skb_recv_datagram()
1448          *      handles the blocking we don't see and worry about blocking
1449          *      retries.
1450          */
1451
1452         if (skb == NULL)
1453                 goto out;
1454
1455         /*
1456          *      If the address length field is there to be filled in, we fill
1457          *      it in now.
1458          */
1459
1460         sll = &PACKET_SKB_CB(skb)->sa.ll;
1461         if (sock->type == SOCK_PACKET)
1462                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1463         else
1464                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1465
1466         /*
1467          *      You lose any data beyond the buffer you gave. If it worries a
1468          *      user program they can ask the device for its MTU anyway.
1469          */
1470
1471         copied = skb->len;
1472         if (copied > len) {
1473                 copied = len;
1474                 msg->msg_flags |= MSG_TRUNC;
1475         }
1476
1477         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1478         if (err)
1479                 goto out_free;
1480
1481         sock_recv_timestamp(msg, sk, skb);
1482
1483         if (msg->msg_name)
1484                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1485                        msg->msg_namelen);
1486
1487         if (pkt_sk(sk)->auxdata) {
1488                 struct tpacket_auxdata aux;
1489
1490                 aux.tp_status = TP_STATUS_USER;
1491                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1492                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1493                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1494                 aux.tp_snaplen = skb->len;
1495                 aux.tp_mac = 0;
1496                 aux.tp_net = skb_network_offset(skb);
1497                 aux.tp_vlan_tci = skb->vlan_tci;
1498
1499                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1500         }
1501
1502         /*
1503          *      Free or return the buffer as appropriate. Again this
1504          *      hides all the races and re-entrancy issues from us.
1505          */
1506         err = (flags&MSG_TRUNC) ? skb->len : copied;
1507
1508 out_free:
1509         skb_free_datagram(sk, skb);
1510 out:
1511         return err;
1512 }
1513
1514 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1515                                int *uaddr_len, int peer)
1516 {
1517         struct net_device *dev;
1518         struct sock *sk = sock->sk;
1519
1520         if (peer)
1521                 return -EOPNOTSUPP;
1522
1523         uaddr->sa_family = AF_PACKET;
1524         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1525         if (dev) {
1526                 strlcpy(uaddr->sa_data, dev->name, 15);
1527                 dev_put(dev);
1528         } else
1529                 memset(uaddr->sa_data, 0, 14);
1530         *uaddr_len = sizeof(*uaddr);
1531
1532         return 0;
1533 }
1534
1535 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1536                           int *uaddr_len, int peer)
1537 {
1538         struct net_device *dev;
1539         struct sock *sk = sock->sk;
1540         struct packet_sock *po = pkt_sk(sk);
1541         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1542
1543         if (peer)
1544                 return -EOPNOTSUPP;
1545
1546         sll->sll_family = AF_PACKET;
1547         sll->sll_ifindex = po->ifindex;
1548         sll->sll_protocol = po->num;
1549         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1550         if (dev) {
1551                 sll->sll_hatype = dev->type;
1552                 sll->sll_halen = dev->addr_len;
1553                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1554                 dev_put(dev);
1555         } else {
1556                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1557                 sll->sll_halen = 0;
1558         }
1559         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1560
1561         return 0;
1562 }
1563
1564 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1565                          int what)
1566 {
1567         switch (i->type) {
1568         case PACKET_MR_MULTICAST:
1569                 if (what > 0)
1570                         return dev_mc_add(dev, i->addr, i->alen, 0);
1571                 else
1572                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1573                 break;
1574         case PACKET_MR_PROMISC:
1575                 return dev_set_promiscuity(dev, what);
1576                 break;
1577         case PACKET_MR_ALLMULTI:
1578                 return dev_set_allmulti(dev, what);
1579                 break;
1580         case PACKET_MR_UNICAST:
1581                 if (what > 0)
1582                         return dev_unicast_add(dev, i->addr);
1583                 else
1584                         return dev_unicast_delete(dev, i->addr);
1585                 break;
1586         default:
1587                 break;
1588         }
1589         return 0;
1590 }
1591
1592 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1593 {
1594         for ( ; i; i = i->next) {
1595                 if (i->ifindex == dev->ifindex)
1596                         packet_dev_mc(dev, i, what);
1597         }
1598 }
1599
1600 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1601 {
1602         struct packet_sock *po = pkt_sk(sk);
1603         struct packet_mclist *ml, *i;
1604         struct net_device *dev;
1605         int err;
1606
1607         rtnl_lock();
1608
1609         err = -ENODEV;
1610         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1611         if (!dev)
1612                 goto done;
1613
1614         err = -EINVAL;
1615         if (mreq->mr_alen > dev->addr_len)
1616                 goto done;
1617
1618         err = -ENOBUFS;
1619         i = kmalloc(sizeof(*i), GFP_KERNEL);
1620         if (i == NULL)
1621                 goto done;
1622
1623         err = 0;
1624         for (ml = po->mclist; ml; ml = ml->next) {
1625                 if (ml->ifindex == mreq->mr_ifindex &&
1626                     ml->type == mreq->mr_type &&
1627                     ml->alen == mreq->mr_alen &&
1628                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1629                         ml->count++;
1630                         /* Free the new element ... */
1631                         kfree(i);
1632                         goto done;
1633                 }
1634         }
1635
1636         i->type = mreq->mr_type;
1637         i->ifindex = mreq->mr_ifindex;
1638         i->alen = mreq->mr_alen;
1639         memcpy(i->addr, mreq->mr_address, i->alen);
1640         i->count = 1;
1641         i->next = po->mclist;
1642         po->mclist = i;
1643         err = packet_dev_mc(dev, i, 1);
1644         if (err) {
1645                 po->mclist = i->next;
1646                 kfree(i);
1647         }
1648
1649 done:
1650         rtnl_unlock();
1651         return err;
1652 }
1653
1654 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1655 {
1656         struct packet_mclist *ml, **mlp;
1657
1658         rtnl_lock();
1659
1660         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1661                 if (ml->ifindex == mreq->mr_ifindex &&
1662                     ml->type == mreq->mr_type &&
1663                     ml->alen == mreq->mr_alen &&
1664                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1665                         if (--ml->count == 0) {
1666                                 struct net_device *dev;
1667                                 *mlp = ml->next;
1668                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1669                                 if (dev) {
1670                                         packet_dev_mc(dev, ml, -1);
1671                                         dev_put(dev);
1672                                 }
1673                                 kfree(ml);
1674                         }
1675                         rtnl_unlock();
1676                         return 0;
1677                 }
1678         }
1679         rtnl_unlock();
1680         return -EADDRNOTAVAIL;
1681 }
1682
1683 static void packet_flush_mclist(struct sock *sk)
1684 {
1685         struct packet_sock *po = pkt_sk(sk);
1686         struct packet_mclist *ml;
1687
1688         if (!po->mclist)
1689                 return;
1690
1691         rtnl_lock();
1692         while ((ml = po->mclist) != NULL) {
1693                 struct net_device *dev;
1694
1695                 po->mclist = ml->next;
1696                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1697                 if (dev != NULL) {
1698                         packet_dev_mc(dev, ml, -1);
1699                         dev_put(dev);
1700                 }
1701                 kfree(ml);
1702         }
1703         rtnl_unlock();
1704 }
1705
1706 static int
1707 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1708 {
1709         struct sock *sk = sock->sk;
1710         struct packet_sock *po = pkt_sk(sk);
1711         int ret;
1712
1713         if (level != SOL_PACKET)
1714                 return -ENOPROTOOPT;
1715
1716         switch (optname) {
1717         case PACKET_ADD_MEMBERSHIP:
1718         case PACKET_DROP_MEMBERSHIP:
1719         {
1720                 struct packet_mreq_max mreq;
1721                 int len = optlen;
1722                 memset(&mreq, 0, sizeof(mreq));
1723                 if (len < sizeof(struct packet_mreq))
1724                         return -EINVAL;
1725                 if (len > sizeof(mreq))
1726                         len = sizeof(mreq);
1727                 if (copy_from_user(&mreq, optval, len))
1728                         return -EFAULT;
1729                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1730                         return -EINVAL;
1731                 if (optname == PACKET_ADD_MEMBERSHIP)
1732                         ret = packet_mc_add(sk, &mreq);
1733                 else
1734                         ret = packet_mc_drop(sk, &mreq);
1735                 return ret;
1736         }
1737
1738 #ifdef CONFIG_PACKET_MMAP
1739         case PACKET_RX_RING:
1740         case PACKET_TX_RING:
1741         {
1742                 struct tpacket_req req;
1743
1744                 if (optlen < sizeof(req))
1745                         return -EINVAL;
1746                 if (copy_from_user(&req, optval, sizeof(req)))
1747                         return -EFAULT;
1748                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1749         }
1750         case PACKET_COPY_THRESH:
1751         {
1752                 int val;
1753
1754                 if (optlen != sizeof(val))
1755                         return -EINVAL;
1756                 if (copy_from_user(&val, optval, sizeof(val)))
1757                         return -EFAULT;
1758
1759                 pkt_sk(sk)->copy_thresh = val;
1760                 return 0;
1761         }
1762         case PACKET_VERSION:
1763         {
1764                 int val;
1765
1766                 if (optlen != sizeof(val))
1767                         return -EINVAL;
1768                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1769                         return -EBUSY;
1770                 if (copy_from_user(&val, optval, sizeof(val)))
1771                         return -EFAULT;
1772                 switch (val) {
1773                 case TPACKET_V1:
1774                 case TPACKET_V2:
1775                         po->tp_version = val;
1776                         return 0;
1777                 default:
1778                         return -EINVAL;
1779                 }
1780         }
1781         case PACKET_RESERVE:
1782         {
1783                 unsigned int val;
1784
1785                 if (optlen != sizeof(val))
1786                         return -EINVAL;
1787                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1788                         return -EBUSY;
1789                 if (copy_from_user(&val, optval, sizeof(val)))
1790                         return -EFAULT;
1791                 po->tp_reserve = val;
1792                 return 0;
1793         }
1794         case PACKET_LOSS:
1795         {
1796                 unsigned int val;
1797
1798                 if (optlen != sizeof(val))
1799                         return -EINVAL;
1800                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1801                         return -EBUSY;
1802                 if (copy_from_user(&val, optval, sizeof(val)))
1803                         return -EFAULT;
1804                 po->tp_loss = !!val;
1805                 return 0;
1806         }
1807 #endif
1808         case PACKET_AUXDATA:
1809         {
1810                 int val;
1811
1812                 if (optlen < sizeof(val))
1813                         return -EINVAL;
1814                 if (copy_from_user(&val, optval, sizeof(val)))
1815                         return -EFAULT;
1816
1817                 po->auxdata = !!val;
1818                 return 0;
1819         }
1820         case PACKET_ORIGDEV:
1821         {
1822                 int val;
1823
1824                 if (optlen < sizeof(val))
1825                         return -EINVAL;
1826                 if (copy_from_user(&val, optval, sizeof(val)))
1827                         return -EFAULT;
1828
1829                 po->origdev = !!val;
1830                 return 0;
1831         }
1832         default:
1833                 return -ENOPROTOOPT;
1834         }
1835 }
1836
1837 static int packet_getsockopt(struct socket *sock, int level, int optname,
1838                              char __user *optval, int __user *optlen)
1839 {
1840         int len;
1841         int val;
1842         struct sock *sk = sock->sk;
1843         struct packet_sock *po = pkt_sk(sk);
1844         void *data;
1845         struct tpacket_stats st;
1846
1847         if (level != SOL_PACKET)
1848                 return -ENOPROTOOPT;
1849
1850         if (get_user(len, optlen))
1851                 return -EFAULT;
1852
1853         if (len < 0)
1854                 return -EINVAL;
1855
1856         switch (optname) {
1857         case PACKET_STATISTICS:
1858                 if (len > sizeof(struct tpacket_stats))
1859                         len = sizeof(struct tpacket_stats);
1860                 spin_lock_bh(&sk->sk_receive_queue.lock);
1861                 st = po->stats;
1862                 memset(&po->stats, 0, sizeof(st));
1863                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1864                 st.tp_packets += st.tp_drops;
1865
1866                 data = &st;
1867                 break;
1868         case PACKET_AUXDATA:
1869                 if (len > sizeof(int))
1870                         len = sizeof(int);
1871                 val = po->auxdata;
1872
1873                 data = &val;
1874                 break;
1875         case PACKET_ORIGDEV:
1876                 if (len > sizeof(int))
1877                         len = sizeof(int);
1878                 val = po->origdev;
1879
1880                 data = &val;
1881                 break;
1882 #ifdef CONFIG_PACKET_MMAP
1883         case PACKET_VERSION:
1884                 if (len > sizeof(int))
1885                         len = sizeof(int);
1886                 val = po->tp_version;
1887                 data = &val;
1888                 break;
1889         case PACKET_HDRLEN:
1890                 if (len > sizeof(int))
1891                         len = sizeof(int);
1892                 if (copy_from_user(&val, optval, len))
1893                         return -EFAULT;
1894                 switch (val) {
1895                 case TPACKET_V1:
1896                         val = sizeof(struct tpacket_hdr);
1897                         break;
1898                 case TPACKET_V2:
1899                         val = sizeof(struct tpacket2_hdr);
1900                         break;
1901                 default:
1902                         return -EINVAL;
1903                 }
1904                 data = &val;
1905                 break;
1906         case PACKET_RESERVE:
1907                 if (len > sizeof(unsigned int))
1908                         len = sizeof(unsigned int);
1909                 val = po->tp_reserve;
1910                 data = &val;
1911                 break;
1912         case PACKET_LOSS:
1913                 if (len > sizeof(unsigned int))
1914                         len = sizeof(unsigned int);
1915                 val = po->tp_loss;
1916                 data = &val;
1917                 break;
1918 #endif
1919         default:
1920                 return -ENOPROTOOPT;
1921         }
1922
1923         if (put_user(len, optlen))
1924                 return -EFAULT;
1925         if (copy_to_user(optval, data, len))
1926                 return -EFAULT;
1927         return 0;
1928 }
1929
1930
1931 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1932 {
1933         struct sock *sk;
1934         struct hlist_node *node;
1935         struct net_device *dev = data;
1936         struct net *net = dev_net(dev);
1937
1938         read_lock(&net->packet.sklist_lock);
1939         sk_for_each(sk, node, &net->packet.sklist) {
1940                 struct packet_sock *po = pkt_sk(sk);
1941
1942                 switch (msg) {
1943                 case NETDEV_UNREGISTER:
1944                         if (po->mclist)
1945                                 packet_dev_mclist(dev, po->mclist, -1);
1946                         /* fallthrough */
1947
1948                 case NETDEV_DOWN:
1949                         if (dev->ifindex == po->ifindex) {
1950                                 spin_lock(&po->bind_lock);
1951                                 if (po->running) {
1952                                         __dev_remove_pack(&po->prot_hook);
1953                                         __sock_put(sk);
1954                                         po->running = 0;
1955                                         sk->sk_err = ENETDOWN;
1956                                         if (!sock_flag(sk, SOCK_DEAD))
1957                                                 sk->sk_error_report(sk);
1958                                 }
1959                                 if (msg == NETDEV_UNREGISTER) {
1960                                         po->ifindex = -1;
1961                                         po->prot_hook.dev = NULL;
1962                                 }
1963                                 spin_unlock(&po->bind_lock);
1964                         }
1965                         break;
1966                 case NETDEV_UP:
1967                         spin_lock(&po->bind_lock);
1968                         if (dev->ifindex == po->ifindex && po->num &&
1969                             !po->running) {
1970                                 dev_add_pack(&po->prot_hook);
1971                                 sock_hold(sk);
1972                                 po->running = 1;
1973                         }
1974                         spin_unlock(&po->bind_lock);
1975                         break;
1976                 }
1977         }
1978         read_unlock(&net->packet.sklist_lock);
1979         return NOTIFY_DONE;
1980 }
1981
1982
1983 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1984                         unsigned long arg)
1985 {
1986         struct sock *sk = sock->sk;
1987
1988         switch (cmd) {
1989         case SIOCOUTQ:
1990         {
1991                 int amount = sk_wmem_alloc_get(sk);
1992
1993                 return put_user(amount, (int __user *)arg);
1994         }
1995         case SIOCINQ:
1996         {
1997                 struct sk_buff *skb;
1998                 int amount = 0;
1999
2000                 spin_lock_bh(&sk->sk_receive_queue.lock);
2001                 skb = skb_peek(&sk->sk_receive_queue);
2002                 if (skb)
2003                         amount = skb->len;
2004                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2005                 return put_user(amount, (int __user *)arg);
2006         }
2007         case SIOCGSTAMP:
2008                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2009         case SIOCGSTAMPNS:
2010                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2011
2012 #ifdef CONFIG_INET
2013         case SIOCADDRT:
2014         case SIOCDELRT:
2015         case SIOCDARP:
2016         case SIOCGARP:
2017         case SIOCSARP:
2018         case SIOCGIFADDR:
2019         case SIOCSIFADDR:
2020         case SIOCGIFBRDADDR:
2021         case SIOCSIFBRDADDR:
2022         case SIOCGIFNETMASK:
2023         case SIOCSIFNETMASK:
2024         case SIOCGIFDSTADDR:
2025         case SIOCSIFDSTADDR:
2026         case SIOCSIFFLAGS:
2027                 if (!net_eq(sock_net(sk), &init_net))
2028                         return -ENOIOCTLCMD;
2029                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2030 #endif
2031
2032         default:
2033                 return -ENOIOCTLCMD;
2034         }
2035         return 0;
2036 }
2037
2038 #ifndef CONFIG_PACKET_MMAP
2039 #define packet_mmap sock_no_mmap
2040 #define packet_poll datagram_poll
2041 #else
2042
2043 static unsigned int packet_poll(struct file *file, struct socket *sock,
2044                                 poll_table *wait)
2045 {
2046         struct sock *sk = sock->sk;
2047         struct packet_sock *po = pkt_sk(sk);
2048         unsigned int mask = datagram_poll(file, sock, wait);
2049
2050         spin_lock_bh(&sk->sk_receive_queue.lock);
2051         if (po->rx_ring.pg_vec) {
2052                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2053                         mask |= POLLIN | POLLRDNORM;
2054         }
2055         spin_unlock_bh(&sk->sk_receive_queue.lock);
2056         spin_lock_bh(&sk->sk_write_queue.lock);
2057         if (po->tx_ring.pg_vec) {
2058                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2059                         mask |= POLLOUT | POLLWRNORM;
2060         }
2061         spin_unlock_bh(&sk->sk_write_queue.lock);
2062         return mask;
2063 }
2064
2065
2066 /* Dirty? Well, I still did not learn better way to account
2067  * for user mmaps.
2068  */
2069
2070 static void packet_mm_open(struct vm_area_struct *vma)
2071 {
2072         struct file *file = vma->vm_file;
2073         struct socket *sock = file->private_data;
2074         struct sock *sk = sock->sk;
2075
2076         if (sk)
2077                 atomic_inc(&pkt_sk(sk)->mapped);
2078 }
2079
2080 static void packet_mm_close(struct vm_area_struct *vma)
2081 {
2082         struct file *file = vma->vm_file;
2083         struct socket *sock = file->private_data;
2084         struct sock *sk = sock->sk;
2085
2086         if (sk)
2087                 atomic_dec(&pkt_sk(sk)->mapped);
2088 }
2089
2090 static const struct vm_operations_struct packet_mmap_ops = {
2091         .open   =       packet_mm_open,
2092         .close  =       packet_mm_close,
2093 };
2094
2095 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2096 {
2097         int i;
2098
2099         for (i = 0; i < len; i++) {
2100                 if (likely(pg_vec[i]))
2101                         free_pages((unsigned long) pg_vec[i], order);
2102         }
2103         kfree(pg_vec);
2104 }
2105
2106 static inline char *alloc_one_pg_vec_page(unsigned long order)
2107 {
2108         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2109
2110         return (char *) __get_free_pages(gfp_flags, order);
2111 }
2112
2113 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2114 {
2115         unsigned int block_nr = req->tp_block_nr;
2116         char **pg_vec;
2117         int i;
2118
2119         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2120         if (unlikely(!pg_vec))
2121                 goto out;
2122
2123         for (i = 0; i < block_nr; i++) {
2124                 pg_vec[i] = alloc_one_pg_vec_page(order);
2125                 if (unlikely(!pg_vec[i]))
2126                         goto out_free_pgvec;
2127         }
2128
2129 out:
2130         return pg_vec;
2131
2132 out_free_pgvec:
2133         free_pg_vec(pg_vec, order, block_nr);
2134         pg_vec = NULL;
2135         goto out;
2136 }
2137
2138 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2139                 int closing, int tx_ring)
2140 {
2141         char **pg_vec = NULL;
2142         struct packet_sock *po = pkt_sk(sk);
2143         int was_running, order = 0;
2144         struct packet_ring_buffer *rb;
2145         struct sk_buff_head *rb_queue;
2146         __be16 num;
2147         int err;
2148
2149         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2150         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2151
2152         err = -EBUSY;
2153         if (!closing) {
2154                 if (atomic_read(&po->mapped))
2155                         goto out;
2156                 if (atomic_read(&rb->pending))
2157                         goto out;
2158         }
2159
2160         if (req->tp_block_nr) {
2161                 /* Sanity tests and some calculations */
2162                 err = -EBUSY;
2163                 if (unlikely(rb->pg_vec))
2164                         goto out;
2165
2166                 switch (po->tp_version) {
2167                 case TPACKET_V1:
2168                         po->tp_hdrlen = TPACKET_HDRLEN;
2169                         break;
2170                 case TPACKET_V2:
2171                         po->tp_hdrlen = TPACKET2_HDRLEN;
2172                         break;
2173                 }
2174
2175                 err = -EINVAL;
2176                 if (unlikely((int)req->tp_block_size <= 0))
2177                         goto out;
2178                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2179                         goto out;
2180                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2181                                         po->tp_reserve))
2182                         goto out;
2183                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2184                         goto out;
2185
2186                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2187                 if (unlikely(rb->frames_per_block <= 0))
2188                         goto out;
2189                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2190                                         req->tp_frame_nr))
2191                         goto out;
2192
2193                 err = -ENOMEM;
2194                 order = get_order(req->tp_block_size);
2195                 pg_vec = alloc_pg_vec(req, order);
2196                 if (unlikely(!pg_vec))
2197                         goto out;
2198         }
2199         /* Done */
2200         else {
2201                 err = -EINVAL;
2202                 if (unlikely(req->tp_frame_nr))
2203                         goto out;
2204         }
2205
2206         lock_sock(sk);
2207
2208         /* Detach socket from network */
2209         spin_lock(&po->bind_lock);
2210         was_running = po->running;
2211         num = po->num;
2212         if (was_running) {
2213                 __dev_remove_pack(&po->prot_hook);
2214                 po->num = 0;
2215                 po->running = 0;
2216                 __sock_put(sk);
2217         }
2218         spin_unlock(&po->bind_lock);
2219
2220         synchronize_net();
2221
2222         err = -EBUSY;
2223         mutex_lock(&po->pg_vec_lock);
2224         if (closing || atomic_read(&po->mapped) == 0) {
2225                 err = 0;
2226 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2227                 spin_lock_bh(&rb_queue->lock);
2228                 pg_vec = XC(rb->pg_vec, pg_vec);
2229                 rb->frame_max = (req->tp_frame_nr - 1);
2230                 rb->head = 0;
2231                 rb->frame_size = req->tp_frame_size;
2232                 spin_unlock_bh(&rb_queue->lock);
2233
2234                 order = XC(rb->pg_vec_order, order);
2235                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2236
2237                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2238                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2239                                                 tpacket_rcv : packet_rcv;
2240                 skb_queue_purge(rb_queue);
2241 #undef XC
2242                 if (atomic_read(&po->mapped))
2243                         pr_err("packet_mmap: vma is busy: %d\n",
2244                                atomic_read(&po->mapped));
2245         }
2246         mutex_unlock(&po->pg_vec_lock);
2247
2248         spin_lock(&po->bind_lock);
2249         if (was_running && !po->running) {
2250                 sock_hold(sk);
2251                 po->running = 1;
2252                 po->num = num;
2253                 dev_add_pack(&po->prot_hook);
2254         }
2255         spin_unlock(&po->bind_lock);
2256
2257         release_sock(sk);
2258
2259         if (pg_vec)
2260                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2261 out:
2262         return err;
2263 }
2264
2265 static int packet_mmap(struct file *file, struct socket *sock,
2266                 struct vm_area_struct *vma)
2267 {
2268         struct sock *sk = sock->sk;
2269         struct packet_sock *po = pkt_sk(sk);
2270         unsigned long size, expected_size;
2271         struct packet_ring_buffer *rb;
2272         unsigned long start;
2273         int err = -EINVAL;
2274         int i;
2275
2276         if (vma->vm_pgoff)
2277                 return -EINVAL;
2278
2279         mutex_lock(&po->pg_vec_lock);
2280
2281         expected_size = 0;
2282         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2283                 if (rb->pg_vec) {
2284                         expected_size += rb->pg_vec_len
2285                                                 * rb->pg_vec_pages
2286                                                 * PAGE_SIZE;
2287                 }
2288         }
2289
2290         if (expected_size == 0)
2291                 goto out;
2292
2293         size = vma->vm_end - vma->vm_start;
2294         if (size != expected_size)
2295                 goto out;
2296
2297         start = vma->vm_start;
2298         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2299                 if (rb->pg_vec == NULL)
2300                         continue;
2301
2302                 for (i = 0; i < rb->pg_vec_len; i++) {
2303                         struct page *page = virt_to_page(rb->pg_vec[i]);
2304                         int pg_num;
2305
2306                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2307                                         pg_num++, page++) {
2308                                 err = vm_insert_page(vma, start, page);
2309                                 if (unlikely(err))
2310                                         goto out;
2311                                 start += PAGE_SIZE;
2312                         }
2313                 }
2314         }
2315
2316         atomic_inc(&po->mapped);
2317         vma->vm_ops = &packet_mmap_ops;
2318         err = 0;
2319
2320 out:
2321         mutex_unlock(&po->pg_vec_lock);
2322         return err;
2323 }
2324 #endif
2325
2326
2327 static const struct proto_ops packet_ops_spkt = {
2328         .family =       PF_PACKET,
2329         .owner =        THIS_MODULE,
2330         .release =      packet_release,
2331         .bind =         packet_bind_spkt,
2332         .connect =      sock_no_connect,
2333         .socketpair =   sock_no_socketpair,
2334         .accept =       sock_no_accept,
2335         .getname =      packet_getname_spkt,
2336         .poll =         datagram_poll,
2337         .ioctl =        packet_ioctl,
2338         .listen =       sock_no_listen,
2339         .shutdown =     sock_no_shutdown,
2340         .setsockopt =   sock_no_setsockopt,
2341         .getsockopt =   sock_no_getsockopt,
2342         .sendmsg =      packet_sendmsg_spkt,
2343         .recvmsg =      packet_recvmsg,
2344         .mmap =         sock_no_mmap,
2345         .sendpage =     sock_no_sendpage,
2346 };
2347
2348 static const struct proto_ops packet_ops = {
2349         .family =       PF_PACKET,
2350         .owner =        THIS_MODULE,
2351         .release =      packet_release,
2352         .bind =         packet_bind,
2353         .connect =      sock_no_connect,
2354         .socketpair =   sock_no_socketpair,
2355         .accept =       sock_no_accept,
2356         .getname =      packet_getname,
2357         .poll =         packet_poll,
2358         .ioctl =        packet_ioctl,
2359         .listen =       sock_no_listen,
2360         .shutdown =     sock_no_shutdown,
2361         .setsockopt =   packet_setsockopt,
2362         .getsockopt =   packet_getsockopt,
2363         .sendmsg =      packet_sendmsg,
2364         .recvmsg =      packet_recvmsg,
2365         .mmap =         packet_mmap,
2366         .sendpage =     sock_no_sendpage,
2367 };
2368
2369 static const struct net_proto_family packet_family_ops = {
2370         .family =       PF_PACKET,
2371         .create =       packet_create,
2372         .owner  =       THIS_MODULE,
2373 };
2374
2375 static struct notifier_block packet_netdev_notifier = {
2376         .notifier_call =        packet_notifier,
2377 };
2378
2379 #ifdef CONFIG_PROC_FS
2380 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2381 {
2382         struct sock *s;
2383         struct hlist_node *node;
2384
2385         sk_for_each(s, node, &net->packet.sklist) {
2386                 if (!off--)
2387                         return s;
2388         }
2389         return NULL;
2390 }
2391
2392 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2393         __acquires(seq_file_net(seq)->packet.sklist_lock)
2394 {
2395         struct net *net = seq_file_net(seq);
2396         read_lock(&net->packet.sklist_lock);
2397         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2398 }
2399
2400 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2401 {
2402         struct net *net = seq_file_net(seq);
2403         ++*pos;
2404         return  (v == SEQ_START_TOKEN)
2405                 ? sk_head(&net->packet.sklist)
2406                 : sk_next((struct sock *)v) ;
2407 }
2408
2409 static void packet_seq_stop(struct seq_file *seq, void *v)
2410         __releases(seq_file_net(seq)->packet.sklist_lock)
2411 {
2412         struct net *net = seq_file_net(seq);
2413         read_unlock(&net->packet.sklist_lock);
2414 }
2415
2416 static int packet_seq_show(struct seq_file *seq, void *v)
2417 {
2418         if (v == SEQ_START_TOKEN)
2419                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2420         else {
2421                 struct sock *s = v;
2422                 const struct packet_sock *po = pkt_sk(s);
2423
2424                 seq_printf(seq,
2425                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2426                            s,
2427                            atomic_read(&s->sk_refcnt),
2428                            s->sk_type,
2429                            ntohs(po->num),
2430                            po->ifindex,
2431                            po->running,
2432                            atomic_read(&s->sk_rmem_alloc),
2433                            sock_i_uid(s),
2434                            sock_i_ino(s));
2435         }
2436
2437         return 0;
2438 }
2439
2440 static const struct seq_operations packet_seq_ops = {
2441         .start  = packet_seq_start,
2442         .next   = packet_seq_next,
2443         .stop   = packet_seq_stop,
2444         .show   = packet_seq_show,
2445 };
2446
2447 static int packet_seq_open(struct inode *inode, struct file *file)
2448 {
2449         return seq_open_net(inode, file, &packet_seq_ops,
2450                             sizeof(struct seq_net_private));
2451 }
2452
2453 static const struct file_operations packet_seq_fops = {
2454         .owner          = THIS_MODULE,
2455         .open           = packet_seq_open,
2456         .read           = seq_read,
2457         .llseek         = seq_lseek,
2458         .release        = seq_release_net,
2459 };
2460
2461 #endif
2462
2463 static int packet_net_init(struct net *net)
2464 {
2465         rwlock_init(&net->packet.sklist_lock);
2466         INIT_HLIST_HEAD(&net->packet.sklist);
2467
2468         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2469                 return -ENOMEM;
2470
2471         return 0;
2472 }
2473
2474 static void packet_net_exit(struct net *net)
2475 {
2476         proc_net_remove(net, "packet");
2477 }
2478
2479 static struct pernet_operations packet_net_ops = {
2480         .init = packet_net_init,
2481         .exit = packet_net_exit,
2482 };
2483
2484
2485 static void __exit packet_exit(void)
2486 {
2487         unregister_netdevice_notifier(&packet_netdev_notifier);
2488         unregister_pernet_subsys(&packet_net_ops);
2489         sock_unregister(PF_PACKET);
2490         proto_unregister(&packet_proto);
2491 }
2492
2493 static int __init packet_init(void)
2494 {
2495         int rc = proto_register(&packet_proto, 0);
2496
2497         if (rc != 0)
2498                 goto out;
2499
2500         sock_register(&packet_family_ops);
2501         register_pernet_subsys(&packet_net_ops);
2502         register_netdevice_notifier(&packet_netdev_notifier);
2503 out:
2504         return rc;
2505 }
2506
2507 module_init(packet_init);
2508 module_exit(packet_exit);
2509 MODULE_LICENSE("GPL");
2510 MODULE_ALIAS_NETPROTO(PF_PACKET);