net: Move && and || to end of previous line
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83
84 #ifdef CONFIG_INET
85 #include <net/inet_common.h>
86 #endif
87
88 /*
89    Assumptions:
90    - if device has no dev->hard_header routine, it adds and removes ll header
91      inside itself. In this case ll header is invisible outside of device,
92      but higher levels still should reserve dev->hard_header_len.
93      Some devices are enough clever to reallocate skb, when header
94      will not fit to reserved space (tunnel), another ones are silly
95      (PPP).
96    - packet socket receives packets with pulled ll header,
97      so that SOCK_RAW should push it back.
98
99 On receive:
100 -----------
101
102 Incoming, dev->hard_header!=NULL
103    mac_header -> ll header
104    data       -> data
105
106 Outgoing, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> ll header
109
110 Incoming, dev->hard_header==NULL
111    mac_header -> UNKNOWN position. It is very likely, that it points to ll
112                  header.  PPP makes it, that is wrong, because introduce
113                  assymetry between rx and tx paths.
114    data       -> data
115
116 Outgoing, dev->hard_header==NULL
117    mac_header -> data. ll header is still not built!
118    data       -> data
119
120 Resume
121   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
122
123
124 On transmit:
125 ------------
126
127 dev->hard_header != NULL
128    mac_header -> ll header
129    data       -> ll header
130
131 dev->hard_header == NULL (ll header is added by device, we cannot control it)
132    mac_header -> data
133    data       -> data
134
135    We should set nh.raw on output to correct posistion,
136    packet classifier depends on it.
137  */
138
139 /* Private packet socket structures. */
140
141 struct packet_mclist {
142         struct packet_mclist    *next;
143         int                     ifindex;
144         int                     count;
145         unsigned short          type;
146         unsigned short          alen;
147         unsigned char           addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max {
153         int             mr_ifindex;
154         unsigned short  mr_type;
155         unsigned short  mr_alen;
156         unsigned char   mr_address[MAX_ADDR_LEN];
157 };
158
159 #ifdef CONFIG_PACKET_MMAP
160 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161                 int closing, int tx_ring);
162
163 struct packet_ring_buffer {
164         char                    **pg_vec;
165         unsigned int            head;
166         unsigned int            frames_per_block;
167         unsigned int            frame_size;
168         unsigned int            frame_max;
169
170         unsigned int            pg_vec_order;
171         unsigned int            pg_vec_pages;
172         unsigned int            pg_vec_len;
173
174         atomic_t                pending;
175 };
176
177 struct packet_sock;
178 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179 #endif
180
181 static void packet_flush_mclist(struct sock *sk);
182
183 struct packet_sock {
184         /* struct sock has to be the first member of packet_sock */
185         struct sock             sk;
186         struct tpacket_stats    stats;
187 #ifdef CONFIG_PACKET_MMAP
188         struct packet_ring_buffer       rx_ring;
189         struct packet_ring_buffer       tx_ring;
190         int                     copy_thresh;
191 #endif
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
208 };
209
210 struct packet_skb_cb {
211         unsigned int origlen;
212         union {
213                 struct sockaddr_pkt pkt;
214                 struct sockaddr_ll ll;
215         } sa;
216 };
217
218 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
219
220 #ifdef CONFIG_PACKET_MMAP
221
222 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
223 {
224         union {
225                 struct tpacket_hdr *h1;
226                 struct tpacket2_hdr *h2;
227                 void *raw;
228         } h;
229
230         h.raw = frame;
231         switch (po->tp_version) {
232         case TPACKET_V1:
233                 h.h1->tp_status = status;
234                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
235                 break;
236         case TPACKET_V2:
237                 h.h2->tp_status = status;
238                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
239                 break;
240         default:
241                 pr_err("TPACKET version not supported\n");
242                 BUG();
243         }
244
245         smp_wmb();
246 }
247
248 static int __packet_get_status(struct packet_sock *po, void *frame)
249 {
250         union {
251                 struct tpacket_hdr *h1;
252                 struct tpacket2_hdr *h2;
253                 void *raw;
254         } h;
255
256         smp_rmb();
257
258         h.raw = frame;
259         switch (po->tp_version) {
260         case TPACKET_V1:
261                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
262                 return h.h1->tp_status;
263         case TPACKET_V2:
264                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
265                 return h.h2->tp_status;
266         default:
267                 pr_err("TPACKET version not supported\n");
268                 BUG();
269                 return 0;
270         }
271 }
272
273 static void *packet_lookup_frame(struct packet_sock *po,
274                 struct packet_ring_buffer *rb,
275                 unsigned int position,
276                 int status)
277 {
278         unsigned int pg_vec_pos, frame_offset;
279         union {
280                 struct tpacket_hdr *h1;
281                 struct tpacket2_hdr *h2;
282                 void *raw;
283         } h;
284
285         pg_vec_pos = position / rb->frames_per_block;
286         frame_offset = position % rb->frames_per_block;
287
288         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
289
290         if (status != __packet_get_status(po, h.raw))
291                 return NULL;
292
293         return h.raw;
294 }
295
296 static inline void *packet_current_frame(struct packet_sock *po,
297                 struct packet_ring_buffer *rb,
298                 int status)
299 {
300         return packet_lookup_frame(po, rb, rb->head, status);
301 }
302
303 static inline void *packet_previous_frame(struct packet_sock *po,
304                 struct packet_ring_buffer *rb,
305                 int status)
306 {
307         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
308         return packet_lookup_frame(po, rb, previous, status);
309 }
310
311 static inline void packet_increment_head(struct packet_ring_buffer *buff)
312 {
313         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
314 }
315
316 #endif
317
318 static inline struct packet_sock *pkt_sk(struct sock *sk)
319 {
320         return (struct packet_sock *)sk;
321 }
322
323 static void packet_sock_destruct(struct sock *sk)
324 {
325         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
326         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
327
328         if (!sock_flag(sk, SOCK_DEAD)) {
329                 pr_err("Attempt to release alive packet socket: %p\n", sk);
330                 return;
331         }
332
333         sk_refcnt_debug_dec(sk);
334 }
335
336
337 static const struct proto_ops packet_ops;
338
339 static const struct proto_ops packet_ops_spkt;
340
341 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
342                            struct packet_type *pt, struct net_device *orig_dev)
343 {
344         struct sock *sk;
345         struct sockaddr_pkt *spkt;
346
347         /*
348          *      When we registered the protocol we saved the socket in the data
349          *      field for just this event.
350          */
351
352         sk = pt->af_packet_priv;
353
354         /*
355          *      Yank back the headers [hope the device set this
356          *      right or kerboom...]
357          *
358          *      Incoming packets have ll header pulled,
359          *      push it back.
360          *
361          *      For outgoing ones skb->data == skb_mac_header(skb)
362          *      so that this procedure is noop.
363          */
364
365         if (skb->pkt_type == PACKET_LOOPBACK)
366                 goto out;
367
368         if (!net_eq(dev_net(dev), sock_net(sk)))
369                 goto out;
370
371         skb = skb_share_check(skb, GFP_ATOMIC);
372         if (skb == NULL)
373                 goto oom;
374
375         /* drop any routing info */
376         skb_dst_drop(skb);
377
378         /* drop conntrack reference */
379         nf_reset(skb);
380
381         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
382
383         skb_push(skb, skb->data - skb_mac_header(skb));
384
385         /*
386          *      The SOCK_PACKET socket receives _all_ frames.
387          */
388
389         spkt->spkt_family = dev->type;
390         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
391         spkt->spkt_protocol = skb->protocol;
392
393         /*
394          *      Charge the memory to the socket. This is done specifically
395          *      to prevent sockets using all the memory up.
396          */
397
398         if (sock_queue_rcv_skb(sk, skb) == 0)
399                 return 0;
400
401 out:
402         kfree_skb(skb);
403 oom:
404         return 0;
405 }
406
407
408 /*
409  *      Output a raw packet to a device layer. This bypasses all the other
410  *      protocol layers and you must therefore supply it with a complete frame
411  */
412
413 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
414                                struct msghdr *msg, size_t len)
415 {
416         struct sock *sk = sock->sk;
417         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
418         struct sk_buff *skb;
419         struct net_device *dev;
420         __be16 proto = 0;
421         int err;
422
423         /*
424          *      Get and verify the address.
425          */
426
427         if (saddr) {
428                 if (msg->msg_namelen < sizeof(struct sockaddr))
429                         return -EINVAL;
430                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
431                         proto = saddr->spkt_protocol;
432         } else
433                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
434
435         /*
436          *      Find the device first to size check it
437          */
438
439         saddr->spkt_device[13] = 0;
440         rcu_read_lock();
441         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
442         err = -ENODEV;
443         if (dev == NULL)
444                 goto out_unlock;
445
446         err = -ENETDOWN;
447         if (!(dev->flags & IFF_UP))
448                 goto out_unlock;
449
450         /*
451          * You may not queue a frame bigger than the mtu. This is the lowest level
452          * raw protocol and you must do your own fragmentation at this level.
453          */
454
455         err = -EMSGSIZE;
456         if (len > dev->mtu + dev->hard_header_len)
457                 goto out_unlock;
458
459         err = -ENOBUFS;
460         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
461
462         /*
463          * If the write buffer is full, then tough. At this level the user
464          * gets to deal with the problem - do your own algorithmic backoffs.
465          * That's far more flexible.
466          */
467
468         if (skb == NULL)
469                 goto out_unlock;
470
471         /*
472          *      Fill it in
473          */
474
475         /* FIXME: Save some space for broken drivers that write a
476          * hard header at transmission time by themselves. PPP is the
477          * notable one here. This should really be fixed at the driver level.
478          */
479         skb_reserve(skb, LL_RESERVED_SPACE(dev));
480         skb_reset_network_header(skb);
481
482         /* Try to align data part correctly */
483         if (dev->header_ops) {
484                 skb->data -= dev->hard_header_len;
485                 skb->tail -= dev->hard_header_len;
486                 if (len < dev->hard_header_len)
487                         skb_reset_network_header(skb);
488         }
489
490         /* Returns -EFAULT on error */
491         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
492         skb->protocol = proto;
493         skb->dev = dev;
494         skb->priority = sk->sk_priority;
495         skb->mark = sk->sk_mark;
496         if (err)
497                 goto out_free;
498
499         /*
500          *      Now send it
501          */
502
503         dev_queue_xmit(skb);
504         rcu_read_unlock();
505         return len;
506
507 out_free:
508         kfree_skb(skb);
509 out_unlock:
510         rcu_read_unlock();
511         return err;
512 }
513
514 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515                                       unsigned int res)
516 {
517         struct sk_filter *filter;
518
519         rcu_read_lock_bh();
520         filter = rcu_dereference(sk->sk_filter);
521         if (filter != NULL)
522                 res = sk_run_filter(skb, filter->insns, filter->len);
523         rcu_read_unlock_bh();
524
525         return res;
526 }
527
528 /*
529    This function makes lazy skb cloning in hope that most of packets
530    are discarded by BPF.
531
532    Note tricky part: we DO mangle shared skb! skb->data, skb->len
533    and skb->cb are mangled. It works because (and until) packets
534    falling here are owned by current CPU. Output packets are cloned
535    by dev_queue_xmit_nit(), input packets are processed by net_bh
536    sequencially, so that if we return skb to original state on exit,
537    we will not harm anyone.
538  */
539
540 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541                       struct packet_type *pt, struct net_device *orig_dev)
542 {
543         struct sock *sk;
544         struct sockaddr_ll *sll;
545         struct packet_sock *po;
546         u8 *skb_head = skb->data;
547         int skb_len = skb->len;
548         unsigned int snaplen, res;
549
550         if (skb->pkt_type == PACKET_LOOPBACK)
551                 goto drop;
552
553         sk = pt->af_packet_priv;
554         po = pkt_sk(sk);
555
556         if (!net_eq(dev_net(dev), sock_net(sk)))
557                 goto drop;
558
559         skb->dev = dev;
560
561         if (dev->header_ops) {
562                 /* The device has an explicit notion of ll header,
563                    exported to higher levels.
564
565                    Otherwise, the device hides datails of it frame
566                    structure, so that corresponding packet head
567                    never delivered to user.
568                  */
569                 if (sk->sk_type != SOCK_DGRAM)
570                         skb_push(skb, skb->data - skb_mac_header(skb));
571                 else if (skb->pkt_type == PACKET_OUTGOING) {
572                         /* Special case: outgoing packets have ll header at head */
573                         skb_pull(skb, skb_network_offset(skb));
574                 }
575         }
576
577         snaplen = skb->len;
578
579         res = run_filter(skb, sk, snaplen);
580         if (!res)
581                 goto drop_n_restore;
582         if (snaplen > res)
583                 snaplen = res;
584
585         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586             (unsigned)sk->sk_rcvbuf)
587                 goto drop_n_acct;
588
589         if (skb_shared(skb)) {
590                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591                 if (nskb == NULL)
592                         goto drop_n_acct;
593
594                 if (skb_head != skb->data) {
595                         skb->data = skb_head;
596                         skb->len = skb_len;
597                 }
598                 kfree_skb(skb);
599                 skb = nskb;
600         }
601
602         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603                      sizeof(skb->cb));
604
605         sll = &PACKET_SKB_CB(skb)->sa.ll;
606         sll->sll_family = AF_PACKET;
607         sll->sll_hatype = dev->type;
608         sll->sll_protocol = skb->protocol;
609         sll->sll_pkttype = skb->pkt_type;
610         if (unlikely(po->origdev))
611                 sll->sll_ifindex = orig_dev->ifindex;
612         else
613                 sll->sll_ifindex = dev->ifindex;
614
615         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
616
617         PACKET_SKB_CB(skb)->origlen = skb->len;
618
619         if (pskb_trim(skb, snaplen))
620                 goto drop_n_acct;
621
622         skb_set_owner_r(skb, sk);
623         skb->dev = NULL;
624         skb_dst_drop(skb);
625
626         /* drop conntrack reference */
627         nf_reset(skb);
628
629         spin_lock(&sk->sk_receive_queue.lock);
630         po->stats.tp_packets++;
631         skb->dropcount = atomic_read(&sk->sk_drops);
632         __skb_queue_tail(&sk->sk_receive_queue, skb);
633         spin_unlock(&sk->sk_receive_queue.lock);
634         sk->sk_data_ready(sk, skb->len);
635         return 0;
636
637 drop_n_acct:
638         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
639
640 drop_n_restore:
641         if (skb_head != skb->data && skb_shared(skb)) {
642                 skb->data = skb_head;
643                 skb->len = skb_len;
644         }
645 drop:
646         consume_skb(skb);
647         return 0;
648 }
649
650 #ifdef CONFIG_PACKET_MMAP
651 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
652                        struct packet_type *pt, struct net_device *orig_dev)
653 {
654         struct sock *sk;
655         struct packet_sock *po;
656         struct sockaddr_ll *sll;
657         union {
658                 struct tpacket_hdr *h1;
659                 struct tpacket2_hdr *h2;
660                 void *raw;
661         } h;
662         u8 *skb_head = skb->data;
663         int skb_len = skb->len;
664         unsigned int snaplen, res;
665         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
666         unsigned short macoff, netoff, hdrlen;
667         struct sk_buff *copy_skb = NULL;
668         struct timeval tv;
669         struct timespec ts;
670
671         if (skb->pkt_type == PACKET_LOOPBACK)
672                 goto drop;
673
674         sk = pt->af_packet_priv;
675         po = pkt_sk(sk);
676
677         if (!net_eq(dev_net(dev), sock_net(sk)))
678                 goto drop;
679
680         if (dev->header_ops) {
681                 if (sk->sk_type != SOCK_DGRAM)
682                         skb_push(skb, skb->data - skb_mac_header(skb));
683                 else if (skb->pkt_type == PACKET_OUTGOING) {
684                         /* Special case: outgoing packets have ll header at head */
685                         skb_pull(skb, skb_network_offset(skb));
686                 }
687         }
688
689         if (skb->ip_summed == CHECKSUM_PARTIAL)
690                 status |= TP_STATUS_CSUMNOTREADY;
691
692         snaplen = skb->len;
693
694         res = run_filter(skb, sk, snaplen);
695         if (!res)
696                 goto drop_n_restore;
697         if (snaplen > res)
698                 snaplen = res;
699
700         if (sk->sk_type == SOCK_DGRAM) {
701                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702                                   po->tp_reserve;
703         } else {
704                 unsigned maclen = skb_network_offset(skb);
705                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
706                                        (maclen < 16 ? 16 : maclen)) +
707                         po->tp_reserve;
708                 macoff = netoff - maclen;
709         }
710
711         if (macoff + snaplen > po->rx_ring.frame_size) {
712                 if (po->copy_thresh &&
713                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714                     (unsigned)sk->sk_rcvbuf) {
715                         if (skb_shared(skb)) {
716                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
717                         } else {
718                                 copy_skb = skb_get(skb);
719                                 skb_head = skb->data;
720                         }
721                         if (copy_skb)
722                                 skb_set_owner_r(copy_skb, sk);
723                 }
724                 snaplen = po->rx_ring.frame_size - macoff;
725                 if ((int)snaplen < 0)
726                         snaplen = 0;
727         }
728
729         spin_lock(&sk->sk_receive_queue.lock);
730         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
731         if (!h.raw)
732                 goto ring_is_full;
733         packet_increment_head(&po->rx_ring);
734         po->stats.tp_packets++;
735         if (copy_skb) {
736                 status |= TP_STATUS_COPY;
737                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738         }
739         if (!po->stats.tp_drops)
740                 status &= ~TP_STATUS_LOSING;
741         spin_unlock(&sk->sk_receive_queue.lock);
742
743         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
744
745         switch (po->tp_version) {
746         case TPACKET_V1:
747                 h.h1->tp_len = skb->len;
748                 h.h1->tp_snaplen = snaplen;
749                 h.h1->tp_mac = macoff;
750                 h.h1->tp_net = netoff;
751                 if (skb->tstamp.tv64)
752                         tv = ktime_to_timeval(skb->tstamp);
753                 else
754                         do_gettimeofday(&tv);
755                 h.h1->tp_sec = tv.tv_sec;
756                 h.h1->tp_usec = tv.tv_usec;
757                 hdrlen = sizeof(*h.h1);
758                 break;
759         case TPACKET_V2:
760                 h.h2->tp_len = skb->len;
761                 h.h2->tp_snaplen = snaplen;
762                 h.h2->tp_mac = macoff;
763                 h.h2->tp_net = netoff;
764                 if (skb->tstamp.tv64)
765                         ts = ktime_to_timespec(skb->tstamp);
766                 else
767                         getnstimeofday(&ts);
768                 h.h2->tp_sec = ts.tv_sec;
769                 h.h2->tp_nsec = ts.tv_nsec;
770                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
771                 hdrlen = sizeof(*h.h2);
772                 break;
773         default:
774                 BUG();
775         }
776
777         sll = h.raw + TPACKET_ALIGN(hdrlen);
778         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
779         sll->sll_family = AF_PACKET;
780         sll->sll_hatype = dev->type;
781         sll->sll_protocol = skb->protocol;
782         sll->sll_pkttype = skb->pkt_type;
783         if (unlikely(po->origdev))
784                 sll->sll_ifindex = orig_dev->ifindex;
785         else
786                 sll->sll_ifindex = dev->ifindex;
787
788         __packet_set_status(po, h.raw, status);
789         smp_mb();
790         {
791                 struct page *p_start, *p_end;
792                 u8 *h_end = h.raw + macoff + snaplen - 1;
793
794                 p_start = virt_to_page(h.raw);
795                 p_end = virt_to_page(h_end);
796                 while (p_start <= p_end) {
797                         flush_dcache_page(p_start);
798                         p_start++;
799                 }
800         }
801
802         sk->sk_data_ready(sk, 0);
803
804 drop_n_restore:
805         if (skb_head != skb->data && skb_shared(skb)) {
806                 skb->data = skb_head;
807                 skb->len = skb_len;
808         }
809 drop:
810         kfree_skb(skb);
811         return 0;
812
813 ring_is_full:
814         po->stats.tp_drops++;
815         spin_unlock(&sk->sk_receive_queue.lock);
816
817         sk->sk_data_ready(sk, 0);
818         kfree_skb(copy_skb);
819         goto drop_n_restore;
820 }
821
822 static void tpacket_destruct_skb(struct sk_buff *skb)
823 {
824         struct packet_sock *po = pkt_sk(skb->sk);
825         void *ph;
826
827         BUG_ON(skb == NULL);
828
829         if (likely(po->tx_ring.pg_vec)) {
830                 ph = skb_shinfo(skb)->destructor_arg;
831                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
832                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
833                 atomic_dec(&po->tx_ring.pending);
834                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
835         }
836
837         sock_wfree(skb);
838 }
839
840 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
841                 void *frame, struct net_device *dev, int size_max,
842                 __be16 proto, unsigned char *addr)
843 {
844         union {
845                 struct tpacket_hdr *h1;
846                 struct tpacket2_hdr *h2;
847                 void *raw;
848         } ph;
849         int to_write, offset, len, tp_len, nr_frags, len_max;
850         struct socket *sock = po->sk.sk_socket;
851         struct page *page;
852         void *data;
853         int err;
854
855         ph.raw = frame;
856
857         skb->protocol = proto;
858         skb->dev = dev;
859         skb->priority = po->sk.sk_priority;
860         skb->mark = po->sk.sk_mark;
861         skb_shinfo(skb)->destructor_arg = ph.raw;
862
863         switch (po->tp_version) {
864         case TPACKET_V2:
865                 tp_len = ph.h2->tp_len;
866                 break;
867         default:
868                 tp_len = ph.h1->tp_len;
869                 break;
870         }
871         if (unlikely(tp_len > size_max)) {
872                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
873                 return -EMSGSIZE;
874         }
875
876         skb_reserve(skb, LL_RESERVED_SPACE(dev));
877         skb_reset_network_header(skb);
878
879         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
880         to_write = tp_len;
881
882         if (sock->type == SOCK_DGRAM) {
883                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
884                                 NULL, tp_len);
885                 if (unlikely(err < 0))
886                         return -EINVAL;
887         } else if (dev->hard_header_len) {
888                 /* net device doesn't like empty head */
889                 if (unlikely(tp_len <= dev->hard_header_len)) {
890                         pr_err("packet size is too short (%d < %d)\n",
891                                tp_len, dev->hard_header_len);
892                         return -EINVAL;
893                 }
894
895                 skb_push(skb, dev->hard_header_len);
896                 err = skb_store_bits(skb, 0, data,
897                                 dev->hard_header_len);
898                 if (unlikely(err))
899                         return err;
900
901                 data += dev->hard_header_len;
902                 to_write -= dev->hard_header_len;
903         }
904
905         err = -EFAULT;
906         page = virt_to_page(data);
907         offset = offset_in_page(data);
908         len_max = PAGE_SIZE - offset;
909         len = ((to_write > len_max) ? len_max : to_write);
910
911         skb->data_len = to_write;
912         skb->len += to_write;
913         skb->truesize += to_write;
914         atomic_add(to_write, &po->sk.sk_wmem_alloc);
915
916         while (likely(to_write)) {
917                 nr_frags = skb_shinfo(skb)->nr_frags;
918
919                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920                         pr_err("Packet exceed the number of skb frags(%lu)\n",
921                                MAX_SKB_FRAGS);
922                         return -EFAULT;
923                 }
924
925                 flush_dcache_page(page);
926                 get_page(page);
927                 skb_fill_page_desc(skb,
928                                 nr_frags,
929                                 page++, offset, len);
930                 to_write -= len;
931                 offset = 0;
932                 len_max = PAGE_SIZE;
933                 len = ((to_write > len_max) ? len_max : to_write);
934         }
935
936         return tp_len;
937 }
938
939 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
940 {
941         struct socket *sock;
942         struct sk_buff *skb;
943         struct net_device *dev;
944         __be16 proto;
945         int ifindex, err, reserve = 0;
946         void *ph;
947         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
948         int tp_len, size_max;
949         unsigned char *addr;
950         int len_sum = 0;
951         int status = 0;
952
953         sock = po->sk.sk_socket;
954
955         mutex_lock(&po->pg_vec_lock);
956
957         err = -EBUSY;
958         if (saddr == NULL) {
959                 ifindex = po->ifindex;
960                 proto   = po->num;
961                 addr    = NULL;
962         } else {
963                 err = -EINVAL;
964                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
965                         goto out;
966                 if (msg->msg_namelen < (saddr->sll_halen
967                                         + offsetof(struct sockaddr_ll,
968                                                 sll_addr)))
969                         goto out;
970                 ifindex = saddr->sll_ifindex;
971                 proto   = saddr->sll_protocol;
972                 addr    = saddr->sll_addr;
973         }
974
975         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
976         err = -ENXIO;
977         if (unlikely(dev == NULL))
978                 goto out;
979
980         reserve = dev->hard_header_len;
981
982         err = -ENETDOWN;
983         if (unlikely(!(dev->flags & IFF_UP)))
984                 goto out_put;
985
986         size_max = po->tx_ring.frame_size
987                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
988
989         if (size_max > dev->mtu + reserve)
990                 size_max = dev->mtu + reserve;
991
992         do {
993                 ph = packet_current_frame(po, &po->tx_ring,
994                                 TP_STATUS_SEND_REQUEST);
995
996                 if (unlikely(ph == NULL)) {
997                         schedule();
998                         continue;
999                 }
1000
1001                 status = TP_STATUS_SEND_REQUEST;
1002                 skb = sock_alloc_send_skb(&po->sk,
1003                                 LL_ALLOCATED_SPACE(dev)
1004                                 + sizeof(struct sockaddr_ll),
1005                                 0, &err);
1006
1007                 if (unlikely(skb == NULL))
1008                         goto out_status;
1009
1010                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1011                                 addr);
1012
1013                 if (unlikely(tp_len < 0)) {
1014                         if (po->tp_loss) {
1015                                 __packet_set_status(po, ph,
1016                                                 TP_STATUS_AVAILABLE);
1017                                 packet_increment_head(&po->tx_ring);
1018                                 kfree_skb(skb);
1019                                 continue;
1020                         } else {
1021                                 status = TP_STATUS_WRONG_FORMAT;
1022                                 err = tp_len;
1023                                 goto out_status;
1024                         }
1025                 }
1026
1027                 skb->destructor = tpacket_destruct_skb;
1028                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1029                 atomic_inc(&po->tx_ring.pending);
1030
1031                 status = TP_STATUS_SEND_REQUEST;
1032                 err = dev_queue_xmit(skb);
1033                 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1034                         goto out_xmit;
1035                 packet_increment_head(&po->tx_ring);
1036                 len_sum += tp_len;
1037         } while (likely((ph != NULL) ||
1038                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1039                          (atomic_read(&po->tx_ring.pending))))
1040                 );
1041
1042         err = len_sum;
1043         goto out_put;
1044
1045 out_xmit:
1046         skb->destructor = sock_wfree;
1047         atomic_dec(&po->tx_ring.pending);
1048 out_status:
1049         __packet_set_status(po, ph, status);
1050         kfree_skb(skb);
1051 out_put:
1052         dev_put(dev);
1053 out:
1054         mutex_unlock(&po->pg_vec_lock);
1055         return err;
1056 }
1057 #endif
1058
1059 static int packet_snd(struct socket *sock,
1060                           struct msghdr *msg, size_t len)
1061 {
1062         struct sock *sk = sock->sk;
1063         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1064         struct sk_buff *skb;
1065         struct net_device *dev;
1066         __be16 proto;
1067         unsigned char *addr;
1068         int ifindex, err, reserve = 0;
1069
1070         /*
1071          *      Get and verify the address.
1072          */
1073
1074         if (saddr == NULL) {
1075                 struct packet_sock *po = pkt_sk(sk);
1076
1077                 ifindex = po->ifindex;
1078                 proto   = po->num;
1079                 addr    = NULL;
1080         } else {
1081                 err = -EINVAL;
1082                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1083                         goto out;
1084                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1085                         goto out;
1086                 ifindex = saddr->sll_ifindex;
1087                 proto   = saddr->sll_protocol;
1088                 addr    = saddr->sll_addr;
1089         }
1090
1091
1092         dev = dev_get_by_index(sock_net(sk), ifindex);
1093         err = -ENXIO;
1094         if (dev == NULL)
1095                 goto out_unlock;
1096         if (sock->type == SOCK_RAW)
1097                 reserve = dev->hard_header_len;
1098
1099         err = -ENETDOWN;
1100         if (!(dev->flags & IFF_UP))
1101                 goto out_unlock;
1102
1103         err = -EMSGSIZE;
1104         if (len > dev->mtu+reserve)
1105                 goto out_unlock;
1106
1107         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1108                                 msg->msg_flags & MSG_DONTWAIT, &err);
1109         if (skb == NULL)
1110                 goto out_unlock;
1111
1112         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1113         skb_reset_network_header(skb);
1114
1115         err = -EINVAL;
1116         if (sock->type == SOCK_DGRAM &&
1117             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1118                 goto out_free;
1119
1120         /* Returns -EFAULT on error */
1121         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1122         if (err)
1123                 goto out_free;
1124
1125         skb->protocol = proto;
1126         skb->dev = dev;
1127         skb->priority = sk->sk_priority;
1128         skb->mark = sk->sk_mark;
1129
1130         /*
1131          *      Now send it
1132          */
1133
1134         err = dev_queue_xmit(skb);
1135         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1136                 goto out_unlock;
1137
1138         dev_put(dev);
1139
1140         return len;
1141
1142 out_free:
1143         kfree_skb(skb);
1144 out_unlock:
1145         if (dev)
1146                 dev_put(dev);
1147 out:
1148         return err;
1149 }
1150
1151 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1152                 struct msghdr *msg, size_t len)
1153 {
1154 #ifdef CONFIG_PACKET_MMAP
1155         struct sock *sk = sock->sk;
1156         struct packet_sock *po = pkt_sk(sk);
1157         if (po->tx_ring.pg_vec)
1158                 return tpacket_snd(po, msg);
1159         else
1160 #endif
1161                 return packet_snd(sock, msg, len);
1162 }
1163
1164 /*
1165  *      Close a PACKET socket. This is fairly simple. We immediately go
1166  *      to 'closed' state and remove our protocol entry in the device list.
1167  */
1168
1169 static int packet_release(struct socket *sock)
1170 {
1171         struct sock *sk = sock->sk;
1172         struct packet_sock *po;
1173         struct net *net;
1174 #ifdef CONFIG_PACKET_MMAP
1175         struct tpacket_req req;
1176 #endif
1177
1178         if (!sk)
1179                 return 0;
1180
1181         net = sock_net(sk);
1182         po = pkt_sk(sk);
1183
1184         write_lock_bh(&net->packet.sklist_lock);
1185         sk_del_node_init(sk);
1186         sock_prot_inuse_add(net, sk->sk_prot, -1);
1187         write_unlock_bh(&net->packet.sklist_lock);
1188
1189         /*
1190          *      Unhook packet receive handler.
1191          */
1192
1193         if (po->running) {
1194                 /*
1195                  *      Remove the protocol hook
1196                  */
1197                 dev_remove_pack(&po->prot_hook);
1198                 po->running = 0;
1199                 po->num = 0;
1200                 __sock_put(sk);
1201         }
1202
1203         packet_flush_mclist(sk);
1204
1205 #ifdef CONFIG_PACKET_MMAP
1206         memset(&req, 0, sizeof(req));
1207
1208         if (po->rx_ring.pg_vec)
1209                 packet_set_ring(sk, &req, 1, 0);
1210
1211         if (po->tx_ring.pg_vec)
1212                 packet_set_ring(sk, &req, 1, 1);
1213 #endif
1214
1215         /*
1216          *      Now the socket is dead. No more input will appear.
1217          */
1218
1219         sock_orphan(sk);
1220         sock->sk = NULL;
1221
1222         /* Purge queues */
1223
1224         skb_queue_purge(&sk->sk_receive_queue);
1225         sk_refcnt_debug_release(sk);
1226
1227         sock_put(sk);
1228         return 0;
1229 }
1230
1231 /*
1232  *      Attach a packet hook.
1233  */
1234
1235 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1236 {
1237         struct packet_sock *po = pkt_sk(sk);
1238         /*
1239          *      Detach an existing hook if present.
1240          */
1241
1242         lock_sock(sk);
1243
1244         spin_lock(&po->bind_lock);
1245         if (po->running) {
1246                 __sock_put(sk);
1247                 po->running = 0;
1248                 po->num = 0;
1249                 spin_unlock(&po->bind_lock);
1250                 dev_remove_pack(&po->prot_hook);
1251                 spin_lock(&po->bind_lock);
1252         }
1253
1254         po->num = protocol;
1255         po->prot_hook.type = protocol;
1256         po->prot_hook.dev = dev;
1257
1258         po->ifindex = dev ? dev->ifindex : 0;
1259
1260         if (protocol == 0)
1261                 goto out_unlock;
1262
1263         if (!dev || (dev->flags & IFF_UP)) {
1264                 dev_add_pack(&po->prot_hook);
1265                 sock_hold(sk);
1266                 po->running = 1;
1267         } else {
1268                 sk->sk_err = ENETDOWN;
1269                 if (!sock_flag(sk, SOCK_DEAD))
1270                         sk->sk_error_report(sk);
1271         }
1272
1273 out_unlock:
1274         spin_unlock(&po->bind_lock);
1275         release_sock(sk);
1276         return 0;
1277 }
1278
1279 /*
1280  *      Bind a packet socket to a device
1281  */
1282
1283 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1284                             int addr_len)
1285 {
1286         struct sock *sk = sock->sk;
1287         char name[15];
1288         struct net_device *dev;
1289         int err = -ENODEV;
1290
1291         /*
1292          *      Check legality
1293          */
1294
1295         if (addr_len != sizeof(struct sockaddr))
1296                 return -EINVAL;
1297         strlcpy(name, uaddr->sa_data, sizeof(name));
1298
1299         dev = dev_get_by_name(sock_net(sk), name);
1300         if (dev) {
1301                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1302                 dev_put(dev);
1303         }
1304         return err;
1305 }
1306
1307 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1308 {
1309         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1310         struct sock *sk = sock->sk;
1311         struct net_device *dev = NULL;
1312         int err;
1313
1314
1315         /*
1316          *      Check legality
1317          */
1318
1319         if (addr_len < sizeof(struct sockaddr_ll))
1320                 return -EINVAL;
1321         if (sll->sll_family != AF_PACKET)
1322                 return -EINVAL;
1323
1324         if (sll->sll_ifindex) {
1325                 err = -ENODEV;
1326                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1327                 if (dev == NULL)
1328                         goto out;
1329         }
1330         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1331         if (dev)
1332                 dev_put(dev);
1333
1334 out:
1335         return err;
1336 }
1337
1338 static struct proto packet_proto = {
1339         .name     = "PACKET",
1340         .owner    = THIS_MODULE,
1341         .obj_size = sizeof(struct packet_sock),
1342 };
1343
1344 /*
1345  *      Create a packet of type SOCK_PACKET.
1346  */
1347
1348 static int packet_create(struct net *net, struct socket *sock, int protocol,
1349                          int kern)
1350 {
1351         struct sock *sk;
1352         struct packet_sock *po;
1353         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1354         int err;
1355
1356         if (!capable(CAP_NET_RAW))
1357                 return -EPERM;
1358         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1359             sock->type != SOCK_PACKET)
1360                 return -ESOCKTNOSUPPORT;
1361
1362         sock->state = SS_UNCONNECTED;
1363
1364         err = -ENOBUFS;
1365         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1366         if (sk == NULL)
1367                 goto out;
1368
1369         sock->ops = &packet_ops;
1370         if (sock->type == SOCK_PACKET)
1371                 sock->ops = &packet_ops_spkt;
1372
1373         sock_init_data(sock, sk);
1374
1375         po = pkt_sk(sk);
1376         sk->sk_family = PF_PACKET;
1377         po->num = proto;
1378
1379         sk->sk_destruct = packet_sock_destruct;
1380         sk_refcnt_debug_inc(sk);
1381
1382         /*
1383          *      Attach a protocol block
1384          */
1385
1386         spin_lock_init(&po->bind_lock);
1387         mutex_init(&po->pg_vec_lock);
1388         po->prot_hook.func = packet_rcv;
1389
1390         if (sock->type == SOCK_PACKET)
1391                 po->prot_hook.func = packet_rcv_spkt;
1392
1393         po->prot_hook.af_packet_priv = sk;
1394
1395         if (proto) {
1396                 po->prot_hook.type = proto;
1397                 dev_add_pack(&po->prot_hook);
1398                 sock_hold(sk);
1399                 po->running = 1;
1400         }
1401
1402         write_lock_bh(&net->packet.sklist_lock);
1403         sk_add_node(sk, &net->packet.sklist);
1404         sock_prot_inuse_add(net, &packet_proto, 1);
1405         write_unlock_bh(&net->packet.sklist_lock);
1406         return 0;
1407 out:
1408         return err;
1409 }
1410
1411 /*
1412  *      Pull a packet from our receive queue and hand it to the user.
1413  *      If necessary we block.
1414  */
1415
1416 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1417                           struct msghdr *msg, size_t len, int flags)
1418 {
1419         struct sock *sk = sock->sk;
1420         struct sk_buff *skb;
1421         int copied, err;
1422         struct sockaddr_ll *sll;
1423
1424         err = -EINVAL;
1425         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1426                 goto out;
1427
1428 #if 0
1429         /* What error should we return now? EUNATTACH? */
1430         if (pkt_sk(sk)->ifindex < 0)
1431                 return -ENODEV;
1432 #endif
1433
1434         /*
1435          *      Call the generic datagram receiver. This handles all sorts
1436          *      of horrible races and re-entrancy so we can forget about it
1437          *      in the protocol layers.
1438          *
1439          *      Now it will return ENETDOWN, if device have just gone down,
1440          *      but then it will block.
1441          */
1442
1443         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1444
1445         /*
1446          *      An error occurred so return it. Because skb_recv_datagram()
1447          *      handles the blocking we don't see and worry about blocking
1448          *      retries.
1449          */
1450
1451         if (skb == NULL)
1452                 goto out;
1453
1454         /*
1455          *      If the address length field is there to be filled in, we fill
1456          *      it in now.
1457          */
1458
1459         sll = &PACKET_SKB_CB(skb)->sa.ll;
1460         if (sock->type == SOCK_PACKET)
1461                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1462         else
1463                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1464
1465         /*
1466          *      You lose any data beyond the buffer you gave. If it worries a
1467          *      user program they can ask the device for its MTU anyway.
1468          */
1469
1470         copied = skb->len;
1471         if (copied > len) {
1472                 copied = len;
1473                 msg->msg_flags |= MSG_TRUNC;
1474         }
1475
1476         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1477         if (err)
1478                 goto out_free;
1479
1480         sock_recv_ts_and_drops(msg, sk, skb);
1481
1482         if (msg->msg_name)
1483                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1484                        msg->msg_namelen);
1485
1486         if (pkt_sk(sk)->auxdata) {
1487                 struct tpacket_auxdata aux;
1488
1489                 aux.tp_status = TP_STATUS_USER;
1490                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1491                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1492                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1493                 aux.tp_snaplen = skb->len;
1494                 aux.tp_mac = 0;
1495                 aux.tp_net = skb_network_offset(skb);
1496                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1497
1498                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1499         }
1500
1501         /*
1502          *      Free or return the buffer as appropriate. Again this
1503          *      hides all the races and re-entrancy issues from us.
1504          */
1505         err = (flags&MSG_TRUNC) ? skb->len : copied;
1506
1507 out_free:
1508         skb_free_datagram(sk, skb);
1509 out:
1510         return err;
1511 }
1512
1513 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1514                                int *uaddr_len, int peer)
1515 {
1516         struct net_device *dev;
1517         struct sock *sk = sock->sk;
1518
1519         if (peer)
1520                 return -EOPNOTSUPP;
1521
1522         uaddr->sa_family = AF_PACKET;
1523         rcu_read_lock();
1524         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1525         if (dev)
1526                 strlcpy(uaddr->sa_data, dev->name, 15);
1527         else
1528                 memset(uaddr->sa_data, 0, 14);
1529         rcu_read_unlock();
1530         *uaddr_len = sizeof(*uaddr);
1531
1532         return 0;
1533 }
1534
1535 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1536                           int *uaddr_len, int peer)
1537 {
1538         struct net_device *dev;
1539         struct sock *sk = sock->sk;
1540         struct packet_sock *po = pkt_sk(sk);
1541         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1542
1543         if (peer)
1544                 return -EOPNOTSUPP;
1545
1546         sll->sll_family = AF_PACKET;
1547         sll->sll_ifindex = po->ifindex;
1548         sll->sll_protocol = po->num;
1549         rcu_read_lock();
1550         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1551         if (dev) {
1552                 sll->sll_hatype = dev->type;
1553                 sll->sll_halen = dev->addr_len;
1554                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1555         } else {
1556                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1557                 sll->sll_halen = 0;
1558         }
1559         rcu_read_unlock();
1560         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1561
1562         return 0;
1563 }
1564
1565 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1566                          int what)
1567 {
1568         switch (i->type) {
1569         case PACKET_MR_MULTICAST:
1570                 if (what > 0)
1571                         return dev_mc_add(dev, i->addr, i->alen, 0);
1572                 else
1573                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1574                 break;
1575         case PACKET_MR_PROMISC:
1576                 return dev_set_promiscuity(dev, what);
1577                 break;
1578         case PACKET_MR_ALLMULTI:
1579                 return dev_set_allmulti(dev, what);
1580                 break;
1581         case PACKET_MR_UNICAST:
1582                 if (what > 0)
1583                         return dev_unicast_add(dev, i->addr);
1584                 else
1585                         return dev_unicast_delete(dev, i->addr);
1586                 break;
1587         default:
1588                 break;
1589         }
1590         return 0;
1591 }
1592
1593 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1594 {
1595         for ( ; i; i = i->next) {
1596                 if (i->ifindex == dev->ifindex)
1597                         packet_dev_mc(dev, i, what);
1598         }
1599 }
1600
1601 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1602 {
1603         struct packet_sock *po = pkt_sk(sk);
1604         struct packet_mclist *ml, *i;
1605         struct net_device *dev;
1606         int err;
1607
1608         rtnl_lock();
1609
1610         err = -ENODEV;
1611         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1612         if (!dev)
1613                 goto done;
1614
1615         err = -EINVAL;
1616         if (mreq->mr_alen > dev->addr_len)
1617                 goto done;
1618
1619         err = -ENOBUFS;
1620         i = kmalloc(sizeof(*i), GFP_KERNEL);
1621         if (i == NULL)
1622                 goto done;
1623
1624         err = 0;
1625         for (ml = po->mclist; ml; ml = ml->next) {
1626                 if (ml->ifindex == mreq->mr_ifindex &&
1627                     ml->type == mreq->mr_type &&
1628                     ml->alen == mreq->mr_alen &&
1629                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1630                         ml->count++;
1631                         /* Free the new element ... */
1632                         kfree(i);
1633                         goto done;
1634                 }
1635         }
1636
1637         i->type = mreq->mr_type;
1638         i->ifindex = mreq->mr_ifindex;
1639         i->alen = mreq->mr_alen;
1640         memcpy(i->addr, mreq->mr_address, i->alen);
1641         i->count = 1;
1642         i->next = po->mclist;
1643         po->mclist = i;
1644         err = packet_dev_mc(dev, i, 1);
1645         if (err) {
1646                 po->mclist = i->next;
1647                 kfree(i);
1648         }
1649
1650 done:
1651         rtnl_unlock();
1652         return err;
1653 }
1654
1655 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1656 {
1657         struct packet_mclist *ml, **mlp;
1658
1659         rtnl_lock();
1660
1661         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1662                 if (ml->ifindex == mreq->mr_ifindex &&
1663                     ml->type == mreq->mr_type &&
1664                     ml->alen == mreq->mr_alen &&
1665                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1666                         if (--ml->count == 0) {
1667                                 struct net_device *dev;
1668                                 *mlp = ml->next;
1669                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1670                                 if (dev)
1671                                         packet_dev_mc(dev, ml, -1);
1672                                 kfree(ml);
1673                         }
1674                         rtnl_unlock();
1675                         return 0;
1676                 }
1677         }
1678         rtnl_unlock();
1679         return -EADDRNOTAVAIL;
1680 }
1681
1682 static void packet_flush_mclist(struct sock *sk)
1683 {
1684         struct packet_sock *po = pkt_sk(sk);
1685         struct packet_mclist *ml;
1686
1687         if (!po->mclist)
1688                 return;
1689
1690         rtnl_lock();
1691         while ((ml = po->mclist) != NULL) {
1692                 struct net_device *dev;
1693
1694                 po->mclist = ml->next;
1695                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1696                 if (dev != NULL)
1697                         packet_dev_mc(dev, ml, -1);
1698                 kfree(ml);
1699         }
1700         rtnl_unlock();
1701 }
1702
1703 static int
1704 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1705 {
1706         struct sock *sk = sock->sk;
1707         struct packet_sock *po = pkt_sk(sk);
1708         int ret;
1709
1710         if (level != SOL_PACKET)
1711                 return -ENOPROTOOPT;
1712
1713         switch (optname) {
1714         case PACKET_ADD_MEMBERSHIP:
1715         case PACKET_DROP_MEMBERSHIP:
1716         {
1717                 struct packet_mreq_max mreq;
1718                 int len = optlen;
1719                 memset(&mreq, 0, sizeof(mreq));
1720                 if (len < sizeof(struct packet_mreq))
1721                         return -EINVAL;
1722                 if (len > sizeof(mreq))
1723                         len = sizeof(mreq);
1724                 if (copy_from_user(&mreq, optval, len))
1725                         return -EFAULT;
1726                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1727                         return -EINVAL;
1728                 if (optname == PACKET_ADD_MEMBERSHIP)
1729                         ret = packet_mc_add(sk, &mreq);
1730                 else
1731                         ret = packet_mc_drop(sk, &mreq);
1732                 return ret;
1733         }
1734
1735 #ifdef CONFIG_PACKET_MMAP
1736         case PACKET_RX_RING:
1737         case PACKET_TX_RING:
1738         {
1739                 struct tpacket_req req;
1740
1741                 if (optlen < sizeof(req))
1742                         return -EINVAL;
1743                 if (copy_from_user(&req, optval, sizeof(req)))
1744                         return -EFAULT;
1745                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1746         }
1747         case PACKET_COPY_THRESH:
1748         {
1749                 int val;
1750
1751                 if (optlen != sizeof(val))
1752                         return -EINVAL;
1753                 if (copy_from_user(&val, optval, sizeof(val)))
1754                         return -EFAULT;
1755
1756                 pkt_sk(sk)->copy_thresh = val;
1757                 return 0;
1758         }
1759         case PACKET_VERSION:
1760         {
1761                 int val;
1762
1763                 if (optlen != sizeof(val))
1764                         return -EINVAL;
1765                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1766                         return -EBUSY;
1767                 if (copy_from_user(&val, optval, sizeof(val)))
1768                         return -EFAULT;
1769                 switch (val) {
1770                 case TPACKET_V1:
1771                 case TPACKET_V2:
1772                         po->tp_version = val;
1773                         return 0;
1774                 default:
1775                         return -EINVAL;
1776                 }
1777         }
1778         case PACKET_RESERVE:
1779         {
1780                 unsigned int val;
1781
1782                 if (optlen != sizeof(val))
1783                         return -EINVAL;
1784                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1785                         return -EBUSY;
1786                 if (copy_from_user(&val, optval, sizeof(val)))
1787                         return -EFAULT;
1788                 po->tp_reserve = val;
1789                 return 0;
1790         }
1791         case PACKET_LOSS:
1792         {
1793                 unsigned int val;
1794
1795                 if (optlen != sizeof(val))
1796                         return -EINVAL;
1797                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1798                         return -EBUSY;
1799                 if (copy_from_user(&val, optval, sizeof(val)))
1800                         return -EFAULT;
1801                 po->tp_loss = !!val;
1802                 return 0;
1803         }
1804 #endif
1805         case PACKET_AUXDATA:
1806         {
1807                 int val;
1808
1809                 if (optlen < sizeof(val))
1810                         return -EINVAL;
1811                 if (copy_from_user(&val, optval, sizeof(val)))
1812                         return -EFAULT;
1813
1814                 po->auxdata = !!val;
1815                 return 0;
1816         }
1817         case PACKET_ORIGDEV:
1818         {
1819                 int val;
1820
1821                 if (optlen < sizeof(val))
1822                         return -EINVAL;
1823                 if (copy_from_user(&val, optval, sizeof(val)))
1824                         return -EFAULT;
1825
1826                 po->origdev = !!val;
1827                 return 0;
1828         }
1829         default:
1830                 return -ENOPROTOOPT;
1831         }
1832 }
1833
1834 static int packet_getsockopt(struct socket *sock, int level, int optname,
1835                              char __user *optval, int __user *optlen)
1836 {
1837         int len;
1838         int val;
1839         struct sock *sk = sock->sk;
1840         struct packet_sock *po = pkt_sk(sk);
1841         void *data;
1842         struct tpacket_stats st;
1843
1844         if (level != SOL_PACKET)
1845                 return -ENOPROTOOPT;
1846
1847         if (get_user(len, optlen))
1848                 return -EFAULT;
1849
1850         if (len < 0)
1851                 return -EINVAL;
1852
1853         switch (optname) {
1854         case PACKET_STATISTICS:
1855                 if (len > sizeof(struct tpacket_stats))
1856                         len = sizeof(struct tpacket_stats);
1857                 spin_lock_bh(&sk->sk_receive_queue.lock);
1858                 st = po->stats;
1859                 memset(&po->stats, 0, sizeof(st));
1860                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1861                 st.tp_packets += st.tp_drops;
1862
1863                 data = &st;
1864                 break;
1865         case PACKET_AUXDATA:
1866                 if (len > sizeof(int))
1867                         len = sizeof(int);
1868                 val = po->auxdata;
1869
1870                 data = &val;
1871                 break;
1872         case PACKET_ORIGDEV:
1873                 if (len > sizeof(int))
1874                         len = sizeof(int);
1875                 val = po->origdev;
1876
1877                 data = &val;
1878                 break;
1879 #ifdef CONFIG_PACKET_MMAP
1880         case PACKET_VERSION:
1881                 if (len > sizeof(int))
1882                         len = sizeof(int);
1883                 val = po->tp_version;
1884                 data = &val;
1885                 break;
1886         case PACKET_HDRLEN:
1887                 if (len > sizeof(int))
1888                         len = sizeof(int);
1889                 if (copy_from_user(&val, optval, len))
1890                         return -EFAULT;
1891                 switch (val) {
1892                 case TPACKET_V1:
1893                         val = sizeof(struct tpacket_hdr);
1894                         break;
1895                 case TPACKET_V2:
1896                         val = sizeof(struct tpacket2_hdr);
1897                         break;
1898                 default:
1899                         return -EINVAL;
1900                 }
1901                 data = &val;
1902                 break;
1903         case PACKET_RESERVE:
1904                 if (len > sizeof(unsigned int))
1905                         len = sizeof(unsigned int);
1906                 val = po->tp_reserve;
1907                 data = &val;
1908                 break;
1909         case PACKET_LOSS:
1910                 if (len > sizeof(unsigned int))
1911                         len = sizeof(unsigned int);
1912                 val = po->tp_loss;
1913                 data = &val;
1914                 break;
1915 #endif
1916         default:
1917                 return -ENOPROTOOPT;
1918         }
1919
1920         if (put_user(len, optlen))
1921                 return -EFAULT;
1922         if (copy_to_user(optval, data, len))
1923                 return -EFAULT;
1924         return 0;
1925 }
1926
1927
1928 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1929 {
1930         struct sock *sk;
1931         struct hlist_node *node;
1932         struct net_device *dev = data;
1933         struct net *net = dev_net(dev);
1934
1935         read_lock(&net->packet.sklist_lock);
1936         sk_for_each(sk, node, &net->packet.sklist) {
1937                 struct packet_sock *po = pkt_sk(sk);
1938
1939                 switch (msg) {
1940                 case NETDEV_UNREGISTER:
1941                         if (po->mclist)
1942                                 packet_dev_mclist(dev, po->mclist, -1);
1943                         /* fallthrough */
1944
1945                 case NETDEV_DOWN:
1946                         if (dev->ifindex == po->ifindex) {
1947                                 spin_lock(&po->bind_lock);
1948                                 if (po->running) {
1949                                         __dev_remove_pack(&po->prot_hook);
1950                                         __sock_put(sk);
1951                                         po->running = 0;
1952                                         sk->sk_err = ENETDOWN;
1953                                         if (!sock_flag(sk, SOCK_DEAD))
1954                                                 sk->sk_error_report(sk);
1955                                 }
1956                                 if (msg == NETDEV_UNREGISTER) {
1957                                         po->ifindex = -1;
1958                                         po->prot_hook.dev = NULL;
1959                                 }
1960                                 spin_unlock(&po->bind_lock);
1961                         }
1962                         break;
1963                 case NETDEV_UP:
1964                         spin_lock(&po->bind_lock);
1965                         if (dev->ifindex == po->ifindex && po->num &&
1966                             !po->running) {
1967                                 dev_add_pack(&po->prot_hook);
1968                                 sock_hold(sk);
1969                                 po->running = 1;
1970                         }
1971                         spin_unlock(&po->bind_lock);
1972                         break;
1973                 }
1974         }
1975         read_unlock(&net->packet.sklist_lock);
1976         return NOTIFY_DONE;
1977 }
1978
1979
1980 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1981                         unsigned long arg)
1982 {
1983         struct sock *sk = sock->sk;
1984
1985         switch (cmd) {
1986         case SIOCOUTQ:
1987         {
1988                 int amount = sk_wmem_alloc_get(sk);
1989
1990                 return put_user(amount, (int __user *)arg);
1991         }
1992         case SIOCINQ:
1993         {
1994                 struct sk_buff *skb;
1995                 int amount = 0;
1996
1997                 spin_lock_bh(&sk->sk_receive_queue.lock);
1998                 skb = skb_peek(&sk->sk_receive_queue);
1999                 if (skb)
2000                         amount = skb->len;
2001                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2002                 return put_user(amount, (int __user *)arg);
2003         }
2004         case SIOCGSTAMP:
2005                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2006         case SIOCGSTAMPNS:
2007                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2008
2009 #ifdef CONFIG_INET
2010         case SIOCADDRT:
2011         case SIOCDELRT:
2012         case SIOCDARP:
2013         case SIOCGARP:
2014         case SIOCSARP:
2015         case SIOCGIFADDR:
2016         case SIOCSIFADDR:
2017         case SIOCGIFBRDADDR:
2018         case SIOCSIFBRDADDR:
2019         case SIOCGIFNETMASK:
2020         case SIOCSIFNETMASK:
2021         case SIOCGIFDSTADDR:
2022         case SIOCSIFDSTADDR:
2023         case SIOCSIFFLAGS:
2024                 if (!net_eq(sock_net(sk), &init_net))
2025                         return -ENOIOCTLCMD;
2026                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2027 #endif
2028
2029         default:
2030                 return -ENOIOCTLCMD;
2031         }
2032         return 0;
2033 }
2034
2035 #ifndef CONFIG_PACKET_MMAP
2036 #define packet_mmap sock_no_mmap
2037 #define packet_poll datagram_poll
2038 #else
2039
2040 static unsigned int packet_poll(struct file *file, struct socket *sock,
2041                                 poll_table *wait)
2042 {
2043         struct sock *sk = sock->sk;
2044         struct packet_sock *po = pkt_sk(sk);
2045         unsigned int mask = datagram_poll(file, sock, wait);
2046
2047         spin_lock_bh(&sk->sk_receive_queue.lock);
2048         if (po->rx_ring.pg_vec) {
2049                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2050                         mask |= POLLIN | POLLRDNORM;
2051         }
2052         spin_unlock_bh(&sk->sk_receive_queue.lock);
2053         spin_lock_bh(&sk->sk_write_queue.lock);
2054         if (po->tx_ring.pg_vec) {
2055                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2056                         mask |= POLLOUT | POLLWRNORM;
2057         }
2058         spin_unlock_bh(&sk->sk_write_queue.lock);
2059         return mask;
2060 }
2061
2062
2063 /* Dirty? Well, I still did not learn better way to account
2064  * for user mmaps.
2065  */
2066
2067 static void packet_mm_open(struct vm_area_struct *vma)
2068 {
2069         struct file *file = vma->vm_file;
2070         struct socket *sock = file->private_data;
2071         struct sock *sk = sock->sk;
2072
2073         if (sk)
2074                 atomic_inc(&pkt_sk(sk)->mapped);
2075 }
2076
2077 static void packet_mm_close(struct vm_area_struct *vma)
2078 {
2079         struct file *file = vma->vm_file;
2080         struct socket *sock = file->private_data;
2081         struct sock *sk = sock->sk;
2082
2083         if (sk)
2084                 atomic_dec(&pkt_sk(sk)->mapped);
2085 }
2086
2087 static const struct vm_operations_struct packet_mmap_ops = {
2088         .open   =       packet_mm_open,
2089         .close  =       packet_mm_close,
2090 };
2091
2092 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2093 {
2094         int i;
2095
2096         for (i = 0; i < len; i++) {
2097                 if (likely(pg_vec[i]))
2098                         free_pages((unsigned long) pg_vec[i], order);
2099         }
2100         kfree(pg_vec);
2101 }
2102
2103 static inline char *alloc_one_pg_vec_page(unsigned long order)
2104 {
2105         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2106
2107         return (char *) __get_free_pages(gfp_flags, order);
2108 }
2109
2110 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2111 {
2112         unsigned int block_nr = req->tp_block_nr;
2113         char **pg_vec;
2114         int i;
2115
2116         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2117         if (unlikely(!pg_vec))
2118                 goto out;
2119
2120         for (i = 0; i < block_nr; i++) {
2121                 pg_vec[i] = alloc_one_pg_vec_page(order);
2122                 if (unlikely(!pg_vec[i]))
2123                         goto out_free_pgvec;
2124         }
2125
2126 out:
2127         return pg_vec;
2128
2129 out_free_pgvec:
2130         free_pg_vec(pg_vec, order, block_nr);
2131         pg_vec = NULL;
2132         goto out;
2133 }
2134
2135 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2136                 int closing, int tx_ring)
2137 {
2138         char **pg_vec = NULL;
2139         struct packet_sock *po = pkt_sk(sk);
2140         int was_running, order = 0;
2141         struct packet_ring_buffer *rb;
2142         struct sk_buff_head *rb_queue;
2143         __be16 num;
2144         int err;
2145
2146         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2147         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2148
2149         err = -EBUSY;
2150         if (!closing) {
2151                 if (atomic_read(&po->mapped))
2152                         goto out;
2153                 if (atomic_read(&rb->pending))
2154                         goto out;
2155         }
2156
2157         if (req->tp_block_nr) {
2158                 /* Sanity tests and some calculations */
2159                 err = -EBUSY;
2160                 if (unlikely(rb->pg_vec))
2161                         goto out;
2162
2163                 switch (po->tp_version) {
2164                 case TPACKET_V1:
2165                         po->tp_hdrlen = TPACKET_HDRLEN;
2166                         break;
2167                 case TPACKET_V2:
2168                         po->tp_hdrlen = TPACKET2_HDRLEN;
2169                         break;
2170                 }
2171
2172                 err = -EINVAL;
2173                 if (unlikely((int)req->tp_block_size <= 0))
2174                         goto out;
2175                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2176                         goto out;
2177                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2178                                         po->tp_reserve))
2179                         goto out;
2180                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2181                         goto out;
2182
2183                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2184                 if (unlikely(rb->frames_per_block <= 0))
2185                         goto out;
2186                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2187                                         req->tp_frame_nr))
2188                         goto out;
2189
2190                 err = -ENOMEM;
2191                 order = get_order(req->tp_block_size);
2192                 pg_vec = alloc_pg_vec(req, order);
2193                 if (unlikely(!pg_vec))
2194                         goto out;
2195         }
2196         /* Done */
2197         else {
2198                 err = -EINVAL;
2199                 if (unlikely(req->tp_frame_nr))
2200                         goto out;
2201         }
2202
2203         lock_sock(sk);
2204
2205         /* Detach socket from network */
2206         spin_lock(&po->bind_lock);
2207         was_running = po->running;
2208         num = po->num;
2209         if (was_running) {
2210                 __dev_remove_pack(&po->prot_hook);
2211                 po->num = 0;
2212                 po->running = 0;
2213                 __sock_put(sk);
2214         }
2215         spin_unlock(&po->bind_lock);
2216
2217         synchronize_net();
2218
2219         err = -EBUSY;
2220         mutex_lock(&po->pg_vec_lock);
2221         if (closing || atomic_read(&po->mapped) == 0) {
2222                 err = 0;
2223 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2224                 spin_lock_bh(&rb_queue->lock);
2225                 pg_vec = XC(rb->pg_vec, pg_vec);
2226                 rb->frame_max = (req->tp_frame_nr - 1);
2227                 rb->head = 0;
2228                 rb->frame_size = req->tp_frame_size;
2229                 spin_unlock_bh(&rb_queue->lock);
2230
2231                 order = XC(rb->pg_vec_order, order);
2232                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2233
2234                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2235                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2236                                                 tpacket_rcv : packet_rcv;
2237                 skb_queue_purge(rb_queue);
2238 #undef XC
2239                 if (atomic_read(&po->mapped))
2240                         pr_err("packet_mmap: vma is busy: %d\n",
2241                                atomic_read(&po->mapped));
2242         }
2243         mutex_unlock(&po->pg_vec_lock);
2244
2245         spin_lock(&po->bind_lock);
2246         if (was_running && !po->running) {
2247                 sock_hold(sk);
2248                 po->running = 1;
2249                 po->num = num;
2250                 dev_add_pack(&po->prot_hook);
2251         }
2252         spin_unlock(&po->bind_lock);
2253
2254         release_sock(sk);
2255
2256         if (pg_vec)
2257                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2258 out:
2259         return err;
2260 }
2261
2262 static int packet_mmap(struct file *file, struct socket *sock,
2263                 struct vm_area_struct *vma)
2264 {
2265         struct sock *sk = sock->sk;
2266         struct packet_sock *po = pkt_sk(sk);
2267         unsigned long size, expected_size;
2268         struct packet_ring_buffer *rb;
2269         unsigned long start;
2270         int err = -EINVAL;
2271         int i;
2272
2273         if (vma->vm_pgoff)
2274                 return -EINVAL;
2275
2276         mutex_lock(&po->pg_vec_lock);
2277
2278         expected_size = 0;
2279         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2280                 if (rb->pg_vec) {
2281                         expected_size += rb->pg_vec_len
2282                                                 * rb->pg_vec_pages
2283                                                 * PAGE_SIZE;
2284                 }
2285         }
2286
2287         if (expected_size == 0)
2288                 goto out;
2289
2290         size = vma->vm_end - vma->vm_start;
2291         if (size != expected_size)
2292                 goto out;
2293
2294         start = vma->vm_start;
2295         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2296                 if (rb->pg_vec == NULL)
2297                         continue;
2298
2299                 for (i = 0; i < rb->pg_vec_len; i++) {
2300                         struct page *page = virt_to_page(rb->pg_vec[i]);
2301                         int pg_num;
2302
2303                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2304                                         pg_num++, page++) {
2305                                 err = vm_insert_page(vma, start, page);
2306                                 if (unlikely(err))
2307                                         goto out;
2308                                 start += PAGE_SIZE;
2309                         }
2310                 }
2311         }
2312
2313         atomic_inc(&po->mapped);
2314         vma->vm_ops = &packet_mmap_ops;
2315         err = 0;
2316
2317 out:
2318         mutex_unlock(&po->pg_vec_lock);
2319         return err;
2320 }
2321 #endif
2322
2323
2324 static const struct proto_ops packet_ops_spkt = {
2325         .family =       PF_PACKET,
2326         .owner =        THIS_MODULE,
2327         .release =      packet_release,
2328         .bind =         packet_bind_spkt,
2329         .connect =      sock_no_connect,
2330         .socketpair =   sock_no_socketpair,
2331         .accept =       sock_no_accept,
2332         .getname =      packet_getname_spkt,
2333         .poll =         datagram_poll,
2334         .ioctl =        packet_ioctl,
2335         .listen =       sock_no_listen,
2336         .shutdown =     sock_no_shutdown,
2337         .setsockopt =   sock_no_setsockopt,
2338         .getsockopt =   sock_no_getsockopt,
2339         .sendmsg =      packet_sendmsg_spkt,
2340         .recvmsg =      packet_recvmsg,
2341         .mmap =         sock_no_mmap,
2342         .sendpage =     sock_no_sendpage,
2343 };
2344
2345 static const struct proto_ops packet_ops = {
2346         .family =       PF_PACKET,
2347         .owner =        THIS_MODULE,
2348         .release =      packet_release,
2349         .bind =         packet_bind,
2350         .connect =      sock_no_connect,
2351         .socketpair =   sock_no_socketpair,
2352         .accept =       sock_no_accept,
2353         .getname =      packet_getname,
2354         .poll =         packet_poll,
2355         .ioctl =        packet_ioctl,
2356         .listen =       sock_no_listen,
2357         .shutdown =     sock_no_shutdown,
2358         .setsockopt =   packet_setsockopt,
2359         .getsockopt =   packet_getsockopt,
2360         .sendmsg =      packet_sendmsg,
2361         .recvmsg =      packet_recvmsg,
2362         .mmap =         packet_mmap,
2363         .sendpage =     sock_no_sendpage,
2364 };
2365
2366 static const struct net_proto_family packet_family_ops = {
2367         .family =       PF_PACKET,
2368         .create =       packet_create,
2369         .owner  =       THIS_MODULE,
2370 };
2371
2372 static struct notifier_block packet_netdev_notifier = {
2373         .notifier_call =        packet_notifier,
2374 };
2375
2376 #ifdef CONFIG_PROC_FS
2377 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2378 {
2379         struct sock *s;
2380         struct hlist_node *node;
2381
2382         sk_for_each(s, node, &net->packet.sklist) {
2383                 if (!off--)
2384                         return s;
2385         }
2386         return NULL;
2387 }
2388
2389 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2390         __acquires(seq_file_net(seq)->packet.sklist_lock)
2391 {
2392         struct net *net = seq_file_net(seq);
2393         read_lock(&net->packet.sklist_lock);
2394         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2395 }
2396
2397 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2398 {
2399         struct net *net = seq_file_net(seq);
2400         ++*pos;
2401         return  (v == SEQ_START_TOKEN)
2402                 ? sk_head(&net->packet.sklist)
2403                 : sk_next((struct sock *)v) ;
2404 }
2405
2406 static void packet_seq_stop(struct seq_file *seq, void *v)
2407         __releases(seq_file_net(seq)->packet.sklist_lock)
2408 {
2409         struct net *net = seq_file_net(seq);
2410         read_unlock(&net->packet.sklist_lock);
2411 }
2412
2413 static int packet_seq_show(struct seq_file *seq, void *v)
2414 {
2415         if (v == SEQ_START_TOKEN)
2416                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2417         else {
2418                 struct sock *s = v;
2419                 const struct packet_sock *po = pkt_sk(s);
2420
2421                 seq_printf(seq,
2422                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2423                            s,
2424                            atomic_read(&s->sk_refcnt),
2425                            s->sk_type,
2426                            ntohs(po->num),
2427                            po->ifindex,
2428                            po->running,
2429                            atomic_read(&s->sk_rmem_alloc),
2430                            sock_i_uid(s),
2431                            sock_i_ino(s));
2432         }
2433
2434         return 0;
2435 }
2436
2437 static const struct seq_operations packet_seq_ops = {
2438         .start  = packet_seq_start,
2439         .next   = packet_seq_next,
2440         .stop   = packet_seq_stop,
2441         .show   = packet_seq_show,
2442 };
2443
2444 static int packet_seq_open(struct inode *inode, struct file *file)
2445 {
2446         return seq_open_net(inode, file, &packet_seq_ops,
2447                             sizeof(struct seq_net_private));
2448 }
2449
2450 static const struct file_operations packet_seq_fops = {
2451         .owner          = THIS_MODULE,
2452         .open           = packet_seq_open,
2453         .read           = seq_read,
2454         .llseek         = seq_lseek,
2455         .release        = seq_release_net,
2456 };
2457
2458 #endif
2459
2460 static int packet_net_init(struct net *net)
2461 {
2462         rwlock_init(&net->packet.sklist_lock);
2463         INIT_HLIST_HEAD(&net->packet.sklist);
2464
2465         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2466                 return -ENOMEM;
2467
2468         return 0;
2469 }
2470
2471 static void packet_net_exit(struct net *net)
2472 {
2473         proc_net_remove(net, "packet");
2474 }
2475
2476 static struct pernet_operations packet_net_ops = {
2477         .init = packet_net_init,
2478         .exit = packet_net_exit,
2479 };
2480
2481
2482 static void __exit packet_exit(void)
2483 {
2484         unregister_netdevice_notifier(&packet_netdev_notifier);
2485         unregister_pernet_subsys(&packet_net_ops);
2486         sock_unregister(PF_PACKET);
2487         proto_unregister(&packet_proto);
2488 }
2489
2490 static int __init packet_init(void)
2491 {
2492         int rc = proto_register(&packet_proto, 0);
2493
2494         if (rc != 0)
2495                 goto out;
2496
2497         sock_register(&packet_family_ops);
2498         register_pernet_subsys(&packet_net_ops);
2499         register_netdevice_notifier(&packet_netdev_notifier);
2500 out:
2501         return rc;
2502 }
2503
2504 module_init(packet_init);
2505 module_exit(packet_exit);
2506 MODULE_LICENSE("GPL");
2507 MODULE_ALIAS_NETPROTO(PF_PACKET);