Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83
84 #ifdef CONFIG_INET
85 #include <net/inet_common.h>
86 #endif
87
88 /*
89    Assumptions:
90    - if device has no dev->hard_header routine, it adds and removes ll header
91      inside itself. In this case ll header is invisible outside of device,
92      but higher levels still should reserve dev->hard_header_len.
93      Some devices are enough clever to reallocate skb, when header
94      will not fit to reserved space (tunnel), another ones are silly
95      (PPP).
96    - packet socket receives packets with pulled ll header,
97      so that SOCK_RAW should push it back.
98
99 On receive:
100 -----------
101
102 Incoming, dev->hard_header!=NULL
103    mac_header -> ll header
104    data       -> data
105
106 Outgoing, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> ll header
109
110 Incoming, dev->hard_header==NULL
111    mac_header -> UNKNOWN position. It is very likely, that it points to ll
112                  header.  PPP makes it, that is wrong, because introduce
113                  assymetry between rx and tx paths.
114    data       -> data
115
116 Outgoing, dev->hard_header==NULL
117    mac_header -> data. ll header is still not built!
118    data       -> data
119
120 Resume
121   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
122
123
124 On transmit:
125 ------------
126
127 dev->hard_header != NULL
128    mac_header -> ll header
129    data       -> ll header
130
131 dev->hard_header == NULL (ll header is added by device, we cannot control it)
132    mac_header -> data
133    data       -> data
134
135    We should set nh.raw on output to correct posistion,
136    packet classifier depends on it.
137  */
138
139 /* Private packet socket structures. */
140
141 struct packet_mclist {
142         struct packet_mclist    *next;
143         int                     ifindex;
144         int                     count;
145         unsigned short          type;
146         unsigned short          alen;
147         unsigned char           addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max {
153         int             mr_ifindex;
154         unsigned short  mr_type;
155         unsigned short  mr_alen;
156         unsigned char   mr_address[MAX_ADDR_LEN];
157 };
158
159 #ifdef CONFIG_PACKET_MMAP
160 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161                 int closing, int tx_ring);
162
163 struct packet_ring_buffer {
164         char                    **pg_vec;
165         unsigned int            head;
166         unsigned int            frames_per_block;
167         unsigned int            frame_size;
168         unsigned int            frame_max;
169
170         unsigned int            pg_vec_order;
171         unsigned int            pg_vec_pages;
172         unsigned int            pg_vec_len;
173
174         atomic_t                pending;
175 };
176
177 struct packet_sock;
178 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179 #endif
180
181 static void packet_flush_mclist(struct sock *sk);
182
183 struct packet_sock {
184         /* struct sock has to be the first member of packet_sock */
185         struct sock             sk;
186         struct tpacket_stats    stats;
187 #ifdef CONFIG_PACKET_MMAP
188         struct packet_ring_buffer       rx_ring;
189         struct packet_ring_buffer       tx_ring;
190         int                     copy_thresh;
191 #endif
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
208 };
209
210 struct packet_skb_cb {
211         unsigned int origlen;
212         union {
213                 struct sockaddr_pkt pkt;
214                 struct sockaddr_ll ll;
215         } sa;
216 };
217
218 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
219
220 #ifdef CONFIG_PACKET_MMAP
221
222 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
223 {
224         union {
225                 struct tpacket_hdr *h1;
226                 struct tpacket2_hdr *h2;
227                 void *raw;
228         } h;
229
230         h.raw = frame;
231         switch (po->tp_version) {
232         case TPACKET_V1:
233                 h.h1->tp_status = status;
234                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
235                 break;
236         case TPACKET_V2:
237                 h.h2->tp_status = status;
238                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
239                 break;
240         default:
241                 pr_err("TPACKET version not supported\n");
242                 BUG();
243         }
244
245         smp_wmb();
246 }
247
248 static int __packet_get_status(struct packet_sock *po, void *frame)
249 {
250         union {
251                 struct tpacket_hdr *h1;
252                 struct tpacket2_hdr *h2;
253                 void *raw;
254         } h;
255
256         smp_rmb();
257
258         h.raw = frame;
259         switch (po->tp_version) {
260         case TPACKET_V1:
261                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
262                 return h.h1->tp_status;
263         case TPACKET_V2:
264                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
265                 return h.h2->tp_status;
266         default:
267                 pr_err("TPACKET version not supported\n");
268                 BUG();
269                 return 0;
270         }
271 }
272
273 static void *packet_lookup_frame(struct packet_sock *po,
274                 struct packet_ring_buffer *rb,
275                 unsigned int position,
276                 int status)
277 {
278         unsigned int pg_vec_pos, frame_offset;
279         union {
280                 struct tpacket_hdr *h1;
281                 struct tpacket2_hdr *h2;
282                 void *raw;
283         } h;
284
285         pg_vec_pos = position / rb->frames_per_block;
286         frame_offset = position % rb->frames_per_block;
287
288         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
289
290         if (status != __packet_get_status(po, h.raw))
291                 return NULL;
292
293         return h.raw;
294 }
295
296 static inline void *packet_current_frame(struct packet_sock *po,
297                 struct packet_ring_buffer *rb,
298                 int status)
299 {
300         return packet_lookup_frame(po, rb, rb->head, status);
301 }
302
303 static inline void *packet_previous_frame(struct packet_sock *po,
304                 struct packet_ring_buffer *rb,
305                 int status)
306 {
307         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
308         return packet_lookup_frame(po, rb, previous, status);
309 }
310
311 static inline void packet_increment_head(struct packet_ring_buffer *buff)
312 {
313         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
314 }
315
316 #endif
317
318 static inline struct packet_sock *pkt_sk(struct sock *sk)
319 {
320         return (struct packet_sock *)sk;
321 }
322
323 static void packet_sock_destruct(struct sock *sk)
324 {
325         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
326         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
327
328         if (!sock_flag(sk, SOCK_DEAD)) {
329                 pr_err("Attempt to release alive packet socket: %p\n", sk);
330                 return;
331         }
332
333         sk_refcnt_debug_dec(sk);
334 }
335
336
337 static const struct proto_ops packet_ops;
338
339 static const struct proto_ops packet_ops_spkt;
340
341 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
342                            struct packet_type *pt, struct net_device *orig_dev)
343 {
344         struct sock *sk;
345         struct sockaddr_pkt *spkt;
346
347         /*
348          *      When we registered the protocol we saved the socket in the data
349          *      field for just this event.
350          */
351
352         sk = pt->af_packet_priv;
353
354         /*
355          *      Yank back the headers [hope the device set this
356          *      right or kerboom...]
357          *
358          *      Incoming packets have ll header pulled,
359          *      push it back.
360          *
361          *      For outgoing ones skb->data == skb_mac_header(skb)
362          *      so that this procedure is noop.
363          */
364
365         if (skb->pkt_type == PACKET_LOOPBACK)
366                 goto out;
367
368         if (dev_net(dev) != sock_net(sk))
369                 goto out;
370
371         skb = skb_share_check(skb, GFP_ATOMIC);
372         if (skb == NULL)
373                 goto oom;
374
375         /* drop any routing info */
376         skb_dst_drop(skb);
377
378         /* drop conntrack reference */
379         nf_reset(skb);
380
381         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
382
383         skb_push(skb, skb->data - skb_mac_header(skb));
384
385         /*
386          *      The SOCK_PACKET socket receives _all_ frames.
387          */
388
389         spkt->spkt_family = dev->type;
390         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
391         spkt->spkt_protocol = skb->protocol;
392
393         /*
394          *      Charge the memory to the socket. This is done specifically
395          *      to prevent sockets using all the memory up.
396          */
397
398         if (sock_queue_rcv_skb(sk, skb) == 0)
399                 return 0;
400
401 out:
402         kfree_skb(skb);
403 oom:
404         return 0;
405 }
406
407
408 /*
409  *      Output a raw packet to a device layer. This bypasses all the other
410  *      protocol layers and you must therefore supply it with a complete frame
411  */
412
413 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
414                                struct msghdr *msg, size_t len)
415 {
416         struct sock *sk = sock->sk;
417         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
418         struct sk_buff *skb;
419         struct net_device *dev;
420         __be16 proto = 0;
421         int err;
422
423         /*
424          *      Get and verify the address.
425          */
426
427         if (saddr) {
428                 if (msg->msg_namelen < sizeof(struct sockaddr))
429                         return -EINVAL;
430                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
431                         proto = saddr->spkt_protocol;
432         } else
433                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
434
435         /*
436          *      Find the device first to size check it
437          */
438
439         saddr->spkt_device[13] = 0;
440         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
441         err = -ENODEV;
442         if (dev == NULL)
443                 goto out_unlock;
444
445         err = -ENETDOWN;
446         if (!(dev->flags & IFF_UP))
447                 goto out_unlock;
448
449         /*
450          * You may not queue a frame bigger than the mtu. This is the lowest level
451          * raw protocol and you must do your own fragmentation at this level.
452          */
453
454         err = -EMSGSIZE;
455         if (len > dev->mtu + dev->hard_header_len)
456                 goto out_unlock;
457
458         err = -ENOBUFS;
459         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
460
461         /*
462          * If the write buffer is full, then tough. At this level the user
463          * gets to deal with the problem - do your own algorithmic backoffs.
464          * That's far more flexible.
465          */
466
467         if (skb == NULL)
468                 goto out_unlock;
469
470         /*
471          *      Fill it in
472          */
473
474         /* FIXME: Save some space for broken drivers that write a
475          * hard header at transmission time by themselves. PPP is the
476          * notable one here. This should really be fixed at the driver level.
477          */
478         skb_reserve(skb, LL_RESERVED_SPACE(dev));
479         skb_reset_network_header(skb);
480
481         /* Try to align data part correctly */
482         if (dev->header_ops) {
483                 skb->data -= dev->hard_header_len;
484                 skb->tail -= dev->hard_header_len;
485                 if (len < dev->hard_header_len)
486                         skb_reset_network_header(skb);
487         }
488
489         /* Returns -EFAULT on error */
490         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
491         skb->protocol = proto;
492         skb->dev = dev;
493         skb->priority = sk->sk_priority;
494         skb->mark = sk->sk_mark;
495         if (err)
496                 goto out_free;
497
498         /*
499          *      Now send it
500          */
501
502         dev_queue_xmit(skb);
503         dev_put(dev);
504         return len;
505
506 out_free:
507         kfree_skb(skb);
508 out_unlock:
509         if (dev)
510                 dev_put(dev);
511         return err;
512 }
513
514 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
515                                       unsigned int res)
516 {
517         struct sk_filter *filter;
518
519         rcu_read_lock_bh();
520         filter = rcu_dereference(sk->sk_filter);
521         if (filter != NULL)
522                 res = sk_run_filter(skb, filter->insns, filter->len);
523         rcu_read_unlock_bh();
524
525         return res;
526 }
527
528 /*
529    This function makes lazy skb cloning in hope that most of packets
530    are discarded by BPF.
531
532    Note tricky part: we DO mangle shared skb! skb->data, skb->len
533    and skb->cb are mangled. It works because (and until) packets
534    falling here are owned by current CPU. Output packets are cloned
535    by dev_queue_xmit_nit(), input packets are processed by net_bh
536    sequencially, so that if we return skb to original state on exit,
537    we will not harm anyone.
538  */
539
540 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
541                       struct packet_type *pt, struct net_device *orig_dev)
542 {
543         struct sock *sk;
544         struct sockaddr_ll *sll;
545         struct packet_sock *po;
546         u8 *skb_head = skb->data;
547         int skb_len = skb->len;
548         unsigned int snaplen, res;
549
550         if (skb->pkt_type == PACKET_LOOPBACK)
551                 goto drop;
552
553         sk = pt->af_packet_priv;
554         po = pkt_sk(sk);
555
556         if (dev_net(dev) != sock_net(sk))
557                 goto drop;
558
559         skb->dev = dev;
560
561         if (dev->header_ops) {
562                 /* The device has an explicit notion of ll header,
563                    exported to higher levels.
564
565                    Otherwise, the device hides datails of it frame
566                    structure, so that corresponding packet head
567                    never delivered to user.
568                  */
569                 if (sk->sk_type != SOCK_DGRAM)
570                         skb_push(skb, skb->data - skb_mac_header(skb));
571                 else if (skb->pkt_type == PACKET_OUTGOING) {
572                         /* Special case: outgoing packets have ll header at head */
573                         skb_pull(skb, skb_network_offset(skb));
574                 }
575         }
576
577         snaplen = skb->len;
578
579         res = run_filter(skb, sk, snaplen);
580         if (!res)
581                 goto drop_n_restore;
582         if (snaplen > res)
583                 snaplen = res;
584
585         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
586             (unsigned)sk->sk_rcvbuf)
587                 goto drop_n_acct;
588
589         if (skb_shared(skb)) {
590                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
591                 if (nskb == NULL)
592                         goto drop_n_acct;
593
594                 if (skb_head != skb->data) {
595                         skb->data = skb_head;
596                         skb->len = skb_len;
597                 }
598                 kfree_skb(skb);
599                 skb = nskb;
600         }
601
602         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
603                      sizeof(skb->cb));
604
605         sll = &PACKET_SKB_CB(skb)->sa.ll;
606         sll->sll_family = AF_PACKET;
607         sll->sll_hatype = dev->type;
608         sll->sll_protocol = skb->protocol;
609         sll->sll_pkttype = skb->pkt_type;
610         if (unlikely(po->origdev))
611                 sll->sll_ifindex = orig_dev->ifindex;
612         else
613                 sll->sll_ifindex = dev->ifindex;
614
615         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
616
617         PACKET_SKB_CB(skb)->origlen = skb->len;
618
619         if (pskb_trim(skb, snaplen))
620                 goto drop_n_acct;
621
622         skb_set_owner_r(skb, sk);
623         skb->dev = NULL;
624         skb_dst_drop(skb);
625
626         /* drop conntrack reference */
627         nf_reset(skb);
628
629         spin_lock(&sk->sk_receive_queue.lock);
630         po->stats.tp_packets++;
631         skb->dropcount = atomic_read(&sk->sk_drops);
632         __skb_queue_tail(&sk->sk_receive_queue, skb);
633         spin_unlock(&sk->sk_receive_queue.lock);
634         sk->sk_data_ready(sk, skb->len);
635         return 0;
636
637 drop_n_acct:
638         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
639
640 drop_n_restore:
641         if (skb_head != skb->data && skb_shared(skb)) {
642                 skb->data = skb_head;
643                 skb->len = skb_len;
644         }
645 drop:
646         consume_skb(skb);
647         return 0;
648 }
649
650 #ifdef CONFIG_PACKET_MMAP
651 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
652                        struct packet_type *pt, struct net_device *orig_dev)
653 {
654         struct sock *sk;
655         struct packet_sock *po;
656         struct sockaddr_ll *sll;
657         union {
658                 struct tpacket_hdr *h1;
659                 struct tpacket2_hdr *h2;
660                 void *raw;
661         } h;
662         u8 *skb_head = skb->data;
663         int skb_len = skb->len;
664         unsigned int snaplen, res;
665         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
666         unsigned short macoff, netoff, hdrlen;
667         struct sk_buff *copy_skb = NULL;
668         struct timeval tv;
669         struct timespec ts;
670
671         if (skb->pkt_type == PACKET_LOOPBACK)
672                 goto drop;
673
674         sk = pt->af_packet_priv;
675         po = pkt_sk(sk);
676
677         if (dev_net(dev) != sock_net(sk))
678                 goto drop;
679
680         if (dev->header_ops) {
681                 if (sk->sk_type != SOCK_DGRAM)
682                         skb_push(skb, skb->data - skb_mac_header(skb));
683                 else if (skb->pkt_type == PACKET_OUTGOING) {
684                         /* Special case: outgoing packets have ll header at head */
685                         skb_pull(skb, skb_network_offset(skb));
686                 }
687         }
688
689         if (skb->ip_summed == CHECKSUM_PARTIAL)
690                 status |= TP_STATUS_CSUMNOTREADY;
691
692         snaplen = skb->len;
693
694         res = run_filter(skb, sk, snaplen);
695         if (!res)
696                 goto drop_n_restore;
697         if (snaplen > res)
698                 snaplen = res;
699
700         if (sk->sk_type == SOCK_DGRAM) {
701                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
702                                   po->tp_reserve;
703         } else {
704                 unsigned maclen = skb_network_offset(skb);
705                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
706                                        (maclen < 16 ? 16 : maclen)) +
707                         po->tp_reserve;
708                 macoff = netoff - maclen;
709         }
710
711         if (macoff + snaplen > po->rx_ring.frame_size) {
712                 if (po->copy_thresh &&
713                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
714                     (unsigned)sk->sk_rcvbuf) {
715                         if (skb_shared(skb)) {
716                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
717                         } else {
718                                 copy_skb = skb_get(skb);
719                                 skb_head = skb->data;
720                         }
721                         if (copy_skb)
722                                 skb_set_owner_r(copy_skb, sk);
723                 }
724                 snaplen = po->rx_ring.frame_size - macoff;
725                 if ((int)snaplen < 0)
726                         snaplen = 0;
727         }
728
729         spin_lock(&sk->sk_receive_queue.lock);
730         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
731         if (!h.raw)
732                 goto ring_is_full;
733         packet_increment_head(&po->rx_ring);
734         po->stats.tp_packets++;
735         if (copy_skb) {
736                 status |= TP_STATUS_COPY;
737                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
738         }
739         if (!po->stats.tp_drops)
740                 status &= ~TP_STATUS_LOSING;
741         spin_unlock(&sk->sk_receive_queue.lock);
742
743         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
744
745         switch (po->tp_version) {
746         case TPACKET_V1:
747                 h.h1->tp_len = skb->len;
748                 h.h1->tp_snaplen = snaplen;
749                 h.h1->tp_mac = macoff;
750                 h.h1->tp_net = netoff;
751                 if (skb->tstamp.tv64)
752                         tv = ktime_to_timeval(skb->tstamp);
753                 else
754                         do_gettimeofday(&tv);
755                 h.h1->tp_sec = tv.tv_sec;
756                 h.h1->tp_usec = tv.tv_usec;
757                 hdrlen = sizeof(*h.h1);
758                 break;
759         case TPACKET_V2:
760                 h.h2->tp_len = skb->len;
761                 h.h2->tp_snaplen = snaplen;
762                 h.h2->tp_mac = macoff;
763                 h.h2->tp_net = netoff;
764                 if (skb->tstamp.tv64)
765                         ts = ktime_to_timespec(skb->tstamp);
766                 else
767                         getnstimeofday(&ts);
768                 h.h2->tp_sec = ts.tv_sec;
769                 h.h2->tp_nsec = ts.tv_nsec;
770                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
771                 hdrlen = sizeof(*h.h2);
772                 break;
773         default:
774                 BUG();
775         }
776
777         sll = h.raw + TPACKET_ALIGN(hdrlen);
778         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
779         sll->sll_family = AF_PACKET;
780         sll->sll_hatype = dev->type;
781         sll->sll_protocol = skb->protocol;
782         sll->sll_pkttype = skb->pkt_type;
783         if (unlikely(po->origdev))
784                 sll->sll_ifindex = orig_dev->ifindex;
785         else
786                 sll->sll_ifindex = dev->ifindex;
787
788         __packet_set_status(po, h.raw, status);
789         smp_mb();
790         {
791                 struct page *p_start, *p_end;
792                 u8 *h_end = h.raw + macoff + snaplen - 1;
793
794                 p_start = virt_to_page(h.raw);
795                 p_end = virt_to_page(h_end);
796                 while (p_start <= p_end) {
797                         flush_dcache_page(p_start);
798                         p_start++;
799                 }
800         }
801
802         sk->sk_data_ready(sk, 0);
803
804 drop_n_restore:
805         if (skb_head != skb->data && skb_shared(skb)) {
806                 skb->data = skb_head;
807                 skb->len = skb_len;
808         }
809 drop:
810         kfree_skb(skb);
811         return 0;
812
813 ring_is_full:
814         po->stats.tp_drops++;
815         spin_unlock(&sk->sk_receive_queue.lock);
816
817         sk->sk_data_ready(sk, 0);
818         kfree_skb(copy_skb);
819         goto drop_n_restore;
820 }
821
822 static void tpacket_destruct_skb(struct sk_buff *skb)
823 {
824         struct packet_sock *po = pkt_sk(skb->sk);
825         void *ph;
826
827         BUG_ON(skb == NULL);
828
829         if (likely(po->tx_ring.pg_vec)) {
830                 ph = skb_shinfo(skb)->destructor_arg;
831                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
832                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
833                 atomic_dec(&po->tx_ring.pending);
834                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
835         }
836
837         sock_wfree(skb);
838 }
839
840 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
841                 void *frame, struct net_device *dev, int size_max,
842                 __be16 proto, unsigned char *addr)
843 {
844         union {
845                 struct tpacket_hdr *h1;
846                 struct tpacket2_hdr *h2;
847                 void *raw;
848         } ph;
849         int to_write, offset, len, tp_len, nr_frags, len_max;
850         struct socket *sock = po->sk.sk_socket;
851         struct page *page;
852         void *data;
853         int err;
854
855         ph.raw = frame;
856
857         skb->protocol = proto;
858         skb->dev = dev;
859         skb->priority = po->sk.sk_priority;
860         skb->mark = po->sk.sk_mark;
861         skb_shinfo(skb)->destructor_arg = ph.raw;
862
863         switch (po->tp_version) {
864         case TPACKET_V2:
865                 tp_len = ph.h2->tp_len;
866                 break;
867         default:
868                 tp_len = ph.h1->tp_len;
869                 break;
870         }
871         if (unlikely(tp_len > size_max)) {
872                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
873                 return -EMSGSIZE;
874         }
875
876         skb_reserve(skb, LL_RESERVED_SPACE(dev));
877         skb_reset_network_header(skb);
878
879         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
880         to_write = tp_len;
881
882         if (sock->type == SOCK_DGRAM) {
883                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
884                                 NULL, tp_len);
885                 if (unlikely(err < 0))
886                         return -EINVAL;
887         } else if (dev->hard_header_len) {
888                 /* net device doesn't like empty head */
889                 if (unlikely(tp_len <= dev->hard_header_len)) {
890                         pr_err("packet size is too short (%d < %d)\n",
891                                tp_len, dev->hard_header_len);
892                         return -EINVAL;
893                 }
894
895                 skb_push(skb, dev->hard_header_len);
896                 err = skb_store_bits(skb, 0, data,
897                                 dev->hard_header_len);
898                 if (unlikely(err))
899                         return err;
900
901                 data += dev->hard_header_len;
902                 to_write -= dev->hard_header_len;
903         }
904
905         err = -EFAULT;
906         page = virt_to_page(data);
907         offset = offset_in_page(data);
908         len_max = PAGE_SIZE - offset;
909         len = ((to_write > len_max) ? len_max : to_write);
910
911         skb->data_len = to_write;
912         skb->len += to_write;
913         skb->truesize += to_write;
914         atomic_add(to_write, &po->sk.sk_wmem_alloc);
915
916         while (likely(to_write)) {
917                 nr_frags = skb_shinfo(skb)->nr_frags;
918
919                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
920                         pr_err("Packet exceed the number of skb frags(%lu)\n",
921                                MAX_SKB_FRAGS);
922                         return -EFAULT;
923                 }
924
925                 flush_dcache_page(page);
926                 get_page(page);
927                 skb_fill_page_desc(skb,
928                                 nr_frags,
929                                 page++, offset, len);
930                 to_write -= len;
931                 offset = 0;
932                 len_max = PAGE_SIZE;
933                 len = ((to_write > len_max) ? len_max : to_write);
934         }
935
936         return tp_len;
937 }
938
939 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
940 {
941         struct socket *sock;
942         struct sk_buff *skb;
943         struct net_device *dev;
944         __be16 proto;
945         int ifindex, err, reserve = 0;
946         void *ph;
947         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
948         int tp_len, size_max;
949         unsigned char *addr;
950         int len_sum = 0;
951         int status = 0;
952
953         sock = po->sk.sk_socket;
954
955         mutex_lock(&po->pg_vec_lock);
956
957         err = -EBUSY;
958         if (saddr == NULL) {
959                 ifindex = po->ifindex;
960                 proto   = po->num;
961                 addr    = NULL;
962         } else {
963                 err = -EINVAL;
964                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
965                         goto out;
966                 if (msg->msg_namelen < (saddr->sll_halen
967                                         + offsetof(struct sockaddr_ll,
968                                                 sll_addr)))
969                         goto out;
970                 ifindex = saddr->sll_ifindex;
971                 proto   = saddr->sll_protocol;
972                 addr    = saddr->sll_addr;
973         }
974
975         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
976         err = -ENXIO;
977         if (unlikely(dev == NULL))
978                 goto out;
979
980         reserve = dev->hard_header_len;
981
982         err = -ENETDOWN;
983         if (unlikely(!(dev->flags & IFF_UP)))
984                 goto out_put;
985
986         size_max = po->tx_ring.frame_size
987                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
988
989         if (size_max > dev->mtu + reserve)
990                 size_max = dev->mtu + reserve;
991
992         do {
993                 ph = packet_current_frame(po, &po->tx_ring,
994                                 TP_STATUS_SEND_REQUEST);
995
996                 if (unlikely(ph == NULL)) {
997                         schedule();
998                         continue;
999                 }
1000
1001                 status = TP_STATUS_SEND_REQUEST;
1002                 skb = sock_alloc_send_skb(&po->sk,
1003                                 LL_ALLOCATED_SPACE(dev)
1004                                 + sizeof(struct sockaddr_ll),
1005                                 0, &err);
1006
1007                 if (unlikely(skb == NULL))
1008                         goto out_status;
1009
1010                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1011                                 addr);
1012
1013                 if (unlikely(tp_len < 0)) {
1014                         if (po->tp_loss) {
1015                                 __packet_set_status(po, ph,
1016                                                 TP_STATUS_AVAILABLE);
1017                                 packet_increment_head(&po->tx_ring);
1018                                 kfree_skb(skb);
1019                                 continue;
1020                         } else {
1021                                 status = TP_STATUS_WRONG_FORMAT;
1022                                 err = tp_len;
1023                                 goto out_status;
1024                         }
1025                 }
1026
1027                 skb->destructor = tpacket_destruct_skb;
1028                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1029                 atomic_inc(&po->tx_ring.pending);
1030
1031                 status = TP_STATUS_SEND_REQUEST;
1032                 err = dev_queue_xmit(skb);
1033                 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1034                         goto out_xmit;
1035                 packet_increment_head(&po->tx_ring);
1036                 len_sum += tp_len;
1037         } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1038                                         && (atomic_read(&po->tx_ring.pending))))
1039               );
1040
1041         err = len_sum;
1042         goto out_put;
1043
1044 out_xmit:
1045         skb->destructor = sock_wfree;
1046         atomic_dec(&po->tx_ring.pending);
1047 out_status:
1048         __packet_set_status(po, ph, status);
1049         kfree_skb(skb);
1050 out_put:
1051         dev_put(dev);
1052 out:
1053         mutex_unlock(&po->pg_vec_lock);
1054         return err;
1055 }
1056 #endif
1057
1058 static int packet_snd(struct socket *sock,
1059                           struct msghdr *msg, size_t len)
1060 {
1061         struct sock *sk = sock->sk;
1062         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1063         struct sk_buff *skb;
1064         struct net_device *dev;
1065         __be16 proto;
1066         unsigned char *addr;
1067         int ifindex, err, reserve = 0;
1068
1069         /*
1070          *      Get and verify the address.
1071          */
1072
1073         if (saddr == NULL) {
1074                 struct packet_sock *po = pkt_sk(sk);
1075
1076                 ifindex = po->ifindex;
1077                 proto   = po->num;
1078                 addr    = NULL;
1079         } else {
1080                 err = -EINVAL;
1081                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1082                         goto out;
1083                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1084                         goto out;
1085                 ifindex = saddr->sll_ifindex;
1086                 proto   = saddr->sll_protocol;
1087                 addr    = saddr->sll_addr;
1088         }
1089
1090
1091         dev = dev_get_by_index(sock_net(sk), ifindex);
1092         err = -ENXIO;
1093         if (dev == NULL)
1094                 goto out_unlock;
1095         if (sock->type == SOCK_RAW)
1096                 reserve = dev->hard_header_len;
1097
1098         err = -ENETDOWN;
1099         if (!(dev->flags & IFF_UP))
1100                 goto out_unlock;
1101
1102         err = -EMSGSIZE;
1103         if (len > dev->mtu+reserve)
1104                 goto out_unlock;
1105
1106         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1107                                 msg->msg_flags & MSG_DONTWAIT, &err);
1108         if (skb == NULL)
1109                 goto out_unlock;
1110
1111         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1112         skb_reset_network_header(skb);
1113
1114         err = -EINVAL;
1115         if (sock->type == SOCK_DGRAM &&
1116             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1117                 goto out_free;
1118
1119         /* Returns -EFAULT on error */
1120         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1121         if (err)
1122                 goto out_free;
1123
1124         skb->protocol = proto;
1125         skb->dev = dev;
1126         skb->priority = sk->sk_priority;
1127         skb->mark = sk->sk_mark;
1128
1129         /*
1130          *      Now send it
1131          */
1132
1133         err = dev_queue_xmit(skb);
1134         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1135                 goto out_unlock;
1136
1137         dev_put(dev);
1138
1139         return len;
1140
1141 out_free:
1142         kfree_skb(skb);
1143 out_unlock:
1144         if (dev)
1145                 dev_put(dev);
1146 out:
1147         return err;
1148 }
1149
1150 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1151                 struct msghdr *msg, size_t len)
1152 {
1153 #ifdef CONFIG_PACKET_MMAP
1154         struct sock *sk = sock->sk;
1155         struct packet_sock *po = pkt_sk(sk);
1156         if (po->tx_ring.pg_vec)
1157                 return tpacket_snd(po, msg);
1158         else
1159 #endif
1160                 return packet_snd(sock, msg, len);
1161 }
1162
1163 /*
1164  *      Close a PACKET socket. This is fairly simple. We immediately go
1165  *      to 'closed' state and remove our protocol entry in the device list.
1166  */
1167
1168 static int packet_release(struct socket *sock)
1169 {
1170         struct sock *sk = sock->sk;
1171         struct packet_sock *po;
1172         struct net *net;
1173 #ifdef CONFIG_PACKET_MMAP
1174         struct tpacket_req req;
1175 #endif
1176
1177         if (!sk)
1178                 return 0;
1179
1180         net = sock_net(sk);
1181         po = pkt_sk(sk);
1182
1183         write_lock_bh(&net->packet.sklist_lock);
1184         sk_del_node_init(sk);
1185         sock_prot_inuse_add(net, sk->sk_prot, -1);
1186         write_unlock_bh(&net->packet.sklist_lock);
1187
1188         /*
1189          *      Unhook packet receive handler.
1190          */
1191
1192         if (po->running) {
1193                 /*
1194                  *      Remove the protocol hook
1195                  */
1196                 dev_remove_pack(&po->prot_hook);
1197                 po->running = 0;
1198                 po->num = 0;
1199                 __sock_put(sk);
1200         }
1201
1202         packet_flush_mclist(sk);
1203
1204 #ifdef CONFIG_PACKET_MMAP
1205         memset(&req, 0, sizeof(req));
1206
1207         if (po->rx_ring.pg_vec)
1208                 packet_set_ring(sk, &req, 1, 0);
1209
1210         if (po->tx_ring.pg_vec)
1211                 packet_set_ring(sk, &req, 1, 1);
1212 #endif
1213
1214         /*
1215          *      Now the socket is dead. No more input will appear.
1216          */
1217
1218         sock_orphan(sk);
1219         sock->sk = NULL;
1220
1221         /* Purge queues */
1222
1223         skb_queue_purge(&sk->sk_receive_queue);
1224         sk_refcnt_debug_release(sk);
1225
1226         sock_put(sk);
1227         return 0;
1228 }
1229
1230 /*
1231  *      Attach a packet hook.
1232  */
1233
1234 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1235 {
1236         struct packet_sock *po = pkt_sk(sk);
1237         /*
1238          *      Detach an existing hook if present.
1239          */
1240
1241         lock_sock(sk);
1242
1243         spin_lock(&po->bind_lock);
1244         if (po->running) {
1245                 __sock_put(sk);
1246                 po->running = 0;
1247                 po->num = 0;
1248                 spin_unlock(&po->bind_lock);
1249                 dev_remove_pack(&po->prot_hook);
1250                 spin_lock(&po->bind_lock);
1251         }
1252
1253         po->num = protocol;
1254         po->prot_hook.type = protocol;
1255         po->prot_hook.dev = dev;
1256
1257         po->ifindex = dev ? dev->ifindex : 0;
1258
1259         if (protocol == 0)
1260                 goto out_unlock;
1261
1262         if (!dev || (dev->flags & IFF_UP)) {
1263                 dev_add_pack(&po->prot_hook);
1264                 sock_hold(sk);
1265                 po->running = 1;
1266         } else {
1267                 sk->sk_err = ENETDOWN;
1268                 if (!sock_flag(sk, SOCK_DEAD))
1269                         sk->sk_error_report(sk);
1270         }
1271
1272 out_unlock:
1273         spin_unlock(&po->bind_lock);
1274         release_sock(sk);
1275         return 0;
1276 }
1277
1278 /*
1279  *      Bind a packet socket to a device
1280  */
1281
1282 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1283                             int addr_len)
1284 {
1285         struct sock *sk = sock->sk;
1286         char name[15];
1287         struct net_device *dev;
1288         int err = -ENODEV;
1289
1290         /*
1291          *      Check legality
1292          */
1293
1294         if (addr_len != sizeof(struct sockaddr))
1295                 return -EINVAL;
1296         strlcpy(name, uaddr->sa_data, sizeof(name));
1297
1298         dev = dev_get_by_name(sock_net(sk), name);
1299         if (dev) {
1300                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1301                 dev_put(dev);
1302         }
1303         return err;
1304 }
1305
1306 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1307 {
1308         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1309         struct sock *sk = sock->sk;
1310         struct net_device *dev = NULL;
1311         int err;
1312
1313
1314         /*
1315          *      Check legality
1316          */
1317
1318         if (addr_len < sizeof(struct sockaddr_ll))
1319                 return -EINVAL;
1320         if (sll->sll_family != AF_PACKET)
1321                 return -EINVAL;
1322
1323         if (sll->sll_ifindex) {
1324                 err = -ENODEV;
1325                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1326                 if (dev == NULL)
1327                         goto out;
1328         }
1329         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1330         if (dev)
1331                 dev_put(dev);
1332
1333 out:
1334         return err;
1335 }
1336
1337 static struct proto packet_proto = {
1338         .name     = "PACKET",
1339         .owner    = THIS_MODULE,
1340         .obj_size = sizeof(struct packet_sock),
1341 };
1342
1343 /*
1344  *      Create a packet of type SOCK_PACKET.
1345  */
1346
1347 static int packet_create(struct net *net, struct socket *sock, int protocol)
1348 {
1349         struct sock *sk;
1350         struct packet_sock *po;
1351         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1352         int err;
1353
1354         if (!capable(CAP_NET_RAW))
1355                 return -EPERM;
1356         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1357             sock->type != SOCK_PACKET)
1358                 return -ESOCKTNOSUPPORT;
1359
1360         sock->state = SS_UNCONNECTED;
1361
1362         err = -ENOBUFS;
1363         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1364         if (sk == NULL)
1365                 goto out;
1366
1367         sock->ops = &packet_ops;
1368         if (sock->type == SOCK_PACKET)
1369                 sock->ops = &packet_ops_spkt;
1370
1371         sock_init_data(sock, sk);
1372
1373         po = pkt_sk(sk);
1374         sk->sk_family = PF_PACKET;
1375         po->num = proto;
1376
1377         sk->sk_destruct = packet_sock_destruct;
1378         sk_refcnt_debug_inc(sk);
1379
1380         /*
1381          *      Attach a protocol block
1382          */
1383
1384         spin_lock_init(&po->bind_lock);
1385         mutex_init(&po->pg_vec_lock);
1386         po->prot_hook.func = packet_rcv;
1387
1388         if (sock->type == SOCK_PACKET)
1389                 po->prot_hook.func = packet_rcv_spkt;
1390
1391         po->prot_hook.af_packet_priv = sk;
1392
1393         if (proto) {
1394                 po->prot_hook.type = proto;
1395                 dev_add_pack(&po->prot_hook);
1396                 sock_hold(sk);
1397                 po->running = 1;
1398         }
1399
1400         write_lock_bh(&net->packet.sklist_lock);
1401         sk_add_node(sk, &net->packet.sklist);
1402         sock_prot_inuse_add(net, &packet_proto, 1);
1403         write_unlock_bh(&net->packet.sklist_lock);
1404         return 0;
1405 out:
1406         return err;
1407 }
1408
1409 /*
1410  *      Pull a packet from our receive queue and hand it to the user.
1411  *      If necessary we block.
1412  */
1413
1414 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1415                           struct msghdr *msg, size_t len, int flags)
1416 {
1417         struct sock *sk = sock->sk;
1418         struct sk_buff *skb;
1419         int copied, err;
1420         struct sockaddr_ll *sll;
1421
1422         err = -EINVAL;
1423         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1424                 goto out;
1425
1426 #if 0
1427         /* What error should we return now? EUNATTACH? */
1428         if (pkt_sk(sk)->ifindex < 0)
1429                 return -ENODEV;
1430 #endif
1431
1432         /*
1433          *      Call the generic datagram receiver. This handles all sorts
1434          *      of horrible races and re-entrancy so we can forget about it
1435          *      in the protocol layers.
1436          *
1437          *      Now it will return ENETDOWN, if device have just gone down,
1438          *      but then it will block.
1439          */
1440
1441         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1442
1443         /*
1444          *      An error occurred so return it. Because skb_recv_datagram()
1445          *      handles the blocking we don't see and worry about blocking
1446          *      retries.
1447          */
1448
1449         if (skb == NULL)
1450                 goto out;
1451
1452         /*
1453          *      If the address length field is there to be filled in, we fill
1454          *      it in now.
1455          */
1456
1457         sll = &PACKET_SKB_CB(skb)->sa.ll;
1458         if (sock->type == SOCK_PACKET)
1459                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1460         else
1461                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1462
1463         /*
1464          *      You lose any data beyond the buffer you gave. If it worries a
1465          *      user program they can ask the device for its MTU anyway.
1466          */
1467
1468         copied = skb->len;
1469         if (copied > len) {
1470                 copied = len;
1471                 msg->msg_flags |= MSG_TRUNC;
1472         }
1473
1474         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1475         if (err)
1476                 goto out_free;
1477
1478         sock_recv_ts_and_drops(msg, sk, skb);
1479
1480         if (msg->msg_name)
1481                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1482                        msg->msg_namelen);
1483
1484         if (pkt_sk(sk)->auxdata) {
1485                 struct tpacket_auxdata aux;
1486
1487                 aux.tp_status = TP_STATUS_USER;
1488                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1489                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1490                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1491                 aux.tp_snaplen = skb->len;
1492                 aux.tp_mac = 0;
1493                 aux.tp_net = skb_network_offset(skb);
1494                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1495
1496                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1497         }
1498
1499         /*
1500          *      Free or return the buffer as appropriate. Again this
1501          *      hides all the races and re-entrancy issues from us.
1502          */
1503         err = (flags&MSG_TRUNC) ? skb->len : copied;
1504
1505 out_free:
1506         skb_free_datagram(sk, skb);
1507 out:
1508         return err;
1509 }
1510
1511 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1512                                int *uaddr_len, int peer)
1513 {
1514         struct net_device *dev;
1515         struct sock *sk = sock->sk;
1516
1517         if (peer)
1518                 return -EOPNOTSUPP;
1519
1520         uaddr->sa_family = AF_PACKET;
1521         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1522         if (dev) {
1523                 strlcpy(uaddr->sa_data, dev->name, 15);
1524                 dev_put(dev);
1525         } else
1526                 memset(uaddr->sa_data, 0, 14);
1527         *uaddr_len = sizeof(*uaddr);
1528
1529         return 0;
1530 }
1531
1532 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1533                           int *uaddr_len, int peer)
1534 {
1535         struct net_device *dev;
1536         struct sock *sk = sock->sk;
1537         struct packet_sock *po = pkt_sk(sk);
1538         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1539
1540         if (peer)
1541                 return -EOPNOTSUPP;
1542
1543         sll->sll_family = AF_PACKET;
1544         sll->sll_ifindex = po->ifindex;
1545         sll->sll_protocol = po->num;
1546         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1547         if (dev) {
1548                 sll->sll_hatype = dev->type;
1549                 sll->sll_halen = dev->addr_len;
1550                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1551                 dev_put(dev);
1552         } else {
1553                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1554                 sll->sll_halen = 0;
1555         }
1556         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1557
1558         return 0;
1559 }
1560
1561 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1562                          int what)
1563 {
1564         switch (i->type) {
1565         case PACKET_MR_MULTICAST:
1566                 if (what > 0)
1567                         return dev_mc_add(dev, i->addr, i->alen, 0);
1568                 else
1569                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1570                 break;
1571         case PACKET_MR_PROMISC:
1572                 return dev_set_promiscuity(dev, what);
1573                 break;
1574         case PACKET_MR_ALLMULTI:
1575                 return dev_set_allmulti(dev, what);
1576                 break;
1577         case PACKET_MR_UNICAST:
1578                 if (what > 0)
1579                         return dev_unicast_add(dev, i->addr);
1580                 else
1581                         return dev_unicast_delete(dev, i->addr);
1582                 break;
1583         default:
1584                 break;
1585         }
1586         return 0;
1587 }
1588
1589 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1590 {
1591         for ( ; i; i = i->next) {
1592                 if (i->ifindex == dev->ifindex)
1593                         packet_dev_mc(dev, i, what);
1594         }
1595 }
1596
1597 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1598 {
1599         struct packet_sock *po = pkt_sk(sk);
1600         struct packet_mclist *ml, *i;
1601         struct net_device *dev;
1602         int err;
1603
1604         rtnl_lock();
1605
1606         err = -ENODEV;
1607         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1608         if (!dev)
1609                 goto done;
1610
1611         err = -EINVAL;
1612         if (mreq->mr_alen > dev->addr_len)
1613                 goto done;
1614
1615         err = -ENOBUFS;
1616         i = kmalloc(sizeof(*i), GFP_KERNEL);
1617         if (i == NULL)
1618                 goto done;
1619
1620         err = 0;
1621         for (ml = po->mclist; ml; ml = ml->next) {
1622                 if (ml->ifindex == mreq->mr_ifindex &&
1623                     ml->type == mreq->mr_type &&
1624                     ml->alen == mreq->mr_alen &&
1625                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1626                         ml->count++;
1627                         /* Free the new element ... */
1628                         kfree(i);
1629                         goto done;
1630                 }
1631         }
1632
1633         i->type = mreq->mr_type;
1634         i->ifindex = mreq->mr_ifindex;
1635         i->alen = mreq->mr_alen;
1636         memcpy(i->addr, mreq->mr_address, i->alen);
1637         i->count = 1;
1638         i->next = po->mclist;
1639         po->mclist = i;
1640         err = packet_dev_mc(dev, i, 1);
1641         if (err) {
1642                 po->mclist = i->next;
1643                 kfree(i);
1644         }
1645
1646 done:
1647         rtnl_unlock();
1648         return err;
1649 }
1650
1651 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1652 {
1653         struct packet_mclist *ml, **mlp;
1654
1655         rtnl_lock();
1656
1657         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1658                 if (ml->ifindex == mreq->mr_ifindex &&
1659                     ml->type == mreq->mr_type &&
1660                     ml->alen == mreq->mr_alen &&
1661                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1662                         if (--ml->count == 0) {
1663                                 struct net_device *dev;
1664                                 *mlp = ml->next;
1665                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1666                                 if (dev)
1667                                         packet_dev_mc(dev, ml, -1);
1668                                 kfree(ml);
1669                         }
1670                         rtnl_unlock();
1671                         return 0;
1672                 }
1673         }
1674         rtnl_unlock();
1675         return -EADDRNOTAVAIL;
1676 }
1677
1678 static void packet_flush_mclist(struct sock *sk)
1679 {
1680         struct packet_sock *po = pkt_sk(sk);
1681         struct packet_mclist *ml;
1682
1683         if (!po->mclist)
1684                 return;
1685
1686         rtnl_lock();
1687         while ((ml = po->mclist) != NULL) {
1688                 struct net_device *dev;
1689
1690                 po->mclist = ml->next;
1691                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1692                 if (dev != NULL)
1693                         packet_dev_mc(dev, ml, -1);
1694                 kfree(ml);
1695         }
1696         rtnl_unlock();
1697 }
1698
1699 static int
1700 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1701 {
1702         struct sock *sk = sock->sk;
1703         struct packet_sock *po = pkt_sk(sk);
1704         int ret;
1705
1706         if (level != SOL_PACKET)
1707                 return -ENOPROTOOPT;
1708
1709         switch (optname) {
1710         case PACKET_ADD_MEMBERSHIP:
1711         case PACKET_DROP_MEMBERSHIP:
1712         {
1713                 struct packet_mreq_max mreq;
1714                 int len = optlen;
1715                 memset(&mreq, 0, sizeof(mreq));
1716                 if (len < sizeof(struct packet_mreq))
1717                         return -EINVAL;
1718                 if (len > sizeof(mreq))
1719                         len = sizeof(mreq);
1720                 if (copy_from_user(&mreq, optval, len))
1721                         return -EFAULT;
1722                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1723                         return -EINVAL;
1724                 if (optname == PACKET_ADD_MEMBERSHIP)
1725                         ret = packet_mc_add(sk, &mreq);
1726                 else
1727                         ret = packet_mc_drop(sk, &mreq);
1728                 return ret;
1729         }
1730
1731 #ifdef CONFIG_PACKET_MMAP
1732         case PACKET_RX_RING:
1733         case PACKET_TX_RING:
1734         {
1735                 struct tpacket_req req;
1736
1737                 if (optlen < sizeof(req))
1738                         return -EINVAL;
1739                 if (copy_from_user(&req, optval, sizeof(req)))
1740                         return -EFAULT;
1741                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1742         }
1743         case PACKET_COPY_THRESH:
1744         {
1745                 int val;
1746
1747                 if (optlen != sizeof(val))
1748                         return -EINVAL;
1749                 if (copy_from_user(&val, optval, sizeof(val)))
1750                         return -EFAULT;
1751
1752                 pkt_sk(sk)->copy_thresh = val;
1753                 return 0;
1754         }
1755         case PACKET_VERSION:
1756         {
1757                 int val;
1758
1759                 if (optlen != sizeof(val))
1760                         return -EINVAL;
1761                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1762                         return -EBUSY;
1763                 if (copy_from_user(&val, optval, sizeof(val)))
1764                         return -EFAULT;
1765                 switch (val) {
1766                 case TPACKET_V1:
1767                 case TPACKET_V2:
1768                         po->tp_version = val;
1769                         return 0;
1770                 default:
1771                         return -EINVAL;
1772                 }
1773         }
1774         case PACKET_RESERVE:
1775         {
1776                 unsigned int val;
1777
1778                 if (optlen != sizeof(val))
1779                         return -EINVAL;
1780                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1781                         return -EBUSY;
1782                 if (copy_from_user(&val, optval, sizeof(val)))
1783                         return -EFAULT;
1784                 po->tp_reserve = val;
1785                 return 0;
1786         }
1787         case PACKET_LOSS:
1788         {
1789                 unsigned int val;
1790
1791                 if (optlen != sizeof(val))
1792                         return -EINVAL;
1793                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1794                         return -EBUSY;
1795                 if (copy_from_user(&val, optval, sizeof(val)))
1796                         return -EFAULT;
1797                 po->tp_loss = !!val;
1798                 return 0;
1799         }
1800 #endif
1801         case PACKET_AUXDATA:
1802         {
1803                 int val;
1804
1805                 if (optlen < sizeof(val))
1806                         return -EINVAL;
1807                 if (copy_from_user(&val, optval, sizeof(val)))
1808                         return -EFAULT;
1809
1810                 po->auxdata = !!val;
1811                 return 0;
1812         }
1813         case PACKET_ORIGDEV:
1814         {
1815                 int val;
1816
1817                 if (optlen < sizeof(val))
1818                         return -EINVAL;
1819                 if (copy_from_user(&val, optval, sizeof(val)))
1820                         return -EFAULT;
1821
1822                 po->origdev = !!val;
1823                 return 0;
1824         }
1825         default:
1826                 return -ENOPROTOOPT;
1827         }
1828 }
1829
1830 static int packet_getsockopt(struct socket *sock, int level, int optname,
1831                              char __user *optval, int __user *optlen)
1832 {
1833         int len;
1834         int val;
1835         struct sock *sk = sock->sk;
1836         struct packet_sock *po = pkt_sk(sk);
1837         void *data;
1838         struct tpacket_stats st;
1839
1840         if (level != SOL_PACKET)
1841                 return -ENOPROTOOPT;
1842
1843         if (get_user(len, optlen))
1844                 return -EFAULT;
1845
1846         if (len < 0)
1847                 return -EINVAL;
1848
1849         switch (optname) {
1850         case PACKET_STATISTICS:
1851                 if (len > sizeof(struct tpacket_stats))
1852                         len = sizeof(struct tpacket_stats);
1853                 spin_lock_bh(&sk->sk_receive_queue.lock);
1854                 st = po->stats;
1855                 memset(&po->stats, 0, sizeof(st));
1856                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1857                 st.tp_packets += st.tp_drops;
1858
1859                 data = &st;
1860                 break;
1861         case PACKET_AUXDATA:
1862                 if (len > sizeof(int))
1863                         len = sizeof(int);
1864                 val = po->auxdata;
1865
1866                 data = &val;
1867                 break;
1868         case PACKET_ORIGDEV:
1869                 if (len > sizeof(int))
1870                         len = sizeof(int);
1871                 val = po->origdev;
1872
1873                 data = &val;
1874                 break;
1875 #ifdef CONFIG_PACKET_MMAP
1876         case PACKET_VERSION:
1877                 if (len > sizeof(int))
1878                         len = sizeof(int);
1879                 val = po->tp_version;
1880                 data = &val;
1881                 break;
1882         case PACKET_HDRLEN:
1883                 if (len > sizeof(int))
1884                         len = sizeof(int);
1885                 if (copy_from_user(&val, optval, len))
1886                         return -EFAULT;
1887                 switch (val) {
1888                 case TPACKET_V1:
1889                         val = sizeof(struct tpacket_hdr);
1890                         break;
1891                 case TPACKET_V2:
1892                         val = sizeof(struct tpacket2_hdr);
1893                         break;
1894                 default:
1895                         return -EINVAL;
1896                 }
1897                 data = &val;
1898                 break;
1899         case PACKET_RESERVE:
1900                 if (len > sizeof(unsigned int))
1901                         len = sizeof(unsigned int);
1902                 val = po->tp_reserve;
1903                 data = &val;
1904                 break;
1905         case PACKET_LOSS:
1906                 if (len > sizeof(unsigned int))
1907                         len = sizeof(unsigned int);
1908                 val = po->tp_loss;
1909                 data = &val;
1910                 break;
1911 #endif
1912         default:
1913                 return -ENOPROTOOPT;
1914         }
1915
1916         if (put_user(len, optlen))
1917                 return -EFAULT;
1918         if (copy_to_user(optval, data, len))
1919                 return -EFAULT;
1920         return 0;
1921 }
1922
1923
1924 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1925 {
1926         struct sock *sk;
1927         struct hlist_node *node;
1928         struct net_device *dev = data;
1929         struct net *net = dev_net(dev);
1930
1931         read_lock(&net->packet.sklist_lock);
1932         sk_for_each(sk, node, &net->packet.sklist) {
1933                 struct packet_sock *po = pkt_sk(sk);
1934
1935                 switch (msg) {
1936                 case NETDEV_UNREGISTER:
1937                         if (po->mclist)
1938                                 packet_dev_mclist(dev, po->mclist, -1);
1939                         /* fallthrough */
1940
1941                 case NETDEV_DOWN:
1942                         if (dev->ifindex == po->ifindex) {
1943                                 spin_lock(&po->bind_lock);
1944                                 if (po->running) {
1945                                         __dev_remove_pack(&po->prot_hook);
1946                                         __sock_put(sk);
1947                                         po->running = 0;
1948                                         sk->sk_err = ENETDOWN;
1949                                         if (!sock_flag(sk, SOCK_DEAD))
1950                                                 sk->sk_error_report(sk);
1951                                 }
1952                                 if (msg == NETDEV_UNREGISTER) {
1953                                         po->ifindex = -1;
1954                                         po->prot_hook.dev = NULL;
1955                                 }
1956                                 spin_unlock(&po->bind_lock);
1957                         }
1958                         break;
1959                 case NETDEV_UP:
1960                         spin_lock(&po->bind_lock);
1961                         if (dev->ifindex == po->ifindex && po->num &&
1962                             !po->running) {
1963                                 dev_add_pack(&po->prot_hook);
1964                                 sock_hold(sk);
1965                                 po->running = 1;
1966                         }
1967                         spin_unlock(&po->bind_lock);
1968                         break;
1969                 }
1970         }
1971         read_unlock(&net->packet.sklist_lock);
1972         return NOTIFY_DONE;
1973 }
1974
1975
1976 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1977                         unsigned long arg)
1978 {
1979         struct sock *sk = sock->sk;
1980
1981         switch (cmd) {
1982         case SIOCOUTQ:
1983         {
1984                 int amount = sk_wmem_alloc_get(sk);
1985
1986                 return put_user(amount, (int __user *)arg);
1987         }
1988         case SIOCINQ:
1989         {
1990                 struct sk_buff *skb;
1991                 int amount = 0;
1992
1993                 spin_lock_bh(&sk->sk_receive_queue.lock);
1994                 skb = skb_peek(&sk->sk_receive_queue);
1995                 if (skb)
1996                         amount = skb->len;
1997                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1998                 return put_user(amount, (int __user *)arg);
1999         }
2000         case SIOCGSTAMP:
2001                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2002         case SIOCGSTAMPNS:
2003                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2004
2005 #ifdef CONFIG_INET
2006         case SIOCADDRT:
2007         case SIOCDELRT:
2008         case SIOCDARP:
2009         case SIOCGARP:
2010         case SIOCSARP:
2011         case SIOCGIFADDR:
2012         case SIOCSIFADDR:
2013         case SIOCGIFBRDADDR:
2014         case SIOCSIFBRDADDR:
2015         case SIOCGIFNETMASK:
2016         case SIOCSIFNETMASK:
2017         case SIOCGIFDSTADDR:
2018         case SIOCSIFDSTADDR:
2019         case SIOCSIFFLAGS:
2020                 if (!net_eq(sock_net(sk), &init_net))
2021                         return -ENOIOCTLCMD;
2022                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2023 #endif
2024
2025         default:
2026                 return -ENOIOCTLCMD;
2027         }
2028         return 0;
2029 }
2030
2031 #ifndef CONFIG_PACKET_MMAP
2032 #define packet_mmap sock_no_mmap
2033 #define packet_poll datagram_poll
2034 #else
2035
2036 static unsigned int packet_poll(struct file *file, struct socket *sock,
2037                                 poll_table *wait)
2038 {
2039         struct sock *sk = sock->sk;
2040         struct packet_sock *po = pkt_sk(sk);
2041         unsigned int mask = datagram_poll(file, sock, wait);
2042
2043         spin_lock_bh(&sk->sk_receive_queue.lock);
2044         if (po->rx_ring.pg_vec) {
2045                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2046                         mask |= POLLIN | POLLRDNORM;
2047         }
2048         spin_unlock_bh(&sk->sk_receive_queue.lock);
2049         spin_lock_bh(&sk->sk_write_queue.lock);
2050         if (po->tx_ring.pg_vec) {
2051                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2052                         mask |= POLLOUT | POLLWRNORM;
2053         }
2054         spin_unlock_bh(&sk->sk_write_queue.lock);
2055         return mask;
2056 }
2057
2058
2059 /* Dirty? Well, I still did not learn better way to account
2060  * for user mmaps.
2061  */
2062
2063 static void packet_mm_open(struct vm_area_struct *vma)
2064 {
2065         struct file *file = vma->vm_file;
2066         struct socket *sock = file->private_data;
2067         struct sock *sk = sock->sk;
2068
2069         if (sk)
2070                 atomic_inc(&pkt_sk(sk)->mapped);
2071 }
2072
2073 static void packet_mm_close(struct vm_area_struct *vma)
2074 {
2075         struct file *file = vma->vm_file;
2076         struct socket *sock = file->private_data;
2077         struct sock *sk = sock->sk;
2078
2079         if (sk)
2080                 atomic_dec(&pkt_sk(sk)->mapped);
2081 }
2082
2083 static const struct vm_operations_struct packet_mmap_ops = {
2084         .open   =       packet_mm_open,
2085         .close  =       packet_mm_close,
2086 };
2087
2088 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2089 {
2090         int i;
2091
2092         for (i = 0; i < len; i++) {
2093                 if (likely(pg_vec[i]))
2094                         free_pages((unsigned long) pg_vec[i], order);
2095         }
2096         kfree(pg_vec);
2097 }
2098
2099 static inline char *alloc_one_pg_vec_page(unsigned long order)
2100 {
2101         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2102
2103         return (char *) __get_free_pages(gfp_flags, order);
2104 }
2105
2106 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2107 {
2108         unsigned int block_nr = req->tp_block_nr;
2109         char **pg_vec;
2110         int i;
2111
2112         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2113         if (unlikely(!pg_vec))
2114                 goto out;
2115
2116         for (i = 0; i < block_nr; i++) {
2117                 pg_vec[i] = alloc_one_pg_vec_page(order);
2118                 if (unlikely(!pg_vec[i]))
2119                         goto out_free_pgvec;
2120         }
2121
2122 out:
2123         return pg_vec;
2124
2125 out_free_pgvec:
2126         free_pg_vec(pg_vec, order, block_nr);
2127         pg_vec = NULL;
2128         goto out;
2129 }
2130
2131 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2132                 int closing, int tx_ring)
2133 {
2134         char **pg_vec = NULL;
2135         struct packet_sock *po = pkt_sk(sk);
2136         int was_running, order = 0;
2137         struct packet_ring_buffer *rb;
2138         struct sk_buff_head *rb_queue;
2139         __be16 num;
2140         int err;
2141
2142         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2143         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2144
2145         err = -EBUSY;
2146         if (!closing) {
2147                 if (atomic_read(&po->mapped))
2148                         goto out;
2149                 if (atomic_read(&rb->pending))
2150                         goto out;
2151         }
2152
2153         if (req->tp_block_nr) {
2154                 /* Sanity tests and some calculations */
2155                 err = -EBUSY;
2156                 if (unlikely(rb->pg_vec))
2157                         goto out;
2158
2159                 switch (po->tp_version) {
2160                 case TPACKET_V1:
2161                         po->tp_hdrlen = TPACKET_HDRLEN;
2162                         break;
2163                 case TPACKET_V2:
2164                         po->tp_hdrlen = TPACKET2_HDRLEN;
2165                         break;
2166                 }
2167
2168                 err = -EINVAL;
2169                 if (unlikely((int)req->tp_block_size <= 0))
2170                         goto out;
2171                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2172                         goto out;
2173                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2174                                         po->tp_reserve))
2175                         goto out;
2176                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2177                         goto out;
2178
2179                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2180                 if (unlikely(rb->frames_per_block <= 0))
2181                         goto out;
2182                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2183                                         req->tp_frame_nr))
2184                         goto out;
2185
2186                 err = -ENOMEM;
2187                 order = get_order(req->tp_block_size);
2188                 pg_vec = alloc_pg_vec(req, order);
2189                 if (unlikely(!pg_vec))
2190                         goto out;
2191         }
2192         /* Done */
2193         else {
2194                 err = -EINVAL;
2195                 if (unlikely(req->tp_frame_nr))
2196                         goto out;
2197         }
2198
2199         lock_sock(sk);
2200
2201         /* Detach socket from network */
2202         spin_lock(&po->bind_lock);
2203         was_running = po->running;
2204         num = po->num;
2205         if (was_running) {
2206                 __dev_remove_pack(&po->prot_hook);
2207                 po->num = 0;
2208                 po->running = 0;
2209                 __sock_put(sk);
2210         }
2211         spin_unlock(&po->bind_lock);
2212
2213         synchronize_net();
2214
2215         err = -EBUSY;
2216         mutex_lock(&po->pg_vec_lock);
2217         if (closing || atomic_read(&po->mapped) == 0) {
2218                 err = 0;
2219 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2220                 spin_lock_bh(&rb_queue->lock);
2221                 pg_vec = XC(rb->pg_vec, pg_vec);
2222                 rb->frame_max = (req->tp_frame_nr - 1);
2223                 rb->head = 0;
2224                 rb->frame_size = req->tp_frame_size;
2225                 spin_unlock_bh(&rb_queue->lock);
2226
2227                 order = XC(rb->pg_vec_order, order);
2228                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2229
2230                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2231                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2232                                                 tpacket_rcv : packet_rcv;
2233                 skb_queue_purge(rb_queue);
2234 #undef XC
2235                 if (atomic_read(&po->mapped))
2236                         pr_err("packet_mmap: vma is busy: %d\n",
2237                                atomic_read(&po->mapped));
2238         }
2239         mutex_unlock(&po->pg_vec_lock);
2240
2241         spin_lock(&po->bind_lock);
2242         if (was_running && !po->running) {
2243                 sock_hold(sk);
2244                 po->running = 1;
2245                 po->num = num;
2246                 dev_add_pack(&po->prot_hook);
2247         }
2248         spin_unlock(&po->bind_lock);
2249
2250         release_sock(sk);
2251
2252         if (pg_vec)
2253                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2254 out:
2255         return err;
2256 }
2257
2258 static int packet_mmap(struct file *file, struct socket *sock,
2259                 struct vm_area_struct *vma)
2260 {
2261         struct sock *sk = sock->sk;
2262         struct packet_sock *po = pkt_sk(sk);
2263         unsigned long size, expected_size;
2264         struct packet_ring_buffer *rb;
2265         unsigned long start;
2266         int err = -EINVAL;
2267         int i;
2268
2269         if (vma->vm_pgoff)
2270                 return -EINVAL;
2271
2272         mutex_lock(&po->pg_vec_lock);
2273
2274         expected_size = 0;
2275         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2276                 if (rb->pg_vec) {
2277                         expected_size += rb->pg_vec_len
2278                                                 * rb->pg_vec_pages
2279                                                 * PAGE_SIZE;
2280                 }
2281         }
2282
2283         if (expected_size == 0)
2284                 goto out;
2285
2286         size = vma->vm_end - vma->vm_start;
2287         if (size != expected_size)
2288                 goto out;
2289
2290         start = vma->vm_start;
2291         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2292                 if (rb->pg_vec == NULL)
2293                         continue;
2294
2295                 for (i = 0; i < rb->pg_vec_len; i++) {
2296                         struct page *page = virt_to_page(rb->pg_vec[i]);
2297                         int pg_num;
2298
2299                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2300                                         pg_num++, page++) {
2301                                 err = vm_insert_page(vma, start, page);
2302                                 if (unlikely(err))
2303                                         goto out;
2304                                 start += PAGE_SIZE;
2305                         }
2306                 }
2307         }
2308
2309         atomic_inc(&po->mapped);
2310         vma->vm_ops = &packet_mmap_ops;
2311         err = 0;
2312
2313 out:
2314         mutex_unlock(&po->pg_vec_lock);
2315         return err;
2316 }
2317 #endif
2318
2319
2320 static const struct proto_ops packet_ops_spkt = {
2321         .family =       PF_PACKET,
2322         .owner =        THIS_MODULE,
2323         .release =      packet_release,
2324         .bind =         packet_bind_spkt,
2325         .connect =      sock_no_connect,
2326         .socketpair =   sock_no_socketpair,
2327         .accept =       sock_no_accept,
2328         .getname =      packet_getname_spkt,
2329         .poll =         datagram_poll,
2330         .ioctl =        packet_ioctl,
2331         .listen =       sock_no_listen,
2332         .shutdown =     sock_no_shutdown,
2333         .setsockopt =   sock_no_setsockopt,
2334         .getsockopt =   sock_no_getsockopt,
2335         .sendmsg =      packet_sendmsg_spkt,
2336         .recvmsg =      packet_recvmsg,
2337         .mmap =         sock_no_mmap,
2338         .sendpage =     sock_no_sendpage,
2339 };
2340
2341 static const struct proto_ops packet_ops = {
2342         .family =       PF_PACKET,
2343         .owner =        THIS_MODULE,
2344         .release =      packet_release,
2345         .bind =         packet_bind,
2346         .connect =      sock_no_connect,
2347         .socketpair =   sock_no_socketpair,
2348         .accept =       sock_no_accept,
2349         .getname =      packet_getname,
2350         .poll =         packet_poll,
2351         .ioctl =        packet_ioctl,
2352         .listen =       sock_no_listen,
2353         .shutdown =     sock_no_shutdown,
2354         .setsockopt =   packet_setsockopt,
2355         .getsockopt =   packet_getsockopt,
2356         .sendmsg =      packet_sendmsg,
2357         .recvmsg =      packet_recvmsg,
2358         .mmap =         packet_mmap,
2359         .sendpage =     sock_no_sendpage,
2360 };
2361
2362 static const struct net_proto_family packet_family_ops = {
2363         .family =       PF_PACKET,
2364         .create =       packet_create,
2365         .owner  =       THIS_MODULE,
2366 };
2367
2368 static struct notifier_block packet_netdev_notifier = {
2369         .notifier_call =        packet_notifier,
2370 };
2371
2372 #ifdef CONFIG_PROC_FS
2373 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2374 {
2375         struct sock *s;
2376         struct hlist_node *node;
2377
2378         sk_for_each(s, node, &net->packet.sklist) {
2379                 if (!off--)
2380                         return s;
2381         }
2382         return NULL;
2383 }
2384
2385 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2386         __acquires(seq_file_net(seq)->packet.sklist_lock)
2387 {
2388         struct net *net = seq_file_net(seq);
2389         read_lock(&net->packet.sklist_lock);
2390         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2391 }
2392
2393 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2394 {
2395         struct net *net = seq_file_net(seq);
2396         ++*pos;
2397         return  (v == SEQ_START_TOKEN)
2398                 ? sk_head(&net->packet.sklist)
2399                 : sk_next((struct sock *)v) ;
2400 }
2401
2402 static void packet_seq_stop(struct seq_file *seq, void *v)
2403         __releases(seq_file_net(seq)->packet.sklist_lock)
2404 {
2405         struct net *net = seq_file_net(seq);
2406         read_unlock(&net->packet.sklist_lock);
2407 }
2408
2409 static int packet_seq_show(struct seq_file *seq, void *v)
2410 {
2411         if (v == SEQ_START_TOKEN)
2412                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2413         else {
2414                 struct sock *s = v;
2415                 const struct packet_sock *po = pkt_sk(s);
2416
2417                 seq_printf(seq,
2418                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2419                            s,
2420                            atomic_read(&s->sk_refcnt),
2421                            s->sk_type,
2422                            ntohs(po->num),
2423                            po->ifindex,
2424                            po->running,
2425                            atomic_read(&s->sk_rmem_alloc),
2426                            sock_i_uid(s),
2427                            sock_i_ino(s));
2428         }
2429
2430         return 0;
2431 }
2432
2433 static const struct seq_operations packet_seq_ops = {
2434         .start  = packet_seq_start,
2435         .next   = packet_seq_next,
2436         .stop   = packet_seq_stop,
2437         .show   = packet_seq_show,
2438 };
2439
2440 static int packet_seq_open(struct inode *inode, struct file *file)
2441 {
2442         return seq_open_net(inode, file, &packet_seq_ops,
2443                             sizeof(struct seq_net_private));
2444 }
2445
2446 static const struct file_operations packet_seq_fops = {
2447         .owner          = THIS_MODULE,
2448         .open           = packet_seq_open,
2449         .read           = seq_read,
2450         .llseek         = seq_lseek,
2451         .release        = seq_release_net,
2452 };
2453
2454 #endif
2455
2456 static int packet_net_init(struct net *net)
2457 {
2458         rwlock_init(&net->packet.sklist_lock);
2459         INIT_HLIST_HEAD(&net->packet.sklist);
2460
2461         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2462                 return -ENOMEM;
2463
2464         return 0;
2465 }
2466
2467 static void packet_net_exit(struct net *net)
2468 {
2469         proc_net_remove(net, "packet");
2470 }
2471
2472 static struct pernet_operations packet_net_ops = {
2473         .init = packet_net_init,
2474         .exit = packet_net_exit,
2475 };
2476
2477
2478 static void __exit packet_exit(void)
2479 {
2480         unregister_netdevice_notifier(&packet_netdev_notifier);
2481         unregister_pernet_subsys(&packet_net_ops);
2482         sock_unregister(PF_PACKET);
2483         proto_unregister(&packet_proto);
2484 }
2485
2486 static int __init packet_init(void)
2487 {
2488         int rc = proto_register(&packet_proto, 0);
2489
2490         if (rc != 0)
2491                 goto out;
2492
2493         sock_register(&packet_family_ops);
2494         register_pernet_subsys(&packet_net_ops);
2495         register_netdevice_notifier(&packet_netdev_notifier);
2496 out:
2497         return rc;
2498 }
2499
2500 module_init(packet_init);
2501 module_exit(packet_exit);
2502 MODULE_LICENSE("GPL");
2503 MODULE_ALIAS_NETPROTO(PF_PACKET);