af_packet: Don't use skb after dev_queue_xmit()
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83
84 #ifdef CONFIG_INET
85 #include <net/inet_common.h>
86 #endif
87
88 /*
89    Assumptions:
90    - if device has no dev->hard_header routine, it adds and removes ll header
91      inside itself. In this case ll header is invisible outside of device,
92      but higher levels still should reserve dev->hard_header_len.
93      Some devices are enough clever to reallocate skb, when header
94      will not fit to reserved space (tunnel), another ones are silly
95      (PPP).
96    - packet socket receives packets with pulled ll header,
97      so that SOCK_RAW should push it back.
98
99 On receive:
100 -----------
101
102 Incoming, dev->hard_header!=NULL
103    mac_header -> ll header
104    data       -> data
105
106 Outgoing, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> ll header
109
110 Incoming, dev->hard_header==NULL
111    mac_header -> UNKNOWN position. It is very likely, that it points to ll
112                  header.  PPP makes it, that is wrong, because introduce
113                  assymetry between rx and tx paths.
114    data       -> data
115
116 Outgoing, dev->hard_header==NULL
117    mac_header -> data. ll header is still not built!
118    data       -> data
119
120 Resume
121   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
122
123
124 On transmit:
125 ------------
126
127 dev->hard_header != NULL
128    mac_header -> ll header
129    data       -> ll header
130
131 dev->hard_header == NULL (ll header is added by device, we cannot control it)
132    mac_header -> data
133    data       -> data
134
135    We should set nh.raw on output to correct posistion,
136    packet classifier depends on it.
137  */
138
139 /* Private packet socket structures. */
140
141 struct packet_mclist {
142         struct packet_mclist    *next;
143         int                     ifindex;
144         int                     count;
145         unsigned short          type;
146         unsigned short          alen;
147         unsigned char           addr[MAX_ADDR_LEN];
148 };
149 /* identical to struct packet_mreq except it has
150  * a longer address field.
151  */
152 struct packet_mreq_max {
153         int             mr_ifindex;
154         unsigned short  mr_type;
155         unsigned short  mr_alen;
156         unsigned char   mr_address[MAX_ADDR_LEN];
157 };
158
159 #ifdef CONFIG_PACKET_MMAP
160 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161                 int closing, int tx_ring);
162
163 struct packet_ring_buffer {
164         char                    **pg_vec;
165         unsigned int            head;
166         unsigned int            frames_per_block;
167         unsigned int            frame_size;
168         unsigned int            frame_max;
169
170         unsigned int            pg_vec_order;
171         unsigned int            pg_vec_pages;
172         unsigned int            pg_vec_len;
173
174         atomic_t                pending;
175 };
176
177 struct packet_sock;
178 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179 #endif
180
181 static void packet_flush_mclist(struct sock *sk);
182
183 struct packet_sock {
184         /* struct sock has to be the first member of packet_sock */
185         struct sock             sk;
186         struct tpacket_stats    stats;
187 #ifdef CONFIG_PACKET_MMAP
188         struct packet_ring_buffer       rx_ring;
189         struct packet_ring_buffer       tx_ring;
190         int                     copy_thresh;
191 #endif
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
208 };
209
210 struct packet_skb_cb {
211         unsigned int origlen;
212         union {
213                 struct sockaddr_pkt pkt;
214                 struct sockaddr_ll ll;
215         } sa;
216 };
217
218 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
219
220 #ifdef CONFIG_PACKET_MMAP
221
222 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
223 {
224         union {
225                 struct tpacket_hdr *h1;
226                 struct tpacket2_hdr *h2;
227                 void *raw;
228         } h;
229
230         h.raw = frame;
231         switch (po->tp_version) {
232         case TPACKET_V1:
233                 h.h1->tp_status = status;
234                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
235                 break;
236         case TPACKET_V2:
237                 h.h2->tp_status = status;
238                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
239                 break;
240         default:
241                 pr_err("TPACKET version not supported\n");
242                 BUG();
243         }
244
245         smp_wmb();
246 }
247
248 static int __packet_get_status(struct packet_sock *po, void *frame)
249 {
250         union {
251                 struct tpacket_hdr *h1;
252                 struct tpacket2_hdr *h2;
253                 void *raw;
254         } h;
255
256         smp_rmb();
257
258         h.raw = frame;
259         switch (po->tp_version) {
260         case TPACKET_V1:
261                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
262                 return h.h1->tp_status;
263         case TPACKET_V2:
264                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
265                 return h.h2->tp_status;
266         default:
267                 pr_err("TPACKET version not supported\n");
268                 BUG();
269                 return 0;
270         }
271 }
272
273 static void *packet_lookup_frame(struct packet_sock *po,
274                 struct packet_ring_buffer *rb,
275                 unsigned int position,
276                 int status)
277 {
278         unsigned int pg_vec_pos, frame_offset;
279         union {
280                 struct tpacket_hdr *h1;
281                 struct tpacket2_hdr *h2;
282                 void *raw;
283         } h;
284
285         pg_vec_pos = position / rb->frames_per_block;
286         frame_offset = position % rb->frames_per_block;
287
288         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
289
290         if (status != __packet_get_status(po, h.raw))
291                 return NULL;
292
293         return h.raw;
294 }
295
296 static inline void *packet_current_frame(struct packet_sock *po,
297                 struct packet_ring_buffer *rb,
298                 int status)
299 {
300         return packet_lookup_frame(po, rb, rb->head, status);
301 }
302
303 static inline void *packet_previous_frame(struct packet_sock *po,
304                 struct packet_ring_buffer *rb,
305                 int status)
306 {
307         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
308         return packet_lookup_frame(po, rb, previous, status);
309 }
310
311 static inline void packet_increment_head(struct packet_ring_buffer *buff)
312 {
313         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
314 }
315
316 #endif
317
318 static inline struct packet_sock *pkt_sk(struct sock *sk)
319 {
320         return (struct packet_sock *)sk;
321 }
322
323 static void packet_sock_destruct(struct sock *sk)
324 {
325         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
326         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
327
328         if (!sock_flag(sk, SOCK_DEAD)) {
329                 pr_err("Attempt to release alive packet socket: %p\n", sk);
330                 return;
331         }
332
333         sk_refcnt_debug_dec(sk);
334 }
335
336
337 static const struct proto_ops packet_ops;
338
339 static const struct proto_ops packet_ops_spkt;
340
341 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
342                            struct packet_type *pt, struct net_device *orig_dev)
343 {
344         struct sock *sk;
345         struct sockaddr_pkt *spkt;
346
347         /*
348          *      When we registered the protocol we saved the socket in the data
349          *      field for just this event.
350          */
351
352         sk = pt->af_packet_priv;
353
354         /*
355          *      Yank back the headers [hope the device set this
356          *      right or kerboom...]
357          *
358          *      Incoming packets have ll header pulled,
359          *      push it back.
360          *
361          *      For outgoing ones skb->data == skb_mac_header(skb)
362          *      so that this procedure is noop.
363          */
364
365         if (skb->pkt_type == PACKET_LOOPBACK)
366                 goto out;
367
368         if (!net_eq(dev_net(dev), sock_net(sk)))
369                 goto out;
370
371         skb = skb_share_check(skb, GFP_ATOMIC);
372         if (skb == NULL)
373                 goto oom;
374
375         /* drop any routing info */
376         skb_dst_drop(skb);
377
378         /* drop conntrack reference */
379         nf_reset(skb);
380
381         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
382
383         skb_push(skb, skb->data - skb_mac_header(skb));
384
385         /*
386          *      The SOCK_PACKET socket receives _all_ frames.
387          */
388
389         spkt->spkt_family = dev->type;
390         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
391         spkt->spkt_protocol = skb->protocol;
392
393         /*
394          *      Charge the memory to the socket. This is done specifically
395          *      to prevent sockets using all the memory up.
396          */
397
398         if (sock_queue_rcv_skb(sk, skb) == 0)
399                 return 0;
400
401 out:
402         kfree_skb(skb);
403 oom:
404         return 0;
405 }
406
407
408 /*
409  *      Output a raw packet to a device layer. This bypasses all the other
410  *      protocol layers and you must therefore supply it with a complete frame
411  */
412
413 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
414                                struct msghdr *msg, size_t len)
415 {
416         struct sock *sk = sock->sk;
417         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
418         struct sk_buff *skb = NULL;
419         struct net_device *dev;
420         __be16 proto = 0;
421         int err;
422
423         /*
424          *      Get and verify the address.
425          */
426
427         if (saddr) {
428                 if (msg->msg_namelen < sizeof(struct sockaddr))
429                         return -EINVAL;
430                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
431                         proto = saddr->spkt_protocol;
432         } else
433                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
434
435         /*
436          *      Find the device first to size check it
437          */
438
439         saddr->spkt_device[13] = 0;
440 retry:
441         rcu_read_lock();
442         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
443         err = -ENODEV;
444         if (dev == NULL)
445                 goto out_unlock;
446
447         err = -ENETDOWN;
448         if (!(dev->flags & IFF_UP))
449                 goto out_unlock;
450
451         /*
452          * You may not queue a frame bigger than the mtu. This is the lowest level
453          * raw protocol and you must do your own fragmentation at this level.
454          */
455
456         err = -EMSGSIZE;
457         if (len > dev->mtu + dev->hard_header_len)
458                 goto out_unlock;
459
460         if (!skb) {
461                 size_t reserved = LL_RESERVED_SPACE(dev);
462                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
463
464                 rcu_read_unlock();
465                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
466                 if (skb == NULL)
467                         return -ENOBUFS;
468                 /* FIXME: Save some space for broken drivers that write a hard
469                  * header at transmission time by themselves. PPP is the notable
470                  * one here. This should really be fixed at the driver level.
471                  */
472                 skb_reserve(skb, reserved);
473                 skb_reset_network_header(skb);
474
475                 /* Try to align data part correctly */
476                 if (hhlen) {
477                         skb->data -= hhlen;
478                         skb->tail -= hhlen;
479                         if (len < hhlen)
480                                 skb_reset_network_header(skb);
481                 }
482                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
483                 if (err)
484                         goto out_free;
485                 goto retry;
486         }
487
488
489         skb->protocol = proto;
490         skb->dev = dev;
491         skb->priority = sk->sk_priority;
492         skb->mark = sk->sk_mark;
493
494         dev_queue_xmit(skb);
495         rcu_read_unlock();
496         return len;
497
498 out_unlock:
499         rcu_read_unlock();
500 out_free:
501         kfree_skb(skb);
502         return err;
503 }
504
505 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
506                                       unsigned int res)
507 {
508         struct sk_filter *filter;
509
510         rcu_read_lock_bh();
511         filter = rcu_dereference(sk->sk_filter);
512         if (filter != NULL)
513                 res = sk_run_filter(skb, filter->insns, filter->len);
514         rcu_read_unlock_bh();
515
516         return res;
517 }
518
519 /*
520    This function makes lazy skb cloning in hope that most of packets
521    are discarded by BPF.
522
523    Note tricky part: we DO mangle shared skb! skb->data, skb->len
524    and skb->cb are mangled. It works because (and until) packets
525    falling here are owned by current CPU. Output packets are cloned
526    by dev_queue_xmit_nit(), input packets are processed by net_bh
527    sequencially, so that if we return skb to original state on exit,
528    we will not harm anyone.
529  */
530
531 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
532                       struct packet_type *pt, struct net_device *orig_dev)
533 {
534         struct sock *sk;
535         struct sockaddr_ll *sll;
536         struct packet_sock *po;
537         u8 *skb_head = skb->data;
538         int skb_len = skb->len;
539         unsigned int snaplen, res;
540
541         if (skb->pkt_type == PACKET_LOOPBACK)
542                 goto drop;
543
544         sk = pt->af_packet_priv;
545         po = pkt_sk(sk);
546
547         if (!net_eq(dev_net(dev), sock_net(sk)))
548                 goto drop;
549
550         skb->dev = dev;
551
552         if (dev->header_ops) {
553                 /* The device has an explicit notion of ll header,
554                    exported to higher levels.
555
556                    Otherwise, the device hides datails of it frame
557                    structure, so that corresponding packet head
558                    never delivered to user.
559                  */
560                 if (sk->sk_type != SOCK_DGRAM)
561                         skb_push(skb, skb->data - skb_mac_header(skb));
562                 else if (skb->pkt_type == PACKET_OUTGOING) {
563                         /* Special case: outgoing packets have ll header at head */
564                         skb_pull(skb, skb_network_offset(skb));
565                 }
566         }
567
568         snaplen = skb->len;
569
570         res = run_filter(skb, sk, snaplen);
571         if (!res)
572                 goto drop_n_restore;
573         if (snaplen > res)
574                 snaplen = res;
575
576         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
577             (unsigned)sk->sk_rcvbuf)
578                 goto drop_n_acct;
579
580         if (skb_shared(skb)) {
581                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
582                 if (nskb == NULL)
583                         goto drop_n_acct;
584
585                 if (skb_head != skb->data) {
586                         skb->data = skb_head;
587                         skb->len = skb_len;
588                 }
589                 kfree_skb(skb);
590                 skb = nskb;
591         }
592
593         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
594                      sizeof(skb->cb));
595
596         sll = &PACKET_SKB_CB(skb)->sa.ll;
597         sll->sll_family = AF_PACKET;
598         sll->sll_hatype = dev->type;
599         sll->sll_protocol = skb->protocol;
600         sll->sll_pkttype = skb->pkt_type;
601         if (unlikely(po->origdev))
602                 sll->sll_ifindex = orig_dev->ifindex;
603         else
604                 sll->sll_ifindex = dev->ifindex;
605
606         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
607
608         PACKET_SKB_CB(skb)->origlen = skb->len;
609
610         if (pskb_trim(skb, snaplen))
611                 goto drop_n_acct;
612
613         skb_set_owner_r(skb, sk);
614         skb->dev = NULL;
615         skb_dst_drop(skb);
616
617         /* drop conntrack reference */
618         nf_reset(skb);
619
620         spin_lock(&sk->sk_receive_queue.lock);
621         po->stats.tp_packets++;
622         skb->dropcount = atomic_read(&sk->sk_drops);
623         __skb_queue_tail(&sk->sk_receive_queue, skb);
624         spin_unlock(&sk->sk_receive_queue.lock);
625         sk->sk_data_ready(sk, skb->len);
626         return 0;
627
628 drop_n_acct:
629         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
630
631 drop_n_restore:
632         if (skb_head != skb->data && skb_shared(skb)) {
633                 skb->data = skb_head;
634                 skb->len = skb_len;
635         }
636 drop:
637         consume_skb(skb);
638         return 0;
639 }
640
641 #ifdef CONFIG_PACKET_MMAP
642 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
643                        struct packet_type *pt, struct net_device *orig_dev)
644 {
645         struct sock *sk;
646         struct packet_sock *po;
647         struct sockaddr_ll *sll;
648         union {
649                 struct tpacket_hdr *h1;
650                 struct tpacket2_hdr *h2;
651                 void *raw;
652         } h;
653         u8 *skb_head = skb->data;
654         int skb_len = skb->len;
655         unsigned int snaplen, res;
656         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
657         unsigned short macoff, netoff, hdrlen;
658         struct sk_buff *copy_skb = NULL;
659         struct timeval tv;
660         struct timespec ts;
661
662         if (skb->pkt_type == PACKET_LOOPBACK)
663                 goto drop;
664
665         sk = pt->af_packet_priv;
666         po = pkt_sk(sk);
667
668         if (!net_eq(dev_net(dev), sock_net(sk)))
669                 goto drop;
670
671         if (dev->header_ops) {
672                 if (sk->sk_type != SOCK_DGRAM)
673                         skb_push(skb, skb->data - skb_mac_header(skb));
674                 else if (skb->pkt_type == PACKET_OUTGOING) {
675                         /* Special case: outgoing packets have ll header at head */
676                         skb_pull(skb, skb_network_offset(skb));
677                 }
678         }
679
680         if (skb->ip_summed == CHECKSUM_PARTIAL)
681                 status |= TP_STATUS_CSUMNOTREADY;
682
683         snaplen = skb->len;
684
685         res = run_filter(skb, sk, snaplen);
686         if (!res)
687                 goto drop_n_restore;
688         if (snaplen > res)
689                 snaplen = res;
690
691         if (sk->sk_type == SOCK_DGRAM) {
692                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
693                                   po->tp_reserve;
694         } else {
695                 unsigned maclen = skb_network_offset(skb);
696                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
697                                        (maclen < 16 ? 16 : maclen)) +
698                         po->tp_reserve;
699                 macoff = netoff - maclen;
700         }
701
702         if (macoff + snaplen > po->rx_ring.frame_size) {
703                 if (po->copy_thresh &&
704                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
705                     (unsigned)sk->sk_rcvbuf) {
706                         if (skb_shared(skb)) {
707                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
708                         } else {
709                                 copy_skb = skb_get(skb);
710                                 skb_head = skb->data;
711                         }
712                         if (copy_skb)
713                                 skb_set_owner_r(copy_skb, sk);
714                 }
715                 snaplen = po->rx_ring.frame_size - macoff;
716                 if ((int)snaplen < 0)
717                         snaplen = 0;
718         }
719
720         spin_lock(&sk->sk_receive_queue.lock);
721         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
722         if (!h.raw)
723                 goto ring_is_full;
724         packet_increment_head(&po->rx_ring);
725         po->stats.tp_packets++;
726         if (copy_skb) {
727                 status |= TP_STATUS_COPY;
728                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
729         }
730         if (!po->stats.tp_drops)
731                 status &= ~TP_STATUS_LOSING;
732         spin_unlock(&sk->sk_receive_queue.lock);
733
734         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
735
736         switch (po->tp_version) {
737         case TPACKET_V1:
738                 h.h1->tp_len = skb->len;
739                 h.h1->tp_snaplen = snaplen;
740                 h.h1->tp_mac = macoff;
741                 h.h1->tp_net = netoff;
742                 if (skb->tstamp.tv64)
743                         tv = ktime_to_timeval(skb->tstamp);
744                 else
745                         do_gettimeofday(&tv);
746                 h.h1->tp_sec = tv.tv_sec;
747                 h.h1->tp_usec = tv.tv_usec;
748                 hdrlen = sizeof(*h.h1);
749                 break;
750         case TPACKET_V2:
751                 h.h2->tp_len = skb->len;
752                 h.h2->tp_snaplen = snaplen;
753                 h.h2->tp_mac = macoff;
754                 h.h2->tp_net = netoff;
755                 if (skb->tstamp.tv64)
756                         ts = ktime_to_timespec(skb->tstamp);
757                 else
758                         getnstimeofday(&ts);
759                 h.h2->tp_sec = ts.tv_sec;
760                 h.h2->tp_nsec = ts.tv_nsec;
761                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
762                 hdrlen = sizeof(*h.h2);
763                 break;
764         default:
765                 BUG();
766         }
767
768         sll = h.raw + TPACKET_ALIGN(hdrlen);
769         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
770         sll->sll_family = AF_PACKET;
771         sll->sll_hatype = dev->type;
772         sll->sll_protocol = skb->protocol;
773         sll->sll_pkttype = skb->pkt_type;
774         if (unlikely(po->origdev))
775                 sll->sll_ifindex = orig_dev->ifindex;
776         else
777                 sll->sll_ifindex = dev->ifindex;
778
779         __packet_set_status(po, h.raw, status);
780         smp_mb();
781         {
782                 struct page *p_start, *p_end;
783                 u8 *h_end = h.raw + macoff + snaplen - 1;
784
785                 p_start = virt_to_page(h.raw);
786                 p_end = virt_to_page(h_end);
787                 while (p_start <= p_end) {
788                         flush_dcache_page(p_start);
789                         p_start++;
790                 }
791         }
792
793         sk->sk_data_ready(sk, 0);
794
795 drop_n_restore:
796         if (skb_head != skb->data && skb_shared(skb)) {
797                 skb->data = skb_head;
798                 skb->len = skb_len;
799         }
800 drop:
801         kfree_skb(skb);
802         return 0;
803
804 ring_is_full:
805         po->stats.tp_drops++;
806         spin_unlock(&sk->sk_receive_queue.lock);
807
808         sk->sk_data_ready(sk, 0);
809         kfree_skb(copy_skb);
810         goto drop_n_restore;
811 }
812
813 static void tpacket_destruct_skb(struct sk_buff *skb)
814 {
815         struct packet_sock *po = pkt_sk(skb->sk);
816         void *ph;
817
818         BUG_ON(skb == NULL);
819
820         if (likely(po->tx_ring.pg_vec)) {
821                 ph = skb_shinfo(skb)->destructor_arg;
822                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
823                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
824                 atomic_dec(&po->tx_ring.pending);
825                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
826         }
827
828         sock_wfree(skb);
829 }
830
831 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
832                 void *frame, struct net_device *dev, int size_max,
833                 __be16 proto, unsigned char *addr)
834 {
835         union {
836                 struct tpacket_hdr *h1;
837                 struct tpacket2_hdr *h2;
838                 void *raw;
839         } ph;
840         int to_write, offset, len, tp_len, nr_frags, len_max;
841         struct socket *sock = po->sk.sk_socket;
842         struct page *page;
843         void *data;
844         int err;
845
846         ph.raw = frame;
847
848         skb->protocol = proto;
849         skb->dev = dev;
850         skb->priority = po->sk.sk_priority;
851         skb->mark = po->sk.sk_mark;
852         skb_shinfo(skb)->destructor_arg = ph.raw;
853
854         switch (po->tp_version) {
855         case TPACKET_V2:
856                 tp_len = ph.h2->tp_len;
857                 break;
858         default:
859                 tp_len = ph.h1->tp_len;
860                 break;
861         }
862         if (unlikely(tp_len > size_max)) {
863                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
864                 return -EMSGSIZE;
865         }
866
867         skb_reserve(skb, LL_RESERVED_SPACE(dev));
868         skb_reset_network_header(skb);
869
870         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
871         to_write = tp_len;
872
873         if (sock->type == SOCK_DGRAM) {
874                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
875                                 NULL, tp_len);
876                 if (unlikely(err < 0))
877                         return -EINVAL;
878         } else if (dev->hard_header_len) {
879                 /* net device doesn't like empty head */
880                 if (unlikely(tp_len <= dev->hard_header_len)) {
881                         pr_err("packet size is too short (%d < %d)\n",
882                                tp_len, dev->hard_header_len);
883                         return -EINVAL;
884                 }
885
886                 skb_push(skb, dev->hard_header_len);
887                 err = skb_store_bits(skb, 0, data,
888                                 dev->hard_header_len);
889                 if (unlikely(err))
890                         return err;
891
892                 data += dev->hard_header_len;
893                 to_write -= dev->hard_header_len;
894         }
895
896         err = -EFAULT;
897         page = virt_to_page(data);
898         offset = offset_in_page(data);
899         len_max = PAGE_SIZE - offset;
900         len = ((to_write > len_max) ? len_max : to_write);
901
902         skb->data_len = to_write;
903         skb->len += to_write;
904         skb->truesize += to_write;
905         atomic_add(to_write, &po->sk.sk_wmem_alloc);
906
907         while (likely(to_write)) {
908                 nr_frags = skb_shinfo(skb)->nr_frags;
909
910                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
911                         pr_err("Packet exceed the number of skb frags(%lu)\n",
912                                MAX_SKB_FRAGS);
913                         return -EFAULT;
914                 }
915
916                 flush_dcache_page(page);
917                 get_page(page);
918                 skb_fill_page_desc(skb,
919                                 nr_frags,
920                                 page++, offset, len);
921                 to_write -= len;
922                 offset = 0;
923                 len_max = PAGE_SIZE;
924                 len = ((to_write > len_max) ? len_max : to_write);
925         }
926
927         return tp_len;
928 }
929
930 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
931 {
932         struct socket *sock;
933         struct sk_buff *skb;
934         struct net_device *dev;
935         __be16 proto;
936         int ifindex, err, reserve = 0;
937         void *ph;
938         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
939         int tp_len, size_max;
940         unsigned char *addr;
941         int len_sum = 0;
942         int status = 0;
943
944         sock = po->sk.sk_socket;
945
946         mutex_lock(&po->pg_vec_lock);
947
948         err = -EBUSY;
949         if (saddr == NULL) {
950                 ifindex = po->ifindex;
951                 proto   = po->num;
952                 addr    = NULL;
953         } else {
954                 err = -EINVAL;
955                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
956                         goto out;
957                 if (msg->msg_namelen < (saddr->sll_halen
958                                         + offsetof(struct sockaddr_ll,
959                                                 sll_addr)))
960                         goto out;
961                 ifindex = saddr->sll_ifindex;
962                 proto   = saddr->sll_protocol;
963                 addr    = saddr->sll_addr;
964         }
965
966         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
967         err = -ENXIO;
968         if (unlikely(dev == NULL))
969                 goto out;
970
971         reserve = dev->hard_header_len;
972
973         err = -ENETDOWN;
974         if (unlikely(!(dev->flags & IFF_UP)))
975                 goto out_put;
976
977         size_max = po->tx_ring.frame_size
978                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
979
980         if (size_max > dev->mtu + reserve)
981                 size_max = dev->mtu + reserve;
982
983         do {
984                 ph = packet_current_frame(po, &po->tx_ring,
985                                 TP_STATUS_SEND_REQUEST);
986
987                 if (unlikely(ph == NULL)) {
988                         schedule();
989                         continue;
990                 }
991
992                 status = TP_STATUS_SEND_REQUEST;
993                 skb = sock_alloc_send_skb(&po->sk,
994                                 LL_ALLOCATED_SPACE(dev)
995                                 + sizeof(struct sockaddr_ll),
996                                 0, &err);
997
998                 if (unlikely(skb == NULL))
999                         goto out_status;
1000
1001                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1002                                 addr);
1003
1004                 if (unlikely(tp_len < 0)) {
1005                         if (po->tp_loss) {
1006                                 __packet_set_status(po, ph,
1007                                                 TP_STATUS_AVAILABLE);
1008                                 packet_increment_head(&po->tx_ring);
1009                                 kfree_skb(skb);
1010                                 continue;
1011                         } else {
1012                                 status = TP_STATUS_WRONG_FORMAT;
1013                                 err = tp_len;
1014                                 goto out_status;
1015                         }
1016                 }
1017
1018                 skb->destructor = tpacket_destruct_skb;
1019                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1020                 atomic_inc(&po->tx_ring.pending);
1021
1022                 status = TP_STATUS_SEND_REQUEST;
1023                 err = dev_queue_xmit(skb);
1024                 if (unlikely(err > 0)) {
1025                         err = net_xmit_errno(err);
1026                         if (err && __packet_get_status(po, ph) ==
1027                                    TP_STATUS_AVAILABLE) {
1028                                 /* skb was destructed already */
1029                                 skb = NULL;
1030                                 goto out_status;
1031                         }
1032                         /*
1033                          * skb was dropped but not destructed yet;
1034                          * let's treat it like congestion or err < 0
1035                          */
1036                         err = 0;
1037                 }
1038                 packet_increment_head(&po->tx_ring);
1039                 len_sum += tp_len;
1040         } while (likely((ph != NULL) ||
1041                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1042                          (atomic_read(&po->tx_ring.pending))))
1043                 );
1044
1045         err = len_sum;
1046         goto out_put;
1047
1048 out_status:
1049         __packet_set_status(po, ph, status);
1050         kfree_skb(skb);
1051 out_put:
1052         dev_put(dev);
1053 out:
1054         mutex_unlock(&po->pg_vec_lock);
1055         return err;
1056 }
1057 #endif
1058
1059 static int packet_snd(struct socket *sock,
1060                           struct msghdr *msg, size_t len)
1061 {
1062         struct sock *sk = sock->sk;
1063         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1064         struct sk_buff *skb;
1065         struct net_device *dev;
1066         __be16 proto;
1067         unsigned char *addr;
1068         int ifindex, err, reserve = 0;
1069
1070         /*
1071          *      Get and verify the address.
1072          */
1073
1074         if (saddr == NULL) {
1075                 struct packet_sock *po = pkt_sk(sk);
1076
1077                 ifindex = po->ifindex;
1078                 proto   = po->num;
1079                 addr    = NULL;
1080         } else {
1081                 err = -EINVAL;
1082                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1083                         goto out;
1084                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1085                         goto out;
1086                 ifindex = saddr->sll_ifindex;
1087                 proto   = saddr->sll_protocol;
1088                 addr    = saddr->sll_addr;
1089         }
1090
1091
1092         dev = dev_get_by_index(sock_net(sk), ifindex);
1093         err = -ENXIO;
1094         if (dev == NULL)
1095                 goto out_unlock;
1096         if (sock->type == SOCK_RAW)
1097                 reserve = dev->hard_header_len;
1098
1099         err = -ENETDOWN;
1100         if (!(dev->flags & IFF_UP))
1101                 goto out_unlock;
1102
1103         err = -EMSGSIZE;
1104         if (len > dev->mtu+reserve)
1105                 goto out_unlock;
1106
1107         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1108                                 msg->msg_flags & MSG_DONTWAIT, &err);
1109         if (skb == NULL)
1110                 goto out_unlock;
1111
1112         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1113         skb_reset_network_header(skb);
1114
1115         err = -EINVAL;
1116         if (sock->type == SOCK_DGRAM &&
1117             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1118                 goto out_free;
1119
1120         /* Returns -EFAULT on error */
1121         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1122         if (err)
1123                 goto out_free;
1124
1125         skb->protocol = proto;
1126         skb->dev = dev;
1127         skb->priority = sk->sk_priority;
1128         skb->mark = sk->sk_mark;
1129
1130         /*
1131          *      Now send it
1132          */
1133
1134         err = dev_queue_xmit(skb);
1135         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1136                 goto out_unlock;
1137
1138         dev_put(dev);
1139
1140         return len;
1141
1142 out_free:
1143         kfree_skb(skb);
1144 out_unlock:
1145         if (dev)
1146                 dev_put(dev);
1147 out:
1148         return err;
1149 }
1150
1151 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1152                 struct msghdr *msg, size_t len)
1153 {
1154 #ifdef CONFIG_PACKET_MMAP
1155         struct sock *sk = sock->sk;
1156         struct packet_sock *po = pkt_sk(sk);
1157         if (po->tx_ring.pg_vec)
1158                 return tpacket_snd(po, msg);
1159         else
1160 #endif
1161                 return packet_snd(sock, msg, len);
1162 }
1163
1164 /*
1165  *      Close a PACKET socket. This is fairly simple. We immediately go
1166  *      to 'closed' state and remove our protocol entry in the device list.
1167  */
1168
1169 static int packet_release(struct socket *sock)
1170 {
1171         struct sock *sk = sock->sk;
1172         struct packet_sock *po;
1173         struct net *net;
1174 #ifdef CONFIG_PACKET_MMAP
1175         struct tpacket_req req;
1176 #endif
1177
1178         if (!sk)
1179                 return 0;
1180
1181         net = sock_net(sk);
1182         po = pkt_sk(sk);
1183
1184         write_lock_bh(&net->packet.sklist_lock);
1185         sk_del_node_init(sk);
1186         sock_prot_inuse_add(net, sk->sk_prot, -1);
1187         write_unlock_bh(&net->packet.sklist_lock);
1188
1189         /*
1190          *      Unhook packet receive handler.
1191          */
1192
1193         if (po->running) {
1194                 /*
1195                  *      Remove the protocol hook
1196                  */
1197                 dev_remove_pack(&po->prot_hook);
1198                 po->running = 0;
1199                 po->num = 0;
1200                 __sock_put(sk);
1201         }
1202
1203         packet_flush_mclist(sk);
1204
1205 #ifdef CONFIG_PACKET_MMAP
1206         memset(&req, 0, sizeof(req));
1207
1208         if (po->rx_ring.pg_vec)
1209                 packet_set_ring(sk, &req, 1, 0);
1210
1211         if (po->tx_ring.pg_vec)
1212                 packet_set_ring(sk, &req, 1, 1);
1213 #endif
1214
1215         /*
1216          *      Now the socket is dead. No more input will appear.
1217          */
1218
1219         sock_orphan(sk);
1220         sock->sk = NULL;
1221
1222         /* Purge queues */
1223
1224         skb_queue_purge(&sk->sk_receive_queue);
1225         sk_refcnt_debug_release(sk);
1226
1227         sock_put(sk);
1228         return 0;
1229 }
1230
1231 /*
1232  *      Attach a packet hook.
1233  */
1234
1235 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1236 {
1237         struct packet_sock *po = pkt_sk(sk);
1238         /*
1239          *      Detach an existing hook if present.
1240          */
1241
1242         lock_sock(sk);
1243
1244         spin_lock(&po->bind_lock);
1245         if (po->running) {
1246                 __sock_put(sk);
1247                 po->running = 0;
1248                 po->num = 0;
1249                 spin_unlock(&po->bind_lock);
1250                 dev_remove_pack(&po->prot_hook);
1251                 spin_lock(&po->bind_lock);
1252         }
1253
1254         po->num = protocol;
1255         po->prot_hook.type = protocol;
1256         po->prot_hook.dev = dev;
1257
1258         po->ifindex = dev ? dev->ifindex : 0;
1259
1260         if (protocol == 0)
1261                 goto out_unlock;
1262
1263         if (!dev || (dev->flags & IFF_UP)) {
1264                 dev_add_pack(&po->prot_hook);
1265                 sock_hold(sk);
1266                 po->running = 1;
1267         } else {
1268                 sk->sk_err = ENETDOWN;
1269                 if (!sock_flag(sk, SOCK_DEAD))
1270                         sk->sk_error_report(sk);
1271         }
1272
1273 out_unlock:
1274         spin_unlock(&po->bind_lock);
1275         release_sock(sk);
1276         return 0;
1277 }
1278
1279 /*
1280  *      Bind a packet socket to a device
1281  */
1282
1283 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1284                             int addr_len)
1285 {
1286         struct sock *sk = sock->sk;
1287         char name[15];
1288         struct net_device *dev;
1289         int err = -ENODEV;
1290
1291         /*
1292          *      Check legality
1293          */
1294
1295         if (addr_len != sizeof(struct sockaddr))
1296                 return -EINVAL;
1297         strlcpy(name, uaddr->sa_data, sizeof(name));
1298
1299         dev = dev_get_by_name(sock_net(sk), name);
1300         if (dev) {
1301                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1302                 dev_put(dev);
1303         }
1304         return err;
1305 }
1306
1307 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1308 {
1309         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1310         struct sock *sk = sock->sk;
1311         struct net_device *dev = NULL;
1312         int err;
1313
1314
1315         /*
1316          *      Check legality
1317          */
1318
1319         if (addr_len < sizeof(struct sockaddr_ll))
1320                 return -EINVAL;
1321         if (sll->sll_family != AF_PACKET)
1322                 return -EINVAL;
1323
1324         if (sll->sll_ifindex) {
1325                 err = -ENODEV;
1326                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1327                 if (dev == NULL)
1328                         goto out;
1329         }
1330         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1331         if (dev)
1332                 dev_put(dev);
1333
1334 out:
1335         return err;
1336 }
1337
1338 static struct proto packet_proto = {
1339         .name     = "PACKET",
1340         .owner    = THIS_MODULE,
1341         .obj_size = sizeof(struct packet_sock),
1342 };
1343
1344 /*
1345  *      Create a packet of type SOCK_PACKET.
1346  */
1347
1348 static int packet_create(struct net *net, struct socket *sock, int protocol,
1349                          int kern)
1350 {
1351         struct sock *sk;
1352         struct packet_sock *po;
1353         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1354         int err;
1355
1356         if (!capable(CAP_NET_RAW))
1357                 return -EPERM;
1358         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1359             sock->type != SOCK_PACKET)
1360                 return -ESOCKTNOSUPPORT;
1361
1362         sock->state = SS_UNCONNECTED;
1363
1364         err = -ENOBUFS;
1365         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1366         if (sk == NULL)
1367                 goto out;
1368
1369         sock->ops = &packet_ops;
1370         if (sock->type == SOCK_PACKET)
1371                 sock->ops = &packet_ops_spkt;
1372
1373         sock_init_data(sock, sk);
1374
1375         po = pkt_sk(sk);
1376         sk->sk_family = PF_PACKET;
1377         po->num = proto;
1378
1379         sk->sk_destruct = packet_sock_destruct;
1380         sk_refcnt_debug_inc(sk);
1381
1382         /*
1383          *      Attach a protocol block
1384          */
1385
1386         spin_lock_init(&po->bind_lock);
1387         mutex_init(&po->pg_vec_lock);
1388         po->prot_hook.func = packet_rcv;
1389
1390         if (sock->type == SOCK_PACKET)
1391                 po->prot_hook.func = packet_rcv_spkt;
1392
1393         po->prot_hook.af_packet_priv = sk;
1394
1395         if (proto) {
1396                 po->prot_hook.type = proto;
1397                 dev_add_pack(&po->prot_hook);
1398                 sock_hold(sk);
1399                 po->running = 1;
1400         }
1401
1402         write_lock_bh(&net->packet.sklist_lock);
1403         sk_add_node(sk, &net->packet.sklist);
1404         sock_prot_inuse_add(net, &packet_proto, 1);
1405         write_unlock_bh(&net->packet.sklist_lock);
1406         return 0;
1407 out:
1408         return err;
1409 }
1410
1411 /*
1412  *      Pull a packet from our receive queue and hand it to the user.
1413  *      If necessary we block.
1414  */
1415
1416 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1417                           struct msghdr *msg, size_t len, int flags)
1418 {
1419         struct sock *sk = sock->sk;
1420         struct sk_buff *skb;
1421         int copied, err;
1422         struct sockaddr_ll *sll;
1423
1424         err = -EINVAL;
1425         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1426                 goto out;
1427
1428 #if 0
1429         /* What error should we return now? EUNATTACH? */
1430         if (pkt_sk(sk)->ifindex < 0)
1431                 return -ENODEV;
1432 #endif
1433
1434         /*
1435          *      Call the generic datagram receiver. This handles all sorts
1436          *      of horrible races and re-entrancy so we can forget about it
1437          *      in the protocol layers.
1438          *
1439          *      Now it will return ENETDOWN, if device have just gone down,
1440          *      but then it will block.
1441          */
1442
1443         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1444
1445         /*
1446          *      An error occurred so return it. Because skb_recv_datagram()
1447          *      handles the blocking we don't see and worry about blocking
1448          *      retries.
1449          */
1450
1451         if (skb == NULL)
1452                 goto out;
1453
1454         /*
1455          *      If the address length field is there to be filled in, we fill
1456          *      it in now.
1457          */
1458
1459         sll = &PACKET_SKB_CB(skb)->sa.ll;
1460         if (sock->type == SOCK_PACKET)
1461                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1462         else
1463                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1464
1465         /*
1466          *      You lose any data beyond the buffer you gave. If it worries a
1467          *      user program they can ask the device for its MTU anyway.
1468          */
1469
1470         copied = skb->len;
1471         if (copied > len) {
1472                 copied = len;
1473                 msg->msg_flags |= MSG_TRUNC;
1474         }
1475
1476         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1477         if (err)
1478                 goto out_free;
1479
1480         sock_recv_ts_and_drops(msg, sk, skb);
1481
1482         if (msg->msg_name)
1483                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1484                        msg->msg_namelen);
1485
1486         if (pkt_sk(sk)->auxdata) {
1487                 struct tpacket_auxdata aux;
1488
1489                 aux.tp_status = TP_STATUS_USER;
1490                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1491                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1492                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1493                 aux.tp_snaplen = skb->len;
1494                 aux.tp_mac = 0;
1495                 aux.tp_net = skb_network_offset(skb);
1496                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1497
1498                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1499         }
1500
1501         /*
1502          *      Free or return the buffer as appropriate. Again this
1503          *      hides all the races and re-entrancy issues from us.
1504          */
1505         err = (flags&MSG_TRUNC) ? skb->len : copied;
1506
1507 out_free:
1508         skb_free_datagram(sk, skb);
1509 out:
1510         return err;
1511 }
1512
1513 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1514                                int *uaddr_len, int peer)
1515 {
1516         struct net_device *dev;
1517         struct sock *sk = sock->sk;
1518
1519         if (peer)
1520                 return -EOPNOTSUPP;
1521
1522         uaddr->sa_family = AF_PACKET;
1523         rcu_read_lock();
1524         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1525         if (dev)
1526                 strlcpy(uaddr->sa_data, dev->name, 15);
1527         else
1528                 memset(uaddr->sa_data, 0, 14);
1529         rcu_read_unlock();
1530         *uaddr_len = sizeof(*uaddr);
1531
1532         return 0;
1533 }
1534
1535 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1536                           int *uaddr_len, int peer)
1537 {
1538         struct net_device *dev;
1539         struct sock *sk = sock->sk;
1540         struct packet_sock *po = pkt_sk(sk);
1541         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1542
1543         if (peer)
1544                 return -EOPNOTSUPP;
1545
1546         sll->sll_family = AF_PACKET;
1547         sll->sll_ifindex = po->ifindex;
1548         sll->sll_protocol = po->num;
1549         rcu_read_lock();
1550         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1551         if (dev) {
1552                 sll->sll_hatype = dev->type;
1553                 sll->sll_halen = dev->addr_len;
1554                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1555         } else {
1556                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1557                 sll->sll_halen = 0;
1558         }
1559         rcu_read_unlock();
1560         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1561
1562         return 0;
1563 }
1564
1565 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1566                          int what)
1567 {
1568         switch (i->type) {
1569         case PACKET_MR_MULTICAST:
1570                 if (what > 0)
1571                         return dev_mc_add(dev, i->addr, i->alen, 0);
1572                 else
1573                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1574                 break;
1575         case PACKET_MR_PROMISC:
1576                 return dev_set_promiscuity(dev, what);
1577                 break;
1578         case PACKET_MR_ALLMULTI:
1579                 return dev_set_allmulti(dev, what);
1580                 break;
1581         case PACKET_MR_UNICAST:
1582                 if (what > 0)
1583                         return dev_unicast_add(dev, i->addr);
1584                 else
1585                         return dev_unicast_delete(dev, i->addr);
1586                 break;
1587         default:
1588                 break;
1589         }
1590         return 0;
1591 }
1592
1593 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1594 {
1595         for ( ; i; i = i->next) {
1596                 if (i->ifindex == dev->ifindex)
1597                         packet_dev_mc(dev, i, what);
1598         }
1599 }
1600
1601 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1602 {
1603         struct packet_sock *po = pkt_sk(sk);
1604         struct packet_mclist *ml, *i;
1605         struct net_device *dev;
1606         int err;
1607
1608         rtnl_lock();
1609
1610         err = -ENODEV;
1611         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1612         if (!dev)
1613                 goto done;
1614
1615         err = -EINVAL;
1616         if (mreq->mr_alen > dev->addr_len)
1617                 goto done;
1618
1619         err = -ENOBUFS;
1620         i = kmalloc(sizeof(*i), GFP_KERNEL);
1621         if (i == NULL)
1622                 goto done;
1623
1624         err = 0;
1625         for (ml = po->mclist; ml; ml = ml->next) {
1626                 if (ml->ifindex == mreq->mr_ifindex &&
1627                     ml->type == mreq->mr_type &&
1628                     ml->alen == mreq->mr_alen &&
1629                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1630                         ml->count++;
1631                         /* Free the new element ... */
1632                         kfree(i);
1633                         goto done;
1634                 }
1635         }
1636
1637         i->type = mreq->mr_type;
1638         i->ifindex = mreq->mr_ifindex;
1639         i->alen = mreq->mr_alen;
1640         memcpy(i->addr, mreq->mr_address, i->alen);
1641         i->count = 1;
1642         i->next = po->mclist;
1643         po->mclist = i;
1644         err = packet_dev_mc(dev, i, 1);
1645         if (err) {
1646                 po->mclist = i->next;
1647                 kfree(i);
1648         }
1649
1650 done:
1651         rtnl_unlock();
1652         return err;
1653 }
1654
1655 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1656 {
1657         struct packet_mclist *ml, **mlp;
1658
1659         rtnl_lock();
1660
1661         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1662                 if (ml->ifindex == mreq->mr_ifindex &&
1663                     ml->type == mreq->mr_type &&
1664                     ml->alen == mreq->mr_alen &&
1665                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1666                         if (--ml->count == 0) {
1667                                 struct net_device *dev;
1668                                 *mlp = ml->next;
1669                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1670                                 if (dev)
1671                                         packet_dev_mc(dev, ml, -1);
1672                                 kfree(ml);
1673                         }
1674                         rtnl_unlock();
1675                         return 0;
1676                 }
1677         }
1678         rtnl_unlock();
1679         return -EADDRNOTAVAIL;
1680 }
1681
1682 static void packet_flush_mclist(struct sock *sk)
1683 {
1684         struct packet_sock *po = pkt_sk(sk);
1685         struct packet_mclist *ml;
1686
1687         if (!po->mclist)
1688                 return;
1689
1690         rtnl_lock();
1691         while ((ml = po->mclist) != NULL) {
1692                 struct net_device *dev;
1693
1694                 po->mclist = ml->next;
1695                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1696                 if (dev != NULL)
1697                         packet_dev_mc(dev, ml, -1);
1698                 kfree(ml);
1699         }
1700         rtnl_unlock();
1701 }
1702
1703 static int
1704 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1705 {
1706         struct sock *sk = sock->sk;
1707         struct packet_sock *po = pkt_sk(sk);
1708         int ret;
1709
1710         if (level != SOL_PACKET)
1711                 return -ENOPROTOOPT;
1712
1713         switch (optname) {
1714         case PACKET_ADD_MEMBERSHIP:
1715         case PACKET_DROP_MEMBERSHIP:
1716         {
1717                 struct packet_mreq_max mreq;
1718                 int len = optlen;
1719                 memset(&mreq, 0, sizeof(mreq));
1720                 if (len < sizeof(struct packet_mreq))
1721                         return -EINVAL;
1722                 if (len > sizeof(mreq))
1723                         len = sizeof(mreq);
1724                 if (copy_from_user(&mreq, optval, len))
1725                         return -EFAULT;
1726                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1727                         return -EINVAL;
1728                 if (optname == PACKET_ADD_MEMBERSHIP)
1729                         ret = packet_mc_add(sk, &mreq);
1730                 else
1731                         ret = packet_mc_drop(sk, &mreq);
1732                 return ret;
1733         }
1734
1735 #ifdef CONFIG_PACKET_MMAP
1736         case PACKET_RX_RING:
1737         case PACKET_TX_RING:
1738         {
1739                 struct tpacket_req req;
1740
1741                 if (optlen < sizeof(req))
1742                         return -EINVAL;
1743                 if (copy_from_user(&req, optval, sizeof(req)))
1744                         return -EFAULT;
1745                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1746         }
1747         case PACKET_COPY_THRESH:
1748         {
1749                 int val;
1750
1751                 if (optlen != sizeof(val))
1752                         return -EINVAL;
1753                 if (copy_from_user(&val, optval, sizeof(val)))
1754                         return -EFAULT;
1755
1756                 pkt_sk(sk)->copy_thresh = val;
1757                 return 0;
1758         }
1759         case PACKET_VERSION:
1760         {
1761                 int val;
1762
1763                 if (optlen != sizeof(val))
1764                         return -EINVAL;
1765                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1766                         return -EBUSY;
1767                 if (copy_from_user(&val, optval, sizeof(val)))
1768                         return -EFAULT;
1769                 switch (val) {
1770                 case TPACKET_V1:
1771                 case TPACKET_V2:
1772                         po->tp_version = val;
1773                         return 0;
1774                 default:
1775                         return -EINVAL;
1776                 }
1777         }
1778         case PACKET_RESERVE:
1779         {
1780                 unsigned int val;
1781
1782                 if (optlen != sizeof(val))
1783                         return -EINVAL;
1784                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1785                         return -EBUSY;
1786                 if (copy_from_user(&val, optval, sizeof(val)))
1787                         return -EFAULT;
1788                 po->tp_reserve = val;
1789                 return 0;
1790         }
1791         case PACKET_LOSS:
1792         {
1793                 unsigned int val;
1794
1795                 if (optlen != sizeof(val))
1796                         return -EINVAL;
1797                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1798                         return -EBUSY;
1799                 if (copy_from_user(&val, optval, sizeof(val)))
1800                         return -EFAULT;
1801                 po->tp_loss = !!val;
1802                 return 0;
1803         }
1804 #endif
1805         case PACKET_AUXDATA:
1806         {
1807                 int val;
1808
1809                 if (optlen < sizeof(val))
1810                         return -EINVAL;
1811                 if (copy_from_user(&val, optval, sizeof(val)))
1812                         return -EFAULT;
1813
1814                 po->auxdata = !!val;
1815                 return 0;
1816         }
1817         case PACKET_ORIGDEV:
1818         {
1819                 int val;
1820
1821                 if (optlen < sizeof(val))
1822                         return -EINVAL;
1823                 if (copy_from_user(&val, optval, sizeof(val)))
1824                         return -EFAULT;
1825
1826                 po->origdev = !!val;
1827                 return 0;
1828         }
1829         default:
1830                 return -ENOPROTOOPT;
1831         }
1832 }
1833
1834 static int packet_getsockopt(struct socket *sock, int level, int optname,
1835                              char __user *optval, int __user *optlen)
1836 {
1837         int len;
1838         int val;
1839         struct sock *sk = sock->sk;
1840         struct packet_sock *po = pkt_sk(sk);
1841         void *data;
1842         struct tpacket_stats st;
1843
1844         if (level != SOL_PACKET)
1845                 return -ENOPROTOOPT;
1846
1847         if (get_user(len, optlen))
1848                 return -EFAULT;
1849
1850         if (len < 0)
1851                 return -EINVAL;
1852
1853         switch (optname) {
1854         case PACKET_STATISTICS:
1855                 if (len > sizeof(struct tpacket_stats))
1856                         len = sizeof(struct tpacket_stats);
1857                 spin_lock_bh(&sk->sk_receive_queue.lock);
1858                 st = po->stats;
1859                 memset(&po->stats, 0, sizeof(st));
1860                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1861                 st.tp_packets += st.tp_drops;
1862
1863                 data = &st;
1864                 break;
1865         case PACKET_AUXDATA:
1866                 if (len > sizeof(int))
1867                         len = sizeof(int);
1868                 val = po->auxdata;
1869
1870                 data = &val;
1871                 break;
1872         case PACKET_ORIGDEV:
1873                 if (len > sizeof(int))
1874                         len = sizeof(int);
1875                 val = po->origdev;
1876
1877                 data = &val;
1878                 break;
1879 #ifdef CONFIG_PACKET_MMAP
1880         case PACKET_VERSION:
1881                 if (len > sizeof(int))
1882                         len = sizeof(int);
1883                 val = po->tp_version;
1884                 data = &val;
1885                 break;
1886         case PACKET_HDRLEN:
1887                 if (len > sizeof(int))
1888                         len = sizeof(int);
1889                 if (copy_from_user(&val, optval, len))
1890                         return -EFAULT;
1891                 switch (val) {
1892                 case TPACKET_V1:
1893                         val = sizeof(struct tpacket_hdr);
1894                         break;
1895                 case TPACKET_V2:
1896                         val = sizeof(struct tpacket2_hdr);
1897                         break;
1898                 default:
1899                         return -EINVAL;
1900                 }
1901                 data = &val;
1902                 break;
1903         case PACKET_RESERVE:
1904                 if (len > sizeof(unsigned int))
1905                         len = sizeof(unsigned int);
1906                 val = po->tp_reserve;
1907                 data = &val;
1908                 break;
1909         case PACKET_LOSS:
1910                 if (len > sizeof(unsigned int))
1911                         len = sizeof(unsigned int);
1912                 val = po->tp_loss;
1913                 data = &val;
1914                 break;
1915 #endif
1916         default:
1917                 return -ENOPROTOOPT;
1918         }
1919
1920         if (put_user(len, optlen))
1921                 return -EFAULT;
1922         if (copy_to_user(optval, data, len))
1923                 return -EFAULT;
1924         return 0;
1925 }
1926
1927
1928 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1929 {
1930         struct sock *sk;
1931         struct hlist_node *node;
1932         struct net_device *dev = data;
1933         struct net *net = dev_net(dev);
1934
1935         read_lock(&net->packet.sklist_lock);
1936         sk_for_each(sk, node, &net->packet.sklist) {
1937                 struct packet_sock *po = pkt_sk(sk);
1938
1939                 switch (msg) {
1940                 case NETDEV_UNREGISTER:
1941                         if (po->mclist)
1942                                 packet_dev_mclist(dev, po->mclist, -1);
1943                         /* fallthrough */
1944
1945                 case NETDEV_DOWN:
1946                         if (dev->ifindex == po->ifindex) {
1947                                 spin_lock(&po->bind_lock);
1948                                 if (po->running) {
1949                                         __dev_remove_pack(&po->prot_hook);
1950                                         __sock_put(sk);
1951                                         po->running = 0;
1952                                         sk->sk_err = ENETDOWN;
1953                                         if (!sock_flag(sk, SOCK_DEAD))
1954                                                 sk->sk_error_report(sk);
1955                                 }
1956                                 if (msg == NETDEV_UNREGISTER) {
1957                                         po->ifindex = -1;
1958                                         po->prot_hook.dev = NULL;
1959                                 }
1960                                 spin_unlock(&po->bind_lock);
1961                         }
1962                         break;
1963                 case NETDEV_UP:
1964                         spin_lock(&po->bind_lock);
1965                         if (dev->ifindex == po->ifindex && po->num &&
1966                             !po->running) {
1967                                 dev_add_pack(&po->prot_hook);
1968                                 sock_hold(sk);
1969                                 po->running = 1;
1970                         }
1971                         spin_unlock(&po->bind_lock);
1972                         break;
1973                 }
1974         }
1975         read_unlock(&net->packet.sklist_lock);
1976         return NOTIFY_DONE;
1977 }
1978
1979
1980 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1981                         unsigned long arg)
1982 {
1983         struct sock *sk = sock->sk;
1984
1985         switch (cmd) {
1986         case SIOCOUTQ:
1987         {
1988                 int amount = sk_wmem_alloc_get(sk);
1989
1990                 return put_user(amount, (int __user *)arg);
1991         }
1992         case SIOCINQ:
1993         {
1994                 struct sk_buff *skb;
1995                 int amount = 0;
1996
1997                 spin_lock_bh(&sk->sk_receive_queue.lock);
1998                 skb = skb_peek(&sk->sk_receive_queue);
1999                 if (skb)
2000                         amount = skb->len;
2001                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2002                 return put_user(amount, (int __user *)arg);
2003         }
2004         case SIOCGSTAMP:
2005                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2006         case SIOCGSTAMPNS:
2007                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2008
2009 #ifdef CONFIG_INET
2010         case SIOCADDRT:
2011         case SIOCDELRT:
2012         case SIOCDARP:
2013         case SIOCGARP:
2014         case SIOCSARP:
2015         case SIOCGIFADDR:
2016         case SIOCSIFADDR:
2017         case SIOCGIFBRDADDR:
2018         case SIOCSIFBRDADDR:
2019         case SIOCGIFNETMASK:
2020         case SIOCSIFNETMASK:
2021         case SIOCGIFDSTADDR:
2022         case SIOCSIFDSTADDR:
2023         case SIOCSIFFLAGS:
2024                 if (!net_eq(sock_net(sk), &init_net))
2025                         return -ENOIOCTLCMD;
2026                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2027 #endif
2028
2029         default:
2030                 return -ENOIOCTLCMD;
2031         }
2032         return 0;
2033 }
2034
2035 #ifndef CONFIG_PACKET_MMAP
2036 #define packet_mmap sock_no_mmap
2037 #define packet_poll datagram_poll
2038 #else
2039
2040 static unsigned int packet_poll(struct file *file, struct socket *sock,
2041                                 poll_table *wait)
2042 {
2043         struct sock *sk = sock->sk;
2044         struct packet_sock *po = pkt_sk(sk);
2045         unsigned int mask = datagram_poll(file, sock, wait);
2046
2047         spin_lock_bh(&sk->sk_receive_queue.lock);
2048         if (po->rx_ring.pg_vec) {
2049                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2050                         mask |= POLLIN | POLLRDNORM;
2051         }
2052         spin_unlock_bh(&sk->sk_receive_queue.lock);
2053         spin_lock_bh(&sk->sk_write_queue.lock);
2054         if (po->tx_ring.pg_vec) {
2055                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2056                         mask |= POLLOUT | POLLWRNORM;
2057         }
2058         spin_unlock_bh(&sk->sk_write_queue.lock);
2059         return mask;
2060 }
2061
2062
2063 /* Dirty? Well, I still did not learn better way to account
2064  * for user mmaps.
2065  */
2066
2067 static void packet_mm_open(struct vm_area_struct *vma)
2068 {
2069         struct file *file = vma->vm_file;
2070         struct socket *sock = file->private_data;
2071         struct sock *sk = sock->sk;
2072
2073         if (sk)
2074                 atomic_inc(&pkt_sk(sk)->mapped);
2075 }
2076
2077 static void packet_mm_close(struct vm_area_struct *vma)
2078 {
2079         struct file *file = vma->vm_file;
2080         struct socket *sock = file->private_data;
2081         struct sock *sk = sock->sk;
2082
2083         if (sk)
2084                 atomic_dec(&pkt_sk(sk)->mapped);
2085 }
2086
2087 static const struct vm_operations_struct packet_mmap_ops = {
2088         .open   =       packet_mm_open,
2089         .close  =       packet_mm_close,
2090 };
2091
2092 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2093 {
2094         int i;
2095
2096         for (i = 0; i < len; i++) {
2097                 if (likely(pg_vec[i]))
2098                         free_pages((unsigned long) pg_vec[i], order);
2099         }
2100         kfree(pg_vec);
2101 }
2102
2103 static inline char *alloc_one_pg_vec_page(unsigned long order)
2104 {
2105         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2106
2107         return (char *) __get_free_pages(gfp_flags, order);
2108 }
2109
2110 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2111 {
2112         unsigned int block_nr = req->tp_block_nr;
2113         char **pg_vec;
2114         int i;
2115
2116         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2117         if (unlikely(!pg_vec))
2118                 goto out;
2119
2120         for (i = 0; i < block_nr; i++) {
2121                 pg_vec[i] = alloc_one_pg_vec_page(order);
2122                 if (unlikely(!pg_vec[i]))
2123                         goto out_free_pgvec;
2124         }
2125
2126 out:
2127         return pg_vec;
2128
2129 out_free_pgvec:
2130         free_pg_vec(pg_vec, order, block_nr);
2131         pg_vec = NULL;
2132         goto out;
2133 }
2134
2135 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2136                 int closing, int tx_ring)
2137 {
2138         char **pg_vec = NULL;
2139         struct packet_sock *po = pkt_sk(sk);
2140         int was_running, order = 0;
2141         struct packet_ring_buffer *rb;
2142         struct sk_buff_head *rb_queue;
2143         __be16 num;
2144         int err;
2145
2146         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2147         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2148
2149         err = -EBUSY;
2150         if (!closing) {
2151                 if (atomic_read(&po->mapped))
2152                         goto out;
2153                 if (atomic_read(&rb->pending))
2154                         goto out;
2155         }
2156
2157         if (req->tp_block_nr) {
2158                 /* Sanity tests and some calculations */
2159                 err = -EBUSY;
2160                 if (unlikely(rb->pg_vec))
2161                         goto out;
2162
2163                 switch (po->tp_version) {
2164                 case TPACKET_V1:
2165                         po->tp_hdrlen = TPACKET_HDRLEN;
2166                         break;
2167                 case TPACKET_V2:
2168                         po->tp_hdrlen = TPACKET2_HDRLEN;
2169                         break;
2170                 }
2171
2172                 err = -EINVAL;
2173                 if (unlikely((int)req->tp_block_size <= 0))
2174                         goto out;
2175                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2176                         goto out;
2177                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2178                                         po->tp_reserve))
2179                         goto out;
2180                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2181                         goto out;
2182
2183                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2184                 if (unlikely(rb->frames_per_block <= 0))
2185                         goto out;
2186                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2187                                         req->tp_frame_nr))
2188                         goto out;
2189
2190                 err = -ENOMEM;
2191                 order = get_order(req->tp_block_size);
2192                 pg_vec = alloc_pg_vec(req, order);
2193                 if (unlikely(!pg_vec))
2194                         goto out;
2195         }
2196         /* Done */
2197         else {
2198                 err = -EINVAL;
2199                 if (unlikely(req->tp_frame_nr))
2200                         goto out;
2201         }
2202
2203         lock_sock(sk);
2204
2205         /* Detach socket from network */
2206         spin_lock(&po->bind_lock);
2207         was_running = po->running;
2208         num = po->num;
2209         if (was_running) {
2210                 __dev_remove_pack(&po->prot_hook);
2211                 po->num = 0;
2212                 po->running = 0;
2213                 __sock_put(sk);
2214         }
2215         spin_unlock(&po->bind_lock);
2216
2217         synchronize_net();
2218
2219         err = -EBUSY;
2220         mutex_lock(&po->pg_vec_lock);
2221         if (closing || atomic_read(&po->mapped) == 0) {
2222                 err = 0;
2223 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2224                 spin_lock_bh(&rb_queue->lock);
2225                 pg_vec = XC(rb->pg_vec, pg_vec);
2226                 rb->frame_max = (req->tp_frame_nr - 1);
2227                 rb->head = 0;
2228                 rb->frame_size = req->tp_frame_size;
2229                 spin_unlock_bh(&rb_queue->lock);
2230
2231                 order = XC(rb->pg_vec_order, order);
2232                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2233
2234                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2235                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2236                                                 tpacket_rcv : packet_rcv;
2237                 skb_queue_purge(rb_queue);
2238 #undef XC
2239                 if (atomic_read(&po->mapped))
2240                         pr_err("packet_mmap: vma is busy: %d\n",
2241                                atomic_read(&po->mapped));
2242         }
2243         mutex_unlock(&po->pg_vec_lock);
2244
2245         spin_lock(&po->bind_lock);
2246         if (was_running && !po->running) {
2247                 sock_hold(sk);
2248                 po->running = 1;
2249                 po->num = num;
2250                 dev_add_pack(&po->prot_hook);
2251         }
2252         spin_unlock(&po->bind_lock);
2253
2254         release_sock(sk);
2255
2256         if (pg_vec)
2257                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2258 out:
2259         return err;
2260 }
2261
2262 static int packet_mmap(struct file *file, struct socket *sock,
2263                 struct vm_area_struct *vma)
2264 {
2265         struct sock *sk = sock->sk;
2266         struct packet_sock *po = pkt_sk(sk);
2267         unsigned long size, expected_size;
2268         struct packet_ring_buffer *rb;
2269         unsigned long start;
2270         int err = -EINVAL;
2271         int i;
2272
2273         if (vma->vm_pgoff)
2274                 return -EINVAL;
2275
2276         mutex_lock(&po->pg_vec_lock);
2277
2278         expected_size = 0;
2279         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2280                 if (rb->pg_vec) {
2281                         expected_size += rb->pg_vec_len
2282                                                 * rb->pg_vec_pages
2283                                                 * PAGE_SIZE;
2284                 }
2285         }
2286
2287         if (expected_size == 0)
2288                 goto out;
2289
2290         size = vma->vm_end - vma->vm_start;
2291         if (size != expected_size)
2292                 goto out;
2293
2294         start = vma->vm_start;
2295         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2296                 if (rb->pg_vec == NULL)
2297                         continue;
2298
2299                 for (i = 0; i < rb->pg_vec_len; i++) {
2300                         struct page *page = virt_to_page(rb->pg_vec[i]);
2301                         int pg_num;
2302
2303                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2304                                         pg_num++, page++) {
2305                                 err = vm_insert_page(vma, start, page);
2306                                 if (unlikely(err))
2307                                         goto out;
2308                                 start += PAGE_SIZE;
2309                         }
2310                 }
2311         }
2312
2313         atomic_inc(&po->mapped);
2314         vma->vm_ops = &packet_mmap_ops;
2315         err = 0;
2316
2317 out:
2318         mutex_unlock(&po->pg_vec_lock);
2319         return err;
2320 }
2321 #endif
2322
2323
2324 static const struct proto_ops packet_ops_spkt = {
2325         .family =       PF_PACKET,
2326         .owner =        THIS_MODULE,
2327         .release =      packet_release,
2328         .bind =         packet_bind_spkt,
2329         .connect =      sock_no_connect,
2330         .socketpair =   sock_no_socketpair,
2331         .accept =       sock_no_accept,
2332         .getname =      packet_getname_spkt,
2333         .poll =         datagram_poll,
2334         .ioctl =        packet_ioctl,
2335         .listen =       sock_no_listen,
2336         .shutdown =     sock_no_shutdown,
2337         .setsockopt =   sock_no_setsockopt,
2338         .getsockopt =   sock_no_getsockopt,
2339         .sendmsg =      packet_sendmsg_spkt,
2340         .recvmsg =      packet_recvmsg,
2341         .mmap =         sock_no_mmap,
2342         .sendpage =     sock_no_sendpage,
2343 };
2344
2345 static const struct proto_ops packet_ops = {
2346         .family =       PF_PACKET,
2347         .owner =        THIS_MODULE,
2348         .release =      packet_release,
2349         .bind =         packet_bind,
2350         .connect =      sock_no_connect,
2351         .socketpair =   sock_no_socketpair,
2352         .accept =       sock_no_accept,
2353         .getname =      packet_getname,
2354         .poll =         packet_poll,
2355         .ioctl =        packet_ioctl,
2356         .listen =       sock_no_listen,
2357         .shutdown =     sock_no_shutdown,
2358         .setsockopt =   packet_setsockopt,
2359         .getsockopt =   packet_getsockopt,
2360         .sendmsg =      packet_sendmsg,
2361         .recvmsg =      packet_recvmsg,
2362         .mmap =         packet_mmap,
2363         .sendpage =     sock_no_sendpage,
2364 };
2365
2366 static const struct net_proto_family packet_family_ops = {
2367         .family =       PF_PACKET,
2368         .create =       packet_create,
2369         .owner  =       THIS_MODULE,
2370 };
2371
2372 static struct notifier_block packet_netdev_notifier = {
2373         .notifier_call =        packet_notifier,
2374 };
2375
2376 #ifdef CONFIG_PROC_FS
2377 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2378 {
2379         struct sock *s;
2380         struct hlist_node *node;
2381
2382         sk_for_each(s, node, &net->packet.sklist) {
2383                 if (!off--)
2384                         return s;
2385         }
2386         return NULL;
2387 }
2388
2389 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2390         __acquires(seq_file_net(seq)->packet.sklist_lock)
2391 {
2392         struct net *net = seq_file_net(seq);
2393         read_lock(&net->packet.sklist_lock);
2394         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2395 }
2396
2397 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2398 {
2399         struct net *net = seq_file_net(seq);
2400         ++*pos;
2401         return  (v == SEQ_START_TOKEN)
2402                 ? sk_head(&net->packet.sklist)
2403                 : sk_next((struct sock *)v) ;
2404 }
2405
2406 static void packet_seq_stop(struct seq_file *seq, void *v)
2407         __releases(seq_file_net(seq)->packet.sklist_lock)
2408 {
2409         struct net *net = seq_file_net(seq);
2410         read_unlock(&net->packet.sklist_lock);
2411 }
2412
2413 static int packet_seq_show(struct seq_file *seq, void *v)
2414 {
2415         if (v == SEQ_START_TOKEN)
2416                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2417         else {
2418                 struct sock *s = v;
2419                 const struct packet_sock *po = pkt_sk(s);
2420
2421                 seq_printf(seq,
2422                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2423                            s,
2424                            atomic_read(&s->sk_refcnt),
2425                            s->sk_type,
2426                            ntohs(po->num),
2427                            po->ifindex,
2428                            po->running,
2429                            atomic_read(&s->sk_rmem_alloc),
2430                            sock_i_uid(s),
2431                            sock_i_ino(s));
2432         }
2433
2434         return 0;
2435 }
2436
2437 static const struct seq_operations packet_seq_ops = {
2438         .start  = packet_seq_start,
2439         .next   = packet_seq_next,
2440         .stop   = packet_seq_stop,
2441         .show   = packet_seq_show,
2442 };
2443
2444 static int packet_seq_open(struct inode *inode, struct file *file)
2445 {
2446         return seq_open_net(inode, file, &packet_seq_ops,
2447                             sizeof(struct seq_net_private));
2448 }
2449
2450 static const struct file_operations packet_seq_fops = {
2451         .owner          = THIS_MODULE,
2452         .open           = packet_seq_open,
2453         .read           = seq_read,
2454         .llseek         = seq_lseek,
2455         .release        = seq_release_net,
2456 };
2457
2458 #endif
2459
2460 static int packet_net_init(struct net *net)
2461 {
2462         rwlock_init(&net->packet.sklist_lock);
2463         INIT_HLIST_HEAD(&net->packet.sklist);
2464
2465         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2466                 return -ENOMEM;
2467
2468         return 0;
2469 }
2470
2471 static void packet_net_exit(struct net *net)
2472 {
2473         proc_net_remove(net, "packet");
2474 }
2475
2476 static struct pernet_operations packet_net_ops = {
2477         .init = packet_net_init,
2478         .exit = packet_net_exit,
2479 };
2480
2481
2482 static void __exit packet_exit(void)
2483 {
2484         unregister_netdevice_notifier(&packet_netdev_notifier);
2485         unregister_pernet_subsys(&packet_net_ops);
2486         sock_unregister(PF_PACKET);
2487         proto_unregister(&packet_proto);
2488 }
2489
2490 static int __init packet_init(void)
2491 {
2492         int rc = proto_register(&packet_proto, 0);
2493
2494         if (rc != 0)
2495                 goto out;
2496
2497         sock_register(&packet_family_ops);
2498         register_pernet_subsys(&packet_net_ops);
2499         register_netdevice_notifier(&packet_netdev_notifier);
2500 out:
2501         return rc;
2502 }
2503
2504 module_init(packet_init);
2505 module_exit(packet_exit);
2506 MODULE_LICENSE("GPL");
2507 MODULE_ALIAS_NETPROTO(PF_PACKET);