af_packet: add interframe drop cmsg (v6)
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* Private packet socket structures. */
139
140 struct packet_mclist {
141         struct packet_mclist    *next;
142         int                     ifindex;
143         int                     count;
144         unsigned short          type;
145         unsigned short          alen;
146         unsigned char           addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160                 int closing, int tx_ring);
161
162 struct packet_ring_buffer {
163         char                    **pg_vec;
164         unsigned int            head;
165         unsigned int            frames_per_block;
166         unsigned int            frame_size;
167         unsigned int            frame_max;
168
169         unsigned int            pg_vec_order;
170         unsigned int            pg_vec_pages;
171         unsigned int            pg_vec_len;
172
173         atomic_t                pending;
174 };
175
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179
180 static void packet_flush_mclist(struct sock *sk);
181
182 struct packet_sock {
183         /* struct sock has to be the first member of packet_sock */
184         struct sock             sk;
185         struct tpacket_stats    stats;
186 #ifdef CONFIG_PACKET_MMAP
187         struct packet_ring_buffer       rx_ring;
188         struct packet_ring_buffer       tx_ring;
189         int                     copy_thresh;
190 #endif
191         struct packet_type      prot_hook;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207 };
208
209 struct packet_skb_cb {
210         unsigned int origlen;
211         union {
212                 struct sockaddr_pkt pkt;
213                 struct sockaddr_ll ll;
214         } sa;
215 };
216
217 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
218
219 #ifdef CONFIG_PACKET_MMAP
220
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223         union {
224                 struct tpacket_hdr *h1;
225                 struct tpacket2_hdr *h2;
226                 void *raw;
227         } h;
228
229         h.raw = frame;
230         switch (po->tp_version) {
231         case TPACKET_V1:
232                 h.h1->tp_status = status;
233                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
234                 break;
235         case TPACKET_V2:
236                 h.h2->tp_status = status;
237                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
238                 break;
239         default:
240                 pr_err("TPACKET version not supported\n");
241                 BUG();
242         }
243
244         smp_wmb();
245 }
246
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249         union {
250                 struct tpacket_hdr *h1;
251                 struct tpacket2_hdr *h2;
252                 void *raw;
253         } h;
254
255         smp_rmb();
256
257         h.raw = frame;
258         switch (po->tp_version) {
259         case TPACKET_V1:
260                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
261                 return h.h1->tp_status;
262         case TPACKET_V2:
263                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
264                 return h.h2->tp_status;
265         default:
266                 pr_err("TPACKET version not supported\n");
267                 BUG();
268                 return 0;
269         }
270 }
271
272 static void *packet_lookup_frame(struct packet_sock *po,
273                 struct packet_ring_buffer *rb,
274                 unsigned int position,
275                 int status)
276 {
277         unsigned int pg_vec_pos, frame_offset;
278         union {
279                 struct tpacket_hdr *h1;
280                 struct tpacket2_hdr *h2;
281                 void *raw;
282         } h;
283
284         pg_vec_pos = position / rb->frames_per_block;
285         frame_offset = position % rb->frames_per_block;
286
287         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288
289         if (status != __packet_get_status(po, h.raw))
290                 return NULL;
291
292         return h.raw;
293 }
294
295 static inline void *packet_current_frame(struct packet_sock *po,
296                 struct packet_ring_buffer *rb,
297                 int status)
298 {
299         return packet_lookup_frame(po, rb, rb->head, status);
300 }
301
302 static inline void *packet_previous_frame(struct packet_sock *po,
303                 struct packet_ring_buffer *rb,
304                 int status)
305 {
306         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307         return packet_lookup_frame(po, rb, previous, status);
308 }
309
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314
315 #endif
316
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319         return (struct packet_sock *)sk;
320 }
321
322 static void packet_sock_destruct(struct sock *sk)
323 {
324         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326
327         if (!sock_flag(sk, SOCK_DEAD)) {
328                 pr_err("Attempt to release alive packet socket: %p\n", sk);
329                 return;
330         }
331
332         sk_refcnt_debug_dec(sk);
333 }
334
335
336 static const struct proto_ops packet_ops;
337
338 static const struct proto_ops packet_ops_spkt;
339
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341                            struct packet_type *pt, struct net_device *orig_dev)
342 {
343         struct sock *sk;
344         struct sockaddr_pkt *spkt;
345
346         /*
347          *      When we registered the protocol we saved the socket in the data
348          *      field for just this event.
349          */
350
351         sk = pt->af_packet_priv;
352
353         /*
354          *      Yank back the headers [hope the device set this
355          *      right or kerboom...]
356          *
357          *      Incoming packets have ll header pulled,
358          *      push it back.
359          *
360          *      For outgoing ones skb->data == skb_mac_header(skb)
361          *      so that this procedure is noop.
362          */
363
364         if (skb->pkt_type == PACKET_LOOPBACK)
365                 goto out;
366
367         if (dev_net(dev) != sock_net(sk))
368                 goto out;
369
370         skb = skb_share_check(skb, GFP_ATOMIC);
371         if (skb == NULL)
372                 goto oom;
373
374         /* drop any routing info */
375         skb_dst_drop(skb);
376
377         /* drop conntrack reference */
378         nf_reset(skb);
379
380         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381
382         skb_push(skb, skb->data - skb_mac_header(skb));
383
384         /*
385          *      The SOCK_PACKET socket receives _all_ frames.
386          */
387
388         spkt->spkt_family = dev->type;
389         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390         spkt->spkt_protocol = skb->protocol;
391
392         /*
393          *      Charge the memory to the socket. This is done specifically
394          *      to prevent sockets using all the memory up.
395          */
396
397         if (sock_queue_rcv_skb(sk, skb) == 0)
398                 return 0;
399
400 out:
401         kfree_skb(skb);
402 oom:
403         return 0;
404 }
405
406
407 /*
408  *      Output a raw packet to a device layer. This bypasses all the other
409  *      protocol layers and you must therefore supply it with a complete frame
410  */
411
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413                                struct msghdr *msg, size_t len)
414 {
415         struct sock *sk = sock->sk;
416         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417         struct sk_buff *skb;
418         struct net_device *dev;
419         __be16 proto = 0;
420         int err;
421
422         /*
423          *      Get and verify the address.
424          */
425
426         if (saddr) {
427                 if (msg->msg_namelen < sizeof(struct sockaddr))
428                         return -EINVAL;
429                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430                         proto = saddr->spkt_protocol;
431         } else
432                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
433
434         /*
435          *      Find the device first to size check it
436          */
437
438         saddr->spkt_device[13] = 0;
439         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440         err = -ENODEV;
441         if (dev == NULL)
442                 goto out_unlock;
443
444         err = -ENETDOWN;
445         if (!(dev->flags & IFF_UP))
446                 goto out_unlock;
447
448         /*
449          * You may not queue a frame bigger than the mtu. This is the lowest level
450          * raw protocol and you must do your own fragmentation at this level.
451          */
452
453         err = -EMSGSIZE;
454         if (len > dev->mtu + dev->hard_header_len)
455                 goto out_unlock;
456
457         err = -ENOBUFS;
458         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459
460         /*
461          * If the write buffer is full, then tough. At this level the user
462          * gets to deal with the problem - do your own algorithmic backoffs.
463          * That's far more flexible.
464          */
465
466         if (skb == NULL)
467                 goto out_unlock;
468
469         /*
470          *      Fill it in
471          */
472
473         /* FIXME: Save some space for broken drivers that write a
474          * hard header at transmission time by themselves. PPP is the
475          * notable one here. This should really be fixed at the driver level.
476          */
477         skb_reserve(skb, LL_RESERVED_SPACE(dev));
478         skb_reset_network_header(skb);
479
480         /* Try to align data part correctly */
481         if (dev->header_ops) {
482                 skb->data -= dev->hard_header_len;
483                 skb->tail -= dev->hard_header_len;
484                 if (len < dev->hard_header_len)
485                         skb_reset_network_header(skb);
486         }
487
488         /* Returns -EFAULT on error */
489         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490         skb->protocol = proto;
491         skb->dev = dev;
492         skb->priority = sk->sk_priority;
493         if (err)
494                 goto out_free;
495
496         /*
497          *      Now send it
498          */
499
500         dev_queue_xmit(skb);
501         dev_put(dev);
502         return len;
503
504 out_free:
505         kfree_skb(skb);
506 out_unlock:
507         if (dev)
508                 dev_put(dev);
509         return err;
510 }
511
512 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
513                                       unsigned int res)
514 {
515         struct sk_filter *filter;
516
517         rcu_read_lock_bh();
518         filter = rcu_dereference(sk->sk_filter);
519         if (filter != NULL)
520                 res = sk_run_filter(skb, filter->insns, filter->len);
521         rcu_read_unlock_bh();
522
523         return res;
524 }
525
526 /*
527  * If we've lost frames since the last time we queued one to the
528  * sk_receive_queue, we need to record it here.
529  * This must be called under the protection of the socket lock
530  * to prevent racing with other softirqs and user space
531  */
532 static inline void record_packet_gap(struct sk_buff *skb,
533                                         struct packet_sock *po)
534 {
535         /*
536          * We overload the mark field here, since we're about
537          * to enqueue to a receive queue and no body else will
538          * use this field at this point
539          */
540         skb->mark = po->stats.tp_gap;
541         po->stats.tp_gap = 0;
542         return;
543
544 }
545
546 static inline __u32 check_packet_gap(struct sk_buff *skb)
547 {
548         return skb->mark;
549 }
550
551 /*
552    This function makes lazy skb cloning in hope that most of packets
553    are discarded by BPF.
554
555    Note tricky part: we DO mangle shared skb! skb->data, skb->len
556    and skb->cb are mangled. It works because (and until) packets
557    falling here are owned by current CPU. Output packets are cloned
558    by dev_queue_xmit_nit(), input packets are processed by net_bh
559    sequencially, so that if we return skb to original state on exit,
560    we will not harm anyone.
561  */
562
563 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
564                       struct packet_type *pt, struct net_device *orig_dev)
565 {
566         struct sock *sk;
567         struct sockaddr_ll *sll;
568         struct packet_sock *po;
569         u8 *skb_head = skb->data;
570         int skb_len = skb->len;
571         unsigned int snaplen, res;
572
573         if (skb->pkt_type == PACKET_LOOPBACK)
574                 goto drop;
575
576         sk = pt->af_packet_priv;
577         po = pkt_sk(sk);
578
579         if (dev_net(dev) != sock_net(sk))
580                 goto drop;
581
582         skb->dev = dev;
583
584         if (dev->header_ops) {
585                 /* The device has an explicit notion of ll header,
586                    exported to higher levels.
587
588                    Otherwise, the device hides datails of it frame
589                    structure, so that corresponding packet head
590                    never delivered to user.
591                  */
592                 if (sk->sk_type != SOCK_DGRAM)
593                         skb_push(skb, skb->data - skb_mac_header(skb));
594                 else if (skb->pkt_type == PACKET_OUTGOING) {
595                         /* Special case: outgoing packets have ll header at head */
596                         skb_pull(skb, skb_network_offset(skb));
597                 }
598         }
599
600         snaplen = skb->len;
601
602         res = run_filter(skb, sk, snaplen);
603         if (!res)
604                 goto drop_n_restore;
605         if (snaplen > res)
606                 snaplen = res;
607
608         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
609             (unsigned)sk->sk_rcvbuf)
610                 goto drop_n_acct;
611
612         if (skb_shared(skb)) {
613                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
614                 if (nskb == NULL)
615                         goto drop_n_acct;
616
617                 if (skb_head != skb->data) {
618                         skb->data = skb_head;
619                         skb->len = skb_len;
620                 }
621                 kfree_skb(skb);
622                 skb = nskb;
623         }
624
625         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
626                      sizeof(skb->cb));
627
628         sll = &PACKET_SKB_CB(skb)->sa.ll;
629         sll->sll_family = AF_PACKET;
630         sll->sll_hatype = dev->type;
631         sll->sll_protocol = skb->protocol;
632         sll->sll_pkttype = skb->pkt_type;
633         if (unlikely(po->origdev))
634                 sll->sll_ifindex = orig_dev->ifindex;
635         else
636                 sll->sll_ifindex = dev->ifindex;
637
638         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
639
640         PACKET_SKB_CB(skb)->origlen = skb->len;
641
642         if (pskb_trim(skb, snaplen))
643                 goto drop_n_acct;
644
645         skb_set_owner_r(skb, sk);
646         skb->dev = NULL;
647         skb_dst_drop(skb);
648
649         /* drop conntrack reference */
650         nf_reset(skb);
651
652         spin_lock(&sk->sk_receive_queue.lock);
653         po->stats.tp_packets++;
654         record_packet_gap(skb, po);
655         __skb_queue_tail(&sk->sk_receive_queue, skb);
656         spin_unlock(&sk->sk_receive_queue.lock);
657         sk->sk_data_ready(sk, skb->len);
658         return 0;
659
660 drop_n_acct:
661         spin_lock(&sk->sk_receive_queue.lock);
662         po->stats.tp_drops++;
663         po->stats.tp_gap++;
664         spin_unlock(&sk->sk_receive_queue.lock);
665
666 drop_n_restore:
667         if (skb_head != skb->data && skb_shared(skb)) {
668                 skb->data = skb_head;
669                 skb->len = skb_len;
670         }
671 drop:
672         consume_skb(skb);
673         return 0;
674 }
675
676 #ifdef CONFIG_PACKET_MMAP
677 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
678                        struct packet_type *pt, struct net_device *orig_dev)
679 {
680         struct sock *sk;
681         struct packet_sock *po;
682         struct sockaddr_ll *sll;
683         union {
684                 struct tpacket_hdr *h1;
685                 struct tpacket2_hdr *h2;
686                 void *raw;
687         } h;
688         u8 *skb_head = skb->data;
689         int skb_len = skb->len;
690         unsigned int snaplen, res;
691         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
692         unsigned short macoff, netoff, hdrlen;
693         struct sk_buff *copy_skb = NULL;
694         struct timeval tv;
695         struct timespec ts;
696
697         if (skb->pkt_type == PACKET_LOOPBACK)
698                 goto drop;
699
700         sk = pt->af_packet_priv;
701         po = pkt_sk(sk);
702
703         if (dev_net(dev) != sock_net(sk))
704                 goto drop;
705
706         if (dev->header_ops) {
707                 if (sk->sk_type != SOCK_DGRAM)
708                         skb_push(skb, skb->data - skb_mac_header(skb));
709                 else if (skb->pkt_type == PACKET_OUTGOING) {
710                         /* Special case: outgoing packets have ll header at head */
711                         skb_pull(skb, skb_network_offset(skb));
712                 }
713         }
714
715         if (skb->ip_summed == CHECKSUM_PARTIAL)
716                 status |= TP_STATUS_CSUMNOTREADY;
717
718         snaplen = skb->len;
719
720         res = run_filter(skb, sk, snaplen);
721         if (!res)
722                 goto drop_n_restore;
723         if (snaplen > res)
724                 snaplen = res;
725
726         if (sk->sk_type == SOCK_DGRAM) {
727                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
728                                   po->tp_reserve;
729         } else {
730                 unsigned maclen = skb_network_offset(skb);
731                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
732                                        (maclen < 16 ? 16 : maclen)) +
733                         po->tp_reserve;
734                 macoff = netoff - maclen;
735         }
736
737         if (macoff + snaplen > po->rx_ring.frame_size) {
738                 if (po->copy_thresh &&
739                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
740                     (unsigned)sk->sk_rcvbuf) {
741                         if (skb_shared(skb)) {
742                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
743                         } else {
744                                 copy_skb = skb_get(skb);
745                                 skb_head = skb->data;
746                         }
747                         if (copy_skb)
748                                 skb_set_owner_r(copy_skb, sk);
749                 }
750                 snaplen = po->rx_ring.frame_size - macoff;
751                 if ((int)snaplen < 0)
752                         snaplen = 0;
753         }
754
755         spin_lock(&sk->sk_receive_queue.lock);
756         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
757         if (!h.raw)
758                 goto ring_is_full;
759         packet_increment_head(&po->rx_ring);
760         po->stats.tp_packets++;
761         if (copy_skb) {
762                 status |= TP_STATUS_COPY;
763                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
764         }
765         if (!po->stats.tp_drops)
766                 status &= ~TP_STATUS_LOSING;
767         spin_unlock(&sk->sk_receive_queue.lock);
768
769         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
770
771         switch (po->tp_version) {
772         case TPACKET_V1:
773                 h.h1->tp_len = skb->len;
774                 h.h1->tp_snaplen = snaplen;
775                 h.h1->tp_mac = macoff;
776                 h.h1->tp_net = netoff;
777                 if (skb->tstamp.tv64)
778                         tv = ktime_to_timeval(skb->tstamp);
779                 else
780                         do_gettimeofday(&tv);
781                 h.h1->tp_sec = tv.tv_sec;
782                 h.h1->tp_usec = tv.tv_usec;
783                 hdrlen = sizeof(*h.h1);
784                 break;
785         case TPACKET_V2:
786                 h.h2->tp_len = skb->len;
787                 h.h2->tp_snaplen = snaplen;
788                 h.h2->tp_mac = macoff;
789                 h.h2->tp_net = netoff;
790                 if (skb->tstamp.tv64)
791                         ts = ktime_to_timespec(skb->tstamp);
792                 else
793                         getnstimeofday(&ts);
794                 h.h2->tp_sec = ts.tv_sec;
795                 h.h2->tp_nsec = ts.tv_nsec;
796                 h.h2->tp_vlan_tci = skb->vlan_tci;
797                 hdrlen = sizeof(*h.h2);
798                 break;
799         default:
800                 BUG();
801         }
802
803         sll = h.raw + TPACKET_ALIGN(hdrlen);
804         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
805         sll->sll_family = AF_PACKET;
806         sll->sll_hatype = dev->type;
807         sll->sll_protocol = skb->protocol;
808         sll->sll_pkttype = skb->pkt_type;
809         if (unlikely(po->origdev))
810                 sll->sll_ifindex = orig_dev->ifindex;
811         else
812                 sll->sll_ifindex = dev->ifindex;
813
814         __packet_set_status(po, h.raw, status);
815         smp_mb();
816         {
817                 struct page *p_start, *p_end;
818                 u8 *h_end = h.raw + macoff + snaplen - 1;
819
820                 p_start = virt_to_page(h.raw);
821                 p_end = virt_to_page(h_end);
822                 while (p_start <= p_end) {
823                         flush_dcache_page(p_start);
824                         p_start++;
825                 }
826         }
827
828         sk->sk_data_ready(sk, 0);
829
830 drop_n_restore:
831         if (skb_head != skb->data && skb_shared(skb)) {
832                 skb->data = skb_head;
833                 skb->len = skb_len;
834         }
835 drop:
836         kfree_skb(skb);
837         return 0;
838
839 ring_is_full:
840         po->stats.tp_drops++;
841         po->stats.tp_gap++;
842         spin_unlock(&sk->sk_receive_queue.lock);
843
844         sk->sk_data_ready(sk, 0);
845         kfree_skb(copy_skb);
846         goto drop_n_restore;
847 }
848
849 static void tpacket_destruct_skb(struct sk_buff *skb)
850 {
851         struct packet_sock *po = pkt_sk(skb->sk);
852         void *ph;
853
854         BUG_ON(skb == NULL);
855
856         if (likely(po->tx_ring.pg_vec)) {
857                 ph = skb_shinfo(skb)->destructor_arg;
858                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
859                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
860                 atomic_dec(&po->tx_ring.pending);
861                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
862         }
863
864         sock_wfree(skb);
865 }
866
867 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
868                 void *frame, struct net_device *dev, int size_max,
869                 __be16 proto, unsigned char *addr)
870 {
871         union {
872                 struct tpacket_hdr *h1;
873                 struct tpacket2_hdr *h2;
874                 void *raw;
875         } ph;
876         int to_write, offset, len, tp_len, nr_frags, len_max;
877         struct socket *sock = po->sk.sk_socket;
878         struct page *page;
879         void *data;
880         int err;
881
882         ph.raw = frame;
883
884         skb->protocol = proto;
885         skb->dev = dev;
886         skb->priority = po->sk.sk_priority;
887         skb_shinfo(skb)->destructor_arg = ph.raw;
888
889         switch (po->tp_version) {
890         case TPACKET_V2:
891                 tp_len = ph.h2->tp_len;
892                 break;
893         default:
894                 tp_len = ph.h1->tp_len;
895                 break;
896         }
897         if (unlikely(tp_len > size_max)) {
898                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
899                 return -EMSGSIZE;
900         }
901
902         skb_reserve(skb, LL_RESERVED_SPACE(dev));
903         skb_reset_network_header(skb);
904
905         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
906         to_write = tp_len;
907
908         if (sock->type == SOCK_DGRAM) {
909                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
910                                 NULL, tp_len);
911                 if (unlikely(err < 0))
912                         return -EINVAL;
913         } else if (dev->hard_header_len) {
914                 /* net device doesn't like empty head */
915                 if (unlikely(tp_len <= dev->hard_header_len)) {
916                         pr_err("packet size is too short (%d < %d)\n",
917                                tp_len, dev->hard_header_len);
918                         return -EINVAL;
919                 }
920
921                 skb_push(skb, dev->hard_header_len);
922                 err = skb_store_bits(skb, 0, data,
923                                 dev->hard_header_len);
924                 if (unlikely(err))
925                         return err;
926
927                 data += dev->hard_header_len;
928                 to_write -= dev->hard_header_len;
929         }
930
931         err = -EFAULT;
932         page = virt_to_page(data);
933         offset = offset_in_page(data);
934         len_max = PAGE_SIZE - offset;
935         len = ((to_write > len_max) ? len_max : to_write);
936
937         skb->data_len = to_write;
938         skb->len += to_write;
939         skb->truesize += to_write;
940         atomic_add(to_write, &po->sk.sk_wmem_alloc);
941
942         while (likely(to_write)) {
943                 nr_frags = skb_shinfo(skb)->nr_frags;
944
945                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
946                         pr_err("Packet exceed the number of skb frags(%lu)\n",
947                                MAX_SKB_FRAGS);
948                         return -EFAULT;
949                 }
950
951                 flush_dcache_page(page);
952                 get_page(page);
953                 skb_fill_page_desc(skb,
954                                 nr_frags,
955                                 page++, offset, len);
956                 to_write -= len;
957                 offset = 0;
958                 len_max = PAGE_SIZE;
959                 len = ((to_write > len_max) ? len_max : to_write);
960         }
961
962         return tp_len;
963 }
964
965 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
966 {
967         struct socket *sock;
968         struct sk_buff *skb;
969         struct net_device *dev;
970         __be16 proto;
971         int ifindex, err, reserve = 0;
972         void *ph;
973         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
974         int tp_len, size_max;
975         unsigned char *addr;
976         int len_sum = 0;
977         int status = 0;
978
979         sock = po->sk.sk_socket;
980
981         mutex_lock(&po->pg_vec_lock);
982
983         err = -EBUSY;
984         if (saddr == NULL) {
985                 ifindex = po->ifindex;
986                 proto   = po->num;
987                 addr    = NULL;
988         } else {
989                 err = -EINVAL;
990                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
991                         goto out;
992                 if (msg->msg_namelen < (saddr->sll_halen
993                                         + offsetof(struct sockaddr_ll,
994                                                 sll_addr)))
995                         goto out;
996                 ifindex = saddr->sll_ifindex;
997                 proto   = saddr->sll_protocol;
998                 addr    = saddr->sll_addr;
999         }
1000
1001         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
1002         err = -ENXIO;
1003         if (unlikely(dev == NULL))
1004                 goto out;
1005
1006         reserve = dev->hard_header_len;
1007
1008         err = -ENETDOWN;
1009         if (unlikely(!(dev->flags & IFF_UP)))
1010                 goto out_put;
1011
1012         size_max = po->tx_ring.frame_size
1013                 - sizeof(struct skb_shared_info)
1014                 - po->tp_hdrlen
1015                 - LL_ALLOCATED_SPACE(dev)
1016                 - sizeof(struct sockaddr_ll);
1017
1018         if (size_max > dev->mtu + reserve)
1019                 size_max = dev->mtu + reserve;
1020
1021         do {
1022                 ph = packet_current_frame(po, &po->tx_ring,
1023                                 TP_STATUS_SEND_REQUEST);
1024
1025                 if (unlikely(ph == NULL)) {
1026                         schedule();
1027                         continue;
1028                 }
1029
1030                 status = TP_STATUS_SEND_REQUEST;
1031                 skb = sock_alloc_send_skb(&po->sk,
1032                                 LL_ALLOCATED_SPACE(dev)
1033                                 + sizeof(struct sockaddr_ll),
1034                                 0, &err);
1035
1036                 if (unlikely(skb == NULL))
1037                         goto out_status;
1038
1039                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1040                                 addr);
1041
1042                 if (unlikely(tp_len < 0)) {
1043                         if (po->tp_loss) {
1044                                 __packet_set_status(po, ph,
1045                                                 TP_STATUS_AVAILABLE);
1046                                 packet_increment_head(&po->tx_ring);
1047                                 kfree_skb(skb);
1048                                 continue;
1049                         } else {
1050                                 status = TP_STATUS_WRONG_FORMAT;
1051                                 err = tp_len;
1052                                 goto out_status;
1053                         }
1054                 }
1055
1056                 skb->destructor = tpacket_destruct_skb;
1057                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1058                 atomic_inc(&po->tx_ring.pending);
1059
1060                 status = TP_STATUS_SEND_REQUEST;
1061                 err = dev_queue_xmit(skb);
1062                 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1063                         goto out_xmit;
1064                 packet_increment_head(&po->tx_ring);
1065                 len_sum += tp_len;
1066         } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1067                                         && (atomic_read(&po->tx_ring.pending))))
1068               );
1069
1070         err = len_sum;
1071         goto out_put;
1072
1073 out_xmit:
1074         skb->destructor = sock_wfree;
1075         atomic_dec(&po->tx_ring.pending);
1076 out_status:
1077         __packet_set_status(po, ph, status);
1078         kfree_skb(skb);
1079 out_put:
1080         dev_put(dev);
1081 out:
1082         mutex_unlock(&po->pg_vec_lock);
1083         return err;
1084 }
1085 #endif
1086
1087 static int packet_snd(struct socket *sock,
1088                           struct msghdr *msg, size_t len)
1089 {
1090         struct sock *sk = sock->sk;
1091         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1092         struct sk_buff *skb;
1093         struct net_device *dev;
1094         __be16 proto;
1095         unsigned char *addr;
1096         int ifindex, err, reserve = 0;
1097
1098         /*
1099          *      Get and verify the address.
1100          */
1101
1102         if (saddr == NULL) {
1103                 struct packet_sock *po = pkt_sk(sk);
1104
1105                 ifindex = po->ifindex;
1106                 proto   = po->num;
1107                 addr    = NULL;
1108         } else {
1109                 err = -EINVAL;
1110                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1111                         goto out;
1112                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1113                         goto out;
1114                 ifindex = saddr->sll_ifindex;
1115                 proto   = saddr->sll_protocol;
1116                 addr    = saddr->sll_addr;
1117         }
1118
1119
1120         dev = dev_get_by_index(sock_net(sk), ifindex);
1121         err = -ENXIO;
1122         if (dev == NULL)
1123                 goto out_unlock;
1124         if (sock->type == SOCK_RAW)
1125                 reserve = dev->hard_header_len;
1126
1127         err = -ENETDOWN;
1128         if (!(dev->flags & IFF_UP))
1129                 goto out_unlock;
1130
1131         err = -EMSGSIZE;
1132         if (len > dev->mtu+reserve)
1133                 goto out_unlock;
1134
1135         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1136                                 msg->msg_flags & MSG_DONTWAIT, &err);
1137         if (skb == NULL)
1138                 goto out_unlock;
1139
1140         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1141         skb_reset_network_header(skb);
1142
1143         err = -EINVAL;
1144         if (sock->type == SOCK_DGRAM &&
1145             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1146                 goto out_free;
1147
1148         /* Returns -EFAULT on error */
1149         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1150         if (err)
1151                 goto out_free;
1152
1153         skb->protocol = proto;
1154         skb->dev = dev;
1155         skb->priority = sk->sk_priority;
1156
1157         /*
1158          *      Now send it
1159          */
1160
1161         err = dev_queue_xmit(skb);
1162         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1163                 goto out_unlock;
1164
1165         dev_put(dev);
1166
1167         return len;
1168
1169 out_free:
1170         kfree_skb(skb);
1171 out_unlock:
1172         if (dev)
1173                 dev_put(dev);
1174 out:
1175         return err;
1176 }
1177
1178 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1179                 struct msghdr *msg, size_t len)
1180 {
1181 #ifdef CONFIG_PACKET_MMAP
1182         struct sock *sk = sock->sk;
1183         struct packet_sock *po = pkt_sk(sk);
1184         if (po->tx_ring.pg_vec)
1185                 return tpacket_snd(po, msg);
1186         else
1187 #endif
1188                 return packet_snd(sock, msg, len);
1189 }
1190
1191 /*
1192  *      Close a PACKET socket. This is fairly simple. We immediately go
1193  *      to 'closed' state and remove our protocol entry in the device list.
1194  */
1195
1196 static int packet_release(struct socket *sock)
1197 {
1198         struct sock *sk = sock->sk;
1199         struct packet_sock *po;
1200         struct net *net;
1201 #ifdef CONFIG_PACKET_MMAP
1202         struct tpacket_req req;
1203 #endif
1204
1205         if (!sk)
1206                 return 0;
1207
1208         net = sock_net(sk);
1209         po = pkt_sk(sk);
1210
1211         write_lock_bh(&net->packet.sklist_lock);
1212         sk_del_node_init(sk);
1213         sock_prot_inuse_add(net, sk->sk_prot, -1);
1214         write_unlock_bh(&net->packet.sklist_lock);
1215
1216         /*
1217          *      Unhook packet receive handler.
1218          */
1219
1220         if (po->running) {
1221                 /*
1222                  *      Remove the protocol hook
1223                  */
1224                 dev_remove_pack(&po->prot_hook);
1225                 po->running = 0;
1226                 po->num = 0;
1227                 __sock_put(sk);
1228         }
1229
1230         packet_flush_mclist(sk);
1231
1232 #ifdef CONFIG_PACKET_MMAP
1233         memset(&req, 0, sizeof(req));
1234
1235         if (po->rx_ring.pg_vec)
1236                 packet_set_ring(sk, &req, 1, 0);
1237
1238         if (po->tx_ring.pg_vec)
1239                 packet_set_ring(sk, &req, 1, 1);
1240 #endif
1241
1242         /*
1243          *      Now the socket is dead. No more input will appear.
1244          */
1245
1246         sock_orphan(sk);
1247         sock->sk = NULL;
1248
1249         /* Purge queues */
1250
1251         skb_queue_purge(&sk->sk_receive_queue);
1252         sk_refcnt_debug_release(sk);
1253
1254         sock_put(sk);
1255         return 0;
1256 }
1257
1258 /*
1259  *      Attach a packet hook.
1260  */
1261
1262 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1263 {
1264         struct packet_sock *po = pkt_sk(sk);
1265         /*
1266          *      Detach an existing hook if present.
1267          */
1268
1269         lock_sock(sk);
1270
1271         spin_lock(&po->bind_lock);
1272         if (po->running) {
1273                 __sock_put(sk);
1274                 po->running = 0;
1275                 po->num = 0;
1276                 spin_unlock(&po->bind_lock);
1277                 dev_remove_pack(&po->prot_hook);
1278                 spin_lock(&po->bind_lock);
1279         }
1280
1281         po->num = protocol;
1282         po->prot_hook.type = protocol;
1283         po->prot_hook.dev = dev;
1284
1285         po->ifindex = dev ? dev->ifindex : 0;
1286
1287         if (protocol == 0)
1288                 goto out_unlock;
1289
1290         if (!dev || (dev->flags & IFF_UP)) {
1291                 dev_add_pack(&po->prot_hook);
1292                 sock_hold(sk);
1293                 po->running = 1;
1294         } else {
1295                 sk->sk_err = ENETDOWN;
1296                 if (!sock_flag(sk, SOCK_DEAD))
1297                         sk->sk_error_report(sk);
1298         }
1299
1300 out_unlock:
1301         spin_unlock(&po->bind_lock);
1302         release_sock(sk);
1303         return 0;
1304 }
1305
1306 /*
1307  *      Bind a packet socket to a device
1308  */
1309
1310 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1311                             int addr_len)
1312 {
1313         struct sock *sk = sock->sk;
1314         char name[15];
1315         struct net_device *dev;
1316         int err = -ENODEV;
1317
1318         /*
1319          *      Check legality
1320          */
1321
1322         if (addr_len != sizeof(struct sockaddr))
1323                 return -EINVAL;
1324         strlcpy(name, uaddr->sa_data, sizeof(name));
1325
1326         dev = dev_get_by_name(sock_net(sk), name);
1327         if (dev) {
1328                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1329                 dev_put(dev);
1330         }
1331         return err;
1332 }
1333
1334 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1335 {
1336         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1337         struct sock *sk = sock->sk;
1338         struct net_device *dev = NULL;
1339         int err;
1340
1341
1342         /*
1343          *      Check legality
1344          */
1345
1346         if (addr_len < sizeof(struct sockaddr_ll))
1347                 return -EINVAL;
1348         if (sll->sll_family != AF_PACKET)
1349                 return -EINVAL;
1350
1351         if (sll->sll_ifindex) {
1352                 err = -ENODEV;
1353                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1354                 if (dev == NULL)
1355                         goto out;
1356         }
1357         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1358         if (dev)
1359                 dev_put(dev);
1360
1361 out:
1362         return err;
1363 }
1364
1365 static struct proto packet_proto = {
1366         .name     = "PACKET",
1367         .owner    = THIS_MODULE,
1368         .obj_size = sizeof(struct packet_sock),
1369 };
1370
1371 /*
1372  *      Create a packet of type SOCK_PACKET.
1373  */
1374
1375 static int packet_create(struct net *net, struct socket *sock, int protocol)
1376 {
1377         struct sock *sk;
1378         struct packet_sock *po;
1379         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1380         int err;
1381
1382         if (!capable(CAP_NET_RAW))
1383                 return -EPERM;
1384         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1385             sock->type != SOCK_PACKET)
1386                 return -ESOCKTNOSUPPORT;
1387
1388         sock->state = SS_UNCONNECTED;
1389
1390         err = -ENOBUFS;
1391         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1392         if (sk == NULL)
1393                 goto out;
1394
1395         sock->ops = &packet_ops;
1396         if (sock->type == SOCK_PACKET)
1397                 sock->ops = &packet_ops_spkt;
1398
1399         sock_init_data(sock, sk);
1400
1401         po = pkt_sk(sk);
1402         sk->sk_family = PF_PACKET;
1403         po->num = proto;
1404
1405         sk->sk_destruct = packet_sock_destruct;
1406         sk_refcnt_debug_inc(sk);
1407
1408         /*
1409          *      Attach a protocol block
1410          */
1411
1412         spin_lock_init(&po->bind_lock);
1413         mutex_init(&po->pg_vec_lock);
1414         po->prot_hook.func = packet_rcv;
1415
1416         if (sock->type == SOCK_PACKET)
1417                 po->prot_hook.func = packet_rcv_spkt;
1418
1419         po->prot_hook.af_packet_priv = sk;
1420
1421         if (proto) {
1422                 po->prot_hook.type = proto;
1423                 dev_add_pack(&po->prot_hook);
1424                 sock_hold(sk);
1425                 po->running = 1;
1426         }
1427
1428         write_lock_bh(&net->packet.sklist_lock);
1429         sk_add_node(sk, &net->packet.sklist);
1430         sock_prot_inuse_add(net, &packet_proto, 1);
1431         write_unlock_bh(&net->packet.sklist_lock);
1432         return 0;
1433 out:
1434         return err;
1435 }
1436
1437 /*
1438  *      Pull a packet from our receive queue and hand it to the user.
1439  *      If necessary we block.
1440  */
1441
1442 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1443                           struct msghdr *msg, size_t len, int flags)
1444 {
1445         struct sock *sk = sock->sk;
1446         struct sk_buff *skb;
1447         int copied, err;
1448         struct sockaddr_ll *sll;
1449         __u32 gap;
1450
1451         err = -EINVAL;
1452         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1453                 goto out;
1454
1455 #if 0
1456         /* What error should we return now? EUNATTACH? */
1457         if (pkt_sk(sk)->ifindex < 0)
1458                 return -ENODEV;
1459 #endif
1460
1461         /*
1462          *      Call the generic datagram receiver. This handles all sorts
1463          *      of horrible races and re-entrancy so we can forget about it
1464          *      in the protocol layers.
1465          *
1466          *      Now it will return ENETDOWN, if device have just gone down,
1467          *      but then it will block.
1468          */
1469
1470         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1471
1472         /*
1473          *      An error occurred so return it. Because skb_recv_datagram()
1474          *      handles the blocking we don't see and worry about blocking
1475          *      retries.
1476          */
1477
1478         if (skb == NULL)
1479                 goto out;
1480
1481         /*
1482          *      If the address length field is there to be filled in, we fill
1483          *      it in now.
1484          */
1485
1486         sll = &PACKET_SKB_CB(skb)->sa.ll;
1487         if (sock->type == SOCK_PACKET)
1488                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1489         else
1490                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1491
1492         /*
1493          *      You lose any data beyond the buffer you gave. If it worries a
1494          *      user program they can ask the device for its MTU anyway.
1495          */
1496
1497         copied = skb->len;
1498         if (copied > len) {
1499                 copied = len;
1500                 msg->msg_flags |= MSG_TRUNC;
1501         }
1502
1503         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1504         if (err)
1505                 goto out_free;
1506
1507         sock_recv_timestamp(msg, sk, skb);
1508
1509         if (msg->msg_name)
1510                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1511                        msg->msg_namelen);
1512
1513         if (pkt_sk(sk)->auxdata) {
1514                 struct tpacket_auxdata aux;
1515
1516                 aux.tp_status = TP_STATUS_USER;
1517                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1518                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1519                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1520                 aux.tp_snaplen = skb->len;
1521                 aux.tp_mac = 0;
1522                 aux.tp_net = skb_network_offset(skb);
1523                 aux.tp_vlan_tci = skb->vlan_tci;
1524
1525                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1526         }
1527
1528         gap = check_packet_gap(skb);
1529         if (gap)
1530                 put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(__u32), &gap);
1531
1532         /*
1533          *      Free or return the buffer as appropriate. Again this
1534          *      hides all the races and re-entrancy issues from us.
1535          */
1536         err = (flags&MSG_TRUNC) ? skb->len : copied;
1537
1538 out_free:
1539         skb_free_datagram(sk, skb);
1540 out:
1541         return err;
1542 }
1543
1544 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1545                                int *uaddr_len, int peer)
1546 {
1547         struct net_device *dev;
1548         struct sock *sk = sock->sk;
1549
1550         if (peer)
1551                 return -EOPNOTSUPP;
1552
1553         uaddr->sa_family = AF_PACKET;
1554         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1555         if (dev) {
1556                 strlcpy(uaddr->sa_data, dev->name, 15);
1557                 dev_put(dev);
1558         } else
1559                 memset(uaddr->sa_data, 0, 14);
1560         *uaddr_len = sizeof(*uaddr);
1561
1562         return 0;
1563 }
1564
1565 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1566                           int *uaddr_len, int peer)
1567 {
1568         struct net_device *dev;
1569         struct sock *sk = sock->sk;
1570         struct packet_sock *po = pkt_sk(sk);
1571         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1572
1573         if (peer)
1574                 return -EOPNOTSUPP;
1575
1576         sll->sll_family = AF_PACKET;
1577         sll->sll_ifindex = po->ifindex;
1578         sll->sll_protocol = po->num;
1579         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1580         if (dev) {
1581                 sll->sll_hatype = dev->type;
1582                 sll->sll_halen = dev->addr_len;
1583                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1584                 dev_put(dev);
1585         } else {
1586                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1587                 sll->sll_halen = 0;
1588         }
1589         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1590
1591         return 0;
1592 }
1593
1594 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1595                          int what)
1596 {
1597         switch (i->type) {
1598         case PACKET_MR_MULTICAST:
1599                 if (what > 0)
1600                         return dev_mc_add(dev, i->addr, i->alen, 0);
1601                 else
1602                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1603                 break;
1604         case PACKET_MR_PROMISC:
1605                 return dev_set_promiscuity(dev, what);
1606                 break;
1607         case PACKET_MR_ALLMULTI:
1608                 return dev_set_allmulti(dev, what);
1609                 break;
1610         case PACKET_MR_UNICAST:
1611                 if (what > 0)
1612                         return dev_unicast_add(dev, i->addr);
1613                 else
1614                         return dev_unicast_delete(dev, i->addr);
1615                 break;
1616         default:
1617                 break;
1618         }
1619         return 0;
1620 }
1621
1622 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1623 {
1624         for ( ; i; i = i->next) {
1625                 if (i->ifindex == dev->ifindex)
1626                         packet_dev_mc(dev, i, what);
1627         }
1628 }
1629
1630 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1631 {
1632         struct packet_sock *po = pkt_sk(sk);
1633         struct packet_mclist *ml, *i;
1634         struct net_device *dev;
1635         int err;
1636
1637         rtnl_lock();
1638
1639         err = -ENODEV;
1640         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1641         if (!dev)
1642                 goto done;
1643
1644         err = -EINVAL;
1645         if (mreq->mr_alen > dev->addr_len)
1646                 goto done;
1647
1648         err = -ENOBUFS;
1649         i = kmalloc(sizeof(*i), GFP_KERNEL);
1650         if (i == NULL)
1651                 goto done;
1652
1653         err = 0;
1654         for (ml = po->mclist; ml; ml = ml->next) {
1655                 if (ml->ifindex == mreq->mr_ifindex &&
1656                     ml->type == mreq->mr_type &&
1657                     ml->alen == mreq->mr_alen &&
1658                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1659                         ml->count++;
1660                         /* Free the new element ... */
1661                         kfree(i);
1662                         goto done;
1663                 }
1664         }
1665
1666         i->type = mreq->mr_type;
1667         i->ifindex = mreq->mr_ifindex;
1668         i->alen = mreq->mr_alen;
1669         memcpy(i->addr, mreq->mr_address, i->alen);
1670         i->count = 1;
1671         i->next = po->mclist;
1672         po->mclist = i;
1673         err = packet_dev_mc(dev, i, 1);
1674         if (err) {
1675                 po->mclist = i->next;
1676                 kfree(i);
1677         }
1678
1679 done:
1680         rtnl_unlock();
1681         return err;
1682 }
1683
1684 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1685 {
1686         struct packet_mclist *ml, **mlp;
1687
1688         rtnl_lock();
1689
1690         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1691                 if (ml->ifindex == mreq->mr_ifindex &&
1692                     ml->type == mreq->mr_type &&
1693                     ml->alen == mreq->mr_alen &&
1694                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1695                         if (--ml->count == 0) {
1696                                 struct net_device *dev;
1697                                 *mlp = ml->next;
1698                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1699                                 if (dev) {
1700                                         packet_dev_mc(dev, ml, -1);
1701                                         dev_put(dev);
1702                                 }
1703                                 kfree(ml);
1704                         }
1705                         rtnl_unlock();
1706                         return 0;
1707                 }
1708         }
1709         rtnl_unlock();
1710         return -EADDRNOTAVAIL;
1711 }
1712
1713 static void packet_flush_mclist(struct sock *sk)
1714 {
1715         struct packet_sock *po = pkt_sk(sk);
1716         struct packet_mclist *ml;
1717
1718         if (!po->mclist)
1719                 return;
1720
1721         rtnl_lock();
1722         while ((ml = po->mclist) != NULL) {
1723                 struct net_device *dev;
1724
1725                 po->mclist = ml->next;
1726                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1727                 if (dev != NULL) {
1728                         packet_dev_mc(dev, ml, -1);
1729                         dev_put(dev);
1730                 }
1731                 kfree(ml);
1732         }
1733         rtnl_unlock();
1734 }
1735
1736 static int
1737 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1738 {
1739         struct sock *sk = sock->sk;
1740         struct packet_sock *po = pkt_sk(sk);
1741         int ret;
1742
1743         if (level != SOL_PACKET)
1744                 return -ENOPROTOOPT;
1745
1746         switch (optname) {
1747         case PACKET_ADD_MEMBERSHIP:
1748         case PACKET_DROP_MEMBERSHIP:
1749         {
1750                 struct packet_mreq_max mreq;
1751                 int len = optlen;
1752                 memset(&mreq, 0, sizeof(mreq));
1753                 if (len < sizeof(struct packet_mreq))
1754                         return -EINVAL;
1755                 if (len > sizeof(mreq))
1756                         len = sizeof(mreq);
1757                 if (copy_from_user(&mreq, optval, len))
1758                         return -EFAULT;
1759                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1760                         return -EINVAL;
1761                 if (optname == PACKET_ADD_MEMBERSHIP)
1762                         ret = packet_mc_add(sk, &mreq);
1763                 else
1764                         ret = packet_mc_drop(sk, &mreq);
1765                 return ret;
1766         }
1767
1768 #ifdef CONFIG_PACKET_MMAP
1769         case PACKET_RX_RING:
1770         case PACKET_TX_RING:
1771         {
1772                 struct tpacket_req req;
1773
1774                 if (optlen < sizeof(req))
1775                         return -EINVAL;
1776                 if (copy_from_user(&req, optval, sizeof(req)))
1777                         return -EFAULT;
1778                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1779         }
1780         case PACKET_COPY_THRESH:
1781         {
1782                 int val;
1783
1784                 if (optlen != sizeof(val))
1785                         return -EINVAL;
1786                 if (copy_from_user(&val, optval, sizeof(val)))
1787                         return -EFAULT;
1788
1789                 pkt_sk(sk)->copy_thresh = val;
1790                 return 0;
1791         }
1792         case PACKET_VERSION:
1793         {
1794                 int val;
1795
1796                 if (optlen != sizeof(val))
1797                         return -EINVAL;
1798                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1799                         return -EBUSY;
1800                 if (copy_from_user(&val, optval, sizeof(val)))
1801                         return -EFAULT;
1802                 switch (val) {
1803                 case TPACKET_V1:
1804                 case TPACKET_V2:
1805                         po->tp_version = val;
1806                         return 0;
1807                 default:
1808                         return -EINVAL;
1809                 }
1810         }
1811         case PACKET_RESERVE:
1812         {
1813                 unsigned int val;
1814
1815                 if (optlen != sizeof(val))
1816                         return -EINVAL;
1817                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1818                         return -EBUSY;
1819                 if (copy_from_user(&val, optval, sizeof(val)))
1820                         return -EFAULT;
1821                 po->tp_reserve = val;
1822                 return 0;
1823         }
1824         case PACKET_LOSS:
1825         {
1826                 unsigned int val;
1827
1828                 if (optlen != sizeof(val))
1829                         return -EINVAL;
1830                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1831                         return -EBUSY;
1832                 if (copy_from_user(&val, optval, sizeof(val)))
1833                         return -EFAULT;
1834                 po->tp_loss = !!val;
1835                 return 0;
1836         }
1837 #endif
1838         case PACKET_AUXDATA:
1839         {
1840                 int val;
1841
1842                 if (optlen < sizeof(val))
1843                         return -EINVAL;
1844                 if (copy_from_user(&val, optval, sizeof(val)))
1845                         return -EFAULT;
1846
1847                 po->auxdata = !!val;
1848                 return 0;
1849         }
1850         case PACKET_ORIGDEV:
1851         {
1852                 int val;
1853
1854                 if (optlen < sizeof(val))
1855                         return -EINVAL;
1856                 if (copy_from_user(&val, optval, sizeof(val)))
1857                         return -EFAULT;
1858
1859                 po->origdev = !!val;
1860                 return 0;
1861         }
1862         default:
1863                 return -ENOPROTOOPT;
1864         }
1865 }
1866
1867 static int packet_getsockopt(struct socket *sock, int level, int optname,
1868                              char __user *optval, int __user *optlen)
1869 {
1870         int len;
1871         int val;
1872         struct sock *sk = sock->sk;
1873         struct packet_sock *po = pkt_sk(sk);
1874         void *data;
1875         struct tpacket_stats st;
1876
1877         if (level != SOL_PACKET)
1878                 return -ENOPROTOOPT;
1879
1880         if (get_user(len, optlen))
1881                 return -EFAULT;
1882
1883         if (len < 0)
1884                 return -EINVAL;
1885
1886         switch (optname) {
1887         case PACKET_STATISTICS:
1888                 if (len > sizeof(struct tpacket_stats))
1889                         len = sizeof(struct tpacket_stats);
1890                 spin_lock_bh(&sk->sk_receive_queue.lock);
1891                 st = po->stats;
1892                 memset(&po->stats, 0, sizeof(st));
1893                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1894                 st.tp_packets += st.tp_drops;
1895
1896                 data = &st;
1897                 break;
1898         case PACKET_AUXDATA:
1899                 if (len > sizeof(int))
1900                         len = sizeof(int);
1901                 val = po->auxdata;
1902
1903                 data = &val;
1904                 break;
1905         case PACKET_ORIGDEV:
1906                 if (len > sizeof(int))
1907                         len = sizeof(int);
1908                 val = po->origdev;
1909
1910                 data = &val;
1911                 break;
1912 #ifdef CONFIG_PACKET_MMAP
1913         case PACKET_VERSION:
1914                 if (len > sizeof(int))
1915                         len = sizeof(int);
1916                 val = po->tp_version;
1917                 data = &val;
1918                 break;
1919         case PACKET_HDRLEN:
1920                 if (len > sizeof(int))
1921                         len = sizeof(int);
1922                 if (copy_from_user(&val, optval, len))
1923                         return -EFAULT;
1924                 switch (val) {
1925                 case TPACKET_V1:
1926                         val = sizeof(struct tpacket_hdr);
1927                         break;
1928                 case TPACKET_V2:
1929                         val = sizeof(struct tpacket2_hdr);
1930                         break;
1931                 default:
1932                         return -EINVAL;
1933                 }
1934                 data = &val;
1935                 break;
1936         case PACKET_RESERVE:
1937                 if (len > sizeof(unsigned int))
1938                         len = sizeof(unsigned int);
1939                 val = po->tp_reserve;
1940                 data = &val;
1941                 break;
1942         case PACKET_LOSS:
1943                 if (len > sizeof(unsigned int))
1944                         len = sizeof(unsigned int);
1945                 val = po->tp_loss;
1946                 data = &val;
1947                 break;
1948 #endif
1949         default:
1950                 return -ENOPROTOOPT;
1951         }
1952
1953         if (put_user(len, optlen))
1954                 return -EFAULT;
1955         if (copy_to_user(optval, data, len))
1956                 return -EFAULT;
1957         return 0;
1958 }
1959
1960
1961 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1962 {
1963         struct sock *sk;
1964         struct hlist_node *node;
1965         struct net_device *dev = data;
1966         struct net *net = dev_net(dev);
1967
1968         read_lock(&net->packet.sklist_lock);
1969         sk_for_each(sk, node, &net->packet.sklist) {
1970                 struct packet_sock *po = pkt_sk(sk);
1971
1972                 switch (msg) {
1973                 case NETDEV_UNREGISTER:
1974                         if (po->mclist)
1975                                 packet_dev_mclist(dev, po->mclist, -1);
1976                         /* fallthrough */
1977
1978                 case NETDEV_DOWN:
1979                         if (dev->ifindex == po->ifindex) {
1980                                 spin_lock(&po->bind_lock);
1981                                 if (po->running) {
1982                                         __dev_remove_pack(&po->prot_hook);
1983                                         __sock_put(sk);
1984                                         po->running = 0;
1985                                         sk->sk_err = ENETDOWN;
1986                                         if (!sock_flag(sk, SOCK_DEAD))
1987                                                 sk->sk_error_report(sk);
1988                                 }
1989                                 if (msg == NETDEV_UNREGISTER) {
1990                                         po->ifindex = -1;
1991                                         po->prot_hook.dev = NULL;
1992                                 }
1993                                 spin_unlock(&po->bind_lock);
1994                         }
1995                         break;
1996                 case NETDEV_UP:
1997                         spin_lock(&po->bind_lock);
1998                         if (dev->ifindex == po->ifindex && po->num &&
1999                             !po->running) {
2000                                 dev_add_pack(&po->prot_hook);
2001                                 sock_hold(sk);
2002                                 po->running = 1;
2003                         }
2004                         spin_unlock(&po->bind_lock);
2005                         break;
2006                 }
2007         }
2008         read_unlock(&net->packet.sklist_lock);
2009         return NOTIFY_DONE;
2010 }
2011
2012
2013 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2014                         unsigned long arg)
2015 {
2016         struct sock *sk = sock->sk;
2017
2018         switch (cmd) {
2019         case SIOCOUTQ:
2020         {
2021                 int amount = sk_wmem_alloc_get(sk);
2022
2023                 return put_user(amount, (int __user *)arg);
2024         }
2025         case SIOCINQ:
2026         {
2027                 struct sk_buff *skb;
2028                 int amount = 0;
2029
2030                 spin_lock_bh(&sk->sk_receive_queue.lock);
2031                 skb = skb_peek(&sk->sk_receive_queue);
2032                 if (skb)
2033                         amount = skb->len;
2034                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2035                 return put_user(amount, (int __user *)arg);
2036         }
2037         case SIOCGSTAMP:
2038                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2039         case SIOCGSTAMPNS:
2040                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2041
2042 #ifdef CONFIG_INET
2043         case SIOCADDRT:
2044         case SIOCDELRT:
2045         case SIOCDARP:
2046         case SIOCGARP:
2047         case SIOCSARP:
2048         case SIOCGIFADDR:
2049         case SIOCSIFADDR:
2050         case SIOCGIFBRDADDR:
2051         case SIOCSIFBRDADDR:
2052         case SIOCGIFNETMASK:
2053         case SIOCSIFNETMASK:
2054         case SIOCGIFDSTADDR:
2055         case SIOCSIFDSTADDR:
2056         case SIOCSIFFLAGS:
2057                 if (!net_eq(sock_net(sk), &init_net))
2058                         return -ENOIOCTLCMD;
2059                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2060 #endif
2061
2062         default:
2063                 return -ENOIOCTLCMD;
2064         }
2065         return 0;
2066 }
2067
2068 #ifndef CONFIG_PACKET_MMAP
2069 #define packet_mmap sock_no_mmap
2070 #define packet_poll datagram_poll
2071 #else
2072
2073 static unsigned int packet_poll(struct file *file, struct socket *sock,
2074                                 poll_table *wait)
2075 {
2076         struct sock *sk = sock->sk;
2077         struct packet_sock *po = pkt_sk(sk);
2078         unsigned int mask = datagram_poll(file, sock, wait);
2079
2080         spin_lock_bh(&sk->sk_receive_queue.lock);
2081         if (po->rx_ring.pg_vec) {
2082                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2083                         mask |= POLLIN | POLLRDNORM;
2084         }
2085         spin_unlock_bh(&sk->sk_receive_queue.lock);
2086         spin_lock_bh(&sk->sk_write_queue.lock);
2087         if (po->tx_ring.pg_vec) {
2088                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2089                         mask |= POLLOUT | POLLWRNORM;
2090         }
2091         spin_unlock_bh(&sk->sk_write_queue.lock);
2092         return mask;
2093 }
2094
2095
2096 /* Dirty? Well, I still did not learn better way to account
2097  * for user mmaps.
2098  */
2099
2100 static void packet_mm_open(struct vm_area_struct *vma)
2101 {
2102         struct file *file = vma->vm_file;
2103         struct socket *sock = file->private_data;
2104         struct sock *sk = sock->sk;
2105
2106         if (sk)
2107                 atomic_inc(&pkt_sk(sk)->mapped);
2108 }
2109
2110 static void packet_mm_close(struct vm_area_struct *vma)
2111 {
2112         struct file *file = vma->vm_file;
2113         struct socket *sock = file->private_data;
2114         struct sock *sk = sock->sk;
2115
2116         if (sk)
2117                 atomic_dec(&pkt_sk(sk)->mapped);
2118 }
2119
2120 static const struct vm_operations_struct packet_mmap_ops = {
2121         .open   =       packet_mm_open,
2122         .close  =       packet_mm_close,
2123 };
2124
2125 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2126 {
2127         int i;
2128
2129         for (i = 0; i < len; i++) {
2130                 if (likely(pg_vec[i]))
2131                         free_pages((unsigned long) pg_vec[i], order);
2132         }
2133         kfree(pg_vec);
2134 }
2135
2136 static inline char *alloc_one_pg_vec_page(unsigned long order)
2137 {
2138         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2139
2140         return (char *) __get_free_pages(gfp_flags, order);
2141 }
2142
2143 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2144 {
2145         unsigned int block_nr = req->tp_block_nr;
2146         char **pg_vec;
2147         int i;
2148
2149         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2150         if (unlikely(!pg_vec))
2151                 goto out;
2152
2153         for (i = 0; i < block_nr; i++) {
2154                 pg_vec[i] = alloc_one_pg_vec_page(order);
2155                 if (unlikely(!pg_vec[i]))
2156                         goto out_free_pgvec;
2157         }
2158
2159 out:
2160         return pg_vec;
2161
2162 out_free_pgvec:
2163         free_pg_vec(pg_vec, order, block_nr);
2164         pg_vec = NULL;
2165         goto out;
2166 }
2167
2168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2169                 int closing, int tx_ring)
2170 {
2171         char **pg_vec = NULL;
2172         struct packet_sock *po = pkt_sk(sk);
2173         int was_running, order = 0;
2174         struct packet_ring_buffer *rb;
2175         struct sk_buff_head *rb_queue;
2176         __be16 num;
2177         int err;
2178
2179         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2180         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2181
2182         err = -EBUSY;
2183         if (!closing) {
2184                 if (atomic_read(&po->mapped))
2185                         goto out;
2186                 if (atomic_read(&rb->pending))
2187                         goto out;
2188         }
2189
2190         if (req->tp_block_nr) {
2191                 /* Sanity tests and some calculations */
2192                 err = -EBUSY;
2193                 if (unlikely(rb->pg_vec))
2194                         goto out;
2195
2196                 switch (po->tp_version) {
2197                 case TPACKET_V1:
2198                         po->tp_hdrlen = TPACKET_HDRLEN;
2199                         break;
2200                 case TPACKET_V2:
2201                         po->tp_hdrlen = TPACKET2_HDRLEN;
2202                         break;
2203                 }
2204
2205                 err = -EINVAL;
2206                 if (unlikely((int)req->tp_block_size <= 0))
2207                         goto out;
2208                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2209                         goto out;
2210                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2211                                         po->tp_reserve))
2212                         goto out;
2213                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2214                         goto out;
2215
2216                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2217                 if (unlikely(rb->frames_per_block <= 0))
2218                         goto out;
2219                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2220                                         req->tp_frame_nr))
2221                         goto out;
2222
2223                 err = -ENOMEM;
2224                 order = get_order(req->tp_block_size);
2225                 pg_vec = alloc_pg_vec(req, order);
2226                 if (unlikely(!pg_vec))
2227                         goto out;
2228         }
2229         /* Done */
2230         else {
2231                 err = -EINVAL;
2232                 if (unlikely(req->tp_frame_nr))
2233                         goto out;
2234         }
2235
2236         lock_sock(sk);
2237
2238         /* Detach socket from network */
2239         spin_lock(&po->bind_lock);
2240         was_running = po->running;
2241         num = po->num;
2242         if (was_running) {
2243                 __dev_remove_pack(&po->prot_hook);
2244                 po->num = 0;
2245                 po->running = 0;
2246                 __sock_put(sk);
2247         }
2248         spin_unlock(&po->bind_lock);
2249
2250         synchronize_net();
2251
2252         err = -EBUSY;
2253         mutex_lock(&po->pg_vec_lock);
2254         if (closing || atomic_read(&po->mapped) == 0) {
2255                 err = 0;
2256 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2257                 spin_lock_bh(&rb_queue->lock);
2258                 pg_vec = XC(rb->pg_vec, pg_vec);
2259                 rb->frame_max = (req->tp_frame_nr - 1);
2260                 rb->head = 0;
2261                 rb->frame_size = req->tp_frame_size;
2262                 spin_unlock_bh(&rb_queue->lock);
2263
2264                 order = XC(rb->pg_vec_order, order);
2265                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2266
2267                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2268                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2269                                                 tpacket_rcv : packet_rcv;
2270                 skb_queue_purge(rb_queue);
2271 #undef XC
2272                 if (atomic_read(&po->mapped))
2273                         pr_err("packet_mmap: vma is busy: %d\n",
2274                                atomic_read(&po->mapped));
2275         }
2276         mutex_unlock(&po->pg_vec_lock);
2277
2278         spin_lock(&po->bind_lock);
2279         if (was_running && !po->running) {
2280                 sock_hold(sk);
2281                 po->running = 1;
2282                 po->num = num;
2283                 dev_add_pack(&po->prot_hook);
2284         }
2285         spin_unlock(&po->bind_lock);
2286
2287         release_sock(sk);
2288
2289         if (pg_vec)
2290                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2291 out:
2292         return err;
2293 }
2294
2295 static int packet_mmap(struct file *file, struct socket *sock,
2296                 struct vm_area_struct *vma)
2297 {
2298         struct sock *sk = sock->sk;
2299         struct packet_sock *po = pkt_sk(sk);
2300         unsigned long size, expected_size;
2301         struct packet_ring_buffer *rb;
2302         unsigned long start;
2303         int err = -EINVAL;
2304         int i;
2305
2306         if (vma->vm_pgoff)
2307                 return -EINVAL;
2308
2309         mutex_lock(&po->pg_vec_lock);
2310
2311         expected_size = 0;
2312         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2313                 if (rb->pg_vec) {
2314                         expected_size += rb->pg_vec_len
2315                                                 * rb->pg_vec_pages
2316                                                 * PAGE_SIZE;
2317                 }
2318         }
2319
2320         if (expected_size == 0)
2321                 goto out;
2322
2323         size = vma->vm_end - vma->vm_start;
2324         if (size != expected_size)
2325                 goto out;
2326
2327         start = vma->vm_start;
2328         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2329                 if (rb->pg_vec == NULL)
2330                         continue;
2331
2332                 for (i = 0; i < rb->pg_vec_len; i++) {
2333                         struct page *page = virt_to_page(rb->pg_vec[i]);
2334                         int pg_num;
2335
2336                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2337                                         pg_num++, page++) {
2338                                 err = vm_insert_page(vma, start, page);
2339                                 if (unlikely(err))
2340                                         goto out;
2341                                 start += PAGE_SIZE;
2342                         }
2343                 }
2344         }
2345
2346         atomic_inc(&po->mapped);
2347         vma->vm_ops = &packet_mmap_ops;
2348         err = 0;
2349
2350 out:
2351         mutex_unlock(&po->pg_vec_lock);
2352         return err;
2353 }
2354 #endif
2355
2356
2357 static const struct proto_ops packet_ops_spkt = {
2358         .family =       PF_PACKET,
2359         .owner =        THIS_MODULE,
2360         .release =      packet_release,
2361         .bind =         packet_bind_spkt,
2362         .connect =      sock_no_connect,
2363         .socketpair =   sock_no_socketpair,
2364         .accept =       sock_no_accept,
2365         .getname =      packet_getname_spkt,
2366         .poll =         datagram_poll,
2367         .ioctl =        packet_ioctl,
2368         .listen =       sock_no_listen,
2369         .shutdown =     sock_no_shutdown,
2370         .setsockopt =   sock_no_setsockopt,
2371         .getsockopt =   sock_no_getsockopt,
2372         .sendmsg =      packet_sendmsg_spkt,
2373         .recvmsg =      packet_recvmsg,
2374         .mmap =         sock_no_mmap,
2375         .sendpage =     sock_no_sendpage,
2376 };
2377
2378 static const struct proto_ops packet_ops = {
2379         .family =       PF_PACKET,
2380         .owner =        THIS_MODULE,
2381         .release =      packet_release,
2382         .bind =         packet_bind,
2383         .connect =      sock_no_connect,
2384         .socketpair =   sock_no_socketpair,
2385         .accept =       sock_no_accept,
2386         .getname =      packet_getname,
2387         .poll =         packet_poll,
2388         .ioctl =        packet_ioctl,
2389         .listen =       sock_no_listen,
2390         .shutdown =     sock_no_shutdown,
2391         .setsockopt =   packet_setsockopt,
2392         .getsockopt =   packet_getsockopt,
2393         .sendmsg =      packet_sendmsg,
2394         .recvmsg =      packet_recvmsg,
2395         .mmap =         packet_mmap,
2396         .sendpage =     sock_no_sendpage,
2397 };
2398
2399 static struct net_proto_family packet_family_ops = {
2400         .family =       PF_PACKET,
2401         .create =       packet_create,
2402         .owner  =       THIS_MODULE,
2403 };
2404
2405 static struct notifier_block packet_netdev_notifier = {
2406         .notifier_call =        packet_notifier,
2407 };
2408
2409 #ifdef CONFIG_PROC_FS
2410 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2411 {
2412         struct sock *s;
2413         struct hlist_node *node;
2414
2415         sk_for_each(s, node, &net->packet.sklist) {
2416                 if (!off--)
2417                         return s;
2418         }
2419         return NULL;
2420 }
2421
2422 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2423         __acquires(seq_file_net(seq)->packet.sklist_lock)
2424 {
2425         struct net *net = seq_file_net(seq);
2426         read_lock(&net->packet.sklist_lock);
2427         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2428 }
2429
2430 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2431 {
2432         struct net *net = seq_file_net(seq);
2433         ++*pos;
2434         return  (v == SEQ_START_TOKEN)
2435                 ? sk_head(&net->packet.sklist)
2436                 : sk_next((struct sock *)v) ;
2437 }
2438
2439 static void packet_seq_stop(struct seq_file *seq, void *v)
2440         __releases(seq_file_net(seq)->packet.sklist_lock)
2441 {
2442         struct net *net = seq_file_net(seq);
2443         read_unlock(&net->packet.sklist_lock);
2444 }
2445
2446 static int packet_seq_show(struct seq_file *seq, void *v)
2447 {
2448         if (v == SEQ_START_TOKEN)
2449                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2450         else {
2451                 struct sock *s = v;
2452                 const struct packet_sock *po = pkt_sk(s);
2453
2454                 seq_printf(seq,
2455                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2456                            s,
2457                            atomic_read(&s->sk_refcnt),
2458                            s->sk_type,
2459                            ntohs(po->num),
2460                            po->ifindex,
2461                            po->running,
2462                            atomic_read(&s->sk_rmem_alloc),
2463                            sock_i_uid(s),
2464                            sock_i_ino(s));
2465         }
2466
2467         return 0;
2468 }
2469
2470 static const struct seq_operations packet_seq_ops = {
2471         .start  = packet_seq_start,
2472         .next   = packet_seq_next,
2473         .stop   = packet_seq_stop,
2474         .show   = packet_seq_show,
2475 };
2476
2477 static int packet_seq_open(struct inode *inode, struct file *file)
2478 {
2479         return seq_open_net(inode, file, &packet_seq_ops,
2480                             sizeof(struct seq_net_private));
2481 }
2482
2483 static const struct file_operations packet_seq_fops = {
2484         .owner          = THIS_MODULE,
2485         .open           = packet_seq_open,
2486         .read           = seq_read,
2487         .llseek         = seq_lseek,
2488         .release        = seq_release_net,
2489 };
2490
2491 #endif
2492
2493 static int packet_net_init(struct net *net)
2494 {
2495         rwlock_init(&net->packet.sklist_lock);
2496         INIT_HLIST_HEAD(&net->packet.sklist);
2497
2498         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2499                 return -ENOMEM;
2500
2501         return 0;
2502 }
2503
2504 static void packet_net_exit(struct net *net)
2505 {
2506         proc_net_remove(net, "packet");
2507 }
2508
2509 static struct pernet_operations packet_net_ops = {
2510         .init = packet_net_init,
2511         .exit = packet_net_exit,
2512 };
2513
2514
2515 static void __exit packet_exit(void)
2516 {
2517         unregister_netdevice_notifier(&packet_netdev_notifier);
2518         unregister_pernet_subsys(&packet_net_ops);
2519         sock_unregister(PF_PACKET);
2520         proto_unregister(&packet_proto);
2521 }
2522
2523 static int __init packet_init(void)
2524 {
2525         int rc = proto_register(&packet_proto, 0);
2526
2527         if (rc != 0)
2528                 goto out;
2529
2530         sock_register(&packet_family_ops);
2531         register_pernet_subsys(&packet_net_ops);
2532         register_netdevice_notifier(&packet_netdev_notifier);
2533 out:
2534         return rc;
2535 }
2536
2537 module_init(packet_init);
2538 module_exit(packet_exit);
2539 MODULE_LICENSE("GPL");
2540 MODULE_ALIAS_NETPROTO(PF_PACKET);