Merge branch 'master' of /repos/git/net-next-2.6
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82 #include <linux/if_vlan.h>
83 #include <linux/virtio_net.h>
84
85 #ifdef CONFIG_INET
86 #include <net/inet_common.h>
87 #endif
88
89 /*
90    Assumptions:
91    - if device has no dev->hard_header routine, it adds and removes ll header
92      inside itself. In this case ll header is invisible outside of device,
93      but higher levels still should reserve dev->hard_header_len.
94      Some devices are enough clever to reallocate skb, when header
95      will not fit to reserved space (tunnel), another ones are silly
96      (PPP).
97    - packet socket receives packets with pulled ll header,
98      so that SOCK_RAW should push it back.
99
100 On receive:
101 -----------
102
103 Incoming, dev->hard_header!=NULL
104    mac_header -> ll header
105    data       -> data
106
107 Outgoing, dev->hard_header!=NULL
108    mac_header -> ll header
109    data       -> ll header
110
111 Incoming, dev->hard_header==NULL
112    mac_header -> UNKNOWN position. It is very likely, that it points to ll
113                  header.  PPP makes it, that is wrong, because introduce
114                  assymetry between rx and tx paths.
115    data       -> data
116
117 Outgoing, dev->hard_header==NULL
118    mac_header -> data. ll header is still not built!
119    data       -> data
120
121 Resume
122   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
123
124
125 On transmit:
126 ------------
127
128 dev->hard_header != NULL
129    mac_header -> ll header
130    data       -> ll header
131
132 dev->hard_header == NULL (ll header is added by device, we cannot control it)
133    mac_header -> data
134    data       -> data
135
136    We should set nh.raw on output to correct posistion,
137    packet classifier depends on it.
138  */
139
140 /* Private packet socket structures. */
141
142 struct packet_mclist {
143         struct packet_mclist    *next;
144         int                     ifindex;
145         int                     count;
146         unsigned short          type;
147         unsigned short          alen;
148         unsigned char           addr[MAX_ADDR_LEN];
149 };
150 /* identical to struct packet_mreq except it has
151  * a longer address field.
152  */
153 struct packet_mreq_max {
154         int             mr_ifindex;
155         unsigned short  mr_type;
156         unsigned short  mr_alen;
157         unsigned char   mr_address[MAX_ADDR_LEN];
158 };
159
160 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
161                 int closing, int tx_ring);
162
163 struct packet_ring_buffer {
164         char                    **pg_vec;
165         unsigned int            head;
166         unsigned int            frames_per_block;
167         unsigned int            frame_size;
168         unsigned int            frame_max;
169
170         unsigned int            pg_vec_order;
171         unsigned int            pg_vec_pages;
172         unsigned int            pg_vec_len;
173
174         atomic_t                pending;
175 };
176
177 struct packet_sock;
178 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
179
180 static void packet_flush_mclist(struct sock *sk);
181
182 struct packet_sock {
183         /* struct sock has to be the first member of packet_sock */
184         struct sock             sk;
185         struct tpacket_stats    stats;
186         struct packet_ring_buffer       rx_ring;
187         struct packet_ring_buffer       tx_ring;
188         int                     copy_thresh;
189         spinlock_t              bind_lock;
190         struct mutex            pg_vec_lock;
191         unsigned int            running:1,      /* prot_hook is attached*/
192                                 auxdata:1,
193                                 origdev:1,
194                                 has_vnet_hdr:1;
195         int                     ifindex;        /* bound device         */
196         __be16                  num;
197         struct packet_mclist    *mclist;
198         atomic_t                mapped;
199         enum tpacket_versions   tp_version;
200         unsigned int            tp_hdrlen;
201         unsigned int            tp_reserve;
202         unsigned int            tp_loss:1;
203         struct packet_type      prot_hook ____cacheline_aligned_in_smp;
204 };
205
206 struct packet_skb_cb {
207         unsigned int origlen;
208         union {
209                 struct sockaddr_pkt pkt;
210                 struct sockaddr_ll ll;
211         } sa;
212 };
213
214 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
215
216 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
217 {
218         union {
219                 struct tpacket_hdr *h1;
220                 struct tpacket2_hdr *h2;
221                 void *raw;
222         } h;
223
224         h.raw = frame;
225         switch (po->tp_version) {
226         case TPACKET_V1:
227                 h.h1->tp_status = status;
228                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
229                 break;
230         case TPACKET_V2:
231                 h.h2->tp_status = status;
232                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
233                 break;
234         default:
235                 pr_err("TPACKET version not supported\n");
236                 BUG();
237         }
238
239         smp_wmb();
240 }
241
242 static int __packet_get_status(struct packet_sock *po, void *frame)
243 {
244         union {
245                 struct tpacket_hdr *h1;
246                 struct tpacket2_hdr *h2;
247                 void *raw;
248         } h;
249
250         smp_rmb();
251
252         h.raw = frame;
253         switch (po->tp_version) {
254         case TPACKET_V1:
255                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
256                 return h.h1->tp_status;
257         case TPACKET_V2:
258                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
259                 return h.h2->tp_status;
260         default:
261                 pr_err("TPACKET version not supported\n");
262                 BUG();
263                 return 0;
264         }
265 }
266
267 static void *packet_lookup_frame(struct packet_sock *po,
268                 struct packet_ring_buffer *rb,
269                 unsigned int position,
270                 int status)
271 {
272         unsigned int pg_vec_pos, frame_offset;
273         union {
274                 struct tpacket_hdr *h1;
275                 struct tpacket2_hdr *h2;
276                 void *raw;
277         } h;
278
279         pg_vec_pos = position / rb->frames_per_block;
280         frame_offset = position % rb->frames_per_block;
281
282         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
283
284         if (status != __packet_get_status(po, h.raw))
285                 return NULL;
286
287         return h.raw;
288 }
289
290 static inline void *packet_current_frame(struct packet_sock *po,
291                 struct packet_ring_buffer *rb,
292                 int status)
293 {
294         return packet_lookup_frame(po, rb, rb->head, status);
295 }
296
297 static inline void *packet_previous_frame(struct packet_sock *po,
298                 struct packet_ring_buffer *rb,
299                 int status)
300 {
301         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
302         return packet_lookup_frame(po, rb, previous, status);
303 }
304
305 static inline void packet_increment_head(struct packet_ring_buffer *buff)
306 {
307         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
308 }
309
310 static inline struct packet_sock *pkt_sk(struct sock *sk)
311 {
312         return (struct packet_sock *)sk;
313 }
314
315 static void packet_sock_destruct(struct sock *sk)
316 {
317         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
318         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
319
320         if (!sock_flag(sk, SOCK_DEAD)) {
321                 pr_err("Attempt to release alive packet socket: %p\n", sk);
322                 return;
323         }
324
325         sk_refcnt_debug_dec(sk);
326 }
327
328
329 static const struct proto_ops packet_ops;
330
331 static const struct proto_ops packet_ops_spkt;
332
333 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
334                            struct packet_type *pt, struct net_device *orig_dev)
335 {
336         struct sock *sk;
337         struct sockaddr_pkt *spkt;
338
339         /*
340          *      When we registered the protocol we saved the socket in the data
341          *      field for just this event.
342          */
343
344         sk = pt->af_packet_priv;
345
346         /*
347          *      Yank back the headers [hope the device set this
348          *      right or kerboom...]
349          *
350          *      Incoming packets have ll header pulled,
351          *      push it back.
352          *
353          *      For outgoing ones skb->data == skb_mac_header(skb)
354          *      so that this procedure is noop.
355          */
356
357         if (skb->pkt_type == PACKET_LOOPBACK)
358                 goto out;
359
360         if (!net_eq(dev_net(dev), sock_net(sk)))
361                 goto out;
362
363         skb = skb_share_check(skb, GFP_ATOMIC);
364         if (skb == NULL)
365                 goto oom;
366
367         /* drop any routing info */
368         skb_dst_drop(skb);
369
370         /* drop conntrack reference */
371         nf_reset(skb);
372
373         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
374
375         skb_push(skb, skb->data - skb_mac_header(skb));
376
377         /*
378          *      The SOCK_PACKET socket receives _all_ frames.
379          */
380
381         spkt->spkt_family = dev->type;
382         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
383         spkt->spkt_protocol = skb->protocol;
384
385         /*
386          *      Charge the memory to the socket. This is done specifically
387          *      to prevent sockets using all the memory up.
388          */
389
390         if (sock_queue_rcv_skb(sk, skb) == 0)
391                 return 0;
392
393 out:
394         kfree_skb(skb);
395 oom:
396         return 0;
397 }
398
399
400 /*
401  *      Output a raw packet to a device layer. This bypasses all the other
402  *      protocol layers and you must therefore supply it with a complete frame
403  */
404
405 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
406                                struct msghdr *msg, size_t len)
407 {
408         struct sock *sk = sock->sk;
409         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
410         struct sk_buff *skb = NULL;
411         struct net_device *dev;
412         __be16 proto = 0;
413         int err;
414
415         /*
416          *      Get and verify the address.
417          */
418
419         if (saddr) {
420                 if (msg->msg_namelen < sizeof(struct sockaddr))
421                         return -EINVAL;
422                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
423                         proto = saddr->spkt_protocol;
424         } else
425                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
426
427         /*
428          *      Find the device first to size check it
429          */
430
431         saddr->spkt_device[13] = 0;
432 retry:
433         rcu_read_lock();
434         dev = dev_get_by_name_rcu(sock_net(sk), saddr->spkt_device);
435         err = -ENODEV;
436         if (dev == NULL)
437                 goto out_unlock;
438
439         err = -ENETDOWN;
440         if (!(dev->flags & IFF_UP))
441                 goto out_unlock;
442
443         /*
444          * You may not queue a frame bigger than the mtu. This is the lowest level
445          * raw protocol and you must do your own fragmentation at this level.
446          */
447
448         err = -EMSGSIZE;
449         if (len > dev->mtu + dev->hard_header_len)
450                 goto out_unlock;
451
452         if (!skb) {
453                 size_t reserved = LL_RESERVED_SPACE(dev);
454                 unsigned int hhlen = dev->header_ops ? dev->hard_header_len : 0;
455
456                 rcu_read_unlock();
457                 skb = sock_wmalloc(sk, len + reserved, 0, GFP_KERNEL);
458                 if (skb == NULL)
459                         return -ENOBUFS;
460                 /* FIXME: Save some space for broken drivers that write a hard
461                  * header at transmission time by themselves. PPP is the notable
462                  * one here. This should really be fixed at the driver level.
463                  */
464                 skb_reserve(skb, reserved);
465                 skb_reset_network_header(skb);
466
467                 /* Try to align data part correctly */
468                 if (hhlen) {
469                         skb->data -= hhlen;
470                         skb->tail -= hhlen;
471                         if (len < hhlen)
472                                 skb_reset_network_header(skb);
473                 }
474                 err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
475                 if (err)
476                         goto out_free;
477                 goto retry;
478         }
479
480
481         skb->protocol = proto;
482         skb->dev = dev;
483         skb->priority = sk->sk_priority;
484         skb->mark = sk->sk_mark;
485
486         dev_queue_xmit(skb);
487         rcu_read_unlock();
488         return len;
489
490 out_unlock:
491         rcu_read_unlock();
492 out_free:
493         kfree_skb(skb);
494         return err;
495 }
496
497 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
498                                       unsigned int res)
499 {
500         struct sk_filter *filter;
501
502         rcu_read_lock_bh();
503         filter = rcu_dereference(sk->sk_filter);
504         if (filter != NULL)
505                 res = sk_run_filter(skb, filter->insns, filter->len);
506         rcu_read_unlock_bh();
507
508         return res;
509 }
510
511 /*
512    This function makes lazy skb cloning in hope that most of packets
513    are discarded by BPF.
514
515    Note tricky part: we DO mangle shared skb! skb->data, skb->len
516    and skb->cb are mangled. It works because (and until) packets
517    falling here are owned by current CPU. Output packets are cloned
518    by dev_queue_xmit_nit(), input packets are processed by net_bh
519    sequencially, so that if we return skb to original state on exit,
520    we will not harm anyone.
521  */
522
523 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
524                       struct packet_type *pt, struct net_device *orig_dev)
525 {
526         struct sock *sk;
527         struct sockaddr_ll *sll;
528         struct packet_sock *po;
529         u8 *skb_head = skb->data;
530         int skb_len = skb->len;
531         unsigned int snaplen, res;
532
533         if (skb->pkt_type == PACKET_LOOPBACK)
534                 goto drop;
535
536         sk = pt->af_packet_priv;
537         po = pkt_sk(sk);
538
539         if (!net_eq(dev_net(dev), sock_net(sk)))
540                 goto drop;
541
542         skb->dev = dev;
543
544         if (dev->header_ops) {
545                 /* The device has an explicit notion of ll header,
546                    exported to higher levels.
547
548                    Otherwise, the device hides datails of it frame
549                    structure, so that corresponding packet head
550                    never delivered to user.
551                  */
552                 if (sk->sk_type != SOCK_DGRAM)
553                         skb_push(skb, skb->data - skb_mac_header(skb));
554                 else if (skb->pkt_type == PACKET_OUTGOING) {
555                         /* Special case: outgoing packets have ll header at head */
556                         skb_pull(skb, skb_network_offset(skb));
557                 }
558         }
559
560         snaplen = skb->len;
561
562         res = run_filter(skb, sk, snaplen);
563         if (!res)
564                 goto drop_n_restore;
565         if (snaplen > res)
566                 snaplen = res;
567
568         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
569             (unsigned)sk->sk_rcvbuf)
570                 goto drop_n_acct;
571
572         if (skb_shared(skb)) {
573                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
574                 if (nskb == NULL)
575                         goto drop_n_acct;
576
577                 if (skb_head != skb->data) {
578                         skb->data = skb_head;
579                         skb->len = skb_len;
580                 }
581                 kfree_skb(skb);
582                 skb = nskb;
583         }
584
585         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
586                      sizeof(skb->cb));
587
588         sll = &PACKET_SKB_CB(skb)->sa.ll;
589         sll->sll_family = AF_PACKET;
590         sll->sll_hatype = dev->type;
591         sll->sll_protocol = skb->protocol;
592         sll->sll_pkttype = skb->pkt_type;
593         if (unlikely(po->origdev))
594                 sll->sll_ifindex = orig_dev->ifindex;
595         else
596                 sll->sll_ifindex = dev->ifindex;
597
598         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
599
600         PACKET_SKB_CB(skb)->origlen = skb->len;
601
602         if (pskb_trim(skb, snaplen))
603                 goto drop_n_acct;
604
605         skb_set_owner_r(skb, sk);
606         skb->dev = NULL;
607         skb_dst_drop(skb);
608
609         /* drop conntrack reference */
610         nf_reset(skb);
611
612         spin_lock(&sk->sk_receive_queue.lock);
613         po->stats.tp_packets++;
614         skb->dropcount = atomic_read(&sk->sk_drops);
615         __skb_queue_tail(&sk->sk_receive_queue, skb);
616         spin_unlock(&sk->sk_receive_queue.lock);
617         sk->sk_data_ready(sk, skb->len);
618         return 0;
619
620 drop_n_acct:
621         po->stats.tp_drops = atomic_inc_return(&sk->sk_drops);
622
623 drop_n_restore:
624         if (skb_head != skb->data && skb_shared(skb)) {
625                 skb->data = skb_head;
626                 skb->len = skb_len;
627         }
628 drop:
629         consume_skb(skb);
630         return 0;
631 }
632
633 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
634                        struct packet_type *pt, struct net_device *orig_dev)
635 {
636         struct sock *sk;
637         struct packet_sock *po;
638         struct sockaddr_ll *sll;
639         union {
640                 struct tpacket_hdr *h1;
641                 struct tpacket2_hdr *h2;
642                 void *raw;
643         } h;
644         u8 *skb_head = skb->data;
645         int skb_len = skb->len;
646         unsigned int snaplen, res;
647         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
648         unsigned short macoff, netoff, hdrlen;
649         struct sk_buff *copy_skb = NULL;
650         struct timeval tv;
651         struct timespec ts;
652
653         if (skb->pkt_type == PACKET_LOOPBACK)
654                 goto drop;
655
656         sk = pt->af_packet_priv;
657         po = pkt_sk(sk);
658
659         if (!net_eq(dev_net(dev), sock_net(sk)))
660                 goto drop;
661
662         if (dev->header_ops) {
663                 if (sk->sk_type != SOCK_DGRAM)
664                         skb_push(skb, skb->data - skb_mac_header(skb));
665                 else if (skb->pkt_type == PACKET_OUTGOING) {
666                         /* Special case: outgoing packets have ll header at head */
667                         skb_pull(skb, skb_network_offset(skb));
668                 }
669         }
670
671         if (skb->ip_summed == CHECKSUM_PARTIAL)
672                 status |= TP_STATUS_CSUMNOTREADY;
673
674         snaplen = skb->len;
675
676         res = run_filter(skb, sk, snaplen);
677         if (!res)
678                 goto drop_n_restore;
679         if (snaplen > res)
680                 snaplen = res;
681
682         if (sk->sk_type == SOCK_DGRAM) {
683                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
684                                   po->tp_reserve;
685         } else {
686                 unsigned maclen = skb_network_offset(skb);
687                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
688                                        (maclen < 16 ? 16 : maclen)) +
689                         po->tp_reserve;
690                 macoff = netoff - maclen;
691         }
692
693         if (macoff + snaplen > po->rx_ring.frame_size) {
694                 if (po->copy_thresh &&
695                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
696                     (unsigned)sk->sk_rcvbuf) {
697                         if (skb_shared(skb)) {
698                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
699                         } else {
700                                 copy_skb = skb_get(skb);
701                                 skb_head = skb->data;
702                         }
703                         if (copy_skb)
704                                 skb_set_owner_r(copy_skb, sk);
705                 }
706                 snaplen = po->rx_ring.frame_size - macoff;
707                 if ((int)snaplen < 0)
708                         snaplen = 0;
709         }
710
711         spin_lock(&sk->sk_receive_queue.lock);
712         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
713         if (!h.raw)
714                 goto ring_is_full;
715         packet_increment_head(&po->rx_ring);
716         po->stats.tp_packets++;
717         if (copy_skb) {
718                 status |= TP_STATUS_COPY;
719                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
720         }
721         if (!po->stats.tp_drops)
722                 status &= ~TP_STATUS_LOSING;
723         spin_unlock(&sk->sk_receive_queue.lock);
724
725         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
726
727         switch (po->tp_version) {
728         case TPACKET_V1:
729                 h.h1->tp_len = skb->len;
730                 h.h1->tp_snaplen = snaplen;
731                 h.h1->tp_mac = macoff;
732                 h.h1->tp_net = netoff;
733                 if (skb->tstamp.tv64)
734                         tv = ktime_to_timeval(skb->tstamp);
735                 else
736                         do_gettimeofday(&tv);
737                 h.h1->tp_sec = tv.tv_sec;
738                 h.h1->tp_usec = tv.tv_usec;
739                 hdrlen = sizeof(*h.h1);
740                 break;
741         case TPACKET_V2:
742                 h.h2->tp_len = skb->len;
743                 h.h2->tp_snaplen = snaplen;
744                 h.h2->tp_mac = macoff;
745                 h.h2->tp_net = netoff;
746                 if (skb->tstamp.tv64)
747                         ts = ktime_to_timespec(skb->tstamp);
748                 else
749                         getnstimeofday(&ts);
750                 h.h2->tp_sec = ts.tv_sec;
751                 h.h2->tp_nsec = ts.tv_nsec;
752                 h.h2->tp_vlan_tci = vlan_tx_tag_get(skb);
753                 hdrlen = sizeof(*h.h2);
754                 break;
755         default:
756                 BUG();
757         }
758
759         sll = h.raw + TPACKET_ALIGN(hdrlen);
760         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
761         sll->sll_family = AF_PACKET;
762         sll->sll_hatype = dev->type;
763         sll->sll_protocol = skb->protocol;
764         sll->sll_pkttype = skb->pkt_type;
765         if (unlikely(po->origdev))
766                 sll->sll_ifindex = orig_dev->ifindex;
767         else
768                 sll->sll_ifindex = dev->ifindex;
769
770         __packet_set_status(po, h.raw, status);
771         smp_mb();
772         {
773                 struct page *p_start, *p_end;
774                 u8 *h_end = h.raw + macoff + snaplen - 1;
775
776                 p_start = virt_to_page(h.raw);
777                 p_end = virt_to_page(h_end);
778                 while (p_start <= p_end) {
779                         flush_dcache_page(p_start);
780                         p_start++;
781                 }
782         }
783
784         sk->sk_data_ready(sk, 0);
785
786 drop_n_restore:
787         if (skb_head != skb->data && skb_shared(skb)) {
788                 skb->data = skb_head;
789                 skb->len = skb_len;
790         }
791 drop:
792         kfree_skb(skb);
793         return 0;
794
795 ring_is_full:
796         po->stats.tp_drops++;
797         spin_unlock(&sk->sk_receive_queue.lock);
798
799         sk->sk_data_ready(sk, 0);
800         kfree_skb(copy_skb);
801         goto drop_n_restore;
802 }
803
804 static void tpacket_destruct_skb(struct sk_buff *skb)
805 {
806         struct packet_sock *po = pkt_sk(skb->sk);
807         void *ph;
808
809         BUG_ON(skb == NULL);
810
811         if (likely(po->tx_ring.pg_vec)) {
812                 ph = skb_shinfo(skb)->destructor_arg;
813                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
814                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
815                 atomic_dec(&po->tx_ring.pending);
816                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
817         }
818
819         sock_wfree(skb);
820 }
821
822 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
823                 void *frame, struct net_device *dev, int size_max,
824                 __be16 proto, unsigned char *addr)
825 {
826         union {
827                 struct tpacket_hdr *h1;
828                 struct tpacket2_hdr *h2;
829                 void *raw;
830         } ph;
831         int to_write, offset, len, tp_len, nr_frags, len_max;
832         struct socket *sock = po->sk.sk_socket;
833         struct page *page;
834         void *data;
835         int err;
836
837         ph.raw = frame;
838
839         skb->protocol = proto;
840         skb->dev = dev;
841         skb->priority = po->sk.sk_priority;
842         skb->mark = po->sk.sk_mark;
843         skb_shinfo(skb)->destructor_arg = ph.raw;
844
845         switch (po->tp_version) {
846         case TPACKET_V2:
847                 tp_len = ph.h2->tp_len;
848                 break;
849         default:
850                 tp_len = ph.h1->tp_len;
851                 break;
852         }
853         if (unlikely(tp_len > size_max)) {
854                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
855                 return -EMSGSIZE;
856         }
857
858         skb_reserve(skb, LL_RESERVED_SPACE(dev));
859         skb_reset_network_header(skb);
860
861         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
862         to_write = tp_len;
863
864         if (sock->type == SOCK_DGRAM) {
865                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
866                                 NULL, tp_len);
867                 if (unlikely(err < 0))
868                         return -EINVAL;
869         } else if (dev->hard_header_len) {
870                 /* net device doesn't like empty head */
871                 if (unlikely(tp_len <= dev->hard_header_len)) {
872                         pr_err("packet size is too short (%d < %d)\n",
873                                tp_len, dev->hard_header_len);
874                         return -EINVAL;
875                 }
876
877                 skb_push(skb, dev->hard_header_len);
878                 err = skb_store_bits(skb, 0, data,
879                                 dev->hard_header_len);
880                 if (unlikely(err))
881                         return err;
882
883                 data += dev->hard_header_len;
884                 to_write -= dev->hard_header_len;
885         }
886
887         err = -EFAULT;
888         page = virt_to_page(data);
889         offset = offset_in_page(data);
890         len_max = PAGE_SIZE - offset;
891         len = ((to_write > len_max) ? len_max : to_write);
892
893         skb->data_len = to_write;
894         skb->len += to_write;
895         skb->truesize += to_write;
896         atomic_add(to_write, &po->sk.sk_wmem_alloc);
897
898         while (likely(to_write)) {
899                 nr_frags = skb_shinfo(skb)->nr_frags;
900
901                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
902                         pr_err("Packet exceed the number of skb frags(%lu)\n",
903                                MAX_SKB_FRAGS);
904                         return -EFAULT;
905                 }
906
907                 flush_dcache_page(page);
908                 get_page(page);
909                 skb_fill_page_desc(skb,
910                                 nr_frags,
911                                 page++, offset, len);
912                 to_write -= len;
913                 offset = 0;
914                 len_max = PAGE_SIZE;
915                 len = ((to_write > len_max) ? len_max : to_write);
916         }
917
918         return tp_len;
919 }
920
921 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
922 {
923         struct socket *sock;
924         struct sk_buff *skb;
925         struct net_device *dev;
926         __be16 proto;
927         int ifindex, err, reserve = 0;
928         void *ph;
929         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
930         int tp_len, size_max;
931         unsigned char *addr;
932         int len_sum = 0;
933         int status = 0;
934
935         sock = po->sk.sk_socket;
936
937         mutex_lock(&po->pg_vec_lock);
938
939         err = -EBUSY;
940         if (saddr == NULL) {
941                 ifindex = po->ifindex;
942                 proto   = po->num;
943                 addr    = NULL;
944         } else {
945                 err = -EINVAL;
946                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
947                         goto out;
948                 if (msg->msg_namelen < (saddr->sll_halen
949                                         + offsetof(struct sockaddr_ll,
950                                                 sll_addr)))
951                         goto out;
952                 ifindex = saddr->sll_ifindex;
953                 proto   = saddr->sll_protocol;
954                 addr    = saddr->sll_addr;
955         }
956
957         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
958         err = -ENXIO;
959         if (unlikely(dev == NULL))
960                 goto out;
961
962         reserve = dev->hard_header_len;
963
964         err = -ENETDOWN;
965         if (unlikely(!(dev->flags & IFF_UP)))
966                 goto out_put;
967
968         size_max = po->tx_ring.frame_size
969                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
970
971         if (size_max > dev->mtu + reserve)
972                 size_max = dev->mtu + reserve;
973
974         do {
975                 ph = packet_current_frame(po, &po->tx_ring,
976                                 TP_STATUS_SEND_REQUEST);
977
978                 if (unlikely(ph == NULL)) {
979                         schedule();
980                         continue;
981                 }
982
983                 status = TP_STATUS_SEND_REQUEST;
984                 skb = sock_alloc_send_skb(&po->sk,
985                                 LL_ALLOCATED_SPACE(dev)
986                                 + sizeof(struct sockaddr_ll),
987                                 0, &err);
988
989                 if (unlikely(skb == NULL))
990                         goto out_status;
991
992                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
993                                 addr);
994
995                 if (unlikely(tp_len < 0)) {
996                         if (po->tp_loss) {
997                                 __packet_set_status(po, ph,
998                                                 TP_STATUS_AVAILABLE);
999                                 packet_increment_head(&po->tx_ring);
1000                                 kfree_skb(skb);
1001                                 continue;
1002                         } else {
1003                                 status = TP_STATUS_WRONG_FORMAT;
1004                                 err = tp_len;
1005                                 goto out_status;
1006                         }
1007                 }
1008
1009                 skb->destructor = tpacket_destruct_skb;
1010                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1011                 atomic_inc(&po->tx_ring.pending);
1012
1013                 status = TP_STATUS_SEND_REQUEST;
1014                 err = dev_queue_xmit(skb);
1015                 if (unlikely(err > 0)) {
1016                         err = net_xmit_errno(err);
1017                         if (err && __packet_get_status(po, ph) ==
1018                                    TP_STATUS_AVAILABLE) {
1019                                 /* skb was destructed already */
1020                                 skb = NULL;
1021                                 goto out_status;
1022                         }
1023                         /*
1024                          * skb was dropped but not destructed yet;
1025                          * let's treat it like congestion or err < 0
1026                          */
1027                         err = 0;
1028                 }
1029                 packet_increment_head(&po->tx_ring);
1030                 len_sum += tp_len;
1031         } while (likely((ph != NULL) ||
1032                         ((!(msg->msg_flags & MSG_DONTWAIT)) &&
1033                          (atomic_read(&po->tx_ring.pending))))
1034                 );
1035
1036         err = len_sum;
1037         goto out_put;
1038
1039 out_status:
1040         __packet_set_status(po, ph, status);
1041         kfree_skb(skb);
1042 out_put:
1043         dev_put(dev);
1044 out:
1045         mutex_unlock(&po->pg_vec_lock);
1046         return err;
1047 }
1048
1049 static inline struct sk_buff *packet_alloc_skb(struct sock *sk, size_t prepad,
1050                                                size_t reserve, size_t len,
1051                                                size_t linear, int noblock,
1052                                                int *err)
1053 {
1054         struct sk_buff *skb;
1055
1056         /* Under a page?  Don't bother with paged skb. */
1057         if (prepad + len < PAGE_SIZE || !linear)
1058                 linear = len;
1059
1060         skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
1061                                    err);
1062         if (!skb)
1063                 return NULL;
1064
1065         skb_reserve(skb, reserve);
1066         skb_put(skb, linear);
1067         skb->data_len = len - linear;
1068         skb->len += len - linear;
1069
1070         return skb;
1071 }
1072
1073 static int packet_snd(struct socket *sock,
1074                           struct msghdr *msg, size_t len)
1075 {
1076         struct sock *sk = sock->sk;
1077         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1078         struct sk_buff *skb;
1079         struct net_device *dev;
1080         __be16 proto;
1081         unsigned char *addr;
1082         int ifindex, err, reserve = 0;
1083         struct virtio_net_hdr vnet_hdr = { 0 };
1084         int offset = 0;
1085         int vnet_hdr_len;
1086         struct packet_sock *po = pkt_sk(sk);
1087         unsigned short gso_type = 0;
1088
1089         /*
1090          *      Get and verify the address.
1091          */
1092
1093         if (saddr == NULL) {
1094                 ifindex = po->ifindex;
1095                 proto   = po->num;
1096                 addr    = NULL;
1097         } else {
1098                 err = -EINVAL;
1099                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1100                         goto out;
1101                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1102                         goto out;
1103                 ifindex = saddr->sll_ifindex;
1104                 proto   = saddr->sll_protocol;
1105                 addr    = saddr->sll_addr;
1106         }
1107
1108
1109         dev = dev_get_by_index(sock_net(sk), ifindex);
1110         err = -ENXIO;
1111         if (dev == NULL)
1112                 goto out_unlock;
1113         if (sock->type == SOCK_RAW)
1114                 reserve = dev->hard_header_len;
1115
1116         err = -ENETDOWN;
1117         if (!(dev->flags & IFF_UP))
1118                 goto out_unlock;
1119
1120         if (po->has_vnet_hdr) {
1121                 vnet_hdr_len = sizeof(vnet_hdr);
1122
1123                 err = -EINVAL;
1124                 if (len < vnet_hdr_len)
1125                         goto out_unlock;
1126
1127                 len -= vnet_hdr_len;
1128
1129                 err = memcpy_fromiovec((void *)&vnet_hdr, msg->msg_iov,
1130                                        vnet_hdr_len);
1131                 if (err < 0)
1132                         goto out_unlock;
1133
1134                 if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
1135                     (vnet_hdr.csum_start + vnet_hdr.csum_offset + 2 >
1136                       vnet_hdr.hdr_len))
1137                         vnet_hdr.hdr_len = vnet_hdr.csum_start +
1138                                                  vnet_hdr.csum_offset + 2;
1139
1140                 err = -EINVAL;
1141                 if (vnet_hdr.hdr_len > len)
1142                         goto out_unlock;
1143
1144                 if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
1145                         switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
1146                         case VIRTIO_NET_HDR_GSO_TCPV4:
1147                                 gso_type = SKB_GSO_TCPV4;
1148                                 break;
1149                         case VIRTIO_NET_HDR_GSO_TCPV6:
1150                                 gso_type = SKB_GSO_TCPV6;
1151                                 break;
1152                         case VIRTIO_NET_HDR_GSO_UDP:
1153                                 gso_type = SKB_GSO_UDP;
1154                                 break;
1155                         default:
1156                                 goto out_unlock;
1157                         }
1158
1159                         if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
1160                                 gso_type |= SKB_GSO_TCP_ECN;
1161
1162                         if (vnet_hdr.gso_size == 0)
1163                                 goto out_unlock;
1164
1165                 }
1166         }
1167
1168         err = -EMSGSIZE;
1169         if (!gso_type && (len > dev->mtu+reserve))
1170                 goto out_unlock;
1171
1172         err = -ENOBUFS;
1173         skb = packet_alloc_skb(sk, LL_ALLOCATED_SPACE(dev),
1174                                LL_RESERVED_SPACE(dev), len, vnet_hdr.hdr_len,
1175                                msg->msg_flags & MSG_DONTWAIT, &err);
1176         if (skb == NULL)
1177                 goto out_unlock;
1178
1179         skb_set_network_header(skb, reserve);
1180
1181         err = -EINVAL;
1182         if (sock->type == SOCK_DGRAM &&
1183             (offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len)) < 0)
1184                 goto out_free;
1185
1186         /* Returns -EFAULT on error */
1187         err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len);
1188         if (err)
1189                 goto out_free;
1190
1191         skb->protocol = proto;
1192         skb->dev = dev;
1193         skb->priority = sk->sk_priority;
1194         skb->mark = sk->sk_mark;
1195
1196         if (po->has_vnet_hdr) {
1197                 if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
1198                         if (!skb_partial_csum_set(skb, vnet_hdr.csum_start,
1199                                                   vnet_hdr.csum_offset)) {
1200                                 err = -EINVAL;
1201                                 goto out_free;
1202                         }
1203                 }
1204
1205                 skb_shinfo(skb)->gso_size = vnet_hdr.gso_size;
1206                 skb_shinfo(skb)->gso_type = gso_type;
1207
1208                 /* Header must be checked, and gso_segs computed. */
1209                 skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
1210                 skb_shinfo(skb)->gso_segs = 0;
1211
1212                 len += vnet_hdr_len;
1213         }
1214
1215         /*
1216          *      Now send it
1217          */
1218
1219         err = dev_queue_xmit(skb);
1220         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1221                 goto out_unlock;
1222
1223         dev_put(dev);
1224
1225         return len;
1226
1227 out_free:
1228         kfree_skb(skb);
1229 out_unlock:
1230         if (dev)
1231                 dev_put(dev);
1232 out:
1233         return err;
1234 }
1235
1236 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1237                 struct msghdr *msg, size_t len)
1238 {
1239         struct sock *sk = sock->sk;
1240         struct packet_sock *po = pkt_sk(sk);
1241         if (po->tx_ring.pg_vec)
1242                 return tpacket_snd(po, msg);
1243         else
1244                 return packet_snd(sock, msg, len);
1245 }
1246
1247 /*
1248  *      Close a PACKET socket. This is fairly simple. We immediately go
1249  *      to 'closed' state and remove our protocol entry in the device list.
1250  */
1251
1252 static int packet_release(struct socket *sock)
1253 {
1254         struct sock *sk = sock->sk;
1255         struct packet_sock *po;
1256         struct net *net;
1257         struct tpacket_req req;
1258
1259         if (!sk)
1260                 return 0;
1261
1262         net = sock_net(sk);
1263         po = pkt_sk(sk);
1264
1265         write_lock_bh(&net->packet.sklist_lock);
1266         sk_del_node_init(sk);
1267         sock_prot_inuse_add(net, sk->sk_prot, -1);
1268         write_unlock_bh(&net->packet.sklist_lock);
1269
1270         /*
1271          *      Unhook packet receive handler.
1272          */
1273
1274         if (po->running) {
1275                 /*
1276                  *      Remove the protocol hook
1277                  */
1278                 dev_remove_pack(&po->prot_hook);
1279                 po->running = 0;
1280                 po->num = 0;
1281                 __sock_put(sk);
1282         }
1283
1284         packet_flush_mclist(sk);
1285
1286         memset(&req, 0, sizeof(req));
1287
1288         if (po->rx_ring.pg_vec)
1289                 packet_set_ring(sk, &req, 1, 0);
1290
1291         if (po->tx_ring.pg_vec)
1292                 packet_set_ring(sk, &req, 1, 1);
1293
1294         /*
1295          *      Now the socket is dead. No more input will appear.
1296          */
1297
1298         sock_orphan(sk);
1299         sock->sk = NULL;
1300
1301         /* Purge queues */
1302
1303         skb_queue_purge(&sk->sk_receive_queue);
1304         sk_refcnt_debug_release(sk);
1305
1306         sock_put(sk);
1307         return 0;
1308 }
1309
1310 /*
1311  *      Attach a packet hook.
1312  */
1313
1314 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1315 {
1316         struct packet_sock *po = pkt_sk(sk);
1317         /*
1318          *      Detach an existing hook if present.
1319          */
1320
1321         lock_sock(sk);
1322
1323         spin_lock(&po->bind_lock);
1324         if (po->running) {
1325                 __sock_put(sk);
1326                 po->running = 0;
1327                 po->num = 0;
1328                 spin_unlock(&po->bind_lock);
1329                 dev_remove_pack(&po->prot_hook);
1330                 spin_lock(&po->bind_lock);
1331         }
1332
1333         po->num = protocol;
1334         po->prot_hook.type = protocol;
1335         po->prot_hook.dev = dev;
1336
1337         po->ifindex = dev ? dev->ifindex : 0;
1338
1339         if (protocol == 0)
1340                 goto out_unlock;
1341
1342         if (!dev || (dev->flags & IFF_UP)) {
1343                 dev_add_pack(&po->prot_hook);
1344                 sock_hold(sk);
1345                 po->running = 1;
1346         } else {
1347                 sk->sk_err = ENETDOWN;
1348                 if (!sock_flag(sk, SOCK_DEAD))
1349                         sk->sk_error_report(sk);
1350         }
1351
1352 out_unlock:
1353         spin_unlock(&po->bind_lock);
1354         release_sock(sk);
1355         return 0;
1356 }
1357
1358 /*
1359  *      Bind a packet socket to a device
1360  */
1361
1362 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1363                             int addr_len)
1364 {
1365         struct sock *sk = sock->sk;
1366         char name[15];
1367         struct net_device *dev;
1368         int err = -ENODEV;
1369
1370         /*
1371          *      Check legality
1372          */
1373
1374         if (addr_len != sizeof(struct sockaddr))
1375                 return -EINVAL;
1376         strlcpy(name, uaddr->sa_data, sizeof(name));
1377
1378         dev = dev_get_by_name(sock_net(sk), name);
1379         if (dev) {
1380                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1381                 dev_put(dev);
1382         }
1383         return err;
1384 }
1385
1386 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1387 {
1388         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1389         struct sock *sk = sock->sk;
1390         struct net_device *dev = NULL;
1391         int err;
1392
1393
1394         /*
1395          *      Check legality
1396          */
1397
1398         if (addr_len < sizeof(struct sockaddr_ll))
1399                 return -EINVAL;
1400         if (sll->sll_family != AF_PACKET)
1401                 return -EINVAL;
1402
1403         if (sll->sll_ifindex) {
1404                 err = -ENODEV;
1405                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1406                 if (dev == NULL)
1407                         goto out;
1408         }
1409         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1410         if (dev)
1411                 dev_put(dev);
1412
1413 out:
1414         return err;
1415 }
1416
1417 static struct proto packet_proto = {
1418         .name     = "PACKET",
1419         .owner    = THIS_MODULE,
1420         .obj_size = sizeof(struct packet_sock),
1421 };
1422
1423 /*
1424  *      Create a packet of type SOCK_PACKET.
1425  */
1426
1427 static int packet_create(struct net *net, struct socket *sock, int protocol,
1428                          int kern)
1429 {
1430         struct sock *sk;
1431         struct packet_sock *po;
1432         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1433         int err;
1434
1435         if (!capable(CAP_NET_RAW))
1436                 return -EPERM;
1437         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1438             sock->type != SOCK_PACKET)
1439                 return -ESOCKTNOSUPPORT;
1440
1441         sock->state = SS_UNCONNECTED;
1442
1443         err = -ENOBUFS;
1444         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1445         if (sk == NULL)
1446                 goto out;
1447
1448         sock->ops = &packet_ops;
1449         if (sock->type == SOCK_PACKET)
1450                 sock->ops = &packet_ops_spkt;
1451
1452         sock_init_data(sock, sk);
1453
1454         po = pkt_sk(sk);
1455         sk->sk_family = PF_PACKET;
1456         po->num = proto;
1457
1458         sk->sk_destruct = packet_sock_destruct;
1459         sk_refcnt_debug_inc(sk);
1460
1461         /*
1462          *      Attach a protocol block
1463          */
1464
1465         spin_lock_init(&po->bind_lock);
1466         mutex_init(&po->pg_vec_lock);
1467         po->prot_hook.func = packet_rcv;
1468
1469         if (sock->type == SOCK_PACKET)
1470                 po->prot_hook.func = packet_rcv_spkt;
1471
1472         po->prot_hook.af_packet_priv = sk;
1473
1474         if (proto) {
1475                 po->prot_hook.type = proto;
1476                 dev_add_pack(&po->prot_hook);
1477                 sock_hold(sk);
1478                 po->running = 1;
1479         }
1480
1481         write_lock_bh(&net->packet.sklist_lock);
1482         sk_add_node(sk, &net->packet.sklist);
1483         sock_prot_inuse_add(net, &packet_proto, 1);
1484         write_unlock_bh(&net->packet.sklist_lock);
1485         return 0;
1486 out:
1487         return err;
1488 }
1489
1490 /*
1491  *      Pull a packet from our receive queue and hand it to the user.
1492  *      If necessary we block.
1493  */
1494
1495 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1496                           struct msghdr *msg, size_t len, int flags)
1497 {
1498         struct sock *sk = sock->sk;
1499         struct sk_buff *skb;
1500         int copied, err;
1501         struct sockaddr_ll *sll;
1502         int vnet_hdr_len = 0;
1503
1504         err = -EINVAL;
1505         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1506                 goto out;
1507
1508 #if 0
1509         /* What error should we return now? EUNATTACH? */
1510         if (pkt_sk(sk)->ifindex < 0)
1511                 return -ENODEV;
1512 #endif
1513
1514         /*
1515          *      Call the generic datagram receiver. This handles all sorts
1516          *      of horrible races and re-entrancy so we can forget about it
1517          *      in the protocol layers.
1518          *
1519          *      Now it will return ENETDOWN, if device have just gone down,
1520          *      but then it will block.
1521          */
1522
1523         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1524
1525         /*
1526          *      An error occurred so return it. Because skb_recv_datagram()
1527          *      handles the blocking we don't see and worry about blocking
1528          *      retries.
1529          */
1530
1531         if (skb == NULL)
1532                 goto out;
1533
1534         if (pkt_sk(sk)->has_vnet_hdr) {
1535                 struct virtio_net_hdr vnet_hdr = { 0 };
1536
1537                 err = -EINVAL;
1538                 vnet_hdr_len = sizeof(vnet_hdr);
1539                 if ((len -= vnet_hdr_len) < 0)
1540                         goto out_free;
1541
1542                 if (skb_is_gso(skb)) {
1543                         struct skb_shared_info *sinfo = skb_shinfo(skb);
1544
1545                         /* This is a hint as to how much should be linear. */
1546                         vnet_hdr.hdr_len = skb_headlen(skb);
1547                         vnet_hdr.gso_size = sinfo->gso_size;
1548                         if (sinfo->gso_type & SKB_GSO_TCPV4)
1549                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
1550                         else if (sinfo->gso_type & SKB_GSO_TCPV6)
1551                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
1552                         else if (sinfo->gso_type & SKB_GSO_UDP)
1553                                 vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
1554                         else if (sinfo->gso_type & SKB_GSO_FCOE)
1555                                 goto out_free;
1556                         else
1557                                 BUG();
1558                         if (sinfo->gso_type & SKB_GSO_TCP_ECN)
1559                                 vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
1560                 } else
1561                         vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
1562
1563                 if (skb->ip_summed == CHECKSUM_PARTIAL) {
1564                         vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
1565                         vnet_hdr.csum_start = skb->csum_start -
1566                                                         skb_headroom(skb);
1567                         vnet_hdr.csum_offset = skb->csum_offset;
1568                 } /* else everything is zero */
1569
1570                 err = memcpy_toiovec(msg->msg_iov, (void *)&vnet_hdr,
1571                                      vnet_hdr_len);
1572                 if (err < 0)
1573                         goto out_free;
1574         }
1575
1576         /*
1577          *      If the address length field is there to be filled in, we fill
1578          *      it in now.
1579          */
1580
1581         sll = &PACKET_SKB_CB(skb)->sa.ll;
1582         if (sock->type == SOCK_PACKET)
1583                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1584         else
1585                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1586
1587         /*
1588          *      You lose any data beyond the buffer you gave. If it worries a
1589          *      user program they can ask the device for its MTU anyway.
1590          */
1591
1592         copied = skb->len;
1593         if (copied > len) {
1594                 copied = len;
1595                 msg->msg_flags |= MSG_TRUNC;
1596         }
1597
1598         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1599         if (err)
1600                 goto out_free;
1601
1602         sock_recv_ts_and_drops(msg, sk, skb);
1603
1604         if (msg->msg_name)
1605                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1606                        msg->msg_namelen);
1607
1608         if (pkt_sk(sk)->auxdata) {
1609                 struct tpacket_auxdata aux;
1610
1611                 aux.tp_status = TP_STATUS_USER;
1612                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1613                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1614                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1615                 aux.tp_snaplen = skb->len;
1616                 aux.tp_mac = 0;
1617                 aux.tp_net = skb_network_offset(skb);
1618                 aux.tp_vlan_tci = vlan_tx_tag_get(skb);
1619
1620                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1621         }
1622
1623         /*
1624          *      Free or return the buffer as appropriate. Again this
1625          *      hides all the races and re-entrancy issues from us.
1626          */
1627         err = vnet_hdr_len + ((flags&MSG_TRUNC) ? skb->len : copied);
1628
1629 out_free:
1630         skb_free_datagram(sk, skb);
1631 out:
1632         return err;
1633 }
1634
1635 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1636                                int *uaddr_len, int peer)
1637 {
1638         struct net_device *dev;
1639         struct sock *sk = sock->sk;
1640
1641         if (peer)
1642                 return -EOPNOTSUPP;
1643
1644         uaddr->sa_family = AF_PACKET;
1645         rcu_read_lock();
1646         dev = dev_get_by_index_rcu(sock_net(sk), pkt_sk(sk)->ifindex);
1647         if (dev)
1648                 strlcpy(uaddr->sa_data, dev->name, 15);
1649         else
1650                 memset(uaddr->sa_data, 0, 14);
1651         rcu_read_unlock();
1652         *uaddr_len = sizeof(*uaddr);
1653
1654         return 0;
1655 }
1656
1657 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1658                           int *uaddr_len, int peer)
1659 {
1660         struct net_device *dev;
1661         struct sock *sk = sock->sk;
1662         struct packet_sock *po = pkt_sk(sk);
1663         DECLARE_SOCKADDR(struct sockaddr_ll *, sll, uaddr);
1664
1665         if (peer)
1666                 return -EOPNOTSUPP;
1667
1668         sll->sll_family = AF_PACKET;
1669         sll->sll_ifindex = po->ifindex;
1670         sll->sll_protocol = po->num;
1671         rcu_read_lock();
1672         dev = dev_get_by_index_rcu(sock_net(sk), po->ifindex);
1673         if (dev) {
1674                 sll->sll_hatype = dev->type;
1675                 sll->sll_halen = dev->addr_len;
1676                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1677         } else {
1678                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1679                 sll->sll_halen = 0;
1680         }
1681         rcu_read_unlock();
1682         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1683
1684         return 0;
1685 }
1686
1687 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1688                          int what)
1689 {
1690         switch (i->type) {
1691         case PACKET_MR_MULTICAST:
1692                 if (what > 0)
1693                         return dev_mc_add(dev, i->addr, i->alen, 0);
1694                 else
1695                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1696                 break;
1697         case PACKET_MR_PROMISC:
1698                 return dev_set_promiscuity(dev, what);
1699                 break;
1700         case PACKET_MR_ALLMULTI:
1701                 return dev_set_allmulti(dev, what);
1702                 break;
1703         case PACKET_MR_UNICAST:
1704                 if (what > 0)
1705                         return dev_unicast_add(dev, i->addr);
1706                 else
1707                         return dev_unicast_delete(dev, i->addr);
1708                 break;
1709         default:
1710                 break;
1711         }
1712         return 0;
1713 }
1714
1715 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1716 {
1717         for ( ; i; i = i->next) {
1718                 if (i->ifindex == dev->ifindex)
1719                         packet_dev_mc(dev, i, what);
1720         }
1721 }
1722
1723 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1724 {
1725         struct packet_sock *po = pkt_sk(sk);
1726         struct packet_mclist *ml, *i;
1727         struct net_device *dev;
1728         int err;
1729
1730         rtnl_lock();
1731
1732         err = -ENODEV;
1733         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1734         if (!dev)
1735                 goto done;
1736
1737         err = -EINVAL;
1738         if (mreq->mr_alen > dev->addr_len)
1739                 goto done;
1740
1741         err = -ENOBUFS;
1742         i = kmalloc(sizeof(*i), GFP_KERNEL);
1743         if (i == NULL)
1744                 goto done;
1745
1746         err = 0;
1747         for (ml = po->mclist; ml; ml = ml->next) {
1748                 if (ml->ifindex == mreq->mr_ifindex &&
1749                     ml->type == mreq->mr_type &&
1750                     ml->alen == mreq->mr_alen &&
1751                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1752                         ml->count++;
1753                         /* Free the new element ... */
1754                         kfree(i);
1755                         goto done;
1756                 }
1757         }
1758
1759         i->type = mreq->mr_type;
1760         i->ifindex = mreq->mr_ifindex;
1761         i->alen = mreq->mr_alen;
1762         memcpy(i->addr, mreq->mr_address, i->alen);
1763         i->count = 1;
1764         i->next = po->mclist;
1765         po->mclist = i;
1766         err = packet_dev_mc(dev, i, 1);
1767         if (err) {
1768                 po->mclist = i->next;
1769                 kfree(i);
1770         }
1771
1772 done:
1773         rtnl_unlock();
1774         return err;
1775 }
1776
1777 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1778 {
1779         struct packet_mclist *ml, **mlp;
1780
1781         rtnl_lock();
1782
1783         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1784                 if (ml->ifindex == mreq->mr_ifindex &&
1785                     ml->type == mreq->mr_type &&
1786                     ml->alen == mreq->mr_alen &&
1787                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1788                         if (--ml->count == 0) {
1789                                 struct net_device *dev;
1790                                 *mlp = ml->next;
1791                                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1792                                 if (dev)
1793                                         packet_dev_mc(dev, ml, -1);
1794                                 kfree(ml);
1795                         }
1796                         rtnl_unlock();
1797                         return 0;
1798                 }
1799         }
1800         rtnl_unlock();
1801         return -EADDRNOTAVAIL;
1802 }
1803
1804 static void packet_flush_mclist(struct sock *sk)
1805 {
1806         struct packet_sock *po = pkt_sk(sk);
1807         struct packet_mclist *ml;
1808
1809         if (!po->mclist)
1810                 return;
1811
1812         rtnl_lock();
1813         while ((ml = po->mclist) != NULL) {
1814                 struct net_device *dev;
1815
1816                 po->mclist = ml->next;
1817                 dev = __dev_get_by_index(sock_net(sk), ml->ifindex);
1818                 if (dev != NULL)
1819                         packet_dev_mc(dev, ml, -1);
1820                 kfree(ml);
1821         }
1822         rtnl_unlock();
1823 }
1824
1825 static int
1826 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1827 {
1828         struct sock *sk = sock->sk;
1829         struct packet_sock *po = pkt_sk(sk);
1830         int ret;
1831
1832         if (level != SOL_PACKET)
1833                 return -ENOPROTOOPT;
1834
1835         switch (optname) {
1836         case PACKET_ADD_MEMBERSHIP:
1837         case PACKET_DROP_MEMBERSHIP:
1838         {
1839                 struct packet_mreq_max mreq;
1840                 int len = optlen;
1841                 memset(&mreq, 0, sizeof(mreq));
1842                 if (len < sizeof(struct packet_mreq))
1843                         return -EINVAL;
1844                 if (len > sizeof(mreq))
1845                         len = sizeof(mreq);
1846                 if (copy_from_user(&mreq, optval, len))
1847                         return -EFAULT;
1848                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1849                         return -EINVAL;
1850                 if (optname == PACKET_ADD_MEMBERSHIP)
1851                         ret = packet_mc_add(sk, &mreq);
1852                 else
1853                         ret = packet_mc_drop(sk, &mreq);
1854                 return ret;
1855         }
1856
1857         case PACKET_RX_RING:
1858         case PACKET_TX_RING:
1859         {
1860                 struct tpacket_req req;
1861
1862                 if (optlen < sizeof(req))
1863                         return -EINVAL;
1864                 if (pkt_sk(sk)->has_vnet_hdr)
1865                         return -EINVAL;
1866                 if (copy_from_user(&req, optval, sizeof(req)))
1867                         return -EFAULT;
1868                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1869         }
1870         case PACKET_COPY_THRESH:
1871         {
1872                 int val;
1873
1874                 if (optlen != sizeof(val))
1875                         return -EINVAL;
1876                 if (copy_from_user(&val, optval, sizeof(val)))
1877                         return -EFAULT;
1878
1879                 pkt_sk(sk)->copy_thresh = val;
1880                 return 0;
1881         }
1882         case PACKET_VERSION:
1883         {
1884                 int val;
1885
1886                 if (optlen != sizeof(val))
1887                         return -EINVAL;
1888                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1889                         return -EBUSY;
1890                 if (copy_from_user(&val, optval, sizeof(val)))
1891                         return -EFAULT;
1892                 switch (val) {
1893                 case TPACKET_V1:
1894                 case TPACKET_V2:
1895                         po->tp_version = val;
1896                         return 0;
1897                 default:
1898                         return -EINVAL;
1899                 }
1900         }
1901         case PACKET_RESERVE:
1902         {
1903                 unsigned int val;
1904
1905                 if (optlen != sizeof(val))
1906                         return -EINVAL;
1907                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1908                         return -EBUSY;
1909                 if (copy_from_user(&val, optval, sizeof(val)))
1910                         return -EFAULT;
1911                 po->tp_reserve = val;
1912                 return 0;
1913         }
1914         case PACKET_LOSS:
1915         {
1916                 unsigned int val;
1917
1918                 if (optlen != sizeof(val))
1919                         return -EINVAL;
1920                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1921                         return -EBUSY;
1922                 if (copy_from_user(&val, optval, sizeof(val)))
1923                         return -EFAULT;
1924                 po->tp_loss = !!val;
1925                 return 0;
1926         }
1927         case PACKET_AUXDATA:
1928         {
1929                 int val;
1930
1931                 if (optlen < sizeof(val))
1932                         return -EINVAL;
1933                 if (copy_from_user(&val, optval, sizeof(val)))
1934                         return -EFAULT;
1935
1936                 po->auxdata = !!val;
1937                 return 0;
1938         }
1939         case PACKET_ORIGDEV:
1940         {
1941                 int val;
1942
1943                 if (optlen < sizeof(val))
1944                         return -EINVAL;
1945                 if (copy_from_user(&val, optval, sizeof(val)))
1946                         return -EFAULT;
1947
1948                 po->origdev = !!val;
1949                 return 0;
1950         }
1951         case PACKET_VNET_HDR:
1952         {
1953                 int val;
1954
1955                 if (sock->type != SOCK_RAW)
1956                         return -EINVAL;
1957                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1958                         return -EBUSY;
1959                 if (optlen < sizeof(val))
1960                         return -EINVAL;
1961                 if (copy_from_user(&val, optval, sizeof(val)))
1962                         return -EFAULT;
1963
1964                 po->has_vnet_hdr = !!val;
1965                 return 0;
1966         }
1967         default:
1968                 return -ENOPROTOOPT;
1969         }
1970 }
1971
1972 static int packet_getsockopt(struct socket *sock, int level, int optname,
1973                              char __user *optval, int __user *optlen)
1974 {
1975         int len;
1976         int val;
1977         struct sock *sk = sock->sk;
1978         struct packet_sock *po = pkt_sk(sk);
1979         void *data;
1980         struct tpacket_stats st;
1981
1982         if (level != SOL_PACKET)
1983                 return -ENOPROTOOPT;
1984
1985         if (get_user(len, optlen))
1986                 return -EFAULT;
1987
1988         if (len < 0)
1989                 return -EINVAL;
1990
1991         switch (optname) {
1992         case PACKET_STATISTICS:
1993                 if (len > sizeof(struct tpacket_stats))
1994                         len = sizeof(struct tpacket_stats);
1995                 spin_lock_bh(&sk->sk_receive_queue.lock);
1996                 st = po->stats;
1997                 memset(&po->stats, 0, sizeof(st));
1998                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1999                 st.tp_packets += st.tp_drops;
2000
2001                 data = &st;
2002                 break;
2003         case PACKET_AUXDATA:
2004                 if (len > sizeof(int))
2005                         len = sizeof(int);
2006                 val = po->auxdata;
2007
2008                 data = &val;
2009                 break;
2010         case PACKET_ORIGDEV:
2011                 if (len > sizeof(int))
2012                         len = sizeof(int);
2013                 val = po->origdev;
2014
2015                 data = &val;
2016                 break;
2017         case PACKET_VNET_HDR:
2018                 if (len > sizeof(int))
2019                         len = sizeof(int);
2020                 val = po->has_vnet_hdr;
2021
2022                 data = &val;
2023                 break;
2024         case PACKET_VERSION:
2025                 if (len > sizeof(int))
2026                         len = sizeof(int);
2027                 val = po->tp_version;
2028                 data = &val;
2029                 break;
2030         case PACKET_HDRLEN:
2031                 if (len > sizeof(int))
2032                         len = sizeof(int);
2033                 if (copy_from_user(&val, optval, len))
2034                         return -EFAULT;
2035                 switch (val) {
2036                 case TPACKET_V1:
2037                         val = sizeof(struct tpacket_hdr);
2038                         break;
2039                 case TPACKET_V2:
2040                         val = sizeof(struct tpacket2_hdr);
2041                         break;
2042                 default:
2043                         return -EINVAL;
2044                 }
2045                 data = &val;
2046                 break;
2047         case PACKET_RESERVE:
2048                 if (len > sizeof(unsigned int))
2049                         len = sizeof(unsigned int);
2050                 val = po->tp_reserve;
2051                 data = &val;
2052                 break;
2053         case PACKET_LOSS:
2054                 if (len > sizeof(unsigned int))
2055                         len = sizeof(unsigned int);
2056                 val = po->tp_loss;
2057                 data = &val;
2058                 break;
2059         default:
2060                 return -ENOPROTOOPT;
2061         }
2062
2063         if (put_user(len, optlen))
2064                 return -EFAULT;
2065         if (copy_to_user(optval, data, len))
2066                 return -EFAULT;
2067         return 0;
2068 }
2069
2070
2071 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
2072 {
2073         struct sock *sk;
2074         struct hlist_node *node;
2075         struct net_device *dev = data;
2076         struct net *net = dev_net(dev);
2077
2078         read_lock(&net->packet.sklist_lock);
2079         sk_for_each(sk, node, &net->packet.sklist) {
2080                 struct packet_sock *po = pkt_sk(sk);
2081
2082                 switch (msg) {
2083                 case NETDEV_UNREGISTER:
2084                         if (po->mclist)
2085                                 packet_dev_mclist(dev, po->mclist, -1);
2086                         /* fallthrough */
2087
2088                 case NETDEV_DOWN:
2089                         if (dev->ifindex == po->ifindex) {
2090                                 spin_lock(&po->bind_lock);
2091                                 if (po->running) {
2092                                         __dev_remove_pack(&po->prot_hook);
2093                                         __sock_put(sk);
2094                                         po->running = 0;
2095                                         sk->sk_err = ENETDOWN;
2096                                         if (!sock_flag(sk, SOCK_DEAD))
2097                                                 sk->sk_error_report(sk);
2098                                 }
2099                                 if (msg == NETDEV_UNREGISTER) {
2100                                         po->ifindex = -1;
2101                                         po->prot_hook.dev = NULL;
2102                                 }
2103                                 spin_unlock(&po->bind_lock);
2104                         }
2105                         break;
2106                 case NETDEV_UP:
2107                         spin_lock(&po->bind_lock);
2108                         if (dev->ifindex == po->ifindex && po->num &&
2109                             !po->running) {
2110                                 dev_add_pack(&po->prot_hook);
2111                                 sock_hold(sk);
2112                                 po->running = 1;
2113                         }
2114                         spin_unlock(&po->bind_lock);
2115                         break;
2116                 }
2117         }
2118         read_unlock(&net->packet.sklist_lock);
2119         return NOTIFY_DONE;
2120 }
2121
2122
2123 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2124                         unsigned long arg)
2125 {
2126         struct sock *sk = sock->sk;
2127
2128         switch (cmd) {
2129         case SIOCOUTQ:
2130         {
2131                 int amount = sk_wmem_alloc_get(sk);
2132
2133                 return put_user(amount, (int __user *)arg);
2134         }
2135         case SIOCINQ:
2136         {
2137                 struct sk_buff *skb;
2138                 int amount = 0;
2139
2140                 spin_lock_bh(&sk->sk_receive_queue.lock);
2141                 skb = skb_peek(&sk->sk_receive_queue);
2142                 if (skb)
2143                         amount = skb->len;
2144                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2145                 return put_user(amount, (int __user *)arg);
2146         }
2147         case SIOCGSTAMP:
2148                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2149         case SIOCGSTAMPNS:
2150                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2151
2152 #ifdef CONFIG_INET
2153         case SIOCADDRT:
2154         case SIOCDELRT:
2155         case SIOCDARP:
2156         case SIOCGARP:
2157         case SIOCSARP:
2158         case SIOCGIFADDR:
2159         case SIOCSIFADDR:
2160         case SIOCGIFBRDADDR:
2161         case SIOCSIFBRDADDR:
2162         case SIOCGIFNETMASK:
2163         case SIOCSIFNETMASK:
2164         case SIOCGIFDSTADDR:
2165         case SIOCSIFDSTADDR:
2166         case SIOCSIFFLAGS:
2167                 if (!net_eq(sock_net(sk), &init_net))
2168                         return -ENOIOCTLCMD;
2169                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2170 #endif
2171
2172         default:
2173                 return -ENOIOCTLCMD;
2174         }
2175         return 0;
2176 }
2177
2178 static unsigned int packet_poll(struct file *file, struct socket *sock,
2179                                 poll_table *wait)
2180 {
2181         struct sock *sk = sock->sk;
2182         struct packet_sock *po = pkt_sk(sk);
2183         unsigned int mask = datagram_poll(file, sock, wait);
2184
2185         spin_lock_bh(&sk->sk_receive_queue.lock);
2186         if (po->rx_ring.pg_vec) {
2187                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2188                         mask |= POLLIN | POLLRDNORM;
2189         }
2190         spin_unlock_bh(&sk->sk_receive_queue.lock);
2191         spin_lock_bh(&sk->sk_write_queue.lock);
2192         if (po->tx_ring.pg_vec) {
2193                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2194                         mask |= POLLOUT | POLLWRNORM;
2195         }
2196         spin_unlock_bh(&sk->sk_write_queue.lock);
2197         return mask;
2198 }
2199
2200
2201 /* Dirty? Well, I still did not learn better way to account
2202  * for user mmaps.
2203  */
2204
2205 static void packet_mm_open(struct vm_area_struct *vma)
2206 {
2207         struct file *file = vma->vm_file;
2208         struct socket *sock = file->private_data;
2209         struct sock *sk = sock->sk;
2210
2211         if (sk)
2212                 atomic_inc(&pkt_sk(sk)->mapped);
2213 }
2214
2215 static void packet_mm_close(struct vm_area_struct *vma)
2216 {
2217         struct file *file = vma->vm_file;
2218         struct socket *sock = file->private_data;
2219         struct sock *sk = sock->sk;
2220
2221         if (sk)
2222                 atomic_dec(&pkt_sk(sk)->mapped);
2223 }
2224
2225 static const struct vm_operations_struct packet_mmap_ops = {
2226         .open   =       packet_mm_open,
2227         .close  =       packet_mm_close,
2228 };
2229
2230 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2231 {
2232         int i;
2233
2234         for (i = 0; i < len; i++) {
2235                 if (likely(pg_vec[i]))
2236                         free_pages((unsigned long) pg_vec[i], order);
2237         }
2238         kfree(pg_vec);
2239 }
2240
2241 static inline char *alloc_one_pg_vec_page(unsigned long order)
2242 {
2243         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2244
2245         return (char *) __get_free_pages(gfp_flags, order);
2246 }
2247
2248 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2249 {
2250         unsigned int block_nr = req->tp_block_nr;
2251         char **pg_vec;
2252         int i;
2253
2254         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2255         if (unlikely(!pg_vec))
2256                 goto out;
2257
2258         for (i = 0; i < block_nr; i++) {
2259                 pg_vec[i] = alloc_one_pg_vec_page(order);
2260                 if (unlikely(!pg_vec[i]))
2261                         goto out_free_pgvec;
2262         }
2263
2264 out:
2265         return pg_vec;
2266
2267 out_free_pgvec:
2268         free_pg_vec(pg_vec, order, block_nr);
2269         pg_vec = NULL;
2270         goto out;
2271 }
2272
2273 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2274                 int closing, int tx_ring)
2275 {
2276         char **pg_vec = NULL;
2277         struct packet_sock *po = pkt_sk(sk);
2278         int was_running, order = 0;
2279         struct packet_ring_buffer *rb;
2280         struct sk_buff_head *rb_queue;
2281         __be16 num;
2282         int err;
2283
2284         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2285         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2286
2287         err = -EBUSY;
2288         if (!closing) {
2289                 if (atomic_read(&po->mapped))
2290                         goto out;
2291                 if (atomic_read(&rb->pending))
2292                         goto out;
2293         }
2294
2295         if (req->tp_block_nr) {
2296                 /* Sanity tests and some calculations */
2297                 err = -EBUSY;
2298                 if (unlikely(rb->pg_vec))
2299                         goto out;
2300
2301                 switch (po->tp_version) {
2302                 case TPACKET_V1:
2303                         po->tp_hdrlen = TPACKET_HDRLEN;
2304                         break;
2305                 case TPACKET_V2:
2306                         po->tp_hdrlen = TPACKET2_HDRLEN;
2307                         break;
2308                 }
2309
2310                 err = -EINVAL;
2311                 if (unlikely((int)req->tp_block_size <= 0))
2312                         goto out;
2313                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2314                         goto out;
2315                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2316                                         po->tp_reserve))
2317                         goto out;
2318                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2319                         goto out;
2320
2321                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2322                 if (unlikely(rb->frames_per_block <= 0))
2323                         goto out;
2324                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2325                                         req->tp_frame_nr))
2326                         goto out;
2327
2328                 err = -ENOMEM;
2329                 order = get_order(req->tp_block_size);
2330                 pg_vec = alloc_pg_vec(req, order);
2331                 if (unlikely(!pg_vec))
2332                         goto out;
2333         }
2334         /* Done */
2335         else {
2336                 err = -EINVAL;
2337                 if (unlikely(req->tp_frame_nr))
2338                         goto out;
2339         }
2340
2341         lock_sock(sk);
2342
2343         /* Detach socket from network */
2344         spin_lock(&po->bind_lock);
2345         was_running = po->running;
2346         num = po->num;
2347         if (was_running) {
2348                 __dev_remove_pack(&po->prot_hook);
2349                 po->num = 0;
2350                 po->running = 0;
2351                 __sock_put(sk);
2352         }
2353         spin_unlock(&po->bind_lock);
2354
2355         synchronize_net();
2356
2357         err = -EBUSY;
2358         mutex_lock(&po->pg_vec_lock);
2359         if (closing || atomic_read(&po->mapped) == 0) {
2360                 err = 0;
2361 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2362                 spin_lock_bh(&rb_queue->lock);
2363                 pg_vec = XC(rb->pg_vec, pg_vec);
2364                 rb->frame_max = (req->tp_frame_nr - 1);
2365                 rb->head = 0;
2366                 rb->frame_size = req->tp_frame_size;
2367                 spin_unlock_bh(&rb_queue->lock);
2368
2369                 order = XC(rb->pg_vec_order, order);
2370                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2371
2372                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2373                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2374                                                 tpacket_rcv : packet_rcv;
2375                 skb_queue_purge(rb_queue);
2376 #undef XC
2377                 if (atomic_read(&po->mapped))
2378                         pr_err("packet_mmap: vma is busy: %d\n",
2379                                atomic_read(&po->mapped));
2380         }
2381         mutex_unlock(&po->pg_vec_lock);
2382
2383         spin_lock(&po->bind_lock);
2384         if (was_running && !po->running) {
2385                 sock_hold(sk);
2386                 po->running = 1;
2387                 po->num = num;
2388                 dev_add_pack(&po->prot_hook);
2389         }
2390         spin_unlock(&po->bind_lock);
2391
2392         release_sock(sk);
2393
2394         if (pg_vec)
2395                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2396 out:
2397         return err;
2398 }
2399
2400 static int packet_mmap(struct file *file, struct socket *sock,
2401                 struct vm_area_struct *vma)
2402 {
2403         struct sock *sk = sock->sk;
2404         struct packet_sock *po = pkt_sk(sk);
2405         unsigned long size, expected_size;
2406         struct packet_ring_buffer *rb;
2407         unsigned long start;
2408         int err = -EINVAL;
2409         int i;
2410
2411         if (vma->vm_pgoff)
2412                 return -EINVAL;
2413
2414         mutex_lock(&po->pg_vec_lock);
2415
2416         expected_size = 0;
2417         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2418                 if (rb->pg_vec) {
2419                         expected_size += rb->pg_vec_len
2420                                                 * rb->pg_vec_pages
2421                                                 * PAGE_SIZE;
2422                 }
2423         }
2424
2425         if (expected_size == 0)
2426                 goto out;
2427
2428         size = vma->vm_end - vma->vm_start;
2429         if (size != expected_size)
2430                 goto out;
2431
2432         start = vma->vm_start;
2433         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2434                 if (rb->pg_vec == NULL)
2435                         continue;
2436
2437                 for (i = 0; i < rb->pg_vec_len; i++) {
2438                         struct page *page = virt_to_page(rb->pg_vec[i]);
2439                         int pg_num;
2440
2441                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2442                                         pg_num++, page++) {
2443                                 err = vm_insert_page(vma, start, page);
2444                                 if (unlikely(err))
2445                                         goto out;
2446                                 start += PAGE_SIZE;
2447                         }
2448                 }
2449         }
2450
2451         atomic_inc(&po->mapped);
2452         vma->vm_ops = &packet_mmap_ops;
2453         err = 0;
2454
2455 out:
2456         mutex_unlock(&po->pg_vec_lock);
2457         return err;
2458 }
2459
2460 static const struct proto_ops packet_ops_spkt = {
2461         .family =       PF_PACKET,
2462         .owner =        THIS_MODULE,
2463         .release =      packet_release,
2464         .bind =         packet_bind_spkt,
2465         .connect =      sock_no_connect,
2466         .socketpair =   sock_no_socketpair,
2467         .accept =       sock_no_accept,
2468         .getname =      packet_getname_spkt,
2469         .poll =         datagram_poll,
2470         .ioctl =        packet_ioctl,
2471         .listen =       sock_no_listen,
2472         .shutdown =     sock_no_shutdown,
2473         .setsockopt =   sock_no_setsockopt,
2474         .getsockopt =   sock_no_getsockopt,
2475         .sendmsg =      packet_sendmsg_spkt,
2476         .recvmsg =      packet_recvmsg,
2477         .mmap =         sock_no_mmap,
2478         .sendpage =     sock_no_sendpage,
2479 };
2480
2481 static const struct proto_ops packet_ops = {
2482         .family =       PF_PACKET,
2483         .owner =        THIS_MODULE,
2484         .release =      packet_release,
2485         .bind =         packet_bind,
2486         .connect =      sock_no_connect,
2487         .socketpair =   sock_no_socketpair,
2488         .accept =       sock_no_accept,
2489         .getname =      packet_getname,
2490         .poll =         packet_poll,
2491         .ioctl =        packet_ioctl,
2492         .listen =       sock_no_listen,
2493         .shutdown =     sock_no_shutdown,
2494         .setsockopt =   packet_setsockopt,
2495         .getsockopt =   packet_getsockopt,
2496         .sendmsg =      packet_sendmsg,
2497         .recvmsg =      packet_recvmsg,
2498         .mmap =         packet_mmap,
2499         .sendpage =     sock_no_sendpage,
2500 };
2501
2502 static const struct net_proto_family packet_family_ops = {
2503         .family =       PF_PACKET,
2504         .create =       packet_create,
2505         .owner  =       THIS_MODULE,
2506 };
2507
2508 static struct notifier_block packet_netdev_notifier = {
2509         .notifier_call =        packet_notifier,
2510 };
2511
2512 #ifdef CONFIG_PROC_FS
2513 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2514 {
2515         struct sock *s;
2516         struct hlist_node *node;
2517
2518         sk_for_each(s, node, &net->packet.sklist) {
2519                 if (!off--)
2520                         return s;
2521         }
2522         return NULL;
2523 }
2524
2525 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2526         __acquires(seq_file_net(seq)->packet.sklist_lock)
2527 {
2528         struct net *net = seq_file_net(seq);
2529         read_lock(&net->packet.sklist_lock);
2530         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2531 }
2532
2533 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2534 {
2535         struct net *net = seq_file_net(seq);
2536         ++*pos;
2537         return  (v == SEQ_START_TOKEN)
2538                 ? sk_head(&net->packet.sklist)
2539                 : sk_next((struct sock *)v) ;
2540 }
2541
2542 static void packet_seq_stop(struct seq_file *seq, void *v)
2543         __releases(seq_file_net(seq)->packet.sklist_lock)
2544 {
2545         struct net *net = seq_file_net(seq);
2546         read_unlock(&net->packet.sklist_lock);
2547 }
2548
2549 static int packet_seq_show(struct seq_file *seq, void *v)
2550 {
2551         if (v == SEQ_START_TOKEN)
2552                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2553         else {
2554                 struct sock *s = v;
2555                 const struct packet_sock *po = pkt_sk(s);
2556
2557                 seq_printf(seq,
2558                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2559                            s,
2560                            atomic_read(&s->sk_refcnt),
2561                            s->sk_type,
2562                            ntohs(po->num),
2563                            po->ifindex,
2564                            po->running,
2565                            atomic_read(&s->sk_rmem_alloc),
2566                            sock_i_uid(s),
2567                            sock_i_ino(s));
2568         }
2569
2570         return 0;
2571 }
2572
2573 static const struct seq_operations packet_seq_ops = {
2574         .start  = packet_seq_start,
2575         .next   = packet_seq_next,
2576         .stop   = packet_seq_stop,
2577         .show   = packet_seq_show,
2578 };
2579
2580 static int packet_seq_open(struct inode *inode, struct file *file)
2581 {
2582         return seq_open_net(inode, file, &packet_seq_ops,
2583                             sizeof(struct seq_net_private));
2584 }
2585
2586 static const struct file_operations packet_seq_fops = {
2587         .owner          = THIS_MODULE,
2588         .open           = packet_seq_open,
2589         .read           = seq_read,
2590         .llseek         = seq_lseek,
2591         .release        = seq_release_net,
2592 };
2593
2594 #endif
2595
2596 static int __net_init packet_net_init(struct net *net)
2597 {
2598         rwlock_init(&net->packet.sklist_lock);
2599         INIT_HLIST_HEAD(&net->packet.sklist);
2600
2601         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2602                 return -ENOMEM;
2603
2604         return 0;
2605 }
2606
2607 static void __net_exit packet_net_exit(struct net *net)
2608 {
2609         proc_net_remove(net, "packet");
2610 }
2611
2612 static struct pernet_operations packet_net_ops = {
2613         .init = packet_net_init,
2614         .exit = packet_net_exit,
2615 };
2616
2617
2618 static void __exit packet_exit(void)
2619 {
2620         unregister_netdevice_notifier(&packet_netdev_notifier);
2621         unregister_pernet_subsys(&packet_net_ops);
2622         sock_unregister(PF_PACKET);
2623         proto_unregister(&packet_proto);
2624 }
2625
2626 static int __init packet_init(void)
2627 {
2628         int rc = proto_register(&packet_proto, 0);
2629
2630         if (rc != 0)
2631                 goto out;
2632
2633         sock_register(&packet_family_ops);
2634         register_pernet_subsys(&packet_net_ops);
2635         register_netdevice_notifier(&packet_netdev_notifier);
2636 out:
2637         return rc;
2638 }
2639
2640 module_init(packet_init);
2641 module_exit(packet_exit);
2642 MODULE_LICENSE("GPL");
2643 MODULE_ALIAS_NETPROTO(PF_PACKET);