Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-2.6
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* Private packet socket structures. */
139
140 struct packet_mclist {
141         struct packet_mclist    *next;
142         int                     ifindex;
143         int                     count;
144         unsigned short          type;
145         unsigned short          alen;
146         unsigned char           addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160                 int closing, int tx_ring);
161
162 struct packet_ring_buffer {
163         char                    **pg_vec;
164         unsigned int            head;
165         unsigned int            frames_per_block;
166         unsigned int            frame_size;
167         unsigned int            frame_max;
168
169         unsigned int            pg_vec_order;
170         unsigned int            pg_vec_pages;
171         unsigned int            pg_vec_len;
172
173         atomic_t                pending;
174 };
175
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179
180 static void packet_flush_mclist(struct sock *sk);
181
182 struct packet_sock {
183         /* struct sock has to be the first member of packet_sock */
184         struct sock             sk;
185         struct tpacket_stats    stats;
186 #ifdef CONFIG_PACKET_MMAP
187         struct packet_ring_buffer       rx_ring;
188         struct packet_ring_buffer       tx_ring;
189         int                     copy_thresh;
190 #endif
191         struct packet_type      prot_hook;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207 };
208
209 struct packet_skb_cb {
210         unsigned int origlen;
211         union {
212                 struct sockaddr_pkt pkt;
213                 struct sockaddr_ll ll;
214         } sa;
215 };
216
217 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
218
219 #ifdef CONFIG_PACKET_MMAP
220
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223         union {
224                 struct tpacket_hdr *h1;
225                 struct tpacket2_hdr *h2;
226                 void *raw;
227         } h;
228
229         h.raw = frame;
230         switch (po->tp_version) {
231         case TPACKET_V1:
232                 h.h1->tp_status = status;
233                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
234                 break;
235         case TPACKET_V2:
236                 h.h2->tp_status = status;
237                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
238                 break;
239         default:
240                 pr_err("TPACKET version not supported\n");
241                 BUG();
242         }
243
244         smp_wmb();
245 }
246
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249         union {
250                 struct tpacket_hdr *h1;
251                 struct tpacket2_hdr *h2;
252                 void *raw;
253         } h;
254
255         smp_rmb();
256
257         h.raw = frame;
258         switch (po->tp_version) {
259         case TPACKET_V1:
260                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
261                 return h.h1->tp_status;
262         case TPACKET_V2:
263                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
264                 return h.h2->tp_status;
265         default:
266                 pr_err("TPACKET version not supported\n");
267                 BUG();
268                 return 0;
269         }
270 }
271
272 static void *packet_lookup_frame(struct packet_sock *po,
273                 struct packet_ring_buffer *rb,
274                 unsigned int position,
275                 int status)
276 {
277         unsigned int pg_vec_pos, frame_offset;
278         union {
279                 struct tpacket_hdr *h1;
280                 struct tpacket2_hdr *h2;
281                 void *raw;
282         } h;
283
284         pg_vec_pos = position / rb->frames_per_block;
285         frame_offset = position % rb->frames_per_block;
286
287         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288
289         if (status != __packet_get_status(po, h.raw))
290                 return NULL;
291
292         return h.raw;
293 }
294
295 static inline void *packet_current_frame(struct packet_sock *po,
296                 struct packet_ring_buffer *rb,
297                 int status)
298 {
299         return packet_lookup_frame(po, rb, rb->head, status);
300 }
301
302 static inline void *packet_previous_frame(struct packet_sock *po,
303                 struct packet_ring_buffer *rb,
304                 int status)
305 {
306         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307         return packet_lookup_frame(po, rb, previous, status);
308 }
309
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314
315 #endif
316
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319         return (struct packet_sock *)sk;
320 }
321
322 static void packet_sock_destruct(struct sock *sk)
323 {
324         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326
327         if (!sock_flag(sk, SOCK_DEAD)) {
328                 pr_err("Attempt to release alive packet socket: %p\n", sk);
329                 return;
330         }
331
332         sk_refcnt_debug_dec(sk);
333 }
334
335
336 static const struct proto_ops packet_ops;
337
338 static const struct proto_ops packet_ops_spkt;
339
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341                            struct packet_type *pt, struct net_device *orig_dev)
342 {
343         struct sock *sk;
344         struct sockaddr_pkt *spkt;
345
346         /*
347          *      When we registered the protocol we saved the socket in the data
348          *      field for just this event.
349          */
350
351         sk = pt->af_packet_priv;
352
353         /*
354          *      Yank back the headers [hope the device set this
355          *      right or kerboom...]
356          *
357          *      Incoming packets have ll header pulled,
358          *      push it back.
359          *
360          *      For outgoing ones skb->data == skb_mac_header(skb)
361          *      so that this procedure is noop.
362          */
363
364         if (skb->pkt_type == PACKET_LOOPBACK)
365                 goto out;
366
367         if (dev_net(dev) != sock_net(sk))
368                 goto out;
369
370         skb = skb_share_check(skb, GFP_ATOMIC);
371         if (skb == NULL)
372                 goto oom;
373
374         /* drop any routing info */
375         skb_dst_drop(skb);
376
377         /* drop conntrack reference */
378         nf_reset(skb);
379
380         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381
382         skb_push(skb, skb->data - skb_mac_header(skb));
383
384         /*
385          *      The SOCK_PACKET socket receives _all_ frames.
386          */
387
388         spkt->spkt_family = dev->type;
389         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390         spkt->spkt_protocol = skb->protocol;
391
392         /*
393          *      Charge the memory to the socket. This is done specifically
394          *      to prevent sockets using all the memory up.
395          */
396
397         if (sock_queue_rcv_skb(sk, skb) == 0)
398                 return 0;
399
400 out:
401         kfree_skb(skb);
402 oom:
403         return 0;
404 }
405
406
407 /*
408  *      Output a raw packet to a device layer. This bypasses all the other
409  *      protocol layers and you must therefore supply it with a complete frame
410  */
411
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413                                struct msghdr *msg, size_t len)
414 {
415         struct sock *sk = sock->sk;
416         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417         struct sk_buff *skb;
418         struct net_device *dev;
419         __be16 proto = 0;
420         int err;
421
422         /*
423          *      Get and verify the address.
424          */
425
426         if (saddr) {
427                 if (msg->msg_namelen < sizeof(struct sockaddr))
428                         return -EINVAL;
429                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430                         proto = saddr->spkt_protocol;
431         } else
432                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
433
434         /*
435          *      Find the device first to size check it
436          */
437
438         saddr->spkt_device[13] = 0;
439         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440         err = -ENODEV;
441         if (dev == NULL)
442                 goto out_unlock;
443
444         err = -ENETDOWN;
445         if (!(dev->flags & IFF_UP))
446                 goto out_unlock;
447
448         /*
449          * You may not queue a frame bigger than the mtu. This is the lowest level
450          * raw protocol and you must do your own fragmentation at this level.
451          */
452
453         err = -EMSGSIZE;
454         if (len > dev->mtu + dev->hard_header_len)
455                 goto out_unlock;
456
457         err = -ENOBUFS;
458         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459
460         /*
461          * If the write buffer is full, then tough. At this level the user
462          * gets to deal with the problem - do your own algorithmic backoffs.
463          * That's far more flexible.
464          */
465
466         if (skb == NULL)
467                 goto out_unlock;
468
469         /*
470          *      Fill it in
471          */
472
473         /* FIXME: Save some space for broken drivers that write a
474          * hard header at transmission time by themselves. PPP is the
475          * notable one here. This should really be fixed at the driver level.
476          */
477         skb_reserve(skb, LL_RESERVED_SPACE(dev));
478         skb_reset_network_header(skb);
479
480         /* Try to align data part correctly */
481         if (dev->header_ops) {
482                 skb->data -= dev->hard_header_len;
483                 skb->tail -= dev->hard_header_len;
484                 if (len < dev->hard_header_len)
485                         skb_reset_network_header(skb);
486         }
487
488         /* Returns -EFAULT on error */
489         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490         skb->protocol = proto;
491         skb->dev = dev;
492         skb->priority = sk->sk_priority;
493         if (err)
494                 goto out_free;
495
496         /*
497          *      Now send it
498          */
499
500         dev_queue_xmit(skb);
501         dev_put(dev);
502         return len;
503
504 out_free:
505         kfree_skb(skb);
506 out_unlock:
507         if (dev)
508                 dev_put(dev);
509         return err;
510 }
511
512 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
513                                       unsigned int res)
514 {
515         struct sk_filter *filter;
516
517         rcu_read_lock_bh();
518         filter = rcu_dereference(sk->sk_filter);
519         if (filter != NULL)
520                 res = sk_run_filter(skb, filter->insns, filter->len);
521         rcu_read_unlock_bh();
522
523         return res;
524 }
525
526 /*
527    This function makes lazy skb cloning in hope that most of packets
528    are discarded by BPF.
529
530    Note tricky part: we DO mangle shared skb! skb->data, skb->len
531    and skb->cb are mangled. It works because (and until) packets
532    falling here are owned by current CPU. Output packets are cloned
533    by dev_queue_xmit_nit(), input packets are processed by net_bh
534    sequencially, so that if we return skb to original state on exit,
535    we will not harm anyone.
536  */
537
538 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
539                       struct packet_type *pt, struct net_device *orig_dev)
540 {
541         struct sock *sk;
542         struct sockaddr_ll *sll;
543         struct packet_sock *po;
544         u8 *skb_head = skb->data;
545         int skb_len = skb->len;
546         unsigned int snaplen, res;
547
548         if (skb->pkt_type == PACKET_LOOPBACK)
549                 goto drop;
550
551         sk = pt->af_packet_priv;
552         po = pkt_sk(sk);
553
554         if (dev_net(dev) != sock_net(sk))
555                 goto drop;
556
557         skb->dev = dev;
558
559         if (dev->header_ops) {
560                 /* The device has an explicit notion of ll header,
561                    exported to higher levels.
562
563                    Otherwise, the device hides datails of it frame
564                    structure, so that corresponding packet head
565                    never delivered to user.
566                  */
567                 if (sk->sk_type != SOCK_DGRAM)
568                         skb_push(skb, skb->data - skb_mac_header(skb));
569                 else if (skb->pkt_type == PACKET_OUTGOING) {
570                         /* Special case: outgoing packets have ll header at head */
571                         skb_pull(skb, skb_network_offset(skb));
572                 }
573         }
574
575         snaplen = skb->len;
576
577         res = run_filter(skb, sk, snaplen);
578         if (!res)
579                 goto drop_n_restore;
580         if (snaplen > res)
581                 snaplen = res;
582
583         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
584             (unsigned)sk->sk_rcvbuf)
585                 goto drop_n_acct;
586
587         if (skb_shared(skb)) {
588                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
589                 if (nskb == NULL)
590                         goto drop_n_acct;
591
592                 if (skb_head != skb->data) {
593                         skb->data = skb_head;
594                         skb->len = skb_len;
595                 }
596                 kfree_skb(skb);
597                 skb = nskb;
598         }
599
600         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
601                      sizeof(skb->cb));
602
603         sll = &PACKET_SKB_CB(skb)->sa.ll;
604         sll->sll_family = AF_PACKET;
605         sll->sll_hatype = dev->type;
606         sll->sll_protocol = skb->protocol;
607         sll->sll_pkttype = skb->pkt_type;
608         if (unlikely(po->origdev))
609                 sll->sll_ifindex = orig_dev->ifindex;
610         else
611                 sll->sll_ifindex = dev->ifindex;
612
613         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
614
615         PACKET_SKB_CB(skb)->origlen = skb->len;
616
617         if (pskb_trim(skb, snaplen))
618                 goto drop_n_acct;
619
620         skb_set_owner_r(skb, sk);
621         skb->dev = NULL;
622         skb_dst_drop(skb);
623
624         /* drop conntrack reference */
625         nf_reset(skb);
626
627         spin_lock(&sk->sk_receive_queue.lock);
628         po->stats.tp_packets++;
629         __skb_queue_tail(&sk->sk_receive_queue, skb);
630         spin_unlock(&sk->sk_receive_queue.lock);
631         sk->sk_data_ready(sk, skb->len);
632         return 0;
633
634 drop_n_acct:
635         spin_lock(&sk->sk_receive_queue.lock);
636         po->stats.tp_drops++;
637         spin_unlock(&sk->sk_receive_queue.lock);
638
639 drop_n_restore:
640         if (skb_head != skb->data && skb_shared(skb)) {
641                 skb->data = skb_head;
642                 skb->len = skb_len;
643         }
644 drop:
645         consume_skb(skb);
646         return 0;
647 }
648
649 #ifdef CONFIG_PACKET_MMAP
650 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
651                        struct packet_type *pt, struct net_device *orig_dev)
652 {
653         struct sock *sk;
654         struct packet_sock *po;
655         struct sockaddr_ll *sll;
656         union {
657                 struct tpacket_hdr *h1;
658                 struct tpacket2_hdr *h2;
659                 void *raw;
660         } h;
661         u8 *skb_head = skb->data;
662         int skb_len = skb->len;
663         unsigned int snaplen, res;
664         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
665         unsigned short macoff, netoff, hdrlen;
666         struct sk_buff *copy_skb = NULL;
667         struct timeval tv;
668         struct timespec ts;
669
670         if (skb->pkt_type == PACKET_LOOPBACK)
671                 goto drop;
672
673         sk = pt->af_packet_priv;
674         po = pkt_sk(sk);
675
676         if (dev_net(dev) != sock_net(sk))
677                 goto drop;
678
679         if (dev->header_ops) {
680                 if (sk->sk_type != SOCK_DGRAM)
681                         skb_push(skb, skb->data - skb_mac_header(skb));
682                 else if (skb->pkt_type == PACKET_OUTGOING) {
683                         /* Special case: outgoing packets have ll header at head */
684                         skb_pull(skb, skb_network_offset(skb));
685                 }
686         }
687
688         if (skb->ip_summed == CHECKSUM_PARTIAL)
689                 status |= TP_STATUS_CSUMNOTREADY;
690
691         snaplen = skb->len;
692
693         res = run_filter(skb, sk, snaplen);
694         if (!res)
695                 goto drop_n_restore;
696         if (snaplen > res)
697                 snaplen = res;
698
699         if (sk->sk_type == SOCK_DGRAM) {
700                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
701                                   po->tp_reserve;
702         } else {
703                 unsigned maclen = skb_network_offset(skb);
704                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
705                                        (maclen < 16 ? 16 : maclen)) +
706                         po->tp_reserve;
707                 macoff = netoff - maclen;
708         }
709
710         if (macoff + snaplen > po->rx_ring.frame_size) {
711                 if (po->copy_thresh &&
712                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
713                     (unsigned)sk->sk_rcvbuf) {
714                         if (skb_shared(skb)) {
715                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
716                         } else {
717                                 copy_skb = skb_get(skb);
718                                 skb_head = skb->data;
719                         }
720                         if (copy_skb)
721                                 skb_set_owner_r(copy_skb, sk);
722                 }
723                 snaplen = po->rx_ring.frame_size - macoff;
724                 if ((int)snaplen < 0)
725                         snaplen = 0;
726         }
727
728         spin_lock(&sk->sk_receive_queue.lock);
729         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
730         if (!h.raw)
731                 goto ring_is_full;
732         packet_increment_head(&po->rx_ring);
733         po->stats.tp_packets++;
734         if (copy_skb) {
735                 status |= TP_STATUS_COPY;
736                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
737         }
738         if (!po->stats.tp_drops)
739                 status &= ~TP_STATUS_LOSING;
740         spin_unlock(&sk->sk_receive_queue.lock);
741
742         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
743
744         switch (po->tp_version) {
745         case TPACKET_V1:
746                 h.h1->tp_len = skb->len;
747                 h.h1->tp_snaplen = snaplen;
748                 h.h1->tp_mac = macoff;
749                 h.h1->tp_net = netoff;
750                 if (skb->tstamp.tv64)
751                         tv = ktime_to_timeval(skb->tstamp);
752                 else
753                         do_gettimeofday(&tv);
754                 h.h1->tp_sec = tv.tv_sec;
755                 h.h1->tp_usec = tv.tv_usec;
756                 hdrlen = sizeof(*h.h1);
757                 break;
758         case TPACKET_V2:
759                 h.h2->tp_len = skb->len;
760                 h.h2->tp_snaplen = snaplen;
761                 h.h2->tp_mac = macoff;
762                 h.h2->tp_net = netoff;
763                 if (skb->tstamp.tv64)
764                         ts = ktime_to_timespec(skb->tstamp);
765                 else
766                         getnstimeofday(&ts);
767                 h.h2->tp_sec = ts.tv_sec;
768                 h.h2->tp_nsec = ts.tv_nsec;
769                 h.h2->tp_vlan_tci = skb->vlan_tci;
770                 hdrlen = sizeof(*h.h2);
771                 break;
772         default:
773                 BUG();
774         }
775
776         sll = h.raw + TPACKET_ALIGN(hdrlen);
777         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
778         sll->sll_family = AF_PACKET;
779         sll->sll_hatype = dev->type;
780         sll->sll_protocol = skb->protocol;
781         sll->sll_pkttype = skb->pkt_type;
782         if (unlikely(po->origdev))
783                 sll->sll_ifindex = orig_dev->ifindex;
784         else
785                 sll->sll_ifindex = dev->ifindex;
786
787         __packet_set_status(po, h.raw, status);
788         smp_mb();
789         {
790                 struct page *p_start, *p_end;
791                 u8 *h_end = h.raw + macoff + snaplen - 1;
792
793                 p_start = virt_to_page(h.raw);
794                 p_end = virt_to_page(h_end);
795                 while (p_start <= p_end) {
796                         flush_dcache_page(p_start);
797                         p_start++;
798                 }
799         }
800
801         sk->sk_data_ready(sk, 0);
802
803 drop_n_restore:
804         if (skb_head != skb->data && skb_shared(skb)) {
805                 skb->data = skb_head;
806                 skb->len = skb_len;
807         }
808 drop:
809         kfree_skb(skb);
810         return 0;
811
812 ring_is_full:
813         po->stats.tp_drops++;
814         spin_unlock(&sk->sk_receive_queue.lock);
815
816         sk->sk_data_ready(sk, 0);
817         kfree_skb(copy_skb);
818         goto drop_n_restore;
819 }
820
821 static void tpacket_destruct_skb(struct sk_buff *skb)
822 {
823         struct packet_sock *po = pkt_sk(skb->sk);
824         void *ph;
825
826         BUG_ON(skb == NULL);
827
828         if (likely(po->tx_ring.pg_vec)) {
829                 ph = skb_shinfo(skb)->destructor_arg;
830                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
831                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
832                 atomic_dec(&po->tx_ring.pending);
833                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
834         }
835
836         sock_wfree(skb);
837 }
838
839 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
840                 void *frame, struct net_device *dev, int size_max,
841                 __be16 proto, unsigned char *addr)
842 {
843         union {
844                 struct tpacket_hdr *h1;
845                 struct tpacket2_hdr *h2;
846                 void *raw;
847         } ph;
848         int to_write, offset, len, tp_len, nr_frags, len_max;
849         struct socket *sock = po->sk.sk_socket;
850         struct page *page;
851         void *data;
852         int err;
853
854         ph.raw = frame;
855
856         skb->protocol = proto;
857         skb->dev = dev;
858         skb->priority = po->sk.sk_priority;
859         skb_shinfo(skb)->destructor_arg = ph.raw;
860
861         switch (po->tp_version) {
862         case TPACKET_V2:
863                 tp_len = ph.h2->tp_len;
864                 break;
865         default:
866                 tp_len = ph.h1->tp_len;
867                 break;
868         }
869         if (unlikely(tp_len > size_max)) {
870                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
871                 return -EMSGSIZE;
872         }
873
874         skb_reserve(skb, LL_RESERVED_SPACE(dev));
875         skb_reset_network_header(skb);
876
877         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
878         to_write = tp_len;
879
880         if (sock->type == SOCK_DGRAM) {
881                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
882                                 NULL, tp_len);
883                 if (unlikely(err < 0))
884                         return -EINVAL;
885         } else if (dev->hard_header_len) {
886                 /* net device doesn't like empty head */
887                 if (unlikely(tp_len <= dev->hard_header_len)) {
888                         pr_err("packet size is too short (%d < %d)\n",
889                                tp_len, dev->hard_header_len);
890                         return -EINVAL;
891                 }
892
893                 skb_push(skb, dev->hard_header_len);
894                 err = skb_store_bits(skb, 0, data,
895                                 dev->hard_header_len);
896                 if (unlikely(err))
897                         return err;
898
899                 data += dev->hard_header_len;
900                 to_write -= dev->hard_header_len;
901         }
902
903         err = -EFAULT;
904         page = virt_to_page(data);
905         offset = offset_in_page(data);
906         len_max = PAGE_SIZE - offset;
907         len = ((to_write > len_max) ? len_max : to_write);
908
909         skb->data_len = to_write;
910         skb->len += to_write;
911         skb->truesize += to_write;
912         atomic_add(to_write, &po->sk.sk_wmem_alloc);
913
914         while (likely(to_write)) {
915                 nr_frags = skb_shinfo(skb)->nr_frags;
916
917                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
918                         pr_err("Packet exceed the number of skb frags(%lu)\n",
919                                MAX_SKB_FRAGS);
920                         return -EFAULT;
921                 }
922
923                 flush_dcache_page(page);
924                 get_page(page);
925                 skb_fill_page_desc(skb,
926                                 nr_frags,
927                                 page++, offset, len);
928                 to_write -= len;
929                 offset = 0;
930                 len_max = PAGE_SIZE;
931                 len = ((to_write > len_max) ? len_max : to_write);
932         }
933
934         return tp_len;
935 }
936
937 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
938 {
939         struct socket *sock;
940         struct sk_buff *skb;
941         struct net_device *dev;
942         __be16 proto;
943         int ifindex, err, reserve = 0;
944         void *ph;
945         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
946         int tp_len, size_max;
947         unsigned char *addr;
948         int len_sum = 0;
949         int status = 0;
950
951         sock = po->sk.sk_socket;
952
953         mutex_lock(&po->pg_vec_lock);
954
955         err = -EBUSY;
956         if (saddr == NULL) {
957                 ifindex = po->ifindex;
958                 proto   = po->num;
959                 addr    = NULL;
960         } else {
961                 err = -EINVAL;
962                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
963                         goto out;
964                 if (msg->msg_namelen < (saddr->sll_halen
965                                         + offsetof(struct sockaddr_ll,
966                                                 sll_addr)))
967                         goto out;
968                 ifindex = saddr->sll_ifindex;
969                 proto   = saddr->sll_protocol;
970                 addr    = saddr->sll_addr;
971         }
972
973         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
974         err = -ENXIO;
975         if (unlikely(dev == NULL))
976                 goto out;
977
978         reserve = dev->hard_header_len;
979
980         err = -ENETDOWN;
981         if (unlikely(!(dev->flags & IFF_UP)))
982                 goto out_put;
983
984         size_max = po->tx_ring.frame_size
985                 - (po->tp_hdrlen - sizeof(struct sockaddr_ll));
986
987         if (size_max > dev->mtu + reserve)
988                 size_max = dev->mtu + reserve;
989
990         do {
991                 ph = packet_current_frame(po, &po->tx_ring,
992                                 TP_STATUS_SEND_REQUEST);
993
994                 if (unlikely(ph == NULL)) {
995                         schedule();
996                         continue;
997                 }
998
999                 status = TP_STATUS_SEND_REQUEST;
1000                 skb = sock_alloc_send_skb(&po->sk,
1001                                 LL_ALLOCATED_SPACE(dev)
1002                                 + sizeof(struct sockaddr_ll),
1003                                 0, &err);
1004
1005                 if (unlikely(skb == NULL))
1006                         goto out_status;
1007
1008                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1009                                 addr);
1010
1011                 if (unlikely(tp_len < 0)) {
1012                         if (po->tp_loss) {
1013                                 __packet_set_status(po, ph,
1014                                                 TP_STATUS_AVAILABLE);
1015                                 packet_increment_head(&po->tx_ring);
1016                                 kfree_skb(skb);
1017                                 continue;
1018                         } else {
1019                                 status = TP_STATUS_WRONG_FORMAT;
1020                                 err = tp_len;
1021                                 goto out_status;
1022                         }
1023                 }
1024
1025                 skb->destructor = tpacket_destruct_skb;
1026                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1027                 atomic_inc(&po->tx_ring.pending);
1028
1029                 status = TP_STATUS_SEND_REQUEST;
1030                 err = dev_queue_xmit(skb);
1031                 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1032                         goto out_xmit;
1033                 packet_increment_head(&po->tx_ring);
1034                 len_sum += tp_len;
1035         } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1036                                         && (atomic_read(&po->tx_ring.pending))))
1037               );
1038
1039         err = len_sum;
1040         goto out_put;
1041
1042 out_xmit:
1043         skb->destructor = sock_wfree;
1044         atomic_dec(&po->tx_ring.pending);
1045 out_status:
1046         __packet_set_status(po, ph, status);
1047         kfree_skb(skb);
1048 out_put:
1049         dev_put(dev);
1050 out:
1051         mutex_unlock(&po->pg_vec_lock);
1052         return err;
1053 }
1054 #endif
1055
1056 static int packet_snd(struct socket *sock,
1057                           struct msghdr *msg, size_t len)
1058 {
1059         struct sock *sk = sock->sk;
1060         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1061         struct sk_buff *skb;
1062         struct net_device *dev;
1063         __be16 proto;
1064         unsigned char *addr;
1065         int ifindex, err, reserve = 0;
1066
1067         /*
1068          *      Get and verify the address.
1069          */
1070
1071         if (saddr == NULL) {
1072                 struct packet_sock *po = pkt_sk(sk);
1073
1074                 ifindex = po->ifindex;
1075                 proto   = po->num;
1076                 addr    = NULL;
1077         } else {
1078                 err = -EINVAL;
1079                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1080                         goto out;
1081                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1082                         goto out;
1083                 ifindex = saddr->sll_ifindex;
1084                 proto   = saddr->sll_protocol;
1085                 addr    = saddr->sll_addr;
1086         }
1087
1088
1089         dev = dev_get_by_index(sock_net(sk), ifindex);
1090         err = -ENXIO;
1091         if (dev == NULL)
1092                 goto out_unlock;
1093         if (sock->type == SOCK_RAW)
1094                 reserve = dev->hard_header_len;
1095
1096         err = -ENETDOWN;
1097         if (!(dev->flags & IFF_UP))
1098                 goto out_unlock;
1099
1100         err = -EMSGSIZE;
1101         if (len > dev->mtu+reserve)
1102                 goto out_unlock;
1103
1104         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1105                                 msg->msg_flags & MSG_DONTWAIT, &err);
1106         if (skb == NULL)
1107                 goto out_unlock;
1108
1109         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1110         skb_reset_network_header(skb);
1111
1112         err = -EINVAL;
1113         if (sock->type == SOCK_DGRAM &&
1114             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1115                 goto out_free;
1116
1117         /* Returns -EFAULT on error */
1118         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1119         if (err)
1120                 goto out_free;
1121
1122         skb->protocol = proto;
1123         skb->dev = dev;
1124         skb->priority = sk->sk_priority;
1125
1126         /*
1127          *      Now send it
1128          */
1129
1130         err = dev_queue_xmit(skb);
1131         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1132                 goto out_unlock;
1133
1134         dev_put(dev);
1135
1136         return len;
1137
1138 out_free:
1139         kfree_skb(skb);
1140 out_unlock:
1141         if (dev)
1142                 dev_put(dev);
1143 out:
1144         return err;
1145 }
1146
1147 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1148                 struct msghdr *msg, size_t len)
1149 {
1150 #ifdef CONFIG_PACKET_MMAP
1151         struct sock *sk = sock->sk;
1152         struct packet_sock *po = pkt_sk(sk);
1153         if (po->tx_ring.pg_vec)
1154                 return tpacket_snd(po, msg);
1155         else
1156 #endif
1157                 return packet_snd(sock, msg, len);
1158 }
1159
1160 /*
1161  *      Close a PACKET socket. This is fairly simple. We immediately go
1162  *      to 'closed' state and remove our protocol entry in the device list.
1163  */
1164
1165 static int packet_release(struct socket *sock)
1166 {
1167         struct sock *sk = sock->sk;
1168         struct packet_sock *po;
1169         struct net *net;
1170 #ifdef CONFIG_PACKET_MMAP
1171         struct tpacket_req req;
1172 #endif
1173
1174         if (!sk)
1175                 return 0;
1176
1177         net = sock_net(sk);
1178         po = pkt_sk(sk);
1179
1180         write_lock_bh(&net->packet.sklist_lock);
1181         sk_del_node_init(sk);
1182         sock_prot_inuse_add(net, sk->sk_prot, -1);
1183         write_unlock_bh(&net->packet.sklist_lock);
1184
1185         /*
1186          *      Unhook packet receive handler.
1187          */
1188
1189         if (po->running) {
1190                 /*
1191                  *      Remove the protocol hook
1192                  */
1193                 dev_remove_pack(&po->prot_hook);
1194                 po->running = 0;
1195                 po->num = 0;
1196                 __sock_put(sk);
1197         }
1198
1199         packet_flush_mclist(sk);
1200
1201 #ifdef CONFIG_PACKET_MMAP
1202         memset(&req, 0, sizeof(req));
1203
1204         if (po->rx_ring.pg_vec)
1205                 packet_set_ring(sk, &req, 1, 0);
1206
1207         if (po->tx_ring.pg_vec)
1208                 packet_set_ring(sk, &req, 1, 1);
1209 #endif
1210
1211         /*
1212          *      Now the socket is dead. No more input will appear.
1213          */
1214
1215         sock_orphan(sk);
1216         sock->sk = NULL;
1217
1218         /* Purge queues */
1219
1220         skb_queue_purge(&sk->sk_receive_queue);
1221         sk_refcnt_debug_release(sk);
1222
1223         sock_put(sk);
1224         return 0;
1225 }
1226
1227 /*
1228  *      Attach a packet hook.
1229  */
1230
1231 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1232 {
1233         struct packet_sock *po = pkt_sk(sk);
1234         /*
1235          *      Detach an existing hook if present.
1236          */
1237
1238         lock_sock(sk);
1239
1240         spin_lock(&po->bind_lock);
1241         if (po->running) {
1242                 __sock_put(sk);
1243                 po->running = 0;
1244                 po->num = 0;
1245                 spin_unlock(&po->bind_lock);
1246                 dev_remove_pack(&po->prot_hook);
1247                 spin_lock(&po->bind_lock);
1248         }
1249
1250         po->num = protocol;
1251         po->prot_hook.type = protocol;
1252         po->prot_hook.dev = dev;
1253
1254         po->ifindex = dev ? dev->ifindex : 0;
1255
1256         if (protocol == 0)
1257                 goto out_unlock;
1258
1259         if (!dev || (dev->flags & IFF_UP)) {
1260                 dev_add_pack(&po->prot_hook);
1261                 sock_hold(sk);
1262                 po->running = 1;
1263         } else {
1264                 sk->sk_err = ENETDOWN;
1265                 if (!sock_flag(sk, SOCK_DEAD))
1266                         sk->sk_error_report(sk);
1267         }
1268
1269 out_unlock:
1270         spin_unlock(&po->bind_lock);
1271         release_sock(sk);
1272         return 0;
1273 }
1274
1275 /*
1276  *      Bind a packet socket to a device
1277  */
1278
1279 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1280                             int addr_len)
1281 {
1282         struct sock *sk = sock->sk;
1283         char name[15];
1284         struct net_device *dev;
1285         int err = -ENODEV;
1286
1287         /*
1288          *      Check legality
1289          */
1290
1291         if (addr_len != sizeof(struct sockaddr))
1292                 return -EINVAL;
1293         strlcpy(name, uaddr->sa_data, sizeof(name));
1294
1295         dev = dev_get_by_name(sock_net(sk), name);
1296         if (dev) {
1297                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1298                 dev_put(dev);
1299         }
1300         return err;
1301 }
1302
1303 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1304 {
1305         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1306         struct sock *sk = sock->sk;
1307         struct net_device *dev = NULL;
1308         int err;
1309
1310
1311         /*
1312          *      Check legality
1313          */
1314
1315         if (addr_len < sizeof(struct sockaddr_ll))
1316                 return -EINVAL;
1317         if (sll->sll_family != AF_PACKET)
1318                 return -EINVAL;
1319
1320         if (sll->sll_ifindex) {
1321                 err = -ENODEV;
1322                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1323                 if (dev == NULL)
1324                         goto out;
1325         }
1326         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1327         if (dev)
1328                 dev_put(dev);
1329
1330 out:
1331         return err;
1332 }
1333
1334 static struct proto packet_proto = {
1335         .name     = "PACKET",
1336         .owner    = THIS_MODULE,
1337         .obj_size = sizeof(struct packet_sock),
1338 };
1339
1340 /*
1341  *      Create a packet of type SOCK_PACKET.
1342  */
1343
1344 static int packet_create(struct net *net, struct socket *sock, int protocol)
1345 {
1346         struct sock *sk;
1347         struct packet_sock *po;
1348         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1349         int err;
1350
1351         if (!capable(CAP_NET_RAW))
1352                 return -EPERM;
1353         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1354             sock->type != SOCK_PACKET)
1355                 return -ESOCKTNOSUPPORT;
1356
1357         sock->state = SS_UNCONNECTED;
1358
1359         err = -ENOBUFS;
1360         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1361         if (sk == NULL)
1362                 goto out;
1363
1364         sock->ops = &packet_ops;
1365         if (sock->type == SOCK_PACKET)
1366                 sock->ops = &packet_ops_spkt;
1367
1368         sock_init_data(sock, sk);
1369
1370         po = pkt_sk(sk);
1371         sk->sk_family = PF_PACKET;
1372         po->num = proto;
1373
1374         sk->sk_destruct = packet_sock_destruct;
1375         sk_refcnt_debug_inc(sk);
1376
1377         /*
1378          *      Attach a protocol block
1379          */
1380
1381         spin_lock_init(&po->bind_lock);
1382         mutex_init(&po->pg_vec_lock);
1383         po->prot_hook.func = packet_rcv;
1384
1385         if (sock->type == SOCK_PACKET)
1386                 po->prot_hook.func = packet_rcv_spkt;
1387
1388         po->prot_hook.af_packet_priv = sk;
1389
1390         if (proto) {
1391                 po->prot_hook.type = proto;
1392                 dev_add_pack(&po->prot_hook);
1393                 sock_hold(sk);
1394                 po->running = 1;
1395         }
1396
1397         write_lock_bh(&net->packet.sklist_lock);
1398         sk_add_node(sk, &net->packet.sklist);
1399         sock_prot_inuse_add(net, &packet_proto, 1);
1400         write_unlock_bh(&net->packet.sklist_lock);
1401         return 0;
1402 out:
1403         return err;
1404 }
1405
1406 /*
1407  *      Pull a packet from our receive queue and hand it to the user.
1408  *      If necessary we block.
1409  */
1410
1411 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1412                           struct msghdr *msg, size_t len, int flags)
1413 {
1414         struct sock *sk = sock->sk;
1415         struct sk_buff *skb;
1416         int copied, err;
1417         struct sockaddr_ll *sll;
1418
1419         err = -EINVAL;
1420         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1421                 goto out;
1422
1423 #if 0
1424         /* What error should we return now? EUNATTACH? */
1425         if (pkt_sk(sk)->ifindex < 0)
1426                 return -ENODEV;
1427 #endif
1428
1429         /*
1430          *      Call the generic datagram receiver. This handles all sorts
1431          *      of horrible races and re-entrancy so we can forget about it
1432          *      in the protocol layers.
1433          *
1434          *      Now it will return ENETDOWN, if device have just gone down,
1435          *      but then it will block.
1436          */
1437
1438         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1439
1440         /*
1441          *      An error occurred so return it. Because skb_recv_datagram()
1442          *      handles the blocking we don't see and worry about blocking
1443          *      retries.
1444          */
1445
1446         if (skb == NULL)
1447                 goto out;
1448
1449         /*
1450          *      If the address length field is there to be filled in, we fill
1451          *      it in now.
1452          */
1453
1454         sll = &PACKET_SKB_CB(skb)->sa.ll;
1455         if (sock->type == SOCK_PACKET)
1456                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1457         else
1458                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1459
1460         /*
1461          *      You lose any data beyond the buffer you gave. If it worries a
1462          *      user program they can ask the device for its MTU anyway.
1463          */
1464
1465         copied = skb->len;
1466         if (copied > len) {
1467                 copied = len;
1468                 msg->msg_flags |= MSG_TRUNC;
1469         }
1470
1471         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1472         if (err)
1473                 goto out_free;
1474
1475         sock_recv_timestamp(msg, sk, skb);
1476
1477         if (msg->msg_name)
1478                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1479                        msg->msg_namelen);
1480
1481         if (pkt_sk(sk)->auxdata) {
1482                 struct tpacket_auxdata aux;
1483
1484                 aux.tp_status = TP_STATUS_USER;
1485                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1486                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1487                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1488                 aux.tp_snaplen = skb->len;
1489                 aux.tp_mac = 0;
1490                 aux.tp_net = skb_network_offset(skb);
1491                 aux.tp_vlan_tci = skb->vlan_tci;
1492
1493                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1494         }
1495
1496         /*
1497          *      Free or return the buffer as appropriate. Again this
1498          *      hides all the races and re-entrancy issues from us.
1499          */
1500         err = (flags&MSG_TRUNC) ? skb->len : copied;
1501
1502 out_free:
1503         skb_free_datagram(sk, skb);
1504 out:
1505         return err;
1506 }
1507
1508 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1509                                int *uaddr_len, int peer)
1510 {
1511         struct net_device *dev;
1512         struct sock *sk = sock->sk;
1513
1514         if (peer)
1515                 return -EOPNOTSUPP;
1516
1517         uaddr->sa_family = AF_PACKET;
1518         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1519         if (dev) {
1520                 strlcpy(uaddr->sa_data, dev->name, 15);
1521                 dev_put(dev);
1522         } else
1523                 memset(uaddr->sa_data, 0, 14);
1524         *uaddr_len = sizeof(*uaddr);
1525
1526         return 0;
1527 }
1528
1529 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1530                           int *uaddr_len, int peer)
1531 {
1532         struct net_device *dev;
1533         struct sock *sk = sock->sk;
1534         struct packet_sock *po = pkt_sk(sk);
1535         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1536
1537         if (peer)
1538                 return -EOPNOTSUPP;
1539
1540         sll->sll_family = AF_PACKET;
1541         sll->sll_ifindex = po->ifindex;
1542         sll->sll_protocol = po->num;
1543         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1544         if (dev) {
1545                 sll->sll_hatype = dev->type;
1546                 sll->sll_halen = dev->addr_len;
1547                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1548                 dev_put(dev);
1549         } else {
1550                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1551                 sll->sll_halen = 0;
1552         }
1553         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1554
1555         return 0;
1556 }
1557
1558 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1559                          int what)
1560 {
1561         switch (i->type) {
1562         case PACKET_MR_MULTICAST:
1563                 if (what > 0)
1564                         return dev_mc_add(dev, i->addr, i->alen, 0);
1565                 else
1566                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1567                 break;
1568         case PACKET_MR_PROMISC:
1569                 return dev_set_promiscuity(dev, what);
1570                 break;
1571         case PACKET_MR_ALLMULTI:
1572                 return dev_set_allmulti(dev, what);
1573                 break;
1574         case PACKET_MR_UNICAST:
1575                 if (what > 0)
1576                         return dev_unicast_add(dev, i->addr);
1577                 else
1578                         return dev_unicast_delete(dev, i->addr);
1579                 break;
1580         default:
1581                 break;
1582         }
1583         return 0;
1584 }
1585
1586 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1587 {
1588         for ( ; i; i = i->next) {
1589                 if (i->ifindex == dev->ifindex)
1590                         packet_dev_mc(dev, i, what);
1591         }
1592 }
1593
1594 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1595 {
1596         struct packet_sock *po = pkt_sk(sk);
1597         struct packet_mclist *ml, *i;
1598         struct net_device *dev;
1599         int err;
1600
1601         rtnl_lock();
1602
1603         err = -ENODEV;
1604         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1605         if (!dev)
1606                 goto done;
1607
1608         err = -EINVAL;
1609         if (mreq->mr_alen > dev->addr_len)
1610                 goto done;
1611
1612         err = -ENOBUFS;
1613         i = kmalloc(sizeof(*i), GFP_KERNEL);
1614         if (i == NULL)
1615                 goto done;
1616
1617         err = 0;
1618         for (ml = po->mclist; ml; ml = ml->next) {
1619                 if (ml->ifindex == mreq->mr_ifindex &&
1620                     ml->type == mreq->mr_type &&
1621                     ml->alen == mreq->mr_alen &&
1622                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1623                         ml->count++;
1624                         /* Free the new element ... */
1625                         kfree(i);
1626                         goto done;
1627                 }
1628         }
1629
1630         i->type = mreq->mr_type;
1631         i->ifindex = mreq->mr_ifindex;
1632         i->alen = mreq->mr_alen;
1633         memcpy(i->addr, mreq->mr_address, i->alen);
1634         i->count = 1;
1635         i->next = po->mclist;
1636         po->mclist = i;
1637         err = packet_dev_mc(dev, i, 1);
1638         if (err) {
1639                 po->mclist = i->next;
1640                 kfree(i);
1641         }
1642
1643 done:
1644         rtnl_unlock();
1645         return err;
1646 }
1647
1648 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1649 {
1650         struct packet_mclist *ml, **mlp;
1651
1652         rtnl_lock();
1653
1654         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1655                 if (ml->ifindex == mreq->mr_ifindex &&
1656                     ml->type == mreq->mr_type &&
1657                     ml->alen == mreq->mr_alen &&
1658                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1659                         if (--ml->count == 0) {
1660                                 struct net_device *dev;
1661                                 *mlp = ml->next;
1662                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1663                                 if (dev) {
1664                                         packet_dev_mc(dev, ml, -1);
1665                                         dev_put(dev);
1666                                 }
1667                                 kfree(ml);
1668                         }
1669                         rtnl_unlock();
1670                         return 0;
1671                 }
1672         }
1673         rtnl_unlock();
1674         return -EADDRNOTAVAIL;
1675 }
1676
1677 static void packet_flush_mclist(struct sock *sk)
1678 {
1679         struct packet_sock *po = pkt_sk(sk);
1680         struct packet_mclist *ml;
1681
1682         if (!po->mclist)
1683                 return;
1684
1685         rtnl_lock();
1686         while ((ml = po->mclist) != NULL) {
1687                 struct net_device *dev;
1688
1689                 po->mclist = ml->next;
1690                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1691                 if (dev != NULL) {
1692                         packet_dev_mc(dev, ml, -1);
1693                         dev_put(dev);
1694                 }
1695                 kfree(ml);
1696         }
1697         rtnl_unlock();
1698 }
1699
1700 static int
1701 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1702 {
1703         struct sock *sk = sock->sk;
1704         struct packet_sock *po = pkt_sk(sk);
1705         int ret;
1706
1707         if (level != SOL_PACKET)
1708                 return -ENOPROTOOPT;
1709
1710         switch (optname) {
1711         case PACKET_ADD_MEMBERSHIP:
1712         case PACKET_DROP_MEMBERSHIP:
1713         {
1714                 struct packet_mreq_max mreq;
1715                 int len = optlen;
1716                 memset(&mreq, 0, sizeof(mreq));
1717                 if (len < sizeof(struct packet_mreq))
1718                         return -EINVAL;
1719                 if (len > sizeof(mreq))
1720                         len = sizeof(mreq);
1721                 if (copy_from_user(&mreq, optval, len))
1722                         return -EFAULT;
1723                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1724                         return -EINVAL;
1725                 if (optname == PACKET_ADD_MEMBERSHIP)
1726                         ret = packet_mc_add(sk, &mreq);
1727                 else
1728                         ret = packet_mc_drop(sk, &mreq);
1729                 return ret;
1730         }
1731
1732 #ifdef CONFIG_PACKET_MMAP
1733         case PACKET_RX_RING:
1734         case PACKET_TX_RING:
1735         {
1736                 struct tpacket_req req;
1737
1738                 if (optlen < sizeof(req))
1739                         return -EINVAL;
1740                 if (copy_from_user(&req, optval, sizeof(req)))
1741                         return -EFAULT;
1742                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1743         }
1744         case PACKET_COPY_THRESH:
1745         {
1746                 int val;
1747
1748                 if (optlen != sizeof(val))
1749                         return -EINVAL;
1750                 if (copy_from_user(&val, optval, sizeof(val)))
1751                         return -EFAULT;
1752
1753                 pkt_sk(sk)->copy_thresh = val;
1754                 return 0;
1755         }
1756         case PACKET_VERSION:
1757         {
1758                 int val;
1759
1760                 if (optlen != sizeof(val))
1761                         return -EINVAL;
1762                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1763                         return -EBUSY;
1764                 if (copy_from_user(&val, optval, sizeof(val)))
1765                         return -EFAULT;
1766                 switch (val) {
1767                 case TPACKET_V1:
1768                 case TPACKET_V2:
1769                         po->tp_version = val;
1770                         return 0;
1771                 default:
1772                         return -EINVAL;
1773                 }
1774         }
1775         case PACKET_RESERVE:
1776         {
1777                 unsigned int val;
1778
1779                 if (optlen != sizeof(val))
1780                         return -EINVAL;
1781                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1782                         return -EBUSY;
1783                 if (copy_from_user(&val, optval, sizeof(val)))
1784                         return -EFAULT;
1785                 po->tp_reserve = val;
1786                 return 0;
1787         }
1788         case PACKET_LOSS:
1789         {
1790                 unsigned int val;
1791
1792                 if (optlen != sizeof(val))
1793                         return -EINVAL;
1794                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1795                         return -EBUSY;
1796                 if (copy_from_user(&val, optval, sizeof(val)))
1797                         return -EFAULT;
1798                 po->tp_loss = !!val;
1799                 return 0;
1800         }
1801 #endif
1802         case PACKET_AUXDATA:
1803         {
1804                 int val;
1805
1806                 if (optlen < sizeof(val))
1807                         return -EINVAL;
1808                 if (copy_from_user(&val, optval, sizeof(val)))
1809                         return -EFAULT;
1810
1811                 po->auxdata = !!val;
1812                 return 0;
1813         }
1814         case PACKET_ORIGDEV:
1815         {
1816                 int val;
1817
1818                 if (optlen < sizeof(val))
1819                         return -EINVAL;
1820                 if (copy_from_user(&val, optval, sizeof(val)))
1821                         return -EFAULT;
1822
1823                 po->origdev = !!val;
1824                 return 0;
1825         }
1826         default:
1827                 return -ENOPROTOOPT;
1828         }
1829 }
1830
1831 static int packet_getsockopt(struct socket *sock, int level, int optname,
1832                              char __user *optval, int __user *optlen)
1833 {
1834         int len;
1835         int val;
1836         struct sock *sk = sock->sk;
1837         struct packet_sock *po = pkt_sk(sk);
1838         void *data;
1839         struct tpacket_stats st;
1840
1841         if (level != SOL_PACKET)
1842                 return -ENOPROTOOPT;
1843
1844         if (get_user(len, optlen))
1845                 return -EFAULT;
1846
1847         if (len < 0)
1848                 return -EINVAL;
1849
1850         switch (optname) {
1851         case PACKET_STATISTICS:
1852                 if (len > sizeof(struct tpacket_stats))
1853                         len = sizeof(struct tpacket_stats);
1854                 spin_lock_bh(&sk->sk_receive_queue.lock);
1855                 st = po->stats;
1856                 memset(&po->stats, 0, sizeof(st));
1857                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1858                 st.tp_packets += st.tp_drops;
1859
1860                 data = &st;
1861                 break;
1862         case PACKET_AUXDATA:
1863                 if (len > sizeof(int))
1864                         len = sizeof(int);
1865                 val = po->auxdata;
1866
1867                 data = &val;
1868                 break;
1869         case PACKET_ORIGDEV:
1870                 if (len > sizeof(int))
1871                         len = sizeof(int);
1872                 val = po->origdev;
1873
1874                 data = &val;
1875                 break;
1876 #ifdef CONFIG_PACKET_MMAP
1877         case PACKET_VERSION:
1878                 if (len > sizeof(int))
1879                         len = sizeof(int);
1880                 val = po->tp_version;
1881                 data = &val;
1882                 break;
1883         case PACKET_HDRLEN:
1884                 if (len > sizeof(int))
1885                         len = sizeof(int);
1886                 if (copy_from_user(&val, optval, len))
1887                         return -EFAULT;
1888                 switch (val) {
1889                 case TPACKET_V1:
1890                         val = sizeof(struct tpacket_hdr);
1891                         break;
1892                 case TPACKET_V2:
1893                         val = sizeof(struct tpacket2_hdr);
1894                         break;
1895                 default:
1896                         return -EINVAL;
1897                 }
1898                 data = &val;
1899                 break;
1900         case PACKET_RESERVE:
1901                 if (len > sizeof(unsigned int))
1902                         len = sizeof(unsigned int);
1903                 val = po->tp_reserve;
1904                 data = &val;
1905                 break;
1906         case PACKET_LOSS:
1907                 if (len > sizeof(unsigned int))
1908                         len = sizeof(unsigned int);
1909                 val = po->tp_loss;
1910                 data = &val;
1911                 break;
1912 #endif
1913         default:
1914                 return -ENOPROTOOPT;
1915         }
1916
1917         if (put_user(len, optlen))
1918                 return -EFAULT;
1919         if (copy_to_user(optval, data, len))
1920                 return -EFAULT;
1921         return 0;
1922 }
1923
1924
1925 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1926 {
1927         struct sock *sk;
1928         struct hlist_node *node;
1929         struct net_device *dev = data;
1930         struct net *net = dev_net(dev);
1931
1932         read_lock(&net->packet.sklist_lock);
1933         sk_for_each(sk, node, &net->packet.sklist) {
1934                 struct packet_sock *po = pkt_sk(sk);
1935
1936                 switch (msg) {
1937                 case NETDEV_UNREGISTER:
1938                         if (po->mclist)
1939                                 packet_dev_mclist(dev, po->mclist, -1);
1940                         /* fallthrough */
1941
1942                 case NETDEV_DOWN:
1943                         if (dev->ifindex == po->ifindex) {
1944                                 spin_lock(&po->bind_lock);
1945                                 if (po->running) {
1946                                         __dev_remove_pack(&po->prot_hook);
1947                                         __sock_put(sk);
1948                                         po->running = 0;
1949                                         sk->sk_err = ENETDOWN;
1950                                         if (!sock_flag(sk, SOCK_DEAD))
1951                                                 sk->sk_error_report(sk);
1952                                 }
1953                                 if (msg == NETDEV_UNREGISTER) {
1954                                         po->ifindex = -1;
1955                                         po->prot_hook.dev = NULL;
1956                                 }
1957                                 spin_unlock(&po->bind_lock);
1958                         }
1959                         break;
1960                 case NETDEV_UP:
1961                         spin_lock(&po->bind_lock);
1962                         if (dev->ifindex == po->ifindex && po->num &&
1963                             !po->running) {
1964                                 dev_add_pack(&po->prot_hook);
1965                                 sock_hold(sk);
1966                                 po->running = 1;
1967                         }
1968                         spin_unlock(&po->bind_lock);
1969                         break;
1970                 }
1971         }
1972         read_unlock(&net->packet.sklist_lock);
1973         return NOTIFY_DONE;
1974 }
1975
1976
1977 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1978                         unsigned long arg)
1979 {
1980         struct sock *sk = sock->sk;
1981
1982         switch (cmd) {
1983         case SIOCOUTQ:
1984         {
1985                 int amount = sk_wmem_alloc_get(sk);
1986
1987                 return put_user(amount, (int __user *)arg);
1988         }
1989         case SIOCINQ:
1990         {
1991                 struct sk_buff *skb;
1992                 int amount = 0;
1993
1994                 spin_lock_bh(&sk->sk_receive_queue.lock);
1995                 skb = skb_peek(&sk->sk_receive_queue);
1996                 if (skb)
1997                         amount = skb->len;
1998                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1999                 return put_user(amount, (int __user *)arg);
2000         }
2001         case SIOCGSTAMP:
2002                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2003         case SIOCGSTAMPNS:
2004                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2005
2006 #ifdef CONFIG_INET
2007         case SIOCADDRT:
2008         case SIOCDELRT:
2009         case SIOCDARP:
2010         case SIOCGARP:
2011         case SIOCSARP:
2012         case SIOCGIFADDR:
2013         case SIOCSIFADDR:
2014         case SIOCGIFBRDADDR:
2015         case SIOCSIFBRDADDR:
2016         case SIOCGIFNETMASK:
2017         case SIOCSIFNETMASK:
2018         case SIOCGIFDSTADDR:
2019         case SIOCSIFDSTADDR:
2020         case SIOCSIFFLAGS:
2021                 if (!net_eq(sock_net(sk), &init_net))
2022                         return -ENOIOCTLCMD;
2023                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2024 #endif
2025
2026         default:
2027                 return -ENOIOCTLCMD;
2028         }
2029         return 0;
2030 }
2031
2032 #ifndef CONFIG_PACKET_MMAP
2033 #define packet_mmap sock_no_mmap
2034 #define packet_poll datagram_poll
2035 #else
2036
2037 static unsigned int packet_poll(struct file *file, struct socket *sock,
2038                                 poll_table *wait)
2039 {
2040         struct sock *sk = sock->sk;
2041         struct packet_sock *po = pkt_sk(sk);
2042         unsigned int mask = datagram_poll(file, sock, wait);
2043
2044         spin_lock_bh(&sk->sk_receive_queue.lock);
2045         if (po->rx_ring.pg_vec) {
2046                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2047                         mask |= POLLIN | POLLRDNORM;
2048         }
2049         spin_unlock_bh(&sk->sk_receive_queue.lock);
2050         spin_lock_bh(&sk->sk_write_queue.lock);
2051         if (po->tx_ring.pg_vec) {
2052                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2053                         mask |= POLLOUT | POLLWRNORM;
2054         }
2055         spin_unlock_bh(&sk->sk_write_queue.lock);
2056         return mask;
2057 }
2058
2059
2060 /* Dirty? Well, I still did not learn better way to account
2061  * for user mmaps.
2062  */
2063
2064 static void packet_mm_open(struct vm_area_struct *vma)
2065 {
2066         struct file *file = vma->vm_file;
2067         struct socket *sock = file->private_data;
2068         struct sock *sk = sock->sk;
2069
2070         if (sk)
2071                 atomic_inc(&pkt_sk(sk)->mapped);
2072 }
2073
2074 static void packet_mm_close(struct vm_area_struct *vma)
2075 {
2076         struct file *file = vma->vm_file;
2077         struct socket *sock = file->private_data;
2078         struct sock *sk = sock->sk;
2079
2080         if (sk)
2081                 atomic_dec(&pkt_sk(sk)->mapped);
2082 }
2083
2084 static const struct vm_operations_struct packet_mmap_ops = {
2085         .open   =       packet_mm_open,
2086         .close  =       packet_mm_close,
2087 };
2088
2089 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2090 {
2091         int i;
2092
2093         for (i = 0; i < len; i++) {
2094                 if (likely(pg_vec[i]))
2095                         free_pages((unsigned long) pg_vec[i], order);
2096         }
2097         kfree(pg_vec);
2098 }
2099
2100 static inline char *alloc_one_pg_vec_page(unsigned long order)
2101 {
2102         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2103
2104         return (char *) __get_free_pages(gfp_flags, order);
2105 }
2106
2107 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2108 {
2109         unsigned int block_nr = req->tp_block_nr;
2110         char **pg_vec;
2111         int i;
2112
2113         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2114         if (unlikely(!pg_vec))
2115                 goto out;
2116
2117         for (i = 0; i < block_nr; i++) {
2118                 pg_vec[i] = alloc_one_pg_vec_page(order);
2119                 if (unlikely(!pg_vec[i]))
2120                         goto out_free_pgvec;
2121         }
2122
2123 out:
2124         return pg_vec;
2125
2126 out_free_pgvec:
2127         free_pg_vec(pg_vec, order, block_nr);
2128         pg_vec = NULL;
2129         goto out;
2130 }
2131
2132 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2133                 int closing, int tx_ring)
2134 {
2135         char **pg_vec = NULL;
2136         struct packet_sock *po = pkt_sk(sk);
2137         int was_running, order = 0;
2138         struct packet_ring_buffer *rb;
2139         struct sk_buff_head *rb_queue;
2140         __be16 num;
2141         int err;
2142
2143         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2144         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2145
2146         err = -EBUSY;
2147         if (!closing) {
2148                 if (atomic_read(&po->mapped))
2149                         goto out;
2150                 if (atomic_read(&rb->pending))
2151                         goto out;
2152         }
2153
2154         if (req->tp_block_nr) {
2155                 /* Sanity tests and some calculations */
2156                 err = -EBUSY;
2157                 if (unlikely(rb->pg_vec))
2158                         goto out;
2159
2160                 switch (po->tp_version) {
2161                 case TPACKET_V1:
2162                         po->tp_hdrlen = TPACKET_HDRLEN;
2163                         break;
2164                 case TPACKET_V2:
2165                         po->tp_hdrlen = TPACKET2_HDRLEN;
2166                         break;
2167                 }
2168
2169                 err = -EINVAL;
2170                 if (unlikely((int)req->tp_block_size <= 0))
2171                         goto out;
2172                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2173                         goto out;
2174                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2175                                         po->tp_reserve))
2176                         goto out;
2177                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2178                         goto out;
2179
2180                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2181                 if (unlikely(rb->frames_per_block <= 0))
2182                         goto out;
2183                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2184                                         req->tp_frame_nr))
2185                         goto out;
2186
2187                 err = -ENOMEM;
2188                 order = get_order(req->tp_block_size);
2189                 pg_vec = alloc_pg_vec(req, order);
2190                 if (unlikely(!pg_vec))
2191                         goto out;
2192         }
2193         /* Done */
2194         else {
2195                 err = -EINVAL;
2196                 if (unlikely(req->tp_frame_nr))
2197                         goto out;
2198         }
2199
2200         lock_sock(sk);
2201
2202         /* Detach socket from network */
2203         spin_lock(&po->bind_lock);
2204         was_running = po->running;
2205         num = po->num;
2206         if (was_running) {
2207                 __dev_remove_pack(&po->prot_hook);
2208                 po->num = 0;
2209                 po->running = 0;
2210                 __sock_put(sk);
2211         }
2212         spin_unlock(&po->bind_lock);
2213
2214         synchronize_net();
2215
2216         err = -EBUSY;
2217         mutex_lock(&po->pg_vec_lock);
2218         if (closing || atomic_read(&po->mapped) == 0) {
2219                 err = 0;
2220 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2221                 spin_lock_bh(&rb_queue->lock);
2222                 pg_vec = XC(rb->pg_vec, pg_vec);
2223                 rb->frame_max = (req->tp_frame_nr - 1);
2224                 rb->head = 0;
2225                 rb->frame_size = req->tp_frame_size;
2226                 spin_unlock_bh(&rb_queue->lock);
2227
2228                 order = XC(rb->pg_vec_order, order);
2229                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2230
2231                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2232                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2233                                                 tpacket_rcv : packet_rcv;
2234                 skb_queue_purge(rb_queue);
2235 #undef XC
2236                 if (atomic_read(&po->mapped))
2237                         pr_err("packet_mmap: vma is busy: %d\n",
2238                                atomic_read(&po->mapped));
2239         }
2240         mutex_unlock(&po->pg_vec_lock);
2241
2242         spin_lock(&po->bind_lock);
2243         if (was_running && !po->running) {
2244                 sock_hold(sk);
2245                 po->running = 1;
2246                 po->num = num;
2247                 dev_add_pack(&po->prot_hook);
2248         }
2249         spin_unlock(&po->bind_lock);
2250
2251         release_sock(sk);
2252
2253         if (pg_vec)
2254                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2255 out:
2256         return err;
2257 }
2258
2259 static int packet_mmap(struct file *file, struct socket *sock,
2260                 struct vm_area_struct *vma)
2261 {
2262         struct sock *sk = sock->sk;
2263         struct packet_sock *po = pkt_sk(sk);
2264         unsigned long size, expected_size;
2265         struct packet_ring_buffer *rb;
2266         unsigned long start;
2267         int err = -EINVAL;
2268         int i;
2269
2270         if (vma->vm_pgoff)
2271                 return -EINVAL;
2272
2273         mutex_lock(&po->pg_vec_lock);
2274
2275         expected_size = 0;
2276         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2277                 if (rb->pg_vec) {
2278                         expected_size += rb->pg_vec_len
2279                                                 * rb->pg_vec_pages
2280                                                 * PAGE_SIZE;
2281                 }
2282         }
2283
2284         if (expected_size == 0)
2285                 goto out;
2286
2287         size = vma->vm_end - vma->vm_start;
2288         if (size != expected_size)
2289                 goto out;
2290
2291         start = vma->vm_start;
2292         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2293                 if (rb->pg_vec == NULL)
2294                         continue;
2295
2296                 for (i = 0; i < rb->pg_vec_len; i++) {
2297                         struct page *page = virt_to_page(rb->pg_vec[i]);
2298                         int pg_num;
2299
2300                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2301                                         pg_num++, page++) {
2302                                 err = vm_insert_page(vma, start, page);
2303                                 if (unlikely(err))
2304                                         goto out;
2305                                 start += PAGE_SIZE;
2306                         }
2307                 }
2308         }
2309
2310         atomic_inc(&po->mapped);
2311         vma->vm_ops = &packet_mmap_ops;
2312         err = 0;
2313
2314 out:
2315         mutex_unlock(&po->pg_vec_lock);
2316         return err;
2317 }
2318 #endif
2319
2320
2321 static const struct proto_ops packet_ops_spkt = {
2322         .family =       PF_PACKET,
2323         .owner =        THIS_MODULE,
2324         .release =      packet_release,
2325         .bind =         packet_bind_spkt,
2326         .connect =      sock_no_connect,
2327         .socketpair =   sock_no_socketpair,
2328         .accept =       sock_no_accept,
2329         .getname =      packet_getname_spkt,
2330         .poll =         datagram_poll,
2331         .ioctl =        packet_ioctl,
2332         .listen =       sock_no_listen,
2333         .shutdown =     sock_no_shutdown,
2334         .setsockopt =   sock_no_setsockopt,
2335         .getsockopt =   sock_no_getsockopt,
2336         .sendmsg =      packet_sendmsg_spkt,
2337         .recvmsg =      packet_recvmsg,
2338         .mmap =         sock_no_mmap,
2339         .sendpage =     sock_no_sendpage,
2340 };
2341
2342 static const struct proto_ops packet_ops = {
2343         .family =       PF_PACKET,
2344         .owner =        THIS_MODULE,
2345         .release =      packet_release,
2346         .bind =         packet_bind,
2347         .connect =      sock_no_connect,
2348         .socketpair =   sock_no_socketpair,
2349         .accept =       sock_no_accept,
2350         .getname =      packet_getname,
2351         .poll =         packet_poll,
2352         .ioctl =        packet_ioctl,
2353         .listen =       sock_no_listen,
2354         .shutdown =     sock_no_shutdown,
2355         .setsockopt =   packet_setsockopt,
2356         .getsockopt =   packet_getsockopt,
2357         .sendmsg =      packet_sendmsg,
2358         .recvmsg =      packet_recvmsg,
2359         .mmap =         packet_mmap,
2360         .sendpage =     sock_no_sendpage,
2361 };
2362
2363 static struct net_proto_family packet_family_ops = {
2364         .family =       PF_PACKET,
2365         .create =       packet_create,
2366         .owner  =       THIS_MODULE,
2367 };
2368
2369 static struct notifier_block packet_netdev_notifier = {
2370         .notifier_call =        packet_notifier,
2371 };
2372
2373 #ifdef CONFIG_PROC_FS
2374 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2375 {
2376         struct sock *s;
2377         struct hlist_node *node;
2378
2379         sk_for_each(s, node, &net->packet.sklist) {
2380                 if (!off--)
2381                         return s;
2382         }
2383         return NULL;
2384 }
2385
2386 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2387         __acquires(seq_file_net(seq)->packet.sklist_lock)
2388 {
2389         struct net *net = seq_file_net(seq);
2390         read_lock(&net->packet.sklist_lock);
2391         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2392 }
2393
2394 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2395 {
2396         struct net *net = seq_file_net(seq);
2397         ++*pos;
2398         return  (v == SEQ_START_TOKEN)
2399                 ? sk_head(&net->packet.sklist)
2400                 : sk_next((struct sock *)v) ;
2401 }
2402
2403 static void packet_seq_stop(struct seq_file *seq, void *v)
2404         __releases(seq_file_net(seq)->packet.sklist_lock)
2405 {
2406         struct net *net = seq_file_net(seq);
2407         read_unlock(&net->packet.sklist_lock);
2408 }
2409
2410 static int packet_seq_show(struct seq_file *seq, void *v)
2411 {
2412         if (v == SEQ_START_TOKEN)
2413                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2414         else {
2415                 struct sock *s = v;
2416                 const struct packet_sock *po = pkt_sk(s);
2417
2418                 seq_printf(seq,
2419                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2420                            s,
2421                            atomic_read(&s->sk_refcnt),
2422                            s->sk_type,
2423                            ntohs(po->num),
2424                            po->ifindex,
2425                            po->running,
2426                            atomic_read(&s->sk_rmem_alloc),
2427                            sock_i_uid(s),
2428                            sock_i_ino(s));
2429         }
2430
2431         return 0;
2432 }
2433
2434 static const struct seq_operations packet_seq_ops = {
2435         .start  = packet_seq_start,
2436         .next   = packet_seq_next,
2437         .stop   = packet_seq_stop,
2438         .show   = packet_seq_show,
2439 };
2440
2441 static int packet_seq_open(struct inode *inode, struct file *file)
2442 {
2443         return seq_open_net(inode, file, &packet_seq_ops,
2444                             sizeof(struct seq_net_private));
2445 }
2446
2447 static const struct file_operations packet_seq_fops = {
2448         .owner          = THIS_MODULE,
2449         .open           = packet_seq_open,
2450         .read           = seq_read,
2451         .llseek         = seq_lseek,
2452         .release        = seq_release_net,
2453 };
2454
2455 #endif
2456
2457 static int packet_net_init(struct net *net)
2458 {
2459         rwlock_init(&net->packet.sklist_lock);
2460         INIT_HLIST_HEAD(&net->packet.sklist);
2461
2462         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2463                 return -ENOMEM;
2464
2465         return 0;
2466 }
2467
2468 static void packet_net_exit(struct net *net)
2469 {
2470         proc_net_remove(net, "packet");
2471 }
2472
2473 static struct pernet_operations packet_net_ops = {
2474         .init = packet_net_init,
2475         .exit = packet_net_exit,
2476 };
2477
2478
2479 static void __exit packet_exit(void)
2480 {
2481         unregister_netdevice_notifier(&packet_netdev_notifier);
2482         unregister_pernet_subsys(&packet_net_ops);
2483         sock_unregister(PF_PACKET);
2484         proto_unregister(&packet_proto);
2485 }
2486
2487 static int __init packet_init(void)
2488 {
2489         int rc = proto_register(&packet_proto, 0);
2490
2491         if (rc != 0)
2492                 goto out;
2493
2494         sock_register(&packet_family_ops);
2495         register_pernet_subsys(&packet_net_ops);
2496         register_netdevice_notifier(&packet_netdev_notifier);
2497 out:
2498         return rc;
2499 }
2500
2501 module_init(packet_init);
2502 module_exit(packet_exit);
2503 MODULE_LICENSE("GPL");
2504 MODULE_ALIAS_NETPROTO(PF_PACKET);