Use sk_mark for routing lookup in more places
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *              Johann Baudy    :       Added TX RING.
43  *
44  *              This program is free software; you can redistribute it and/or
45  *              modify it under the terms of the GNU General Public License
46  *              as published by the Free Software Foundation; either version
47  *              2 of the License, or (at your option) any later version.
48  *
49  */
50
51 #include <linux/types.h>
52 #include <linux/mm.h>
53 #include <linux/capability.h>
54 #include <linux/fcntl.h>
55 #include <linux/socket.h>
56 #include <linux/in.h>
57 #include <linux/inet.h>
58 #include <linux/netdevice.h>
59 #include <linux/if_packet.h>
60 #include <linux/wireless.h>
61 #include <linux/kernel.h>
62 #include <linux/kmod.h>
63 #include <net/net_namespace.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81 #include <linux/mutex.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* Private packet socket structures. */
139
140 struct packet_mclist {
141         struct packet_mclist    *next;
142         int                     ifindex;
143         int                     count;
144         unsigned short          type;
145         unsigned short          alen;
146         unsigned char           addr[MAX_ADDR_LEN];
147 };
148 /* identical to struct packet_mreq except it has
149  * a longer address field.
150  */
151 struct packet_mreq_max {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
160                 int closing, int tx_ring);
161
162 struct packet_ring_buffer {
163         char                    **pg_vec;
164         unsigned int            head;
165         unsigned int            frames_per_block;
166         unsigned int            frame_size;
167         unsigned int            frame_max;
168
169         unsigned int            pg_vec_order;
170         unsigned int            pg_vec_pages;
171         unsigned int            pg_vec_len;
172
173         atomic_t                pending;
174 };
175
176 struct packet_sock;
177 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg);
178 #endif
179
180 static void packet_flush_mclist(struct sock *sk);
181
182 struct packet_sock {
183         /* struct sock has to be the first member of packet_sock */
184         struct sock             sk;
185         struct tpacket_stats    stats;
186 #ifdef CONFIG_PACKET_MMAP
187         struct packet_ring_buffer       rx_ring;
188         struct packet_ring_buffer       tx_ring;
189         int                     copy_thresh;
190 #endif
191         struct packet_type      prot_hook;
192         spinlock_t              bind_lock;
193         struct mutex            pg_vec_lock;
194         unsigned int            running:1,      /* prot_hook is attached*/
195                                 auxdata:1,
196                                 origdev:1;
197         int                     ifindex;        /* bound device         */
198         __be16                  num;
199         struct packet_mclist    *mclist;
200 #ifdef CONFIG_PACKET_MMAP
201         atomic_t                mapped;
202         enum tpacket_versions   tp_version;
203         unsigned int            tp_hdrlen;
204         unsigned int            tp_reserve;
205         unsigned int            tp_loss:1;
206 #endif
207 };
208
209 struct packet_skb_cb {
210         unsigned int origlen;
211         union {
212                 struct sockaddr_pkt pkt;
213                 struct sockaddr_ll ll;
214         } sa;
215 };
216
217 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
218
219 #ifdef CONFIG_PACKET_MMAP
220
221 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
222 {
223         union {
224                 struct tpacket_hdr *h1;
225                 struct tpacket2_hdr *h2;
226                 void *raw;
227         } h;
228
229         h.raw = frame;
230         switch (po->tp_version) {
231         case TPACKET_V1:
232                 h.h1->tp_status = status;
233                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
234                 break;
235         case TPACKET_V2:
236                 h.h2->tp_status = status;
237                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
238                 break;
239         default:
240                 pr_err("TPACKET version not supported\n");
241                 BUG();
242         }
243
244         smp_wmb();
245 }
246
247 static int __packet_get_status(struct packet_sock *po, void *frame)
248 {
249         union {
250                 struct tpacket_hdr *h1;
251                 struct tpacket2_hdr *h2;
252                 void *raw;
253         } h;
254
255         smp_rmb();
256
257         h.raw = frame;
258         switch (po->tp_version) {
259         case TPACKET_V1:
260                 flush_dcache_page(virt_to_page(&h.h1->tp_status));
261                 return h.h1->tp_status;
262         case TPACKET_V2:
263                 flush_dcache_page(virt_to_page(&h.h2->tp_status));
264                 return h.h2->tp_status;
265         default:
266                 pr_err("TPACKET version not supported\n");
267                 BUG();
268                 return 0;
269         }
270 }
271
272 static void *packet_lookup_frame(struct packet_sock *po,
273                 struct packet_ring_buffer *rb,
274                 unsigned int position,
275                 int status)
276 {
277         unsigned int pg_vec_pos, frame_offset;
278         union {
279                 struct tpacket_hdr *h1;
280                 struct tpacket2_hdr *h2;
281                 void *raw;
282         } h;
283
284         pg_vec_pos = position / rb->frames_per_block;
285         frame_offset = position % rb->frames_per_block;
286
287         h.raw = rb->pg_vec[pg_vec_pos] + (frame_offset * rb->frame_size);
288
289         if (status != __packet_get_status(po, h.raw))
290                 return NULL;
291
292         return h.raw;
293 }
294
295 static inline void *packet_current_frame(struct packet_sock *po,
296                 struct packet_ring_buffer *rb,
297                 int status)
298 {
299         return packet_lookup_frame(po, rb, rb->head, status);
300 }
301
302 static inline void *packet_previous_frame(struct packet_sock *po,
303                 struct packet_ring_buffer *rb,
304                 int status)
305 {
306         unsigned int previous = rb->head ? rb->head - 1 : rb->frame_max;
307         return packet_lookup_frame(po, rb, previous, status);
308 }
309
310 static inline void packet_increment_head(struct packet_ring_buffer *buff)
311 {
312         buff->head = buff->head != buff->frame_max ? buff->head+1 : 0;
313 }
314
315 #endif
316
317 static inline struct packet_sock *pkt_sk(struct sock *sk)
318 {
319         return (struct packet_sock *)sk;
320 }
321
322 static void packet_sock_destruct(struct sock *sk)
323 {
324         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
325         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
326
327         if (!sock_flag(sk, SOCK_DEAD)) {
328                 pr_err("Attempt to release alive packet socket: %p\n", sk);
329                 return;
330         }
331
332         sk_refcnt_debug_dec(sk);
333 }
334
335
336 static const struct proto_ops packet_ops;
337
338 static const struct proto_ops packet_ops_spkt;
339
340 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,
341                            struct packet_type *pt, struct net_device *orig_dev)
342 {
343         struct sock *sk;
344         struct sockaddr_pkt *spkt;
345
346         /*
347          *      When we registered the protocol we saved the socket in the data
348          *      field for just this event.
349          */
350
351         sk = pt->af_packet_priv;
352
353         /*
354          *      Yank back the headers [hope the device set this
355          *      right or kerboom...]
356          *
357          *      Incoming packets have ll header pulled,
358          *      push it back.
359          *
360          *      For outgoing ones skb->data == skb_mac_header(skb)
361          *      so that this procedure is noop.
362          */
363
364         if (skb->pkt_type == PACKET_LOOPBACK)
365                 goto out;
366
367         if (dev_net(dev) != sock_net(sk))
368                 goto out;
369
370         skb = skb_share_check(skb, GFP_ATOMIC);
371         if (skb == NULL)
372                 goto oom;
373
374         /* drop any routing info */
375         skb_dst_drop(skb);
376
377         /* drop conntrack reference */
378         nf_reset(skb);
379
380         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
381
382         skb_push(skb, skb->data - skb_mac_header(skb));
383
384         /*
385          *      The SOCK_PACKET socket receives _all_ frames.
386          */
387
388         spkt->spkt_family = dev->type;
389         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
390         spkt->spkt_protocol = skb->protocol;
391
392         /*
393          *      Charge the memory to the socket. This is done specifically
394          *      to prevent sockets using all the memory up.
395          */
396
397         if (sock_queue_rcv_skb(sk, skb) == 0)
398                 return 0;
399
400 out:
401         kfree_skb(skb);
402 oom:
403         return 0;
404 }
405
406
407 /*
408  *      Output a raw packet to a device layer. This bypasses all the other
409  *      protocol layers and you must therefore supply it with a complete frame
410  */
411
412 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
413                                struct msghdr *msg, size_t len)
414 {
415         struct sock *sk = sock->sk;
416         struct sockaddr_pkt *saddr = (struct sockaddr_pkt *)msg->msg_name;
417         struct sk_buff *skb;
418         struct net_device *dev;
419         __be16 proto = 0;
420         int err;
421
422         /*
423          *      Get and verify the address.
424          */
425
426         if (saddr) {
427                 if (msg->msg_namelen < sizeof(struct sockaddr))
428                         return -EINVAL;
429                 if (msg->msg_namelen == sizeof(struct sockaddr_pkt))
430                         proto = saddr->spkt_protocol;
431         } else
432                 return -ENOTCONN;       /* SOCK_PACKET must be sent giving an address */
433
434         /*
435          *      Find the device first to size check it
436          */
437
438         saddr->spkt_device[13] = 0;
439         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
440         err = -ENODEV;
441         if (dev == NULL)
442                 goto out_unlock;
443
444         err = -ENETDOWN;
445         if (!(dev->flags & IFF_UP))
446                 goto out_unlock;
447
448         /*
449          * You may not queue a frame bigger than the mtu. This is the lowest level
450          * raw protocol and you must do your own fragmentation at this level.
451          */
452
453         err = -EMSGSIZE;
454         if (len > dev->mtu + dev->hard_header_len)
455                 goto out_unlock;
456
457         err = -ENOBUFS;
458         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
459
460         /*
461          * If the write buffer is full, then tough. At this level the user
462          * gets to deal with the problem - do your own algorithmic backoffs.
463          * That's far more flexible.
464          */
465
466         if (skb == NULL)
467                 goto out_unlock;
468
469         /*
470          *      Fill it in
471          */
472
473         /* FIXME: Save some space for broken drivers that write a
474          * hard header at transmission time by themselves. PPP is the
475          * notable one here. This should really be fixed at the driver level.
476          */
477         skb_reserve(skb, LL_RESERVED_SPACE(dev));
478         skb_reset_network_header(skb);
479
480         /* Try to align data part correctly */
481         if (dev->header_ops) {
482                 skb->data -= dev->hard_header_len;
483                 skb->tail -= dev->hard_header_len;
484                 if (len < dev->hard_header_len)
485                         skb_reset_network_header(skb);
486         }
487
488         /* Returns -EFAULT on error */
489         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
490         skb->protocol = proto;
491         skb->dev = dev;
492         skb->priority = sk->sk_priority;
493         skb->mark = sk->sk_mark;
494         if (err)
495                 goto out_free;
496
497         /*
498          *      Now send it
499          */
500
501         dev_queue_xmit(skb);
502         dev_put(dev);
503         return len;
504
505 out_free:
506         kfree_skb(skb);
507 out_unlock:
508         if (dev)
509                 dev_put(dev);
510         return err;
511 }
512
513 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
514                                       unsigned int res)
515 {
516         struct sk_filter *filter;
517
518         rcu_read_lock_bh();
519         filter = rcu_dereference(sk->sk_filter);
520         if (filter != NULL)
521                 res = sk_run_filter(skb, filter->insns, filter->len);
522         rcu_read_unlock_bh();
523
524         return res;
525 }
526
527 /*
528  * If we've lost frames since the last time we queued one to the
529  * sk_receive_queue, we need to record it here.
530  * This must be called under the protection of the socket lock
531  * to prevent racing with other softirqs and user space
532  */
533 static inline void record_packet_gap(struct sk_buff *skb,
534                                         struct packet_sock *po)
535 {
536         /*
537          * We overload the mark field here, since we're about
538          * to enqueue to a receive queue and no body else will
539          * use this field at this point
540          */
541         skb->mark = po->stats.tp_gap;
542         po->stats.tp_gap = 0;
543         return;
544
545 }
546
547 static inline __u32 check_packet_gap(struct sk_buff *skb)
548 {
549         return skb->mark;
550 }
551
552 /*
553    This function makes lazy skb cloning in hope that most of packets
554    are discarded by BPF.
555
556    Note tricky part: we DO mangle shared skb! skb->data, skb->len
557    and skb->cb are mangled. It works because (and until) packets
558    falling here are owned by current CPU. Output packets are cloned
559    by dev_queue_xmit_nit(), input packets are processed by net_bh
560    sequencially, so that if we return skb to original state on exit,
561    we will not harm anyone.
562  */
563
564 static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
565                       struct packet_type *pt, struct net_device *orig_dev)
566 {
567         struct sock *sk;
568         struct sockaddr_ll *sll;
569         struct packet_sock *po;
570         u8 *skb_head = skb->data;
571         int skb_len = skb->len;
572         unsigned int snaplen, res;
573
574         if (skb->pkt_type == PACKET_LOOPBACK)
575                 goto drop;
576
577         sk = pt->af_packet_priv;
578         po = pkt_sk(sk);
579
580         if (dev_net(dev) != sock_net(sk))
581                 goto drop;
582
583         skb->dev = dev;
584
585         if (dev->header_ops) {
586                 /* The device has an explicit notion of ll header,
587                    exported to higher levels.
588
589                    Otherwise, the device hides datails of it frame
590                    structure, so that corresponding packet head
591                    never delivered to user.
592                  */
593                 if (sk->sk_type != SOCK_DGRAM)
594                         skb_push(skb, skb->data - skb_mac_header(skb));
595                 else if (skb->pkt_type == PACKET_OUTGOING) {
596                         /* Special case: outgoing packets have ll header at head */
597                         skb_pull(skb, skb_network_offset(skb));
598                 }
599         }
600
601         snaplen = skb->len;
602
603         res = run_filter(skb, sk, snaplen);
604         if (!res)
605                 goto drop_n_restore;
606         if (snaplen > res)
607                 snaplen = res;
608
609         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
610             (unsigned)sk->sk_rcvbuf)
611                 goto drop_n_acct;
612
613         if (skb_shared(skb)) {
614                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
615                 if (nskb == NULL)
616                         goto drop_n_acct;
617
618                 if (skb_head != skb->data) {
619                         skb->data = skb_head;
620                         skb->len = skb_len;
621                 }
622                 kfree_skb(skb);
623                 skb = nskb;
624         }
625
626         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
627                      sizeof(skb->cb));
628
629         sll = &PACKET_SKB_CB(skb)->sa.ll;
630         sll->sll_family = AF_PACKET;
631         sll->sll_hatype = dev->type;
632         sll->sll_protocol = skb->protocol;
633         sll->sll_pkttype = skb->pkt_type;
634         if (unlikely(po->origdev))
635                 sll->sll_ifindex = orig_dev->ifindex;
636         else
637                 sll->sll_ifindex = dev->ifindex;
638
639         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
640
641         PACKET_SKB_CB(skb)->origlen = skb->len;
642
643         if (pskb_trim(skb, snaplen))
644                 goto drop_n_acct;
645
646         skb_set_owner_r(skb, sk);
647         skb->dev = NULL;
648         skb_dst_drop(skb);
649
650         /* drop conntrack reference */
651         nf_reset(skb);
652
653         spin_lock(&sk->sk_receive_queue.lock);
654         po->stats.tp_packets++;
655         record_packet_gap(skb, po);
656         __skb_queue_tail(&sk->sk_receive_queue, skb);
657         spin_unlock(&sk->sk_receive_queue.lock);
658         sk->sk_data_ready(sk, skb->len);
659         return 0;
660
661 drop_n_acct:
662         spin_lock(&sk->sk_receive_queue.lock);
663         po->stats.tp_drops++;
664         po->stats.tp_gap++;
665         spin_unlock(&sk->sk_receive_queue.lock);
666
667 drop_n_restore:
668         if (skb_head != skb->data && skb_shared(skb)) {
669                 skb->data = skb_head;
670                 skb->len = skb_len;
671         }
672 drop:
673         consume_skb(skb);
674         return 0;
675 }
676
677 #ifdef CONFIG_PACKET_MMAP
678 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
679                        struct packet_type *pt, struct net_device *orig_dev)
680 {
681         struct sock *sk;
682         struct packet_sock *po;
683         struct sockaddr_ll *sll;
684         union {
685                 struct tpacket_hdr *h1;
686                 struct tpacket2_hdr *h2;
687                 void *raw;
688         } h;
689         u8 *skb_head = skb->data;
690         int skb_len = skb->len;
691         unsigned int snaplen, res;
692         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
693         unsigned short macoff, netoff, hdrlen;
694         struct sk_buff *copy_skb = NULL;
695         struct timeval tv;
696         struct timespec ts;
697
698         if (skb->pkt_type == PACKET_LOOPBACK)
699                 goto drop;
700
701         sk = pt->af_packet_priv;
702         po = pkt_sk(sk);
703
704         if (dev_net(dev) != sock_net(sk))
705                 goto drop;
706
707         if (dev->header_ops) {
708                 if (sk->sk_type != SOCK_DGRAM)
709                         skb_push(skb, skb->data - skb_mac_header(skb));
710                 else if (skb->pkt_type == PACKET_OUTGOING) {
711                         /* Special case: outgoing packets have ll header at head */
712                         skb_pull(skb, skb_network_offset(skb));
713                 }
714         }
715
716         if (skb->ip_summed == CHECKSUM_PARTIAL)
717                 status |= TP_STATUS_CSUMNOTREADY;
718
719         snaplen = skb->len;
720
721         res = run_filter(skb, sk, snaplen);
722         if (!res)
723                 goto drop_n_restore;
724         if (snaplen > res)
725                 snaplen = res;
726
727         if (sk->sk_type == SOCK_DGRAM) {
728                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
729                                   po->tp_reserve;
730         } else {
731                 unsigned maclen = skb_network_offset(skb);
732                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
733                                        (maclen < 16 ? 16 : maclen)) +
734                         po->tp_reserve;
735                 macoff = netoff - maclen;
736         }
737
738         if (macoff + snaplen > po->rx_ring.frame_size) {
739                 if (po->copy_thresh &&
740                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
741                     (unsigned)sk->sk_rcvbuf) {
742                         if (skb_shared(skb)) {
743                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
744                         } else {
745                                 copy_skb = skb_get(skb);
746                                 skb_head = skb->data;
747                         }
748                         if (copy_skb)
749                                 skb_set_owner_r(copy_skb, sk);
750                 }
751                 snaplen = po->rx_ring.frame_size - macoff;
752                 if ((int)snaplen < 0)
753                         snaplen = 0;
754         }
755
756         spin_lock(&sk->sk_receive_queue.lock);
757         h.raw = packet_current_frame(po, &po->rx_ring, TP_STATUS_KERNEL);
758         if (!h.raw)
759                 goto ring_is_full;
760         packet_increment_head(&po->rx_ring);
761         po->stats.tp_packets++;
762         if (copy_skb) {
763                 status |= TP_STATUS_COPY;
764                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
765         }
766         if (!po->stats.tp_drops)
767                 status &= ~TP_STATUS_LOSING;
768         spin_unlock(&sk->sk_receive_queue.lock);
769
770         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
771
772         switch (po->tp_version) {
773         case TPACKET_V1:
774                 h.h1->tp_len = skb->len;
775                 h.h1->tp_snaplen = snaplen;
776                 h.h1->tp_mac = macoff;
777                 h.h1->tp_net = netoff;
778                 if (skb->tstamp.tv64)
779                         tv = ktime_to_timeval(skb->tstamp);
780                 else
781                         do_gettimeofday(&tv);
782                 h.h1->tp_sec = tv.tv_sec;
783                 h.h1->tp_usec = tv.tv_usec;
784                 hdrlen = sizeof(*h.h1);
785                 break;
786         case TPACKET_V2:
787                 h.h2->tp_len = skb->len;
788                 h.h2->tp_snaplen = snaplen;
789                 h.h2->tp_mac = macoff;
790                 h.h2->tp_net = netoff;
791                 if (skb->tstamp.tv64)
792                         ts = ktime_to_timespec(skb->tstamp);
793                 else
794                         getnstimeofday(&ts);
795                 h.h2->tp_sec = ts.tv_sec;
796                 h.h2->tp_nsec = ts.tv_nsec;
797                 h.h2->tp_vlan_tci = skb->vlan_tci;
798                 hdrlen = sizeof(*h.h2);
799                 break;
800         default:
801                 BUG();
802         }
803
804         sll = h.raw + TPACKET_ALIGN(hdrlen);
805         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
806         sll->sll_family = AF_PACKET;
807         sll->sll_hatype = dev->type;
808         sll->sll_protocol = skb->protocol;
809         sll->sll_pkttype = skb->pkt_type;
810         if (unlikely(po->origdev))
811                 sll->sll_ifindex = orig_dev->ifindex;
812         else
813                 sll->sll_ifindex = dev->ifindex;
814
815         __packet_set_status(po, h.raw, status);
816         smp_mb();
817         {
818                 struct page *p_start, *p_end;
819                 u8 *h_end = h.raw + macoff + snaplen - 1;
820
821                 p_start = virt_to_page(h.raw);
822                 p_end = virt_to_page(h_end);
823                 while (p_start <= p_end) {
824                         flush_dcache_page(p_start);
825                         p_start++;
826                 }
827         }
828
829         sk->sk_data_ready(sk, 0);
830
831 drop_n_restore:
832         if (skb_head != skb->data && skb_shared(skb)) {
833                 skb->data = skb_head;
834                 skb->len = skb_len;
835         }
836 drop:
837         kfree_skb(skb);
838         return 0;
839
840 ring_is_full:
841         po->stats.tp_drops++;
842         po->stats.tp_gap++;
843         spin_unlock(&sk->sk_receive_queue.lock);
844
845         sk->sk_data_ready(sk, 0);
846         kfree_skb(copy_skb);
847         goto drop_n_restore;
848 }
849
850 static void tpacket_destruct_skb(struct sk_buff *skb)
851 {
852         struct packet_sock *po = pkt_sk(skb->sk);
853         void *ph;
854
855         BUG_ON(skb == NULL);
856
857         if (likely(po->tx_ring.pg_vec)) {
858                 ph = skb_shinfo(skb)->destructor_arg;
859                 BUG_ON(__packet_get_status(po, ph) != TP_STATUS_SENDING);
860                 BUG_ON(atomic_read(&po->tx_ring.pending) == 0);
861                 atomic_dec(&po->tx_ring.pending);
862                 __packet_set_status(po, ph, TP_STATUS_AVAILABLE);
863         }
864
865         sock_wfree(skb);
866 }
867
868 static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
869                 void *frame, struct net_device *dev, int size_max,
870                 __be16 proto, unsigned char *addr)
871 {
872         union {
873                 struct tpacket_hdr *h1;
874                 struct tpacket2_hdr *h2;
875                 void *raw;
876         } ph;
877         int to_write, offset, len, tp_len, nr_frags, len_max;
878         struct socket *sock = po->sk.sk_socket;
879         struct page *page;
880         void *data;
881         int err;
882
883         ph.raw = frame;
884
885         skb->protocol = proto;
886         skb->dev = dev;
887         skb->priority = po->sk.sk_priority;
888         skb->mark = po->sk.sk_mark;
889         skb_shinfo(skb)->destructor_arg = ph.raw;
890
891         switch (po->tp_version) {
892         case TPACKET_V2:
893                 tp_len = ph.h2->tp_len;
894                 break;
895         default:
896                 tp_len = ph.h1->tp_len;
897                 break;
898         }
899         if (unlikely(tp_len > size_max)) {
900                 pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
901                 return -EMSGSIZE;
902         }
903
904         skb_reserve(skb, LL_RESERVED_SPACE(dev));
905         skb_reset_network_header(skb);
906
907         data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
908         to_write = tp_len;
909
910         if (sock->type == SOCK_DGRAM) {
911                 err = dev_hard_header(skb, dev, ntohs(proto), addr,
912                                 NULL, tp_len);
913                 if (unlikely(err < 0))
914                         return -EINVAL;
915         } else if (dev->hard_header_len) {
916                 /* net device doesn't like empty head */
917                 if (unlikely(tp_len <= dev->hard_header_len)) {
918                         pr_err("packet size is too short (%d < %d)\n",
919                                tp_len, dev->hard_header_len);
920                         return -EINVAL;
921                 }
922
923                 skb_push(skb, dev->hard_header_len);
924                 err = skb_store_bits(skb, 0, data,
925                                 dev->hard_header_len);
926                 if (unlikely(err))
927                         return err;
928
929                 data += dev->hard_header_len;
930                 to_write -= dev->hard_header_len;
931         }
932
933         err = -EFAULT;
934         page = virt_to_page(data);
935         offset = offset_in_page(data);
936         len_max = PAGE_SIZE - offset;
937         len = ((to_write > len_max) ? len_max : to_write);
938
939         skb->data_len = to_write;
940         skb->len += to_write;
941         skb->truesize += to_write;
942         atomic_add(to_write, &po->sk.sk_wmem_alloc);
943
944         while (likely(to_write)) {
945                 nr_frags = skb_shinfo(skb)->nr_frags;
946
947                 if (unlikely(nr_frags >= MAX_SKB_FRAGS)) {
948                         pr_err("Packet exceed the number of skb frags(%lu)\n",
949                                MAX_SKB_FRAGS);
950                         return -EFAULT;
951                 }
952
953                 flush_dcache_page(page);
954                 get_page(page);
955                 skb_fill_page_desc(skb,
956                                 nr_frags,
957                                 page++, offset, len);
958                 to_write -= len;
959                 offset = 0;
960                 len_max = PAGE_SIZE;
961                 len = ((to_write > len_max) ? len_max : to_write);
962         }
963
964         return tp_len;
965 }
966
967 static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
968 {
969         struct socket *sock;
970         struct sk_buff *skb;
971         struct net_device *dev;
972         __be16 proto;
973         int ifindex, err, reserve = 0;
974         void *ph;
975         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
976         int tp_len, size_max;
977         unsigned char *addr;
978         int len_sum = 0;
979         int status = 0;
980
981         sock = po->sk.sk_socket;
982
983         mutex_lock(&po->pg_vec_lock);
984
985         err = -EBUSY;
986         if (saddr == NULL) {
987                 ifindex = po->ifindex;
988                 proto   = po->num;
989                 addr    = NULL;
990         } else {
991                 err = -EINVAL;
992                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
993                         goto out;
994                 if (msg->msg_namelen < (saddr->sll_halen
995                                         + offsetof(struct sockaddr_ll,
996                                                 sll_addr)))
997                         goto out;
998                 ifindex = saddr->sll_ifindex;
999                 proto   = saddr->sll_protocol;
1000                 addr    = saddr->sll_addr;
1001         }
1002
1003         dev = dev_get_by_index(sock_net(&po->sk), ifindex);
1004         err = -ENXIO;
1005         if (unlikely(dev == NULL))
1006                 goto out;
1007
1008         reserve = dev->hard_header_len;
1009
1010         err = -ENETDOWN;
1011         if (unlikely(!(dev->flags & IFF_UP)))
1012                 goto out_put;
1013
1014         size_max = po->tx_ring.frame_size
1015                 - sizeof(struct skb_shared_info)
1016                 - po->tp_hdrlen
1017                 - LL_ALLOCATED_SPACE(dev)
1018                 - sizeof(struct sockaddr_ll);
1019
1020         if (size_max > dev->mtu + reserve)
1021                 size_max = dev->mtu + reserve;
1022
1023         do {
1024                 ph = packet_current_frame(po, &po->tx_ring,
1025                                 TP_STATUS_SEND_REQUEST);
1026
1027                 if (unlikely(ph == NULL)) {
1028                         schedule();
1029                         continue;
1030                 }
1031
1032                 status = TP_STATUS_SEND_REQUEST;
1033                 skb = sock_alloc_send_skb(&po->sk,
1034                                 LL_ALLOCATED_SPACE(dev)
1035                                 + sizeof(struct sockaddr_ll),
1036                                 0, &err);
1037
1038                 if (unlikely(skb == NULL))
1039                         goto out_status;
1040
1041                 tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
1042                                 addr);
1043
1044                 if (unlikely(tp_len < 0)) {
1045                         if (po->tp_loss) {
1046                                 __packet_set_status(po, ph,
1047                                                 TP_STATUS_AVAILABLE);
1048                                 packet_increment_head(&po->tx_ring);
1049                                 kfree_skb(skb);
1050                                 continue;
1051                         } else {
1052                                 status = TP_STATUS_WRONG_FORMAT;
1053                                 err = tp_len;
1054                                 goto out_status;
1055                         }
1056                 }
1057
1058                 skb->destructor = tpacket_destruct_skb;
1059                 __packet_set_status(po, ph, TP_STATUS_SENDING);
1060                 atomic_inc(&po->tx_ring.pending);
1061
1062                 status = TP_STATUS_SEND_REQUEST;
1063                 err = dev_queue_xmit(skb);
1064                 if (unlikely(err > 0 && (err = net_xmit_errno(err)) != 0))
1065                         goto out_xmit;
1066                 packet_increment_head(&po->tx_ring);
1067                 len_sum += tp_len;
1068         } while (likely((ph != NULL) || ((!(msg->msg_flags & MSG_DONTWAIT))
1069                                         && (atomic_read(&po->tx_ring.pending))))
1070               );
1071
1072         err = len_sum;
1073         goto out_put;
1074
1075 out_xmit:
1076         skb->destructor = sock_wfree;
1077         atomic_dec(&po->tx_ring.pending);
1078 out_status:
1079         __packet_set_status(po, ph, status);
1080         kfree_skb(skb);
1081 out_put:
1082         dev_put(dev);
1083 out:
1084         mutex_unlock(&po->pg_vec_lock);
1085         return err;
1086 }
1087 #endif
1088
1089 static int packet_snd(struct socket *sock,
1090                           struct msghdr *msg, size_t len)
1091 {
1092         struct sock *sk = sock->sk;
1093         struct sockaddr_ll *saddr = (struct sockaddr_ll *)msg->msg_name;
1094         struct sk_buff *skb;
1095         struct net_device *dev;
1096         __be16 proto;
1097         unsigned char *addr;
1098         int ifindex, err, reserve = 0;
1099
1100         /*
1101          *      Get and verify the address.
1102          */
1103
1104         if (saddr == NULL) {
1105                 struct packet_sock *po = pkt_sk(sk);
1106
1107                 ifindex = po->ifindex;
1108                 proto   = po->num;
1109                 addr    = NULL;
1110         } else {
1111                 err = -EINVAL;
1112                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
1113                         goto out;
1114                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
1115                         goto out;
1116                 ifindex = saddr->sll_ifindex;
1117                 proto   = saddr->sll_protocol;
1118                 addr    = saddr->sll_addr;
1119         }
1120
1121
1122         dev = dev_get_by_index(sock_net(sk), ifindex);
1123         err = -ENXIO;
1124         if (dev == NULL)
1125                 goto out_unlock;
1126         if (sock->type == SOCK_RAW)
1127                 reserve = dev->hard_header_len;
1128
1129         err = -ENETDOWN;
1130         if (!(dev->flags & IFF_UP))
1131                 goto out_unlock;
1132
1133         err = -EMSGSIZE;
1134         if (len > dev->mtu+reserve)
1135                 goto out_unlock;
1136
1137         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
1138                                 msg->msg_flags & MSG_DONTWAIT, &err);
1139         if (skb == NULL)
1140                 goto out_unlock;
1141
1142         skb_reserve(skb, LL_RESERVED_SPACE(dev));
1143         skb_reset_network_header(skb);
1144
1145         err = -EINVAL;
1146         if (sock->type == SOCK_DGRAM &&
1147             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
1148                 goto out_free;
1149
1150         /* Returns -EFAULT on error */
1151         err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len);
1152         if (err)
1153                 goto out_free;
1154
1155         skb->protocol = proto;
1156         skb->dev = dev;
1157         skb->priority = sk->sk_priority;
1158         skb->mark = sk->sk_mark;
1159
1160         /*
1161          *      Now send it
1162          */
1163
1164         err = dev_queue_xmit(skb);
1165         if (err > 0 && (err = net_xmit_errno(err)) != 0)
1166                 goto out_unlock;
1167
1168         dev_put(dev);
1169
1170         return len;
1171
1172 out_free:
1173         kfree_skb(skb);
1174 out_unlock:
1175         if (dev)
1176                 dev_put(dev);
1177 out:
1178         return err;
1179 }
1180
1181 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
1182                 struct msghdr *msg, size_t len)
1183 {
1184 #ifdef CONFIG_PACKET_MMAP
1185         struct sock *sk = sock->sk;
1186         struct packet_sock *po = pkt_sk(sk);
1187         if (po->tx_ring.pg_vec)
1188                 return tpacket_snd(po, msg);
1189         else
1190 #endif
1191                 return packet_snd(sock, msg, len);
1192 }
1193
1194 /*
1195  *      Close a PACKET socket. This is fairly simple. We immediately go
1196  *      to 'closed' state and remove our protocol entry in the device list.
1197  */
1198
1199 static int packet_release(struct socket *sock)
1200 {
1201         struct sock *sk = sock->sk;
1202         struct packet_sock *po;
1203         struct net *net;
1204 #ifdef CONFIG_PACKET_MMAP
1205         struct tpacket_req req;
1206 #endif
1207
1208         if (!sk)
1209                 return 0;
1210
1211         net = sock_net(sk);
1212         po = pkt_sk(sk);
1213
1214         write_lock_bh(&net->packet.sklist_lock);
1215         sk_del_node_init(sk);
1216         sock_prot_inuse_add(net, sk->sk_prot, -1);
1217         write_unlock_bh(&net->packet.sklist_lock);
1218
1219         /*
1220          *      Unhook packet receive handler.
1221          */
1222
1223         if (po->running) {
1224                 /*
1225                  *      Remove the protocol hook
1226                  */
1227                 dev_remove_pack(&po->prot_hook);
1228                 po->running = 0;
1229                 po->num = 0;
1230                 __sock_put(sk);
1231         }
1232
1233         packet_flush_mclist(sk);
1234
1235 #ifdef CONFIG_PACKET_MMAP
1236         memset(&req, 0, sizeof(req));
1237
1238         if (po->rx_ring.pg_vec)
1239                 packet_set_ring(sk, &req, 1, 0);
1240
1241         if (po->tx_ring.pg_vec)
1242                 packet_set_ring(sk, &req, 1, 1);
1243 #endif
1244
1245         /*
1246          *      Now the socket is dead. No more input will appear.
1247          */
1248
1249         sock_orphan(sk);
1250         sock->sk = NULL;
1251
1252         /* Purge queues */
1253
1254         skb_queue_purge(&sk->sk_receive_queue);
1255         sk_refcnt_debug_release(sk);
1256
1257         sock_put(sk);
1258         return 0;
1259 }
1260
1261 /*
1262  *      Attach a packet hook.
1263  */
1264
1265 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
1266 {
1267         struct packet_sock *po = pkt_sk(sk);
1268         /*
1269          *      Detach an existing hook if present.
1270          */
1271
1272         lock_sock(sk);
1273
1274         spin_lock(&po->bind_lock);
1275         if (po->running) {
1276                 __sock_put(sk);
1277                 po->running = 0;
1278                 po->num = 0;
1279                 spin_unlock(&po->bind_lock);
1280                 dev_remove_pack(&po->prot_hook);
1281                 spin_lock(&po->bind_lock);
1282         }
1283
1284         po->num = protocol;
1285         po->prot_hook.type = protocol;
1286         po->prot_hook.dev = dev;
1287
1288         po->ifindex = dev ? dev->ifindex : 0;
1289
1290         if (protocol == 0)
1291                 goto out_unlock;
1292
1293         if (!dev || (dev->flags & IFF_UP)) {
1294                 dev_add_pack(&po->prot_hook);
1295                 sock_hold(sk);
1296                 po->running = 1;
1297         } else {
1298                 sk->sk_err = ENETDOWN;
1299                 if (!sock_flag(sk, SOCK_DEAD))
1300                         sk->sk_error_report(sk);
1301         }
1302
1303 out_unlock:
1304         spin_unlock(&po->bind_lock);
1305         release_sock(sk);
1306         return 0;
1307 }
1308
1309 /*
1310  *      Bind a packet socket to a device
1311  */
1312
1313 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
1314                             int addr_len)
1315 {
1316         struct sock *sk = sock->sk;
1317         char name[15];
1318         struct net_device *dev;
1319         int err = -ENODEV;
1320
1321         /*
1322          *      Check legality
1323          */
1324
1325         if (addr_len != sizeof(struct sockaddr))
1326                 return -EINVAL;
1327         strlcpy(name, uaddr->sa_data, sizeof(name));
1328
1329         dev = dev_get_by_name(sock_net(sk), name);
1330         if (dev) {
1331                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
1332                 dev_put(dev);
1333         }
1334         return err;
1335 }
1336
1337 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
1338 {
1339         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1340         struct sock *sk = sock->sk;
1341         struct net_device *dev = NULL;
1342         int err;
1343
1344
1345         /*
1346          *      Check legality
1347          */
1348
1349         if (addr_len < sizeof(struct sockaddr_ll))
1350                 return -EINVAL;
1351         if (sll->sll_family != AF_PACKET)
1352                 return -EINVAL;
1353
1354         if (sll->sll_ifindex) {
1355                 err = -ENODEV;
1356                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1357                 if (dev == NULL)
1358                         goto out;
1359         }
1360         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1361         if (dev)
1362                 dev_put(dev);
1363
1364 out:
1365         return err;
1366 }
1367
1368 static struct proto packet_proto = {
1369         .name     = "PACKET",
1370         .owner    = THIS_MODULE,
1371         .obj_size = sizeof(struct packet_sock),
1372 };
1373
1374 /*
1375  *      Create a packet of type SOCK_PACKET.
1376  */
1377
1378 static int packet_create(struct net *net, struct socket *sock, int protocol)
1379 {
1380         struct sock *sk;
1381         struct packet_sock *po;
1382         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1383         int err;
1384
1385         if (!capable(CAP_NET_RAW))
1386                 return -EPERM;
1387         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1388             sock->type != SOCK_PACKET)
1389                 return -ESOCKTNOSUPPORT;
1390
1391         sock->state = SS_UNCONNECTED;
1392
1393         err = -ENOBUFS;
1394         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1395         if (sk == NULL)
1396                 goto out;
1397
1398         sock->ops = &packet_ops;
1399         if (sock->type == SOCK_PACKET)
1400                 sock->ops = &packet_ops_spkt;
1401
1402         sock_init_data(sock, sk);
1403
1404         po = pkt_sk(sk);
1405         sk->sk_family = PF_PACKET;
1406         po->num = proto;
1407
1408         sk->sk_destruct = packet_sock_destruct;
1409         sk_refcnt_debug_inc(sk);
1410
1411         /*
1412          *      Attach a protocol block
1413          */
1414
1415         spin_lock_init(&po->bind_lock);
1416         mutex_init(&po->pg_vec_lock);
1417         po->prot_hook.func = packet_rcv;
1418
1419         if (sock->type == SOCK_PACKET)
1420                 po->prot_hook.func = packet_rcv_spkt;
1421
1422         po->prot_hook.af_packet_priv = sk;
1423
1424         if (proto) {
1425                 po->prot_hook.type = proto;
1426                 dev_add_pack(&po->prot_hook);
1427                 sock_hold(sk);
1428                 po->running = 1;
1429         }
1430
1431         write_lock_bh(&net->packet.sklist_lock);
1432         sk_add_node(sk, &net->packet.sklist);
1433         sock_prot_inuse_add(net, &packet_proto, 1);
1434         write_unlock_bh(&net->packet.sklist_lock);
1435         return 0;
1436 out:
1437         return err;
1438 }
1439
1440 /*
1441  *      Pull a packet from our receive queue and hand it to the user.
1442  *      If necessary we block.
1443  */
1444
1445 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1446                           struct msghdr *msg, size_t len, int flags)
1447 {
1448         struct sock *sk = sock->sk;
1449         struct sk_buff *skb;
1450         int copied, err;
1451         struct sockaddr_ll *sll;
1452         __u32 gap;
1453
1454         err = -EINVAL;
1455         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1456                 goto out;
1457
1458 #if 0
1459         /* What error should we return now? EUNATTACH? */
1460         if (pkt_sk(sk)->ifindex < 0)
1461                 return -ENODEV;
1462 #endif
1463
1464         /*
1465          *      Call the generic datagram receiver. This handles all sorts
1466          *      of horrible races and re-entrancy so we can forget about it
1467          *      in the protocol layers.
1468          *
1469          *      Now it will return ENETDOWN, if device have just gone down,
1470          *      but then it will block.
1471          */
1472
1473         skb = skb_recv_datagram(sk, flags, flags & MSG_DONTWAIT, &err);
1474
1475         /*
1476          *      An error occurred so return it. Because skb_recv_datagram()
1477          *      handles the blocking we don't see and worry about blocking
1478          *      retries.
1479          */
1480
1481         if (skb == NULL)
1482                 goto out;
1483
1484         /*
1485          *      If the address length field is there to be filled in, we fill
1486          *      it in now.
1487          */
1488
1489         sll = &PACKET_SKB_CB(skb)->sa.ll;
1490         if (sock->type == SOCK_PACKET)
1491                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1492         else
1493                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1494
1495         /*
1496          *      You lose any data beyond the buffer you gave. If it worries a
1497          *      user program they can ask the device for its MTU anyway.
1498          */
1499
1500         copied = skb->len;
1501         if (copied > len) {
1502                 copied = len;
1503                 msg->msg_flags |= MSG_TRUNC;
1504         }
1505
1506         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1507         if (err)
1508                 goto out_free;
1509
1510         sock_recv_timestamp(msg, sk, skb);
1511
1512         if (msg->msg_name)
1513                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1514                        msg->msg_namelen);
1515
1516         if (pkt_sk(sk)->auxdata) {
1517                 struct tpacket_auxdata aux;
1518
1519                 aux.tp_status = TP_STATUS_USER;
1520                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1521                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1522                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1523                 aux.tp_snaplen = skb->len;
1524                 aux.tp_mac = 0;
1525                 aux.tp_net = skb_network_offset(skb);
1526                 aux.tp_vlan_tci = skb->vlan_tci;
1527
1528                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1529         }
1530
1531         gap = check_packet_gap(skb);
1532         if (gap)
1533                 put_cmsg(msg, SOL_PACKET, PACKET_GAPDATA, sizeof(__u32), &gap);
1534
1535         /*
1536          *      Free or return the buffer as appropriate. Again this
1537          *      hides all the races and re-entrancy issues from us.
1538          */
1539         err = (flags&MSG_TRUNC) ? skb->len : copied;
1540
1541 out_free:
1542         skb_free_datagram(sk, skb);
1543 out:
1544         return err;
1545 }
1546
1547 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1548                                int *uaddr_len, int peer)
1549 {
1550         struct net_device *dev;
1551         struct sock *sk = sock->sk;
1552
1553         if (peer)
1554                 return -EOPNOTSUPP;
1555
1556         uaddr->sa_family = AF_PACKET;
1557         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1558         if (dev) {
1559                 strlcpy(uaddr->sa_data, dev->name, 15);
1560                 dev_put(dev);
1561         } else
1562                 memset(uaddr->sa_data, 0, 14);
1563         *uaddr_len = sizeof(*uaddr);
1564
1565         return 0;
1566 }
1567
1568 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1569                           int *uaddr_len, int peer)
1570 {
1571         struct net_device *dev;
1572         struct sock *sk = sock->sk;
1573         struct packet_sock *po = pkt_sk(sk);
1574         struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
1575
1576         if (peer)
1577                 return -EOPNOTSUPP;
1578
1579         sll->sll_family = AF_PACKET;
1580         sll->sll_ifindex = po->ifindex;
1581         sll->sll_protocol = po->num;
1582         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1583         if (dev) {
1584                 sll->sll_hatype = dev->type;
1585                 sll->sll_halen = dev->addr_len;
1586                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1587                 dev_put(dev);
1588         } else {
1589                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1590                 sll->sll_halen = 0;
1591         }
1592         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1593
1594         return 0;
1595 }
1596
1597 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1598                          int what)
1599 {
1600         switch (i->type) {
1601         case PACKET_MR_MULTICAST:
1602                 if (what > 0)
1603                         return dev_mc_add(dev, i->addr, i->alen, 0);
1604                 else
1605                         return dev_mc_delete(dev, i->addr, i->alen, 0);
1606                 break;
1607         case PACKET_MR_PROMISC:
1608                 return dev_set_promiscuity(dev, what);
1609                 break;
1610         case PACKET_MR_ALLMULTI:
1611                 return dev_set_allmulti(dev, what);
1612                 break;
1613         case PACKET_MR_UNICAST:
1614                 if (what > 0)
1615                         return dev_unicast_add(dev, i->addr);
1616                 else
1617                         return dev_unicast_delete(dev, i->addr);
1618                 break;
1619         default:
1620                 break;
1621         }
1622         return 0;
1623 }
1624
1625 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1626 {
1627         for ( ; i; i = i->next) {
1628                 if (i->ifindex == dev->ifindex)
1629                         packet_dev_mc(dev, i, what);
1630         }
1631 }
1632
1633 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1634 {
1635         struct packet_sock *po = pkt_sk(sk);
1636         struct packet_mclist *ml, *i;
1637         struct net_device *dev;
1638         int err;
1639
1640         rtnl_lock();
1641
1642         err = -ENODEV;
1643         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1644         if (!dev)
1645                 goto done;
1646
1647         err = -EINVAL;
1648         if (mreq->mr_alen > dev->addr_len)
1649                 goto done;
1650
1651         err = -ENOBUFS;
1652         i = kmalloc(sizeof(*i), GFP_KERNEL);
1653         if (i == NULL)
1654                 goto done;
1655
1656         err = 0;
1657         for (ml = po->mclist; ml; ml = ml->next) {
1658                 if (ml->ifindex == mreq->mr_ifindex &&
1659                     ml->type == mreq->mr_type &&
1660                     ml->alen == mreq->mr_alen &&
1661                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1662                         ml->count++;
1663                         /* Free the new element ... */
1664                         kfree(i);
1665                         goto done;
1666                 }
1667         }
1668
1669         i->type = mreq->mr_type;
1670         i->ifindex = mreq->mr_ifindex;
1671         i->alen = mreq->mr_alen;
1672         memcpy(i->addr, mreq->mr_address, i->alen);
1673         i->count = 1;
1674         i->next = po->mclist;
1675         po->mclist = i;
1676         err = packet_dev_mc(dev, i, 1);
1677         if (err) {
1678                 po->mclist = i->next;
1679                 kfree(i);
1680         }
1681
1682 done:
1683         rtnl_unlock();
1684         return err;
1685 }
1686
1687 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1688 {
1689         struct packet_mclist *ml, **mlp;
1690
1691         rtnl_lock();
1692
1693         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1694                 if (ml->ifindex == mreq->mr_ifindex &&
1695                     ml->type == mreq->mr_type &&
1696                     ml->alen == mreq->mr_alen &&
1697                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1698                         if (--ml->count == 0) {
1699                                 struct net_device *dev;
1700                                 *mlp = ml->next;
1701                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1702                                 if (dev) {
1703                                         packet_dev_mc(dev, ml, -1);
1704                                         dev_put(dev);
1705                                 }
1706                                 kfree(ml);
1707                         }
1708                         rtnl_unlock();
1709                         return 0;
1710                 }
1711         }
1712         rtnl_unlock();
1713         return -EADDRNOTAVAIL;
1714 }
1715
1716 static void packet_flush_mclist(struct sock *sk)
1717 {
1718         struct packet_sock *po = pkt_sk(sk);
1719         struct packet_mclist *ml;
1720
1721         if (!po->mclist)
1722                 return;
1723
1724         rtnl_lock();
1725         while ((ml = po->mclist) != NULL) {
1726                 struct net_device *dev;
1727
1728                 po->mclist = ml->next;
1729                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1730                 if (dev != NULL) {
1731                         packet_dev_mc(dev, ml, -1);
1732                         dev_put(dev);
1733                 }
1734                 kfree(ml);
1735         }
1736         rtnl_unlock();
1737 }
1738
1739 static int
1740 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen)
1741 {
1742         struct sock *sk = sock->sk;
1743         struct packet_sock *po = pkt_sk(sk);
1744         int ret;
1745
1746         if (level != SOL_PACKET)
1747                 return -ENOPROTOOPT;
1748
1749         switch (optname) {
1750         case PACKET_ADD_MEMBERSHIP:
1751         case PACKET_DROP_MEMBERSHIP:
1752         {
1753                 struct packet_mreq_max mreq;
1754                 int len = optlen;
1755                 memset(&mreq, 0, sizeof(mreq));
1756                 if (len < sizeof(struct packet_mreq))
1757                         return -EINVAL;
1758                 if (len > sizeof(mreq))
1759                         len = sizeof(mreq);
1760                 if (copy_from_user(&mreq, optval, len))
1761                         return -EFAULT;
1762                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1763                         return -EINVAL;
1764                 if (optname == PACKET_ADD_MEMBERSHIP)
1765                         ret = packet_mc_add(sk, &mreq);
1766                 else
1767                         ret = packet_mc_drop(sk, &mreq);
1768                 return ret;
1769         }
1770
1771 #ifdef CONFIG_PACKET_MMAP
1772         case PACKET_RX_RING:
1773         case PACKET_TX_RING:
1774         {
1775                 struct tpacket_req req;
1776
1777                 if (optlen < sizeof(req))
1778                         return -EINVAL;
1779                 if (copy_from_user(&req, optval, sizeof(req)))
1780                         return -EFAULT;
1781                 return packet_set_ring(sk, &req, 0, optname == PACKET_TX_RING);
1782         }
1783         case PACKET_COPY_THRESH:
1784         {
1785                 int val;
1786
1787                 if (optlen != sizeof(val))
1788                         return -EINVAL;
1789                 if (copy_from_user(&val, optval, sizeof(val)))
1790                         return -EFAULT;
1791
1792                 pkt_sk(sk)->copy_thresh = val;
1793                 return 0;
1794         }
1795         case PACKET_VERSION:
1796         {
1797                 int val;
1798
1799                 if (optlen != sizeof(val))
1800                         return -EINVAL;
1801                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1802                         return -EBUSY;
1803                 if (copy_from_user(&val, optval, sizeof(val)))
1804                         return -EFAULT;
1805                 switch (val) {
1806                 case TPACKET_V1:
1807                 case TPACKET_V2:
1808                         po->tp_version = val;
1809                         return 0;
1810                 default:
1811                         return -EINVAL;
1812                 }
1813         }
1814         case PACKET_RESERVE:
1815         {
1816                 unsigned int val;
1817
1818                 if (optlen != sizeof(val))
1819                         return -EINVAL;
1820                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1821                         return -EBUSY;
1822                 if (copy_from_user(&val, optval, sizeof(val)))
1823                         return -EFAULT;
1824                 po->tp_reserve = val;
1825                 return 0;
1826         }
1827         case PACKET_LOSS:
1828         {
1829                 unsigned int val;
1830
1831                 if (optlen != sizeof(val))
1832                         return -EINVAL;
1833                 if (po->rx_ring.pg_vec || po->tx_ring.pg_vec)
1834                         return -EBUSY;
1835                 if (copy_from_user(&val, optval, sizeof(val)))
1836                         return -EFAULT;
1837                 po->tp_loss = !!val;
1838                 return 0;
1839         }
1840 #endif
1841         case PACKET_AUXDATA:
1842         {
1843                 int val;
1844
1845                 if (optlen < sizeof(val))
1846                         return -EINVAL;
1847                 if (copy_from_user(&val, optval, sizeof(val)))
1848                         return -EFAULT;
1849
1850                 po->auxdata = !!val;
1851                 return 0;
1852         }
1853         case PACKET_ORIGDEV:
1854         {
1855                 int val;
1856
1857                 if (optlen < sizeof(val))
1858                         return -EINVAL;
1859                 if (copy_from_user(&val, optval, sizeof(val)))
1860                         return -EFAULT;
1861
1862                 po->origdev = !!val;
1863                 return 0;
1864         }
1865         default:
1866                 return -ENOPROTOOPT;
1867         }
1868 }
1869
1870 static int packet_getsockopt(struct socket *sock, int level, int optname,
1871                              char __user *optval, int __user *optlen)
1872 {
1873         int len;
1874         int val;
1875         struct sock *sk = sock->sk;
1876         struct packet_sock *po = pkt_sk(sk);
1877         void *data;
1878         struct tpacket_stats st;
1879
1880         if (level != SOL_PACKET)
1881                 return -ENOPROTOOPT;
1882
1883         if (get_user(len, optlen))
1884                 return -EFAULT;
1885
1886         if (len < 0)
1887                 return -EINVAL;
1888
1889         switch (optname) {
1890         case PACKET_STATISTICS:
1891                 if (len > sizeof(struct tpacket_stats))
1892                         len = sizeof(struct tpacket_stats);
1893                 spin_lock_bh(&sk->sk_receive_queue.lock);
1894                 st = po->stats;
1895                 memset(&po->stats, 0, sizeof(st));
1896                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1897                 st.tp_packets += st.tp_drops;
1898
1899                 data = &st;
1900                 break;
1901         case PACKET_AUXDATA:
1902                 if (len > sizeof(int))
1903                         len = sizeof(int);
1904                 val = po->auxdata;
1905
1906                 data = &val;
1907                 break;
1908         case PACKET_ORIGDEV:
1909                 if (len > sizeof(int))
1910                         len = sizeof(int);
1911                 val = po->origdev;
1912
1913                 data = &val;
1914                 break;
1915 #ifdef CONFIG_PACKET_MMAP
1916         case PACKET_VERSION:
1917                 if (len > sizeof(int))
1918                         len = sizeof(int);
1919                 val = po->tp_version;
1920                 data = &val;
1921                 break;
1922         case PACKET_HDRLEN:
1923                 if (len > sizeof(int))
1924                         len = sizeof(int);
1925                 if (copy_from_user(&val, optval, len))
1926                         return -EFAULT;
1927                 switch (val) {
1928                 case TPACKET_V1:
1929                         val = sizeof(struct tpacket_hdr);
1930                         break;
1931                 case TPACKET_V2:
1932                         val = sizeof(struct tpacket2_hdr);
1933                         break;
1934                 default:
1935                         return -EINVAL;
1936                 }
1937                 data = &val;
1938                 break;
1939         case PACKET_RESERVE:
1940                 if (len > sizeof(unsigned int))
1941                         len = sizeof(unsigned int);
1942                 val = po->tp_reserve;
1943                 data = &val;
1944                 break;
1945         case PACKET_LOSS:
1946                 if (len > sizeof(unsigned int))
1947                         len = sizeof(unsigned int);
1948                 val = po->tp_loss;
1949                 data = &val;
1950                 break;
1951 #endif
1952         default:
1953                 return -ENOPROTOOPT;
1954         }
1955
1956         if (put_user(len, optlen))
1957                 return -EFAULT;
1958         if (copy_to_user(optval, data, len))
1959                 return -EFAULT;
1960         return 0;
1961 }
1962
1963
1964 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1965 {
1966         struct sock *sk;
1967         struct hlist_node *node;
1968         struct net_device *dev = data;
1969         struct net *net = dev_net(dev);
1970
1971         read_lock(&net->packet.sklist_lock);
1972         sk_for_each(sk, node, &net->packet.sklist) {
1973                 struct packet_sock *po = pkt_sk(sk);
1974
1975                 switch (msg) {
1976                 case NETDEV_UNREGISTER:
1977                         if (po->mclist)
1978                                 packet_dev_mclist(dev, po->mclist, -1);
1979                         /* fallthrough */
1980
1981                 case NETDEV_DOWN:
1982                         if (dev->ifindex == po->ifindex) {
1983                                 spin_lock(&po->bind_lock);
1984                                 if (po->running) {
1985                                         __dev_remove_pack(&po->prot_hook);
1986                                         __sock_put(sk);
1987                                         po->running = 0;
1988                                         sk->sk_err = ENETDOWN;
1989                                         if (!sock_flag(sk, SOCK_DEAD))
1990                                                 sk->sk_error_report(sk);
1991                                 }
1992                                 if (msg == NETDEV_UNREGISTER) {
1993                                         po->ifindex = -1;
1994                                         po->prot_hook.dev = NULL;
1995                                 }
1996                                 spin_unlock(&po->bind_lock);
1997                         }
1998                         break;
1999                 case NETDEV_UP:
2000                         spin_lock(&po->bind_lock);
2001                         if (dev->ifindex == po->ifindex && po->num &&
2002                             !po->running) {
2003                                 dev_add_pack(&po->prot_hook);
2004                                 sock_hold(sk);
2005                                 po->running = 1;
2006                         }
2007                         spin_unlock(&po->bind_lock);
2008                         break;
2009                 }
2010         }
2011         read_unlock(&net->packet.sklist_lock);
2012         return NOTIFY_DONE;
2013 }
2014
2015
2016 static int packet_ioctl(struct socket *sock, unsigned int cmd,
2017                         unsigned long arg)
2018 {
2019         struct sock *sk = sock->sk;
2020
2021         switch (cmd) {
2022         case SIOCOUTQ:
2023         {
2024                 int amount = sk_wmem_alloc_get(sk);
2025
2026                 return put_user(amount, (int __user *)arg);
2027         }
2028         case SIOCINQ:
2029         {
2030                 struct sk_buff *skb;
2031                 int amount = 0;
2032
2033                 spin_lock_bh(&sk->sk_receive_queue.lock);
2034                 skb = skb_peek(&sk->sk_receive_queue);
2035                 if (skb)
2036                         amount = skb->len;
2037                 spin_unlock_bh(&sk->sk_receive_queue.lock);
2038                 return put_user(amount, (int __user *)arg);
2039         }
2040         case SIOCGSTAMP:
2041                 return sock_get_timestamp(sk, (struct timeval __user *)arg);
2042         case SIOCGSTAMPNS:
2043                 return sock_get_timestampns(sk, (struct timespec __user *)arg);
2044
2045 #ifdef CONFIG_INET
2046         case SIOCADDRT:
2047         case SIOCDELRT:
2048         case SIOCDARP:
2049         case SIOCGARP:
2050         case SIOCSARP:
2051         case SIOCGIFADDR:
2052         case SIOCSIFADDR:
2053         case SIOCGIFBRDADDR:
2054         case SIOCSIFBRDADDR:
2055         case SIOCGIFNETMASK:
2056         case SIOCSIFNETMASK:
2057         case SIOCGIFDSTADDR:
2058         case SIOCSIFDSTADDR:
2059         case SIOCSIFFLAGS:
2060                 if (!net_eq(sock_net(sk), &init_net))
2061                         return -ENOIOCTLCMD;
2062                 return inet_dgram_ops.ioctl(sock, cmd, arg);
2063 #endif
2064
2065         default:
2066                 return -ENOIOCTLCMD;
2067         }
2068         return 0;
2069 }
2070
2071 #ifndef CONFIG_PACKET_MMAP
2072 #define packet_mmap sock_no_mmap
2073 #define packet_poll datagram_poll
2074 #else
2075
2076 static unsigned int packet_poll(struct file *file, struct socket *sock,
2077                                 poll_table *wait)
2078 {
2079         struct sock *sk = sock->sk;
2080         struct packet_sock *po = pkt_sk(sk);
2081         unsigned int mask = datagram_poll(file, sock, wait);
2082
2083         spin_lock_bh(&sk->sk_receive_queue.lock);
2084         if (po->rx_ring.pg_vec) {
2085                 if (!packet_previous_frame(po, &po->rx_ring, TP_STATUS_KERNEL))
2086                         mask |= POLLIN | POLLRDNORM;
2087         }
2088         spin_unlock_bh(&sk->sk_receive_queue.lock);
2089         spin_lock_bh(&sk->sk_write_queue.lock);
2090         if (po->tx_ring.pg_vec) {
2091                 if (packet_current_frame(po, &po->tx_ring, TP_STATUS_AVAILABLE))
2092                         mask |= POLLOUT | POLLWRNORM;
2093         }
2094         spin_unlock_bh(&sk->sk_write_queue.lock);
2095         return mask;
2096 }
2097
2098
2099 /* Dirty? Well, I still did not learn better way to account
2100  * for user mmaps.
2101  */
2102
2103 static void packet_mm_open(struct vm_area_struct *vma)
2104 {
2105         struct file *file = vma->vm_file;
2106         struct socket *sock = file->private_data;
2107         struct sock *sk = sock->sk;
2108
2109         if (sk)
2110                 atomic_inc(&pkt_sk(sk)->mapped);
2111 }
2112
2113 static void packet_mm_close(struct vm_area_struct *vma)
2114 {
2115         struct file *file = vma->vm_file;
2116         struct socket *sock = file->private_data;
2117         struct sock *sk = sock->sk;
2118
2119         if (sk)
2120                 atomic_dec(&pkt_sk(sk)->mapped);
2121 }
2122
2123 static const struct vm_operations_struct packet_mmap_ops = {
2124         .open   =       packet_mm_open,
2125         .close  =       packet_mm_close,
2126 };
2127
2128 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
2129 {
2130         int i;
2131
2132         for (i = 0; i < len; i++) {
2133                 if (likely(pg_vec[i]))
2134                         free_pages((unsigned long) pg_vec[i], order);
2135         }
2136         kfree(pg_vec);
2137 }
2138
2139 static inline char *alloc_one_pg_vec_page(unsigned long order)
2140 {
2141         gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | __GFP_NOWARN;
2142
2143         return (char *) __get_free_pages(gfp_flags, order);
2144 }
2145
2146 static char **alloc_pg_vec(struct tpacket_req *req, int order)
2147 {
2148         unsigned int block_nr = req->tp_block_nr;
2149         char **pg_vec;
2150         int i;
2151
2152         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
2153         if (unlikely(!pg_vec))
2154                 goto out;
2155
2156         for (i = 0; i < block_nr; i++) {
2157                 pg_vec[i] = alloc_one_pg_vec_page(order);
2158                 if (unlikely(!pg_vec[i]))
2159                         goto out_free_pgvec;
2160         }
2161
2162 out:
2163         return pg_vec;
2164
2165 out_free_pgvec:
2166         free_pg_vec(pg_vec, order, block_nr);
2167         pg_vec = NULL;
2168         goto out;
2169 }
2170
2171 static int packet_set_ring(struct sock *sk, struct tpacket_req *req,
2172                 int closing, int tx_ring)
2173 {
2174         char **pg_vec = NULL;
2175         struct packet_sock *po = pkt_sk(sk);
2176         int was_running, order = 0;
2177         struct packet_ring_buffer *rb;
2178         struct sk_buff_head *rb_queue;
2179         __be16 num;
2180         int err;
2181
2182         rb = tx_ring ? &po->tx_ring : &po->rx_ring;
2183         rb_queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
2184
2185         err = -EBUSY;
2186         if (!closing) {
2187                 if (atomic_read(&po->mapped))
2188                         goto out;
2189                 if (atomic_read(&rb->pending))
2190                         goto out;
2191         }
2192
2193         if (req->tp_block_nr) {
2194                 /* Sanity tests and some calculations */
2195                 err = -EBUSY;
2196                 if (unlikely(rb->pg_vec))
2197                         goto out;
2198
2199                 switch (po->tp_version) {
2200                 case TPACKET_V1:
2201                         po->tp_hdrlen = TPACKET_HDRLEN;
2202                         break;
2203                 case TPACKET_V2:
2204                         po->tp_hdrlen = TPACKET2_HDRLEN;
2205                         break;
2206                 }
2207
2208                 err = -EINVAL;
2209                 if (unlikely((int)req->tp_block_size <= 0))
2210                         goto out;
2211                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
2212                         goto out;
2213                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
2214                                         po->tp_reserve))
2215                         goto out;
2216                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
2217                         goto out;
2218
2219                 rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
2220                 if (unlikely(rb->frames_per_block <= 0))
2221                         goto out;
2222                 if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
2223                                         req->tp_frame_nr))
2224                         goto out;
2225
2226                 err = -ENOMEM;
2227                 order = get_order(req->tp_block_size);
2228                 pg_vec = alloc_pg_vec(req, order);
2229                 if (unlikely(!pg_vec))
2230                         goto out;
2231         }
2232         /* Done */
2233         else {
2234                 err = -EINVAL;
2235                 if (unlikely(req->tp_frame_nr))
2236                         goto out;
2237         }
2238
2239         lock_sock(sk);
2240
2241         /* Detach socket from network */
2242         spin_lock(&po->bind_lock);
2243         was_running = po->running;
2244         num = po->num;
2245         if (was_running) {
2246                 __dev_remove_pack(&po->prot_hook);
2247                 po->num = 0;
2248                 po->running = 0;
2249                 __sock_put(sk);
2250         }
2251         spin_unlock(&po->bind_lock);
2252
2253         synchronize_net();
2254
2255         err = -EBUSY;
2256         mutex_lock(&po->pg_vec_lock);
2257         if (closing || atomic_read(&po->mapped) == 0) {
2258                 err = 0;
2259 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
2260                 spin_lock_bh(&rb_queue->lock);
2261                 pg_vec = XC(rb->pg_vec, pg_vec);
2262                 rb->frame_max = (req->tp_frame_nr - 1);
2263                 rb->head = 0;
2264                 rb->frame_size = req->tp_frame_size;
2265                 spin_unlock_bh(&rb_queue->lock);
2266
2267                 order = XC(rb->pg_vec_order, order);
2268                 req->tp_block_nr = XC(rb->pg_vec_len, req->tp_block_nr);
2269
2270                 rb->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
2271                 po->prot_hook.func = (po->rx_ring.pg_vec) ?
2272                                                 tpacket_rcv : packet_rcv;
2273                 skb_queue_purge(rb_queue);
2274 #undef XC
2275                 if (atomic_read(&po->mapped))
2276                         pr_err("packet_mmap: vma is busy: %d\n",
2277                                atomic_read(&po->mapped));
2278         }
2279         mutex_unlock(&po->pg_vec_lock);
2280
2281         spin_lock(&po->bind_lock);
2282         if (was_running && !po->running) {
2283                 sock_hold(sk);
2284                 po->running = 1;
2285                 po->num = num;
2286                 dev_add_pack(&po->prot_hook);
2287         }
2288         spin_unlock(&po->bind_lock);
2289
2290         release_sock(sk);
2291
2292         if (pg_vec)
2293                 free_pg_vec(pg_vec, order, req->tp_block_nr);
2294 out:
2295         return err;
2296 }
2297
2298 static int packet_mmap(struct file *file, struct socket *sock,
2299                 struct vm_area_struct *vma)
2300 {
2301         struct sock *sk = sock->sk;
2302         struct packet_sock *po = pkt_sk(sk);
2303         unsigned long size, expected_size;
2304         struct packet_ring_buffer *rb;
2305         unsigned long start;
2306         int err = -EINVAL;
2307         int i;
2308
2309         if (vma->vm_pgoff)
2310                 return -EINVAL;
2311
2312         mutex_lock(&po->pg_vec_lock);
2313
2314         expected_size = 0;
2315         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2316                 if (rb->pg_vec) {
2317                         expected_size += rb->pg_vec_len
2318                                                 * rb->pg_vec_pages
2319                                                 * PAGE_SIZE;
2320                 }
2321         }
2322
2323         if (expected_size == 0)
2324                 goto out;
2325
2326         size = vma->vm_end - vma->vm_start;
2327         if (size != expected_size)
2328                 goto out;
2329
2330         start = vma->vm_start;
2331         for (rb = &po->rx_ring; rb <= &po->tx_ring; rb++) {
2332                 if (rb->pg_vec == NULL)
2333                         continue;
2334
2335                 for (i = 0; i < rb->pg_vec_len; i++) {
2336                         struct page *page = virt_to_page(rb->pg_vec[i]);
2337                         int pg_num;
2338
2339                         for (pg_num = 0; pg_num < rb->pg_vec_pages;
2340                                         pg_num++, page++) {
2341                                 err = vm_insert_page(vma, start, page);
2342                                 if (unlikely(err))
2343                                         goto out;
2344                                 start += PAGE_SIZE;
2345                         }
2346                 }
2347         }
2348
2349         atomic_inc(&po->mapped);
2350         vma->vm_ops = &packet_mmap_ops;
2351         err = 0;
2352
2353 out:
2354         mutex_unlock(&po->pg_vec_lock);
2355         return err;
2356 }
2357 #endif
2358
2359
2360 static const struct proto_ops packet_ops_spkt = {
2361         .family =       PF_PACKET,
2362         .owner =        THIS_MODULE,
2363         .release =      packet_release,
2364         .bind =         packet_bind_spkt,
2365         .connect =      sock_no_connect,
2366         .socketpair =   sock_no_socketpair,
2367         .accept =       sock_no_accept,
2368         .getname =      packet_getname_spkt,
2369         .poll =         datagram_poll,
2370         .ioctl =        packet_ioctl,
2371         .listen =       sock_no_listen,
2372         .shutdown =     sock_no_shutdown,
2373         .setsockopt =   sock_no_setsockopt,
2374         .getsockopt =   sock_no_getsockopt,
2375         .sendmsg =      packet_sendmsg_spkt,
2376         .recvmsg =      packet_recvmsg,
2377         .mmap =         sock_no_mmap,
2378         .sendpage =     sock_no_sendpage,
2379 };
2380
2381 static const struct proto_ops packet_ops = {
2382         .family =       PF_PACKET,
2383         .owner =        THIS_MODULE,
2384         .release =      packet_release,
2385         .bind =         packet_bind,
2386         .connect =      sock_no_connect,
2387         .socketpair =   sock_no_socketpair,
2388         .accept =       sock_no_accept,
2389         .getname =      packet_getname,
2390         .poll =         packet_poll,
2391         .ioctl =        packet_ioctl,
2392         .listen =       sock_no_listen,
2393         .shutdown =     sock_no_shutdown,
2394         .setsockopt =   packet_setsockopt,
2395         .getsockopt =   packet_getsockopt,
2396         .sendmsg =      packet_sendmsg,
2397         .recvmsg =      packet_recvmsg,
2398         .mmap =         packet_mmap,
2399         .sendpage =     sock_no_sendpage,
2400 };
2401
2402 static struct net_proto_family packet_family_ops = {
2403         .family =       PF_PACKET,
2404         .create =       packet_create,
2405         .owner  =       THIS_MODULE,
2406 };
2407
2408 static struct notifier_block packet_netdev_notifier = {
2409         .notifier_call =        packet_notifier,
2410 };
2411
2412 #ifdef CONFIG_PROC_FS
2413 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2414 {
2415         struct sock *s;
2416         struct hlist_node *node;
2417
2418         sk_for_each(s, node, &net->packet.sklist) {
2419                 if (!off--)
2420                         return s;
2421         }
2422         return NULL;
2423 }
2424
2425 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2426         __acquires(seq_file_net(seq)->packet.sklist_lock)
2427 {
2428         struct net *net = seq_file_net(seq);
2429         read_lock(&net->packet.sklist_lock);
2430         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2431 }
2432
2433 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2434 {
2435         struct net *net = seq_file_net(seq);
2436         ++*pos;
2437         return  (v == SEQ_START_TOKEN)
2438                 ? sk_head(&net->packet.sklist)
2439                 : sk_next((struct sock *)v) ;
2440 }
2441
2442 static void packet_seq_stop(struct seq_file *seq, void *v)
2443         __releases(seq_file_net(seq)->packet.sklist_lock)
2444 {
2445         struct net *net = seq_file_net(seq);
2446         read_unlock(&net->packet.sklist_lock);
2447 }
2448
2449 static int packet_seq_show(struct seq_file *seq, void *v)
2450 {
2451         if (v == SEQ_START_TOKEN)
2452                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2453         else {
2454                 struct sock *s = v;
2455                 const struct packet_sock *po = pkt_sk(s);
2456
2457                 seq_printf(seq,
2458                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2459                            s,
2460                            atomic_read(&s->sk_refcnt),
2461                            s->sk_type,
2462                            ntohs(po->num),
2463                            po->ifindex,
2464                            po->running,
2465                            atomic_read(&s->sk_rmem_alloc),
2466                            sock_i_uid(s),
2467                            sock_i_ino(s));
2468         }
2469
2470         return 0;
2471 }
2472
2473 static const struct seq_operations packet_seq_ops = {
2474         .start  = packet_seq_start,
2475         .next   = packet_seq_next,
2476         .stop   = packet_seq_stop,
2477         .show   = packet_seq_show,
2478 };
2479
2480 static int packet_seq_open(struct inode *inode, struct file *file)
2481 {
2482         return seq_open_net(inode, file, &packet_seq_ops,
2483                             sizeof(struct seq_net_private));
2484 }
2485
2486 static const struct file_operations packet_seq_fops = {
2487         .owner          = THIS_MODULE,
2488         .open           = packet_seq_open,
2489         .read           = seq_read,
2490         .llseek         = seq_lseek,
2491         .release        = seq_release_net,
2492 };
2493
2494 #endif
2495
2496 static int packet_net_init(struct net *net)
2497 {
2498         rwlock_init(&net->packet.sklist_lock);
2499         INIT_HLIST_HEAD(&net->packet.sklist);
2500
2501         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2502                 return -ENOMEM;
2503
2504         return 0;
2505 }
2506
2507 static void packet_net_exit(struct net *net)
2508 {
2509         proc_net_remove(net, "packet");
2510 }
2511
2512 static struct pernet_operations packet_net_ops = {
2513         .init = packet_net_init,
2514         .exit = packet_net_exit,
2515 };
2516
2517
2518 static void __exit packet_exit(void)
2519 {
2520         unregister_netdevice_notifier(&packet_netdev_notifier);
2521         unregister_pernet_subsys(&packet_net_ops);
2522         sock_unregister(PF_PACKET);
2523         proto_unregister(&packet_proto);
2524 }
2525
2526 static int __init packet_init(void)
2527 {
2528         int rc = proto_register(&packet_proto, 0);
2529
2530         if (rc != 0)
2531                 goto out;
2532
2533         sock_register(&packet_family_ops);
2534         register_pernet_subsys(&packet_net_ops);
2535         register_netdevice_notifier(&packet_netdev_notifier);
2536 out:
2537         return rc;
2538 }
2539
2540 module_init(packet_init);
2541 module_exit(packet_exit);
2542 MODULE_LICENSE("GPL");
2543 MODULE_ALIAS_NETPROTO(PF_PACKET);