packet: deliver VLAN TCI to userspace
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *
43  *              This program is free software; you can redistribute it and/or
44  *              modify it under the terms of the GNU General Public License
45  *              as published by the Free Software Foundation; either version
46  *              2 of the License, or (at your option) any later version.
47  *
48  */
49
50 #include <linux/types.h>
51 #include <linux/mm.h>
52 #include <linux/capability.h>
53 #include <linux/fcntl.h>
54 #include <linux/socket.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/if_packet.h>
59 #include <linux/wireless.h>
60 #include <linux/kernel.h>
61 #include <linux/kmod.h>
62 #include <net/net_namespace.h>
63 #include <net/ip.h>
64 #include <net/protocol.h>
65 #include <linux/skbuff.h>
66 #include <net/sock.h>
67 #include <linux/errno.h>
68 #include <linux/timer.h>
69 #include <asm/system.h>
70 #include <asm/uaccess.h>
71 #include <asm/ioctls.h>
72 #include <asm/page.h>
73 #include <asm/cacheflush.h>
74 #include <asm/io.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80
81 #ifdef CONFIG_INET
82 #include <net/inet_common.h>
83 #endif
84
85 /*
86    Assumptions:
87    - if device has no dev->hard_header routine, it adds and removes ll header
88      inside itself. In this case ll header is invisible outside of device,
89      but higher levels still should reserve dev->hard_header_len.
90      Some devices are enough clever to reallocate skb, when header
91      will not fit to reserved space (tunnel), another ones are silly
92      (PPP).
93    - packet socket receives packets with pulled ll header,
94      so that SOCK_RAW should push it back.
95
96 On receive:
97 -----------
98
99 Incoming, dev->hard_header!=NULL
100    mac_header -> ll header
101    data       -> data
102
103 Outgoing, dev->hard_header!=NULL
104    mac_header -> ll header
105    data       -> ll header
106
107 Incoming, dev->hard_header==NULL
108    mac_header -> UNKNOWN position. It is very likely, that it points to ll
109                  header.  PPP makes it, that is wrong, because introduce
110                  assymetry between rx and tx paths.
111    data       -> data
112
113 Outgoing, dev->hard_header==NULL
114    mac_header -> data. ll header is still not built!
115    data       -> data
116
117 Resume
118   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
119
120
121 On transmit:
122 ------------
123
124 dev->hard_header != NULL
125    mac_header -> ll header
126    data       -> ll header
127
128 dev->hard_header == NULL (ll header is added by device, we cannot control it)
129    mac_header -> data
130    data       -> data
131
132    We should set nh.raw on output to correct posistion,
133    packet classifier depends on it.
134  */
135
136 /* Private packet socket structures. */
137
138 struct packet_mclist
139 {
140         struct packet_mclist    *next;
141         int                     ifindex;
142         int                     count;
143         unsigned short          type;
144         unsigned short          alen;
145         unsigned char           addr[MAX_ADDR_LEN];
146 };
147 /* identical to struct packet_mreq except it has
148  * a longer address field.
149  */
150 struct packet_mreq_max
151 {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
160 #endif
161
162 static void packet_flush_mclist(struct sock *sk);
163
164 struct packet_sock {
165         /* struct sock has to be the first member of packet_sock */
166         struct sock             sk;
167         struct tpacket_stats    stats;
168 #ifdef CONFIG_PACKET_MMAP
169         char *                  *pg_vec;
170         unsigned int            head;
171         unsigned int            frames_per_block;
172         unsigned int            frame_size;
173         unsigned int            frame_max;
174         int                     copy_thresh;
175 #endif
176         struct packet_type      prot_hook;
177         spinlock_t              bind_lock;
178         unsigned int            running:1,      /* prot_hook is attached*/
179                                 auxdata:1,
180                                 origdev:1;
181         int                     ifindex;        /* bound device         */
182         __be16                  num;
183         struct packet_mclist    *mclist;
184 #ifdef CONFIG_PACKET_MMAP
185         atomic_t                mapped;
186         unsigned int            pg_vec_order;
187         unsigned int            pg_vec_pages;
188         unsigned int            pg_vec_len;
189         enum tpacket_versions   tp_version;
190         unsigned int            tp_hdrlen;
191 #endif
192 };
193
194 struct packet_skb_cb {
195         unsigned int origlen;
196         union {
197                 struct sockaddr_pkt pkt;
198                 struct sockaddr_ll ll;
199         } sa;
200 };
201
202 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
203
204 #ifdef CONFIG_PACKET_MMAP
205
206 static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
207                                  int status)
208 {
209         unsigned int pg_vec_pos, frame_offset;
210         union {
211                 struct tpacket_hdr *h1;
212                 struct tpacket2_hdr *h2;
213                 void *raw;
214         } h;
215
216         pg_vec_pos = position / po->frames_per_block;
217         frame_offset = position % po->frames_per_block;
218
219         h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
220         switch (po->tp_version) {
221         case TPACKET_V1:
222                 if (status != h.h1->tp_status ? TP_STATUS_USER :
223                                                 TP_STATUS_KERNEL)
224                         return NULL;
225                 break;
226         case TPACKET_V2:
227                 if (status != h.h2->tp_status ? TP_STATUS_USER :
228                                                 TP_STATUS_KERNEL)
229                         return NULL;
230                 break;
231         }
232         return h.raw;
233 }
234
235 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
236 {
237         union {
238                 struct tpacket_hdr *h1;
239                 struct tpacket2_hdr *h2;
240                 void *raw;
241         } h;
242
243         h.raw = frame;
244         switch (po->tp_version) {
245         case TPACKET_V1:
246                 h.h1->tp_status = status;
247                 break;
248         case TPACKET_V2:
249                 h.h2->tp_status = status;
250                 break;
251         }
252 }
253 #endif
254
255 static inline struct packet_sock *pkt_sk(struct sock *sk)
256 {
257         return (struct packet_sock *)sk;
258 }
259
260 static void packet_sock_destruct(struct sock *sk)
261 {
262         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
263         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
264
265         if (!sock_flag(sk, SOCK_DEAD)) {
266                 printk("Attempt to release alive packet socket: %p\n", sk);
267                 return;
268         }
269
270         sk_refcnt_debug_dec(sk);
271 }
272
273
274 static const struct proto_ops packet_ops;
275
276 static const struct proto_ops packet_ops_spkt;
277
278 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
279 {
280         struct sock *sk;
281         struct sockaddr_pkt *spkt;
282
283         /*
284          *      When we registered the protocol we saved the socket in the data
285          *      field for just this event.
286          */
287
288         sk = pt->af_packet_priv;
289
290         /*
291          *      Yank back the headers [hope the device set this
292          *      right or kerboom...]
293          *
294          *      Incoming packets have ll header pulled,
295          *      push it back.
296          *
297          *      For outgoing ones skb->data == skb_mac_header(skb)
298          *      so that this procedure is noop.
299          */
300
301         if (skb->pkt_type == PACKET_LOOPBACK)
302                 goto out;
303
304         if (dev_net(dev) != sock_net(sk))
305                 goto out;
306
307         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
308                 goto oom;
309
310         /* drop any routing info */
311         dst_release(skb->dst);
312         skb->dst = NULL;
313
314         /* drop conntrack reference */
315         nf_reset(skb);
316
317         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
318
319         skb_push(skb, skb->data - skb_mac_header(skb));
320
321         /*
322          *      The SOCK_PACKET socket receives _all_ frames.
323          */
324
325         spkt->spkt_family = dev->type;
326         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
327         spkt->spkt_protocol = skb->protocol;
328
329         /*
330          *      Charge the memory to the socket. This is done specifically
331          *      to prevent sockets using all the memory up.
332          */
333
334         if (sock_queue_rcv_skb(sk,skb) == 0)
335                 return 0;
336
337 out:
338         kfree_skb(skb);
339 oom:
340         return 0;
341 }
342
343
344 /*
345  *      Output a raw packet to a device layer. This bypasses all the other
346  *      protocol layers and you must therefore supply it with a complete frame
347  */
348
349 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
350                                struct msghdr *msg, size_t len)
351 {
352         struct sock *sk = sock->sk;
353         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
354         struct sk_buff *skb;
355         struct net_device *dev;
356         __be16 proto=0;
357         int err;
358
359         /*
360          *      Get and verify the address.
361          */
362
363         if (saddr)
364         {
365                 if (msg->msg_namelen < sizeof(struct sockaddr))
366                         return(-EINVAL);
367                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
368                         proto=saddr->spkt_protocol;
369         }
370         else
371                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
372
373         /*
374          *      Find the device first to size check it
375          */
376
377         saddr->spkt_device[13] = 0;
378         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
379         err = -ENODEV;
380         if (dev == NULL)
381                 goto out_unlock;
382
383         err = -ENETDOWN;
384         if (!(dev->flags & IFF_UP))
385                 goto out_unlock;
386
387         /*
388          *      You may not queue a frame bigger than the mtu. This is the lowest level
389          *      raw protocol and you must do your own fragmentation at this level.
390          */
391
392         err = -EMSGSIZE;
393         if (len > dev->mtu + dev->hard_header_len)
394                 goto out_unlock;
395
396         err = -ENOBUFS;
397         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
398
399         /*
400          *      If the write buffer is full, then tough. At this level the user gets to
401          *      deal with the problem - do your own algorithmic backoffs. That's far
402          *      more flexible.
403          */
404
405         if (skb == NULL)
406                 goto out_unlock;
407
408         /*
409          *      Fill it in
410          */
411
412         /* FIXME: Save some space for broken drivers that write a
413          * hard header at transmission time by themselves. PPP is the
414          * notable one here. This should really be fixed at the driver level.
415          */
416         skb_reserve(skb, LL_RESERVED_SPACE(dev));
417         skb_reset_network_header(skb);
418
419         /* Try to align data part correctly */
420         if (dev->header_ops) {
421                 skb->data -= dev->hard_header_len;
422                 skb->tail -= dev->hard_header_len;
423                 if (len < dev->hard_header_len)
424                         skb_reset_network_header(skb);
425         }
426
427         /* Returns -EFAULT on error */
428         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
429         skb->protocol = proto;
430         skb->dev = dev;
431         skb->priority = sk->sk_priority;
432         if (err)
433                 goto out_free;
434
435         /*
436          *      Now send it
437          */
438
439         dev_queue_xmit(skb);
440         dev_put(dev);
441         return(len);
442
443 out_free:
444         kfree_skb(skb);
445 out_unlock:
446         if (dev)
447                 dev_put(dev);
448         return err;
449 }
450
451 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
452                                       unsigned int res)
453 {
454         struct sk_filter *filter;
455
456         rcu_read_lock_bh();
457         filter = rcu_dereference(sk->sk_filter);
458         if (filter != NULL)
459                 res = sk_run_filter(skb, filter->insns, filter->len);
460         rcu_read_unlock_bh();
461
462         return res;
463 }
464
465 /*
466    This function makes lazy skb cloning in hope that most of packets
467    are discarded by BPF.
468
469    Note tricky part: we DO mangle shared skb! skb->data, skb->len
470    and skb->cb are mangled. It works because (and until) packets
471    falling here are owned by current CPU. Output packets are cloned
472    by dev_queue_xmit_nit(), input packets are processed by net_bh
473    sequencially, so that if we return skb to original state on exit,
474    we will not harm anyone.
475  */
476
477 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
478 {
479         struct sock *sk;
480         struct sockaddr_ll *sll;
481         struct packet_sock *po;
482         u8 * skb_head = skb->data;
483         int skb_len = skb->len;
484         unsigned int snaplen, res;
485
486         if (skb->pkt_type == PACKET_LOOPBACK)
487                 goto drop;
488
489         sk = pt->af_packet_priv;
490         po = pkt_sk(sk);
491
492         if (dev_net(dev) != sock_net(sk))
493                 goto drop;
494
495         skb->dev = dev;
496
497         if (dev->header_ops) {
498                 /* The device has an explicit notion of ll header,
499                    exported to higher levels.
500
501                    Otherwise, the device hides datails of it frame
502                    structure, so that corresponding packet head
503                    never delivered to user.
504                  */
505                 if (sk->sk_type != SOCK_DGRAM)
506                         skb_push(skb, skb->data - skb_mac_header(skb));
507                 else if (skb->pkt_type == PACKET_OUTGOING) {
508                         /* Special case: outgoing packets have ll header at head */
509                         skb_pull(skb, skb_network_offset(skb));
510                 }
511         }
512
513         snaplen = skb->len;
514
515         res = run_filter(skb, sk, snaplen);
516         if (!res)
517                 goto drop_n_restore;
518         if (snaplen > res)
519                 snaplen = res;
520
521         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
522             (unsigned)sk->sk_rcvbuf)
523                 goto drop_n_acct;
524
525         if (skb_shared(skb)) {
526                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
527                 if (nskb == NULL)
528                         goto drop_n_acct;
529
530                 if (skb_head != skb->data) {
531                         skb->data = skb_head;
532                         skb->len = skb_len;
533                 }
534                 kfree_skb(skb);
535                 skb = nskb;
536         }
537
538         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
539                      sizeof(skb->cb));
540
541         sll = &PACKET_SKB_CB(skb)->sa.ll;
542         sll->sll_family = AF_PACKET;
543         sll->sll_hatype = dev->type;
544         sll->sll_protocol = skb->protocol;
545         sll->sll_pkttype = skb->pkt_type;
546         if (unlikely(po->origdev))
547                 sll->sll_ifindex = orig_dev->ifindex;
548         else
549                 sll->sll_ifindex = dev->ifindex;
550
551         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
552
553         PACKET_SKB_CB(skb)->origlen = skb->len;
554
555         if (pskb_trim(skb, snaplen))
556                 goto drop_n_acct;
557
558         skb_set_owner_r(skb, sk);
559         skb->dev = NULL;
560         dst_release(skb->dst);
561         skb->dst = NULL;
562
563         /* drop conntrack reference */
564         nf_reset(skb);
565
566         spin_lock(&sk->sk_receive_queue.lock);
567         po->stats.tp_packets++;
568         __skb_queue_tail(&sk->sk_receive_queue, skb);
569         spin_unlock(&sk->sk_receive_queue.lock);
570         sk->sk_data_ready(sk, skb->len);
571         return 0;
572
573 drop_n_acct:
574         spin_lock(&sk->sk_receive_queue.lock);
575         po->stats.tp_drops++;
576         spin_unlock(&sk->sk_receive_queue.lock);
577
578 drop_n_restore:
579         if (skb_head != skb->data && skb_shared(skb)) {
580                 skb->data = skb_head;
581                 skb->len = skb_len;
582         }
583 drop:
584         kfree_skb(skb);
585         return 0;
586 }
587
588 #ifdef CONFIG_PACKET_MMAP
589 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
590 {
591         struct sock *sk;
592         struct packet_sock *po;
593         struct sockaddr_ll *sll;
594         union {
595                 struct tpacket_hdr *h1;
596                 struct tpacket2_hdr *h2;
597                 void *raw;
598         } h;
599         u8 * skb_head = skb->data;
600         int skb_len = skb->len;
601         unsigned int snaplen, res;
602         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
603         unsigned short macoff, netoff, hdrlen;
604         struct sk_buff *copy_skb = NULL;
605         struct timeval tv;
606         struct timespec ts;
607
608         if (skb->pkt_type == PACKET_LOOPBACK)
609                 goto drop;
610
611         sk = pt->af_packet_priv;
612         po = pkt_sk(sk);
613
614         if (dev_net(dev) != sock_net(sk))
615                 goto drop;
616
617         if (dev->header_ops) {
618                 if (sk->sk_type != SOCK_DGRAM)
619                         skb_push(skb, skb->data - skb_mac_header(skb));
620                 else if (skb->pkt_type == PACKET_OUTGOING) {
621                         /* Special case: outgoing packets have ll header at head */
622                         skb_pull(skb, skb_network_offset(skb));
623                 }
624         }
625
626         if (skb->ip_summed == CHECKSUM_PARTIAL)
627                 status |= TP_STATUS_CSUMNOTREADY;
628
629         snaplen = skb->len;
630
631         res = run_filter(skb, sk, snaplen);
632         if (!res)
633                 goto drop_n_restore;
634         if (snaplen > res)
635                 snaplen = res;
636
637         if (sk->sk_type == SOCK_DGRAM) {
638                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16;
639         } else {
640                 unsigned maclen = skb_network_offset(skb);
641                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
642                                        (maclen < 16 ? 16 : maclen));
643                 macoff = netoff - maclen;
644         }
645
646         if (macoff + snaplen > po->frame_size) {
647                 if (po->copy_thresh &&
648                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
649                     (unsigned)sk->sk_rcvbuf) {
650                         if (skb_shared(skb)) {
651                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
652                         } else {
653                                 copy_skb = skb_get(skb);
654                                 skb_head = skb->data;
655                         }
656                         if (copy_skb)
657                                 skb_set_owner_r(copy_skb, sk);
658                 }
659                 snaplen = po->frame_size - macoff;
660                 if ((int)snaplen < 0)
661                         snaplen = 0;
662         }
663
664         spin_lock(&sk->sk_receive_queue.lock);
665         h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
666         if (!h.raw)
667                 goto ring_is_full;
668         po->head = po->head != po->frame_max ? po->head+1 : 0;
669         po->stats.tp_packets++;
670         if (copy_skb) {
671                 status |= TP_STATUS_COPY;
672                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
673         }
674         if (!po->stats.tp_drops)
675                 status &= ~TP_STATUS_LOSING;
676         spin_unlock(&sk->sk_receive_queue.lock);
677
678         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
679
680         switch (po->tp_version) {
681         case TPACKET_V1:
682                 h.h1->tp_len = skb->len;
683                 h.h1->tp_snaplen = snaplen;
684                 h.h1->tp_mac = macoff;
685                 h.h1->tp_net = netoff;
686                 if (skb->tstamp.tv64)
687                         tv = ktime_to_timeval(skb->tstamp);
688                 else
689                         do_gettimeofday(&tv);
690                 h.h1->tp_sec = tv.tv_sec;
691                 h.h1->tp_usec = tv.tv_usec;
692                 hdrlen = sizeof(*h.h1);
693                 break;
694         case TPACKET_V2:
695                 h.h2->tp_len = skb->len;
696                 h.h2->tp_snaplen = snaplen;
697                 h.h2->tp_mac = macoff;
698                 h.h2->tp_net = netoff;
699                 if (skb->tstamp.tv64)
700                         ts = ktime_to_timespec(skb->tstamp);
701                 else
702                         getnstimeofday(&ts);
703                 h.h2->tp_sec = ts.tv_sec;
704                 h.h2->tp_nsec = ts.tv_nsec;
705                 h.h2->tp_vlan_tci = skb->vlan_tci;
706                 hdrlen = sizeof(*h.h2);
707                 break;
708         default:
709                 BUG();
710         }
711
712         sll = h.raw + TPACKET_ALIGN(hdrlen);
713         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
714         sll->sll_family = AF_PACKET;
715         sll->sll_hatype = dev->type;
716         sll->sll_protocol = skb->protocol;
717         sll->sll_pkttype = skb->pkt_type;
718         if (unlikely(po->origdev))
719                 sll->sll_ifindex = orig_dev->ifindex;
720         else
721                 sll->sll_ifindex = dev->ifindex;
722
723         __packet_set_status(po, h.raw, status);
724         smp_mb();
725
726         {
727                 struct page *p_start, *p_end;
728                 u8 *h_end = h.raw + macoff + snaplen - 1;
729
730                 p_start = virt_to_page(h.raw);
731                 p_end = virt_to_page(h_end);
732                 while (p_start <= p_end) {
733                         flush_dcache_page(p_start);
734                         p_start++;
735                 }
736         }
737
738         sk->sk_data_ready(sk, 0);
739
740 drop_n_restore:
741         if (skb_head != skb->data && skb_shared(skb)) {
742                 skb->data = skb_head;
743                 skb->len = skb_len;
744         }
745 drop:
746         kfree_skb(skb);
747         return 0;
748
749 ring_is_full:
750         po->stats.tp_drops++;
751         spin_unlock(&sk->sk_receive_queue.lock);
752
753         sk->sk_data_ready(sk, 0);
754         if (copy_skb)
755                 kfree_skb(copy_skb);
756         goto drop_n_restore;
757 }
758
759 #endif
760
761
762 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
763                           struct msghdr *msg, size_t len)
764 {
765         struct sock *sk = sock->sk;
766         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
767         struct sk_buff *skb;
768         struct net_device *dev;
769         __be16 proto;
770         unsigned char *addr;
771         int ifindex, err, reserve = 0;
772
773         /*
774          *      Get and verify the address.
775          */
776
777         if (saddr == NULL) {
778                 struct packet_sock *po = pkt_sk(sk);
779
780                 ifindex = po->ifindex;
781                 proto   = po->num;
782                 addr    = NULL;
783         } else {
784                 err = -EINVAL;
785                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
786                         goto out;
787                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
788                         goto out;
789                 ifindex = saddr->sll_ifindex;
790                 proto   = saddr->sll_protocol;
791                 addr    = saddr->sll_addr;
792         }
793
794
795         dev = dev_get_by_index(sock_net(sk), ifindex);
796         err = -ENXIO;
797         if (dev == NULL)
798                 goto out_unlock;
799         if (sock->type == SOCK_RAW)
800                 reserve = dev->hard_header_len;
801
802         err = -ENETDOWN;
803         if (!(dev->flags & IFF_UP))
804                 goto out_unlock;
805
806         err = -EMSGSIZE;
807         if (len > dev->mtu+reserve)
808                 goto out_unlock;
809
810         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
811                                 msg->msg_flags & MSG_DONTWAIT, &err);
812         if (skb==NULL)
813                 goto out_unlock;
814
815         skb_reserve(skb, LL_RESERVED_SPACE(dev));
816         skb_reset_network_header(skb);
817
818         err = -EINVAL;
819         if (sock->type == SOCK_DGRAM &&
820             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
821                 goto out_free;
822
823         /* Returns -EFAULT on error */
824         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
825         if (err)
826                 goto out_free;
827
828         skb->protocol = proto;
829         skb->dev = dev;
830         skb->priority = sk->sk_priority;
831
832         /*
833          *      Now send it
834          */
835
836         err = dev_queue_xmit(skb);
837         if (err > 0 && (err = net_xmit_errno(err)) != 0)
838                 goto out_unlock;
839
840         dev_put(dev);
841
842         return(len);
843
844 out_free:
845         kfree_skb(skb);
846 out_unlock:
847         if (dev)
848                 dev_put(dev);
849 out:
850         return err;
851 }
852
853 /*
854  *      Close a PACKET socket. This is fairly simple. We immediately go
855  *      to 'closed' state and remove our protocol entry in the device list.
856  */
857
858 static int packet_release(struct socket *sock)
859 {
860         struct sock *sk = sock->sk;
861         struct packet_sock *po;
862         struct net *net;
863
864         if (!sk)
865                 return 0;
866
867         net = sock_net(sk);
868         po = pkt_sk(sk);
869
870         write_lock_bh(&net->packet.sklist_lock);
871         sk_del_node_init(sk);
872         write_unlock_bh(&net->packet.sklist_lock);
873
874         /*
875          *      Unhook packet receive handler.
876          */
877
878         if (po->running) {
879                 /*
880                  *      Remove the protocol hook
881                  */
882                 dev_remove_pack(&po->prot_hook);
883                 po->running = 0;
884                 po->num = 0;
885                 __sock_put(sk);
886         }
887
888         packet_flush_mclist(sk);
889
890 #ifdef CONFIG_PACKET_MMAP
891         if (po->pg_vec) {
892                 struct tpacket_req req;
893                 memset(&req, 0, sizeof(req));
894                 packet_set_ring(sk, &req, 1);
895         }
896 #endif
897
898         /*
899          *      Now the socket is dead. No more input will appear.
900          */
901
902         sock_orphan(sk);
903         sock->sk = NULL;
904
905         /* Purge queues */
906
907         skb_queue_purge(&sk->sk_receive_queue);
908         sk_refcnt_debug_release(sk);
909
910         sock_put(sk);
911         return 0;
912 }
913
914 /*
915  *      Attach a packet hook.
916  */
917
918 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
919 {
920         struct packet_sock *po = pkt_sk(sk);
921         /*
922          *      Detach an existing hook if present.
923          */
924
925         lock_sock(sk);
926
927         spin_lock(&po->bind_lock);
928         if (po->running) {
929                 __sock_put(sk);
930                 po->running = 0;
931                 po->num = 0;
932                 spin_unlock(&po->bind_lock);
933                 dev_remove_pack(&po->prot_hook);
934                 spin_lock(&po->bind_lock);
935         }
936
937         po->num = protocol;
938         po->prot_hook.type = protocol;
939         po->prot_hook.dev = dev;
940
941         po->ifindex = dev ? dev->ifindex : 0;
942
943         if (protocol == 0)
944                 goto out_unlock;
945
946         if (!dev || (dev->flags & IFF_UP)) {
947                 dev_add_pack(&po->prot_hook);
948                 sock_hold(sk);
949                 po->running = 1;
950         } else {
951                 sk->sk_err = ENETDOWN;
952                 if (!sock_flag(sk, SOCK_DEAD))
953                         sk->sk_error_report(sk);
954         }
955
956 out_unlock:
957         spin_unlock(&po->bind_lock);
958         release_sock(sk);
959         return 0;
960 }
961
962 /*
963  *      Bind a packet socket to a device
964  */
965
966 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
967 {
968         struct sock *sk=sock->sk;
969         char name[15];
970         struct net_device *dev;
971         int err = -ENODEV;
972
973         /*
974          *      Check legality
975          */
976
977         if (addr_len != sizeof(struct sockaddr))
978                 return -EINVAL;
979         strlcpy(name,uaddr->sa_data,sizeof(name));
980
981         dev = dev_get_by_name(sock_net(sk), name);
982         if (dev) {
983                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
984                 dev_put(dev);
985         }
986         return err;
987 }
988
989 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
990 {
991         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
992         struct sock *sk=sock->sk;
993         struct net_device *dev = NULL;
994         int err;
995
996
997         /*
998          *      Check legality
999          */
1000
1001         if (addr_len < sizeof(struct sockaddr_ll))
1002                 return -EINVAL;
1003         if (sll->sll_family != AF_PACKET)
1004                 return -EINVAL;
1005
1006         if (sll->sll_ifindex) {
1007                 err = -ENODEV;
1008                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1009                 if (dev == NULL)
1010                         goto out;
1011         }
1012         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1013         if (dev)
1014                 dev_put(dev);
1015
1016 out:
1017         return err;
1018 }
1019
1020 static struct proto packet_proto = {
1021         .name     = "PACKET",
1022         .owner    = THIS_MODULE,
1023         .obj_size = sizeof(struct packet_sock),
1024 };
1025
1026 /*
1027  *      Create a packet of type SOCK_PACKET.
1028  */
1029
1030 static int packet_create(struct net *net, struct socket *sock, int protocol)
1031 {
1032         struct sock *sk;
1033         struct packet_sock *po;
1034         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1035         int err;
1036
1037         if (!capable(CAP_NET_RAW))
1038                 return -EPERM;
1039         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1040             sock->type != SOCK_PACKET)
1041                 return -ESOCKTNOSUPPORT;
1042
1043         sock->state = SS_UNCONNECTED;
1044
1045         err = -ENOBUFS;
1046         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1047         if (sk == NULL)
1048                 goto out;
1049
1050         sock->ops = &packet_ops;
1051         if (sock->type == SOCK_PACKET)
1052                 sock->ops = &packet_ops_spkt;
1053
1054         sock_init_data(sock, sk);
1055
1056         po = pkt_sk(sk);
1057         sk->sk_family = PF_PACKET;
1058         po->num = proto;
1059
1060         sk->sk_destruct = packet_sock_destruct;
1061         sk_refcnt_debug_inc(sk);
1062
1063         /*
1064          *      Attach a protocol block
1065          */
1066
1067         spin_lock_init(&po->bind_lock);
1068         po->prot_hook.func = packet_rcv;
1069
1070         if (sock->type == SOCK_PACKET)
1071                 po->prot_hook.func = packet_rcv_spkt;
1072
1073         po->prot_hook.af_packet_priv = sk;
1074
1075         if (proto) {
1076                 po->prot_hook.type = proto;
1077                 dev_add_pack(&po->prot_hook);
1078                 sock_hold(sk);
1079                 po->running = 1;
1080         }
1081
1082         write_lock_bh(&net->packet.sklist_lock);
1083         sk_add_node(sk, &net->packet.sklist);
1084         write_unlock_bh(&net->packet.sklist_lock);
1085         return(0);
1086 out:
1087         return err;
1088 }
1089
1090 /*
1091  *      Pull a packet from our receive queue and hand it to the user.
1092  *      If necessary we block.
1093  */
1094
1095 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1096                           struct msghdr *msg, size_t len, int flags)
1097 {
1098         struct sock *sk = sock->sk;
1099         struct sk_buff *skb;
1100         int copied, err;
1101         struct sockaddr_ll *sll;
1102
1103         err = -EINVAL;
1104         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1105                 goto out;
1106
1107 #if 0
1108         /* What error should we return now? EUNATTACH? */
1109         if (pkt_sk(sk)->ifindex < 0)
1110                 return -ENODEV;
1111 #endif
1112
1113         /*
1114          *      Call the generic datagram receiver. This handles all sorts
1115          *      of horrible races and re-entrancy so we can forget about it
1116          *      in the protocol layers.
1117          *
1118          *      Now it will return ENETDOWN, if device have just gone down,
1119          *      but then it will block.
1120          */
1121
1122         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1123
1124         /*
1125          *      An error occurred so return it. Because skb_recv_datagram()
1126          *      handles the blocking we don't see and worry about blocking
1127          *      retries.
1128          */
1129
1130         if (skb == NULL)
1131                 goto out;
1132
1133         /*
1134          *      If the address length field is there to be filled in, we fill
1135          *      it in now.
1136          */
1137
1138         sll = &PACKET_SKB_CB(skb)->sa.ll;
1139         if (sock->type == SOCK_PACKET)
1140                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1141         else
1142                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1143
1144         /*
1145          *      You lose any data beyond the buffer you gave. If it worries a
1146          *      user program they can ask the device for its MTU anyway.
1147          */
1148
1149         copied = skb->len;
1150         if (copied > len)
1151         {
1152                 copied=len;
1153                 msg->msg_flags|=MSG_TRUNC;
1154         }
1155
1156         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1157         if (err)
1158                 goto out_free;
1159
1160         sock_recv_timestamp(msg, sk, skb);
1161
1162         if (msg->msg_name)
1163                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1164                        msg->msg_namelen);
1165
1166         if (pkt_sk(sk)->auxdata) {
1167                 struct tpacket_auxdata aux;
1168
1169                 aux.tp_status = TP_STATUS_USER;
1170                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1171                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1172                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1173                 aux.tp_snaplen = skb->len;
1174                 aux.tp_mac = 0;
1175                 aux.tp_net = skb_network_offset(skb);
1176                 aux.tp_vlan_tci = skb->vlan_tci;
1177
1178                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1179         }
1180
1181         /*
1182          *      Free or return the buffer as appropriate. Again this
1183          *      hides all the races and re-entrancy issues from us.
1184          */
1185         err = (flags&MSG_TRUNC) ? skb->len : copied;
1186
1187 out_free:
1188         skb_free_datagram(sk, skb);
1189 out:
1190         return err;
1191 }
1192
1193 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1194                                int *uaddr_len, int peer)
1195 {
1196         struct net_device *dev;
1197         struct sock *sk = sock->sk;
1198
1199         if (peer)
1200                 return -EOPNOTSUPP;
1201
1202         uaddr->sa_family = AF_PACKET;
1203         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1204         if (dev) {
1205                 strlcpy(uaddr->sa_data, dev->name, 15);
1206                 dev_put(dev);
1207         } else
1208                 memset(uaddr->sa_data, 0, 14);
1209         *uaddr_len = sizeof(*uaddr);
1210
1211         return 0;
1212 }
1213
1214 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1215                           int *uaddr_len, int peer)
1216 {
1217         struct net_device *dev;
1218         struct sock *sk = sock->sk;
1219         struct packet_sock *po = pkt_sk(sk);
1220         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1221
1222         if (peer)
1223                 return -EOPNOTSUPP;
1224
1225         sll->sll_family = AF_PACKET;
1226         sll->sll_ifindex = po->ifindex;
1227         sll->sll_protocol = po->num;
1228         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1229         if (dev) {
1230                 sll->sll_hatype = dev->type;
1231                 sll->sll_halen = dev->addr_len;
1232                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1233                 dev_put(dev);
1234         } else {
1235                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1236                 sll->sll_halen = 0;
1237         }
1238         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1239
1240         return 0;
1241 }
1242
1243 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1244                          int what)
1245 {
1246         switch (i->type) {
1247         case PACKET_MR_MULTICAST:
1248                 if (what > 0)
1249                         dev_mc_add(dev, i->addr, i->alen, 0);
1250                 else
1251                         dev_mc_delete(dev, i->addr, i->alen, 0);
1252                 break;
1253         case PACKET_MR_PROMISC:
1254                 return dev_set_promiscuity(dev, what);
1255                 break;
1256         case PACKET_MR_ALLMULTI:
1257                 return dev_set_allmulti(dev, what);
1258                 break;
1259         default:;
1260         }
1261         return 0;
1262 }
1263
1264 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1265 {
1266         for ( ; i; i=i->next) {
1267                 if (i->ifindex == dev->ifindex)
1268                         packet_dev_mc(dev, i, what);
1269         }
1270 }
1271
1272 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1273 {
1274         struct packet_sock *po = pkt_sk(sk);
1275         struct packet_mclist *ml, *i;
1276         struct net_device *dev;
1277         int err;
1278
1279         rtnl_lock();
1280
1281         err = -ENODEV;
1282         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1283         if (!dev)
1284                 goto done;
1285
1286         err = -EINVAL;
1287         if (mreq->mr_alen > dev->addr_len)
1288                 goto done;
1289
1290         err = -ENOBUFS;
1291         i = kmalloc(sizeof(*i), GFP_KERNEL);
1292         if (i == NULL)
1293                 goto done;
1294
1295         err = 0;
1296         for (ml = po->mclist; ml; ml = ml->next) {
1297                 if (ml->ifindex == mreq->mr_ifindex &&
1298                     ml->type == mreq->mr_type &&
1299                     ml->alen == mreq->mr_alen &&
1300                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1301                         ml->count++;
1302                         /* Free the new element ... */
1303                         kfree(i);
1304                         goto done;
1305                 }
1306         }
1307
1308         i->type = mreq->mr_type;
1309         i->ifindex = mreq->mr_ifindex;
1310         i->alen = mreq->mr_alen;
1311         memcpy(i->addr, mreq->mr_address, i->alen);
1312         i->count = 1;
1313         i->next = po->mclist;
1314         po->mclist = i;
1315         err = packet_dev_mc(dev, i, 1);
1316         if (err) {
1317                 po->mclist = i->next;
1318                 kfree(i);
1319         }
1320
1321 done:
1322         rtnl_unlock();
1323         return err;
1324 }
1325
1326 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1327 {
1328         struct packet_mclist *ml, **mlp;
1329
1330         rtnl_lock();
1331
1332         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1333                 if (ml->ifindex == mreq->mr_ifindex &&
1334                     ml->type == mreq->mr_type &&
1335                     ml->alen == mreq->mr_alen &&
1336                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1337                         if (--ml->count == 0) {
1338                                 struct net_device *dev;
1339                                 *mlp = ml->next;
1340                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1341                                 if (dev) {
1342                                         packet_dev_mc(dev, ml, -1);
1343                                         dev_put(dev);
1344                                 }
1345                                 kfree(ml);
1346                         }
1347                         rtnl_unlock();
1348                         return 0;
1349                 }
1350         }
1351         rtnl_unlock();
1352         return -EADDRNOTAVAIL;
1353 }
1354
1355 static void packet_flush_mclist(struct sock *sk)
1356 {
1357         struct packet_sock *po = pkt_sk(sk);
1358         struct packet_mclist *ml;
1359
1360         if (!po->mclist)
1361                 return;
1362
1363         rtnl_lock();
1364         while ((ml = po->mclist) != NULL) {
1365                 struct net_device *dev;
1366
1367                 po->mclist = ml->next;
1368                 if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1369                         packet_dev_mc(dev, ml, -1);
1370                         dev_put(dev);
1371                 }
1372                 kfree(ml);
1373         }
1374         rtnl_unlock();
1375 }
1376
1377 static int
1378 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1379 {
1380         struct sock *sk = sock->sk;
1381         struct packet_sock *po = pkt_sk(sk);
1382         int ret;
1383
1384         if (level != SOL_PACKET)
1385                 return -ENOPROTOOPT;
1386
1387         switch(optname) {
1388         case PACKET_ADD_MEMBERSHIP:
1389         case PACKET_DROP_MEMBERSHIP:
1390         {
1391                 struct packet_mreq_max mreq;
1392                 int len = optlen;
1393                 memset(&mreq, 0, sizeof(mreq));
1394                 if (len < sizeof(struct packet_mreq))
1395                         return -EINVAL;
1396                 if (len > sizeof(mreq))
1397                         len = sizeof(mreq);
1398                 if (copy_from_user(&mreq,optval,len))
1399                         return -EFAULT;
1400                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1401                         return -EINVAL;
1402                 if (optname == PACKET_ADD_MEMBERSHIP)
1403                         ret = packet_mc_add(sk, &mreq);
1404                 else
1405                         ret = packet_mc_drop(sk, &mreq);
1406                 return ret;
1407         }
1408
1409 #ifdef CONFIG_PACKET_MMAP
1410         case PACKET_RX_RING:
1411         {
1412                 struct tpacket_req req;
1413
1414                 if (optlen<sizeof(req))
1415                         return -EINVAL;
1416                 if (copy_from_user(&req,optval,sizeof(req)))
1417                         return -EFAULT;
1418                 return packet_set_ring(sk, &req, 0);
1419         }
1420         case PACKET_COPY_THRESH:
1421         {
1422                 int val;
1423
1424                 if (optlen!=sizeof(val))
1425                         return -EINVAL;
1426                 if (copy_from_user(&val,optval,sizeof(val)))
1427                         return -EFAULT;
1428
1429                 pkt_sk(sk)->copy_thresh = val;
1430                 return 0;
1431         }
1432         case PACKET_VERSION:
1433         {
1434                 int val;
1435
1436                 if (optlen != sizeof(val))
1437                         return -EINVAL;
1438                 if (po->pg_vec)
1439                         return -EBUSY;
1440                 if (copy_from_user(&val, optval, sizeof(val)))
1441                         return -EFAULT;
1442                 switch (val) {
1443                 case TPACKET_V1:
1444                 case TPACKET_V2:
1445                         po->tp_version = val;
1446                         return 0;
1447                 default:
1448                         return -EINVAL;
1449                 }
1450         }
1451 #endif
1452         case PACKET_AUXDATA:
1453         {
1454                 int val;
1455
1456                 if (optlen < sizeof(val))
1457                         return -EINVAL;
1458                 if (copy_from_user(&val, optval, sizeof(val)))
1459                         return -EFAULT;
1460
1461                 po->auxdata = !!val;
1462                 return 0;
1463         }
1464         case PACKET_ORIGDEV:
1465         {
1466                 int val;
1467
1468                 if (optlen < sizeof(val))
1469                         return -EINVAL;
1470                 if (copy_from_user(&val, optval, sizeof(val)))
1471                         return -EFAULT;
1472
1473                 po->origdev = !!val;
1474                 return 0;
1475         }
1476         default:
1477                 return -ENOPROTOOPT;
1478         }
1479 }
1480
1481 static int packet_getsockopt(struct socket *sock, int level, int optname,
1482                              char __user *optval, int __user *optlen)
1483 {
1484         int len;
1485         int val;
1486         struct sock *sk = sock->sk;
1487         struct packet_sock *po = pkt_sk(sk);
1488         void *data;
1489         struct tpacket_stats st;
1490
1491         if (level != SOL_PACKET)
1492                 return -ENOPROTOOPT;
1493
1494         if (get_user(len, optlen))
1495                 return -EFAULT;
1496
1497         if (len < 0)
1498                 return -EINVAL;
1499
1500         switch(optname) {
1501         case PACKET_STATISTICS:
1502                 if (len > sizeof(struct tpacket_stats))
1503                         len = sizeof(struct tpacket_stats);
1504                 spin_lock_bh(&sk->sk_receive_queue.lock);
1505                 st = po->stats;
1506                 memset(&po->stats, 0, sizeof(st));
1507                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1508                 st.tp_packets += st.tp_drops;
1509
1510                 data = &st;
1511                 break;
1512         case PACKET_AUXDATA:
1513                 if (len > sizeof(int))
1514                         len = sizeof(int);
1515                 val = po->auxdata;
1516
1517                 data = &val;
1518                 break;
1519         case PACKET_ORIGDEV:
1520                 if (len > sizeof(int))
1521                         len = sizeof(int);
1522                 val = po->origdev;
1523
1524                 data = &val;
1525                 break;
1526 #ifdef CONFIG_PACKET_MMAP
1527         case PACKET_VERSION:
1528                 if (len > sizeof(int))
1529                         len = sizeof(int);
1530                 val = po->tp_version;
1531                 data = &val;
1532                 break;
1533         case PACKET_HDRLEN:
1534                 if (len > sizeof(int))
1535                         len = sizeof(int);
1536                 if (copy_from_user(&val, optval, len))
1537                         return -EFAULT;
1538                 switch (val) {
1539                 case TPACKET_V1:
1540                         val = sizeof(struct tpacket_hdr);
1541                         break;
1542                 case TPACKET_V2:
1543                         val = sizeof(struct tpacket2_hdr);
1544                         break;
1545                 default:
1546                         return -EINVAL;
1547                 }
1548                 data = &val;
1549                 break;
1550 #endif
1551         default:
1552                 return -ENOPROTOOPT;
1553         }
1554
1555         if (put_user(len, optlen))
1556                 return -EFAULT;
1557         if (copy_to_user(optval, data, len))
1558                 return -EFAULT;
1559         return 0;
1560 }
1561
1562
1563 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1564 {
1565         struct sock *sk;
1566         struct hlist_node *node;
1567         struct net_device *dev = data;
1568         struct net *net = dev_net(dev);
1569
1570         read_lock(&net->packet.sklist_lock);
1571         sk_for_each(sk, node, &net->packet.sklist) {
1572                 struct packet_sock *po = pkt_sk(sk);
1573
1574                 switch (msg) {
1575                 case NETDEV_UNREGISTER:
1576                         if (po->mclist)
1577                                 packet_dev_mclist(dev, po->mclist, -1);
1578                         /* fallthrough */
1579
1580                 case NETDEV_DOWN:
1581                         if (dev->ifindex == po->ifindex) {
1582                                 spin_lock(&po->bind_lock);
1583                                 if (po->running) {
1584                                         __dev_remove_pack(&po->prot_hook);
1585                                         __sock_put(sk);
1586                                         po->running = 0;
1587                                         sk->sk_err = ENETDOWN;
1588                                         if (!sock_flag(sk, SOCK_DEAD))
1589                                                 sk->sk_error_report(sk);
1590                                 }
1591                                 if (msg == NETDEV_UNREGISTER) {
1592                                         po->ifindex = -1;
1593                                         po->prot_hook.dev = NULL;
1594                                 }
1595                                 spin_unlock(&po->bind_lock);
1596                         }
1597                         break;
1598                 case NETDEV_UP:
1599                         spin_lock(&po->bind_lock);
1600                         if (dev->ifindex == po->ifindex && po->num &&
1601                             !po->running) {
1602                                 dev_add_pack(&po->prot_hook);
1603                                 sock_hold(sk);
1604                                 po->running = 1;
1605                         }
1606                         spin_unlock(&po->bind_lock);
1607                         break;
1608                 }
1609         }
1610         read_unlock(&net->packet.sklist_lock);
1611         return NOTIFY_DONE;
1612 }
1613
1614
1615 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1616                         unsigned long arg)
1617 {
1618         struct sock *sk = sock->sk;
1619
1620         switch(cmd) {
1621                 case SIOCOUTQ:
1622                 {
1623                         int amount = atomic_read(&sk->sk_wmem_alloc);
1624                         return put_user(amount, (int __user *)arg);
1625                 }
1626                 case SIOCINQ:
1627                 {
1628                         struct sk_buff *skb;
1629                         int amount = 0;
1630
1631                         spin_lock_bh(&sk->sk_receive_queue.lock);
1632                         skb = skb_peek(&sk->sk_receive_queue);
1633                         if (skb)
1634                                 amount = skb->len;
1635                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1636                         return put_user(amount, (int __user *)arg);
1637                 }
1638                 case SIOCGSTAMP:
1639                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1640                 case SIOCGSTAMPNS:
1641                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1642
1643 #ifdef CONFIG_INET
1644                 case SIOCADDRT:
1645                 case SIOCDELRT:
1646                 case SIOCDARP:
1647                 case SIOCGARP:
1648                 case SIOCSARP:
1649                 case SIOCGIFADDR:
1650                 case SIOCSIFADDR:
1651                 case SIOCGIFBRDADDR:
1652                 case SIOCSIFBRDADDR:
1653                 case SIOCGIFNETMASK:
1654                 case SIOCSIFNETMASK:
1655                 case SIOCGIFDSTADDR:
1656                 case SIOCSIFDSTADDR:
1657                 case SIOCSIFFLAGS:
1658                         if (sock_net(sk) != &init_net)
1659                                 return -ENOIOCTLCMD;
1660                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1661 #endif
1662
1663                 default:
1664                         return -ENOIOCTLCMD;
1665         }
1666         return 0;
1667 }
1668
1669 #ifndef CONFIG_PACKET_MMAP
1670 #define packet_mmap sock_no_mmap
1671 #define packet_poll datagram_poll
1672 #else
1673
1674 static unsigned int packet_poll(struct file * file, struct socket *sock,
1675                                 poll_table *wait)
1676 {
1677         struct sock *sk = sock->sk;
1678         struct packet_sock *po = pkt_sk(sk);
1679         unsigned int mask = datagram_poll(file, sock, wait);
1680
1681         spin_lock_bh(&sk->sk_receive_queue.lock);
1682         if (po->pg_vec) {
1683                 unsigned last = po->head ? po->head-1 : po->frame_max;
1684
1685                 if (packet_lookup_frame(po, last, TP_STATUS_USER))
1686                         mask |= POLLIN | POLLRDNORM;
1687         }
1688         spin_unlock_bh(&sk->sk_receive_queue.lock);
1689         return mask;
1690 }
1691
1692
1693 /* Dirty? Well, I still did not learn better way to account
1694  * for user mmaps.
1695  */
1696
1697 static void packet_mm_open(struct vm_area_struct *vma)
1698 {
1699         struct file *file = vma->vm_file;
1700         struct socket * sock = file->private_data;
1701         struct sock *sk = sock->sk;
1702
1703         if (sk)
1704                 atomic_inc(&pkt_sk(sk)->mapped);
1705 }
1706
1707 static void packet_mm_close(struct vm_area_struct *vma)
1708 {
1709         struct file *file = vma->vm_file;
1710         struct socket * sock = file->private_data;
1711         struct sock *sk = sock->sk;
1712
1713         if (sk)
1714                 atomic_dec(&pkt_sk(sk)->mapped);
1715 }
1716
1717 static struct vm_operations_struct packet_mmap_ops = {
1718         .open = packet_mm_open,
1719         .close =packet_mm_close,
1720 };
1721
1722 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1723 {
1724         int i;
1725
1726         for (i = 0; i < len; i++) {
1727                 if (likely(pg_vec[i]))
1728                         free_pages((unsigned long) pg_vec[i], order);
1729         }
1730         kfree(pg_vec);
1731 }
1732
1733 static inline char *alloc_one_pg_vec_page(unsigned long order)
1734 {
1735         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1736                                          order);
1737 }
1738
1739 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1740 {
1741         unsigned int block_nr = req->tp_block_nr;
1742         char **pg_vec;
1743         int i;
1744
1745         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1746         if (unlikely(!pg_vec))
1747                 goto out;
1748
1749         for (i = 0; i < block_nr; i++) {
1750                 pg_vec[i] = alloc_one_pg_vec_page(order);
1751                 if (unlikely(!pg_vec[i]))
1752                         goto out_free_pgvec;
1753         }
1754
1755 out:
1756         return pg_vec;
1757
1758 out_free_pgvec:
1759         free_pg_vec(pg_vec, order, block_nr);
1760         pg_vec = NULL;
1761         goto out;
1762 }
1763
1764 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1765 {
1766         char **pg_vec = NULL;
1767         struct packet_sock *po = pkt_sk(sk);
1768         int was_running, order = 0;
1769         __be16 num;
1770         int err = 0;
1771
1772         if (req->tp_block_nr) {
1773                 int i;
1774
1775                 /* Sanity tests and some calculations */
1776
1777                 if (unlikely(po->pg_vec))
1778                         return -EBUSY;
1779
1780                 switch (po->tp_version) {
1781                 case TPACKET_V1:
1782                         po->tp_hdrlen = TPACKET_HDRLEN;
1783                         break;
1784                 case TPACKET_V2:
1785                         po->tp_hdrlen = TPACKET2_HDRLEN;
1786                         break;
1787                 }
1788
1789                 if (unlikely((int)req->tp_block_size <= 0))
1790                         return -EINVAL;
1791                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1792                         return -EINVAL;
1793                 if (unlikely(req->tp_frame_size < po->tp_hdrlen))
1794                         return -EINVAL;
1795                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1796                         return -EINVAL;
1797
1798                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1799                 if (unlikely(po->frames_per_block <= 0))
1800                         return -EINVAL;
1801                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1802                              req->tp_frame_nr))
1803                         return -EINVAL;
1804
1805                 err = -ENOMEM;
1806                 order = get_order(req->tp_block_size);
1807                 pg_vec = alloc_pg_vec(req, order);
1808                 if (unlikely(!pg_vec))
1809                         goto out;
1810
1811                 for (i = 0; i < req->tp_block_nr; i++) {
1812                         void *ptr = pg_vec[i];
1813                         int k;
1814
1815                         for (k = 0; k < po->frames_per_block; k++) {
1816                                 __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1817                                 ptr += req->tp_frame_size;
1818                         }
1819                 }
1820                 /* Done */
1821         } else {
1822                 if (unlikely(req->tp_frame_nr))
1823                         return -EINVAL;
1824         }
1825
1826         lock_sock(sk);
1827
1828         /* Detach socket from network */
1829         spin_lock(&po->bind_lock);
1830         was_running = po->running;
1831         num = po->num;
1832         if (was_running) {
1833                 __dev_remove_pack(&po->prot_hook);
1834                 po->num = 0;
1835                 po->running = 0;
1836                 __sock_put(sk);
1837         }
1838         spin_unlock(&po->bind_lock);
1839
1840         synchronize_net();
1841
1842         err = -EBUSY;
1843         if (closing || atomic_read(&po->mapped) == 0) {
1844                 err = 0;
1845 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1846
1847                 spin_lock_bh(&sk->sk_receive_queue.lock);
1848                 pg_vec = XC(po->pg_vec, pg_vec);
1849                 po->frame_max = (req->tp_frame_nr - 1);
1850                 po->head = 0;
1851                 po->frame_size = req->tp_frame_size;
1852                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1853
1854                 order = XC(po->pg_vec_order, order);
1855                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1856
1857                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1858                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1859                 skb_queue_purge(&sk->sk_receive_queue);
1860 #undef XC
1861                 if (atomic_read(&po->mapped))
1862                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1863         }
1864
1865         spin_lock(&po->bind_lock);
1866         if (was_running && !po->running) {
1867                 sock_hold(sk);
1868                 po->running = 1;
1869                 po->num = num;
1870                 dev_add_pack(&po->prot_hook);
1871         }
1872         spin_unlock(&po->bind_lock);
1873
1874         release_sock(sk);
1875
1876         if (pg_vec)
1877                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1878 out:
1879         return err;
1880 }
1881
1882 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1883 {
1884         struct sock *sk = sock->sk;
1885         struct packet_sock *po = pkt_sk(sk);
1886         unsigned long size;
1887         unsigned long start;
1888         int err = -EINVAL;
1889         int i;
1890
1891         if (vma->vm_pgoff)
1892                 return -EINVAL;
1893
1894         size = vma->vm_end - vma->vm_start;
1895
1896         lock_sock(sk);
1897         if (po->pg_vec == NULL)
1898                 goto out;
1899         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1900                 goto out;
1901
1902         start = vma->vm_start;
1903         for (i = 0; i < po->pg_vec_len; i++) {
1904                 struct page *page = virt_to_page(po->pg_vec[i]);
1905                 int pg_num;
1906
1907                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1908                         err = vm_insert_page(vma, start, page);
1909                         if (unlikely(err))
1910                                 goto out;
1911                         start += PAGE_SIZE;
1912                 }
1913         }
1914         atomic_inc(&po->mapped);
1915         vma->vm_ops = &packet_mmap_ops;
1916         err = 0;
1917
1918 out:
1919         release_sock(sk);
1920         return err;
1921 }
1922 #endif
1923
1924
1925 static const struct proto_ops packet_ops_spkt = {
1926         .family =       PF_PACKET,
1927         .owner =        THIS_MODULE,
1928         .release =      packet_release,
1929         .bind =         packet_bind_spkt,
1930         .connect =      sock_no_connect,
1931         .socketpair =   sock_no_socketpair,
1932         .accept =       sock_no_accept,
1933         .getname =      packet_getname_spkt,
1934         .poll =         datagram_poll,
1935         .ioctl =        packet_ioctl,
1936         .listen =       sock_no_listen,
1937         .shutdown =     sock_no_shutdown,
1938         .setsockopt =   sock_no_setsockopt,
1939         .getsockopt =   sock_no_getsockopt,
1940         .sendmsg =      packet_sendmsg_spkt,
1941         .recvmsg =      packet_recvmsg,
1942         .mmap =         sock_no_mmap,
1943         .sendpage =     sock_no_sendpage,
1944 };
1945
1946 static const struct proto_ops packet_ops = {
1947         .family =       PF_PACKET,
1948         .owner =        THIS_MODULE,
1949         .release =      packet_release,
1950         .bind =         packet_bind,
1951         .connect =      sock_no_connect,
1952         .socketpair =   sock_no_socketpair,
1953         .accept =       sock_no_accept,
1954         .getname =      packet_getname,
1955         .poll =         packet_poll,
1956         .ioctl =        packet_ioctl,
1957         .listen =       sock_no_listen,
1958         .shutdown =     sock_no_shutdown,
1959         .setsockopt =   packet_setsockopt,
1960         .getsockopt =   packet_getsockopt,
1961         .sendmsg =      packet_sendmsg,
1962         .recvmsg =      packet_recvmsg,
1963         .mmap =         packet_mmap,
1964         .sendpage =     sock_no_sendpage,
1965 };
1966
1967 static struct net_proto_family packet_family_ops = {
1968         .family =       PF_PACKET,
1969         .create =       packet_create,
1970         .owner  =       THIS_MODULE,
1971 };
1972
1973 static struct notifier_block packet_netdev_notifier = {
1974         .notifier_call =packet_notifier,
1975 };
1976
1977 #ifdef CONFIG_PROC_FS
1978 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
1979 {
1980         struct sock *s;
1981         struct hlist_node *node;
1982
1983         sk_for_each(s, node, &net->packet.sklist) {
1984                 if (!off--)
1985                         return s;
1986         }
1987         return NULL;
1988 }
1989
1990 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1991         __acquires(seq_file_net(seq)->packet.sklist_lock)
1992 {
1993         struct net *net = seq_file_net(seq);
1994         read_lock(&net->packet.sklist_lock);
1995         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
1996 }
1997
1998 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1999 {
2000         struct net *net = seq_file_net(seq);
2001         ++*pos;
2002         return  (v == SEQ_START_TOKEN)
2003                 ? sk_head(&net->packet.sklist)
2004                 : sk_next((struct sock*)v) ;
2005 }
2006
2007 static void packet_seq_stop(struct seq_file *seq, void *v)
2008         __releases(seq_file_net(seq)->packet.sklist_lock)
2009 {
2010         struct net *net = seq_file_net(seq);
2011         read_unlock(&net->packet.sklist_lock);
2012 }
2013
2014 static int packet_seq_show(struct seq_file *seq, void *v)
2015 {
2016         if (v == SEQ_START_TOKEN)
2017                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2018         else {
2019                 struct sock *s = v;
2020                 const struct packet_sock *po = pkt_sk(s);
2021
2022                 seq_printf(seq,
2023                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2024                            s,
2025                            atomic_read(&s->sk_refcnt),
2026                            s->sk_type,
2027                            ntohs(po->num),
2028                            po->ifindex,
2029                            po->running,
2030                            atomic_read(&s->sk_rmem_alloc),
2031                            sock_i_uid(s),
2032                            sock_i_ino(s) );
2033         }
2034
2035         return 0;
2036 }
2037
2038 static const struct seq_operations packet_seq_ops = {
2039         .start  = packet_seq_start,
2040         .next   = packet_seq_next,
2041         .stop   = packet_seq_stop,
2042         .show   = packet_seq_show,
2043 };
2044
2045 static int packet_seq_open(struct inode *inode, struct file *file)
2046 {
2047         return seq_open_net(inode, file, &packet_seq_ops,
2048                             sizeof(struct seq_net_private));
2049 }
2050
2051 static const struct file_operations packet_seq_fops = {
2052         .owner          = THIS_MODULE,
2053         .open           = packet_seq_open,
2054         .read           = seq_read,
2055         .llseek         = seq_lseek,
2056         .release        = seq_release_net,
2057 };
2058
2059 #endif
2060
2061 static int packet_net_init(struct net *net)
2062 {
2063         rwlock_init(&net->packet.sklist_lock);
2064         INIT_HLIST_HEAD(&net->packet.sklist);
2065
2066         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2067                 return -ENOMEM;
2068
2069         return 0;
2070 }
2071
2072 static void packet_net_exit(struct net *net)
2073 {
2074         proc_net_remove(net, "packet");
2075 }
2076
2077 static struct pernet_operations packet_net_ops = {
2078         .init = packet_net_init,
2079         .exit = packet_net_exit,
2080 };
2081
2082
2083 static void __exit packet_exit(void)
2084 {
2085         unregister_netdevice_notifier(&packet_netdev_notifier);
2086         unregister_pernet_subsys(&packet_net_ops);
2087         sock_unregister(PF_PACKET);
2088         proto_unregister(&packet_proto);
2089 }
2090
2091 static int __init packet_init(void)
2092 {
2093         int rc = proto_register(&packet_proto, 0);
2094
2095         if (rc != 0)
2096                 goto out;
2097
2098         sock_register(&packet_family_ops);
2099         register_pernet_subsys(&packet_net_ops);
2100         register_netdevice_notifier(&packet_netdev_notifier);
2101 out:
2102         return rc;
2103 }
2104
2105 module_init(packet_init);
2106 module_exit(packet_exit);
2107 MODULE_LICENSE("GPL");
2108 MODULE_ALIAS_NETPROTO(PF_PACKET);