net: af_packet should update its inuse counter
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *
12  * Fixes:
13  *              Alan Cox        :       verify_area() now used correctly
14  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
15  *              Alan Cox        :       tidied skbuff lists.
16  *              Alan Cox        :       Now uses generic datagram routines I
17  *                                      added. Also fixed the peek/read crash
18  *                                      from all old Linux datagram code.
19  *              Alan Cox        :       Uses the improved datagram code.
20  *              Alan Cox        :       Added NULL's for socket options.
21  *              Alan Cox        :       Re-commented the code.
22  *              Alan Cox        :       Use new kernel side addressing
23  *              Rob Janssen     :       Correct MTU usage.
24  *              Dave Platt      :       Counter leaks caused by incorrect
25  *                                      interrupt locking and some slightly
26  *                                      dubious gcc output. Can you read
27  *                                      compiler: it said _VOLATILE_
28  *      Richard Kooijman        :       Timestamp fixes.
29  *              Alan Cox        :       New buffers. Use sk->mac.raw.
30  *              Alan Cox        :       sendmsg/recvmsg support.
31  *              Alan Cox        :       Protocol setting support
32  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
33  *      Cyrus Durgin            :       Fixed kerneld for kmod.
34  *      Michal Ostrowski        :       Module initialization cleanup.
35  *         Ulises Alonso        :       Frame number limit removal and
36  *                                      packet_set_ring memory leak.
37  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
38  *                                      The convention is that longer addresses
39  *                                      will simply extend the hardware address
40  *                                      byte arrays at the end of sockaddr_ll
41  *                                      and packet_mreq.
42  *
43  *              This program is free software; you can redistribute it and/or
44  *              modify it under the terms of the GNU General Public License
45  *              as published by the Free Software Foundation; either version
46  *              2 of the License, or (at your option) any later version.
47  *
48  */
49
50 #include <linux/types.h>
51 #include <linux/mm.h>
52 #include <linux/capability.h>
53 #include <linux/fcntl.h>
54 #include <linux/socket.h>
55 #include <linux/in.h>
56 #include <linux/inet.h>
57 #include <linux/netdevice.h>
58 #include <linux/if_packet.h>
59 #include <linux/wireless.h>
60 #include <linux/kernel.h>
61 #include <linux/kmod.h>
62 #include <net/net_namespace.h>
63 #include <net/ip.h>
64 #include <net/protocol.h>
65 #include <linux/skbuff.h>
66 #include <net/sock.h>
67 #include <linux/errno.h>
68 #include <linux/timer.h>
69 #include <asm/system.h>
70 #include <asm/uaccess.h>
71 #include <asm/ioctls.h>
72 #include <asm/page.h>
73 #include <asm/cacheflush.h>
74 #include <asm/io.h>
75 #include <linux/proc_fs.h>
76 #include <linux/seq_file.h>
77 #include <linux/poll.h>
78 #include <linux/module.h>
79 #include <linux/init.h>
80
81 #ifdef CONFIG_INET
82 #include <net/inet_common.h>
83 #endif
84
85 /*
86    Assumptions:
87    - if device has no dev->hard_header routine, it adds and removes ll header
88      inside itself. In this case ll header is invisible outside of device,
89      but higher levels still should reserve dev->hard_header_len.
90      Some devices are enough clever to reallocate skb, when header
91      will not fit to reserved space (tunnel), another ones are silly
92      (PPP).
93    - packet socket receives packets with pulled ll header,
94      so that SOCK_RAW should push it back.
95
96 On receive:
97 -----------
98
99 Incoming, dev->hard_header!=NULL
100    mac_header -> ll header
101    data       -> data
102
103 Outgoing, dev->hard_header!=NULL
104    mac_header -> ll header
105    data       -> ll header
106
107 Incoming, dev->hard_header==NULL
108    mac_header -> UNKNOWN position. It is very likely, that it points to ll
109                  header.  PPP makes it, that is wrong, because introduce
110                  assymetry between rx and tx paths.
111    data       -> data
112
113 Outgoing, dev->hard_header==NULL
114    mac_header -> data. ll header is still not built!
115    data       -> data
116
117 Resume
118   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
119
120
121 On transmit:
122 ------------
123
124 dev->hard_header != NULL
125    mac_header -> ll header
126    data       -> ll header
127
128 dev->hard_header == NULL (ll header is added by device, we cannot control it)
129    mac_header -> data
130    data       -> data
131
132    We should set nh.raw on output to correct posistion,
133    packet classifier depends on it.
134  */
135
136 /* Private packet socket structures. */
137
138 struct packet_mclist
139 {
140         struct packet_mclist    *next;
141         int                     ifindex;
142         int                     count;
143         unsigned short          type;
144         unsigned short          alen;
145         unsigned char           addr[MAX_ADDR_LEN];
146 };
147 /* identical to struct packet_mreq except it has
148  * a longer address field.
149  */
150 struct packet_mreq_max
151 {
152         int             mr_ifindex;
153         unsigned short  mr_type;
154         unsigned short  mr_alen;
155         unsigned char   mr_address[MAX_ADDR_LEN];
156 };
157
158 #ifdef CONFIG_PACKET_MMAP
159 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
160 #endif
161
162 static void packet_flush_mclist(struct sock *sk);
163
164 struct packet_sock {
165         /* struct sock has to be the first member of packet_sock */
166         struct sock             sk;
167         struct tpacket_stats    stats;
168 #ifdef CONFIG_PACKET_MMAP
169         char *                  *pg_vec;
170         unsigned int            head;
171         unsigned int            frames_per_block;
172         unsigned int            frame_size;
173         unsigned int            frame_max;
174         int                     copy_thresh;
175 #endif
176         struct packet_type      prot_hook;
177         spinlock_t              bind_lock;
178         unsigned int            running:1,      /* prot_hook is attached*/
179                                 auxdata:1,
180                                 origdev:1;
181         int                     ifindex;        /* bound device         */
182         __be16                  num;
183         struct packet_mclist    *mclist;
184 #ifdef CONFIG_PACKET_MMAP
185         atomic_t                mapped;
186         unsigned int            pg_vec_order;
187         unsigned int            pg_vec_pages;
188         unsigned int            pg_vec_len;
189         enum tpacket_versions   tp_version;
190         unsigned int            tp_hdrlen;
191         unsigned int            tp_reserve;
192 #endif
193 };
194
195 struct packet_skb_cb {
196         unsigned int origlen;
197         union {
198                 struct sockaddr_pkt pkt;
199                 struct sockaddr_ll ll;
200         } sa;
201 };
202
203 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
204
205 #ifdef CONFIG_PACKET_MMAP
206
207 static void *packet_lookup_frame(struct packet_sock *po, unsigned int position,
208                                  int status)
209 {
210         unsigned int pg_vec_pos, frame_offset;
211         union {
212                 struct tpacket_hdr *h1;
213                 struct tpacket2_hdr *h2;
214                 void *raw;
215         } h;
216
217         pg_vec_pos = position / po->frames_per_block;
218         frame_offset = position % po->frames_per_block;
219
220         h.raw = po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size);
221         switch (po->tp_version) {
222         case TPACKET_V1:
223                 if (status != h.h1->tp_status ? TP_STATUS_USER :
224                                                 TP_STATUS_KERNEL)
225                         return NULL;
226                 break;
227         case TPACKET_V2:
228                 if (status != h.h2->tp_status ? TP_STATUS_USER :
229                                                 TP_STATUS_KERNEL)
230                         return NULL;
231                 break;
232         }
233         return h.raw;
234 }
235
236 static void __packet_set_status(struct packet_sock *po, void *frame, int status)
237 {
238         union {
239                 struct tpacket_hdr *h1;
240                 struct tpacket2_hdr *h2;
241                 void *raw;
242         } h;
243
244         h.raw = frame;
245         switch (po->tp_version) {
246         case TPACKET_V1:
247                 h.h1->tp_status = status;
248                 break;
249         case TPACKET_V2:
250                 h.h2->tp_status = status;
251                 break;
252         }
253 }
254 #endif
255
256 static inline struct packet_sock *pkt_sk(struct sock *sk)
257 {
258         return (struct packet_sock *)sk;
259 }
260
261 static void packet_sock_destruct(struct sock *sk)
262 {
263         WARN_ON(atomic_read(&sk->sk_rmem_alloc));
264         WARN_ON(atomic_read(&sk->sk_wmem_alloc));
265
266         if (!sock_flag(sk, SOCK_DEAD)) {
267                 printk("Attempt to release alive packet socket: %p\n", sk);
268                 return;
269         }
270
271         sk_refcnt_debug_dec(sk);
272 }
273
274
275 static const struct proto_ops packet_ops;
276
277 static const struct proto_ops packet_ops_spkt;
278
279 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
280 {
281         struct sock *sk;
282         struct sockaddr_pkt *spkt;
283
284         /*
285          *      When we registered the protocol we saved the socket in the data
286          *      field for just this event.
287          */
288
289         sk = pt->af_packet_priv;
290
291         /*
292          *      Yank back the headers [hope the device set this
293          *      right or kerboom...]
294          *
295          *      Incoming packets have ll header pulled,
296          *      push it back.
297          *
298          *      For outgoing ones skb->data == skb_mac_header(skb)
299          *      so that this procedure is noop.
300          */
301
302         if (skb->pkt_type == PACKET_LOOPBACK)
303                 goto out;
304
305         if (dev_net(dev) != sock_net(sk))
306                 goto out;
307
308         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
309                 goto oom;
310
311         /* drop any routing info */
312         dst_release(skb->dst);
313         skb->dst = NULL;
314
315         /* drop conntrack reference */
316         nf_reset(skb);
317
318         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
319
320         skb_push(skb, skb->data - skb_mac_header(skb));
321
322         /*
323          *      The SOCK_PACKET socket receives _all_ frames.
324          */
325
326         spkt->spkt_family = dev->type;
327         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
328         spkt->spkt_protocol = skb->protocol;
329
330         /*
331          *      Charge the memory to the socket. This is done specifically
332          *      to prevent sockets using all the memory up.
333          */
334
335         if (sock_queue_rcv_skb(sk,skb) == 0)
336                 return 0;
337
338 out:
339         kfree_skb(skb);
340 oom:
341         return 0;
342 }
343
344
345 /*
346  *      Output a raw packet to a device layer. This bypasses all the other
347  *      protocol layers and you must therefore supply it with a complete frame
348  */
349
350 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
351                                struct msghdr *msg, size_t len)
352 {
353         struct sock *sk = sock->sk;
354         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
355         struct sk_buff *skb;
356         struct net_device *dev;
357         __be16 proto=0;
358         int err;
359
360         /*
361          *      Get and verify the address.
362          */
363
364         if (saddr)
365         {
366                 if (msg->msg_namelen < sizeof(struct sockaddr))
367                         return(-EINVAL);
368                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
369                         proto=saddr->spkt_protocol;
370         }
371         else
372                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
373
374         /*
375          *      Find the device first to size check it
376          */
377
378         saddr->spkt_device[13] = 0;
379         dev = dev_get_by_name(sock_net(sk), saddr->spkt_device);
380         err = -ENODEV;
381         if (dev == NULL)
382                 goto out_unlock;
383
384         err = -ENETDOWN;
385         if (!(dev->flags & IFF_UP))
386                 goto out_unlock;
387
388         /*
389          *      You may not queue a frame bigger than the mtu. This is the lowest level
390          *      raw protocol and you must do your own fragmentation at this level.
391          */
392
393         err = -EMSGSIZE;
394         if (len > dev->mtu + dev->hard_header_len)
395                 goto out_unlock;
396
397         err = -ENOBUFS;
398         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
399
400         /*
401          *      If the write buffer is full, then tough. At this level the user gets to
402          *      deal with the problem - do your own algorithmic backoffs. That's far
403          *      more flexible.
404          */
405
406         if (skb == NULL)
407                 goto out_unlock;
408
409         /*
410          *      Fill it in
411          */
412
413         /* FIXME: Save some space for broken drivers that write a
414          * hard header at transmission time by themselves. PPP is the
415          * notable one here. This should really be fixed at the driver level.
416          */
417         skb_reserve(skb, LL_RESERVED_SPACE(dev));
418         skb_reset_network_header(skb);
419
420         /* Try to align data part correctly */
421         if (dev->header_ops) {
422                 skb->data -= dev->hard_header_len;
423                 skb->tail -= dev->hard_header_len;
424                 if (len < dev->hard_header_len)
425                         skb_reset_network_header(skb);
426         }
427
428         /* Returns -EFAULT on error */
429         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
430         skb->protocol = proto;
431         skb->dev = dev;
432         skb->priority = sk->sk_priority;
433         if (err)
434                 goto out_free;
435
436         /*
437          *      Now send it
438          */
439
440         dev_queue_xmit(skb);
441         dev_put(dev);
442         return(len);
443
444 out_free:
445         kfree_skb(skb);
446 out_unlock:
447         if (dev)
448                 dev_put(dev);
449         return err;
450 }
451
452 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
453                                       unsigned int res)
454 {
455         struct sk_filter *filter;
456
457         rcu_read_lock_bh();
458         filter = rcu_dereference(sk->sk_filter);
459         if (filter != NULL)
460                 res = sk_run_filter(skb, filter->insns, filter->len);
461         rcu_read_unlock_bh();
462
463         return res;
464 }
465
466 /*
467    This function makes lazy skb cloning in hope that most of packets
468    are discarded by BPF.
469
470    Note tricky part: we DO mangle shared skb! skb->data, skb->len
471    and skb->cb are mangled. It works because (and until) packets
472    falling here are owned by current CPU. Output packets are cloned
473    by dev_queue_xmit_nit(), input packets are processed by net_bh
474    sequencially, so that if we return skb to original state on exit,
475    we will not harm anyone.
476  */
477
478 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
479 {
480         struct sock *sk;
481         struct sockaddr_ll *sll;
482         struct packet_sock *po;
483         u8 * skb_head = skb->data;
484         int skb_len = skb->len;
485         unsigned int snaplen, res;
486
487         if (skb->pkt_type == PACKET_LOOPBACK)
488                 goto drop;
489
490         sk = pt->af_packet_priv;
491         po = pkt_sk(sk);
492
493         if (dev_net(dev) != sock_net(sk))
494                 goto drop;
495
496         skb->dev = dev;
497
498         if (dev->header_ops) {
499                 /* The device has an explicit notion of ll header,
500                    exported to higher levels.
501
502                    Otherwise, the device hides datails of it frame
503                    structure, so that corresponding packet head
504                    never delivered to user.
505                  */
506                 if (sk->sk_type != SOCK_DGRAM)
507                         skb_push(skb, skb->data - skb_mac_header(skb));
508                 else if (skb->pkt_type == PACKET_OUTGOING) {
509                         /* Special case: outgoing packets have ll header at head */
510                         skb_pull(skb, skb_network_offset(skb));
511                 }
512         }
513
514         snaplen = skb->len;
515
516         res = run_filter(skb, sk, snaplen);
517         if (!res)
518                 goto drop_n_restore;
519         if (snaplen > res)
520                 snaplen = res;
521
522         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
523             (unsigned)sk->sk_rcvbuf)
524                 goto drop_n_acct;
525
526         if (skb_shared(skb)) {
527                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
528                 if (nskb == NULL)
529                         goto drop_n_acct;
530
531                 if (skb_head != skb->data) {
532                         skb->data = skb_head;
533                         skb->len = skb_len;
534                 }
535                 kfree_skb(skb);
536                 skb = nskb;
537         }
538
539         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
540                      sizeof(skb->cb));
541
542         sll = &PACKET_SKB_CB(skb)->sa.ll;
543         sll->sll_family = AF_PACKET;
544         sll->sll_hatype = dev->type;
545         sll->sll_protocol = skb->protocol;
546         sll->sll_pkttype = skb->pkt_type;
547         if (unlikely(po->origdev))
548                 sll->sll_ifindex = orig_dev->ifindex;
549         else
550                 sll->sll_ifindex = dev->ifindex;
551
552         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
553
554         PACKET_SKB_CB(skb)->origlen = skb->len;
555
556         if (pskb_trim(skb, snaplen))
557                 goto drop_n_acct;
558
559         skb_set_owner_r(skb, sk);
560         skb->dev = NULL;
561         dst_release(skb->dst);
562         skb->dst = NULL;
563
564         /* drop conntrack reference */
565         nf_reset(skb);
566
567         spin_lock(&sk->sk_receive_queue.lock);
568         po->stats.tp_packets++;
569         __skb_queue_tail(&sk->sk_receive_queue, skb);
570         spin_unlock(&sk->sk_receive_queue.lock);
571         sk->sk_data_ready(sk, skb->len);
572         return 0;
573
574 drop_n_acct:
575         spin_lock(&sk->sk_receive_queue.lock);
576         po->stats.tp_drops++;
577         spin_unlock(&sk->sk_receive_queue.lock);
578
579 drop_n_restore:
580         if (skb_head != skb->data && skb_shared(skb)) {
581                 skb->data = skb_head;
582                 skb->len = skb_len;
583         }
584 drop:
585         kfree_skb(skb);
586         return 0;
587 }
588
589 #ifdef CONFIG_PACKET_MMAP
590 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
591 {
592         struct sock *sk;
593         struct packet_sock *po;
594         struct sockaddr_ll *sll;
595         union {
596                 struct tpacket_hdr *h1;
597                 struct tpacket2_hdr *h2;
598                 void *raw;
599         } h;
600         u8 * skb_head = skb->data;
601         int skb_len = skb->len;
602         unsigned int snaplen, res;
603         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
604         unsigned short macoff, netoff, hdrlen;
605         struct sk_buff *copy_skb = NULL;
606         struct timeval tv;
607         struct timespec ts;
608
609         if (skb->pkt_type == PACKET_LOOPBACK)
610                 goto drop;
611
612         sk = pt->af_packet_priv;
613         po = pkt_sk(sk);
614
615         if (dev_net(dev) != sock_net(sk))
616                 goto drop;
617
618         if (dev->header_ops) {
619                 if (sk->sk_type != SOCK_DGRAM)
620                         skb_push(skb, skb->data - skb_mac_header(skb));
621                 else if (skb->pkt_type == PACKET_OUTGOING) {
622                         /* Special case: outgoing packets have ll header at head */
623                         skb_pull(skb, skb_network_offset(skb));
624                 }
625         }
626
627         if (skb->ip_summed == CHECKSUM_PARTIAL)
628                 status |= TP_STATUS_CSUMNOTREADY;
629
630         snaplen = skb->len;
631
632         res = run_filter(skb, sk, snaplen);
633         if (!res)
634                 goto drop_n_restore;
635         if (snaplen > res)
636                 snaplen = res;
637
638         if (sk->sk_type == SOCK_DGRAM) {
639                 macoff = netoff = TPACKET_ALIGN(po->tp_hdrlen) + 16 +
640                                   po->tp_reserve;
641         } else {
642                 unsigned maclen = skb_network_offset(skb);
643                 netoff = TPACKET_ALIGN(po->tp_hdrlen +
644                                        (maclen < 16 ? 16 : maclen)) +
645                         po->tp_reserve;
646                 macoff = netoff - maclen;
647         }
648
649         if (macoff + snaplen > po->frame_size) {
650                 if (po->copy_thresh &&
651                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
652                     (unsigned)sk->sk_rcvbuf) {
653                         if (skb_shared(skb)) {
654                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
655                         } else {
656                                 copy_skb = skb_get(skb);
657                                 skb_head = skb->data;
658                         }
659                         if (copy_skb)
660                                 skb_set_owner_r(copy_skb, sk);
661                 }
662                 snaplen = po->frame_size - macoff;
663                 if ((int)snaplen < 0)
664                         snaplen = 0;
665         }
666
667         spin_lock(&sk->sk_receive_queue.lock);
668         h.raw = packet_lookup_frame(po, po->head, TP_STATUS_KERNEL);
669         if (!h.raw)
670                 goto ring_is_full;
671         po->head = po->head != po->frame_max ? po->head+1 : 0;
672         po->stats.tp_packets++;
673         if (copy_skb) {
674                 status |= TP_STATUS_COPY;
675                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
676         }
677         if (!po->stats.tp_drops)
678                 status &= ~TP_STATUS_LOSING;
679         spin_unlock(&sk->sk_receive_queue.lock);
680
681         skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
682
683         switch (po->tp_version) {
684         case TPACKET_V1:
685                 h.h1->tp_len = skb->len;
686                 h.h1->tp_snaplen = snaplen;
687                 h.h1->tp_mac = macoff;
688                 h.h1->tp_net = netoff;
689                 if (skb->tstamp.tv64)
690                         tv = ktime_to_timeval(skb->tstamp);
691                 else
692                         do_gettimeofday(&tv);
693                 h.h1->tp_sec = tv.tv_sec;
694                 h.h1->tp_usec = tv.tv_usec;
695                 hdrlen = sizeof(*h.h1);
696                 break;
697         case TPACKET_V2:
698                 h.h2->tp_len = skb->len;
699                 h.h2->tp_snaplen = snaplen;
700                 h.h2->tp_mac = macoff;
701                 h.h2->tp_net = netoff;
702                 if (skb->tstamp.tv64)
703                         ts = ktime_to_timespec(skb->tstamp);
704                 else
705                         getnstimeofday(&ts);
706                 h.h2->tp_sec = ts.tv_sec;
707                 h.h2->tp_nsec = ts.tv_nsec;
708                 h.h2->tp_vlan_tci = skb->vlan_tci;
709                 hdrlen = sizeof(*h.h2);
710                 break;
711         default:
712                 BUG();
713         }
714
715         sll = h.raw + TPACKET_ALIGN(hdrlen);
716         sll->sll_halen = dev_parse_header(skb, sll->sll_addr);
717         sll->sll_family = AF_PACKET;
718         sll->sll_hatype = dev->type;
719         sll->sll_protocol = skb->protocol;
720         sll->sll_pkttype = skb->pkt_type;
721         if (unlikely(po->origdev))
722                 sll->sll_ifindex = orig_dev->ifindex;
723         else
724                 sll->sll_ifindex = dev->ifindex;
725
726         __packet_set_status(po, h.raw, status);
727         smp_mb();
728
729         {
730                 struct page *p_start, *p_end;
731                 u8 *h_end = h.raw + macoff + snaplen - 1;
732
733                 p_start = virt_to_page(h.raw);
734                 p_end = virt_to_page(h_end);
735                 while (p_start <= p_end) {
736                         flush_dcache_page(p_start);
737                         p_start++;
738                 }
739         }
740
741         sk->sk_data_ready(sk, 0);
742
743 drop_n_restore:
744         if (skb_head != skb->data && skb_shared(skb)) {
745                 skb->data = skb_head;
746                 skb->len = skb_len;
747         }
748 drop:
749         kfree_skb(skb);
750         return 0;
751
752 ring_is_full:
753         po->stats.tp_drops++;
754         spin_unlock(&sk->sk_receive_queue.lock);
755
756         sk->sk_data_ready(sk, 0);
757         if (copy_skb)
758                 kfree_skb(copy_skb);
759         goto drop_n_restore;
760 }
761
762 #endif
763
764
765 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
766                           struct msghdr *msg, size_t len)
767 {
768         struct sock *sk = sock->sk;
769         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
770         struct sk_buff *skb;
771         struct net_device *dev;
772         __be16 proto;
773         unsigned char *addr;
774         int ifindex, err, reserve = 0;
775
776         /*
777          *      Get and verify the address.
778          */
779
780         if (saddr == NULL) {
781                 struct packet_sock *po = pkt_sk(sk);
782
783                 ifindex = po->ifindex;
784                 proto   = po->num;
785                 addr    = NULL;
786         } else {
787                 err = -EINVAL;
788                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
789                         goto out;
790                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
791                         goto out;
792                 ifindex = saddr->sll_ifindex;
793                 proto   = saddr->sll_protocol;
794                 addr    = saddr->sll_addr;
795         }
796
797
798         dev = dev_get_by_index(sock_net(sk), ifindex);
799         err = -ENXIO;
800         if (dev == NULL)
801                 goto out_unlock;
802         if (sock->type == SOCK_RAW)
803                 reserve = dev->hard_header_len;
804
805         err = -ENETDOWN;
806         if (!(dev->flags & IFF_UP))
807                 goto out_unlock;
808
809         err = -EMSGSIZE;
810         if (len > dev->mtu+reserve)
811                 goto out_unlock;
812
813         skb = sock_alloc_send_skb(sk, len + LL_ALLOCATED_SPACE(dev),
814                                 msg->msg_flags & MSG_DONTWAIT, &err);
815         if (skb==NULL)
816                 goto out_unlock;
817
818         skb_reserve(skb, LL_RESERVED_SPACE(dev));
819         skb_reset_network_header(skb);
820
821         err = -EINVAL;
822         if (sock->type == SOCK_DGRAM &&
823             dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
824                 goto out_free;
825
826         /* Returns -EFAULT on error */
827         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
828         if (err)
829                 goto out_free;
830
831         skb->protocol = proto;
832         skb->dev = dev;
833         skb->priority = sk->sk_priority;
834
835         /*
836          *      Now send it
837          */
838
839         err = dev_queue_xmit(skb);
840         if (err > 0 && (err = net_xmit_errno(err)) != 0)
841                 goto out_unlock;
842
843         dev_put(dev);
844
845         return(len);
846
847 out_free:
848         kfree_skb(skb);
849 out_unlock:
850         if (dev)
851                 dev_put(dev);
852 out:
853         return err;
854 }
855
856 /*
857  *      Close a PACKET socket. This is fairly simple. We immediately go
858  *      to 'closed' state and remove our protocol entry in the device list.
859  */
860
861 static int packet_release(struct socket *sock)
862 {
863         struct sock *sk = sock->sk;
864         struct packet_sock *po;
865         struct net *net;
866
867         if (!sk)
868                 return 0;
869
870         net = sock_net(sk);
871         po = pkt_sk(sk);
872
873         write_lock_bh(&net->packet.sklist_lock);
874         sk_del_node_init(sk);
875         write_unlock_bh(&net->packet.sklist_lock);
876
877         /*
878          *      Unhook packet receive handler.
879          */
880
881         if (po->running) {
882                 /*
883                  *      Remove the protocol hook
884                  */
885                 dev_remove_pack(&po->prot_hook);
886                 po->running = 0;
887                 po->num = 0;
888                 __sock_put(sk);
889         }
890
891         packet_flush_mclist(sk);
892
893 #ifdef CONFIG_PACKET_MMAP
894         if (po->pg_vec) {
895                 struct tpacket_req req;
896                 memset(&req, 0, sizeof(req));
897                 packet_set_ring(sk, &req, 1);
898         }
899 #endif
900
901         /*
902          *      Now the socket is dead. No more input will appear.
903          */
904
905         sock_orphan(sk);
906         sock->sk = NULL;
907
908         /* Purge queues */
909
910         skb_queue_purge(&sk->sk_receive_queue);
911         sk_refcnt_debug_release(sk);
912
913         sock_prot_inuse_add(net, sk->sk_prot, -1);
914         sock_put(sk);
915         return 0;
916 }
917
918 /*
919  *      Attach a packet hook.
920  */
921
922 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
923 {
924         struct packet_sock *po = pkt_sk(sk);
925         /*
926          *      Detach an existing hook if present.
927          */
928
929         lock_sock(sk);
930
931         spin_lock(&po->bind_lock);
932         if (po->running) {
933                 __sock_put(sk);
934                 po->running = 0;
935                 po->num = 0;
936                 spin_unlock(&po->bind_lock);
937                 dev_remove_pack(&po->prot_hook);
938                 spin_lock(&po->bind_lock);
939         }
940
941         po->num = protocol;
942         po->prot_hook.type = protocol;
943         po->prot_hook.dev = dev;
944
945         po->ifindex = dev ? dev->ifindex : 0;
946
947         if (protocol == 0)
948                 goto out_unlock;
949
950         if (!dev || (dev->flags & IFF_UP)) {
951                 dev_add_pack(&po->prot_hook);
952                 sock_hold(sk);
953                 po->running = 1;
954         } else {
955                 sk->sk_err = ENETDOWN;
956                 if (!sock_flag(sk, SOCK_DEAD))
957                         sk->sk_error_report(sk);
958         }
959
960 out_unlock:
961         spin_unlock(&po->bind_lock);
962         release_sock(sk);
963         return 0;
964 }
965
966 /*
967  *      Bind a packet socket to a device
968  */
969
970 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
971 {
972         struct sock *sk=sock->sk;
973         char name[15];
974         struct net_device *dev;
975         int err = -ENODEV;
976
977         /*
978          *      Check legality
979          */
980
981         if (addr_len != sizeof(struct sockaddr))
982                 return -EINVAL;
983         strlcpy(name,uaddr->sa_data,sizeof(name));
984
985         dev = dev_get_by_name(sock_net(sk), name);
986         if (dev) {
987                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
988                 dev_put(dev);
989         }
990         return err;
991 }
992
993 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
994 {
995         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
996         struct sock *sk=sock->sk;
997         struct net_device *dev = NULL;
998         int err;
999
1000
1001         /*
1002          *      Check legality
1003          */
1004
1005         if (addr_len < sizeof(struct sockaddr_ll))
1006                 return -EINVAL;
1007         if (sll->sll_family != AF_PACKET)
1008                 return -EINVAL;
1009
1010         if (sll->sll_ifindex) {
1011                 err = -ENODEV;
1012                 dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
1013                 if (dev == NULL)
1014                         goto out;
1015         }
1016         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
1017         if (dev)
1018                 dev_put(dev);
1019
1020 out:
1021         return err;
1022 }
1023
1024 static struct proto packet_proto = {
1025         .name     = "PACKET",
1026         .owner    = THIS_MODULE,
1027         .obj_size = sizeof(struct packet_sock),
1028 };
1029
1030 /*
1031  *      Create a packet of type SOCK_PACKET.
1032  */
1033
1034 static int packet_create(struct net *net, struct socket *sock, int protocol)
1035 {
1036         struct sock *sk;
1037         struct packet_sock *po;
1038         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1039         int err;
1040
1041         if (!capable(CAP_NET_RAW))
1042                 return -EPERM;
1043         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
1044             sock->type != SOCK_PACKET)
1045                 return -ESOCKTNOSUPPORT;
1046
1047         sock->state = SS_UNCONNECTED;
1048
1049         err = -ENOBUFS;
1050         sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
1051         if (sk == NULL)
1052                 goto out;
1053
1054         sock->ops = &packet_ops;
1055         if (sock->type == SOCK_PACKET)
1056                 sock->ops = &packet_ops_spkt;
1057
1058         sock_init_data(sock, sk);
1059
1060         po = pkt_sk(sk);
1061         sk->sk_family = PF_PACKET;
1062         po->num = proto;
1063
1064         sk->sk_destruct = packet_sock_destruct;
1065         sk_refcnt_debug_inc(sk);
1066
1067         /*
1068          *      Attach a protocol block
1069          */
1070
1071         spin_lock_init(&po->bind_lock);
1072         po->prot_hook.func = packet_rcv;
1073
1074         if (sock->type == SOCK_PACKET)
1075                 po->prot_hook.func = packet_rcv_spkt;
1076
1077         po->prot_hook.af_packet_priv = sk;
1078
1079         if (proto) {
1080                 po->prot_hook.type = proto;
1081                 dev_add_pack(&po->prot_hook);
1082                 sock_hold(sk);
1083                 po->running = 1;
1084         }
1085
1086         write_lock_bh(&net->packet.sklist_lock);
1087         sk_add_node(sk, &net->packet.sklist);
1088         write_unlock_bh(&net->packet.sklist_lock);
1089         sock_prot_inuse_add(net, &packet_proto, 1);
1090         return(0);
1091 out:
1092         return err;
1093 }
1094
1095 /*
1096  *      Pull a packet from our receive queue and hand it to the user.
1097  *      If necessary we block.
1098  */
1099
1100 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1101                           struct msghdr *msg, size_t len, int flags)
1102 {
1103         struct sock *sk = sock->sk;
1104         struct sk_buff *skb;
1105         int copied, err;
1106         struct sockaddr_ll *sll;
1107
1108         err = -EINVAL;
1109         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1110                 goto out;
1111
1112 #if 0
1113         /* What error should we return now? EUNATTACH? */
1114         if (pkt_sk(sk)->ifindex < 0)
1115                 return -ENODEV;
1116 #endif
1117
1118         /*
1119          *      Call the generic datagram receiver. This handles all sorts
1120          *      of horrible races and re-entrancy so we can forget about it
1121          *      in the protocol layers.
1122          *
1123          *      Now it will return ENETDOWN, if device have just gone down,
1124          *      but then it will block.
1125          */
1126
1127         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1128
1129         /*
1130          *      An error occurred so return it. Because skb_recv_datagram()
1131          *      handles the blocking we don't see and worry about blocking
1132          *      retries.
1133          */
1134
1135         if (skb == NULL)
1136                 goto out;
1137
1138         /*
1139          *      If the address length field is there to be filled in, we fill
1140          *      it in now.
1141          */
1142
1143         sll = &PACKET_SKB_CB(skb)->sa.ll;
1144         if (sock->type == SOCK_PACKET)
1145                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1146         else
1147                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1148
1149         /*
1150          *      You lose any data beyond the buffer you gave. If it worries a
1151          *      user program they can ask the device for its MTU anyway.
1152          */
1153
1154         copied = skb->len;
1155         if (copied > len)
1156         {
1157                 copied=len;
1158                 msg->msg_flags|=MSG_TRUNC;
1159         }
1160
1161         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1162         if (err)
1163                 goto out_free;
1164
1165         sock_recv_timestamp(msg, sk, skb);
1166
1167         if (msg->msg_name)
1168                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1169                        msg->msg_namelen);
1170
1171         if (pkt_sk(sk)->auxdata) {
1172                 struct tpacket_auxdata aux;
1173
1174                 aux.tp_status = TP_STATUS_USER;
1175                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1176                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1177                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1178                 aux.tp_snaplen = skb->len;
1179                 aux.tp_mac = 0;
1180                 aux.tp_net = skb_network_offset(skb);
1181                 aux.tp_vlan_tci = skb->vlan_tci;
1182
1183                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1184         }
1185
1186         /*
1187          *      Free or return the buffer as appropriate. Again this
1188          *      hides all the races and re-entrancy issues from us.
1189          */
1190         err = (flags&MSG_TRUNC) ? skb->len : copied;
1191
1192 out_free:
1193         skb_free_datagram(sk, skb);
1194 out:
1195         return err;
1196 }
1197
1198 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1199                                int *uaddr_len, int peer)
1200 {
1201         struct net_device *dev;
1202         struct sock *sk = sock->sk;
1203
1204         if (peer)
1205                 return -EOPNOTSUPP;
1206
1207         uaddr->sa_family = AF_PACKET;
1208         dev = dev_get_by_index(sock_net(sk), pkt_sk(sk)->ifindex);
1209         if (dev) {
1210                 strlcpy(uaddr->sa_data, dev->name, 15);
1211                 dev_put(dev);
1212         } else
1213                 memset(uaddr->sa_data, 0, 14);
1214         *uaddr_len = sizeof(*uaddr);
1215
1216         return 0;
1217 }
1218
1219 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1220                           int *uaddr_len, int peer)
1221 {
1222         struct net_device *dev;
1223         struct sock *sk = sock->sk;
1224         struct packet_sock *po = pkt_sk(sk);
1225         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1226
1227         if (peer)
1228                 return -EOPNOTSUPP;
1229
1230         sll->sll_family = AF_PACKET;
1231         sll->sll_ifindex = po->ifindex;
1232         sll->sll_protocol = po->num;
1233         dev = dev_get_by_index(sock_net(sk), po->ifindex);
1234         if (dev) {
1235                 sll->sll_hatype = dev->type;
1236                 sll->sll_halen = dev->addr_len;
1237                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1238                 dev_put(dev);
1239         } else {
1240                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1241                 sll->sll_halen = 0;
1242         }
1243         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1244
1245         return 0;
1246 }
1247
1248 static int packet_dev_mc(struct net_device *dev, struct packet_mclist *i,
1249                          int what)
1250 {
1251         switch (i->type) {
1252         case PACKET_MR_MULTICAST:
1253                 if (what > 0)
1254                         dev_mc_add(dev, i->addr, i->alen, 0);
1255                 else
1256                         dev_mc_delete(dev, i->addr, i->alen, 0);
1257                 break;
1258         case PACKET_MR_PROMISC:
1259                 return dev_set_promiscuity(dev, what);
1260                 break;
1261         case PACKET_MR_ALLMULTI:
1262                 return dev_set_allmulti(dev, what);
1263                 break;
1264         default:;
1265         }
1266         return 0;
1267 }
1268
1269 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1270 {
1271         for ( ; i; i=i->next) {
1272                 if (i->ifindex == dev->ifindex)
1273                         packet_dev_mc(dev, i, what);
1274         }
1275 }
1276
1277 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1278 {
1279         struct packet_sock *po = pkt_sk(sk);
1280         struct packet_mclist *ml, *i;
1281         struct net_device *dev;
1282         int err;
1283
1284         rtnl_lock();
1285
1286         err = -ENODEV;
1287         dev = __dev_get_by_index(sock_net(sk), mreq->mr_ifindex);
1288         if (!dev)
1289                 goto done;
1290
1291         err = -EINVAL;
1292         if (mreq->mr_alen > dev->addr_len)
1293                 goto done;
1294
1295         err = -ENOBUFS;
1296         i = kmalloc(sizeof(*i), GFP_KERNEL);
1297         if (i == NULL)
1298                 goto done;
1299
1300         err = 0;
1301         for (ml = po->mclist; ml; ml = ml->next) {
1302                 if (ml->ifindex == mreq->mr_ifindex &&
1303                     ml->type == mreq->mr_type &&
1304                     ml->alen == mreq->mr_alen &&
1305                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1306                         ml->count++;
1307                         /* Free the new element ... */
1308                         kfree(i);
1309                         goto done;
1310                 }
1311         }
1312
1313         i->type = mreq->mr_type;
1314         i->ifindex = mreq->mr_ifindex;
1315         i->alen = mreq->mr_alen;
1316         memcpy(i->addr, mreq->mr_address, i->alen);
1317         i->count = 1;
1318         i->next = po->mclist;
1319         po->mclist = i;
1320         err = packet_dev_mc(dev, i, 1);
1321         if (err) {
1322                 po->mclist = i->next;
1323                 kfree(i);
1324         }
1325
1326 done:
1327         rtnl_unlock();
1328         return err;
1329 }
1330
1331 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1332 {
1333         struct packet_mclist *ml, **mlp;
1334
1335         rtnl_lock();
1336
1337         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1338                 if (ml->ifindex == mreq->mr_ifindex &&
1339                     ml->type == mreq->mr_type &&
1340                     ml->alen == mreq->mr_alen &&
1341                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1342                         if (--ml->count == 0) {
1343                                 struct net_device *dev;
1344                                 *mlp = ml->next;
1345                                 dev = dev_get_by_index(sock_net(sk), ml->ifindex);
1346                                 if (dev) {
1347                                         packet_dev_mc(dev, ml, -1);
1348                                         dev_put(dev);
1349                                 }
1350                                 kfree(ml);
1351                         }
1352                         rtnl_unlock();
1353                         return 0;
1354                 }
1355         }
1356         rtnl_unlock();
1357         return -EADDRNOTAVAIL;
1358 }
1359
1360 static void packet_flush_mclist(struct sock *sk)
1361 {
1362         struct packet_sock *po = pkt_sk(sk);
1363         struct packet_mclist *ml;
1364
1365         if (!po->mclist)
1366                 return;
1367
1368         rtnl_lock();
1369         while ((ml = po->mclist) != NULL) {
1370                 struct net_device *dev;
1371
1372                 po->mclist = ml->next;
1373                 if ((dev = dev_get_by_index(sock_net(sk), ml->ifindex)) != NULL) {
1374                         packet_dev_mc(dev, ml, -1);
1375                         dev_put(dev);
1376                 }
1377                 kfree(ml);
1378         }
1379         rtnl_unlock();
1380 }
1381
1382 static int
1383 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1384 {
1385         struct sock *sk = sock->sk;
1386         struct packet_sock *po = pkt_sk(sk);
1387         int ret;
1388
1389         if (level != SOL_PACKET)
1390                 return -ENOPROTOOPT;
1391
1392         switch(optname) {
1393         case PACKET_ADD_MEMBERSHIP:
1394         case PACKET_DROP_MEMBERSHIP:
1395         {
1396                 struct packet_mreq_max mreq;
1397                 int len = optlen;
1398                 memset(&mreq, 0, sizeof(mreq));
1399                 if (len < sizeof(struct packet_mreq))
1400                         return -EINVAL;
1401                 if (len > sizeof(mreq))
1402                         len = sizeof(mreq);
1403                 if (copy_from_user(&mreq,optval,len))
1404                         return -EFAULT;
1405                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1406                         return -EINVAL;
1407                 if (optname == PACKET_ADD_MEMBERSHIP)
1408                         ret = packet_mc_add(sk, &mreq);
1409                 else
1410                         ret = packet_mc_drop(sk, &mreq);
1411                 return ret;
1412         }
1413
1414 #ifdef CONFIG_PACKET_MMAP
1415         case PACKET_RX_RING:
1416         {
1417                 struct tpacket_req req;
1418
1419                 if (optlen<sizeof(req))
1420                         return -EINVAL;
1421                 if (copy_from_user(&req,optval,sizeof(req)))
1422                         return -EFAULT;
1423                 return packet_set_ring(sk, &req, 0);
1424         }
1425         case PACKET_COPY_THRESH:
1426         {
1427                 int val;
1428
1429                 if (optlen!=sizeof(val))
1430                         return -EINVAL;
1431                 if (copy_from_user(&val,optval,sizeof(val)))
1432                         return -EFAULT;
1433
1434                 pkt_sk(sk)->copy_thresh = val;
1435                 return 0;
1436         }
1437         case PACKET_VERSION:
1438         {
1439                 int val;
1440
1441                 if (optlen != sizeof(val))
1442                         return -EINVAL;
1443                 if (po->pg_vec)
1444                         return -EBUSY;
1445                 if (copy_from_user(&val, optval, sizeof(val)))
1446                         return -EFAULT;
1447                 switch (val) {
1448                 case TPACKET_V1:
1449                 case TPACKET_V2:
1450                         po->tp_version = val;
1451                         return 0;
1452                 default:
1453                         return -EINVAL;
1454                 }
1455         }
1456         case PACKET_RESERVE:
1457         {
1458                 unsigned int val;
1459
1460                 if (optlen != sizeof(val))
1461                         return -EINVAL;
1462                 if (po->pg_vec)
1463                         return -EBUSY;
1464                 if (copy_from_user(&val, optval, sizeof(val)))
1465                         return -EFAULT;
1466                 po->tp_reserve = val;
1467                 return 0;
1468         }
1469 #endif
1470         case PACKET_AUXDATA:
1471         {
1472                 int val;
1473
1474                 if (optlen < sizeof(val))
1475                         return -EINVAL;
1476                 if (copy_from_user(&val, optval, sizeof(val)))
1477                         return -EFAULT;
1478
1479                 po->auxdata = !!val;
1480                 return 0;
1481         }
1482         case PACKET_ORIGDEV:
1483         {
1484                 int val;
1485
1486                 if (optlen < sizeof(val))
1487                         return -EINVAL;
1488                 if (copy_from_user(&val, optval, sizeof(val)))
1489                         return -EFAULT;
1490
1491                 po->origdev = !!val;
1492                 return 0;
1493         }
1494         default:
1495                 return -ENOPROTOOPT;
1496         }
1497 }
1498
1499 static int packet_getsockopt(struct socket *sock, int level, int optname,
1500                              char __user *optval, int __user *optlen)
1501 {
1502         int len;
1503         int val;
1504         struct sock *sk = sock->sk;
1505         struct packet_sock *po = pkt_sk(sk);
1506         void *data;
1507         struct tpacket_stats st;
1508
1509         if (level != SOL_PACKET)
1510                 return -ENOPROTOOPT;
1511
1512         if (get_user(len, optlen))
1513                 return -EFAULT;
1514
1515         if (len < 0)
1516                 return -EINVAL;
1517
1518         switch(optname) {
1519         case PACKET_STATISTICS:
1520                 if (len > sizeof(struct tpacket_stats))
1521                         len = sizeof(struct tpacket_stats);
1522                 spin_lock_bh(&sk->sk_receive_queue.lock);
1523                 st = po->stats;
1524                 memset(&po->stats, 0, sizeof(st));
1525                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1526                 st.tp_packets += st.tp_drops;
1527
1528                 data = &st;
1529                 break;
1530         case PACKET_AUXDATA:
1531                 if (len > sizeof(int))
1532                         len = sizeof(int);
1533                 val = po->auxdata;
1534
1535                 data = &val;
1536                 break;
1537         case PACKET_ORIGDEV:
1538                 if (len > sizeof(int))
1539                         len = sizeof(int);
1540                 val = po->origdev;
1541
1542                 data = &val;
1543                 break;
1544 #ifdef CONFIG_PACKET_MMAP
1545         case PACKET_VERSION:
1546                 if (len > sizeof(int))
1547                         len = sizeof(int);
1548                 val = po->tp_version;
1549                 data = &val;
1550                 break;
1551         case PACKET_HDRLEN:
1552                 if (len > sizeof(int))
1553                         len = sizeof(int);
1554                 if (copy_from_user(&val, optval, len))
1555                         return -EFAULT;
1556                 switch (val) {
1557                 case TPACKET_V1:
1558                         val = sizeof(struct tpacket_hdr);
1559                         break;
1560                 case TPACKET_V2:
1561                         val = sizeof(struct tpacket2_hdr);
1562                         break;
1563                 default:
1564                         return -EINVAL;
1565                 }
1566                 data = &val;
1567                 break;
1568         case PACKET_RESERVE:
1569                 if (len > sizeof(unsigned int))
1570                         len = sizeof(unsigned int);
1571                 val = po->tp_reserve;
1572                 data = &val;
1573                 break;
1574 #endif
1575         default:
1576                 return -ENOPROTOOPT;
1577         }
1578
1579         if (put_user(len, optlen))
1580                 return -EFAULT;
1581         if (copy_to_user(optval, data, len))
1582                 return -EFAULT;
1583         return 0;
1584 }
1585
1586
1587 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1588 {
1589         struct sock *sk;
1590         struct hlist_node *node;
1591         struct net_device *dev = data;
1592         struct net *net = dev_net(dev);
1593
1594         read_lock(&net->packet.sklist_lock);
1595         sk_for_each(sk, node, &net->packet.sklist) {
1596                 struct packet_sock *po = pkt_sk(sk);
1597
1598                 switch (msg) {
1599                 case NETDEV_UNREGISTER:
1600                         if (po->mclist)
1601                                 packet_dev_mclist(dev, po->mclist, -1);
1602                         /* fallthrough */
1603
1604                 case NETDEV_DOWN:
1605                         if (dev->ifindex == po->ifindex) {
1606                                 spin_lock(&po->bind_lock);
1607                                 if (po->running) {
1608                                         __dev_remove_pack(&po->prot_hook);
1609                                         __sock_put(sk);
1610                                         po->running = 0;
1611                                         sk->sk_err = ENETDOWN;
1612                                         if (!sock_flag(sk, SOCK_DEAD))
1613                                                 sk->sk_error_report(sk);
1614                                 }
1615                                 if (msg == NETDEV_UNREGISTER) {
1616                                         po->ifindex = -1;
1617                                         po->prot_hook.dev = NULL;
1618                                 }
1619                                 spin_unlock(&po->bind_lock);
1620                         }
1621                         break;
1622                 case NETDEV_UP:
1623                         spin_lock(&po->bind_lock);
1624                         if (dev->ifindex == po->ifindex && po->num &&
1625                             !po->running) {
1626                                 dev_add_pack(&po->prot_hook);
1627                                 sock_hold(sk);
1628                                 po->running = 1;
1629                         }
1630                         spin_unlock(&po->bind_lock);
1631                         break;
1632                 }
1633         }
1634         read_unlock(&net->packet.sklist_lock);
1635         return NOTIFY_DONE;
1636 }
1637
1638
1639 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1640                         unsigned long arg)
1641 {
1642         struct sock *sk = sock->sk;
1643
1644         switch(cmd) {
1645                 case SIOCOUTQ:
1646                 {
1647                         int amount = atomic_read(&sk->sk_wmem_alloc);
1648                         return put_user(amount, (int __user *)arg);
1649                 }
1650                 case SIOCINQ:
1651                 {
1652                         struct sk_buff *skb;
1653                         int amount = 0;
1654
1655                         spin_lock_bh(&sk->sk_receive_queue.lock);
1656                         skb = skb_peek(&sk->sk_receive_queue);
1657                         if (skb)
1658                                 amount = skb->len;
1659                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1660                         return put_user(amount, (int __user *)arg);
1661                 }
1662                 case SIOCGSTAMP:
1663                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1664                 case SIOCGSTAMPNS:
1665                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1666
1667 #ifdef CONFIG_INET
1668                 case SIOCADDRT:
1669                 case SIOCDELRT:
1670                 case SIOCDARP:
1671                 case SIOCGARP:
1672                 case SIOCSARP:
1673                 case SIOCGIFADDR:
1674                 case SIOCSIFADDR:
1675                 case SIOCGIFBRDADDR:
1676                 case SIOCSIFBRDADDR:
1677                 case SIOCGIFNETMASK:
1678                 case SIOCSIFNETMASK:
1679                 case SIOCGIFDSTADDR:
1680                 case SIOCSIFDSTADDR:
1681                 case SIOCSIFFLAGS:
1682                         if (!net_eq(sock_net(sk), &init_net))
1683                                 return -ENOIOCTLCMD;
1684                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1685 #endif
1686
1687                 default:
1688                         return -ENOIOCTLCMD;
1689         }
1690         return 0;
1691 }
1692
1693 #ifndef CONFIG_PACKET_MMAP
1694 #define packet_mmap sock_no_mmap
1695 #define packet_poll datagram_poll
1696 #else
1697
1698 static unsigned int packet_poll(struct file * file, struct socket *sock,
1699                                 poll_table *wait)
1700 {
1701         struct sock *sk = sock->sk;
1702         struct packet_sock *po = pkt_sk(sk);
1703         unsigned int mask = datagram_poll(file, sock, wait);
1704
1705         spin_lock_bh(&sk->sk_receive_queue.lock);
1706         if (po->pg_vec) {
1707                 unsigned last = po->head ? po->head-1 : po->frame_max;
1708
1709                 if (packet_lookup_frame(po, last, TP_STATUS_USER))
1710                         mask |= POLLIN | POLLRDNORM;
1711         }
1712         spin_unlock_bh(&sk->sk_receive_queue.lock);
1713         return mask;
1714 }
1715
1716
1717 /* Dirty? Well, I still did not learn better way to account
1718  * for user mmaps.
1719  */
1720
1721 static void packet_mm_open(struct vm_area_struct *vma)
1722 {
1723         struct file *file = vma->vm_file;
1724         struct socket * sock = file->private_data;
1725         struct sock *sk = sock->sk;
1726
1727         if (sk)
1728                 atomic_inc(&pkt_sk(sk)->mapped);
1729 }
1730
1731 static void packet_mm_close(struct vm_area_struct *vma)
1732 {
1733         struct file *file = vma->vm_file;
1734         struct socket * sock = file->private_data;
1735         struct sock *sk = sock->sk;
1736
1737         if (sk)
1738                 atomic_dec(&pkt_sk(sk)->mapped);
1739 }
1740
1741 static struct vm_operations_struct packet_mmap_ops = {
1742         .open = packet_mm_open,
1743         .close =packet_mm_close,
1744 };
1745
1746 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1747 {
1748         int i;
1749
1750         for (i = 0; i < len; i++) {
1751                 if (likely(pg_vec[i]))
1752                         free_pages((unsigned long) pg_vec[i], order);
1753         }
1754         kfree(pg_vec);
1755 }
1756
1757 static inline char *alloc_one_pg_vec_page(unsigned long order)
1758 {
1759         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1760                                          order);
1761 }
1762
1763 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1764 {
1765         unsigned int block_nr = req->tp_block_nr;
1766         char **pg_vec;
1767         int i;
1768
1769         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1770         if (unlikely(!pg_vec))
1771                 goto out;
1772
1773         for (i = 0; i < block_nr; i++) {
1774                 pg_vec[i] = alloc_one_pg_vec_page(order);
1775                 if (unlikely(!pg_vec[i]))
1776                         goto out_free_pgvec;
1777         }
1778
1779 out:
1780         return pg_vec;
1781
1782 out_free_pgvec:
1783         free_pg_vec(pg_vec, order, block_nr);
1784         pg_vec = NULL;
1785         goto out;
1786 }
1787
1788 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1789 {
1790         char **pg_vec = NULL;
1791         struct packet_sock *po = pkt_sk(sk);
1792         int was_running, order = 0;
1793         __be16 num;
1794         int err = 0;
1795
1796         if (req->tp_block_nr) {
1797                 int i;
1798
1799                 /* Sanity tests and some calculations */
1800
1801                 if (unlikely(po->pg_vec))
1802                         return -EBUSY;
1803
1804                 switch (po->tp_version) {
1805                 case TPACKET_V1:
1806                         po->tp_hdrlen = TPACKET_HDRLEN;
1807                         break;
1808                 case TPACKET_V2:
1809                         po->tp_hdrlen = TPACKET2_HDRLEN;
1810                         break;
1811                 }
1812
1813                 if (unlikely((int)req->tp_block_size <= 0))
1814                         return -EINVAL;
1815                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1816                         return -EINVAL;
1817                 if (unlikely(req->tp_frame_size < po->tp_hdrlen +
1818                                                   po->tp_reserve))
1819                         return -EINVAL;
1820                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1821                         return -EINVAL;
1822
1823                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1824                 if (unlikely(po->frames_per_block <= 0))
1825                         return -EINVAL;
1826                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1827                              req->tp_frame_nr))
1828                         return -EINVAL;
1829
1830                 err = -ENOMEM;
1831                 order = get_order(req->tp_block_size);
1832                 pg_vec = alloc_pg_vec(req, order);
1833                 if (unlikely(!pg_vec))
1834                         goto out;
1835
1836                 for (i = 0; i < req->tp_block_nr; i++) {
1837                         void *ptr = pg_vec[i];
1838                         int k;
1839
1840                         for (k = 0; k < po->frames_per_block; k++) {
1841                                 __packet_set_status(po, ptr, TP_STATUS_KERNEL);
1842                                 ptr += req->tp_frame_size;
1843                         }
1844                 }
1845                 /* Done */
1846         } else {
1847                 if (unlikely(req->tp_frame_nr))
1848                         return -EINVAL;
1849         }
1850
1851         lock_sock(sk);
1852
1853         /* Detach socket from network */
1854         spin_lock(&po->bind_lock);
1855         was_running = po->running;
1856         num = po->num;
1857         if (was_running) {
1858                 __dev_remove_pack(&po->prot_hook);
1859                 po->num = 0;
1860                 po->running = 0;
1861                 __sock_put(sk);
1862         }
1863         spin_unlock(&po->bind_lock);
1864
1865         synchronize_net();
1866
1867         err = -EBUSY;
1868         if (closing || atomic_read(&po->mapped) == 0) {
1869                 err = 0;
1870 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1871
1872                 spin_lock_bh(&sk->sk_receive_queue.lock);
1873                 pg_vec = XC(po->pg_vec, pg_vec);
1874                 po->frame_max = (req->tp_frame_nr - 1);
1875                 po->head = 0;
1876                 po->frame_size = req->tp_frame_size;
1877                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1878
1879                 order = XC(po->pg_vec_order, order);
1880                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1881
1882                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1883                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1884                 skb_queue_purge(&sk->sk_receive_queue);
1885 #undef XC
1886                 if (atomic_read(&po->mapped))
1887                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1888         }
1889
1890         spin_lock(&po->bind_lock);
1891         if (was_running && !po->running) {
1892                 sock_hold(sk);
1893                 po->running = 1;
1894                 po->num = num;
1895                 dev_add_pack(&po->prot_hook);
1896         }
1897         spin_unlock(&po->bind_lock);
1898
1899         release_sock(sk);
1900
1901         if (pg_vec)
1902                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1903 out:
1904         return err;
1905 }
1906
1907 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1908 {
1909         struct sock *sk = sock->sk;
1910         struct packet_sock *po = pkt_sk(sk);
1911         unsigned long size;
1912         unsigned long start;
1913         int err = -EINVAL;
1914         int i;
1915
1916         if (vma->vm_pgoff)
1917                 return -EINVAL;
1918
1919         size = vma->vm_end - vma->vm_start;
1920
1921         lock_sock(sk);
1922         if (po->pg_vec == NULL)
1923                 goto out;
1924         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1925                 goto out;
1926
1927         start = vma->vm_start;
1928         for (i = 0; i < po->pg_vec_len; i++) {
1929                 struct page *page = virt_to_page(po->pg_vec[i]);
1930                 int pg_num;
1931
1932                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1933                         err = vm_insert_page(vma, start, page);
1934                         if (unlikely(err))
1935                                 goto out;
1936                         start += PAGE_SIZE;
1937                 }
1938         }
1939         atomic_inc(&po->mapped);
1940         vma->vm_ops = &packet_mmap_ops;
1941         err = 0;
1942
1943 out:
1944         release_sock(sk);
1945         return err;
1946 }
1947 #endif
1948
1949
1950 static const struct proto_ops packet_ops_spkt = {
1951         .family =       PF_PACKET,
1952         .owner =        THIS_MODULE,
1953         .release =      packet_release,
1954         .bind =         packet_bind_spkt,
1955         .connect =      sock_no_connect,
1956         .socketpair =   sock_no_socketpair,
1957         .accept =       sock_no_accept,
1958         .getname =      packet_getname_spkt,
1959         .poll =         datagram_poll,
1960         .ioctl =        packet_ioctl,
1961         .listen =       sock_no_listen,
1962         .shutdown =     sock_no_shutdown,
1963         .setsockopt =   sock_no_setsockopt,
1964         .getsockopt =   sock_no_getsockopt,
1965         .sendmsg =      packet_sendmsg_spkt,
1966         .recvmsg =      packet_recvmsg,
1967         .mmap =         sock_no_mmap,
1968         .sendpage =     sock_no_sendpage,
1969 };
1970
1971 static const struct proto_ops packet_ops = {
1972         .family =       PF_PACKET,
1973         .owner =        THIS_MODULE,
1974         .release =      packet_release,
1975         .bind =         packet_bind,
1976         .connect =      sock_no_connect,
1977         .socketpair =   sock_no_socketpair,
1978         .accept =       sock_no_accept,
1979         .getname =      packet_getname,
1980         .poll =         packet_poll,
1981         .ioctl =        packet_ioctl,
1982         .listen =       sock_no_listen,
1983         .shutdown =     sock_no_shutdown,
1984         .setsockopt =   packet_setsockopt,
1985         .getsockopt =   packet_getsockopt,
1986         .sendmsg =      packet_sendmsg,
1987         .recvmsg =      packet_recvmsg,
1988         .mmap =         packet_mmap,
1989         .sendpage =     sock_no_sendpage,
1990 };
1991
1992 static struct net_proto_family packet_family_ops = {
1993         .family =       PF_PACKET,
1994         .create =       packet_create,
1995         .owner  =       THIS_MODULE,
1996 };
1997
1998 static struct notifier_block packet_netdev_notifier = {
1999         .notifier_call =packet_notifier,
2000 };
2001
2002 #ifdef CONFIG_PROC_FS
2003 static inline struct sock *packet_seq_idx(struct net *net, loff_t off)
2004 {
2005         struct sock *s;
2006         struct hlist_node *node;
2007
2008         sk_for_each(s, node, &net->packet.sklist) {
2009                 if (!off--)
2010                         return s;
2011         }
2012         return NULL;
2013 }
2014
2015 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
2016         __acquires(seq_file_net(seq)->packet.sklist_lock)
2017 {
2018         struct net *net = seq_file_net(seq);
2019         read_lock(&net->packet.sklist_lock);
2020         return *pos ? packet_seq_idx(net, *pos - 1) : SEQ_START_TOKEN;
2021 }
2022
2023 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2024 {
2025         struct net *net = seq_file_net(seq);
2026         ++*pos;
2027         return  (v == SEQ_START_TOKEN)
2028                 ? sk_head(&net->packet.sklist)
2029                 : sk_next((struct sock*)v) ;
2030 }
2031
2032 static void packet_seq_stop(struct seq_file *seq, void *v)
2033         __releases(seq_file_net(seq)->packet.sklist_lock)
2034 {
2035         struct net *net = seq_file_net(seq);
2036         read_unlock(&net->packet.sklist_lock);
2037 }
2038
2039 static int packet_seq_show(struct seq_file *seq, void *v)
2040 {
2041         if (v == SEQ_START_TOKEN)
2042                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
2043         else {
2044                 struct sock *s = v;
2045                 const struct packet_sock *po = pkt_sk(s);
2046
2047                 seq_printf(seq,
2048                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
2049                            s,
2050                            atomic_read(&s->sk_refcnt),
2051                            s->sk_type,
2052                            ntohs(po->num),
2053                            po->ifindex,
2054                            po->running,
2055                            atomic_read(&s->sk_rmem_alloc),
2056                            sock_i_uid(s),
2057                            sock_i_ino(s) );
2058         }
2059
2060         return 0;
2061 }
2062
2063 static const struct seq_operations packet_seq_ops = {
2064         .start  = packet_seq_start,
2065         .next   = packet_seq_next,
2066         .stop   = packet_seq_stop,
2067         .show   = packet_seq_show,
2068 };
2069
2070 static int packet_seq_open(struct inode *inode, struct file *file)
2071 {
2072         return seq_open_net(inode, file, &packet_seq_ops,
2073                             sizeof(struct seq_net_private));
2074 }
2075
2076 static const struct file_operations packet_seq_fops = {
2077         .owner          = THIS_MODULE,
2078         .open           = packet_seq_open,
2079         .read           = seq_read,
2080         .llseek         = seq_lseek,
2081         .release        = seq_release_net,
2082 };
2083
2084 #endif
2085
2086 static int packet_net_init(struct net *net)
2087 {
2088         rwlock_init(&net->packet.sklist_lock);
2089         INIT_HLIST_HEAD(&net->packet.sklist);
2090
2091         if (!proc_net_fops_create(net, "packet", 0, &packet_seq_fops))
2092                 return -ENOMEM;
2093
2094         return 0;
2095 }
2096
2097 static void packet_net_exit(struct net *net)
2098 {
2099         proc_net_remove(net, "packet");
2100 }
2101
2102 static struct pernet_operations packet_net_ops = {
2103         .init = packet_net_init,
2104         .exit = packet_net_exit,
2105 };
2106
2107
2108 static void __exit packet_exit(void)
2109 {
2110         unregister_netdevice_notifier(&packet_netdev_notifier);
2111         unregister_pernet_subsys(&packet_net_ops);
2112         sock_unregister(PF_PACKET);
2113         proto_unregister(&packet_proto);
2114 }
2115
2116 static int __init packet_init(void)
2117 {
2118         int rc = proto_register(&packet_proto, 0);
2119
2120         if (rc != 0)
2121                 goto out;
2122
2123         sock_register(&packet_family_ops);
2124         register_pernet_subsys(&packet_net_ops);
2125         register_netdevice_notifier(&packet_netdev_notifier);
2126 out:
2127         return rc;
2128 }
2129
2130 module_init(packet_init);
2131 module_exit(packet_exit);
2132 MODULE_LICENSE("GPL");
2133 MODULE_ALIAS_NETPROTO(PF_PACKET);