[AF_PACKET]: Add option to return orig_dev to userspace.
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81
82 #ifdef CONFIG_INET
83 #include <net/inet_common.h>
84 #endif
85
86 #define CONFIG_SOCK_PACKET      1
87
88 /*
89    Proposed replacement for SIOC{ADD,DEL}MULTI and
90    IFF_PROMISC, IFF_ALLMULTI flags.
91
92    It is more expensive, but I believe,
93    it is really correct solution: reentereble, safe and fault tolerant.
94
95    IFF_PROMISC/IFF_ALLMULTI/SIOC{ADD/DEL}MULTI are faked by keeping
96    reference count and global flag, so that real status is
97    (gflag|(count != 0)), so that we can use obsolete faulty interface
98    not harming clever users.
99  */
100 #define CONFIG_PACKET_MULTICAST 1
101
102 /*
103    Assumptions:
104    - if device has no dev->hard_header routine, it adds and removes ll header
105      inside itself. In this case ll header is invisible outside of device,
106      but higher levels still should reserve dev->hard_header_len.
107      Some devices are enough clever to reallocate skb, when header
108      will not fit to reserved space (tunnel), another ones are silly
109      (PPP).
110    - packet socket receives packets with pulled ll header,
111      so that SOCK_RAW should push it back.
112
113 On receive:
114 -----------
115
116 Incoming, dev->hard_header!=NULL
117    mac_header -> ll header
118    data       -> data
119
120 Outgoing, dev->hard_header!=NULL
121    mac_header -> ll header
122    data       -> ll header
123
124 Incoming, dev->hard_header==NULL
125    mac_header -> UNKNOWN position. It is very likely, that it points to ll
126                  header.  PPP makes it, that is wrong, because introduce
127                  assymetry between rx and tx paths.
128    data       -> data
129
130 Outgoing, dev->hard_header==NULL
131    mac_header -> data. ll header is still not built!
132    data       -> data
133
134 Resume
135   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
136
137
138 On transmit:
139 ------------
140
141 dev->hard_header != NULL
142    mac_header -> ll header
143    data       -> ll header
144
145 dev->hard_header == NULL (ll header is added by device, we cannot control it)
146    mac_header -> data
147    data       -> data
148
149    We should set nh.raw on output to correct posistion,
150    packet classifier depends on it.
151  */
152
153 /* List of all packet sockets. */
154 static HLIST_HEAD(packet_sklist);
155 static DEFINE_RWLOCK(packet_sklist_lock);
156
157 static atomic_t packet_socks_nr;
158
159
160 /* Private packet socket structures. */
161
162 #ifdef CONFIG_PACKET_MULTICAST
163 struct packet_mclist
164 {
165         struct packet_mclist    *next;
166         int                     ifindex;
167         int                     count;
168         unsigned short          type;
169         unsigned short          alen;
170         unsigned char           addr[MAX_ADDR_LEN];
171 };
172 /* identical to struct packet_mreq except it has
173  * a longer address field.
174  */
175 struct packet_mreq_max
176 {
177         int             mr_ifindex;
178         unsigned short  mr_type;
179         unsigned short  mr_alen;
180         unsigned char   mr_address[MAX_ADDR_LEN];
181 };
182 #endif
183 #ifdef CONFIG_PACKET_MMAP
184 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
185 #endif
186
187 static void packet_flush_mclist(struct sock *sk);
188
189 struct packet_sock {
190         /* struct sock has to be the first member of packet_sock */
191         struct sock             sk;
192         struct tpacket_stats    stats;
193 #ifdef CONFIG_PACKET_MMAP
194         char *                  *pg_vec;
195         unsigned int            head;
196         unsigned int            frames_per_block;
197         unsigned int            frame_size;
198         unsigned int            frame_max;
199         int                     copy_thresh;
200 #endif
201         struct packet_type      prot_hook;
202         spinlock_t              bind_lock;
203         unsigned int            running:1,      /* prot_hook is attached*/
204                                 auxdata:1,
205                                 origdev:1;
206         int                     ifindex;        /* bound device         */
207         __be16                  num;
208 #ifdef CONFIG_PACKET_MULTICAST
209         struct packet_mclist    *mclist;
210 #endif
211 #ifdef CONFIG_PACKET_MMAP
212         atomic_t                mapped;
213         unsigned int            pg_vec_order;
214         unsigned int            pg_vec_pages;
215         unsigned int            pg_vec_len;
216 #endif
217 };
218
219 struct packet_skb_cb {
220         unsigned int origlen;
221         union {
222                 struct sockaddr_pkt pkt;
223                 struct sockaddr_ll ll;
224         } sa;
225 };
226
227 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
228
229 #ifdef CONFIG_PACKET_MMAP
230
231 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
232 {
233         unsigned int pg_vec_pos, frame_offset;
234
235         pg_vec_pos = position / po->frames_per_block;
236         frame_offset = position % po->frames_per_block;
237
238         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
239 }
240 #endif
241
242 static inline struct packet_sock *pkt_sk(struct sock *sk)
243 {
244         return (struct packet_sock *)sk;
245 }
246
247 static void packet_sock_destruct(struct sock *sk)
248 {
249         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
250         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
251
252         if (!sock_flag(sk, SOCK_DEAD)) {
253                 printk("Attempt to release alive packet socket: %p\n", sk);
254                 return;
255         }
256
257         atomic_dec(&packet_socks_nr);
258 #ifdef PACKET_REFCNT_DEBUG
259         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
260 #endif
261 }
262
263
264 static const struct proto_ops packet_ops;
265
266 #ifdef CONFIG_SOCK_PACKET
267 static const struct proto_ops packet_ops_spkt;
268
269 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
270 {
271         struct sock *sk;
272         struct sockaddr_pkt *spkt;
273
274         /*
275          *      When we registered the protocol we saved the socket in the data
276          *      field for just this event.
277          */
278
279         sk = pt->af_packet_priv;
280
281         /*
282          *      Yank back the headers [hope the device set this
283          *      right or kerboom...]
284          *
285          *      Incoming packets have ll header pulled,
286          *      push it back.
287          *
288          *      For outgoing ones skb->data == skb_mac_header(skb)
289          *      so that this procedure is noop.
290          */
291
292         if (skb->pkt_type == PACKET_LOOPBACK)
293                 goto out;
294
295         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
296                 goto oom;
297
298         /* drop any routing info */
299         dst_release(skb->dst);
300         skb->dst = NULL;
301
302         /* drop conntrack reference */
303         nf_reset(skb);
304
305         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
306
307         skb_push(skb, skb->data - skb_mac_header(skb));
308
309         /*
310          *      The SOCK_PACKET socket receives _all_ frames.
311          */
312
313         spkt->spkt_family = dev->type;
314         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
315         spkt->spkt_protocol = skb->protocol;
316
317         /*
318          *      Charge the memory to the socket. This is done specifically
319          *      to prevent sockets using all the memory up.
320          */
321
322         if (sock_queue_rcv_skb(sk,skb) == 0)
323                 return 0;
324
325 out:
326         kfree_skb(skb);
327 oom:
328         return 0;
329 }
330
331
332 /*
333  *      Output a raw packet to a device layer. This bypasses all the other
334  *      protocol layers and you must therefore supply it with a complete frame
335  */
336
337 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
338                                struct msghdr *msg, size_t len)
339 {
340         struct sock *sk = sock->sk;
341         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
342         struct sk_buff *skb;
343         struct net_device *dev;
344         __be16 proto=0;
345         int err;
346
347         /*
348          *      Get and verify the address.
349          */
350
351         if (saddr)
352         {
353                 if (msg->msg_namelen < sizeof(struct sockaddr))
354                         return(-EINVAL);
355                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
356                         proto=saddr->spkt_protocol;
357         }
358         else
359                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
360
361         /*
362          *      Find the device first to size check it
363          */
364
365         saddr->spkt_device[13] = 0;
366         dev = dev_get_by_name(saddr->spkt_device);
367         err = -ENODEV;
368         if (dev == NULL)
369                 goto out_unlock;
370
371         err = -ENETDOWN;
372         if (!(dev->flags & IFF_UP))
373                 goto out_unlock;
374
375         /*
376          *      You may not queue a frame bigger than the mtu. This is the lowest level
377          *      raw protocol and you must do your own fragmentation at this level.
378          */
379
380         err = -EMSGSIZE;
381         if (len > dev->mtu + dev->hard_header_len)
382                 goto out_unlock;
383
384         err = -ENOBUFS;
385         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
386
387         /*
388          *      If the write buffer is full, then tough. At this level the user gets to
389          *      deal with the problem - do your own algorithmic backoffs. That's far
390          *      more flexible.
391          */
392
393         if (skb == NULL)
394                 goto out_unlock;
395
396         /*
397          *      Fill it in
398          */
399
400         /* FIXME: Save some space for broken drivers that write a
401          * hard header at transmission time by themselves. PPP is the
402          * notable one here. This should really be fixed at the driver level.
403          */
404         skb_reserve(skb, LL_RESERVED_SPACE(dev));
405         skb_reset_network_header(skb);
406
407         /* Try to align data part correctly */
408         if (dev->hard_header) {
409                 skb->data -= dev->hard_header_len;
410                 skb->tail -= dev->hard_header_len;
411                 if (len < dev->hard_header_len)
412                         skb_reset_network_header(skb);
413         }
414
415         /* Returns -EFAULT on error */
416         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
417         skb->protocol = proto;
418         skb->dev = dev;
419         skb->priority = sk->sk_priority;
420         if (err)
421                 goto out_free;
422
423         /*
424          *      Now send it
425          */
426
427         dev_queue_xmit(skb);
428         dev_put(dev);
429         return(len);
430
431 out_free:
432         kfree_skb(skb);
433 out_unlock:
434         if (dev)
435                 dev_put(dev);
436         return err;
437 }
438 #endif
439
440 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
441                                       unsigned int res)
442 {
443         struct sk_filter *filter;
444
445         rcu_read_lock_bh();
446         filter = rcu_dereference(sk->sk_filter);
447         if (filter != NULL)
448                 res = sk_run_filter(skb, filter->insns, filter->len);
449         rcu_read_unlock_bh();
450
451         return res;
452 }
453
454 /*
455    This function makes lazy skb cloning in hope that most of packets
456    are discarded by BPF.
457
458    Note tricky part: we DO mangle shared skb! skb->data, skb->len
459    and skb->cb are mangled. It works because (and until) packets
460    falling here are owned by current CPU. Output packets are cloned
461    by dev_queue_xmit_nit(), input packets are processed by net_bh
462    sequencially, so that if we return skb to original state on exit,
463    we will not harm anyone.
464  */
465
466 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
467 {
468         struct sock *sk;
469         struct sockaddr_ll *sll;
470         struct packet_sock *po;
471         u8 * skb_head = skb->data;
472         int skb_len = skb->len;
473         unsigned int snaplen, res;
474
475         if (skb->pkt_type == PACKET_LOOPBACK)
476                 goto drop;
477
478         sk = pt->af_packet_priv;
479         po = pkt_sk(sk);
480
481         skb->dev = dev;
482
483         if (dev->hard_header) {
484                 /* The device has an explicit notion of ll header,
485                    exported to higher levels.
486
487                    Otherwise, the device hides datails of it frame
488                    structure, so that corresponding packet head
489                    never delivered to user.
490                  */
491                 if (sk->sk_type != SOCK_DGRAM)
492                         skb_push(skb, skb->data - skb_mac_header(skb));
493                 else if (skb->pkt_type == PACKET_OUTGOING) {
494                         /* Special case: outgoing packets have ll header at head */
495                         skb_pull(skb, skb_network_offset(skb));
496                 }
497         }
498
499         snaplen = skb->len;
500
501         res = run_filter(skb, sk, snaplen);
502         if (!res)
503                 goto drop_n_restore;
504         if (snaplen > res)
505                 snaplen = res;
506
507         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
508             (unsigned)sk->sk_rcvbuf)
509                 goto drop_n_acct;
510
511         if (skb_shared(skb)) {
512                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
513                 if (nskb == NULL)
514                         goto drop_n_acct;
515
516                 if (skb_head != skb->data) {
517                         skb->data = skb_head;
518                         skb->len = skb_len;
519                 }
520                 kfree_skb(skb);
521                 skb = nskb;
522         }
523
524         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
525                      sizeof(skb->cb));
526
527         sll = &PACKET_SKB_CB(skb)->sa.ll;
528         sll->sll_family = AF_PACKET;
529         sll->sll_hatype = dev->type;
530         sll->sll_protocol = skb->protocol;
531         sll->sll_pkttype = skb->pkt_type;
532         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
533                 sll->sll_ifindex = orig_dev->ifindex;
534         else
535                 sll->sll_ifindex = dev->ifindex;
536         sll->sll_halen = 0;
537
538         if (dev->hard_header_parse)
539                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
540
541         PACKET_SKB_CB(skb)->origlen = skb->len;
542
543         if (pskb_trim(skb, snaplen))
544                 goto drop_n_acct;
545
546         skb_set_owner_r(skb, sk);
547         skb->dev = NULL;
548         dst_release(skb->dst);
549         skb->dst = NULL;
550
551         /* drop conntrack reference */
552         nf_reset(skb);
553
554         spin_lock(&sk->sk_receive_queue.lock);
555         po->stats.tp_packets++;
556         __skb_queue_tail(&sk->sk_receive_queue, skb);
557         spin_unlock(&sk->sk_receive_queue.lock);
558         sk->sk_data_ready(sk, skb->len);
559         return 0;
560
561 drop_n_acct:
562         spin_lock(&sk->sk_receive_queue.lock);
563         po->stats.tp_drops++;
564         spin_unlock(&sk->sk_receive_queue.lock);
565
566 drop_n_restore:
567         if (skb_head != skb->data && skb_shared(skb)) {
568                 skb->data = skb_head;
569                 skb->len = skb_len;
570         }
571 drop:
572         kfree_skb(skb);
573         return 0;
574 }
575
576 #ifdef CONFIG_PACKET_MMAP
577 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
578 {
579         struct sock *sk;
580         struct packet_sock *po;
581         struct sockaddr_ll *sll;
582         struct tpacket_hdr *h;
583         u8 * skb_head = skb->data;
584         int skb_len = skb->len;
585         unsigned int snaplen, res;
586         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
587         unsigned short macoff, netoff;
588         struct sk_buff *copy_skb = NULL;
589         struct timeval tv;
590
591         if (skb->pkt_type == PACKET_LOOPBACK)
592                 goto drop;
593
594         sk = pt->af_packet_priv;
595         po = pkt_sk(sk);
596
597         if (dev->hard_header) {
598                 if (sk->sk_type != SOCK_DGRAM)
599                         skb_push(skb, skb->data - skb_mac_header(skb));
600                 else if (skb->pkt_type == PACKET_OUTGOING) {
601                         /* Special case: outgoing packets have ll header at head */
602                         skb_pull(skb, skb_network_offset(skb));
603                 }
604         }
605
606         if (skb->ip_summed == CHECKSUM_PARTIAL)
607                 status |= TP_STATUS_CSUMNOTREADY;
608
609         snaplen = skb->len;
610
611         res = run_filter(skb, sk, snaplen);
612         if (!res)
613                 goto drop_n_restore;
614         if (snaplen > res)
615                 snaplen = res;
616
617         if (sk->sk_type == SOCK_DGRAM) {
618                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
619         } else {
620                 unsigned maclen = skb_network_offset(skb);
621                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
622                 macoff = netoff - maclen;
623         }
624
625         if (macoff + snaplen > po->frame_size) {
626                 if (po->copy_thresh &&
627                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
628                     (unsigned)sk->sk_rcvbuf) {
629                         if (skb_shared(skb)) {
630                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
631                         } else {
632                                 copy_skb = skb_get(skb);
633                                 skb_head = skb->data;
634                         }
635                         if (copy_skb)
636                                 skb_set_owner_r(copy_skb, sk);
637                 }
638                 snaplen = po->frame_size - macoff;
639                 if ((int)snaplen < 0)
640                         snaplen = 0;
641         }
642
643         spin_lock(&sk->sk_receive_queue.lock);
644         h = packet_lookup_frame(po, po->head);
645
646         if (h->tp_status)
647                 goto ring_is_full;
648         po->head = po->head != po->frame_max ? po->head+1 : 0;
649         po->stats.tp_packets++;
650         if (copy_skb) {
651                 status |= TP_STATUS_COPY;
652                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
653         }
654         if (!po->stats.tp_drops)
655                 status &= ~TP_STATUS_LOSING;
656         spin_unlock(&sk->sk_receive_queue.lock);
657
658         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
659
660         h->tp_len = skb->len;
661         h->tp_snaplen = snaplen;
662         h->tp_mac = macoff;
663         h->tp_net = netoff;
664         if (skb->tstamp.tv64 == 0) {
665                 __net_timestamp(skb);
666                 sock_enable_timestamp(sk);
667         }
668         tv = ktime_to_timeval(skb->tstamp);
669         h->tp_sec = tv.tv_sec;
670         h->tp_usec = tv.tv_usec;
671
672         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
673         sll->sll_halen = 0;
674         if (dev->hard_header_parse)
675                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
676         sll->sll_family = AF_PACKET;
677         sll->sll_hatype = dev->type;
678         sll->sll_protocol = skb->protocol;
679         sll->sll_pkttype = skb->pkt_type;
680         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
681                 sll->sll_ifindex = orig_dev->ifindex;
682         else
683                 sll->sll_ifindex = dev->ifindex;
684
685         h->tp_status = status;
686         smp_mb();
687
688         {
689                 struct page *p_start, *p_end;
690                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
691
692                 p_start = virt_to_page(h);
693                 p_end = virt_to_page(h_end);
694                 while (p_start <= p_end) {
695                         flush_dcache_page(p_start);
696                         p_start++;
697                 }
698         }
699
700         sk->sk_data_ready(sk, 0);
701
702 drop_n_restore:
703         if (skb_head != skb->data && skb_shared(skb)) {
704                 skb->data = skb_head;
705                 skb->len = skb_len;
706         }
707 drop:
708         kfree_skb(skb);
709         return 0;
710
711 ring_is_full:
712         po->stats.tp_drops++;
713         spin_unlock(&sk->sk_receive_queue.lock);
714
715         sk->sk_data_ready(sk, 0);
716         if (copy_skb)
717                 kfree_skb(copy_skb);
718         goto drop_n_restore;
719 }
720
721 #endif
722
723
724 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
725                           struct msghdr *msg, size_t len)
726 {
727         struct sock *sk = sock->sk;
728         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
729         struct sk_buff *skb;
730         struct net_device *dev;
731         __be16 proto;
732         unsigned char *addr;
733         int ifindex, err, reserve = 0;
734
735         /*
736          *      Get and verify the address.
737          */
738
739         if (saddr == NULL) {
740                 struct packet_sock *po = pkt_sk(sk);
741
742                 ifindex = po->ifindex;
743                 proto   = po->num;
744                 addr    = NULL;
745         } else {
746                 err = -EINVAL;
747                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
748                         goto out;
749                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
750                         goto out;
751                 ifindex = saddr->sll_ifindex;
752                 proto   = saddr->sll_protocol;
753                 addr    = saddr->sll_addr;
754         }
755
756
757         dev = dev_get_by_index(ifindex);
758         err = -ENXIO;
759         if (dev == NULL)
760                 goto out_unlock;
761         if (sock->type == SOCK_RAW)
762                 reserve = dev->hard_header_len;
763
764         err = -ENETDOWN;
765         if (!(dev->flags & IFF_UP))
766                 goto out_unlock;
767
768         err = -EMSGSIZE;
769         if (len > dev->mtu+reserve)
770                 goto out_unlock;
771
772         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
773                                 msg->msg_flags & MSG_DONTWAIT, &err);
774         if (skb==NULL)
775                 goto out_unlock;
776
777         skb_reserve(skb, LL_RESERVED_SPACE(dev));
778         skb_reset_network_header(skb);
779
780         if (dev->hard_header) {
781                 int res;
782                 err = -EINVAL;
783                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
784                 if (sock->type != SOCK_DGRAM) {
785                         skb_reset_tail_pointer(skb);
786                         skb->len = 0;
787                 } else if (res < 0)
788                         goto out_free;
789         }
790
791         /* Returns -EFAULT on error */
792         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
793         if (err)
794                 goto out_free;
795
796         skb->protocol = proto;
797         skb->dev = dev;
798         skb->priority = sk->sk_priority;
799
800         /*
801          *      Now send it
802          */
803
804         err = dev_queue_xmit(skb);
805         if (err > 0 && (err = net_xmit_errno(err)) != 0)
806                 goto out_unlock;
807
808         dev_put(dev);
809
810         return(len);
811
812 out_free:
813         kfree_skb(skb);
814 out_unlock:
815         if (dev)
816                 dev_put(dev);
817 out:
818         return err;
819 }
820
821 /*
822  *      Close a PACKET socket. This is fairly simple. We immediately go
823  *      to 'closed' state and remove our protocol entry in the device list.
824  */
825
826 static int packet_release(struct socket *sock)
827 {
828         struct sock *sk = sock->sk;
829         struct packet_sock *po;
830
831         if (!sk)
832                 return 0;
833
834         po = pkt_sk(sk);
835
836         write_lock_bh(&packet_sklist_lock);
837         sk_del_node_init(sk);
838         write_unlock_bh(&packet_sklist_lock);
839
840         /*
841          *      Unhook packet receive handler.
842          */
843
844         if (po->running) {
845                 /*
846                  *      Remove the protocol hook
847                  */
848                 dev_remove_pack(&po->prot_hook);
849                 po->running = 0;
850                 po->num = 0;
851                 __sock_put(sk);
852         }
853
854 #ifdef CONFIG_PACKET_MULTICAST
855         packet_flush_mclist(sk);
856 #endif
857
858 #ifdef CONFIG_PACKET_MMAP
859         if (po->pg_vec) {
860                 struct tpacket_req req;
861                 memset(&req, 0, sizeof(req));
862                 packet_set_ring(sk, &req, 1);
863         }
864 #endif
865
866         /*
867          *      Now the socket is dead. No more input will appear.
868          */
869
870         sock_orphan(sk);
871         sock->sk = NULL;
872
873         /* Purge queues */
874
875         skb_queue_purge(&sk->sk_receive_queue);
876
877         sock_put(sk);
878         return 0;
879 }
880
881 /*
882  *      Attach a packet hook.
883  */
884
885 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
886 {
887         struct packet_sock *po = pkt_sk(sk);
888         /*
889          *      Detach an existing hook if present.
890          */
891
892         lock_sock(sk);
893
894         spin_lock(&po->bind_lock);
895         if (po->running) {
896                 __sock_put(sk);
897                 po->running = 0;
898                 po->num = 0;
899                 spin_unlock(&po->bind_lock);
900                 dev_remove_pack(&po->prot_hook);
901                 spin_lock(&po->bind_lock);
902         }
903
904         po->num = protocol;
905         po->prot_hook.type = protocol;
906         po->prot_hook.dev = dev;
907
908         po->ifindex = dev ? dev->ifindex : 0;
909
910         if (protocol == 0)
911                 goto out_unlock;
912
913         if (dev) {
914                 if (dev->flags&IFF_UP) {
915                         dev_add_pack(&po->prot_hook);
916                         sock_hold(sk);
917                         po->running = 1;
918                 } else {
919                         sk->sk_err = ENETDOWN;
920                         if (!sock_flag(sk, SOCK_DEAD))
921                                 sk->sk_error_report(sk);
922                 }
923         } else {
924                 dev_add_pack(&po->prot_hook);
925                 sock_hold(sk);
926                 po->running = 1;
927         }
928
929 out_unlock:
930         spin_unlock(&po->bind_lock);
931         release_sock(sk);
932         return 0;
933 }
934
935 /*
936  *      Bind a packet socket to a device
937  */
938
939 #ifdef CONFIG_SOCK_PACKET
940
941 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
942 {
943         struct sock *sk=sock->sk;
944         char name[15];
945         struct net_device *dev;
946         int err = -ENODEV;
947
948         /*
949          *      Check legality
950          */
951
952         if (addr_len != sizeof(struct sockaddr))
953                 return -EINVAL;
954         strlcpy(name,uaddr->sa_data,sizeof(name));
955
956         dev = dev_get_by_name(name);
957         if (dev) {
958                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
959                 dev_put(dev);
960         }
961         return err;
962 }
963 #endif
964
965 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
966 {
967         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
968         struct sock *sk=sock->sk;
969         struct net_device *dev = NULL;
970         int err;
971
972
973         /*
974          *      Check legality
975          */
976
977         if (addr_len < sizeof(struct sockaddr_ll))
978                 return -EINVAL;
979         if (sll->sll_family != AF_PACKET)
980                 return -EINVAL;
981
982         if (sll->sll_ifindex) {
983                 err = -ENODEV;
984                 dev = dev_get_by_index(sll->sll_ifindex);
985                 if (dev == NULL)
986                         goto out;
987         }
988         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
989         if (dev)
990                 dev_put(dev);
991
992 out:
993         return err;
994 }
995
996 static struct proto packet_proto = {
997         .name     = "PACKET",
998         .owner    = THIS_MODULE,
999         .obj_size = sizeof(struct packet_sock),
1000 };
1001
1002 /*
1003  *      Create a packet of type SOCK_PACKET.
1004  */
1005
1006 static int packet_create(struct socket *sock, int protocol)
1007 {
1008         struct sock *sk;
1009         struct packet_sock *po;
1010         __be16 proto = (__force __be16)protocol; /* weird, but documented */
1011         int err;
1012
1013         if (!capable(CAP_NET_RAW))
1014                 return -EPERM;
1015         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
1016 #ifdef CONFIG_SOCK_PACKET
1017             && sock->type != SOCK_PACKET
1018 #endif
1019             )
1020                 return -ESOCKTNOSUPPORT;
1021
1022         sock->state = SS_UNCONNECTED;
1023
1024         err = -ENOBUFS;
1025         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1026         if (sk == NULL)
1027                 goto out;
1028
1029         sock->ops = &packet_ops;
1030 #ifdef CONFIG_SOCK_PACKET
1031         if (sock->type == SOCK_PACKET)
1032                 sock->ops = &packet_ops_spkt;
1033 #endif
1034         sock_init_data(sock, sk);
1035
1036         po = pkt_sk(sk);
1037         sk->sk_family = PF_PACKET;
1038         po->num = proto;
1039
1040         sk->sk_destruct = packet_sock_destruct;
1041         atomic_inc(&packet_socks_nr);
1042
1043         /*
1044          *      Attach a protocol block
1045          */
1046
1047         spin_lock_init(&po->bind_lock);
1048         po->prot_hook.func = packet_rcv;
1049 #ifdef CONFIG_SOCK_PACKET
1050         if (sock->type == SOCK_PACKET)
1051                 po->prot_hook.func = packet_rcv_spkt;
1052 #endif
1053         po->prot_hook.af_packet_priv = sk;
1054
1055         if (proto) {
1056                 po->prot_hook.type = proto;
1057                 dev_add_pack(&po->prot_hook);
1058                 sock_hold(sk);
1059                 po->running = 1;
1060         }
1061
1062         write_lock_bh(&packet_sklist_lock);
1063         sk_add_node(sk, &packet_sklist);
1064         write_unlock_bh(&packet_sklist_lock);
1065         return(0);
1066 out:
1067         return err;
1068 }
1069
1070 /*
1071  *      Pull a packet from our receive queue and hand it to the user.
1072  *      If necessary we block.
1073  */
1074
1075 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1076                           struct msghdr *msg, size_t len, int flags)
1077 {
1078         struct sock *sk = sock->sk;
1079         struct sk_buff *skb;
1080         int copied, err;
1081         struct sockaddr_ll *sll;
1082
1083         err = -EINVAL;
1084         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1085                 goto out;
1086
1087 #if 0
1088         /* What error should we return now? EUNATTACH? */
1089         if (pkt_sk(sk)->ifindex < 0)
1090                 return -ENODEV;
1091 #endif
1092
1093         /*
1094          *      Call the generic datagram receiver. This handles all sorts
1095          *      of horrible races and re-entrancy so we can forget about it
1096          *      in the protocol layers.
1097          *
1098          *      Now it will return ENETDOWN, if device have just gone down,
1099          *      but then it will block.
1100          */
1101
1102         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1103
1104         /*
1105          *      An error occurred so return it. Because skb_recv_datagram()
1106          *      handles the blocking we don't see and worry about blocking
1107          *      retries.
1108          */
1109
1110         if (skb == NULL)
1111                 goto out;
1112
1113         /*
1114          *      If the address length field is there to be filled in, we fill
1115          *      it in now.
1116          */
1117
1118         sll = &PACKET_SKB_CB(skb)->sa.ll;
1119         if (sock->type == SOCK_PACKET)
1120                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1121         else
1122                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1123
1124         /*
1125          *      You lose any data beyond the buffer you gave. If it worries a
1126          *      user program they can ask the device for its MTU anyway.
1127          */
1128
1129         copied = skb->len;
1130         if (copied > len)
1131         {
1132                 copied=len;
1133                 msg->msg_flags|=MSG_TRUNC;
1134         }
1135
1136         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1137         if (err)
1138                 goto out_free;
1139
1140         sock_recv_timestamp(msg, sk, skb);
1141
1142         if (msg->msg_name)
1143                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1144                        msg->msg_namelen);
1145
1146         if (pkt_sk(sk)->auxdata) {
1147                 struct tpacket_auxdata aux;
1148
1149                 aux.tp_status = TP_STATUS_USER;
1150                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1151                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1152                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1153                 aux.tp_snaplen = skb->len;
1154                 aux.tp_mac = 0;
1155                 aux.tp_net = skb_network_offset(skb);
1156
1157                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1158         }
1159
1160         /*
1161          *      Free or return the buffer as appropriate. Again this
1162          *      hides all the races and re-entrancy issues from us.
1163          */
1164         err = (flags&MSG_TRUNC) ? skb->len : copied;
1165
1166 out_free:
1167         skb_free_datagram(sk, skb);
1168 out:
1169         return err;
1170 }
1171
1172 #ifdef CONFIG_SOCK_PACKET
1173 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1174                                int *uaddr_len, int peer)
1175 {
1176         struct net_device *dev;
1177         struct sock *sk = sock->sk;
1178
1179         if (peer)
1180                 return -EOPNOTSUPP;
1181
1182         uaddr->sa_family = AF_PACKET;
1183         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1184         if (dev) {
1185                 strlcpy(uaddr->sa_data, dev->name, 15);
1186                 dev_put(dev);
1187         } else
1188                 memset(uaddr->sa_data, 0, 14);
1189         *uaddr_len = sizeof(*uaddr);
1190
1191         return 0;
1192 }
1193 #endif
1194
1195 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1196                           int *uaddr_len, int peer)
1197 {
1198         struct net_device *dev;
1199         struct sock *sk = sock->sk;
1200         struct packet_sock *po = pkt_sk(sk);
1201         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1202
1203         if (peer)
1204                 return -EOPNOTSUPP;
1205
1206         sll->sll_family = AF_PACKET;
1207         sll->sll_ifindex = po->ifindex;
1208         sll->sll_protocol = po->num;
1209         dev = dev_get_by_index(po->ifindex);
1210         if (dev) {
1211                 sll->sll_hatype = dev->type;
1212                 sll->sll_halen = dev->addr_len;
1213                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1214                 dev_put(dev);
1215         } else {
1216                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1217                 sll->sll_halen = 0;
1218         }
1219         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1220
1221         return 0;
1222 }
1223
1224 #ifdef CONFIG_PACKET_MULTICAST
1225 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1226 {
1227         switch (i->type) {
1228         case PACKET_MR_MULTICAST:
1229                 if (what > 0)
1230                         dev_mc_add(dev, i->addr, i->alen, 0);
1231                 else
1232                         dev_mc_delete(dev, i->addr, i->alen, 0);
1233                 break;
1234         case PACKET_MR_PROMISC:
1235                 dev_set_promiscuity(dev, what);
1236                 break;
1237         case PACKET_MR_ALLMULTI:
1238                 dev_set_allmulti(dev, what);
1239                 break;
1240         default:;
1241         }
1242 }
1243
1244 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1245 {
1246         for ( ; i; i=i->next) {
1247                 if (i->ifindex == dev->ifindex)
1248                         packet_dev_mc(dev, i, what);
1249         }
1250 }
1251
1252 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1253 {
1254         struct packet_sock *po = pkt_sk(sk);
1255         struct packet_mclist *ml, *i;
1256         struct net_device *dev;
1257         int err;
1258
1259         rtnl_lock();
1260
1261         err = -ENODEV;
1262         dev = __dev_get_by_index(mreq->mr_ifindex);
1263         if (!dev)
1264                 goto done;
1265
1266         err = -EINVAL;
1267         if (mreq->mr_alen > dev->addr_len)
1268                 goto done;
1269
1270         err = -ENOBUFS;
1271         i = kmalloc(sizeof(*i), GFP_KERNEL);
1272         if (i == NULL)
1273                 goto done;
1274
1275         err = 0;
1276         for (ml = po->mclist; ml; ml = ml->next) {
1277                 if (ml->ifindex == mreq->mr_ifindex &&
1278                     ml->type == mreq->mr_type &&
1279                     ml->alen == mreq->mr_alen &&
1280                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1281                         ml->count++;
1282                         /* Free the new element ... */
1283                         kfree(i);
1284                         goto done;
1285                 }
1286         }
1287
1288         i->type = mreq->mr_type;
1289         i->ifindex = mreq->mr_ifindex;
1290         i->alen = mreq->mr_alen;
1291         memcpy(i->addr, mreq->mr_address, i->alen);
1292         i->count = 1;
1293         i->next = po->mclist;
1294         po->mclist = i;
1295         packet_dev_mc(dev, i, +1);
1296
1297 done:
1298         rtnl_unlock();
1299         return err;
1300 }
1301
1302 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1303 {
1304         struct packet_mclist *ml, **mlp;
1305
1306         rtnl_lock();
1307
1308         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1309                 if (ml->ifindex == mreq->mr_ifindex &&
1310                     ml->type == mreq->mr_type &&
1311                     ml->alen == mreq->mr_alen &&
1312                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1313                         if (--ml->count == 0) {
1314                                 struct net_device *dev;
1315                                 *mlp = ml->next;
1316                                 dev = dev_get_by_index(ml->ifindex);
1317                                 if (dev) {
1318                                         packet_dev_mc(dev, ml, -1);
1319                                         dev_put(dev);
1320                                 }
1321                                 kfree(ml);
1322                         }
1323                         rtnl_unlock();
1324                         return 0;
1325                 }
1326         }
1327         rtnl_unlock();
1328         return -EADDRNOTAVAIL;
1329 }
1330
1331 static void packet_flush_mclist(struct sock *sk)
1332 {
1333         struct packet_sock *po = pkt_sk(sk);
1334         struct packet_mclist *ml;
1335
1336         if (!po->mclist)
1337                 return;
1338
1339         rtnl_lock();
1340         while ((ml = po->mclist) != NULL) {
1341                 struct net_device *dev;
1342
1343                 po->mclist = ml->next;
1344                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1345                         packet_dev_mc(dev, ml, -1);
1346                         dev_put(dev);
1347                 }
1348                 kfree(ml);
1349         }
1350         rtnl_unlock();
1351 }
1352 #endif
1353
1354 static int
1355 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1356 {
1357         struct sock *sk = sock->sk;
1358         struct packet_sock *po = pkt_sk(sk);
1359         int ret;
1360
1361         if (level != SOL_PACKET)
1362                 return -ENOPROTOOPT;
1363
1364         switch(optname) {
1365 #ifdef CONFIG_PACKET_MULTICAST
1366         case PACKET_ADD_MEMBERSHIP:
1367         case PACKET_DROP_MEMBERSHIP:
1368         {
1369                 struct packet_mreq_max mreq;
1370                 int len = optlen;
1371                 memset(&mreq, 0, sizeof(mreq));
1372                 if (len < sizeof(struct packet_mreq))
1373                         return -EINVAL;
1374                 if (len > sizeof(mreq))
1375                         len = sizeof(mreq);
1376                 if (copy_from_user(&mreq,optval,len))
1377                         return -EFAULT;
1378                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1379                         return -EINVAL;
1380                 if (optname == PACKET_ADD_MEMBERSHIP)
1381                         ret = packet_mc_add(sk, &mreq);
1382                 else
1383                         ret = packet_mc_drop(sk, &mreq);
1384                 return ret;
1385         }
1386 #endif
1387 #ifdef CONFIG_PACKET_MMAP
1388         case PACKET_RX_RING:
1389         {
1390                 struct tpacket_req req;
1391
1392                 if (optlen<sizeof(req))
1393                         return -EINVAL;
1394                 if (copy_from_user(&req,optval,sizeof(req)))
1395                         return -EFAULT;
1396                 return packet_set_ring(sk, &req, 0);
1397         }
1398         case PACKET_COPY_THRESH:
1399         {
1400                 int val;
1401
1402                 if (optlen!=sizeof(val))
1403                         return -EINVAL;
1404                 if (copy_from_user(&val,optval,sizeof(val)))
1405                         return -EFAULT;
1406
1407                 pkt_sk(sk)->copy_thresh = val;
1408                 return 0;
1409         }
1410 #endif
1411         case PACKET_AUXDATA:
1412         {
1413                 int val;
1414
1415                 if (optlen < sizeof(val))
1416                         return -EINVAL;
1417                 if (copy_from_user(&val, optval, sizeof(val)))
1418                         return -EFAULT;
1419
1420                 po->auxdata = !!val;
1421                 return 0;
1422         }
1423         case PACKET_ORIGDEV:
1424         {
1425                 int val;
1426
1427                 if (optlen < sizeof(val))
1428                         return -EINVAL;
1429                 if (copy_from_user(&val, optval, sizeof(val)))
1430                         return -EFAULT;
1431
1432                 po->origdev = !!val;
1433                 return 0;
1434         }
1435         default:
1436                 return -ENOPROTOOPT;
1437         }
1438 }
1439
1440 static int packet_getsockopt(struct socket *sock, int level, int optname,
1441                              char __user *optval, int __user *optlen)
1442 {
1443         int len;
1444         int val;
1445         struct sock *sk = sock->sk;
1446         struct packet_sock *po = pkt_sk(sk);
1447         void *data;
1448         struct tpacket_stats st;
1449
1450         if (level != SOL_PACKET)
1451                 return -ENOPROTOOPT;
1452
1453         if (get_user(len, optlen))
1454                 return -EFAULT;
1455
1456         if (len < 0)
1457                 return -EINVAL;
1458
1459         switch(optname) {
1460         case PACKET_STATISTICS:
1461                 if (len > sizeof(struct tpacket_stats))
1462                         len = sizeof(struct tpacket_stats);
1463                 spin_lock_bh(&sk->sk_receive_queue.lock);
1464                 st = po->stats;
1465                 memset(&po->stats, 0, sizeof(st));
1466                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1467                 st.tp_packets += st.tp_drops;
1468
1469                 data = &st;
1470                 break;
1471         case PACKET_AUXDATA:
1472                 if (len > sizeof(int))
1473                         len = sizeof(int);
1474                 val = po->auxdata;
1475
1476                 data = &val;
1477                 break;
1478         case PACKET_ORIGDEV:
1479                 if (len > sizeof(int))
1480                         len = sizeof(int);
1481                 val = po->origdev;
1482
1483                 data = &val;
1484                 break;
1485         default:
1486                 return -ENOPROTOOPT;
1487         }
1488
1489         if (put_user(len, optlen))
1490                 return -EFAULT;
1491         if (copy_to_user(optval, data, len))
1492                 return -EFAULT;
1493         return 0;
1494 }
1495
1496
1497 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1498 {
1499         struct sock *sk;
1500         struct hlist_node *node;
1501         struct net_device *dev = data;
1502
1503         read_lock(&packet_sklist_lock);
1504         sk_for_each(sk, node, &packet_sklist) {
1505                 struct packet_sock *po = pkt_sk(sk);
1506
1507                 switch (msg) {
1508                 case NETDEV_UNREGISTER:
1509 #ifdef CONFIG_PACKET_MULTICAST
1510                         if (po->mclist)
1511                                 packet_dev_mclist(dev, po->mclist, -1);
1512                         // fallthrough
1513 #endif
1514                 case NETDEV_DOWN:
1515                         if (dev->ifindex == po->ifindex) {
1516                                 spin_lock(&po->bind_lock);
1517                                 if (po->running) {
1518                                         __dev_remove_pack(&po->prot_hook);
1519                                         __sock_put(sk);
1520                                         po->running = 0;
1521                                         sk->sk_err = ENETDOWN;
1522                                         if (!sock_flag(sk, SOCK_DEAD))
1523                                                 sk->sk_error_report(sk);
1524                                 }
1525                                 if (msg == NETDEV_UNREGISTER) {
1526                                         po->ifindex = -1;
1527                                         po->prot_hook.dev = NULL;
1528                                 }
1529                                 spin_unlock(&po->bind_lock);
1530                         }
1531                         break;
1532                 case NETDEV_UP:
1533                         spin_lock(&po->bind_lock);
1534                         if (dev->ifindex == po->ifindex && po->num &&
1535                             !po->running) {
1536                                 dev_add_pack(&po->prot_hook);
1537                                 sock_hold(sk);
1538                                 po->running = 1;
1539                         }
1540                         spin_unlock(&po->bind_lock);
1541                         break;
1542                 }
1543         }
1544         read_unlock(&packet_sklist_lock);
1545         return NOTIFY_DONE;
1546 }
1547
1548
1549 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1550                         unsigned long arg)
1551 {
1552         struct sock *sk = sock->sk;
1553
1554         switch(cmd) {
1555                 case SIOCOUTQ:
1556                 {
1557                         int amount = atomic_read(&sk->sk_wmem_alloc);
1558                         return put_user(amount, (int __user *)arg);
1559                 }
1560                 case SIOCINQ:
1561                 {
1562                         struct sk_buff *skb;
1563                         int amount = 0;
1564
1565                         spin_lock_bh(&sk->sk_receive_queue.lock);
1566                         skb = skb_peek(&sk->sk_receive_queue);
1567                         if (skb)
1568                                 amount = skb->len;
1569                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1570                         return put_user(amount, (int __user *)arg);
1571                 }
1572                 case SIOCGSTAMP:
1573                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1574                 case SIOCGSTAMPNS:
1575                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1576
1577 #ifdef CONFIG_INET
1578                 case SIOCADDRT:
1579                 case SIOCDELRT:
1580                 case SIOCDARP:
1581                 case SIOCGARP:
1582                 case SIOCSARP:
1583                 case SIOCGIFADDR:
1584                 case SIOCSIFADDR:
1585                 case SIOCGIFBRDADDR:
1586                 case SIOCSIFBRDADDR:
1587                 case SIOCGIFNETMASK:
1588                 case SIOCSIFNETMASK:
1589                 case SIOCGIFDSTADDR:
1590                 case SIOCSIFDSTADDR:
1591                 case SIOCSIFFLAGS:
1592                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1593 #endif
1594
1595                 default:
1596                         return -ENOIOCTLCMD;
1597         }
1598         return 0;
1599 }
1600
1601 #ifndef CONFIG_PACKET_MMAP
1602 #define packet_mmap sock_no_mmap
1603 #define packet_poll datagram_poll
1604 #else
1605
1606 static unsigned int packet_poll(struct file * file, struct socket *sock,
1607                                 poll_table *wait)
1608 {
1609         struct sock *sk = sock->sk;
1610         struct packet_sock *po = pkt_sk(sk);
1611         unsigned int mask = datagram_poll(file, sock, wait);
1612
1613         spin_lock_bh(&sk->sk_receive_queue.lock);
1614         if (po->pg_vec) {
1615                 unsigned last = po->head ? po->head-1 : po->frame_max;
1616                 struct tpacket_hdr *h;
1617
1618                 h = packet_lookup_frame(po, last);
1619
1620                 if (h->tp_status)
1621                         mask |= POLLIN | POLLRDNORM;
1622         }
1623         spin_unlock_bh(&sk->sk_receive_queue.lock);
1624         return mask;
1625 }
1626
1627
1628 /* Dirty? Well, I still did not learn better way to account
1629  * for user mmaps.
1630  */
1631
1632 static void packet_mm_open(struct vm_area_struct *vma)
1633 {
1634         struct file *file = vma->vm_file;
1635         struct socket * sock = file->private_data;
1636         struct sock *sk = sock->sk;
1637
1638         if (sk)
1639                 atomic_inc(&pkt_sk(sk)->mapped);
1640 }
1641
1642 static void packet_mm_close(struct vm_area_struct *vma)
1643 {
1644         struct file *file = vma->vm_file;
1645         struct socket * sock = file->private_data;
1646         struct sock *sk = sock->sk;
1647
1648         if (sk)
1649                 atomic_dec(&pkt_sk(sk)->mapped);
1650 }
1651
1652 static struct vm_operations_struct packet_mmap_ops = {
1653         .open = packet_mm_open,
1654         .close =packet_mm_close,
1655 };
1656
1657 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1658 {
1659         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1660 }
1661
1662 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1663 {
1664         int i;
1665
1666         for (i = 0; i < len; i++) {
1667                 if (likely(pg_vec[i]))
1668                         free_pages((unsigned long) pg_vec[i], order);
1669         }
1670         kfree(pg_vec);
1671 }
1672
1673 static inline char *alloc_one_pg_vec_page(unsigned long order)
1674 {
1675         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1676                                          order);
1677 }
1678
1679 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1680 {
1681         unsigned int block_nr = req->tp_block_nr;
1682         char **pg_vec;
1683         int i;
1684
1685         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1686         if (unlikely(!pg_vec))
1687                 goto out;
1688
1689         for (i = 0; i < block_nr; i++) {
1690                 pg_vec[i] = alloc_one_pg_vec_page(order);
1691                 if (unlikely(!pg_vec[i]))
1692                         goto out_free_pgvec;
1693         }
1694
1695 out:
1696         return pg_vec;
1697
1698 out_free_pgvec:
1699         free_pg_vec(pg_vec, order, block_nr);
1700         pg_vec = NULL;
1701         goto out;
1702 }
1703
1704 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1705 {
1706         char **pg_vec = NULL;
1707         struct packet_sock *po = pkt_sk(sk);
1708         int was_running, order = 0;
1709         __be16 num;
1710         int err = 0;
1711
1712         if (req->tp_block_nr) {
1713                 int i, l;
1714
1715                 /* Sanity tests and some calculations */
1716
1717                 if (unlikely(po->pg_vec))
1718                         return -EBUSY;
1719
1720                 if (unlikely((int)req->tp_block_size <= 0))
1721                         return -EINVAL;
1722                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1723                         return -EINVAL;
1724                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1725                         return -EINVAL;
1726                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1727                         return -EINVAL;
1728
1729                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1730                 if (unlikely(po->frames_per_block <= 0))
1731                         return -EINVAL;
1732                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1733                              req->tp_frame_nr))
1734                         return -EINVAL;
1735
1736                 err = -ENOMEM;
1737                 order = get_order(req->tp_block_size);
1738                 pg_vec = alloc_pg_vec(req, order);
1739                 if (unlikely(!pg_vec))
1740                         goto out;
1741
1742                 l = 0;
1743                 for (i = 0; i < req->tp_block_nr; i++) {
1744                         char *ptr = pg_vec[i];
1745                         struct tpacket_hdr *header;
1746                         int k;
1747
1748                         for (k = 0; k < po->frames_per_block; k++) {
1749                                 header = (struct tpacket_hdr *) ptr;
1750                                 header->tp_status = TP_STATUS_KERNEL;
1751                                 ptr += req->tp_frame_size;
1752                         }
1753                 }
1754                 /* Done */
1755         } else {
1756                 if (unlikely(req->tp_frame_nr))
1757                         return -EINVAL;
1758         }
1759
1760         lock_sock(sk);
1761
1762         /* Detach socket from network */
1763         spin_lock(&po->bind_lock);
1764         was_running = po->running;
1765         num = po->num;
1766         if (was_running) {
1767                 __dev_remove_pack(&po->prot_hook);
1768                 po->num = 0;
1769                 po->running = 0;
1770                 __sock_put(sk);
1771         }
1772         spin_unlock(&po->bind_lock);
1773
1774         synchronize_net();
1775
1776         err = -EBUSY;
1777         if (closing || atomic_read(&po->mapped) == 0) {
1778                 err = 0;
1779 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1780
1781                 spin_lock_bh(&sk->sk_receive_queue.lock);
1782                 pg_vec = XC(po->pg_vec, pg_vec);
1783                 po->frame_max = (req->tp_frame_nr - 1);
1784                 po->head = 0;
1785                 po->frame_size = req->tp_frame_size;
1786                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1787
1788                 order = XC(po->pg_vec_order, order);
1789                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1790
1791                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1792                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1793                 skb_queue_purge(&sk->sk_receive_queue);
1794 #undef XC
1795                 if (atomic_read(&po->mapped))
1796                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1797         }
1798
1799         spin_lock(&po->bind_lock);
1800         if (was_running && !po->running) {
1801                 sock_hold(sk);
1802                 po->running = 1;
1803                 po->num = num;
1804                 dev_add_pack(&po->prot_hook);
1805         }
1806         spin_unlock(&po->bind_lock);
1807
1808         release_sock(sk);
1809
1810         if (pg_vec)
1811                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1812 out:
1813         return err;
1814 }
1815
1816 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1817 {
1818         struct sock *sk = sock->sk;
1819         struct packet_sock *po = pkt_sk(sk);
1820         unsigned long size;
1821         unsigned long start;
1822         int err = -EINVAL;
1823         int i;
1824
1825         if (vma->vm_pgoff)
1826                 return -EINVAL;
1827
1828         size = vma->vm_end - vma->vm_start;
1829
1830         lock_sock(sk);
1831         if (po->pg_vec == NULL)
1832                 goto out;
1833         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1834                 goto out;
1835
1836         start = vma->vm_start;
1837         for (i = 0; i < po->pg_vec_len; i++) {
1838                 struct page *page = virt_to_page(po->pg_vec[i]);
1839                 int pg_num;
1840
1841                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1842                         err = vm_insert_page(vma, start, page);
1843                         if (unlikely(err))
1844                                 goto out;
1845                         start += PAGE_SIZE;
1846                 }
1847         }
1848         atomic_inc(&po->mapped);
1849         vma->vm_ops = &packet_mmap_ops;
1850         err = 0;
1851
1852 out:
1853         release_sock(sk);
1854         return err;
1855 }
1856 #endif
1857
1858
1859 #ifdef CONFIG_SOCK_PACKET
1860 static const struct proto_ops packet_ops_spkt = {
1861         .family =       PF_PACKET,
1862         .owner =        THIS_MODULE,
1863         .release =      packet_release,
1864         .bind =         packet_bind_spkt,
1865         .connect =      sock_no_connect,
1866         .socketpair =   sock_no_socketpair,
1867         .accept =       sock_no_accept,
1868         .getname =      packet_getname_spkt,
1869         .poll =         datagram_poll,
1870         .ioctl =        packet_ioctl,
1871         .listen =       sock_no_listen,
1872         .shutdown =     sock_no_shutdown,
1873         .setsockopt =   sock_no_setsockopt,
1874         .getsockopt =   sock_no_getsockopt,
1875         .sendmsg =      packet_sendmsg_spkt,
1876         .recvmsg =      packet_recvmsg,
1877         .mmap =         sock_no_mmap,
1878         .sendpage =     sock_no_sendpage,
1879 };
1880 #endif
1881
1882 static const struct proto_ops packet_ops = {
1883         .family =       PF_PACKET,
1884         .owner =        THIS_MODULE,
1885         .release =      packet_release,
1886         .bind =         packet_bind,
1887         .connect =      sock_no_connect,
1888         .socketpair =   sock_no_socketpair,
1889         .accept =       sock_no_accept,
1890         .getname =      packet_getname,
1891         .poll =         packet_poll,
1892         .ioctl =        packet_ioctl,
1893         .listen =       sock_no_listen,
1894         .shutdown =     sock_no_shutdown,
1895         .setsockopt =   packet_setsockopt,
1896         .getsockopt =   packet_getsockopt,
1897         .sendmsg =      packet_sendmsg,
1898         .recvmsg =      packet_recvmsg,
1899         .mmap =         packet_mmap,
1900         .sendpage =     sock_no_sendpage,
1901 };
1902
1903 static struct net_proto_family packet_family_ops = {
1904         .family =       PF_PACKET,
1905         .create =       packet_create,
1906         .owner  =       THIS_MODULE,
1907 };
1908
1909 static struct notifier_block packet_netdev_notifier = {
1910         .notifier_call =packet_notifier,
1911 };
1912
1913 #ifdef CONFIG_PROC_FS
1914 static inline struct sock *packet_seq_idx(loff_t off)
1915 {
1916         struct sock *s;
1917         struct hlist_node *node;
1918
1919         sk_for_each(s, node, &packet_sklist) {
1920                 if (!off--)
1921                         return s;
1922         }
1923         return NULL;
1924 }
1925
1926 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1927 {
1928         read_lock(&packet_sklist_lock);
1929         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1930 }
1931
1932 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1933 {
1934         ++*pos;
1935         return  (v == SEQ_START_TOKEN)
1936                 ? sk_head(&packet_sklist)
1937                 : sk_next((struct sock*)v) ;
1938 }
1939
1940 static void packet_seq_stop(struct seq_file *seq, void *v)
1941 {
1942         read_unlock(&packet_sklist_lock);
1943 }
1944
1945 static int packet_seq_show(struct seq_file *seq, void *v)
1946 {
1947         if (v == SEQ_START_TOKEN)
1948                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1949         else {
1950                 struct sock *s = v;
1951                 const struct packet_sock *po = pkt_sk(s);
1952
1953                 seq_printf(seq,
1954                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1955                            s,
1956                            atomic_read(&s->sk_refcnt),
1957                            s->sk_type,
1958                            ntohs(po->num),
1959                            po->ifindex,
1960                            po->running,
1961                            atomic_read(&s->sk_rmem_alloc),
1962                            sock_i_uid(s),
1963                            sock_i_ino(s) );
1964         }
1965
1966         return 0;
1967 }
1968
1969 static struct seq_operations packet_seq_ops = {
1970         .start  = packet_seq_start,
1971         .next   = packet_seq_next,
1972         .stop   = packet_seq_stop,
1973         .show   = packet_seq_show,
1974 };
1975
1976 static int packet_seq_open(struct inode *inode, struct file *file)
1977 {
1978         return seq_open(file, &packet_seq_ops);
1979 }
1980
1981 static const struct file_operations packet_seq_fops = {
1982         .owner          = THIS_MODULE,
1983         .open           = packet_seq_open,
1984         .read           = seq_read,
1985         .llseek         = seq_lseek,
1986         .release        = seq_release,
1987 };
1988
1989 #endif
1990
1991 static void __exit packet_exit(void)
1992 {
1993         proc_net_remove("packet");
1994         unregister_netdevice_notifier(&packet_netdev_notifier);
1995         sock_unregister(PF_PACKET);
1996         proto_unregister(&packet_proto);
1997 }
1998
1999 static int __init packet_init(void)
2000 {
2001         int rc = proto_register(&packet_proto, 0);
2002
2003         if (rc != 0)
2004                 goto out;
2005
2006         sock_register(&packet_family_ops);
2007         register_netdevice_notifier(&packet_netdev_notifier);
2008         proc_net_fops_create("packet", 0, &packet_seq_fops);
2009 out:
2010         return rc;
2011 }
2012
2013 module_init(packet_init);
2014 module_exit(packet_exit);
2015 MODULE_LICENSE("GPL");
2016 MODULE_ALIAS_NETPROTO(PF_PACKET);