[NET]: Make /proc/net per network namespace
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/net_namespace.h>
65 #include <net/ip.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
68 #include <net/sock.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
74 #include <asm/page.h>
75 #include <asm/cacheflush.h>
76 #include <asm/io.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
82
83 #ifdef CONFIG_INET
84 #include <net/inet_common.h>
85 #endif
86
87 /*
88    Assumptions:
89    - if device has no dev->hard_header routine, it adds and removes ll header
90      inside itself. In this case ll header is invisible outside of device,
91      but higher levels still should reserve dev->hard_header_len.
92      Some devices are enough clever to reallocate skb, when header
93      will not fit to reserved space (tunnel), another ones are silly
94      (PPP).
95    - packet socket receives packets with pulled ll header,
96      so that SOCK_RAW should push it back.
97
98 On receive:
99 -----------
100
101 Incoming, dev->hard_header!=NULL
102    mac_header -> ll header
103    data       -> data
104
105 Outgoing, dev->hard_header!=NULL
106    mac_header -> ll header
107    data       -> ll header
108
109 Incoming, dev->hard_header==NULL
110    mac_header -> UNKNOWN position. It is very likely, that it points to ll
111                  header.  PPP makes it, that is wrong, because introduce
112                  assymetry between rx and tx paths.
113    data       -> data
114
115 Outgoing, dev->hard_header==NULL
116    mac_header -> data. ll header is still not built!
117    data       -> data
118
119 Resume
120   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
121
122
123 On transmit:
124 ------------
125
126 dev->hard_header != NULL
127    mac_header -> ll header
128    data       -> ll header
129
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
131    mac_header -> data
132    data       -> data
133
134    We should set nh.raw on output to correct posistion,
135    packet classifier depends on it.
136  */
137
138 /* List of all packet sockets. */
139 static HLIST_HEAD(packet_sklist);
140 static DEFINE_RWLOCK(packet_sklist_lock);
141
142 static atomic_t packet_socks_nr;
143
144
145 /* Private packet socket structures. */
146
147 struct packet_mclist
148 {
149         struct packet_mclist    *next;
150         int                     ifindex;
151         int                     count;
152         unsigned short          type;
153         unsigned short          alen;
154         unsigned char           addr[MAX_ADDR_LEN];
155 };
156 /* identical to struct packet_mreq except it has
157  * a longer address field.
158  */
159 struct packet_mreq_max
160 {
161         int             mr_ifindex;
162         unsigned short  mr_type;
163         unsigned short  mr_alen;
164         unsigned char   mr_address[MAX_ADDR_LEN];
165 };
166
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
169 #endif
170
171 static void packet_flush_mclist(struct sock *sk);
172
173 struct packet_sock {
174         /* struct sock has to be the first member of packet_sock */
175         struct sock             sk;
176         struct tpacket_stats    stats;
177 #ifdef CONFIG_PACKET_MMAP
178         char *                  *pg_vec;
179         unsigned int            head;
180         unsigned int            frames_per_block;
181         unsigned int            frame_size;
182         unsigned int            frame_max;
183         int                     copy_thresh;
184 #endif
185         struct packet_type      prot_hook;
186         spinlock_t              bind_lock;
187         unsigned int            running:1,      /* prot_hook is attached*/
188                                 auxdata:1,
189                                 origdev:1;
190         int                     ifindex;        /* bound device         */
191         __be16                  num;
192         struct packet_mclist    *mclist;
193 #ifdef CONFIG_PACKET_MMAP
194         atomic_t                mapped;
195         unsigned int            pg_vec_order;
196         unsigned int            pg_vec_pages;
197         unsigned int            pg_vec_len;
198 #endif
199 };
200
201 struct packet_skb_cb {
202         unsigned int origlen;
203         union {
204                 struct sockaddr_pkt pkt;
205                 struct sockaddr_ll ll;
206         } sa;
207 };
208
209 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
210
211 #ifdef CONFIG_PACKET_MMAP
212
213 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
214 {
215         unsigned int pg_vec_pos, frame_offset;
216
217         pg_vec_pos = position / po->frames_per_block;
218         frame_offset = position % po->frames_per_block;
219
220         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
221 }
222 #endif
223
224 static inline struct packet_sock *pkt_sk(struct sock *sk)
225 {
226         return (struct packet_sock *)sk;
227 }
228
229 static void packet_sock_destruct(struct sock *sk)
230 {
231         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
232         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
233
234         if (!sock_flag(sk, SOCK_DEAD)) {
235                 printk("Attempt to release alive packet socket: %p\n", sk);
236                 return;
237         }
238
239         atomic_dec(&packet_socks_nr);
240 #ifdef PACKET_REFCNT_DEBUG
241         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
242 #endif
243 }
244
245
246 static const struct proto_ops packet_ops;
247
248 static const struct proto_ops packet_ops_spkt;
249
250 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
251 {
252         struct sock *sk;
253         struct sockaddr_pkt *spkt;
254
255         /*
256          *      When we registered the protocol we saved the socket in the data
257          *      field for just this event.
258          */
259
260         sk = pt->af_packet_priv;
261
262         /*
263          *      Yank back the headers [hope the device set this
264          *      right or kerboom...]
265          *
266          *      Incoming packets have ll header pulled,
267          *      push it back.
268          *
269          *      For outgoing ones skb->data == skb_mac_header(skb)
270          *      so that this procedure is noop.
271          */
272
273         if (skb->pkt_type == PACKET_LOOPBACK)
274                 goto out;
275
276         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
277                 goto oom;
278
279         /* drop any routing info */
280         dst_release(skb->dst);
281         skb->dst = NULL;
282
283         /* drop conntrack reference */
284         nf_reset(skb);
285
286         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
287
288         skb_push(skb, skb->data - skb_mac_header(skb));
289
290         /*
291          *      The SOCK_PACKET socket receives _all_ frames.
292          */
293
294         spkt->spkt_family = dev->type;
295         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
296         spkt->spkt_protocol = skb->protocol;
297
298         /*
299          *      Charge the memory to the socket. This is done specifically
300          *      to prevent sockets using all the memory up.
301          */
302
303         if (sock_queue_rcv_skb(sk,skb) == 0)
304                 return 0;
305
306 out:
307         kfree_skb(skb);
308 oom:
309         return 0;
310 }
311
312
313 /*
314  *      Output a raw packet to a device layer. This bypasses all the other
315  *      protocol layers and you must therefore supply it with a complete frame
316  */
317
318 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
319                                struct msghdr *msg, size_t len)
320 {
321         struct sock *sk = sock->sk;
322         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
323         struct sk_buff *skb;
324         struct net_device *dev;
325         __be16 proto=0;
326         int err;
327
328         /*
329          *      Get and verify the address.
330          */
331
332         if (saddr)
333         {
334                 if (msg->msg_namelen < sizeof(struct sockaddr))
335                         return(-EINVAL);
336                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
337                         proto=saddr->spkt_protocol;
338         }
339         else
340                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
341
342         /*
343          *      Find the device first to size check it
344          */
345
346         saddr->spkt_device[13] = 0;
347         dev = dev_get_by_name(saddr->spkt_device);
348         err = -ENODEV;
349         if (dev == NULL)
350                 goto out_unlock;
351
352         err = -ENETDOWN;
353         if (!(dev->flags & IFF_UP))
354                 goto out_unlock;
355
356         /*
357          *      You may not queue a frame bigger than the mtu. This is the lowest level
358          *      raw protocol and you must do your own fragmentation at this level.
359          */
360
361         err = -EMSGSIZE;
362         if (len > dev->mtu + dev->hard_header_len)
363                 goto out_unlock;
364
365         err = -ENOBUFS;
366         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
367
368         /*
369          *      If the write buffer is full, then tough. At this level the user gets to
370          *      deal with the problem - do your own algorithmic backoffs. That's far
371          *      more flexible.
372          */
373
374         if (skb == NULL)
375                 goto out_unlock;
376
377         /*
378          *      Fill it in
379          */
380
381         /* FIXME: Save some space for broken drivers that write a
382          * hard header at transmission time by themselves. PPP is the
383          * notable one here. This should really be fixed at the driver level.
384          */
385         skb_reserve(skb, LL_RESERVED_SPACE(dev));
386         skb_reset_network_header(skb);
387
388         /* Try to align data part correctly */
389         if (dev->hard_header) {
390                 skb->data -= dev->hard_header_len;
391                 skb->tail -= dev->hard_header_len;
392                 if (len < dev->hard_header_len)
393                         skb_reset_network_header(skb);
394         }
395
396         /* Returns -EFAULT on error */
397         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
398         skb->protocol = proto;
399         skb->dev = dev;
400         skb->priority = sk->sk_priority;
401         if (err)
402                 goto out_free;
403
404         /*
405          *      Now send it
406          */
407
408         dev_queue_xmit(skb);
409         dev_put(dev);
410         return(len);
411
412 out_free:
413         kfree_skb(skb);
414 out_unlock:
415         if (dev)
416                 dev_put(dev);
417         return err;
418 }
419
420 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
421                                       unsigned int res)
422 {
423         struct sk_filter *filter;
424
425         rcu_read_lock_bh();
426         filter = rcu_dereference(sk->sk_filter);
427         if (filter != NULL)
428                 res = sk_run_filter(skb, filter->insns, filter->len);
429         rcu_read_unlock_bh();
430
431         return res;
432 }
433
434 /*
435    This function makes lazy skb cloning in hope that most of packets
436    are discarded by BPF.
437
438    Note tricky part: we DO mangle shared skb! skb->data, skb->len
439    and skb->cb are mangled. It works because (and until) packets
440    falling here are owned by current CPU. Output packets are cloned
441    by dev_queue_xmit_nit(), input packets are processed by net_bh
442    sequencially, so that if we return skb to original state on exit,
443    we will not harm anyone.
444  */
445
446 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
447 {
448         struct sock *sk;
449         struct sockaddr_ll *sll;
450         struct packet_sock *po;
451         u8 * skb_head = skb->data;
452         int skb_len = skb->len;
453         unsigned int snaplen, res;
454
455         if (skb->pkt_type == PACKET_LOOPBACK)
456                 goto drop;
457
458         sk = pt->af_packet_priv;
459         po = pkt_sk(sk);
460
461         skb->dev = dev;
462
463         if (dev->hard_header) {
464                 /* The device has an explicit notion of ll header,
465                    exported to higher levels.
466
467                    Otherwise, the device hides datails of it frame
468                    structure, so that corresponding packet head
469                    never delivered to user.
470                  */
471                 if (sk->sk_type != SOCK_DGRAM)
472                         skb_push(skb, skb->data - skb_mac_header(skb));
473                 else if (skb->pkt_type == PACKET_OUTGOING) {
474                         /* Special case: outgoing packets have ll header at head */
475                         skb_pull(skb, skb_network_offset(skb));
476                 }
477         }
478
479         snaplen = skb->len;
480
481         res = run_filter(skb, sk, snaplen);
482         if (!res)
483                 goto drop_n_restore;
484         if (snaplen > res)
485                 snaplen = res;
486
487         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
488             (unsigned)sk->sk_rcvbuf)
489                 goto drop_n_acct;
490
491         if (skb_shared(skb)) {
492                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
493                 if (nskb == NULL)
494                         goto drop_n_acct;
495
496                 if (skb_head != skb->data) {
497                         skb->data = skb_head;
498                         skb->len = skb_len;
499                 }
500                 kfree_skb(skb);
501                 skb = nskb;
502         }
503
504         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
505                      sizeof(skb->cb));
506
507         sll = &PACKET_SKB_CB(skb)->sa.ll;
508         sll->sll_family = AF_PACKET;
509         sll->sll_hatype = dev->type;
510         sll->sll_protocol = skb->protocol;
511         sll->sll_pkttype = skb->pkt_type;
512         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
513                 sll->sll_ifindex = orig_dev->ifindex;
514         else
515                 sll->sll_ifindex = dev->ifindex;
516         sll->sll_halen = 0;
517
518         if (dev->hard_header_parse)
519                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
520
521         PACKET_SKB_CB(skb)->origlen = skb->len;
522
523         if (pskb_trim(skb, snaplen))
524                 goto drop_n_acct;
525
526         skb_set_owner_r(skb, sk);
527         skb->dev = NULL;
528         dst_release(skb->dst);
529         skb->dst = NULL;
530
531         /* drop conntrack reference */
532         nf_reset(skb);
533
534         spin_lock(&sk->sk_receive_queue.lock);
535         po->stats.tp_packets++;
536         __skb_queue_tail(&sk->sk_receive_queue, skb);
537         spin_unlock(&sk->sk_receive_queue.lock);
538         sk->sk_data_ready(sk, skb->len);
539         return 0;
540
541 drop_n_acct:
542         spin_lock(&sk->sk_receive_queue.lock);
543         po->stats.tp_drops++;
544         spin_unlock(&sk->sk_receive_queue.lock);
545
546 drop_n_restore:
547         if (skb_head != skb->data && skb_shared(skb)) {
548                 skb->data = skb_head;
549                 skb->len = skb_len;
550         }
551 drop:
552         kfree_skb(skb);
553         return 0;
554 }
555
556 #ifdef CONFIG_PACKET_MMAP
557 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
558 {
559         struct sock *sk;
560         struct packet_sock *po;
561         struct sockaddr_ll *sll;
562         struct tpacket_hdr *h;
563         u8 * skb_head = skb->data;
564         int skb_len = skb->len;
565         unsigned int snaplen, res;
566         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
567         unsigned short macoff, netoff;
568         struct sk_buff *copy_skb = NULL;
569         struct timeval tv;
570
571         if (skb->pkt_type == PACKET_LOOPBACK)
572                 goto drop;
573
574         sk = pt->af_packet_priv;
575         po = pkt_sk(sk);
576
577         if (dev->hard_header) {
578                 if (sk->sk_type != SOCK_DGRAM)
579                         skb_push(skb, skb->data - skb_mac_header(skb));
580                 else if (skb->pkt_type == PACKET_OUTGOING) {
581                         /* Special case: outgoing packets have ll header at head */
582                         skb_pull(skb, skb_network_offset(skb));
583                 }
584         }
585
586         if (skb->ip_summed == CHECKSUM_PARTIAL)
587                 status |= TP_STATUS_CSUMNOTREADY;
588
589         snaplen = skb->len;
590
591         res = run_filter(skb, sk, snaplen);
592         if (!res)
593                 goto drop_n_restore;
594         if (snaplen > res)
595                 snaplen = res;
596
597         if (sk->sk_type == SOCK_DGRAM) {
598                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
599         } else {
600                 unsigned maclen = skb_network_offset(skb);
601                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
602                 macoff = netoff - maclen;
603         }
604
605         if (macoff + snaplen > po->frame_size) {
606                 if (po->copy_thresh &&
607                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
608                     (unsigned)sk->sk_rcvbuf) {
609                         if (skb_shared(skb)) {
610                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
611                         } else {
612                                 copy_skb = skb_get(skb);
613                                 skb_head = skb->data;
614                         }
615                         if (copy_skb)
616                                 skb_set_owner_r(copy_skb, sk);
617                 }
618                 snaplen = po->frame_size - macoff;
619                 if ((int)snaplen < 0)
620                         snaplen = 0;
621         }
622
623         spin_lock(&sk->sk_receive_queue.lock);
624         h = packet_lookup_frame(po, po->head);
625
626         if (h->tp_status)
627                 goto ring_is_full;
628         po->head = po->head != po->frame_max ? po->head+1 : 0;
629         po->stats.tp_packets++;
630         if (copy_skb) {
631                 status |= TP_STATUS_COPY;
632                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
633         }
634         if (!po->stats.tp_drops)
635                 status &= ~TP_STATUS_LOSING;
636         spin_unlock(&sk->sk_receive_queue.lock);
637
638         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
639
640         h->tp_len = skb->len;
641         h->tp_snaplen = snaplen;
642         h->tp_mac = macoff;
643         h->tp_net = netoff;
644         if (skb->tstamp.tv64)
645                 tv = ktime_to_timeval(skb->tstamp);
646         else
647                 do_gettimeofday(&tv);
648         h->tp_sec = tv.tv_sec;
649         h->tp_usec = tv.tv_usec;
650
651         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
652         sll->sll_halen = 0;
653         if (dev->hard_header_parse)
654                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
655         sll->sll_family = AF_PACKET;
656         sll->sll_hatype = dev->type;
657         sll->sll_protocol = skb->protocol;
658         sll->sll_pkttype = skb->pkt_type;
659         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
660                 sll->sll_ifindex = orig_dev->ifindex;
661         else
662                 sll->sll_ifindex = dev->ifindex;
663
664         h->tp_status = status;
665         smp_mb();
666
667         {
668                 struct page *p_start, *p_end;
669                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
670
671                 p_start = virt_to_page(h);
672                 p_end = virt_to_page(h_end);
673                 while (p_start <= p_end) {
674                         flush_dcache_page(p_start);
675                         p_start++;
676                 }
677         }
678
679         sk->sk_data_ready(sk, 0);
680
681 drop_n_restore:
682         if (skb_head != skb->data && skb_shared(skb)) {
683                 skb->data = skb_head;
684                 skb->len = skb_len;
685         }
686 drop:
687         kfree_skb(skb);
688         return 0;
689
690 ring_is_full:
691         po->stats.tp_drops++;
692         spin_unlock(&sk->sk_receive_queue.lock);
693
694         sk->sk_data_ready(sk, 0);
695         if (copy_skb)
696                 kfree_skb(copy_skb);
697         goto drop_n_restore;
698 }
699
700 #endif
701
702
703 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
704                           struct msghdr *msg, size_t len)
705 {
706         struct sock *sk = sock->sk;
707         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
708         struct sk_buff *skb;
709         struct net_device *dev;
710         __be16 proto;
711         unsigned char *addr;
712         int ifindex, err, reserve = 0;
713
714         /*
715          *      Get and verify the address.
716          */
717
718         if (saddr == NULL) {
719                 struct packet_sock *po = pkt_sk(sk);
720
721                 ifindex = po->ifindex;
722                 proto   = po->num;
723                 addr    = NULL;
724         } else {
725                 err = -EINVAL;
726                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
727                         goto out;
728                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
729                         goto out;
730                 ifindex = saddr->sll_ifindex;
731                 proto   = saddr->sll_protocol;
732                 addr    = saddr->sll_addr;
733         }
734
735
736         dev = dev_get_by_index(ifindex);
737         err = -ENXIO;
738         if (dev == NULL)
739                 goto out_unlock;
740         if (sock->type == SOCK_RAW)
741                 reserve = dev->hard_header_len;
742
743         err = -ENETDOWN;
744         if (!(dev->flags & IFF_UP))
745                 goto out_unlock;
746
747         err = -EMSGSIZE;
748         if (len > dev->mtu+reserve)
749                 goto out_unlock;
750
751         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
752                                 msg->msg_flags & MSG_DONTWAIT, &err);
753         if (skb==NULL)
754                 goto out_unlock;
755
756         skb_reserve(skb, LL_RESERVED_SPACE(dev));
757         skb_reset_network_header(skb);
758
759         if (dev->hard_header) {
760                 int res;
761                 err = -EINVAL;
762                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
763                 if (sock->type != SOCK_DGRAM) {
764                         skb_reset_tail_pointer(skb);
765                         skb->len = 0;
766                 } else if (res < 0)
767                         goto out_free;
768         }
769
770         /* Returns -EFAULT on error */
771         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
772         if (err)
773                 goto out_free;
774
775         skb->protocol = proto;
776         skb->dev = dev;
777         skb->priority = sk->sk_priority;
778
779         /*
780          *      Now send it
781          */
782
783         err = dev_queue_xmit(skb);
784         if (err > 0 && (err = net_xmit_errno(err)) != 0)
785                 goto out_unlock;
786
787         dev_put(dev);
788
789         return(len);
790
791 out_free:
792         kfree_skb(skb);
793 out_unlock:
794         if (dev)
795                 dev_put(dev);
796 out:
797         return err;
798 }
799
800 /*
801  *      Close a PACKET socket. This is fairly simple. We immediately go
802  *      to 'closed' state and remove our protocol entry in the device list.
803  */
804
805 static int packet_release(struct socket *sock)
806 {
807         struct sock *sk = sock->sk;
808         struct packet_sock *po;
809
810         if (!sk)
811                 return 0;
812
813         po = pkt_sk(sk);
814
815         write_lock_bh(&packet_sklist_lock);
816         sk_del_node_init(sk);
817         write_unlock_bh(&packet_sklist_lock);
818
819         /*
820          *      Unhook packet receive handler.
821          */
822
823         if (po->running) {
824                 /*
825                  *      Remove the protocol hook
826                  */
827                 dev_remove_pack(&po->prot_hook);
828                 po->running = 0;
829                 po->num = 0;
830                 __sock_put(sk);
831         }
832
833         packet_flush_mclist(sk);
834
835 #ifdef CONFIG_PACKET_MMAP
836         if (po->pg_vec) {
837                 struct tpacket_req req;
838                 memset(&req, 0, sizeof(req));
839                 packet_set_ring(sk, &req, 1);
840         }
841 #endif
842
843         /*
844          *      Now the socket is dead. No more input will appear.
845          */
846
847         sock_orphan(sk);
848         sock->sk = NULL;
849
850         /* Purge queues */
851
852         skb_queue_purge(&sk->sk_receive_queue);
853
854         sock_put(sk);
855         return 0;
856 }
857
858 /*
859  *      Attach a packet hook.
860  */
861
862 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
863 {
864         struct packet_sock *po = pkt_sk(sk);
865         /*
866          *      Detach an existing hook if present.
867          */
868
869         lock_sock(sk);
870
871         spin_lock(&po->bind_lock);
872         if (po->running) {
873                 __sock_put(sk);
874                 po->running = 0;
875                 po->num = 0;
876                 spin_unlock(&po->bind_lock);
877                 dev_remove_pack(&po->prot_hook);
878                 spin_lock(&po->bind_lock);
879         }
880
881         po->num = protocol;
882         po->prot_hook.type = protocol;
883         po->prot_hook.dev = dev;
884
885         po->ifindex = dev ? dev->ifindex : 0;
886
887         if (protocol == 0)
888                 goto out_unlock;
889
890         if (dev) {
891                 if (dev->flags&IFF_UP) {
892                         dev_add_pack(&po->prot_hook);
893                         sock_hold(sk);
894                         po->running = 1;
895                 } else {
896                         sk->sk_err = ENETDOWN;
897                         if (!sock_flag(sk, SOCK_DEAD))
898                                 sk->sk_error_report(sk);
899                 }
900         } else {
901                 dev_add_pack(&po->prot_hook);
902                 sock_hold(sk);
903                 po->running = 1;
904         }
905
906 out_unlock:
907         spin_unlock(&po->bind_lock);
908         release_sock(sk);
909         return 0;
910 }
911
912 /*
913  *      Bind a packet socket to a device
914  */
915
916 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
917 {
918         struct sock *sk=sock->sk;
919         char name[15];
920         struct net_device *dev;
921         int err = -ENODEV;
922
923         /*
924          *      Check legality
925          */
926
927         if (addr_len != sizeof(struct sockaddr))
928                 return -EINVAL;
929         strlcpy(name,uaddr->sa_data,sizeof(name));
930
931         dev = dev_get_by_name(name);
932         if (dev) {
933                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
934                 dev_put(dev);
935         }
936         return err;
937 }
938
939 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
940 {
941         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
942         struct sock *sk=sock->sk;
943         struct net_device *dev = NULL;
944         int err;
945
946
947         /*
948          *      Check legality
949          */
950
951         if (addr_len < sizeof(struct sockaddr_ll))
952                 return -EINVAL;
953         if (sll->sll_family != AF_PACKET)
954                 return -EINVAL;
955
956         if (sll->sll_ifindex) {
957                 err = -ENODEV;
958                 dev = dev_get_by_index(sll->sll_ifindex);
959                 if (dev == NULL)
960                         goto out;
961         }
962         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
963         if (dev)
964                 dev_put(dev);
965
966 out:
967         return err;
968 }
969
970 static struct proto packet_proto = {
971         .name     = "PACKET",
972         .owner    = THIS_MODULE,
973         .obj_size = sizeof(struct packet_sock),
974 };
975
976 /*
977  *      Create a packet of type SOCK_PACKET.
978  */
979
980 static int packet_create(struct socket *sock, int protocol)
981 {
982         struct sock *sk;
983         struct packet_sock *po;
984         __be16 proto = (__force __be16)protocol; /* weird, but documented */
985         int err;
986
987         if (!capable(CAP_NET_RAW))
988                 return -EPERM;
989         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
990             sock->type != SOCK_PACKET)
991                 return -ESOCKTNOSUPPORT;
992
993         sock->state = SS_UNCONNECTED;
994
995         err = -ENOBUFS;
996         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
997         if (sk == NULL)
998                 goto out;
999
1000         sock->ops = &packet_ops;
1001         if (sock->type == SOCK_PACKET)
1002                 sock->ops = &packet_ops_spkt;
1003
1004         sock_init_data(sock, sk);
1005
1006         po = pkt_sk(sk);
1007         sk->sk_family = PF_PACKET;
1008         po->num = proto;
1009
1010         sk->sk_destruct = packet_sock_destruct;
1011         atomic_inc(&packet_socks_nr);
1012
1013         /*
1014          *      Attach a protocol block
1015          */
1016
1017         spin_lock_init(&po->bind_lock);
1018         po->prot_hook.func = packet_rcv;
1019
1020         if (sock->type == SOCK_PACKET)
1021                 po->prot_hook.func = packet_rcv_spkt;
1022
1023         po->prot_hook.af_packet_priv = sk;
1024
1025         if (proto) {
1026                 po->prot_hook.type = proto;
1027                 dev_add_pack(&po->prot_hook);
1028                 sock_hold(sk);
1029                 po->running = 1;
1030         }
1031
1032         write_lock_bh(&packet_sklist_lock);
1033         sk_add_node(sk, &packet_sklist);
1034         write_unlock_bh(&packet_sklist_lock);
1035         return(0);
1036 out:
1037         return err;
1038 }
1039
1040 /*
1041  *      Pull a packet from our receive queue and hand it to the user.
1042  *      If necessary we block.
1043  */
1044
1045 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1046                           struct msghdr *msg, size_t len, int flags)
1047 {
1048         struct sock *sk = sock->sk;
1049         struct sk_buff *skb;
1050         int copied, err;
1051         struct sockaddr_ll *sll;
1052
1053         err = -EINVAL;
1054         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1055                 goto out;
1056
1057 #if 0
1058         /* What error should we return now? EUNATTACH? */
1059         if (pkt_sk(sk)->ifindex < 0)
1060                 return -ENODEV;
1061 #endif
1062
1063         /*
1064          *      Call the generic datagram receiver. This handles all sorts
1065          *      of horrible races and re-entrancy so we can forget about it
1066          *      in the protocol layers.
1067          *
1068          *      Now it will return ENETDOWN, if device have just gone down,
1069          *      but then it will block.
1070          */
1071
1072         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1073
1074         /*
1075          *      An error occurred so return it. Because skb_recv_datagram()
1076          *      handles the blocking we don't see and worry about blocking
1077          *      retries.
1078          */
1079
1080         if (skb == NULL)
1081                 goto out;
1082
1083         /*
1084          *      If the address length field is there to be filled in, we fill
1085          *      it in now.
1086          */
1087
1088         sll = &PACKET_SKB_CB(skb)->sa.ll;
1089         if (sock->type == SOCK_PACKET)
1090                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1091         else
1092                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1093
1094         /*
1095          *      You lose any data beyond the buffer you gave. If it worries a
1096          *      user program they can ask the device for its MTU anyway.
1097          */
1098
1099         copied = skb->len;
1100         if (copied > len)
1101         {
1102                 copied=len;
1103                 msg->msg_flags|=MSG_TRUNC;
1104         }
1105
1106         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1107         if (err)
1108                 goto out_free;
1109
1110         sock_recv_timestamp(msg, sk, skb);
1111
1112         if (msg->msg_name)
1113                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1114                        msg->msg_namelen);
1115
1116         if (pkt_sk(sk)->auxdata) {
1117                 struct tpacket_auxdata aux;
1118
1119                 aux.tp_status = TP_STATUS_USER;
1120                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1121                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1122                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1123                 aux.tp_snaplen = skb->len;
1124                 aux.tp_mac = 0;
1125                 aux.tp_net = skb_network_offset(skb);
1126
1127                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1128         }
1129
1130         /*
1131          *      Free or return the buffer as appropriate. Again this
1132          *      hides all the races and re-entrancy issues from us.
1133          */
1134         err = (flags&MSG_TRUNC) ? skb->len : copied;
1135
1136 out_free:
1137         skb_free_datagram(sk, skb);
1138 out:
1139         return err;
1140 }
1141
1142 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1143                                int *uaddr_len, int peer)
1144 {
1145         struct net_device *dev;
1146         struct sock *sk = sock->sk;
1147
1148         if (peer)
1149                 return -EOPNOTSUPP;
1150
1151         uaddr->sa_family = AF_PACKET;
1152         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1153         if (dev) {
1154                 strlcpy(uaddr->sa_data, dev->name, 15);
1155                 dev_put(dev);
1156         } else
1157                 memset(uaddr->sa_data, 0, 14);
1158         *uaddr_len = sizeof(*uaddr);
1159
1160         return 0;
1161 }
1162
1163 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1164                           int *uaddr_len, int peer)
1165 {
1166         struct net_device *dev;
1167         struct sock *sk = sock->sk;
1168         struct packet_sock *po = pkt_sk(sk);
1169         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1170
1171         if (peer)
1172                 return -EOPNOTSUPP;
1173
1174         sll->sll_family = AF_PACKET;
1175         sll->sll_ifindex = po->ifindex;
1176         sll->sll_protocol = po->num;
1177         dev = dev_get_by_index(po->ifindex);
1178         if (dev) {
1179                 sll->sll_hatype = dev->type;
1180                 sll->sll_halen = dev->addr_len;
1181                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1182                 dev_put(dev);
1183         } else {
1184                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1185                 sll->sll_halen = 0;
1186         }
1187         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1188
1189         return 0;
1190 }
1191
1192 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1193 {
1194         switch (i->type) {
1195         case PACKET_MR_MULTICAST:
1196                 if (what > 0)
1197                         dev_mc_add(dev, i->addr, i->alen, 0);
1198                 else
1199                         dev_mc_delete(dev, i->addr, i->alen, 0);
1200                 break;
1201         case PACKET_MR_PROMISC:
1202                 dev_set_promiscuity(dev, what);
1203                 break;
1204         case PACKET_MR_ALLMULTI:
1205                 dev_set_allmulti(dev, what);
1206                 break;
1207         default:;
1208         }
1209 }
1210
1211 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1212 {
1213         for ( ; i; i=i->next) {
1214                 if (i->ifindex == dev->ifindex)
1215                         packet_dev_mc(dev, i, what);
1216         }
1217 }
1218
1219 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1220 {
1221         struct packet_sock *po = pkt_sk(sk);
1222         struct packet_mclist *ml, *i;
1223         struct net_device *dev;
1224         int err;
1225
1226         rtnl_lock();
1227
1228         err = -ENODEV;
1229         dev = __dev_get_by_index(mreq->mr_ifindex);
1230         if (!dev)
1231                 goto done;
1232
1233         err = -EINVAL;
1234         if (mreq->mr_alen > dev->addr_len)
1235                 goto done;
1236
1237         err = -ENOBUFS;
1238         i = kmalloc(sizeof(*i), GFP_KERNEL);
1239         if (i == NULL)
1240                 goto done;
1241
1242         err = 0;
1243         for (ml = po->mclist; ml; ml = ml->next) {
1244                 if (ml->ifindex == mreq->mr_ifindex &&
1245                     ml->type == mreq->mr_type &&
1246                     ml->alen == mreq->mr_alen &&
1247                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1248                         ml->count++;
1249                         /* Free the new element ... */
1250                         kfree(i);
1251                         goto done;
1252                 }
1253         }
1254
1255         i->type = mreq->mr_type;
1256         i->ifindex = mreq->mr_ifindex;
1257         i->alen = mreq->mr_alen;
1258         memcpy(i->addr, mreq->mr_address, i->alen);
1259         i->count = 1;
1260         i->next = po->mclist;
1261         po->mclist = i;
1262         packet_dev_mc(dev, i, +1);
1263
1264 done:
1265         rtnl_unlock();
1266         return err;
1267 }
1268
1269 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1270 {
1271         struct packet_mclist *ml, **mlp;
1272
1273         rtnl_lock();
1274
1275         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1276                 if (ml->ifindex == mreq->mr_ifindex &&
1277                     ml->type == mreq->mr_type &&
1278                     ml->alen == mreq->mr_alen &&
1279                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1280                         if (--ml->count == 0) {
1281                                 struct net_device *dev;
1282                                 *mlp = ml->next;
1283                                 dev = dev_get_by_index(ml->ifindex);
1284                                 if (dev) {
1285                                         packet_dev_mc(dev, ml, -1);
1286                                         dev_put(dev);
1287                                 }
1288                                 kfree(ml);
1289                         }
1290                         rtnl_unlock();
1291                         return 0;
1292                 }
1293         }
1294         rtnl_unlock();
1295         return -EADDRNOTAVAIL;
1296 }
1297
1298 static void packet_flush_mclist(struct sock *sk)
1299 {
1300         struct packet_sock *po = pkt_sk(sk);
1301         struct packet_mclist *ml;
1302
1303         if (!po->mclist)
1304                 return;
1305
1306         rtnl_lock();
1307         while ((ml = po->mclist) != NULL) {
1308                 struct net_device *dev;
1309
1310                 po->mclist = ml->next;
1311                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1312                         packet_dev_mc(dev, ml, -1);
1313                         dev_put(dev);
1314                 }
1315                 kfree(ml);
1316         }
1317         rtnl_unlock();
1318 }
1319
1320 static int
1321 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1322 {
1323         struct sock *sk = sock->sk;
1324         struct packet_sock *po = pkt_sk(sk);
1325         int ret;
1326
1327         if (level != SOL_PACKET)
1328                 return -ENOPROTOOPT;
1329
1330         switch(optname) {
1331         case PACKET_ADD_MEMBERSHIP:
1332         case PACKET_DROP_MEMBERSHIP:
1333         {
1334                 struct packet_mreq_max mreq;
1335                 int len = optlen;
1336                 memset(&mreq, 0, sizeof(mreq));
1337                 if (len < sizeof(struct packet_mreq))
1338                         return -EINVAL;
1339                 if (len > sizeof(mreq))
1340                         len = sizeof(mreq);
1341                 if (copy_from_user(&mreq,optval,len))
1342                         return -EFAULT;
1343                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1344                         return -EINVAL;
1345                 if (optname == PACKET_ADD_MEMBERSHIP)
1346                         ret = packet_mc_add(sk, &mreq);
1347                 else
1348                         ret = packet_mc_drop(sk, &mreq);
1349                 return ret;
1350         }
1351
1352 #ifdef CONFIG_PACKET_MMAP
1353         case PACKET_RX_RING:
1354         {
1355                 struct tpacket_req req;
1356
1357                 if (optlen<sizeof(req))
1358                         return -EINVAL;
1359                 if (copy_from_user(&req,optval,sizeof(req)))
1360                         return -EFAULT;
1361                 return packet_set_ring(sk, &req, 0);
1362         }
1363         case PACKET_COPY_THRESH:
1364         {
1365                 int val;
1366
1367                 if (optlen!=sizeof(val))
1368                         return -EINVAL;
1369                 if (copy_from_user(&val,optval,sizeof(val)))
1370                         return -EFAULT;
1371
1372                 pkt_sk(sk)->copy_thresh = val;
1373                 return 0;
1374         }
1375 #endif
1376         case PACKET_AUXDATA:
1377         {
1378                 int val;
1379
1380                 if (optlen < sizeof(val))
1381                         return -EINVAL;
1382                 if (copy_from_user(&val, optval, sizeof(val)))
1383                         return -EFAULT;
1384
1385                 po->auxdata = !!val;
1386                 return 0;
1387         }
1388         case PACKET_ORIGDEV:
1389         {
1390                 int val;
1391
1392                 if (optlen < sizeof(val))
1393                         return -EINVAL;
1394                 if (copy_from_user(&val, optval, sizeof(val)))
1395                         return -EFAULT;
1396
1397                 po->origdev = !!val;
1398                 return 0;
1399         }
1400         default:
1401                 return -ENOPROTOOPT;
1402         }
1403 }
1404
1405 static int packet_getsockopt(struct socket *sock, int level, int optname,
1406                              char __user *optval, int __user *optlen)
1407 {
1408         int len;
1409         int val;
1410         struct sock *sk = sock->sk;
1411         struct packet_sock *po = pkt_sk(sk);
1412         void *data;
1413         struct tpacket_stats st;
1414
1415         if (level != SOL_PACKET)
1416                 return -ENOPROTOOPT;
1417
1418         if (get_user(len, optlen))
1419                 return -EFAULT;
1420
1421         if (len < 0)
1422                 return -EINVAL;
1423
1424         switch(optname) {
1425         case PACKET_STATISTICS:
1426                 if (len > sizeof(struct tpacket_stats))
1427                         len = sizeof(struct tpacket_stats);
1428                 spin_lock_bh(&sk->sk_receive_queue.lock);
1429                 st = po->stats;
1430                 memset(&po->stats, 0, sizeof(st));
1431                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1432                 st.tp_packets += st.tp_drops;
1433
1434                 data = &st;
1435                 break;
1436         case PACKET_AUXDATA:
1437                 if (len > sizeof(int))
1438                         len = sizeof(int);
1439                 val = po->auxdata;
1440
1441                 data = &val;
1442                 break;
1443         case PACKET_ORIGDEV:
1444                 if (len > sizeof(int))
1445                         len = sizeof(int);
1446                 val = po->origdev;
1447
1448                 data = &val;
1449                 break;
1450         default:
1451                 return -ENOPROTOOPT;
1452         }
1453
1454         if (put_user(len, optlen))
1455                 return -EFAULT;
1456         if (copy_to_user(optval, data, len))
1457                 return -EFAULT;
1458         return 0;
1459 }
1460
1461
1462 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1463 {
1464         struct sock *sk;
1465         struct hlist_node *node;
1466         struct net_device *dev = data;
1467
1468         read_lock(&packet_sklist_lock);
1469         sk_for_each(sk, node, &packet_sklist) {
1470                 struct packet_sock *po = pkt_sk(sk);
1471
1472                 switch (msg) {
1473                 case NETDEV_UNREGISTER:
1474                         if (po->mclist)
1475                                 packet_dev_mclist(dev, po->mclist, -1);
1476                         /* fallthrough */
1477
1478                 case NETDEV_DOWN:
1479                         if (dev->ifindex == po->ifindex) {
1480                                 spin_lock(&po->bind_lock);
1481                                 if (po->running) {
1482                                         __dev_remove_pack(&po->prot_hook);
1483                                         __sock_put(sk);
1484                                         po->running = 0;
1485                                         sk->sk_err = ENETDOWN;
1486                                         if (!sock_flag(sk, SOCK_DEAD))
1487                                                 sk->sk_error_report(sk);
1488                                 }
1489                                 if (msg == NETDEV_UNREGISTER) {
1490                                         po->ifindex = -1;
1491                                         po->prot_hook.dev = NULL;
1492                                 }
1493                                 spin_unlock(&po->bind_lock);
1494                         }
1495                         break;
1496                 case NETDEV_UP:
1497                         spin_lock(&po->bind_lock);
1498                         if (dev->ifindex == po->ifindex && po->num &&
1499                             !po->running) {
1500                                 dev_add_pack(&po->prot_hook);
1501                                 sock_hold(sk);
1502                                 po->running = 1;
1503                         }
1504                         spin_unlock(&po->bind_lock);
1505                         break;
1506                 }
1507         }
1508         read_unlock(&packet_sklist_lock);
1509         return NOTIFY_DONE;
1510 }
1511
1512
1513 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1514                         unsigned long arg)
1515 {
1516         struct sock *sk = sock->sk;
1517
1518         switch(cmd) {
1519                 case SIOCOUTQ:
1520                 {
1521                         int amount = atomic_read(&sk->sk_wmem_alloc);
1522                         return put_user(amount, (int __user *)arg);
1523                 }
1524                 case SIOCINQ:
1525                 {
1526                         struct sk_buff *skb;
1527                         int amount = 0;
1528
1529                         spin_lock_bh(&sk->sk_receive_queue.lock);
1530                         skb = skb_peek(&sk->sk_receive_queue);
1531                         if (skb)
1532                                 amount = skb->len;
1533                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1534                         return put_user(amount, (int __user *)arg);
1535                 }
1536                 case SIOCGSTAMP:
1537                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1538                 case SIOCGSTAMPNS:
1539                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1540
1541 #ifdef CONFIG_INET
1542                 case SIOCADDRT:
1543                 case SIOCDELRT:
1544                 case SIOCDARP:
1545                 case SIOCGARP:
1546                 case SIOCSARP:
1547                 case SIOCGIFADDR:
1548                 case SIOCSIFADDR:
1549                 case SIOCGIFBRDADDR:
1550                 case SIOCSIFBRDADDR:
1551                 case SIOCGIFNETMASK:
1552                 case SIOCSIFNETMASK:
1553                 case SIOCGIFDSTADDR:
1554                 case SIOCSIFDSTADDR:
1555                 case SIOCSIFFLAGS:
1556                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1557 #endif
1558
1559                 default:
1560                         return -ENOIOCTLCMD;
1561         }
1562         return 0;
1563 }
1564
1565 #ifndef CONFIG_PACKET_MMAP
1566 #define packet_mmap sock_no_mmap
1567 #define packet_poll datagram_poll
1568 #else
1569
1570 static unsigned int packet_poll(struct file * file, struct socket *sock,
1571                                 poll_table *wait)
1572 {
1573         struct sock *sk = sock->sk;
1574         struct packet_sock *po = pkt_sk(sk);
1575         unsigned int mask = datagram_poll(file, sock, wait);
1576
1577         spin_lock_bh(&sk->sk_receive_queue.lock);
1578         if (po->pg_vec) {
1579                 unsigned last = po->head ? po->head-1 : po->frame_max;
1580                 struct tpacket_hdr *h;
1581
1582                 h = packet_lookup_frame(po, last);
1583
1584                 if (h->tp_status)
1585                         mask |= POLLIN | POLLRDNORM;
1586         }
1587         spin_unlock_bh(&sk->sk_receive_queue.lock);
1588         return mask;
1589 }
1590
1591
1592 /* Dirty? Well, I still did not learn better way to account
1593  * for user mmaps.
1594  */
1595
1596 static void packet_mm_open(struct vm_area_struct *vma)
1597 {
1598         struct file *file = vma->vm_file;
1599         struct socket * sock = file->private_data;
1600         struct sock *sk = sock->sk;
1601
1602         if (sk)
1603                 atomic_inc(&pkt_sk(sk)->mapped);
1604 }
1605
1606 static void packet_mm_close(struct vm_area_struct *vma)
1607 {
1608         struct file *file = vma->vm_file;
1609         struct socket * sock = file->private_data;
1610         struct sock *sk = sock->sk;
1611
1612         if (sk)
1613                 atomic_dec(&pkt_sk(sk)->mapped);
1614 }
1615
1616 static struct vm_operations_struct packet_mmap_ops = {
1617         .open = packet_mm_open,
1618         .close =packet_mm_close,
1619 };
1620
1621 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1622 {
1623         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1624 }
1625
1626 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1627 {
1628         int i;
1629
1630         for (i = 0; i < len; i++) {
1631                 if (likely(pg_vec[i]))
1632                         free_pages((unsigned long) pg_vec[i], order);
1633         }
1634         kfree(pg_vec);
1635 }
1636
1637 static inline char *alloc_one_pg_vec_page(unsigned long order)
1638 {
1639         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1640                                          order);
1641 }
1642
1643 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1644 {
1645         unsigned int block_nr = req->tp_block_nr;
1646         char **pg_vec;
1647         int i;
1648
1649         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1650         if (unlikely(!pg_vec))
1651                 goto out;
1652
1653         for (i = 0; i < block_nr; i++) {
1654                 pg_vec[i] = alloc_one_pg_vec_page(order);
1655                 if (unlikely(!pg_vec[i]))
1656                         goto out_free_pgvec;
1657         }
1658
1659 out:
1660         return pg_vec;
1661
1662 out_free_pgvec:
1663         free_pg_vec(pg_vec, order, block_nr);
1664         pg_vec = NULL;
1665         goto out;
1666 }
1667
1668 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1669 {
1670         char **pg_vec = NULL;
1671         struct packet_sock *po = pkt_sk(sk);
1672         int was_running, order = 0;
1673         __be16 num;
1674         int err = 0;
1675
1676         if (req->tp_block_nr) {
1677                 int i, l;
1678
1679                 /* Sanity tests and some calculations */
1680
1681                 if (unlikely(po->pg_vec))
1682                         return -EBUSY;
1683
1684                 if (unlikely((int)req->tp_block_size <= 0))
1685                         return -EINVAL;
1686                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1687                         return -EINVAL;
1688                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1689                         return -EINVAL;
1690                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1691                         return -EINVAL;
1692
1693                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1694                 if (unlikely(po->frames_per_block <= 0))
1695                         return -EINVAL;
1696                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1697                              req->tp_frame_nr))
1698                         return -EINVAL;
1699
1700                 err = -ENOMEM;
1701                 order = get_order(req->tp_block_size);
1702                 pg_vec = alloc_pg_vec(req, order);
1703                 if (unlikely(!pg_vec))
1704                         goto out;
1705
1706                 l = 0;
1707                 for (i = 0; i < req->tp_block_nr; i++) {
1708                         char *ptr = pg_vec[i];
1709                         struct tpacket_hdr *header;
1710                         int k;
1711
1712                         for (k = 0; k < po->frames_per_block; k++) {
1713                                 header = (struct tpacket_hdr *) ptr;
1714                                 header->tp_status = TP_STATUS_KERNEL;
1715                                 ptr += req->tp_frame_size;
1716                         }
1717                 }
1718                 /* Done */
1719         } else {
1720                 if (unlikely(req->tp_frame_nr))
1721                         return -EINVAL;
1722         }
1723
1724         lock_sock(sk);
1725
1726         /* Detach socket from network */
1727         spin_lock(&po->bind_lock);
1728         was_running = po->running;
1729         num = po->num;
1730         if (was_running) {
1731                 __dev_remove_pack(&po->prot_hook);
1732                 po->num = 0;
1733                 po->running = 0;
1734                 __sock_put(sk);
1735         }
1736         spin_unlock(&po->bind_lock);
1737
1738         synchronize_net();
1739
1740         err = -EBUSY;
1741         if (closing || atomic_read(&po->mapped) == 0) {
1742                 err = 0;
1743 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1744
1745                 spin_lock_bh(&sk->sk_receive_queue.lock);
1746                 pg_vec = XC(po->pg_vec, pg_vec);
1747                 po->frame_max = (req->tp_frame_nr - 1);
1748                 po->head = 0;
1749                 po->frame_size = req->tp_frame_size;
1750                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1751
1752                 order = XC(po->pg_vec_order, order);
1753                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1754
1755                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1756                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1757                 skb_queue_purge(&sk->sk_receive_queue);
1758 #undef XC
1759                 if (atomic_read(&po->mapped))
1760                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1761         }
1762
1763         spin_lock(&po->bind_lock);
1764         if (was_running && !po->running) {
1765                 sock_hold(sk);
1766                 po->running = 1;
1767                 po->num = num;
1768                 dev_add_pack(&po->prot_hook);
1769         }
1770         spin_unlock(&po->bind_lock);
1771
1772         release_sock(sk);
1773
1774         if (pg_vec)
1775                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1776 out:
1777         return err;
1778 }
1779
1780 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1781 {
1782         struct sock *sk = sock->sk;
1783         struct packet_sock *po = pkt_sk(sk);
1784         unsigned long size;
1785         unsigned long start;
1786         int err = -EINVAL;
1787         int i;
1788
1789         if (vma->vm_pgoff)
1790                 return -EINVAL;
1791
1792         size = vma->vm_end - vma->vm_start;
1793
1794         lock_sock(sk);
1795         if (po->pg_vec == NULL)
1796                 goto out;
1797         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1798                 goto out;
1799
1800         start = vma->vm_start;
1801         for (i = 0; i < po->pg_vec_len; i++) {
1802                 struct page *page = virt_to_page(po->pg_vec[i]);
1803                 int pg_num;
1804
1805                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1806                         err = vm_insert_page(vma, start, page);
1807                         if (unlikely(err))
1808                                 goto out;
1809                         start += PAGE_SIZE;
1810                 }
1811         }
1812         atomic_inc(&po->mapped);
1813         vma->vm_ops = &packet_mmap_ops;
1814         err = 0;
1815
1816 out:
1817         release_sock(sk);
1818         return err;
1819 }
1820 #endif
1821
1822
1823 static const struct proto_ops packet_ops_spkt = {
1824         .family =       PF_PACKET,
1825         .owner =        THIS_MODULE,
1826         .release =      packet_release,
1827         .bind =         packet_bind_spkt,
1828         .connect =      sock_no_connect,
1829         .socketpair =   sock_no_socketpair,
1830         .accept =       sock_no_accept,
1831         .getname =      packet_getname_spkt,
1832         .poll =         datagram_poll,
1833         .ioctl =        packet_ioctl,
1834         .listen =       sock_no_listen,
1835         .shutdown =     sock_no_shutdown,
1836         .setsockopt =   sock_no_setsockopt,
1837         .getsockopt =   sock_no_getsockopt,
1838         .sendmsg =      packet_sendmsg_spkt,
1839         .recvmsg =      packet_recvmsg,
1840         .mmap =         sock_no_mmap,
1841         .sendpage =     sock_no_sendpage,
1842 };
1843
1844 static const struct proto_ops packet_ops = {
1845         .family =       PF_PACKET,
1846         .owner =        THIS_MODULE,
1847         .release =      packet_release,
1848         .bind =         packet_bind,
1849         .connect =      sock_no_connect,
1850         .socketpair =   sock_no_socketpair,
1851         .accept =       sock_no_accept,
1852         .getname =      packet_getname,
1853         .poll =         packet_poll,
1854         .ioctl =        packet_ioctl,
1855         .listen =       sock_no_listen,
1856         .shutdown =     sock_no_shutdown,
1857         .setsockopt =   packet_setsockopt,
1858         .getsockopt =   packet_getsockopt,
1859         .sendmsg =      packet_sendmsg,
1860         .recvmsg =      packet_recvmsg,
1861         .mmap =         packet_mmap,
1862         .sendpage =     sock_no_sendpage,
1863 };
1864
1865 static struct net_proto_family packet_family_ops = {
1866         .family =       PF_PACKET,
1867         .create =       packet_create,
1868         .owner  =       THIS_MODULE,
1869 };
1870
1871 static struct notifier_block packet_netdev_notifier = {
1872         .notifier_call =packet_notifier,
1873 };
1874
1875 #ifdef CONFIG_PROC_FS
1876 static inline struct sock *packet_seq_idx(loff_t off)
1877 {
1878         struct sock *s;
1879         struct hlist_node *node;
1880
1881         sk_for_each(s, node, &packet_sklist) {
1882                 if (!off--)
1883                         return s;
1884         }
1885         return NULL;
1886 }
1887
1888 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1889 {
1890         read_lock(&packet_sklist_lock);
1891         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1892 }
1893
1894 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1895 {
1896         ++*pos;
1897         return  (v == SEQ_START_TOKEN)
1898                 ? sk_head(&packet_sklist)
1899                 : sk_next((struct sock*)v) ;
1900 }
1901
1902 static void packet_seq_stop(struct seq_file *seq, void *v)
1903 {
1904         read_unlock(&packet_sklist_lock);
1905 }
1906
1907 static int packet_seq_show(struct seq_file *seq, void *v)
1908 {
1909         if (v == SEQ_START_TOKEN)
1910                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1911         else {
1912                 struct sock *s = v;
1913                 const struct packet_sock *po = pkt_sk(s);
1914
1915                 seq_printf(seq,
1916                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1917                            s,
1918                            atomic_read(&s->sk_refcnt),
1919                            s->sk_type,
1920                            ntohs(po->num),
1921                            po->ifindex,
1922                            po->running,
1923                            atomic_read(&s->sk_rmem_alloc),
1924                            sock_i_uid(s),
1925                            sock_i_ino(s) );
1926         }
1927
1928         return 0;
1929 }
1930
1931 static const struct seq_operations packet_seq_ops = {
1932         .start  = packet_seq_start,
1933         .next   = packet_seq_next,
1934         .stop   = packet_seq_stop,
1935         .show   = packet_seq_show,
1936 };
1937
1938 static int packet_seq_open(struct inode *inode, struct file *file)
1939 {
1940         return seq_open(file, &packet_seq_ops);
1941 }
1942
1943 static const struct file_operations packet_seq_fops = {
1944         .owner          = THIS_MODULE,
1945         .open           = packet_seq_open,
1946         .read           = seq_read,
1947         .llseek         = seq_lseek,
1948         .release        = seq_release,
1949 };
1950
1951 #endif
1952
1953 static void __exit packet_exit(void)
1954 {
1955         proc_net_remove(&init_net, "packet");
1956         unregister_netdevice_notifier(&packet_netdev_notifier);
1957         sock_unregister(PF_PACKET);
1958         proto_unregister(&packet_proto);
1959 }
1960
1961 static int __init packet_init(void)
1962 {
1963         int rc = proto_register(&packet_proto, 0);
1964
1965         if (rc != 0)
1966                 goto out;
1967
1968         sock_register(&packet_family_ops);
1969         register_netdevice_notifier(&packet_netdev_notifier);
1970         proc_net_fops_create(&init_net, "packet", 0, &packet_seq_fops);
1971 out:
1972         return rc;
1973 }
1974
1975 module_init(packet_init);
1976 module_exit(packet_exit);
1977 MODULE_LICENSE("GPL");
1978 MODULE_ALIAS_NETPROTO(PF_PACKET);