[AF_PACKET]: Kill bogus CONFIG_PACKET_MULTICAST
[safe/jmp/linux-2.6] / net / packet / af_packet.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              PACKET - implements raw packet sockets.
7  *
8  * Version:     $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
13  *
14  * Fixes:
15  *              Alan Cox        :       verify_area() now used correctly
16  *              Alan Cox        :       new skbuff lists, look ma no backlogs!
17  *              Alan Cox        :       tidied skbuff lists.
18  *              Alan Cox        :       Now uses generic datagram routines I
19  *                                      added. Also fixed the peek/read crash
20  *                                      from all old Linux datagram code.
21  *              Alan Cox        :       Uses the improved datagram code.
22  *              Alan Cox        :       Added NULL's for socket options.
23  *              Alan Cox        :       Re-commented the code.
24  *              Alan Cox        :       Use new kernel side addressing
25  *              Rob Janssen     :       Correct MTU usage.
26  *              Dave Platt      :       Counter leaks caused by incorrect
27  *                                      interrupt locking and some slightly
28  *                                      dubious gcc output. Can you read
29  *                                      compiler: it said _VOLATILE_
30  *      Richard Kooijman        :       Timestamp fixes.
31  *              Alan Cox        :       New buffers. Use sk->mac.raw.
32  *              Alan Cox        :       sendmsg/recvmsg support.
33  *              Alan Cox        :       Protocol setting support
34  *      Alexey Kuznetsov        :       Untied from IPv4 stack.
35  *      Cyrus Durgin            :       Fixed kerneld for kmod.
36  *      Michal Ostrowski        :       Module initialization cleanup.
37  *         Ulises Alonso        :       Frame number limit removal and
38  *                                      packet_set_ring memory leak.
39  *              Eric Biederman  :       Allow for > 8 byte hardware addresses.
40  *                                      The convention is that longer addresses
41  *                                      will simply extend the hardware address
42  *                                      byte arrays at the end of sockaddr_ll
43  *                                      and packet_mreq.
44  *
45  *              This program is free software; you can redistribute it and/or
46  *              modify it under the terms of the GNU General Public License
47  *              as published by the Free Software Foundation; either version
48  *              2 of the License, or (at your option) any later version.
49  *
50  */
51
52 #include <linux/types.h>
53 #include <linux/mm.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
57 #include <linux/in.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/ip.h>
65 #include <net/protocol.h>
66 #include <linux/skbuff.h>
67 #include <net/sock.h>
68 #include <linux/errno.h>
69 #include <linux/timer.h>
70 #include <asm/system.h>
71 #include <asm/uaccess.h>
72 #include <asm/ioctls.h>
73 #include <asm/page.h>
74 #include <asm/cacheflush.h>
75 #include <asm/io.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
78 #include <linux/poll.h>
79 #include <linux/module.h>
80 #include <linux/init.h>
81
82 #ifdef CONFIG_INET
83 #include <net/inet_common.h>
84 #endif
85
86 #define CONFIG_SOCK_PACKET      1
87
88 /*
89    Assumptions:
90    - if device has no dev->hard_header routine, it adds and removes ll header
91      inside itself. In this case ll header is invisible outside of device,
92      but higher levels still should reserve dev->hard_header_len.
93      Some devices are enough clever to reallocate skb, when header
94      will not fit to reserved space (tunnel), another ones are silly
95      (PPP).
96    - packet socket receives packets with pulled ll header,
97      so that SOCK_RAW should push it back.
98
99 On receive:
100 -----------
101
102 Incoming, dev->hard_header!=NULL
103    mac_header -> ll header
104    data       -> data
105
106 Outgoing, dev->hard_header!=NULL
107    mac_header -> ll header
108    data       -> ll header
109
110 Incoming, dev->hard_header==NULL
111    mac_header -> UNKNOWN position. It is very likely, that it points to ll
112                  header.  PPP makes it, that is wrong, because introduce
113                  assymetry between rx and tx paths.
114    data       -> data
115
116 Outgoing, dev->hard_header==NULL
117    mac_header -> data. ll header is still not built!
118    data       -> data
119
120 Resume
121   If dev->hard_header==NULL we are unlikely to restore sensible ll header.
122
123
124 On transmit:
125 ------------
126
127 dev->hard_header != NULL
128    mac_header -> ll header
129    data       -> ll header
130
131 dev->hard_header == NULL (ll header is added by device, we cannot control it)
132    mac_header -> data
133    data       -> data
134
135    We should set nh.raw on output to correct posistion,
136    packet classifier depends on it.
137  */
138
139 /* List of all packet sockets. */
140 static HLIST_HEAD(packet_sklist);
141 static DEFINE_RWLOCK(packet_sklist_lock);
142
143 static atomic_t packet_socks_nr;
144
145
146 /* Private packet socket structures. */
147
148 struct packet_mclist
149 {
150         struct packet_mclist    *next;
151         int                     ifindex;
152         int                     count;
153         unsigned short          type;
154         unsigned short          alen;
155         unsigned char           addr[MAX_ADDR_LEN];
156 };
157 /* identical to struct packet_mreq except it has
158  * a longer address field.
159  */
160 struct packet_mreq_max
161 {
162         int             mr_ifindex;
163         unsigned short  mr_type;
164         unsigned short  mr_alen;
165         unsigned char   mr_address[MAX_ADDR_LEN];
166 };
167
168 #ifdef CONFIG_PACKET_MMAP
169 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
170 #endif
171
172 static void packet_flush_mclist(struct sock *sk);
173
174 struct packet_sock {
175         /* struct sock has to be the first member of packet_sock */
176         struct sock             sk;
177         struct tpacket_stats    stats;
178 #ifdef CONFIG_PACKET_MMAP
179         char *                  *pg_vec;
180         unsigned int            head;
181         unsigned int            frames_per_block;
182         unsigned int            frame_size;
183         unsigned int            frame_max;
184         int                     copy_thresh;
185 #endif
186         struct packet_type      prot_hook;
187         spinlock_t              bind_lock;
188         unsigned int            running:1,      /* prot_hook is attached*/
189                                 auxdata:1,
190                                 origdev:1;
191         int                     ifindex;        /* bound device         */
192         __be16                  num;
193         struct packet_mclist    *mclist;
194 #ifdef CONFIG_PACKET_MMAP
195         atomic_t                mapped;
196         unsigned int            pg_vec_order;
197         unsigned int            pg_vec_pages;
198         unsigned int            pg_vec_len;
199 #endif
200 };
201
202 struct packet_skb_cb {
203         unsigned int origlen;
204         union {
205                 struct sockaddr_pkt pkt;
206                 struct sockaddr_ll ll;
207         } sa;
208 };
209
210 #define PACKET_SKB_CB(__skb)    ((struct packet_skb_cb *)((__skb)->cb))
211
212 #ifdef CONFIG_PACKET_MMAP
213
214 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
215 {
216         unsigned int pg_vec_pos, frame_offset;
217
218         pg_vec_pos = position / po->frames_per_block;
219         frame_offset = position % po->frames_per_block;
220
221         return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
222 }
223 #endif
224
225 static inline struct packet_sock *pkt_sk(struct sock *sk)
226 {
227         return (struct packet_sock *)sk;
228 }
229
230 static void packet_sock_destruct(struct sock *sk)
231 {
232         BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
233         BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
234
235         if (!sock_flag(sk, SOCK_DEAD)) {
236                 printk("Attempt to release alive packet socket: %p\n", sk);
237                 return;
238         }
239
240         atomic_dec(&packet_socks_nr);
241 #ifdef PACKET_REFCNT_DEBUG
242         printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
243 #endif
244 }
245
246
247 static const struct proto_ops packet_ops;
248
249 #ifdef CONFIG_SOCK_PACKET
250 static const struct proto_ops packet_ops_spkt;
251
252 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev,  struct packet_type *pt, struct net_device *orig_dev)
253 {
254         struct sock *sk;
255         struct sockaddr_pkt *spkt;
256
257         /*
258          *      When we registered the protocol we saved the socket in the data
259          *      field for just this event.
260          */
261
262         sk = pt->af_packet_priv;
263
264         /*
265          *      Yank back the headers [hope the device set this
266          *      right or kerboom...]
267          *
268          *      Incoming packets have ll header pulled,
269          *      push it back.
270          *
271          *      For outgoing ones skb->data == skb_mac_header(skb)
272          *      so that this procedure is noop.
273          */
274
275         if (skb->pkt_type == PACKET_LOOPBACK)
276                 goto out;
277
278         if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
279                 goto oom;
280
281         /* drop any routing info */
282         dst_release(skb->dst);
283         skb->dst = NULL;
284
285         /* drop conntrack reference */
286         nf_reset(skb);
287
288         spkt = &PACKET_SKB_CB(skb)->sa.pkt;
289
290         skb_push(skb, skb->data - skb_mac_header(skb));
291
292         /*
293          *      The SOCK_PACKET socket receives _all_ frames.
294          */
295
296         spkt->spkt_family = dev->type;
297         strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
298         spkt->spkt_protocol = skb->protocol;
299
300         /*
301          *      Charge the memory to the socket. This is done specifically
302          *      to prevent sockets using all the memory up.
303          */
304
305         if (sock_queue_rcv_skb(sk,skb) == 0)
306                 return 0;
307
308 out:
309         kfree_skb(skb);
310 oom:
311         return 0;
312 }
313
314
315 /*
316  *      Output a raw packet to a device layer. This bypasses all the other
317  *      protocol layers and you must therefore supply it with a complete frame
318  */
319
320 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
321                                struct msghdr *msg, size_t len)
322 {
323         struct sock *sk = sock->sk;
324         struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
325         struct sk_buff *skb;
326         struct net_device *dev;
327         __be16 proto=0;
328         int err;
329
330         /*
331          *      Get and verify the address.
332          */
333
334         if (saddr)
335         {
336                 if (msg->msg_namelen < sizeof(struct sockaddr))
337                         return(-EINVAL);
338                 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
339                         proto=saddr->spkt_protocol;
340         }
341         else
342                 return(-ENOTCONN);      /* SOCK_PACKET must be sent giving an address */
343
344         /*
345          *      Find the device first to size check it
346          */
347
348         saddr->spkt_device[13] = 0;
349         dev = dev_get_by_name(saddr->spkt_device);
350         err = -ENODEV;
351         if (dev == NULL)
352                 goto out_unlock;
353
354         err = -ENETDOWN;
355         if (!(dev->flags & IFF_UP))
356                 goto out_unlock;
357
358         /*
359          *      You may not queue a frame bigger than the mtu. This is the lowest level
360          *      raw protocol and you must do your own fragmentation at this level.
361          */
362
363         err = -EMSGSIZE;
364         if (len > dev->mtu + dev->hard_header_len)
365                 goto out_unlock;
366
367         err = -ENOBUFS;
368         skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
369
370         /*
371          *      If the write buffer is full, then tough. At this level the user gets to
372          *      deal with the problem - do your own algorithmic backoffs. That's far
373          *      more flexible.
374          */
375
376         if (skb == NULL)
377                 goto out_unlock;
378
379         /*
380          *      Fill it in
381          */
382
383         /* FIXME: Save some space for broken drivers that write a
384          * hard header at transmission time by themselves. PPP is the
385          * notable one here. This should really be fixed at the driver level.
386          */
387         skb_reserve(skb, LL_RESERVED_SPACE(dev));
388         skb_reset_network_header(skb);
389
390         /* Try to align data part correctly */
391         if (dev->hard_header) {
392                 skb->data -= dev->hard_header_len;
393                 skb->tail -= dev->hard_header_len;
394                 if (len < dev->hard_header_len)
395                         skb_reset_network_header(skb);
396         }
397
398         /* Returns -EFAULT on error */
399         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
400         skb->protocol = proto;
401         skb->dev = dev;
402         skb->priority = sk->sk_priority;
403         if (err)
404                 goto out_free;
405
406         /*
407          *      Now send it
408          */
409
410         dev_queue_xmit(skb);
411         dev_put(dev);
412         return(len);
413
414 out_free:
415         kfree_skb(skb);
416 out_unlock:
417         if (dev)
418                 dev_put(dev);
419         return err;
420 }
421 #endif
422
423 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
424                                       unsigned int res)
425 {
426         struct sk_filter *filter;
427
428         rcu_read_lock_bh();
429         filter = rcu_dereference(sk->sk_filter);
430         if (filter != NULL)
431                 res = sk_run_filter(skb, filter->insns, filter->len);
432         rcu_read_unlock_bh();
433
434         return res;
435 }
436
437 /*
438    This function makes lazy skb cloning in hope that most of packets
439    are discarded by BPF.
440
441    Note tricky part: we DO mangle shared skb! skb->data, skb->len
442    and skb->cb are mangled. It works because (and until) packets
443    falling here are owned by current CPU. Output packets are cloned
444    by dev_queue_xmit_nit(), input packets are processed by net_bh
445    sequencially, so that if we return skb to original state on exit,
446    we will not harm anyone.
447  */
448
449 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
450 {
451         struct sock *sk;
452         struct sockaddr_ll *sll;
453         struct packet_sock *po;
454         u8 * skb_head = skb->data;
455         int skb_len = skb->len;
456         unsigned int snaplen, res;
457
458         if (skb->pkt_type == PACKET_LOOPBACK)
459                 goto drop;
460
461         sk = pt->af_packet_priv;
462         po = pkt_sk(sk);
463
464         skb->dev = dev;
465
466         if (dev->hard_header) {
467                 /* The device has an explicit notion of ll header,
468                    exported to higher levels.
469
470                    Otherwise, the device hides datails of it frame
471                    structure, so that corresponding packet head
472                    never delivered to user.
473                  */
474                 if (sk->sk_type != SOCK_DGRAM)
475                         skb_push(skb, skb->data - skb_mac_header(skb));
476                 else if (skb->pkt_type == PACKET_OUTGOING) {
477                         /* Special case: outgoing packets have ll header at head */
478                         skb_pull(skb, skb_network_offset(skb));
479                 }
480         }
481
482         snaplen = skb->len;
483
484         res = run_filter(skb, sk, snaplen);
485         if (!res)
486                 goto drop_n_restore;
487         if (snaplen > res)
488                 snaplen = res;
489
490         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
491             (unsigned)sk->sk_rcvbuf)
492                 goto drop_n_acct;
493
494         if (skb_shared(skb)) {
495                 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
496                 if (nskb == NULL)
497                         goto drop_n_acct;
498
499                 if (skb_head != skb->data) {
500                         skb->data = skb_head;
501                         skb->len = skb_len;
502                 }
503                 kfree_skb(skb);
504                 skb = nskb;
505         }
506
507         BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
508                      sizeof(skb->cb));
509
510         sll = &PACKET_SKB_CB(skb)->sa.ll;
511         sll->sll_family = AF_PACKET;
512         sll->sll_hatype = dev->type;
513         sll->sll_protocol = skb->protocol;
514         sll->sll_pkttype = skb->pkt_type;
515         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
516                 sll->sll_ifindex = orig_dev->ifindex;
517         else
518                 sll->sll_ifindex = dev->ifindex;
519         sll->sll_halen = 0;
520
521         if (dev->hard_header_parse)
522                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
523
524         PACKET_SKB_CB(skb)->origlen = skb->len;
525
526         if (pskb_trim(skb, snaplen))
527                 goto drop_n_acct;
528
529         skb_set_owner_r(skb, sk);
530         skb->dev = NULL;
531         dst_release(skb->dst);
532         skb->dst = NULL;
533
534         /* drop conntrack reference */
535         nf_reset(skb);
536
537         spin_lock(&sk->sk_receive_queue.lock);
538         po->stats.tp_packets++;
539         __skb_queue_tail(&sk->sk_receive_queue, skb);
540         spin_unlock(&sk->sk_receive_queue.lock);
541         sk->sk_data_ready(sk, skb->len);
542         return 0;
543
544 drop_n_acct:
545         spin_lock(&sk->sk_receive_queue.lock);
546         po->stats.tp_drops++;
547         spin_unlock(&sk->sk_receive_queue.lock);
548
549 drop_n_restore:
550         if (skb_head != skb->data && skb_shared(skb)) {
551                 skb->data = skb_head;
552                 skb->len = skb_len;
553         }
554 drop:
555         kfree_skb(skb);
556         return 0;
557 }
558
559 #ifdef CONFIG_PACKET_MMAP
560 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
561 {
562         struct sock *sk;
563         struct packet_sock *po;
564         struct sockaddr_ll *sll;
565         struct tpacket_hdr *h;
566         u8 * skb_head = skb->data;
567         int skb_len = skb->len;
568         unsigned int snaplen, res;
569         unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
570         unsigned short macoff, netoff;
571         struct sk_buff *copy_skb = NULL;
572         struct timeval tv;
573
574         if (skb->pkt_type == PACKET_LOOPBACK)
575                 goto drop;
576
577         sk = pt->af_packet_priv;
578         po = pkt_sk(sk);
579
580         if (dev->hard_header) {
581                 if (sk->sk_type != SOCK_DGRAM)
582                         skb_push(skb, skb->data - skb_mac_header(skb));
583                 else if (skb->pkt_type == PACKET_OUTGOING) {
584                         /* Special case: outgoing packets have ll header at head */
585                         skb_pull(skb, skb_network_offset(skb));
586                 }
587         }
588
589         if (skb->ip_summed == CHECKSUM_PARTIAL)
590                 status |= TP_STATUS_CSUMNOTREADY;
591
592         snaplen = skb->len;
593
594         res = run_filter(skb, sk, snaplen);
595         if (!res)
596                 goto drop_n_restore;
597         if (snaplen > res)
598                 snaplen = res;
599
600         if (sk->sk_type == SOCK_DGRAM) {
601                 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
602         } else {
603                 unsigned maclen = skb_network_offset(skb);
604                 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
605                 macoff = netoff - maclen;
606         }
607
608         if (macoff + snaplen > po->frame_size) {
609                 if (po->copy_thresh &&
610                     atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
611                     (unsigned)sk->sk_rcvbuf) {
612                         if (skb_shared(skb)) {
613                                 copy_skb = skb_clone(skb, GFP_ATOMIC);
614                         } else {
615                                 copy_skb = skb_get(skb);
616                                 skb_head = skb->data;
617                         }
618                         if (copy_skb)
619                                 skb_set_owner_r(copy_skb, sk);
620                 }
621                 snaplen = po->frame_size - macoff;
622                 if ((int)snaplen < 0)
623                         snaplen = 0;
624         }
625
626         spin_lock(&sk->sk_receive_queue.lock);
627         h = packet_lookup_frame(po, po->head);
628
629         if (h->tp_status)
630                 goto ring_is_full;
631         po->head = po->head != po->frame_max ? po->head+1 : 0;
632         po->stats.tp_packets++;
633         if (copy_skb) {
634                 status |= TP_STATUS_COPY;
635                 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
636         }
637         if (!po->stats.tp_drops)
638                 status &= ~TP_STATUS_LOSING;
639         spin_unlock(&sk->sk_receive_queue.lock);
640
641         skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
642
643         h->tp_len = skb->len;
644         h->tp_snaplen = snaplen;
645         h->tp_mac = macoff;
646         h->tp_net = netoff;
647         if (skb->tstamp.tv64 == 0) {
648                 __net_timestamp(skb);
649                 sock_enable_timestamp(sk);
650         }
651         tv = ktime_to_timeval(skb->tstamp);
652         h->tp_sec = tv.tv_sec;
653         h->tp_usec = tv.tv_usec;
654
655         sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
656         sll->sll_halen = 0;
657         if (dev->hard_header_parse)
658                 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
659         sll->sll_family = AF_PACKET;
660         sll->sll_hatype = dev->type;
661         sll->sll_protocol = skb->protocol;
662         sll->sll_pkttype = skb->pkt_type;
663         if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
664                 sll->sll_ifindex = orig_dev->ifindex;
665         else
666                 sll->sll_ifindex = dev->ifindex;
667
668         h->tp_status = status;
669         smp_mb();
670
671         {
672                 struct page *p_start, *p_end;
673                 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
674
675                 p_start = virt_to_page(h);
676                 p_end = virt_to_page(h_end);
677                 while (p_start <= p_end) {
678                         flush_dcache_page(p_start);
679                         p_start++;
680                 }
681         }
682
683         sk->sk_data_ready(sk, 0);
684
685 drop_n_restore:
686         if (skb_head != skb->data && skb_shared(skb)) {
687                 skb->data = skb_head;
688                 skb->len = skb_len;
689         }
690 drop:
691         kfree_skb(skb);
692         return 0;
693
694 ring_is_full:
695         po->stats.tp_drops++;
696         spin_unlock(&sk->sk_receive_queue.lock);
697
698         sk->sk_data_ready(sk, 0);
699         if (copy_skb)
700                 kfree_skb(copy_skb);
701         goto drop_n_restore;
702 }
703
704 #endif
705
706
707 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
708                           struct msghdr *msg, size_t len)
709 {
710         struct sock *sk = sock->sk;
711         struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
712         struct sk_buff *skb;
713         struct net_device *dev;
714         __be16 proto;
715         unsigned char *addr;
716         int ifindex, err, reserve = 0;
717
718         /*
719          *      Get and verify the address.
720          */
721
722         if (saddr == NULL) {
723                 struct packet_sock *po = pkt_sk(sk);
724
725                 ifindex = po->ifindex;
726                 proto   = po->num;
727                 addr    = NULL;
728         } else {
729                 err = -EINVAL;
730                 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
731                         goto out;
732                 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
733                         goto out;
734                 ifindex = saddr->sll_ifindex;
735                 proto   = saddr->sll_protocol;
736                 addr    = saddr->sll_addr;
737         }
738
739
740         dev = dev_get_by_index(ifindex);
741         err = -ENXIO;
742         if (dev == NULL)
743                 goto out_unlock;
744         if (sock->type == SOCK_RAW)
745                 reserve = dev->hard_header_len;
746
747         err = -ENETDOWN;
748         if (!(dev->flags & IFF_UP))
749                 goto out_unlock;
750
751         err = -EMSGSIZE;
752         if (len > dev->mtu+reserve)
753                 goto out_unlock;
754
755         skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
756                                 msg->msg_flags & MSG_DONTWAIT, &err);
757         if (skb==NULL)
758                 goto out_unlock;
759
760         skb_reserve(skb, LL_RESERVED_SPACE(dev));
761         skb_reset_network_header(skb);
762
763         if (dev->hard_header) {
764                 int res;
765                 err = -EINVAL;
766                 res = dev->hard_header(skb, dev, ntohs(proto), addr, NULL, len);
767                 if (sock->type != SOCK_DGRAM) {
768                         skb_reset_tail_pointer(skb);
769                         skb->len = 0;
770                 } else if (res < 0)
771                         goto out_free;
772         }
773
774         /* Returns -EFAULT on error */
775         err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
776         if (err)
777                 goto out_free;
778
779         skb->protocol = proto;
780         skb->dev = dev;
781         skb->priority = sk->sk_priority;
782
783         /*
784          *      Now send it
785          */
786
787         err = dev_queue_xmit(skb);
788         if (err > 0 && (err = net_xmit_errno(err)) != 0)
789                 goto out_unlock;
790
791         dev_put(dev);
792
793         return(len);
794
795 out_free:
796         kfree_skb(skb);
797 out_unlock:
798         if (dev)
799                 dev_put(dev);
800 out:
801         return err;
802 }
803
804 /*
805  *      Close a PACKET socket. This is fairly simple. We immediately go
806  *      to 'closed' state and remove our protocol entry in the device list.
807  */
808
809 static int packet_release(struct socket *sock)
810 {
811         struct sock *sk = sock->sk;
812         struct packet_sock *po;
813
814         if (!sk)
815                 return 0;
816
817         po = pkt_sk(sk);
818
819         write_lock_bh(&packet_sklist_lock);
820         sk_del_node_init(sk);
821         write_unlock_bh(&packet_sklist_lock);
822
823         /*
824          *      Unhook packet receive handler.
825          */
826
827         if (po->running) {
828                 /*
829                  *      Remove the protocol hook
830                  */
831                 dev_remove_pack(&po->prot_hook);
832                 po->running = 0;
833                 po->num = 0;
834                 __sock_put(sk);
835         }
836
837         packet_flush_mclist(sk);
838
839 #ifdef CONFIG_PACKET_MMAP
840         if (po->pg_vec) {
841                 struct tpacket_req req;
842                 memset(&req, 0, sizeof(req));
843                 packet_set_ring(sk, &req, 1);
844         }
845 #endif
846
847         /*
848          *      Now the socket is dead. No more input will appear.
849          */
850
851         sock_orphan(sk);
852         sock->sk = NULL;
853
854         /* Purge queues */
855
856         skb_queue_purge(&sk->sk_receive_queue);
857
858         sock_put(sk);
859         return 0;
860 }
861
862 /*
863  *      Attach a packet hook.
864  */
865
866 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
867 {
868         struct packet_sock *po = pkt_sk(sk);
869         /*
870          *      Detach an existing hook if present.
871          */
872
873         lock_sock(sk);
874
875         spin_lock(&po->bind_lock);
876         if (po->running) {
877                 __sock_put(sk);
878                 po->running = 0;
879                 po->num = 0;
880                 spin_unlock(&po->bind_lock);
881                 dev_remove_pack(&po->prot_hook);
882                 spin_lock(&po->bind_lock);
883         }
884
885         po->num = protocol;
886         po->prot_hook.type = protocol;
887         po->prot_hook.dev = dev;
888
889         po->ifindex = dev ? dev->ifindex : 0;
890
891         if (protocol == 0)
892                 goto out_unlock;
893
894         if (dev) {
895                 if (dev->flags&IFF_UP) {
896                         dev_add_pack(&po->prot_hook);
897                         sock_hold(sk);
898                         po->running = 1;
899                 } else {
900                         sk->sk_err = ENETDOWN;
901                         if (!sock_flag(sk, SOCK_DEAD))
902                                 sk->sk_error_report(sk);
903                 }
904         } else {
905                 dev_add_pack(&po->prot_hook);
906                 sock_hold(sk);
907                 po->running = 1;
908         }
909
910 out_unlock:
911         spin_unlock(&po->bind_lock);
912         release_sock(sk);
913         return 0;
914 }
915
916 /*
917  *      Bind a packet socket to a device
918  */
919
920 #ifdef CONFIG_SOCK_PACKET
921
922 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
923 {
924         struct sock *sk=sock->sk;
925         char name[15];
926         struct net_device *dev;
927         int err = -ENODEV;
928
929         /*
930          *      Check legality
931          */
932
933         if (addr_len != sizeof(struct sockaddr))
934                 return -EINVAL;
935         strlcpy(name,uaddr->sa_data,sizeof(name));
936
937         dev = dev_get_by_name(name);
938         if (dev) {
939                 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
940                 dev_put(dev);
941         }
942         return err;
943 }
944 #endif
945
946 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
947 {
948         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
949         struct sock *sk=sock->sk;
950         struct net_device *dev = NULL;
951         int err;
952
953
954         /*
955          *      Check legality
956          */
957
958         if (addr_len < sizeof(struct sockaddr_ll))
959                 return -EINVAL;
960         if (sll->sll_family != AF_PACKET)
961                 return -EINVAL;
962
963         if (sll->sll_ifindex) {
964                 err = -ENODEV;
965                 dev = dev_get_by_index(sll->sll_ifindex);
966                 if (dev == NULL)
967                         goto out;
968         }
969         err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
970         if (dev)
971                 dev_put(dev);
972
973 out:
974         return err;
975 }
976
977 static struct proto packet_proto = {
978         .name     = "PACKET",
979         .owner    = THIS_MODULE,
980         .obj_size = sizeof(struct packet_sock),
981 };
982
983 /*
984  *      Create a packet of type SOCK_PACKET.
985  */
986
987 static int packet_create(struct socket *sock, int protocol)
988 {
989         struct sock *sk;
990         struct packet_sock *po;
991         __be16 proto = (__force __be16)protocol; /* weird, but documented */
992         int err;
993
994         if (!capable(CAP_NET_RAW))
995                 return -EPERM;
996         if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW
997 #ifdef CONFIG_SOCK_PACKET
998             && sock->type != SOCK_PACKET
999 #endif
1000             )
1001                 return -ESOCKTNOSUPPORT;
1002
1003         sock->state = SS_UNCONNECTED;
1004
1005         err = -ENOBUFS;
1006         sk = sk_alloc(PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1007         if (sk == NULL)
1008                 goto out;
1009
1010         sock->ops = &packet_ops;
1011 #ifdef CONFIG_SOCK_PACKET
1012         if (sock->type == SOCK_PACKET)
1013                 sock->ops = &packet_ops_spkt;
1014 #endif
1015         sock_init_data(sock, sk);
1016
1017         po = pkt_sk(sk);
1018         sk->sk_family = PF_PACKET;
1019         po->num = proto;
1020
1021         sk->sk_destruct = packet_sock_destruct;
1022         atomic_inc(&packet_socks_nr);
1023
1024         /*
1025          *      Attach a protocol block
1026          */
1027
1028         spin_lock_init(&po->bind_lock);
1029         po->prot_hook.func = packet_rcv;
1030 #ifdef CONFIG_SOCK_PACKET
1031         if (sock->type == SOCK_PACKET)
1032                 po->prot_hook.func = packet_rcv_spkt;
1033 #endif
1034         po->prot_hook.af_packet_priv = sk;
1035
1036         if (proto) {
1037                 po->prot_hook.type = proto;
1038                 dev_add_pack(&po->prot_hook);
1039                 sock_hold(sk);
1040                 po->running = 1;
1041         }
1042
1043         write_lock_bh(&packet_sklist_lock);
1044         sk_add_node(sk, &packet_sklist);
1045         write_unlock_bh(&packet_sklist_lock);
1046         return(0);
1047 out:
1048         return err;
1049 }
1050
1051 /*
1052  *      Pull a packet from our receive queue and hand it to the user.
1053  *      If necessary we block.
1054  */
1055
1056 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1057                           struct msghdr *msg, size_t len, int flags)
1058 {
1059         struct sock *sk = sock->sk;
1060         struct sk_buff *skb;
1061         int copied, err;
1062         struct sockaddr_ll *sll;
1063
1064         err = -EINVAL;
1065         if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1066                 goto out;
1067
1068 #if 0
1069         /* What error should we return now? EUNATTACH? */
1070         if (pkt_sk(sk)->ifindex < 0)
1071                 return -ENODEV;
1072 #endif
1073
1074         /*
1075          *      Call the generic datagram receiver. This handles all sorts
1076          *      of horrible races and re-entrancy so we can forget about it
1077          *      in the protocol layers.
1078          *
1079          *      Now it will return ENETDOWN, if device have just gone down,
1080          *      but then it will block.
1081          */
1082
1083         skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1084
1085         /*
1086          *      An error occurred so return it. Because skb_recv_datagram()
1087          *      handles the blocking we don't see and worry about blocking
1088          *      retries.
1089          */
1090
1091         if (skb == NULL)
1092                 goto out;
1093
1094         /*
1095          *      If the address length field is there to be filled in, we fill
1096          *      it in now.
1097          */
1098
1099         sll = &PACKET_SKB_CB(skb)->sa.ll;
1100         if (sock->type == SOCK_PACKET)
1101                 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1102         else
1103                 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1104
1105         /*
1106          *      You lose any data beyond the buffer you gave. If it worries a
1107          *      user program they can ask the device for its MTU anyway.
1108          */
1109
1110         copied = skb->len;
1111         if (copied > len)
1112         {
1113                 copied=len;
1114                 msg->msg_flags|=MSG_TRUNC;
1115         }
1116
1117         err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1118         if (err)
1119                 goto out_free;
1120
1121         sock_recv_timestamp(msg, sk, skb);
1122
1123         if (msg->msg_name)
1124                 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1125                        msg->msg_namelen);
1126
1127         if (pkt_sk(sk)->auxdata) {
1128                 struct tpacket_auxdata aux;
1129
1130                 aux.tp_status = TP_STATUS_USER;
1131                 if (skb->ip_summed == CHECKSUM_PARTIAL)
1132                         aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1133                 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1134                 aux.tp_snaplen = skb->len;
1135                 aux.tp_mac = 0;
1136                 aux.tp_net = skb_network_offset(skb);
1137
1138                 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1139         }
1140
1141         /*
1142          *      Free or return the buffer as appropriate. Again this
1143          *      hides all the races and re-entrancy issues from us.
1144          */
1145         err = (flags&MSG_TRUNC) ? skb->len : copied;
1146
1147 out_free:
1148         skb_free_datagram(sk, skb);
1149 out:
1150         return err;
1151 }
1152
1153 #ifdef CONFIG_SOCK_PACKET
1154 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1155                                int *uaddr_len, int peer)
1156 {
1157         struct net_device *dev;
1158         struct sock *sk = sock->sk;
1159
1160         if (peer)
1161                 return -EOPNOTSUPP;
1162
1163         uaddr->sa_family = AF_PACKET;
1164         dev = dev_get_by_index(pkt_sk(sk)->ifindex);
1165         if (dev) {
1166                 strlcpy(uaddr->sa_data, dev->name, 15);
1167                 dev_put(dev);
1168         } else
1169                 memset(uaddr->sa_data, 0, 14);
1170         *uaddr_len = sizeof(*uaddr);
1171
1172         return 0;
1173 }
1174 #endif
1175
1176 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1177                           int *uaddr_len, int peer)
1178 {
1179         struct net_device *dev;
1180         struct sock *sk = sock->sk;
1181         struct packet_sock *po = pkt_sk(sk);
1182         struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1183
1184         if (peer)
1185                 return -EOPNOTSUPP;
1186
1187         sll->sll_family = AF_PACKET;
1188         sll->sll_ifindex = po->ifindex;
1189         sll->sll_protocol = po->num;
1190         dev = dev_get_by_index(po->ifindex);
1191         if (dev) {
1192                 sll->sll_hatype = dev->type;
1193                 sll->sll_halen = dev->addr_len;
1194                 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1195                 dev_put(dev);
1196         } else {
1197                 sll->sll_hatype = 0;    /* Bad: we have no ARPHRD_UNSPEC */
1198                 sll->sll_halen = 0;
1199         }
1200         *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1201
1202         return 0;
1203 }
1204
1205 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1206 {
1207         switch (i->type) {
1208         case PACKET_MR_MULTICAST:
1209                 if (what > 0)
1210                         dev_mc_add(dev, i->addr, i->alen, 0);
1211                 else
1212                         dev_mc_delete(dev, i->addr, i->alen, 0);
1213                 break;
1214         case PACKET_MR_PROMISC:
1215                 dev_set_promiscuity(dev, what);
1216                 break;
1217         case PACKET_MR_ALLMULTI:
1218                 dev_set_allmulti(dev, what);
1219                 break;
1220         default:;
1221         }
1222 }
1223
1224 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1225 {
1226         for ( ; i; i=i->next) {
1227                 if (i->ifindex == dev->ifindex)
1228                         packet_dev_mc(dev, i, what);
1229         }
1230 }
1231
1232 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1233 {
1234         struct packet_sock *po = pkt_sk(sk);
1235         struct packet_mclist *ml, *i;
1236         struct net_device *dev;
1237         int err;
1238
1239         rtnl_lock();
1240
1241         err = -ENODEV;
1242         dev = __dev_get_by_index(mreq->mr_ifindex);
1243         if (!dev)
1244                 goto done;
1245
1246         err = -EINVAL;
1247         if (mreq->mr_alen > dev->addr_len)
1248                 goto done;
1249
1250         err = -ENOBUFS;
1251         i = kmalloc(sizeof(*i), GFP_KERNEL);
1252         if (i == NULL)
1253                 goto done;
1254
1255         err = 0;
1256         for (ml = po->mclist; ml; ml = ml->next) {
1257                 if (ml->ifindex == mreq->mr_ifindex &&
1258                     ml->type == mreq->mr_type &&
1259                     ml->alen == mreq->mr_alen &&
1260                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1261                         ml->count++;
1262                         /* Free the new element ... */
1263                         kfree(i);
1264                         goto done;
1265                 }
1266         }
1267
1268         i->type = mreq->mr_type;
1269         i->ifindex = mreq->mr_ifindex;
1270         i->alen = mreq->mr_alen;
1271         memcpy(i->addr, mreq->mr_address, i->alen);
1272         i->count = 1;
1273         i->next = po->mclist;
1274         po->mclist = i;
1275         packet_dev_mc(dev, i, +1);
1276
1277 done:
1278         rtnl_unlock();
1279         return err;
1280 }
1281
1282 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1283 {
1284         struct packet_mclist *ml, **mlp;
1285
1286         rtnl_lock();
1287
1288         for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1289                 if (ml->ifindex == mreq->mr_ifindex &&
1290                     ml->type == mreq->mr_type &&
1291                     ml->alen == mreq->mr_alen &&
1292                     memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1293                         if (--ml->count == 0) {
1294                                 struct net_device *dev;
1295                                 *mlp = ml->next;
1296                                 dev = dev_get_by_index(ml->ifindex);
1297                                 if (dev) {
1298                                         packet_dev_mc(dev, ml, -1);
1299                                         dev_put(dev);
1300                                 }
1301                                 kfree(ml);
1302                         }
1303                         rtnl_unlock();
1304                         return 0;
1305                 }
1306         }
1307         rtnl_unlock();
1308         return -EADDRNOTAVAIL;
1309 }
1310
1311 static void packet_flush_mclist(struct sock *sk)
1312 {
1313         struct packet_sock *po = pkt_sk(sk);
1314         struct packet_mclist *ml;
1315
1316         if (!po->mclist)
1317                 return;
1318
1319         rtnl_lock();
1320         while ((ml = po->mclist) != NULL) {
1321                 struct net_device *dev;
1322
1323                 po->mclist = ml->next;
1324                 if ((dev = dev_get_by_index(ml->ifindex)) != NULL) {
1325                         packet_dev_mc(dev, ml, -1);
1326                         dev_put(dev);
1327                 }
1328                 kfree(ml);
1329         }
1330         rtnl_unlock();
1331 }
1332
1333 static int
1334 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1335 {
1336         struct sock *sk = sock->sk;
1337         struct packet_sock *po = pkt_sk(sk);
1338         int ret;
1339
1340         if (level != SOL_PACKET)
1341                 return -ENOPROTOOPT;
1342
1343         switch(optname) {
1344         case PACKET_ADD_MEMBERSHIP:
1345         case PACKET_DROP_MEMBERSHIP:
1346         {
1347                 struct packet_mreq_max mreq;
1348                 int len = optlen;
1349                 memset(&mreq, 0, sizeof(mreq));
1350                 if (len < sizeof(struct packet_mreq))
1351                         return -EINVAL;
1352                 if (len > sizeof(mreq))
1353                         len = sizeof(mreq);
1354                 if (copy_from_user(&mreq,optval,len))
1355                         return -EFAULT;
1356                 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1357                         return -EINVAL;
1358                 if (optname == PACKET_ADD_MEMBERSHIP)
1359                         ret = packet_mc_add(sk, &mreq);
1360                 else
1361                         ret = packet_mc_drop(sk, &mreq);
1362                 return ret;
1363         }
1364
1365 #ifdef CONFIG_PACKET_MMAP
1366         case PACKET_RX_RING:
1367         {
1368                 struct tpacket_req req;
1369
1370                 if (optlen<sizeof(req))
1371                         return -EINVAL;
1372                 if (copy_from_user(&req,optval,sizeof(req)))
1373                         return -EFAULT;
1374                 return packet_set_ring(sk, &req, 0);
1375         }
1376         case PACKET_COPY_THRESH:
1377         {
1378                 int val;
1379
1380                 if (optlen!=sizeof(val))
1381                         return -EINVAL;
1382                 if (copy_from_user(&val,optval,sizeof(val)))
1383                         return -EFAULT;
1384
1385                 pkt_sk(sk)->copy_thresh = val;
1386                 return 0;
1387         }
1388 #endif
1389         case PACKET_AUXDATA:
1390         {
1391                 int val;
1392
1393                 if (optlen < sizeof(val))
1394                         return -EINVAL;
1395                 if (copy_from_user(&val, optval, sizeof(val)))
1396                         return -EFAULT;
1397
1398                 po->auxdata = !!val;
1399                 return 0;
1400         }
1401         case PACKET_ORIGDEV:
1402         {
1403                 int val;
1404
1405                 if (optlen < sizeof(val))
1406                         return -EINVAL;
1407                 if (copy_from_user(&val, optval, sizeof(val)))
1408                         return -EFAULT;
1409
1410                 po->origdev = !!val;
1411                 return 0;
1412         }
1413         default:
1414                 return -ENOPROTOOPT;
1415         }
1416 }
1417
1418 static int packet_getsockopt(struct socket *sock, int level, int optname,
1419                              char __user *optval, int __user *optlen)
1420 {
1421         int len;
1422         int val;
1423         struct sock *sk = sock->sk;
1424         struct packet_sock *po = pkt_sk(sk);
1425         void *data;
1426         struct tpacket_stats st;
1427
1428         if (level != SOL_PACKET)
1429                 return -ENOPROTOOPT;
1430
1431         if (get_user(len, optlen))
1432                 return -EFAULT;
1433
1434         if (len < 0)
1435                 return -EINVAL;
1436
1437         switch(optname) {
1438         case PACKET_STATISTICS:
1439                 if (len > sizeof(struct tpacket_stats))
1440                         len = sizeof(struct tpacket_stats);
1441                 spin_lock_bh(&sk->sk_receive_queue.lock);
1442                 st = po->stats;
1443                 memset(&po->stats, 0, sizeof(st));
1444                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1445                 st.tp_packets += st.tp_drops;
1446
1447                 data = &st;
1448                 break;
1449         case PACKET_AUXDATA:
1450                 if (len > sizeof(int))
1451                         len = sizeof(int);
1452                 val = po->auxdata;
1453
1454                 data = &val;
1455                 break;
1456         case PACKET_ORIGDEV:
1457                 if (len > sizeof(int))
1458                         len = sizeof(int);
1459                 val = po->origdev;
1460
1461                 data = &val;
1462                 break;
1463         default:
1464                 return -ENOPROTOOPT;
1465         }
1466
1467         if (put_user(len, optlen))
1468                 return -EFAULT;
1469         if (copy_to_user(optval, data, len))
1470                 return -EFAULT;
1471         return 0;
1472 }
1473
1474
1475 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1476 {
1477         struct sock *sk;
1478         struct hlist_node *node;
1479         struct net_device *dev = data;
1480
1481         read_lock(&packet_sklist_lock);
1482         sk_for_each(sk, node, &packet_sklist) {
1483                 struct packet_sock *po = pkt_sk(sk);
1484
1485                 switch (msg) {
1486                 case NETDEV_UNREGISTER:
1487                         if (po->mclist)
1488                                 packet_dev_mclist(dev, po->mclist, -1);
1489                         /* fallthrough */
1490
1491                 case NETDEV_DOWN:
1492                         if (dev->ifindex == po->ifindex) {
1493                                 spin_lock(&po->bind_lock);
1494                                 if (po->running) {
1495                                         __dev_remove_pack(&po->prot_hook);
1496                                         __sock_put(sk);
1497                                         po->running = 0;
1498                                         sk->sk_err = ENETDOWN;
1499                                         if (!sock_flag(sk, SOCK_DEAD))
1500                                                 sk->sk_error_report(sk);
1501                                 }
1502                                 if (msg == NETDEV_UNREGISTER) {
1503                                         po->ifindex = -1;
1504                                         po->prot_hook.dev = NULL;
1505                                 }
1506                                 spin_unlock(&po->bind_lock);
1507                         }
1508                         break;
1509                 case NETDEV_UP:
1510                         spin_lock(&po->bind_lock);
1511                         if (dev->ifindex == po->ifindex && po->num &&
1512                             !po->running) {
1513                                 dev_add_pack(&po->prot_hook);
1514                                 sock_hold(sk);
1515                                 po->running = 1;
1516                         }
1517                         spin_unlock(&po->bind_lock);
1518                         break;
1519                 }
1520         }
1521         read_unlock(&packet_sklist_lock);
1522         return NOTIFY_DONE;
1523 }
1524
1525
1526 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1527                         unsigned long arg)
1528 {
1529         struct sock *sk = sock->sk;
1530
1531         switch(cmd) {
1532                 case SIOCOUTQ:
1533                 {
1534                         int amount = atomic_read(&sk->sk_wmem_alloc);
1535                         return put_user(amount, (int __user *)arg);
1536                 }
1537                 case SIOCINQ:
1538                 {
1539                         struct sk_buff *skb;
1540                         int amount = 0;
1541
1542                         spin_lock_bh(&sk->sk_receive_queue.lock);
1543                         skb = skb_peek(&sk->sk_receive_queue);
1544                         if (skb)
1545                                 amount = skb->len;
1546                         spin_unlock_bh(&sk->sk_receive_queue.lock);
1547                         return put_user(amount, (int __user *)arg);
1548                 }
1549                 case SIOCGSTAMP:
1550                         return sock_get_timestamp(sk, (struct timeval __user *)arg);
1551                 case SIOCGSTAMPNS:
1552                         return sock_get_timestampns(sk, (struct timespec __user *)arg);
1553
1554 #ifdef CONFIG_INET
1555                 case SIOCADDRT:
1556                 case SIOCDELRT:
1557                 case SIOCDARP:
1558                 case SIOCGARP:
1559                 case SIOCSARP:
1560                 case SIOCGIFADDR:
1561                 case SIOCSIFADDR:
1562                 case SIOCGIFBRDADDR:
1563                 case SIOCSIFBRDADDR:
1564                 case SIOCGIFNETMASK:
1565                 case SIOCSIFNETMASK:
1566                 case SIOCGIFDSTADDR:
1567                 case SIOCSIFDSTADDR:
1568                 case SIOCSIFFLAGS:
1569                         return inet_dgram_ops.ioctl(sock, cmd, arg);
1570 #endif
1571
1572                 default:
1573                         return -ENOIOCTLCMD;
1574         }
1575         return 0;
1576 }
1577
1578 #ifndef CONFIG_PACKET_MMAP
1579 #define packet_mmap sock_no_mmap
1580 #define packet_poll datagram_poll
1581 #else
1582
1583 static unsigned int packet_poll(struct file * file, struct socket *sock,
1584                                 poll_table *wait)
1585 {
1586         struct sock *sk = sock->sk;
1587         struct packet_sock *po = pkt_sk(sk);
1588         unsigned int mask = datagram_poll(file, sock, wait);
1589
1590         spin_lock_bh(&sk->sk_receive_queue.lock);
1591         if (po->pg_vec) {
1592                 unsigned last = po->head ? po->head-1 : po->frame_max;
1593                 struct tpacket_hdr *h;
1594
1595                 h = packet_lookup_frame(po, last);
1596
1597                 if (h->tp_status)
1598                         mask |= POLLIN | POLLRDNORM;
1599         }
1600         spin_unlock_bh(&sk->sk_receive_queue.lock);
1601         return mask;
1602 }
1603
1604
1605 /* Dirty? Well, I still did not learn better way to account
1606  * for user mmaps.
1607  */
1608
1609 static void packet_mm_open(struct vm_area_struct *vma)
1610 {
1611         struct file *file = vma->vm_file;
1612         struct socket * sock = file->private_data;
1613         struct sock *sk = sock->sk;
1614
1615         if (sk)
1616                 atomic_inc(&pkt_sk(sk)->mapped);
1617 }
1618
1619 static void packet_mm_close(struct vm_area_struct *vma)
1620 {
1621         struct file *file = vma->vm_file;
1622         struct socket * sock = file->private_data;
1623         struct sock *sk = sock->sk;
1624
1625         if (sk)
1626                 atomic_dec(&pkt_sk(sk)->mapped);
1627 }
1628
1629 static struct vm_operations_struct packet_mmap_ops = {
1630         .open = packet_mm_open,
1631         .close =packet_mm_close,
1632 };
1633
1634 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1635 {
1636         return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1637 }
1638
1639 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1640 {
1641         int i;
1642
1643         for (i = 0; i < len; i++) {
1644                 if (likely(pg_vec[i]))
1645                         free_pages((unsigned long) pg_vec[i], order);
1646         }
1647         kfree(pg_vec);
1648 }
1649
1650 static inline char *alloc_one_pg_vec_page(unsigned long order)
1651 {
1652         return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1653                                          order);
1654 }
1655
1656 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1657 {
1658         unsigned int block_nr = req->tp_block_nr;
1659         char **pg_vec;
1660         int i;
1661
1662         pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1663         if (unlikely(!pg_vec))
1664                 goto out;
1665
1666         for (i = 0; i < block_nr; i++) {
1667                 pg_vec[i] = alloc_one_pg_vec_page(order);
1668                 if (unlikely(!pg_vec[i]))
1669                         goto out_free_pgvec;
1670         }
1671
1672 out:
1673         return pg_vec;
1674
1675 out_free_pgvec:
1676         free_pg_vec(pg_vec, order, block_nr);
1677         pg_vec = NULL;
1678         goto out;
1679 }
1680
1681 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1682 {
1683         char **pg_vec = NULL;
1684         struct packet_sock *po = pkt_sk(sk);
1685         int was_running, order = 0;
1686         __be16 num;
1687         int err = 0;
1688
1689         if (req->tp_block_nr) {
1690                 int i, l;
1691
1692                 /* Sanity tests and some calculations */
1693
1694                 if (unlikely(po->pg_vec))
1695                         return -EBUSY;
1696
1697                 if (unlikely((int)req->tp_block_size <= 0))
1698                         return -EINVAL;
1699                 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1700                         return -EINVAL;
1701                 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1702                         return -EINVAL;
1703                 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1704                         return -EINVAL;
1705
1706                 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1707                 if (unlikely(po->frames_per_block <= 0))
1708                         return -EINVAL;
1709                 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1710                              req->tp_frame_nr))
1711                         return -EINVAL;
1712
1713                 err = -ENOMEM;
1714                 order = get_order(req->tp_block_size);
1715                 pg_vec = alloc_pg_vec(req, order);
1716                 if (unlikely(!pg_vec))
1717                         goto out;
1718
1719                 l = 0;
1720                 for (i = 0; i < req->tp_block_nr; i++) {
1721                         char *ptr = pg_vec[i];
1722                         struct tpacket_hdr *header;
1723                         int k;
1724
1725                         for (k = 0; k < po->frames_per_block; k++) {
1726                                 header = (struct tpacket_hdr *) ptr;
1727                                 header->tp_status = TP_STATUS_KERNEL;
1728                                 ptr += req->tp_frame_size;
1729                         }
1730                 }
1731                 /* Done */
1732         } else {
1733                 if (unlikely(req->tp_frame_nr))
1734                         return -EINVAL;
1735         }
1736
1737         lock_sock(sk);
1738
1739         /* Detach socket from network */
1740         spin_lock(&po->bind_lock);
1741         was_running = po->running;
1742         num = po->num;
1743         if (was_running) {
1744                 __dev_remove_pack(&po->prot_hook);
1745                 po->num = 0;
1746                 po->running = 0;
1747                 __sock_put(sk);
1748         }
1749         spin_unlock(&po->bind_lock);
1750
1751         synchronize_net();
1752
1753         err = -EBUSY;
1754         if (closing || atomic_read(&po->mapped) == 0) {
1755                 err = 0;
1756 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1757
1758                 spin_lock_bh(&sk->sk_receive_queue.lock);
1759                 pg_vec = XC(po->pg_vec, pg_vec);
1760                 po->frame_max = (req->tp_frame_nr - 1);
1761                 po->head = 0;
1762                 po->frame_size = req->tp_frame_size;
1763                 spin_unlock_bh(&sk->sk_receive_queue.lock);
1764
1765                 order = XC(po->pg_vec_order, order);
1766                 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1767
1768                 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1769                 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1770                 skb_queue_purge(&sk->sk_receive_queue);
1771 #undef XC
1772                 if (atomic_read(&po->mapped))
1773                         printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1774         }
1775
1776         spin_lock(&po->bind_lock);
1777         if (was_running && !po->running) {
1778                 sock_hold(sk);
1779                 po->running = 1;
1780                 po->num = num;
1781                 dev_add_pack(&po->prot_hook);
1782         }
1783         spin_unlock(&po->bind_lock);
1784
1785         release_sock(sk);
1786
1787         if (pg_vec)
1788                 free_pg_vec(pg_vec, order, req->tp_block_nr);
1789 out:
1790         return err;
1791 }
1792
1793 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1794 {
1795         struct sock *sk = sock->sk;
1796         struct packet_sock *po = pkt_sk(sk);
1797         unsigned long size;
1798         unsigned long start;
1799         int err = -EINVAL;
1800         int i;
1801
1802         if (vma->vm_pgoff)
1803                 return -EINVAL;
1804
1805         size = vma->vm_end - vma->vm_start;
1806
1807         lock_sock(sk);
1808         if (po->pg_vec == NULL)
1809                 goto out;
1810         if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1811                 goto out;
1812
1813         start = vma->vm_start;
1814         for (i = 0; i < po->pg_vec_len; i++) {
1815                 struct page *page = virt_to_page(po->pg_vec[i]);
1816                 int pg_num;
1817
1818                 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1819                         err = vm_insert_page(vma, start, page);
1820                         if (unlikely(err))
1821                                 goto out;
1822                         start += PAGE_SIZE;
1823                 }
1824         }
1825         atomic_inc(&po->mapped);
1826         vma->vm_ops = &packet_mmap_ops;
1827         err = 0;
1828
1829 out:
1830         release_sock(sk);
1831         return err;
1832 }
1833 #endif
1834
1835
1836 #ifdef CONFIG_SOCK_PACKET
1837 static const struct proto_ops packet_ops_spkt = {
1838         .family =       PF_PACKET,
1839         .owner =        THIS_MODULE,
1840         .release =      packet_release,
1841         .bind =         packet_bind_spkt,
1842         .connect =      sock_no_connect,
1843         .socketpair =   sock_no_socketpair,
1844         .accept =       sock_no_accept,
1845         .getname =      packet_getname_spkt,
1846         .poll =         datagram_poll,
1847         .ioctl =        packet_ioctl,
1848         .listen =       sock_no_listen,
1849         .shutdown =     sock_no_shutdown,
1850         .setsockopt =   sock_no_setsockopt,
1851         .getsockopt =   sock_no_getsockopt,
1852         .sendmsg =      packet_sendmsg_spkt,
1853         .recvmsg =      packet_recvmsg,
1854         .mmap =         sock_no_mmap,
1855         .sendpage =     sock_no_sendpage,
1856 };
1857 #endif
1858
1859 static const struct proto_ops packet_ops = {
1860         .family =       PF_PACKET,
1861         .owner =        THIS_MODULE,
1862         .release =      packet_release,
1863         .bind =         packet_bind,
1864         .connect =      sock_no_connect,
1865         .socketpair =   sock_no_socketpair,
1866         .accept =       sock_no_accept,
1867         .getname =      packet_getname,
1868         .poll =         packet_poll,
1869         .ioctl =        packet_ioctl,
1870         .listen =       sock_no_listen,
1871         .shutdown =     sock_no_shutdown,
1872         .setsockopt =   packet_setsockopt,
1873         .getsockopt =   packet_getsockopt,
1874         .sendmsg =      packet_sendmsg,
1875         .recvmsg =      packet_recvmsg,
1876         .mmap =         packet_mmap,
1877         .sendpage =     sock_no_sendpage,
1878 };
1879
1880 static struct net_proto_family packet_family_ops = {
1881         .family =       PF_PACKET,
1882         .create =       packet_create,
1883         .owner  =       THIS_MODULE,
1884 };
1885
1886 static struct notifier_block packet_netdev_notifier = {
1887         .notifier_call =packet_notifier,
1888 };
1889
1890 #ifdef CONFIG_PROC_FS
1891 static inline struct sock *packet_seq_idx(loff_t off)
1892 {
1893         struct sock *s;
1894         struct hlist_node *node;
1895
1896         sk_for_each(s, node, &packet_sklist) {
1897                 if (!off--)
1898                         return s;
1899         }
1900         return NULL;
1901 }
1902
1903 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1904 {
1905         read_lock(&packet_sklist_lock);
1906         return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1907 }
1908
1909 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1910 {
1911         ++*pos;
1912         return  (v == SEQ_START_TOKEN)
1913                 ? sk_head(&packet_sklist)
1914                 : sk_next((struct sock*)v) ;
1915 }
1916
1917 static void packet_seq_stop(struct seq_file *seq, void *v)
1918 {
1919         read_unlock(&packet_sklist_lock);
1920 }
1921
1922 static int packet_seq_show(struct seq_file *seq, void *v)
1923 {
1924         if (v == SEQ_START_TOKEN)
1925                 seq_puts(seq, "sk       RefCnt Type Proto  Iface R Rmem   User   Inode\n");
1926         else {
1927                 struct sock *s = v;
1928                 const struct packet_sock *po = pkt_sk(s);
1929
1930                 seq_printf(seq,
1931                            "%p %-6d %-4d %04x   %-5d %1d %-6u %-6u %-6lu\n",
1932                            s,
1933                            atomic_read(&s->sk_refcnt),
1934                            s->sk_type,
1935                            ntohs(po->num),
1936                            po->ifindex,
1937                            po->running,
1938                            atomic_read(&s->sk_rmem_alloc),
1939                            sock_i_uid(s),
1940                            sock_i_ino(s) );
1941         }
1942
1943         return 0;
1944 }
1945
1946 static struct seq_operations packet_seq_ops = {
1947         .start  = packet_seq_start,
1948         .next   = packet_seq_next,
1949         .stop   = packet_seq_stop,
1950         .show   = packet_seq_show,
1951 };
1952
1953 static int packet_seq_open(struct inode *inode, struct file *file)
1954 {
1955         return seq_open(file, &packet_seq_ops);
1956 }
1957
1958 static const struct file_operations packet_seq_fops = {
1959         .owner          = THIS_MODULE,
1960         .open           = packet_seq_open,
1961         .read           = seq_read,
1962         .llseek         = seq_lseek,
1963         .release        = seq_release,
1964 };
1965
1966 #endif
1967
1968 static void __exit packet_exit(void)
1969 {
1970         proc_net_remove("packet");
1971         unregister_netdevice_notifier(&packet_netdev_notifier);
1972         sock_unregister(PF_PACKET);
1973         proto_unregister(&packet_proto);
1974 }
1975
1976 static int __init packet_init(void)
1977 {
1978         int rc = proto_register(&packet_proto, 0);
1979
1980         if (rc != 0)
1981                 goto out;
1982
1983         sock_register(&packet_family_ops);
1984         register_netdevice_notifier(&packet_netdev_notifier);
1985         proc_net_fops_create("packet", 0, &packet_seq_fops);
1986 out:
1987         return rc;
1988 }
1989
1990 module_init(packet_init);
1991 module_exit(packet_exit);
1992 MODULE_LICENSE("GPL");
1993 MODULE_ALIAS_NETPROTO(PF_PACKET);