2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * PACKET - implements raw packet sockets.
8 * Version: $Id: af_packet.c,v 1.61 2002/02/08 03:57:19 davem Exp $
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Alan Cox, <gw4pts@gw4pts.ampr.org>
15 * Alan Cox : verify_area() now used correctly
16 * Alan Cox : new skbuff lists, look ma no backlogs!
17 * Alan Cox : tidied skbuff lists.
18 * Alan Cox : Now uses generic datagram routines I
19 * added. Also fixed the peek/read crash
20 * from all old Linux datagram code.
21 * Alan Cox : Uses the improved datagram code.
22 * Alan Cox : Added NULL's for socket options.
23 * Alan Cox : Re-commented the code.
24 * Alan Cox : Use new kernel side addressing
25 * Rob Janssen : Correct MTU usage.
26 * Dave Platt : Counter leaks caused by incorrect
27 * interrupt locking and some slightly
28 * dubious gcc output. Can you read
29 * compiler: it said _VOLATILE_
30 * Richard Kooijman : Timestamp fixes.
31 * Alan Cox : New buffers. Use sk->mac.raw.
32 * Alan Cox : sendmsg/recvmsg support.
33 * Alan Cox : Protocol setting support
34 * Alexey Kuznetsov : Untied from IPv4 stack.
35 * Cyrus Durgin : Fixed kerneld for kmod.
36 * Michal Ostrowski : Module initialization cleanup.
37 * Ulises Alonso : Frame number limit removal and
38 * packet_set_ring memory leak.
39 * Eric Biederman : Allow for > 8 byte hardware addresses.
40 * The convention is that longer addresses
41 * will simply extend the hardware address
42 * byte arrays at the end of sockaddr_ll
45 * This program is free software; you can redistribute it and/or
46 * modify it under the terms of the GNU General Public License
47 * as published by the Free Software Foundation; either version
48 * 2 of the License, or (at your option) any later version.
52 #include <linux/types.h>
54 #include <linux/capability.h>
55 #include <linux/fcntl.h>
56 #include <linux/socket.h>
58 #include <linux/inet.h>
59 #include <linux/netdevice.h>
60 #include <linux/if_packet.h>
61 #include <linux/wireless.h>
62 #include <linux/kernel.h>
63 #include <linux/kmod.h>
64 #include <net/net_namespace.h>
66 #include <net/protocol.h>
67 #include <linux/skbuff.h>
69 #include <linux/errno.h>
70 #include <linux/timer.h>
71 #include <asm/system.h>
72 #include <asm/uaccess.h>
73 #include <asm/ioctls.h>
75 #include <asm/cacheflush.h>
77 #include <linux/proc_fs.h>
78 #include <linux/seq_file.h>
79 #include <linux/poll.h>
80 #include <linux/module.h>
81 #include <linux/init.h>
84 #include <net/inet_common.h>
89 - if device has no dev->hard_header routine, it adds and removes ll header
90 inside itself. In this case ll header is invisible outside of device,
91 but higher levels still should reserve dev->hard_header_len.
92 Some devices are enough clever to reallocate skb, when header
93 will not fit to reserved space (tunnel), another ones are silly
95 - packet socket receives packets with pulled ll header,
96 so that SOCK_RAW should push it back.
101 Incoming, dev->hard_header!=NULL
102 mac_header -> ll header
105 Outgoing, dev->hard_header!=NULL
106 mac_header -> ll header
109 Incoming, dev->hard_header==NULL
110 mac_header -> UNKNOWN position. It is very likely, that it points to ll
111 header. PPP makes it, that is wrong, because introduce
112 assymetry between rx and tx paths.
115 Outgoing, dev->hard_header==NULL
116 mac_header -> data. ll header is still not built!
120 If dev->hard_header==NULL we are unlikely to restore sensible ll header.
126 dev->hard_header != NULL
127 mac_header -> ll header
130 dev->hard_header == NULL (ll header is added by device, we cannot control it)
134 We should set nh.raw on output to correct posistion,
135 packet classifier depends on it.
138 /* List of all packet sockets. */
139 static HLIST_HEAD(packet_sklist);
140 static DEFINE_RWLOCK(packet_sklist_lock);
142 static atomic_t packet_socks_nr;
145 /* Private packet socket structures. */
149 struct packet_mclist *next;
154 unsigned char addr[MAX_ADDR_LEN];
156 /* identical to struct packet_mreq except it has
157 * a longer address field.
159 struct packet_mreq_max
162 unsigned short mr_type;
163 unsigned short mr_alen;
164 unsigned char mr_address[MAX_ADDR_LEN];
167 #ifdef CONFIG_PACKET_MMAP
168 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing);
171 static void packet_flush_mclist(struct sock *sk);
174 /* struct sock has to be the first member of packet_sock */
176 struct tpacket_stats stats;
177 #ifdef CONFIG_PACKET_MMAP
180 unsigned int frames_per_block;
181 unsigned int frame_size;
182 unsigned int frame_max;
185 struct packet_type prot_hook;
186 spinlock_t bind_lock;
187 unsigned int running:1, /* prot_hook is attached*/
190 int ifindex; /* bound device */
192 struct packet_mclist *mclist;
193 #ifdef CONFIG_PACKET_MMAP
195 unsigned int pg_vec_order;
196 unsigned int pg_vec_pages;
197 unsigned int pg_vec_len;
201 struct packet_skb_cb {
202 unsigned int origlen;
204 struct sockaddr_pkt pkt;
205 struct sockaddr_ll ll;
209 #define PACKET_SKB_CB(__skb) ((struct packet_skb_cb *)((__skb)->cb))
211 #ifdef CONFIG_PACKET_MMAP
213 static inline struct tpacket_hdr *packet_lookup_frame(struct packet_sock *po, unsigned int position)
215 unsigned int pg_vec_pos, frame_offset;
217 pg_vec_pos = position / po->frames_per_block;
218 frame_offset = position % po->frames_per_block;
220 return (struct tpacket_hdr *)(po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
224 static inline struct packet_sock *pkt_sk(struct sock *sk)
226 return (struct packet_sock *)sk;
229 static void packet_sock_destruct(struct sock *sk)
231 BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
232 BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
234 if (!sock_flag(sk, SOCK_DEAD)) {
235 printk("Attempt to release alive packet socket: %p\n", sk);
239 atomic_dec(&packet_socks_nr);
240 #ifdef PACKET_REFCNT_DEBUG
241 printk(KERN_DEBUG "PACKET socket %p is free, %d are alive\n", sk, atomic_read(&packet_socks_nr));
246 static const struct proto_ops packet_ops;
248 static const struct proto_ops packet_ops_spkt;
250 static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
253 struct sockaddr_pkt *spkt;
255 if (dev->nd_net != &init_net)
259 * When we registered the protocol we saved the socket in the data
260 * field for just this event.
263 sk = pt->af_packet_priv;
266 * Yank back the headers [hope the device set this
267 * right or kerboom...]
269 * Incoming packets have ll header pulled,
272 * For outgoing ones skb->data == skb_mac_header(skb)
273 * so that this procedure is noop.
276 if (skb->pkt_type == PACKET_LOOPBACK)
279 if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)
282 /* drop any routing info */
283 dst_release(skb->dst);
286 /* drop conntrack reference */
289 spkt = &PACKET_SKB_CB(skb)->sa.pkt;
291 skb_push(skb, skb->data - skb_mac_header(skb));
294 * The SOCK_PACKET socket receives _all_ frames.
297 spkt->spkt_family = dev->type;
298 strlcpy(spkt->spkt_device, dev->name, sizeof(spkt->spkt_device));
299 spkt->spkt_protocol = skb->protocol;
302 * Charge the memory to the socket. This is done specifically
303 * to prevent sockets using all the memory up.
306 if (sock_queue_rcv_skb(sk,skb) == 0)
317 * Output a raw packet to a device layer. This bypasses all the other
318 * protocol layers and you must therefore supply it with a complete frame
321 static int packet_sendmsg_spkt(struct kiocb *iocb, struct socket *sock,
322 struct msghdr *msg, size_t len)
324 struct sock *sk = sock->sk;
325 struct sockaddr_pkt *saddr=(struct sockaddr_pkt *)msg->msg_name;
327 struct net_device *dev;
332 * Get and verify the address.
337 if (msg->msg_namelen < sizeof(struct sockaddr))
339 if (msg->msg_namelen==sizeof(struct sockaddr_pkt))
340 proto=saddr->spkt_protocol;
343 return(-ENOTCONN); /* SOCK_PACKET must be sent giving an address */
346 * Find the device first to size check it
349 saddr->spkt_device[13] = 0;
350 dev = dev_get_by_name(&init_net, saddr->spkt_device);
356 if (!(dev->flags & IFF_UP))
360 * You may not queue a frame bigger than the mtu. This is the lowest level
361 * raw protocol and you must do your own fragmentation at this level.
365 if (len > dev->mtu + dev->hard_header_len)
369 skb = sock_wmalloc(sk, len + LL_RESERVED_SPACE(dev), 0, GFP_KERNEL);
372 * If the write buffer is full, then tough. At this level the user gets to
373 * deal with the problem - do your own algorithmic backoffs. That's far
384 /* FIXME: Save some space for broken drivers that write a
385 * hard header at transmission time by themselves. PPP is the
386 * notable one here. This should really be fixed at the driver level.
388 skb_reserve(skb, LL_RESERVED_SPACE(dev));
389 skb_reset_network_header(skb);
391 /* Try to align data part correctly */
392 if (dev->hard_header) {
393 skb->data -= dev->hard_header_len;
394 skb->tail -= dev->hard_header_len;
395 if (len < dev->hard_header_len)
396 skb_reset_network_header(skb);
399 /* Returns -EFAULT on error */
400 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
401 skb->protocol = proto;
403 skb->priority = sk->sk_priority;
423 static inline unsigned int run_filter(struct sk_buff *skb, struct sock *sk,
426 struct sk_filter *filter;
429 filter = rcu_dereference(sk->sk_filter);
431 res = sk_run_filter(skb, filter->insns, filter->len);
432 rcu_read_unlock_bh();
438 This function makes lazy skb cloning in hope that most of packets
439 are discarded by BPF.
441 Note tricky part: we DO mangle shared skb! skb->data, skb->len
442 and skb->cb are mangled. It works because (and until) packets
443 falling here are owned by current CPU. Output packets are cloned
444 by dev_queue_xmit_nit(), input packets are processed by net_bh
445 sequencially, so that if we return skb to original state on exit,
446 we will not harm anyone.
449 static int packet_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
452 struct sockaddr_ll *sll;
453 struct packet_sock *po;
454 u8 * skb_head = skb->data;
455 int skb_len = skb->len;
456 unsigned int snaplen, res;
458 if (dev->nd_net != &init_net)
461 if (skb->pkt_type == PACKET_LOOPBACK)
464 sk = pt->af_packet_priv;
469 if (dev->hard_header) {
470 /* The device has an explicit notion of ll header,
471 exported to higher levels.
473 Otherwise, the device hides datails of it frame
474 structure, so that corresponding packet head
475 never delivered to user.
477 if (sk->sk_type != SOCK_DGRAM)
478 skb_push(skb, skb->data - skb_mac_header(skb));
479 else if (skb->pkt_type == PACKET_OUTGOING) {
480 /* Special case: outgoing packets have ll header at head */
481 skb_pull(skb, skb_network_offset(skb));
487 res = run_filter(skb, sk, snaplen);
493 if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
494 (unsigned)sk->sk_rcvbuf)
497 if (skb_shared(skb)) {
498 struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
502 if (skb_head != skb->data) {
503 skb->data = skb_head;
510 BUILD_BUG_ON(sizeof(*PACKET_SKB_CB(skb)) + MAX_ADDR_LEN - 8 >
513 sll = &PACKET_SKB_CB(skb)->sa.ll;
514 sll->sll_family = AF_PACKET;
515 sll->sll_hatype = dev->type;
516 sll->sll_protocol = skb->protocol;
517 sll->sll_pkttype = skb->pkt_type;
518 if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
519 sll->sll_ifindex = orig_dev->ifindex;
521 sll->sll_ifindex = dev->ifindex;
524 if (dev->hard_header_parse)
525 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
527 PACKET_SKB_CB(skb)->origlen = skb->len;
529 if (pskb_trim(skb, snaplen))
532 skb_set_owner_r(skb, sk);
534 dst_release(skb->dst);
537 /* drop conntrack reference */
540 spin_lock(&sk->sk_receive_queue.lock);
541 po->stats.tp_packets++;
542 __skb_queue_tail(&sk->sk_receive_queue, skb);
543 spin_unlock(&sk->sk_receive_queue.lock);
544 sk->sk_data_ready(sk, skb->len);
548 spin_lock(&sk->sk_receive_queue.lock);
549 po->stats.tp_drops++;
550 spin_unlock(&sk->sk_receive_queue.lock);
553 if (skb_head != skb->data && skb_shared(skb)) {
554 skb->data = skb_head;
562 #ifdef CONFIG_PACKET_MMAP
563 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
566 struct packet_sock *po;
567 struct sockaddr_ll *sll;
568 struct tpacket_hdr *h;
569 u8 * skb_head = skb->data;
570 int skb_len = skb->len;
571 unsigned int snaplen, res;
572 unsigned long status = TP_STATUS_LOSING|TP_STATUS_USER;
573 unsigned short macoff, netoff;
574 struct sk_buff *copy_skb = NULL;
577 if (dev->nd_net != &init_net)
580 if (skb->pkt_type == PACKET_LOOPBACK)
583 sk = pt->af_packet_priv;
586 if (dev->hard_header) {
587 if (sk->sk_type != SOCK_DGRAM)
588 skb_push(skb, skb->data - skb_mac_header(skb));
589 else if (skb->pkt_type == PACKET_OUTGOING) {
590 /* Special case: outgoing packets have ll header at head */
591 skb_pull(skb, skb_network_offset(skb));
595 if (skb->ip_summed == CHECKSUM_PARTIAL)
596 status |= TP_STATUS_CSUMNOTREADY;
600 res = run_filter(skb, sk, snaplen);
606 if (sk->sk_type == SOCK_DGRAM) {
607 macoff = netoff = TPACKET_ALIGN(TPACKET_HDRLEN) + 16;
609 unsigned maclen = skb_network_offset(skb);
610 netoff = TPACKET_ALIGN(TPACKET_HDRLEN + (maclen < 16 ? 16 : maclen));
611 macoff = netoff - maclen;
614 if (macoff + snaplen > po->frame_size) {
615 if (po->copy_thresh &&
616 atomic_read(&sk->sk_rmem_alloc) + skb->truesize <
617 (unsigned)sk->sk_rcvbuf) {
618 if (skb_shared(skb)) {
619 copy_skb = skb_clone(skb, GFP_ATOMIC);
621 copy_skb = skb_get(skb);
622 skb_head = skb->data;
625 skb_set_owner_r(copy_skb, sk);
627 snaplen = po->frame_size - macoff;
628 if ((int)snaplen < 0)
632 spin_lock(&sk->sk_receive_queue.lock);
633 h = packet_lookup_frame(po, po->head);
637 po->head = po->head != po->frame_max ? po->head+1 : 0;
638 po->stats.tp_packets++;
640 status |= TP_STATUS_COPY;
641 __skb_queue_tail(&sk->sk_receive_queue, copy_skb);
643 if (!po->stats.tp_drops)
644 status &= ~TP_STATUS_LOSING;
645 spin_unlock(&sk->sk_receive_queue.lock);
647 skb_copy_bits(skb, 0, (u8*)h + macoff, snaplen);
649 h->tp_len = skb->len;
650 h->tp_snaplen = snaplen;
653 if (skb->tstamp.tv64)
654 tv = ktime_to_timeval(skb->tstamp);
656 do_gettimeofday(&tv);
657 h->tp_sec = tv.tv_sec;
658 h->tp_usec = tv.tv_usec;
660 sll = (struct sockaddr_ll*)((u8*)h + TPACKET_ALIGN(sizeof(*h)));
662 if (dev->hard_header_parse)
663 sll->sll_halen = dev->hard_header_parse(skb, sll->sll_addr);
664 sll->sll_family = AF_PACKET;
665 sll->sll_hatype = dev->type;
666 sll->sll_protocol = skb->protocol;
667 sll->sll_pkttype = skb->pkt_type;
668 if (unlikely(po->origdev) && skb->pkt_type == PACKET_HOST)
669 sll->sll_ifindex = orig_dev->ifindex;
671 sll->sll_ifindex = dev->ifindex;
673 h->tp_status = status;
677 struct page *p_start, *p_end;
678 u8 *h_end = (u8 *)h + macoff + snaplen - 1;
680 p_start = virt_to_page(h);
681 p_end = virt_to_page(h_end);
682 while (p_start <= p_end) {
683 flush_dcache_page(p_start);
688 sk->sk_data_ready(sk, 0);
691 if (skb_head != skb->data && skb_shared(skb)) {
692 skb->data = skb_head;
700 po->stats.tp_drops++;
701 spin_unlock(&sk->sk_receive_queue.lock);
703 sk->sk_data_ready(sk, 0);
712 static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
713 struct msghdr *msg, size_t len)
715 struct sock *sk = sock->sk;
716 struct sockaddr_ll *saddr=(struct sockaddr_ll *)msg->msg_name;
718 struct net_device *dev;
721 int ifindex, err, reserve = 0;
724 * Get and verify the address.
728 struct packet_sock *po = pkt_sk(sk);
730 ifindex = po->ifindex;
735 if (msg->msg_namelen < sizeof(struct sockaddr_ll))
737 if (msg->msg_namelen < (saddr->sll_halen + offsetof(struct sockaddr_ll, sll_addr)))
739 ifindex = saddr->sll_ifindex;
740 proto = saddr->sll_protocol;
741 addr = saddr->sll_addr;
745 dev = dev_get_by_index(&init_net, ifindex);
749 if (sock->type == SOCK_RAW)
750 reserve = dev->hard_header_len;
753 if (!(dev->flags & IFF_UP))
757 if (len > dev->mtu+reserve)
760 skb = sock_alloc_send_skb(sk, len + LL_RESERVED_SPACE(dev),
761 msg->msg_flags & MSG_DONTWAIT, &err);
765 skb_reserve(skb, LL_RESERVED_SPACE(dev));
766 skb_reset_network_header(skb);
769 if (sock->type == SOCK_DGRAM &&
770 dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len) < 0)
773 /* Returns -EFAULT on error */
774 err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
778 skb->protocol = proto;
780 skb->priority = sk->sk_priority;
786 err = dev_queue_xmit(skb);
787 if (err > 0 && (err = net_xmit_errno(err)) != 0)
804 * Close a PACKET socket. This is fairly simple. We immediately go
805 * to 'closed' state and remove our protocol entry in the device list.
808 static int packet_release(struct socket *sock)
810 struct sock *sk = sock->sk;
811 struct packet_sock *po;
818 write_lock_bh(&packet_sklist_lock);
819 sk_del_node_init(sk);
820 write_unlock_bh(&packet_sklist_lock);
823 * Unhook packet receive handler.
828 * Remove the protocol hook
830 dev_remove_pack(&po->prot_hook);
836 packet_flush_mclist(sk);
838 #ifdef CONFIG_PACKET_MMAP
840 struct tpacket_req req;
841 memset(&req, 0, sizeof(req));
842 packet_set_ring(sk, &req, 1);
847 * Now the socket is dead. No more input will appear.
855 skb_queue_purge(&sk->sk_receive_queue);
862 * Attach a packet hook.
865 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 protocol)
867 struct packet_sock *po = pkt_sk(sk);
869 * Detach an existing hook if present.
874 spin_lock(&po->bind_lock);
879 spin_unlock(&po->bind_lock);
880 dev_remove_pack(&po->prot_hook);
881 spin_lock(&po->bind_lock);
885 po->prot_hook.type = protocol;
886 po->prot_hook.dev = dev;
888 po->ifindex = dev ? dev->ifindex : 0;
894 if (dev->flags&IFF_UP) {
895 dev_add_pack(&po->prot_hook);
899 sk->sk_err = ENETDOWN;
900 if (!sock_flag(sk, SOCK_DEAD))
901 sk->sk_error_report(sk);
904 dev_add_pack(&po->prot_hook);
910 spin_unlock(&po->bind_lock);
916 * Bind a packet socket to a device
919 static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len)
921 struct sock *sk=sock->sk;
923 struct net_device *dev;
930 if (addr_len != sizeof(struct sockaddr))
932 strlcpy(name,uaddr->sa_data,sizeof(name));
934 dev = dev_get_by_name(&init_net, name);
936 err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
942 static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
944 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
945 struct sock *sk=sock->sk;
946 struct net_device *dev = NULL;
954 if (addr_len < sizeof(struct sockaddr_ll))
956 if (sll->sll_family != AF_PACKET)
959 if (sll->sll_ifindex) {
961 dev = dev_get_by_index(&init_net, sll->sll_ifindex);
965 err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
973 static struct proto packet_proto = {
975 .owner = THIS_MODULE,
976 .obj_size = sizeof(struct packet_sock),
980 * Create a packet of type SOCK_PACKET.
983 static int packet_create(struct net *net, struct socket *sock, int protocol)
986 struct packet_sock *po;
987 __be16 proto = (__force __be16)protocol; /* weird, but documented */
990 if (net != &init_net)
991 return -EAFNOSUPPORT;
993 if (!capable(CAP_NET_RAW))
995 if (sock->type != SOCK_DGRAM && sock->type != SOCK_RAW &&
996 sock->type != SOCK_PACKET)
997 return -ESOCKTNOSUPPORT;
999 sock->state = SS_UNCONNECTED;
1002 sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto, 1);
1006 sock->ops = &packet_ops;
1007 if (sock->type == SOCK_PACKET)
1008 sock->ops = &packet_ops_spkt;
1010 sock_init_data(sock, sk);
1013 sk->sk_family = PF_PACKET;
1016 sk->sk_destruct = packet_sock_destruct;
1017 atomic_inc(&packet_socks_nr);
1020 * Attach a protocol block
1023 spin_lock_init(&po->bind_lock);
1024 po->prot_hook.func = packet_rcv;
1026 if (sock->type == SOCK_PACKET)
1027 po->prot_hook.func = packet_rcv_spkt;
1029 po->prot_hook.af_packet_priv = sk;
1032 po->prot_hook.type = proto;
1033 dev_add_pack(&po->prot_hook);
1038 write_lock_bh(&packet_sklist_lock);
1039 sk_add_node(sk, &packet_sklist);
1040 write_unlock_bh(&packet_sklist_lock);
1047 * Pull a packet from our receive queue and hand it to the user.
1048 * If necessary we block.
1051 static int packet_recvmsg(struct kiocb *iocb, struct socket *sock,
1052 struct msghdr *msg, size_t len, int flags)
1054 struct sock *sk = sock->sk;
1055 struct sk_buff *skb;
1057 struct sockaddr_ll *sll;
1060 if (flags & ~(MSG_PEEK|MSG_DONTWAIT|MSG_TRUNC|MSG_CMSG_COMPAT))
1064 /* What error should we return now? EUNATTACH? */
1065 if (pkt_sk(sk)->ifindex < 0)
1070 * Call the generic datagram receiver. This handles all sorts
1071 * of horrible races and re-entrancy so we can forget about it
1072 * in the protocol layers.
1074 * Now it will return ENETDOWN, if device have just gone down,
1075 * but then it will block.
1078 skb=skb_recv_datagram(sk,flags,flags&MSG_DONTWAIT,&err);
1081 * An error occurred so return it. Because skb_recv_datagram()
1082 * handles the blocking we don't see and worry about blocking
1090 * If the address length field is there to be filled in, we fill
1094 sll = &PACKET_SKB_CB(skb)->sa.ll;
1095 if (sock->type == SOCK_PACKET)
1096 msg->msg_namelen = sizeof(struct sockaddr_pkt);
1098 msg->msg_namelen = sll->sll_halen + offsetof(struct sockaddr_ll, sll_addr);
1101 * You lose any data beyond the buffer you gave. If it worries a
1102 * user program they can ask the device for its MTU anyway.
1109 msg->msg_flags|=MSG_TRUNC;
1112 err = skb_copy_datagram_iovec(skb, 0, msg->msg_iov, copied);
1116 sock_recv_timestamp(msg, sk, skb);
1119 memcpy(msg->msg_name, &PACKET_SKB_CB(skb)->sa,
1122 if (pkt_sk(sk)->auxdata) {
1123 struct tpacket_auxdata aux;
1125 aux.tp_status = TP_STATUS_USER;
1126 if (skb->ip_summed == CHECKSUM_PARTIAL)
1127 aux.tp_status |= TP_STATUS_CSUMNOTREADY;
1128 aux.tp_len = PACKET_SKB_CB(skb)->origlen;
1129 aux.tp_snaplen = skb->len;
1131 aux.tp_net = skb_network_offset(skb);
1133 put_cmsg(msg, SOL_PACKET, PACKET_AUXDATA, sizeof(aux), &aux);
1137 * Free or return the buffer as appropriate. Again this
1138 * hides all the races and re-entrancy issues from us.
1140 err = (flags&MSG_TRUNC) ? skb->len : copied;
1143 skb_free_datagram(sk, skb);
1148 static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr,
1149 int *uaddr_len, int peer)
1151 struct net_device *dev;
1152 struct sock *sk = sock->sk;
1157 uaddr->sa_family = AF_PACKET;
1158 dev = dev_get_by_index(&init_net, pkt_sk(sk)->ifindex);
1160 strlcpy(uaddr->sa_data, dev->name, 15);
1163 memset(uaddr->sa_data, 0, 14);
1164 *uaddr_len = sizeof(*uaddr);
1169 static int packet_getname(struct socket *sock, struct sockaddr *uaddr,
1170 int *uaddr_len, int peer)
1172 struct net_device *dev;
1173 struct sock *sk = sock->sk;
1174 struct packet_sock *po = pkt_sk(sk);
1175 struct sockaddr_ll *sll = (struct sockaddr_ll*)uaddr;
1180 sll->sll_family = AF_PACKET;
1181 sll->sll_ifindex = po->ifindex;
1182 sll->sll_protocol = po->num;
1183 dev = dev_get_by_index(&init_net, po->ifindex);
1185 sll->sll_hatype = dev->type;
1186 sll->sll_halen = dev->addr_len;
1187 memcpy(sll->sll_addr, dev->dev_addr, dev->addr_len);
1190 sll->sll_hatype = 0; /* Bad: we have no ARPHRD_UNSPEC */
1193 *uaddr_len = offsetof(struct sockaddr_ll, sll_addr) + sll->sll_halen;
1198 static void packet_dev_mc(struct net_device *dev, struct packet_mclist *i, int what)
1201 case PACKET_MR_MULTICAST:
1203 dev_mc_add(dev, i->addr, i->alen, 0);
1205 dev_mc_delete(dev, i->addr, i->alen, 0);
1207 case PACKET_MR_PROMISC:
1208 dev_set_promiscuity(dev, what);
1210 case PACKET_MR_ALLMULTI:
1211 dev_set_allmulti(dev, what);
1217 static void packet_dev_mclist(struct net_device *dev, struct packet_mclist *i, int what)
1219 for ( ; i; i=i->next) {
1220 if (i->ifindex == dev->ifindex)
1221 packet_dev_mc(dev, i, what);
1225 static int packet_mc_add(struct sock *sk, struct packet_mreq_max *mreq)
1227 struct packet_sock *po = pkt_sk(sk);
1228 struct packet_mclist *ml, *i;
1229 struct net_device *dev;
1235 dev = __dev_get_by_index(&init_net, mreq->mr_ifindex);
1240 if (mreq->mr_alen > dev->addr_len)
1244 i = kmalloc(sizeof(*i), GFP_KERNEL);
1249 for (ml = po->mclist; ml; ml = ml->next) {
1250 if (ml->ifindex == mreq->mr_ifindex &&
1251 ml->type == mreq->mr_type &&
1252 ml->alen == mreq->mr_alen &&
1253 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1255 /* Free the new element ... */
1261 i->type = mreq->mr_type;
1262 i->ifindex = mreq->mr_ifindex;
1263 i->alen = mreq->mr_alen;
1264 memcpy(i->addr, mreq->mr_address, i->alen);
1266 i->next = po->mclist;
1268 packet_dev_mc(dev, i, +1);
1275 static int packet_mc_drop(struct sock *sk, struct packet_mreq_max *mreq)
1277 struct packet_mclist *ml, **mlp;
1281 for (mlp = &pkt_sk(sk)->mclist; (ml = *mlp) != NULL; mlp = &ml->next) {
1282 if (ml->ifindex == mreq->mr_ifindex &&
1283 ml->type == mreq->mr_type &&
1284 ml->alen == mreq->mr_alen &&
1285 memcmp(ml->addr, mreq->mr_address, ml->alen) == 0) {
1286 if (--ml->count == 0) {
1287 struct net_device *dev;
1289 dev = dev_get_by_index(&init_net, ml->ifindex);
1291 packet_dev_mc(dev, ml, -1);
1301 return -EADDRNOTAVAIL;
1304 static void packet_flush_mclist(struct sock *sk)
1306 struct packet_sock *po = pkt_sk(sk);
1307 struct packet_mclist *ml;
1313 while ((ml = po->mclist) != NULL) {
1314 struct net_device *dev;
1316 po->mclist = ml->next;
1317 if ((dev = dev_get_by_index(&init_net, ml->ifindex)) != NULL) {
1318 packet_dev_mc(dev, ml, -1);
1327 packet_setsockopt(struct socket *sock, int level, int optname, char __user *optval, int optlen)
1329 struct sock *sk = sock->sk;
1330 struct packet_sock *po = pkt_sk(sk);
1333 if (level != SOL_PACKET)
1334 return -ENOPROTOOPT;
1337 case PACKET_ADD_MEMBERSHIP:
1338 case PACKET_DROP_MEMBERSHIP:
1340 struct packet_mreq_max mreq;
1342 memset(&mreq, 0, sizeof(mreq));
1343 if (len < sizeof(struct packet_mreq))
1345 if (len > sizeof(mreq))
1347 if (copy_from_user(&mreq,optval,len))
1349 if (len < (mreq.mr_alen + offsetof(struct packet_mreq, mr_address)))
1351 if (optname == PACKET_ADD_MEMBERSHIP)
1352 ret = packet_mc_add(sk, &mreq);
1354 ret = packet_mc_drop(sk, &mreq);
1358 #ifdef CONFIG_PACKET_MMAP
1359 case PACKET_RX_RING:
1361 struct tpacket_req req;
1363 if (optlen<sizeof(req))
1365 if (copy_from_user(&req,optval,sizeof(req)))
1367 return packet_set_ring(sk, &req, 0);
1369 case PACKET_COPY_THRESH:
1373 if (optlen!=sizeof(val))
1375 if (copy_from_user(&val,optval,sizeof(val)))
1378 pkt_sk(sk)->copy_thresh = val;
1382 case PACKET_AUXDATA:
1386 if (optlen < sizeof(val))
1388 if (copy_from_user(&val, optval, sizeof(val)))
1391 po->auxdata = !!val;
1394 case PACKET_ORIGDEV:
1398 if (optlen < sizeof(val))
1400 if (copy_from_user(&val, optval, sizeof(val)))
1403 po->origdev = !!val;
1407 return -ENOPROTOOPT;
1411 static int packet_getsockopt(struct socket *sock, int level, int optname,
1412 char __user *optval, int __user *optlen)
1416 struct sock *sk = sock->sk;
1417 struct packet_sock *po = pkt_sk(sk);
1419 struct tpacket_stats st;
1421 if (level != SOL_PACKET)
1422 return -ENOPROTOOPT;
1424 if (get_user(len, optlen))
1431 case PACKET_STATISTICS:
1432 if (len > sizeof(struct tpacket_stats))
1433 len = sizeof(struct tpacket_stats);
1434 spin_lock_bh(&sk->sk_receive_queue.lock);
1436 memset(&po->stats, 0, sizeof(st));
1437 spin_unlock_bh(&sk->sk_receive_queue.lock);
1438 st.tp_packets += st.tp_drops;
1442 case PACKET_AUXDATA:
1443 if (len > sizeof(int))
1449 case PACKET_ORIGDEV:
1450 if (len > sizeof(int))
1457 return -ENOPROTOOPT;
1460 if (put_user(len, optlen))
1462 if (copy_to_user(optval, data, len))
1468 static int packet_notifier(struct notifier_block *this, unsigned long msg, void *data)
1471 struct hlist_node *node;
1472 struct net_device *dev = data;
1474 if (dev->nd_net != &init_net)
1477 read_lock(&packet_sklist_lock);
1478 sk_for_each(sk, node, &packet_sklist) {
1479 struct packet_sock *po = pkt_sk(sk);
1482 case NETDEV_UNREGISTER:
1484 packet_dev_mclist(dev, po->mclist, -1);
1488 if (dev->ifindex == po->ifindex) {
1489 spin_lock(&po->bind_lock);
1491 __dev_remove_pack(&po->prot_hook);
1494 sk->sk_err = ENETDOWN;
1495 if (!sock_flag(sk, SOCK_DEAD))
1496 sk->sk_error_report(sk);
1498 if (msg == NETDEV_UNREGISTER) {
1500 po->prot_hook.dev = NULL;
1502 spin_unlock(&po->bind_lock);
1506 spin_lock(&po->bind_lock);
1507 if (dev->ifindex == po->ifindex && po->num &&
1509 dev_add_pack(&po->prot_hook);
1513 spin_unlock(&po->bind_lock);
1517 read_unlock(&packet_sklist_lock);
1522 static int packet_ioctl(struct socket *sock, unsigned int cmd,
1525 struct sock *sk = sock->sk;
1530 int amount = atomic_read(&sk->sk_wmem_alloc);
1531 return put_user(amount, (int __user *)arg);
1535 struct sk_buff *skb;
1538 spin_lock_bh(&sk->sk_receive_queue.lock);
1539 skb = skb_peek(&sk->sk_receive_queue);
1542 spin_unlock_bh(&sk->sk_receive_queue.lock);
1543 return put_user(amount, (int __user *)arg);
1546 return sock_get_timestamp(sk, (struct timeval __user *)arg);
1548 return sock_get_timestampns(sk, (struct timespec __user *)arg);
1558 case SIOCGIFBRDADDR:
1559 case SIOCSIFBRDADDR:
1560 case SIOCGIFNETMASK:
1561 case SIOCSIFNETMASK:
1562 case SIOCGIFDSTADDR:
1563 case SIOCSIFDSTADDR:
1565 return inet_dgram_ops.ioctl(sock, cmd, arg);
1569 return -ENOIOCTLCMD;
1574 #ifndef CONFIG_PACKET_MMAP
1575 #define packet_mmap sock_no_mmap
1576 #define packet_poll datagram_poll
1579 static unsigned int packet_poll(struct file * file, struct socket *sock,
1582 struct sock *sk = sock->sk;
1583 struct packet_sock *po = pkt_sk(sk);
1584 unsigned int mask = datagram_poll(file, sock, wait);
1586 spin_lock_bh(&sk->sk_receive_queue.lock);
1588 unsigned last = po->head ? po->head-1 : po->frame_max;
1589 struct tpacket_hdr *h;
1591 h = packet_lookup_frame(po, last);
1594 mask |= POLLIN | POLLRDNORM;
1596 spin_unlock_bh(&sk->sk_receive_queue.lock);
1601 /* Dirty? Well, I still did not learn better way to account
1605 static void packet_mm_open(struct vm_area_struct *vma)
1607 struct file *file = vma->vm_file;
1608 struct socket * sock = file->private_data;
1609 struct sock *sk = sock->sk;
1612 atomic_inc(&pkt_sk(sk)->mapped);
1615 static void packet_mm_close(struct vm_area_struct *vma)
1617 struct file *file = vma->vm_file;
1618 struct socket * sock = file->private_data;
1619 struct sock *sk = sock->sk;
1622 atomic_dec(&pkt_sk(sk)->mapped);
1625 static struct vm_operations_struct packet_mmap_ops = {
1626 .open = packet_mm_open,
1627 .close =packet_mm_close,
1630 static inline struct page *pg_vec_endpage(char *one_pg_vec, unsigned int order)
1632 return virt_to_page(one_pg_vec + (PAGE_SIZE << order) - 1);
1635 static void free_pg_vec(char **pg_vec, unsigned int order, unsigned int len)
1639 for (i = 0; i < len; i++) {
1640 if (likely(pg_vec[i]))
1641 free_pages((unsigned long) pg_vec[i], order);
1646 static inline char *alloc_one_pg_vec_page(unsigned long order)
1648 return (char *) __get_free_pages(GFP_KERNEL | __GFP_COMP | __GFP_ZERO,
1652 static char **alloc_pg_vec(struct tpacket_req *req, int order)
1654 unsigned int block_nr = req->tp_block_nr;
1658 pg_vec = kzalloc(block_nr * sizeof(char *), GFP_KERNEL);
1659 if (unlikely(!pg_vec))
1662 for (i = 0; i < block_nr; i++) {
1663 pg_vec[i] = alloc_one_pg_vec_page(order);
1664 if (unlikely(!pg_vec[i]))
1665 goto out_free_pgvec;
1672 free_pg_vec(pg_vec, order, block_nr);
1677 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
1679 char **pg_vec = NULL;
1680 struct packet_sock *po = pkt_sk(sk);
1681 int was_running, order = 0;
1685 if (req->tp_block_nr) {
1688 /* Sanity tests and some calculations */
1690 if (unlikely(po->pg_vec))
1693 if (unlikely((int)req->tp_block_size <= 0))
1695 if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
1697 if (unlikely(req->tp_frame_size < TPACKET_HDRLEN))
1699 if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
1702 po->frames_per_block = req->tp_block_size/req->tp_frame_size;
1703 if (unlikely(po->frames_per_block <= 0))
1705 if (unlikely((po->frames_per_block * req->tp_block_nr) !=
1710 order = get_order(req->tp_block_size);
1711 pg_vec = alloc_pg_vec(req, order);
1712 if (unlikely(!pg_vec))
1716 for (i = 0; i < req->tp_block_nr; i++) {
1717 char *ptr = pg_vec[i];
1718 struct tpacket_hdr *header;
1721 for (k = 0; k < po->frames_per_block; k++) {
1722 header = (struct tpacket_hdr *) ptr;
1723 header->tp_status = TP_STATUS_KERNEL;
1724 ptr += req->tp_frame_size;
1729 if (unlikely(req->tp_frame_nr))
1735 /* Detach socket from network */
1736 spin_lock(&po->bind_lock);
1737 was_running = po->running;
1740 __dev_remove_pack(&po->prot_hook);
1745 spin_unlock(&po->bind_lock);
1750 if (closing || atomic_read(&po->mapped) == 0) {
1752 #define XC(a, b) ({ __typeof__ ((a)) __t; __t = (a); (a) = (b); __t; })
1754 spin_lock_bh(&sk->sk_receive_queue.lock);
1755 pg_vec = XC(po->pg_vec, pg_vec);
1756 po->frame_max = (req->tp_frame_nr - 1);
1758 po->frame_size = req->tp_frame_size;
1759 spin_unlock_bh(&sk->sk_receive_queue.lock);
1761 order = XC(po->pg_vec_order, order);
1762 req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
1764 po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
1765 po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
1766 skb_queue_purge(&sk->sk_receive_queue);
1768 if (atomic_read(&po->mapped))
1769 printk(KERN_DEBUG "packet_mmap: vma is busy: %d\n", atomic_read(&po->mapped));
1772 spin_lock(&po->bind_lock);
1773 if (was_running && !po->running) {
1777 dev_add_pack(&po->prot_hook);
1779 spin_unlock(&po->bind_lock);
1784 free_pg_vec(pg_vec, order, req->tp_block_nr);
1789 static int packet_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1791 struct sock *sk = sock->sk;
1792 struct packet_sock *po = pkt_sk(sk);
1794 unsigned long start;
1801 size = vma->vm_end - vma->vm_start;
1804 if (po->pg_vec == NULL)
1806 if (size != po->pg_vec_len*po->pg_vec_pages*PAGE_SIZE)
1809 start = vma->vm_start;
1810 for (i = 0; i < po->pg_vec_len; i++) {
1811 struct page *page = virt_to_page(po->pg_vec[i]);
1814 for (pg_num = 0; pg_num < po->pg_vec_pages; pg_num++, page++) {
1815 err = vm_insert_page(vma, start, page);
1821 atomic_inc(&po->mapped);
1822 vma->vm_ops = &packet_mmap_ops;
1832 static const struct proto_ops packet_ops_spkt = {
1833 .family = PF_PACKET,
1834 .owner = THIS_MODULE,
1835 .release = packet_release,
1836 .bind = packet_bind_spkt,
1837 .connect = sock_no_connect,
1838 .socketpair = sock_no_socketpair,
1839 .accept = sock_no_accept,
1840 .getname = packet_getname_spkt,
1841 .poll = datagram_poll,
1842 .ioctl = packet_ioctl,
1843 .listen = sock_no_listen,
1844 .shutdown = sock_no_shutdown,
1845 .setsockopt = sock_no_setsockopt,
1846 .getsockopt = sock_no_getsockopt,
1847 .sendmsg = packet_sendmsg_spkt,
1848 .recvmsg = packet_recvmsg,
1849 .mmap = sock_no_mmap,
1850 .sendpage = sock_no_sendpage,
1853 static const struct proto_ops packet_ops = {
1854 .family = PF_PACKET,
1855 .owner = THIS_MODULE,
1856 .release = packet_release,
1857 .bind = packet_bind,
1858 .connect = sock_no_connect,
1859 .socketpair = sock_no_socketpair,
1860 .accept = sock_no_accept,
1861 .getname = packet_getname,
1862 .poll = packet_poll,
1863 .ioctl = packet_ioctl,
1864 .listen = sock_no_listen,
1865 .shutdown = sock_no_shutdown,
1866 .setsockopt = packet_setsockopt,
1867 .getsockopt = packet_getsockopt,
1868 .sendmsg = packet_sendmsg,
1869 .recvmsg = packet_recvmsg,
1870 .mmap = packet_mmap,
1871 .sendpage = sock_no_sendpage,
1874 static struct net_proto_family packet_family_ops = {
1875 .family = PF_PACKET,
1876 .create = packet_create,
1877 .owner = THIS_MODULE,
1880 static struct notifier_block packet_netdev_notifier = {
1881 .notifier_call =packet_notifier,
1884 #ifdef CONFIG_PROC_FS
1885 static inline struct sock *packet_seq_idx(loff_t off)
1888 struct hlist_node *node;
1890 sk_for_each(s, node, &packet_sklist) {
1897 static void *packet_seq_start(struct seq_file *seq, loff_t *pos)
1899 read_lock(&packet_sklist_lock);
1900 return *pos ? packet_seq_idx(*pos - 1) : SEQ_START_TOKEN;
1903 static void *packet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1906 return (v == SEQ_START_TOKEN)
1907 ? sk_head(&packet_sklist)
1908 : sk_next((struct sock*)v) ;
1911 static void packet_seq_stop(struct seq_file *seq, void *v)
1913 read_unlock(&packet_sklist_lock);
1916 static int packet_seq_show(struct seq_file *seq, void *v)
1918 if (v == SEQ_START_TOKEN)
1919 seq_puts(seq, "sk RefCnt Type Proto Iface R Rmem User Inode\n");
1922 const struct packet_sock *po = pkt_sk(s);
1925 "%p %-6d %-4d %04x %-5d %1d %-6u %-6u %-6lu\n",
1927 atomic_read(&s->sk_refcnt),
1932 atomic_read(&s->sk_rmem_alloc),
1940 static const struct seq_operations packet_seq_ops = {
1941 .start = packet_seq_start,
1942 .next = packet_seq_next,
1943 .stop = packet_seq_stop,
1944 .show = packet_seq_show,
1947 static int packet_seq_open(struct inode *inode, struct file *file)
1949 return seq_open(file, &packet_seq_ops);
1952 static const struct file_operations packet_seq_fops = {
1953 .owner = THIS_MODULE,
1954 .open = packet_seq_open,
1956 .llseek = seq_lseek,
1957 .release = seq_release,
1962 static void __exit packet_exit(void)
1964 proc_net_remove(&init_net, "packet");
1965 unregister_netdevice_notifier(&packet_netdev_notifier);
1966 sock_unregister(PF_PACKET);
1967 proto_unregister(&packet_proto);
1970 static int __init packet_init(void)
1972 int rc = proto_register(&packet_proto, 0);
1977 sock_register(&packet_family_ops);
1978 register_netdevice_notifier(&packet_netdev_notifier);
1979 proc_net_fops_create(&init_net, "packet", 0, &packet_seq_fops);
1984 module_init(packet_init);
1985 module_exit(packet_exit);
1986 MODULE_LICENSE("GPL");
1987 MODULE_ALIAS_NETPROTO(PF_PACKET);