#include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * A macvtap queue is the central object of this driver, it connects * an open character device to a macvlan interface. There can be * multiple queues on one interface, which map back to queues * implemented in hardware on the underlying device. * * macvtap_proto is used to allocate queues through the sock allocation * mechanism. * * TODO: multiqueue support is currently not implemented, even though * macvtap is basically prepared for that. We will need to add this * here as well as in virtio-net and qemu to get line rate on 10gbit * adapters from a guest. */ struct macvtap_queue { struct sock sk; struct socket sock; struct macvlan_dev *vlan; struct file *file; }; static struct proto macvtap_proto = { .name = "macvtap", .owner = THIS_MODULE, .obj_size = sizeof (struct macvtap_queue), }; /* * Minor number matches netdev->ifindex, so need a potentially * large value. This also makes it possible to split the * tap functionality out again in the future by offering it * from other drivers besides macvtap. As long as every device * only has one tap, the interface numbers assure that the * device nodes are unique. */ static unsigned int macvtap_major; #define MACVTAP_NUM_DEVS 65536 static struct class *macvtap_class; static struct cdev macvtap_cdev; /* * RCU usage: * The macvtap_queue is referenced both from the chardev struct file * and from the struct macvlan_dev using rcu_read_lock. * * We never actually update the contents of a macvtap_queue atomically * with RCU but it is used for race-free destruction of a queue when * either the file or the macvlan_dev goes away. Pointers back to * the dev and the file are implicitly valid as long as the queue * exists. * * The callbacks from macvlan are always done with rcu_read_lock held * already. For calls from file_operations, we use the rcu_read_lock_bh * to get a reference count on the socket and the device. * * When destroying a queue, we remove the pointers from the file and * from the dev and then synchronize_rcu to make sure no thread is * still using the queue. There may still be references to the struct * sock inside of the queue from outbound SKBs, but these never * reference back to the file or the dev. The data structure is freed * through __sk_free when both our references and any pending SKBs * are gone. * * macvtap_lock is only used to prevent multiple concurrent open() * calls to assign a new vlan->tap pointer. It could be moved into * the macvlan_dev itself but is extremely rarely used. */ static DEFINE_SPINLOCK(macvtap_lock); /* * Choose the next free queue, for now there is only one */ static int macvtap_set_queue(struct net_device *dev, struct file *file, struct macvtap_queue *q) { struct macvlan_dev *vlan = netdev_priv(dev); int err = -EBUSY; spin_lock(&macvtap_lock); if (rcu_dereference(vlan->tap)) goto out; err = 0; q->vlan = vlan; rcu_assign_pointer(vlan->tap, q); q->file = file; rcu_assign_pointer(file->private_data, q); out: spin_unlock(&macvtap_lock); return err; } /* * We must destroy each queue exactly once, when either * the netdev or the file go away. * * Using the spinlock makes sure that we don't get * to the queue again after destroying it. * * synchronize_rcu serializes with the packet flow * that uses rcu_read_lock. */ static void macvtap_del_queue(struct macvtap_queue **qp) { struct macvtap_queue *q; spin_lock(&macvtap_lock); q = rcu_dereference(*qp); if (!q) { spin_unlock(&macvtap_lock); return; } rcu_assign_pointer(q->vlan->tap, NULL); rcu_assign_pointer(q->file->private_data, NULL); spin_unlock(&macvtap_lock); synchronize_rcu(); sock_put(&q->sk); } /* * Since we only support one queue, just dereference the pointer. */ static struct macvtap_queue *macvtap_get_queue(struct net_device *dev, struct sk_buff *skb) { struct macvlan_dev *vlan = netdev_priv(dev); return rcu_dereference(vlan->tap); } static void macvtap_del_queues(struct net_device *dev) { struct macvlan_dev *vlan = netdev_priv(dev); macvtap_del_queue(&vlan->tap); } static inline struct macvtap_queue *macvtap_file_get_queue(struct file *file) { struct macvtap_queue *q; rcu_read_lock_bh(); q = rcu_dereference(file->private_data); if (q) { sock_hold(&q->sk); dev_hold(q->vlan->dev); } rcu_read_unlock_bh(); return q; } static inline void macvtap_file_put_queue(struct macvtap_queue *q) { sock_put(&q->sk); dev_put(q->vlan->dev); } /* * Forward happens for data that gets sent from one macvlan * endpoint to another one in bridge mode. We just take * the skb and put it into the receive queue. */ static int macvtap_forward(struct net_device *dev, struct sk_buff *skb) { struct macvtap_queue *q = macvtap_get_queue(dev, skb); if (!q) return -ENOLINK; skb_queue_tail(&q->sk.sk_receive_queue, skb); wake_up(q->sk.sk_sleep); return 0; } /* * Receive is for data from the external interface (lowerdev), * in case of macvtap, we can treat that the same way as * forward, which macvlan cannot. */ static int macvtap_receive(struct sk_buff *skb) { skb_push(skb, ETH_HLEN); return macvtap_forward(skb->dev, skb); } static int macvtap_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[]) { struct device *classdev; dev_t devt; int err; err = macvlan_common_newlink(src_net, dev, tb, data, macvtap_receive, macvtap_forward); if (err) goto out; devt = MKDEV(MAJOR(macvtap_major), dev->ifindex); classdev = device_create(macvtap_class, &dev->dev, devt, dev, "tap%d", dev->ifindex); if (IS_ERR(classdev)) { err = PTR_ERR(classdev); macvtap_del_queues(dev); } out: return err; } static void macvtap_dellink(struct net_device *dev, struct list_head *head) { device_destroy(macvtap_class, MKDEV(MAJOR(macvtap_major), dev->ifindex)); macvtap_del_queues(dev); macvlan_dellink(dev, head); } static struct rtnl_link_ops macvtap_link_ops __read_mostly = { .kind = "macvtap", .newlink = macvtap_newlink, .dellink = macvtap_dellink, }; static void macvtap_sock_write_space(struct sock *sk) { if (!sock_writeable(sk) || !test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags)) return; if (sk->sk_sleep && waitqueue_active(sk->sk_sleep)) wake_up_interruptible_sync(sk->sk_sleep); } static int macvtap_open(struct inode *inode, struct file *file) { struct net *net = current->nsproxy->net_ns; struct net_device *dev = dev_get_by_index(net, iminor(inode)); struct macvtap_queue *q; int err; err = -ENODEV; if (!dev) goto out; /* check if this is a macvtap device */ err = -EINVAL; if (dev->rtnl_link_ops != &macvtap_link_ops) goto out; err = -ENOMEM; q = (struct macvtap_queue *)sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &macvtap_proto); if (!q) goto out; init_waitqueue_head(&q->sock.wait); q->sock.type = SOCK_RAW; q->sock.state = SS_CONNECTED; sock_init_data(&q->sock, &q->sk); q->sk.sk_allocation = GFP_ATOMIC; /* for now */ q->sk.sk_write_space = macvtap_sock_write_space; err = macvtap_set_queue(dev, file, q); if (err) sock_put(&q->sk); out: if (dev) dev_put(dev); return err; } static int macvtap_release(struct inode *inode, struct file *file) { macvtap_del_queue((struct macvtap_queue **)&file->private_data); return 0; } static unsigned int macvtap_poll(struct file *file, poll_table * wait) { struct macvtap_queue *q = macvtap_file_get_queue(file); unsigned int mask = POLLERR; if (!q) goto out; mask = 0; poll_wait(file, &q->sock.wait, wait); if (!skb_queue_empty(&q->sk.sk_receive_queue)) mask |= POLLIN | POLLRDNORM; if (sock_writeable(&q->sk) || (!test_and_set_bit(SOCK_ASYNC_NOSPACE, &q->sock.flags) && sock_writeable(&q->sk))) mask |= POLLOUT | POLLWRNORM; macvtap_file_put_queue(q); out: return mask; } /* Get packet from user space buffer */ static ssize_t macvtap_get_user(struct macvtap_queue *q, const struct iovec *iv, size_t count, int noblock) { struct sk_buff *skb; size_t len = count; int err; if (unlikely(len < ETH_HLEN)) return -EINVAL; skb = sock_alloc_send_skb(&q->sk, NET_IP_ALIGN + len, noblock, &err); if (!skb) { macvlan_count_rx(q->vlan, 0, false, false); return err; } skb_reserve(skb, NET_IP_ALIGN); skb_put(skb, count); if (skb_copy_datagram_from_iovec(skb, 0, iv, 0, len)) { macvlan_count_rx(q->vlan, 0, false, false); kfree_skb(skb); return -EFAULT; } skb_set_network_header(skb, ETH_HLEN); macvlan_start_xmit(skb, q->vlan->dev); return count; } static ssize_t macvtap_aio_write(struct kiocb *iocb, const struct iovec *iv, unsigned long count, loff_t pos) { struct file *file = iocb->ki_filp; ssize_t result = -ENOLINK; struct macvtap_queue *q = macvtap_file_get_queue(file); if (!q) goto out; result = macvtap_get_user(q, iv, iov_length(iv, count), file->f_flags & O_NONBLOCK); macvtap_file_put_queue(q); out: return result; } /* Put packet to the user space buffer */ static ssize_t macvtap_put_user(struct macvtap_queue *q, const struct sk_buff *skb, const struct iovec *iv, int len) { struct macvlan_dev *vlan = q->vlan; int ret; len = min_t(int, skb->len, len); ret = skb_copy_datagram_const_iovec(skb, 0, iv, 0, len); macvlan_count_rx(vlan, len, ret == 0, 0); return ret ? ret : len; } static ssize_t macvtap_aio_read(struct kiocb *iocb, const struct iovec *iv, unsigned long count, loff_t pos) { struct file *file = iocb->ki_filp; struct macvtap_queue *q = macvtap_file_get_queue(file); DECLARE_WAITQUEUE(wait, current); struct sk_buff *skb; ssize_t len, ret = 0; if (!q) return -ENOLINK; len = iov_length(iv, count); if (len < 0) { ret = -EINVAL; goto out; } add_wait_queue(q->sk.sk_sleep, &wait); while (len) { current->state = TASK_INTERRUPTIBLE; /* Read frames from the queue */ skb = skb_dequeue(&q->sk.sk_receive_queue); if (!skb) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; break; } if (signal_pending(current)) { ret = -ERESTARTSYS; break; } /* Nothing to read, let's sleep */ schedule(); continue; } ret = macvtap_put_user(q, skb, iv, len); kfree_skb(skb); break; } current->state = TASK_RUNNING; remove_wait_queue(q->sk.sk_sleep, &wait); out: macvtap_file_put_queue(q); return ret; } /* * provide compatibility with generic tun/tap interface */ static long macvtap_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct macvtap_queue *q; void __user *argp = (void __user *)arg; struct ifreq __user *ifr = argp; unsigned int __user *up = argp; unsigned int u; char devname[IFNAMSIZ]; switch (cmd) { case TUNSETIFF: /* ignore the name, just look at flags */ if (get_user(u, &ifr->ifr_flags)) return -EFAULT; if (u != (IFF_TAP | IFF_NO_PI)) return -EINVAL; return 0; case TUNGETIFF: q = macvtap_file_get_queue(file); if (!q) return -ENOLINK; memcpy(devname, q->vlan->dev->name, sizeof(devname)); macvtap_file_put_queue(q); if (copy_to_user(&ifr->ifr_name, q->vlan->dev->name, IFNAMSIZ) || put_user((TUN_TAP_DEV | TUN_NO_PI), &ifr->ifr_flags)) return -EFAULT; return 0; case TUNGETFEATURES: if (put_user((IFF_TAP | IFF_NO_PI), up)) return -EFAULT; return 0; case TUNSETSNDBUF: if (get_user(u, up)) return -EFAULT; q = macvtap_file_get_queue(file); if (!q) return -ENOLINK; q->sk.sk_sndbuf = u; macvtap_file_put_queue(q); return 0; case TUNSETOFFLOAD: /* let the user check for future flags */ if (arg & ~(TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN | TUN_F_UFO)) return -EINVAL; /* TODO: add support for these, so far we don't support any offload */ if (arg & (TUN_F_CSUM | TUN_F_TSO4 | TUN_F_TSO6 | TUN_F_TSO_ECN | TUN_F_UFO)) return -EINVAL; return 0; default: return -EINVAL; } } #ifdef CONFIG_COMPAT static long macvtap_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { return macvtap_ioctl(file, cmd, (unsigned long)compat_ptr(arg)); } #endif static const struct file_operations macvtap_fops = { .owner = THIS_MODULE, .open = macvtap_open, .release = macvtap_release, .aio_read = macvtap_aio_read, .aio_write = macvtap_aio_write, .poll = macvtap_poll, .llseek = no_llseek, .unlocked_ioctl = macvtap_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = macvtap_compat_ioctl, #endif }; static int macvtap_init(void) { int err; err = alloc_chrdev_region(&macvtap_major, 0, MACVTAP_NUM_DEVS, "macvtap"); if (err) goto out1; cdev_init(&macvtap_cdev, &macvtap_fops); err = cdev_add(&macvtap_cdev, macvtap_major, MACVTAP_NUM_DEVS); if (err) goto out2; macvtap_class = class_create(THIS_MODULE, "macvtap"); if (IS_ERR(macvtap_class)) { err = PTR_ERR(macvtap_class); goto out3; } err = macvlan_link_register(&macvtap_link_ops); if (err) goto out4; return 0; out4: class_unregister(macvtap_class); out3: cdev_del(&macvtap_cdev); out2: unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); out1: return err; } module_init(macvtap_init); static void macvtap_exit(void) { rtnl_link_unregister(&macvtap_link_ops); class_unregister(macvtap_class); cdev_del(&macvtap_cdev); unregister_chrdev_region(macvtap_major, MACVTAP_NUM_DEVS); } module_exit(macvtap_exit); MODULE_ALIAS_RTNL_LINK("macvtap"); MODULE_AUTHOR("Arnd Bergmann "); MODULE_LICENSE("GPL");