*
* PF_INET protocol family socket handler.
*
- * Version: $Id: af_inet.c,v 1.137 2002/02/01 22:01:03 davem Exp $
- *
* Authors: Ross Biro
* Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
* Florian La Roche, <flla@stud.uni-sb.de>
#include <linux/poll.h>
#include <linux/netfilter_ipv4.h>
#include <linux/random.h>
+#include <linux/slab.h>
#include <asm/uaccess.h>
#include <asm/system.h>
#include <linux/igmp.h>
#include <linux/inetdevice.h>
#include <linux/netdevice.h>
+#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <net/arp.h>
#include <net/ipip.h>
#include <net/inet_common.h>
#include <net/xfrm.h>
+#include <net/net_namespace.h>
#ifdef CONFIG_IP_MROUTE
#include <linux/mroute.h>
#endif
-DEFINE_SNMP_STAT(struct linux_mib, net_statistics) __read_mostly;
-
-extern void ip_mc_drop_socket(struct sock *sk);
/* The inetsw table contains everything that inet_create needs to
* build a new socket.
static DEFINE_SPINLOCK(inetsw_lock);
struct ipv4_config ipv4_config;
-
EXPORT_SYMBOL(ipv4_config);
/* New destruction routine */
__skb_queue_purge(&sk->sk_receive_queue);
__skb_queue_purge(&sk->sk_error_queue);
+ sk_mem_reclaim(sk);
+
if (sk->sk_type == SOCK_STREAM && sk->sk_state != TCP_CLOSE) {
- printk("Attempt to release TCP socket in state %d %p\n",
+ pr_err("Attempt to release TCP socket in state %d %p\n",
sk->sk_state, sk);
return;
}
if (!sock_flag(sk, SOCK_DEAD)) {
- printk("Attempt to release alive inet socket %p\n", sk);
+ pr_err("Attempt to release alive inet socket %p\n", sk);
return;
}
- BUG_TRAP(!atomic_read(&sk->sk_rmem_alloc));
- BUG_TRAP(!atomic_read(&sk->sk_wmem_alloc));
- BUG_TRAP(!sk->sk_wmem_queued);
- BUG_TRAP(!sk->sk_forward_alloc);
+ WARN_ON(atomic_read(&sk->sk_rmem_alloc));
+ WARN_ON(atomic_read(&sk->sk_wmem_alloc));
+ WARN_ON(sk->sk_wmem_queued);
+ WARN_ON(sk->sk_forward_alloc);
kfree(inet->opt);
- dst_release(sk->sk_dst_cache);
+ dst_release(rcu_dereference_check(sk->sk_dst_cache, 1));
sk_refcnt_debug_dec(sk);
}
+EXPORT_SYMBOL(inet_sock_destruct);
/*
* The routines beyond this point handle the behaviour of an AF_INET
/* We may need to bind the socket. */
lock_sock(sk);
inet = inet_sk(sk);
- if (!inet->num) {
+ if (!inet->inet_num) {
if (sk->sk_prot->get_port(sk, 0)) {
release_sock(sk);
return -EAGAIN;
}
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
}
release_sock(sk);
return 0;
release_sock(sk);
return err;
}
+EXPORT_SYMBOL(inet_listen);
u32 inet_ehash_secret __read_mostly;
EXPORT_SYMBOL(inet_ehash_secret);
}
EXPORT_SYMBOL(build_ehash_secret);
+static inline int inet_netns_ok(struct net *net, int protocol)
+{
+ int hash;
+ const struct net_protocol *ipprot;
+
+ if (net_eq(net, &init_net))
+ return 1;
+
+ hash = protocol & (MAX_INET_PROTOS - 1);
+ ipprot = rcu_dereference(inet_protos[hash]);
+
+ if (ipprot == NULL)
+ /* raw IP is OK */
+ return 1;
+ return ipprot->netns_ok;
+}
+
/*
* Create an inet socket.
*/
-static int inet_create(struct net *net, struct socket *sock, int protocol)
+static int inet_create(struct net *net, struct socket *sock, int protocol,
+ int kern)
{
struct sock *sk;
- struct list_head *p;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
int try_loading_module = 0;
int err;
- if (net != &init_net)
- return -EAFNOSUPPORT;
-
- if (sock->type != SOCK_RAW &&
- sock->type != SOCK_DGRAM &&
- !inet_ehash_secret)
- build_ehash_secret();
+ if (unlikely(!inet_ehash_secret))
+ if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
+ build_ehash_secret();
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
- answer = NULL;
lookup_protocol:
err = -ESOCKTNOSUPPORT;
rcu_read_lock();
- list_for_each_rcu(p, &inetsw[sock->type]) {
- answer = list_entry(p, struct inet_protosw, list);
+ list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
+ err = 0;
/* Check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != IPPROTO_IP)
break;
}
err = -EPROTONOSUPPORT;
- answer = NULL;
}
- if (unlikely(answer == NULL)) {
+ if (unlikely(err)) {
if (try_loading_module < 2) {
rcu_read_unlock();
/*
}
err = -EPERM;
- if (answer->capability > 0 && !capable(answer->capability))
+ if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
+ goto out_rcu_unlock;
+
+ err = -EAFNOSUPPORT;
+ if (!inet_netns_ok(net, protocol))
goto out_rcu_unlock;
sock->ops = answer->ops;
answer_flags = answer->flags;
rcu_read_unlock();
- BUG_TRAP(answer_prot->slab != NULL);
+ WARN_ON(answer_prot->slab == NULL);
err = -ENOBUFS;
sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot);
inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;
if (SOCK_RAW == sock->type) {
- inet->num = protocol;
+ inet->inet_num = protocol;
if (IPPROTO_RAW == protocol)
inet->hdrincl = 1;
}
else
inet->pmtudisc = IP_PMTUDISC_WANT;
- inet->id = 0;
+ inet->inet_id = 0;
sock_init_data(sock, sk);
sk->sk_destruct = inet_sock_destruct;
- sk->sk_family = PF_INET;
sk->sk_protocol = protocol;
sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;
inet->uc_ttl = -1;
inet->mc_loop = 1;
inet->mc_ttl = 1;
+ inet->mc_all = 1;
inet->mc_index = 0;
inet->mc_list = NULL;
sk_refcnt_debug_inc(sk);
- if (inet->num) {
+ if (inet->inet_num) {
/* It assumes that any protocol which allows
* the user to assign a number at socket
* creation time automatically
* shares.
*/
- inet->sport = htons(inet->num);
+ inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
sk->sk_prot->hash(sk);
}
if (sk) {
long timeout;
+ sock_rps_reset_flow(sk);
+
/* Applications forget to leave groups before exiting */
ip_mc_drop_socket(sk);
}
return 0;
}
+EXPORT_SYMBOL(inet_release);
/* It is off by default, see below. */
int sysctl_ip_nonlocal_bind __read_mostly;
+EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
if (addr_len < sizeof(struct sockaddr_in))
goto out;
- chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
+ chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
/* Not specified by any standard per-se, however it breaks too
* many applications when removed. It is unfortunate since
*/
err = -EADDRNOTAVAIL;
if (!sysctl_ip_nonlocal_bind &&
- !inet->freebind &&
- addr->sin_addr.s_addr != INADDR_ANY &&
+ !(inet->freebind || inet->transparent) &&
+ addr->sin_addr.s_addr != htonl(INADDR_ANY) &&
chk_addr_ret != RTN_LOCAL &&
chk_addr_ret != RTN_MULTICAST &&
chk_addr_ret != RTN_BROADCAST)
/* Check these errors (active socket, double bind). */
err = -EINVAL;
- if (sk->sk_state != TCP_CLOSE || inet->num)
+ if (sk->sk_state != TCP_CLOSE || inet->inet_num)
goto out_release_sock;
- inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr;
+ inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
- inet->saddr = 0; /* Use device */
+ inet->inet_saddr = 0; /* Use device */
/* Make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) {
- inet->saddr = inet->rcv_saddr = 0;
+ inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -EADDRINUSE;
goto out_release_sock;
}
- if (inet->rcv_saddr)
+ if (inet->inet_rcv_saddr)
sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
if (snum)
sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
- inet->sport = htons(inet->num);
- inet->daddr = 0;
- inet->dport = 0;
+ inet->inet_sport = htons(inet->inet_num);
+ inet->inet_daddr = 0;
+ inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
out:
return err;
}
+EXPORT_SYMBOL(inet_bind);
int inet_dgram_connect(struct socket *sock, struct sockaddr * uaddr,
int addr_len, int flags)
{
struct sock *sk = sock->sk;
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC)
return sk->sk_prot->disconnect(sk, flags);
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->connect(sk, (struct sockaddr *)uaddr, addr_len);
}
+EXPORT_SYMBOL(inet_dgram_connect);
static long inet_wait_for_connect(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
/* Basic assumption: if someone sets sk->sk_err, he _must_
* change state of the socket from TCP_SYN_*.
lock_sock(sk);
if (signal_pending(current) || !timeo)
break;
- prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
}
- finish_wait(sk->sk_sleep, &wait);
+ finish_wait(sk_sleep(sk), &wait);
return timeo;
}
int err;
long timeo;
+ if (addr_len < sizeof(uaddr->sa_family))
+ return -EINVAL;
+
lock_sock(sk);
if (uaddr->sa_family == AF_UNSPEC) {
sock->state = SS_DISCONNECTING;
goto out;
}
+EXPORT_SYMBOL(inet_stream_connect);
/*
* Accept a pending connection. The TCP layer now gives BSD semantics.
lock_sock(sk2);
- BUG_TRAP((1 << sk2->sk_state) &
- (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE));
+ WARN_ON(!((1 << sk2->sk_state) &
+ (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT | TCPF_CLOSE)));
sock_graft(sk2, newsock);
do_err:
return err;
}
+EXPORT_SYMBOL(inet_accept);
/*
{
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
- struct sockaddr_in *sin = (struct sockaddr_in *)uaddr;
+ DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
sin->sin_family = AF_INET;
if (peer) {
- if (!inet->dport ||
+ if (!inet->inet_dport ||
(((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_SYN_SENT)) &&
peer == 1))
return -ENOTCONN;
- sin->sin_port = inet->dport;
- sin->sin_addr.s_addr = inet->daddr;
+ sin->sin_port = inet->inet_dport;
+ sin->sin_addr.s_addr = inet->inet_daddr;
} else {
- __be32 addr = inet->rcv_saddr;
+ __be32 addr = inet->inet_rcv_saddr;
if (!addr)
- addr = inet->saddr;
- sin->sin_port = inet->sport;
+ addr = inet->inet_saddr;
+ sin->sin_port = inet->inet_sport;
sin->sin_addr.s_addr = addr;
}
memset(sin->sin_zero, 0, sizeof(sin->sin_zero));
*uaddr_len = sizeof(*sin);
return 0;
}
+EXPORT_SYMBOL(inet_getname);
int inet_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
size_t size)
{
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
return sk->sk_prot->sendmsg(iocb, sk, msg, size);
}
+EXPORT_SYMBOL(inet_sendmsg);
-
-static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
+static ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset,
+ size_t size, int flags)
{
struct sock *sk = sock->sk;
+ sock_rps_record_flow(sk);
+
/* We may need to bind the socket. */
- if (!inet_sk(sk)->num && inet_autobind(sk))
+ if (!inet_sk(sk)->inet_num && inet_autobind(sk))
return -EAGAIN;
if (sk->sk_prot->sendpage)
return sock_no_sendpage(sock, page, offset, size, flags);
}
+int inet_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg,
+ size_t size, int flags)
+{
+ struct sock *sk = sock->sk;
+ int addr_len = 0;
+ int err;
+
+ sock_rps_record_flow(sk);
+
+ err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
+ flags & ~MSG_DONTWAIT, &addr_len);
+ if (err >= 0)
+ msg->msg_namelen = addr_len;
+ return err;
+}
+EXPORT_SYMBOL(inet_recvmsg);
int inet_shutdown(struct socket *sock, int how)
{
release_sock(sk);
return err;
}
+EXPORT_SYMBOL(inet_shutdown);
/*
* ioctl() calls you can issue on an INET socket. Most of these are
{
struct sock *sk = sock->sk;
int err = 0;
+ struct net *net = sock_net(sk);
switch (cmd) {
- case SIOCGSTAMP:
- err = sock_get_timestamp(sk, (struct timeval __user *)arg);
- break;
- case SIOCGSTAMPNS:
- err = sock_get_timestampns(sk, (struct timespec __user *)arg);
- break;
- case SIOCADDRT:
- case SIOCDELRT:
- case SIOCRTMSG:
- err = ip_rt_ioctl(cmd, (void __user *)arg);
- break;
- case SIOCDARP:
- case SIOCGARP:
- case SIOCSARP:
- err = arp_ioctl(sk->sk_net, cmd, (void __user *)arg);
- break;
- case SIOCGIFADDR:
- case SIOCSIFADDR:
- case SIOCGIFBRDADDR:
- case SIOCSIFBRDADDR:
- case SIOCGIFNETMASK:
- case SIOCSIFNETMASK:
- case SIOCGIFDSTADDR:
- case SIOCSIFDSTADDR:
- case SIOCSIFPFLAGS:
- case SIOCGIFPFLAGS:
- case SIOCSIFFLAGS:
- err = devinet_ioctl(cmd, (void __user *)arg);
- break;
- default:
- if (sk->sk_prot->ioctl)
- err = sk->sk_prot->ioctl(sk, cmd, arg);
- else
- err = -ENOIOCTLCMD;
- break;
+ case SIOCGSTAMP:
+ err = sock_get_timestamp(sk, (struct timeval __user *)arg);
+ break;
+ case SIOCGSTAMPNS:
+ err = sock_get_timestampns(sk, (struct timespec __user *)arg);
+ break;
+ case SIOCADDRT:
+ case SIOCDELRT:
+ case SIOCRTMSG:
+ err = ip_rt_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCDARP:
+ case SIOCGARP:
+ case SIOCSARP:
+ err = arp_ioctl(net, cmd, (void __user *)arg);
+ break;
+ case SIOCGIFADDR:
+ case SIOCSIFADDR:
+ case SIOCGIFBRDADDR:
+ case SIOCSIFBRDADDR:
+ case SIOCGIFNETMASK:
+ case SIOCSIFNETMASK:
+ case SIOCGIFDSTADDR:
+ case SIOCSIFDSTADDR:
+ case SIOCSIFPFLAGS:
+ case SIOCGIFPFLAGS:
+ case SIOCSIFFLAGS:
+ err = devinet_ioctl(net, cmd, (void __user *)arg);
+ break;
+ default:
+ if (sk->sk_prot->ioctl)
+ err = sk->sk_prot->ioctl(sk, cmd, arg);
+ else
+ err = -ENOIOCTLCMD;
+ break;
}
return err;
}
+EXPORT_SYMBOL(inet_ioctl);
const struct proto_ops inet_stream_ops = {
.family = PF_INET,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = tcp_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = tcp_sendpage,
.splice_read = tcp_splice_read,
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
+EXPORT_SYMBOL(inet_stream_ops);
const struct proto_ops inet_dgram_ops = {
.family = PF_INET,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
.compat_getsockopt = compat_sock_common_getsockopt,
#endif
};
+EXPORT_SYMBOL(inet_dgram_ops);
/*
* For SOCK_RAW sockets; should be the same as inet_dgram_ops but without
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
- .recvmsg = sock_common_recvmsg,
+ .recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
#ifdef CONFIG_COMPAT
#endif
};
-static struct net_proto_family inet_family_ops = {
+static const struct net_proto_family inet_family_ops = {
.family = PF_INET,
.create = inet_create,
.owner = THIS_MODULE,
.protocol = IPPROTO_TCP,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
- .capability = -1,
.no_check = 0,
.flags = INET_PROTOSW_PERMANENT |
INET_PROTOSW_ICSK,
.protocol = IPPROTO_UDP,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
- .capability = -1,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_PERMANENT,
},
.protocol = IPPROTO_IP, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
- .capability = CAP_NET_RAW,
.no_check = UDP_CSUM_DEFAULT,
.flags = INET_PROTOSW_REUSE,
}
out:
spin_unlock_bh(&inetsw_lock);
- synchronize_net();
-
return;
out_permanent:
p->type);
goto out;
}
+EXPORT_SYMBOL(inet_register_protosw);
void inet_unregister_protosw(struct inet_protosw *p)
{
synchronize_net();
}
}
+EXPORT_SYMBOL(inet_unregister_protosw);
/*
* Shall we try to damage output packets if routing dev changes?
struct inet_sock *inet = inet_sk(sk);
int err;
struct rtable *rt;
- __be32 old_saddr = inet->saddr;
+ __be32 old_saddr = inet->inet_saddr;
__be32 new_saddr;
- __be32 daddr = inet->daddr;
+ __be32 daddr = inet->inet_daddr;
if (inet->opt && inet->opt->srr)
daddr = inet->opt->faddr;
RT_CONN_FLAGS(sk),
sk->sk_bound_dev_if,
sk->sk_protocol,
- inet->sport, inet->dport, sk, 0);
+ inet->inet_sport, inet->inet_dport, sk, 0);
if (err)
return err;
return 0;
if (sysctl_ip_dynaddr > 1) {
- printk(KERN_INFO "%s(): shifting inet->"
- "saddr from %d.%d.%d.%d to %d.%d.%d.%d\n",
- __FUNCTION__,
- NIPQUAD(old_saddr),
- NIPQUAD(new_saddr));
+ printk(KERN_INFO "%s(): shifting inet->saddr from %pI4 to %pI4\n",
+ __func__, &old_saddr, &new_saddr);
}
- inet->saddr = inet->rcv_saddr = new_saddr;
+ inet->inet_saddr = inet->inet_rcv_saddr = new_saddr;
/*
* XXX The only one ugly spot where we need to
return 0;
/* Reroute. */
- daddr = inet->daddr;
+ daddr = inet->inet_daddr;
if (inet->opt && inet->opt->srr)
daddr = inet->opt->faddr;
{
struct flowi fl = {
.oif = sk->sk_bound_dev_if,
+ .mark = sk->sk_mark,
.nl_u = {
.ip4_u = {
.daddr = daddr,
- .saddr = inet->saddr,
+ .saddr = inet->inet_saddr,
.tos = RT_CONN_FLAGS(sk),
},
},
.proto = sk->sk_protocol,
+ .flags = inet_sk_flowi_flags(sk),
.uli_u = {
.ports = {
- .sport = inet->sport,
- .dport = inet->dport,
+ .sport = inet->inet_sport,
+ .dport = inet->inet_dport,
},
},
};
security_sk_classify_flow(sk, &fl);
- err = ip_route_output_flow(&rt, &fl, sk, 0);
+ err = ip_route_output_flow(sock_net(sk), &rt, &fl, sk, 0);
}
if (!err)
sk_setup_caps(sk, &rt->u.dst);
return err;
}
-
EXPORT_SYMBOL(inet_sk_rebuild_header);
static int inet_gso_send_check(struct sk_buff *skb)
{
struct iphdr *iph;
- struct net_protocol *ops;
+ const struct net_protocol *ops;
int proto;
int ihl;
int err = -EINVAL;
{
struct sk_buff *segs = ERR_PTR(-EINVAL);
struct iphdr *iph;
- struct net_protocol *ops;
+ const struct net_protocol *ops;
int proto;
int ihl;
int id;
+ unsigned int offset = 0;
if (!(features & NETIF_F_V4_CSUM))
features &= ~NETIF_F_SG;
segs = ops->gso_segment(skb, features);
rcu_read_unlock();
- if (!segs || unlikely(IS_ERR(segs)))
+ if (!segs || IS_ERR(segs))
goto out;
skb = segs;
do {
iph = ip_hdr(skb);
- iph->id = htons(id++);
+ if (proto == IPPROTO_UDP) {
+ iph->id = htons(id);
+ iph->frag_off = htons(offset >> 3);
+ if (skb->next != NULL)
+ iph->frag_off |= htons(IP_MF);
+ offset += (skb->len - skb->mac_len - iph->ihl * 4);
+ } else
+ iph->id = htons(id++);
iph->tot_len = htons(skb->len - skb->mac_len);
iph->check = 0;
iph->check = ip_fast_csum(skb_network_header(skb), iph->ihl);
return segs;
}
-unsigned long snmp_fold_field(void *mib[], int offt)
+static struct sk_buff **inet_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ const struct net_protocol *ops;
+ struct sk_buff **pp = NULL;
+ struct sk_buff *p;
+ struct iphdr *iph;
+ unsigned int hlen;
+ unsigned int off;
+ unsigned int id;
+ int flush = 1;
+ int proto;
+
+ off = skb_gro_offset(skb);
+ hlen = off + sizeof(*iph);
+ iph = skb_gro_header_fast(skb, off);
+ if (skb_gro_header_hard(skb, hlen)) {
+ iph = skb_gro_header_slow(skb, hlen, off);
+ if (unlikely(!iph))
+ goto out;
+ }
+
+ proto = iph->protocol & (MAX_INET_PROTOS - 1);
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (!ops || !ops->gro_receive)
+ goto out_unlock;
+
+ if (*(u8 *)iph != 0x45)
+ goto out_unlock;
+
+ if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+ goto out_unlock;
+
+ id = ntohl(*(__be32 *)&iph->id);
+ flush = (u16)((ntohl(*(__be32 *)iph) ^ skb_gro_len(skb)) | (id ^ IP_DF));
+ id >>= 16;
+
+ for (p = *head; p; p = p->next) {
+ struct iphdr *iph2;
+
+ if (!NAPI_GRO_CB(p)->same_flow)
+ continue;
+
+ iph2 = ip_hdr(p);
+
+ if ((iph->protocol ^ iph2->protocol) |
+ (iph->tos ^ iph2->tos) |
+ ((__force u32)iph->saddr ^ (__force u32)iph2->saddr) |
+ ((__force u32)iph->daddr ^ (__force u32)iph2->daddr)) {
+ NAPI_GRO_CB(p)->same_flow = 0;
+ continue;
+ }
+
+ /* All fields must match except length and checksum. */
+ NAPI_GRO_CB(p)->flush |=
+ (iph->ttl ^ iph2->ttl) |
+ ((u16)(ntohs(iph2->id) + NAPI_GRO_CB(p)->count) ^ id);
+
+ NAPI_GRO_CB(p)->flush |= flush;
+ }
+
+ NAPI_GRO_CB(skb)->flush |= flush;
+ skb_gro_pull(skb, sizeof(*iph));
+ skb_set_transport_header(skb, skb_gro_offset(skb));
+
+ pp = ops->gro_receive(head, skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+out:
+ NAPI_GRO_CB(skb)->flush |= flush;
+
+ return pp;
+}
+
+static int inet_gro_complete(struct sk_buff *skb)
+{
+ const struct net_protocol *ops;
+ struct iphdr *iph = ip_hdr(skb);
+ int proto = iph->protocol & (MAX_INET_PROTOS - 1);
+ int err = -ENOSYS;
+ __be16 newlen = htons(skb->len - skb_network_offset(skb));
+
+ csum_replace2(&iph->check, iph->tot_len, newlen);
+ iph->tot_len = newlen;
+
+ rcu_read_lock();
+ ops = rcu_dereference(inet_protos[proto]);
+ if (WARN_ON(!ops || !ops->gro_complete))
+ goto out_unlock;
+
+ err = ops->gro_complete(skb);
+
+out_unlock:
+ rcu_read_unlock();
+
+ return err;
+}
+
+int inet_ctl_sock_create(struct sock **sk, unsigned short family,
+ unsigned short type, unsigned char protocol,
+ struct net *net)
+{
+ struct socket *sock;
+ int rc = sock_create_kern(family, type, protocol, &sock);
+
+ if (rc == 0) {
+ *sk = sock->sk;
+ (*sk)->sk_allocation = GFP_ATOMIC;
+ /*
+ * Unhash it so that IP input processing does not even see it,
+ * we do not wish this socket to see incoming packets.
+ */
+ (*sk)->sk_prot->unhash(*sk);
+
+ sk_change_net(*sk, net);
+ }
+ return rc;
+}
+EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
+
+unsigned long snmp_fold_field(void __percpu *mib[], int offt)
{
unsigned long res = 0;
int i;
}
EXPORT_SYMBOL_GPL(snmp_fold_field);
-int snmp_mib_init(void *ptr[2], size_t mibsize)
+int snmp_mib_init(void __percpu *ptr[2], size_t mibsize)
{
BUG_ON(ptr == NULL);
- ptr[0] = __alloc_percpu(mibsize);
+ ptr[0] = __alloc_percpu(mibsize, __alignof__(unsigned long));
if (!ptr[0])
goto err0;
- ptr[1] = __alloc_percpu(mibsize);
+ ptr[1] = __alloc_percpu(mibsize, __alignof__(unsigned long));
if (!ptr[1])
goto err1;
return 0;
}
EXPORT_SYMBOL_GPL(snmp_mib_init);
-void snmp_mib_free(void *ptr[2])
+void snmp_mib_free(void __percpu *ptr[2])
{
BUG_ON(ptr == NULL);
free_percpu(ptr[0]);
EXPORT_SYMBOL_GPL(snmp_mib_free);
#ifdef CONFIG_IP_MULTICAST
-static struct net_protocol igmp_protocol = {
+static const struct net_protocol igmp_protocol = {
.handler = igmp_rcv,
+ .netns_ok = 1,
};
#endif
-static struct net_protocol tcp_protocol = {
+static const struct net_protocol tcp_protocol = {
.handler = tcp_v4_rcv,
.err_handler = tcp_v4_err,
.gso_send_check = tcp_v4_gso_send_check,
.gso_segment = tcp_tso_segment,
+ .gro_receive = tcp4_gro_receive,
+ .gro_complete = tcp4_gro_complete,
.no_policy = 1,
+ .netns_ok = 1,
};
-static struct net_protocol udp_protocol = {
+static const struct net_protocol udp_protocol = {
.handler = udp_rcv,
.err_handler = udp_err,
+ .gso_send_check = udp4_ufo_send_check,
+ .gso_segment = udp4_ufo_fragment,
.no_policy = 1,
+ .netns_ok = 1,
};
-static struct net_protocol icmp_protocol = {
+static const struct net_protocol icmp_protocol = {
.handler = icmp_rcv,
.no_policy = 1,
+ .netns_ok = 1,
};
-static int __init init_ipv4_mibs(void)
+static __net_init int ipv4_mib_init_net(struct net *net)
{
- if (snmp_mib_init((void **)net_statistics,
- sizeof(struct linux_mib)) < 0)
- goto err_net_mib;
- if (snmp_mib_init((void **)ip_statistics,
- sizeof(struct ipstats_mib)) < 0)
- goto err_ip_mib;
- if (snmp_mib_init((void **)icmp_statistics,
- sizeof(struct icmp_mib)) < 0)
- goto err_icmp_mib;
- if (snmp_mib_init((void **)icmpmsg_statistics,
- sizeof(struct icmpmsg_mib)) < 0)
- goto err_icmpmsg_mib;
- if (snmp_mib_init((void **)tcp_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.tcp_statistics,
sizeof(struct tcp_mib)) < 0)
goto err_tcp_mib;
- if (snmp_mib_init((void **)udp_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.ip_statistics,
+ sizeof(struct ipstats_mib)) < 0)
+ goto err_ip_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.net_statistics,
+ sizeof(struct linux_mib)) < 0)
+ goto err_net_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.udp_statistics,
sizeof(struct udp_mib)) < 0)
goto err_udp_mib;
- if (snmp_mib_init((void **)udplite_statistics,
+ if (snmp_mib_init((void __percpu **)net->mib.udplite_statistics,
sizeof(struct udp_mib)) < 0)
goto err_udplite_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.icmp_statistics,
+ sizeof(struct icmp_mib)) < 0)
+ goto err_icmp_mib;
+ if (snmp_mib_init((void __percpu **)net->mib.icmpmsg_statistics,
+ sizeof(struct icmpmsg_mib)) < 0)
+ goto err_icmpmsg_mib;
- tcp_mib_init();
-
+ tcp_mib_init(net);
return 0;
-err_udplite_mib:
- snmp_mib_free((void **)udp_statistics);
-err_udp_mib:
- snmp_mib_free((void **)tcp_statistics);
-err_tcp_mib:
- snmp_mib_free((void **)icmpmsg_statistics);
err_icmpmsg_mib:
- snmp_mib_free((void **)icmp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
err_icmp_mib:
- snmp_mib_free((void **)ip_statistics);
-err_ip_mib:
- snmp_mib_free((void **)net_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+err_udplite_mib:
+ snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+err_udp_mib:
+ snmp_mib_free((void __percpu **)net->mib.net_statistics);
err_net_mib:
+ snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+err_ip_mib:
+ snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+err_tcp_mib:
return -ENOMEM;
}
+static __net_exit void ipv4_mib_exit_net(struct net *net)
+{
+ snmp_mib_free((void __percpu **)net->mib.icmpmsg_statistics);
+ snmp_mib_free((void __percpu **)net->mib.icmp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udplite_statistics);
+ snmp_mib_free((void __percpu **)net->mib.udp_statistics);
+ snmp_mib_free((void __percpu **)net->mib.net_statistics);
+ snmp_mib_free((void __percpu **)net->mib.ip_statistics);
+ snmp_mib_free((void __percpu **)net->mib.tcp_statistics);
+}
+
+static __net_initdata struct pernet_operations ipv4_mib_ops = {
+ .init = ipv4_mib_init_net,
+ .exit = ipv4_mib_exit_net,
+};
+
+static int __init init_ipv4_mibs(void)
+{
+ return register_pernet_subsys(&ipv4_mib_ops);
+}
+
static int ipv4_proc_init(void);
/*
* IP protocol layer initialiser
*/
-static struct packet_type ip_packet_type = {
- .type = __constant_htons(ETH_P_IP),
+static struct packet_type ip_packet_type __read_mostly = {
+ .type = cpu_to_be16(ETH_P_IP),
.func = ip_rcv,
.gso_send_check = inet_gso_send_check,
.gso_segment = inet_gso_segment,
+ .gro_receive = inet_gro_receive,
+ .gro_complete = inet_gro_complete,
};
static int __init inet_init(void)
BUILD_BUG_ON(sizeof(struct inet_skb_parm) > sizeof(dummy_skb->cb));
+ sysctl_local_reserved_ports = kzalloc(65536 / 8, GFP_KERNEL);
+ if (!sysctl_local_reserved_ports)
+ goto out;
+
rc = proto_register(&tcp_prot, 1);
if (rc)
- goto out;
+ goto out_free_reserved_ports;
rc = proto_register(&udp_prot, 1);
if (rc)
(void)sock_register(&inet_family_ops);
+#ifdef CONFIG_SYSCTL
+ ip_static_sysctl_init();
+#endif
+
/*
* Add all the base protocols.
*/
ip_init();
- tcp_v4_init(&inet_family_ops);
+ tcp_v4_init();
/* Setup TCP slab cache for open requests. */
tcp_init();
+ /* Setup UDP memory threshold */
+ udp_init();
+
/* Add UDP-Lite (RFC 3828) */
udplite4_register();
* Set the ICMP layer up
*/
- icmp_init(&inet_family_ops);
+ if (icmp_init() < 0)
+ panic("Failed to create the ICMP control socket.\n");
/*
* Initialise the multicast router
*/
#if defined(CONFIG_IP_MROUTE)
- ip_mr_init();
+ if (ip_mr_init())
+ printk(KERN_CRIT "inet_init: Cannot init ipv4 mroute\n");
#endif
/*
* Initialise per-cpu ipv4 mibs
*/
if (init_ipv4_mibs())
- printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n"); ;
+ printk(KERN_CRIT "inet_init: Cannot init ipv4 mibs\n");
ipv4_proc_init();
proto_unregister(&udp_prot);
out_unregister_tcp_proto:
proto_unregister(&tcp_prot);
+out_free_reserved_ports:
+ kfree(sysctl_local_reserved_ports);
goto out;
}
goto out_tcp;
if (udp4_proc_init())
goto out_udp;
- if (fib_proc_init())
- goto out_fib;
if (ip_misc_proc_init())
goto out_misc;
out:
return rc;
out_misc:
- fib_proc_exit();
-out_fib:
udp4_proc_exit();
out_udp:
tcp4_proc_exit();
MODULE_ALIAS_NETPROTO(PF_INET);
-EXPORT_SYMBOL(inet_accept);
-EXPORT_SYMBOL(inet_bind);
-EXPORT_SYMBOL(inet_dgram_connect);
-EXPORT_SYMBOL(inet_dgram_ops);
-EXPORT_SYMBOL(inet_getname);
-EXPORT_SYMBOL(inet_ioctl);
-EXPORT_SYMBOL(inet_listen);
-EXPORT_SYMBOL(inet_register_protosw);
-EXPORT_SYMBOL(inet_release);
-EXPORT_SYMBOL(inet_sendmsg);
-EXPORT_SYMBOL(inet_shutdown);
-EXPORT_SYMBOL(inet_sock_destruct);
-EXPORT_SYMBOL(inet_stream_connect);
-EXPORT_SYMBOL(inet_stream_ops);
-EXPORT_SYMBOL(inet_unregister_protosw);
-EXPORT_SYMBOL(net_statistics);
-EXPORT_SYMBOL(sysctl_ip_nonlocal_bind);