X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Finet_connection_sock.c;h=ee16475f8fc33619eea9c9e247f06501e6aad45b;hb=a83d8e8d099fc373a5ca7112ad08c553bb2c180f;hp=274b0b846c25cd5fed68b91333a884933e8080e3;hpb=6b72977bd6c6fefc6497d4f0275079f539eaf0ac;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 274b0b8..ee16475 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -30,20 +30,40 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); #endif /* - * This array holds the first and last local port number. - * For high-usage systems, use sysctl to change this to - * 32768-61000 + * This struct holds the first and last local port number. */ -int sysctl_local_port_range[2] = { 1024, 4999 }; +struct local_ports sysctl_local_ports __read_mostly = { + .lock = SEQLOCK_UNLOCKED, + .range = { 32768, 61000 }, +}; + +void inet_get_local_port_range(int *low, int *high) +{ + unsigned seq; + do { + seq = read_seqbegin(&sysctl_local_ports.lock); + + *low = sysctl_local_ports.range[0]; + *high = sysctl_local_ports.range[1]; + } while (read_seqretry(&sysctl_local_ports.lock, seq)); +} +EXPORT_SYMBOL(inet_get_local_port_range); int inet_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) { - const u32 sk_rcv_saddr = inet_rcv_saddr(sk); + const __be32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; + /* + * Unlike other sk lookup places we do not check + * for sk_net here, since _all_ the socks listed + * in tb->owners list belong to the same net - the + * one this bucket belongs to. + */ + sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && !inet_v6_ipv6only(sk2) && @@ -52,7 +72,7 @@ int inet_csk_bind_conflict(const struct sock *sk, sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { - const u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); + const __be32 sk2_rcv_saddr = inet_rcv_saddr(sk2); if (!sk2_rcv_saddr || !sk_rcv_saddr || sk2_rcv_saddr == sk_rcv_saddr) break; @@ -67,29 +87,46 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ -int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum, - int (*bind_conflict)(const struct sock *sk, - const struct inet_bind_bucket *tb)) +int inet_csk_get_port(struct sock *sk, unsigned short snum) { + struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_bind_hashbucket *head; struct hlist_node *node; struct inet_bind_bucket *tb; - int ret; + int ret, attempts = 5; + struct net *net = sock_net(sk); + int smallest_size = -1, smallest_rover; local_bh_disable(); if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; + int remaining, rover, low, high; + +again: + inet_get_local_port_range(&low, &high); + remaining = (high - low) + 1; + smallest_rover = rover = net_random() % remaining + low; + smallest_size = -1; do { - head = &hashinfo->bhash[inet_bhashfn(rover, hashinfo->bhash_size)]; + head = &hashinfo->bhash[inet_bhashfn(net, rover, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) + if (net_eq(ib_net(tb), net) && tb->port == rover) { + if (tb->fastreuse > 0 && + sk->sk_reuse && + sk->sk_state != TCP_LISTEN && + (tb->num_owners < smallest_size || smallest_size == -1)) { + smallest_size = tb->num_owners; + smallest_rover = rover; + if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { + spin_unlock(&head->lock); + snum = smallest_rover; + goto have_snum; + } + } goto next; + } break; next: spin_unlock(&head->lock); @@ -104,38 +141,50 @@ int inet_csk_get_port(struct inet_hashinfo *hashinfo, * the top level, not from the 'break;' statement. */ ret = 1; - if (remaining <= 0) + if (remaining <= 0) { + if (smallest_size != -1) { + snum = smallest_rover; + goto have_snum; + } goto fail; - + } /* OK, here is the one we will use. HEAD is * non-NULL and we hold it's mutex. */ snum = rover; } else { - head = &hashinfo->bhash[inet_bhashfn(snum, hashinfo->bhash_size)]; +have_snum: + head = &hashinfo->bhash[inet_bhashfn(net, snum, + hashinfo->bhash_size)]; spin_lock(&head->lock); inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) + if (net_eq(ib_net(tb), net) && tb->port == snum) goto tb_found; } tb = NULL; goto tb_not_found; tb_found: if (!hlist_empty(&tb->owners)) { - if (sk->sk_reuse > 1) - goto success; if (tb->fastreuse > 0 && - sk->sk_reuse && sk->sk_state != TCP_LISTEN) { + sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size == -1) { goto success; } else { ret = 1; - if (bind_conflict(sk, tb)) + if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + if (sk->sk_reuse && sk->sk_state != TCP_LISTEN && + smallest_size != -1 && --attempts >= 0) { + spin_unlock(&head->lock); + goto again; + } goto fail_unlock; + } } } tb_not_found: ret = 1; - if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, head, snum)) == NULL) + if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep, + net, head, snum)) == NULL) goto fail_unlock; if (hlist_empty(&tb->owners)) { if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) @@ -148,8 +197,8 @@ tb_not_found: success: if (!inet_csk(sk)->icsk_bind_hash) inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; + WARN_ON(inet_csk(sk)->icsk_bind_hash != tb); + ret = 0; fail_unlock: spin_unlock(&head->lock); @@ -241,7 +290,7 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) } newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk); - BUG_TRAP(newsk->sk_state != TCP_SYN_RECV); + WARN_ON(newsk->sk_state == TCP_SYN_RECV); out: release_sock(sk); return newsk; @@ -255,7 +304,7 @@ EXPORT_SYMBOL(inet_csk_accept); /* * Using different timers for retransmit, delayed acks and probes - * We may wish use just one timer maintaining a list of expire jiffies + * We may wish use just one timer maintaining a list of expire jiffies * to optimize. */ void inet_csk_init_xmit_timers(struct sock *sk, @@ -265,18 +314,11 @@ void inet_csk_init_xmit_timers(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); - init_timer(&icsk->icsk_retransmit_timer); - init_timer(&icsk->icsk_delack_timer); - init_timer(&sk->sk_timer); - - icsk->icsk_retransmit_timer.function = retransmit_handler; - icsk->icsk_delack_timer.function = delack_handler; - sk->sk_timer.function = keepalive_handler; - - icsk->icsk_retransmit_timer.data = - icsk->icsk_delack_timer.data = - sk->sk_timer.data = (unsigned long)sk; - + setup_timer(&icsk->icsk_retransmit_timer, retransmit_handler, + (unsigned long)sk); + setup_timer(&icsk->icsk_delack_timer, delack_handler, + (unsigned long)sk); + setup_timer(&sk->sk_timer, keepalive_handler, (unsigned long)sk); icsk->icsk_pending = icsk->icsk_ack.pending = 0; } @@ -309,13 +351,14 @@ void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len) EXPORT_SYMBOL(inet_csk_reset_keepalive_timer); -struct dst_entry* inet_csk_route_req(struct sock *sk, +struct dst_entry *inet_csk_route_req(struct sock *sk, const struct request_sock *req) { struct rtable *rt; const struct inet_request_sock *ireq = inet_rsk(req); struct ip_options *opt = inet_rsk(req)->opt; struct flowi fl = { .oif = sk->sk_bound_dev_if, + .mark = sk->sk_mark, .nl_u = { .ip4_u = { .daddr = ((opt && opt->srr) ? opt->faddr : @@ -323,27 +366,30 @@ struct dst_entry* inet_csk_route_req(struct sock *sk, .saddr = ireq->loc_addr, .tos = RT_CONN_FLAGS(sk) } }, .proto = sk->sk_protocol, + .flags = inet_sk_flowi_flags(sk), .uli_u = { .ports = - { .sport = inet_sk(sk)->sport, + { .sport = inet_sk(sk)->inet_sport, .dport = ireq->rmt_port } } }; + struct net *net = sock_net(sk); security_req_classify_flow(req, &fl); - if (ip_route_output_flow(&rt, &fl, sk, 0)) { - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } - if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) { - ip_rt_put(rt); - IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); - return NULL; - } + if (ip_route_output_flow(net, &rt, &fl, sk, 0)) + goto no_route; + if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) + goto route_err; return &rt->u.dst; + +route_err: + ip_rt_put(rt); +no_route: + IP_INC_STATS_BH(net, IPSTATS_MIB_OUTNOROUTES); + return NULL; } EXPORT_SYMBOL_GPL(inet_csk_route_req); static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport, - const u32 rnd, const u16 synq_hsize) + const u32 rnd, const u32 synq_hsize) { return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1); } @@ -373,7 +419,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, ireq->rmt_addr == raddr && ireq->loc_addr == laddr && AF_INET_FAMILY(req->rsk_ops->family)) { - BUG_TRAP(!req->sk); + WARN_ON(req->sk); *prevp = prev; break; } @@ -401,6 +447,28 @@ extern int sysctl_tcp_synack_retries; EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add); +/* Decide when to expire the request and when to resend SYN-ACK */ +static inline void syn_ack_recalc(struct request_sock *req, const int thresh, + const int max_retries, + const u8 rskq_defer_accept, + int *expire, int *resend) +{ + if (!rskq_defer_accept) { + *expire = req->retrans >= thresh; + *resend = 1; + return; + } + *expire = req->retrans >= thresh && + (!inet_rsk(req)->acked || req->retrans >= max_retries); + /* + * Do not resend while waiting for data after ACK, + * start to resend on end of deferring period to give + * last chance for data or ACK to create established socket. + */ + *resend = !inet_rsk(req)->acked || + req->retrans >= rskq_defer_accept - 1; +} + void inet_csk_reqsk_queue_prune(struct sock *parent, const unsigned long interval, const unsigned long timeout, @@ -456,9 +524,15 @@ void inet_csk_reqsk_queue_prune(struct sock *parent, reqp=&lopt->syn_table[i]; while ((req = *reqp) != NULL) { if (time_after_eq(now, req->expires)) { - if ((req->retrans < thresh || - (inet_rsk(req)->acked && req->retrans < max_retries)) - && !req->rsk_ops->rtx_syn_ack(parent, req, NULL)) { + int expire = 0, resend = 0; + + syn_ack_recalc(req, thresh, max_retries, + queue->rskq_defer_accept, + &expire, &resend); + if (!expire && + (!resend || + !req->rsk_ops->rtx_syn_ack(parent, req, NULL) || + inet_rsk(req)->acked)) { unsigned long timeo; if (req->retrans++ == 0) @@ -501,7 +575,9 @@ struct sock *inet_csk_clone(struct sock *sk, const struct request_sock *req, newsk->sk_state = TCP_SYN_RECV; newicsk->icsk_bind_hash = NULL; - inet_sk(newsk)->dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->inet_dport = inet_rsk(req)->rmt_port; + inet_sk(newsk)->inet_num = ntohs(inet_rsk(req)->loc_port); + inet_sk(newsk)->inet_sport = inet_rsk(req)->loc_port; newsk->sk_write_space = sk_stream_write_space; newicsk->icsk_retransmits = 0; @@ -526,14 +602,14 @@ EXPORT_SYMBOL_GPL(inet_csk_clone); */ void inet_csk_destroy_sock(struct sock *sk) { - BUG_TRAP(sk->sk_state == TCP_CLOSE); - BUG_TRAP(sock_flag(sk, SOCK_DEAD)); + WARN_ON(sk->sk_state != TCP_CLOSE); + WARN_ON(!sock_flag(sk, SOCK_DEAD)); /* It cannot be in hash table! */ - BUG_TRAP(sk_unhashed(sk)); + WARN_ON(!sk_unhashed(sk)); - /* If it has not 0 inet_sk(sk)->num, it must be bound */ - BUG_TRAP(!inet_sk(sk)->num || inet_csk(sk)->icsk_bind_hash); + /* If it has not 0 inet_sk(sk)->inet_num, it must be bound */ + WARN_ON(inet_sk(sk)->inet_num && !inet_csk(sk)->icsk_bind_hash); sk->sk_prot->destroy(sk); @@ -543,7 +619,7 @@ void inet_csk_destroy_sock(struct sock *sk) sk_refcnt_debug_release(sk); - atomic_dec(sk->sk_prot->orphan_count); + percpu_counter_dec(sk->sk_prot->orphan_count); sock_put(sk); } @@ -568,8 +644,8 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) * after validation is complete. */ sk->sk_state = TCP_LISTEN; - if (!sk->sk_prot->get_port(sk, inet->num)) { - inet->sport = htons(inet->num); + if (!sk->sk_prot->get_port(sk, inet->inet_num)) { + inet->inet_sport = htons(inet->inet_num); sk_dst_reset(sk); sk->sk_prot->hash(sk); @@ -616,14 +692,14 @@ void inet_csk_listen_stop(struct sock *sk) local_bh_disable(); bh_lock_sock(child); - BUG_TRAP(!sock_owned_by_user(child)); + WARN_ON(sock_owned_by_user(child)); sock_hold(child); sk->sk_prot->disconnect(child, O_NONBLOCK); sock_orphan(child); - atomic_inc(sk->sk_prot->orphan_count); + percpu_counter_inc(sk->sk_prot->orphan_count); inet_csk_destroy_sock(child); @@ -634,7 +710,7 @@ void inet_csk_listen_stop(struct sock *sk) sk_acceptq_removed(sk); __reqsk_free(req); } - BUG_TRAP(!sk->sk_ack_backlog); + WARN_ON(sk->sk_ack_backlog); } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); @@ -645,31 +721,12 @@ void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) const struct inet_sock *inet = inet_sk(sk); sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; + sin->sin_addr.s_addr = inet->inet_daddr; + sin->sin_port = inet->inet_dport; } EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); -int inet_csk_ctl_sock_create(struct socket **sock, unsigned short family, - unsigned short type, unsigned char protocol) -{ - int rc = sock_create_kern(family, type, protocol, sock); - - if (rc == 0) { - (*sock)->sk->sk_allocation = GFP_ATOMIC; - inet_sk((*sock)->sk)->uc_ttl = -1; - /* - * Unhash it so that IP input processing does not even see it, - * we do not wish this socket to see incoming packets. - */ - (*sock)->sk->sk_prot->unhash((*sock)->sk); - } - return rc; -} - -EXPORT_SYMBOL_GPL(inet_csk_ctl_sock_create); - #ifdef CONFIG_COMPAT int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen) @@ -686,7 +743,7 @@ int inet_csk_compat_getsockopt(struct sock *sk, int level, int optname, EXPORT_SYMBOL_GPL(inet_csk_compat_getsockopt); int inet_csk_compat_setsockopt(struct sock *sk, int level, int optname, - char __user *optval, int optlen) + char __user *optval, unsigned int optlen) { const struct inet_connection_sock *icsk = inet_csk(sk);