X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp.c;h=7a0f0b27bf1f0affc18e57c96b7c002aa5376fcc;hb=0975ecba3b670df7c488a5e0e6fe9f1f370a8ad8;hp=6b35ab841db2a5523a9e76af0817101b491913b5;hpb=f561d0f27d6283c49359bb96048f8ac3728c812c;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 6b35ab8..7a0f0b2 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, @@ -255,11 +253,14 @@ #include #include #include +#include #include #include #include #include #include +#include +#include #include #include #include @@ -276,10 +277,7 @@ int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT; -DEFINE_SNMP_STAT(struct tcp_mib, tcp_statistics) __read_mostly; - -atomic_t tcp_orphan_count = ATOMIC_INIT(0); - +struct percpu_counter tcp_orphan_count; EXPORT_SYMBOL_GPL(tcp_orphan_count); int sysctl_tcp_mem[3] __read_mostly; @@ -291,9 +289,12 @@ EXPORT_SYMBOL(sysctl_tcp_rmem); EXPORT_SYMBOL(sysctl_tcp_wmem); atomic_t tcp_memory_allocated; /* Current allocated memory. */ -atomic_t tcp_sockets_allocated; /* Current number of TCP sockets. */ - EXPORT_SYMBOL(tcp_memory_allocated); + +/* + * Current number of TCP sockets. + */ +struct percpu_counter tcp_sockets_allocated; EXPORT_SYMBOL(tcp_sockets_allocated); /* @@ -308,17 +309,17 @@ struct tcp_splice_state { /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. - * All the sk_stream_mem_schedule() is of this nature: accounting + * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ int tcp_memory_pressure __read_mostly; EXPORT_SYMBOL(tcp_memory_pressure); -void tcp_enter_memory_pressure(void) +void tcp_enter_memory_pressure(struct sock *sk) { if (!tcp_memory_pressure) { - NET_INC_STATS(LINUX_MIB_TCPMEMORYPRESSURES); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMEMORYPRESSURES); tcp_memory_pressure = 1; } } @@ -343,8 +344,8 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) return inet_csk_listen_poll(sk); /* Socket is not locked. We are protected from async events - by poll logic and correct handling of state changes - made by another threads is impossible in any case. + * by poll logic and correct handling of state changes + * made by other threads is impossible in any case. */ mask = 0; @@ -370,10 +371,10 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) * in state CLOSE_WAIT. One solution is evident --- to set POLLHUP * if and only if shutdown has been made in both directions. * Actually, it is interesting to look how Solaris and DUX - * solve this dilemma. I would prefer, if PULLHUP were maskable, + * solve this dilemma. I would prefer, if POLLHUP were maskable, * then we could set it on SND_SHUTDOWN. BTW examples given * in Stevens' books assume exactly this behaviour, it explains - * why PULLHUP is incompatible with POLLOUT. --ANK + * why POLLHUP is incompatible with POLLOUT. --ANK * * NOTE. Check for TCP_CLOSE is added. The goal is to prevent * blocking on fresh not-connected or disconnected socket. --ANK @@ -385,13 +386,17 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) /* Connected? */ if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) { + int target = sock_rcvlowat(sk, 0, INT_MAX); + + if (tp->urg_seq == tp->copied_seq && + !sock_flag(sk, SOCK_URGINLINE) && + tp->urg_data) + target--; + /* Potential race condition. If read of tp below will * escape above sk->sk_state, we can be illegally awaken * in SYN_* states. */ - if ((tp->rcv_nxt != tp->copied_seq) && - (tp->urg_seq != tp->copied_seq || - tp->rcv_nxt != tp->copied_seq + 1 || - sock_flag(sk, SOCK_URGINLINE) || !tp->urg_data)) + if (tp->rcv_nxt - tp->copied_seq >= target) mask |= POLLIN | POLLRDNORM; if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { @@ -485,7 +490,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) tcb->sacked = 0; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk_charge_skb(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) tp->nonagle &= ~TCP_NAGLE_PUSH; } @@ -493,11 +499,8 @@ static inline void skb_entail(struct sock *sk, struct sk_buff *skb) static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, struct sk_buff *skb) { - if (flags & MSG_OOB) { - tp->urg_mode = 1; + if (flags & MSG_OOB) tp->snd_up = tp->write_seq; - TCP_SKB_CB(skb)->sacked |= TCPCB_URG; - } } static inline void tcp_push(struct sock *sk, int flags, int mss_now, @@ -519,8 +522,13 @@ static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) { struct tcp_splice_state *tss = rd_desc->arg.data; + int ret; - return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); + ret = skb_splice_bits(skb, offset, tss->pipe, min(rd_desc->count, len), + tss->flags); + if (ret > 0) + rd_desc->count -= ret; + return ret; } static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) @@ -528,6 +536,7 @@ static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) /* Store TCP splice context information in read_descriptor_t. */ read_descriptor_t rd_desc = { .arg.data = tss, + .count = tss->len, }; return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); @@ -577,10 +586,6 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, else if (!ret) { if (spliced) break; - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } if (sock_flag(sk, SOCK_DONE)) break; if (sk->sk_err) { @@ -612,11 +617,13 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, tss.len -= ret; spliced += ret; + if (!timeo) + break; release_sock(sk); lock_sock(sk); if (sk->sk_err || sk->sk_state == TCP_CLOSE || - (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || + (sk->sk_shutdown & RCV_SHUTDOWN) || signal_pending(current)) break; } @@ -629,8 +636,7 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, return ret; } -struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, - int size, int mem, gfp_t gfp) +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) { struct sk_buff *skb; @@ -639,8 +645,7 @@ struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); if (skb) { - skb->truesize += mem; - if (sk_stream_wmem_schedule(sk, skb->truesize)) { + if (sk_wmem_schedule(sk, skb->truesize)) { /* * Make sure that we have exactly size bytes * available to the caller, no more, no less. @@ -650,12 +655,53 @@ struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, } __kfree_skb(skb); } else { - sk->sk_prot->enter_memory_pressure(); + sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return NULL; } +static unsigned int tcp_xmit_size_goal(struct sock *sk, u32 mss_now, + int large_allowed) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 xmit_size_goal, old_size_goal; + + xmit_size_goal = mss_now; + + if (large_allowed && sk_can_gso(sk)) { + xmit_size_goal = ((sk->sk_gso_max_size - 1) - + inet_csk(sk)->icsk_af_ops->net_header_len - + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); + + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); + + /* We try hard to avoid divides here */ + old_size_goal = tp->xmit_size_goal_segs * mss_now; + + if (likely(old_size_goal <= xmit_size_goal && + old_size_goal + mss_now > xmit_size_goal)) { + xmit_size_goal = old_size_goal; + } else { + tp->xmit_size_goal_segs = xmit_size_goal / mss_now; + xmit_size_goal = tp->xmit_size_goal_segs * mss_now; + } + } + + return max(xmit_size_goal, mss_now); +} + +static int tcp_send_mss(struct sock *sk, int *size_goal, int flags) +{ + int mss_now; + + mss_now = tcp_current_mss(sk); + *size_goal = tcp_xmit_size_goal(sk, mss_now, !(flags & MSG_OOB)); + + return mss_now; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -672,13 +718,12 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); copied = 0; err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (psize > 0) { struct sk_buff *skb = tcp_write_queue_tail(sk); @@ -692,8 +737,7 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, 0, 0, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -710,7 +754,7 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (can_coalesce) { @@ -724,7 +768,7 @@ new_segment: skb->data_len += copy; skb->truesize += copy; sk->sk_wmem_queued += copy; - sk->sk_forward_alloc -= copy; + sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; @@ -738,7 +782,7 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -757,8 +801,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } out: @@ -840,8 +883,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, /* This should be in poll */ clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags); - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); /* Ok commence sending. */ iovlen = msg->msg_iovlen; @@ -850,7 +892,7 @@ int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto do_error; + goto out_err; while (--iovlen >= 0) { int seglen = iov->iov_len; @@ -873,8 +915,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, select_size(sk), - 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, select_size(sk), + sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -931,7 +973,7 @@ new_segment: if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (!page) { @@ -984,7 +1026,7 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { @@ -1003,8 +1045,7 @@ wait_for_memory: if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; - mss_now = tcp_current_mss(sk, !(flags&MSG_OOB)); - size_goal = tp->xmit_size_goal; + mss_now = tcp_send_mss(sk, &size_goal, flags); } } @@ -1022,7 +1063,7 @@ do_fault: * reset, where we can be unlinking the send_head. */ tcp_check_send_head(sk, skb); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); } do_error: @@ -1040,9 +1081,7 @@ out_err: * this, no blocking and very strange errors 8) */ -static int tcp_recv_urg(struct sock *sk, long timeo, - struct msghdr *msg, int len, int flags, - int *addr_len) +static int tcp_recv_urg(struct sock *sk, struct msghdr *msg, int len, int flags) { struct tcp_sock *tp = tcp_sk(sk); @@ -1100,7 +1139,7 @@ void tcp_cleanup_rbuf(struct sock *sk, int copied) #if TCP_DEBUG struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); - BUG_TRAP(!skb || before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); + WARN_ON(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq)); #endif if (inet_csk_ack_scheduled(sk)) { @@ -1155,13 +1194,13 @@ static void tcp_prequeue_process(struct sock *sk) struct sk_buff *skb; struct tcp_sock *tp = tcp_sk(sk); - NET_INC_STATS_USER(LINUX_MIB_TCPPREQUEUED); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPPREQUEUED); /* RX process wants to run with disabled BHs, though it is not * necessary */ local_bh_disable(); while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL) - sk->sk_backlog_rcv(sk, skb); + sk_backlog_rcv(sk, skb); local_bh_enable(); /* Clear memory counter. */ @@ -1209,7 +1248,8 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, return -ENOTCONN; while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { if (offset < skb->len) { - size_t used, len; + int used; + size_t len; len = skb->len - offset; /* Stop reading if we hit a patch of urgent data */ @@ -1230,7 +1270,14 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, copied += used; offset += used; } - if (offset != skb->len) + /* + * If recv_actor drops the lock (e.g. TCP splice + * receive) the skb pointer might be invalid when + * getting here: tcp_collapse might have deleted it + * while aggregating skbs from the socket queue. + */ + skb = tcp_recv_skb(sk, seq-1, &offset); + if (!skb || (offset+1 != skb->len)) break; } if (tcp_hdr(skb)->fin) { @@ -1274,6 +1321,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, struct task_struct *user_recv = NULL; int copied_early = 0; struct sk_buff *skb; + u32 urg_hole = 0; lock_sock(sk); @@ -1309,7 +1357,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if ((available < target) && (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && - __get_cpu_var(softnet_data).net_dma) { + dma_find_channel(DMA_MEMCPY)) { preempt_enable_no_resched(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); @@ -1354,7 +1402,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, goto found_ok_skb; if (tcp_hdr(skb)->fin) goto found_fin_ok; - BUG_TRAP(flags & MSG_PEEK); + WARN_ON(!(flags & MSG_PEEK)); skb = skb->next; } while (skb != (struct sk_buff *)&sk->sk_receive_queue); @@ -1368,8 +1416,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, sk->sk_state == TCP_CLOSE || (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || - signal_pending(current) || - (flags & MSG_PEEK)) + signal_pending(current)) break; } else { if (sock_flag(sk, SOCK_DONE)) @@ -1417,8 +1464,8 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, tp->ucopy.len = len; - BUG_TRAP(tp->copied_seq == tp->rcv_nxt || - (flags & (MSG_PEEK | MSG_TRUNC))); + WARN_ON(tp->copied_seq != tp->rcv_nxt && + !(flags & (MSG_PEEK | MSG_TRUNC))); /* Ugly... If prequeue is not empty, we have to * process it before releasing socket, otherwise @@ -1469,7 +1516,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, /* __ Restore normal policy in scheduler __ */ if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMBACKLOG, chunk); len -= chunk; copied += chunk; } @@ -1480,13 +1527,14 @@ do_prequeue: tcp_prequeue_process(sk); if ((chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } } } - if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { + if ((flags & MSG_PEEK) && + (peek_seq - copied - urg_hole != tp->copied_seq)) { if (net_ratelimit()) printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", current->comm, task_pid_nr(current)); @@ -1507,6 +1555,7 @@ do_prequeue: if (!urg_offset) { if (!sock_flag(sk, SOCK_URGINLINE)) { ++*seq; + urg_hole++; offset++; used--; if (!used) @@ -1520,7 +1569,7 @@ do_prequeue: if (!(flags & MSG_TRUNC)) { #ifdef CONFIG_NET_DMA if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = get_softnet_dma(); + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) { tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( @@ -1595,7 +1644,7 @@ skip_copy: tcp_prequeue_process(sk); if (copied > 0 && (chunk = len - tp->ucopy.len) != 0) { - NET_ADD_STATS_USER(LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); + NET_ADD_STATS_USER(sock_net(sk), LINUX_MIB_TCPDIRECTCOPYFROMPREQUEUE, chunk); len -= chunk; copied += chunk; } @@ -1625,7 +1674,6 @@ skip_copy: /* Safe to free early-copied skbs now */ __skb_queue_purge(&sk->sk_async_wait_queue); - dma_chan_put(tp->ucopy.dma_chan); tp->ucopy.dma_chan = NULL; } if (tp->ucopy.pinned_list) { @@ -1651,10 +1699,45 @@ out: return err; recv_urg: - err = tcp_recv_urg(sk, timeo, msg, len, flags, addr_len); + err = tcp_recv_urg(sk, msg, len, flags); goto out; } +void tcp_set_state(struct sock *sk, int state) +{ + int oldstate = sk->sk_state; + + switch (state) { + case TCP_ESTABLISHED: + if (oldstate != TCP_ESTABLISHED) + TCP_INC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + break; + + case TCP_CLOSE: + if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) + TCP_INC_STATS(sock_net(sk), TCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(sk); + /* fall through */ + default: + if (oldstate == TCP_ESTABLISHED) + TCP_DEC_STATS(sock_net(sk), TCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; + +#ifdef STATE_TRACE + SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]); +#endif +} +EXPORT_SYMBOL_GPL(tcp_set_state); + /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some @@ -1690,7 +1773,7 @@ static int tcp_close_state(struct sock *sk) /* * Shutdown the sending side of a connection. Much like close except - * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD). + * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) @@ -1741,7 +1824,7 @@ void tcp_close(struct sock *sk, long timeout) __kfree_skb(skb); } - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); /* As outlined in RFC 2525, section 2.17, we send a RST here because * data was lost. To witness the awful effects of the old behavior of @@ -1752,13 +1835,13 @@ void tcp_close(struct sock *sk, long timeout) */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONCLOSE); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONCLOSE); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_KERNEL); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { /* Check zero linger _after_ checking for unread data. */ sk->sk_prot->disconnect(sk, 0); - NET_INC_STATS_USER(LINUX_MIB_TCPABORTONDATA); + NET_INC_STATS_USER(sock_net(sk), LINUX_MIB_TCPABORTONDATA); } else if (tcp_close_state(sk)) { /* We FIN if the application ate all the data before * zapping the connection. @@ -1794,7 +1877,6 @@ adjudge_to_death: state = sk->sk_state; sock_hold(sk); sock_orphan(sk); - atomic_inc(sk->sk_prot->orphan_count); /* It is the last release_sock in its life. It will remove backlog. */ release_sock(sk); @@ -1805,7 +1887,9 @@ adjudge_to_death: */ local_bh_disable(); bh_lock_sock(sk); - BUG_TRAP(!sock_owned_by_user(sk)); + WARN_ON(sock_owned_by_user(sk)); + + percpu_counter_inc(sk->sk_prot->orphan_count); /* Have we already been destroyed by a softirq or backlog? */ if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE) @@ -1830,7 +1914,8 @@ adjudge_to_death: if (tp->linger2 < 0) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONLINGER); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONLINGER); } else { const int tmo = tcp_fin_time(sk); @@ -1844,15 +1929,18 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_stream_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, - atomic_read(sk->sk_prot->orphan_count))) { + int orphan_count = percpu_counter_read_positive( + sk->sk_prot->orphan_count); + + sk_mem_reclaim(sk); + if (tcp_too_many_orphans(sk, orphan_count)) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); - NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY); + NET_INC_STATS_BH(sock_net(sk), + LINUX_MIB_TCPABORTONMEMORY); } } @@ -1932,7 +2020,7 @@ int tcp_disconnect(struct sock *sk, int flags) memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); - BUG_TRAP(!inet->num || icsk->icsk_bind_hash); + WARN_ON(inet->num && !icsk->icsk_bind_hash); sk->sk_error_report(sk); return err; @@ -2340,7 +2428,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) unsigned int seq; __be32 delta; unsigned int oldlen; - unsigned int len; + unsigned int mss; if (!pskb_may_pull(skb, sizeof(*th))) goto out; @@ -2356,10 +2444,13 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) oldlen = (u16)~skb->len; __skb_pull(skb, thlen); + mss = skb_shinfo(skb)->gso_size; + if (unlikely(skb->len <= mss)) + goto out; + if (skb_gso_ok(skb, features | NETIF_F_GSO_ROBUST)) { /* Packet is from an untrusted source, reset gso_segs. */ int type = skb_shinfo(skb)->gso_type; - int mss; if (unlikely(type & ~(SKB_GSO_TCPV4 | @@ -2370,7 +2461,6 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) !(type & (SKB_GSO_TCPV4 | SKB_GSO_TCPV6)))) goto out; - mss = skb_shinfo(skb)->gso_size; skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; @@ -2381,8 +2471,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) if (IS_ERR(segs)) goto out; - len = skb_shinfo(skb)->gso_size; - delta = htonl(oldlen + (thlen + len)); + delta = htonl(oldlen + (thlen + mss)); skb = segs; th = tcp_hdr(skb); @@ -2398,7 +2487,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) csum_fold(csum_partial(skb_transport_header(skb), thlen, skb->csum)); - seq += len; + seq += mss; skb = skb->next; th = tcp_hdr(skb); @@ -2419,6 +2508,108 @@ out: } EXPORT_SYMBOL(tcp_tso_segment); +struct sk_buff **tcp_gro_receive(struct sk_buff **head, struct sk_buff *skb) +{ + struct sk_buff **pp = NULL; + struct sk_buff *p; + struct tcphdr *th; + struct tcphdr *th2; + unsigned int len; + unsigned int thlen; + unsigned int flags; + unsigned int mss = 1; + int flush = 1; + int i; + + th = skb_gro_header(skb, sizeof(*th)); + if (unlikely(!th)) + goto out; + + thlen = th->doff * 4; + if (thlen < sizeof(*th)) + goto out; + + th = skb_gro_header(skb, thlen); + if (unlikely(!th)) + goto out; + + skb_gro_pull(skb, thlen); + + len = skb_gro_len(skb); + flags = tcp_flag_word(th); + + for (; (p = *head); head = &p->next) { + if (!NAPI_GRO_CB(p)->same_flow) + continue; + + th2 = tcp_hdr(p); + + if ((th->source ^ th2->source) | (th->dest ^ th2->dest)) { + NAPI_GRO_CB(p)->same_flow = 0; + continue; + } + + goto found; + } + + goto out_check_final; + +found: + flush = NAPI_GRO_CB(p)->flush; + flush |= flags & TCP_FLAG_CWR; + flush |= (flags ^ tcp_flag_word(th2)) & + ~(TCP_FLAG_CWR | TCP_FLAG_FIN | TCP_FLAG_PSH); + flush |= (th->ack_seq ^ th2->ack_seq) | (th->window ^ th2->window); + for (i = sizeof(*th); !flush && i < thlen; i += 4) + flush |= *(u32 *)((u8 *)th + i) ^ + *(u32 *)((u8 *)th2 + i); + + mss = skb_shinfo(p)->gso_size; + + flush |= (len > mss) | !len; + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); + + if (flush || skb_gro_receive(head, skb)) { + mss = 1; + goto out_check_final; + } + + p = *head; + th2 = tcp_hdr(p); + tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); + +out_check_final: + flush = len < mss; + flush |= flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | + TCP_FLAG_SYN | TCP_FLAG_FIN); + + if (p && (!NAPI_GRO_CB(skb)->same_flow || flush)) + pp = head; + +out: + NAPI_GRO_CB(skb)->flush |= flush; + + return pp; +} +EXPORT_SYMBOL(tcp_gro_receive); + +int tcp_gro_complete(struct sk_buff *skb) +{ + struct tcphdr *th = tcp_hdr(skb); + + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + skb->ip_summed = CHECKSUM_PARTIAL; + + skb_shinfo(skb)->gso_segs = NAPI_GRO_CB(skb)->count; + + if (th->cwr) + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + + return 0; +} +EXPORT_SYMBOL(tcp_gro_complete); + #ifdef CONFIG_TCP_MD5SIG static unsigned long tcp_md5sig_users; static struct tcp_md5sig_pool **tcp_md5sig_pool; @@ -2547,12 +2738,69 @@ void __tcp_put_md5sig_pool(void) } EXPORT_SYMBOL(__tcp_put_md5sig_pool); + +int tcp_md5_hash_header(struct tcp_md5sig_pool *hp, + struct tcphdr *th) +{ + struct scatterlist sg; + int err; + + __sum16 old_checksum = th->check; + th->check = 0; + /* options aren't included in the hash */ + sg_init_one(&sg, th, sizeof(struct tcphdr)); + err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(struct tcphdr)); + th->check = old_checksum; + return err; +} + +EXPORT_SYMBOL(tcp_md5_hash_header); + +int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp, + struct sk_buff *skb, unsigned header_len) +{ + struct scatterlist sg; + const struct tcphdr *tp = tcp_hdr(skb); + struct hash_desc *desc = &hp->md5_desc; + unsigned i; + const unsigned head_data_len = skb_headlen(skb) > header_len ? + skb_headlen(skb) - header_len : 0; + const struct skb_shared_info *shi = skb_shinfo(skb); + + sg_init_table(&sg, 1); + + sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len); + if (crypto_hash_update(desc, &sg, head_data_len)) + return 1; + + for (i = 0; i < shi->nr_frags; ++i) { + const struct skb_frag_struct *f = &shi->frags[i]; + sg_set_page(&sg, f->page, f->size, f->page_offset); + if (crypto_hash_update(desc, &sg, f->size)) + return 1; + } + + return 0; +} + +EXPORT_SYMBOL(tcp_md5_hash_skb_data); + +int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, struct tcp_md5sig_key *key) +{ + struct scatterlist sg; + + sg_init_one(&sg, key->key, key->keylen); + return crypto_hash_update(&hp->md5_desc, &sg, key->keylen); +} + +EXPORT_SYMBOL(tcp_md5_hash_key); + #endif void tcp_done(struct sock *sk) { - if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) - TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_ATTEMPTFAILS); tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); @@ -2566,7 +2814,6 @@ void tcp_done(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_done); -extern void __skb_cb_too_small_for_tcp(int, int); extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; @@ -2582,13 +2829,13 @@ __setup("thash_entries=", set_thash_entries); void __init tcp_init(void) { struct sk_buff *skb = NULL; - unsigned long limit; + unsigned long nr_pages, limit; int order, i, max_share; - if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) - __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), - sizeof(skb->cb)); + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); + percpu_counter_init(&tcp_sockets_allocated, 0); + percpu_counter_init(&tcp_orphan_count, 0); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, @@ -2611,8 +2858,8 @@ void __init tcp_init(void) thash_entries ? 0 : 512 * 1024); tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; for (i = 0; i < tcp_hashinfo.ehash_size; i++) { - INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); - INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); + INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].chain, i); + INIT_HLIST_NULLS_HEAD(&tcp_hashinfo.ehash[i].twchain, i); } if (inet_ehash_locks_alloc(&tcp_hashinfo)) panic("TCP: failed to alloc ehash_locks"); @@ -2653,8 +2900,9 @@ void __init tcp_init(void) * is up to 1/2 at 256 MB, decreasing toward zero with the amount of * memory, with a floor of 128 pages. */ - limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); - limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + nr_pages = totalram_pages - totalhigh_pages; + limit = min(nr_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); + limit = (limit * (nr_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); limit = max(limit, 128UL); sysctl_tcp_mem[0] = limit / 4 * 3; sysctl_tcp_mem[1] = limit; @@ -2664,11 +2912,11 @@ void __init tcp_init(void) limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; sysctl_tcp_wmem[2] = max(64*1024, max_share); - sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; sysctl_tcp_rmem[2] = max(87380, max_share); @@ -2691,4 +2939,3 @@ EXPORT_SYMBOL(tcp_splice_read); EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown); -EXPORT_SYMBOL(tcp_statistics);