X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp.c;h=cf0850c068f5488856a642e9bc588c3b8900c536;hb=4ae127d1b6c71f9240dd4245f240e6dd8fc98014;hp=090c690627e5d12c9540119de37fb14f813e8592;hpb=f5b99bcdddfb2338227faad3489c24907f37ee8e;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 090c690..cf0850c 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp.c,v 1.216 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, @@ -247,13 +245,17 @@ * TCP_CLOSE socket is finished */ +#include #include #include #include #include #include -#include #include +#include +#include +#include +#include #include #include #include @@ -265,6 +267,7 @@ #include #include #include +#include #include #include @@ -292,12 +295,21 @@ EXPORT_SYMBOL(tcp_memory_allocated); EXPORT_SYMBOL(tcp_sockets_allocated); /* + * TCP splice context + */ +struct tcp_splice_state { + struct pipe_inode_info *pipe; + size_t len; + unsigned int flags; +}; + +/* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. - * All the sk_stream_mem_schedule() is of this nature: accounting + * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ -int tcp_memory_pressure; +int tcp_memory_pressure __read_mostly; EXPORT_SYMBOL(tcp_memory_pressure); @@ -425,7 +437,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) /* Subtract 1, if FIN is in queue. */ if (answ && !skb_queue_empty(&sk->sk_receive_queue)) answ -= - ((struct sk_buff *)sk->sk_receive_queue.prev)->h.th->fin; + tcp_hdr((struct sk_buff *)sk->sk_receive_queue.prev)->fin; } else answ = tp->urg_seq - tp->copied_seq; release_sock(sk); @@ -444,7 +456,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg) break; default: return -ENOIOCTLCMD; - }; + } return put_user(answ, (int __user *)arg); } @@ -460,9 +472,9 @@ static inline int forced_push(struct tcp_sock *tp) return after(tp->write_seq, tp->pushed_seq + (tp->max_window >> 1)); } -static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static inline void skb_entail(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); skb->csum = 0; @@ -470,12 +482,11 @@ static inline void skb_entail(struct sock *sk, struct tcp_sock *tp, tcb->flags = TCPCB_FLAG_ACK; tcb->sacked = 0; skb_header_release(skb); - __skb_queue_tail(&sk->sk_write_queue, skb); - sk_charge_skb(sk, skb); - if (!sk->sk_send_head) - sk->sk_send_head = skb; + tcp_add_write_queue_tail(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); if (tp->nonagle & TCP_NAGLE_PUSH) - tp->nonagle &= ~TCP_NAGLE_PUSH; + tp->nonagle &= ~TCP_NAGLE_PUSH; } static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, @@ -484,23 +495,163 @@ static inline void tcp_mark_urg(struct tcp_sock *tp, int flags, if (flags & MSG_OOB) { tp->urg_mode = 1; tp->snd_up = tp->write_seq; - TCP_SKB_CB(skb)->sacked |= TCPCB_URG; } } -static inline void tcp_push(struct sock *sk, struct tcp_sock *tp, int flags, - int mss_now, int nonagle) +static inline void tcp_push(struct sock *sk, int flags, int mss_now, + int nonagle) { - if (sk->sk_send_head) { - struct sk_buff *skb = sk->sk_write_queue.prev; + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_send_head(sk)) { + struct sk_buff *skb = tcp_write_queue_tail(sk); if (!(flags & MSG_MORE) || forced_push(tp)) tcp_mark_push(tp, skb); tcp_mark_urg(tp, flags, skb); - __tcp_push_pending_frames(sk, tp, mss_now, + __tcp_push_pending_frames(sk, mss_now, (flags & MSG_MORE) ? TCP_NAGLE_CORK : nonagle); } } +static int tcp_splice_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, + unsigned int offset, size_t len) +{ + struct tcp_splice_state *tss = rd_desc->arg.data; + + return skb_splice_bits(skb, offset, tss->pipe, tss->len, tss->flags); +} + +static int __tcp_splice_read(struct sock *sk, struct tcp_splice_state *tss) +{ + /* Store TCP splice context information in read_descriptor_t. */ + read_descriptor_t rd_desc = { + .arg.data = tss, + }; + + return tcp_read_sock(sk, &rd_desc, tcp_splice_data_recv); +} + +/** + * tcp_splice_read - splice data from TCP socket to a pipe + * @sock: socket to splice from + * @ppos: position (not valid) + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Description: + * Will read pages from given socket and fill them into a pipe. + * + **/ +ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct sock *sk = sock->sk; + struct tcp_splice_state tss = { + .pipe = pipe, + .len = len, + .flags = flags, + }; + long timeo; + ssize_t spliced; + int ret; + + /* + * We can't seek on a socket input + */ + if (unlikely(*ppos)) + return -ESPIPE; + + ret = spliced = 0; + + lock_sock(sk); + + timeo = sock_rcvtimeo(sk, flags & SPLICE_F_NONBLOCK); + while (tss.len) { + ret = __tcp_splice_read(sk, &tss); + if (ret < 0) + break; + else if (!ret) { + if (spliced) + break; + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + if (sock_flag(sk, SOCK_DONE)) + break; + if (sk->sk_err) { + ret = sock_error(sk); + break; + } + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + if (sk->sk_state == TCP_CLOSE) { + /* + * This occurs when user tries to read + * from never connected socket. + */ + if (!sock_flag(sk, SOCK_DONE)) + ret = -ENOTCONN; + break; + } + if (!timeo) { + ret = -EAGAIN; + break; + } + sk_wait_data(sk, &timeo); + if (signal_pending(current)) { + ret = sock_intr_errno(timeo); + break; + } + continue; + } + tss.len -= ret; + spliced += ret; + + release_sock(sk); + lock_sock(sk); + + if (sk->sk_err || sk->sk_state == TCP_CLOSE || + (sk->sk_shutdown & RCV_SHUTDOWN) || !timeo || + signal_pending(current)) + break; + } + + release_sock(sk); + + if (spliced) + return spliced; + + return ret; +} + +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp) +{ + struct sk_buff *skb; + + /* The TCP header must be at least 32-bit aligned. */ + size = ALIGN(size, 4); + + skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp); + if (skb) { + if (sk_wmem_schedule(sk, skb->truesize)) { + /* + * Make sure that we have exactly size bytes + * available to the caller, no more, no less. + */ + skb_reserve(skb, skb_tailroom(skb) - size); + return skb; + } + __kfree_skb(skb); + } else { + sk->sk_prot->enter_memory_pressure(); + sk_stream_moderate_sndbuf(sk); + } + return NULL; +} + static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffset, size_t psize, int flags) { @@ -526,23 +677,22 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page **pages, int poffse goto do_error; while (psize > 0) { - struct sk_buff *skb = sk->sk_write_queue.prev; + struct sk_buff *skb = tcp_write_queue_tail(sk); struct page *page = pages[poffset / PAGE_SIZE]; int copy, i, can_coalesce; int offset = poffset % PAGE_SIZE; int size = min_t(size_t, psize, PAGE_SIZE - offset); - if (!sk->sk_send_head || (copy = size_goal - skb->len) <= 0) { + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, 0, 0, - sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation); if (!skb) goto wait_for_memory; - skb_entail(sk, tp, skb); + skb_entail(sk, skb); copy = size_goal; } @@ -555,9 +705,9 @@ new_segment: tcp_mark_push(tp, skb); goto new_segment; } - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; - + if (can_coalesce) { skb_shinfo(skb)->frags[i - 1].size += copy; } else { @@ -569,7 +719,7 @@ new_segment: skb->data_len += copy; skb->truesize += copy; sk->sk_wmem_queued += copy; - sk->sk_forward_alloc -= copy; + sk_mem_charge(sk, copy); skb->ip_summed = CHECKSUM_PARTIAL; tp->write_seq += copy; TCP_SKB_CB(skb)->end_seq += copy; @@ -583,13 +733,13 @@ new_segment: if (!(psize -= copy)) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); - } else if (skb == sk->sk_send_head) + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; @@ -597,7 +747,7 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -608,7 +758,7 @@ wait_for_memory: out: if (copied) - tcp_push(sk, tp, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle); return copied; do_error: @@ -639,8 +789,9 @@ ssize_t tcp_sendpage(struct socket *sock, struct page *page, int offset, #define TCP_PAGE(sk) (sk->sk_sndmsg_page) #define TCP_OFF(sk) (sk->sk_sndmsg_off) -static inline int select_size(struct sock *sk, struct tcp_sock *tp) +static inline int select_size(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); int tmp = tp->mss_cache; if (sk->sk_route_caps & NETIF_F_SG) { @@ -658,9 +809,10 @@ static inline int select_size(struct sock *sk, struct tcp_sock *tp) return tmp; } -int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, +int tcp_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size) { + struct sock *sk = sock->sk; struct iovec *iov; struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -704,9 +856,9 @@ int tcp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, while (seglen > 0) { int copy; - skb = sk->sk_write_queue.prev; + skb = tcp_write_queue_tail(sk); - if (!sk->sk_send_head || + if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0) { new_segment: @@ -716,8 +868,8 @@ new_segment: if (!sk_stream_memory_free(sk)) goto wait_for_sndbuf; - skb = sk_stream_alloc_pskb(sk, select_size(sk, tp), - 0, sk->sk_allocation); + skb = sk_stream_alloc_skb(sk, select_size(sk), + sk->sk_allocation); if (!skb) goto wait_for_memory; @@ -727,7 +879,7 @@ new_segment: if (sk->sk_route_caps & NETIF_F_ALL_CSUM) skb->ip_summed = CHECKSUM_PARTIAL; - skb_entail(sk, tp, skb); + skb_entail(sk, skb); copy = size_goal; } @@ -774,7 +926,7 @@ new_segment: if (copy > PAGE_SIZE - off) copy = PAGE_SIZE - off; - if (!sk_stream_wmem_schedule(sk, copy)) + if (!sk_wmem_schedule(sk, copy)) goto wait_for_memory; if (!page) { @@ -827,13 +979,13 @@ new_segment: if ((seglen -= copy) == 0 && iovlen == 0) goto out; - if (skb->len < mss_now || (flags & MSG_OOB)) + if (skb->len < size_goal || (flags & MSG_OOB)) continue; if (forced_push(tp)) { tcp_mark_push(tp, skb); - __tcp_push_pending_frames(sk, tp, mss_now, TCP_NAGLE_PUSH); - } else if (skb == sk->sk_send_head) + __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_PUSH); + } else if (skb == tcp_send_head(sk)) tcp_push_one(sk, mss_now); continue; @@ -841,7 +993,7 @@ wait_for_sndbuf: set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); wait_for_memory: if (copied) - tcp_push(sk, tp, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); + tcp_push(sk, flags & ~MSG_MORE, mss_now, TCP_NAGLE_PUSH); if ((err = sk_stream_wait_memory(sk, &timeo)) != 0) goto do_error; @@ -853,17 +1005,19 @@ wait_for_memory: out: if (copied) - tcp_push(sk, tp, flags, mss_now, tp->nonagle); + tcp_push(sk, flags, mss_now, tp->nonagle); TCP_CHECK_TIMER(sk); release_sock(sk); return copied; do_fault: if (!skb->len) { - if (sk->sk_send_head == skb) - sk->sk_send_head = NULL; - __skb_unlink(skb, &sk->sk_write_queue); - sk_stream_free_skb(sk, skb); + tcp_unlink_write_queue(skb, sk); + /* It is the one place in all of TCP, except connection + * reset, where we can be unlinking the send_head. + */ + tcp_check_send_head(sk, skb); + sk_wmem_free_skb(sk, skb); } do_error: @@ -1016,9 +1170,9 @@ static inline struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off) skb_queue_walk(&sk->sk_receive_queue, skb) { offset = seq - TCP_SKB_CB(skb)->seq; - if (skb->h.th->syn) + if (tcp_hdr(skb)->syn) offset--; - if (offset < skb->len || skb->h.th->fin) { + if (offset < skb->len || tcp_hdr(skb)->fin) { *off = offset; return skb; } @@ -1062,15 +1216,26 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, break; } used = recv_actor(desc, skb, offset, len); - if (used <= len) { + if (used < 0) { + if (!copied) + copied = used; + break; + } else if (used <= len) { seq += used; copied += used; offset += used; } - if (offset != skb->len) + /* + * If recv_actor drops the lock (e.g. TCP splice + * receive) the skb pointer might be invalid when + * getting here: tcp_collapse might have deleted it + * while aggregating skbs from the socket queue. + */ + skb = tcp_recv_skb(sk, seq-1, &offset); + if (!skb || (offset+1 != skb->len)) break; } - if (skb->h.th->fin) { + if (tcp_hdr(skb)->fin) { sk_eat_skb(sk, skb, 0); ++seq; break; @@ -1084,7 +1249,7 @@ int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, tcp_rcv_space_adjust(sk); /* Clean up data we have read: This will do ACK frames. */ - if (copied) + if (copied > 0) tcp_cleanup_rbuf(sk, copied); return copied; } @@ -1110,6 +1275,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, long timeo; struct task_struct *user_recv = NULL; int copied_early = 0; + struct sk_buff *skb; lock_sock(sk); @@ -1136,16 +1302,26 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, #ifdef CONFIG_NET_DMA tp->ucopy.dma_chan = NULL; preempt_disable(); - if ((len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && - !sysctl_tcp_low_latency && __get_cpu_var(softnet_data).net_dma) { - preempt_enable_no_resched(); - tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); - } else - preempt_enable_no_resched(); + skb = skb_peek_tail(&sk->sk_receive_queue); + { + int available = 0; + + if (skb) + available = TCP_SKB_CB(skb)->seq + skb->len - (*seq); + if ((available < target) && + (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && + !sysctl_tcp_low_latency && + __get_cpu_var(softnet_data).net_dma) { + preempt_enable_no_resched(); + tp->ucopy.pinned_list = + dma_pin_iovec_pages(msg->msg_iov, len); + } else { + preempt_enable_no_resched(); + } + } #endif do { - struct sk_buff *skb; u32 offset; /* Are we at urgent data? Stop if we have read anything or have SIGURG pending. */ @@ -1174,11 +1350,11 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, break; } offset = *seq - TCP_SKB_CB(skb)->seq; - if (skb->h.th->syn) + if (tcp_hdr(skb)->syn) offset--; if (offset < skb->len) goto found_ok_skb; - if (skb->h.th->fin) + if (tcp_hdr(skb)->fin) goto found_fin_ok; BUG_TRAP(flags & MSG_PEEK); skb = skb->next; @@ -1315,7 +1491,7 @@ do_prequeue: if ((flags & MSG_PEEK) && peek_seq != tp->copied_seq) { if (net_ratelimit()) printk(KERN_DEBUG "TCP(%s:%d): Application bug, race in MSG_PEEK.\n", - current->comm, current->pid); + current->comm, task_pid_nr(current)); peek_seq = tp->copied_seq; } continue; @@ -1389,12 +1565,12 @@ do_prequeue: skip_copy: if (tp->urg_data && after(tp->copied_seq, tp->urg_seq)) { tp->urg_data = 0; - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); } if (used + offset < skb->len) continue; - if (skb->h.th->fin) + if (tcp_hdr(skb)->fin) goto found_fin_ok; if (!(flags & MSG_PEEK)) { sk_eat_skb(sk, skb, copied_early); @@ -1433,18 +1609,17 @@ skip_copy: #ifdef CONFIG_NET_DMA if (tp->ucopy.dma_chan) { - struct sk_buff *skb; dma_cookie_t done, used; dma_async_memcpy_issue_pending(tp->ucopy.dma_chan); while (dma_async_memcpy_complete(tp->ucopy.dma_chan, - tp->ucopy.dma_cookie, &done, - &used) == DMA_IN_PROGRESS) { + tp->ucopy.dma_cookie, &done, + &used) == DMA_IN_PROGRESS) { /* do partial cleanup of sk_async_wait_queue */ while ((skb = skb_peek(&sk->sk_async_wait_queue)) && (dma_async_is_complete(skb->dma_cookie, done, - used) == DMA_SUCCESS)) { + used) == DMA_SUCCESS)) { __skb_dequeue(&sk->sk_async_wait_queue); kfree_skb(skb); } @@ -1482,6 +1657,41 @@ recv_urg: goto out; } +void tcp_set_state(struct sock *sk, int state) +{ + int oldstate = sk->sk_state; + + switch (state) { + case TCP_ESTABLISHED: + if (oldstate != TCP_ESTABLISHED) + TCP_INC_STATS(TCP_MIB_CURRESTAB); + break; + + case TCP_CLOSE: + if (oldstate == TCP_CLOSE_WAIT || oldstate == TCP_ESTABLISHED) + TCP_INC_STATS(TCP_MIB_ESTABRESETS); + + sk->sk_prot->unhash(sk); + if (inet_csk(sk)->icsk_bind_hash && + !(sk->sk_userlocks & SOCK_BINDPORT_LOCK)) + inet_put_port(sk); + /* fall through */ + default: + if (oldstate==TCP_ESTABLISHED) + TCP_DEC_STATS(TCP_MIB_CURRESTAB); + } + + /* Change state AFTER socket is unhashed to avoid closed + * socket sitting in hash tables. + */ + sk->sk_state = state; + +#ifdef STATE_TRACE + SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n",sk, statename[oldstate],statename[state]); +#endif +} +EXPORT_SYMBOL_GPL(tcp_set_state); + /* * State processing on a close. This implements the state shift for * sending our FIN frame. Note that we only send a FIN for some @@ -1517,7 +1727,7 @@ static int tcp_close_state(struct sock *sk) /* * Shutdown the sending side of a connection. Much like close except - * that we don't receive shut down or set_sock_flag(sk, SOCK_DEAD). + * that we don't receive shut down or sock_set_flag(sk, SOCK_DEAD). */ void tcp_shutdown(struct sock *sk, int how) @@ -1563,21 +1773,19 @@ void tcp_close(struct sock *sk, long timeout) */ while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) { u32 len = TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq - - skb->h.th->fin; + tcp_hdr(skb)->fin; data_was_unread += len; __kfree_skb(skb); } - sk_stream_mem_reclaim(sk); + sk_mem_reclaim(sk); - /* As outlined in draft-ietf-tcpimpl-prob-03.txt, section - * 3.10, we send a RST here because data was lost. To - * witness the awful effects of the old behavior of always - * doing a FIN, run an older 2.1.x kernel or 2.0.x, start - * a bulk GET in an FTP client, suspend the process, wait - * for the client to advertise a zero window, then kill -9 - * the FTP client, wheee... Note: timeout is always zero - * in such a case. + /* As outlined in RFC 2525, section 2.17, we send a RST here because + * data was lost. To witness the awful effects of the old behavior of + * always doing a FIN, run an older 2.1.x kernel or 2.0.x, start a bulk + * GET in an FTP client, suspend the process, wait for the client to + * advertise a zero window, then kill -9 the FTP client, wheee... + * Note: timeout is always zero in such a case. */ if (data_was_unread) { /* Unread data was tossed, zap the connection. */ @@ -1673,10 +1881,9 @@ adjudge_to_death: } } if (sk->sk_state != TCP_CLOSE) { - sk_stream_mem_reclaim(sk); - if (atomic_read(sk->sk_prot->orphan_count) > sysctl_tcp_max_orphans || - (sk->sk_wmem_queued > SOCK_MIN_SNDBUF && - atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) { + sk_mem_reclaim(sk); + if (tcp_too_many_orphans(sk, + atomic_read(sk->sk_prot->orphan_count))) { if (net_ratelimit()) printk(KERN_INFO "TCP: too many of orphaned " "sockets\n"); @@ -1732,7 +1939,7 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_clear_xmit_timers(sk); __skb_queue_purge(&sk->sk_receive_queue); - sk_stream_writequeue_purge(sk); + tcp_write_queue_purge(sk); __skb_queue_purge(&tp->out_of_order_queue); #ifdef CONFIG_NET_DMA __skb_queue_purge(&sk->sk_async_wait_queue); @@ -1758,9 +1965,8 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); - sk->sk_send_head = NULL; - tp->rx_opt.saw_tstamp = 0; - tcp_sack_reset(&tp->rx_opt); + tcp_init_send_head(sk); + memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); BUG_TRAP(!inet->num || icsk->icsk_bind_hash); @@ -1830,7 +2036,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, * for currently queued segments. */ tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); } else { tp->nonagle &= ~TCP_NAGLE_OFF; } @@ -1854,7 +2060,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, tp->nonagle &= ~TCP_NAGLE_CORK; if (tp->nonagle&TCP_NAGLE_OFF) tp->nonagle |= TCP_NAGLE_PUSH; - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); } break; @@ -1954,7 +2160,8 @@ static int do_tcp_setsockopt(struct sock *sk, int level, default: err = -ENOPROTOOPT; break; - }; + } + release_sock(sk); return err; } @@ -2000,13 +2207,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) if (tp->rx_opt.tstamp_ok) info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) info->tcpi_options |= TCPI_OPT_SACK; if (tp->rx_opt.wscale_ok) { info->tcpi_options |= TCPI_OPT_WSCALE; info->tcpi_snd_wscale = tp->rx_opt.snd_wscale; info->tcpi_rcv_wscale = tp->rx_opt.rcv_wscale; - } + } if (tp->ecn_flags&TCP_ECN_OK) info->tcpi_options |= TCPI_OPT_ECN; @@ -2016,8 +2223,13 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_snd_mss = tp->mss_cache; info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss; - info->tcpi_unacked = tp->packets_out; - info->tcpi_sacked = tp->sacked_out; + if (sk->sk_state == TCP_LISTEN) { + info->tcpi_unacked = sk->sk_ack_backlog; + info->tcpi_sacked = sk->sk_max_ack_backlog; + } else { + info->tcpi_unacked = tp->packets_out; + info->tcpi_sacked = tp->sacked_out; + } info->tcpi_lost = tp->lost_out; info->tcpi_retrans = tp->retrans_out; info->tcpi_fackets = tp->fackets_out; @@ -2124,7 +2336,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, return 0; default: return -ENOPROTOOPT; - }; + } if (put_user(len, optlen)) return -EFAULT; @@ -2170,7 +2382,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) if (!pskb_may_pull(skb, sizeof(*th))) goto out; - th = skb->h.th; + th = tcp_hdr(skb); thlen = th->doff * 4; if (thlen < sizeof(*th)) goto out; @@ -2196,7 +2408,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) goto out; mss = skb_shinfo(skb)->gso_size; - skb_shinfo(skb)->gso_segs = (skb->len + mss - 1) / mss; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(skb->len, mss); segs = NULL; goto out; @@ -2210,7 +2422,7 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) delta = htonl(oldlen + (thlen + len)); skb = segs; - th = skb->h.th; + th = tcp_hdr(skb); seq = ntohl(th->seq); do { @@ -2219,23 +2431,25 @@ struct sk_buff *tcp_tso_segment(struct sk_buff *skb, int features) th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); + th->check = + csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); seq += len; skb = skb->next; - th = skb->h.th; + th = tcp_hdr(skb); th->seq = htonl(seq); th->cwr = 0; } while (skb->next); - delta = htonl(oldlen + (skb->tail - skb->h.raw) + skb->data_len); + delta = htonl(oldlen + (skb->tail - skb->transport_header) + + skb->data_len); th->check = ~csum_fold((__force __wsum)((__force u32)th->check + (__force u32)delta)); if (skb->ip_summed != CHECKSUM_PARTIAL) - th->check = csum_fold(csum_partial(skb->h.raw, thlen, - skb->csum)); + th->check = csum_fold(csum_partial(skb_transport_header(skb), + thlen, skb->csum)); out: return segs; @@ -2247,6 +2461,76 @@ static unsigned long tcp_md5sig_users; static struct tcp_md5sig_pool **tcp_md5sig_pool; static DEFINE_SPINLOCK(tcp_md5sig_pool_lock); +int tcp_calc_md5_hash(char *md5_hash, struct tcp_md5sig_key *key, + int bplen, + struct tcphdr *th, unsigned int tcplen, + struct tcp_md5sig_pool *hp) +{ + struct scatterlist sg[4]; + __u16 data_len; + int block = 0; + __sum16 cksum; + struct hash_desc *desc = &hp->md5_desc; + int err; + unsigned int nbytes = 0; + + sg_init_table(sg, 4); + + /* 1. The TCP pseudo-header */ + sg_set_buf(&sg[block++], &hp->md5_blk, bplen); + nbytes += bplen; + + /* 2. The TCP header, excluding options, and assuming a + * checksum of zero + */ + cksum = th->check; + th->check = 0; + sg_set_buf(&sg[block++], th, sizeof(*th)); + nbytes += sizeof(*th); + + /* 3. The TCP segment data (if any) */ + data_len = tcplen - (th->doff << 2); + if (data_len > 0) { + u8 *data = (u8 *)th + (th->doff << 2); + sg_set_buf(&sg[block++], data, data_len); + nbytes += data_len; + } + + /* 4. an independently-specified key or password, known to both + * TCPs and presumably connection-specific + */ + sg_set_buf(&sg[block++], key->key, key->keylen); + nbytes += key->keylen; + + sg_mark_end(&sg[block - 1]); + + /* Now store the hash into the packet */ + err = crypto_hash_init(desc); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_init failed\n", __func__); + return -1; + } + err = crypto_hash_update(desc, sg, nbytes); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_update failed\n", __func__); + return -1; + } + err = crypto_hash_final(desc, md5_hash); + if (err) { + if (net_ratelimit()) + printk(KERN_WARNING "%s(): hash_final failed\n", __func__); + return -1; + } + + /* Reset header */ + th->check = cksum; + + return 0; +} +EXPORT_SYMBOL(tcp_calc_md5_hash); + static void __tcp_free_md5sig_pool(struct tcp_md5sig_pool **pool) { int cpu; @@ -2266,12 +2550,12 @@ void tcp_free_md5sig_pool(void) { struct tcp_md5sig_pool **pool = NULL; - spin_lock(&tcp_md5sig_pool_lock); + spin_lock_bh(&tcp_md5sig_pool_lock); if (--tcp_md5sig_users == 0) { pool = tcp_md5sig_pool; tcp_md5sig_pool = NULL; } - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); if (pool) __tcp_free_md5sig_pool(pool); } @@ -2314,36 +2598,36 @@ struct tcp_md5sig_pool **tcp_alloc_md5sig_pool(void) int alloc = 0; retry: - spin_lock(&tcp_md5sig_pool_lock); + spin_lock_bh(&tcp_md5sig_pool_lock); pool = tcp_md5sig_pool; if (tcp_md5sig_users++ == 0) { alloc = 1; - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); } else if (!pool) { tcp_md5sig_users--; - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); cpu_relax(); goto retry; } else - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); if (alloc) { /* we cannot hold spinlock here because this may sleep. */ struct tcp_md5sig_pool **p = __tcp_alloc_md5sig_pool(); - spin_lock(&tcp_md5sig_pool_lock); + spin_lock_bh(&tcp_md5sig_pool_lock); if (!p) { tcp_md5sig_users--; - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); return NULL; } pool = tcp_md5sig_pool; if (pool) { /* oops, it has already been assigned. */ - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); __tcp_free_md5sig_pool(p); } else { tcp_md5sig_pool = pool = p; - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); } } return pool; @@ -2354,24 +2638,41 @@ EXPORT_SYMBOL(tcp_alloc_md5sig_pool); struct tcp_md5sig_pool *__tcp_get_md5sig_pool(int cpu) { struct tcp_md5sig_pool **p; - spin_lock(&tcp_md5sig_pool_lock); + spin_lock_bh(&tcp_md5sig_pool_lock); p = tcp_md5sig_pool; if (p) tcp_md5sig_users++; - spin_unlock(&tcp_md5sig_pool_lock); + spin_unlock_bh(&tcp_md5sig_pool_lock); return (p ? *per_cpu_ptr(p, cpu) : NULL); } EXPORT_SYMBOL(__tcp_get_md5sig_pool); -void __tcp_put_md5sig_pool(void) { - __tcp_free_md5sig_pool(tcp_md5sig_pool); +void __tcp_put_md5sig_pool(void) +{ + tcp_free_md5sig_pool(); } EXPORT_SYMBOL(__tcp_put_md5sig_pool); #endif -extern void __skb_cb_too_small_for_tcp(int, int); +void tcp_done(struct sock *sk) +{ + if(sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV) + TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS); + + tcp_set_state(sk, TCP_CLOSE); + tcp_clear_xmit_timers(sk); + + sk->sk_shutdown = SHUTDOWN_MASK; + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); + else + inet_csk_destroy_sock(sk); +} +EXPORT_SYMBOL_GPL(tcp_done); + extern struct tcp_congestion_ops tcp_reno; static __initdata unsigned long thash_entries; @@ -2390,14 +2691,12 @@ void __init tcp_init(void) unsigned long limit; int order, i, max_share; - if (sizeof(struct tcp_skb_cb) > sizeof(skb->cb)) - __skb_cb_too_small_for_tcp(sizeof(struct tcp_skb_cb), - sizeof(skb->cb)); + BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); tcp_hashinfo.bind_bucket_cachep = kmem_cache_create("tcp_bind_bucket", sizeof(struct inet_bind_bucket), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); /* Size and allocate the main established and bind bucket * hash tables. @@ -2413,13 +2712,14 @@ void __init tcp_init(void) 0, &tcp_hashinfo.ehash_size, NULL, - 0); - tcp_hashinfo.ehash_size = (1 << tcp_hashinfo.ehash_size) >> 1; - for (i = 0; i < (tcp_hashinfo.ehash_size << 1); i++) { - rwlock_init(&tcp_hashinfo.ehash[i].lock); + thash_entries ? 0 : 512 * 1024); + tcp_hashinfo.ehash_size = 1 << tcp_hashinfo.ehash_size; + for (i = 0; i < tcp_hashinfo.ehash_size; i++) { INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].chain); + INIT_HLIST_HEAD(&tcp_hashinfo.ehash[i].twchain); } - + if (inet_ehash_locks_alloc(&tcp_hashinfo)) + panic("TCP: failed to alloc ehash_locks"); tcp_hashinfo.bhash = alloc_large_system_hash("TCP bind", sizeof(struct inet_bind_hashbucket), @@ -2444,37 +2744,41 @@ void __init tcp_init(void) order++) ; if (order >= 4) { - sysctl_local_port_range[0] = 32768; - sysctl_local_port_range[1] = 61000; tcp_death_row.sysctl_max_tw_buckets = 180000; sysctl_tcp_max_orphans = 4096 << (order - 4); sysctl_max_syn_backlog = 1024; } else if (order < 3) { - sysctl_local_port_range[0] = 1024 * (3 - order); tcp_death_row.sysctl_max_tw_buckets >>= (3 - order); sysctl_tcp_max_orphans >>= (3 - order); sysctl_max_syn_backlog = 128; } - /* Allow no more than 3/4 kernel memory (usually less) allocated to TCP */ - sysctl_tcp_mem[0] = (1536 / sizeof (struct inet_bind_hashbucket)) << order; - sysctl_tcp_mem[1] = sysctl_tcp_mem[0] * 4 / 3; + /* Set the pressure threshold to be a fraction of global memory that + * is up to 1/2 at 256 MB, decreasing toward zero with the amount of + * memory, with a floor of 128 pages. + */ + limit = min(nr_all_pages, 1UL<<(28-PAGE_SHIFT)) >> (20-PAGE_SHIFT); + limit = (limit * (nr_all_pages >> (20-PAGE_SHIFT))) >> (PAGE_SHIFT-11); + limit = max(limit, 128UL); + sysctl_tcp_mem[0] = limit / 4 * 3; + sysctl_tcp_mem[1] = limit; sysctl_tcp_mem[2] = sysctl_tcp_mem[0] * 2; + /* Set per-socket limits to no more than 1/128 the pressure threshold */ limit = ((unsigned long)sysctl_tcp_mem[1]) << (PAGE_SHIFT - 7); max_share = min(4UL*1024*1024, limit); - sysctl_tcp_wmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; sysctl_tcp_wmem[1] = 16*1024; sysctl_tcp_wmem[2] = max(64*1024, max_share); - sysctl_tcp_rmem[0] = SK_STREAM_MEM_QUANTUM; + sysctl_tcp_rmem[0] = SK_MEM_QUANTUM; sysctl_tcp_rmem[1] = 87380; sysctl_tcp_rmem[2] = max(87380, max_share); printk(KERN_INFO "TCP: Hash tables configured " "(established %d bind %d)\n", - tcp_hashinfo.ehash_size << 1, tcp_hashinfo.bhash_size); + tcp_hashinfo.ehash_size, tcp_hashinfo.bhash_size); tcp_register_congestion_control(&tcp_reno); } @@ -2487,6 +2791,7 @@ EXPORT_SYMBOL(tcp_poll); EXPORT_SYMBOL(tcp_read_sock); EXPORT_SYMBOL(tcp_recvmsg); EXPORT_SYMBOL(tcp_sendmsg); +EXPORT_SYMBOL(tcp_splice_read); EXPORT_SYMBOL(tcp_sendpage); EXPORT_SYMBOL(tcp_setsockopt); EXPORT_SYMBOL(tcp_shutdown);