X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_output.c;h=da2c3b8794f2b39d3c5e4948b08cd78167f5ae2e;hb=76a67ec6fb79ff3570dcb5342142c16098299911;hp=9a9510acb147ff555320e7b460612708a00dcad1;hpb=b92edbe0b8a36a833c16b0cbafb6e899b81ffc08;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 9a9510a..da2c3b8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -5,8 +5,6 @@ * * Implementation of the Transmission Control Protocol(TCP). * - * Version: $Id: tcp_output.c,v 1.146 2002/02/01 22:01:04 davem Exp $ - * * Authors: Ross Biro * Fred N. van Kempen, * Mark Evans, @@ -44,7 +42,7 @@ /* People can turn this off for buggy TCP's found in printers etc. */ int sysctl_tcp_retrans_collapse __read_mostly = 1; -/* People can turn this on to work with those rare, broken TCPs that +/* People can turn this on to work with those rare, broken TCPs that * interpret the window field as a signed quantity. */ int sysctl_tcp_workaround_signed_windows __read_mostly = 0; @@ -61,29 +59,22 @@ int sysctl_tcp_base_mss __read_mostly = 512; /* By default, RFC2861 behavior. */ int sysctl_tcp_slow_start_after_idle __read_mostly = 1; -static inline void tcp_packets_out_inc(struct sock *sk, - const struct sk_buff *skb) -{ - struct tcp_sock *tp = tcp_sk(sk); - int orig = tp->packets_out; - - tp->packets_out += tcp_skb_pcount(skb); - if (!orig) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, TCP_RTO_MAX); -} - -static void update_send_head(struct sock *sk, struct sk_buff *skb) +static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + unsigned int prior_packets = tp->packets_out; tcp_advance_send_head(sk, skb); tp->snd_nxt = TCP_SKB_CB(skb)->end_seq; - tcp_packets_out_inc(sk, skb); /* Don't override Nagle indefinately with F-RTO */ if (tp->frto_counter == 2) tp->frto_counter = 3; + + tp->packets_out += tcp_skb_pcount(skb); + if (!prior_packets) + inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, + inet_csk(sk)->icsk_rto, TCP_RTO_MAX); } /* SND.NXT, if window was not shrunk. @@ -96,10 +87,10 @@ static inline __u32 tcp_acceptable_seq(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (!before(tp->snd_una+tp->snd_wnd, tp->snd_nxt)) + if (!before(tcp_wnd_end(tp), tp->snd_nxt)) return tp->snd_nxt; else - return tp->snd_una+tp->snd_wnd; + return tcp_wnd_end(tp); } /* Calculate mss to advertise in SYN segment. @@ -228,14 +219,14 @@ void tcp_select_initial_window(int __space, __u32 mss, * following RFC2414. Senders, not following this RFC, * will be satisfied with 2. */ - if (mss > (1<<*rcv_wscale)) { + if (mss > (1 << *rcv_wscale)) { int init_cwnd = 4; - if (mss > 1460*3) + if (mss > 1460 * 3) init_cwnd = 2; else if (mss > 1460) init_cwnd = 3; - if (*rcv_wnd > init_cwnd*mss) - *rcv_wnd = init_cwnd*mss; + if (*rcv_wnd > init_cwnd * mss) + *rcv_wnd = init_cwnd * mss; } /* Set the clamp no higher than max representable value */ @@ -262,7 +253,7 @@ static u16 tcp_select_window(struct sock *sk) * * Relax Will Robinson. */ - new_win = cur_win; + new_win = ALIGN(cur_win, 1 << tp->rx_opt.rcv_wscale); } tp->rcv_wnd = new_win; tp->rcv_wup = tp->rcv_nxt; @@ -285,11 +276,10 @@ static u16 tcp_select_window(struct sock *sk) return new_win; } -static inline void TCP_ECN_send_synack(struct tcp_sock *tp, - struct sk_buff *skb) +static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb) { TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR; - if (!(tp->ecn_flags&TCP_ECN_OK)) + if (!(tp->ecn_flags & TCP_ECN_OK)) TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE; } @@ -299,7 +289,7 @@ static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb) tp->ecn_flags = 0; if (sysctl_tcp_ecn) { - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE|TCPCB_FLAG_CWR; + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR; tp->ecn_flags = TCP_ECN_OK; } } @@ -321,7 +311,7 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, if (skb->len != tcp_header_len && !before(TCP_SKB_CB(skb)->seq, tp->snd_nxt)) { INET_ECN_xmit(sk); - if (tp->ecn_flags&TCP_ECN_QUEUE_CWR) { + if (tp->ecn_flags & TCP_ECN_QUEUE_CWR) { tp->ecn_flags &= ~TCP_ECN_QUEUE_CWR; tcp_hdr(skb)->cwr = 1; skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; @@ -335,112 +325,260 @@ static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb, } } -static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp, - __u32 tstamp, __u8 **md5_hash) +/* Constructs common control bits of non-data skb. If SYN/FIN is present, + * auto increment end seqno. + */ +static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags) { - if (tp->rx_opt.tstamp_ok) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); - *ptr++ = htonl(tp->rx_opt.ts_recent); - } - if (tp->rx_opt.eff_sacks) { - struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; - int this_sack; + skb->csum = 0; - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK))); + TCP_SKB_CB(skb)->flags = flags; + TCP_SKB_CB(skb)->sacked = 0; - for (this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { - *ptr++ = htonl(sp[this_sack].start_seq); - *ptr++ = htonl(sp[this_sack].end_seq); - } + skb_shinfo(skb)->gso_segs = 1; + skb_shinfo(skb)->gso_size = 0; + skb_shinfo(skb)->gso_type = 0; - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks--; - } - } -#ifdef CONFIG_TCP_MD5SIG - if (md5_hash) { + TCP_SKB_CB(skb)->seq = seq; + if (flags & (TCPCB_FLAG_SYN | TCPCB_FLAG_FIN)) + seq++; + TCP_SKB_CB(skb)->end_seq = seq; +} + +static inline int tcp_urg_mode(const struct tcp_sock *tp) +{ + return tp->snd_una != tp->snd_up; +} + +#define OPTION_SACK_ADVERTISE (1 << 0) +#define OPTION_TS (1 << 1) +#define OPTION_MD5 (1 << 2) + +struct tcp_out_options { + u8 options; /* bit field of OPTION_* */ + u8 ws; /* window scale, 0 to disable */ + u8 num_sack_blocks; /* number of SACK blocks to include */ + u16 mss; /* 0 to disable */ + __u32 tsval, tsecr; /* need to include OPTION_TS */ +}; + +/* Beware: Something in the Internet is very sensitive to the ordering of + * TCP options, we learned this through the hard way, so be careful here. + * Luckily we can at least blame others for their non-compliance but from + * inter-operatibility perspective it seems that we're somewhat stuck with + * the ordering which we have been using if we want to keep working with + * those broken things (not that it currently hurts anybody as there isn't + * particular reason why the ordering would need to be changed). + * + * At least SACK_PERM as the first option is known to lead to a disaster + * (but it may well be that other scenarios fail similarly). + */ +static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp, + const struct tcp_out_options *opts, + __u8 **md5_hash) { + if (unlikely(OPTION_MD5 & opts->options)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_MD5SIG << 8) | TCPOLEN_MD5SIG); *md5_hash = (__u8 *)ptr; + ptr += 4; + } else { + *md5_hash = NULL; } -#endif -} -/* Construct a tcp options header for a SYN or SYN_ACK packet. - * If this is every changed make sure to change the definition of - * MAX_SYN_SIZE to match the new maximum number of options that you - * can generate. - * - * Note - that with the RFC2385 TCP option, we make room for the - * 16 byte MD5 hash. This will be filled in later, so the pointer for the - * location to be filled is passed back up. - */ -static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, - int offer_wscale, int wscale, __u32 tstamp, - __u32 ts_recent, __u8 **md5_hash) -{ - /* We always get an MSS option. - * The option bytes which will be seen in normal data - * packets should timestamps be used, must be in the MSS - * advertised. But we subtract them from tp->mss_cache so - * that calculations in tcp_sendmsg are simpler etc. - * So account for this fact here if necessary. If we - * don't do this correctly, as a receiver we won't - * recognize data packets as being full sized when we - * should, and thus we won't abide by the delayed ACK - * rules correctly. - * SACKs don't matter, we never delay an ACK when we - * have any of those going out. - */ - *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); - if (ts) { - if (sack) + if (unlikely(opts->mss)) { + *ptr++ = htonl((TCPOPT_MSS << 24) | + (TCPOLEN_MSS << 16) | + opts->mss); + } + + if (likely(OPTION_TS & opts->options)) { + if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) { *ptr++ = htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - else + } else { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); /* TSVAL */ - *ptr++ = htonl(ts_recent); /* TSECR */ - } else if (sack) + } + *ptr++ = htonl(opts->tsval); + *ptr++ = htonl(opts->tsecr); + } + + if (unlikely(OPTION_SACK_ADVERTISE & opts->options && + !(OPTION_TS & opts->options))) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (offer_wscale) + } + + if (unlikely(opts->ws)) { *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | - (wscale)); + opts->ws); + } + + if (unlikely(opts->num_sack_blocks)) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? + tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (opts->num_sack_blocks * + TCPOLEN_SACK_PERBLOCK))); + + for (this_sack = 0; this_sack < opts->num_sack_blocks; + ++this_sack) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks; + } + } +} + +static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + #ifdef CONFIG_TCP_MD5SIG - /* - * If MD5 is enabled, then we set the option, and include the size - * (always 18). The actual MD5 hash is added just before the - * packet is sent. - */ - if (md5_hash) { - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_MD5SIG << 8) | - TCPOLEN_MD5SIG); - *md5_hash = (__u8 *) ptr; + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + /* We always get an MSS option. The option bytes which will be seen in + * normal data packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so that + * calculations in tcp_sendmsg are simpler etc. So account for this + * fact here if necessary. If we don't do this correctly, as a + * receiver we won't recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK rules correctly. + * SACKs don't matter, we never delay an ACK when we have any of those + * going out. */ + opts->mss = tcp_advertise_mss(sk); + size += TCPOLEN_MSS_ALIGNED; + + if (likely(sysctl_tcp_timestamps && *md5 == NULL)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(sysctl_tcp_window_scaling)) { + opts->ws = tp->rx_opt.rcv_wscale; + if (likely(opts->ws)) + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(sysctl_tcp_sack)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!(OPTION_TS & opts->options))) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; +} + +static unsigned tcp_synack_options(struct sock *sk, + struct request_sock *req, + unsigned mss, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + unsigned size = 0; + struct inet_request_sock *ireq = inet_rsk(req); + char doing_ts; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); + if (*md5) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; + } +#else + *md5 = NULL; +#endif + + /* we can't fit any SACK blocks in a packet with MD5 + TS + options. There was discussion about disabling SACK rather than TS in + order to fit in better with old, buggy kernels, but that was deemed + to be unnecessary. */ + doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok); + + opts->mss = mss; + size += TCPOLEN_MSS_ALIGNED; + + if (likely(ireq->wscale_ok)) { + opts->ws = ireq->rcv_wscale; + if (likely(opts->ws)) + size += TCPOLEN_WSCALE_ALIGNED; + } + if (likely(doing_ts)) { + opts->options |= OPTION_TS; + opts->tsval = TCP_SKB_CB(skb)->when; + opts->tsecr = req->ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + if (likely(ireq->sack_ok)) { + opts->options |= OPTION_SACK_ADVERTISE; + if (unlikely(!doing_ts)) + size += TCPOLEN_SACKPERM_ALIGNED; + } + + return size; +} + +static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb, + struct tcp_out_options *opts, + struct tcp_md5sig_key **md5) { + struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL; + struct tcp_sock *tp = tcp_sk(sk); + unsigned size = 0; + +#ifdef CONFIG_TCP_MD5SIG + *md5 = tp->af_specific->md5_lookup(sk, sk); + if (unlikely(*md5)) { + opts->options |= OPTION_MD5; + size += TCPOLEN_MD5SIG_ALIGNED; } +#else + *md5 = NULL; #endif + + if (likely(tp->rx_opt.tstamp_ok)) { + opts->options |= OPTION_TS; + opts->tsval = tcb ? tcb->when : 0; + opts->tsecr = tp->rx_opt.ts_recent; + size += TCPOLEN_TSTAMP_ALIGNED; + } + + if (unlikely(tp->rx_opt.eff_sacks)) { + const unsigned remaining = MAX_TCP_OPTION_SPACE - size; + opts->num_sack_blocks = + min_t(unsigned, tp->rx_opt.eff_sacks, + (remaining - TCPOLEN_SACK_BASE_ALIGNED) / + TCPOLEN_SACK_PERBLOCK); + size += TCPOLEN_SACK_BASE_ALIGNED + + opts->num_sack_blocks * TCPOLEN_SACK_PERBLOCK; + } + + return size; } /* This routine actually transmits TCP packets queued in by @@ -454,19 +592,18 @@ static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack, * We are working here with either a clone of the original * SKB, or a fresh unique copy made by the retransmit engine. */ -static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask) +static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, + gfp_t gfp_mask) { const struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet; struct tcp_sock *tp; struct tcp_skb_cb *tcb; - int tcp_header_size; -#ifdef CONFIG_TCP_MD5SIG + struct tcp_out_options opts; + unsigned tcp_options_size, tcp_header_size; struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif struct tcphdr *th; - int sysctl_flags; int err; BUG_ON(!skb || !tcp_skb_pcount(skb)); @@ -489,50 +626,18 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, inet = inet_sk(sk); tp = tcp_sk(sk); tcb = TCP_SKB_CB(skb); - tcp_header_size = tp->tcp_header_len; + memset(&opts, 0, sizeof(opts)); -#define SYSCTL_FLAG_TSTAMPS 0x1 -#define SYSCTL_FLAG_WSCALE 0x2 -#define SYSCTL_FLAG_SACK 0x4 - - sysctl_flags = 0; - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS; - if (sysctl_tcp_timestamps) { - tcp_header_size += TCPOLEN_TSTAMP_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_TSTAMPS; - } - if (sysctl_tcp_window_scaling) { - tcp_header_size += TCPOLEN_WSCALE_ALIGNED; - sysctl_flags |= SYSCTL_FLAG_WSCALE; - } - if (sysctl_tcp_sack) { - sysctl_flags |= SYSCTL_FLAG_SACK; - if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS)) - tcp_header_size += TCPOLEN_SACKPERM_ALIGNED; - } - } else if (unlikely(tp->rx_opt.eff_sacks)) { - /* A SACK is 2 pad bytes, a 2 byte header, plus - * 2 32-bit sequence numbers for each SACK block. - */ - tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK)); - } + if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) + tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5); + else + tcp_options_size = tcp_established_options(sk, skb, &opts, + &md5); + tcp_header_size = tcp_options_size + sizeof(struct tcphdr); if (tcp_packets_in_flight(tp) == 0) tcp_ca_event(sk, CA_EVENT_TX_START); -#ifdef CONFIG_TCP_MD5SIG - /* - * Are we doing MD5 on this segment? If so - make - * room for it. - */ - md5 = tp->af_specific->md5_lookup(sk, sk); - if (md5) - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; -#endif - skb_push(skb, tcp_header_size); skb_reset_transport_header(skb); skb_set_owner_w(skb, sk); @@ -557,45 +662,23 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, th->check = 0; th->urg_ptr = 0; - if (unlikely(tp->urg_mode && - between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) { - th->urg_ptr = htons(tp->snd_up-tcb->seq); + /* The urg_mode check is necessary during a below snd_una win probe */ + if (unlikely(tcp_urg_mode(tp) && + between(tp->snd_up, tcb->seq + 1, tcb->seq + 0xFFFF))) { + th->urg_ptr = htons(tp->snd_up - tcb->seq); th->urg = 1; } - if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) { - tcp_syn_build_options((__be32 *)(th + 1), - tcp_advertise_mss(sk), - (sysctl_flags & SYSCTL_FLAG_TSTAMPS), - (sysctl_flags & SYSCTL_FLAG_SACK), - (sysctl_flags & SYSCTL_FLAG_WSCALE), - tp->rx_opt.rcv_wscale, - tcb->when, - tp->rx_opt.ts_recent, - -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); - } else { - tcp_build_and_update_options((__be32 *)(th + 1), - tp, tcb->when, -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : -#endif - NULL); + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); + if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0)) TCP_ECN_send(sk, skb, tcp_header_size); - } #ifdef CONFIG_TCP_MD5SIG /* Calculate the MD5 hash, as we have all we need now */ if (md5) { + sk->sk_route_caps &= ~NETIF_F_GSO_MASK; tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - sk, NULL, NULL, - tcp_hdr(skb), - sk->sk_protocol, - skb->len); + md5, sk, NULL, skb); } #endif @@ -608,7 +691,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_event_data_sent(tp, skb, sk); if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq) - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (likely(err <= 0)) @@ -617,13 +700,8 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, tcp_enter_cwr(sk, 1); return net_xmit_eval(err); - -#undef SYSCTL_FLAG_TSTAMPS -#undef SYSCTL_FLAG_WSCALE -#undef SYSCTL_FLAG_SACK } - /* This routine just queue's the buffer * * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, @@ -637,10 +715,12 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb) tp->write_seq = TCP_SKB_CB(skb)->end_seq; skb_header_release(skb); tcp_add_write_queue_tail(sk, skb); - sk_charge_skb(sk, skb); + sk->sk_wmem_queued += skb->truesize; + sk_mem_charge(sk, skb->truesize); } -static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { if (skb->len <= mss_now || !sk_can_gso(sk)) { /* Avoid the costly divide in the normal @@ -676,7 +756,8 @@ static void tcp_adjust_fackets_out(struct sock *sk, struct sk_buff *skb, * packet to the list. This won't be called frequently, I hope. * Remember, these are still headerless SKBs at this point. */ -int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now) +int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, + unsigned int mss_now) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *buff; @@ -701,7 +782,8 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss if (buff == NULL) return -ENOMEM; /* We'll just try again later. */ - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); nlen = skb->len - len - nsize; buff->truesize += nlen; skb->truesize -= nlen; @@ -713,14 +795,14 @@ int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked; - TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL; if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) { /* Copy and checksum data tail into the new buffer. */ - buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize), + buff->csum = csum_partial_copy_nocheck(skb->data + len, + skb_put(buff, nsize), nsize, 0); skb_trim(skb, len); @@ -787,7 +869,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) eat = len; k = 0; - for (i=0; inr_frags; i++) { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { if (skb_shinfo(skb)->frags[i].size <= eat) { put_page(skb_shinfo(skb)->frags[i].page); eat -= skb_shinfo(skb)->frags[i].size; @@ -810,8 +892,7 @@ static void __pskb_trim_head(struct sk_buff *skb, int len) int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) { - if (skb_cloned(skb) && - pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) return -ENOMEM; /* If len == headlen, we avoid __skb_pull to preserve alignment. */ @@ -825,7 +906,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) skb->truesize -= len; sk->sk_wmem_queued -= len; - sk->sk_forward_alloc += len; + sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); /* Any change of skb->len requires recalculation of tso @@ -893,6 +974,15 @@ void tcp_mtup_init(struct sock *sk) icsk->icsk_mtup.probe_size = 0; } +/* Bound MSS / TSO packet size with the half of the window */ +static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize) +{ + if (tp->max_window && pktsize > (tp->max_window >> 1)) + return max(tp->max_window >> 1, 68U - tp->tcp_header_len); + else + return pktsize; +} + /* This function synchronize snd mss to current pmtu/exthdr set. tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts @@ -915,7 +1005,6 @@ void tcp_mtup_init(struct sock *sk) NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache are READ ONLY outside this function. --ANK (980731) */ - unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); @@ -926,10 +1015,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) icsk->icsk_mtup.search_high = pmtu; mss_now = tcp_mtu_to_mss(sk, pmtu); - - /* Bound mss with half of window */ - if (tp->max_window && mss_now > (tp->max_window>>1)) - mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); + mss_now = tcp_bound_to_half_wnd(tp, mss_now); /* And store cached results */ icsk->icsk_pmtu_cookie = pmtu; @@ -942,10 +1028,6 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) /* Compute the current effective MSS, taking SACKs and IP options, * and even PMTU discovery events into account. - * - * LARGESEND note: !urg_mode is overkill, only frames up to snd_up - * cannot be large. However, taking into account rare use of URG, this - * is not a big flaw. */ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) { @@ -954,10 +1036,13 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) u32 mss_now; u16 xmit_size_goal; int doing_tso = 0; + unsigned header_len; + struct tcp_out_options opts; + struct tcp_md5sig_key *md5; mss_now = tp->mss_cache; - if (large_allowed && sk_can_gso(sk) && !tp->urg_mode) + if (large_allowed && sk_can_gso(sk)) doing_tso = 1; if (dst) { @@ -966,28 +1051,26 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) mss_now = tcp_sync_mss(sk, mtu); } - if (tp->rx_opt.eff_sacks) - mss_now -= (TCPOLEN_SACK_BASE_ALIGNED + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK)); - -#ifdef CONFIG_TCP_MD5SIG - if (tp->af_specific->md5_lookup(sk, sk)) - mss_now -= TCPOLEN_MD5SIG_ALIGNED; -#endif + header_len = tcp_established_options(sk, NULL, &opts, &md5) + + sizeof(struct tcphdr); + /* The mss_cache is sized based on tp->tcp_header_len, which assumes + * some common options. If this is an odd packet (because we have SACK + * blocks etc) then our calculated header_len will be different, and + * we have to adjust mss_now correspondingly */ + if (header_len != tp->tcp_header_len) { + int delta = (int) header_len - tp->tcp_header_len; + mss_now -= delta; + } xmit_size_goal = mss_now; if (doing_tso) { - xmit_size_goal = (65535 - + xmit_size_goal = ((sk->sk_gso_max_size - 1) - inet_csk(sk)->icsk_af_ops->net_header_len - inet_csk(sk)->icsk_ext_hdr_len - tp->tcp_header_len); - if (tp->max_window && - (xmit_size_goal > (tp->max_window >> 1))) - xmit_size_goal = max((tp->max_window >> 1), - 68U - tp->tcp_header_len); - + xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal); xmit_size_goal -= (xmit_size_goal % mss_now); } tp->xmit_size_goal = xmit_size_goal; @@ -996,13 +1079,11 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) } /* Congestion window validation. (RFC2861) */ - static void tcp_cwnd_validate(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - __u32 packets_out = tp->packets_out; - if (packets_out >= tp->snd_cwnd) { + if (tp->packets_out >= tp->snd_cwnd) { /* Network is feed fully. */ tp->snd_cwnd_used = 0; tp->snd_cwnd_stamp = tcp_time_stamp; @@ -1017,19 +1098,43 @@ static void tcp_cwnd_validate(struct sock *sk) } } -static unsigned int tcp_window_allows(struct tcp_sock *tp, struct sk_buff *skb, unsigned int mss_now, unsigned int cwnd) +/* Returns the portion of skb which can be sent right away without + * introducing MSS oddities to segment boundaries. In rare cases where + * mss_now != mss_cache, we will request caller to create a small skb + * per input skb which could be mostly avoided here (if desired). + * + * We explicitly want to create a request for splitting write queue tail + * to a small skb for Nagle purposes while avoiding unnecessary modulos, + * thus all the complexity (cwnd_len is always MSS multiple which we + * return whenever allowed by the other factors). Basically we need the + * modulo only when the receiver window alone is the limiting factor or + * when we would be allowed to send the split-due-to-Nagle skb fully. + */ +static unsigned int tcp_mss_split_point(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now, unsigned int cwnd) { - u32 window, cwnd_len; + struct tcp_sock *tp = tcp_sk(sk); + u32 needed, window, cwnd_len; - window = (tp->snd_una + tp->snd_wnd - TCP_SKB_CB(skb)->seq); + window = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; cwnd_len = mss_now * cwnd; - return min(window, cwnd_len); + + if (likely(cwnd_len <= window && skb != tcp_write_queue_tail(sk))) + return cwnd_len; + + needed = min(skb->len, window); + + if (cwnd_len <= needed) + return cwnd_len; + + return needed - needed % mss_now; } /* Can at least one segment of SKB be sent right now, according to the * congestion window rules? If so, return how many segments are allowed. */ -static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *skb) +static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, + struct sk_buff *skb) { u32 in_flight, cwnd; @@ -1049,13 +1154,12 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, + unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); - if (!tso_segs || - (tso_segs > 1 && - tcp_skb_mss(skb) != mss_now)) { + if (!tso_segs || (tso_segs > 1 && tcp_skb_mss(skb) != mss_now)) { tcp_set_skb_tso_segs(sk, skb, mss_now); tso_segs = tcp_skb_pcount(skb); } @@ -1064,7 +1168,7 @@ static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int static inline int tcp_minshall_check(const struct tcp_sock *tp) { - return after(tp->snd_sml,tp->snd_una) && + return after(tp->snd_sml, tp->snd_una) && !after(tp->snd_sml, tp->snd_nxt); } @@ -1075,16 +1179,13 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp) * 4. Or TCP_CORK is not set, and all sent packets are ACKed. * With Minshall's modification: all sent small packets are ACKed. */ - static inline int tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned mss_now, int nonagle) { return (skb->len < mss_now && - ((nonagle&TCP_NAGLE_CORK) || - (!nonagle && - tp->packets_out && - tcp_minshall_check(tp)))); + ((nonagle & TCP_NAGLE_CORK) || + (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); } /* Return non-zero if the Nagle test allows this packet to be @@ -1105,7 +1206,7 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, /* Don't use the nagle rule for urgent data (or for the final FIN). * Nagle can be ignored during F-RTO too (see RFC4138). */ - if (tp->urg_mode || (tp->frto_counter == 2) || + if (tcp_urg_mode(tp) || (tp->frto_counter == 2) || (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)) return 1; @@ -1116,14 +1217,15 @@ static inline int tcp_nagle_test(struct tcp_sock *tp, struct sk_buff *skb, } /* Does at least the first segment of SKB fit into the send window? */ -static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, unsigned int cur_mss) +static inline int tcp_snd_wnd_test(struct tcp_sock *tp, struct sk_buff *skb, + unsigned int cur_mss) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (skb->len > cur_mss) end_seq = TCP_SKB_CB(skb)->seq + cur_mss; - return !after(end_seq, tp->snd_una + tp->snd_wnd); + return !after(end_seq, tcp_wnd_end(tp)); } /* This checks if the data bearing packet SKB (usually tcp_send_head(sk)) @@ -1142,8 +1244,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb, return 0; cwnd_quota = tcp_cwnd_test(tp, skb); - if (cwnd_quota && - !tcp_snd_wnd_test(tp, skb, cur_mss)) + if (cwnd_quota && !tcp_snd_wnd_test(tp, skb, cur_mss)) cwnd_quota = 0; return cwnd_quota; @@ -1167,7 +1268,8 @@ int tcp_may_send_now(struct sock *sk) * know that all the data is in scatter-gather pages, and that the * packet has never been sent out before (and thus is not cloned). */ -static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, unsigned int mss_now) +static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, + unsigned int mss_now) { struct sk_buff *buff; int nlen = skb->len - len; @@ -1181,7 +1283,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, if (unlikely(buff == NULL)) return -ENOMEM; - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); buff->truesize += nlen; skb->truesize -= nlen; @@ -1192,7 +1295,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len, /* PSH and FIN should only be set in the second packet. */ flags = TCP_SKB_CB(skb)->flags; - TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); + TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN | TCPCB_FLAG_PSH); TCP_SKB_CB(buff)->flags = flags; /* This packet was never sent out yet, so no SACK bits. */ @@ -1231,15 +1334,14 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) /* Defer for less than two clock ticks. */ if (tp->tso_deferred && - ((jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) + (((u32)jiffies << 1) >> 1) - (tp->tso_deferred >> 1) > 1) goto send_now; in_flight = tcp_packets_in_flight(tp); - BUG_ON(tcp_skb_pcount(skb) <= 1 || - (tp->snd_cwnd <= in_flight)); + BUG_ON(tcp_skb_pcount(skb) <= 1 || (tp->snd_cwnd <= in_flight)); - send_win = (tp->snd_una + tp->snd_wnd) - TCP_SKB_CB(skb)->seq; + send_win = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; /* From in_flight test above, we know that cwnd > in_flight. */ cong_win = (tp->snd_cwnd - in_flight) * tp->mss_cache; @@ -1247,7 +1349,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) limit = min(send_win, cong_win); /* If a full-sized TSO skb can be sent, do it. */ - if (limit >= 65536) + if (limit >= sk->sk_gso_max_size) goto send_now; if (sysctl_tcp_tso_win_divisor) { @@ -1270,7 +1372,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct sk_buff *skb) } /* Ok, it looks like it is advisable to defer. */ - tp->tso_deferred = 1 | (jiffies<<1); + tp->tso_deferred = 1 | (jiffies << 1); return 1; @@ -1282,7 +1384,8 @@ send_now: /* Create a new MTU probe if we are ready. * Returns 0 if we should wait to probe (no cwnd available), * 1 if a probe was sent, - * -1 otherwise */ + * -1 otherwise + */ static int tcp_mtu_probe(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); @@ -1307,7 +1410,7 @@ static int tcp_mtu_probe(struct sock *sk) /* Very simple search strategy: just double the MSS. */ mss_now = tcp_current_mss(sk, 0); - probe_size = 2*tp->mss_cache; + probe_size = 2 * tp->mss_cache; size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache; if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) { /* TODO: set timer for probe_converge_event */ @@ -1320,7 +1423,7 @@ static int tcp_mtu_probe(struct sock *sk) if (tp->snd_wnd < size_needed) return -1; - if (after(tp->snd_nxt + size_needed, tp->snd_una + tp->snd_wnd)) + if (after(tp->snd_nxt + size_needed, tcp_wnd_end(tp))) return 0; /* Do we need to wait to drain cwnd? With none in flight, don't stall */ @@ -1334,7 +1437,8 @@ static int tcp_mtu_probe(struct sock *sk) /* We're allowed to probe. Build it now. */ if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL) return -1; - sk_charge_skb(sk, nskb); + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); skb = tcp_send_head(sk); @@ -1354,21 +1458,23 @@ static int tcp_mtu_probe(struct sock *sk) skb_copy_bits(skb, 0, skb_put(nskb, copy), copy); else nskb->csum = skb_copy_and_csum_bits(skb, 0, - skb_put(nskb, copy), copy, nskb->csum); + skb_put(nskb, copy), + copy, nskb->csum); if (skb->len <= copy) { /* We've eaten all the data from this skb. * Throw it away. */ TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags; tcp_unlink_write_queue(skb, sk); - sk_stream_free_skb(sk, skb); + sk_wmem_free_skb(sk, skb); } else { TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH); if (!skb_shinfo(skb)->nr_frags) { skb_pull(skb, copy); if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_partial(skb->data, skb->len, 0); + skb->csum = csum_partial(skb->data, + skb->len, 0); } else { __pskb_trim_head(skb, copy); tcp_set_skb_tso_segs(sk, skb, mss_now); @@ -1388,9 +1494,9 @@ static int tcp_mtu_probe(struct sock *sk) TCP_SKB_CB(nskb)->when = tcp_time_stamp; if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) { /* Decrement cwnd here because we are sending - * effectively two packets. */ + * effectively two packets. */ tp->snd_cwnd--; - update_send_head(sk, nskb); + tcp_event_new_data_sent(sk, nskb); icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len); tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq; @@ -1402,15 +1508,19 @@ static int tcp_mtu_probe(struct sock *sk) return -1; } - /* This routine writes packets to the network. It advances the * send_head. This happens as incoming acks open up the remote * window for us. * + * LARGESEND note: !tcp_urg_mode is overkill, only frames between + * snd_up-64k-mss .. snd_up cannot be large. However, taking into + * account rare use of URG, this is not a big flaw. + * * Returns 1, if no segments are in flight and we have queued segments, but * cannot send anything now because of SWS or another problem. */ -static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) +static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + int push_one, gfp_t gfp) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -1418,20 +1528,16 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) int cwnd_quota; int result; - /* If we are closed, the bytes will have to remain here. - * In time closedown will finish, we empty the write queue and all - * will be happy. - */ - if (unlikely(sk->sk_state == TCP_CLOSE)) - return 0; - sent_pkts = 0; - /* Do MTU probing. */ - if ((result = tcp_mtu_probe(sk)) == 0) { - return 0; - } else if (result > 0) { - sent_pkts = 1; + if (!push_one) { + /* Do MTU probing. */ + result = tcp_mtu_probe(sk); + if (!result) { + return 0; + } else if (result > 0) { + sent_pkts = 1; + } } while ((skb = tcp_send_head(sk))) { @@ -1453,22 +1559,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) nonagle : TCP_NAGLE_PUSH)))) break; } else { - if (tcp_tso_should_defer(sk, skb)) + if (!push_one && tcp_tso_should_defer(sk, skb)) break; } limit = mss_now; - if (tso_segs > 1) { - limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); - - if (skb->len < limit) { - unsigned int trim = skb->len % mss_now; - - if (trim) - limit = skb->len - trim; - } - } + if (tso_segs > 1 && !tcp_urg_mode(tp)) + limit = tcp_mss_split_point(sk, skb, mss_now, + cwnd_quota); if (skb->len > limit && unlikely(tso_fragment(sk, skb, limit, mss_now))) @@ -1476,16 +1574,19 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle) TCP_SKB_CB(skb)->when = tcp_time_stamp; - if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC))) + if (unlikely(tcp_transmit_skb(sk, skb, 1, gfp))) break; /* Advance the send_head. This one is sent out. * This call will increment packets_out. */ - update_send_head(sk, skb); + tcp_event_new_data_sent(sk, skb); tcp_minshall_update(tp, mss_now, skb); sent_pkts++; + + if (push_one) + break; } if (likely(sent_pkts)) { @@ -1504,10 +1605,18 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, { struct sk_buff *skb = tcp_send_head(sk); - if (skb) { - if (tcp_write_xmit(sk, cur_mss, nonagle)) - tcp_check_probe_timer(sk); - } + if (!skb) + return; + + /* If we are closed, the bytes will have to remain here. + * In time closedown will finish, we empty the write queue and + * all will be happy. + */ + if (unlikely(sk->sk_state == TCP_CLOSE)) + return; + + if (tcp_write_xmit(sk, cur_mss, nonagle, 0, GFP_ATOMIC)) + tcp_check_probe_timer(sk); } /* Send _single_ skb sitting at the send head. This function requires @@ -1515,46 +1624,11 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss, */ void tcp_push_one(struct sock *sk, unsigned int mss_now) { - struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); - unsigned int tso_segs, cwnd_quota; BUG_ON(!skb || skb->len < mss_now); - tso_segs = tcp_init_tso_segs(sk, skb, mss_now); - cwnd_quota = tcp_snd_test(sk, skb, mss_now, TCP_NAGLE_PUSH); - - if (likely(cwnd_quota)) { - unsigned int limit; - - BUG_ON(!tso_segs); - - limit = mss_now; - if (tso_segs > 1) { - limit = tcp_window_allows(tp, skb, - mss_now, cwnd_quota); - - if (skb->len < limit) { - unsigned int trim = skb->len % mss_now; - - if (trim) - limit = skb->len - trim; - } - } - - if (skb->len > limit && - unlikely(tso_fragment(sk, skb, limit, mss_now))) - return; - - /* Send it out now. */ - TCP_SKB_CB(skb)->when = tcp_time_stamp; - - if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) { - update_send_head(sk, skb); - tcp_cwnd_validate(sk); - return; - } - } + tcp_write_xmit(sk, mss_now, TCP_NAGLE_PUSH, 1, sk->sk_allocation); } /* This function returns the amount that we can raise the @@ -1631,7 +1705,8 @@ u32 __tcp_select_window(struct sock *sk) icsk->icsk_ack.quick = 0; if (tcp_memory_pressure) - tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss); + tp->rcv_ssthresh = min(tp->rcv_ssthresh, + 4U * tp->advmss); if (free_space < mss) return 0; @@ -1664,7 +1739,7 @@ u32 __tcp_select_window(struct sock *sk) * is too small. */ if (window <= free_space - mss || window > free_space) - window = (free_space/mss)*mss; + window = (free_space / mss) * mss; else if (mss == full_space && free_space > window + (full_space >> 1)) window = free_space; @@ -1673,131 +1748,117 @@ u32 __tcp_select_window(struct sock *sk) return window; } -/* Attempt to collapse two adjacent SKB's during retransmission. */ -static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int mss_now) +/* Collapses two adjacent SKB's during retransmission. */ +static void tcp_collapse_retrans(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *next_skb = tcp_write_queue_next(sk, skb); + int skb_size, next_skb_size; + u16 flags; - /* The first test we must make is that neither of these two - * SKB's are still referenced by someone else. - */ - if (!skb_cloned(skb) && !skb_cloned(next_skb)) { - int skb_size = skb->len, next_skb_size = next_skb->len; - u16 flags = TCP_SKB_CB(skb)->flags; - - /* Also punt if next skb has been SACK'd. */ - if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_ACKED) - return; - - /* Next skb is out of window. */ - if (after(TCP_SKB_CB(next_skb)->end_seq, tp->snd_una+tp->snd_wnd)) - return; - - /* Punt if not enough space exists in the first SKB for - * the data in the second, or the total combined payload - * would exceed the MSS. - */ - if ((next_skb_size > skb_tailroom(skb)) || - ((skb_size + next_skb_size) > mss_now)) - return; + skb_size = skb->len; + next_skb_size = next_skb->len; + flags = TCP_SKB_CB(skb)->flags; - BUG_ON(tcp_skb_pcount(skb) != 1 || - tcp_skb_pcount(next_skb) != 1); + BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1); - tcp_highest_sack_combine(sk, next_skb, skb); + tcp_highest_sack_combine(sk, next_skb, skb); - /* Ok. We will be able to collapse the packet. */ - tcp_unlink_write_queue(next_skb, sk); + tcp_unlink_write_queue(next_skb, sk); - skb_copy_from_linear_data(next_skb, - skb_put(skb, next_skb_size), - next_skb_size); + skb_copy_from_linear_data(next_skb, skb_put(skb, next_skb_size), + next_skb_size); - if (next_skb->ip_summed == CHECKSUM_PARTIAL) - skb->ip_summed = CHECKSUM_PARTIAL; + if (next_skb->ip_summed == CHECKSUM_PARTIAL) + skb->ip_summed = CHECKSUM_PARTIAL; - if (skb->ip_summed != CHECKSUM_PARTIAL) - skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); + if (skb->ip_summed != CHECKSUM_PARTIAL) + skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size); - /* Update sequence range on original skb. */ - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; + /* Update sequence range on original skb. */ + TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq; - /* Merge over control information. */ - flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ - TCP_SKB_CB(skb)->flags = flags; + /* Merge over control information. */ + flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */ + TCP_SKB_CB(skb)->flags = flags; - /* All done, get rid of second SKB and account for it so - * packet counting does not break. - */ - TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked&(TCPCB_EVER_RETRANS|TCPCB_AT_TAIL); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(next_skb); - if (TCP_SKB_CB(next_skb)->sacked&TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(next_skb); - /* Reno case is special. Sigh... */ - if (tcp_is_reno(tp) && tp->sacked_out) - tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + /* All done, get rid of second SKB and account for it so + * packet counting does not break. + */ + TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS; + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= tcp_skb_pcount(next_skb); + if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST) + tp->lost_out -= tcp_skb_pcount(next_skb); + /* Reno case is special. Sigh... */ + if (tcp_is_reno(tp) && tp->sacked_out) + tcp_dec_pcount_approx(&tp->sacked_out, next_skb); + + tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb)); + tp->packets_out -= tcp_skb_pcount(next_skb); + + /* changed transmit queue under us so clear hints */ + tcp_clear_retrans_hints_partial(tp); + if (next_skb == tp->retransmit_skb_hint) + tp->retransmit_skb_hint = skb; - tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb)); - tp->packets_out -= tcp_skb_pcount(next_skb); + sk_wmem_free_skb(sk, next_skb); +} - /* changed transmit queue under us so clear hints */ - tcp_clear_retrans_hints_partial(tp); +static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb) +{ + if (tcp_skb_pcount(skb) > 1) + return 0; + /* TODO: SACK collapsing could be used to remove this condition */ + if (skb_shinfo(skb)->nr_frags != 0) + return 0; + if (skb_cloned(skb)) + return 0; + if (skb == tcp_send_head(sk)) + return 0; + /* Some heurestics for collapsing over SACK'd could be invented */ + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) + return 0; - sk_stream_free_skb(sk, next_skb); - } + return 1; } -/* Do a simple retransmit without using the backoff mechanisms in - * tcp_timer. This is used for path mtu discovery. - * The socket is already locked here. - */ -void tcp_simple_retransmit(struct sock *sk) +static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to, + int space) { - const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - unsigned int mss = tcp_current_mss(sk, 0); - int lost = 0; + struct sk_buff *skb = to, *tmp; + int first = 1; - tcp_for_write_queue(skb, sk) { - if (skb == tcp_send_head(sk)) + if (!sysctl_tcp_retrans_collapse) + return; + if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) + return; + + tcp_for_write_queue_from_safe(skb, tmp, sk) { + if (!tcp_can_collapse(sk, skb)) break; - if (skb->len > mss && - !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { - if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { - TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; - tp->retrans_out -= tcp_skb_pcount(skb); - } - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_LOST)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - lost = 1; - } - } - } - tcp_clear_all_retrans_hints(tp); + space -= skb->len; - if (!lost) - return; + if (first) { + first = 0; + continue; + } - tcp_verify_left_out(tp); + if (space < 0) + break; + /* Punt if not enough space exists in the first SKB for + * the data in the second + */ + if (skb->len > skb_tailroom(to)) + break; - /* Don't muck with the congestion window here. - * Reason is that we do not increase amount of _data_ - * in network, but units changed and effective - * cwnd/ssthresh really reduced now. - */ - if (icsk->icsk_ca_state != TCP_CA_Loss) { - tp->high_seq = tp->snd_nxt; - tp->snd_ssthresh = tcp_current_ssthresh(sk); - tp->prior_ssthresh = 0; - tp->undo_marker = 0; - tcp_set_ca_state(sk, TCP_CA_Loss); + if (after(TCP_SKB_CB(skb)->end_seq, tcp_wnd_end(tp))) + break; + + tcp_collapse_retrans(sk, to); } - tcp_xmit_retransmit_queue(sk); } /* This retransmits one SKB. Policy decisions and retransmit queue @@ -1808,7 +1869,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); - unsigned int cur_mss = tcp_current_mss(sk, 0); + unsigned int cur_mss; int err; /* Inconslusive MTU probe */ @@ -1830,12 +1891,17 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -ENOMEM; } + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) + return -EHOSTUNREACH; /* Routing failure or similar. */ + + cur_mss = tcp_current_mss(sk, 0); + /* If receiver has shrunk his window, and skb is out of * new window, do not retransmit it. The exception is the * case, when window is shrunk to zero. In this case * our retransmit serves as a zero window probe. */ - if (!before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd) + if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) && TCP_SKB_CB(skb)->seq != tp->snd_una) return -EAGAIN; @@ -1844,18 +1910,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return -ENOMEM; /* We'll try again later. */ } - /* Collapse two adjacent packets if worthwhile and we can. */ - if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN) && - (skb->len < (cur_mss >> 1)) && - (tcp_write_queue_next(sk, skb) != tcp_send_head(sk)) && - (!tcp_skb_is_last(sk, skb)) && - (skb_shinfo(skb)->nr_frags == 0 && skb_shinfo(tcp_write_queue_next(sk, skb))->nr_frags == 0) && - (tcp_skb_pcount(skb) == 1 && tcp_skb_pcount(tcp_write_queue_next(sk, skb)) == 1) && - (sysctl_tcp_retrans_collapse != 0)) - tcp_retrans_try_collapse(sk, skb, cur_mss); - - if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) - return -EHOSTUNREACH; /* Routing failure or similar. */ + tcp_retrans_try_collapse(sk, skb, cur_mss); /* Some Solaris stacks overoptimize and ignore the FIN on a * retransmit when old data is attached. So strip it off @@ -1865,12 +1920,10 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) && tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) { if (!pskb_trim(skb, 0)) { - TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; + /* Reuse, even though it does some unnecessary work */ + tcp_init_nondata_skb(skb, TCP_SKB_CB(skb)->end_seq - 1, + TCP_SKB_CB(skb)->flags); skb->ip_summed = CHECKSUM_NONE; - skb->csum = 0; } } @@ -1883,12 +1936,12 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) if (err == 0) { /* Update global TCP statistics. */ - TCP_INC_STATS(TCP_MIB_RETRANSSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_RETRANSSEGS); tp->total_retrans++; #if FASTRETRANS_DEBUG > 0 - if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) { + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { if (net_ratelimit()) printk(KERN_DEBUG "retrans_out leaked.\n"); } @@ -1912,83 +1965,18 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) return err; } -/* This gets called after a retransmit timeout, and the initially - * retransmitted data is acknowledged. It tries to continue - * resending the rest of the retransmit queue, until either - * we've sent it all or the congestion window limit is reached. - * If doing SACK, the first ACK which comes back for a timeout - * based retransmit packet might feed us FACK information again. - * If so, we use it to avoid unnecessarily retransmissions. - */ -void tcp_xmit_retransmit_queue(struct sock *sk) +static int tcp_can_forward_retransmit(struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - int packet_cnt; - - if (tp->retransmit_skb_hint) { - skb = tp->retransmit_skb_hint; - packet_cnt = tp->retransmit_cnt_hint; - }else{ - skb = tcp_write_queue_head(sk); - packet_cnt = 0; - } - - /* First pass: retransmit lost packets. */ - if (tp->lost_out) { - tcp_for_write_queue_from(skb, sk) { - __u8 sacked = TCP_SKB_CB(skb)->sacked; - - if (skb == tcp_send_head(sk)) - break; - /* we could do better than to assign each time */ - tp->retransmit_skb_hint = skb; - tp->retransmit_cnt_hint = packet_cnt; - - /* Assume this retransmit will generate - * only one packet for congestion window - * calculation purposes. This works because - * tcp_retransmit_skb() will chop up the - * packet to be MSS sized and all the - * packet counting works out. - */ - if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - return; - - if (sacked & TCPCB_LOST) { - if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) { - if (tcp_retransmit_skb(sk, skb)) { - tp->retransmit_skb_hint = NULL; - return; - } - if (icsk->icsk_ca_state != TCP_CA_Loss) - NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS); - else - NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS); - - if (skb == tcp_write_queue_head(sk)) - inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, - inet_csk(sk)->icsk_rto, - TCP_RTO_MAX); - } - - packet_cnt += tcp_skb_pcount(skb); - if (packet_cnt >= tp->lost_out) - break; - } - } - } - - /* OK, demanded retransmission is finished. */ /* Forward retransmissions are possible only during Recovery. */ if (icsk->icsk_ca_state != TCP_CA_Recovery) - return; + return 0; /* No forward retransmissions in Reno are possible. */ if (tcp_is_reno(tp)) - return; + return 0; /* Yeah, we have to make difficult choice between forward transmission * and retransmission... Both ways have their merits... @@ -1999,47 +1987,106 @@ void tcp_xmit_retransmit_queue(struct sock *sk) */ if (tcp_may_send_now(sk)) - return; + return 0; - /* If nothing is SACKed, highest_sack in the loop won't be valid */ - if (!tp->sacked_out) - return; + return 1; +} - if (tp->forward_skb_hint) - skb = tp->forward_skb_hint; - else +/* This gets called after a retransmit timeout, and the initially + * retransmitted data is acknowledged. It tries to continue + * resending the rest of the retransmit queue, until either + * we've sent it all or the congestion window limit is reached. + * If doing SACK, the first ACK which comes back for a timeout + * based retransmit packet might feed us FACK information again. + * If so, we use it to avoid unnecessarily retransmissions. + */ +void tcp_xmit_retransmit_queue(struct sock *sk) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + struct sk_buff *hole = NULL; + u32 last_lost; + int mib_idx; + int fwd_rexmitting = 0; + + if (!tp->lost_out) + tp->retransmit_high = tp->snd_una; + + if (tp->retransmit_skb_hint) { + skb = tp->retransmit_skb_hint; + last_lost = TCP_SKB_CB(skb)->end_seq; + if (after(last_lost, tp->retransmit_high)) + last_lost = tp->retransmit_high; + } else { skb = tcp_write_queue_head(sk); + last_lost = tp->snd_una; + } tcp_for_write_queue_from(skb, sk) { - if (skb == tcp_send_head(sk)) - break; - tp->forward_skb_hint = skb; + __u8 sacked = TCP_SKB_CB(skb)->sacked; - if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + if (skb == tcp_send_head(sk)) break; + /* we could do better than to assign each time */ + if (hole == NULL) + tp->retransmit_skb_hint = skb; + /* Assume this retransmit will generate + * only one packet for congestion window + * calculation purposes. This works because + * tcp_retransmit_skb() will chop up the + * packet to be MSS sized and all the + * packet counting works out. + */ if (tcp_packets_in_flight(tp) >= tp->snd_cwnd) - break; + return; + + if (fwd_rexmitting) { +begin_fwd: + if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) + break; + mib_idx = LINUX_MIB_TCPFORWARDRETRANS; - if (TCP_SKB_CB(skb)->sacked & TCPCB_TAGBITS) + } else if (!before(TCP_SKB_CB(skb)->seq, tp->retransmit_high)) { + tp->retransmit_high = last_lost; + if (!tcp_can_forward_retransmit(sk)) + break; + /* Backtrack if necessary to non-L'ed skb */ + if (hole != NULL) { + skb = hole; + hole = NULL; + } + fwd_rexmitting = 1; + goto begin_fwd; + + } else if (!(sacked & TCPCB_LOST)) { + if (hole == NULL && !(sacked & TCPCB_SACKED_RETRANS)) + hole = skb; continue; - /* Ok, retransmit it. */ - if (tcp_retransmit_skb(sk, skb)) { - tp->forward_skb_hint = NULL; - break; + } else { + last_lost = TCP_SKB_CB(skb)->end_seq; + if (icsk->icsk_ca_state != TCP_CA_Loss) + mib_idx = LINUX_MIB_TCPFASTRETRANS; + else + mib_idx = LINUX_MIB_TCPSLOWSTARTRETRANS; } + if (sacked & (TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS)) + continue; + + if (tcp_retransmit_skb(sk, skb)) + return; + NET_INC_STATS_BH(sock_net(sk), mib_idx); + if (skb == tcp_write_queue_head(sk)) inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, inet_csk(sk)->icsk_rto, TCP_RTO_MAX); - - NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS); } } - /* Send a fin. The caller locks the socket for us. This cannot be * allowed to fail queueing a FIN frame under any circumstances. */ @@ -2070,16 +2117,9 @@ void tcp_send_fin(struct sock *sk) /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */ - TCP_SKB_CB(skb)->seq = tp->write_seq; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; + tcp_init_nondata_skb(skb, tp->write_seq, + TCPCB_FLAG_ACK | TCPCB_FLAG_FIN); tcp_queue_skb(sk, skb); } __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF); @@ -2097,25 +2137,20 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) /* NOTE: No TCP options attached and we never retransmit this. */ skb = alloc_skb(MAX_TCP_HEADER, priority); if (!skb) { - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); return; } /* Reserve space for headers and prepare control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST); - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - + tcp_init_nondata_skb(skb, tcp_acceptable_seq(sk), + TCPCB_FLAG_ACK | TCPCB_FLAG_RST); /* Send it off. */ - TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk); - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; if (tcp_transmit_skb(sk, skb, 0, priority)) - NET_INC_STATS(LINUX_MIB_TCPABORTFAILED); + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPABORTFAILED); + + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS); } /* WARNING: This routine must only be called when we have already sent @@ -2125,14 +2160,14 @@ void tcp_send_active_reset(struct sock *sk, gfp_t priority) */ int tcp_send_synack(struct sock *sk) { - struct sk_buff* skb; + struct sk_buff *skb; skb = tcp_write_queue_head(sk); - if (skb == NULL || !(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_SYN)) { + if (skb == NULL || !(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_SYN)) { printk(KERN_DEBUG "tcp_send_synack: wrong queue state\n"); return -EFAULT; } - if (!(TCP_SKB_CB(skb)->flags&TCPCB_FLAG_ACK)) { + if (!(TCP_SKB_CB(skb)->flags & TCPCB_FLAG_ACK)) { if (skb_cloned(skb)) { struct sk_buff *nskb = skb_copy(skb, GFP_ATOMIC); if (nskb == NULL) @@ -2140,8 +2175,9 @@ int tcp_send_synack(struct sock *sk) tcp_unlink_write_queue(skb, sk); skb_header_release(nskb); __tcp_add_write_queue_head(sk, nskb); - sk_stream_free_skb(sk, skb); - sk_charge_skb(sk, nskb); + sk_wmem_free_skb(sk, skb); + sk->sk_wmem_queued += nskb->truesize; + sk_mem_charge(sk, nskb->truesize); skb = nskb; } @@ -2155,18 +2191,18 @@ int tcp_send_synack(struct sock *sk) /* * Prepare a SYN-ACK. */ -struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, - struct request_sock *req) +struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, + struct request_sock *req) { struct inet_request_sock *ireq = inet_rsk(req); struct tcp_sock *tp = tcp_sk(sk); struct tcphdr *th; int tcp_header_size; + struct tcp_out_options opts; struct sk_buff *skb; -#ifdef CONFIG_TCP_MD5SIG struct tcp_md5sig_key *md5; __u8 *md5_hash_location; -#endif + int mss; skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC); if (skb == NULL) @@ -2177,43 +2213,17 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, skb->dst = dst_clone(dst); - tcp_header_size = (sizeof(struct tcphdr) + TCPOLEN_MSS + - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0) + - (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) + - /* SACK_PERM is in the place of NOP NOP of TS */ - ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0)); + mss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss) + mss = tp->rx_opt.user_mss; -#ifdef CONFIG_TCP_MD5SIG - /* Are we doing MD5 on this segment? If so - make room for it */ - md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req); - if (md5) - tcp_header_size += TCPOLEN_MD5SIG_ALIGNED; -#endif - skb_push(skb, tcp_header_size); - skb_reset_transport_header(skb); - - th = tcp_hdr(skb); - memset(th, 0, sizeof(struct tcphdr)); - th->syn = 1; - th->ack = 1; - TCP_ECN_make_synack(req, th); - th->source = inet_sk(sk)->sport; - th->dest = ireq->rmt_port; - TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1; - TCP_SKB_CB(skb)->sacked = 0; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - th->seq = htonl(TCP_SKB_CB(skb)->seq); - th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */ __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), - dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), + mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), &req->rcv_wnd, &req->window_clamp, ireq->wscale_ok, @@ -2221,33 +2231,46 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, ireq->rcv_wscale = rcv_wscale; } - /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ - th->window = htons(min(req->rcv_wnd, 65535U)); - - TCP_SKB_CB(skb)->when = tcp_time_stamp; - tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok, - ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale, - TCP_SKB_CB(skb)->when, - req->ts_recent, - ( -#ifdef CONFIG_TCP_MD5SIG - md5 ? &md5_hash_location : + memset(&opts, 0, sizeof(opts)); +#ifdef CONFIG_SYN_COOKIES + if (unlikely(req->cookie_ts)) + TCP_SKB_CB(skb)->when = cookie_init_timestamp(req); + else #endif - NULL) - ); + TCP_SKB_CB(skb)->when = tcp_time_stamp; + tcp_header_size = tcp_synack_options(sk, req, mss, + skb, &opts, &md5) + + sizeof(struct tcphdr); - skb->csum = 0; + skb_push(skb, tcp_header_size); + skb_reset_transport_header(skb); + + th = tcp_hdr(skb); + memset(th, 0, sizeof(struct tcphdr)); + th->syn = 1; + th->ack = 1; + TCP_ECN_make_synack(req, th); + th->source = ireq->loc_port; + th->dest = ireq->rmt_port; + /* Setting of flags are superfluous here for callers (and ECE is + * not even correctly set) + */ + tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn, + TCPCB_FLAG_SYN | TCPCB_FLAG_ACK); + th->seq = htonl(TCP_SKB_CB(skb)->seq); + th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1); + + /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */ + th->window = htons(min(req->rcv_wnd, 65535U)); + tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location); th->doff = (tcp_header_size >> 2); - TCP_INC_STATS(TCP_MIB_OUTSEGS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS); #ifdef CONFIG_TCP_MD5SIG /* Okay, we have all we need - do the md5 hash if needed */ if (md5) { tp->af_specific->calc_md5_hash(md5_hash_location, - md5, - NULL, dst, req, - tcp_hdr(skb), sk->sk_protocol, - skb->len); + md5, NULL, req, skb); } #endif @@ -2284,6 +2307,9 @@ static void tcp_connect_init(struct sock *sk) if (!tp->window_clamp) tp->window_clamp = dst_metric(dst, RTAX_WINDOW); tp->advmss = dst_metric(dst, RTAX_ADVMSS); + if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < tp->advmss) + tp->advmss = tp->rx_opt.user_mss; + tcp_initialize_rcv_mss(sk); tcp_select_initial_window(tcp_full_space(sk), @@ -2302,6 +2328,7 @@ static void tcp_connect_init(struct sock *sk) tcp_init_wl(tp, tp->write_seq, 0); tp->snd_una = tp->write_seq; tp->snd_sml = tp->write_seq; + tp->snd_up = tp->write_seq; tp->rcv_nxt = 0; tp->rcv_wup = 0; tp->copied_seq = 0; @@ -2328,23 +2355,17 @@ int tcp_connect(struct sock *sk) /* Reserve space for headers. */ skb_reserve(buff, MAX_TCP_HEADER); - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN; - TCP_ECN_send_syn(sk, buff); - TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->gso_segs = 1; - skb_shinfo(buff)->gso_size = 0; - skb_shinfo(buff)->gso_type = 0; - buff->csum = 0; tp->snd_nxt = tp->write_seq; - TCP_SKB_CB(buff)->seq = tp->write_seq++; - TCP_SKB_CB(buff)->end_seq = tp->write_seq; + tcp_init_nondata_skb(buff, tp->write_seq++, TCPCB_FLAG_SYN); + TCP_ECN_send_syn(sk, buff); /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; tp->retrans_stamp = TCP_SKB_CB(buff)->when; skb_header_release(buff); __tcp_add_write_queue_tail(sk, buff); - sk_charge_skb(sk, buff); + sk->sk_wmem_queued += buff->truesize; + sk_mem_charge(sk, buff->truesize); tp->packets_out += tcp_skb_pcount(buff); tcp_transmit_skb(sk, buff, 1, GFP_KERNEL); @@ -2353,7 +2374,7 @@ int tcp_connect(struct sock *sk) */ tp->snd_nxt = tp->write_seq; tp->pushed_seq = tp->write_seq; - TCP_INC_STATS(TCP_MIB_ACTIVEOPENS); + TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS); /* Timer for repeating the SYN until an answer. */ inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, @@ -2373,9 +2394,10 @@ void tcp_send_delayed_ack(struct sock *sk) if (ato > TCP_DELACK_MIN) { const struct tcp_sock *tp = tcp_sk(sk); - int max_ato = HZ/2; + int max_ato = HZ / 2; - if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) + if (icsk->icsk_ack.pingpong || + (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)) max_ato = TCP_DELACK_MAX; /* Slow path, intersegment interval is "high". */ @@ -2385,7 +2407,7 @@ void tcp_send_delayed_ack(struct sock *sk) * directly. */ if (tp->srtt) { - int rtt = max(tp->srtt>>3, TCP_DELACK_MIN); + int rtt = max(tp->srtt >> 3, TCP_DELACK_MIN); if (rtt < max_ato) max_ato = rtt; @@ -2419,37 +2441,32 @@ void tcp_send_delayed_ack(struct sock *sk) /* This routine sends an ack and also updates the window. */ void tcp_send_ack(struct sock *sk) { + struct sk_buff *buff; + /* If we have been reset, we may not send again. */ - if (sk->sk_state != TCP_CLOSE) { - struct sk_buff *buff; + if (sk->sk_state == TCP_CLOSE) + return; - /* We are not putting this on the write queue, so - * tcp_transmit_skb() will set the ownership to this - * sock. - */ - buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); - if (buff == NULL) { - inet_csk_schedule_ack(sk); - inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; - inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, - TCP_DELACK_MAX, TCP_RTO_MAX); - return; - } + /* We are not putting this on the write queue, so + * tcp_transmit_skb() will set the ownership to this + * sock. + */ + buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC); + if (buff == NULL) { + inet_csk_schedule_ack(sk); + inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN; + inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK, + TCP_DELACK_MAX, TCP_RTO_MAX); + return; + } - /* Reserve space for headers and prepare control bits. */ - skb_reserve(buff, MAX_TCP_HEADER); - buff->csum = 0; - TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(buff)->sacked = 0; - skb_shinfo(buff)->gso_segs = 1; - skb_shinfo(buff)->gso_size = 0; - skb_shinfo(buff)->gso_type = 0; + /* Reserve space for headers and prepare control bits. */ + skb_reserve(buff, MAX_TCP_HEADER); + tcp_init_nondata_skb(buff, tcp_acceptable_seq(sk), TCPCB_FLAG_ACK); - /* Send it off, this clears delayed acks for us. */ - TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk); - TCP_SKB_CB(buff)->when = tcp_time_stamp; - tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); - } + /* Send it off, this clears delayed acks for us. */ + TCP_SKB_CB(buff)->when = tcp_time_stamp; + tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC); } /* This routine sends a packet with an out of date sequence @@ -2475,66 +2492,56 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent) /* Reserve space for headers and set control bits. */ skb_reserve(skb, MAX_TCP_HEADER); - skb->csum = 0; - TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK; - TCP_SKB_CB(skb)->sacked = urgent; - skb_shinfo(skb)->gso_segs = 1; - skb_shinfo(skb)->gso_size = 0; - skb_shinfo(skb)->gso_type = 0; - /* Use a previous sequence. This should cause the other * end to send an ack. Don't queue or clone SKB, just * send it. */ - TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; - TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; + tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPCB_FLAG_ACK); TCP_SKB_CB(skb)->when = tcp_time_stamp; return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC); } int tcp_write_wakeup(struct sock *sk) { - if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - struct sk_buff *skb; - - if ((skb = tcp_send_head(sk)) != NULL && - before(TCP_SKB_CB(skb)->seq, tp->snd_una+tp->snd_wnd)) { - int err; - unsigned int mss = tcp_current_mss(sk, 0); - unsigned int seg_size = tp->snd_una+tp->snd_wnd-TCP_SKB_CB(skb)->seq; - - if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) - tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; - - /* We are probing the opening of a window - * but the window size is != 0 - * must have been a result SWS avoidance ( sender ) - */ - if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || - skb->len > mss) { - seg_size = min(seg_size, mss); - TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - if (tcp_fragment(sk, skb, seg_size, mss)) - return -1; - } else if (!tcp_skb_pcount(skb)) - tcp_set_skb_tso_segs(sk, skb, mss); + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *skb; + if (sk->sk_state == TCP_CLOSE) + return -1; + + if ((skb = tcp_send_head(sk)) != NULL && + before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) { + int err; + unsigned int mss = tcp_current_mss(sk, 0); + unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq; + + if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq)) + tp->pushed_seq = TCP_SKB_CB(skb)->end_seq; + + /* We are probing the opening of a window + * but the window size is != 0 + * must have been a result SWS avoidance ( sender ) + */ + if (seg_size < TCP_SKB_CB(skb)->end_seq - TCP_SKB_CB(skb)->seq || + skb->len > mss) { + seg_size = min(seg_size, mss); TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; - TCP_SKB_CB(skb)->when = tcp_time_stamp; - err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); - if (!err) { - update_send_head(sk, skb); - } - return err; - } else { - if (tp->urg_mode && - between(tp->snd_up, tp->snd_una+1, tp->snd_una+0xFFFF)) - tcp_xmit_probe_skb(sk, TCPCB_URG); - return tcp_xmit_probe_skb(sk, 0); - } + if (tcp_fragment(sk, skb, seg_size, mss)) + return -1; + } else if (!tcp_skb_pcount(skb)) + tcp_set_skb_tso_segs(sk, skb, mss); + + TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; + TCP_SKB_CB(skb)->when = tcp_time_stamp; + err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC); + if (!err) + tcp_event_new_data_sent(sk, skb); + return err; + } else { + if (between(tp->snd_up, tp->snd_una + 1, tp->snd_una + 0xFFFF)) + tcp_xmit_probe_skb(sk, 1); + return tcp_xmit_probe_skb(sk, 0); } - return -1; } /* A window probe timeout has occurred. If window is not closed send @@ -2578,9 +2585,9 @@ void tcp_send_probe0(struct sock *sk) } } +EXPORT_SYMBOL(tcp_select_initial_window); EXPORT_SYMBOL(tcp_connect); EXPORT_SYMBOL(tcp_make_synack); EXPORT_SYMBOL(tcp_simple_retransmit); EXPORT_SYMBOL(tcp_sync_mss); -EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor); EXPORT_SYMBOL(tcp_mtup_init);