X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_input.c;h=4c10d9cad20f594e2bd6a3a15b0c6d45554ece38;hb=4c70f383e0c0273c4092c4efdb414be0966978b7;hp=cb715eadf8f59bb48b689608c6519d180744dd10;hpb=e01f9d7793be82e6c252efbd52c399d3eb65abe4;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index cb715ea..4c10d9c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2; int sysctl_tcp_stdurg __read_mostly; int sysctl_tcp_rfc1337 __read_mostly; int sysctl_tcp_max_orphans __read_mostly = NR_FILE; -int sysctl_tcp_frto __read_mostly; +int sysctl_tcp_frto __read_mostly = 2; int sysctl_tcp_frto_response __read_mostly; int sysctl_tcp_nometrics_save __read_mostly; @@ -102,19 +102,20 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ +#define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ +#define FLAG_DSACKING_ACK 0x800 /* SACK blocks contained DSACK info */ +#define FLAG_NONHEAD_RETRANS_ACKED 0x1000 /* Non-head rexmitted data was ACKed */ #define FLAG_ACKED (FLAG_DATA_ACKED|FLAG_SYN_ACKED) #define FLAG_NOT_DUP (FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED) #define FLAG_CA_ALERT (FLAG_DATA_SACKED|FLAG_ECE) #define FLAG_FORWARD_PROGRESS (FLAG_ACKED|FLAG_DATA_SACKED) - -#define IsReno(tp) ((tp)->rx_opt.sack_ok == 0) -#define IsFack(tp) ((tp)->rx_opt.sack_ok & 2) -#define IsDSack(tp) ((tp)->rx_opt.sack_ok & 4) +#define FLAG_ANY_PROGRESS (FLAG_FORWARD_PROGRESS|FLAG_SND_UNA_ADVANCED) #define IsSackFrto() (sysctl_tcp_frto == 0x2) #define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH) +#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH)) /* Adapt the MSS value used to make delayed ack decision to the * real world. @@ -140,7 +141,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * * "len" is invariant segment length, including TCP header. */ - len += skb->data - skb->h.raw; + len += skb->data - skb_transport_header(skb); if (len >= TCP_MIN_RCVMSS + sizeof(struct tcphdr) || /* If PSH is not set, packet should be * full sized, provided peer TCP is not badly broken. @@ -148,7 +149,7 @@ static void tcp_measure_rcv_mss(struct sock *sk, * to handle super-low mtu links fairly. */ (len >= TCP_MIN_MSS + sizeof(struct tcphdr) && - !(tcp_flag_word(skb->h.th)&TCP_REMNANT))) { + !(tcp_flag_word(tcp_hdr(skb)) & TCP_REMNANT))) { /* Subtract also invariant (if peer is RFC compliant), * tcp header plus fixed timestamp option length. * Resulting "len" is MSS free of SACK jitter. @@ -195,6 +196,55 @@ static inline int tcp_in_quickack_mode(const struct sock *sk) return icsk->icsk_ack.quick && !icsk->icsk_ack.pingpong; } +static inline void TCP_ECN_queue_cwr(struct tcp_sock *tp) +{ + if (tp->ecn_flags&TCP_ECN_OK) + tp->ecn_flags |= TCP_ECN_QUEUE_CWR; +} + +static inline void TCP_ECN_accept_cwr(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tcp_hdr(skb)->cwr) + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_withdraw_cwr(struct tcp_sock *tp) +{ + tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +} + +static inline void TCP_ECN_check_ce(struct tcp_sock *tp, struct sk_buff *skb) +{ + if (tp->ecn_flags&TCP_ECN_OK) { + if (INET_ECN_is_ce(TCP_SKB_CB(skb)->flags)) + tp->ecn_flags |= TCP_ECN_DEMAND_CWR; + /* Funny extension: if ECT is not set on a segment, + * it is surely retransmit. It is not in ECN RFC, + * but Linux follows this rule. */ + else if (INET_ECN_is_not_ect((TCP_SKB_CB(skb)->flags))) + tcp_enter_quickack_mode((struct sock *)tp); + } +} + +static inline void TCP_ECN_rcv_synack(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline void TCP_ECN_rcv_syn(struct tcp_sock *tp, struct tcphdr *th) +{ + if ((tp->ecn_flags&TCP_ECN_OK) && (!th->ece || !th->cwr)) + tp->ecn_flags &= ~TCP_ECN_OK; +} + +static inline int TCP_ECN_rcv_ecn_echo(struct tcp_sock *tp, struct tcphdr *th) +{ + if (th->ece && !th->syn && (tp->ecn_flags&TCP_ECN_OK)) + return 1; + return 0; +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -235,9 +285,9 @@ static void tcp_fixup_sndbuf(struct sock *sk) */ /* Slow part of check#2. */ -static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, - const struct sk_buff *skb) +static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); /* Optimize this! */ int truesize = tcp_win_from_space(skb->truesize)/2; int window = tcp_win_from_space(sysctl_tcp_rmem[2])/2; @@ -252,9 +302,11 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, +static void tcp_grow_window(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); + /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && (int)tp->rcv_ssthresh < tcp_space(sk) && @@ -267,7 +319,7 @@ static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, if (tcp_win_from_space(skb->truesize) <= skb->len) incr = 2*tp->advmss; else - incr = __tcp_grow_window(sk, tp, skb); + incr = __tcp_grow_window(sk, skb); if (incr) { tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, tp->window_clamp); @@ -330,8 +382,9 @@ static void tcp_init_buffer_space(struct sock *sk) } /* 5. Recalculate window clamp after socket hit its memory bounds. */ -static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) +static void tcp_clamp_window(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ack.quick = 0; @@ -503,8 +556,9 @@ new_measure: * each ACK we send, he increments snd_cwnd and transmits more of his * queue. -DaveM */ -static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb) +static void tcp_event_data_recv(struct sock *sk, struct sk_buff *skb) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); u32 now; @@ -545,7 +599,17 @@ static void tcp_event_data_recv(struct sock *sk, struct tcp_sock *tp, struct sk_ TCP_ECN_check_ce(tp, skb); if (skb->len >= 128) - tcp_grow_window(sk, tp, skb); + tcp_grow_window(sk, skb); +} + +static u32 tcp_rto_min(struct sock *sk) +{ + struct dst_entry *dst = __sk_dst_get(sk); + u32 rto_min = TCP_RTO_MIN; + + if (dst && dst_metric_locked(dst, RTAX_RTO_MIN)) + rto_min = dst->metrics[RTAX_RTO_MIN-1]; + return rto_min; } /* Called to compute a smoothed rtt estimate. The data fed to this @@ -578,7 +642,7 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) * does not matter how to _calculate_ it. Seems, it was trap * that VJ failed to avoid. 8) */ - if(m == 0) + if (m == 0) m = 1; if (tp->srtt != 0) { m -= (tp->srtt >> 3); /* m is now error in rtt est */ @@ -609,13 +673,13 @@ static void tcp_rtt_estimator(struct sock *sk, const __u32 mrtt) if (tp->mdev_max < tp->rttvar) tp->rttvar -= (tp->rttvar-tp->mdev_max)>>2; tp->rtt_seq = tp->snd_nxt; - tp->mdev_max = TCP_RTO_MIN; + tp->mdev_max = tcp_rto_min(sk); } } else { /* no previous measure. */ tp->srtt = m<<3; /* take the measured time to be rtt */ tp->mdev = m<<1; /* make sure rto = 3*rtt */ - tp->mdev_max = tp->rttvar = max(tp->mdev, TCP_RTO_MIN); + tp->mdev_max = tp->rttvar = max(tp->mdev, tcp_rto_min(sk)); tp->rtt_seq = tp->snd_nxt; } } @@ -748,7 +812,15 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC2414. */ +/* Numbers are taken from RFC3390. + * + * John Heffner states: + * + * The RFC specifies a window of no more than 4380 bytes + * unless 2*MSS > 4380. Reading the pseudocode in the RFC + * is a bit misleading because they use a clamp at 4380 bytes + * rather than use a multiplier in the relevant range. + */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); @@ -785,6 +857,21 @@ void tcp_enter_cwr(struct sock *sk, const int set_ssthresh) } } +/* + * Packet counting of FACK is based on in-order assumptions, therefore TCP + * disables it when reordering is detected + */ +static void tcp_disable_fack(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok &= ~2; +} + +/* Take a notice that peer is sending DSACKs */ +static void tcp_dsack_seen(struct tcp_sock *tp) +{ + tp->rx_opt.sack_ok |= 4; +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -806,7 +893,7 @@ static void tcp_init_metrics(struct sock *sk) } if (dst_metric(dst, RTAX_REORDERING) && tp->reordering != dst_metric(dst, RTAX_REORDERING)) { - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); tp->reordering = dst_metric(dst, RTAX_REORDERING); } @@ -868,9 +955,9 @@ static void tcp_update_reordering(struct sock *sk, const int metric, /* This exciting event is worth to be remembered. 8) */ if (ts) NET_INC_STATS_BH(LINUX_MIB_TCPTSREORDER); - else if (IsReno(tp)) + else if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENOREORDER); - else if (IsFack(tp)) + else if (tcp_is_fack(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPFACKREORDER); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKREORDER); @@ -882,8 +969,7 @@ static void tcp_update_reordering(struct sock *sk, const int metric, tp->sacked_out, tp->undo_marker ? tp->undo_retrans : 0); #endif - /* Disable FACK yet. */ - tp->rx_opt.sack_ok &= ~2; + tcp_disable_fack(tp); } } @@ -934,13 +1020,126 @@ static void tcp_update_reordering(struct sock *sk, const int metric, * for retransmitted and already SACKed segment -> reordering.. * Both of these heuristics are not used in Loss state, when we cannot * account for retransmits accurately. + * + * SACK block validation. + * ---------------------- + * + * SACK block range validation checks that the received SACK block fits to + * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT. + * Note that SND.UNA is not included to the range though being valid because + * it means that the receiver is rather inconsistent with itself (reports + * SACK reneging when it should advance SND.UNA). + * + * Implements also blockage to start_seq wrap-around. Problem lies in the + * fact that though start_seq (s) is before end_seq (i.e., not reversed), + * there's no guarantee that it will be before snd_nxt (n). The problem + * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt + * wrap (s_w): + * + * <- outs wnd -> <- wrapzone -> + * u e n u_w e_w s n_w + * | | | | | | | + * |<------------+------+----- TCP seqno space --------------+---------->| + * ...-- <2^31 ->| |<--------... + * ...---- >2^31 ------>| |<--------... + * + * Current code wouldn't be vulnerable but it's better still to discard such + * crazy SACK blocks. Doing this check for start_seq alone closes somewhat + * similar case (end_seq after snd_nxt wrap) as earlier reversed check in + * snd_nxt wrap -> snd_una region will then become "well defined", i.e., + * equal to the ideal case (infinite seqno space without wrap caused issues). + * + * With D-SACK the lower bound is extended to cover sequence space below + * SND.UNA down to undo_marker, which is the last point of interest. Yet + * again, DSACK block must not to go across snd_una (for the same reason as + * for the normal SACK blocks, explained above). But there all simplicity + * ends, TCP might receive valid D-SACKs below that. As long as they reside + * fully below undo_marker they do not affect behavior in anyway and can + * therefore be safely ignored. In rare cases (which are more or less + * theoretical ones), the D-SACK will nicely cross that boundary due to skb + * fragmentation and packet reordering past skb's retransmission. To consider + * them correctly, the acceptable range must be extended even more though + * the exact amount is rather hard to quantify. However, tp->max_window can + * be used as an exaggerated estimate. */ +static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, + u32 start_seq, u32 end_seq) +{ + /* Too far in future, or reversed (interpretation is ambiguous) */ + if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq)) + return 0; + + /* Nasty start_seq wrap-around check (see comments above) */ + if (!before(start_seq, tp->snd_nxt)) + return 0; + + /* In outstanding window? ...This is valid exit for DSACKs too. + * start_seq == snd_una is non-sensical (see comments above) + */ + if (after(start_seq, tp->snd_una)) + return 1; + + if (!is_dsack || !tp->undo_marker) + return 0; + + /* ...Then it's D-SACK, and must reside below snd_una completely */ + if (!after(end_seq, tp->snd_una)) + return 0; + + if (!before(start_seq, tp->undo_marker)) + return 1; + + /* Too old */ + if (!after(end_seq, tp->undo_marker)) + return 0; + + /* Undo_marker boundary crossing (overestimates a lot). Known already: + * start_seq < undo_marker and end_seq >= undo_marker. + */ + return !before(start_seq, end_seq - tp->max_window); +} + + +static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb, + struct tcp_sack_block_wire *sp, int num_sacks, + u32 prior_snd_una) +{ + u32 start_seq_0 = ntohl(get_unaligned(&sp[0].start_seq)); + u32 end_seq_0 = ntohl(get_unaligned(&sp[0].end_seq)); + int dup_sack = 0; + + if (before(start_seq_0, TCP_SKB_CB(ack_skb)->ack_seq)) { + dup_sack = 1; + tcp_dsack_seen(tp); + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); + } else if (num_sacks > 1) { + u32 end_seq_1 = ntohl(get_unaligned(&sp[1].end_seq)); + u32 start_seq_1 = ntohl(get_unaligned(&sp[1].start_seq)); + + if (!after(end_seq_0, end_seq_1) && + !before(start_seq_0, start_seq_1)) { + dup_sack = 1; + tcp_dsack_seen(tp); + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); + } + } + + /* D-SACK for already forgotten data... Do dumb counting. */ + if (dup_sack && + !after(end_seq_0, prior_snd_una) && + after(end_seq_0, tp->undo_marker)) + tp->undo_retrans--; + + return dup_sack; +} + static int tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_una) { const struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked; + unsigned char *ptr = (skb_transport_header(ack_skb) + + TCP_SKB_CB(ack_skb)->sacked); struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2); struct sk_buff *cached_skb; int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3; @@ -948,34 +1147,21 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ int prior_fackets; u32 lost_retrans = 0; int flag = 0; - int dup_sack = 0; + int found_dup_sack = 0; int cached_fack_count; int i; int first_sack_index; - if (!tp->sacked_out) + if (!tp->sacked_out) { tp->fackets_out = 0; + tp->highest_sack = tp->snd_una; + } prior_fackets = tp->fackets_out; - /* Check for D-SACK. */ - if (before(ntohl(sp[0].start_seq), TCP_SKB_CB(ack_skb)->ack_seq)) { - dup_sack = 1; - tp->rx_opt.sack_ok |= 4; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKRECV); - } else if (num_sacks > 1 && - !after(ntohl(sp[0].end_seq), ntohl(sp[1].end_seq)) && - !before(ntohl(sp[0].start_seq), ntohl(sp[1].start_seq))) { - dup_sack = 1; - tp->rx_opt.sack_ok |= 4; - NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOFORECV); - } - - /* D-SACK for already forgotten data... - * Do dumb counting. */ - if (dup_sack && - !after(ntohl(sp[0].end_seq), prior_snd_una) && - after(ntohl(sp[0].end_seq), tp->undo_marker)) - tp->undo_retrans--; + found_dup_sack = tcp_check_dsack(tp, ack_skb, sp, + num_sacks, prior_snd_una); + if (found_dup_sack) + flag |= FLAG_DSACKING_ACK; /* Eliminate too old ACKs, but take into * account more or less fresh ones, they can @@ -1044,7 +1230,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ cached_skb = tp->fastpath_skb_hint; cached_fack_count = tp->fastpath_cnt_hint; if (!cached_skb) { - cached_skb = sk->sk_write_queue.next; + cached_skb = tcp_write_queue_head(sk); cached_fack_count = 0; } @@ -1053,6 +1239,18 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ __u32 start_seq = ntohl(sp->start_seq); __u32 end_seq = ntohl(sp->end_seq); int fack_count; + int dup_sack = (found_dup_sack && (i == first_sack_index)); + + if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) { + if (dup_sack) { + if (!tp->undo_marker) + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO); + else + NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD); + } else + NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD); + continue; + } skb = cached_skb; fack_count = cached_fack_count; @@ -1061,10 +1259,13 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (after(end_seq, tp->high_seq)) flag |= FLAG_DATA_LOST; - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { int in_sack, pcount; u8 sacked; + if (skb == tcp_send_head(sk)) + break; + cached_skb = skb; cached_fack_count = fack_count; if (i == first_sack_index) { @@ -1185,6 +1386,10 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (fack_count > tp->fackets_out) tp->fackets_out = fack_count; + + if (after(TCP_SKB_CB(skb)->seq, + tp->highest_sack)) + tp->highest_sack = TCP_SKB_CB(skb)->seq; } else { if (dup_sack && (sacked&TCPCB_RETRANS)) reord = min(fack_count, reord); @@ -1213,14 +1418,16 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ if (lost_retrans && icsk->icsk_ca_state == TCP_CA_Recovery) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (after(TCP_SKB_CB(skb)->seq, lost_retrans)) break; if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) continue; if ((TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) && after(lost_retrans, TCP_SKB_CB(skb)->ack_seq) && - (IsFack(tp) || + (tcp_is_fack(tp) || !before(lost_retrans, TCP_SKB_CB(skb)->ack_seq + tp->reordering * tp->mss_cache))) { @@ -1240,7 +1447,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ } } - tp->left_out = tp->sacked_out + tp->lost_out; + tcp_verify_left_out(tp); if ((reord < tp->fackets_out) && icsk->icsk_ca_state != TCP_CA_Loss && (!tp->frto_highmark || after(tp->snd_una, tp->frto_highmark))) @@ -1255,20 +1462,61 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_ return flag; } -/* F-RTO can only be used if these conditions are satisfied: - * - there must be some unsent new data - * - the advertised window should allow sending it - * - TCP has never retransmitted anything other than head (SACK enhanced - * variant from Appendix B of RFC4138 is more robust here) +/* F-RTO can only be used if TCP has never retransmitted anything other than + * head (SACK enhanced variant from Appendix B of RFC4138 is more robust here) */ +static void tcp_check_reno_reordering(struct sock *sk, const int addend) +{ + struct tcp_sock *tp = tcp_sk(sk); + u32 holes; + + holes = max(tp->lost_out, 1U); + holes = min(holes, tp->packets_out); + + if ((tp->sacked_out + holes) > tp->packets_out) { + tp->sacked_out = tp->packets_out - holes; + tcp_update_reordering(sk, tp->packets_out + addend, 0); + } +} + +/* Emulate SACKs for SACKless connection: account for a new dupack. */ + +static void tcp_add_reno_sack(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + tp->sacked_out++; + tcp_check_reno_reordering(sk, 0); + tcp_verify_left_out(tp); +} + +/* Account for ACK, ACKing some data in Reno Recovery phase. */ + +static void tcp_remove_reno_sacks(struct sock *sk, int acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (acked > 0) { + /* One ACK acked hole. The rest eat duplicate ACKs. */ + if (acked-1 >= tp->sacked_out) + tp->sacked_out = 0; + else + tp->sacked_out -= acked-1; + } + tcp_check_reno_reordering(sk, acked); + tcp_verify_left_out(tp); +} + +static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +{ + tp->sacked_out = 0; +} + int tcp_use_frto(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - if (!sysctl_tcp_frto || !sk->sk_send_head || - after(TCP_SKB_CB(sk->sk_send_head)->end_seq, - tp->snd_una + tp->snd_wnd)) + if (!sysctl_tcp_frto) return 0; if (IsSackFrto()) @@ -1278,8 +1526,11 @@ int tcp_use_frto(struct sock *sk) if (tp->retrans_out > 1) return 0; - skb = skb_peek(&sk->sk_write_queue)->next; /* Skips head */ - sk_stream_for_retrans_queue_from(skb, sk) { + skb = tcp_write_queue_head(sk); + skb = tcp_write_queue_next(sk, skb); /* Skips head */ + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) return 0; /* Short-circuit when first non-SACKed skb has been checked */ @@ -1343,12 +1594,14 @@ void tcp_enter_frto(struct sock *sk) tp->undo_marker = tp->snd_una; tp->undo_retrans = 0; - skb = skb_peek(&sk->sk_write_queue); + skb = tcp_write_queue_head(sk); + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) { TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS; tp->retrans_out -= tcp_skb_pcount(skb); } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Earlier loss recovery underway (see RFC4138; Appendix B). * The last condition is necessary at least in tp->frto_counter case. @@ -1373,47 +1626,43 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; - int cnt = 0; - tp->sacked_out = 0; tp->lost_out = 0; - tp->fackets_out = 0; tp->retrans_out = 0; + if (tcp_is_reno(tp)) + tcp_reset_reno_sack(tp); - sk_stream_for_retrans_queue(skb, sk) { - cnt += tcp_skb_pcount(skb); + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; /* * Count the retransmission made on RTO correctly (only when * waiting for the first ACK and did not get it)... */ if ((tp->frto_counter == 1) && !(flag&FLAG_DATA_ACKED)) { - tp->retrans_out += tcp_skb_pcount(skb); + /* For some reason this R-bit might get cleared? */ + if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out += tcp_skb_pcount(skb); /* ...enter this if branch just for the first segment */ flag |= FLAG_DATA_ACKED; } else { + if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS) + tp->undo_marker = 0; TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS); } - if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) { - /* Do not mark those segments lost that were - * forward transmitted after RTO - */ - if (!after(TCP_SKB_CB(skb)->end_seq, - tp->frto_highmark)) { - TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; - tp->lost_out += tcp_skb_pcount(skb); - } - } else { - tp->sacked_out += tcp_skb_pcount(skb); - tp->fackets_out = cnt; + /* Don't lost mark skbs that were fwd transmitted after RTO */ + if (!(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED) && + !after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) { + TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; + tp->lost_out += tcp_skb_pcount(skb); } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->undo_marker = 0; tp->frto_counter = 0; tp->reordering = min_t(unsigned int, tp->reordering, @@ -1422,12 +1671,11 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag) tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); - clear_all_retrans_hints(tp); + tcp_clear_retrans_hints_partial(tp); } void tcp_clear_retrans(struct tcp_sock *tp) { - tp->left_out = 0; tp->retrans_out = 0; tp->fackets_out = 0; @@ -1463,12 +1711,18 @@ void tcp_enter_loss(struct sock *sk, int how) tp->bytes_acked = 0; tcp_clear_retrans(tp); - /* Push undo marker, if it was plain RTO and nothing - * was retransmitted. */ - if (!how) + if (!how) { + /* Push undo marker, if it was plain RTO and nothing + * was retransmitted. */ tp->undo_marker = tp->snd_una; + tcp_clear_retrans_hints_partial(tp); + } else { + tcp_clear_all_retrans_hints(tp); + } - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; cnt += tcp_skb_pcount(skb); if (TCP_SKB_CB(skb)->sacked&TCPCB_RETRANS) tp->undo_marker = 0; @@ -1482,15 +1736,15 @@ void tcp_enter_loss(struct sock *sk, int how) tp->fackets_out = cnt; } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); tcp_set_ca_state(sk, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); - - clear_all_retrans_hints(tp); + /* Abort FRTO algorithm if one is in progress */ + tp->frto_counter = 0; } static int tcp_check_sack_reneging(struct sock *sk) @@ -1503,14 +1757,14 @@ static int tcp_check_sack_reneging(struct sock *sk) * receiver _host_ is heavily congested (or buggy). * Do processing similar to RTO timeout. */ - if ((skb = skb_peek(&sk->sk_write_queue)) != NULL && + if ((skb = tcp_write_queue_head(sk)) != NULL && (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)) { struct inet_connection_sock *icsk = inet_csk(sk); NET_INC_STATS_BH(LINUX_MIB_TCPSACKRENEGING); tcp_enter_loss(sk, 1); icsk->icsk_retransmits++; - tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)); + tcp_retransmit_skb(sk, tcp_write_queue_head(sk)); inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); return 1; @@ -1520,7 +1774,7 @@ static int tcp_check_sack_reneging(struct sock *sk) static inline int tcp_fackets_out(struct tcp_sock *tp) { - return IsReno(tp) ? tp->sacked_out+1 : tp->fackets_out; + return tcp_is_reno(tp) ? tp->sacked_out+1 : tp->fackets_out; } static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) @@ -1528,10 +1782,12 @@ static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); } -static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) +static inline int tcp_head_timedout(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + return tp->packets_out && - tcp_skb_timedout(sk, skb_peek(&sk->sk_write_queue)); + tcp_skb_timedout(sk, tcp_write_queue_head(sk)); } /* Linux NewReno/SACK/FACK/ECN state machine. @@ -1627,8 +1883,9 @@ static inline int tcp_head_timedout(struct sock *sk, struct tcp_sock *tp) * Main question: may we further continue forward transmission * with the same cwnd? */ -static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) +static int tcp_time_to_recover(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); __u32 packets_out; /* Do not perform any recovery during FRTO algorithm */ @@ -1646,7 +1903,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) /* Trick#3 : when we use RFC2988 timer restart, fast * retransmit can be triggered by timeout of queue head. */ - if (tcp_head_timedout(sk, tp)) + if (tcp_head_timedout(sk)) return 1; /* Trick#4: It is still not OK... But will it be useful to delay @@ -1655,7 +1912,7 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) packets_out = tp->packets_out; if (packets_out <= tp->reordering && tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) && - !tcp_may_send_now(sk, tp)) { + !tcp_may_send_now(sk)) { /* We have nothing to send. This connection is limited * either by receiver window or by application. */ @@ -1665,59 +1922,25 @@ static int tcp_time_to_recover(struct sock *sk, struct tcp_sock *tp) return 0; } -/* If we receive more dupacks than we expected counting segments - * in assumption of absent reordering, interpret this as reordering. - * The only another reason could be bug in receiver TCP. +/* RFC: This is from the original, I doubt that this is necessary at all: + * clear xmit_retrans hint if seq of this skb is beyond hint. How could we + * retransmitted past LOST markings in the first place? I'm not fully sure + * about undo and end of connection cases, which can cause R without L? */ -static void tcp_check_reno_reordering(struct sock *sk, const int addend) -{ - struct tcp_sock *tp = tcp_sk(sk); - u32 holes; - - holes = max(tp->lost_out, 1U); - holes = min(holes, tp->packets_out); - - if ((tp->sacked_out + holes) > tp->packets_out) { - tp->sacked_out = tp->packets_out - holes; - tcp_update_reordering(sk, tp->packets_out + addend, 0); - } -} - -/* Emulate SACKs for SACKless connection: account for a new dupack. */ - -static void tcp_add_reno_sack(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - tp->sacked_out++; - tcp_check_reno_reordering(sk, 0); - tcp_sync_left_out(tp); -} - -/* Account for ACK, ACKing some data in Reno Recovery phase. */ - -static void tcp_remove_reno_sacks(struct sock *sk, struct tcp_sock *tp, int acked) -{ - if (acked > 0) { - /* One ACK acked hole. The rest eat duplicate ACKs. */ - if (acked-1 >= tp->sacked_out) - tp->sacked_out = 0; - else - tp->sacked_out -= acked-1; - } - tcp_check_reno_reordering(sk, acked); - tcp_sync_left_out(tp); -} - -static inline void tcp_reset_reno_sack(struct tcp_sock *tp) +static void tcp_verify_retransmit_hint(struct tcp_sock *tp, + struct sk_buff *skb) { - tp->sacked_out = 0; - tp->left_out = tp->lost_out; + if ((tp->retransmit_skb_hint != NULL) && + before(TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) + tp->retransmit_skb_hint = NULL; } /* Mark head of queue up as lost. */ -static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, +static void tcp_mark_head_lost(struct sock *sk, int packets, u32 high_seq) { + struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; int cnt; @@ -1726,11 +1949,13 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; } else { - skb = sk->sk_write_queue.next; + skb = tcp_write_queue_head(sk); cnt = 0; } - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; /* TODO: do this better */ /* this is not the most efficient way to do this... */ tp->lost_skb_hint = skb; @@ -1741,31 +1966,25 @@ static void tcp_mark_head_lost(struct sock *sk, struct tcp_sock *tp, if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - - /* clear xmit_retransmit_queue hints - * if this is beyond hint */ - if(tp->retransmit_skb_hint != NULL && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) { - - tp->retransmit_skb_hint = NULL; - } + tcp_verify_retransmit_hint(tp, skb); } } - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } /* Account newly detected lost packet(s) */ -static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) +static void tcp_update_scoreboard(struct sock *sk) { - if (IsFack(tp)) { + struct tcp_sock *tp = tcp_sk(sk); + + if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, tp, lost, tp->high_seq); + tcp_mark_head_lost(sk, lost, tp->high_seq); } else { - tcp_mark_head_lost(sk, tp, 1, tp->high_seq); + tcp_mark_head_lost(sk, 1, tp->high_seq); } /* New heuristics: it is possible only after we switched @@ -1773,32 +1992,28 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp) * Hence, we can detect timed out packets during fast * retransmit without falling to slow start. */ - if (!IsReno(tp) && tcp_head_timedout(sk, tp)) { + if (!tcp_is_reno(tp) && tcp_head_timedout(sk)) { struct sk_buff *skb; skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint - : sk->sk_write_queue.next; + : tcp_write_queue_head(sk); - sk_stream_for_retrans_queue_from(skb, sk) { + tcp_for_write_queue_from(skb, sk) { + if (skb == tcp_send_head(sk)) + break; if (!tcp_skb_timedout(sk, skb)) break; if (!(TCP_SKB_CB(skb)->sacked&TCPCB_TAGBITS)) { TCP_SKB_CB(skb)->sacked |= TCPCB_LOST; tp->lost_out += tcp_skb_pcount(skb); - - /* clear xmit_retrans hint */ - if (tp->retransmit_skb_hint && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->retransmit_skb_hint)->seq)) - - tp->retransmit_skb_hint = NULL; + tcp_verify_retransmit_hint(tp, skb); } } tp->scoreboard_skb_hint = skb; - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); } } @@ -1823,19 +2038,22 @@ static inline u32 tcp_cwnd_min(const struct sock *sk) } /* Decrease cwnd each second ack. */ -static void tcp_cwnd_down(struct sock *sk) +static void tcp_cwnd_down(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); int decr = tp->snd_cwnd_cnt + 1; - tp->snd_cwnd_cnt = decr&1; - decr >>= 1; + if ((flag&(FLAG_ANY_PROGRESS|FLAG_DSACKING_ACK)) || + (tcp_is_reno(tp) && !(flag&FLAG_NOT_DUP))) { + tp->snd_cwnd_cnt = decr&1; + decr >>= 1; - if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) - tp->snd_cwnd -= decr; + if (decr && tp->snd_cwnd > tcp_cwnd_min(sk)) + tp->snd_cwnd -= decr; - tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); - tp->snd_cwnd_stamp = tcp_time_stamp; + tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1); + tp->snd_cwnd_stamp = tcp_time_stamp; + } } /* Nothing was retransmitted or returned timestamp is less @@ -1851,13 +2069,15 @@ static inline int tcp_packet_delayed(struct tcp_sock *tp) /* Undo procedures. */ #if FASTRETRANS_DEBUG > 1 -static void DBGUNDO(struct sock *sk, struct tcp_sock *tp, const char *msg) +static void DBGUNDO(struct sock *sk, const char *msg) { + struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); + printk(KERN_DEBUG "Undo %s %u.%u.%u.%u/%u c%u l%u ss%u/%u p%u\n", msg, NIPQUAD(inet->daddr), ntohs(inet->dport), - tp->snd_cwnd, tp->left_out, + tp->snd_cwnd, tcp_left_out(tp), tp->snd_ssthresh, tp->prior_ssthresh, tp->packets_out); } @@ -1889,7 +2109,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo) /* There is something screwy going on with the retrans hints after an undo */ - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } static inline int tcp_may_undo(struct tcp_sock *tp) @@ -1899,13 +2119,15 @@ static inline int tcp_may_undo(struct tcp_sock *tp) } /* People celebrate: "We love our President!" */ -static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_recovery(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { /* Happy end! We did not retransmit anything * or our original transmission succeeded. */ - DBGUNDO(sk, tp, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); + DBGUNDO(sk, inet_csk(sk)->icsk_ca_state == TCP_CA_Loss ? "loss" : "retrans"); tcp_undo_cwr(sk, 1); if (inet_csk(sk)->icsk_ca_state == TCP_CA_Loss) NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); @@ -1913,7 +2135,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) NET_INC_STATS_BH(LINUX_MIB_TCPFULLUNDO); tp->undo_marker = 0; } - if (tp->snd_una == tp->high_seq && IsReno(tp)) { + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { /* Hold old state until something *above* high_seq * is ACKed. For Reno it is MUST to prevent false * fast retransmits (RFC2582). SACK TCP is safe. */ @@ -1925,10 +2147,12 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_sock *tp) } /* Try to undo cwnd reduction, because D-SACKs acked all retransmitted data */ -static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) +static void tcp_try_undo_dsack(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tp->undo_marker && !tp->undo_retrans) { - DBGUNDO(sk, tp, "D-SACK"); + DBGUNDO(sk, "D-SACK"); tcp_undo_cwr(sk, 1); tp->undo_marker = 0; NET_INC_STATS_BH(LINUX_MIB_TCPDSACKUNDO); @@ -1937,11 +2161,11 @@ static void tcp_try_undo_dsack(struct sock *sk, struct tcp_sock *tp) /* Undo during fast recovery after partial ACK. */ -static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, - int acked) +static int tcp_try_undo_partial(struct sock *sk, int acked) { + struct tcp_sock *tp = tcp_sk(sk); /* Partial ACK arrived. Force Hoe's retransmit. */ - int failed = IsReno(tp) || tp->fackets_out>tp->reordering; + int failed = tcp_is_reno(tp) || tp->fackets_out>tp->reordering; if (tcp_may_undo(tp)) { /* Plain luck! Hole if filled with delayed @@ -1952,7 +2176,7 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, tcp_update_reordering(sk, tcp_fackets_out(tp) + acked, 1); - DBGUNDO(sk, tp, "Hoe"); + DBGUNDO(sk, "Hoe"); tcp_undo_cwr(sk, 0); NET_INC_STATS_BH(LINUX_MIB_TCPPARTIALUNDO); @@ -1966,24 +2190,27 @@ static int tcp_try_undo_partial(struct sock *sk, struct tcp_sock *tp, } /* Undo during loss recovery after partial ACK. */ -static int tcp_try_undo_loss(struct sock *sk, struct tcp_sock *tp) +static int tcp_try_undo_loss(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (tcp_may_undo(tp)) { struct sk_buff *skb; - sk_stream_for_retrans_queue(skb, sk) { + tcp_for_write_queue(skb, sk) { + if (skb == tcp_send_head(sk)) + break; TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST; } - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); - DBGUNDO(sk, tp, "partial loss"); + DBGUNDO(sk, "partial loss"); tp->lost_out = 0; - tp->left_out = tp->sacked_out; tcp_undo_cwr(sk, 1); NET_INC_STATS_BH(LINUX_MIB_TCPLOSSUNDO); inet_csk(sk)->icsk_retransmits = 0; tp->undo_marker = 0; - if (!IsReno(tp)) + if (tcp_is_sack(tp)) tcp_set_ca_state(sk, TCP_CA_Open); return 1; } @@ -1998,9 +2225,11 @@ static inline void tcp_complete_cwr(struct sock *sk) tcp_ca_event(sk, CA_EVENT_COMPLETE_CWR); } -static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) +static void tcp_try_to_open(struct sock *sk, int flag) { - tp->left_out = tp->sacked_out; + struct tcp_sock *tp = tcp_sk(sk); + + tcp_verify_left_out(tp); if (tp->retrans_out == 0) tp->retrans_stamp = 0; @@ -2011,7 +2240,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; - if (tp->left_out || tp->retrans_out || tp->undo_marker) + if (tcp_left_out(tp) || tp->retrans_out || tp->undo_marker) state = TCP_CA_Disorder; if (inet_csk(sk)->icsk_ca_state != state) { @@ -2020,7 +2249,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_sock *tp, int flag) } tcp_moderate_cwnd(tp); } else { - tcp_cwnd_down(sk); + tcp_cwnd_down(sk, flag); } } @@ -2064,19 +2293,20 @@ static void tcp_mtup_probe_success(struct sock *sk, struct sk_buff *skb) * tcp_xmit_retransmit_queue(). */ static void -tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, - int prior_packets, int flag) +tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); - int is_dupack = (tp->snd_una == prior_snd_una && !(flag&FLAG_NOT_DUP)); + int is_dupack = !(flag&(FLAG_SND_UNA_ADVANCED|FLAG_NOT_DUP)); + int do_lost = is_dupack || ((flag&FLAG_DATA_SACKED) && + (tp->fackets_out > tp->reordering)); /* Some technical things: * 1. Reno does not count dupacks (sacked_out) automatically. */ if (!tp->packets_out) tp->sacked_out = 0; - /* 2. SACK counts snd_fack in packets inaccurately. */ - if (tp->sacked_out == 0) + + if (WARN_ON(!tp->sacked_out && tp->fackets_out)) tp->fackets_out = 0; /* Now state machine starts. @@ -2093,12 +2323,12 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp, tp->fackets_out-tp->reordering, tp->high_seq); + tcp_mark_head_lost(sk, tp->fackets_out-tp->reordering, tp->high_seq); NET_INC_STATS_BH(LINUX_MIB_TCPLOSS); } - /* D. Synchronize left_out to current state. */ - tcp_sync_left_out(tp); + /* D. Check consistency of the current state. */ + tcp_verify_left_out(tp); /* E. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ @@ -2109,7 +2339,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, switch (icsk->icsk_ca_state) { case TCP_CA_Loss: icsk->icsk_retransmits = 0; - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; break; @@ -2123,20 +2353,20 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, break; case TCP_CA_Disorder: - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); if (!tp->undo_marker || /* For SACK case do not Open to allow to undo * catching for all duplicate ACKs. */ - IsReno(tp) || tp->snd_una != tp->high_seq) { + tcp_is_reno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; tcp_set_ca_state(sk, TCP_CA_Open); } break; case TCP_CA_Recovery: - if (IsReno(tp)) + if (tcp_is_reno(tp)) tcp_reset_reno_sack(tp); - if (tcp_try_undo_recovery(sk, tp)) + if (tcp_try_undo_recovery(sk)) return; tcp_complete_cwr(sk); break; @@ -2146,20 +2376,16 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* F. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: - if (prior_snd_una == tp->snd_una) { - if (IsReno(tp) && is_dupack) + if (!(flag & FLAG_SND_UNA_ADVANCED)) { + if (tcp_is_reno(tp) && is_dupack) tcp_add_reno_sack(sk); - } else { - int acked = prior_packets - tp->packets_out; - if (IsReno(tp)) - tcp_remove_reno_sacks(sk, tp, acked); - is_dupack = tcp_try_undo_partial(sk, tp, acked); - } + } else + do_lost = tcp_try_undo_partial(sk, pkts_acked); break; case TCP_CA_Loss: if (flag&FLAG_DATA_ACKED) icsk->icsk_retransmits = 0; - if (!tcp_try_undo_loss(sk, tp)) { + if (!tcp_try_undo_loss(sk)) { tcp_moderate_cwnd(tp); tcp_xmit_retransmit_queue(sk); return; @@ -2168,18 +2394,18 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, return; /* Loss is undone; fall through to processing in Open state. */ default: - if (IsReno(tp)) { - if (tp->snd_una != prior_snd_una) + if (tcp_is_reno(tp)) { + if (flag & FLAG_SND_UNA_ADVANCED) tcp_reset_reno_sack(tp); if (is_dupack) tcp_add_reno_sack(sk); } if (icsk->icsk_ca_state == TCP_CA_Disorder) - tcp_try_undo_dsack(sk, tp); + tcp_try_undo_dsack(sk); - if (!tcp_time_to_recover(sk, tp)) { - tcp_try_to_open(sk, tp, flag); + if (!tcp_time_to_recover(sk)) { + tcp_try_to_open(sk, flag); return; } @@ -2196,7 +2422,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, /* Otherwise enter Recovery state */ - if (IsReno(tp)) + if (tcp_is_reno(tp)) NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERY); else NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERY); @@ -2218,9 +2444,9 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, tcp_set_ca_state(sk, TCP_CA_Recovery); } - if (is_dupack || tcp_head_timedout(sk, tp)) - tcp_update_scoreboard(sk, tp); - tcp_cwnd_down(sk); + if (do_lost || tcp_head_timedout(sk)) + tcp_update_scoreboard(sk); + tcp_cwnd_down(sk, flag); tcp_xmit_retransmit_queue(sk); } @@ -2283,20 +2509,21 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); - icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); + icsk->icsk_ca_ops->cong_avoid(sk, ack, in_flight, good); tcp_sk(sk)->snd_cwnd_stamp = tcp_time_stamp; } /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ - -static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_rearm_rto(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); } else { @@ -2304,168 +2531,162 @@ static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) } } -static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, - __u32 now, __s32 *seq_rtt) +/* If we get here, the whole TSO packet has not been acked. */ +static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u32 seq = tp->snd_una; - __u32 packets_acked; - int acked = 0; + u32 packets_acked; - /* If we get here, the whole TSO packet has not been - * acked. - */ - BUG_ON(!after(scb->end_seq, seq)); + BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)); packets_acked = tcp_skb_pcount(skb); - if (tcp_trim_head(sk, skb, seq - scb->seq)) + if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq)) return 0; packets_acked -= tcp_skb_pcount(skb); if (packets_acked) { - __u8 sacked = scb->sacked; - - acked |= FLAG_DATA_ACKED; - if (sacked) { - if (sacked & TCPCB_RETRANS) { - if (sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= packets_acked; - acked |= FLAG_RETRANS_DATA_ACKED; - *seq_rtt = -1; - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= packets_acked; - if (sacked & TCPCB_LOST) - tp->lost_out -= packets_acked; - if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(seq, tp->snd_up)) - tp->urg_mode = 0; - } - } else if (*seq_rtt < 0) - *seq_rtt = now - scb->when; - - if (tp->fackets_out) { - __u32 dval = min(tp->fackets_out, packets_acked); - tp->fackets_out -= dval; - } - tp->packets_out -= packets_acked; - BUG_ON(tcp_skb_pcount(skb) == 0); - BUG_ON(!before(scb->seq, scb->end_seq)); + BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq)); } - return acked; -} - -static u32 tcp_usrtt(struct timeval *tv) -{ - struct timeval now; - - do_gettimeofday(&now); - return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec); + return packets_acked; } -/* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) +/* Remove acknowledged frames from the retransmission queue. If our packet + * is before the ack sequence we can discard it as it's confirmed to have + * arrived at the other end. + */ +static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p) { struct tcp_sock *tp = tcp_sk(sk); const struct inet_connection_sock *icsk = inet_csk(sk); struct sk_buff *skb; - __u32 now = tcp_time_stamp; - int acked = 0; - __s32 seq_rtt = -1; - u32 pkts_acked = 0; - void (*rtt_sample)(struct sock *sk, u32 usrtt) - = icsk->icsk_ca_ops->rtt_sample; - struct timeval tv = { .tv_sec = 0, .tv_usec = 0 }; - - while ((skb = skb_peek(&sk->sk_write_queue)) && - skb != sk->sk_send_head) { + u32 now = tcp_time_stamp; + int fully_acked = 1; + int flag = 0; + int prior_packets = tp->packets_out; + s32 seq_rtt = -1; + ktime_t last_ackt = net_invalid_timestamp(); + + while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); - __u8 sacked = scb->sacked; + u32 end_seq; + u32 packets_acked; + u8 sacked = scb->sacked; - /* If our packet is before the ack sequence we can - * discard it as it's confirmed to have arrived at - * the other end. - */ if (after(scb->end_seq, tp->snd_una)) { - if (tcp_skb_pcount(skb) > 1 && - after(tp->snd_una, scb->seq)) - acked |= tcp_tso_acked(sk, skb, - now, &seq_rtt); - break; - } + if (tcp_skb_pcount(skb) == 1 || + !after(tp->snd_una, scb->seq)) + break; - /* Initial outgoing SYN's get put onto the write_queue - * just like anything else we transmit. It is not - * true data, and if we misinform our callers that - * this ACK acks real data, we will erroneously exit - * connection startup slow start one packet too - * quickly. This is severely frowned upon behavior. - */ - if (!(scb->flags & TCPCB_FLAG_SYN)) { - acked |= FLAG_DATA_ACKED; - ++pkts_acked; + packets_acked = tcp_tso_acked(sk, skb); + if (!packets_acked) + break; + + fully_acked = 0; + end_seq = tp->snd_una; } else { - acked |= FLAG_SYN_ACKED; - tp->retrans_stamp = 0; + packets_acked = tcp_skb_pcount(skb); + end_seq = scb->end_seq; } /* MTU probing checks */ - if (icsk->icsk_mtup.probe_size) { - if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) { - tcp_mtup_probe_success(sk, skb); - } + if (fully_acked && icsk->icsk_mtup.probe_size && + !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) { + tcp_mtup_probe_success(sk, skb); } if (sacked) { if (sacked & TCPCB_RETRANS) { - if(sacked & TCPCB_SACKED_RETRANS) - tp->retrans_out -= tcp_skb_pcount(skb); - acked |= FLAG_RETRANS_DATA_ACKED; + if (sacked & TCPCB_SACKED_RETRANS) + tp->retrans_out -= packets_acked; + flag |= FLAG_RETRANS_DATA_ACKED; seq_rtt = -1; + if ((flag & FLAG_DATA_ACKED) || + (packets_acked > 1)) + flag |= FLAG_NONHEAD_RETRANS_ACKED; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + if (fully_acked) + last_ackt = skb->tstamp; } + if (sacked & TCPCB_SACKED_ACKED) - tp->sacked_out -= tcp_skb_pcount(skb); + tp->sacked_out -= packets_acked; if (sacked & TCPCB_LOST) - tp->lost_out -= tcp_skb_pcount(skb); - if (sacked & TCPCB_URG) { - if (tp->urg_mode && - !before(scb->end_seq, tp->snd_up)) - tp->urg_mode = 0; - } + tp->lost_out -= packets_acked; + + if ((sacked & TCPCB_URG) && tp->urg_mode && + !before(end_seq, tp->snd_up)) + tp->urg_mode = 0; } else if (seq_rtt < 0) { seq_rtt = now - scb->when; - skb_get_timestamp(skb, &tv); + if (fully_acked) + last_ackt = skb->tstamp; } - tcp_dec_pcount_approx(&tp->fackets_out, skb); - tcp_packets_out_dec(tp, skb); - __skb_unlink(skb, &sk->sk_write_queue); + tp->packets_out -= packets_acked; + + /* Initial outgoing SYN's get put onto the write_queue + * just like anything else we transmit. It is not + * true data, and if we misinform our callers that + * this ACK acks real data, we will erroneously exit + * connection startup slow start one packet too + * quickly. This is severely frowned upon behavior. + */ + if (!(scb->flags & TCPCB_FLAG_SYN)) { + flag |= FLAG_DATA_ACKED; + } else { + flag |= FLAG_SYN_ACKED; + tp->retrans_stamp = 0; + } + + if (!fully_acked) + break; + + tcp_unlink_write_queue(skb, sk); sk_stream_free_skb(sk, skb); - clear_all_retrans_hints(tp); + tcp_clear_all_retrans_hints(tp); } - if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(sk, acked, seq_rtt); - tcp_ack_packets_out(sk, tp); - if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED)) - (*rtt_sample)(sk, tcp_usrtt(&tv)); + if (flag & FLAG_ACKED) { + u32 pkts_acked = prior_packets - tp->packets_out; + const struct tcp_congestion_ops *ca_ops + = inet_csk(sk)->icsk_ca_ops; + + tcp_ack_update_rtt(sk, flag, seq_rtt); + tcp_rearm_rto(sk); + + tp->fackets_out -= min(pkts_acked, tp->fackets_out); + /* hint's skb might be NULL but we don't need to care */ + tp->fastpath_cnt_hint -= min_t(u32, pkts_acked, + tp->fastpath_cnt_hint); + if (tcp_is_reno(tp)) + tcp_remove_reno_sacks(sk, pkts_acked); + + if (ca_ops->pkts_acked) { + s32 rtt_us = -1; + + /* Is the ACK triggering packet unambiguous? */ + if (!(flag & FLAG_RETRANS_DATA_ACKED)) { + /* High resolution needed and available? */ + if (ca_ops->flags & TCP_CONG_RTT_STAMP && + !ktime_equal(last_ackt, + net_invalid_timestamp())) + rtt_us = ktime_us_delta(ktime_get_real(), + last_ackt); + else if (seq_rtt > 0) + rtt_us = jiffies_to_usecs(seq_rtt); + } - if (icsk->icsk_ca_ops->pkts_acked) - icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked); + ca_ops->pkts_acked(sk, pkts_acked, rtt_us); + } } #if FASTRETRANS_DEBUG > 0 BUG_TRAP((int)tp->sacked_out >= 0); BUG_TRAP((int)tp->lost_out >= 0); BUG_TRAP((int)tp->retrans_out >= 0); - if (!tp->packets_out && tp->rx_opt.sack_ok) { + if (!tp->packets_out && tcp_is_sack(tp)) { const struct inet_connection_sock *icsk = inet_csk(sk); if (tp->lost_out) { printk(KERN_DEBUG "Leak l=%u %d\n", @@ -2485,7 +2706,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) } #endif *seq_rtt_p = seq_rtt; - return acked; + return flag; } static void tcp_ack_probe(struct sock *sk) @@ -2495,7 +2716,7 @@ static void tcp_ack_probe(struct sock *sk) /* Was it a usable window open? */ - if (!after(TCP_SKB_CB(sk->sk_send_head)->end_seq, + if (!after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, tp->snd_una + tp->snd_wnd)) { icsk->icsk_backoff = 0; inet_csk_clear_xmit_timer(sk, ICSK_TIME_PROBE0); @@ -2538,13 +2759,14 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack * Window update algorithm, described in RFC793/RFC1122 (used in linux-2.2 * and in FreeBSD. NetBSD's one is even worse.) is wrong. */ -static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb, u32 ack, u32 ack_seq) +static int tcp_ack_update_window(struct sock *sk, struct sk_buff *skb, u32 ack, + u32 ack_seq) { + struct tcp_sock *tp = tcp_sk(sk); int flag = 0; - u32 nwin = ntohs(skb->h.th->window); + u32 nwin = ntohs(tcp_hdr(skb)->window); - if (likely(!skb->h.th->syn)) + if (likely(!tcp_hdr(skb)->syn)) nwin <<= tp->rx_opt.snd_wscale; if (tcp_may_update_window(tp, ack, ack_seq, nwin)) { @@ -2558,7 +2780,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, * fast path is recovered for sending TCP. */ tp->pred_flags = 0; - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (nwin > tp->max_window) { tp->max_window = nwin; @@ -2579,6 +2801,7 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) { tp->snd_cwnd = min(tp->snd_cwnd, tp->snd_ssthresh); tp->snd_cwnd_cnt = 0; + TCP_ECN_queue_cwr(tp); tcp_moderate_cwnd(tp); } @@ -2587,14 +2810,15 @@ static void tcp_conservative_spur_to_response(struct tcp_sock *tp) */ static void tcp_ratehalving_spur_to_response(struct sock *sk) { - struct tcp_sock *tp = tcp_sk(sk); tcp_enter_cwr(sk, 0); - tp->high_seq = tp->frto_highmark; /* Smoother w/o this? - ij */ } -static void tcp_undo_spur_to_response(struct sock *sk) +static void tcp_undo_spur_to_response(struct sock *sk, int flag) { - tcp_undo_cwr(sk, 1); + if (flag&FLAG_ECE) + tcp_ratehalving_spur_to_response(sk); + else + tcp_undo_cwr(sk, 1); } /* F-RTO spurious RTO detection algorithm (RFC4138) @@ -2607,7 +2831,9 @@ static void tcp_undo_spur_to_response(struct sock *sk) * algorithm is not part of the F-RTO detection algorithm * given in RFC4138 but can be selected separately). * Otherwise (basically on duplicate ACK), RTO was (likely) caused by a loss - * and TCP falls back to conventional RTO recovery. + * and TCP falls back to conventional RTO recovery. F-RTO allows overriding + * of Nagle, this is done using frto_counter states 2 and 3, when a new data + * segment of any size sent during F-RTO, state 2 is upgraded to 3. * * Rationale: if the RTO was spurious, new ACKs should arrive from the * original window even after we transmit two new data segments. @@ -2625,28 +2851,31 @@ static void tcp_undo_spur_to_response(struct sock *sk) * to prove that the RTO is indeed spurious. It transfers the control * from F-RTO to the conventional RTO recovery */ -static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) +static int tcp_process_frto(struct sock *sk, int flag) { struct tcp_sock *tp = tcp_sk(sk); - tcp_sync_left_out(tp); + tcp_verify_left_out(tp); /* Duplicate the behavior from Loss state (fastretrans_alert) */ if (flag&FLAG_DATA_ACKED) inet_csk(sk)->icsk_retransmits = 0; + if ((flag & FLAG_NONHEAD_RETRANS_ACKED) || + ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED))) + tp->undo_marker = 0; + if (!before(tp->snd_una, tp->frto_highmark)) { - tcp_enter_frto_loss(sk, tp->frto_counter + 1, flag); + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag); return 1; } - if (!IsSackFrto() || IsReno(tp)) { + if (!IsSackFrto() || tcp_is_reno(tp)) { /* RFC4138 shortcoming in step 2; should also have case c): * ACK isn't duplicate nor advances window, e.g., opposite dir * data, winupdate */ - if ((tp->snd_una == prior_snd_una) && (flag&FLAG_NOT_DUP) && - !(flag&FLAG_FORWARD_PROGRESS)) + if (!(flag&FLAG_ANY_PROGRESS) && (flag&FLAG_NOT_DUP)) return 1; if (!(flag&FLAG_DATA_ACKED)) { @@ -2662,7 +2891,7 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) return 1; } - if ((tp->frto_counter == 2) && + if ((tp->frto_counter >= 2) && (!(flag&FLAG_FORWARD_PROGRESS) || ((flag&FLAG_DATA_SACKED) && !(flag&FLAG_ONLY_ORIG_SACKED)))) { /* RFC4138 shortcoming (see comment above) */ @@ -2675,13 +2904,22 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) } if (tp->frto_counter == 1) { + /* Sending of the next skb must be allowed or no FRTO */ + if (!tcp_send_head(sk) || + after(TCP_SKB_CB(tcp_send_head(sk))->end_seq, + tp->snd_una + tp->snd_wnd)) { + tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), + flag); + return 1; + } + tp->snd_cwnd = tcp_packets_in_flight(tp) + 2; tp->frto_counter = 2; return 1; - } else /* frto_counter == 2 */ { + } else { switch (sysctl_tcp_frto_response) { case 2: - tcp_undo_spur_to_response(sk); + tcp_undo_spur_to_response(sk, flag); break; case 1: tcp_conservative_spur_to_response(tp); @@ -2689,8 +2927,9 @@ static int tcp_process_frto(struct sock *sk, u32 prior_snd_una, int flag) default: tcp_ratehalving_spur_to_response(sk); break; - }; + } tp->frto_counter = 0; + tp->undo_marker = 0; } return 0; } @@ -2717,6 +2956,9 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) if (before(ack, prior_snd_una)) goto old_ack; + if (after(ack, prior_snd_una)) + flag |= FLAG_SND_UNA_ADVANCED; + if (sysctl_tcp_abc) { if (icsk->icsk_ca_state < TCP_CA_CWR) tp->bytes_acked += ack - prior_snd_una; @@ -2743,12 +2985,12 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) else NET_INC_STATS_BH(LINUX_MIB_TCPPUREACKS); - flag |= tcp_ack_update_window(sk, tp, skb, ack, ack_seq); + flag |= tcp_ack_update_window(sk, skb, ack, ack_seq); if (TCP_SKB_CB(skb)->sacked) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una); - if (TCP_ECN_rcv_ecn_echo(tp, skb->h.th)) + if (TCP_ECN_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; tcp_ca_event(sk, CA_EVENT_SLOW_ACK); @@ -2769,17 +3011,17 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tp->frto_counter) - frto_cwnd = tcp_process_frto(sk, prior_snd_una, flag); + frto_cwnd = tcp_process_frto(sk, flag); if (tcp_ack_is_dubious(sk, flag)) { /* Advance CWND, if state allows this. */ if ((flag & FLAG_DATA_ACKED) && !frto_cwnd && tcp_may_raise_cwnd(sk, flag)) - tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 0); - tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); + tcp_cong_avoid(sk, ack, prior_in_flight, 0); + tcp_fastretrans_alert(sk, prior_packets - tp->packets_out, flag); } else { if ((flag & FLAG_DATA_ACKED) && !frto_cwnd) - tcp_cong_avoid(sk, ack, seq_rtt, prior_in_flight, 1); + tcp_cong_avoid(sk, ack, prior_in_flight, 1); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) @@ -2794,7 +3036,7 @@ no_queue: * being used to time the probes, and is probably far higher than * it needs to be for normal retransmission. */ - if (sk->sk_send_head) + if (tcp_send_head(sk)) tcp_ack_probe(sk); return 1; @@ -2815,13 +3057,13 @@ uninteresting_ack: void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, int estab) { unsigned char *ptr; - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); int length=(th->doff*4)-sizeof(struct tcphdr); ptr = (unsigned char *)(th + 1); opt_rx->saw_tstamp = 0; - while(length>0) { + while (length > 0) { int opcode=*ptr++; int opsize; @@ -2837,9 +3079,9 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, return; if (opsize > length) return; /* don't parse partial options */ - switch(opcode) { + switch (opcode) { case TCPOPT_MSS: - if(opsize==TCPOLEN_MSS && th->syn && !estab) { + if (opsize==TCPOLEN_MSS && th->syn && !estab) { u16 in_mss = ntohs(get_unaligned((__be16 *)ptr)); if (in_mss) { if (opt_rx->user_mss && opt_rx->user_mss < in_mss) @@ -2849,12 +3091,12 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_WINDOW: - if(opsize==TCPOLEN_WINDOW && th->syn && !estab) + if (opsize==TCPOLEN_WINDOW && th->syn && !estab) if (sysctl_tcp_window_scaling) { __u8 snd_wscale = *(__u8 *) ptr; opt_rx->wscale_ok = 1; if (snd_wscale > 14) { - if(net_ratelimit()) + if (net_ratelimit()) printk(KERN_INFO "tcp_parse_options: Illegal window " "scaling value %d >14 received.\n", snd_wscale); @@ -2864,7 +3106,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_TIMESTAMP: - if(opsize==TCPOLEN_TIMESTAMP) { + if (opsize==TCPOLEN_TIMESTAMP) { if ((estab && opt_rx->tstamp_ok) || (!estab && sysctl_tcp_timestamps)) { opt_rx->saw_tstamp = 1; @@ -2874,7 +3116,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, } break; case TCPOPT_SACK_PERM: - if(opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { + if (opsize==TCPOLEN_SACK_PERM && th->syn && !estab) { if (sysctl_tcp_sack) { opt_rx->sack_ok = 1; tcp_sack_reset(opt_rx); @@ -2883,11 +3125,12 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, break; case TCPOPT_SACK: - if((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && + if ((opsize >= (TCPOLEN_SACK_BASE + TCPOLEN_SACK_PERBLOCK)) && !((opsize - TCPOLEN_SACK_BASE) % TCPOLEN_SACK_PERBLOCK) && opt_rx->sack_ok) { TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th; } + break; #ifdef CONFIG_TCP_MD5SIG case TCPOPT_MD5SIG: /* @@ -2896,10 +3139,11 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, */ break; #endif - }; + } + ptr+=opsize-2; length-=opsize; - }; + } } } @@ -2932,7 +3176,7 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, static inline void tcp_store_ts_recent(struct tcp_sock *tp) { tp->rx_opt.ts_recent = tp->rx_opt.rcv_tsval; - tp->rx_opt.ts_recent_stamp = xtime.tv_sec; + tp->rx_opt.ts_recent_stamp = get_seconds(); } static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) @@ -2945,8 +3189,8 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) * Not only, also it occurs for expired timestamps. */ - if((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || - xtime.tv_sec >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) + if ((s32)(tp->rx_opt.rcv_tsval - tp->rx_opt.ts_recent) >= 0 || + get_seconds() >= tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS) tcp_store_ts_recent(tp); } } @@ -2977,7 +3221,7 @@ static inline void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) static int tcp_disordered_ack(const struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); u32 seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; @@ -2998,7 +3242,7 @@ static inline int tcp_paws_discard(const struct sock *sk, const struct sk_buff * { const struct tcp_sock *tp = tcp_sk(sk); return ((s32)(tp->rx_opt.ts_recent - tp->rx_opt.rcv_tsval) > TCP_PAWS_WINDOW && - xtime.tv_sec < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && + get_seconds() < tp->rx_opt.ts_recent_stamp + TCP_PAWS_24DAYS && !tcp_disordered_ack(sk, skb)); } @@ -3105,13 +3349,13 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) printk(KERN_ERR "%s: Impossible, sk->sk_state=%d\n", __FUNCTION__, sk->sk_state); break; - }; + } /* It _is_ possible, that we have something out-of-order _after_ FIN. * Probably, we should reset in this case. For now drop them. */ __skb_queue_purge(&tp->out_of_order_queue); - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); @@ -3141,7 +3385,7 @@ static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_se static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) NET_INC_STATS_BH(LINUX_MIB_TCPDSACKOLDSENT); else @@ -3171,7 +3415,7 @@ static void tcp_send_dupack(struct sock *sk, struct sk_buff *skb) NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOST); tcp_enter_quickack_mode(sk); - if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { + if (tcp_is_sack(tp) && sysctl_tcp_dsack) { u32 end_seq = TCP_SKB_CB(skb)->end_seq; if (after(TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt)) @@ -3204,7 +3448,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) */ tp->rx_opt.num_sacks--; tp->rx_opt.eff_sacks = min(tp->rx_opt.num_sacks + tp->rx_opt.dsack, 4 - tp->rx_opt.tstamp_ok); - for(i=this_sack; i < tp->rx_opt.num_sacks; i++) + for (i=this_sack; i < tp->rx_opt.num_sacks; i++) sp[i] = sp[i+1]; continue; } @@ -3257,7 +3501,7 @@ static void tcp_sack_new_ofo_skb(struct sock *sk, u32 seq, u32 end_seq) tp->rx_opt.num_sacks--; sp--; } - for(; this_sack > 0; this_sack--, sp--) + for (; this_sack > 0; this_sack--, sp--) *sp = *(sp-1); new_sack: @@ -3283,7 +3527,7 @@ static void tcp_sack_remove(struct tcp_sock *tp) return; } - for(this_sack = 0; this_sack < num_sacks; ) { + for (this_sack = 0; this_sack < num_sacks; ) { /* Check if the start of the sack is covered by RCV.NXT. */ if (!before(tp->rcv_nxt, sp->start_seq)) { int i; @@ -3339,8 +3583,8 @@ static void tcp_ofo_queue(struct sock *sk) __skb_unlink(skb, &tp->out_of_order_queue); __skb_queue_tail(&sk->sk_receive_queue, skb); tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->h.th->fin) - tcp_fin(skb, sk, skb->h.th); + if (tcp_hdr(skb)->fin) + tcp_fin(skb, sk, tcp_hdr(skb)); } } @@ -3348,7 +3592,7 @@ static int tcp_prune_queue(struct sock *sk); static void tcp_data_queue(struct sock *sk, struct sk_buff *skb) { - struct tcphdr *th = skb->h.th; + struct tcphdr *th = tcp_hdr(skb); struct tcp_sock *tp = tcp_sk(sk); int eaten = -1; @@ -3405,9 +3649,9 @@ queue_and_out: __skb_queue_tail(&sk->sk_receive_queue, skb); } tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; - if(skb->len) - tcp_event_data_recv(sk, tp, skb); - if(th->fin) + if (skb->len) + tcp_event_data_recv(sk, skb); + if (th->fin) tcp_fin(skb, sk, th); if (!skb_queue_empty(&tp->out_of_order_queue)) { @@ -3423,7 +3667,7 @@ queue_and_out: if (tp->rx_opt.num_sacks) tcp_sack_remove(tp); - tcp_fast_path_check(sk, tp); + tcp_fast_path_check(sk); if (eaten > 0) __kfree_skb(skb); @@ -3487,7 +3731,7 @@ drop: if (!skb_peek(&tp->out_of_order_queue)) { /* Initial out of order segment, build 1 SACK. */ - if (tp->rx_opt.sack_ok) { + if (tcp_is_sack(tp)) { tp->rx_opt.num_sacks = 1; tp->rx_opt.dsack = 0; tp->rx_opt.eff_sacks = 1; @@ -3552,7 +3796,7 @@ drop: } add_sack: - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_new_ofo_skb(sk, seq, end_seq); } } @@ -3587,7 +3831,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, * - bloated or contains data before "start" or * overlaps to the next one. */ - if (!skb->h.th->syn && !skb->h.th->fin && + if (!tcp_hdr(skb)->syn && !tcp_hdr(skb)->fin && (tcp_win_from_space(skb->truesize) > skb->len || before(TCP_SKB_CB(skb)->seq, start) || (skb->next != tail && @@ -3598,7 +3842,7 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, start = TCP_SKB_CB(skb)->end_seq; skb = skb->next; } - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || tcp_hdr(skb)->syn || tcp_hdr(skb)->fin) return; while (before(start, end)) { @@ -3614,11 +3858,14 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, nskb = alloc_skb(copy+header, GFP_ATOMIC); if (!nskb) return; + + skb_set_mac_header(nskb, skb_mac_header(skb) - skb->head); + skb_set_network_header(nskb, (skb_network_header(skb) - + skb->head)); + skb_set_transport_header(nskb, (skb_transport_header(skb) - + skb->head)); skb_reserve(nskb, header); memcpy(nskb->head, skb->head, header); - nskb->nh.raw = nskb->head + (skb->nh.raw-skb->head); - nskb->h.raw = nskb->head + (skb->h.raw-skb->head); - nskb->mac.raw = nskb->head + (skb->mac.raw-skb->head); memcpy(nskb->cb, skb->cb, sizeof(skb->cb)); TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(nskb)->end_seq = start; __skb_insert(nskb, skb->prev, skb, list); @@ -3644,7 +3891,9 @@ tcp_collapse(struct sock *sk, struct sk_buff_head *list, __kfree_skb(skb); NET_INC_STATS_BH(LINUX_MIB_TCPRCVCOLLAPSED); skb = next; - if (skb == tail || skb->h.th->syn || skb->h.th->fin) + if (skb == tail || + tcp_hdr(skb)->syn || + tcp_hdr(skb)->fin) return; } } @@ -3709,7 +3958,7 @@ static int tcp_prune_queue(struct sock *sk) NET_INC_STATS_BH(LINUX_MIB_PRUNECALLED); if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) - tcp_clamp_window(sk, tp); + tcp_clamp_window(sk); else if (tcp_memory_pressure) tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); @@ -3736,7 +3985,7 @@ static int tcp_prune_queue(struct sock *sk) * is in a sad state like this, we care only about integrity * of the connection not performance. */ - if (tp->rx_opt.sack_ok) + if (tcp_is_sack(tp)) tcp_sack_reset(&tp->rx_opt); sk_stream_mem_reclaim(sk); } @@ -3778,8 +4027,10 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk) { + struct tcp_sock *tp = tcp_sk(sk); + /* If the user specified a specific send buffer setting, do * not modify it. */ @@ -3811,7 +4062,7 @@ static void tcp_new_space(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); - if (tcp_should_expand_sndbuf(sk, tp)) { + if (tcp_should_expand_sndbuf(sk)) { int sndmem = max_t(u32, tp->rx_opt.mss_clamp, tp->mss_cache) + MAX_TCP_HEADER + 16 + sizeof(struct sk_buff), demanded = max_t(unsigned int, tp->snd_cwnd, @@ -3835,9 +4086,9 @@ static void tcp_check_space(struct sock *sk) } } -static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk) { - tcp_push_pending_frames(sk, tp); + tcp_push_pending_frames(sk); tcp_check_space(sk); } @@ -3985,7 +4236,7 @@ static int tcp_copy_to_iovec(struct sock *sk, struct sk_buff *skb, int hlen) int err; local_bh_enable(); - if (skb->ip_summed==CHECKSUM_UNNECESSARY) + if (skb_csum_unnecessary(skb)) err = skb_copy_datagram_iovec(skb, hlen, tp->ucopy.iov, chunk); else err = skb_copy_and_csum_datagram_iovec(skb, hlen, @@ -4017,7 +4268,7 @@ static __sum16 __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { - return skb->ip_summed != CHECKSUM_UNNECESSARY && + return !skb_csum_unnecessary(skb) && __tcp_checksum_complete_user(sk, skb); } @@ -4035,7 +4286,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) tp->ucopy.dma_chan = get_softnet_dma(); - if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) { + if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan, skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list); @@ -4051,7 +4302,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen tcp_rcv_space_adjust(sk); if ((tp->ucopy.len == 0) || - (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) || + (tcp_flag_word(tcp_hdr(skb)) & TCP_FLAG_PSH) || (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) { tp->ucopy.wakeup = 1; sk->sk_data_ready(sk, 0); @@ -4171,7 +4422,7 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, */ tcp_ack(sk, skb, 0); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } else { /* Header too small */ TCP_INC_STATS_BH(TCP_MIB_INERRS); @@ -4242,12 +4493,12 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb, tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq; } - tcp_event_data_recv(sk, tp, skb); + tcp_event_data_recv(sk, skb); if (TCP_SKB_CB(skb)->ack_seq != tp->snd_una) { /* Well, only one small jumplet in fast path... */ tcp_ack(sk, skb, FLAG_DATA); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); if (!inet_csk_ack_scheduled(sk)) goto no_ack; } @@ -4304,7 +4555,7 @@ slow_path: goto discard; } - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4319,7 +4570,7 @@ slow_path: } step5: - if(th->ack) + if (th->ack) tcp_ack(sk, skb, FLAG_SLOWPATH); tcp_rcv_rtt_measure_ts(sk, skb); @@ -4330,7 +4581,7 @@ step5: /* step 7: process the segment text */ tcp_data_queue(sk, skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); return 0; @@ -4435,8 +4686,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, tp->tcp_header_len = sizeof(struct tcphdr); } - if (tp->rx_opt.sack_ok && sysctl_tcp_fack) - tp->rx_opt.sack_ok |= 2; + if (tcp_is_sack(tp) && sysctl_tcp_fack) + tcp_enable_fack(tp); tcp_mtup_init(sk); tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -4607,13 +4858,13 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; case TCP_LISTEN: - if(th->ack) + if (th->ack) return 1; - if(th->rst) + if (th->rst) goto discard; - if(th->syn) { + if (th->syn) { if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; @@ -4647,7 +4898,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Do step6 onward by hand. */ tcp_urg(sk, skb, th); __kfree_skb(skb); - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); return 0; } @@ -4669,7 +4920,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, } /* step 2: check RST bit */ - if(th->rst) { + if (th->rst) { tcp_reset(sk); goto discard; } @@ -4692,7 +4943,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (th->ack) { int acceptable = tcp_ack(sk, skb, FLAG_SLOWPATH); - switch(sk->sk_state) { + switch (sk->sk_state) { case TCP_SYN_RECV: if (acceptable) { tp->copied_seq = tp->rcv_nxt; @@ -4839,7 +5090,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* tcp_data could move socket to TIME-WAIT */ if (sk->sk_state != TCP_CLOSE) { - tcp_data_snd_check(sk, tp); + tcp_data_snd_check(sk); tcp_ack_snd_check(sk); }