X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Ftcp_input.c;h=4c10d9cad20f594e2bd6a3a15b0c6d45554ece38;hb=4c70f383e0c0273c4092c4efdb414be0966978b7;hp=593960d66ed968e8a8499cf272bd96acc137d350;hpb=e9144bd8da80f3136b23c615609798e371e885ac;p=safe%2Fjmp%2Flinux-2.6

diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 593960d..4c10d9c 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -85,7 +85,7 @@ int sysctl_tcp_adv_win_scale __read_mostly = 2;
 int sysctl_tcp_stdurg __read_mostly;
 int sysctl_tcp_rfc1337 __read_mostly;
 int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
-int sysctl_tcp_frto __read_mostly;
+int sysctl_tcp_frto __read_mostly = 2;
 int sysctl_tcp_frto_response __read_mostly;
 int sysctl_tcp_nometrics_save __read_mostly;
 
@@ -104,6 +104,7 @@ int sysctl_tcp_abc __read_mostly;
 #define FLAG_ONLY_ORIG_SACKED	0x200 /* SACKs only non-rexmit sent before RTO */
 #define FLAG_SND_UNA_ADVANCED	0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */
 #define FLAG_DSACKING_ACK	0x800 /* SACK blocks contained DSACK info */
+#define FLAG_NONHEAD_RETRANS_ACKED	0x1000 /* Non-head rexmitted data was ACKed */
 
 #define FLAG_ACKED		(FLAG_DATA_ACKED|FLAG_SYN_ACKED)
 #define FLAG_NOT_DUP		(FLAG_DATA|FLAG_WIN_UPDATE|FLAG_ACKED)
@@ -1019,7 +1020,86 @@ static void tcp_update_reordering(struct sock *sk, const int metric,
  *    for retransmitted and already SACKed segment -> reordering..
  * Both of these heuristics are not used in Loss state, when we cannot
  * account for retransmits accurately.
+ *
+ * SACK block validation.
+ * ----------------------
+ *
+ * SACK block range validation checks that the received SACK block fits to
+ * the expected sequence limits, i.e., it is between SND.UNA and SND.NXT.
+ * Note that SND.UNA is not included to the range though being valid because
+ * it means that the receiver is rather inconsistent with itself (reports
+ * SACK reneging when it should advance SND.UNA).
+ *
+ * Implements also blockage to start_seq wrap-around. Problem lies in the
+ * fact that though start_seq (s) is before end_seq (i.e., not reversed),
+ * there's no guarantee that it will be before snd_nxt (n). The problem
+ * happens when start_seq resides between end_seq wrap (e_w) and snd_nxt
+ * wrap (s_w):
+ *
+ *         <- outs wnd ->                          <- wrapzone ->
+ *         u     e      n                         u_w   e_w  s n_w
+ *         |     |      |                          |     |   |  |
+ * |<------------+------+----- TCP seqno space --------------+---------->|
+ * ...-- <2^31 ->|                                           |<--------...
+ * ...---- >2^31 ------>|                                    |<--------...
+ *
+ * Current code wouldn't be vulnerable but it's better still to discard such
+ * crazy SACK blocks. Doing this check for start_seq alone closes somewhat
+ * similar case (end_seq after snd_nxt wrap) as earlier reversed check in
+ * snd_nxt wrap -> snd_una region will then become "well defined", i.e.,
+ * equal to the ideal case (infinite seqno space without wrap caused issues).
+ *
+ * With D-SACK the lower bound is extended to cover sequence space below
+ * SND.UNA down to undo_marker, which is the last point of interest. Yet
+ * again, DSACK block must not to go across snd_una (for the same reason as
+ * for the normal SACK blocks, explained above). But there all simplicity
+ * ends, TCP might receive valid D-SACKs below that. As long as they reside
+ * fully below undo_marker they do not affect behavior in anyway and can
+ * therefore be safely ignored. In rare cases (which are more or less
+ * theoretical ones), the D-SACK will nicely cross that boundary due to skb
+ * fragmentation and packet reordering past skb's retransmission. To consider
+ * them correctly, the acceptable range must be extended even more though
+ * the exact amount is rather hard to quantify. However, tp->max_window can
+ * be used as an exaggerated estimate.
  */
+static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack,
+				  u32 start_seq, u32 end_seq)
+{
+	/* Too far in future, or reversed (interpretation is ambiguous) */
+	if (after(end_seq, tp->snd_nxt) || !before(start_seq, end_seq))
+		return 0;
+
+	/* Nasty start_seq wrap-around check (see comments above) */
+	if (!before(start_seq, tp->snd_nxt))
+		return 0;
+
+	/* In outstanding window? ...This is valid exit for DSACKs too.
+	 * start_seq == snd_una is non-sensical (see comments above)
+	 */
+	if (after(start_seq, tp->snd_una))
+		return 1;
+
+	if (!is_dsack || !tp->undo_marker)
+		return 0;
+
+	/* ...Then it's D-SACK, and must reside below snd_una completely */
+	if (!after(end_seq, tp->snd_una))
+		return 0;
+
+	if (!before(start_seq, tp->undo_marker))
+		return 1;
+
+	/* Too old */
+	if (!after(end_seq, tp->undo_marker))
+		return 0;
+
+	/* Undo_marker boundary crossing (overestimates a lot). Known already:
+	 *   start_seq < undo_marker and end_seq >= undo_marker.
+	 */
+	return !before(start_seq, end_seq - tp->max_window);
+}
+
+
 static int tcp_check_dsack(struct tcp_sock *tp, struct sk_buff *ack_skb,
 			   struct tcp_sack_block_wire *sp, int num_sacks,
 			   u32 prior_snd_una)
@@ -1161,6 +1241,17 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
 		int fack_count;
 		int dup_sack = (found_dup_sack && (i == first_sack_index));
 
+		if (!tcp_is_sackblock_valid(tp, dup_sack, start_seq, end_seq)) {
+			if (dup_sack) {
+				if (!tp->undo_marker)
+					NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDNOUNDO);
+				else
+					NET_INC_STATS_BH(LINUX_MIB_TCPDSACKIGNOREDOLD);
+			} else
+				NET_INC_STATS_BH(LINUX_MIB_TCPSACKDISCARD);
+			continue;
+		}
+
 		skb = cached_skb;
 		fack_count = cached_fack_count;
 
@@ -1504,6 +1595,8 @@ void tcp_enter_frto(struct sock *sk)
 	tp->undo_retrans = 0;
 
 	skb = tcp_write_queue_head(sk);
+	if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+		tp->undo_marker = 0;
 	if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS) {
 		TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
 		tp->retrans_out -= tcp_skb_pcount(skb);
@@ -1553,6 +1646,8 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 			/* ...enter this if branch just for the first segment */
 			flag |= FLAG_DATA_ACKED;
 		} else {
+			if (TCP_SKB_CB(skb)->sacked & TCPCB_RETRANS)
+				tp->undo_marker = 0;
 			TCP_SKB_CB(skb)->sacked &= ~(TCPCB_LOST|TCPCB_SACKED_RETRANS);
 		}
 
@@ -1568,7 +1663,6 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 	tp->snd_cwnd = tcp_packets_in_flight(tp) + allowed_segments;
 	tp->snd_cwnd_cnt = 0;
 	tp->snd_cwnd_stamp = tcp_time_stamp;
-	tp->undo_marker = 0;
 	tp->frto_counter = 0;
 
 	tp->reordering = min_t(unsigned int, tp->reordering,
@@ -1577,7 +1671,7 @@ static void tcp_enter_frto_loss(struct sock *sk, int allowed_segments, int flag)
 	tp->high_seq = tp->frto_highmark;
 	TCP_ECN_queue_cwr(tp);
 
-	clear_all_retrans_hints(tp);
+	tcp_clear_retrans_hints_partial(tp);
 }
 
 void tcp_clear_retrans(struct tcp_sock *tp)
@@ -1617,10 +1711,14 @@ void tcp_enter_loss(struct sock *sk, int how)
 	tp->bytes_acked = 0;
 	tcp_clear_retrans(tp);
 
-	/* Push undo marker, if it was plain RTO and nothing
-	 * was retransmitted. */
-	if (!how)
+	if (!how) {
+		/* Push undo marker, if it was plain RTO and nothing
+		 * was retransmitted. */
 		tp->undo_marker = tp->snd_una;
+		tcp_clear_retrans_hints_partial(tp);
+	} else {
+		tcp_clear_all_retrans_hints(tp);
+	}
 
 	tcp_for_write_queue(skb, sk) {
 		if (skb == tcp_send_head(sk))
@@ -1647,8 +1745,6 @@ void tcp_enter_loss(struct sock *sk, int how)
 	TCP_ECN_queue_cwr(tp);
 	/* Abort FRTO algorithm if one is in progress */
 	tp->frto_counter = 0;
-
-	clear_all_retrans_hints(tp);
 }
 
 static int tcp_check_sack_reneging(struct sock *sk)
@@ -2013,7 +2109,7 @@ static void tcp_undo_cwr(struct sock *sk, const int undo)
 
 	/* There is something screwy going on with the retrans hints after
 	   an undo */
-	clear_all_retrans_hints(tp);
+	tcp_clear_all_retrans_hints(tp);
 }
 
 static inline int tcp_may_undo(struct tcp_sock *tp)
@@ -2106,7 +2202,7 @@ static int tcp_try_undo_loss(struct sock *sk)
 			TCP_SKB_CB(skb)->sacked &= ~TCPCB_LOST;
 		}
 
-		clear_all_retrans_hints(tp);
+		tcp_clear_all_retrans_hints(tp);
 
 		DBGUNDO(sk, "partial loss");
 		tp->lost_out = 0;
@@ -2209,8 +2305,8 @@ tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag)
 	 * 1. Reno does not count dupacks (sacked_out) automatically. */
 	if (!tp->packets_out)
 		tp->sacked_out = 0;
-	/* 2. SACK counts snd_fack in packets inaccurately. */
-	if (tp->sacked_out == 0)
+
+	if (WARN_ON(!tp->sacked_out && tp->fackets_out))
 		tp->fackets_out = 0;
 
 	/* Now state machine starts.
@@ -2424,8 +2520,7 @@ static void tcp_cong_avoid(struct sock *sk, u32 ack,
 /* Restart timer after forward progress on connection.
  * RFC2988 recommends to restart timer to now+rto.
  */
-
-static void tcp_ack_packets_out(struct sock *sk)
+static void tcp_rearm_rto(struct sock *sk)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 
@@ -2436,153 +2531,135 @@ static void tcp_ack_packets_out(struct sock *sk)
 	}
 }
 
-static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
-			 __u32 now, __s32 *seq_rtt)
+/* If we get here, the whole TSO packet has not been acked. */
+static u32 tcp_tso_acked(struct sock *sk, struct sk_buff *skb)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
-	struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-	__u32 seq = tp->snd_una;
-	__u32 packets_acked;
-	int acked = 0;
+	u32 packets_acked;
 
-	/* If we get here, the whole TSO packet has not been
-	 * acked.
-	 */
-	BUG_ON(!after(scb->end_seq, seq));
+	BUG_ON(!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una));
 
 	packets_acked = tcp_skb_pcount(skb);
-	if (tcp_trim_head(sk, skb, seq - scb->seq))
+	if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
 		return 0;
 	packets_acked -= tcp_skb_pcount(skb);
 
 	if (packets_acked) {
-		__u8 sacked = scb->sacked;
-
-		acked |= FLAG_DATA_ACKED;
-		if (sacked) {
-			if (sacked & TCPCB_RETRANS) {
-				if (sacked & TCPCB_SACKED_RETRANS)
-					tp->retrans_out -= packets_acked;
-				acked |= FLAG_RETRANS_DATA_ACKED;
-				*seq_rtt = -1;
-			} else if (*seq_rtt < 0)
-				*seq_rtt = now - scb->when;
-			if (sacked & TCPCB_SACKED_ACKED)
-				tp->sacked_out -= packets_acked;
-			if (sacked & TCPCB_LOST)
-				tp->lost_out -= packets_acked;
-			if (sacked & TCPCB_URG) {
-				if (tp->urg_mode &&
-				    !before(seq, tp->snd_up))
-					tp->urg_mode = 0;
-			}
-		} else if (*seq_rtt < 0)
-			*seq_rtt = now - scb->when;
-
-		if (tp->fackets_out) {
-			__u32 dval = min(tp->fackets_out, packets_acked);
-			tp->fackets_out -= dval;
-		}
-		/* hint's skb might be NULL but we don't need to care */
-		tp->fastpath_cnt_hint -= min_t(u32, packets_acked,
-					       tp->fastpath_cnt_hint);
-		tp->packets_out -= packets_acked;
-
 		BUG_ON(tcp_skb_pcount(skb) == 0);
-		BUG_ON(!before(scb->seq, scb->end_seq));
+		BUG_ON(!before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq));
 	}
 
-	return acked;
+	return packets_acked;
 }
 
-/* Remove acknowledged frames from the retransmission queue. */
-static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
+/* Remove acknowledged frames from the retransmission queue. If our packet
+ * is before the ack sequence we can discard it as it's confirmed to have
+ * arrived at the other end.
+ */
+static int tcp_clean_rtx_queue(struct sock *sk, s32 *seq_rtt_p)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	const struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_buff *skb;
-	__u32 now = tcp_time_stamp;
-	int acked = 0;
+	u32 now = tcp_time_stamp;
+	int fully_acked = 1;
+	int flag = 0;
 	int prior_packets = tp->packets_out;
-	__s32 seq_rtt = -1;
+	s32 seq_rtt = -1;
 	ktime_t last_ackt = net_invalid_timestamp();
 
-	while ((skb = tcp_write_queue_head(sk)) &&
-	       skb != tcp_send_head(sk)) {
+	while ((skb = tcp_write_queue_head(sk)) && skb != tcp_send_head(sk)) {
 		struct tcp_skb_cb *scb = TCP_SKB_CB(skb);
-		__u8 sacked = scb->sacked;
+		u32 end_seq;
+		u32 packets_acked;
+		u8 sacked = scb->sacked;
 
-		/* If our packet is before the ack sequence we can
-		 * discard it as it's confirmed to have arrived at
-		 * the other end.
-		 */
 		if (after(scb->end_seq, tp->snd_una)) {
-			if (tcp_skb_pcount(skb) > 1 &&
-			    after(tp->snd_una, scb->seq))
-				acked |= tcp_tso_acked(sk, skb,
-						       now, &seq_rtt);
-			break;
-		}
+			if (tcp_skb_pcount(skb) == 1 ||
+			    !after(tp->snd_una, scb->seq))
+				break;
 
-		/* Initial outgoing SYN's get put onto the write_queue
-		 * just like anything else we transmit.  It is not
-		 * true data, and if we misinform our callers that
-		 * this ACK acks real data, we will erroneously exit
-		 * connection startup slow start one packet too
-		 * quickly.  This is severely frowned upon behavior.
-		 */
-		if (!(scb->flags & TCPCB_FLAG_SYN)) {
-			acked |= FLAG_DATA_ACKED;
+			packets_acked = tcp_tso_acked(sk, skb);
+			if (!packets_acked)
+				break;
+
+			fully_acked = 0;
+			end_seq = tp->snd_una;
 		} else {
-			acked |= FLAG_SYN_ACKED;
-			tp->retrans_stamp = 0;
+			packets_acked = tcp_skb_pcount(skb);
+			end_seq = scb->end_seq;
 		}
 
 		/* MTU probing checks */
-		if (icsk->icsk_mtup.probe_size) {
-			if (!after(tp->mtu_probe.probe_seq_end, TCP_SKB_CB(skb)->end_seq)) {
-				tcp_mtup_probe_success(sk, skb);
-			}
+		if (fully_acked && icsk->icsk_mtup.probe_size &&
+		    !after(tp->mtu_probe.probe_seq_end, scb->end_seq)) {
+			tcp_mtup_probe_success(sk, skb);
 		}
 
 		if (sacked) {
 			if (sacked & TCPCB_RETRANS) {
 				if (sacked & TCPCB_SACKED_RETRANS)
-					tp->retrans_out -= tcp_skb_pcount(skb);
-				acked |= FLAG_RETRANS_DATA_ACKED;
+					tp->retrans_out -= packets_acked;
+				flag |= FLAG_RETRANS_DATA_ACKED;
 				seq_rtt = -1;
+				if ((flag & FLAG_DATA_ACKED) ||
+				    (packets_acked > 1))
+					flag |= FLAG_NONHEAD_RETRANS_ACKED;
 			} else if (seq_rtt < 0) {
 				seq_rtt = now - scb->when;
-				last_ackt = skb->tstamp;
+				if (fully_acked)
+					last_ackt = skb->tstamp;
 			}
+
 			if (sacked & TCPCB_SACKED_ACKED)
-				tp->sacked_out -= tcp_skb_pcount(skb);
+				tp->sacked_out -= packets_acked;
 			if (sacked & TCPCB_LOST)
-				tp->lost_out -= tcp_skb_pcount(skb);
-			if (sacked & TCPCB_URG) {
-				if (tp->urg_mode &&
-				    !before(scb->end_seq, tp->snd_up))
-					tp->urg_mode = 0;
-			}
+				tp->lost_out -= packets_acked;
+
+			if ((sacked & TCPCB_URG) && tp->urg_mode &&
+			    !before(end_seq, tp->snd_up))
+				tp->urg_mode = 0;
 		} else if (seq_rtt < 0) {
 			seq_rtt = now - scb->when;
-			last_ackt = skb->tstamp;
+			if (fully_acked)
+				last_ackt = skb->tstamp;
+		}
+		tp->packets_out -= packets_acked;
+
+		/* Initial outgoing SYN's get put onto the write_queue
+		 * just like anything else we transmit.  It is not
+		 * true data, and if we misinform our callers that
+		 * this ACK acks real data, we will erroneously exit
+		 * connection startup slow start one packet too
+		 * quickly.  This is severely frowned upon behavior.
+		 */
+		if (!(scb->flags & TCPCB_FLAG_SYN)) {
+			flag |= FLAG_DATA_ACKED;
+		} else {
+			flag |= FLAG_SYN_ACKED;
+			tp->retrans_stamp = 0;
 		}
-		tcp_dec_pcount_approx(&tp->fackets_out, skb);
-		tp->packets_out -= tcp_skb_pcount(skb);
+
+		if (!fully_acked)
+			break;
+
 		tcp_unlink_write_queue(skb, sk);
 		sk_stream_free_skb(sk, skb);
-		clear_all_retrans_hints(tp);
+		tcp_clear_all_retrans_hints(tp);
 	}
 
-	if (acked&FLAG_ACKED) {
+	if (flag & FLAG_ACKED) {
 		u32 pkts_acked = prior_packets - tp->packets_out;
 		const struct tcp_congestion_ops *ca_ops
 			= inet_csk(sk)->icsk_ca_ops;
 
-		tcp_ack_update_rtt(sk, acked, seq_rtt);
-		tcp_ack_packets_out(sk);
+		tcp_ack_update_rtt(sk, flag, seq_rtt);
+		tcp_rearm_rto(sk);
 
+		tp->fackets_out -= min(pkts_acked, tp->fackets_out);
+		/* hint's skb might be NULL but we don't need to care */
+		tp->fastpath_cnt_hint -= min_t(u32, pkts_acked,
+					       tp->fastpath_cnt_hint);
 		if (tcp_is_reno(tp))
 			tcp_remove_reno_sacks(sk, pkts_acked);
 
@@ -2590,7 +2667,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 			s32 rtt_us = -1;
 
 			/* Is the ACK triggering packet unambiguous? */
-			if (!(acked & FLAG_RETRANS_DATA_ACKED)) {
+			if (!(flag & FLAG_RETRANS_DATA_ACKED)) {
 				/* High resolution needed and available? */
 				if (ca_ops->flags & TCP_CONG_RTT_STAMP &&
 				    !ktime_equal(last_ackt,
@@ -2629,7 +2706,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
 	}
 #endif
 	*seq_rtt_p = seq_rtt;
-	return acked;
+	return flag;
 }
 
 static void tcp_ack_probe(struct sock *sk)
@@ -2784,6 +2861,10 @@ static int tcp_process_frto(struct sock *sk, int flag)
 	if (flag&FLAG_DATA_ACKED)
 		inet_csk(sk)->icsk_retransmits = 0;
 
+	if ((flag & FLAG_NONHEAD_RETRANS_ACKED) ||
+	    ((tp->frto_counter >= 2) && (flag & FLAG_RETRANS_DATA_ACKED)))
+		tp->undo_marker = 0;
+
 	if (!before(tp->snd_una, tp->frto_highmark)) {
 		tcp_enter_frto_loss(sk, (tp->frto_counter == 1 ? 2 : 3), flag);
 		return 1;
@@ -2848,6 +2929,7 @@ static int tcp_process_frto(struct sock *sk, int flag)
 			break;
 		}
 		tp->frto_counter = 0;
+		tp->undo_marker = 0;
 	}
 	return 0;
 }