[TCP]: MD5 Signature Option (RFC2385) support.
[safe/jmp/linux-2.6] / net / ipv4 / tcp_input.c
index 195d835..6ab3423 100644 (file)
@@ -63,7 +63,6 @@
  *             Pasi Sarolahti:         F-RTO for dealing with spurious RTOs
  */
 
-#include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/sysctl.h>
 #include <net/inet_common.h>
 #include <linux/ipsec.h>
 #include <asm/unaligned.h>
-
-int sysctl_tcp_timestamps = 1;
-int sysctl_tcp_window_scaling = 1;
-int sysctl_tcp_sack = 1;
-int sysctl_tcp_fack = 1;
-int sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
-int sysctl_tcp_ecn;
-int sysctl_tcp_dsack = 1;
-int sysctl_tcp_app_win = 31;
-int sysctl_tcp_adv_win_scale = 2;
-
-int sysctl_tcp_stdurg;
-int sysctl_tcp_rfc1337;
-int sysctl_tcp_max_orphans = NR_FILE;
-int sysctl_tcp_frto;
-int sysctl_tcp_nometrics_save;
-
-int sysctl_tcp_moderate_rcvbuf = 1;
-int sysctl_tcp_abc = 1;
+#include <net/netdma.h>
+
+int sysctl_tcp_timestamps __read_mostly = 1;
+int sysctl_tcp_window_scaling __read_mostly = 1;
+int sysctl_tcp_sack __read_mostly = 1;
+int sysctl_tcp_fack __read_mostly = 1;
+int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
+int sysctl_tcp_ecn __read_mostly;
+int sysctl_tcp_dsack __read_mostly = 1;
+int sysctl_tcp_app_win __read_mostly = 31;
+int sysctl_tcp_adv_win_scale __read_mostly = 2;
+
+int sysctl_tcp_stdurg __read_mostly;
+int sysctl_tcp_rfc1337 __read_mostly;
+int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
+int sysctl_tcp_frto __read_mostly;
+int sysctl_tcp_nometrics_save __read_mostly;
+
+int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
+int sysctl_tcp_abc __read_mostly;
 
 #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
 #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
@@ -127,7 +127,7 @@ static void tcp_measure_rcv_mss(struct sock *sk,
        /* skb->len may jitter because of SACKs, even if peer
         * sends good full-sized frames.
         */
-       len = skb->len;
+       len = skb_shinfo(skb)->gso_size ?: skb->len;
        if (len >= icsk->icsk_ack.rcv_mss) {
                icsk->icsk_ack.rcv_mss = len;
        } else {
@@ -156,6 +156,8 @@ static void tcp_measure_rcv_mss(struct sock *sk,
                                return;
                        }
                }
+               if (icsk->icsk_ack.pending & ICSK_ACK_PUSHED)
+                       icsk->icsk_ack.pending |= ICSK_ACK_PUSHED2;
                icsk->icsk_ack.pending |= ICSK_ACK_PUSHED;
        }
 }
@@ -933,7 +935,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
        const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        unsigned char *ptr = ack_skb->h.raw + TCP_SKB_CB(ack_skb)->sacked;
-       struct tcp_sack_block *sp = (struct tcp_sack_block *)(ptr+2);
+       struct tcp_sack_block_wire *sp = (struct tcp_sack_block_wire *)(ptr+2);
        int num_sacks = (ptr[1] - TCPOLEN_SACK_BASE)>>3;
        int reord = tp->packets_out;
        int prior_fackets;
@@ -1072,7 +1074,7 @@ tcp_sacktag_write_queue(struct sock *sk, struct sk_buff *ack_skb, u32 prior_snd_
                                else
                                        pkt_len = (end_seq -
                                                   TCP_SKB_CB(skb)->seq);
-                               if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->tso_size))
+                               if (tcp_fragment(sk, skb, pkt_len, skb_shinfo(skb)->gso_size))
                                        break;
                                pcount = tcp_skb_pcount(skb);
                        }
@@ -1649,7 +1651,7 @@ static void tcp_update_scoreboard(struct sock *sk, struct tcp_sock *tp)
         * Hence, we can detect timed out packets during fast
         * retransmit without falling to slow start.
         */
-       if (tcp_head_timedout(sk, tp)) {
+       if (!IsReno(tp) && tcp_head_timedout(sk, tp)) {
                struct sk_buff *skb;
 
                skb = tp->scoreboard_skb_hint ? tp->scoreboard_skb_hint
@@ -1688,17 +1690,26 @@ static inline void tcp_moderate_cwnd(struct tcp_sock *tp)
        tp->snd_cwnd_stamp = tcp_time_stamp;
 }
 
+/* Lower bound on congestion window is slow start threshold
+ * unless congestion avoidance choice decides to overide it.
+ */
+static inline u32 tcp_cwnd_min(const struct sock *sk)
+{
+       const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops;
+
+       return ca_ops->min_cwnd ? ca_ops->min_cwnd(sk) : tcp_sk(sk)->snd_ssthresh;
+}
+
 /* Decrease cwnd each second ack. */
 static void tcp_cwnd_down(struct sock *sk)
 {
-       const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int decr = tp->snd_cwnd_cnt + 1;
 
        tp->snd_cwnd_cnt = decr&1;
        decr >>= 1;
 
-       if (decr && tp->snd_cwnd > icsk->icsk_ca_ops->min_cwnd(sk))
+       if (decr && tp->snd_cwnd > tcp_cwnd_min(sk))
                tp->snd_cwnd -= decr;
 
        tp->snd_cwnd = min(tp->snd_cwnd, tcp_packets_in_flight(tp)+1);
@@ -2228,13 +2239,12 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb,
        return acked;
 }
 
-static u32 tcp_usrtt(const struct sk_buff *skb)
+static u32 tcp_usrtt(struct timeval *tv)
 {
-       struct timeval tv, now;
+       struct timeval now;
 
        do_gettimeofday(&now);
-       skb_get_timestamp(skb, &tv);
-       return (now.tv_sec - tv.tv_sec) * 1000000 + (now.tv_usec - tv.tv_usec);
+       return (now.tv_sec - tv->tv_sec) * 1000000 + (now.tv_usec - tv->tv_usec);
 }
 
 /* Remove acknowledged frames from the retransmission queue. */
@@ -2249,6 +2259,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
        u32 pkts_acked = 0;
        void (*rtt_sample)(struct sock *sk, u32 usrtt)
                = icsk->icsk_ca_ops->rtt_sample;
+       struct timeval tv = { .tv_sec = 0, .tv_usec = 0 };
 
        while ((skb = skb_peek(&sk->sk_write_queue)) &&
               skb != sk->sk_send_head) {
@@ -2297,8 +2308,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
                                seq_rtt = -1;
                        } else if (seq_rtt < 0) {
                                seq_rtt = now - scb->when;
-                               if (rtt_sample)
-                                       (*rtt_sample)(sk, tcp_usrtt(skb));
+                               skb_get_timestamp(skb, &tv);
                        }
                        if (sacked & TCPCB_SACKED_ACKED)
                                tp->sacked_out -= tcp_skb_pcount(skb);
@@ -2311,8 +2321,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
                        }
                } else if (seq_rtt < 0) {
                        seq_rtt = now - scb->when;
-                       if (rtt_sample)
-                               (*rtt_sample)(sk, tcp_usrtt(skb));
+                       skb_get_timestamp(skb, &tv);
                }
                tcp_dec_pcount_approx(&tp->fackets_out, skb);
                tcp_packets_out_dec(tp, skb);
@@ -2324,6 +2333,8 @@ static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p)
        if (acked&FLAG_ACKED) {
                tcp_ack_update_rtt(sk, acked, seq_rtt);
                tcp_ack_packets_out(sk, tp);
+               if (rtt_sample && !(acked & FLAG_RETRANS_DATA_ACKED))
+                       (*rtt_sample)(sk, tcp_usrtt(&tv));
 
                if (icsk->icsk_ca_ops->pkts_acked)
                        icsk->icsk_ca_ops->pkts_acked(sk, pkts_acked);
@@ -2496,8 +2507,13 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag)
        if (before(ack, prior_snd_una))
                goto old_ack;
 
-       if (sysctl_tcp_abc && icsk->icsk_ca_state < TCP_CA_CWR)
-               tp->bytes_acked += ack - prior_snd_una;
+       if (sysctl_tcp_abc) {
+               if (icsk->icsk_ca_state < TCP_CA_CWR)
+                       tp->bytes_acked += ack - prior_snd_una;
+               else if (icsk->icsk_ca_state == TCP_CA_Loss)
+                       /* we assume just one segment left network */
+                       tp->bytes_acked += min(ack - prior_snd_una, tp->mss_cache);
+       }
 
        if (!(flag&FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
                /* Window is constant, pure forward advance.
@@ -2613,7 +2629,7 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
                                switch(opcode) {
                                case TCPOPT_MSS:
                                        if(opsize==TCPOLEN_MSS && th->syn && !estab) {
-                                               u16 in_mss = ntohs(get_unaligned((__u16 *)ptr));
+                                               u16 in_mss = ntohs(get_unaligned((__be16 *)ptr));
                                                if (in_mss) {
                                                        if (opt_rx->user_mss && opt_rx->user_mss < in_mss)
                                                                in_mss = opt_rx->user_mss;
@@ -2641,8 +2657,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
                                                if ((estab && opt_rx->tstamp_ok) ||
                                                    (!estab && sysctl_tcp_timestamps)) {
                                                        opt_rx->saw_tstamp = 1;
-                                                       opt_rx->rcv_tsval = ntohl(get_unaligned((__u32 *)ptr));
-                                                       opt_rx->rcv_tsecr = ntohl(get_unaligned((__u32 *)(ptr+4)));
+                                                       opt_rx->rcv_tsval = ntohl(get_unaligned((__be32 *)ptr));
+                                                       opt_rx->rcv_tsecr = ntohl(get_unaligned((__be32 *)(ptr+4)));
                                                }
                                        }
                                        break;
@@ -2661,6 +2677,14 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx,
                                           opt_rx->sack_ok) {
                                                TCP_SKB_CB(skb)->sacked = (ptr - 2) - (unsigned char *)th;
                                        }
+#ifdef CONFIG_TCP_MD5SIG
+                               case TCPOPT_MD5SIG:
+                                       /*
+                                        * The MD5 Hash has already been
+                                        * checked (see tcp_v{4,6}_do_rcv()).
+                                        */
+                                       break;
+#endif
                                };
                                ptr+=opsize-2;
                                length-=opsize;
@@ -2679,8 +2703,8 @@ static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th,
                return 0;
        } else if (tp->rx_opt.tstamp_ok &&
                   th->doff == (sizeof(struct tcphdr)>>2)+(TCPOLEN_TSTAMP_ALIGNED>>2)) {
-               __u32 *ptr = (__u32 *)(th + 1);
-               if (*ptr == ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+               __be32 *ptr = (__be32 *)(th + 1);
+               if (*ptr == htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
                                  | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP)) {
                        tp->rx_opt.saw_tstamp = 1;
                        ++ptr;
@@ -3532,7 +3556,8 @@ void tcp_cwnd_application_limited(struct sock *sk)
        if (inet_csk(sk)->icsk_ca_state == TCP_CA_Open &&
            sk->sk_socket && !test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
                /* Limited by application or receiver window. */
-               u32 win_used = max(tp->snd_cwnd_used, 2U);
+               u32 init_win = tcp_init_cwnd(tp, __sk_dst_get(sk));
+               u32 win_used = max(tp->snd_cwnd_used, init_win);
                if (win_used < tp->snd_cwnd) {
                        tp->snd_ssthresh = tcp_current_ssthresh(sk);
                        tp->snd_cwnd = (tp->snd_cwnd + win_used) >> 1;
@@ -3785,6 +3810,50 @@ static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *sk
                __tcp_checksum_complete_user(sk, skb);
 }
 
+#ifdef CONFIG_NET_DMA
+static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, int hlen)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       int chunk = skb->len - hlen;
+       int dma_cookie;
+       int copied_early = 0;
+
+       if (tp->ucopy.wakeup)
+               return 0;
+
+       if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
+               tp->ucopy.dma_chan = get_softnet_dma();
+
+       if (tp->ucopy.dma_chan && skb->ip_summed == CHECKSUM_UNNECESSARY) {
+
+               dma_cookie = dma_skb_copy_datagram_iovec(tp->ucopy.dma_chan,
+                       skb, hlen, tp->ucopy.iov, chunk, tp->ucopy.pinned_list);
+
+               if (dma_cookie < 0)
+                       goto out;
+
+               tp->ucopy.dma_cookie = dma_cookie;
+               copied_early = 1;
+
+               tp->ucopy.len -= chunk;
+               tp->copied_seq += chunk;
+               tcp_rcv_space_adjust(sk);
+
+               if ((tp->ucopy.len == 0) ||
+                   (tcp_flag_word(skb->h.th) & TCP_FLAG_PSH) ||
+                   (atomic_read(&sk->sk_rmem_alloc) > (sk->sk_rcvbuf >> 1))) {
+                       tp->ucopy.wakeup = 1;
+                       sk->sk_data_ready(sk, 0);
+               }
+       } else if (chunk > 0) {
+               tp->ucopy.wakeup = 1;
+               sk->sk_data_ready(sk, 0);
+       }
+out:
+       return copied_early;
+}
+#endif /* CONFIG_NET_DMA */
+
 /*
  *     TCP receive function for the ESTABLISHED state. 
  *
@@ -3850,10 +3919,10 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
                /* Check timestamp */
                if (tcp_header_len == sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED) {
-                       __u32 *ptr = (__u32 *)(th + 1);
+                       __be32 *ptr = (__be32 *)(th + 1);
 
                        /* No? Slow path! */
-                       if (*ptr != ntohl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
+                       if (*ptr != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
                                          | (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP))
                                goto slow_path;
 
@@ -3886,8 +3955,6 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                    tp->rcv_nxt == tp->rcv_wup)
                                        tcp_store_ts_recent(tp);
 
-                               tcp_rcv_rtt_measure_ts(sk, skb);
-
                                /* We know that such packets are checksummed
                                 * on entry.
                                 */
@@ -3901,14 +3968,23 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                        }
                } else {
                        int eaten = 0;
+                       int copied_early = 0;
 
-                       if (tp->ucopy.task == current &&
-                           tp->copied_seq == tp->rcv_nxt &&
-                           len - tcp_header_len <= tp->ucopy.len &&
-                           sock_owned_by_user(sk)) {
-                               __set_current_state(TASK_RUNNING);
+                       if (tp->copied_seq == tp->rcv_nxt &&
+                           len - tcp_header_len <= tp->ucopy.len) {
+#ifdef CONFIG_NET_DMA
+                               if (tcp_dma_try_early_copy(sk, skb, tcp_header_len)) {
+                                       copied_early = 1;
+                                       eaten = 1;
+                               }
+#endif
+                               if (tp->ucopy.task == current && sock_owned_by_user(sk) && !copied_early) {
+                                       __set_current_state(TASK_RUNNING);
 
-                               if (!tcp_copy_to_iovec(sk, skb, tcp_header_len)) {
+                                       if (!tcp_copy_to_iovec(sk, skb, tcp_header_len))
+                                               eaten = 1;
+                               }
+                               if (eaten) {
                                        /* Predicted packet is in window by definition.
                                         * seq == rcv_nxt and rcv_wup <= rcv_nxt.
                                         * Hence, check seq<=rcv_wup reduces to:
@@ -3924,8 +4000,9 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
                                        __skb_pull(skb, tcp_header_len);
                                        tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
                                        NET_INC_STATS_BH(LINUX_MIB_TCPHPHITSTOUSER);
-                                       eaten = 1;
                                }
+                               if (copied_early)
+                                       tcp_cleanup_rbuf(sk, skb->len);
                        }
                        if (!eaten) {
                                if (tcp_checksum_complete_user(sk, skb))
@@ -3966,6 +4043,11 @@ int tcp_rcv_established(struct sock *sk, struct sk_buff *skb,
 
                        __tcp_ack_snd_check(sk, 0);
 no_ack:
+#ifdef CONFIG_NET_DMA
+                       if (copied_early)
+                               __skb_queue_tail(&sk->sk_async_wait_queue, skb);
+                       else
+#endif
                        if (eaten)
                                __kfree_skb(skb);
                        else
@@ -4111,8 +4193,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                 */
 
                TCP_ECN_rcv_synack(tp, th);
-               if (tp->ecn_flags&TCP_ECN_OK)
-                       sock_set_flag(sk, SOCK_NO_LARGESEND);
 
                tp->snd_wl1 = TCP_SKB_CB(skb)->seq;
                tcp_ack(sk, skb, FLAG_SLOWPATH);
@@ -4158,6 +4238,8 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb,
                mb();
                tcp_set_state(sk, TCP_ESTABLISHED);
 
+               security_inet_conn_established(sk, skb);
+
                /* Make sure socket is routed, for correct metrics.  */
                icsk->icsk_af_ops->rebuild_header(sk);
 
@@ -4255,8 +4337,6 @@ discard:
                tp->max_window = tp->snd_wnd;
 
                TCP_ECN_rcv_syn(tp, th);
-               if (tp->ecn_flags&TCP_ECN_OK)
-                       sock_set_flag(sk, SOCK_NO_LARGESEND);
 
                tcp_mtup_init(sk);
                tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
@@ -4559,7 +4639,6 @@ discard:
 
 EXPORT_SYMBOL(sysctl_tcp_ecn);
 EXPORT_SYMBOL(sysctl_tcp_reordering);
-EXPORT_SYMBOL(sysctl_tcp_abc);
 EXPORT_SYMBOL(tcp_parse_options);
 EXPORT_SYMBOL(tcp_rcv_established);
 EXPORT_SYMBOL(tcp_rcv_state_process);