[NET] IPV4: Fix whitespace errors.
[safe/jmp/linux-2.6] / net / ipv4 / tcp_output.c
index 566045e..cebe9aa 100644 (file)
 #include <linux/smp_lock.h>
 
 /* People can turn this off for buggy TCP's found in printers etc. */
 #include <linux/smp_lock.h>
 
 /* People can turn this off for buggy TCP's found in printers etc. */
-int sysctl_tcp_retrans_collapse = 1;
+int sysctl_tcp_retrans_collapse __read_mostly = 1;
+
+/* People can turn this on to  work with those rare, broken TCPs that
+ * interpret the window field as a signed quantity.
+ */
+int sysctl_tcp_workaround_signed_windows __read_mostly = 0;
 
 /* This limits the percentage of the congestion window which we
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
  */
 
 /* This limits the percentage of the congestion window which we
  * will allow a single TSO frame to consume.  Building TSO frames
  * which are too large can cause TCP streams to be bursty.
  */
-int sysctl_tcp_tso_win_divisor = 3;
+int sysctl_tcp_tso_win_divisor __read_mostly = 3;
+
+int sysctl_tcp_mtu_probing __read_mostly = 0;
+int sysctl_tcp_base_mss __read_mostly = 512;
+
+/* By default, RFC2861 behavior.  */
+int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
 
 
-static inline void update_send_head(struct sock *sk, struct tcp_sock *tp,
-                                   struct sk_buff *skb)
+static void update_send_head(struct sock *sk, struct tcp_sock *tp,
+                            struct sk_buff *skb)
 {
        sk->sk_send_head = skb->next;
        if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
 {
        sk->sk_send_head = skb->next;
        if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue)
@@ -105,47 +116,48 @@ static __u16 tcp_advertise_mss(struct sock *sk)
 
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
  * This is the first part of cwnd validation mechanism. */
 
 /* RFC2861. Reset CWND after idle period longer RTO to "restart window".
  * This is the first part of cwnd validation mechanism. */
-static void tcp_cwnd_restart(struct tcp_sock *tp, struct dst_entry *dst)
+static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst)
 {
 {
+       struct tcp_sock *tp = tcp_sk(sk);
        s32 delta = tcp_time_stamp - tp->lsndtime;
        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
        u32 cwnd = tp->snd_cwnd;
 
        s32 delta = tcp_time_stamp - tp->lsndtime;
        u32 restart_cwnd = tcp_init_cwnd(tp, dst);
        u32 cwnd = tp->snd_cwnd;
 
-       tcp_ca_event(tp, CA_EVENT_CWND_RESTART);
+       tcp_ca_event(sk, CA_EVENT_CWND_RESTART);
 
 
-       tp->snd_ssthresh = tcp_current_ssthresh(tp);
+       tp->snd_ssthresh = tcp_current_ssthresh(sk);
        restart_cwnd = min(restart_cwnd, cwnd);
 
        restart_cwnd = min(restart_cwnd, cwnd);
 
-       while ((delta -= tp->rto) > 0 && cwnd > restart_cwnd)
+       while ((delta -= inet_csk(sk)->icsk_rto) > 0 && cwnd > restart_cwnd)
                cwnd >>= 1;
        tp->snd_cwnd = max(cwnd, restart_cwnd);
        tp->snd_cwnd_stamp = tcp_time_stamp;
        tp->snd_cwnd_used = 0;
 }
 
                cwnd >>= 1;
        tp->snd_cwnd = max(cwnd, restart_cwnd);
        tp->snd_cwnd_stamp = tcp_time_stamp;
        tp->snd_cwnd_used = 0;
 }
 
-static inline void tcp_event_data_sent(struct tcp_sock *tp,
-                                      struct sk_buff *skb, struct sock *sk)
+static void tcp_event_data_sent(struct tcp_sock *tp,
+                               struct sk_buff *skb, struct sock *sk)
 {
 {
-       u32 now = tcp_time_stamp;
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       const u32 now = tcp_time_stamp;
 
 
-       if (!tp->packets_out && (s32)(now - tp->lsndtime) > tp->rto)
-               tcp_cwnd_restart(tp, __sk_dst_get(sk));
+       if (sysctl_tcp_slow_start_after_idle &&
+           (!tp->packets_out && (s32)(now - tp->lsndtime) > icsk->icsk_rto))
+               tcp_cwnd_restart(sk, __sk_dst_get(sk));
 
        tp->lsndtime = now;
 
        /* If it is a reply for ato after last received
         * packet, enter pingpong mode.
         */
 
        tp->lsndtime = now;
 
        /* If it is a reply for ato after last received
         * packet, enter pingpong mode.
         */
-       if ((u32)(now - tp->ack.lrcvtime) < tp->ack.ato)
-               tp->ack.pingpong = 1;
+       if ((u32)(now - icsk->icsk_ack.lrcvtime) < icsk->icsk_ack.ato)
+               icsk->icsk_ack.pingpong = 1;
 }
 
 }
 
-static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
+static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
 {
 {
-       struct tcp_sock *tp = tcp_sk(sk);
-
-       tcp_dec_quickack_mode(tp, pkts);
-       tcp_clear_xmit_timer(sk, TCP_TIME_DACK);
+       tcp_dec_quickack_mode(sk, pkts);
+       inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
 }
 
 /* Determine a window scaling and initial window to offer.
 }
 
 /* Determine a window scaling and initial window to offer.
@@ -171,18 +183,25 @@ void tcp_select_initial_window(int __space, __u32 mss,
                space = (space / mss) * mss;
 
        /* NOTE: offering an initial window larger than 32767
                space = (space / mss) * mss;
 
        /* NOTE: offering an initial window larger than 32767
-        * will break some buggy TCP stacks. We try to be nice.
-        * If we are not window scaling, then this truncates
-        * our initial window offering to 32k. There should also
-        * be a sysctl option to stop being nice.
+        * will break some buggy TCP stacks. If the admin tells us
+        * it is likely we could be speaking with such a buggy stack
+        * we will truncate our initial window offering to 32K-1
+        * unless the remote has sent us a window scaling option,
+        * which we interpret as a sign the remote TCP is not
+        * misinterpreting the window field as a signed quantity.
         */
         */
-       (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+       if (sysctl_tcp_workaround_signed_windows)
+               (*rcv_wnd) = min(space, MAX_TCP_WINDOW);
+       else
+               (*rcv_wnd) = space;
+
        (*rcv_wscale) = 0;
        if (wscale_ok) {
                /* Set window scaling on max possible window
        (*rcv_wscale) = 0;
        if (wscale_ok) {
                /* Set window scaling on max possible window
-                * See RFC1323 for an explanation of the limit to 14 
+                * See RFC1323 for an explanation of the limit to 14
                 */
                space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
                 */
                space = max_t(u32, sysctl_tcp_rmem[2], sysctl_rmem_max);
+               space = min_t(u32, space, *window_clamp);
                while (space > 65535 && (*rcv_wscale) < 14) {
                        space >>= 1;
                        (*rcv_wscale)++;
                while (space > 65535 && (*rcv_wscale) < 14) {
                        space >>= 1;
                        (*rcv_wscale)++;
@@ -190,7 +209,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
        }
 
        /* Set initial window to value enough for senders,
        }
 
        /* Set initial window to value enough for senders,
-        * following RFC1414. Senders, not following this RFC,
+        * following RFC2414. Senders, not following this RFC,
         * will be satisfied with 2.
         */
        if (mss > (1<<*rcv_wscale)) {
         * will be satisfied with 2.
         */
        if (mss > (1<<*rcv_wscale)) {
@@ -212,7 +231,7 @@ void tcp_select_initial_window(int __space, __u32 mss,
  * value can be stuffed directly into th->window for an outgoing
  * frame.
  */
  * value can be stuffed directly into th->window for an outgoing
  * frame.
  */
-static __inline__ u16 tcp_select_window(struct sock *sk)
+static u16 tcp_select_window(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u32 cur_win = tcp_receive_window(tp);
 {
        struct tcp_sock *tp = tcp_sk(sk);
        u32 cur_win = tcp_receive_window(tp);
@@ -235,7 +254,7 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
        /* Make sure we do not exceed the maximum possible
         * scaled window.
         */
        /* Make sure we do not exceed the maximum possible
         * scaled window.
         */
-       if (!tp->rx_opt.rcv_wscale)
+       if (!tp->rx_opt.rcv_wscale && sysctl_tcp_workaround_signed_windows)
                new_win = min(new_win, MAX_TCP_WINDOW);
        else
                new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
                new_win = min(new_win, MAX_TCP_WINDOW);
        else
                new_win = min(new_win, (65535U << tp->rx_opt.rcv_wscale));
@@ -250,6 +269,111 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
        return new_win;
 }
 
        return new_win;
 }
 
+static void tcp_build_and_update_options(__be32 *ptr, struct tcp_sock *tp,
+                                        __u32 tstamp, __u8 **md5_hash)
+{
+       if (tp->rx_opt.tstamp_ok) {
+               *ptr++ = htonl((TCPOPT_NOP << 24) |
+                              (TCPOPT_NOP << 16) |
+                              (TCPOPT_TIMESTAMP << 8) |
+                              TCPOLEN_TIMESTAMP);
+               *ptr++ = htonl(tstamp);
+               *ptr++ = htonl(tp->rx_opt.ts_recent);
+       }
+       if (tp->rx_opt.eff_sacks) {
+               struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks;
+               int this_sack;
+
+               *ptr++ = htonl((TCPOPT_NOP  << 24) |
+                              (TCPOPT_NOP  << 16) |
+                              (TCPOPT_SACK <<  8) |
+                              (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks *
+                                                    TCPOLEN_SACK_PERBLOCK)));
+               for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) {
+                       *ptr++ = htonl(sp[this_sack].start_seq);
+                       *ptr++ = htonl(sp[this_sack].end_seq);
+               }
+               if (tp->rx_opt.dsack) {
+                       tp->rx_opt.dsack = 0;
+                       tp->rx_opt.eff_sacks--;
+               }
+       }
+#ifdef CONFIG_TCP_MD5SIG
+       if (md5_hash) {
+               *ptr++ = htonl((TCPOPT_NOP << 24) |
+                              (TCPOPT_NOP << 16) |
+                              (TCPOPT_MD5SIG << 8) |
+                              TCPOLEN_MD5SIG);
+               *md5_hash = (__u8 *)ptr;
+       }
+#endif
+}
+
+/* Construct a tcp options header for a SYN or SYN_ACK packet.
+ * If this is every changed make sure to change the definition of
+ * MAX_SYN_SIZE to match the new maximum number of options that you
+ * can generate.
+ *
+ * Note - that with the RFC2385 TCP option, we make room for the
+ * 16 byte MD5 hash. This will be filled in later, so the pointer for the
+ * location to be filled is passed back up.
+ */
+static void tcp_syn_build_options(__be32 *ptr, int mss, int ts, int sack,
+                                 int offer_wscale, int wscale, __u32 tstamp,
+                                 __u32 ts_recent, __u8 **md5_hash)
+{
+       /* We always get an MSS option.
+        * The option bytes which will be seen in normal data
+        * packets should timestamps be used, must be in the MSS
+        * advertised.  But we subtract them from tp->mss_cache so
+        * that calculations in tcp_sendmsg are simpler etc.
+        * So account for this fact here if necessary.  If we
+        * don't do this correctly, as a receiver we won't
+        * recognize data packets as being full sized when we
+        * should, and thus we won't abide by the delayed ACK
+        * rules correctly.
+        * SACKs don't matter, we never delay an ACK when we
+        * have any of those going out.
+        */
+       *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss);
+       if (ts) {
+               if(sack)
+                       *ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
+                                      (TCPOLEN_SACK_PERM << 16) |
+                                      (TCPOPT_TIMESTAMP << 8) |
+                                      TCPOLEN_TIMESTAMP);
+               else
+                       *ptr++ = htonl((TCPOPT_NOP << 24) |
+                                      (TCPOPT_NOP << 16) |
+                                      (TCPOPT_TIMESTAMP << 8) |
+                                      TCPOLEN_TIMESTAMP);
+               *ptr++ = htonl(tstamp);         /* TSVAL */
+               *ptr++ = htonl(ts_recent);      /* TSECR */
+       } else if(sack)
+               *ptr++ = htonl((TCPOPT_NOP << 24) |
+                              (TCPOPT_NOP << 16) |
+                              (TCPOPT_SACK_PERM << 8) |
+                              TCPOLEN_SACK_PERM);
+       if (offer_wscale)
+               *ptr++ = htonl((TCPOPT_NOP << 24) |
+                              (TCPOPT_WINDOW << 16) |
+                              (TCPOLEN_WINDOW << 8) |
+                              (wscale));
+#ifdef CONFIG_TCP_MD5SIG
+       /*
+        * If MD5 is enabled, then we set the option, and include the size
+        * (always 18). The actual MD5 hash is added just before the
+        * packet is sent.
+        */
+       if (md5_hash) {
+               *ptr++ = htonl((TCPOPT_NOP << 24) |
+                              (TCPOPT_NOP << 16) |
+                              (TCPOPT_MD5SIG << 8) |
+                              TCPOLEN_MD5SIG);
+               *md5_hash = (__u8 *) ptr;
+       }
+#endif
+}
 
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
 
 /* This routine actually transmits TCP packets queued in by
  * tcp_do_sendmsg().  This is used by both the initial
@@ -262,128 +386,176 @@ static __inline__ u16 tcp_select_window(struct sock *sk)
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
  * We are working here with either a clone of the original
  * SKB, or a fresh unique copy made by the retransmit engine.
  */
-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb)
+static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, gfp_t gfp_mask)
 {
 {
-       if (skb != NULL) {
-               struct inet_sock *inet = inet_sk(sk);
-               struct tcp_sock *tp = tcp_sk(sk);
-               struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);
-               int tcp_header_size = tp->tcp_header_len;
-               struct tcphdr *th;
-               int sysctl_flags;
-               int err;
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct inet_sock *inet;
+       struct tcp_sock *tp;
+       struct tcp_skb_cb *tcb;
+       int tcp_header_size;
+#ifdef CONFIG_TCP_MD5SIG
+       struct tcp_md5sig_key *md5;
+       __u8 *md5_hash_location;
+#endif
+       struct tcphdr *th;
+       int sysctl_flags;
+       int err;
 
 
-               BUG_ON(!tcp_skb_pcount(skb));
+       BUG_ON(!skb || !tcp_skb_pcount(skb));
+
+       /* If congestion control is doing timestamping, we must
+        * take such a timestamp before we potentially clone/copy.
+        */
+       if (icsk->icsk_ca_ops->rtt_sample)
+               __net_timestamp(skb);
+
+       if (likely(clone_it)) {
+               if (unlikely(skb_cloned(skb)))
+                       skb = pskb_copy(skb, gfp_mask);
+               else
+                       skb = skb_clone(skb, gfp_mask);
+               if (unlikely(!skb))
+                       return -ENOBUFS;
+       }
+
+       inet = inet_sk(sk);
+       tp = tcp_sk(sk);
+       tcb = TCP_SKB_CB(skb);
+       tcp_header_size = tp->tcp_header_len;
 
 #define SYSCTL_FLAG_TSTAMPS    0x1
 #define SYSCTL_FLAG_WSCALE     0x2
 #define SYSCTL_FLAG_SACK       0x4
 
 
 #define SYSCTL_FLAG_TSTAMPS    0x1
 #define SYSCTL_FLAG_WSCALE     0x2
 #define SYSCTL_FLAG_SACK       0x4
 
-               /* If congestion control is doing timestamping */
-               if (tp->ca_ops->rtt_sample)
-                       do_gettimeofday(&skb->stamp);
-
-               sysctl_flags = 0;
-               if (tcb->flags & TCPCB_FLAG_SYN) {
-                       tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
-                       if(sysctl_tcp_timestamps) {
-                               tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
-                               sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
-                       }
-                       if(sysctl_tcp_window_scaling) {
-                               tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
-                               sysctl_flags |= SYSCTL_FLAG_WSCALE;
-                       }
-                       if(sysctl_tcp_sack) {
-                               sysctl_flags |= SYSCTL_FLAG_SACK;
-                               if(!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
-                                       tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
-                       }
-               } else if (tp->rx_opt.eff_sacks) {
-                       /* A SACK is 2 pad bytes, a 2 byte header, plus
-                        * 2 32-bit sequence numbers for each SACK block.
-                        */
-                       tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
-                                           (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
+       sysctl_flags = 0;
+       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+               tcp_header_size = sizeof(struct tcphdr) + TCPOLEN_MSS;
+               if(sysctl_tcp_timestamps) {
+                       tcp_header_size += TCPOLEN_TSTAMP_ALIGNED;
+                       sysctl_flags |= SYSCTL_FLAG_TSTAMPS;
                }
                }
-               
-               if (tcp_packets_in_flight(tp) == 0)
-                       tcp_ca_event(tp, CA_EVENT_TX_START);
-
-               th = (struct tcphdr *) skb_push(skb, tcp_header_size);
-               skb->h.th = th;
-               skb_set_owner_w(skb, sk);
-
-               /* Build TCP header and checksum it. */
-               th->source              = inet->sport;
-               th->dest                = inet->dport;
-               th->seq                 = htonl(tcb->seq);
-               th->ack_seq             = htonl(tp->rcv_nxt);
-               *(((__u16 *)th) + 6)    = htons(((tcp_header_size >> 2) << 12) | tcb->flags);
-               if (tcb->flags & TCPCB_FLAG_SYN) {
-                       /* RFC1323: The window in SYN & SYN/ACK segments
-                        * is never scaled.
-                        */
-                       th->window      = htons(tp->rcv_wnd);
-               } else {
-                       th->window      = htons(tcp_select_window(sk));
+               if (sysctl_tcp_window_scaling) {
+                       tcp_header_size += TCPOLEN_WSCALE_ALIGNED;
+                       sysctl_flags |= SYSCTL_FLAG_WSCALE;
                }
                }
-               th->check               = 0;
-               th->urg_ptr             = 0;
-
-               if (tp->urg_mode &&
-                   between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF)) {
-                       th->urg_ptr             = htons(tp->snd_up-tcb->seq);
-                       th->urg                 = 1;
+               if (sysctl_tcp_sack) {
+                       sysctl_flags |= SYSCTL_FLAG_SACK;
+                       if (!(sysctl_flags & SYSCTL_FLAG_TSTAMPS))
+                               tcp_header_size += TCPOLEN_SACKPERM_ALIGNED;
                }
                }
+       } else if (unlikely(tp->rx_opt.eff_sacks)) {
+               /* A SACK is 2 pad bytes, a 2 byte header, plus
+                * 2 32-bit sequence numbers for each SACK block.
+                */
+               tcp_header_size += (TCPOLEN_SACK_BASE_ALIGNED +
+                                   (tp->rx_opt.eff_sacks *
+                                    TCPOLEN_SACK_PERBLOCK));
+       }
 
 
-               if (tcb->flags & TCPCB_FLAG_SYN) {
-                       tcp_syn_build_options((__u32 *)(th + 1),
-                                             tcp_advertise_mss(sk),
-                                             (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
-                                             (sysctl_flags & SYSCTL_FLAG_SACK),
-                                             (sysctl_flags & SYSCTL_FLAG_WSCALE),
-                                             tp->rx_opt.rcv_wscale,
-                                             tcb->when,
-                                             tp->rx_opt.ts_recent);
-               } else {
-                       tcp_build_and_update_options((__u32 *)(th + 1),
-                                                    tp, tcb->when);
+       if (tcp_packets_in_flight(tp) == 0)
+               tcp_ca_event(sk, CA_EVENT_TX_START);
 
 
-                       TCP_ECN_send(sk, tp, skb, tcp_header_size);
-               }
-               tp->af_specific->send_check(sk, th, skb->len, skb);
+#ifdef CONFIG_TCP_MD5SIG
+       /*
+        * Are we doing MD5 on this segment? If so - make
+        * room for it.
+        */
+       md5 = tp->af_specific->md5_lookup(sk, sk);
+       if (md5)
+               tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
+       th = (struct tcphdr *) skb_push(skb, tcp_header_size);
+       skb->h.th = th;
+       skb_set_owner_w(skb, sk);
+
+       /* Build TCP header and checksum it. */
+       th->source              = inet->sport;
+       th->dest                = inet->dport;
+       th->seq                 = htonl(tcb->seq);
+       th->ack_seq             = htonl(tp->rcv_nxt);
+       *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
+                                       tcb->flags);
+
+       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+               /* RFC1323: The window in SYN & SYN/ACK segments
+                * is never scaled.
+                */
+               th->window      = htons(tp->rcv_wnd);
+       } else {
+               th->window      = htons(tcp_select_window(sk));
+       }
+       th->check               = 0;
+       th->urg_ptr             = 0;
+
+       if (unlikely(tp->urg_mode &&
+                    between(tp->snd_up, tcb->seq+1, tcb->seq+0xFFFF))) {
+               th->urg_ptr             = htons(tp->snd_up-tcb->seq);
+               th->urg                 = 1;
+       }
 
 
-               if (tcb->flags & TCPCB_FLAG_ACK)
-                       tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+       if (unlikely(tcb->flags & TCPCB_FLAG_SYN)) {
+               tcp_syn_build_options((__be32 *)(th + 1),
+                                     tcp_advertise_mss(sk),
+                                     (sysctl_flags & SYSCTL_FLAG_TSTAMPS),
+                                     (sysctl_flags & SYSCTL_FLAG_SACK),
+                                     (sysctl_flags & SYSCTL_FLAG_WSCALE),
+                                     tp->rx_opt.rcv_wscale,
+                                     tcb->when,
+                                     tp->rx_opt.ts_recent,
+
+#ifdef CONFIG_TCP_MD5SIG
+                                     md5 ? &md5_hash_location :
+#endif
+                                     NULL);
+       } else {
+               tcp_build_and_update_options((__be32 *)(th + 1),
+                                            tp, tcb->when,
+#ifdef CONFIG_TCP_MD5SIG
+                                            md5 ? &md5_hash_location :
+#endif
+                                            NULL);
+               TCP_ECN_send(sk, tp, skb, tcp_header_size);
+       }
+
+#ifdef CONFIG_TCP_MD5SIG
+       /* Calculate the MD5 hash, as we have all we need now */
+       if (md5) {
+               tp->af_specific->calc_md5_hash(md5_hash_location,
+                                              md5,
+                                              sk, NULL, NULL,
+                                              skb->h.th,
+                                              sk->sk_protocol,
+                                              skb->len);
+       }
+#endif
 
 
-               if (skb->len != tcp_header_size)
-                       tcp_event_data_sent(tp, skb, sk);
+       icsk->icsk_af_ops->send_check(sk, skb->len, skb);
 
 
+       if (likely(tcb->flags & TCPCB_FLAG_ACK))
+               tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
+
+       if (skb->len != tcp_header_size)
+               tcp_event_data_sent(tp, skb, sk);
+
+       if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
                TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
                TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
-               err = tp->af_specific->queue_xmit(skb, 0);
-               if (err <= 0)
-                       return err;
+       err = icsk->icsk_af_ops->queue_xmit(skb, 0);
+       if (likely(err <= 0))
+               return err;
 
 
-               tcp_enter_cwr(tp);
+       tcp_enter_cwr(sk);
+
+       return net_xmit_eval(err);
 
 
-               /* NET_XMIT_CN is special. It does not guarantee,
-                * that this packet is lost. It tells that device
-                * is about to start to drop packets or already
-                * drops some packets of the same priority and
-                * invokes us to send less aggressively.
-                */
-               return err == NET_XMIT_CN ? 0 : err;
-       }
-       return -ENOBUFS;
 #undef SYSCTL_FLAG_TSTAMPS
 #undef SYSCTL_FLAG_WSCALE
 #undef SYSCTL_FLAG_SACK
 }
 
 
 #undef SYSCTL_FLAG_TSTAMPS
 #undef SYSCTL_FLAG_WSCALE
 #undef SYSCTL_FLAG_SACK
 }
 
 
-/* This routine just queue's the buffer 
+/* This routine just queue's the buffer
  *
  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
  * otherwise socket can stall.
  *
  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
  * otherwise socket can stall.
@@ -405,35 +577,40 @@ static void tcp_queue_skb(struct sock *sk, struct sk_buff *skb)
 
 static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
 
 static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
-       if (skb->len <= mss_now ||
-           !(sk->sk_route_caps & NETIF_F_TSO)) {
+       if (skb->len <= mss_now || !sk_can_gso(sk)) {
                /* Avoid the costly divide in the normal
                 * non-TSO case.
                 */
                /* Avoid the costly divide in the normal
                 * non-TSO case.
                 */
-               skb_shinfo(skb)->tso_segs = 1;
-               skb_shinfo(skb)->tso_size = 0;
+               skb_shinfo(skb)->gso_segs = 1;
+               skb_shinfo(skb)->gso_size = 0;
+               skb_shinfo(skb)->gso_type = 0;
        } else {
                unsigned int factor;
 
                factor = skb->len + (mss_now - 1);
                factor /= mss_now;
        } else {
                unsigned int factor;
 
                factor = skb->len + (mss_now - 1);
                factor /= mss_now;
-               skb_shinfo(skb)->tso_segs = factor;
-               skb_shinfo(skb)->tso_size = mss_now;
+               skb_shinfo(skb)->gso_segs = factor;
+               skb_shinfo(skb)->gso_size = mss_now;
+               skb_shinfo(skb)->gso_type = sk->sk_gso_type;
        }
 }
 
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
        }
 }
 
 /* Function to create two new TCP segments.  Shrinks the given segment
  * to the specified size and appends a new segment with the rest of the
- * packet to the list.  This won't be called frequently, I hope. 
+ * packet to the list.  This won't be called frequently, I hope.
  * Remember, these are still headerless SKBs at this point.
  */
  * Remember, these are still headerless SKBs at this point.
  */
-static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
+int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned int mss_now)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *buff;
-       int nsize;
+       int nsize, old_factor;
+       int nlen;
        u16 flags;
 
        u16 flags;
 
+       BUG_ON(len > skb->len);
+
+       clear_all_retrans_hints(tp);
        nsize = skb_headlen(skb) - len;
        if (nsize < 0)
                nsize = 0;
        nsize = skb_headlen(skb) - len;
        if (nsize < 0)
                nsize = 0;
@@ -447,7 +624,11 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
        buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
        if (buff == NULL)
                return -ENOMEM; /* We'll just try again later. */
        buff = sk_stream_alloc_skb(sk, nsize, GFP_ATOMIC);
        if (buff == NULL)
                return -ENOMEM; /* We'll just try again later. */
+
        sk_charge_skb(sk, buff);
        sk_charge_skb(sk, buff);
+       nlen = skb->len - len - nsize;
+       buff->truesize += nlen;
+       skb->truesize -= nlen;
 
        /* Correct the sequence numbers. */
        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
 
        /* Correct the sequence numbers. */
        TCP_SKB_CB(buff)->seq = TCP_SKB_CB(skb)->seq + len;
@@ -458,12 +639,10 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
        flags = TCP_SKB_CB(skb)->flags;
        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
        TCP_SKB_CB(buff)->flags = flags;
        flags = TCP_SKB_CB(skb)->flags;
        TCP_SKB_CB(skb)->flags = flags & ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
        TCP_SKB_CB(buff)->flags = flags;
-       TCP_SKB_CB(buff)->sacked =
-               (TCP_SKB_CB(skb)->sacked &
-                (TCPCB_LOST | TCPCB_EVER_RETRANS | TCPCB_AT_TAIL));
+       TCP_SKB_CB(buff)->sacked = TCP_SKB_CB(skb)->sacked;
        TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
 
        TCP_SKB_CB(skb)->sacked &= ~TCPCB_AT_TAIL;
 
-       if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_HW) {
+       if (!skb_shinfo(skb)->nr_frags && skb->ip_summed != CHECKSUM_PARTIAL) {
                /* Copy and checksum data tail into the new buffer. */
                buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
                                                       nsize, 0);
                /* Copy and checksum data tail into the new buffer. */
                buff->csum = csum_partial_copy_nocheck(skb->data + len, skb_put(buff, nsize),
                                                       nsize, 0);
@@ -472,7 +651,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
 
                skb->csum = csum_block_sub(skb->csum, buff->csum, len);
        } else {
 
                skb->csum = csum_block_sub(skb->csum, buff->csum, len);
        } else {
-               skb->ip_summed = CHECKSUM_HW;
+               skb->ip_summed = CHECKSUM_PARTIAL;
                skb_split(skb, buff, len);
        }
 
                skb_split(skb, buff, len);
        }
 
@@ -482,30 +661,51 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
         * skbs, which it never sent before. --ANK
         */
        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
         * skbs, which it never sent before. --ANK
         */
        TCP_SKB_CB(buff)->when = TCP_SKB_CB(skb)->when;
-       buff->stamp = skb->stamp;
+       buff->tstamp = skb->tstamp;
 
 
-       if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
-               tp->lost_out -= tcp_skb_pcount(skb);
-               tp->left_out -= tcp_skb_pcount(skb);
-       }
+       old_factor = tcp_skb_pcount(skb);
 
        /* Fix up tso_factor for both original and new SKB.  */
        tcp_set_skb_tso_segs(sk, skb, mss_now);
        tcp_set_skb_tso_segs(sk, buff, mss_now);
 
 
        /* Fix up tso_factor for both original and new SKB.  */
        tcp_set_skb_tso_segs(sk, skb, mss_now);
        tcp_set_skb_tso_segs(sk, buff, mss_now);
 
-       if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
-               tp->lost_out += tcp_skb_pcount(skb);
-               tp->left_out += tcp_skb_pcount(skb);
-       }
+       /* If this packet has been sent out already, we must
+        * adjust the various packet counters.
+        */
+       if (!before(tp->snd_nxt, TCP_SKB_CB(buff)->end_seq)) {
+               int diff = old_factor - tcp_skb_pcount(skb) -
+                       tcp_skb_pcount(buff);
 
 
-       if (TCP_SKB_CB(buff)->sacked&TCPCB_LOST) {
-               tp->lost_out += tcp_skb_pcount(buff);
-               tp->left_out += tcp_skb_pcount(buff);
+               tp->packets_out -= diff;
+
+               if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+                       tp->sacked_out -= diff;
+               if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+                       tp->retrans_out -= diff;
+
+               if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST) {
+                       tp->lost_out -= diff;
+                       tp->left_out -= diff;
+               }
+
+               if (diff > 0) {
+                       /* Adjust Reno SACK estimate. */
+                       if (!tp->rx_opt.sack_ok) {
+                               tp->sacked_out -= diff;
+                               if ((int)tp->sacked_out < 0)
+                                       tp->sacked_out = 0;
+                               tcp_sync_left_out(tp);
+                       }
+
+                       tp->fackets_out -= diff;
+                       if ((int)tp->fackets_out < 0)
+                               tp->fackets_out = 0;
+               }
        }
 
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
        }
 
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
-       __skb_append(skb, buff);
+       __skb_append(skb, buff, &sk->sk_write_queue);
 
        return 0;
 }
 
        return 0;
 }
@@ -514,7 +714,7 @@ static int tcp_fragment(struct sock *sk, struct sk_buff *skb, u32 len, unsigned
  * eventually). The difference is that pulled data not copied, but
  * immediately discarded.
  */
  * eventually). The difference is that pulled data not copied, but
  * immediately discarded.
  */
-static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
+static void __pskb_trim_head(struct sk_buff *skb, int len)
 {
        int i, k, eat;
 
 {
        int i, k, eat;
 
@@ -539,7 +739,6 @@ static unsigned char *__pskb_trim_head(struct sk_buff *skb, int len)
        skb->tail = skb->data;
        skb->data_len -= len;
        skb->len = skb->data_len;
        skb->tail = skb->data;
        skb->data_len -= len;
        skb->len = skb->data_len;
-       return skb->tail;
 }
 
 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
 }
 
 int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
@@ -548,15 +747,14 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
                return -ENOMEM;
 
            pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
                return -ENOMEM;
 
-       if (len <= skb_headlen(skb)) {
+       /* If len == headlen, we avoid __skb_pull to preserve alignment. */
+       if (unlikely(len < skb_headlen(skb)))
                __skb_pull(skb, len);
                __skb_pull(skb, len);
-       } else {
-               if (__pskb_trim_head(skb, len-skb_headlen(skb)) == NULL)
-                       return -ENOMEM;
-       }
+       else
+               __pskb_trim_head(skb, len - skb_headlen(skb));
 
        TCP_SKB_CB(skb)->seq += len;
 
        TCP_SKB_CB(skb)->seq += len;
-       skb->ip_summed = CHECKSUM_HW;
+       skb->ip_summed = CHECKSUM_PARTIAL;
 
        skb->truesize        -= len;
        sk->sk_wmem_queued   -= len;
 
        skb->truesize        -= len;
        sk->sk_wmem_queued   -= len;
@@ -572,16 +770,72 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
        return 0;
 }
 
        return 0;
 }
 
+/* Not accounting for SACKs here. */
+int tcp_mtu_to_mss(struct sock *sk, int pmtu)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int mss_now;
+
+       /* Calculate base mss without TCP options:
+          It is MMS_S - sizeof(tcphdr) of rfc1122
+        */
+       mss_now = pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr);
+
+       /* Clamp it (mss_clamp does not include tcp options) */
+       if (mss_now > tp->rx_opt.mss_clamp)
+               mss_now = tp->rx_opt.mss_clamp;
+
+       /* Now subtract optional transport overhead */
+       mss_now -= icsk->icsk_ext_hdr_len;
+
+       /* Then reserve room for full set of TCP options and 8 bytes of data */
+       if (mss_now < 48)
+               mss_now = 48;
+
+       /* Now subtract TCP options size, not including SACKs */
+       mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+
+       return mss_now;
+}
+
+/* Inverse of above */
+int tcp_mss_to_mtu(struct sock *sk, int mss)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int mtu;
+
+       mtu = mss +
+             tp->tcp_header_len +
+             icsk->icsk_ext_hdr_len +
+             icsk->icsk_af_ops->net_header_len;
+
+       return mtu;
+}
+
+void tcp_mtup_init(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       icsk->icsk_mtup.enabled = sysctl_tcp_mtu_probing > 1;
+       icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp + sizeof(struct tcphdr) +
+                              icsk->icsk_af_ops->net_header_len;
+       icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, sysctl_tcp_base_mss);
+       icsk->icsk_mtup.probe_size = 0;
+}
+
 /* This function synchronize snd mss to current pmtu/exthdr set.
 
    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
    for TCP options, but includes only bare TCP header.
 
    tp->rx_opt.mss_clamp is mss negotiated at connection setup.
 /* This function synchronize snd mss to current pmtu/exthdr set.
 
    tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
    for TCP options, but includes only bare TCP header.
 
    tp->rx_opt.mss_clamp is mss negotiated at connection setup.
-   It is minumum of user_mss and mss received with SYN.
+   It is minimum of user_mss and mss received with SYN.
    It also does not include TCP options.
 
    It also does not include TCP options.
 
-   tp->pmtu_cookie is last pmtu, seen by this function.
+   inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function.
 
    tp->mss_cache is current effective sending mss, including
    all tcp options except for SACKs. It is evaluated,
 
    tp->mss_cache is current effective sending mss, including
    all tcp options except for SACKs. It is evaluated,
@@ -591,40 +845,29 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
    NOTE1. rfc1122 clearly states that advertised MSS
    DOES NOT include either tcp or ip options.
 
    NOTE1. rfc1122 clearly states that advertised MSS
    DOES NOT include either tcp or ip options.
 
-   NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside
-   this function.                      --ANK (980731)
+   NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache
+   are READ ONLY outside this function.                --ANK (980731)
  */
 
 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
        struct tcp_sock *tp = tcp_sk(sk);
  */
 
 unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu)
 {
        struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
        int mss_now;
 
        int mss_now;
 
-       /* Calculate base mss without TCP options:
-          It is MMS_S - sizeof(tcphdr) of rfc1122
-        */
-       mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr);
-
-       /* Clamp it (mss_clamp does not include tcp options) */
-       if (mss_now > tp->rx_opt.mss_clamp)
-               mss_now = tp->rx_opt.mss_clamp;
-
-       /* Now subtract optional transport overhead */
-       mss_now -= tp->ext_header_len;
+       if (icsk->icsk_mtup.search_high > pmtu)
+               icsk->icsk_mtup.search_high = pmtu;
 
 
-       /* Then reserve room for full set of TCP options and 8 bytes of data */
-       if (mss_now < 48)
-               mss_now = 48;
-
-       /* Now subtract TCP options size, not including SACKs */
-       mss_now -= tp->tcp_header_len - sizeof(struct tcphdr);
+       mss_now = tcp_mtu_to_mss(sk, pmtu);
 
        /* Bound mss with half of window */
        if (tp->max_window && mss_now > (tp->max_window>>1))
                mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
 
        /* And store cached results */
 
        /* Bound mss with half of window */
        if (tp->max_window && mss_now > (tp->max_window>>1))
                mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len);
 
        /* And store cached results */
-       tp->pmtu_cookie = pmtu;
+       icsk->icsk_pmtu_cookie = pmtu;
+       if (icsk->icsk_mtup.enabled)
+               mss_now = min(mss_now, tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_low));
        tp->mss_cache = mss_now;
 
        return mss_now;
        tp->mss_cache = mss_now;
 
        return mss_now;
@@ -647,14 +890,12 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 
        mss_now = tp->mss_cache;
 
 
        mss_now = tp->mss_cache;
 
-       if (large_allowed &&
-           (sk->sk_route_caps & NETIF_F_TSO) &&
-           !tp->urg_mode)
+       if (large_allowed && sk_can_gso(sk) && !tp->urg_mode)
                doing_tso = 1;
 
        if (dst) {
                u32 mtu = dst_mtu(dst);
                doing_tso = 1;
 
        if (dst) {
                u32 mtu = dst_mtu(dst);
-               if (mtu != tp->pmtu_cookie)
+               if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
                        mss_now = tcp_sync_mss(sk, mtu);
        }
 
                        mss_now = tcp_sync_mss(sk, mtu);
        }
 
@@ -662,12 +903,18 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 
                mss_now -= (TCPOLEN_SACK_BASE_ALIGNED +
                            (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK));
 
+#ifdef CONFIG_TCP_MD5SIG
+       if (tp->af_specific->md5_lookup(sk, sk))
+               mss_now -= TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
        xmit_size_goal = mss_now;
 
        if (doing_tso) {
        xmit_size_goal = mss_now;
 
        if (doing_tso) {
-               xmit_size_goal = 65535 -
-                       tp->af_specific->net_header_len -
-                       tp->ext_header_len - tp->tcp_header_len;
+               xmit_size_goal = (65535 -
+                                 inet_csk(sk)->icsk_af_ops->net_header_len -
+                                 inet_csk(sk)->icsk_ext_hdr_len -
+                                 tp->tcp_header_len);
 
                if (tp->max_window &&
                    (xmit_size_goal > (tp->max_window >> 1)))
 
                if (tp->max_window &&
                    (xmit_size_goal > (tp->max_window >> 1)))
@@ -683,7 +930,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
 
 /* Congestion window validation. (RFC2861) */
 
 
 /* Congestion window validation. (RFC2861) */
 
-static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
+static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
 {
        __u32 packets_out = tp->packets_out;
 
 {
        __u32 packets_out = tp->packets_out;
 
@@ -696,7 +943,7 @@ static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp)
                if (tp->packets_out > tp->snd_cwnd_used)
                        tp->snd_cwnd_used = tp->packets_out;
 
                if (tp->packets_out > tp->snd_cwnd_used)
                        tp->snd_cwnd_used = tp->packets_out;
 
-               if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= tp->rto)
+               if ((s32)(tcp_time_stamp - tp->snd_cwnd_stamp) >= inet_csk(sk)->icsk_rto)
                        tcp_cwnd_application_limited(sk);
        }
 }
                        tcp_cwnd_application_limited(sk);
        }
 }
@@ -718,7 +965,8 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
        u32 in_flight, cwnd;
 
        /* Don't be strict about the congestion window for the final FIN.  */
        u32 in_flight, cwnd;
 
        /* Don't be strict about the congestion window for the final FIN.  */
-       if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
+       if ((TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN) &&
+           tcp_skb_pcount(skb) == 1)
                return 1;
 
        in_flight = tcp_packets_in_flight(tp);
                return 1;
 
        in_flight = tcp_packets_in_flight(tp);
@@ -732,13 +980,13 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk
 /* This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
 /* This must be invoked the first time we consider transmitting
  * SKB onto the wire.
  */
-static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
+static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now)
 {
        int tso_segs = tcp_skb_pcount(skb);
 
        if (!tso_segs ||
            (tso_segs > 1 &&
 {
        int tso_segs = tcp_skb_pcount(skb);
 
        if (!tso_segs ||
            (tso_segs > 1 &&
-            skb_shinfo(skb)->tso_size != mss_now)) {
+            tcp_skb_mss(skb) != mss_now)) {
                tcp_set_skb_tso_segs(sk, skb, mss_now);
                tso_segs = tcp_skb_pcount(skb);
        }
                tcp_set_skb_tso_segs(sk, skb, mss_now);
                tso_segs = tcp_skb_pcount(skb);
        }
@@ -760,7 +1008,7 @@ static inline int tcp_minshall_check(const struct tcp_sock *tp)
  */
 
 static inline int tcp_nagle_check(const struct tcp_sock *tp,
  */
 
 static inline int tcp_nagle_check(const struct tcp_sock *tp,
-                                 const struct sk_buff *skb, 
+                                 const struct sk_buff *skb,
                                  unsigned mss_now, int nonagle)
 {
        return (skb->len < mss_now &&
                                  unsigned mss_now, int nonagle)
 {
        return (skb->len < mss_now &&
@@ -830,7 +1078,7 @@ static unsigned int tcp_snd_test(struct sock *sk, struct sk_buff *skb,
        return cwnd_quota;
 }
 
        return cwnd_quota;
 }
 
-static inline int tcp_skb_is_last(const struct sock *sk, 
+static inline int tcp_skb_is_last(const struct sock *sk,
                                  const struct sk_buff *skb)
 {
        return skb->next == (struct sk_buff *)&sk->sk_write_queue;
                                  const struct sk_buff *skb)
 {
        return skb->next == (struct sk_buff *)&sk->sk_write_queue;
@@ -868,7 +1116,8 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
        if (unlikely(buff == NULL))
                return -ENOMEM;
 
        if (unlikely(buff == NULL))
                return -ENOMEM;
 
-       buff->truesize = nlen;
+       sk_charge_skb(sk, buff);
+       buff->truesize += nlen;
        skb->truesize -= nlen;
 
        /* Correct the sequence numbers. */
        skb->truesize -= nlen;
 
        /* Correct the sequence numbers. */
@@ -884,7 +1133,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
        /* This packet was never sent out yet, so no SACK bits. */
        TCP_SKB_CB(buff)->sacked = 0;
 
        /* This packet was never sent out yet, so no SACK bits. */
        TCP_SKB_CB(buff)->sacked = 0;
 
-       buff->ip_summed = skb->ip_summed = CHECKSUM_HW;
+       buff->ip_summed = skb->ip_summed = CHECKSUM_PARTIAL;
        skb_split(skb, buff, len);
 
        /* Fix up tso_factor for both original and new SKB.  */
        skb_split(skb, buff, len);
 
        /* Fix up tso_factor for both original and new SKB.  */
@@ -893,7 +1142,7 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
 
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
 
        /* Link BUFF into the send queue. */
        skb_header_release(buff);
-       __skb_append(skb, buff);
+       __skb_append(skb, buff, &sk->sk_write_queue);
 
        return 0;
 }
 
        return 0;
 }
@@ -905,13 +1154,18 @@ static int tso_fragment(struct sock *sk, struct sk_buff *skb, unsigned int len,
  */
 static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
  */
 static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_buff *skb)
 {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        u32 send_win, cong_win, limit, in_flight;
 
        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
        u32 send_win, cong_win, limit, in_flight;
 
        if (TCP_SKB_CB(skb)->flags & TCPCB_FLAG_FIN)
-               return 0;
+               goto send_now;
 
 
-       if (tp->ca_state != TCP_CA_Open)
-               return 0;
+       if (icsk->icsk_ca_state != TCP_CA_Open)
+               goto send_now;
+
+       /* Defer for less than two clock ticks. */
+       if (!tp->tso_deferred && ((jiffies<<1)>>1) - (tp->tso_deferred>>1) > 1)
+               goto send_now;
 
        in_flight = tcp_packets_in_flight(tp);
 
 
        in_flight = tcp_packets_in_flight(tp);
 
@@ -925,9 +1179,9 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
 
        limit = min(send_win, cong_win);
 
 
        limit = min(send_win, cong_win);
 
-       /* If sk_send_head can be sent fully now, just do it.  */
-       if (skb->len <= limit)
-               return 0;
+       /* If a full-sized TSO skb can be sent, do it. */
+       if (limit >= 65536)
+               goto send_now;
 
        if (sysctl_tcp_tso_win_divisor) {
                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
 
        if (sysctl_tcp_tso_win_divisor) {
                u32 chunk = min(tp->snd_wnd, tp->snd_cwnd * tp->mss_cache);
@@ -937,7 +1191,7 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
                 */
                chunk /= sysctl_tcp_tso_win_divisor;
                if (limit >= chunk)
                 */
                chunk /= sysctl_tcp_tso_win_divisor;
                if (limit >= chunk)
-                       return 0;
+                       goto send_now;
        } else {
                /* Different approach, try not to defer past a single
                 * ACK.  Receiver should ACK every other full sized
        } else {
                /* Different approach, try not to defer past a single
                 * ACK.  Receiver should ACK every other full sized
@@ -945,13 +1199,152 @@ static int tcp_tso_should_defer(struct sock *sk, struct tcp_sock *tp, struct sk_
                 * then send now.
                 */
                if (limit > tcp_max_burst(tp) * tp->mss_cache)
                 * then send now.
                 */
                if (limit > tcp_max_burst(tp) * tp->mss_cache)
-                       return 0;
+                       goto send_now;
        }
 
        /* Ok, it looks like it is advisable to defer.  */
        }
 
        /* Ok, it looks like it is advisable to defer.  */
+       tp->tso_deferred = 1 | (jiffies<<1);
+
        return 1;
        return 1;
+
+send_now:
+       tp->tso_deferred = 0;
+       return 0;
+}
+
+/* Create a new MTU probe if we are ready.
+ * Returns 0 if we should wait to probe (no cwnd available),
+ *         1 if a probe was sent,
+ *         -1 otherwise */
+static int tcp_mtu_probe(struct sock *sk)
+{
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct sk_buff *skb, *nskb, *next;
+       int len;
+       int probe_size;
+       unsigned int pif;
+       int copy;
+       int mss_now;
+
+       /* Not currently probing/verifying,
+        * not in recovery,
+        * have enough cwnd, and
+        * not SACKing (the variable headers throw things off) */
+       if (!icsk->icsk_mtup.enabled ||
+           icsk->icsk_mtup.probe_size ||
+           inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
+           tp->snd_cwnd < 11 ||
+           tp->rx_opt.eff_sacks)
+               return -1;
+
+       /* Very simple search strategy: just double the MSS. */
+       mss_now = tcp_current_mss(sk, 0);
+       probe_size = 2*tp->mss_cache;
+       if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
+               /* TODO: set timer for probe_converge_event */
+               return -1;
+       }
+
+       /* Have enough data in the send queue to probe? */
+       len = 0;
+       if ((skb = sk->sk_send_head) == NULL)
+               return -1;
+       while ((len += skb->len) < probe_size && !tcp_skb_is_last(sk, skb))
+               skb = skb->next;
+       if (len < probe_size)
+               return -1;
+
+       /* Receive window check. */
+       if (after(TCP_SKB_CB(skb)->seq + probe_size, tp->snd_una + tp->snd_wnd)) {
+               if (tp->snd_wnd < probe_size)
+                       return -1;
+               else
+                       return 0;
+       }
+
+       /* Do we need to wait to drain cwnd? */
+       pif = tcp_packets_in_flight(tp);
+       if (pif + 2 > tp->snd_cwnd) {
+               /* With no packets in flight, don't stall. */
+               if (pif == 0)
+                       return -1;
+               else
+                       return 0;
+       }
+
+       /* We're allowed to probe.  Build it now. */
+       if ((nskb = sk_stream_alloc_skb(sk, probe_size, GFP_ATOMIC)) == NULL)
+               return -1;
+       sk_charge_skb(sk, nskb);
+
+       skb = sk->sk_send_head;
+       __skb_insert(nskb, skb->prev, skb, &sk->sk_write_queue);
+       sk->sk_send_head = nskb;
+
+       TCP_SKB_CB(nskb)->seq = TCP_SKB_CB(skb)->seq;
+       TCP_SKB_CB(nskb)->end_seq = TCP_SKB_CB(skb)->seq + probe_size;
+       TCP_SKB_CB(nskb)->flags = TCPCB_FLAG_ACK;
+       TCP_SKB_CB(nskb)->sacked = 0;
+       nskb->csum = 0;
+       nskb->ip_summed = skb->ip_summed;
+
+       len = 0;
+       while (len < probe_size) {
+               next = skb->next;
+
+               copy = min_t(int, skb->len, probe_size - len);
+               if (nskb->ip_summed)
+                       skb_copy_bits(skb, 0, skb_put(nskb, copy), copy);
+               else
+                       nskb->csum = skb_copy_and_csum_bits(skb, 0,
+                                        skb_put(nskb, copy), copy, nskb->csum);
+
+               if (skb->len <= copy) {
+                       /* We've eaten all the data from this skb.
+                        * Throw it away. */
+                       TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags;
+                       __skb_unlink(skb, &sk->sk_write_queue);
+                       sk_stream_free_skb(sk, skb);
+               } else {
+                       TCP_SKB_CB(nskb)->flags |= TCP_SKB_CB(skb)->flags &
+                                                  ~(TCPCB_FLAG_FIN|TCPCB_FLAG_PSH);
+                       if (!skb_shinfo(skb)->nr_frags) {
+                               skb_pull(skb, copy);
+                               if (skb->ip_summed != CHECKSUM_PARTIAL)
+                                       skb->csum = csum_partial(skb->data, skb->len, 0);
+                       } else {
+                               __pskb_trim_head(skb, copy);
+                               tcp_set_skb_tso_segs(sk, skb, mss_now);
+                       }
+                       TCP_SKB_CB(skb)->seq += copy;
+               }
+
+               len += copy;
+               skb = next;
+       }
+       tcp_init_tso_segs(sk, nskb, nskb->len);
+
+       /* We're ready to send.  If this fails, the probe will
+        * be resegmented into mss-sized pieces by tcp_write_xmit(). */
+       TCP_SKB_CB(nskb)->when = tcp_time_stamp;
+       if (!tcp_transmit_skb(sk, nskb, 1, GFP_ATOMIC)) {
+               /* Decrement cwnd here because we are sending
+               * effectively two packets. */
+               tp->snd_cwnd--;
+               update_send_head(sk, tp, nskb);
+
+               icsk->icsk_mtup.probe_size = tcp_mss_to_mtu(sk, nskb->len);
+               tp->mtu_probe.probe_seq_start = TCP_SKB_CB(nskb)->seq;
+               tp->mtu_probe.probe_seq_end = TCP_SKB_CB(nskb)->end_seq;
+
+               return 1;
+       }
+
+       return -1;
 }
 
 }
 
+
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
 /* This routine writes packets to the network.  It advances the
  * send_head.  This happens as incoming acks open up the remote
  * window for us.
@@ -965,6 +1358,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
        struct sk_buff *skb;
        unsigned int tso_segs, sent_pkts;
        int cwnd_quota;
        struct sk_buff *skb;
        unsigned int tso_segs, sent_pkts;
        int cwnd_quota;
+       int result;
 
        /* If we are closed, the bytes will have to remain here.
         * In time closedown will finish, we empty the write queue and all
 
        /* If we are closed, the bytes will have to remain here.
         * In time closedown will finish, we empty the write queue and all
@@ -974,6 +1368,14 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
                return 0;
 
        sent_pkts = 0;
                return 0;
 
        sent_pkts = 0;
+
+       /* Do MTU probing. */
+       if ((result = tcp_mtu_probe(sk)) == 0) {
+               return 0;
+       } else if (result > 0) {
+               sent_pkts = 1;
+       }
+
        while ((skb = sk->sk_send_head)) {
                unsigned int limit;
 
        while ((skb = sk->sk_send_head)) {
                unsigned int limit;
 
@@ -1016,7 +1418,7 @@ static int tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle)
 
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
 
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-               if (unlikely(tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))))
+               if (unlikely(tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC)))
                        break;
 
                /* Advance the send_head.  This one is sent out.
                        break;
 
                /* Advance the send_head.  This one is sent out.
@@ -1089,7 +1491,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
                /* Send it out now. */
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
                /* Send it out now. */
                TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-               if (likely(!tcp_transmit_skb(sk, skb_clone(skb, sk->sk_allocation)))) {
+               if (likely(!tcp_transmit_skb(sk, skb, 1, sk->sk_allocation))) {
                        update_send_head(sk, tp, skb);
                        tcp_cwnd_validate(sk, tp);
                        return;
                        update_send_head(sk, tp, skb);
                        tcp_cwnd_validate(sk, tp);
                        return;
@@ -1099,7 +1501,7 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
 
 /* This function returns the amount that we can raise the
  * usable window based on the following constraints
 
 /* This function returns the amount that we can raise the
  * usable window based on the following constraints
- *  
+ *
  * 1. The window can never be shrunk once it is offered (RFC 793)
  * 2. We limit memory per socket
  *
  * 1. The window can never be shrunk once it is offered (RFC 793)
  * 2. We limit memory per socket
  *
@@ -1118,12 +1520,12 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
  * side SWS prevention criteria. The problem is that under this rule
  * a stream of single byte packets will cause the right side of the
  * window to always advance by a single byte.
  * side SWS prevention criteria. The problem is that under this rule
  * a stream of single byte packets will cause the right side of the
  * window to always advance by a single byte.
- * 
+ *
  * Of course, if the sender implements sender side SWS prevention
  * then this will not be a problem.
  * Of course, if the sender implements sender side SWS prevention
  * then this will not be a problem.
- * 
+ *
  * BSD seems to make the following compromise:
  * BSD seems to make the following compromise:
- * 
+ *
  *     If the free space is less than the 1/4 of the maximum
  *     space available and the free space is less than 1/2 mss,
  *     then set the window to 0.
  *     If the free space is less than the 1/4 of the maximum
  *     space available and the free space is less than 1/2 mss,
  *     then set the window to 0.
@@ -1151,23 +1553,24 @@ void tcp_push_one(struct sock *sk, unsigned int mss_now)
  */
 u32 __tcp_select_window(struct sock *sk)
 {
  */
 u32 __tcp_select_window(struct sock *sk)
 {
+       struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
-       /* MSS for the peer's data.  Previous verions used mss_clamp
+       /* MSS for the peer's data.  Previous versions used mss_clamp
         * here.  I don't know if the value based on our guesses
         * of peer's MSS is better for the performance.  It's more correct
         * but may be worse for the performance because of rcv_mss
         * fluctuations.  --SAW  1998/11/1
         */
         * here.  I don't know if the value based on our guesses
         * of peer's MSS is better for the performance.  It's more correct
         * but may be worse for the performance because of rcv_mss
         * fluctuations.  --SAW  1998/11/1
         */
-       int mss = tp->ack.rcv_mss;
+       int mss = icsk->icsk_ack.rcv_mss;
        int free_space = tcp_space(sk);
        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
        int window;
 
        if (mss > full_space)
        int free_space = tcp_space(sk);
        int full_space = min_t(int, tp->window_clamp, tcp_full_space(sk));
        int window;
 
        if (mss > full_space)
-               mss = full_space; 
+               mss = full_space;
 
        if (free_space < full_space/2) {
 
        if (free_space < full_space/2) {
-               tp->ack.quick = 0;
+               icsk->icsk_ack.quick = 0;
 
                if (tcp_memory_pressure)
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
 
                if (tcp_memory_pressure)
                        tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U*tp->advmss);
@@ -1241,15 +1644,18 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
                BUG_ON(tcp_skb_pcount(skb) != 1 ||
                       tcp_skb_pcount(next_skb) != 1);
 
                BUG_ON(tcp_skb_pcount(skb) != 1 ||
                       tcp_skb_pcount(next_skb) != 1);
 
-               /* Ok.  We will be able to collapse the packet. */
-               __skb_unlink(next_skb, next_skb->list);
+               /* changing transmit queue under us so clear hints */
+               clear_all_retrans_hints(tp);
+
+               /* Ok.  We will be able to collapse the packet. */
+               __skb_unlink(next_skb, &sk->sk_write_queue);
 
                memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 
 
                memcpy(skb_put(skb, next_skb_size), next_skb->data, next_skb_size);
 
-               if (next_skb->ip_summed == CHECKSUM_HW)
-                       skb->ip_summed = CHECKSUM_HW;
+               if (next_skb->ip_summed == CHECKSUM_PARTIAL)
+                       skb->ip_summed = CHECKSUM_PARTIAL;
 
 
-               if (skb->ip_summed != CHECKSUM_HW)
+               if (skb->ip_summed != CHECKSUM_PARTIAL)
                        skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
 
                /* Update sequence range on original skb. */
                        skb->csum = csum_block_add(skb->csum, next_skb->csum, skb_size);
 
                /* Update sequence range on original skb. */
@@ -1285,18 +1691,19 @@ static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *skb, int m
 }
 
 /* Do a simple retransmit without using the backoff mechanisms in
 }
 
 /* Do a simple retransmit without using the backoff mechanisms in
- * tcp_timer. This is used for path mtu discovery. 
+ * tcp_timer. This is used for path mtu discovery.
  * The socket is already locked here.
  * The socket is already locked here.
- */ 
+ */
 void tcp_simple_retransmit(struct sock *sk)
 {
 void tcp_simple_retransmit(struct sock *sk)
 {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        unsigned int mss = tcp_current_mss(sk, 0);
        int lost = 0;
 
        sk_stream_for_retrans_queue(skb, sk) {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        unsigned int mss = tcp_current_mss(sk, 0);
        int lost = 0;
 
        sk_stream_for_retrans_queue(skb, sk) {
-               if (skb->len > mss && 
+               if (skb->len > mss &&
                    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
                        if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
                    !(TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_ACKED)) {
                        if (TCP_SKB_CB(skb)->sacked&TCPCB_SACKED_RETRANS) {
                                TCP_SKB_CB(skb)->sacked &= ~TCPCB_SACKED_RETRANS;
@@ -1310,22 +1717,24 @@ void tcp_simple_retransmit(struct sock *sk)
                }
        }
 
                }
        }
 
+       clear_all_retrans_hints(tp);
+
        if (!lost)
                return;
 
        tcp_sync_left_out(tp);
 
        if (!lost)
                return;
 
        tcp_sync_left_out(tp);
 
-       /* Don't muck with the congestion window here.
+       /* Don't muck with the congestion window here.
         * Reason is that we do not increase amount of _data_
         * in network, but units changed and effective
         * cwnd/ssthresh really reduced now.
         */
         * Reason is that we do not increase amount of _data_
         * in network, but units changed and effective
         * cwnd/ssthresh really reduced now.
         */
-       if (tp->ca_state != TCP_CA_Loss) {
+       if (icsk->icsk_ca_state != TCP_CA_Loss) {
                tp->high_seq = tp->snd_nxt;
                tp->high_seq = tp->snd_nxt;
-               tp->snd_ssthresh = tcp_current_ssthresh(tp);
+               tp->snd_ssthresh = tcp_current_ssthresh(sk);
                tp->prior_ssthresh = 0;
                tp->undo_marker = 0;
                tp->prior_ssthresh = 0;
                tp->undo_marker = 0;
-               tcp_set_ca_state(tp, TCP_CA_Loss);
+               tcp_set_ca_state(sk, TCP_CA_Loss);
        }
        tcp_xmit_retransmit_queue(sk);
 }
        }
        tcp_xmit_retransmit_queue(sk);
 }
@@ -1337,11 +1746,17 @@ void tcp_simple_retransmit(struct sock *sk)
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
 {
        struct tcp_sock *tp = tcp_sk(sk);
-       unsigned int cur_mss = tcp_current_mss(sk, 0);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       unsigned int cur_mss = tcp_current_mss(sk, 0);
        int err;
 
        int err;
 
+       /* Inconslusive MTU probe */
+       if (icsk->icsk_mtup.probe_size) {
+               icsk->icsk_mtup.probe_size = 0;
+       }
+
        /* Do not sent more than we queued. 1/4 is reserved for possible
        /* Do not sent more than we queued. 1/4 is reserved for possible
-        * copying overhead: frgagmentation, tunneling, mangling etc.
+        * copying overhead: fragmentation, tunneling, mangling etc.
         */
        if (atomic_read(&sk->sk_wmem_alloc) >
            min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
         */
        if (atomic_read(&sk->sk_wmem_alloc) >
            min(sk->sk_wmem_queued + (sk->sk_wmem_queued >> 2), sk->sk_sndbuf))
@@ -1350,12 +1765,6 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
                if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
                        BUG();
        if (before(TCP_SKB_CB(skb)->seq, tp->snd_una)) {
                if (before(TCP_SKB_CB(skb)->end_seq, tp->snd_una))
                        BUG();
-
-               if (sk->sk_route_caps & NETIF_F_TSO) {
-                       sk->sk_route_caps &= ~NETIF_F_TSO;
-                       sock_set_flag(sk, SOCK_NO_LARGESEND);
-               }
-
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
                        return -ENOMEM;
        }
                if (tcp_trim_head(sk, skb, tp->snd_una - TCP_SKB_CB(skb)->seq))
                        return -ENOMEM;
        }
@@ -1370,22 +1779,8 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
                return -EAGAIN;
 
        if (skb->len > cur_mss) {
                return -EAGAIN;
 
        if (skb->len > cur_mss) {
-               int old_factor = tcp_skb_pcount(skb);
-               int diff;
-
                if (tcp_fragment(sk, skb, cur_mss, cur_mss))
                        return -ENOMEM; /* We'll try again later. */
                if (tcp_fragment(sk, skb, cur_mss, cur_mss))
                        return -ENOMEM; /* We'll try again later. */
-
-               /* New SKB created, account for it. */
-               diff = old_factor - tcp_skb_pcount(skb) -
-                      tcp_skb_pcount(skb->next);
-               tp->packets_out -= diff;
-
-               if (diff > 0) {
-                       tp->fackets_out -= diff;
-                       if ((int)tp->fackets_out < 0)
-                               tp->fackets_out = 0;
-               }
        }
 
        /* Collapse two adjacent packets if worthwhile and we can. */
        }
 
        /* Collapse two adjacent packets if worthwhile and we can. */
@@ -1398,7 +1793,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
           (sysctl_tcp_retrans_collapse != 0))
                tcp_retrans_try_collapse(sk, skb, cur_mss);
 
           (sysctl_tcp_retrans_collapse != 0))
                tcp_retrans_try_collapse(sk, skb, cur_mss);
 
-       if(tp->af_specific->rebuild_header(sk))
+       if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
                return -EHOSTUNREACH; /* Routing failure or similar. */
 
        /* Some Solaris stacks overoptimize and ignore the FIN on a
                return -EHOSTUNREACH; /* Routing failure or similar. */
 
        /* Some Solaris stacks overoptimize and ignore the FIN on a
@@ -1410,8 +1805,9 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
           tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
                if (!pskb_trim(skb, 0)) {
                        TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
           tp->snd_una == (TCP_SKB_CB(skb)->end_seq - 1)) {
                if (!pskb_trim(skb, 0)) {
                        TCP_SKB_CB(skb)->seq = TCP_SKB_CB(skb)->end_seq - 1;
-                       skb_shinfo(skb)->tso_segs = 1;
-                       skb_shinfo(skb)->tso_size = 0;
+                       skb_shinfo(skb)->gso_segs = 1;
+                       skb_shinfo(skb)->gso_size = 0;
+                       skb_shinfo(skb)->gso_type = 0;
                        skb->ip_summed = CHECKSUM_NONE;
                        skb->csum = 0;
                }
                        skb->ip_summed = CHECKSUM_NONE;
                        skb->csum = 0;
                }
@@ -1422,9 +1818,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
         */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
         */
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
-       err = tcp_transmit_skb(sk, (skb_cloned(skb) ?
-                                   pskb_copy(skb, GFP_ATOMIC):
-                                   skb_clone(skb, GFP_ATOMIC)));
+       err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 
        if (err == 0) {
                /* Update global TCP statistics. */
 
        if (err == 0) {
                /* Update global TCP statistics. */
@@ -1465,15 +1859,28 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb)
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
  */
 void tcp_xmit_retransmit_queue(struct sock *sk)
 {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
-       int packet_cnt = tp->lost_out;
+       int packet_cnt;
+
+       if (tp->retransmit_skb_hint) {
+               skb = tp->retransmit_skb_hint;
+               packet_cnt = tp->retransmit_cnt_hint;
+       }else{
+               skb = sk->sk_write_queue.next;
+               packet_cnt = 0;
+       }
 
        /* First pass: retransmit lost packets. */
 
        /* First pass: retransmit lost packets. */
-       if (packet_cnt) {
-               sk_stream_for_retrans_queue(skb, sk) {
+       if (tp->lost_out) {
+               sk_stream_for_retrans_queue_from(skb, sk) {
                        __u8 sacked = TCP_SKB_CB(skb)->sacked;
 
                        __u8 sacked = TCP_SKB_CB(skb)->sacked;
 
+                       /* we could do better than to assign each time */
+                       tp->retransmit_skb_hint = skb;
+                       tp->retransmit_cnt_hint = packet_cnt;
+
                        /* Assume this retransmit will generate
                         * only one packet for congestion window
                         * calculation purposes.  This works because
                        /* Assume this retransmit will generate
                         * only one packet for congestion window
                         * calculation purposes.  This works because
@@ -1484,22 +1891,26 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
                                return;
 
                        if (tcp_packets_in_flight(tp) >= tp->snd_cwnd)
                                return;
 
-                       if (sacked&TCPCB_LOST) {
+                       if (sacked & TCPCB_LOST) {
                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
                                if (!(sacked&(TCPCB_SACKED_ACKED|TCPCB_SACKED_RETRANS))) {
-                                       if (tcp_retransmit_skb(sk, skb))
+                                       if (tcp_retransmit_skb(sk, skb)) {
+                                               tp->retransmit_skb_hint = NULL;
                                                return;
                                                return;
-                                       if (tp->ca_state != TCP_CA_Loss)
+                                       }
+                                       if (icsk->icsk_ca_state != TCP_CA_Loss)
                                                NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
                                        else
                                                NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
 
                                        if (skb ==
                                            skb_peek(&sk->sk_write_queue))
                                                NET_INC_STATS_BH(LINUX_MIB_TCPFASTRETRANS);
                                        else
                                                NET_INC_STATS_BH(LINUX_MIB_TCPSLOWSTARTRETRANS);
 
                                        if (skb ==
                                            skb_peek(&sk->sk_write_queue))
-                                               tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                                               inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                                                         inet_csk(sk)->icsk_rto,
+                                                                         TCP_RTO_MAX);
                                }
 
                                }
 
-                               packet_cnt -= tcp_skb_pcount(skb);
-                               if (packet_cnt <= 0)
+                               packet_cnt += tcp_skb_pcount(skb);
+                               if (packet_cnt >= tp->lost_out)
                                        break;
                        }
                }
                                        break;
                        }
                }
@@ -1508,7 +1919,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
        /* OK, demanded retransmission is finished. */
 
        /* Forward retransmissions are possible only during Recovery. */
        /* OK, demanded retransmission is finished. */
 
        /* Forward retransmissions are possible only during Recovery. */
-       if (tp->ca_state != TCP_CA_Recovery)
+       if (icsk->icsk_ca_state != TCP_CA_Recovery)
                return;
 
        /* No forward retransmissions in Reno are possible. */
                return;
 
        /* No forward retransmissions in Reno are possible. */
@@ -1525,9 +1936,18 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
        if (tcp_may_send_now(sk, tp))
                return;
 
        if (tcp_may_send_now(sk, tp))
                return;
 
-       packet_cnt = 0;
+       if (tp->forward_skb_hint) {
+               skb = tp->forward_skb_hint;
+               packet_cnt = tp->forward_cnt_hint;
+       } else{
+               skb = sk->sk_write_queue.next;
+               packet_cnt = 0;
+       }
+
+       sk_stream_for_retrans_queue_from(skb, sk) {
+               tp->forward_cnt_hint = packet_cnt;
+               tp->forward_skb_hint = skb;
 
 
-       sk_stream_for_retrans_queue(skb, sk) {
                /* Similar to the retransmit loop above we
                 * can pretend that the retransmitted SKB
                 * we send out here will be composed of one
                /* Similar to the retransmit loop above we
                 * can pretend that the retransmitted SKB
                 * we send out here will be composed of one
@@ -1544,11 +1964,15 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
                        continue;
 
                /* Ok, retransmit it. */
                        continue;
 
                /* Ok, retransmit it. */
-               if (tcp_retransmit_skb(sk, skb))
+               if (tcp_retransmit_skb(sk, skb)) {
+                       tp->forward_skb_hint = NULL;
                        break;
                        break;
+               }
 
                if (skb == skb_peek(&sk->sk_write_queue))
 
                if (skb == skb_peek(&sk->sk_write_queue))
-                       tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+                       inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                                 inet_csk(sk)->icsk_rto,
+                                                 TCP_RTO_MAX);
 
                NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
        }
 
                NET_INC_STATS_BH(LINUX_MIB_TCPFORWARDRETRANS);
        }
@@ -1560,10 +1984,10 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
  */
 void tcp_send_fin(struct sock *sk)
 {
  */
 void tcp_send_fin(struct sock *sk)
 {
-       struct tcp_sock *tp = tcp_sk(sk);       
+       struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
        int mss_now;
        struct sk_buff *skb = skb_peek_tail(&sk->sk_write_queue);
        int mss_now;
-       
+
        /* Optimization, tack on the FIN if we have a queue of
         * unsent frames.  But be careful about outgoing SACKS
         * and IP options.
        /* Optimization, tack on the FIN if we have a queue of
         * unsent frames.  But be careful about outgoing SACKS
         * and IP options.
@@ -1577,7 +2001,7 @@ void tcp_send_fin(struct sock *sk)
        } else {
                /* Socket is locked, keep trying until memory is available. */
                for (;;) {
        } else {
                /* Socket is locked, keep trying until memory is available. */
                for (;;) {
-                       skb = alloc_skb(MAX_TCP_HEADER, GFP_KERNEL);
+                       skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
                        if (skb)
                                break;
                        yield();
                        if (skb)
                                break;
                        yield();
@@ -1588,8 +2012,9 @@ void tcp_send_fin(struct sock *sk)
                skb->csum = 0;
                TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
                TCP_SKB_CB(skb)->sacked = 0;
                skb->csum = 0;
                TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_FIN);
                TCP_SKB_CB(skb)->sacked = 0;
-               skb_shinfo(skb)->tso_segs = 1;
-               skb_shinfo(skb)->tso_size = 0;
+               skb_shinfo(skb)->gso_segs = 1;
+               skb_shinfo(skb)->gso_size = 0;
+               skb_shinfo(skb)->gso_type = 0;
 
                /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
                TCP_SKB_CB(skb)->seq = tp->write_seq;
 
                /* FIN eats a sequence byte, write_seq advanced by tcp_queue_skb(). */
                TCP_SKB_CB(skb)->seq = tp->write_seq;
@@ -1604,7 +2029,7 @@ void tcp_send_fin(struct sock *sk)
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
  * was unread data in the receive queue.  This behavior is recommended
  * by draft-ietf-tcpimpl-prob-03.txt section 3.10.  -DaveM
  */
-void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
+void tcp_send_active_reset(struct sock *sk, gfp_t priority)
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
 {
        struct tcp_sock *tp = tcp_sk(sk);
        struct sk_buff *skb;
@@ -1621,14 +2046,15 @@ void tcp_send_active_reset(struct sock *sk, unsigned int __nocast priority)
        skb->csum = 0;
        TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
        TCP_SKB_CB(skb)->sacked = 0;
        skb->csum = 0;
        TCP_SKB_CB(skb)->flags = (TCPCB_FLAG_ACK | TCPCB_FLAG_RST);
        TCP_SKB_CB(skb)->sacked = 0;
-       skb_shinfo(skb)->tso_segs = 1;
-       skb_shinfo(skb)->tso_size = 0;
+       skb_shinfo(skb)->gso_segs = 1;
+       skb_shinfo(skb)->gso_size = 0;
+       skb_shinfo(skb)->gso_type = 0;
 
        /* Send it off. */
        TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
 
        /* Send it off. */
        TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp);
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       if (tcp_transmit_skb(sk, skb))
+       if (tcp_transmit_skb(sk, skb, 0, priority))
                NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
 }
 
                NET_INC_STATS(LINUX_MIB_TCPABORTFAILED);
 }
 
@@ -1663,7 +2089,7 @@ int tcp_send_synack(struct sock *sk)
                TCP_ECN_send_synack(tcp_sk(sk), skb);
        }
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
                TCP_ECN_send_synack(tcp_sk(sk), skb);
        }
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       return tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+       return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
 }
 
 /*
 }
 
 /*
@@ -1677,6 +2103,10 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
        struct tcphdr *th;
        int tcp_header_size;
        struct sk_buff *skb;
        struct tcphdr *th;
        int tcp_header_size;
        struct sk_buff *skb;
+#ifdef CONFIG_TCP_MD5SIG
+       struct tcp_md5sig_key *md5;
+       __u8 *md5_hash_location;
+#endif
 
        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
        if (skb == NULL)
 
        skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
        if (skb == NULL)
@@ -1692,56 +2122,80 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst,
                           (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
                           /* SACK_PERM is in the place of NOP NOP of TS */
                           ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
                           (ireq->wscale_ok ? TCPOLEN_WSCALE_ALIGNED : 0) +
                           /* SACK_PERM is in the place of NOP NOP of TS */
                           ((ireq->sack_ok && !ireq->tstamp_ok) ? TCPOLEN_SACKPERM_ALIGNED : 0));
+
+#ifdef CONFIG_TCP_MD5SIG
+       /* Are we doing MD5 on this segment? If so - make room for it */
+       md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
+       if (md5)
+               tcp_header_size += TCPOLEN_MD5SIG_ALIGNED;
+#endif
        skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 
        memset(th, 0, sizeof(struct tcphdr));
        th->syn = 1;
        th->ack = 1;
        skb->h.th = th = (struct tcphdr *) skb_push(skb, tcp_header_size);
 
        memset(th, 0, sizeof(struct tcphdr));
        th->syn = 1;
        th->ack = 1;
-       if (dst->dev->features&NETIF_F_TSO)
-               ireq->ecn_ok = 0;
        TCP_ECN_make_synack(req, th);
        th->source = inet_sk(sk)->sport;
        th->dest = ireq->rmt_port;
        TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
        TCP_SKB_CB(skb)->sacked = 0;
        TCP_ECN_make_synack(req, th);
        th->source = inet_sk(sk)->sport;
        th->dest = ireq->rmt_port;
        TCP_SKB_CB(skb)->seq = tcp_rsk(req)->snt_isn;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq + 1;
        TCP_SKB_CB(skb)->sacked = 0;
-       skb_shinfo(skb)->tso_segs = 1;
-       skb_shinfo(skb)->tso_size = 0;
+       skb_shinfo(skb)->gso_segs = 1;
+       skb_shinfo(skb)->gso_size = 0;
+       skb_shinfo(skb)->gso_type = 0;
        th->seq = htonl(TCP_SKB_CB(skb)->seq);
        th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
        if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
        th->seq = htonl(TCP_SKB_CB(skb)->seq);
        th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
        if (req->rcv_wnd == 0) { /* ignored for retransmitted syns */
-               __u8 rcv_wscale; 
+               __u8 rcv_wscale;
                /* Set this up on the first call only */
                req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
                /* tcp_full_space because it is guaranteed to be the first packet */
                /* Set this up on the first call only */
                req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW);
                /* tcp_full_space because it is guaranteed to be the first packet */
-               tcp_select_initial_window(tcp_full_space(sk), 
+               tcp_select_initial_window(tcp_full_space(sk),
                        dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
                        &req->rcv_wnd,
                        &req->window_clamp,
                        ireq->wscale_ok,
                        &rcv_wscale);
                        dst_metric(dst, RTAX_ADVMSS) - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0),
                        &req->rcv_wnd,
                        &req->window_clamp,
                        ireq->wscale_ok,
                        &rcv_wscale);
-               ireq->rcv_wscale = rcv_wscale; 
+               ireq->rcv_wscale = rcv_wscale;
        }
 
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(req->rcv_wnd);
 
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
        }
 
        /* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
        th->window = htons(req->rcv_wnd);
 
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       tcp_syn_build_options((__u32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
+       tcp_syn_build_options((__be32 *)(th + 1), dst_metric(dst, RTAX_ADVMSS), ireq->tstamp_ok,
                              ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
                              TCP_SKB_CB(skb)->when,
                              ireq->sack_ok, ireq->wscale_ok, ireq->rcv_wscale,
                              TCP_SKB_CB(skb)->when,
-                             req->ts_recent);
+                             req->ts_recent,
+                             (
+#ifdef CONFIG_TCP_MD5SIG
+                              md5 ? &md5_hash_location :
+#endif
+                              NULL)
+                             );
 
        skb->csum = 0;
        th->doff = (tcp_header_size >> 2);
        TCP_INC_STATS(TCP_MIB_OUTSEGS);
 
        skb->csum = 0;
        th->doff = (tcp_header_size >> 2);
        TCP_INC_STATS(TCP_MIB_OUTSEGS);
+
+#ifdef CONFIG_TCP_MD5SIG
+       /* Okay, we have all we need - do the md5 hash if needed */
+       if (md5) {
+               tp->af_specific->calc_md5_hash(md5_hash_location,
+                                              md5,
+                                              NULL, dst, req,
+                                              skb->h.th, sk->sk_protocol,
+                                              skb->len);
+       }
+#endif
+
        return skb;
 }
 
        return skb;
 }
 
-/* 
+/*
  * Do all connect socket setups that can be done AF independent.
  * Do all connect socket setups that can be done AF independent.
- */ 
-static inline void tcp_connect_init(struct sock *sk)
+ */
+static void tcp_connect_init(struct sock *sk)
 {
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
 {
        struct dst_entry *dst = __sk_dst_get(sk);
        struct tcp_sock *tp = tcp_sk(sk);
@@ -1753,10 +2207,16 @@ static inline void tcp_connect_init(struct sock *sk)
        tp->tcp_header_len = sizeof(struct tcphdr) +
                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 
        tp->tcp_header_len = sizeof(struct tcphdr) +
                (sysctl_tcp_timestamps ? TCPOLEN_TSTAMP_ALIGNED : 0);
 
+#ifdef CONFIG_TCP_MD5SIG
+       if (tp->af_specific->md5_lookup(sk, sk) != NULL)
+               tp->tcp_header_len += TCPOLEN_MD5SIG_ALIGNED;
+#endif
+
        /* If user gave his TCP_MAXSEG, record it to clamp */
        if (tp->rx_opt.user_mss)
                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
        tp->max_window = 0;
        /* If user gave his TCP_MAXSEG, record it to clamp */
        if (tp->rx_opt.user_mss)
                tp->rx_opt.mss_clamp = tp->rx_opt.user_mss;
        tp->max_window = 0;
+       tcp_mtup_init(sk);
        tcp_sync_mss(sk, dst_mtu(dst));
 
        if (!tp->window_clamp)
        tcp_sync_mss(sk, dst_mtu(dst));
 
        if (!tp->window_clamp)
@@ -1784,14 +2244,14 @@ static inline void tcp_connect_init(struct sock *sk)
        tp->rcv_wup = 0;
        tp->copied_seq = 0;
 
        tp->rcv_wup = 0;
        tp->copied_seq = 0;
 
-       tp->rto = TCP_TIMEOUT_INIT;
-       tp->retransmits = 0;
+       inet_csk(sk)->icsk_rto = TCP_TIMEOUT_INIT;
+       inet_csk(sk)->icsk_retransmits = 0;
        tcp_clear_retrans(tp);
 }
 
 /*
  * Build a SYN and send it off.
        tcp_clear_retrans(tp);
 }
 
 /*
  * Build a SYN and send it off.
- */ 
+ */
 int tcp_connect(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
 int tcp_connect(struct sock *sk)
 {
        struct tcp_sock *tp = tcp_sk(sk);
@@ -1799,7 +2259,7 @@ int tcp_connect(struct sock *sk)
 
        tcp_connect_init(sk);
 
 
        tcp_connect_init(sk);
 
-       buff = alloc_skb(MAX_TCP_HEADER + 15, sk->sk_allocation);
+       buff = alloc_skb_fclone(MAX_TCP_HEADER + 15, sk->sk_allocation);
        if (unlikely(buff == NULL))
                return -ENOBUFS;
 
        if (unlikely(buff == NULL))
                return -ENOBUFS;
 
@@ -1809,13 +2269,13 @@ int tcp_connect(struct sock *sk)
        TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
        TCP_ECN_send_syn(sk, tp, buff);
        TCP_SKB_CB(buff)->sacked = 0;
        TCP_SKB_CB(buff)->flags = TCPCB_FLAG_SYN;
        TCP_ECN_send_syn(sk, tp, buff);
        TCP_SKB_CB(buff)->sacked = 0;
-       skb_shinfo(buff)->tso_segs = 1;
-       skb_shinfo(buff)->tso_size = 0;
+       skb_shinfo(buff)->gso_segs = 1;
+       skb_shinfo(buff)->gso_size = 0;
+       skb_shinfo(buff)->gso_type = 0;
        buff->csum = 0;
        buff->csum = 0;
+       tp->snd_nxt = tp->write_seq;
        TCP_SKB_CB(buff)->seq = tp->write_seq++;
        TCP_SKB_CB(buff)->end_seq = tp->write_seq;
        TCP_SKB_CB(buff)->seq = tp->write_seq++;
        TCP_SKB_CB(buff)->end_seq = tp->write_seq;
-       tp->snd_nxt = tp->write_seq;
-       tp->pushed_seq = tp->write_seq;
 
        /* Send it off. */
        TCP_SKB_CB(buff)->when = tcp_time_stamp;
 
        /* Send it off. */
        TCP_SKB_CB(buff)->when = tcp_time_stamp;
@@ -1824,11 +2284,18 @@ int tcp_connect(struct sock *sk)
        __skb_queue_tail(&sk->sk_write_queue, buff);
        sk_charge_skb(sk, buff);
        tp->packets_out += tcp_skb_pcount(buff);
        __skb_queue_tail(&sk->sk_write_queue, buff);
        sk_charge_skb(sk, buff);
        tp->packets_out += tcp_skb_pcount(buff);
-       tcp_transmit_skb(sk, skb_clone(buff, GFP_KERNEL));
+       tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
+
+       /* We change tp->snd_nxt after the tcp_transmit_skb() call
+        * in order to make this packet get counted in tcpOutSegs.
+        */
+       tp->snd_nxt = tp->write_seq;
+       tp->pushed_seq = tp->write_seq;
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
        /* Timer for repeating the SYN until an answer. */
        TCP_INC_STATS(TCP_MIB_ACTIVEOPENS);
 
        /* Timer for repeating the SYN until an answer. */
-       tcp_reset_xmit_timer(sk, TCP_TIME_RETRANS, tp->rto);
+       inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
+                                 inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
        return 0;
 }
 
        return 0;
 }
 
@@ -1838,20 +2305,21 @@ int tcp_connect(struct sock *sk)
  */
 void tcp_send_delayed_ack(struct sock *sk)
 {
  */
 void tcp_send_delayed_ack(struct sock *sk)
 {
-       struct tcp_sock *tp = tcp_sk(sk);
-       int ato = tp->ack.ato;
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       int ato = icsk->icsk_ack.ato;
        unsigned long timeout;
 
        if (ato > TCP_DELACK_MIN) {
        unsigned long timeout;
 
        if (ato > TCP_DELACK_MIN) {
+               const struct tcp_sock *tp = tcp_sk(sk);
                int max_ato = HZ/2;
 
                int max_ato = HZ/2;
 
-               if (tp->ack.pingpong || (tp->ack.pending&TCP_ACK_PUSHED))
+               if (icsk->icsk_ack.pingpong || (icsk->icsk_ack.pending & ICSK_ACK_PUSHED))
                        max_ato = TCP_DELACK_MAX;
 
                /* Slow path, intersegment interval is "high". */
 
                /* If some rtt estimate is known, use it to bound delayed ack.
                        max_ato = TCP_DELACK_MAX;
 
                /* Slow path, intersegment interval is "high". */
 
                /* If some rtt estimate is known, use it to bound delayed ack.
-                * Do not use tp->rto here, use results of rtt measurements
+                * Do not use inet_csk(sk)->icsk_rto here, use results of rtt measurements
                 * directly.
                 */
                if (tp->srtt) {
                 * directly.
                 */
                if (tp->srtt) {
@@ -1868,21 +2336,22 @@ void tcp_send_delayed_ack(struct sock *sk)
        timeout = jiffies + ato;
 
        /* Use new timeout only if there wasn't a older one earlier. */
        timeout = jiffies + ato;
 
        /* Use new timeout only if there wasn't a older one earlier. */
-       if (tp->ack.pending&TCP_ACK_TIMER) {
+       if (icsk->icsk_ack.pending & ICSK_ACK_TIMER) {
                /* If delack timer was blocked or is about to expire,
                 * send ACK now.
                 */
                /* If delack timer was blocked or is about to expire,
                 * send ACK now.
                 */
-               if (tp->ack.blocked || time_before_eq(tp->ack.timeout, jiffies+(ato>>2))) {
+               if (icsk->icsk_ack.blocked ||
+                   time_before_eq(icsk->icsk_ack.timeout, jiffies + (ato >> 2))) {
                        tcp_send_ack(sk);
                        return;
                }
 
                        tcp_send_ack(sk);
                        return;
                }
 
-               if (!time_before(timeout, tp->ack.timeout))
-                       timeout = tp->ack.timeout;
+               if (!time_before(timeout, icsk->icsk_ack.timeout))
+                       timeout = icsk->icsk_ack.timeout;
        }
        }
-       tp->ack.pending |= TCP_ACK_SCHED|TCP_ACK_TIMER;
-       tp->ack.timeout = timeout;
-       sk_reset_timer(sk, &tp->delack_timer, timeout);
+       icsk->icsk_ack.pending |= ICSK_ACK_SCHED | ICSK_ACK_TIMER;
+       icsk->icsk_ack.timeout = timeout;
+       sk_reset_timer(sk, &icsk->icsk_delack_timer, timeout);
 }
 
 /* This routine sends an ack and also updates the window. */
 }
 
 /* This routine sends an ack and also updates the window. */
@@ -1899,9 +2368,10 @@ void tcp_send_ack(struct sock *sk)
                 */
                buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
                if (buff == NULL) {
                 */
                buff = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
                if (buff == NULL) {
-                       tcp_schedule_ack(tp);
-                       tp->ack.ato = TCP_ATO_MIN;
-                       tcp_reset_xmit_timer(sk, TCP_TIME_DACK, TCP_DELACK_MAX);
+                       inet_csk_schedule_ack(sk);
+                       inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
+                       inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
+                                                 TCP_DELACK_MAX, TCP_RTO_MAX);
                        return;
                }
 
                        return;
                }
 
@@ -1910,13 +2380,14 @@ void tcp_send_ack(struct sock *sk)
                buff->csum = 0;
                TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
                TCP_SKB_CB(buff)->sacked = 0;
                buff->csum = 0;
                TCP_SKB_CB(buff)->flags = TCPCB_FLAG_ACK;
                TCP_SKB_CB(buff)->sacked = 0;
-               skb_shinfo(buff)->tso_segs = 1;
-               skb_shinfo(buff)->tso_size = 0;
+               skb_shinfo(buff)->gso_segs = 1;
+               skb_shinfo(buff)->gso_size = 0;
+               skb_shinfo(buff)->gso_type = 0;
 
                /* Send it off, this clears delayed acks for us. */
                TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
                TCP_SKB_CB(buff)->when = tcp_time_stamp;
 
                /* Send it off, this clears delayed acks for us. */
                TCP_SKB_CB(buff)->seq = TCP_SKB_CB(buff)->end_seq = tcp_acceptable_seq(sk, tp);
                TCP_SKB_CB(buff)->when = tcp_time_stamp;
-               tcp_transmit_skb(sk, buff);
+               tcp_transmit_skb(sk, buff, 0, GFP_ATOMIC);
        }
 }
 
        }
 }
 
@@ -1938,7 +2409,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
 
        /* We don't queue it, tcp_transmit_skb() sets ownership. */
        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
 
        /* We don't queue it, tcp_transmit_skb() sets ownership. */
        skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
-       if (skb == NULL) 
+       if (skb == NULL)
                return -1;
 
        /* Reserve space for headers and set control bits. */
                return -1;
 
        /* Reserve space for headers and set control bits. */
@@ -1946,8 +2417,9 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
        skb->csum = 0;
        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
        TCP_SKB_CB(skb)->sacked = urgent;
        skb->csum = 0;
        TCP_SKB_CB(skb)->flags = TCPCB_FLAG_ACK;
        TCP_SKB_CB(skb)->sacked = urgent;
-       skb_shinfo(skb)->tso_segs = 1;
-       skb_shinfo(skb)->tso_size = 0;
+       skb_shinfo(skb)->gso_segs = 1;
+       skb_shinfo(skb)->gso_size = 0;
+       skb_shinfo(skb)->gso_type = 0;
 
        /* Use a previous sequence.  This should cause the other
         * end to send an ack.  Don't queue or clone SKB, just
 
        /* Use a previous sequence.  This should cause the other
         * end to send an ack.  Don't queue or clone SKB, just
@@ -1956,7 +2428,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent)
        TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
        TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1;
        TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq;
        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-       return tcp_transmit_skb(sk, skb);
+       return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
 }
 
 int tcp_write_wakeup(struct sock *sk)
 }
 
 int tcp_write_wakeup(struct sock *sk)
@@ -1984,18 +2456,12 @@ int tcp_write_wakeup(struct sock *sk)
                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                                if (tcp_fragment(sk, skb, seg_size, mss))
                                        return -1;
                                TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                                if (tcp_fragment(sk, skb, seg_size, mss))
                                        return -1;
-                               /* SWS override triggered forced fragmentation.
-                                * Disable TSO, the connection is too sick. */
-                               if (sk->sk_route_caps & NETIF_F_TSO) {
-                                       sock_set_flag(sk, SOCK_NO_LARGESEND);
-                                       sk->sk_route_caps &= ~NETIF_F_TSO;
-                               }
                        } else if (!tcp_skb_pcount(skb))
                                tcp_set_skb_tso_segs(sk, skb, mss);
 
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
                        } else if (!tcp_skb_pcount(skb))
                                tcp_set_skb_tso_segs(sk, skb, mss);
 
                        TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH;
                        TCP_SKB_CB(skb)->when = tcp_time_stamp;
-                       err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC));
+                       err = tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
                        if (!err) {
                                update_send_head(sk, tp, skb);
                        }
                        if (!err) {
                                update_send_head(sk, tp, skb);
                        }
@@ -2015,6 +2481,7 @@ int tcp_write_wakeup(struct sock *sk)
  */
 void tcp_send_probe0(struct sock *sk)
 {
  */
 void tcp_send_probe0(struct sock *sk)
 {
+       struct inet_connection_sock *icsk = inet_csk(sk);
        struct tcp_sock *tp = tcp_sk(sk);
        int err;
 
        struct tcp_sock *tp = tcp_sk(sk);
        int err;
 
@@ -2022,28 +2489,31 @@ void tcp_send_probe0(struct sock *sk)
 
        if (tp->packets_out || !sk->sk_send_head) {
                /* Cancel probe timer, if it is not required. */
 
        if (tp->packets_out || !sk->sk_send_head) {
                /* Cancel probe timer, if it is not required. */
-               tp->probes_out = 0;
-               tp->backoff = 0;
+               icsk->icsk_probes_out = 0;
+               icsk->icsk_backoff = 0;
                return;
        }
 
        if (err <= 0) {
                return;
        }
 
        if (err <= 0) {
-               if (tp->backoff < sysctl_tcp_retries2)
-                       tp->backoff++;
-               tp->probes_out++;
-               tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-                                     min(tp->rto << tp->backoff, TCP_RTO_MAX));
+               if (icsk->icsk_backoff < sysctl_tcp_retries2)
+                       icsk->icsk_backoff++;
+               icsk->icsk_probes_out++;
+               inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+                                         min(icsk->icsk_rto << icsk->icsk_backoff, TCP_RTO_MAX),
+                                         TCP_RTO_MAX);
        } else {
                /* If packet was not sent due to local congestion,
        } else {
                /* If packet was not sent due to local congestion,
-                * do not backoff and do not remember probes_out.
+                * do not backoff and do not remember icsk_probes_out.
                 * Let local senders to fight for local resources.
                 *
                 * Use accumulated backoff yet.
                 */
                 * Let local senders to fight for local resources.
                 *
                 * Use accumulated backoff yet.
                 */
-               if (!tp->probes_out)
-                       tp->probes_out=1;
-               tcp_reset_xmit_timer (sk, TCP_TIME_PROBE0, 
-                                     min(tp->rto << tp->backoff, TCP_RESOURCE_PROBE_INTERVAL));
+               if (!icsk->icsk_probes_out)
+                       icsk->icsk_probes_out = 1;
+               inet_csk_reset_xmit_timer(sk, ICSK_TIME_PROBE0,
+                                         min(icsk->icsk_rto << icsk->icsk_backoff,
+                                             TCP_RESOURCE_PROBE_INTERVAL),
+                                         TCP_RTO_MAX);
        }
 }
 
        }
 }
 
@@ -2051,3 +2521,5 @@ EXPORT_SYMBOL(tcp_connect);
 EXPORT_SYMBOL(tcp_make_synack);
 EXPORT_SYMBOL(tcp_simple_retransmit);
 EXPORT_SYMBOL(tcp_sync_mss);
 EXPORT_SYMBOL(tcp_make_synack);
 EXPORT_SYMBOL(tcp_simple_retransmit);
 EXPORT_SYMBOL(tcp_sync_mss);
+EXPORT_SYMBOL(sysctl_tcp_tso_win_divisor);
+EXPORT_SYMBOL(tcp_mtup_init);