/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
+int sysctl_tcp_cookie_size __read_mostly = 0; /* TCP_COOKIE_MAX */
+EXPORT_SYMBOL_GPL(sysctl_tcp_cookie_size);
+
+
+/* Account for new data that has been sent to the network. */
static void tcp_event_new_data_sent(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->snd_cwnd_used = 0;
}
+/* Congestion state accounting after a packet has been sent. */
static void tcp_event_data_sent(struct tcp_sock *tp,
struct sk_buff *skb, struct sock *sk)
{
icsk->icsk_ack.pingpong = 1;
}
+/* Account for an ACK we sent. */
static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
{
tcp_dec_quickack_mode(sk, pkts);
return new_win;
}
+/* Packet ECN state for a SYN-ACK */
static inline void TCP_ECN_send_synack(struct tcp_sock *tp, struct sk_buff *skb)
{
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_CWR;
TCP_SKB_CB(skb)->flags &= ~TCPCB_FLAG_ECE;
}
+/* Packet ECN state for a SYN. */
static inline void TCP_ECN_send_syn(struct sock *sk, struct sk_buff *skb)
{
struct tcp_sock *tp = tcp_sk(sk);
tp->ecn_flags = 0;
- if (sysctl_tcp_ecn) {
+ if (sysctl_tcp_ecn == 1) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_ECE | TCPCB_FLAG_CWR;
tp->ecn_flags = TCP_ECN_OK;
}
th->ece = 1;
}
+/* Set up ECN state for a packet on a ESTABLISHED socket that is about to
+ * be sent.
+ */
static inline void TCP_ECN_send(struct sock *sk, struct sk_buff *skb,
int tcp_header_len)
{
#define OPTION_SACK_ADVERTISE (1 << 0)
#define OPTION_TS (1 << 1)
#define OPTION_MD5 (1 << 2)
+#define OPTION_WSCALE (1 << 3)
+#define OPTION_COOKIE_EXTENSION (1 << 4)
struct tcp_out_options {
u8 options; /* bit field of OPTION_* */
u8 ws; /* window scale, 0 to disable */
u8 num_sack_blocks; /* number of SACK blocks to include */
+ u8 hash_size; /* bytes in hash_location */
u16 mss; /* 0 to disable */
__u32 tsval, tsecr; /* need to include OPTION_TS */
+ __u8 *hash_location; /* temporary pointer, overloaded */
};
-/* Beware: Something in the Internet is very sensitive to the ordering of
+/* The sysctl int routines are generic, so check consistency here.
+ */
+static u8 tcp_cookie_size_check(u8 desired)
+{
+ if (desired > 0) {
+ /* previously specified */
+ return desired;
+ }
+ if (sysctl_tcp_cookie_size <= 0) {
+ /* no default specified */
+ return 0;
+ }
+ if (sysctl_tcp_cookie_size <= TCP_COOKIE_MIN) {
+ /* value too small, specify minimum */
+ return TCP_COOKIE_MIN;
+ }
+ if (sysctl_tcp_cookie_size >= TCP_COOKIE_MAX) {
+ /* value too large, specify maximum */
+ return TCP_COOKIE_MAX;
+ }
+ if (0x1 & sysctl_tcp_cookie_size) {
+ /* 8-bit multiple, illegal, fix it */
+ return (u8)(sysctl_tcp_cookie_size + 0x1);
+ }
+ return (u8)sysctl_tcp_cookie_size;
+}
+
+/* Write previously computed TCP options to the packet.
+ *
+ * Beware: Something in the Internet is very sensitive to the ordering of
* TCP options, we learned this through the hard way, so be careful here.
* Luckily we can at least blame others for their non-compliance but from
* inter-operatibility perspective it seems that we're somewhat stuck with
* (but it may well be that other scenarios fail similarly).
*/
static void tcp_options_write(__be32 *ptr, struct tcp_sock *tp,
- const struct tcp_out_options *opts,
- __u8 **md5_hash) {
- if (unlikely(OPTION_MD5 & opts->options)) {
- *ptr++ = htonl((TCPOPT_NOP << 24) |
- (TCPOPT_NOP << 16) |
- (TCPOPT_MD5SIG << 8) |
- TCPOLEN_MD5SIG);
- *md5_hash = (__u8 *)ptr;
+ struct tcp_out_options *opts)
+{
+ u8 options = opts->options; /* mungable copy */
+
+ /* Having both authentication and cookies for security is redundant,
+ * and there's certainly not enough room. Instead, the cookie-less
+ * extension variant is proposed.
+ *
+ * Consider the pessimal case with authentication. The options
+ * could look like:
+ * COOKIE|MD5(20) + MSS(4) + SACK|TS(12) + WSCALE(4) == 40
+ */
+ if (unlikely(OPTION_MD5 & options)) {
+ if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+ *ptr++ = htonl((TCPOPT_COOKIE << 24) |
+ (TCPOLEN_COOKIE_BASE << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ } else {
+ *ptr++ = htonl((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_MD5SIG << 8) |
+ TCPOLEN_MD5SIG);
+ }
+ options &= ~OPTION_COOKIE_EXTENSION;
+ /* overload cookie hash location */
+ opts->hash_location = (__u8 *)ptr;
ptr += 4;
- } else {
- *md5_hash = NULL;
}
if (unlikely(opts->mss)) {
opts->mss);
}
- if (likely(OPTION_TS & opts->options)) {
- if (unlikely(OPTION_SACK_ADVERTISE & opts->options)) {
+ if (likely(OPTION_TS & options)) {
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_SACK_PERM << 24) |
(TCPOLEN_SACK_PERM << 16) |
(TCPOPT_TIMESTAMP << 8) |
TCPOLEN_TIMESTAMP);
+ options &= ~OPTION_SACK_ADVERTISE;
} else {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
*ptr++ = htonl(opts->tsecr);
}
- if (unlikely(OPTION_SACK_ADVERTISE & opts->options &&
- !(OPTION_TS & opts->options))) {
+ /* Specification requires after timestamp, so do it now.
+ *
+ * Consider the pessimal case without authentication. The options
+ * could look like:
+ * MSS(4) + SACK|TS(12) + COOKIE(20) + WSCALE(4) == 40
+ */
+ if (unlikely(OPTION_COOKIE_EXTENSION & options)) {
+ __u8 *cookie_copy = opts->hash_location;
+ u8 cookie_size = opts->hash_size;
+
+ /* 8-bit multiple handled in tcp_cookie_size_check() above,
+ * and elsewhere.
+ */
+ if (0x2 & cookie_size) {
+ __u8 *p = (__u8 *)ptr;
+
+ /* 16-bit multiple */
+ *p++ = TCPOPT_COOKIE;
+ *p++ = TCPOLEN_COOKIE_BASE + cookie_size;
+ *p++ = *cookie_copy++;
+ *p++ = *cookie_copy++;
+ ptr++;
+ cookie_size -= 2;
+ } else {
+ /* 32-bit multiple */
+ *ptr++ = htonl(((TCPOPT_NOP << 24) |
+ (TCPOPT_NOP << 16) |
+ (TCPOPT_COOKIE << 8) |
+ TCPOLEN_COOKIE_BASE) +
+ cookie_size);
+ }
+
+ if (cookie_size > 0) {
+ memcpy(ptr, cookie_copy, cookie_size);
+ ptr += (cookie_size / 4);
+ }
+ }
+
+ if (unlikely(OPTION_SACK_ADVERTISE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_NOP << 16) |
(TCPOPT_SACK_PERM << 8) |
TCPOLEN_SACK_PERM);
}
- if (unlikely(opts->ws)) {
+ if (unlikely(OPTION_WSCALE & options)) {
*ptr++ = htonl((TCPOPT_NOP << 24) |
(TCPOPT_WINDOW << 16) |
(TCPOLEN_WINDOW << 8) |
*ptr++ = htonl(sp[this_sack].end_seq);
}
- if (tp->rx_opt.dsack) {
- tp->rx_opt.dsack = 0;
- tp->rx_opt.eff_sacks = tp->rx_opt.num_sacks;
- }
+ tp->rx_opt.dsack = 0;
}
}
+/* Compute TCP options for SYN packets. This is not the final
+ * network wire format yet.
+ */
static unsigned tcp_syn_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5) {
struct tcp_sock *tp = tcp_sk(sk);
- unsigned size = 0;
+ struct tcp_cookie_values *cvp = tp->cookie_values;
+ unsigned remaining = MAX_TCP_OPTION_SPACE;
+ u8 cookie_size = (!tp->rx_opt.cookie_out_never && cvp != NULL) ?
+ tcp_cookie_size_check(cvp->cookie_desired) :
+ 0;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
if (*md5) {
opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
}
#else
*md5 = NULL;
* SACKs don't matter, we never delay an ACK when we have any of those
* going out. */
opts->mss = tcp_advertise_mss(sk);
- size += TCPOLEN_MSS_ALIGNED;
+ remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(sysctl_tcp_timestamps && *md5 == NULL)) {
opts->options |= OPTION_TS;
opts->tsval = TCP_SKB_CB(skb)->when;
opts->tsecr = tp->rx_opt.ts_recent;
- size += TCPOLEN_TSTAMP_ALIGNED;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(sysctl_tcp_window_scaling)) {
opts->ws = tp->rx_opt.rcv_wscale;
- if (likely(opts->ws))
- size += TCPOLEN_WSCALE_ALIGNED;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(sysctl_tcp_sack)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!(OPTION_TS & opts->options)))
- size += TCPOLEN_SACKPERM_ALIGNED;
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- return size;
+ /* Note that timestamps are required by the specification.
+ *
+ * Odd numbers of bytes are prohibited by the specification, ensuring
+ * that the cookie is 16-bit aligned, and the resulting cookie pair is
+ * 32-bit aligned.
+ */
+ if (*md5 == NULL &&
+ (OPTION_TS & opts->options) &&
+ cookie_size > 0) {
+ int need = TCPOLEN_COOKIE_BASE + cookie_size;
+
+ if (0x2 & need) {
+ /* 32-bit multiple */
+ need += 2; /* NOPs */
+
+ if (need > remaining) {
+ /* try shrinking cookie to fit */
+ cookie_size -= 2;
+ need -= 4;
+ }
+ }
+ while (need > remaining && TCP_COOKIE_MIN <= cookie_size) {
+ cookie_size -= 4;
+ need -= 4;
+ }
+ if (TCP_COOKIE_MIN <= cookie_size) {
+ opts->options |= OPTION_COOKIE_EXTENSION;
+ opts->hash_location = (__u8 *)&cvp->cookie_pair[0];
+ opts->hash_size = cookie_size;
+
+ /* Remember for future incarnations. */
+ cvp->cookie_desired = cookie_size;
+
+ if (cvp->cookie_desired != cvp->cookie_pair_size) {
+ /* Currently use random bytes as a nonce,
+ * assuming these are completely unpredictable
+ * by hostile users of the same system.
+ */
+ get_random_bytes(&cvp->cookie_pair[0],
+ cookie_size);
+ cvp->cookie_pair_size = cookie_size;
+ }
+
+ remaining -= need;
+ }
+ }
+ return MAX_TCP_OPTION_SPACE - remaining;
}
+/* Set up TCP options for SYN-ACKs. */
static unsigned tcp_synack_options(struct sock *sk,
struct request_sock *req,
unsigned mss, struct sk_buff *skb,
struct tcp_out_options *opts,
- struct tcp_md5sig_key **md5) {
- unsigned size = 0;
+ struct tcp_md5sig_key **md5,
+ struct tcp_extend_values *xvp)
+{
struct inet_request_sock *ireq = inet_rsk(req);
- char doing_ts;
+ unsigned remaining = MAX_TCP_OPTION_SPACE;
+ u8 cookie_plus = (xvp != NULL && !xvp->cookie_out_never) ?
+ xvp->cookie_plus :
+ 0;
+ bool doing_ts = ireq->tstamp_ok;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tcp_rsk(req)->af_specific->md5_lookup(sk, req);
if (*md5) {
opts->options |= OPTION_MD5;
- size += TCPOLEN_MD5SIG_ALIGNED;
+ remaining -= TCPOLEN_MD5SIG_ALIGNED;
+
+ /* We can't fit any SACK blocks in a packet with MD5 + TS
+ * options. There was discussion about disabling SACK
+ * rather than TS in order to fit in better with old,
+ * buggy kernels, but that was deemed to be unnecessary.
+ */
+ doing_ts &= !ireq->sack_ok;
}
#else
*md5 = NULL;
#endif
- /* we can't fit any SACK blocks in a packet with MD5 + TS
- options. There was discussion about disabling SACK rather than TS in
- order to fit in better with old, buggy kernels, but that was deemed
- to be unnecessary. */
- doing_ts = ireq->tstamp_ok && !(*md5 && ireq->sack_ok);
-
+ /* We always send an MSS option. */
opts->mss = mss;
- size += TCPOLEN_MSS_ALIGNED;
+ remaining -= TCPOLEN_MSS_ALIGNED;
if (likely(ireq->wscale_ok)) {
opts->ws = ireq->rcv_wscale;
- if (likely(opts->ws))
- size += TCPOLEN_WSCALE_ALIGNED;
+ opts->options |= OPTION_WSCALE;
+ remaining -= TCPOLEN_WSCALE_ALIGNED;
}
if (likely(doing_ts)) {
opts->options |= OPTION_TS;
opts->tsval = TCP_SKB_CB(skb)->when;
opts->tsecr = req->ts_recent;
- size += TCPOLEN_TSTAMP_ALIGNED;
+ remaining -= TCPOLEN_TSTAMP_ALIGNED;
}
if (likely(ireq->sack_ok)) {
opts->options |= OPTION_SACK_ADVERTISE;
if (unlikely(!doing_ts))
- size += TCPOLEN_SACKPERM_ALIGNED;
+ remaining -= TCPOLEN_SACKPERM_ALIGNED;
}
- return size;
+ /* Similar rationale to tcp_syn_options() applies here, too.
+ * If the <SYN> options fit, the same options should fit now!
+ */
+ if (*md5 == NULL &&
+ doing_ts &&
+ cookie_plus > TCPOLEN_COOKIE_BASE) {
+ int need = cookie_plus; /* has TCPOLEN_COOKIE_BASE */
+
+ if (0x2 & need) {
+ /* 32-bit multiple */
+ need += 2; /* NOPs */
+ }
+ if (need <= remaining) {
+ opts->options |= OPTION_COOKIE_EXTENSION;
+ opts->hash_size = cookie_plus - TCPOLEN_COOKIE_BASE;
+ remaining -= need;
+ } else {
+ /* There's no error return, so flag it. */
+ xvp->cookie_out_never = 1; /* true */
+ opts->hash_size = 0;
+ }
+ }
+ return MAX_TCP_OPTION_SPACE - remaining;
}
+/* Compute TCP options for ESTABLISHED sockets. This is not the
+ * final wire format yet.
+ */
static unsigned tcp_established_options(struct sock *sk, struct sk_buff *skb,
struct tcp_out_options *opts,
struct tcp_md5sig_key **md5) {
struct tcp_skb_cb *tcb = skb ? TCP_SKB_CB(skb) : NULL;
struct tcp_sock *tp = tcp_sk(sk);
unsigned size = 0;
+ unsigned int eff_sacks;
#ifdef CONFIG_TCP_MD5SIG
*md5 = tp->af_specific->md5_lookup(sk, sk);
size += TCPOLEN_TSTAMP_ALIGNED;
}
- if (unlikely(tp->rx_opt.eff_sacks)) {
+ eff_sacks = tp->rx_opt.num_sacks + tp->rx_opt.dsack;
+ if (unlikely(eff_sacks)) {
const unsigned remaining = MAX_TCP_OPTION_SPACE - size;
opts->num_sack_blocks =
- min_t(unsigned, tp->rx_opt.eff_sacks,
+ min_t(unsigned, eff_sacks,
(remaining - TCPOLEN_SACK_BASE_ALIGNED) /
TCPOLEN_SACK_PERBLOCK);
size += TCPOLEN_SACK_BASE_ALIGNED +
struct tcp_out_options opts;
unsigned tcp_options_size, tcp_header_size;
struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
struct tcphdr *th;
int err;
/* Build TCP header and checksum it. */
th = tcp_hdr(skb);
- th->source = inet->sport;
- th->dest = inet->dport;
+ th->source = inet->inet_sport;
+ th->dest = inet->inet_dport;
th->seq = htonl(tcb->seq);
th->ack_seq = htonl(tp->rcv_nxt);
*(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
}
}
- tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
if (likely((tcb->flags & TCPCB_FLAG_SYN) == 0))
TCP_ECN_send(sk, skb, tcp_header_size);
/* Calculate the MD5 hash, as we have all we need now */
if (md5) {
sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
- tp->af_specific->calc_md5_hash(md5_hash_location,
+ tp->af_specific->calc_md5_hash(opts.hash_location,
md5, sk, NULL, skb);
}
#endif
return net_xmit_eval(err);
}
-/* This routine just queue's the buffer
+/* This routine just queues the buffer for sending.
*
* NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
* otherwise socket can stall.
sk_mem_charge(sk, skb->truesize);
}
+/* Initialize TSO segments for a packet. */
static void tcp_set_skb_tso_segs(struct sock *sk, struct sk_buff *skb,
unsigned int mss_now)
{
- if (skb->len <= mss_now || !sk_can_gso(sk)) {
+ if (skb->len <= mss_now || !sk_can_gso(sk) ||
+ skb->ip_summed == CHECKSUM_NONE) {
/* Avoid the costly divide in the normal
* non-TSO case.
*/
tp->fackets_out -= decr;
}
+/* Pcount in the middle of the write queue got changed, we need to do various
+ * tweaks to fix counters
+ */
+static void tcp_adjust_pcount(struct sock *sk, struct sk_buff *skb, int decr)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ tp->packets_out -= decr;
+
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+ tp->sacked_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
+ tp->retrans_out -= decr;
+ if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
+ tp->lost_out -= decr;
+
+ /* Reno case is special. Sigh... */
+ if (tcp_is_reno(tp) && decr > 0)
+ tp->sacked_out -= min_t(u32, tp->sacked_out, decr);
+
+ tcp_adjust_fackets_out(sk, skb, decr);
+
+ if (tp->lost_skb_hint &&
+ before(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(tp->lost_skb_hint)->seq) &&
+ (tcp_is_fack(tp) || (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)))
+ tp->lost_cnt_hint -= decr;
+
+ tcp_verify_left_out(tp);
+}
+
/* Function to create two new TCP segments. Shrinks the given segment
* to the specified size and appends a new segment with the rest of the
* packet to the list. This won't be called frequently, I hope.
struct sk_buff *buff;
int nsize, old_factor;
int nlen;
- u16 flags;
+ u8 flags;
BUG_ON(len > skb->len);
- tcp_clear_retrans_hints_partial(tp);
nsize = skb_headlen(skb) - len;
if (nsize < 0)
nsize = 0;
int diff = old_factor - tcp_skb_pcount(skb) -
tcp_skb_pcount(buff);
- tp->packets_out -= diff;
-
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
- tp->sacked_out -= diff;
- if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= diff;
-
- if (TCP_SKB_CB(skb)->sacked & TCPCB_LOST)
- tp->lost_out -= diff;
-
- /* Adjust Reno SACK estimate. */
- if (tcp_is_reno(tp) && diff > 0) {
- tcp_dec_pcount_approx_int(&tp->sacked_out, diff);
- tcp_verify_left_out(tp);
- }
- tcp_adjust_fackets_out(sk, skb, diff);
+ if (diff)
+ tcp_adjust_pcount(sk, skb, diff);
}
/* Link BUFF into the send queue. */
skb->len = skb->data_len;
}
+/* Remove acked data from a packet in the transmit queue. */
int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len)
{
if (skb_cloned(skb) && pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
* factor and mss.
*/
if (tcp_skb_pcount(skb) > 1)
- tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk, 1));
+ tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk));
return 0;
}
-/* Not accounting for SACKs here. */
+/* Calculate MSS. Not accounting for SACKs here. */
int tcp_mtu_to_mss(struct sock *sk, int pmtu)
{
struct tcp_sock *tp = tcp_sk(sk);
return mtu;
}
+/* MTU probing init per socket */
void tcp_mtup_init(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
icsk->icsk_mtup.probe_size = 0;
}
-/* Bound MSS / TSO packet size with the half of the window */
-static int tcp_bound_to_half_wnd(struct tcp_sock *tp, int pktsize)
-{
- if (tp->max_window && pktsize > (tp->max_window >> 1))
- return max(tp->max_window >> 1, 68U - tp->tcp_header_len);
- else
- return pktsize;
-}
-
/* This function synchronize snd mss to current pmtu/exthdr set.
tp->rx_opt.user_mss is mss set by user by TCP_MAXSEG. It does NOT counts
/* Compute the current effective MSS, taking SACKs and IP options,
* and even PMTU discovery events into account.
*/
-unsigned int tcp_current_mss(struct sock *sk, int large_allowed)
+unsigned int tcp_current_mss(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
u32 mss_now;
- u16 xmit_size_goal;
- int doing_tso = 0;
unsigned header_len;
struct tcp_out_options opts;
struct tcp_md5sig_key *md5;
mss_now = tp->mss_cache;
- if (large_allowed && sk_can_gso(sk))
- doing_tso = 1;
-
if (dst) {
u32 mtu = dst_mtu(dst);
if (mtu != inet_csk(sk)->icsk_pmtu_cookie)
mss_now -= delta;
}
- xmit_size_goal = mss_now;
-
- if (doing_tso) {
- xmit_size_goal = ((sk->sk_gso_max_size - 1) -
- inet_csk(sk)->icsk_af_ops->net_header_len -
- inet_csk(sk)->icsk_ext_hdr_len -
- tp->tcp_header_len);
-
- xmit_size_goal = tcp_bound_to_half_wnd(tp, xmit_size_goal);
- xmit_size_goal -= (xmit_size_goal % mss_now);
- }
- tp->xmit_size_goal = xmit_size_goal;
-
return mss_now;
}
return 0;
}
-/* This must be invoked the first time we consider transmitting
+/* Intialize TSO state of a skb.
+ * This must be invoked the first time we consider transmitting
* SKB onto the wire.
*/
static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb,
return tso_segs;
}
+/* Minshall's variant of the Nagle send check. */
static inline int tcp_minshall_check(const struct tcp_sock *tp)
{
return after(tp->snd_sml, tp->snd_una) &&
return cwnd_quota;
}
+/* Test if sending is allowed right now. */
int tcp_may_send_now(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb = tcp_send_head(sk);
return (skb &&
- tcp_snd_test(sk, skb, tcp_current_mss(sk, 1),
+ tcp_snd_test(sk, skb, tcp_current_mss(sk),
(tcp_skb_is_last(sk, skb) ?
tp->nonagle : TCP_NAGLE_PUSH)));
}
{
struct sk_buff *buff;
int nlen = skb->len - len;
- u16 flags;
+ u8 flags;
/* All of a TSO frame must be composed of paged data. */
if (skb->len != skb->data_len)
}
/* Create a new MTU probe if we are ready.
+ * MTU probe is regularly attempting to increase the path MTU by
+ * deliberately sending larger packets. This discovers routing
+ * changes resulting in larger path MTUs.
+ *
* Returns 0 if we should wait to probe (no cwnd available),
* 1 if a probe was sent,
* -1 otherwise
icsk->icsk_mtup.probe_size ||
inet_csk(sk)->icsk_ca_state != TCP_CA_Open ||
tp->snd_cwnd < 11 ||
- tp->rx_opt.eff_sacks)
+ tp->rx_opt.num_sacks || tp->rx_opt.dsack)
return -1;
/* Very simple search strategy: just double the MSS. */
- mss_now = tcp_current_mss(sk, 0);
+ mss_now = tcp_current_mss(sk);
probe_size = 2 * tp->mss_cache;
size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *next_skb = tcp_write_queue_next(sk, skb);
int skb_size, next_skb_size;
- u16 flags;
skb_size = skb->len;
next_skb_size = next_skb->len;
- flags = TCP_SKB_CB(skb)->flags;
BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
/* Update sequence range on original skb. */
TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(next_skb)->end_seq;
- /* Merge over control information. */
- flags |= TCP_SKB_CB(next_skb)->flags; /* This moves PSH/FIN etc. over */
- TCP_SKB_CB(skb)->flags = flags;
+ /* Merge over control information. This moves PSH/FIN etc. over */
+ TCP_SKB_CB(skb)->flags |= TCP_SKB_CB(next_skb)->flags;
/* All done, get rid of second SKB and account for it so
* packet counting does not break.
*/
TCP_SKB_CB(skb)->sacked |= TCP_SKB_CB(next_skb)->sacked & TCPCB_EVER_RETRANS;
- if (TCP_SKB_CB(next_skb)->sacked & TCPCB_SACKED_RETRANS)
- tp->retrans_out -= tcp_skb_pcount(next_skb);
- if (TCP_SKB_CB(next_skb)->sacked & TCPCB_LOST)
- tp->lost_out -= tcp_skb_pcount(next_skb);
- /* Reno case is special. Sigh... */
- if (tcp_is_reno(tp) && tp->sacked_out)
- tcp_dec_pcount_approx(&tp->sacked_out, next_skb);
-
- tcp_adjust_fackets_out(sk, next_skb, tcp_skb_pcount(next_skb));
- tp->packets_out -= tcp_skb_pcount(next_skb);
/* changed transmit queue under us so clear hints */
tcp_clear_retrans_hints_partial(tp);
if (next_skb == tp->retransmit_skb_hint)
tp->retransmit_skb_hint = skb;
+ tcp_adjust_pcount(sk, next_skb, tcp_skb_pcount(next_skb));
+
sk_wmem_free_skb(sk, next_skb);
}
+/* Check if coalescing SKBs is legal. */
static int tcp_can_collapse(struct sock *sk, struct sk_buff *skb)
{
if (tcp_skb_pcount(skb) > 1)
return 1;
}
+/* Collapse packets in the retransmit queue to make to create
+ * less packets on the wire. This is only done on retransmission.
+ */
static void tcp_retrans_try_collapse(struct sock *sk, struct sk_buff *to,
int space)
{
if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
return -EHOSTUNREACH; /* Routing failure or similar. */
- cur_mss = tcp_current_mss(sk, 0);
+ cur_mss = tcp_current_mss(sk);
/* If receiver has shrunk his window, and skb is out of
* new window, do not retransmit it. The exception is the
* case, when window is shrunk to zero. In this case
* our retransmit serves as a zero window probe.
*/
- if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))
- && TCP_SKB_CB(skb)->seq != tp->snd_una)
+ if (!before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp)) &&
+ TCP_SKB_CB(skb)->seq != tp->snd_una)
return -EAGAIN;
if (skb->len > cur_mss) {
if (tcp_fragment(sk, skb, cur_mss, cur_mss))
return -ENOMEM; /* We'll try again later. */
+ } else {
+ int oldpcount = tcp_skb_pcount(skb);
+
+ if (unlikely(oldpcount > 1)) {
+ tcp_init_tso_segs(sk, skb, cur_mss);
+ tcp_adjust_pcount(sk, skb, oldpcount - tcp_skb_pcount(skb));
+ }
}
tcp_retrans_try_collapse(sk, skb, cur_mss);
return err;
}
+/* Check if we forward retransmits are possible in the current
+ * window/congestion state.
+ */
static int tcp_can_forward_retransmit(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
* unsent frames. But be careful about outgoing SACKS
* and IP options.
*/
- mss_now = tcp_current_mss(sk, 1);
+ mss_now = tcp_current_mss(sk);
if (tcp_send_head(sk) != NULL) {
TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_FIN;
} else {
/* Socket is locked, keep trying until memory is available. */
for (;;) {
- skb = alloc_skb_fclone(MAX_TCP_HEADER, GFP_KERNEL);
+ skb = alloc_skb_fclone(MAX_TCP_HEADER,
+ sk->sk_allocation);
if (skb)
break;
yield();
TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTRSTS);
}
-/* WARNING: This routine must only be called when we have already sent
+/* Send a crossed SYN-ACK during socket establishment.
+ * WARNING: This routine must only be called when we have already sent
* a SYN packet that crossed the incoming SYN that caused this routine
* to get called. If this assumption fails then the initial rcv_wnd
* and rcv_wscale values will not be correct.
return tcp_transmit_skb(sk, skb, 1, GFP_ATOMIC);
}
-/*
- * Prepare a SYN-ACK.
- */
+/* Prepare a SYN-ACK. */
struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst,
- struct request_sock *req)
+ struct request_sock *req,
+ struct request_values *rvp)
{
+ struct tcp_out_options opts;
+ struct tcp_extend_values *xvp = tcp_xv(rvp);
struct inet_request_sock *ireq = inet_rsk(req);
struct tcp_sock *tp = tcp_sk(sk);
struct tcphdr *th;
- int tcp_header_size;
- struct tcp_out_options opts;
struct sk_buff *skb;
struct tcp_md5sig_key *md5;
- __u8 *md5_hash_location;
+ int tcp_header_size;
int mss;
skb = sock_wmalloc(sk, MAX_TCP_HEADER + 15, 1, GFP_ATOMIC);
/* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER);
- skb->dst = dst_clone(dst);
+ skb_dst_set(skb, dst_clone(dst));
mss = dst_metric(dst, RTAX_ADVMSS);
if (tp->rx_opt.user_mss && tp->rx_opt.user_mss < mss)
#endif
TCP_SKB_CB(skb)->when = tcp_time_stamp;
tcp_header_size = tcp_synack_options(sk, req, mss,
- skb, &opts, &md5) +
- sizeof(struct tcphdr);
+ skb, &opts, &md5, xvp)
+ + sizeof(*th);
skb_push(skb, tcp_header_size);
skb_reset_transport_header(skb);
*/
tcp_init_nondata_skb(skb, tcp_rsk(req)->snt_isn,
TCPCB_FLAG_SYN | TCPCB_FLAG_ACK);
+
+ if (OPTION_COOKIE_EXTENSION & opts.options) {
+ const struct tcp_cookie_values *cvp = tp->cookie_values;
+
+ if (cvp != NULL &&
+ cvp->s_data_constant &&
+ cvp->s_data_desired > 0) {
+ u8 *buf = skb_put(skb, cvp->s_data_desired);
+
+ /* copy data directly from the listening socket. */
+ memcpy(buf, cvp->s_data_payload, cvp->s_data_desired);
+ TCP_SKB_CB(skb)->end_seq += cvp->s_data_desired;
+ }
+
+ if (opts.hash_size > 0) {
+ __u32 workspace[SHA_WORKSPACE_WORDS];
+ u32 *mess = &xvp->cookie_bakery[COOKIE_DIGEST_WORDS];
+ u32 *tail = &mess[COOKIE_MESSAGE_WORDS-1];
+
+ /* Secret recipe depends on the Timestamp, (future)
+ * Sequence and Acknowledgment Numbers, Initiator
+ * Cookie, and others handled by IP variant caller.
+ */
+ *tail-- ^= opts.tsval;
+ *tail-- ^= tcp_rsk(req)->rcv_isn + 1;
+ *tail-- ^= TCP_SKB_CB(skb)->seq + 1;
+
+ /* recommended */
+ *tail-- ^= ((th->dest << 16) | th->source);
+ *tail-- ^= (u32)(unsigned long)cvp; /* per sockopt */
+
+ sha_transform((__u32 *)&xvp->cookie_bakery[0],
+ (char *)mess,
+ &workspace[0]);
+ opts.hash_location =
+ (__u8 *)&xvp->cookie_bakery[0];
+ }
+ }
+
th->seq = htonl(TCP_SKB_CB(skb)->seq);
th->ack_seq = htonl(tcp_rsk(req)->rcv_isn + 1);
/* RFC1323: The window in SYN & SYN/ACK segments is never scaled. */
th->window = htons(min(req->rcv_wnd, 65535U));
- tcp_options_write((__be32 *)(th + 1), tp, &opts, &md5_hash_location);
+ tcp_options_write((__be32 *)(th + 1), tp, &opts);
th->doff = (tcp_header_size >> 2);
TCP_INC_STATS(sock_net(sk), TCP_MIB_OUTSEGS);
#ifdef CONFIG_TCP_MD5SIG
/* Okay, we have all we need - do the md5 hash if needed */
if (md5) {
- tp->af_specific->calc_md5_hash(md5_hash_location,
+ tcp_rsk(req)->af_specific->calc_md5_hash(opts.hash_location,
md5, NULL, req, skb);
}
#endif
return skb;
}
-/*
- * Do all connect socket setups that can be done AF independent.
- */
+/* Do all connect socket setups that can be done AF independent. */
static void tcp_connect_init(struct sock *sk)
{
struct dst_entry *dst = __sk_dst_get(sk);
sk->sk_err = 0;
sock_reset_flag(sk, SOCK_DONE);
tp->snd_wnd = 0;
- tcp_init_wl(tp, tp->write_seq, 0);
+ tcp_init_wl(tp, 0);
tp->snd_una = tp->write_seq;
tp->snd_sml = tp->write_seq;
tp->snd_up = tp->write_seq;
tcp_clear_retrans(tp);
}
-/*
- * Build a SYN and send it off.
- */
+/* Build a SYN and send it off. */
int tcp_connect(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
sk->sk_wmem_queued += buff->truesize;
sk_mem_charge(sk, buff->truesize);
tp->packets_out += tcp_skb_pcount(buff);
- tcp_transmit_skb(sk, buff, 1, GFP_KERNEL);
+ tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
/* We change tp->snd_nxt after the tcp_transmit_skb() call
* in order to make this packet get counted in tcpOutSegs.
return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
}
+/* Initiate keepalive or window probe from timer. */
int tcp_write_wakeup(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
if ((skb = tcp_send_head(sk)) != NULL &&
before(TCP_SKB_CB(skb)->seq, tcp_wnd_end(tp))) {
int err;
- unsigned int mss = tcp_current_mss(sk, 0);
+ unsigned int mss = tcp_current_mss(sk);
unsigned int seg_size = tcp_wnd_end(tp) - TCP_SKB_CB(skb)->seq;
if (before(tp->pushed_seq, TCP_SKB_CB(skb)->end_seq))