X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fnetfilter%2Fnf_conntrack_proto_tcp.c;h=ad118053971ab27bfe8aa101938258c9e4f3718a;hb=9ab99d5a43e9f283738fd9fd365539306d13eaac;hp=7a3f64c1aca6f346581c0ef0bc54427f2fb75d14;hpb=a57793651ff1a09ef18bade998632435ca2dc13f;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 7a3f64c..ad11805 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -24,9 +25,9 @@ #include #include #include - -/* Protects conntrack->proto.tcp */ -static DEFINE_RWLOCK(tcp_lock); +#include +#include +#include /* "Be conservative in what you do, be liberal in what you accept from others." @@ -45,7 +46,7 @@ static int nf_ct_tcp_max_retrans __read_mostly = 3; /* FIXME: Examine ipfilter's timeouts and conntrack transitions more closely. They're more complex. --RR */ -static const char *tcp_conntrack_names[] = { +static const char *const tcp_conntrack_names[] = { "NONE", "SYN_SENT", "SYN_RECV", @@ -55,7 +56,7 @@ static const char *tcp_conntrack_names[] = { "LAST_ACK", "TIME_WAIT", "CLOSE", - "LISTEN" + "SYN_SENT2", }; #define SECS * HZ @@ -63,32 +64,23 @@ static const char *tcp_conntrack_names[] = { #define HOURS * 60 MINS #define DAYS * 24 HOURS -static unsigned int nf_ct_tcp_timeout_syn_sent __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_syn_recv __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_established __read_mostly = 5 DAYS; -static unsigned int nf_ct_tcp_timeout_fin_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close_wait __read_mostly = 60 SECS; -static unsigned int nf_ct_tcp_timeout_last_ack __read_mostly = 30 SECS; -static unsigned int nf_ct_tcp_timeout_time_wait __read_mostly = 2 MINS; -static unsigned int nf_ct_tcp_timeout_close __read_mostly = 10 SECS; - /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ -static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; - -static unsigned int * tcp_timeouts[] = { - NULL, /* TCP_CONNTRACK_NONE */ - &nf_ct_tcp_timeout_syn_sent, /* TCP_CONNTRACK_SYN_SENT, */ - &nf_ct_tcp_timeout_syn_recv, /* TCP_CONNTRACK_SYN_RECV, */ - &nf_ct_tcp_timeout_established, /* TCP_CONNTRACK_ESTABLISHED, */ - &nf_ct_tcp_timeout_fin_wait, /* TCP_CONNTRACK_FIN_WAIT, */ - &nf_ct_tcp_timeout_close_wait, /* TCP_CONNTRACK_CLOSE_WAIT, */ - &nf_ct_tcp_timeout_last_ack, /* TCP_CONNTRACK_LAST_ACK, */ - &nf_ct_tcp_timeout_time_wait, /* TCP_CONNTRACK_TIME_WAIT, */ - &nf_ct_tcp_timeout_close, /* TCP_CONNTRACK_CLOSE, */ - NULL, /* TCP_CONNTRACK_LISTEN */ - }; +static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; +static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly = 5 MINS; + +static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { + [TCP_CONNTRACK_SYN_SENT] = 2 MINS, + [TCP_CONNTRACK_SYN_RECV] = 60 SECS, + [TCP_CONNTRACK_ESTABLISHED] = 5 DAYS, + [TCP_CONNTRACK_FIN_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE_WAIT] = 60 SECS, + [TCP_CONNTRACK_LAST_ACK] = 30 SECS, + [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, + [TCP_CONNTRACK_CLOSE] = 10 SECS, + [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, +}; #define sNO TCP_CONNTRACK_NONE #define sSS TCP_CONNTRACK_SYN_SENT @@ -99,7 +91,7 @@ static unsigned int * tcp_timeouts[] = { #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE -#define sLI TCP_CONNTRACK_LISTEN +#define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE @@ -129,15 +121,14 @@ enum tcp_bit_set { * * NONE: initial state * SYN_SENT: SYN-only packet seen + * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen * CLOSE_WAIT: ACK seen (after FIN) * LAST_ACK: FIN seen (after FIN) * TIME_WAIT: last ACK seen - * CLOSE: closed connection - * - * LISTEN state is not used. + * CLOSE: closed connection (RST) * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid @@ -145,18 +136,18 @@ enum tcp_bit_set { * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): - * if they are invalid - * or we do not support the request (simultaneous open) + * if we regard them as truly invalid packets */ -static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { +static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN - * sSR -> sIG Late retransmitted SYN? + * sS2 -> sS2 Late retransmitted SYN + * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. @@ -167,22 +158,30 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* - * A SYN/ACK from the client is always invalid: - * - either it tries to set up a simultaneous open, which is - * not supported; - * - or the firewall has just been inserted between the two hosts - * during the session set-up. The SYN will be retransmitted - * by the true client (or it'll time out). + * sNO -> sIV Too late and no reason to do anything + * sSS -> sIV Client can't send SYN and then SYN/ACK + * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. + * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for @@ -193,11 +192,12 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -206,29 +206,31 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 }, /* * sNO -> sIV Never reached. - * sSS -> sIV Simultaneous open, not supported - * sSR -> sIV Simultaneous open, not supported. - * sES -> sIV Server may not initiate a connection. + * sSS -> sS2 Simultaneous open + * sS2 -> sS2 Retransmitted simultaneous SYN + * sSR -> sIV Invalid SYN packets sent by the server + * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV Reopened connection, but server may not do it. * sCL -> sIV */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. + * sS2 -> sSR Simultaneous open * sSR -> sSR Retransmitted SYN/ACK. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN @@ -237,10 +239,11 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sIG * sCL -> sIG */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. + * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. @@ -249,10 +252,11 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. + * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -261,35 +265,35 @@ static enum tcp_conntrack tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; -static int tcp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) { - struct tcphdr _hdr, *hp; + const struct tcphdr *hp; + struct tcphdr _hdr; /* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, 8, &_hdr); if (hp == NULL) - return 0; + return false; tuple->src.u.tcp.port = hp->source; tuple->dst.u.tcp.port = hp->dest; - return 1; + return true; } -static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { tuple->src.u.tcp.port = orig->dst.u.tcp.port; tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ @@ -302,14 +306,13 @@ static int tcp_print_tuple(struct seq_file *s, } /* Print out the private part of the conntrack. */ -static int tcp_print_conntrack(struct seq_file *s, - const struct nf_conn *conntrack) +static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { enum tcp_conntrack state; - read_lock_bh(&tcp_lock); - state = conntrack->proto.tcp.state; - read_unlock_bh(&tcp_lock); + spin_lock_bh(&ct->lock); + state = ct->proto.tcp.state; + spin_unlock_bh(&ct->lock); return seq_printf(s, "%s ", tcp_conntrack_names[state]); } @@ -342,19 +345,20 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph) I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin - III. Upper bound for valid ack: sack <= receiver.td_end - IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + III. Upper bound for valid (s)ack: sack <= receiver.td_end + IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW - where sack is the highest right edge of sack block found in the packet. + where sack is the highest right edge of sack block found in the packet + or ack in the case of packet without SACK option. - The upper bound limit for a valid ack is not ignored - + The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ static inline __u32 segment_seq_plus_len(__u32 seq, size_t len, unsigned int dataoff, - struct tcphdr *tcph) + const struct tcphdr *tcph) { /* XXX Should I use payload length field in IP/IPv6 header ? * - YK */ @@ -373,11 +377,11 @@ static inline __u32 segment_seq_plus_len(__u32 seq, */ static void tcp_options(const struct sk_buff *skb, unsigned int dataoff, - struct tcphdr *tcph, + const struct tcphdr *tcph, struct ip_ct_tcp_state *state) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; + const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); if (!length) @@ -428,10 +432,10 @@ static void tcp_options(const struct sk_buff *skb, } static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, - struct tcphdr *tcph, __u32 *sack) + const struct tcphdr *tcph, __u32 *sack) { unsigned char buff[(15 * 4) - sizeof(struct tcphdr)]; - unsigned char *ptr; + const unsigned char *ptr; int length = (tcph->doff*4) - sizeof(struct tcphdr); __u32 tmp; @@ -475,7 +479,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { - tmp = ntohl(*((__be32 *)(ptr+i)+1)); + tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); if (after(tmp, *sack)) *sack = tmp; @@ -488,20 +492,37 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -static int tcp_in_window(struct nf_conn *ct, - struct ip_ct_tcp *state, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - unsigned int dataoff, - struct tcphdr *tcph, - int pf) +#ifdef CONFIG_NF_NAT_NEEDED +static inline s16 nat_offset(const struct nf_conn *ct, + enum ip_conntrack_dir dir, + u32 seq) +{ + typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset); + + return get_offset != NULL ? get_offset(ct, dir, seq) : 0; +} +#define NAT_OFFSET(pf, ct, dir, seq) \ + (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0) +#else +#define NAT_OFFSET(pf, ct, dir, seq) 0 +#endif + +static bool tcp_in_window(const struct nf_conn *ct, + struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u_int8_t pf) { + struct net *net = nf_ct_net(ct); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; - struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; + const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; - int res; + s16 receiver_offset; + bool res; /* * Get the required data from the packet. @@ -514,11 +535,16 @@ static int tcp_in_window(struct nf_conn *ct, if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM) tcp_sack(skb, dataoff, tcph, &sack); + /* Take into account NAT sequence number mangling */ + receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1); + ack -= receiver_offset; + sack -= receiver_offset; + pr_debug("tcp_in_window: START\n"); pr_debug("tcp_in_window: "); - NF_CT_DUMP_TUPLE(tuple); - pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n", - seq, ack, sack, win, end); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, @@ -526,13 +552,14 @@ static int tcp_in_window(struct nf_conn *ct, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); - if (sender->td_end == 0) { + if (sender->td_maxwin == 0) { /* * Initialize sender data. */ - if (tcph->syn && tcph->ack) { + if (tcph->syn) { /* - * Outgoing SYN-ACK in reply to a SYN. + * SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. */ sender->td_end = sender->td_maxend = end; @@ -548,6 +575,9 @@ static int tcp_in_window(struct nf_conn *ct, && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) sender->td_scale = receiver->td_scale = 0; + if (!tcph->ack) + /* Simultaneous open */ + return true; } else { /* * We are in the middle of a connection, @@ -603,9 +633,9 @@ static int tcp_in_window(struct nf_conn *ct, seq = end = sender->td_end; pr_debug("tcp_in_window: "); - NF_CT_DUMP_TUPLE(tuple); - pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n", - seq, ack, sack, win, end); + nf_ct_dump_tuple(tuple); + pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n", + seq, ack, receiver_offset, sack, receiver_offset, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, @@ -617,12 +647,12 @@ static int tcp_in_window(struct nf_conn *ct, before(seq, sender->td_maxend + 1), after(end, sender->td_end - receiver->td_maxwin - 1), before(sack, receiver->td_end + 1), - after(ack, receiver->td_end - MAXACKWINDOW(sender))); + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && after(end, sender->td_end - receiver->td_maxwin - 1) && before(sack, receiver->td_end + 1) && - after(ack, receiver->td_end - MAXACKWINDOW(sender))) { + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* * Take into account window scaling (RFC 1323). */ @@ -635,8 +665,18 @@ static int tcp_in_window(struct nf_conn *ct, swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; - if (after(end, sender->td_end)) + if (after(end, sender->td_end)) { sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) + sender->td_maxack = ack; + } + /* * Update receiver data. */ @@ -647,6 +687,8 @@ static int tcp_in_window(struct nf_conn *ct, if (win == 0) receiver->td_maxend++; } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; /* * Check retransmissions. @@ -667,26 +709,26 @@ static int tcp_in_window(struct nf_conn *ct, state->retrans = 0; } } - res = 1; + res = true; } else { - res = 0; + res = false; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || nf_ct_tcp_be_liberal) - res = 1; - if (!res && LOG_INVALID(IPPROTO_TCP)) + res = true; + if (!res && LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? after(end, sender->td_end - receiver->td_maxwin - 1) ? before(sack, receiver->td_end + 1) ? - after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG" + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" : "ACK is under the lower bound (possible overly delayed ACK)" : "ACK is over the upper bound (ACKed data not seen yet)" : "SEQ is under the lower bound (already ACKed data retransmitted)" : "SEQ is over the upper bound (over the window of the receiver)"); } - pr_debug("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " "receiver end=%u maxend=%u maxwin=%u\n", res, sender->td_end, sender->td_maxend, sender->td_maxwin, receiver->td_end, receiver->td_maxend, receiver->td_maxwin); @@ -694,39 +736,6 @@ static int tcp_in_window(struct nf_conn *ct, return res; } -#ifdef CONFIG_NF_NAT_NEEDED -/* Update sender->td_end after NAT successfully mangled the packet */ -/* Caller must linearize skb at tcp header. */ -void nf_conntrack_tcp_update(struct sk_buff *skb, - unsigned int dataoff, - struct nf_conn *conntrack, - int dir) -{ - struct tcphdr *tcph = (void *)skb->data + dataoff; - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[dir]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[!dir]; - __u32 end; - - end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); - - write_lock_bh(&tcp_lock); - /* - * We have to worry for the ack in the reply packet only... - */ - if (after(end, conntrack->proto.tcp.seen[dir].td_end)) - conntrack->proto.tcp.seen[dir].td_end = end; - conntrack->proto.tcp.last_end = end; - write_unlock_bh(&tcp_lock); - pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " - "receiver end=%u maxend=%u maxwin=%u scale=%i\n", - sender->td_end, sender->td_maxend, sender->td_maxwin, - sender->td_scale, - receiver->td_end, receiver->td_maxend, receiver->td_maxwin, - receiver->td_scale); -} -EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update); -#endif - #define TH_FIN 0x01 #define TH_SYN 0x02 #define TH_RST 0x04 @@ -737,7 +746,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update); #define TH_CWR 0x80 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ -static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = +static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = { [TH_SYN] = 1, [TH_SYN|TH_URG] = 1, @@ -751,20 +760,22 @@ static u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct sk_buff *skb, +static int tcp_error(struct net *net, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { - struct tcphdr _tcph, *th; + const struct tcphdr *th; + struct tcphdr _tcph; unsigned int tcplen = skb->len - dataoff; u_int8_t tcpflags; /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; @@ -772,7 +783,7 @@ static int tcp_error(struct sk_buff *skb, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; @@ -783,11 +794,9 @@ static int tcp_error(struct sk_buff *skb, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && - ((pf == PF_INET && hooknum == NF_IP_PRE_ROUTING) || - (pf == PF_INET6 && hooknum == NF_IP6_PRE_ROUTING)) && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; @@ -796,7 +805,7 @@ static int tcp_error(struct sk_buff *skb, /* Check TCP flags. */ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; @@ -806,102 +815,166 @@ static int tcp_error(struct sk_buff *skb, } /* Returns verdict for packet, or -1 for invalid. */ -static int tcp_packet(struct nf_conn *conntrack, +static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; enum ip_conntrack_dir dir; - struct tcphdr *th, _tcph; + const struct tcphdr *th; + struct tcphdr _tcph; unsigned long timeout; unsigned int index; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); - write_lock_bh(&tcp_lock); - old_state = conntrack->proto.tcp.state; + spin_lock_bh(&ct->lock); + old_state = ct->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); new_state = tcp_conntracks[dir][index][old_state]; - tuple = &conntrack->tuplehash[dir].tuple; + tuple = &ct->tuplehash[dir].tuple; switch (new_state) { case TCP_CONNTRACK_SYN_SENT: if (old_state < TCP_CONNTRACK_TIME_WAIT) break; - if ((conntrack->proto.tcp.seen[!dir].flags & - IP_CT_TCP_FLAG_CLOSE_INIT) - || (conntrack->proto.tcp.last_dir == dir - && conntrack->proto.tcp.last_index == TCP_RST_SET)) { + /* RFC 1122: "When a connection is closed actively, + * it MUST linger in TIME-WAIT state for a time 2xMSL + * (Maximum Segment Lifetime). However, it MAY accept + * a new SYN from the remote TCP to reopen the connection + * directly from TIME-WAIT state, if..." + * We ignore the conditions because we are in the + * TIME-WAIT state anyway. + * + * Handle aborted connections: we and the server + * think there is an existing connection but the client + * aborts it and starts a new one. + */ + if (((ct->proto.tcp.seen[dir].flags + | ct->proto.tcp.seen[!dir].flags) + & IP_CT_TCP_FLAG_CLOSE_INIT) + || (ct->proto.tcp.last_dir == dir + && ct->proto.tcp.last_index == TCP_RST_SET)) { /* Attempt to reopen a closed/aborted connection. * Delete this connection and look up again. */ - write_unlock_bh(&tcp_lock); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_REPEAT; + spin_unlock_bh(&ct->lock); + + /* Only repeat if we can actually remove the timer. + * Destruction may already be in progress in process + * context and we must give it a chance to terminate. + */ + if (nf_ct_kill(ct)) + return -NF_REPEAT; + return NF_DROP; } /* Fall through */ case TCP_CONNTRACK_IGNORE: /* Ignored packets: * + * Our connection entry may be out of sync, so ignore + * packets which may signal the real connection between + * the client and the server. + * * a) SYN in ORIGINAL * b) SYN/ACK in REPLY * c) ACK in reply direction after initial SYN in original. + * + * If the ignored packet is invalid, the receiver will send + * a RST we'll catch below. */ if (index == TCP_SYNACK_SET - && conntrack->proto.tcp.last_index == TCP_SYN_SET - && conntrack->proto.tcp.last_dir != dir - && ntohl(th->ack_seq) == - conntrack->proto.tcp.last_end) { - /* This SYN/ACK acknowledges a SYN that we earlier + && ct->proto.tcp.last_index == TCP_SYN_SET + && ct->proto.tcp.last_dir != dir + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { + /* b) This SYN/ACK acknowledges a SYN that we earlier * ignored as invalid. This means that the client and * the server are both in sync, while the firewall is - * not. We kill this session and block the SYN/ACK so - * that the client cannot but retransmit its SYN and - * thus initiate a clean new session. + * not. We get in sync from the previously annotated + * values. */ - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) - nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: killing out of sync session "); - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); - return -NF_DROP; + old_state = TCP_CONNTRACK_SYN_SENT; + new_state = TCP_CONNTRACK_SYN_RECV; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend = + ct->proto.tcp.last_end; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin = + ct->proto.tcp.last_win == 0 ? + 1 : ct->proto.tcp.last_win; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale = + ct->proto.tcp.last_wscale; + ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags = + ct->proto.tcp.last_flags; + memset(&ct->proto.tcp.seen[dir], 0, + sizeof(struct ip_ct_tcp_state)); + break; } - conntrack->proto.tcp.last_index = index; - conntrack->proto.tcp.last_dir = dir; - conntrack->proto.tcp.last_seq = ntohl(th->seq); - conntrack->proto.tcp.last_end = + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; + ct->proto.tcp.last_seq = ntohl(th->seq); + ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + ct->proto.tcp.last_win = ntohs(th->window); + + /* a) This is a SYN in ORIGINAL. The client and the server + * may be in sync but we are not. In that case, we annotate + * the TCP options and let the packet go through. If it is a + * valid SYN packet, the server will reply with a SYN/ACK, and + * then we'll get in sync. Otherwise, the server ignores it. */ + if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) { + struct ip_ct_tcp_state seen = {}; + + ct->proto.tcp.last_flags = + ct->proto.tcp.last_wscale = 0; + tcp_options(skb, dataoff, th, &seen); + if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_WINDOW_SCALE; + ct->proto.tcp.last_wscale = seen.td_scale; + } + if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) { + ct->proto.tcp.last_flags |= + IP_CT_TCP_FLAG_SACK_PERM; + } + } + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, - "nf_ct_tcp: invalid packed ignored "); + "nf_ct_tcp: invalid packet ignored "); return NF_ACCEPT; case TCP_CONNTRACK_MAX: /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET - && ((test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_SYN_SET) - || (!test_bit(IPS_ASSURED_BIT, &conntrack->status) - && conntrack->proto.tcp.last_index == TCP_ACK_SET)) - && ntohl(th->ack_seq) == conntrack->proto.tcp.last_end) { + && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) + && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { + /* Invalid RST */ + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid RST "); + return -NF_ACCEPT; + } + if (index == TCP_RST_SET + && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_SYN_SET) + || (!test_bit(IPS_ASSURED_BIT, &ct->status) + && ct->proto.tcp.last_index == TCP_ACK_SET)) + && ntohl(th->ack_seq) == ct->proto.tcp.last_end) { /* RST sent to invalid SYN or ACK we had let through * at a) and c) above: * @@ -919,72 +992,75 @@ static int tcp_packet(struct nf_conn *conntrack, break; } - if (!tcp_in_window(conntrack, &conntrack->proto.tcp, dir, index, + if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, skb, dataoff, th, pf)) { - write_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } in_window: /* From now on we have got in-window packets */ - conntrack->proto.tcp.last_index = index; - conntrack->proto.tcp.last_dir = dir; + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; pr_debug("tcp_conntracks: "); - NF_CT_DUMP_TUPLE(tuple); + nf_ct_dump_tuple(tuple); pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", (th->syn ? 1 : 0), (th->ack ? 1 : 0), (th->fin ? 1 : 0), (th->rst ? 1 : 0), old_state, new_state); - conntrack->proto.tcp.state = new_state; + ct->proto.tcp.state = new_state; if (old_state != new_state - && (new_state == TCP_CONNTRACK_FIN_WAIT - || new_state == TCP_CONNTRACK_CLOSE)) - conntrack->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - timeout = conntrack->proto.tcp.retrans >= nf_ct_tcp_max_retrans - && *tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans - ? nf_ct_tcp_timeout_max_retrans : *tcp_timeouts[new_state]; - write_unlock_bh(&tcp_lock); - - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + && new_state == TCP_CONNTRACK_FIN_WAIT) + ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; + + if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans && + tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans) + timeout = nf_ct_tcp_timeout_max_retrans; + else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & + IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && + tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged) + timeout = nf_ct_tcp_timeout_unacknowledged; + else + timeout = tcp_timeouts[new_state]; + spin_unlock_bh(&ct->lock); + if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); - if (!test_bit(IPS_SEEN_REPLY_BIT, &conntrack->status)) { + if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to have an established connection: this is a fairly common problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { - if (del_timer(&conntrack->timeout)) - conntrack->timeout.function((unsigned long) - conntrack); + nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } - } else if (!test_bit(IPS_ASSURED_BIT, &conntrack->status) + } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) && (old_state == TCP_CONNTRACK_SYN_RECV || old_state == TCP_CONNTRACK_ESTABLISHED) && new_state == TCP_CONNTRACK_ESTABLISHED) { /* Set ASSURED if we see see valid ack in ESTABLISHED after SYN_RECV or a valid answer for a picked up connection. */ - set_bit(IPS_ASSURED_BIT, &conntrack->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + set_bit(IPS_ASSURED_BIT, &ct->status); + nf_conntrack_event_cache(IPCT_ASSURED, ct); } - nf_ct_refresh_acct(conntrack, ctinfo, skb, timeout); + nf_ct_refresh_acct(ct, ctinfo, skb, timeout); return NF_ACCEPT; } /* Called when a new connection for this protocol found. */ -static int tcp_new(struct nf_conn *conntrack, - const struct sk_buff *skb, - unsigned int dataoff) +static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) { enum tcp_conntrack new_state; - struct tcphdr *th, _tcph; - struct ip_ct_tcp_state *sender = &conntrack->proto.tcp.seen[0]; - struct ip_ct_tcp_state *receiver = &conntrack->proto.tcp.seen[1]; + const struct tcphdr *th; + struct tcphdr _tcph; + const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0]; + const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1]; th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); @@ -997,57 +1073,57 @@ static int tcp_new(struct nf_conn *conntrack, /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { pr_debug("nf_ct_tcp: invalid new deleting.\n"); - return 0; + return false; } if (new_state == TCP_CONNTRACK_SYN_SENT) { /* SYN packet */ - conntrack->proto.tcp.seen[0].td_end = + ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end; - - tcp_options(skb, dataoff, th, &conntrack->proto.tcp.seen[0]); - conntrack->proto.tcp.seen[1].flags = 0; + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end; + + tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]); + ct->proto.tcp.seen[1].flags = 0; } else if (nf_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ - return 0; + return false; } else { /* * We are in the middle of a connection, * its history is lost for us. * Let's try to use the data from the packet. */ - conntrack->proto.tcp.seen[0].td_end = + ct->proto.tcp.seen[0].td_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - conntrack->proto.tcp.seen[0].td_maxwin = ntohs(th->window); - if (conntrack->proto.tcp.seen[0].td_maxwin == 0) - conntrack->proto.tcp.seen[0].td_maxwin = 1; - conntrack->proto.tcp.seen[0].td_maxend = - conntrack->proto.tcp.seen[0].td_end + - conntrack->proto.tcp.seen[0].td_maxwin; - conntrack->proto.tcp.seen[0].td_scale = 0; + ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window); + if (ct->proto.tcp.seen[0].td_maxwin == 0) + ct->proto.tcp.seen[0].td_maxwin = 1; + ct->proto.tcp.seen[0].td_maxend = + ct->proto.tcp.seen[0].td_end + + ct->proto.tcp.seen[0].td_maxwin; + ct->proto.tcp.seen[0].td_scale = 0; /* We assume SACK and liberal window checking to handle * window scaling */ - conntrack->proto.tcp.seen[0].flags = - conntrack->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | - IP_CT_TCP_FLAG_BE_LIBERAL; + ct->proto.tcp.seen[0].flags = + ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM | + IP_CT_TCP_FLAG_BE_LIBERAL; } - conntrack->proto.tcp.seen[1].td_end = 0; - conntrack->proto.tcp.seen[1].td_maxend = 0; - conntrack->proto.tcp.seen[1].td_maxwin = 1; - conntrack->proto.tcp.seen[1].td_scale = 0; + ct->proto.tcp.seen[1].td_end = 0; + ct->proto.tcp.seen[1].td_maxend = 0; + ct->proto.tcp.seen[1].td_maxwin = 0; + ct->proto.tcp.seen[1].td_scale = 0; /* tcp_packet will set them */ - conntrack->proto.tcp.state = TCP_CONNTRACK_NONE; - conntrack->proto.tcp.last_index = TCP_NONE_SET; + ct->proto.tcp.state = TCP_CONNTRACK_NONE; + ct->proto.tcp.last_index = TCP_NONE_SET; pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", @@ -1055,7 +1131,7 @@ static int tcp_new(struct nf_conn *conntrack, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); - return 1; + return true; } #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) @@ -1064,24 +1140,23 @@ static int tcp_new(struct nf_conn *conntrack, #include static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - const struct nf_conn *ct) + struct nf_conn *ct) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; - read_lock_bh(&tcp_lock); + spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; - NLA_PUT(skb, CTA_PROTOINFO_TCP_STATE, sizeof(u_int8_t), - &ct->proto.tcp.state); + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state); - NLA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, sizeof(u_int8_t), - &ct->proto.tcp.seen[0].td_scale); + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL, + ct->proto.tcp.seen[0].td_scale); - NLA_PUT(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, sizeof(u_int8_t), - &ct->proto.tcp.seen[1].td_scale); + NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY, + ct->proto.tcp.seen[1].td_scale); tmp.flags = ct->proto.tcp.seen[0].flags; NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL, @@ -1090,14 +1165,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, tmp.flags = ct->proto.tcp.seen[1].flags; NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp); - read_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: - read_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return -1; } @@ -1111,25 +1186,26 @@ static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = { static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) { - struct nlattr *attr = cda[CTA_PROTOINFO_TCP]; + struct nlattr *pattr = cda[CTA_PROTOINFO_TCP]; struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1]; int err; /* updates could not contain anything about the private * protocol info, in that case skip the parsing */ - if (!attr) + if (!pattr) return 0; - err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, attr, tcp_nla_policy); + err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy); if (err < 0) return err; - if (!tb[CTA_PROTOINFO_TCP_STATE]) + if (tb[CTA_PROTOINFO_TCP_STATE] && + nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) return -EINVAL; - write_lock_bh(&tcp_lock); - ct->proto.tcp.state = - *(u_int8_t *)nla_data(tb[CTA_PROTOINFO_TCP_STATE]); + spin_lock_bh(&ct->lock); + if (tb[CTA_PROTOINFO_TCP_STATE]) + ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = @@ -1149,15 +1225,26 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] && ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE && ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) { - ct->proto.tcp.seen[0].td_scale = *(u_int8_t *) - nla_data(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); - ct->proto.tcp.seen[1].td_scale = *(u_int8_t *) - nla_data(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); + ct->proto.tcp.seen[0].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]); + ct->proto.tcp.seen[1].td_scale = + nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } - write_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return 0; } + +static int tcp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_TCP */ + + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); +} + +static int tcp_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} #endif #ifdef CONFIG_SYSCTL @@ -1166,188 +1253,192 @@ static struct ctl_table_header *tcp_sysctl_header; static struct ctl_table tcp_sysctl_table[] = { { .procname = "nf_conntrack_tcp_timeout_syn_sent", - .data = &nf_ct_tcp_timeout_syn_sent, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_syn_recv", - .data = &nf_ct_tcp_timeout_syn_recv, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_established", - .data = &nf_ct_tcp_timeout_established, + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_fin_wait", - .data = &nf_ct_tcp_timeout_fin_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close_wait", - .data = &nf_ct_tcp_timeout_close_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_last_ack", - .data = &nf_ct_tcp_timeout_last_ack, + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_time_wait", - .data = &nf_ct_tcp_timeout_time_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close", - .data = &nf_ct_tcp_timeout_close, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_max_retrans", .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_unacknowledged", + .data = &nf_ct_tcp_timeout_unacknowledged, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE, .procname = "nf_conntrack_tcp_loose", .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL, .procname = "nf_conntrack_tcp_be_liberal", .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS, .procname = "nf_conntrack_tcp_max_retrans", .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { - .ctl_name = 0 - } + { } }; #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT static struct ctl_table tcp_compat_sysctl_table[] = { { .procname = "ip_conntrack_tcp_timeout_syn_sent", - .data = &nf_ct_tcp_timeout_syn_sent, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_syn_sent2", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_syn_recv", - .data = &nf_ct_tcp_timeout_syn_recv, + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_established", - .data = &nf_ct_tcp_timeout_established, + .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_fin_wait", - .data = &nf_ct_tcp_timeout_fin_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close_wait", - .data = &nf_ct_tcp_timeout_close_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_last_ack", - .data = &nf_ct_tcp_timeout_last_ack, + .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_time_wait", - .data = &nf_ct_tcp_timeout_time_wait, + .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close", - .data = &nf_ct_tcp_timeout_close, + .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_max_retrans", .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, .procname = "ip_conntrack_tcp_loose", .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, .procname = "ip_conntrack_tcp_be_liberal", .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { - .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, .procname = "ip_conntrack_tcp_max_retrans", .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, - { - .ctl_name = 0 - } + { } }; #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */ #endif /* CONFIG_SYSCTL */ @@ -1366,9 +1457,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = .error = tcp_error, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_SYSCTL @@ -1396,9 +1489,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = .error = tcp_error, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_SYSCTL