X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fnetfilter%2Fnf_conntrack_proto_tcp.c;h=5142e60af5402237642163758a4665d707438a26;hb=36432dae73cf2c90a59b39c8df9fd8219272b005;hp=202d7fa09483841ac91143f1d1c53dab9d6a6d50;hpb=b2155e7f70b3f058efe94c0c459db023b05057bd;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index 202d7fa..5142e60 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -15,6 +15,7 @@ #include #include #include +#include #include @@ -25,9 +26,8 @@ #include #include #include - -/* Protects ct->proto.tcp */ -static DEFINE_RWLOCK(tcp_lock); +#include +#include /* "Be conservative in what you do, be liberal in what you accept from others." @@ -56,7 +56,7 @@ static const char *const tcp_conntrack_names[] = { "LAST_ACK", "TIME_WAIT", "CLOSE", - "LISTEN" + "SYN_SENT2", }; #define SECS * HZ @@ -67,7 +67,8 @@ static const char *const tcp_conntrack_names[] = { /* RFC1122 says the R2 limit should be at least 100 seconds. Linux uses 15 packets as limit, which corresponds to ~13-30min depending on RTO. */ -static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; +static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly = 5 MINS; +static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly = 5 MINS; static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { [TCP_CONNTRACK_SYN_SENT] = 2 MINS, @@ -78,6 +79,7 @@ static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { [TCP_CONNTRACK_LAST_ACK] = 30 SECS, [TCP_CONNTRACK_TIME_WAIT] = 2 MINS, [TCP_CONNTRACK_CLOSE] = 10 SECS, + [TCP_CONNTRACK_SYN_SENT2] = 2 MINS, }; #define sNO TCP_CONNTRACK_NONE @@ -89,7 +91,7 @@ static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = { #define sLA TCP_CONNTRACK_LAST_ACK #define sTW TCP_CONNTRACK_TIME_WAIT #define sCL TCP_CONNTRACK_CLOSE -#define sLI TCP_CONNTRACK_LISTEN +#define sS2 TCP_CONNTRACK_SYN_SENT2 #define sIV TCP_CONNTRACK_MAX #define sIG TCP_CONNTRACK_IGNORE @@ -119,6 +121,7 @@ enum tcp_bit_set { * * NONE: initial state * SYN_SENT: SYN-only packet seen + * SYN_SENT2: SYN-only packet seen from reply dir, simultaneous open * SYN_RECV: SYN-ACK packet seen * ESTABLISHED: ACK packet seen * FIN_WAIT: FIN packet seen @@ -127,26 +130,24 @@ enum tcp_bit_set { * TIME_WAIT: last ACK seen * CLOSE: closed connection (RST) * - * LISTEN state is not used. - * * Packets marked as IGNORED (sIG): * if they may be either invalid or valid * and the receiver may send back a connection * closing RST or a SYN/ACK. * * Packets marked as INVALID (sIV): - * if they are invalid - * or we do not support the request (simultaneous open) + * if we regard them as truly invalid packets */ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { { /* ORIGINAL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 }, /* * sNO -> sSS Initialize a new connection * sSS -> sSS Retransmitted SYN - * sSR -> sIG Late retransmitted SYN? + * sS2 -> sS2 Late retransmitted SYN + * sSR -> sIG * sES -> sIG Error: SYNs in window outside the SYN_SENT state * are errors. Receiver will reply with RST * and close the connection. @@ -157,22 +158,30 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sSS Reopened connection (RFC 1122). * sCL -> sSS */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* - * A SYN/ACK from the client is always invalid: - * - either it tries to set up a simultaneous open, which is - * not supported; - * - or the firewall has just been inserted between the two hosts - * during the session set-up. The SYN will be retransmitted - * by the true client (or it'll time out). + * sNO -> sIV Too late and no reason to do anything + * sSS -> sIV Client can't send SYN and then SYN/ACK + * sS2 -> sSR SYN/ACK sent to SYN2 in simultaneous open + * sSR -> sIG + * sES -> sIG Error: SYNs in window outside the SYN_SENT state + * are errors. Receiver will reply with RST + * and close the connection. + * Or we are not in sync and hold a dead connection. + * sFW -> sIG + * sCW -> sIG + * sLA -> sIG + * sTW -> sIG + * sCL -> sIG */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sNO -> sIV Too late and no reason to do anything... * sSS -> sIV Client migth not send FIN in this state: * we enforce waiting for a SYN/ACK reply first. + * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions, waiting for @@ -183,11 +192,12 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*ack*/ { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV }, /* * sNO -> sES Assumed. * sSS -> sIV ACK is invalid: we haven't seen a SYN/ACK yet. + * sS2 -> sIV * sSR -> sES Established state is reached. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -196,29 +206,31 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW Retransmitted last ACK. Remain in the same state. * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } }, { /* REPLY */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*syn*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*syn*/ { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 }, /* * sNO -> sIV Never reached. - * sSS -> sIV Simultaneous open, not supported - * sSR -> sIV Simultaneous open, not supported. - * sES -> sIV Server may not initiate a connection. + * sSS -> sS2 Simultaneous open + * sS2 -> sS2 Retransmitted simultaneous SYN + * sSR -> sIV Invalid SYN packets sent by the server + * sES -> sIV * sFW -> sIV * sCW -> sIV * sLA -> sIV * sTW -> sIV Reopened connection, but server may not do it. * sCL -> sIV */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR }, /* * sSS -> sSR Standard open. + * sS2 -> sSR Simultaneous open * sSR -> sSR Retransmitted SYN/ACK. * sES -> sIG Late retransmitted SYN/ACK? * sFW -> sIG Might be SYN/ACK answering ignored SYN @@ -227,10 +239,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sIG * sCL -> sIG */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ /*fin*/ { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV }, /* * sSS -> sIV Server might not send FIN in this state. + * sS2 -> sIV * sSR -> sFW Close started. * sES -> sFW * sFW -> sLA FIN seen in both directions. @@ -239,10 +252,11 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*ack*/ { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG }, /* * sSS -> sIG Might be a half-open connection. + * sS2 -> sIG * sSR -> sSR Might answer late resent SYN. * sES -> sES :-) * sFW -> sCW Normal close request answered by ACK. @@ -251,15 +265,14 @@ static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = { * sTW -> sTW Retransmitted last ACK. * sCL -> sCL */ -/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sLI */ -/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sIV }, +/* sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2 */ +/*rst*/ { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL }, /*none*/ { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV } } }; -static int tcp_pkt_to_tuple(const struct sk_buff *skb, - unsigned int dataoff, - struct nf_conntrack_tuple *tuple) +static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff, + struct nf_conntrack_tuple *tuple) { const struct tcphdr *hp; struct tcphdr _hdr; @@ -267,20 +280,20 @@ static int tcp_pkt_to_tuple(const struct sk_buff *skb, /* Actually only need first 8 bytes. */ hp = skb_header_pointer(skb, dataoff, 8, &_hdr); if (hp == NULL) - return 0; + return false; tuple->src.u.tcp.port = hp->source; tuple->dst.u.tcp.port = hp->dest; - return 1; + return true; } -static int tcp_invert_tuple(struct nf_conntrack_tuple *tuple, - const struct nf_conntrack_tuple *orig) +static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_tuple *orig) { tuple->src.u.tcp.port = orig->dst.u.tcp.port; tuple->dst.u.tcp.port = orig->src.u.tcp.port; - return 1; + return true; } /* Print out the per-protocol part of the tuple. */ @@ -293,13 +306,13 @@ static int tcp_print_tuple(struct seq_file *s, } /* Print out the private part of the conntrack. */ -static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct) +static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct) { enum tcp_conntrack state; - read_lock_bh(&tcp_lock); + spin_lock_bh(&ct->lock); state = ct->proto.tcp.state; - read_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return seq_printf(s, "%s ", tcp_conntrack_names[state]); } @@ -332,12 +345,13 @@ static unsigned int get_conntrack_index(const struct tcphdr *tcph) I. Upper bound for valid data: seq <= sender.td_maxend II. Lower bound for valid data: seq + len >= sender.td_end - receiver.td_maxwin - III. Upper bound for valid ack: sack <= receiver.td_end - IV. Lower bound for valid ack: ack >= receiver.td_end - MAXACKWINDOW + III. Upper bound for valid (s)ack: sack <= receiver.td_end + IV. Lower bound for valid (s)ack: sack >= receiver.td_end - MAXACKWINDOW - where sack is the highest right edge of sack block found in the packet. + where sack is the highest right edge of sack block found in the packet + or ack in the case of packet without SACK option. - The upper bound limit for a valid ack is not ignored - + The upper bound limit for a valid (s)ack is not ignored - we doesn't have to deal with fragments. */ @@ -465,7 +479,7 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, for (i = 0; i < (opsize - TCPOLEN_SACK_BASE); i += TCPOLEN_SACK_PERBLOCK) { - tmp = ntohl(*((__be32 *)(ptr+i)+1)); + tmp = get_unaligned_be32((__be32 *)(ptr+i)+1); if (after(tmp, *sack)) *sack = tmp; @@ -478,20 +492,21 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff, } } -static int tcp_in_window(const struct nf_conn *ct, - struct ip_ct_tcp *state, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - unsigned int dataoff, - const struct tcphdr *tcph, - int pf) +static bool tcp_in_window(const struct nf_conn *ct, + struct ip_ct_tcp *state, + enum ip_conntrack_dir dir, + unsigned int index, + const struct sk_buff *skb, + unsigned int dataoff, + const struct tcphdr *tcph, + u_int8_t pf) { + struct net *net = nf_ct_net(ct); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple; __u32 seq, ack, sack, end, win, swin; - int res; + bool res; /* * Get the required data from the packet. @@ -506,7 +521,7 @@ static int tcp_in_window(const struct nf_conn *ct, pr_debug("tcp_in_window: START\n"); pr_debug("tcp_in_window: "); - NF_CT_DUMP_TUPLE(tuple); + nf_ct_dump_tuple(tuple); pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n", seq, ack, sack, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " @@ -516,13 +531,14 @@ static int tcp_in_window(const struct nf_conn *ct, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); - if (sender->td_end == 0) { + if (sender->td_maxwin == 0) { /* * Initialize sender data. */ - if (tcph->syn && tcph->ack) { + if (tcph->syn) { /* - * Outgoing SYN-ACK in reply to a SYN. + * SYN-ACK in reply to a SYN + * or SYN from reply direction in simultaneous open. */ sender->td_end = sender->td_maxend = end; @@ -538,6 +554,9 @@ static int tcp_in_window(const struct nf_conn *ct, && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE)) sender->td_scale = receiver->td_scale = 0; + if (!tcph->ack) + /* Simultaneous open */ + return true; } else { /* * We are in the middle of a connection, @@ -593,7 +612,7 @@ static int tcp_in_window(const struct nf_conn *ct, seq = end = sender->td_end; pr_debug("tcp_in_window: "); - NF_CT_DUMP_TUPLE(tuple); + nf_ct_dump_tuple(tuple); pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n", seq, ack, sack, win, end); pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i " @@ -607,12 +626,12 @@ static int tcp_in_window(const struct nf_conn *ct, before(seq, sender->td_maxend + 1), after(end, sender->td_end - receiver->td_maxwin - 1), before(sack, receiver->td_end + 1), - after(ack, receiver->td_end - MAXACKWINDOW(sender))); + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)); if (before(seq, sender->td_maxend + 1) && after(end, sender->td_end - receiver->td_maxwin - 1) && before(sack, receiver->td_end + 1) && - after(ack, receiver->td_end - MAXACKWINDOW(sender))) { + after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { /* * Take into account window scaling (RFC 1323). */ @@ -625,8 +644,18 @@ static int tcp_in_window(const struct nf_conn *ct, swin = win + (sack - ack); if (sender->td_maxwin < swin) sender->td_maxwin = swin; - if (after(end, sender->td_end)) + if (after(end, sender->td_end)) { sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) + sender->td_maxack = ack; + } + /* * Update receiver data. */ @@ -637,6 +666,8 @@ static int tcp_in_window(const struct nf_conn *ct, if (win == 0) receiver->td_maxend++; } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; /* * Check retransmissions. @@ -657,13 +688,13 @@ static int tcp_in_window(const struct nf_conn *ct, state->retrans = 0; } } - res = 1; + res = true; } else { - res = 0; + res = false; if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || nf_ct_tcp_be_liberal) - res = 1; - if (!res && LOG_INVALID(IPPROTO_TCP)) + res = true; + if (!res && LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: %s ", before(seq, sender->td_maxend + 1) ? @@ -676,7 +707,7 @@ static int tcp_in_window(const struct nf_conn *ct, : "SEQ is over the upper bound (over the window of the receiver)"); } - pr_debug("tcp_in_window: res=%i sender end=%u maxend=%u maxwin=%u " + pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u " "receiver end=%u maxend=%u maxwin=%u\n", res, sender->td_end, sender->td_maxend, sender->td_maxwin, receiver->td_end, receiver->td_maxend, receiver->td_maxwin); @@ -699,14 +730,14 @@ void nf_conntrack_tcp_update(const struct sk_buff *skb, end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph); - write_lock_bh(&tcp_lock); + spin_lock_bh(&ct->lock); /* * We have to worry for the ack in the reply packet only... */ if (after(end, ct->proto.tcp.seen[dir].td_end)) ct->proto.tcp.seen[dir].td_end = end; ct->proto.tcp.last_end = end; - write_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i " "receiver end=%u maxend=%u maxwin=%u scale=%i\n", sender->td_end, sender->td_maxend, sender->td_maxwin, @@ -741,10 +772,11 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] = }; /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c. */ -static int tcp_error(struct sk_buff *skb, +static int tcp_error(struct net *net, + struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info *ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { const struct tcphdr *th; @@ -755,7 +787,7 @@ static int tcp_error(struct sk_buff *skb, /* Smaller that minimal TCP header? */ th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); if (th == NULL) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: short packet "); return -NF_ACCEPT; @@ -763,7 +795,7 @@ static int tcp_error(struct sk_buff *skb, /* Not whole TCP header or malformed packet */ if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: truncated/malformed packet "); return -NF_ACCEPT; @@ -774,9 +806,9 @@ static int tcp_error(struct sk_buff *skb, * because the checksum is assumed to be correct. */ /* FIXME: Source route IP option packets --RR */ - if (nf_conntrack_checksum && hooknum == NF_INET_PRE_ROUTING && + if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING && nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: bad TCP checksum "); return -NF_ACCEPT; @@ -785,7 +817,7 @@ static int tcp_error(struct sk_buff *skb, /* Check TCP flags. */ tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH)); if (!tcp_valid_flags[tcpflags]) { - if (LOG_INVALID(IPPROTO_TCP)) + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid TCP flag combination "); return -NF_ACCEPT; @@ -799,9 +831,10 @@ static int tcp_packet(struct nf_conn *ct, const struct sk_buff *skb, unsigned int dataoff, enum ip_conntrack_info ctinfo, - int pf, + u_int8_t pf, unsigned int hooknum) { + struct net *net = nf_ct_net(ct); struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; enum ip_conntrack_dir dir; @@ -813,7 +846,7 @@ static int tcp_packet(struct nf_conn *ct, th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph); BUG_ON(th == NULL); - write_lock_bh(&tcp_lock); + spin_lock_bh(&ct->lock); old_state = ct->proto.tcp.state; dir = CTINFO2DIR(ctinfo); index = get_conntrack_index(th); @@ -843,10 +876,15 @@ static int tcp_packet(struct nf_conn *ct, && ct->proto.tcp.last_index == TCP_RST_SET)) { /* Attempt to reopen a closed/aborted connection. * Delete this connection and look up again. */ - write_unlock_bh(&tcp_lock); - if (del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); - return -NF_REPEAT; + spin_unlock_bh(&ct->lock); + + /* Only repeat if we can actually remove the timer. + * Destruction may already be in progress in process + * context and we must give it a chance to terminate. + */ + if (nf_ct_kill(ct)) + return -NF_REPEAT; + return NF_DROP; } /* Fall through */ case TCP_CONNTRACK_IGNORE: @@ -874,13 +912,12 @@ static int tcp_packet(struct nf_conn *ct, * that the client cannot but retransmit its SYN and * thus initiate a clean new session. */ - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: killing out of sync session "); - if (del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); - return -NF_DROP; + nf_ct_kill(ct); + return NF_DROP; } ct->proto.tcp.last_index = index; ct->proto.tcp.last_dir = dir; @@ -888,8 +925,8 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_end = segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th); - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid packet ignored "); return NF_ACCEPT; @@ -897,13 +934,23 @@ static int tcp_packet(struct nf_conn *ct, /* Invalid packet */ pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n", dir, get_conntrack_index(th), old_state); - write_unlock_bh(&tcp_lock); - if (LOG_INVALID(IPPROTO_TCP)) + spin_unlock_bh(&ct->lock); + if (LOG_INVALID(net, IPPROTO_TCP)) nf_log_packet(pf, 0, skb, NULL, NULL, NULL, "nf_ct_tcp: invalid state "); return -NF_ACCEPT; case TCP_CONNTRACK_CLOSE: if (index == TCP_RST_SET + && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET) + && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) { + /* Invalid RST */ + write_unlock_bh(&tcp_lock); + if (LOG_INVALID(net, IPPROTO_TCP)) + nf_log_packet(pf, 0, skb, NULL, NULL, NULL, + "nf_ct_tcp: invalid RST "); + return -NF_ACCEPT; + } + if (index == TCP_RST_SET && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status) && ct->proto.tcp.last_index == TCP_SYN_SET) || (!test_bit(IPS_ASSURED_BIT, &ct->status) @@ -928,7 +975,7 @@ static int tcp_packet(struct nf_conn *ct, if (!tcp_in_window(ct, &ct->proto.tcp, dir, index, skb, dataoff, th, pf)) { - write_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return -NF_ACCEPT; } in_window: @@ -937,7 +984,7 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.last_dir = dir; pr_debug("tcp_conntracks: "); - NF_CT_DUMP_TUPLE(tuple); + nf_ct_dump_tuple(tuple); pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n", (th->syn ? 1 : 0), (th->ack ? 1 : 0), (th->fin ? 1 : 0), (th->rst ? 1 : 0), @@ -945,16 +992,22 @@ static int tcp_packet(struct nf_conn *ct, ct->proto.tcp.state = new_state; if (old_state != new_state - && new_state == TCP_CONNTRACK_CLOSE) + && new_state == TCP_CONNTRACK_FIN_WAIT) ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT; - timeout = ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans - && tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans - ? nf_ct_tcp_timeout_max_retrans : tcp_timeouts[new_state]; - write_unlock_bh(&tcp_lock); - nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, skb); + if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans && + tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans) + timeout = nf_ct_tcp_timeout_max_retrans; + else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) & + IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED && + tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged) + timeout = nf_ct_tcp_timeout_unacknowledged; + else + timeout = tcp_timeouts[new_state]; + spin_unlock_bh(&ct->lock); + if (new_state != old_state) - nf_conntrack_event_cache(IPCT_PROTOINFO, skb); + nf_conntrack_event_cache(IPCT_PROTOINFO, ct); if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) { /* If only reply is a RST, we can consider ourselves not to @@ -962,8 +1015,7 @@ static int tcp_packet(struct nf_conn *ct, problem case, so we can delete the conntrack immediately. --RR */ if (th->rst) { - if (del_timer(&ct->timeout)) - ct->timeout.function((unsigned long)ct); + nf_ct_kill_acct(ct, ctinfo, skb); return NF_ACCEPT; } } else if (!test_bit(IPS_ASSURED_BIT, &ct->status) @@ -974,7 +1026,7 @@ static int tcp_packet(struct nf_conn *ct, after SYN_RECV or a valid answer for a picked up connection. */ set_bit(IPS_ASSURED_BIT, &ct->status); - nf_conntrack_event_cache(IPCT_STATUS, skb); + nf_conntrack_event_cache(IPCT_STATUS, ct); } nf_ct_refresh_acct(ct, ctinfo, skb, timeout); @@ -982,9 +1034,8 @@ static int tcp_packet(struct nf_conn *ct, } /* Called when a new connection for this protocol found. */ -static int tcp_new(struct nf_conn *ct, - const struct sk_buff *skb, - unsigned int dataoff) +static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb, + unsigned int dataoff) { enum tcp_conntrack new_state; const struct tcphdr *th; @@ -1003,7 +1054,7 @@ static int tcp_new(struct nf_conn *ct, /* Invalid: delete conntrack */ if (new_state >= TCP_CONNTRACK_MAX) { pr_debug("nf_ct_tcp: invalid new deleting.\n"); - return 0; + return false; } if (new_state == TCP_CONNTRACK_SYN_SENT) { @@ -1021,7 +1072,7 @@ static int tcp_new(struct nf_conn *ct, ct->proto.tcp.seen[1].flags = 0; } else if (nf_ct_tcp_loose == 0) { /* Don't try to pick up connections. */ - return 0; + return false; } else { /* * We are in the middle of a connection, @@ -1048,7 +1099,7 @@ static int tcp_new(struct nf_conn *ct, ct->proto.tcp.seen[1].td_end = 0; ct->proto.tcp.seen[1].td_maxend = 0; - ct->proto.tcp.seen[1].td_maxwin = 1; + ct->proto.tcp.seen[1].td_maxwin = 0; ct->proto.tcp.seen[1].td_scale = 0; /* tcp_packet will set them */ @@ -1061,7 +1112,7 @@ static int tcp_new(struct nf_conn *ct, sender->td_scale, receiver->td_end, receiver->td_maxend, receiver->td_maxwin, receiver->td_scale); - return 1; + return true; } #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) @@ -1070,12 +1121,12 @@ static int tcp_new(struct nf_conn *ct, #include static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, - const struct nf_conn *ct) + struct nf_conn *ct) { struct nlattr *nest_parms; struct nf_ct_tcp_flags tmp = {}; - read_lock_bh(&tcp_lock); + spin_lock_bh(&ct->lock); nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED); if (!nest_parms) goto nla_put_failure; @@ -1095,14 +1146,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla, tmp.flags = ct->proto.tcp.seen[1].flags; NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY, sizeof(struct nf_ct_tcp_flags), &tmp); - read_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); nla_nest_end(skb, nest_parms); return 0; nla_put_failure: - read_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return -1; } @@ -1129,11 +1180,13 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) if (err < 0) return err; - if (!tb[CTA_PROTOINFO_TCP_STATE]) + if (tb[CTA_PROTOINFO_TCP_STATE] && + nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX) return -EINVAL; - write_lock_bh(&tcp_lock); - ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); + spin_lock_bh(&ct->lock); + if (tb[CTA_PROTOINFO_TCP_STATE]) + ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]); if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) { struct nf_ct_tcp_flags *attr = @@ -1158,10 +1211,21 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct) ct->proto.tcp.seen[1].td_scale = nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]); } - write_unlock_bh(&tcp_lock); + spin_unlock_bh(&ct->lock); return 0; } + +static int tcp_nlattr_size(void) +{ + return nla_total_size(0) /* CTA_PROTOINFO_TCP */ + + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1); +} + +static int tcp_nlattr_tuple_size(void) +{ + return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1); +} #endif #ifdef CONFIG_SYSCTL @@ -1173,63 +1237,70 @@ static struct ctl_table tcp_sysctl_table[] = { .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_syn_recv", .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_established", .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_fin_wait", .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close_wait", .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_last_ack", .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_time_wait", .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_close", .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "nf_conntrack_tcp_timeout_max_retrans", .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "nf_conntrack_tcp_timeout_unacknowledged", + .data = &nf_ct_tcp_timeout_unacknowledged, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { .ctl_name = NET_NF_CONNTRACK_TCP_LOOSE, @@ -1237,7 +1308,7 @@ static struct ctl_table tcp_sysctl_table[] = { .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_NF_CONNTRACK_TCP_BE_LIBERAL, @@ -1245,7 +1316,7 @@ static struct ctl_table tcp_sysctl_table[] = { .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_NF_CONNTRACK_TCP_MAX_RETRANS, @@ -1253,7 +1324,7 @@ static struct ctl_table tcp_sysctl_table[] = { .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = 0 @@ -1267,63 +1338,70 @@ static struct ctl_table tcp_compat_sysctl_table[] = { .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ip_conntrack_tcp_timeout_syn_sent2", + .data = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2], + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_syn_recv", .data = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_established", .data = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_fin_wait", .data = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close_wait", .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_last_ack", .data = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_time_wait", .data = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_close", .data = &tcp_timeouts[TCP_CONNTRACK_CLOSE], .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .procname = "ip_conntrack_tcp_timeout_max_retrans", .data = &nf_ct_tcp_timeout_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec_jiffies, + .proc_handler = proc_dointvec_jiffies, }, { .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_LOOSE, @@ -1331,7 +1409,7 @@ static struct ctl_table tcp_compat_sysctl_table[] = { .data = &nf_ct_tcp_loose, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL, @@ -1339,7 +1417,7 @@ static struct ctl_table tcp_compat_sysctl_table[] = { .data = &nf_ct_tcp_be_liberal, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS, @@ -1347,7 +1425,7 @@ static struct ctl_table tcp_compat_sysctl_table[] = { .data = &nf_ct_tcp_max_retrans, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = &proc_dointvec, + .proc_handler = proc_dointvec, }, { .ctl_name = 0 @@ -1370,9 +1448,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly = .error = tcp_error, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_SYSCTL @@ -1400,9 +1480,11 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly = .error = tcp_error, #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE) .to_nlattr = tcp_to_nlattr, + .nlattr_size = tcp_nlattr_size, .from_nlattr = nlattr_to_tcp, .tuple_to_nlattr = nf_ct_port_tuple_to_nlattr, .nlattr_to_tuple = nf_ct_port_nlattr_to_tuple, + .nlattr_tuple_size = tcp_nlattr_tuple_size, .nla_policy = nf_ct_port_nla_policy, #endif #ifdef CONFIG_SYSCTL