b7e8a825efed451f84320df913356034be932150
[safe/jmp/linux-2.6] / net / netfilter / nf_conntrack_proto_tcp.c
1 /* (C) 1999-2001 Paul `Rusty' Russell
2  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
3  *
4  * This program is free software; you can redistribute it and/or modify
5  * it under the terms of the GNU General Public License version 2 as
6  * published by the Free Software Foundation.
7  */
8
9 #include <linux/types.h>
10 #include <linux/timer.h>
11 #include <linux/module.h>
12 #include <linux/in.h>
13 #include <linux/tcp.h>
14 #include <linux/spinlock.h>
15 #include <linux/skbuff.h>
16 #include <linux/ipv6.h>
17 #include <net/ip6_checksum.h>
18 #include <asm/unaligned.h>
19
20 #include <net/tcp.h>
21
22 #include <linux/netfilter.h>
23 #include <linux/netfilter_ipv4.h>
24 #include <linux/netfilter_ipv6.h>
25 #include <net/netfilter/nf_conntrack.h>
26 #include <net/netfilter/nf_conntrack_l4proto.h>
27 #include <net/netfilter/nf_conntrack_ecache.h>
28 #include <net/netfilter/nf_log.h>
29 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
30 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
31
32 /* Protects ct->proto.tcp */
33 static DEFINE_RWLOCK(tcp_lock);
34
35 /* "Be conservative in what you do,
36     be liberal in what you accept from others."
37     If it's non-zero, we mark only out of window RST segments as INVALID. */
38 static int nf_ct_tcp_be_liberal __read_mostly = 0;
39
40 /* If it is set to zero, we disable picking up already established
41    connections. */
42 static int nf_ct_tcp_loose __read_mostly = 1;
43
44 /* Max number of the retransmitted packets without receiving an (acceptable)
45    ACK from the destination. If this number is reached, a shorter timer
46    will be started. */
47 static int nf_ct_tcp_max_retrans __read_mostly = 3;
48
49   /* FIXME: Examine ipfilter's timeouts and conntrack transitions more
50      closely.  They're more complex. --RR */
51
52 static const char *const tcp_conntrack_names[] = {
53         "NONE",
54         "SYN_SENT",
55         "SYN_RECV",
56         "ESTABLISHED",
57         "FIN_WAIT",
58         "CLOSE_WAIT",
59         "LAST_ACK",
60         "TIME_WAIT",
61         "CLOSE",
62         "SYN_SENT2",
63 };
64
65 #define SECS * HZ
66 #define MINS * 60 SECS
67 #define HOURS * 60 MINS
68 #define DAYS * 24 HOURS
69
70 /* RFC1122 says the R2 limit should be at least 100 seconds.
71    Linux uses 15 packets as limit, which corresponds
72    to ~13-30min depending on RTO. */
73 static unsigned int nf_ct_tcp_timeout_max_retrans __read_mostly    =   5 MINS;
74 static unsigned int nf_ct_tcp_timeout_unacknowledged __read_mostly =   5 MINS;
75
76 static unsigned int tcp_timeouts[TCP_CONNTRACK_MAX] __read_mostly = {
77         [TCP_CONNTRACK_SYN_SENT]        = 2 MINS,
78         [TCP_CONNTRACK_SYN_RECV]        = 60 SECS,
79         [TCP_CONNTRACK_ESTABLISHED]     = 5 DAYS,
80         [TCP_CONNTRACK_FIN_WAIT]        = 2 MINS,
81         [TCP_CONNTRACK_CLOSE_WAIT]      = 60 SECS,
82         [TCP_CONNTRACK_LAST_ACK]        = 30 SECS,
83         [TCP_CONNTRACK_TIME_WAIT]       = 2 MINS,
84         [TCP_CONNTRACK_CLOSE]           = 10 SECS,
85         [TCP_CONNTRACK_SYN_SENT2]       = 2 MINS,
86 };
87
88 #define sNO TCP_CONNTRACK_NONE
89 #define sSS TCP_CONNTRACK_SYN_SENT
90 #define sSR TCP_CONNTRACK_SYN_RECV
91 #define sES TCP_CONNTRACK_ESTABLISHED
92 #define sFW TCP_CONNTRACK_FIN_WAIT
93 #define sCW TCP_CONNTRACK_CLOSE_WAIT
94 #define sLA TCP_CONNTRACK_LAST_ACK
95 #define sTW TCP_CONNTRACK_TIME_WAIT
96 #define sCL TCP_CONNTRACK_CLOSE
97 #define sS2 TCP_CONNTRACK_SYN_SENT2
98 #define sIV TCP_CONNTRACK_MAX
99 #define sIG TCP_CONNTRACK_IGNORE
100
101 /* What TCP flags are set from RST/SYN/FIN/ACK. */
102 enum tcp_bit_set {
103         TCP_SYN_SET,
104         TCP_SYNACK_SET,
105         TCP_FIN_SET,
106         TCP_ACK_SET,
107         TCP_RST_SET,
108         TCP_NONE_SET,
109 };
110
111 /*
112  * The TCP state transition table needs a few words...
113  *
114  * We are the man in the middle. All the packets go through us
115  * but might get lost in transit to the destination.
116  * It is assumed that the destinations can't receive segments
117  * we haven't seen.
118  *
119  * The checked segment is in window, but our windows are *not*
120  * equivalent with the ones of the sender/receiver. We always
121  * try to guess the state of the current sender.
122  *
123  * The meaning of the states are:
124  *
125  * NONE:        initial state
126  * SYN_SENT:    SYN-only packet seen
127  * SYN_SENT2:   SYN-only packet seen from reply dir, simultaneous open
128  * SYN_RECV:    SYN-ACK packet seen
129  * ESTABLISHED: ACK packet seen
130  * FIN_WAIT:    FIN packet seen
131  * CLOSE_WAIT:  ACK seen (after FIN)
132  * LAST_ACK:    FIN seen (after FIN)
133  * TIME_WAIT:   last ACK seen
134  * CLOSE:       closed connection (RST)
135  *
136  * Packets marked as IGNORED (sIG):
137  *      if they may be either invalid or valid
138  *      and the receiver may send back a connection
139  *      closing RST or a SYN/ACK.
140  *
141  * Packets marked as INVALID (sIV):
142  *      if we regard them as truly invalid packets
143  */
144 static const u8 tcp_conntracks[2][6][TCP_CONNTRACK_MAX] = {
145         {
146 /* ORIGINAL */
147 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
148 /*syn*/    { sSS, sSS, sIG, sIG, sIG, sIG, sIG, sSS, sSS, sS2 },
149 /*
150  *      sNO -> sSS      Initialize a new connection
151  *      sSS -> sSS      Retransmitted SYN
152  *      sS2 -> sS2      Late retransmitted SYN
153  *      sSR -> sIG
154  *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
155  *                      are errors. Receiver will reply with RST
156  *                      and close the connection.
157  *                      Or we are not in sync and hold a dead connection.
158  *      sFW -> sIG
159  *      sCW -> sIG
160  *      sLA -> sIG
161  *      sTW -> sSS      Reopened connection (RFC 1122).
162  *      sCL -> sSS
163  */
164 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
165 /*synack*/ { sIV, sIV, sIG, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
166 /*
167  *      sNO -> sIV      Too late and no reason to do anything
168  *      sSS -> sIV      Client can't send SYN and then SYN/ACK
169  *      sS2 -> sSR      SYN/ACK sent to SYN2 in simultaneous open
170  *      sSR -> sIG
171  *      sES -> sIG      Error: SYNs in window outside the SYN_SENT state
172  *                      are errors. Receiver will reply with RST
173  *                      and close the connection.
174  *                      Or we are not in sync and hold a dead connection.
175  *      sFW -> sIG
176  *      sCW -> sIG
177  *      sLA -> sIG
178  *      sTW -> sIG
179  *      sCL -> sIG
180  */
181 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
182 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
183 /*
184  *      sNO -> sIV      Too late and no reason to do anything...
185  *      sSS -> sIV      Client migth not send FIN in this state:
186  *                      we enforce waiting for a SYN/ACK reply first.
187  *      sS2 -> sIV
188  *      sSR -> sFW      Close started.
189  *      sES -> sFW
190  *      sFW -> sLA      FIN seen in both directions, waiting for
191  *                      the last ACK.
192  *                      Migth be a retransmitted FIN as well...
193  *      sCW -> sLA
194  *      sLA -> sLA      Retransmitted FIN. Remain in the same state.
195  *      sTW -> sTW
196  *      sCL -> sCL
197  */
198 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
199 /*ack*/    { sES, sIV, sES, sES, sCW, sCW, sTW, sTW, sCL, sIV },
200 /*
201  *      sNO -> sES      Assumed.
202  *      sSS -> sIV      ACK is invalid: we haven't seen a SYN/ACK yet.
203  *      sS2 -> sIV
204  *      sSR -> sES      Established state is reached.
205  *      sES -> sES      :-)
206  *      sFW -> sCW      Normal close request answered by ACK.
207  *      sCW -> sCW
208  *      sLA -> sTW      Last ACK detected.
209  *      sTW -> sTW      Retransmitted last ACK. Remain in the same state.
210  *      sCL -> sCL
211  */
212 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
213 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
214 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
215         },
216         {
217 /* REPLY */
218 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
219 /*syn*/    { sIV, sS2, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sS2 },
220 /*
221  *      sNO -> sIV      Never reached.
222  *      sSS -> sS2      Simultaneous open
223  *      sS2 -> sS2      Retransmitted simultaneous SYN
224  *      sSR -> sIV      Invalid SYN packets sent by the server
225  *      sES -> sIV
226  *      sFW -> sIV
227  *      sCW -> sIV
228  *      sLA -> sIV
229  *      sTW -> sIV      Reopened connection, but server may not do it.
230  *      sCL -> sIV
231  */
232 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
233 /*synack*/ { sIV, sSR, sSR, sIG, sIG, sIG, sIG, sIG, sIG, sSR },
234 /*
235  *      sSS -> sSR      Standard open.
236  *      sS2 -> sSR      Simultaneous open
237  *      sSR -> sSR      Retransmitted SYN/ACK.
238  *      sES -> sIG      Late retransmitted SYN/ACK?
239  *      sFW -> sIG      Might be SYN/ACK answering ignored SYN
240  *      sCW -> sIG
241  *      sLA -> sIG
242  *      sTW -> sIG
243  *      sCL -> sIG
244  */
245 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
246 /*fin*/    { sIV, sIV, sFW, sFW, sLA, sLA, sLA, sTW, sCL, sIV },
247 /*
248  *      sSS -> sIV      Server might not send FIN in this state.
249  *      sS2 -> sIV
250  *      sSR -> sFW      Close started.
251  *      sES -> sFW
252  *      sFW -> sLA      FIN seen in both directions.
253  *      sCW -> sLA
254  *      sLA -> sLA      Retransmitted FIN.
255  *      sTW -> sTW
256  *      sCL -> sCL
257  */
258 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
259 /*ack*/    { sIV, sIG, sSR, sES, sCW, sCW, sTW, sTW, sCL, sIG },
260 /*
261  *      sSS -> sIG      Might be a half-open connection.
262  *      sS2 -> sIG
263  *      sSR -> sSR      Might answer late resent SYN.
264  *      sES -> sES      :-)
265  *      sFW -> sCW      Normal close request answered by ACK.
266  *      sCW -> sCW
267  *      sLA -> sTW      Last ACK detected.
268  *      sTW -> sTW      Retransmitted last ACK.
269  *      sCL -> sCL
270  */
271 /*           sNO, sSS, sSR, sES, sFW, sCW, sLA, sTW, sCL, sS2   */
272 /*rst*/    { sIV, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL, sCL },
273 /*none*/   { sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV, sIV }
274         }
275 };
276
277 static bool tcp_pkt_to_tuple(const struct sk_buff *skb, unsigned int dataoff,
278                              struct nf_conntrack_tuple *tuple)
279 {
280         const struct tcphdr *hp;
281         struct tcphdr _hdr;
282
283         /* Actually only need first 8 bytes. */
284         hp = skb_header_pointer(skb, dataoff, 8, &_hdr);
285         if (hp == NULL)
286                 return false;
287
288         tuple->src.u.tcp.port = hp->source;
289         tuple->dst.u.tcp.port = hp->dest;
290
291         return true;
292 }
293
294 static bool tcp_invert_tuple(struct nf_conntrack_tuple *tuple,
295                              const struct nf_conntrack_tuple *orig)
296 {
297         tuple->src.u.tcp.port = orig->dst.u.tcp.port;
298         tuple->dst.u.tcp.port = orig->src.u.tcp.port;
299         return true;
300 }
301
302 /* Print out the per-protocol part of the tuple. */
303 static int tcp_print_tuple(struct seq_file *s,
304                            const struct nf_conntrack_tuple *tuple)
305 {
306         return seq_printf(s, "sport=%hu dport=%hu ",
307                           ntohs(tuple->src.u.tcp.port),
308                           ntohs(tuple->dst.u.tcp.port));
309 }
310
311 /* Print out the private part of the conntrack. */
312 static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
313 {
314         enum tcp_conntrack state;
315
316         read_lock_bh(&tcp_lock);
317         state = ct->proto.tcp.state;
318         read_unlock_bh(&tcp_lock);
319
320         return seq_printf(s, "%s ", tcp_conntrack_names[state]);
321 }
322
323 static unsigned int get_conntrack_index(const struct tcphdr *tcph)
324 {
325         if (tcph->rst) return TCP_RST_SET;
326         else if (tcph->syn) return (tcph->ack ? TCP_SYNACK_SET : TCP_SYN_SET);
327         else if (tcph->fin) return TCP_FIN_SET;
328         else if (tcph->ack) return TCP_ACK_SET;
329         else return TCP_NONE_SET;
330 }
331
332 /* TCP connection tracking based on 'Real Stateful TCP Packet Filtering
333    in IP Filter' by Guido van Rooij.
334
335    http://www.nluug.nl/events/sane2000/papers.html
336    http://www.iae.nl/users/guido/papers/tcp_filtering.ps.gz
337
338    The boundaries and the conditions are changed according to RFC793:
339    the packet must intersect the window (i.e. segments may be
340    after the right or before the left edge) and thus receivers may ACK
341    segments after the right edge of the window.
342
343         td_maxend = max(sack + max(win,1)) seen in reply packets
344         td_maxwin = max(max(win, 1)) + (sack - ack) seen in sent packets
345         td_maxwin += seq + len - sender.td_maxend
346                         if seq + len > sender.td_maxend
347         td_end    = max(seq + len) seen in sent packets
348
349    I.   Upper bound for valid data:     seq <= sender.td_maxend
350    II.  Lower bound for valid data:     seq + len >= sender.td_end - receiver.td_maxwin
351    III. Upper bound for valid (s)ack:   sack <= receiver.td_end
352    IV.  Lower bound for valid (s)ack:   sack >= receiver.td_end - MAXACKWINDOW
353
354    where sack is the highest right edge of sack block found in the packet
355    or ack in the case of packet without SACK option.
356
357    The upper bound limit for a valid (s)ack is not ignored -
358    we doesn't have to deal with fragments.
359 */
360
361 static inline __u32 segment_seq_plus_len(__u32 seq,
362                                          size_t len,
363                                          unsigned int dataoff,
364                                          const struct tcphdr *tcph)
365 {
366         /* XXX Should I use payload length field in IP/IPv6 header ?
367          * - YK */
368         return (seq + len - dataoff - tcph->doff*4
369                 + (tcph->syn ? 1 : 0) + (tcph->fin ? 1 : 0));
370 }
371
372 /* Fixme: what about big packets? */
373 #define MAXACKWINCONST                  66000
374 #define MAXACKWINDOW(sender)                                            \
375         ((sender)->td_maxwin > MAXACKWINCONST ? (sender)->td_maxwin     \
376                                               : MAXACKWINCONST)
377
378 /*
379  * Simplified tcp_parse_options routine from tcp_input.c
380  */
381 static void tcp_options(const struct sk_buff *skb,
382                         unsigned int dataoff,
383                         const struct tcphdr *tcph,
384                         struct ip_ct_tcp_state *state)
385 {
386         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
387         const unsigned char *ptr;
388         int length = (tcph->doff*4) - sizeof(struct tcphdr);
389
390         if (!length)
391                 return;
392
393         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
394                                  length, buff);
395         BUG_ON(ptr == NULL);
396
397         state->td_scale =
398         state->flags = 0;
399
400         while (length > 0) {
401                 int opcode=*ptr++;
402                 int opsize;
403
404                 switch (opcode) {
405                 case TCPOPT_EOL:
406                         return;
407                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
408                         length--;
409                         continue;
410                 default:
411                         opsize=*ptr++;
412                         if (opsize < 2) /* "silly options" */
413                                 return;
414                         if (opsize > length)
415                                 break;  /* don't parse partial options */
416
417                         if (opcode == TCPOPT_SACK_PERM
418                             && opsize == TCPOLEN_SACK_PERM)
419                                 state->flags |= IP_CT_TCP_FLAG_SACK_PERM;
420                         else if (opcode == TCPOPT_WINDOW
421                                  && opsize == TCPOLEN_WINDOW) {
422                                 state->td_scale = *(u_int8_t *)ptr;
423
424                                 if (state->td_scale > 14) {
425                                         /* See RFC1323 */
426                                         state->td_scale = 14;
427                                 }
428                                 state->flags |=
429                                         IP_CT_TCP_FLAG_WINDOW_SCALE;
430                         }
431                         ptr += opsize - 2;
432                         length -= opsize;
433                 }
434         }
435 }
436
437 static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
438                      const struct tcphdr *tcph, __u32 *sack)
439 {
440         unsigned char buff[(15 * 4) - sizeof(struct tcphdr)];
441         const unsigned char *ptr;
442         int length = (tcph->doff*4) - sizeof(struct tcphdr);
443         __u32 tmp;
444
445         if (!length)
446                 return;
447
448         ptr = skb_header_pointer(skb, dataoff + sizeof(struct tcphdr),
449                                  length, buff);
450         BUG_ON(ptr == NULL);
451
452         /* Fast path for timestamp-only option */
453         if (length == TCPOLEN_TSTAMP_ALIGNED*4
454             && *(__be32 *)ptr == htonl((TCPOPT_NOP << 24)
455                                        | (TCPOPT_NOP << 16)
456                                        | (TCPOPT_TIMESTAMP << 8)
457                                        | TCPOLEN_TIMESTAMP))
458                 return;
459
460         while (length > 0) {
461                 int opcode = *ptr++;
462                 int opsize, i;
463
464                 switch (opcode) {
465                 case TCPOPT_EOL:
466                         return;
467                 case TCPOPT_NOP:        /* Ref: RFC 793 section 3.1 */
468                         length--;
469                         continue;
470                 default:
471                         opsize = *ptr++;
472                         if (opsize < 2) /* "silly options" */
473                                 return;
474                         if (opsize > length)
475                                 break;  /* don't parse partial options */
476
477                         if (opcode == TCPOPT_SACK
478                             && opsize >= (TCPOLEN_SACK_BASE
479                                           + TCPOLEN_SACK_PERBLOCK)
480                             && !((opsize - TCPOLEN_SACK_BASE)
481                                  % TCPOLEN_SACK_PERBLOCK)) {
482                                 for (i = 0;
483                                      i < (opsize - TCPOLEN_SACK_BASE);
484                                      i += TCPOLEN_SACK_PERBLOCK) {
485                                         tmp = get_unaligned_be32((__be32 *)(ptr+i)+1);
486
487                                         if (after(tmp, *sack))
488                                                 *sack = tmp;
489                                 }
490                                 return;
491                         }
492                         ptr += opsize - 2;
493                         length -= opsize;
494                 }
495         }
496 }
497
498 static bool tcp_in_window(const struct nf_conn *ct,
499                           struct ip_ct_tcp *state,
500                           enum ip_conntrack_dir dir,
501                           unsigned int index,
502                           const struct sk_buff *skb,
503                           unsigned int dataoff,
504                           const struct tcphdr *tcph,
505                           u_int8_t pf)
506 {
507         struct net *net = nf_ct_net(ct);
508         struct ip_ct_tcp_state *sender = &state->seen[dir];
509         struct ip_ct_tcp_state *receiver = &state->seen[!dir];
510         const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
511         __u32 seq, ack, sack, end, win, swin;
512         bool res;
513
514         /*
515          * Get the required data from the packet.
516          */
517         seq = ntohl(tcph->seq);
518         ack = sack = ntohl(tcph->ack_seq);
519         win = ntohs(tcph->window);
520         end = segment_seq_plus_len(seq, skb->len, dataoff, tcph);
521
522         if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
523                 tcp_sack(skb, dataoff, tcph, &sack);
524
525         pr_debug("tcp_in_window: START\n");
526         pr_debug("tcp_in_window: ");
527         nf_ct_dump_tuple(tuple);
528         pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n",
529                  seq, ack, sack, win, end);
530         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
531                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
532                  sender->td_end, sender->td_maxend, sender->td_maxwin,
533                  sender->td_scale,
534                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
535                  receiver->td_scale);
536
537         if (sender->td_maxwin == 0) {
538                 /*
539                  * Initialize sender data.
540                  */
541                 if (tcph->syn) {
542                         /*
543                          * SYN-ACK in reply to a SYN
544                          * or SYN from reply direction in simultaneous open.
545                          */
546                         sender->td_end =
547                         sender->td_maxend = end;
548                         sender->td_maxwin = (win == 0 ? 1 : win);
549
550                         tcp_options(skb, dataoff, tcph, sender);
551                         /*
552                          * RFC 1323:
553                          * Both sides must send the Window Scale option
554                          * to enable window scaling in either direction.
555                          */
556                         if (!(sender->flags & IP_CT_TCP_FLAG_WINDOW_SCALE
557                               && receiver->flags & IP_CT_TCP_FLAG_WINDOW_SCALE))
558                                 sender->td_scale =
559                                 receiver->td_scale = 0;
560                         if (!tcph->ack)
561                                 /* Simultaneous open */
562                                 return true;
563                 } else {
564                         /*
565                          * We are in the middle of a connection,
566                          * its history is lost for us.
567                          * Let's try to use the data from the packet.
568                          */
569                         sender->td_end = end;
570                         sender->td_maxwin = (win == 0 ? 1 : win);
571                         sender->td_maxend = end + sender->td_maxwin;
572                 }
573         } else if (((state->state == TCP_CONNTRACK_SYN_SENT
574                      && dir == IP_CT_DIR_ORIGINAL)
575                    || (state->state == TCP_CONNTRACK_SYN_RECV
576                      && dir == IP_CT_DIR_REPLY))
577                    && after(end, sender->td_end)) {
578                 /*
579                  * RFC 793: "if a TCP is reinitialized ... then it need
580                  * not wait at all; it must only be sure to use sequence
581                  * numbers larger than those recently used."
582                  */
583                 sender->td_end =
584                 sender->td_maxend = end;
585                 sender->td_maxwin = (win == 0 ? 1 : win);
586
587                 tcp_options(skb, dataoff, tcph, sender);
588         }
589
590         if (!(tcph->ack)) {
591                 /*
592                  * If there is no ACK, just pretend it was set and OK.
593                  */
594                 ack = sack = receiver->td_end;
595         } else if (((tcp_flag_word(tcph) & (TCP_FLAG_ACK|TCP_FLAG_RST)) ==
596                     (TCP_FLAG_ACK|TCP_FLAG_RST))
597                    && (ack == 0)) {
598                 /*
599                  * Broken TCP stacks, that set ACK in RST packets as well
600                  * with zero ack value.
601                  */
602                 ack = sack = receiver->td_end;
603         }
604
605         if (seq == end
606             && (!tcph->rst
607                 || (seq == 0 && state->state == TCP_CONNTRACK_SYN_SENT)))
608                 /*
609                  * Packets contains no data: we assume it is valid
610                  * and check the ack value only.
611                  * However RST segments are always validated by their
612                  * SEQ number, except when seq == 0 (reset sent answering
613                  * SYN.
614                  */
615                 seq = end = sender->td_end;
616
617         pr_debug("tcp_in_window: ");
618         nf_ct_dump_tuple(tuple);
619         pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n",
620                  seq, ack, sack, win, end);
621         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
622                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
623                  sender->td_end, sender->td_maxend, sender->td_maxwin,
624                  sender->td_scale,
625                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
626                  receiver->td_scale);
627
628         pr_debug("tcp_in_window: I=%i II=%i III=%i IV=%i\n",
629                  before(seq, sender->td_maxend + 1),
630                  after(end, sender->td_end - receiver->td_maxwin - 1),
631                  before(sack, receiver->td_end + 1),
632                  after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1));
633
634         if (before(seq, sender->td_maxend + 1) &&
635             after(end, sender->td_end - receiver->td_maxwin - 1) &&
636             before(sack, receiver->td_end + 1) &&
637             after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) {
638                 /*
639                  * Take into account window scaling (RFC 1323).
640                  */
641                 if (!tcph->syn)
642                         win <<= sender->td_scale;
643
644                 /*
645                  * Update sender data.
646                  */
647                 swin = win + (sack - ack);
648                 if (sender->td_maxwin < swin)
649                         sender->td_maxwin = swin;
650                 if (after(end, sender->td_end)) {
651                         sender->td_end = end;
652                         sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
653                 }
654                 /*
655                  * Update receiver data.
656                  */
657                 if (after(end, sender->td_maxend))
658                         receiver->td_maxwin += end - sender->td_maxend;
659                 if (after(sack + win, receiver->td_maxend - 1)) {
660                         receiver->td_maxend = sack + win;
661                         if (win == 0)
662                                 receiver->td_maxend++;
663                 }
664                 if (ack == receiver->td_end)
665                         receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
666
667                 /*
668                  * Check retransmissions.
669                  */
670                 if (index == TCP_ACK_SET) {
671                         if (state->last_dir == dir
672                             && state->last_seq == seq
673                             && state->last_ack == ack
674                             && state->last_end == end
675                             && state->last_win == win)
676                                 state->retrans++;
677                         else {
678                                 state->last_dir = dir;
679                                 state->last_seq = seq;
680                                 state->last_ack = ack;
681                                 state->last_end = end;
682                                 state->last_win = win;
683                                 state->retrans = 0;
684                         }
685                 }
686                 res = true;
687         } else {
688                 res = false;
689                 if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL ||
690                     nf_ct_tcp_be_liberal)
691                         res = true;
692                 if (!res && LOG_INVALID(net, IPPROTO_TCP))
693                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
694                         "nf_ct_tcp: %s ",
695                         before(seq, sender->td_maxend + 1) ?
696                         after(end, sender->td_end - receiver->td_maxwin - 1) ?
697                         before(sack, receiver->td_end + 1) ?
698                         after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
699                         : "ACK is under the lower bound (possible overly delayed ACK)"
700                         : "ACK is over the upper bound (ACKed data not seen yet)"
701                         : "SEQ is under the lower bound (already ACKed data retransmitted)"
702                         : "SEQ is over the upper bound (over the window of the receiver)");
703         }
704
705         pr_debug("tcp_in_window: res=%u sender end=%u maxend=%u maxwin=%u "
706                  "receiver end=%u maxend=%u maxwin=%u\n",
707                  res, sender->td_end, sender->td_maxend, sender->td_maxwin,
708                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin);
709
710         return res;
711 }
712
713 #ifdef CONFIG_NF_NAT_NEEDED
714 /* Update sender->td_end after NAT successfully mangled the packet */
715 /* Caller must linearize skb at tcp header. */
716 void nf_conntrack_tcp_update(const struct sk_buff *skb,
717                              unsigned int dataoff,
718                              struct nf_conn *ct,
719                              int dir)
720 {
721         const struct tcphdr *tcph = (const void *)skb->data + dataoff;
722         const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[dir];
723         const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[!dir];
724         __u32 end;
725
726         end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
727
728         write_lock_bh(&tcp_lock);
729         /*
730          * We have to worry for the ack in the reply packet only...
731          */
732         if (after(end, ct->proto.tcp.seen[dir].td_end))
733                 ct->proto.tcp.seen[dir].td_end = end;
734         ct->proto.tcp.last_end = end;
735         write_unlock_bh(&tcp_lock);
736         pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
737                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
738                  sender->td_end, sender->td_maxend, sender->td_maxwin,
739                  sender->td_scale,
740                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
741                  receiver->td_scale);
742 }
743 EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update);
744 #endif
745
746 #define TH_FIN  0x01
747 #define TH_SYN  0x02
748 #define TH_RST  0x04
749 #define TH_PUSH 0x08
750 #define TH_ACK  0x10
751 #define TH_URG  0x20
752 #define TH_ECE  0x40
753 #define TH_CWR  0x80
754
755 /* table of valid flag combinations - PUSH, ECE and CWR are always valid */
756 static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] =
757 {
758         [TH_SYN]                        = 1,
759         [TH_SYN|TH_URG]                 = 1,
760         [TH_SYN|TH_ACK]                 = 1,
761         [TH_RST]                        = 1,
762         [TH_RST|TH_ACK]                 = 1,
763         [TH_FIN|TH_ACK]                 = 1,
764         [TH_FIN|TH_ACK|TH_URG]          = 1,
765         [TH_ACK]                        = 1,
766         [TH_ACK|TH_URG]                 = 1,
767 };
768
769 /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
770 static int tcp_error(struct net *net,
771                      struct sk_buff *skb,
772                      unsigned int dataoff,
773                      enum ip_conntrack_info *ctinfo,
774                      u_int8_t pf,
775                      unsigned int hooknum)
776 {
777         const struct tcphdr *th;
778         struct tcphdr _tcph;
779         unsigned int tcplen = skb->len - dataoff;
780         u_int8_t tcpflags;
781
782         /* Smaller that minimal TCP header? */
783         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
784         if (th == NULL) {
785                 if (LOG_INVALID(net, IPPROTO_TCP))
786                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
787                                 "nf_ct_tcp: short packet ");
788                 return -NF_ACCEPT;
789         }
790
791         /* Not whole TCP header or malformed packet */
792         if (th->doff*4 < sizeof(struct tcphdr) || tcplen < th->doff*4) {
793                 if (LOG_INVALID(net, IPPROTO_TCP))
794                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
795                                 "nf_ct_tcp: truncated/malformed packet ");
796                 return -NF_ACCEPT;
797         }
798
799         /* Checksum invalid? Ignore.
800          * We skip checking packets on the outgoing path
801          * because the checksum is assumed to be correct.
802          */
803         /* FIXME: Source route IP option packets --RR */
804         if (net->ct.sysctl_checksum && hooknum == NF_INET_PRE_ROUTING &&
805             nf_checksum(skb, hooknum, dataoff, IPPROTO_TCP, pf)) {
806                 if (LOG_INVALID(net, IPPROTO_TCP))
807                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
808                                   "nf_ct_tcp: bad TCP checksum ");
809                 return -NF_ACCEPT;
810         }
811
812         /* Check TCP flags. */
813         tcpflags = (((u_int8_t *)th)[13] & ~(TH_ECE|TH_CWR|TH_PUSH));
814         if (!tcp_valid_flags[tcpflags]) {
815                 if (LOG_INVALID(net, IPPROTO_TCP))
816                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
817                                   "nf_ct_tcp: invalid TCP flag combination ");
818                 return -NF_ACCEPT;
819         }
820
821         return NF_ACCEPT;
822 }
823
824 /* Returns verdict for packet, or -1 for invalid. */
825 static int tcp_packet(struct nf_conn *ct,
826                       const struct sk_buff *skb,
827                       unsigned int dataoff,
828                       enum ip_conntrack_info ctinfo,
829                       u_int8_t pf,
830                       unsigned int hooknum)
831 {
832         struct net *net = nf_ct_net(ct);
833         struct nf_conntrack_tuple *tuple;
834         enum tcp_conntrack new_state, old_state;
835         enum ip_conntrack_dir dir;
836         const struct tcphdr *th;
837         struct tcphdr _tcph;
838         unsigned long timeout;
839         unsigned int index;
840
841         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
842         BUG_ON(th == NULL);
843
844         write_lock_bh(&tcp_lock);
845         old_state = ct->proto.tcp.state;
846         dir = CTINFO2DIR(ctinfo);
847         index = get_conntrack_index(th);
848         new_state = tcp_conntracks[dir][index][old_state];
849         tuple = &ct->tuplehash[dir].tuple;
850
851         switch (new_state) {
852         case TCP_CONNTRACK_SYN_SENT:
853                 if (old_state < TCP_CONNTRACK_TIME_WAIT)
854                         break;
855                 /* RFC 1122: "When a connection is closed actively,
856                  * it MUST linger in TIME-WAIT state for a time 2xMSL
857                  * (Maximum Segment Lifetime). However, it MAY accept
858                  * a new SYN from the remote TCP to reopen the connection
859                  * directly from TIME-WAIT state, if..."
860                  * We ignore the conditions because we are in the
861                  * TIME-WAIT state anyway.
862                  *
863                  * Handle aborted connections: we and the server
864                  * think there is an existing connection but the client
865                  * aborts it and starts a new one.
866                  */
867                 if (((ct->proto.tcp.seen[dir].flags
868                       | ct->proto.tcp.seen[!dir].flags)
869                      & IP_CT_TCP_FLAG_CLOSE_INIT)
870                     || (ct->proto.tcp.last_dir == dir
871                         && ct->proto.tcp.last_index == TCP_RST_SET)) {
872                         /* Attempt to reopen a closed/aborted connection.
873                          * Delete this connection and look up again. */
874                         write_unlock_bh(&tcp_lock);
875
876                         /* Only repeat if we can actually remove the timer.
877                          * Destruction may already be in progress in process
878                          * context and we must give it a chance to terminate.
879                          */
880                         if (nf_ct_kill(ct))
881                                 return -NF_REPEAT;
882                         return NF_DROP;
883                 }
884                 /* Fall through */
885         case TCP_CONNTRACK_IGNORE:
886                 /* Ignored packets:
887                  *
888                  * Our connection entry may be out of sync, so ignore
889                  * packets which may signal the real connection between
890                  * the client and the server.
891                  *
892                  * a) SYN in ORIGINAL
893                  * b) SYN/ACK in REPLY
894                  * c) ACK in reply direction after initial SYN in original.
895                  *
896                  * If the ignored packet is invalid, the receiver will send
897                  * a RST we'll catch below.
898                  */
899                 if (index == TCP_SYNACK_SET
900                     && ct->proto.tcp.last_index == TCP_SYN_SET
901                     && ct->proto.tcp.last_dir != dir
902                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
903                         /* b) This SYN/ACK acknowledges a SYN that we earlier
904                          * ignored as invalid. This means that the client and
905                          * the server are both in sync, while the firewall is
906                          * not. We kill this session and block the SYN/ACK so
907                          * that the client cannot but retransmit its SYN and
908                          * thus initiate a clean new session.
909                          */
910                         write_unlock_bh(&tcp_lock);
911                         if (LOG_INVALID(net, IPPROTO_TCP))
912                                 nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
913                                           "nf_ct_tcp: killing out of sync session ");
914                         nf_ct_kill(ct);
915                         return NF_DROP;
916                 }
917                 ct->proto.tcp.last_index = index;
918                 ct->proto.tcp.last_dir = dir;
919                 ct->proto.tcp.last_seq = ntohl(th->seq);
920                 ct->proto.tcp.last_end =
921                     segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
922
923                 write_unlock_bh(&tcp_lock);
924                 if (LOG_INVALID(net, IPPROTO_TCP))
925                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
926                                   "nf_ct_tcp: invalid packet ignored ");
927                 return NF_ACCEPT;
928         case TCP_CONNTRACK_MAX:
929                 /* Invalid packet */
930                 pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
931                          dir, get_conntrack_index(th), old_state);
932                 write_unlock_bh(&tcp_lock);
933                 if (LOG_INVALID(net, IPPROTO_TCP))
934                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
935                                   "nf_ct_tcp: invalid state ");
936                 return -NF_ACCEPT;
937         case TCP_CONNTRACK_CLOSE:
938                 if (index == TCP_RST_SET
939                     && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
940                          && ct->proto.tcp.last_index == TCP_SYN_SET)
941                         || (!test_bit(IPS_ASSURED_BIT, &ct->status)
942                             && ct->proto.tcp.last_index == TCP_ACK_SET))
943                     && ntohl(th->ack_seq) == ct->proto.tcp.last_end) {
944                         /* RST sent to invalid SYN or ACK we had let through
945                          * at a) and c) above:
946                          *
947                          * a) SYN was in window then
948                          * c) we hold a half-open connection.
949                          *
950                          * Delete our connection entry.
951                          * We skip window checking, because packet might ACK
952                          * segments we ignored. */
953                         goto in_window;
954                 }
955                 /* Just fall through */
956         default:
957                 /* Keep compilers happy. */
958                 break;
959         }
960
961         if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
962                            skb, dataoff, th, pf)) {
963                 write_unlock_bh(&tcp_lock);
964                 return -NF_ACCEPT;
965         }
966      in_window:
967         /* From now on we have got in-window packets */
968         ct->proto.tcp.last_index = index;
969         ct->proto.tcp.last_dir = dir;
970
971         pr_debug("tcp_conntracks: ");
972         nf_ct_dump_tuple(tuple);
973         pr_debug("syn=%i ack=%i fin=%i rst=%i old=%i new=%i\n",
974                  (th->syn ? 1 : 0), (th->ack ? 1 : 0),
975                  (th->fin ? 1 : 0), (th->rst ? 1 : 0),
976                  old_state, new_state);
977
978         ct->proto.tcp.state = new_state;
979         if (old_state != new_state
980             && new_state == TCP_CONNTRACK_FIN_WAIT)
981                 ct->proto.tcp.seen[dir].flags |= IP_CT_TCP_FLAG_CLOSE_INIT;
982
983         if (ct->proto.tcp.retrans >= nf_ct_tcp_max_retrans &&
984             tcp_timeouts[new_state] > nf_ct_tcp_timeout_max_retrans)
985                 timeout = nf_ct_tcp_timeout_max_retrans;
986         else if ((ct->proto.tcp.seen[0].flags | ct->proto.tcp.seen[1].flags) &
987                  IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED &&
988                  tcp_timeouts[new_state] > nf_ct_tcp_timeout_unacknowledged)
989                 timeout = nf_ct_tcp_timeout_unacknowledged;
990         else
991                 timeout = tcp_timeouts[new_state];
992         write_unlock_bh(&tcp_lock);
993
994         if (new_state != old_state)
995                 nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
996
997         if (!test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
998                 /* If only reply is a RST, we can consider ourselves not to
999                    have an established connection: this is a fairly common
1000                    problem case, so we can delete the conntrack
1001                    immediately.  --RR */
1002                 if (th->rst) {
1003                         nf_ct_kill_acct(ct, ctinfo, skb);
1004                         return NF_ACCEPT;
1005                 }
1006         } else if (!test_bit(IPS_ASSURED_BIT, &ct->status)
1007                    && (old_state == TCP_CONNTRACK_SYN_RECV
1008                        || old_state == TCP_CONNTRACK_ESTABLISHED)
1009                    && new_state == TCP_CONNTRACK_ESTABLISHED) {
1010                 /* Set ASSURED if we see see valid ack in ESTABLISHED
1011                    after SYN_RECV or a valid answer for a picked up
1012                    connection. */
1013                 set_bit(IPS_ASSURED_BIT, &ct->status);
1014                 nf_conntrack_event_cache(IPCT_STATUS, ct);
1015         }
1016         nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
1017
1018         return NF_ACCEPT;
1019 }
1020
1021 /* Called when a new connection for this protocol found. */
1022 static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
1023                     unsigned int dataoff)
1024 {
1025         enum tcp_conntrack new_state;
1026         const struct tcphdr *th;
1027         struct tcphdr _tcph;
1028         const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[0];
1029         const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[1];
1030
1031         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
1032         BUG_ON(th == NULL);
1033
1034         /* Don't need lock here: this conntrack not in circulation yet */
1035         new_state
1036                 = tcp_conntracks[0][get_conntrack_index(th)]
1037                 [TCP_CONNTRACK_NONE];
1038
1039         /* Invalid: delete conntrack */
1040         if (new_state >= TCP_CONNTRACK_MAX) {
1041                 pr_debug("nf_ct_tcp: invalid new deleting.\n");
1042                 return false;
1043         }
1044
1045         if (new_state == TCP_CONNTRACK_SYN_SENT) {
1046                 /* SYN packet */
1047                 ct->proto.tcp.seen[0].td_end =
1048                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1049                                              dataoff, th);
1050                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1051                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
1052                         ct->proto.tcp.seen[0].td_maxwin = 1;
1053                 ct->proto.tcp.seen[0].td_maxend =
1054                         ct->proto.tcp.seen[0].td_end;
1055
1056                 tcp_options(skb, dataoff, th, &ct->proto.tcp.seen[0]);
1057                 ct->proto.tcp.seen[1].flags = 0;
1058         } else if (nf_ct_tcp_loose == 0) {
1059                 /* Don't try to pick up connections. */
1060                 return false;
1061         } else {
1062                 /*
1063                  * We are in the middle of a connection,
1064                  * its history is lost for us.
1065                  * Let's try to use the data from the packet.
1066                  */
1067                 ct->proto.tcp.seen[0].td_end =
1068                         segment_seq_plus_len(ntohl(th->seq), skb->len,
1069                                              dataoff, th);
1070                 ct->proto.tcp.seen[0].td_maxwin = ntohs(th->window);
1071                 if (ct->proto.tcp.seen[0].td_maxwin == 0)
1072                         ct->proto.tcp.seen[0].td_maxwin = 1;
1073                 ct->proto.tcp.seen[0].td_maxend =
1074                         ct->proto.tcp.seen[0].td_end +
1075                         ct->proto.tcp.seen[0].td_maxwin;
1076                 ct->proto.tcp.seen[0].td_scale = 0;
1077
1078                 /* We assume SACK and liberal window checking to handle
1079                  * window scaling */
1080                 ct->proto.tcp.seen[0].flags =
1081                 ct->proto.tcp.seen[1].flags = IP_CT_TCP_FLAG_SACK_PERM |
1082                                               IP_CT_TCP_FLAG_BE_LIBERAL;
1083         }
1084
1085         ct->proto.tcp.seen[1].td_end = 0;
1086         ct->proto.tcp.seen[1].td_maxend = 0;
1087         ct->proto.tcp.seen[1].td_maxwin = 0;
1088         ct->proto.tcp.seen[1].td_scale = 0;
1089
1090         /* tcp_packet will set them */
1091         ct->proto.tcp.state = TCP_CONNTRACK_NONE;
1092         ct->proto.tcp.last_index = TCP_NONE_SET;
1093
1094         pr_debug("tcp_new: sender end=%u maxend=%u maxwin=%u scale=%i "
1095                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
1096                  sender->td_end, sender->td_maxend, sender->td_maxwin,
1097                  sender->td_scale,
1098                  receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
1099                  receiver->td_scale);
1100         return true;
1101 }
1102
1103 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1104
1105 #include <linux/netfilter/nfnetlink.h>
1106 #include <linux/netfilter/nfnetlink_conntrack.h>
1107
1108 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
1109                          const struct nf_conn *ct)
1110 {
1111         struct nlattr *nest_parms;
1112         struct nf_ct_tcp_flags tmp = {};
1113
1114         read_lock_bh(&tcp_lock);
1115         nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
1116         if (!nest_parms)
1117                 goto nla_put_failure;
1118
1119         NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_STATE, ct->proto.tcp.state);
1120
1121         NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_ORIGINAL,
1122                    ct->proto.tcp.seen[0].td_scale);
1123
1124         NLA_PUT_U8(skb, CTA_PROTOINFO_TCP_WSCALE_REPLY,
1125                    ct->proto.tcp.seen[1].td_scale);
1126
1127         tmp.flags = ct->proto.tcp.seen[0].flags;
1128         NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_ORIGINAL,
1129                 sizeof(struct nf_ct_tcp_flags), &tmp);
1130
1131         tmp.flags = ct->proto.tcp.seen[1].flags;
1132         NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
1133                 sizeof(struct nf_ct_tcp_flags), &tmp);
1134         read_unlock_bh(&tcp_lock);
1135
1136         nla_nest_end(skb, nest_parms);
1137
1138         return 0;
1139
1140 nla_put_failure:
1141         read_unlock_bh(&tcp_lock);
1142         return -1;
1143 }
1144
1145 static const struct nla_policy tcp_nla_policy[CTA_PROTOINFO_TCP_MAX+1] = {
1146         [CTA_PROTOINFO_TCP_STATE]           = { .type = NLA_U8 },
1147         [CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] = { .type = NLA_U8 },
1148         [CTA_PROTOINFO_TCP_WSCALE_REPLY]    = { .type = NLA_U8 },
1149         [CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]  = { .len = sizeof(struct nf_ct_tcp_flags) },
1150         [CTA_PROTOINFO_TCP_FLAGS_REPLY]     = { .len =  sizeof(struct nf_ct_tcp_flags) },
1151 };
1152
1153 static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
1154 {
1155         struct nlattr *pattr = cda[CTA_PROTOINFO_TCP];
1156         struct nlattr *tb[CTA_PROTOINFO_TCP_MAX+1];
1157         int err;
1158
1159         /* updates could not contain anything about the private
1160          * protocol info, in that case skip the parsing */
1161         if (!pattr)
1162                 return 0;
1163
1164         err = nla_parse_nested(tb, CTA_PROTOINFO_TCP_MAX, pattr, tcp_nla_policy);
1165         if (err < 0)
1166                 return err;
1167
1168         if (tb[CTA_PROTOINFO_TCP_STATE] &&
1169             nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
1170                 return -EINVAL;
1171
1172         write_lock_bh(&tcp_lock);
1173         if (tb[CTA_PROTOINFO_TCP_STATE])
1174                 ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
1175
1176         if (tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]) {
1177                 struct nf_ct_tcp_flags *attr =
1178                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_ORIGINAL]);
1179                 ct->proto.tcp.seen[0].flags &= ~attr->mask;
1180                 ct->proto.tcp.seen[0].flags |= attr->flags & attr->mask;
1181         }
1182
1183         if (tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]) {
1184                 struct nf_ct_tcp_flags *attr =
1185                         nla_data(tb[CTA_PROTOINFO_TCP_FLAGS_REPLY]);
1186                 ct->proto.tcp.seen[1].flags &= ~attr->mask;
1187                 ct->proto.tcp.seen[1].flags |= attr->flags & attr->mask;
1188         }
1189
1190         if (tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL] &&
1191             tb[CTA_PROTOINFO_TCP_WSCALE_REPLY] &&
1192             ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_WINDOW_SCALE &&
1193             ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
1194                 ct->proto.tcp.seen[0].td_scale =
1195                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_ORIGINAL]);
1196                 ct->proto.tcp.seen[1].td_scale =
1197                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
1198         }
1199         write_unlock_bh(&tcp_lock);
1200
1201         return 0;
1202 }
1203
1204 static int tcp_nlattr_size(void)
1205 {
1206         return nla_total_size(0)           /* CTA_PROTOINFO_TCP */
1207                 + nla_policy_len(tcp_nla_policy, CTA_PROTOINFO_TCP_MAX + 1);
1208 }
1209
1210 static int tcp_nlattr_tuple_size(void)
1211 {
1212         return nla_policy_len(nf_ct_port_nla_policy, CTA_PROTO_MAX + 1);
1213 }
1214 #endif
1215
1216 #ifdef CONFIG_SYSCTL
1217 static unsigned int tcp_sysctl_table_users;
1218 static struct ctl_table_header *tcp_sysctl_header;
1219 static struct ctl_table tcp_sysctl_table[] = {
1220         {
1221                 .procname       = "nf_conntrack_tcp_timeout_syn_sent",
1222                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
1223                 .maxlen         = sizeof(unsigned int),
1224                 .mode           = 0644,
1225                 .proc_handler   = proc_dointvec_jiffies,
1226         },
1227         {
1228                 .procname       = "nf_conntrack_tcp_timeout_syn_recv",
1229                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
1230                 .maxlen         = sizeof(unsigned int),
1231                 .mode           = 0644,
1232                 .proc_handler   = proc_dointvec_jiffies,
1233         },
1234         {
1235                 .procname       = "nf_conntrack_tcp_timeout_established",
1236                 .data           = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
1237                 .maxlen         = sizeof(unsigned int),
1238                 .mode           = 0644,
1239                 .proc_handler   = proc_dointvec_jiffies,
1240         },
1241         {
1242                 .procname       = "nf_conntrack_tcp_timeout_fin_wait",
1243                 .data           = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
1244                 .maxlen         = sizeof(unsigned int),
1245                 .mode           = 0644,
1246                 .proc_handler   = proc_dointvec_jiffies,
1247         },
1248         {
1249                 .procname       = "nf_conntrack_tcp_timeout_close_wait",
1250                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
1251                 .maxlen         = sizeof(unsigned int),
1252                 .mode           = 0644,
1253                 .proc_handler   = proc_dointvec_jiffies,
1254         },
1255         {
1256                 .procname       = "nf_conntrack_tcp_timeout_last_ack",
1257                 .data           = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
1258                 .maxlen         = sizeof(unsigned int),
1259                 .mode           = 0644,
1260                 .proc_handler   = proc_dointvec_jiffies,
1261         },
1262         {
1263                 .procname       = "nf_conntrack_tcp_timeout_time_wait",
1264                 .data           = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
1265                 .maxlen         = sizeof(unsigned int),
1266                 .mode           = 0644,
1267                 .proc_handler   = proc_dointvec_jiffies,
1268         },
1269         {
1270                 .procname       = "nf_conntrack_tcp_timeout_close",
1271                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE],
1272                 .maxlen         = sizeof(unsigned int),
1273                 .mode           = 0644,
1274                 .proc_handler   = proc_dointvec_jiffies,
1275         },
1276         {
1277                 .procname       = "nf_conntrack_tcp_timeout_max_retrans",
1278                 .data           = &nf_ct_tcp_timeout_max_retrans,
1279                 .maxlen         = sizeof(unsigned int),
1280                 .mode           = 0644,
1281                 .proc_handler   = proc_dointvec_jiffies,
1282         },
1283         {
1284                 .procname       = "nf_conntrack_tcp_timeout_unacknowledged",
1285                 .data           = &nf_ct_tcp_timeout_unacknowledged,
1286                 .maxlen         = sizeof(unsigned int),
1287                 .mode           = 0644,
1288                 .proc_handler   = proc_dointvec_jiffies,
1289         },
1290         {
1291                 .ctl_name       = NET_NF_CONNTRACK_TCP_LOOSE,
1292                 .procname       = "nf_conntrack_tcp_loose",
1293                 .data           = &nf_ct_tcp_loose,
1294                 .maxlen         = sizeof(unsigned int),
1295                 .mode           = 0644,
1296                 .proc_handler   = proc_dointvec,
1297         },
1298         {
1299                 .ctl_name       = NET_NF_CONNTRACK_TCP_BE_LIBERAL,
1300                 .procname       = "nf_conntrack_tcp_be_liberal",
1301                 .data           = &nf_ct_tcp_be_liberal,
1302                 .maxlen         = sizeof(unsigned int),
1303                 .mode           = 0644,
1304                 .proc_handler   = proc_dointvec,
1305         },
1306         {
1307                 .ctl_name       = NET_NF_CONNTRACK_TCP_MAX_RETRANS,
1308                 .procname       = "nf_conntrack_tcp_max_retrans",
1309                 .data           = &nf_ct_tcp_max_retrans,
1310                 .maxlen         = sizeof(unsigned int),
1311                 .mode           = 0644,
1312                 .proc_handler   = proc_dointvec,
1313         },
1314         {
1315                 .ctl_name       = 0
1316         }
1317 };
1318
1319 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1320 static struct ctl_table tcp_compat_sysctl_table[] = {
1321         {
1322                 .procname       = "ip_conntrack_tcp_timeout_syn_sent",
1323                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT],
1324                 .maxlen         = sizeof(unsigned int),
1325                 .mode           = 0644,
1326                 .proc_handler   = proc_dointvec_jiffies,
1327         },
1328         {
1329                 .procname       = "ip_conntrack_tcp_timeout_syn_sent2",
1330                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_SENT2],
1331                 .maxlen         = sizeof(unsigned int),
1332                 .mode           = 0644,
1333                 .proc_handler   = proc_dointvec_jiffies,
1334         },
1335         {
1336                 .procname       = "ip_conntrack_tcp_timeout_syn_recv",
1337                 .data           = &tcp_timeouts[TCP_CONNTRACK_SYN_RECV],
1338                 .maxlen         = sizeof(unsigned int),
1339                 .mode           = 0644,
1340                 .proc_handler   = proc_dointvec_jiffies,
1341         },
1342         {
1343                 .procname       = "ip_conntrack_tcp_timeout_established",
1344                 .data           = &tcp_timeouts[TCP_CONNTRACK_ESTABLISHED],
1345                 .maxlen         = sizeof(unsigned int),
1346                 .mode           = 0644,
1347                 .proc_handler   = proc_dointvec_jiffies,
1348         },
1349         {
1350                 .procname       = "ip_conntrack_tcp_timeout_fin_wait",
1351                 .data           = &tcp_timeouts[TCP_CONNTRACK_FIN_WAIT],
1352                 .maxlen         = sizeof(unsigned int),
1353                 .mode           = 0644,
1354                 .proc_handler   = proc_dointvec_jiffies,
1355         },
1356         {
1357                 .procname       = "ip_conntrack_tcp_timeout_close_wait",
1358                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE_WAIT],
1359                 .maxlen         = sizeof(unsigned int),
1360                 .mode           = 0644,
1361                 .proc_handler   = proc_dointvec_jiffies,
1362         },
1363         {
1364                 .procname       = "ip_conntrack_tcp_timeout_last_ack",
1365                 .data           = &tcp_timeouts[TCP_CONNTRACK_LAST_ACK],
1366                 .maxlen         = sizeof(unsigned int),
1367                 .mode           = 0644,
1368                 .proc_handler   = proc_dointvec_jiffies,
1369         },
1370         {
1371                 .procname       = "ip_conntrack_tcp_timeout_time_wait",
1372                 .data           = &tcp_timeouts[TCP_CONNTRACK_TIME_WAIT],
1373                 .maxlen         = sizeof(unsigned int),
1374                 .mode           = 0644,
1375                 .proc_handler   = proc_dointvec_jiffies,
1376         },
1377         {
1378                 .procname       = "ip_conntrack_tcp_timeout_close",
1379                 .data           = &tcp_timeouts[TCP_CONNTRACK_CLOSE],
1380                 .maxlen         = sizeof(unsigned int),
1381                 .mode           = 0644,
1382                 .proc_handler   = proc_dointvec_jiffies,
1383         },
1384         {
1385                 .procname       = "ip_conntrack_tcp_timeout_max_retrans",
1386                 .data           = &nf_ct_tcp_timeout_max_retrans,
1387                 .maxlen         = sizeof(unsigned int),
1388                 .mode           = 0644,
1389                 .proc_handler   = proc_dointvec_jiffies,
1390         },
1391         {
1392                 .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
1393                 .procname       = "ip_conntrack_tcp_loose",
1394                 .data           = &nf_ct_tcp_loose,
1395                 .maxlen         = sizeof(unsigned int),
1396                 .mode           = 0644,
1397                 .proc_handler   = proc_dointvec,
1398         },
1399         {
1400                 .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
1401                 .procname       = "ip_conntrack_tcp_be_liberal",
1402                 .data           = &nf_ct_tcp_be_liberal,
1403                 .maxlen         = sizeof(unsigned int),
1404                 .mode           = 0644,
1405                 .proc_handler   = proc_dointvec,
1406         },
1407         {
1408                 .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
1409                 .procname       = "ip_conntrack_tcp_max_retrans",
1410                 .data           = &nf_ct_tcp_max_retrans,
1411                 .maxlen         = sizeof(unsigned int),
1412                 .mode           = 0644,
1413                 .proc_handler   = proc_dointvec,
1414         },
1415         {
1416                 .ctl_name       = 0
1417         }
1418 };
1419 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
1420 #endif /* CONFIG_SYSCTL */
1421
1422 struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp4 __read_mostly =
1423 {
1424         .l3proto                = PF_INET,
1425         .l4proto                = IPPROTO_TCP,
1426         .name                   = "tcp",
1427         .pkt_to_tuple           = tcp_pkt_to_tuple,
1428         .invert_tuple           = tcp_invert_tuple,
1429         .print_tuple            = tcp_print_tuple,
1430         .print_conntrack        = tcp_print_conntrack,
1431         .packet                 = tcp_packet,
1432         .new                    = tcp_new,
1433         .error                  = tcp_error,
1434 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1435         .to_nlattr              = tcp_to_nlattr,
1436         .nlattr_size            = tcp_nlattr_size,
1437         .from_nlattr            = nlattr_to_tcp,
1438         .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
1439         .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
1440         .nlattr_tuple_size      = tcp_nlattr_tuple_size,
1441         .nla_policy             = nf_ct_port_nla_policy,
1442 #endif
1443 #ifdef CONFIG_SYSCTL
1444         .ctl_table_users        = &tcp_sysctl_table_users,
1445         .ctl_table_header       = &tcp_sysctl_header,
1446         .ctl_table              = tcp_sysctl_table,
1447 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
1448         .ctl_compat_table       = tcp_compat_sysctl_table,
1449 #endif
1450 #endif
1451 };
1452 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp4);
1453
1454 struct nf_conntrack_l4proto nf_conntrack_l4proto_tcp6 __read_mostly =
1455 {
1456         .l3proto                = PF_INET6,
1457         .l4proto                = IPPROTO_TCP,
1458         .name                   = "tcp",
1459         .pkt_to_tuple           = tcp_pkt_to_tuple,
1460         .invert_tuple           = tcp_invert_tuple,
1461         .print_tuple            = tcp_print_tuple,
1462         .print_conntrack        = tcp_print_conntrack,
1463         .packet                 = tcp_packet,
1464         .new                    = tcp_new,
1465         .error                  = tcp_error,
1466 #if defined(CONFIG_NF_CT_NETLINK) || defined(CONFIG_NF_CT_NETLINK_MODULE)
1467         .to_nlattr              = tcp_to_nlattr,
1468         .nlattr_size            = tcp_nlattr_size,
1469         .from_nlattr            = nlattr_to_tcp,
1470         .tuple_to_nlattr        = nf_ct_port_tuple_to_nlattr,
1471         .nlattr_to_tuple        = nf_ct_port_nlattr_to_tuple,
1472         .nlattr_tuple_size      = tcp_nlattr_tuple_size,
1473         .nla_policy             = nf_ct_port_nla_policy,
1474 #endif
1475 #ifdef CONFIG_SYSCTL
1476         .ctl_table_users        = &tcp_sysctl_table_users,
1477         .ctl_table_header       = &tcp_sysctl_header,
1478         .ctl_table              = tcp_sysctl_table,
1479 #endif
1480 };
1481 EXPORT_SYMBOL_GPL(nf_conntrack_l4proto_tcp6);