Merge branch 'master' of /repos/git/net-next-2.6
[safe/jmp/linux-2.6] / net / netfilter / nf_conntrack_proto_tcp.c
index b7e8a82..ad11805 100644 (file)
@@ -29,9 +29,6 @@
 #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
 #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
 
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
 /* "Be conservative in what you do,
     be liberal in what you accept from others."
     If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -309,13 +306,13 @@ static int tcp_print_tuple(struct seq_file *s,
 }
 
 /* Print out the private part of the conntrack. */
-static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
+static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
 {
        enum tcp_conntrack state;
 
-       read_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
        state = ct->proto.tcp.state;
-       read_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
 
        return seq_printf(s, "%s ", tcp_conntrack_names[state]);
 }
@@ -495,6 +492,21 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
        }
 }
 
+#ifdef CONFIG_NF_NAT_NEEDED
+static inline s16 nat_offset(const struct nf_conn *ct,
+                            enum ip_conntrack_dir dir,
+                            u32 seq)
+{
+       typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset);
+
+       return get_offset != NULL ? get_offset(ct, dir, seq) : 0;
+}
+#define NAT_OFFSET(pf, ct, dir, seq) \
+       (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0)
+#else
+#define NAT_OFFSET(pf, ct, dir, seq)   0
+#endif
+
 static bool tcp_in_window(const struct nf_conn *ct,
                          struct ip_ct_tcp *state,
                          enum ip_conntrack_dir dir,
@@ -509,6 +521,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
        struct ip_ct_tcp_state *receiver = &state->seen[!dir];
        const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
        __u32 seq, ack, sack, end, win, swin;
+       s16 receiver_offset;
        bool res;
 
        /*
@@ -522,11 +535,16 @@ static bool tcp_in_window(const struct nf_conn *ct,
        if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
                tcp_sack(skb, dataoff, tcph, &sack);
 
+       /* Take into account NAT sequence number mangling */
+       receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1);
+       ack -= receiver_offset;
+       sack -= receiver_offset;
+
        pr_debug("tcp_in_window: START\n");
        pr_debug("tcp_in_window: ");
        nf_ct_dump_tuple(tuple);
-       pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n",
-                seq, ack, sack, win, end);
+       pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+                seq, ack, receiver_offset, sack, receiver_offset, win, end);
        pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
                 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
                 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -616,8 +634,8 @@ static bool tcp_in_window(const struct nf_conn *ct,
 
        pr_debug("tcp_in_window: ");
        nf_ct_dump_tuple(tuple);
-       pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n",
-                seq, ack, sack, win, end);
+       pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+                seq, ack, receiver_offset, sack, receiver_offset, win, end);
        pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
                 "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
                 sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -651,6 +669,14 @@ static bool tcp_in_window(const struct nf_conn *ct,
                        sender->td_end = end;
                        sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
                }
+               if (tcph->ack) {
+                       if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+                               sender->td_maxack = ack;
+                               sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+                       } else if (after(ack, sender->td_maxack))
+                               sender->td_maxack = ack;
+               }
+
                /*
                 * Update receiver data.
                 */
@@ -695,7 +721,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
                        before(seq, sender->td_maxend + 1) ?
                        after(end, sender->td_end - receiver->td_maxwin - 1) ?
                        before(sack, receiver->td_end + 1) ?
-                       after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+                       after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
                        : "ACK is under the lower bound (possible overly delayed ACK)"
                        : "ACK is over the upper bound (ACKed data not seen yet)"
                        : "SEQ is under the lower bound (already ACKed data retransmitted)"
@@ -710,39 +736,6 @@ static bool tcp_in_window(const struct nf_conn *ct,
        return res;
 }
 
-#ifdef CONFIG_NF_NAT_NEEDED
-/* Update sender->td_end after NAT successfully mangled the packet */
-/* Caller must linearize skb at tcp header. */
-void nf_conntrack_tcp_update(const struct sk_buff *skb,
-                            unsigned int dataoff,
-                            struct nf_conn *ct,
-                            int dir)
-{
-       const struct tcphdr *tcph = (const void *)skb->data + dataoff;
-       const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[dir];
-       const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[!dir];
-       __u32 end;
-
-       end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
-
-       write_lock_bh(&tcp_lock);
-       /*
-        * We have to worry for the ack in the reply packet only...
-        */
-       if (after(end, ct->proto.tcp.seen[dir].td_end))
-               ct->proto.tcp.seen[dir].td_end = end;
-       ct->proto.tcp.last_end = end;
-       write_unlock_bh(&tcp_lock);
-       pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
-                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
-                sender->td_end, sender->td_maxend, sender->td_maxwin,
-                sender->td_scale,
-                receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
-                receiver->td_scale);
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update);
-#endif
-
 #define        TH_FIN  0x01
 #define        TH_SYN  0x02
 #define        TH_RST  0x04
@@ -841,7 +834,7 @@ static int tcp_packet(struct nf_conn *ct,
        th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
        BUG_ON(th == NULL);
 
-       write_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
        old_state = ct->proto.tcp.state;
        dir = CTINFO2DIR(ctinfo);
        index = get_conntrack_index(th);
@@ -871,7 +864,7 @@ static int tcp_packet(struct nf_conn *ct,
                        && ct->proto.tcp.last_index == TCP_RST_SET)) {
                        /* Attempt to reopen a closed/aborted connection.
                         * Delete this connection and look up again. */
-                       write_unlock_bh(&tcp_lock);
+                       spin_unlock_bh(&ct->lock);
 
                        /* Only repeat if we can actually remove the timer.
                         * Destruction may already be in progress in process
@@ -903,24 +896,55 @@ static int tcp_packet(struct nf_conn *ct,
                        /* b) This SYN/ACK acknowledges a SYN that we earlier
                         * ignored as invalid. This means that the client and
                         * the server are both in sync, while the firewall is
-                        * not. We kill this session and block the SYN/ACK so
-                        * that the client cannot but retransmit its SYN and
-                        * thus initiate a clean new session.
+                        * not. We get in sync from the previously annotated
+                        * values.
                         */
-                       write_unlock_bh(&tcp_lock);
-                       if (LOG_INVALID(net, IPPROTO_TCP))
-                               nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
-                                         "nf_ct_tcp: killing out of sync session ");
-                       nf_ct_kill(ct);
-                       return NF_DROP;
+                       old_state = TCP_CONNTRACK_SYN_SENT;
+                       new_state = TCP_CONNTRACK_SYN_RECV;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
+                               ct->proto.tcp.last_end;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
+                               ct->proto.tcp.last_end;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
+                               ct->proto.tcp.last_win == 0 ?
+                                       1 : ct->proto.tcp.last_win;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
+                               ct->proto.tcp.last_wscale;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
+                               ct->proto.tcp.last_flags;
+                       memset(&ct->proto.tcp.seen[dir], 0,
+                              sizeof(struct ip_ct_tcp_state));
+                       break;
                }
                ct->proto.tcp.last_index = index;
                ct->proto.tcp.last_dir = dir;
                ct->proto.tcp.last_seq = ntohl(th->seq);
                ct->proto.tcp.last_end =
                    segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
-
-               write_unlock_bh(&tcp_lock);
+               ct->proto.tcp.last_win = ntohs(th->window);
+
+               /* a) This is a SYN in ORIGINAL. The client and the server
+                * may be in sync but we are not. In that case, we annotate
+                * the TCP options and let the packet go through. If it is a
+                * valid SYN packet, the server will reply with a SYN/ACK, and
+                * then we'll get in sync. Otherwise, the server ignores it. */
+               if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
+                       struct ip_ct_tcp_state seen = {};
+
+                       ct->proto.tcp.last_flags =
+                       ct->proto.tcp.last_wscale = 0;
+                       tcp_options(skb, dataoff, th, &seen);
+                       if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+                               ct->proto.tcp.last_flags |=
+                                       IP_CT_TCP_FLAG_WINDOW_SCALE;
+                               ct->proto.tcp.last_wscale = seen.td_scale;
+                       }
+                       if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
+                               ct->proto.tcp.last_flags |=
+                                       IP_CT_TCP_FLAG_SACK_PERM;
+                       }
+               }
+               spin_unlock_bh(&ct->lock);
                if (LOG_INVALID(net, IPPROTO_TCP))
                        nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
                                  "nf_ct_tcp: invalid packet ignored ");
@@ -929,13 +953,23 @@ static int tcp_packet(struct nf_conn *ct,
                /* Invalid packet */
                pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
                         dir, get_conntrack_index(th), old_state);
-               write_unlock_bh(&tcp_lock);
+               spin_unlock_bh(&ct->lock);
                if (LOG_INVALID(net, IPPROTO_TCP))
                        nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
                                  "nf_ct_tcp: invalid state ");
                return -NF_ACCEPT;
        case TCP_CONNTRACK_CLOSE:
                if (index == TCP_RST_SET
+                   && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+                   && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+                       /* Invalid RST  */
+                       spin_unlock_bh(&ct->lock);
+                       if (LOG_INVALID(net, IPPROTO_TCP))
+                               nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+                                         "nf_ct_tcp: invalid RST ");
+                       return -NF_ACCEPT;
+               }
+               if (index == TCP_RST_SET
                    && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
                         && ct->proto.tcp.last_index == TCP_SYN_SET)
                        || (!test_bit(IPS_ASSURED_BIT, &ct->status)
@@ -960,7 +994,7 @@ static int tcp_packet(struct nf_conn *ct,
 
        if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
                           skb, dataoff, th, pf)) {
-               write_unlock_bh(&tcp_lock);
+               spin_unlock_bh(&ct->lock);
                return -NF_ACCEPT;
        }
      in_window:
@@ -989,7 +1023,7 @@ static int tcp_packet(struct nf_conn *ct,
                timeout = nf_ct_tcp_timeout_unacknowledged;
        else
                timeout = tcp_timeouts[new_state];
-       write_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
 
        if (new_state != old_state)
                nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
@@ -1011,7 +1045,7 @@ static int tcp_packet(struct nf_conn *ct,
                   after SYN_RECV or a valid answer for a picked up
                   connection. */
                set_bit(IPS_ASSURED_BIT, &ct->status);
-               nf_conntrack_event_cache(IPCT_STATUS, ct);
+               nf_conntrack_event_cache(IPCT_ASSURED, ct);
        }
        nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
 
@@ -1106,12 +1140,12 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
 static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-                        const struct nf_conn *ct)
+                        struct nf_conn *ct)
 {
        struct nlattr *nest_parms;
        struct nf_ct_tcp_flags tmp = {};
 
-       read_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
        nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
        if (!nest_parms)
                goto nla_put_failure;
@@ -1131,14 +1165,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
        tmp.flags = ct->proto.tcp.seen[1].flags;
        NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
                sizeof(struct nf_ct_tcp_flags), &tmp);
-       read_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
 
        nla_nest_end(skb, nest_parms);
 
        return 0;
 
 nla_put_failure:
-       read_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
        return -1;
 }
 
@@ -1169,7 +1203,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
            nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
                return -EINVAL;
 
-       write_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
        if (tb[CTA_PROTOINFO_TCP_STATE])
                ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
 
@@ -1196,7 +1230,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
                ct->proto.tcp.seen[1].td_scale =
                        nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
        }
-       write_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
 
        return 0;
 }
@@ -1288,7 +1322,6 @@ static struct ctl_table tcp_sysctl_table[] = {
                .proc_handler   = proc_dointvec_jiffies,
        },
        {
-               .ctl_name       = NET_NF_CONNTRACK_TCP_LOOSE,
                .procname       = "nf_conntrack_tcp_loose",
                .data           = &nf_ct_tcp_loose,
                .maxlen         = sizeof(unsigned int),
@@ -1296,7 +1329,6 @@ static struct ctl_table tcp_sysctl_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-               .ctl_name       = NET_NF_CONNTRACK_TCP_BE_LIBERAL,
                .procname       = "nf_conntrack_tcp_be_liberal",
                .data           = &nf_ct_tcp_be_liberal,
                .maxlen         = sizeof(unsigned int),
@@ -1304,16 +1336,13 @@ static struct ctl_table tcp_sysctl_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-               .ctl_name       = NET_NF_CONNTRACK_TCP_MAX_RETRANS,
                .procname       = "nf_conntrack_tcp_max_retrans",
                .data           = &nf_ct_tcp_max_retrans,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-       {
-               .ctl_name       = 0
-       }
+       { }
 };
 
 #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
@@ -1389,7 +1418,6 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
                .proc_handler   = proc_dointvec_jiffies,
        },
        {
-               .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
                .procname       = "ip_conntrack_tcp_loose",
                .data           = &nf_ct_tcp_loose,
                .maxlen         = sizeof(unsigned int),
@@ -1397,7 +1425,6 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-               .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
                .procname       = "ip_conntrack_tcp_be_liberal",
                .data           = &nf_ct_tcp_be_liberal,
                .maxlen         = sizeof(unsigned int),
@@ -1405,16 +1432,13 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
                .proc_handler   = proc_dointvec,
        },
        {
-               .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
                .procname       = "ip_conntrack_tcp_max_retrans",
                .data           = &nf_ct_tcp_max_retrans,
                .maxlen         = sizeof(unsigned int),
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-       {
-               .ctl_name       = 0
-       }
+       { }
 };
 #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
 #endif /* CONFIG_SYSCTL */