netfilter: ebtables: abort if next_offset is too small

[safe/jmp/linux-2.6] / net / netfilter / nf_conntrack_proto_tcp.c
diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c

index 4c7f6f0..9dd8cd4 100644 (file)
--- a/net/netfilter/nf_conntrack_proto_tcp.c
+++ b/net/netfilter/nf_conntrack_proto_tcp.c
@@ -29,9 +29,6 @@
  #include <net/netfilter/ipv4/nf_conntrack_ipv4.h>
  #include <net/netfilter/ipv6/nf_conntrack_ipv6.h>
  
-/* Protects ct->proto.tcp */
-static DEFINE_RWLOCK(tcp_lock);
-
  /* "Be conservative in what you do,
      be liberal in what you accept from others."
      If it's non-zero, we mark only out of window RST segments as INVALID. */
@@ -309,13 +306,13 @@ static int tcp_print_tuple(struct seq_file *s,
  }
  
  /* Print out the private part of the conntrack. */
-static int tcp_print_conntrack(struct seq_file *s, const struct nf_conn *ct)
+static int tcp_print_conntrack(struct seq_file *s, struct nf_conn *ct)
  {
         enum tcp_conntrack state;
  
-       read_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
         state = ct->proto.tcp.state;
-       read_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
  
         return seq_printf(s, "%s ", tcp_conntrack_names[state]);
  }
@@ -495,6 +492,21 @@ static void tcp_sack(const struct sk_buff *skb, unsigned int dataoff,
         }
  }
  
+#ifdef CONFIG_NF_NAT_NEEDED
+static inline s16 nat_offset(const struct nf_conn *ct,
+                            enum ip_conntrack_dir dir,
+                            u32 seq)
+{
+       typeof(nf_ct_nat_offset) get_offset = rcu_dereference(nf_ct_nat_offset);
+
+       return get_offset != NULL ? get_offset(ct, dir, seq) : 0;
+}
+#define NAT_OFFSET(pf, ct, dir, seq) \
+       (pf == NFPROTO_IPV4 ? nat_offset(ct, dir, seq) : 0)
+#else
+#define NAT_OFFSET(pf, ct, dir, seq)   0
+#endif
+
  static bool tcp_in_window(const struct nf_conn *ct,
                           struct ip_ct_tcp *state,
                           enum ip_conntrack_dir dir,
@@ -509,6 +521,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
         struct ip_ct_tcp_state *receiver = &state->seen[!dir];
         const struct nf_conntrack_tuple *tuple = &ct->tuplehash[dir].tuple;
         __u32 seq, ack, sack, end, win, swin;
+       s16 receiver_offset;
         bool res;
  
         /*
@@ -522,11 +535,16 @@ static bool tcp_in_window(const struct nf_conn *ct,
         if (receiver->flags & IP_CT_TCP_FLAG_SACK_PERM)
                 tcp_sack(skb, dataoff, tcph, &sack);
  
+       /* Take into account NAT sequence number mangling */
+       receiver_offset = NAT_OFFSET(pf, ct, !dir, ack - 1);
+       ack -= receiver_offset;
+       sack -= receiver_offset;
+
         pr_debug("tcp_in_window: START\n");
         pr_debug("tcp_in_window: ");
         nf_ct_dump_tuple(tuple);
-       pr_debug("seq=%u ack=%u sack=%u win=%u end=%u\n",
-                seq, ack, sack, win, end);
+       pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+                seq, ack, receiver_offset, sack, receiver_offset, win, end);
         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
                  sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -616,8 +634,8 @@ static bool tcp_in_window(const struct nf_conn *ct,
  
         pr_debug("tcp_in_window: ");
         nf_ct_dump_tuple(tuple);
-       pr_debug("seq=%u ack=%u sack =%u win=%u end=%u\n",
-                seq, ack, sack, win, end);
+       pr_debug("seq=%u ack=%u+(%d) sack=%u+(%d) win=%u end=%u\n",
+                seq, ack, receiver_offset, sack, receiver_offset, win, end);
         pr_debug("tcp_in_window: sender end=%u maxend=%u maxwin=%u scale=%i "
                  "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
                  sender->td_end, sender->td_maxend, sender->td_maxwin,
@@ -651,6 +669,14 @@ static bool tcp_in_window(const struct nf_conn *ct,
                         sender->td_end = end;
                         sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED;
                 }
+               if (tcph->ack) {
+                       if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) {
+                               sender->td_maxack = ack;
+                               sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET;
+                       } else if (after(ack, sender->td_maxack))
+                               sender->td_maxack = ack;
+               }
+
                 /*
                  * Update receiver data.
                  */
@@ -695,7 +721,7 @@ static bool tcp_in_window(const struct nf_conn *ct,
                         before(seq, sender->td_maxend + 1) ?
                         after(end, sender->td_end - receiver->td_maxwin - 1) ?
                         before(sack, receiver->td_end + 1) ?
-                       after(ack, receiver->td_end - MAXACKWINDOW(sender)) ? "BUG"
+                       after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG"
                         : "ACK is under the lower bound (possible overly delayed ACK)"
                         : "ACK is over the upper bound (ACKed data not seen yet)"
                         : "SEQ is under the lower bound (already ACKed data retransmitted)"
@@ -710,39 +736,6 @@ static bool tcp_in_window(const struct nf_conn *ct,
         return res;
  }
  
-#ifdef CONFIG_NF_NAT_NEEDED
-/* Update sender->td_end after NAT successfully mangled the packet */
-/* Caller must linearize skb at tcp header. */
-void nf_conntrack_tcp_update(const struct sk_buff *skb,
-                            unsigned int dataoff,
-                            struct nf_conn *ct,
-                            int dir)
-{
-       const struct tcphdr *tcph = (const void *)skb->data + dataoff;
-       const struct ip_ct_tcp_state *sender = &ct->proto.tcp.seen[dir];
-       const struct ip_ct_tcp_state *receiver = &ct->proto.tcp.seen[!dir];
-       __u32 end;
-
-       end = segment_seq_plus_len(ntohl(tcph->seq), skb->len, dataoff, tcph);
-
-       write_lock_bh(&tcp_lock);
-       /*
-        * We have to worry for the ack in the reply packet only...
-        */
-       if (after(end, ct->proto.tcp.seen[dir].td_end))
-               ct->proto.tcp.seen[dir].td_end = end;
-       ct->proto.tcp.last_end = end;
-       write_unlock_bh(&tcp_lock);
-       pr_debug("tcp_update: sender end=%u maxend=%u maxwin=%u scale=%i "
-                "receiver end=%u maxend=%u maxwin=%u scale=%i\n",
-                sender->td_end, sender->td_maxend, sender->td_maxwin,
-                sender->td_scale,
-                receiver->td_end, receiver->td_maxend, receiver->td_maxwin,
-                receiver->td_scale);
-}
-EXPORT_SYMBOL_GPL(nf_conntrack_tcp_update);
-#endif
-
  #define        TH_FIN  0x01
  #define        TH_SYN  0x02
  #define        TH_RST  0x04
@@ -767,7 +760,7 @@ static const u8 tcp_valid_flags[(TH_FIN|TH_SYN|TH_RST|TH_ACK|TH_URG) + 1] =
  };
  
  /* Protect conntrack agaist broken packets. Code taken from ipt_unclean.c.  */
-static int tcp_error(struct net *net,
+static int tcp_error(struct net *net, struct nf_conn *tmpl,
                      struct sk_buff *skb,
                      unsigned int dataoff,
                      enum ip_conntrack_info *ctinfo,
@@ -841,7 +834,7 @@ static int tcp_packet(struct nf_conn *ct,
         th = skb_header_pointer(skb, dataoff, sizeof(_tcph), &_tcph);
         BUG_ON(th == NULL);
  
-       write_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
         old_state = ct->proto.tcp.state;
         dir = CTINFO2DIR(ctinfo);
         index = get_conntrack_index(th);
@@ -871,7 +864,7 @@ static int tcp_packet(struct nf_conn *ct,
                         && ct->proto.tcp.last_index == TCP_RST_SET)) {
                         /* Attempt to reopen a closed/aborted connection.
                          * Delete this connection and look up again. */
-                       write_unlock_bh(&tcp_lock);
+                       spin_unlock_bh(&ct->lock);
  
                         /* Only repeat if we can actually remove the timer.
                          * Destruction may already be in progress in process
@@ -903,24 +896,55 @@ static int tcp_packet(struct nf_conn *ct,
                         /* b) This SYN/ACK acknowledges a SYN that we earlier
                          * ignored as invalid. This means that the client and
                          * the server are both in sync, while the firewall is
-                        * not. We kill this session and block the SYN/ACK so
-                        * that the client cannot but retransmit its SYN and
-                        * thus initiate a clean new session.
+                        * not. We get in sync from the previously annotated
+                        * values.
                          */
-                       write_unlock_bh(&tcp_lock);
-                       if (LOG_INVALID(net, IPPROTO_TCP))
-                               nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
-                                         "nf_ct_tcp: killing out of sync session ");
-                       nf_ct_kill(ct);
-                       return NF_DROP;
+                       old_state = TCP_CONNTRACK_SYN_SENT;
+                       new_state = TCP_CONNTRACK_SYN_RECV;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_end =
+                               ct->proto.tcp.last_end;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxend =
+                               ct->proto.tcp.last_end;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_maxwin =
+                               ct->proto.tcp.last_win == 0 ?
+                                       1 : ct->proto.tcp.last_win;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].td_scale =
+                               ct->proto.tcp.last_wscale;
+                       ct->proto.tcp.seen[ct->proto.tcp.last_dir].flags =
+                               ct->proto.tcp.last_flags;
+                       memset(&ct->proto.tcp.seen[dir], 0,
+                              sizeof(struct ip_ct_tcp_state));
+                       break;
                 }
                 ct->proto.tcp.last_index = index;
                 ct->proto.tcp.last_dir = dir;
                 ct->proto.tcp.last_seq = ntohl(th->seq);
                 ct->proto.tcp.last_end =
                     segment_seq_plus_len(ntohl(th->seq), skb->len, dataoff, th);
-
-               write_unlock_bh(&tcp_lock);
+               ct->proto.tcp.last_win = ntohs(th->window);
+
+               /* a) This is a SYN in ORIGINAL. The client and the server
+                * may be in sync but we are not. In that case, we annotate
+                * the TCP options and let the packet go through. If it is a
+                * valid SYN packet, the server will reply with a SYN/ACK, and
+                * then we'll get in sync. Otherwise, the server ignores it. */
+               if (index == TCP_SYN_SET && dir == IP_CT_DIR_ORIGINAL) {
+                       struct ip_ct_tcp_state seen = {};
+
+                       ct->proto.tcp.last_flags =
+                       ct->proto.tcp.last_wscale = 0;
+                       tcp_options(skb, dataoff, th, &seen);
+                       if (seen.flags & IP_CT_TCP_FLAG_WINDOW_SCALE) {
+                               ct->proto.tcp.last_flags |=
+                                       IP_CT_TCP_FLAG_WINDOW_SCALE;
+                               ct->proto.tcp.last_wscale = seen.td_scale;
+                       }
+                       if (seen.flags & IP_CT_TCP_FLAG_SACK_PERM) {
+                               ct->proto.tcp.last_flags |=
+                                       IP_CT_TCP_FLAG_SACK_PERM;
+                       }
+               }
+               spin_unlock_bh(&ct->lock);
                 if (LOG_INVALID(net, IPPROTO_TCP))
                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
                                   "nf_ct_tcp: invalid packet ignored ");
@@ -929,13 +953,23 @@ static int tcp_packet(struct nf_conn *ct,
                 /* Invalid packet */
                 pr_debug("nf_ct_tcp: Invalid dir=%i index=%u ostate=%u\n",
                          dir, get_conntrack_index(th), old_state);
-               write_unlock_bh(&tcp_lock);
+               spin_unlock_bh(&ct->lock);
                 if (LOG_INVALID(net, IPPROTO_TCP))
                         nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
                                   "nf_ct_tcp: invalid state ");
                 return -NF_ACCEPT;
         case TCP_CONNTRACK_CLOSE:
                 if (index == TCP_RST_SET
+                   && (ct->proto.tcp.seen[!dir].flags & IP_CT_TCP_FLAG_MAXACK_SET)
+                   && before(ntohl(th->seq), ct->proto.tcp.seen[!dir].td_maxack)) {
+                       /* Invalid RST  */
+                       spin_unlock_bh(&ct->lock);
+                       if (LOG_INVALID(net, IPPROTO_TCP))
+                               nf_log_packet(pf, 0, skb, NULL, NULL, NULL,
+                                         "nf_ct_tcp: invalid RST ");
+                       return -NF_ACCEPT;
+               }
+               if (index == TCP_RST_SET
                     && ((test_bit(IPS_SEEN_REPLY_BIT, &ct->status)
                          && ct->proto.tcp.last_index == TCP_SYN_SET)
                         || (!test_bit(IPS_ASSURED_BIT, &ct->status)
@@ -960,7 +994,7 @@ static int tcp_packet(struct nf_conn *ct,
  
         if (!tcp_in_window(ct, &ct->proto.tcp, dir, index,
                            skb, dataoff, th, pf)) {
-               write_unlock_bh(&tcp_lock);
+               spin_unlock_bh(&ct->lock);
                 return -NF_ACCEPT;
         }
       in_window:
@@ -989,9 +1023,8 @@ static int tcp_packet(struct nf_conn *ct,
                 timeout = nf_ct_tcp_timeout_unacknowledged;
         else
                 timeout = tcp_timeouts[new_state];
-       write_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
  
-       nf_conntrack_event_cache(IPCT_PROTOINFO_VOLATILE, ct);
         if (new_state != old_state)
                 nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
  
@@ -1012,7 +1045,7 @@ static int tcp_packet(struct nf_conn *ct,
                    after SYN_RECV or a valid answer for a picked up
                    connection. */
                 set_bit(IPS_ASSURED_BIT, &ct->status);
-               nf_conntrack_event_cache(IPCT_STATUS, ct);
+               nf_conntrack_event_cache(IPCT_ASSURED, ct);
         }
         nf_ct_refresh_acct(ct, ctinfo, skb, timeout);
  
@@ -1107,12 +1140,12 @@ static bool tcp_new(struct nf_conn *ct, const struct sk_buff *skb,
  #include <linux/netfilter/nfnetlink_conntrack.h>
  
  static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
-                        const struct nf_conn *ct)
+                        struct nf_conn *ct)
  {
         struct nlattr *nest_parms;
         struct nf_ct_tcp_flags tmp = {};
  
-       read_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
         nest_parms = nla_nest_start(skb, CTA_PROTOINFO_TCP | NLA_F_NESTED);
         if (!nest_parms)
                 goto nla_put_failure;
@@ -1132,14 +1165,14 @@ static int tcp_to_nlattr(struct sk_buff *skb, struct nlattr *nla,
         tmp.flags = ct->proto.tcp.seen[1].flags;
         NLA_PUT(skb, CTA_PROTOINFO_TCP_FLAGS_REPLY,
                 sizeof(struct nf_ct_tcp_flags), &tmp);
-       read_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
  
         nla_nest_end(skb, nest_parms);
  
         return 0;
  
  nla_put_failure:
-       read_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
         return -1;
  }
  
@@ -1170,7 +1203,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
             nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]) >= TCP_CONNTRACK_MAX)
                 return -EINVAL;
  
-       write_lock_bh(&tcp_lock);
+       spin_lock_bh(&ct->lock);
         if (tb[CTA_PROTOINFO_TCP_STATE])
                 ct->proto.tcp.state = nla_get_u8(tb[CTA_PROTOINFO_TCP_STATE]);
  
@@ -1197,7 +1230,7 @@ static int nlattr_to_tcp(struct nlattr *cda[], struct nf_conn *ct)
                 ct->proto.tcp.seen[1].td_scale =
                         nla_get_u8(tb[CTA_PROTOINFO_TCP_WSCALE_REPLY]);
         }
-       write_unlock_bh(&tcp_lock);
+       spin_unlock_bh(&ct->lock);
  
         return 0;
  }
@@ -1289,7 +1322,6 @@ static struct ctl_table tcp_sysctl_table[] = {
                 .proc_handler   = proc_dointvec_jiffies,
         },
         {
-               .ctl_name       = NET_NF_CONNTRACK_TCP_LOOSE,
                 .procname       = "nf_conntrack_tcp_loose",
                 .data           = &nf_ct_tcp_loose,
                 .maxlen         = sizeof(unsigned int),
@@ -1297,7 +1329,6 @@ static struct ctl_table tcp_sysctl_table[] = {
                 .proc_handler   = proc_dointvec,
         },
         {
-               .ctl_name       = NET_NF_CONNTRACK_TCP_BE_LIBERAL,
                 .procname       = "nf_conntrack_tcp_be_liberal",
                 .data           = &nf_ct_tcp_be_liberal,
                 .maxlen         = sizeof(unsigned int),
@@ -1305,16 +1336,13 @@ static struct ctl_table tcp_sysctl_table[] = {
                 .proc_handler   = proc_dointvec,
         },
         {
-               .ctl_name       = NET_NF_CONNTRACK_TCP_MAX_RETRANS,
                 .procname       = "nf_conntrack_tcp_max_retrans",
                 .data           = &nf_ct_tcp_max_retrans,
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
-       {
-               .ctl_name       = 0
-       }
+       { }
  };
  
  #ifdef CONFIG_NF_CONNTRACK_PROC_COMPAT
@@ -1390,7 +1418,6 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
                 .proc_handler   = proc_dointvec_jiffies,
         },
         {
-               .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_LOOSE,
                 .procname       = "ip_conntrack_tcp_loose",
                 .data           = &nf_ct_tcp_loose,
                 .maxlen         = sizeof(unsigned int),
@@ -1398,7 +1425,6 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
                 .proc_handler   = proc_dointvec,
         },
         {
-               .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_BE_LIBERAL,
                 .procname       = "ip_conntrack_tcp_be_liberal",
                 .data           = &nf_ct_tcp_be_liberal,
                 .maxlen         = sizeof(unsigned int),
@@ -1406,16 +1432,13 @@ static struct ctl_table tcp_compat_sysctl_table[] = {
                 .proc_handler   = proc_dointvec,
         },
         {
-               .ctl_name       = NET_IPV4_NF_CONNTRACK_TCP_MAX_RETRANS,
                 .procname       = "ip_conntrack_tcp_max_retrans",
                 .data           = &nf_ct_tcp_max_retrans,
                 .maxlen         = sizeof(unsigned int),
                 .mode           = 0644,
                 .proc_handler   = proc_dointvec,
         },
-       {
-               .ctl_name       = 0
-       }
+       { }
  };
  #endif /* CONFIG_NF_CONNTRACK_PROC_COMPAT */
  #endif /* CONFIG_SYSCTL */