Merge branch 'master' of /home/davem/src/GIT/linux-2.6/
[safe/jmp/linux-2.6] / net / core / filter.c
index 3a10e0b..d38ef7f 100644 (file)
  * 2 of the License, or (at your option) any later version.
  *
  * Andi Kleen - Fix a few bad bugs and races.
+ * Kris Katterjohn - Added many additional checks in sk_chk_filter()
  */
 
 #include <linux/module.h>
 #include <linux/types.h>
-#include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/fcntl.h>
 #include <linux/socket.h>
 #include <linux/if_packet.h>
 #include <net/ip.h>
 #include <net/protocol.h>
+#include <net/netlink.h>
 #include <linux/skbuff.h>
 #include <net/sock.h>
 #include <linux/errno.h>
 #include <linux/timer.h>
 #include <asm/system.h>
 #include <asm/uaccess.h>
+#include <asm/unaligned.h>
 #include <linux/filter.h>
 
 /* No hurry in this branch */
@@ -41,17 +43,17 @@ static void *__load_pointer(struct sk_buff *skb, int k)
        u8 *ptr = NULL;
 
        if (k >= SKF_NET_OFF)
-               ptr = skb->nh.raw + k - SKF_NET_OFF;
+               ptr = skb_network_header(skb) + k - SKF_NET_OFF;
        else if (k >= SKF_LL_OFF)
-               ptr = skb->mac.raw + k - SKF_LL_OFF;
+               ptr = skb_mac_header(skb) + k - SKF_LL_OFF;
 
-       if (ptr >= skb->head && ptr < skb->tail)
+       if (ptr >= skb->head && ptr < skb_tail_pointer(skb))
                return ptr;
        return NULL;
 }
 
 static inline void *load_pointer(struct sk_buff *skb, int k,
-                                 unsigned int size, void *buffer)
+                                unsigned int size, void *buffer)
 {
        if (k >= 0)
                return skb_header_pointer(skb, k, size, buffer);
@@ -63,7 +65,41 @@ static inline void *load_pointer(struct sk_buff *skb, int k,
 }
 
 /**
- *     sk_run_filter   -       run a filter on a socket
+ *     sk_filter - run a packet through a socket filter
+ *     @sk: sock associated with &sk_buff
+ *     @skb: buffer to filter
+ *
+ * Run the filter code and then cut skb->data to correct size returned by
+ * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
+ * than pkt_len we keep whole skb->data. This is the socket level
+ * wrapper to sk_run_filter. It returns 0 if the packet should
+ * be accepted or -EPERM if the packet should be tossed.
+ *
+ */
+int sk_filter(struct sock *sk, struct sk_buff *skb)
+{
+       int err;
+       struct sk_filter *filter;
+
+       err = security_sock_rcv_skb(sk, skb);
+       if (err)
+               return err;
+
+       rcu_read_lock_bh();
+       filter = rcu_dereference_bh(sk->sk_filter);
+       if (filter) {
+               unsigned int pkt_len = sk_run_filter(skb, filter->insns,
+                               filter->len);
+               err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+       }
+       rcu_read_unlock_bh();
+
+       return err;
+}
+EXPORT_SYMBOL(sk_filter);
+
+/**
+ *     sk_run_filter - run a filter on a socket
  *     @skb: buffer to run the filter on
  *     @filter: filter to apply
  *     @flen: length of filter
@@ -73,13 +109,12 @@ static inline void *load_pointer(struct sk_buff *skb, int k,
  * filtering, filter is the array of filter instructions, and
  * len is the number of filter blocks in the array.
  */
-int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
+unsigned int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
 {
        struct sock_filter *fentry;     /* We walk down these */
        void *ptr;
-       u32 A = 0;                      /* Accumulator */
-       u32 X = 0;                      /* Index Register */
+       u32 A = 0;                      /* Accumulator */
+       u32 X = 0;                      /* Index Register */
        u32 mem[BPF_MEMWORDS];          /* Scratch Memory Store */
        u32 tmp;
        int k;
@@ -90,7 +125,7 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
         */
        for (pc = 0; pc < flen; pc++) {
                fentry = &filter[pc];
-                       
+
                switch (fentry->code) {
                case BPF_ALU|BPF_ADD|BPF_X:
                        A += X;
@@ -174,19 +209,19 @@ int sk_run_filter(struct sk_buff *skb, struct sock_filter *filter, int flen)
                        continue;
                case BPF_LD|BPF_W|BPF_ABS:
                        k = fentry->k;
- load_w:
+load_w:
                        ptr = load_pointer(skb, k, 4, &tmp);
                        if (ptr != NULL) {
-                               A = ntohl(*(u32 *)ptr);
+                               A = get_unaligned_be32(ptr);
                                continue;
                        }
                        break;
                case BPF_LD|BPF_H|BPF_ABS:
                        k = fentry->k;
- load_h:
+load_h:
                        ptr = load_pointer(skb, k, 2, &tmp);
                        if (ptr != NULL) {
-                               A = ntohs(*(u16 *)ptr);
+                               A = get_unaligned_be16(ptr);
                                continue;
                        }
                        break;
@@ -240,9 +275,9 @@ load_b:
                        A = X;
                        continue;
                case BPF_RET|BPF_K:
-                       return ((unsigned int)fentry->k);
+                       return fentry->k;
                case BPF_RET|BPF_A:
-                       return ((unsigned int)A);
+                       return A;
                case BPF_ST:
                        mem[fentry->k] = A;
                        continue;
@@ -250,7 +285,7 @@ load_b:
                        mem[fentry->k] = X;
                        continue;
                default:
-                       /* Invalid instruction counts as RET */
+                       WARN_ON(1);
                        return 0;
                }
 
@@ -260,7 +295,7 @@ load_b:
                 */
                switch (k-SKF_AD_OFF) {
                case SKF_AD_PROTOCOL:
-                       A = htons(skb->protocol);
+                       A = ntohs(skb->protocol);
                        continue;
                case SKF_AD_PKTTYPE:
                        A = skb->pkt_type;
@@ -268,6 +303,47 @@ load_b:
                case SKF_AD_IFINDEX:
                        A = skb->dev->ifindex;
                        continue;
+               case SKF_AD_MARK:
+                       A = skb->mark;
+                       continue;
+               case SKF_AD_QUEUE:
+                       A = skb->queue_mapping;
+                       continue;
+               case SKF_AD_NLATTR: {
+                       struct nlattr *nla;
+
+                       if (skb_is_nonlinear(skb))
+                               return 0;
+                       if (A > skb->len - sizeof(struct nlattr))
+                               return 0;
+
+                       nla = nla_find((struct nlattr *)&skb->data[A],
+                                      skb->len - A, X);
+                       if (nla)
+                               A = (void *)nla - (void *)skb->data;
+                       else
+                               A = 0;
+                       continue;
+               }
+               case SKF_AD_NLATTR_NEST: {
+                       struct nlattr *nla;
+
+                       if (skb_is_nonlinear(skb))
+                               return 0;
+                       if (A > skb->len - sizeof(struct nlattr))
+                               return 0;
+
+                       nla = (struct nlattr *)&skb->data[A];
+                       if (nla->nla_len > A - skb->len)
+                               return 0;
+
+                       nla = nla_find_nested(nla, X);
+                       if (nla)
+                               A = (void *)nla - (void *)skb->data;
+                       else
+                               A = 0;
+                       continue;
+               }
                default:
                        return 0;
                }
@@ -275,6 +351,7 @@ load_b:
 
        return 0;
 }
+EXPORT_SYMBOL(sk_run_filter);
 
 /**
  *     sk_chk_filter - verify socket filter code
@@ -283,10 +360,12 @@ load_b:
  *
  * Check the user's filter code. If we let some ugly
  * filter code slip through kaboom! The filter must contain
- * no references or jumps that are out of range, no illegal instructions
- * and no backward jumps. It must end with a RET instruction
+ * no references or jumps that are out of range, no illegal
+ * instructions, and must end with a RET instruction.
+ *
+ * All jumps are forward as they are not signed.
  *
- * Returns 0 if the rule set is legal or a negative errno code if not.
+ * Returns 0 if the rule set is legal or -EINVAL if not.
  */
 int sk_chk_filter(struct sock_filter *filter, int flen)
 {
@@ -298,49 +377,110 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
 
        /* check the filter code now */
        for (pc = 0; pc < flen; pc++) {
-               /* all jumps are forward as they are not signed */
                ftest = &filter[pc];
-               if (BPF_CLASS(ftest->code) == BPF_JMP) {
-                       /* but they mustn't jump off the end */
-                       if (BPF_OP(ftest->code) == BPF_JA) {
-                               /*
-                                * Note, the large ftest->k might cause loops.
-                                * Compare this with conditional jumps below,
-                                * where offsets are limited. --ANK (981016)
-                                */
-                               if (ftest->k >= (unsigned)(flen-pc-1))
-                                       return -EINVAL;
-                       } else {
-                               /* for conditionals both must be safe */
-                               if (pc + ftest->jt +1 >= flen ||
-                                   pc + ftest->jf +1 >= flen)
-                                       return -EINVAL;
-                       }
-               }
 
-               /* check for division by zero   -Kris Katterjohn 2005-10-30 */
-               if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0)
-                       return -EINVAL;
+               /* Only allow valid instructions */
+               switch (ftest->code) {
+               case BPF_ALU|BPF_ADD|BPF_K:
+               case BPF_ALU|BPF_ADD|BPF_X:
+               case BPF_ALU|BPF_SUB|BPF_K:
+               case BPF_ALU|BPF_SUB|BPF_X:
+               case BPF_ALU|BPF_MUL|BPF_K:
+               case BPF_ALU|BPF_MUL|BPF_X:
+               case BPF_ALU|BPF_DIV|BPF_X:
+               case BPF_ALU|BPF_AND|BPF_K:
+               case BPF_ALU|BPF_AND|BPF_X:
+               case BPF_ALU|BPF_OR|BPF_K:
+               case BPF_ALU|BPF_OR|BPF_X:
+               case BPF_ALU|BPF_LSH|BPF_K:
+               case BPF_ALU|BPF_LSH|BPF_X:
+               case BPF_ALU|BPF_RSH|BPF_K:
+               case BPF_ALU|BPF_RSH|BPF_X:
+               case BPF_ALU|BPF_NEG:
+               case BPF_LD|BPF_W|BPF_ABS:
+               case BPF_LD|BPF_H|BPF_ABS:
+               case BPF_LD|BPF_B|BPF_ABS:
+               case BPF_LD|BPF_W|BPF_LEN:
+               case BPF_LD|BPF_W|BPF_IND:
+               case BPF_LD|BPF_H|BPF_IND:
+               case BPF_LD|BPF_B|BPF_IND:
+               case BPF_LD|BPF_IMM:
+               case BPF_LDX|BPF_W|BPF_LEN:
+               case BPF_LDX|BPF_B|BPF_MSH:
+               case BPF_LDX|BPF_IMM:
+               case BPF_MISC|BPF_TAX:
+               case BPF_MISC|BPF_TXA:
+               case BPF_RET|BPF_K:
+               case BPF_RET|BPF_A:
+                       break;
+
+               /* Some instructions need special checks */
 
-               /* check that memory operations use valid addresses. */
-               if (ftest->k >= BPF_MEMWORDS) {
-                       /* but it might not be a memory operation... */
-                       switch (ftest->code) {
-                       case BPF_ST:    
-                       case BPF_STX:   
-                       case BPF_LD|BPF_MEM:    
-                       case BPF_LDX|BPF_MEM:   
+               case BPF_ALU|BPF_DIV|BPF_K:
+                       /* check for division by zero */
+                       if (ftest->k == 0)
                                return -EINVAL;
-                       }
+                       break;
+
+               case BPF_LD|BPF_MEM:
+               case BPF_LDX|BPF_MEM:
+               case BPF_ST:
+               case BPF_STX:
+                       /* check for invalid memory addresses */
+                       if (ftest->k >= BPF_MEMWORDS)
+                               return -EINVAL;
+                       break;
+
+               case BPF_JMP|BPF_JA:
+                       /*
+                        * Note, the large ftest->k might cause loops.
+                        * Compare this with conditional jumps below,
+                        * where offsets are limited. --ANK (981016)
+                        */
+                       if (ftest->k >= (unsigned)(flen-pc-1))
+                               return -EINVAL;
+                       break;
+
+               case BPF_JMP|BPF_JEQ|BPF_K:
+               case BPF_JMP|BPF_JEQ|BPF_X:
+               case BPF_JMP|BPF_JGE|BPF_K:
+               case BPF_JMP|BPF_JGE|BPF_X:
+               case BPF_JMP|BPF_JGT|BPF_K:
+               case BPF_JMP|BPF_JGT|BPF_X:
+               case BPF_JMP|BPF_JSET|BPF_K:
+               case BPF_JMP|BPF_JSET|BPF_X:
+                       /* for conditionals both must be safe */
+                       if (pc + ftest->jt + 1 >= flen ||
+                           pc + ftest->jf + 1 >= flen)
+                               return -EINVAL;
+                       break;
+
+               default:
+                       return -EINVAL;
                }
        }
 
-       /*
-        * The program must end with a return. We don't care where they
-        * jumped within the script (its always forwards) but in the end
-        * they _will_ hit this.
-        */
-        return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
+       return (BPF_CLASS(filter[flen - 1].code) == BPF_RET) ? 0 : -EINVAL;
+}
+EXPORT_SYMBOL(sk_chk_filter);
+
+/**
+ *     sk_filter_rcu_release: Release a socket filter by rcu_head
+ *     @rcu: rcu_head that contains the sk_filter to free
+ */
+static void sk_filter_rcu_release(struct rcu_head *rcu)
+{
+       struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu);
+
+       sk_filter_release(fp);
+}
+
+static void sk_filter_delayed_uncharge(struct sock *sk, struct sk_filter *fp)
+{
+       unsigned int size = sk_filter_len(fp);
+
+       atomic_sub(size, &sk->sk_omem_alloc);
+       call_rcu_bh(&fp->rcu, sk_filter_rcu_release);
 }
 
 /**
@@ -355,19 +495,19 @@ int sk_chk_filter(struct sock_filter *filter, int flen)
  */
 int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 {
-       struct sk_filter *fp
+       struct sk_filter *fp, *old_fp;
        unsigned int fsize = sizeof(struct sock_filter) * fprog->len;
        int err;
 
        /* Make sure new filter is there and in the right amounts. */
-        if (fprog->filter == NULL)
-                return -EINVAL;
+       if (fprog->filter == NULL)
+               return -EINVAL;
 
        fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
        if (!fp)
                return -ENOMEM;
        if (copy_from_user(fp->insns, fprog->filter, fsize)) {
-               sock_kfree_s(sk, fp, fsize+sizeof(*fp)); 
+               sock_kfree_s(sk, fp, fsize+sizeof(*fp));
                return -EFAULT;
        }
 
@@ -375,20 +515,35 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
        fp->len = fprog->len;
 
        err = sk_chk_filter(fp->insns, fp->len);
-       if (!err) {
-               struct sk_filter *old_fp;
-
-               spin_lock_bh(&sk->sk_lock.slock);
-               old_fp = sk->sk_filter;
-               sk->sk_filter = fp;
-               spin_unlock_bh(&sk->sk_lock.slock);
-               fp = old_fp;
+       if (err) {
+               sk_filter_uncharge(sk, fp);
+               return err;
        }
 
-       if (fp)
-               sk_filter_release(sk, fp);
-       return err;
+       rcu_read_lock_bh();
+       old_fp = rcu_dereference_bh(sk->sk_filter);
+       rcu_assign_pointer(sk->sk_filter, fp);
+       rcu_read_unlock_bh();
+
+       if (old_fp)
+               sk_filter_delayed_uncharge(sk, old_fp);
+       return 0;
 }
+EXPORT_SYMBOL_GPL(sk_attach_filter);
 
-EXPORT_SYMBOL(sk_chk_filter);
-EXPORT_SYMBOL(sk_run_filter);
+int sk_detach_filter(struct sock *sk)
+{
+       int ret = -ENOENT;
+       struct sk_filter *filter;
+
+       rcu_read_lock_bh();
+       filter = rcu_dereference_bh(sk->sk_filter);
+       if (filter) {
+               rcu_assign_pointer(sk->sk_filter, NULL);
+               sk_filter_delayed_uncharge(sk, filter);
+               ret = 0;
+       }
+       rcu_read_unlock_bh();
+       return ret;
+}
+EXPORT_SYMBOL_GPL(sk_detach_filter);