X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fipv4%2Fip_fragment.c;h=75347ea70ea071d6fef5223535b6bb83973b602b;hb=ebda37c27d0c768947e9b058332d7ea798210cf8;hp=0d6cff1de5a32544cd90529cecfafbafc15df344;hpb=c6fda282294da882f8d8cc4c513940277dd380f5;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 0d6cff1..75347ea 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -5,10 +5,8 @@ * * The IP fragmentation functionality. * - * Version: $Id: ip_fragment.c,v 1.59 2002/01/12 07:54:56 davem Exp $ - * * Authors: Fred N. van Kempen - * Alan Cox + * Alan Cox * * Fixes: * Alan Cox : Split from ip.c , see ip_input.c for history. @@ -34,6 +32,9 @@ #include #include #include +#include +#include +#include #include #include #include @@ -50,7 +51,7 @@ * as well. Or notify me, at least. --ANK */ -int sysctl_ipfrag_max_dist __read_mostly = 64; +static int sysctl_ipfrag_max_dist __read_mostly = 64; struct ipfrag_skb_cb { @@ -58,7 +59,7 @@ struct ipfrag_skb_cb int offset; }; -#define FRAG_CB(skb) ((struct ipfrag_skb_cb*)((skb)->cb)) +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { @@ -74,35 +75,16 @@ struct ipq { struct inet_peer *peer; }; -struct inet_frags_ctl ip4_frags_ctl __read_mostly = { - /* - * Fragment cache limits. We will commit 256K at one time. Should we - * cross that limit we will prune down to 192K. This should cope with - * even the most extreme cases without allowing an attacker to - * measurably harm machine performance. - */ - .high_thresh = 256 * 1024, - .low_thresh = 192 * 1024, - - /* - * Important NOTE! Fragment queue must be destroyed before MSL expires. - * RFC791 is wrong proposing to prolongate timer each fragment arrival - * by TTL. - */ - .timeout = IP_FRAG_TIME, - .secret_interval = 10 * 60 * HZ, -}; - static struct inet_frags ip4_frags; -int ip_frag_nqueues(void) +int ip_frag_nqueues(struct net *net) { - return ip4_frags.nqueues; + return net->ipv4.frags.nqueues; } -int ip_frag_mem(void) +int ip_frag_mem(struct net *net) { - return atomic_read(&ip4_frags.mem); + return atomic_read(&net->ipv4.frags.mem); } static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, @@ -128,26 +110,26 @@ static unsigned int ip4_hashfn(struct inet_frag_queue *q) return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); } -static int ip4_frag_equal(struct inet_frag_queue *q1, - struct inet_frag_queue *q2) +static int ip4_frag_match(struct inet_frag_queue *q, void *a) { - struct ipq *qp1, *qp2; - - qp1 = container_of(q1, struct ipq, q); - qp2 = container_of(q2, struct ipq, q); - return (qp1->id == qp2->id && - qp1->saddr == qp2->saddr && - qp1->daddr == qp2->daddr && - qp1->protocol == qp2->protocol && - qp1->user == qp2->user); + struct ipq *qp; + struct ip4_create_arg *arg = a; + + qp = container_of(q, struct ipq, q); + return (qp->id == arg->iph->id && + qp->saddr == arg->iph->saddr && + qp->daddr == arg->iph->daddr && + qp->protocol == arg->iph->protocol && + qp->user == arg->user); } /* Memory Tracking Functions. */ -static __inline__ void frag_kfree_skb(struct sk_buff *skb, int *work) +static __inline__ void frag_kfree_skb(struct netns_frags *nf, + struct sk_buff *skb, int *work) { if (work) *work -= skb->truesize; - atomic_sub(skb->truesize, &ip4_frags.mem); + atomic_sub(skb->truesize, &nf->mem); kfree_skb(skb); } @@ -172,7 +154,6 @@ static __inline__ void ip4_frag_free(struct inet_frag_queue *q) qp = container_of(q, struct ipq, q); if (qp->peer) inet_putpeer(qp->peer); - kfree(qp); } @@ -194,13 +175,13 @@ static void ipq_kill(struct ipq *ipq) /* Memory limiting on fragments. Evictor trashes the oldest * fragment queue until we are back under the threshold. */ -static void ip_evictor(void) +static void ip_evictor(struct net *net) { int evicted; - evicted = inet_frag_evictor(&ip4_frags); + evicted = inet_frag_evictor(&net->ipv4.frags, &ip4_frags); if (evicted) - IP_ADD_STATS_BH(IPSTATS_MIB_REASMFAILS, evicted); + IP_ADD_STATS_BH(net, IPSTATS_MIB_REASMFAILS, evicted); } /* @@ -209,44 +190,75 @@ static void ip_evictor(void) static void ip_expire(unsigned long arg) { struct ipq *qp; + struct net *net; qp = container_of((struct inet_frag_queue *) arg, struct ipq, q); + net = container_of(qp->q.net, struct net, ipv4.frags); spin_lock(&qp->q.lock); - if (qp->q.last_in & COMPLETE) + if (qp->q.last_in & INET_FRAG_COMPLETE) goto out; ipq_kill(qp); - IP_INC_STATS_BH(IPSTATS_MIB_REASMTIMEOUT); - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMTIMEOUT); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); - if ((qp->q.last_in&FIRST_IN) && qp->q.fragments != NULL) { + if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; - /* Send an ICMP "Fragment Reassembly Timeout" message. */ - if ((head->dev = dev_get_by_index(&init_net, qp->iif)) != NULL) { - icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); - dev_put(head->dev); + + rcu_read_lock(); + head->dev = dev_get_by_index_rcu(net, qp->iif); + if (!head->dev) + goto out_rcu_unlock; + + /* + * Only search router table for the head fragment, + * when defraging timeout at PRE_ROUTING HOOK. + */ + if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { + const struct iphdr *iph = ip_hdr(head); + int err = ip_route_input(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (unlikely(err)) + goto out_rcu_unlock; + + /* + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. + */ + if (skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; + } + + /* Send an ICMP "Fragment Reassembly Timeout" message. */ + icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); +out_rcu_unlock: + rcu_read_unlock(); } out: spin_unlock(&qp->q.lock); ipq_put(qp); } -/* Creation primitives. */ - -/* Add an entry to the 'ipq' queue for a newly received IP datagram. */ -static struct ipq *ip_frag_create(struct iphdr *iph, u32 user, unsigned int h) +/* Find the correct entry in the "incomplete datagrams" queue for + * this IP datagram, and create new one, if nothing is found. + */ +static inline struct ipq *ip_find(struct net *net, struct iphdr *iph, u32 user) { struct inet_frag_queue *q; struct ip4_create_arg arg; + unsigned int hash; arg.iph = iph; arg.user = user; - q = inet_frag_create(&ip4_frags, &arg, h); + read_lock(&ip4_frags.lock); + hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); + + q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); if (q == NULL) goto out_nomem; @@ -257,37 +269,6 @@ out_nomem: return NULL; } -/* Find the correct entry in the "incomplete datagrams" queue for - * this IP datagram, and create new one, if nothing is found. - */ -static inline struct ipq *ip_find(struct iphdr *iph, u32 user) -{ - __be16 id = iph->id; - __be32 saddr = iph->saddr; - __be32 daddr = iph->daddr; - __u8 protocol = iph->protocol; - unsigned int hash; - struct ipq *qp; - struct hlist_node *n; - - read_lock(&ip4_frags.lock); - hash = ipqhashfn(id, saddr, daddr, protocol); - hlist_for_each_entry(qp, n, &ip4_frags.hash[hash], q.list) { - if (qp->id == id && - qp->saddr == saddr && - qp->daddr == daddr && - qp->protocol == protocol && - qp->user == user) { - atomic_inc(&qp->q.refcnt); - read_unlock(&ip4_frags.lock); - return qp; - } - } - read_unlock(&ip4_frags.lock); - - return ip_frag_create(iph, user, hash); -} - /* Is the fragment too far ahead to be part of ipq? */ static inline int ip_frag_too_far(struct ipq *qp) { @@ -307,7 +288,10 @@ static inline int ip_frag_too_far(struct ipq *qp) rc = qp->q.fragments && (end - start) > max; if (rc) { - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + struct net *net; + + net = container_of(qp->q.net, struct net, ipv4.frags); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); } return rc; @@ -317,7 +301,7 @@ static int ip_frag_reinit(struct ipq *qp) { struct sk_buff *fp; - if (!mod_timer(&qp->q.timer, jiffies + ip4_frags_ctl.timeout)) { + if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { atomic_inc(&qp->q.refcnt); return -ETIMEDOUT; } @@ -325,7 +309,7 @@ static int ip_frag_reinit(struct ipq *qp) fp = qp->q.fragments; do { struct sk_buff *xp = fp->next; - frag_kfree_skb(fp, NULL); + frag_kfree_skb(qp->q.net, fp, NULL); fp = xp; } while (fp); @@ -347,7 +331,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) int ihl, end; int err = -ENOENT; - if (qp->q.last_in & COMPLETE) + if (qp->q.last_in & INET_FRAG_COMPLETE) goto err; if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && @@ -373,9 +357,9 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) * or have different end, the segment is corrrupted. */ if (end < qp->q.len || - ((qp->q.last_in & LAST_IN) && end != qp->q.len)) + ((qp->q.last_in & INET_FRAG_LAST_IN) && end != qp->q.len)) goto err; - qp->q.last_in |= LAST_IN; + qp->q.last_in |= INET_FRAG_LAST_IN; qp->q.len = end; } else { if (end&7) { @@ -385,7 +369,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ - if (qp->q.last_in & LAST_IN) + if (qp->q.last_in & INET_FRAG_LAST_IN) goto err; qp->q.len = end; } @@ -462,7 +446,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) qp->q.fragments = next; qp->q.meat -= free_it->len; - frag_kfree_skb(free_it, NULL); + frag_kfree_skb(qp->q.net, free_it, NULL); } } @@ -482,15 +466,16 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; - atomic_add(skb->truesize, &ip4_frags.mem); + atomic_add(skb->truesize, &qp->q.net->mem); if (offset == 0) - qp->q.last_in |= FIRST_IN; + qp->q.last_in |= INET_FRAG_FIRST_IN; - if (qp->q.last_in == (FIRST_IN | LAST_IN) && qp->q.meat == qp->q.len) + if (qp->q.last_in == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + qp->q.meat == qp->q.len) return ip_frag_reasm(qp, prev, dev); write_lock(&ip4_frags.lock); - list_move_tail(&qp->q.lru_list, &ip4_frags.lru_list); + list_move_tail(&qp->q.lru_list, &qp->q.net->lru_list); write_unlock(&ip4_frags.lock); return -EINPROGRESS; @@ -505,6 +490,7 @@ err: static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, struct net_device *dev) { + struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; struct sk_buff *fp, *head = qp->q.fragments; int len; @@ -517,7 +503,6 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, if (prev) { head = prev->next; fp = skb_clone(head, GFP_ATOMIC); - if (!fp) goto out_nomem; @@ -531,8 +516,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, qp->q.fragments = head; } - BUG_TRAP(head != NULL); - BUG_TRAP(FRAG_CB(head)->offset == 0); + WARN_ON(head == NULL); + WARN_ON(FRAG_CB(head)->offset != 0); /* Allocate a new buffer for the datagram. */ ihlen = ip_hdrlen(head); @@ -543,14 +528,13 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, goto out_oversize; /* Head of list must not be cloned. */ - err = -ENOMEM; if (skb_cloned(head) && pskb_expand_head(head, 0, 0, GFP_ATOMIC)) goto out_nomem; /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_shinfo(head)->frag_list) { + if (skb_has_frags(head)) { struct sk_buff *clone; int i, plen = 0; @@ -559,7 +543,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, clone->next = head->next; head->next = clone; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_shinfo(head)->frag_list = NULL; + skb_frag_list_init(head); for (i=0; inr_frags; i++) plen += skb_shinfo(head)->frags[i].size; clone->len = clone->data_len = head->data_len - plen; @@ -567,12 +551,12 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, head->len -= clone->len; clone->csum = 0; clone->ip_summed = head->ip_summed; - atomic_add(clone->truesize, &ip4_frags.mem); + atomic_add(clone->truesize, &qp->q.net->mem); } skb_shinfo(head)->frag_list = head->next; skb_push(head, head->data - skb_network_header(head)); - atomic_sub(head->truesize, &ip4_frags.mem); + atomic_sub(head->truesize, &qp->q.net->mem); for (fp=head->next; fp; fp = fp->next) { head->data_len += fp->len; @@ -582,7 +566,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, else if (head->ip_summed == CHECKSUM_COMPLETE) head->csum = csum_add(head->csum, fp->csum); head->truesize += fp->truesize; - atomic_sub(fp->truesize, &ip4_frags.mem); + atomic_sub(fp->truesize, &qp->q.net->mem); } head->next = NULL; @@ -592,21 +576,21 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, iph = ip_hdr(head); iph->frag_off = 0; iph->tot_len = htons(len); - IP_INC_STATS_BH(IPSTATS_MIB_REASMOKS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMOKS); qp->q.fragments = NULL; return 0; out_nomem: LIMIT_NETDEBUG(KERN_ERR "IP: queue_glue: no memory for gluing " "queue %p\n", qp); + err = -ENOMEM; goto out_fail; out_oversize: if (net_ratelimit()) - printk(KERN_INFO - "Oversized IP packet from %d.%d.%d.%d.\n", - NIPQUAD(qp->saddr)); + printk(KERN_INFO "Oversized IP packet from %pI4.\n", + &qp->saddr); out_fail: - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); return err; } @@ -614,15 +598,17 @@ out_fail: int ip_defrag(struct sk_buff *skb, u32 user) { struct ipq *qp; + struct net *net; - IP_INC_STATS_BH(IPSTATS_MIB_REASMREQDS); + net = skb->dev ? dev_net(skb->dev) : dev_net(skb_dst(skb)->dev); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS); /* Start by cleaning up the memory. */ - if (atomic_read(&ip4_frags.mem) > ip4_frags_ctl.high_thresh) - ip_evictor(); + if (atomic_read(&net->ipv4.frags.mem) > net->ipv4.frags.high_thresh) + ip_evictor(net); /* Lookup (or create) queue header */ - if ((qp = ip_find(ip_hdr(skb), user)) != NULL) { + if ((qp = ip_find(net, ip_hdr(skb), user)) != NULL) { int ret; spin_lock(&qp->q.lock); @@ -634,21 +620,161 @@ int ip_defrag(struct sk_buff *skb, u32 user) return ret; } - IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + IP_INC_STATS_BH(net, IPSTATS_MIB_REASMFAILS); kfree_skb(skb); return -ENOMEM; } +#ifdef CONFIG_SYSCTL +static int zero; + +static struct ctl_table ip4_frags_ns_ctl_table[] = { + { + .procname = "ipfrag_high_thresh", + .data = &init_net.ipv4.frags.high_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ipfrag_low_thresh", + .data = &init_net.ipv4.frags.low_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec + }, + { + .procname = "ipfrag_time", + .data = &init_net.ipv4.frags.timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { } +}; + +static struct ctl_table ip4_frags_ctl_table[] = { + { + .procname = "ipfrag_secret_interval", + .data = &ip4_frags.secret_interval, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero + }, + { } +}; + +static int __net_init ip4_frags_ns_ctl_register(struct net *net) +{ + struct ctl_table *table; + struct ctl_table_header *hdr; + + table = ip4_frags_ns_ctl_table; + if (!net_eq(net, &init_net)) { + table = kmemdup(table, sizeof(ip4_frags_ns_ctl_table), GFP_KERNEL); + if (table == NULL) + goto err_alloc; + + table[0].data = &net->ipv4.frags.high_thresh; + table[1].data = &net->ipv4.frags.low_thresh; + table[2].data = &net->ipv4.frags.timeout; + } + + hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); + if (hdr == NULL) + goto err_reg; + + net->ipv4.frags_hdr = hdr; + return 0; + +err_reg: + if (!net_eq(net, &init_net)) + kfree(table); +err_alloc: + return -ENOMEM; +} + +static void __net_exit ip4_frags_ns_ctl_unregister(struct net *net) +{ + struct ctl_table *table; + + table = net->ipv4.frags_hdr->ctl_table_arg; + unregister_net_sysctl_table(net->ipv4.frags_hdr); + kfree(table); +} + +static void ip4_frags_ctl_register(void) +{ + register_net_sysctl_rotable(net_ipv4_ctl_path, ip4_frags_ctl_table); +} +#else +static inline int ip4_frags_ns_ctl_register(struct net *net) +{ + return 0; +} + +static inline void ip4_frags_ns_ctl_unregister(struct net *net) +{ +} + +static inline void ip4_frags_ctl_register(void) +{ +} +#endif + +static int __net_init ipv4_frags_init_net(struct net *net) +{ + /* + * Fragment cache limits. We will commit 256K at one time. Should we + * cross that limit we will prune down to 192K. This should cope with + * even the most extreme cases without allowing an attacker to + * measurably harm machine performance. + */ + net->ipv4.frags.high_thresh = 256 * 1024; + net->ipv4.frags.low_thresh = 192 * 1024; + /* + * Important NOTE! Fragment queue must be destroyed before MSL expires. + * RFC791 is wrong proposing to prolongate timer each fragment arrival + * by TTL. + */ + net->ipv4.frags.timeout = IP_FRAG_TIME; + + inet_frags_init_net(&net->ipv4.frags); + + return ip4_frags_ns_ctl_register(net); +} + +static void __net_exit ipv4_frags_exit_net(struct net *net) +{ + ip4_frags_ns_ctl_unregister(net); + inet_frags_exit_net(&net->ipv4.frags, &ip4_frags); +} + +static struct pernet_operations ip4_frags_ops = { + .init = ipv4_frags_init_net, + .exit = ipv4_frags_exit_net, +}; + void __init ipfrag_init(void) { - ip4_frags.ctl = &ip4_frags_ctl; + ip4_frags_ctl_register(); + register_pernet_subsys(&ip4_frags_ops); ip4_frags.hashfn = ip4_hashfn; ip4_frags.constructor = ip4_frag_init; ip4_frags.destructor = ip4_frag_free; ip4_frags.skb_free = NULL; ip4_frags.qsize = sizeof(struct ipq); - ip4_frags.equal = ip4_frag_equal; + ip4_frags.match = ip4_frag_match; ip4_frags.frag_expire = ip_expire; + ip4_frags.secret_interval = 10 * 60 * HZ; inet_frags_init(&ip4_frags); }