[NET] IPV4: Fix whitespace errors.
[safe/jmp/linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #include <linux/netfilter_ipv4/ip_conntrack.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
47
48 #define IP_CONNTRACK_VERSION    "2.4"
49
50 #if 0
51 #define DEBUGP printk
52 #else
53 #define DEBUGP(format, args...)
54 #endif
55
56 DEFINE_RWLOCK(ip_conntrack_lock);
57
58 /* ip_conntrack_standalone needs this */
59 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size __read_mostly = 0;
66 int ip_conntrack_max __read_mostly;
67 struct list_head *ip_conntrack_hash __read_mostly;
68 static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69 static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70 struct ip_conntrack ip_conntrack_untracked;
71 unsigned int ip_ct_log_invalid __read_mostly;
72 static LIST_HEAD(unconfirmed);
73 static int ip_conntrack_vmalloc __read_mostly;
74
75 static unsigned int ip_conntrack_next_id;
76 static unsigned int ip_conntrack_expect_next_id;
77 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
80
81 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
83 /* deliver cached events and clear cache entry - must be called with locally
84  * disabled softirqs */
85 static inline void
86 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
87 {
88         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91                                     ecache->ct);
92         ecache->events = 0;
93         ip_conntrack_put(ecache->ct);
94         ecache->ct = NULL;
95 }
96
97 /* Deliver all cached events for a particular conntrack. This is called
98  * by code prior to async packet handling or freeing the skb */
99 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
100 {
101         struct ip_conntrack_ecache *ecache;
102
103         local_bh_disable();
104         ecache = &__get_cpu_var(ip_conntrack_ecache);
105         if (ecache->ct == ct)
106                 __ip_ct_deliver_cached_events(ecache);
107         local_bh_enable();
108 }
109
110 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111 {
112         struct ip_conntrack_ecache *ecache;
113
114         /* take care of delivering potentially old events */
115         ecache = &__get_cpu_var(ip_conntrack_ecache);
116         BUG_ON(ecache->ct == ct);
117         if (ecache->ct)
118                 __ip_ct_deliver_cached_events(ecache);
119         /* initialize for this conntrack/packet */
120         ecache->ct = ct;
121         nf_conntrack_get(&ct->ct_general);
122 }
123
124 /* flush the event cache - touches other CPU's data and must not be called while
125  * packets are still passing through the code */
126 static void ip_ct_event_cache_flush(void)
127 {
128         struct ip_conntrack_ecache *ecache;
129         int cpu;
130
131         for_each_possible_cpu(cpu) {
132                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133                 if (ecache->ct)
134                         ip_conntrack_put(ecache->ct);
135         }
136 }
137 #else
138 static inline void ip_ct_event_cache_flush(void) {}
139 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
141 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
143 static int ip_conntrack_hash_rnd_initted;
144 static unsigned int ip_conntrack_hash_rnd;
145
146 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147                             unsigned int size, unsigned int rnd)
148 {
149         return (jhash_3words((__force u32)tuple->src.ip,
150                              ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
152                              rnd) % size);
153 }
154
155 static u_int32_t
156 hash_conntrack(const struct ip_conntrack_tuple *tuple)
157 {
158         return __hash_conntrack(tuple, ip_conntrack_htable_size,
159                                 ip_conntrack_hash_rnd);
160 }
161
162 int
163 ip_ct_get_tuple(const struct iphdr *iph,
164                 const struct sk_buff *skb,
165                 unsigned int dataoff,
166                 struct ip_conntrack_tuple *tuple,
167                 const struct ip_conntrack_protocol *protocol)
168 {
169         /* Never happen */
170         if (iph->frag_off & htons(IP_OFFSET)) {
171                 printk("ip_conntrack_core: Frag of proto %u.\n",
172                        iph->protocol);
173                 return 0;
174         }
175
176         tuple->src.ip = iph->saddr;
177         tuple->dst.ip = iph->daddr;
178         tuple->dst.protonum = iph->protocol;
179         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181         return protocol->pkt_to_tuple(skb, dataoff, tuple);
182 }
183
184 int
185 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186                    const struct ip_conntrack_tuple *orig,
187                    const struct ip_conntrack_protocol *protocol)
188 {
189         inverse->src.ip = orig->dst.ip;
190         inverse->dst.ip = orig->src.ip;
191         inverse->dst.protonum = orig->dst.protonum;
192         inverse->dst.dir = !orig->dst.dir;
193
194         return protocol->invert_tuple(inverse, orig);
195 }
196
197
198 /* ip_conntrack_expect helper functions */
199 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
200 {
201         IP_NF_ASSERT(!timer_pending(&exp->timeout));
202         list_del(&exp->list);
203         CONNTRACK_STAT_INC(expect_delete);
204         exp->master->expecting--;
205         ip_conntrack_expect_put(exp);
206 }
207
208 static void expectation_timed_out(unsigned long ul_expect)
209 {
210         struct ip_conntrack_expect *exp = (void *)ul_expect;
211
212         write_lock_bh(&ip_conntrack_lock);
213         ip_ct_unlink_expect(exp);
214         write_unlock_bh(&ip_conntrack_lock);
215         ip_conntrack_expect_put(exp);
216 }
217
218 struct ip_conntrack_expect *
219 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220 {
221         struct ip_conntrack_expect *i;
222
223         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225                         return i;
226         }
227         return NULL;
228 }
229
230 /* Just find a expectation corresponding to a tuple. */
231 struct ip_conntrack_expect *
232 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
233 {
234         struct ip_conntrack_expect *i;
235
236         read_lock_bh(&ip_conntrack_lock);
237         i = __ip_conntrack_expect_find(tuple);
238         if (i)
239                 atomic_inc(&i->use);
240         read_unlock_bh(&ip_conntrack_lock);
241
242         return i;
243 }
244
245 /* If an expectation for this connection is found, it gets delete from
246  * global list then returned. */
247 static struct ip_conntrack_expect *
248 find_expectation(const struct ip_conntrack_tuple *tuple)
249 {
250         struct ip_conntrack_expect *i;
251
252         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253                 /* If master is not in hash table yet (ie. packet hasn't left
254                    this machine yet), how can other end know about expected?
255                    Hence these are not the droids you are looking for (if
256                    master ct never got confirmed, we'd hold a reference to it
257                    and weird things would happen to future packets). */
258                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259                     && is_confirmed(i->master)) {
260                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
261                                 atomic_inc(&i->use);
262                                 return i;
263                         } else if (del_timer(&i->timeout)) {
264                                 ip_ct_unlink_expect(i);
265                                 return i;
266                         }
267                 }
268         }
269         return NULL;
270 }
271
272 /* delete all expectations for this conntrack */
273 void ip_ct_remove_expectations(struct ip_conntrack *ct)
274 {
275         struct ip_conntrack_expect *i, *tmp;
276
277         /* Optimization: most connection never expect any others. */
278         if (ct->expecting == 0)
279                 return;
280
281         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282                 if (i->master == ct && del_timer(&i->timeout)) {
283                         ip_ct_unlink_expect(i);
284                         ip_conntrack_expect_put(i);
285                 }
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         DEBUGP("clean_from_lists(%p)\n", ct);
293         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
295
296         /* Destroy all pending expectations */
297         ip_ct_remove_expectations(ct);
298 }
299
300 static void
301 destroy_conntrack(struct nf_conntrack *nfct)
302 {
303         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304         struct ip_conntrack_protocol *proto;
305         struct ip_conntrack_helper *helper;
306
307         DEBUGP("destroy_conntrack(%p)\n", ct);
308         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
309         IP_NF_ASSERT(!timer_pending(&ct->timeout));
310
311         ip_conntrack_event(IPCT_DESTROY, ct);
312         set_bit(IPS_DYING_BIT, &ct->status);
313
314         helper = ct->helper;
315         if (helper && helper->destroy)
316                 helper->destroy(ct);
317
318         /* To make sure we don't get any weird locking issues here:
319          * destroy_conntrack() MUST NOT be called with a write lock
320          * to ip_conntrack_lock!!! -HW */
321         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
322         if (proto && proto->destroy)
323                 proto->destroy(ct);
324
325         if (ip_conntrack_destroyed)
326                 ip_conntrack_destroyed(ct);
327
328         write_lock_bh(&ip_conntrack_lock);
329         /* Expectations will have been removed in clean_from_lists,
330          * except TFTP can create an expectation on the first packet,
331          * before connection is in the list, so we need to clean here,
332          * too. */
333         ip_ct_remove_expectations(ct);
334
335         /* We overload first tuple to link into unconfirmed list. */
336         if (!is_confirmed(ct)) {
337                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
338                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
339         }
340
341         CONNTRACK_STAT_INC(delete);
342         write_unlock_bh(&ip_conntrack_lock);
343
344         if (ct->master)
345                 ip_conntrack_put(ct->master);
346
347         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
348         ip_conntrack_free(ct);
349 }
350
351 static void death_by_timeout(unsigned long ul_conntrack)
352 {
353         struct ip_conntrack *ct = (void *)ul_conntrack;
354
355         write_lock_bh(&ip_conntrack_lock);
356         /* Inside lock so preempt is disabled on module removal path.
357          * Otherwise we can get spurious warnings. */
358         CONNTRACK_STAT_INC(delete_list);
359         clean_from_lists(ct);
360         write_unlock_bh(&ip_conntrack_lock);
361         ip_conntrack_put(ct);
362 }
363
364 struct ip_conntrack_tuple_hash *
365 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
366                     const struct ip_conntrack *ignored_conntrack)
367 {
368         struct ip_conntrack_tuple_hash *h;
369         unsigned int hash = hash_conntrack(tuple);
370
371         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
372                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
373                     ip_ct_tuple_equal(tuple, &h->tuple)) {
374                         CONNTRACK_STAT_INC(found);
375                         return h;
376                 }
377                 CONNTRACK_STAT_INC(searched);
378         }
379
380         return NULL;
381 }
382
383 /* Find a connection corresponding to a tuple. */
384 struct ip_conntrack_tuple_hash *
385 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
386                       const struct ip_conntrack *ignored_conntrack)
387 {
388         struct ip_conntrack_tuple_hash *h;
389
390         read_lock_bh(&ip_conntrack_lock);
391         h = __ip_conntrack_find(tuple, ignored_conntrack);
392         if (h)
393                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
394         read_unlock_bh(&ip_conntrack_lock);
395
396         return h;
397 }
398
399 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
400                                         unsigned int hash,
401                                         unsigned int repl_hash)
402 {
403         ct->id = ++ip_conntrack_next_id;
404         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
405                  &ip_conntrack_hash[hash]);
406         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
407                  &ip_conntrack_hash[repl_hash]);
408 }
409
410 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
411 {
412         unsigned int hash, repl_hash;
413
414         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
415         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
416
417         write_lock_bh(&ip_conntrack_lock);
418         __ip_conntrack_hash_insert(ct, hash, repl_hash);
419         write_unlock_bh(&ip_conntrack_lock);
420 }
421
422 /* Confirm a connection given skb; places it in hash table */
423 int
424 __ip_conntrack_confirm(struct sk_buff **pskb)
425 {
426         unsigned int hash, repl_hash;
427         struct ip_conntrack_tuple_hash *h;
428         struct ip_conntrack *ct;
429         enum ip_conntrack_info ctinfo;
430
431         ct = ip_conntrack_get(*pskb, &ctinfo);
432
433         /* ipt_REJECT uses ip_conntrack_attach to attach related
434            ICMP/TCP RST packets in other direction.  Actual packet
435            which created connection will be IP_CT_NEW or for an
436            expected connection, IP_CT_RELATED. */
437         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
438                 return NF_ACCEPT;
439
440         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
441         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
442
443         /* We're not in hash table, and we refuse to set up related
444            connections for unconfirmed conns.  But packet copies and
445            REJECT will give spurious warnings here. */
446         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
447
448         /* No external references means noone else could have
449            confirmed us. */
450         IP_NF_ASSERT(!is_confirmed(ct));
451         DEBUGP("Confirming conntrack %p\n", ct);
452
453         write_lock_bh(&ip_conntrack_lock);
454
455         /* See if there's one in the list already, including reverse:
456            NAT could have grabbed it without realizing, since we're
457            not in the hash.  If there is, we lost race. */
458         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
459                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
460                                       &h->tuple))
461                         goto out;
462         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
463                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
464                                       &h->tuple))
465                         goto out;
466
467         /* Remove from unconfirmed list */
468         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
469
470         __ip_conntrack_hash_insert(ct, hash, repl_hash);
471         /* Timer relative to confirmation time, not original
472            setting time, otherwise we'd get timer wrap in
473            weird delay cases. */
474         ct->timeout.expires += jiffies;
475         add_timer(&ct->timeout);
476         atomic_inc(&ct->ct_general.use);
477         set_bit(IPS_CONFIRMED_BIT, &ct->status);
478         CONNTRACK_STAT_INC(insert);
479         write_unlock_bh(&ip_conntrack_lock);
480         if (ct->helper)
481                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
482 #ifdef CONFIG_IP_NF_NAT_NEEDED
483         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
484             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
485                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
486 #endif
487         ip_conntrack_event_cache(master_ct(ct) ?
488                                  IPCT_RELATED : IPCT_NEW, *pskb);
489
490         return NF_ACCEPT;
491
492 out:
493         CONNTRACK_STAT_INC(insert_failed);
494         write_unlock_bh(&ip_conntrack_lock);
495         return NF_DROP;
496 }
497
498 /* Returns true if a connection correspondings to the tuple (required
499    for NAT). */
500 int
501 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
502                          const struct ip_conntrack *ignored_conntrack)
503 {
504         struct ip_conntrack_tuple_hash *h;
505
506         read_lock_bh(&ip_conntrack_lock);
507         h = __ip_conntrack_find(tuple, ignored_conntrack);
508         read_unlock_bh(&ip_conntrack_lock);
509
510         return h != NULL;
511 }
512
513 /* There's a small race here where we may free a just-assured
514    connection.  Too bad: we're in trouble anyway. */
515 static int early_drop(struct list_head *chain)
516 {
517         /* Traverse backwards: gives us oldest, which is roughly LRU */
518         struct ip_conntrack_tuple_hash *h;
519         struct ip_conntrack *ct = NULL, *tmp;
520         int dropped = 0;
521
522         read_lock_bh(&ip_conntrack_lock);
523         list_for_each_entry_reverse(h, chain, list) {
524                 tmp = tuplehash_to_ctrack(h);
525                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
526                         ct = tmp;
527                         atomic_inc(&ct->ct_general.use);
528                         break;
529                 }
530         }
531         read_unlock_bh(&ip_conntrack_lock);
532
533         if (!ct)
534                 return dropped;
535
536         if (del_timer(&ct->timeout)) {
537                 death_by_timeout((unsigned long)ct);
538                 dropped = 1;
539                 CONNTRACK_STAT_INC(early_drop);
540         }
541         ip_conntrack_put(ct);
542         return dropped;
543 }
544
545 static struct ip_conntrack_helper *
546 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
547 {
548         struct ip_conntrack_helper *h;
549
550         list_for_each_entry(h, &helpers, list) {
551                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
552                         return h;
553         }
554         return NULL;
555 }
556
557 struct ip_conntrack_helper *
558 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
559 {
560         struct ip_conntrack_helper *helper;
561
562         /* need ip_conntrack_lock to assure that helper exists until
563          * try_module_get() is called */
564         read_lock_bh(&ip_conntrack_lock);
565
566         helper = __ip_conntrack_helper_find(tuple);
567         if (helper) {
568                 /* need to increase module usage count to assure helper will
569                  * not go away while the caller is e.g. busy putting a
570                  * conntrack in the hash that uses the helper */
571                 if (!try_module_get(helper->me))
572                         helper = NULL;
573         }
574
575         read_unlock_bh(&ip_conntrack_lock);
576
577         return helper;
578 }
579
580 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
581 {
582         module_put(helper->me);
583 }
584
585 struct ip_conntrack_protocol *
586 __ip_conntrack_proto_find(u_int8_t protocol)
587 {
588         return ip_ct_protos[protocol];
589 }
590
591 /* this is guaranteed to always return a valid protocol helper, since
592  * it falls back to generic_protocol */
593 struct ip_conntrack_protocol *
594 ip_conntrack_proto_find_get(u_int8_t protocol)
595 {
596         struct ip_conntrack_protocol *p;
597
598         preempt_disable();
599         p = __ip_conntrack_proto_find(protocol);
600         if (p) {
601                 if (!try_module_get(p->me))
602                         p = &ip_conntrack_generic_protocol;
603         }
604         preempt_enable();
605
606         return p;
607 }
608
609 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
610 {
611         module_put(p->me);
612 }
613
614 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
615                                         struct ip_conntrack_tuple *repl)
616 {
617         struct ip_conntrack *conntrack;
618
619         if (!ip_conntrack_hash_rnd_initted) {
620                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
621                 ip_conntrack_hash_rnd_initted = 1;
622         }
623
624         /* We don't want any race condition at early drop stage */
625         atomic_inc(&ip_conntrack_count);
626
627         if (ip_conntrack_max
628             && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
629                 unsigned int hash = hash_conntrack(orig);
630                 /* Try dropping from this hash chain. */
631                 if (!early_drop(&ip_conntrack_hash[hash])) {
632                         atomic_dec(&ip_conntrack_count);
633                         if (net_ratelimit())
634                                 printk(KERN_WARNING
635                                        "ip_conntrack: table full, dropping"
636                                        " packet.\n");
637                         return ERR_PTR(-ENOMEM);
638                 }
639         }
640
641         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
642         if (!conntrack) {
643                 DEBUGP("Can't allocate conntrack.\n");
644                 atomic_dec(&ip_conntrack_count);
645                 return ERR_PTR(-ENOMEM);
646         }
647
648         memset(conntrack, 0, sizeof(*conntrack));
649         atomic_set(&conntrack->ct_general.use, 1);
650         conntrack->ct_general.destroy = destroy_conntrack;
651         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
652         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
653         /* Don't set timer yet: wait for confirmation */
654         init_timer(&conntrack->timeout);
655         conntrack->timeout.data = (unsigned long)conntrack;
656         conntrack->timeout.function = death_by_timeout;
657
658         return conntrack;
659 }
660
661 void
662 ip_conntrack_free(struct ip_conntrack *conntrack)
663 {
664         atomic_dec(&ip_conntrack_count);
665         kmem_cache_free(ip_conntrack_cachep, conntrack);
666 }
667
668 /* Allocate a new conntrack: we return -ENOMEM if classification
669  * failed due to stress.   Otherwise it really is unclassifiable */
670 static struct ip_conntrack_tuple_hash *
671 init_conntrack(struct ip_conntrack_tuple *tuple,
672                struct ip_conntrack_protocol *protocol,
673                struct sk_buff *skb)
674 {
675         struct ip_conntrack *conntrack;
676         struct ip_conntrack_tuple repl_tuple;
677         struct ip_conntrack_expect *exp;
678
679         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
680                 DEBUGP("Can't invert tuple.\n");
681                 return NULL;
682         }
683
684         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
685         if (conntrack == NULL || IS_ERR(conntrack))
686                 return (struct ip_conntrack_tuple_hash *)conntrack;
687
688         if (!protocol->new(conntrack, skb)) {
689                 ip_conntrack_free(conntrack);
690                 return NULL;
691         }
692
693         write_lock_bh(&ip_conntrack_lock);
694         exp = find_expectation(tuple);
695
696         if (exp) {
697                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
698                         conntrack, exp);
699                 /* Welcome, Mr. Bond.  We've been expecting you... */
700                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
701                 conntrack->master = exp->master;
702 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
703                 conntrack->mark = exp->master->mark;
704 #endif
705 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
706     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
707                 /* this is ugly, but there is no other place where to put it */
708                 conntrack->nat.masq_index = exp->master->nat.masq_index;
709 #endif
710 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
711                 conntrack->secmark = exp->master->secmark;
712 #endif
713                 nf_conntrack_get(&conntrack->master->ct_general);
714                 CONNTRACK_STAT_INC(expect_new);
715         } else {
716                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
717
718                 CONNTRACK_STAT_INC(new);
719         }
720
721         /* Overload tuple linked list to put us in unconfirmed list. */
722         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
723
724         write_unlock_bh(&ip_conntrack_lock);
725
726         if (exp) {
727                 if (exp->expectfn)
728                         exp->expectfn(conntrack, exp);
729                 ip_conntrack_expect_put(exp);
730         }
731
732         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
733 }
734
735 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
736 static inline struct ip_conntrack *
737 resolve_normal_ct(struct sk_buff *skb,
738                   struct ip_conntrack_protocol *proto,
739                   int *set_reply,
740                   unsigned int hooknum,
741                   enum ip_conntrack_info *ctinfo)
742 {
743         struct ip_conntrack_tuple tuple;
744         struct ip_conntrack_tuple_hash *h;
745         struct ip_conntrack *ct;
746
747         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
748
749         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4,
750                                 &tuple,proto))
751                 return NULL;
752
753         /* look for tuple match */
754         h = ip_conntrack_find_get(&tuple, NULL);
755         if (!h) {
756                 h = init_conntrack(&tuple, proto, skb);
757                 if (!h)
758                         return NULL;
759                 if (IS_ERR(h))
760                         return (void *)h;
761         }
762         ct = tuplehash_to_ctrack(h);
763
764         /* It exists; we have (non-exclusive) reference. */
765         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
766                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
767                 /* Please set reply bit if this packet OK */
768                 *set_reply = 1;
769         } else {
770                 /* Once we've had two way comms, always ESTABLISHED. */
771                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
772                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
773                                ct);
774                         *ctinfo = IP_CT_ESTABLISHED;
775                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
776                         DEBUGP("ip_conntrack_in: related packet for %p\n",
777                                ct);
778                         *ctinfo = IP_CT_RELATED;
779                 } else {
780                         DEBUGP("ip_conntrack_in: new packet for %p\n",
781                                ct);
782                         *ctinfo = IP_CT_NEW;
783                 }
784                 *set_reply = 0;
785         }
786         skb->nfct = &ct->ct_general;
787         skb->nfctinfo = *ctinfo;
788         return ct;
789 }
790
791 /* Netfilter hook itself. */
792 unsigned int ip_conntrack_in(unsigned int hooknum,
793                              struct sk_buff **pskb,
794                              const struct net_device *in,
795                              const struct net_device *out,
796                              int (*okfn)(struct sk_buff *))
797 {
798         struct ip_conntrack *ct;
799         enum ip_conntrack_info ctinfo;
800         struct ip_conntrack_protocol *proto;
801         int set_reply = 0;
802         int ret;
803
804         /* Previously seen (loopback or untracked)?  Ignore. */
805         if ((*pskb)->nfct) {
806                 CONNTRACK_STAT_INC(ignore);
807                 return NF_ACCEPT;
808         }
809
810         /* Never happen */
811         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
812                 if (net_ratelimit()) {
813                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
814                        (*pskb)->nh.iph->protocol, hooknum);
815                 }
816                 return NF_DROP;
817         }
818
819 /* Doesn't cover locally-generated broadcast, so not worth it. */
820 #if 0
821         /* Ignore broadcast: no `connection'. */
822         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
823                 printk("Broadcast packet!\n");
824                 return NF_ACCEPT;
825         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF))
826                    == htonl(0x000000FF)) {
827                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
828                        NIPQUAD((*pskb)->nh.iph->saddr),
829                        NIPQUAD((*pskb)->nh.iph->daddr),
830                        (*pskb)->sk, (*pskb)->pkt_type);
831         }
832 #endif
833
834         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
835
836         /* It may be an special packet, error, unclean...
837          * inverse of the return code tells to the netfilter
838          * core what to do with the packet. */
839         if (proto->error != NULL
840             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
841                 CONNTRACK_STAT_INC(error);
842                 CONNTRACK_STAT_INC(invalid);
843                 return -ret;
844         }
845
846         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
847                 /* Not valid part of a connection */
848                 CONNTRACK_STAT_INC(invalid);
849                 return NF_ACCEPT;
850         }
851
852         if (IS_ERR(ct)) {
853                 /* Too stressed to deal. */
854                 CONNTRACK_STAT_INC(drop);
855                 return NF_DROP;
856         }
857
858         IP_NF_ASSERT((*pskb)->nfct);
859
860         ret = proto->packet(ct, *pskb, ctinfo);
861         if (ret < 0) {
862                 /* Invalid: inverse of the return code tells
863                  * the netfilter core what to do*/
864                 nf_conntrack_put((*pskb)->nfct);
865                 (*pskb)->nfct = NULL;
866                 CONNTRACK_STAT_INC(invalid);
867                 return -ret;
868         }
869
870         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
871                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
872
873         return ret;
874 }
875
876 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
877                    const struct ip_conntrack_tuple *orig)
878 {
879         return ip_ct_invert_tuple(inverse, orig,
880                                   __ip_conntrack_proto_find(orig->dst.protonum));
881 }
882
883 /* Would two expected things clash? */
884 static inline int expect_clash(const struct ip_conntrack_expect *a,
885                                const struct ip_conntrack_expect *b)
886 {
887         /* Part covered by intersection of masks must be unequal,
888            otherwise they clash */
889         struct ip_conntrack_tuple intersect_mask
890                 = { { a->mask.src.ip & b->mask.src.ip,
891                       { a->mask.src.u.all & b->mask.src.u.all } },
892                     { a->mask.dst.ip & b->mask.dst.ip,
893                       { a->mask.dst.u.all & b->mask.dst.u.all },
894                       a->mask.dst.protonum & b->mask.dst.protonum } };
895
896         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
897 }
898
899 static inline int expect_matches(const struct ip_conntrack_expect *a,
900                                  const struct ip_conntrack_expect *b)
901 {
902         return a->master == b->master
903                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
904                 && ip_ct_tuple_equal(&a->mask, &b->mask);
905 }
906
907 /* Generally a bad idea to call this: could have matched already. */
908 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
909 {
910         struct ip_conntrack_expect *i;
911
912         write_lock_bh(&ip_conntrack_lock);
913         /* choose the the oldest expectation to evict */
914         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
915                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
916                         ip_ct_unlink_expect(i);
917                         write_unlock_bh(&ip_conntrack_lock);
918                         ip_conntrack_expect_put(i);
919                         return;
920                 }
921         }
922         write_unlock_bh(&ip_conntrack_lock);
923 }
924
925 /* We don't increase the master conntrack refcount for non-fulfilled
926  * conntracks. During the conntrack destruction, the expectations are
927  * always killed before the conntrack itself */
928 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
929 {
930         struct ip_conntrack_expect *new;
931
932         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
933         if (!new) {
934                 DEBUGP("expect_related: OOM allocating expect\n");
935                 return NULL;
936         }
937         new->master = me;
938         atomic_set(&new->use, 1);
939         return new;
940 }
941
942 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
943 {
944         if (atomic_dec_and_test(&exp->use))
945                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
946 }
947
948 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
949 {
950         atomic_inc(&exp->use);
951         exp->master->expecting++;
952         list_add(&exp->list, &ip_conntrack_expect_list);
953
954         init_timer(&exp->timeout);
955         exp->timeout.data = (unsigned long)exp;
956         exp->timeout.function = expectation_timed_out;
957         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
958         add_timer(&exp->timeout);
959
960         exp->id = ++ip_conntrack_expect_next_id;
961         atomic_inc(&exp->use);
962         CONNTRACK_STAT_INC(expect_create);
963 }
964
965 /* Race with expectations being used means we could have none to find; OK. */
966 static void evict_oldest_expect(struct ip_conntrack *master)
967 {
968         struct ip_conntrack_expect *i;
969
970         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
971                 if (i->master == master) {
972                         if (del_timer(&i->timeout)) {
973                                 ip_ct_unlink_expect(i);
974                                 ip_conntrack_expect_put(i);
975                         }
976                         break;
977                 }
978         }
979 }
980
981 static inline int refresh_timer(struct ip_conntrack_expect *i)
982 {
983         if (!del_timer(&i->timeout))
984                 return 0;
985
986         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
987         add_timer(&i->timeout);
988         return 1;
989 }
990
991 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
992 {
993         struct ip_conntrack_expect *i;
994         int ret;
995
996         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
997         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
998         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
999
1000         write_lock_bh(&ip_conntrack_lock);
1001         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1002                 if (expect_matches(i, expect)) {
1003                         /* Refresh timer: if it's dying, ignore.. */
1004                         if (refresh_timer(i)) {
1005                                 ret = 0;
1006                                 goto out;
1007                         }
1008                 } else if (expect_clash(i, expect)) {
1009                         ret = -EBUSY;
1010                         goto out;
1011                 }
1012         }
1013
1014         /* Will be over limit? */
1015         if (expect->master->helper->max_expected &&
1016             expect->master->expecting >= expect->master->helper->max_expected)
1017                 evict_oldest_expect(expect->master);
1018
1019         ip_conntrack_expect_insert(expect);
1020         ip_conntrack_expect_event(IPEXP_NEW, expect);
1021         ret = 0;
1022 out:
1023         write_unlock_bh(&ip_conntrack_lock);
1024         return ret;
1025 }
1026
1027 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1028    implicitly racy: see __ip_conntrack_confirm */
1029 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1030                               const struct ip_conntrack_tuple *newreply)
1031 {
1032         write_lock_bh(&ip_conntrack_lock);
1033         /* Should be unconfirmed, so not in hash table yet */
1034         IP_NF_ASSERT(!is_confirmed(conntrack));
1035
1036         DEBUGP("Altering reply tuple of %p to ", conntrack);
1037         DUMP_TUPLE(newreply);
1038
1039         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1040         if (!conntrack->master && conntrack->expecting == 0)
1041                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1042         write_unlock_bh(&ip_conntrack_lock);
1043 }
1044
1045 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1046 {
1047         BUG_ON(me->timeout == 0);
1048         write_lock_bh(&ip_conntrack_lock);
1049         list_add(&me->list, &helpers);
1050         write_unlock_bh(&ip_conntrack_lock);
1051
1052         return 0;
1053 }
1054
1055 struct ip_conntrack_helper *
1056 __ip_conntrack_helper_find_byname(const char *name)
1057 {
1058         struct ip_conntrack_helper *h;
1059
1060         list_for_each_entry(h, &helpers, list) {
1061                 if (!strcmp(h->name, name))
1062                         return h;
1063         }
1064
1065         return NULL;
1066 }
1067
1068 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1069                           const struct ip_conntrack_helper *me)
1070 {
1071         if (tuplehash_to_ctrack(i)->helper == me) {
1072                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1073                 tuplehash_to_ctrack(i)->helper = NULL;
1074         }
1075 }
1076
1077 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1078 {
1079         unsigned int i;
1080         struct ip_conntrack_tuple_hash *h;
1081         struct ip_conntrack_expect *exp, *tmp;
1082
1083         /* Need write lock here, to delete helper. */
1084         write_lock_bh(&ip_conntrack_lock);
1085         list_del(&me->list);
1086
1087         /* Get rid of expectations */
1088         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1089                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1090                         ip_ct_unlink_expect(exp);
1091                         ip_conntrack_expect_put(exp);
1092                 }
1093         }
1094         /* Get rid of expecteds, set helpers to NULL. */
1095         list_for_each_entry(h, &unconfirmed, list)
1096                 unhelp(h, me);
1097         for (i = 0; i < ip_conntrack_htable_size; i++) {
1098                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1099                         unhelp(h, me);
1100         }
1101         write_unlock_bh(&ip_conntrack_lock);
1102
1103         /* Someone could be still looking at the helper in a bh. */
1104         synchronize_net();
1105 }
1106
1107 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1108 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1109                         enum ip_conntrack_info ctinfo,
1110                         const struct sk_buff *skb,
1111                         unsigned long extra_jiffies,
1112                         int do_acct)
1113 {
1114         int event = 0;
1115
1116         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1117         IP_NF_ASSERT(skb);
1118
1119         write_lock_bh(&ip_conntrack_lock);
1120
1121         /* Only update if this is not a fixed timeout */
1122         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1123                 write_unlock_bh(&ip_conntrack_lock);
1124                 return;
1125         }
1126
1127         /* If not in hash table, timer will not be active yet */
1128         if (!is_confirmed(ct)) {
1129                 ct->timeout.expires = extra_jiffies;
1130                 event = IPCT_REFRESH;
1131         } else {
1132                 /* Need del_timer for race avoidance (may already be dying). */
1133                 if (del_timer(&ct->timeout)) {
1134                         ct->timeout.expires = jiffies + extra_jiffies;
1135                         add_timer(&ct->timeout);
1136                         event = IPCT_REFRESH;
1137                 }
1138         }
1139
1140 #ifdef CONFIG_IP_NF_CT_ACCT
1141         if (do_acct) {
1142                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1143                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1144                                                 ntohs(skb->nh.iph->tot_len);
1145                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1146                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1147                         event |= IPCT_COUNTER_FILLING;
1148         }
1149 #endif
1150
1151         write_unlock_bh(&ip_conntrack_lock);
1152
1153         /* must be unlocked when calling event cache */
1154         if (event)
1155                 ip_conntrack_event_cache(event, skb);
1156 }
1157
1158 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1159     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1160 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1161  * in ip_conntrack_core, since we don't want the protocols to autoload
1162  * or depend on ctnetlink */
1163 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1164                                const struct ip_conntrack_tuple *tuple)
1165 {
1166         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1167                 &tuple->src.u.tcp.port);
1168         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1169                 &tuple->dst.u.tcp.port);
1170         return 0;
1171
1172 nfattr_failure:
1173         return -1;
1174 }
1175
1176 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1177                                struct ip_conntrack_tuple *t)
1178 {
1179         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1180                 return -EINVAL;
1181
1182         t->src.u.tcp.port =
1183                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1184         t->dst.u.tcp.port =
1185                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1186
1187         return 0;
1188 }
1189 #endif
1190
1191 /* Returns new sk_buff, or NULL */
1192 struct sk_buff *
1193 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1194 {
1195         skb_orphan(skb);
1196
1197         local_bh_disable();
1198         skb = ip_defrag(skb, user);
1199         local_bh_enable();
1200
1201         if (skb)
1202                 ip_send_check(skb->nh.iph);
1203         return skb;
1204 }
1205
1206 /* Used by ipt_REJECT. */
1207 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1208 {
1209         struct ip_conntrack *ct;
1210         enum ip_conntrack_info ctinfo;
1211
1212         /* This ICMP is in reverse direction to the packet which caused it */
1213         ct = ip_conntrack_get(skb, &ctinfo);
1214
1215         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1216                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1217         else
1218                 ctinfo = IP_CT_RELATED;
1219
1220         /* Attach to new skbuff, and increment count */
1221         nskb->nfct = &ct->ct_general;
1222         nskb->nfctinfo = ctinfo;
1223         nf_conntrack_get(nskb->nfct);
1224 }
1225
1226 /* Bring out ya dead! */
1227 static struct ip_conntrack *
1228 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1229                 void *data, unsigned int *bucket)
1230 {
1231         struct ip_conntrack_tuple_hash *h;
1232         struct ip_conntrack *ct;
1233
1234         write_lock_bh(&ip_conntrack_lock);
1235         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1236                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1237                         ct = tuplehash_to_ctrack(h);
1238                         if (iter(ct, data))
1239                                 goto found;
1240                 }
1241         }
1242         list_for_each_entry(h, &unconfirmed, list) {
1243                 ct = tuplehash_to_ctrack(h);
1244                 if (iter(ct, data))
1245                         goto found;
1246         }
1247         write_unlock_bh(&ip_conntrack_lock);
1248         return NULL;
1249
1250 found:
1251         atomic_inc(&ct->ct_general.use);
1252         write_unlock_bh(&ip_conntrack_lock);
1253         return ct;
1254 }
1255
1256 void
1257 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1258 {
1259         struct ip_conntrack *ct;
1260         unsigned int bucket = 0;
1261
1262         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1263                 /* Time to push up daises... */
1264                 if (del_timer(&ct->timeout))
1265                         death_by_timeout((unsigned long)ct);
1266                 /* ... else the timer will get him soon. */
1267
1268                 ip_conntrack_put(ct);
1269         }
1270 }
1271
1272 /* Fast function for those who don't want to parse /proc (and I don't
1273    blame them). */
1274 /* Reversing the socket's dst/src point of view gives us the reply
1275    mapping. */
1276 static int
1277 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1278 {
1279         struct inet_sock *inet = inet_sk(sk);
1280         struct ip_conntrack_tuple_hash *h;
1281         struct ip_conntrack_tuple tuple;
1282
1283         IP_CT_TUPLE_U_BLANK(&tuple);
1284         tuple.src.ip = inet->rcv_saddr;
1285         tuple.src.u.tcp.port = inet->sport;
1286         tuple.dst.ip = inet->daddr;
1287         tuple.dst.u.tcp.port = inet->dport;
1288         tuple.dst.protonum = IPPROTO_TCP;
1289
1290         /* We only do TCP at the moment: is there a better way? */
1291         if (strcmp(sk->sk_prot->name, "TCP")) {
1292                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1293                 return -ENOPROTOOPT;
1294         }
1295
1296         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1297                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1298                        *len, sizeof(struct sockaddr_in));
1299                 return -EINVAL;
1300         }
1301
1302         h = ip_conntrack_find_get(&tuple, NULL);
1303         if (h) {
1304                 struct sockaddr_in sin;
1305                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1306
1307                 sin.sin_family = AF_INET;
1308                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1309                         .tuple.dst.u.tcp.port;
1310                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1311                         .tuple.dst.ip;
1312                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1313
1314                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1315                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1316                 ip_conntrack_put(ct);
1317                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1318                         return -EFAULT;
1319                 else
1320                         return 0;
1321         }
1322         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1323                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1324                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1325         return -ENOENT;
1326 }
1327
1328 static struct nf_sockopt_ops so_getorigdst = {
1329         .pf             = PF_INET,
1330         .get_optmin     = SO_ORIGINAL_DST,
1331         .get_optmax     = SO_ORIGINAL_DST+1,
1332         .get            = &getorigdst,
1333 };
1334
1335 static int kill_all(struct ip_conntrack *i, void *data)
1336 {
1337         return 1;
1338 }
1339
1340 void ip_conntrack_flush(void)
1341 {
1342         ip_ct_iterate_cleanup(kill_all, NULL);
1343 }
1344
1345 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1346 {
1347         if (vmalloced)
1348                 vfree(hash);
1349         else
1350                 free_pages((unsigned long)hash,
1351                            get_order(sizeof(struct list_head) * size));
1352 }
1353
1354 /* Mishearing the voices in his head, our hero wonders how he's
1355    supposed to kill the mall. */
1356 void ip_conntrack_cleanup(void)
1357 {
1358         ip_ct_attach = NULL;
1359
1360         /* This makes sure all current packets have passed through
1361            netfilter framework.  Roll on, two-stage module
1362            delete... */
1363         synchronize_net();
1364
1365         ip_ct_event_cache_flush();
1366  i_see_dead_people:
1367         ip_conntrack_flush();
1368         if (atomic_read(&ip_conntrack_count) != 0) {
1369                 schedule();
1370                 goto i_see_dead_people;
1371         }
1372         /* wait until all references to ip_conntrack_untracked are dropped */
1373         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1374                 schedule();
1375
1376         kmem_cache_destroy(ip_conntrack_cachep);
1377         kmem_cache_destroy(ip_conntrack_expect_cachep);
1378         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1379                             ip_conntrack_htable_size);
1380         nf_unregister_sockopt(&so_getorigdst);
1381 }
1382
1383 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1384 {
1385         struct list_head *hash;
1386         unsigned int i;
1387
1388         *vmalloced = 0;
1389         hash = (void*)__get_free_pages(GFP_KERNEL,
1390                                        get_order(sizeof(struct list_head)
1391                                                  * size));
1392         if (!hash) {
1393                 *vmalloced = 1;
1394                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1395                 hash = vmalloc(sizeof(struct list_head) * size);
1396         }
1397
1398         if (hash)
1399                 for (i = 0; i < size; i++)
1400                         INIT_LIST_HEAD(&hash[i]);
1401
1402         return hash;
1403 }
1404
1405 static int set_hashsize(const char *val, struct kernel_param *kp)
1406 {
1407         int i, bucket, hashsize, vmalloced;
1408         int old_vmalloced, old_size;
1409         int rnd;
1410         struct list_head *hash, *old_hash;
1411         struct ip_conntrack_tuple_hash *h;
1412
1413         /* On boot, we can set this without any fancy locking. */
1414         if (!ip_conntrack_htable_size)
1415                 return param_set_int(val, kp);
1416
1417         hashsize = simple_strtol(val, NULL, 0);
1418         if (!hashsize)
1419                 return -EINVAL;
1420
1421         hash = alloc_hashtable(hashsize, &vmalloced);
1422         if (!hash)
1423                 return -ENOMEM;
1424
1425         /* We have to rehash for the new table anyway, so we also can
1426          * use a new random seed */
1427         get_random_bytes(&rnd, 4);
1428
1429         write_lock_bh(&ip_conntrack_lock);
1430         for (i = 0; i < ip_conntrack_htable_size; i++) {
1431                 while (!list_empty(&ip_conntrack_hash[i])) {
1432                         h = list_entry(ip_conntrack_hash[i].next,
1433                                        struct ip_conntrack_tuple_hash, list);
1434                         list_del(&h->list);
1435                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1436                         list_add_tail(&h->list, &hash[bucket]);
1437                 }
1438         }
1439         old_size = ip_conntrack_htable_size;
1440         old_vmalloced = ip_conntrack_vmalloc;
1441         old_hash = ip_conntrack_hash;
1442
1443         ip_conntrack_htable_size = hashsize;
1444         ip_conntrack_vmalloc = vmalloced;
1445         ip_conntrack_hash = hash;
1446         ip_conntrack_hash_rnd = rnd;
1447         write_unlock_bh(&ip_conntrack_lock);
1448
1449         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1450         return 0;
1451 }
1452
1453 module_param_call(hashsize, set_hashsize, param_get_uint,
1454                   &ip_conntrack_htable_size, 0600);
1455
1456 int __init ip_conntrack_init(void)
1457 {
1458         unsigned int i;
1459         int ret;
1460
1461         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1462          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1463         if (!ip_conntrack_htable_size) {
1464                 ip_conntrack_htable_size
1465                         = (((num_physpages << PAGE_SHIFT) / 16384)
1466                            / sizeof(struct list_head));
1467                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1468                         ip_conntrack_htable_size = 8192;
1469                 if (ip_conntrack_htable_size < 16)
1470                         ip_conntrack_htable_size = 16;
1471         }
1472         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1473
1474         printk("ip_conntrack version %s (%u buckets, %d max)"
1475                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1476                ip_conntrack_htable_size, ip_conntrack_max,
1477                sizeof(struct ip_conntrack));
1478
1479         ret = nf_register_sockopt(&so_getorigdst);
1480         if (ret != 0) {
1481                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1482                 return ret;
1483         }
1484
1485         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1486                                             &ip_conntrack_vmalloc);
1487         if (!ip_conntrack_hash) {
1488                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1489                 goto err_unreg_sockopt;
1490         }
1491
1492         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1493                                                 sizeof(struct ip_conntrack), 0,
1494                                                 0, NULL, NULL);
1495         if (!ip_conntrack_cachep) {
1496                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1497                 goto err_free_hash;
1498         }
1499
1500         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1501                                         sizeof(struct ip_conntrack_expect),
1502                                         0, 0, NULL, NULL);
1503         if (!ip_conntrack_expect_cachep) {
1504                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1505                 goto err_free_conntrack_slab;
1506         }
1507
1508         /* Don't NEED lock here, but good form anyway. */
1509         write_lock_bh(&ip_conntrack_lock);
1510         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1511                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1512         /* Sew in builtin protocols. */
1513         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1514         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1515         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1516         write_unlock_bh(&ip_conntrack_lock);
1517
1518         /* For use by ipt_REJECT */
1519         ip_ct_attach = ip_conntrack_attach;
1520
1521         /* Set up fake conntrack:
1522             - to never be deleted, not in any hashes */
1523         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1524         /*  - and look it like as a confirmed connection */
1525         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1526
1527         return ret;
1528
1529 err_free_conntrack_slab:
1530         kmem_cache_destroy(ip_conntrack_cachep);
1531 err_free_hash:
1532         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1533                             ip_conntrack_htable_size);
1534 err_unreg_sockopt:
1535         nf_unregister_sockopt(&so_getorigdst);
1536
1537         return -ENOMEM;
1538 }