5da25ad50309079fabd87e506a0cf65528a64a8e
[safe/jmp/linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell  
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #define ASSERT_READ_LOCK(x)
44 #define ASSERT_WRITE_LOCK(x)
45
46 #include <linux/netfilter_ipv4/ip_conntrack.h>
47 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
48 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
49 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
50 #include <linux/netfilter_ipv4/listhelp.h>
51
52 #define IP_CONNTRACK_VERSION    "2.4"
53
54 #if 0
55 #define DEBUGP printk
56 #else
57 #define DEBUGP(format, args...)
58 #endif
59
60 DEFINE_RWLOCK(ip_conntrack_lock);
61
62 /* ip_conntrack_standalone needs this */
63 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
64
65 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
66 LIST_HEAD(ip_conntrack_expect_list);
67 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO];
68 static LIST_HEAD(helpers);
69 unsigned int ip_conntrack_htable_size __read_mostly = 0;
70 int ip_conntrack_max __read_mostly;
71 struct list_head *ip_conntrack_hash;
72 static kmem_cache_t *ip_conntrack_cachep __read_mostly;
73 static kmem_cache_t *ip_conntrack_expect_cachep __read_mostly;
74 struct ip_conntrack ip_conntrack_untracked;
75 unsigned int ip_ct_log_invalid __read_mostly;
76 static LIST_HEAD(unconfirmed);
77 static int ip_conntrack_vmalloc;
78
79 static unsigned int ip_conntrack_next_id;
80 static unsigned int ip_conntrack_expect_next_id;
81 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
82 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
83 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
84
85 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
86
87 /* deliver cached events and clear cache entry - must be called with locally
88  * disabled softirqs */
89 static inline void
90 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
91 {
92         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
93         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
94                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
95                                     ecache->ct);
96         ecache->events = 0;
97         ip_conntrack_put(ecache->ct);
98         ecache->ct = NULL;
99 }
100
101 /* Deliver all cached events for a particular conntrack. This is called
102  * by code prior to async packet handling or freeing the skb */
103 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
104 {
105         struct ip_conntrack_ecache *ecache;
106         
107         local_bh_disable();
108         ecache = &__get_cpu_var(ip_conntrack_ecache);
109         if (ecache->ct == ct)
110                 __ip_ct_deliver_cached_events(ecache);
111         local_bh_enable();
112 }
113
114 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
115 {
116         struct ip_conntrack_ecache *ecache;
117
118         /* take care of delivering potentially old events */
119         ecache = &__get_cpu_var(ip_conntrack_ecache);
120         BUG_ON(ecache->ct == ct);
121         if (ecache->ct)
122                 __ip_ct_deliver_cached_events(ecache);
123         /* initialize for this conntrack/packet */
124         ecache->ct = ct;
125         nf_conntrack_get(&ct->ct_general);
126 }
127
128 /* flush the event cache - touches other CPU's data and must not be called while
129  * packets are still passing through the code */
130 static void ip_ct_event_cache_flush(void)
131 {
132         struct ip_conntrack_ecache *ecache;
133         int cpu;
134
135         for_each_possible_cpu(cpu) {
136                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
137                 if (ecache->ct)
138                         ip_conntrack_put(ecache->ct);
139         }
140 }
141 #else
142 static inline void ip_ct_event_cache_flush(void) {}
143 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
144
145 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
146
147 static int ip_conntrack_hash_rnd_initted;
148 static unsigned int ip_conntrack_hash_rnd;
149
150 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
151                             unsigned int size, unsigned int rnd)
152 {
153         return (jhash_3words(tuple->src.ip,
154                              (tuple->dst.ip ^ tuple->dst.protonum),
155                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
156                              rnd) % size);
157 }
158
159 static u_int32_t
160 hash_conntrack(const struct ip_conntrack_tuple *tuple)
161 {
162         return __hash_conntrack(tuple, ip_conntrack_htable_size,
163                                 ip_conntrack_hash_rnd);
164 }
165
166 int
167 ip_ct_get_tuple(const struct iphdr *iph,
168                 const struct sk_buff *skb,
169                 unsigned int dataoff,
170                 struct ip_conntrack_tuple *tuple,
171                 const struct ip_conntrack_protocol *protocol)
172 {
173         /* Never happen */
174         if (iph->frag_off & htons(IP_OFFSET)) {
175                 printk("ip_conntrack_core: Frag of proto %u.\n",
176                        iph->protocol);
177                 return 0;
178         }
179
180         tuple->src.ip = iph->saddr;
181         tuple->dst.ip = iph->daddr;
182         tuple->dst.protonum = iph->protocol;
183         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
184
185         return protocol->pkt_to_tuple(skb, dataoff, tuple);
186 }
187
188 int
189 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
190                    const struct ip_conntrack_tuple *orig,
191                    const struct ip_conntrack_protocol *protocol)
192 {
193         inverse->src.ip = orig->dst.ip;
194         inverse->dst.ip = orig->src.ip;
195         inverse->dst.protonum = orig->dst.protonum;
196         inverse->dst.dir = !orig->dst.dir;
197
198         return protocol->invert_tuple(inverse, orig);
199 }
200
201
202 /* ip_conntrack_expect helper functions */
203 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
204 {
205         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
206         IP_NF_ASSERT(!timer_pending(&exp->timeout));
207         list_del(&exp->list);
208         CONNTRACK_STAT_INC(expect_delete);
209         exp->master->expecting--;
210         ip_conntrack_expect_put(exp);
211 }
212
213 static void expectation_timed_out(unsigned long ul_expect)
214 {
215         struct ip_conntrack_expect *exp = (void *)ul_expect;
216
217         write_lock_bh(&ip_conntrack_lock);
218         ip_ct_unlink_expect(exp);
219         write_unlock_bh(&ip_conntrack_lock);
220         ip_conntrack_expect_put(exp);
221 }
222
223 struct ip_conntrack_expect *
224 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
225 {
226         struct ip_conntrack_expect *i;
227         
228         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
229                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)) {
230                         atomic_inc(&i->use);
231                         return i;
232                 }
233         }
234         return NULL;
235 }
236
237 /* Just find a expectation corresponding to a tuple. */
238 struct ip_conntrack_expect *
239 ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
240 {
241         struct ip_conntrack_expect *i;
242         
243         read_lock_bh(&ip_conntrack_lock);
244         i = __ip_conntrack_expect_find(tuple);
245         read_unlock_bh(&ip_conntrack_lock);
246
247         return i;
248 }
249
250 /* If an expectation for this connection is found, it gets delete from
251  * global list then returned. */
252 static struct ip_conntrack_expect *
253 find_expectation(const struct ip_conntrack_tuple *tuple)
254 {
255         struct ip_conntrack_expect *i;
256
257         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
258                 /* If master is not in hash table yet (ie. packet hasn't left
259                    this machine yet), how can other end know about expected?
260                    Hence these are not the droids you are looking for (if
261                    master ct never got confirmed, we'd hold a reference to it
262                    and weird things would happen to future packets). */
263                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
264                     && is_confirmed(i->master)) {
265                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
266                                 atomic_inc(&i->use);
267                                 return i;
268                         } else if (del_timer(&i->timeout)) {
269                                 ip_ct_unlink_expect(i);
270                                 return i;
271                         }
272                 }
273         }
274         return NULL;
275 }
276
277 /* delete all expectations for this conntrack */
278 void ip_ct_remove_expectations(struct ip_conntrack *ct)
279 {
280         struct ip_conntrack_expect *i, *tmp;
281
282         /* Optimization: most connection never expect any others. */
283         if (ct->expecting == 0)
284                 return;
285
286         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
287                 if (i->master == ct && del_timer(&i->timeout)) {
288                         ip_ct_unlink_expect(i);
289                         ip_conntrack_expect_put(i);
290                 }
291         }
292 }
293
294 static void
295 clean_from_lists(struct ip_conntrack *ct)
296 {
297         unsigned int ho, hr;
298         
299         DEBUGP("clean_from_lists(%p)\n", ct);
300         ASSERT_WRITE_LOCK(&ip_conntrack_lock);
301
302         ho = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
303         hr = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
304         LIST_DELETE(&ip_conntrack_hash[ho], &ct->tuplehash[IP_CT_DIR_ORIGINAL]);
305         LIST_DELETE(&ip_conntrack_hash[hr], &ct->tuplehash[IP_CT_DIR_REPLY]);
306
307         /* Destroy all pending expectations */
308         ip_ct_remove_expectations(ct);
309 }
310
311 static void
312 destroy_conntrack(struct nf_conntrack *nfct)
313 {
314         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
315         struct ip_conntrack_protocol *proto;
316
317         DEBUGP("destroy_conntrack(%p)\n", ct);
318         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
319         IP_NF_ASSERT(!timer_pending(&ct->timeout));
320
321         ip_conntrack_event(IPCT_DESTROY, ct);
322         set_bit(IPS_DYING_BIT, &ct->status);
323
324         /* To make sure we don't get any weird locking issues here:
325          * destroy_conntrack() MUST NOT be called with a write lock
326          * to ip_conntrack_lock!!! -HW */
327         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
328         if (proto && proto->destroy)
329                 proto->destroy(ct);
330
331         if (ip_conntrack_destroyed)
332                 ip_conntrack_destroyed(ct);
333
334         write_lock_bh(&ip_conntrack_lock);
335         /* Expectations will have been removed in clean_from_lists,
336          * except TFTP can create an expectation on the first packet,
337          * before connection is in the list, so we need to clean here,
338          * too. */
339         ip_ct_remove_expectations(ct);
340
341         /* We overload first tuple to link into unconfirmed list. */
342         if (!is_confirmed(ct)) {
343                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
344                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
345         }
346
347         CONNTRACK_STAT_INC(delete);
348         write_unlock_bh(&ip_conntrack_lock);
349
350         if (ct->master)
351                 ip_conntrack_put(ct->master);
352
353         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
354         ip_conntrack_free(ct);
355 }
356
357 static void death_by_timeout(unsigned long ul_conntrack)
358 {
359         struct ip_conntrack *ct = (void *)ul_conntrack;
360
361         write_lock_bh(&ip_conntrack_lock);
362         /* Inside lock so preempt is disabled on module removal path.
363          * Otherwise we can get spurious warnings. */
364         CONNTRACK_STAT_INC(delete_list);
365         clean_from_lists(ct);
366         write_unlock_bh(&ip_conntrack_lock);
367         ip_conntrack_put(ct);
368 }
369
370 static inline int
371 conntrack_tuple_cmp(const struct ip_conntrack_tuple_hash *i,
372                     const struct ip_conntrack_tuple *tuple,
373                     const struct ip_conntrack *ignored_conntrack)
374 {
375         ASSERT_READ_LOCK(&ip_conntrack_lock);
376         return tuplehash_to_ctrack(i) != ignored_conntrack
377                 && ip_ct_tuple_equal(tuple, &i->tuple);
378 }
379
380 struct ip_conntrack_tuple_hash *
381 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
382                     const struct ip_conntrack *ignored_conntrack)
383 {
384         struct ip_conntrack_tuple_hash *h;
385         unsigned int hash = hash_conntrack(tuple);
386
387         ASSERT_READ_LOCK(&ip_conntrack_lock);
388         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
389                 if (conntrack_tuple_cmp(h, tuple, ignored_conntrack)) {
390                         CONNTRACK_STAT_INC(found);
391                         return h;
392                 }
393                 CONNTRACK_STAT_INC(searched);
394         }
395
396         return NULL;
397 }
398
399 /* Find a connection corresponding to a tuple. */
400 struct ip_conntrack_tuple_hash *
401 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
402                       const struct ip_conntrack *ignored_conntrack)
403 {
404         struct ip_conntrack_tuple_hash *h;
405
406         read_lock_bh(&ip_conntrack_lock);
407         h = __ip_conntrack_find(tuple, ignored_conntrack);
408         if (h)
409                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
410         read_unlock_bh(&ip_conntrack_lock);
411
412         return h;
413 }
414
415 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
416                                         unsigned int hash,
417                                         unsigned int repl_hash) 
418 {
419         ct->id = ++ip_conntrack_next_id;
420         list_prepend(&ip_conntrack_hash[hash],
421                      &ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
422         list_prepend(&ip_conntrack_hash[repl_hash],
423                      &ct->tuplehash[IP_CT_DIR_REPLY].list);
424 }
425
426 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
427 {
428         unsigned int hash, repl_hash;
429
430         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
431         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
432
433         write_lock_bh(&ip_conntrack_lock);
434         __ip_conntrack_hash_insert(ct, hash, repl_hash);
435         write_unlock_bh(&ip_conntrack_lock);
436 }
437
438 /* Confirm a connection given skb; places it in hash table */
439 int
440 __ip_conntrack_confirm(struct sk_buff **pskb)
441 {
442         unsigned int hash, repl_hash;
443         struct ip_conntrack *ct;
444         enum ip_conntrack_info ctinfo;
445
446         ct = ip_conntrack_get(*pskb, &ctinfo);
447
448         /* ipt_REJECT uses ip_conntrack_attach to attach related
449            ICMP/TCP RST packets in other direction.  Actual packet
450            which created connection will be IP_CT_NEW or for an
451            expected connection, IP_CT_RELATED. */
452         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
453                 return NF_ACCEPT;
454
455         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
456         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
457
458         /* We're not in hash table, and we refuse to set up related
459            connections for unconfirmed conns.  But packet copies and
460            REJECT will give spurious warnings here. */
461         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
462
463         /* No external references means noone else could have
464            confirmed us. */
465         IP_NF_ASSERT(!is_confirmed(ct));
466         DEBUGP("Confirming conntrack %p\n", ct);
467
468         write_lock_bh(&ip_conntrack_lock);
469
470         /* See if there's one in the list already, including reverse:
471            NAT could have grabbed it without realizing, since we're
472            not in the hash.  If there is, we lost race. */
473         if (!LIST_FIND(&ip_conntrack_hash[hash],
474                        conntrack_tuple_cmp,
475                        struct ip_conntrack_tuple_hash *,
476                        &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, NULL)
477             && !LIST_FIND(&ip_conntrack_hash[repl_hash],
478                           conntrack_tuple_cmp,
479                           struct ip_conntrack_tuple_hash *,
480                           &ct->tuplehash[IP_CT_DIR_REPLY].tuple, NULL)) {
481                 /* Remove from unconfirmed list */
482                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
483
484                 __ip_conntrack_hash_insert(ct, hash, repl_hash);
485                 /* Timer relative to confirmation time, not original
486                    setting time, otherwise we'd get timer wrap in
487                    weird delay cases. */
488                 ct->timeout.expires += jiffies;
489                 add_timer(&ct->timeout);
490                 atomic_inc(&ct->ct_general.use);
491                 set_bit(IPS_CONFIRMED_BIT, &ct->status);
492                 CONNTRACK_STAT_INC(insert);
493                 write_unlock_bh(&ip_conntrack_lock);
494                 if (ct->helper)
495                         ip_conntrack_event_cache(IPCT_HELPER, *pskb);
496 #ifdef CONFIG_IP_NF_NAT_NEEDED
497                 if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
498                     test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
499                         ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
500 #endif
501                 ip_conntrack_event_cache(master_ct(ct) ?
502                                          IPCT_RELATED : IPCT_NEW, *pskb);
503
504                 return NF_ACCEPT;
505         }
506
507         CONNTRACK_STAT_INC(insert_failed);
508         write_unlock_bh(&ip_conntrack_lock);
509
510         return NF_DROP;
511 }
512
513 /* Returns true if a connection correspondings to the tuple (required
514    for NAT). */
515 int
516 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
517                          const struct ip_conntrack *ignored_conntrack)
518 {
519         struct ip_conntrack_tuple_hash *h;
520
521         read_lock_bh(&ip_conntrack_lock);
522         h = __ip_conntrack_find(tuple, ignored_conntrack);
523         read_unlock_bh(&ip_conntrack_lock);
524
525         return h != NULL;
526 }
527
528 /* There's a small race here where we may free a just-assured
529    connection.  Too bad: we're in trouble anyway. */
530 static inline int unreplied(const struct ip_conntrack_tuple_hash *i)
531 {
532         return !(test_bit(IPS_ASSURED_BIT, &tuplehash_to_ctrack(i)->status));
533 }
534
535 static int early_drop(struct list_head *chain)
536 {
537         /* Traverse backwards: gives us oldest, which is roughly LRU */
538         struct ip_conntrack_tuple_hash *h;
539         struct ip_conntrack *ct = NULL;
540         int dropped = 0;
541
542         read_lock_bh(&ip_conntrack_lock);
543         h = LIST_FIND_B(chain, unreplied, struct ip_conntrack_tuple_hash *);
544         if (h) {
545                 ct = tuplehash_to_ctrack(h);
546                 atomic_inc(&ct->ct_general.use);
547         }
548         read_unlock_bh(&ip_conntrack_lock);
549
550         if (!ct)
551                 return dropped;
552
553         if (del_timer(&ct->timeout)) {
554                 death_by_timeout((unsigned long)ct);
555                 dropped = 1;
556                 CONNTRACK_STAT_INC(early_drop);
557         }
558         ip_conntrack_put(ct);
559         return dropped;
560 }
561
562 static inline int helper_cmp(const struct ip_conntrack_helper *i,
563                              const struct ip_conntrack_tuple *rtuple)
564 {
565         return ip_ct_tuple_mask_cmp(rtuple, &i->tuple, &i->mask);
566 }
567
568 static struct ip_conntrack_helper *
569 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
570 {
571         return LIST_FIND(&helpers, helper_cmp,
572                          struct ip_conntrack_helper *,
573                          tuple);
574 }
575
576 struct ip_conntrack_helper *
577 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
578 {
579         struct ip_conntrack_helper *helper;
580
581         /* need ip_conntrack_lock to assure that helper exists until
582          * try_module_get() is called */
583         read_lock_bh(&ip_conntrack_lock);
584
585         helper = __ip_conntrack_helper_find(tuple);
586         if (helper) {
587                 /* need to increase module usage count to assure helper will
588                  * not go away while the caller is e.g. busy putting a
589                  * conntrack in the hash that uses the helper */
590                 if (!try_module_get(helper->me))
591                         helper = NULL;
592         }
593
594         read_unlock_bh(&ip_conntrack_lock);
595
596         return helper;
597 }
598
599 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
600 {
601         module_put(helper->me);
602 }
603
604 struct ip_conntrack_protocol *
605 __ip_conntrack_proto_find(u_int8_t protocol)
606 {
607         return ip_ct_protos[protocol];
608 }
609
610 /* this is guaranteed to always return a valid protocol helper, since
611  * it falls back to generic_protocol */
612 struct ip_conntrack_protocol *
613 ip_conntrack_proto_find_get(u_int8_t protocol)
614 {
615         struct ip_conntrack_protocol *p;
616
617         preempt_disable();
618         p = __ip_conntrack_proto_find(protocol);
619         if (p) {
620                 if (!try_module_get(p->me))
621                         p = &ip_conntrack_generic_protocol;
622         }
623         preempt_enable();
624         
625         return p;
626 }
627
628 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
629 {
630         module_put(p->me);
631 }
632
633 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
634                                         struct ip_conntrack_tuple *repl)
635 {
636         struct ip_conntrack *conntrack;
637
638         if (!ip_conntrack_hash_rnd_initted) {
639                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
640                 ip_conntrack_hash_rnd_initted = 1;
641         }
642
643         if (ip_conntrack_max
644             && atomic_read(&ip_conntrack_count) >= ip_conntrack_max) {
645                 unsigned int hash = hash_conntrack(orig);
646                 /* Try dropping from this hash chain. */
647                 if (!early_drop(&ip_conntrack_hash[hash])) {
648                         if (net_ratelimit())
649                                 printk(KERN_WARNING
650                                        "ip_conntrack: table full, dropping"
651                                        " packet.\n");
652                         return ERR_PTR(-ENOMEM);
653                 }
654         }
655
656         conntrack = kmem_cache_alloc(ip_conntrack_cachep, GFP_ATOMIC);
657         if (!conntrack) {
658                 DEBUGP("Can't allocate conntrack.\n");
659                 return ERR_PTR(-ENOMEM);
660         }
661
662         memset(conntrack, 0, sizeof(*conntrack));
663         atomic_set(&conntrack->ct_general.use, 1);
664         conntrack->ct_general.destroy = destroy_conntrack;
665         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
666         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
667         /* Don't set timer yet: wait for confirmation */
668         init_timer(&conntrack->timeout);
669         conntrack->timeout.data = (unsigned long)conntrack;
670         conntrack->timeout.function = death_by_timeout;
671
672         atomic_inc(&ip_conntrack_count);
673
674         return conntrack;
675 }
676
677 void
678 ip_conntrack_free(struct ip_conntrack *conntrack)
679 {
680         atomic_dec(&ip_conntrack_count);
681         kmem_cache_free(ip_conntrack_cachep, conntrack);
682 }
683
684 /* Allocate a new conntrack: we return -ENOMEM if classification
685  * failed due to stress.   Otherwise it really is unclassifiable */
686 static struct ip_conntrack_tuple_hash *
687 init_conntrack(struct ip_conntrack_tuple *tuple,
688                struct ip_conntrack_protocol *protocol,
689                struct sk_buff *skb)
690 {
691         struct ip_conntrack *conntrack;
692         struct ip_conntrack_tuple repl_tuple;
693         struct ip_conntrack_expect *exp;
694
695         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
696                 DEBUGP("Can't invert tuple.\n");
697                 return NULL;
698         }
699
700         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
701         if (conntrack == NULL || IS_ERR(conntrack))
702                 return (struct ip_conntrack_tuple_hash *)conntrack;
703
704         if (!protocol->new(conntrack, skb)) {
705                 ip_conntrack_free(conntrack);
706                 return NULL;
707         }
708
709         write_lock_bh(&ip_conntrack_lock);
710         exp = find_expectation(tuple);
711
712         if (exp) {
713                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
714                         conntrack, exp);
715                 /* Welcome, Mr. Bond.  We've been expecting you... */
716                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
717                 conntrack->master = exp->master;
718 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
719                 conntrack->mark = exp->master->mark;
720 #endif
721 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
722     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
723                 /* this is ugly, but there is no other place where to put it */
724                 conntrack->nat.masq_index = exp->master->nat.masq_index;
725 #endif
726 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
727                 conntrack->secmark = exp->master->secmark;
728 #endif
729                 nf_conntrack_get(&conntrack->master->ct_general);
730                 CONNTRACK_STAT_INC(expect_new);
731         } else {
732                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
733
734                 CONNTRACK_STAT_INC(new);
735         }
736
737         /* Overload tuple linked list to put us in unconfirmed list. */
738         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
739
740         write_unlock_bh(&ip_conntrack_lock);
741
742         if (exp) {
743                 if (exp->expectfn)
744                         exp->expectfn(conntrack, exp);
745                 ip_conntrack_expect_put(exp);
746         }
747
748         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
749 }
750
751 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
752 static inline struct ip_conntrack *
753 resolve_normal_ct(struct sk_buff *skb,
754                   struct ip_conntrack_protocol *proto,
755                   int *set_reply,
756                   unsigned int hooknum,
757                   enum ip_conntrack_info *ctinfo)
758 {
759         struct ip_conntrack_tuple tuple;
760         struct ip_conntrack_tuple_hash *h;
761         struct ip_conntrack *ct;
762
763         IP_NF_ASSERT((skb->nh.iph->frag_off & htons(IP_OFFSET)) == 0);
764
765         if (!ip_ct_get_tuple(skb->nh.iph, skb, skb->nh.iph->ihl*4, 
766                                 &tuple,proto))
767                 return NULL;
768
769         /* look for tuple match */
770         h = ip_conntrack_find_get(&tuple, NULL);
771         if (!h) {
772                 h = init_conntrack(&tuple, proto, skb);
773                 if (!h)
774                         return NULL;
775                 if (IS_ERR(h))
776                         return (void *)h;
777         }
778         ct = tuplehash_to_ctrack(h);
779
780         /* It exists; we have (non-exclusive) reference. */
781         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
782                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
783                 /* Please set reply bit if this packet OK */
784                 *set_reply = 1;
785         } else {
786                 /* Once we've had two way comms, always ESTABLISHED. */
787                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
788                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
789                                ct);
790                         *ctinfo = IP_CT_ESTABLISHED;
791                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
792                         DEBUGP("ip_conntrack_in: related packet for %p\n",
793                                ct);
794                         *ctinfo = IP_CT_RELATED;
795                 } else {
796                         DEBUGP("ip_conntrack_in: new packet for %p\n",
797                                ct);
798                         *ctinfo = IP_CT_NEW;
799                 }
800                 *set_reply = 0;
801         }
802         skb->nfct = &ct->ct_general;
803         skb->nfctinfo = *ctinfo;
804         return ct;
805 }
806
807 /* Netfilter hook itself. */
808 unsigned int ip_conntrack_in(unsigned int hooknum,
809                              struct sk_buff **pskb,
810                              const struct net_device *in,
811                              const struct net_device *out,
812                              int (*okfn)(struct sk_buff *))
813 {
814         struct ip_conntrack *ct;
815         enum ip_conntrack_info ctinfo;
816         struct ip_conntrack_protocol *proto;
817         int set_reply = 0;
818         int ret;
819
820         /* Previously seen (loopback or untracked)?  Ignore. */
821         if ((*pskb)->nfct) {
822                 CONNTRACK_STAT_INC(ignore);
823                 return NF_ACCEPT;
824         }
825
826         /* Never happen */
827         if ((*pskb)->nh.iph->frag_off & htons(IP_OFFSET)) {
828                 if (net_ratelimit()) {
829                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
830                        (*pskb)->nh.iph->protocol, hooknum);
831                 }
832                 return NF_DROP;
833         }
834
835 /* Doesn't cover locally-generated broadcast, so not worth it. */
836 #if 0
837         /* Ignore broadcast: no `connection'. */
838         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
839                 printk("Broadcast packet!\n");
840                 return NF_ACCEPT;
841         } else if (((*pskb)->nh.iph->daddr & htonl(0x000000FF)) 
842                    == htonl(0x000000FF)) {
843                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
844                        NIPQUAD((*pskb)->nh.iph->saddr),
845                        NIPQUAD((*pskb)->nh.iph->daddr),
846                        (*pskb)->sk, (*pskb)->pkt_type);
847         }
848 #endif
849
850         proto = __ip_conntrack_proto_find((*pskb)->nh.iph->protocol);
851
852         /* It may be an special packet, error, unclean...
853          * inverse of the return code tells to the netfilter
854          * core what to do with the packet. */
855         if (proto->error != NULL 
856             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
857                 CONNTRACK_STAT_INC(error);
858                 CONNTRACK_STAT_INC(invalid);
859                 return -ret;
860         }
861
862         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
863                 /* Not valid part of a connection */
864                 CONNTRACK_STAT_INC(invalid);
865                 return NF_ACCEPT;
866         }
867
868         if (IS_ERR(ct)) {
869                 /* Too stressed to deal. */
870                 CONNTRACK_STAT_INC(drop);
871                 return NF_DROP;
872         }
873
874         IP_NF_ASSERT((*pskb)->nfct);
875
876         ret = proto->packet(ct, *pskb, ctinfo);
877         if (ret < 0) {
878                 /* Invalid: inverse of the return code tells
879                  * the netfilter core what to do*/
880                 nf_conntrack_put((*pskb)->nfct);
881                 (*pskb)->nfct = NULL;
882                 CONNTRACK_STAT_INC(invalid);
883                 return -ret;
884         }
885
886         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
887                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
888
889         return ret;
890 }
891
892 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
893                    const struct ip_conntrack_tuple *orig)
894 {
895         return ip_ct_invert_tuple(inverse, orig, 
896                                   __ip_conntrack_proto_find(orig->dst.protonum));
897 }
898
899 /* Would two expected things clash? */
900 static inline int expect_clash(const struct ip_conntrack_expect *a,
901                                const struct ip_conntrack_expect *b)
902 {
903         /* Part covered by intersection of masks must be unequal,
904            otherwise they clash */
905         struct ip_conntrack_tuple intersect_mask
906                 = { { a->mask.src.ip & b->mask.src.ip,
907                       { a->mask.src.u.all & b->mask.src.u.all } },
908                     { a->mask.dst.ip & b->mask.dst.ip,
909                       { a->mask.dst.u.all & b->mask.dst.u.all },
910                       a->mask.dst.protonum & b->mask.dst.protonum } };
911
912         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
913 }
914
915 static inline int expect_matches(const struct ip_conntrack_expect *a,
916                                  const struct ip_conntrack_expect *b)
917 {
918         return a->master == b->master
919                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
920                 && ip_ct_tuple_equal(&a->mask, &b->mask);
921 }
922
923 /* Generally a bad idea to call this: could have matched already. */
924 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
925 {
926         struct ip_conntrack_expect *i;
927
928         write_lock_bh(&ip_conntrack_lock);
929         /* choose the the oldest expectation to evict */
930         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
931                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
932                         ip_ct_unlink_expect(i);
933                         write_unlock_bh(&ip_conntrack_lock);
934                         ip_conntrack_expect_put(i);
935                         return;
936                 }
937         }
938         write_unlock_bh(&ip_conntrack_lock);
939 }
940
941 /* We don't increase the master conntrack refcount for non-fulfilled
942  * conntracks. During the conntrack destruction, the expectations are 
943  * always killed before the conntrack itself */
944 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
945 {
946         struct ip_conntrack_expect *new;
947
948         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
949         if (!new) {
950                 DEBUGP("expect_related: OOM allocating expect\n");
951                 return NULL;
952         }
953         new->master = me;
954         atomic_set(&new->use, 1);
955         return new;
956 }
957
958 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
959 {
960         if (atomic_dec_and_test(&exp->use))
961                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
962 }
963
964 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
965 {
966         atomic_inc(&exp->use);
967         exp->master->expecting++;
968         list_add(&exp->list, &ip_conntrack_expect_list);
969
970         init_timer(&exp->timeout);
971         exp->timeout.data = (unsigned long)exp;
972         exp->timeout.function = expectation_timed_out;
973         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
974         add_timer(&exp->timeout);
975
976         exp->id = ++ip_conntrack_expect_next_id;
977         atomic_inc(&exp->use);
978         CONNTRACK_STAT_INC(expect_create);
979 }
980
981 /* Race with expectations being used means we could have none to find; OK. */
982 static void evict_oldest_expect(struct ip_conntrack *master)
983 {
984         struct ip_conntrack_expect *i;
985
986         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
987                 if (i->master == master) {
988                         if (del_timer(&i->timeout)) {
989                                 ip_ct_unlink_expect(i);
990                                 ip_conntrack_expect_put(i);
991                         }
992                         break;
993                 }
994         }
995 }
996
997 static inline int refresh_timer(struct ip_conntrack_expect *i)
998 {
999         if (!del_timer(&i->timeout))
1000                 return 0;
1001
1002         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
1003         add_timer(&i->timeout);
1004         return 1;
1005 }
1006
1007 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1008 {
1009         struct ip_conntrack_expect *i;
1010         int ret;
1011
1012         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1013         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1014         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1015
1016         write_lock_bh(&ip_conntrack_lock);
1017         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1018                 if (expect_matches(i, expect)) {
1019                         /* Refresh timer: if it's dying, ignore.. */
1020                         if (refresh_timer(i)) {
1021                                 ret = 0;
1022                                 goto out;
1023                         }
1024                 } else if (expect_clash(i, expect)) {
1025                         ret = -EBUSY;
1026                         goto out;
1027                 }
1028         }
1029
1030         /* Will be over limit? */
1031         if (expect->master->helper->max_expected && 
1032             expect->master->expecting >= expect->master->helper->max_expected)
1033                 evict_oldest_expect(expect->master);
1034
1035         ip_conntrack_expect_insert(expect);
1036         ip_conntrack_expect_event(IPEXP_NEW, expect);
1037         ret = 0;
1038 out:
1039         write_unlock_bh(&ip_conntrack_lock);
1040         return ret;
1041 }
1042
1043 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1044    implicitly racy: see __ip_conntrack_confirm */
1045 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1046                               const struct ip_conntrack_tuple *newreply)
1047 {
1048         write_lock_bh(&ip_conntrack_lock);
1049         /* Should be unconfirmed, so not in hash table yet */
1050         IP_NF_ASSERT(!is_confirmed(conntrack));
1051
1052         DEBUGP("Altering reply tuple of %p to ", conntrack);
1053         DUMP_TUPLE(newreply);
1054
1055         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1056         if (!conntrack->master && conntrack->expecting == 0)
1057                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1058         write_unlock_bh(&ip_conntrack_lock);
1059 }
1060
1061 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1062 {
1063         BUG_ON(me->timeout == 0);
1064         write_lock_bh(&ip_conntrack_lock);
1065         list_prepend(&helpers, me);
1066         write_unlock_bh(&ip_conntrack_lock);
1067
1068         return 0;
1069 }
1070
1071 struct ip_conntrack_helper *
1072 __ip_conntrack_helper_find_byname(const char *name)
1073 {
1074         struct ip_conntrack_helper *h;
1075
1076         list_for_each_entry(h, &helpers, list) {
1077                 if (!strcmp(h->name, name))
1078                         return h;
1079         }
1080
1081         return NULL;
1082 }
1083
1084 static inline int unhelp(struct ip_conntrack_tuple_hash *i,
1085                          const struct ip_conntrack_helper *me)
1086 {
1087         if (tuplehash_to_ctrack(i)->helper == me) {
1088                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1089                 tuplehash_to_ctrack(i)->helper = NULL;
1090         }
1091         return 0;
1092 }
1093
1094 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1095 {
1096         unsigned int i;
1097         struct ip_conntrack_expect *exp, *tmp;
1098
1099         /* Need write lock here, to delete helper. */
1100         write_lock_bh(&ip_conntrack_lock);
1101         LIST_DELETE(&helpers, me);
1102
1103         /* Get rid of expectations */
1104         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1105                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1106                         ip_ct_unlink_expect(exp);
1107                         ip_conntrack_expect_put(exp);
1108                 }
1109         }
1110         /* Get rid of expecteds, set helpers to NULL. */
1111         LIST_FIND_W(&unconfirmed, unhelp, struct ip_conntrack_tuple_hash*, me);
1112         for (i = 0; i < ip_conntrack_htable_size; i++)
1113                 LIST_FIND_W(&ip_conntrack_hash[i], unhelp,
1114                             struct ip_conntrack_tuple_hash *, me);
1115         write_unlock_bh(&ip_conntrack_lock);
1116
1117         /* Someone could be still looking at the helper in a bh. */
1118         synchronize_net();
1119 }
1120
1121 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1122 void __ip_ct_refresh_acct(struct ip_conntrack *ct, 
1123                         enum ip_conntrack_info ctinfo,
1124                         const struct sk_buff *skb,
1125                         unsigned long extra_jiffies,
1126                         int do_acct)
1127 {
1128         int event = 0;
1129
1130         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1131         IP_NF_ASSERT(skb);
1132
1133         write_lock_bh(&ip_conntrack_lock);
1134
1135         /* Only update if this is not a fixed timeout */
1136         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1137                 write_unlock_bh(&ip_conntrack_lock);
1138                 return;
1139         }
1140
1141         /* If not in hash table, timer will not be active yet */
1142         if (!is_confirmed(ct)) {
1143                 ct->timeout.expires = extra_jiffies;
1144                 event = IPCT_REFRESH;
1145         } else {
1146                 /* Need del_timer for race avoidance (may already be dying). */
1147                 if (del_timer(&ct->timeout)) {
1148                         ct->timeout.expires = jiffies + extra_jiffies;
1149                         add_timer(&ct->timeout);
1150                         event = IPCT_REFRESH;
1151                 }
1152         }
1153
1154 #ifdef CONFIG_IP_NF_CT_ACCT
1155         if (do_acct) {
1156                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1157                 ct->counters[CTINFO2DIR(ctinfo)].bytes += 
1158                                                 ntohs(skb->nh.iph->tot_len);
1159                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1160                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1161                         event |= IPCT_COUNTER_FILLING;
1162         }
1163 #endif
1164
1165         write_unlock_bh(&ip_conntrack_lock);
1166
1167         /* must be unlocked when calling event cache */
1168         if (event)
1169                 ip_conntrack_event_cache(event, skb);
1170 }
1171
1172 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1173     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1174 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1175  * in ip_conntrack_core, since we don't want the protocols to autoload
1176  * or depend on ctnetlink */
1177 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1178                                const struct ip_conntrack_tuple *tuple)
1179 {
1180         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(u_int16_t),
1181                 &tuple->src.u.tcp.port);
1182         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(u_int16_t),
1183                 &tuple->dst.u.tcp.port);
1184         return 0;
1185
1186 nfattr_failure:
1187         return -1;
1188 }
1189
1190 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1191                                struct ip_conntrack_tuple *t)
1192 {
1193         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1194                 return -EINVAL;
1195
1196         t->src.u.tcp.port =
1197                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1198         t->dst.u.tcp.port =
1199                 *(u_int16_t *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1200
1201         return 0;
1202 }
1203 #endif
1204
1205 /* Returns new sk_buff, or NULL */
1206 struct sk_buff *
1207 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1208 {
1209         skb_orphan(skb);
1210
1211         local_bh_disable(); 
1212         skb = ip_defrag(skb, user);
1213         local_bh_enable();
1214
1215         if (skb)
1216                 ip_send_check(skb->nh.iph);
1217         return skb;
1218 }
1219
1220 /* Used by ipt_REJECT. */
1221 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1222 {
1223         struct ip_conntrack *ct;
1224         enum ip_conntrack_info ctinfo;
1225
1226         /* This ICMP is in reverse direction to the packet which caused it */
1227         ct = ip_conntrack_get(skb, &ctinfo);
1228         
1229         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1230                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1231         else
1232                 ctinfo = IP_CT_RELATED;
1233
1234         /* Attach to new skbuff, and increment count */
1235         nskb->nfct = &ct->ct_general;
1236         nskb->nfctinfo = ctinfo;
1237         nf_conntrack_get(nskb->nfct);
1238 }
1239
1240 static inline int
1241 do_iter(const struct ip_conntrack_tuple_hash *i,
1242         int (*iter)(struct ip_conntrack *i, void *data),
1243         void *data)
1244 {
1245         return iter(tuplehash_to_ctrack(i), data);
1246 }
1247
1248 /* Bring out ya dead! */
1249 static struct ip_conntrack_tuple_hash *
1250 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1251                 void *data, unsigned int *bucket)
1252 {
1253         struct ip_conntrack_tuple_hash *h = NULL;
1254
1255         write_lock_bh(&ip_conntrack_lock);
1256         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1257                 h = LIST_FIND_W(&ip_conntrack_hash[*bucket], do_iter,
1258                                 struct ip_conntrack_tuple_hash *, iter, data);
1259                 if (h)
1260                         break;
1261         }
1262         if (!h)
1263                 h = LIST_FIND_W(&unconfirmed, do_iter,
1264                                 struct ip_conntrack_tuple_hash *, iter, data);
1265         if (h)
1266                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
1267         write_unlock_bh(&ip_conntrack_lock);
1268
1269         return h;
1270 }
1271
1272 void
1273 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1274 {
1275         struct ip_conntrack_tuple_hash *h;
1276         unsigned int bucket = 0;
1277
1278         while ((h = get_next_corpse(iter, data, &bucket)) != NULL) {
1279                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1280                 /* Time to push up daises... */
1281                 if (del_timer(&ct->timeout))
1282                         death_by_timeout((unsigned long)ct);
1283                 /* ... else the timer will get him soon. */
1284
1285                 ip_conntrack_put(ct);
1286         }
1287 }
1288
1289 /* Fast function for those who don't want to parse /proc (and I don't
1290    blame them). */
1291 /* Reversing the socket's dst/src point of view gives us the reply
1292    mapping. */
1293 static int
1294 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1295 {
1296         struct inet_sock *inet = inet_sk(sk);
1297         struct ip_conntrack_tuple_hash *h;
1298         struct ip_conntrack_tuple tuple;
1299         
1300         IP_CT_TUPLE_U_BLANK(&tuple);
1301         tuple.src.ip = inet->rcv_saddr;
1302         tuple.src.u.tcp.port = inet->sport;
1303         tuple.dst.ip = inet->daddr;
1304         tuple.dst.u.tcp.port = inet->dport;
1305         tuple.dst.protonum = IPPROTO_TCP;
1306
1307         /* We only do TCP at the moment: is there a better way? */
1308         if (strcmp(sk->sk_prot->name, "TCP")) {
1309                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1310                 return -ENOPROTOOPT;
1311         }
1312
1313         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1314                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1315                        *len, sizeof(struct sockaddr_in));
1316                 return -EINVAL;
1317         }
1318
1319         h = ip_conntrack_find_get(&tuple, NULL);
1320         if (h) {
1321                 struct sockaddr_in sin;
1322                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1323
1324                 sin.sin_family = AF_INET;
1325                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1326                         .tuple.dst.u.tcp.port;
1327                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1328                         .tuple.dst.ip;
1329                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1330
1331                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1332                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1333                 ip_conntrack_put(ct);
1334                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1335                         return -EFAULT;
1336                 else
1337                         return 0;
1338         }
1339         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1340                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1341                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1342         return -ENOENT;
1343 }
1344
1345 static struct nf_sockopt_ops so_getorigdst = {
1346         .pf             = PF_INET,
1347         .get_optmin     = SO_ORIGINAL_DST,
1348         .get_optmax     = SO_ORIGINAL_DST+1,
1349         .get            = &getorigdst,
1350 };
1351
1352 static int kill_all(struct ip_conntrack *i, void *data)
1353 {
1354         return 1;
1355 }
1356
1357 void ip_conntrack_flush(void)
1358 {
1359         ip_ct_iterate_cleanup(kill_all, NULL);
1360 }
1361
1362 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1363 {
1364         if (vmalloced)
1365                 vfree(hash);
1366         else
1367                 free_pages((unsigned long)hash, 
1368                            get_order(sizeof(struct list_head) * size));
1369 }
1370
1371 /* Mishearing the voices in his head, our hero wonders how he's
1372    supposed to kill the mall. */
1373 void ip_conntrack_cleanup(void)
1374 {
1375         ip_ct_attach = NULL;
1376
1377         /* This makes sure all current packets have passed through
1378            netfilter framework.  Roll on, two-stage module
1379            delete... */
1380         synchronize_net();
1381
1382         ip_ct_event_cache_flush();
1383  i_see_dead_people:
1384         ip_conntrack_flush();
1385         if (atomic_read(&ip_conntrack_count) != 0) {
1386                 schedule();
1387                 goto i_see_dead_people;
1388         }
1389         /* wait until all references to ip_conntrack_untracked are dropped */
1390         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1391                 schedule();
1392
1393         kmem_cache_destroy(ip_conntrack_cachep);
1394         kmem_cache_destroy(ip_conntrack_expect_cachep);
1395         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1396                             ip_conntrack_htable_size);
1397         nf_unregister_sockopt(&so_getorigdst);
1398 }
1399
1400 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1401 {
1402         struct list_head *hash;
1403         unsigned int i;
1404
1405         *vmalloced = 0; 
1406         hash = (void*)__get_free_pages(GFP_KERNEL, 
1407                                        get_order(sizeof(struct list_head)
1408                                                  * size));
1409         if (!hash) { 
1410                 *vmalloced = 1;
1411                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1412                 hash = vmalloc(sizeof(struct list_head) * size);
1413         }
1414
1415         if (hash)
1416                 for (i = 0; i < size; i++)
1417                         INIT_LIST_HEAD(&hash[i]);
1418
1419         return hash;
1420 }
1421
1422 static int set_hashsize(const char *val, struct kernel_param *kp)
1423 {
1424         int i, bucket, hashsize, vmalloced;
1425         int old_vmalloced, old_size;
1426         int rnd;
1427         struct list_head *hash, *old_hash;
1428         struct ip_conntrack_tuple_hash *h;
1429
1430         /* On boot, we can set this without any fancy locking. */
1431         if (!ip_conntrack_htable_size)
1432                 return param_set_int(val, kp);
1433
1434         hashsize = simple_strtol(val, NULL, 0);
1435         if (!hashsize)
1436                 return -EINVAL;
1437
1438         hash = alloc_hashtable(hashsize, &vmalloced);
1439         if (!hash)
1440                 return -ENOMEM;
1441
1442         /* We have to rehash for the new table anyway, so we also can 
1443          * use a new random seed */
1444         get_random_bytes(&rnd, 4);
1445
1446         write_lock_bh(&ip_conntrack_lock);
1447         for (i = 0; i < ip_conntrack_htable_size; i++) {
1448                 while (!list_empty(&ip_conntrack_hash[i])) {
1449                         h = list_entry(ip_conntrack_hash[i].next,
1450                                        struct ip_conntrack_tuple_hash, list);
1451                         list_del(&h->list);
1452                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1453                         list_add_tail(&h->list, &hash[bucket]);
1454                 }
1455         }
1456         old_size = ip_conntrack_htable_size;
1457         old_vmalloced = ip_conntrack_vmalloc;
1458         old_hash = ip_conntrack_hash;
1459
1460         ip_conntrack_htable_size = hashsize;
1461         ip_conntrack_vmalloc = vmalloced;
1462         ip_conntrack_hash = hash;
1463         ip_conntrack_hash_rnd = rnd;
1464         write_unlock_bh(&ip_conntrack_lock);
1465
1466         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1467         return 0;
1468 }
1469
1470 module_param_call(hashsize, set_hashsize, param_get_uint,
1471                   &ip_conntrack_htable_size, 0600);
1472
1473 int __init ip_conntrack_init(void)
1474 {
1475         unsigned int i;
1476         int ret;
1477
1478         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1479          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1480         if (!ip_conntrack_htable_size) {
1481                 ip_conntrack_htable_size
1482                         = (((num_physpages << PAGE_SHIFT) / 16384)
1483                            / sizeof(struct list_head));
1484                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1485                         ip_conntrack_htable_size = 8192;
1486                 if (ip_conntrack_htable_size < 16)
1487                         ip_conntrack_htable_size = 16;
1488         }
1489         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1490
1491         printk("ip_conntrack version %s (%u buckets, %d max)"
1492                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1493                ip_conntrack_htable_size, ip_conntrack_max,
1494                sizeof(struct ip_conntrack));
1495
1496         ret = nf_register_sockopt(&so_getorigdst);
1497         if (ret != 0) {
1498                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1499                 return ret;
1500         }
1501
1502         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1503                                             &ip_conntrack_vmalloc);
1504         if (!ip_conntrack_hash) {
1505                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1506                 goto err_unreg_sockopt;
1507         }
1508
1509         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1510                                                 sizeof(struct ip_conntrack), 0,
1511                                                 0, NULL, NULL);
1512         if (!ip_conntrack_cachep) {
1513                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1514                 goto err_free_hash;
1515         }
1516
1517         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1518                                         sizeof(struct ip_conntrack_expect),
1519                                         0, 0, NULL, NULL);
1520         if (!ip_conntrack_expect_cachep) {
1521                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1522                 goto err_free_conntrack_slab;
1523         }
1524
1525         /* Don't NEED lock here, but good form anyway. */
1526         write_lock_bh(&ip_conntrack_lock);
1527         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1528                 ip_ct_protos[i] = &ip_conntrack_generic_protocol;
1529         /* Sew in builtin protocols. */
1530         ip_ct_protos[IPPROTO_TCP] = &ip_conntrack_protocol_tcp;
1531         ip_ct_protos[IPPROTO_UDP] = &ip_conntrack_protocol_udp;
1532         ip_ct_protos[IPPROTO_ICMP] = &ip_conntrack_protocol_icmp;
1533         write_unlock_bh(&ip_conntrack_lock);
1534
1535         /* For use by ipt_REJECT */
1536         ip_ct_attach = ip_conntrack_attach;
1537
1538         /* Set up fake conntrack:
1539             - to never be deleted, not in any hashes */
1540         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1541         /*  - and look it like as a confirmed connection */
1542         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1543
1544         return ret;
1545
1546 err_free_conntrack_slab:
1547         kmem_cache_destroy(ip_conntrack_cachep);
1548 err_free_hash:
1549         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1550                             ip_conntrack_htable_size);
1551 err_unreg_sockopt:
1552         nf_unregister_sockopt(&so_getorigdst);
1553
1554         return -ENOMEM;
1555 }