[SK_BUFF]: Introduce ip_hdr(), remove skb->nh.iph
[safe/jmp/linux-2.6] / net / ipv4 / netfilter / ip_conntrack_core.c
1 /* Connection state tracking for netfilter.  This is separated from,
2    but required by, the NAT layer; it can also be used by an iptables
3    extension. */
4
5 /* (C) 1999-2001 Paul `Rusty' Russell
6  * (C) 2002-2004 Netfilter Core Team <coreteam@netfilter.org>
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License version 2 as
10  * published by the Free Software Foundation.
11  *
12  * 23 Apr 2001: Harald Welte <laforge@gnumonks.org>
13  *      - new API and handling of conntrack/nat helpers
14  *      - now capable of multiple expectations for one master
15  * 16 Jul 2002: Harald Welte <laforge@gnumonks.org>
16  *      - add usage/reference counts to ip_conntrack_expect
17  *      - export ip_conntrack[_expect]_{find_get,put} functions
18  * */
19
20 #include <linux/types.h>
21 #include <linux/icmp.h>
22 #include <linux/ip.h>
23 #include <linux/netfilter.h>
24 #include <linux/netfilter_ipv4.h>
25 #include <linux/module.h>
26 #include <linux/skbuff.h>
27 #include <linux/proc_fs.h>
28 #include <linux/vmalloc.h>
29 #include <net/checksum.h>
30 #include <net/ip.h>
31 #include <linux/stddef.h>
32 #include <linux/sysctl.h>
33 #include <linux/slab.h>
34 #include <linux/random.h>
35 #include <linux/jhash.h>
36 #include <linux/err.h>
37 #include <linux/percpu.h>
38 #include <linux/moduleparam.h>
39 #include <linux/notifier.h>
40
41 /* ip_conntrack_lock protects the main hash table, protocol/helper/expected
42    registrations, conntrack timers*/
43 #include <linux/netfilter_ipv4/ip_conntrack.h>
44 #include <linux/netfilter_ipv4/ip_conntrack_protocol.h>
45 #include <linux/netfilter_ipv4/ip_conntrack_helper.h>
46 #include <linux/netfilter_ipv4/ip_conntrack_core.h>
47
48 #define IP_CONNTRACK_VERSION    "2.4"
49
50 #if 0
51 #define DEBUGP printk
52 #else
53 #define DEBUGP(format, args...)
54 #endif
55
56 DEFINE_RWLOCK(ip_conntrack_lock);
57
58 /* ip_conntrack_standalone needs this */
59 atomic_t ip_conntrack_count = ATOMIC_INIT(0);
60
61 void (*ip_conntrack_destroyed)(struct ip_conntrack *conntrack) = NULL;
62 LIST_HEAD(ip_conntrack_expect_list);
63 struct ip_conntrack_protocol *ip_ct_protos[MAX_IP_CT_PROTO] __read_mostly;
64 static LIST_HEAD(helpers);
65 unsigned int ip_conntrack_htable_size __read_mostly = 0;
66 int ip_conntrack_max __read_mostly;
67 struct list_head *ip_conntrack_hash __read_mostly;
68 static struct kmem_cache *ip_conntrack_cachep __read_mostly;
69 static struct kmem_cache *ip_conntrack_expect_cachep __read_mostly;
70 struct ip_conntrack ip_conntrack_untracked;
71 unsigned int ip_ct_log_invalid __read_mostly;
72 static LIST_HEAD(unconfirmed);
73 static int ip_conntrack_vmalloc __read_mostly;
74
75 static unsigned int ip_conntrack_next_id;
76 static unsigned int ip_conntrack_expect_next_id;
77 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
78 ATOMIC_NOTIFIER_HEAD(ip_conntrack_chain);
79 ATOMIC_NOTIFIER_HEAD(ip_conntrack_expect_chain);
80
81 DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
82
83 /* deliver cached events and clear cache entry - must be called with locally
84  * disabled softirqs */
85 static inline void
86 __ip_ct_deliver_cached_events(struct ip_conntrack_ecache *ecache)
87 {
88         DEBUGP("ecache: delivering events for %p\n", ecache->ct);
89         if (is_confirmed(ecache->ct) && !is_dying(ecache->ct) && ecache->events)
90                 atomic_notifier_call_chain(&ip_conntrack_chain, ecache->events,
91                                     ecache->ct);
92         ecache->events = 0;
93         ip_conntrack_put(ecache->ct);
94         ecache->ct = NULL;
95 }
96
97 /* Deliver all cached events for a particular conntrack. This is called
98  * by code prior to async packet handling or freeing the skb */
99 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
100 {
101         struct ip_conntrack_ecache *ecache;
102
103         local_bh_disable();
104         ecache = &__get_cpu_var(ip_conntrack_ecache);
105         if (ecache->ct == ct)
106                 __ip_ct_deliver_cached_events(ecache);
107         local_bh_enable();
108 }
109
110 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
111 {
112         struct ip_conntrack_ecache *ecache;
113
114         /* take care of delivering potentially old events */
115         ecache = &__get_cpu_var(ip_conntrack_ecache);
116         BUG_ON(ecache->ct == ct);
117         if (ecache->ct)
118                 __ip_ct_deliver_cached_events(ecache);
119         /* initialize for this conntrack/packet */
120         ecache->ct = ct;
121         nf_conntrack_get(&ct->ct_general);
122 }
123
124 /* flush the event cache - touches other CPU's data and must not be called while
125  * packets are still passing through the code */
126 static void ip_ct_event_cache_flush(void)
127 {
128         struct ip_conntrack_ecache *ecache;
129         int cpu;
130
131         for_each_possible_cpu(cpu) {
132                 ecache = &per_cpu(ip_conntrack_ecache, cpu);
133                 if (ecache->ct)
134                         ip_conntrack_put(ecache->ct);
135         }
136 }
137 #else
138 static inline void ip_ct_event_cache_flush(void) {}
139 #endif /* CONFIG_IP_NF_CONNTRACK_EVENTS */
140
141 DEFINE_PER_CPU(struct ip_conntrack_stat, ip_conntrack_stat);
142
143 static int ip_conntrack_hash_rnd_initted;
144 static unsigned int ip_conntrack_hash_rnd;
145
146 static u_int32_t __hash_conntrack(const struct ip_conntrack_tuple *tuple,
147                             unsigned int size, unsigned int rnd)
148 {
149         return (jhash_3words((__force u32)tuple->src.ip,
150                              ((__force u32)tuple->dst.ip ^ tuple->dst.protonum),
151                              (tuple->src.u.all | (tuple->dst.u.all << 16)),
152                              rnd) % size);
153 }
154
155 static u_int32_t
156 hash_conntrack(const struct ip_conntrack_tuple *tuple)
157 {
158         return __hash_conntrack(tuple, ip_conntrack_htable_size,
159                                 ip_conntrack_hash_rnd);
160 }
161
162 int
163 ip_ct_get_tuple(const struct iphdr *iph,
164                 const struct sk_buff *skb,
165                 unsigned int dataoff,
166                 struct ip_conntrack_tuple *tuple,
167                 const struct ip_conntrack_protocol *protocol)
168 {
169         /* Never happen */
170         if (iph->frag_off & htons(IP_OFFSET)) {
171                 printk("ip_conntrack_core: Frag of proto %u.\n",
172                        iph->protocol);
173                 return 0;
174         }
175
176         tuple->src.ip = iph->saddr;
177         tuple->dst.ip = iph->daddr;
178         tuple->dst.protonum = iph->protocol;
179         tuple->dst.dir = IP_CT_DIR_ORIGINAL;
180
181         return protocol->pkt_to_tuple(skb, dataoff, tuple);
182 }
183
184 int
185 ip_ct_invert_tuple(struct ip_conntrack_tuple *inverse,
186                    const struct ip_conntrack_tuple *orig,
187                    const struct ip_conntrack_protocol *protocol)
188 {
189         inverse->src.ip = orig->dst.ip;
190         inverse->dst.ip = orig->src.ip;
191         inverse->dst.protonum = orig->dst.protonum;
192         inverse->dst.dir = !orig->dst.dir;
193
194         return protocol->invert_tuple(inverse, orig);
195 }
196
197
198 /* ip_conntrack_expect helper functions */
199 void ip_ct_unlink_expect(struct ip_conntrack_expect *exp)
200 {
201         IP_NF_ASSERT(!timer_pending(&exp->timeout));
202         list_del(&exp->list);
203         CONNTRACK_STAT_INC(expect_delete);
204         exp->master->expecting--;
205         ip_conntrack_expect_put(exp);
206 }
207
208 static void expectation_timed_out(unsigned long ul_expect)
209 {
210         struct ip_conntrack_expect *exp = (void *)ul_expect;
211
212         write_lock_bh(&ip_conntrack_lock);
213         ip_ct_unlink_expect(exp);
214         write_unlock_bh(&ip_conntrack_lock);
215         ip_conntrack_expect_put(exp);
216 }
217
218 struct ip_conntrack_expect *
219 __ip_conntrack_expect_find(const struct ip_conntrack_tuple *tuple)
220 {
221         struct ip_conntrack_expect *i;
222
223         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
224                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask))
225                         return i;
226         }
227         return NULL;
228 }
229
230 /* Just find a expectation corresponding to a tuple. */
231 struct ip_conntrack_expect *
232 ip_conntrack_expect_find_get(const struct ip_conntrack_tuple *tuple)
233 {
234         struct ip_conntrack_expect *i;
235
236         read_lock_bh(&ip_conntrack_lock);
237         i = __ip_conntrack_expect_find(tuple);
238         if (i)
239                 atomic_inc(&i->use);
240         read_unlock_bh(&ip_conntrack_lock);
241
242         return i;
243 }
244
245 /* If an expectation for this connection is found, it gets delete from
246  * global list then returned. */
247 static struct ip_conntrack_expect *
248 find_expectation(const struct ip_conntrack_tuple *tuple)
249 {
250         struct ip_conntrack_expect *i;
251
252         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
253                 /* If master is not in hash table yet (ie. packet hasn't left
254                    this machine yet), how can other end know about expected?
255                    Hence these are not the droids you are looking for (if
256                    master ct never got confirmed, we'd hold a reference to it
257                    and weird things would happen to future packets). */
258                 if (ip_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask)
259                     && is_confirmed(i->master)) {
260                         if (i->flags & IP_CT_EXPECT_PERMANENT) {
261                                 atomic_inc(&i->use);
262                                 return i;
263                         } else if (del_timer(&i->timeout)) {
264                                 ip_ct_unlink_expect(i);
265                                 return i;
266                         }
267                 }
268         }
269         return NULL;
270 }
271
272 /* delete all expectations for this conntrack */
273 void ip_ct_remove_expectations(struct ip_conntrack *ct)
274 {
275         struct ip_conntrack_expect *i, *tmp;
276
277         /* Optimization: most connection never expect any others. */
278         if (ct->expecting == 0)
279                 return;
280
281         list_for_each_entry_safe(i, tmp, &ip_conntrack_expect_list, list) {
282                 if (i->master == ct && del_timer(&i->timeout)) {
283                         ip_ct_unlink_expect(i);
284                         ip_conntrack_expect_put(i);
285                 }
286         }
287 }
288
289 static void
290 clean_from_lists(struct ip_conntrack *ct)
291 {
292         DEBUGP("clean_from_lists(%p)\n", ct);
293         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
294         list_del(&ct->tuplehash[IP_CT_DIR_REPLY].list);
295
296         /* Destroy all pending expectations */
297         ip_ct_remove_expectations(ct);
298 }
299
300 static void
301 destroy_conntrack(struct nf_conntrack *nfct)
302 {
303         struct ip_conntrack *ct = (struct ip_conntrack *)nfct;
304         struct ip_conntrack_protocol *proto;
305         struct ip_conntrack_helper *helper;
306         typeof(ip_conntrack_destroyed) destroyed;
307
308         DEBUGP("destroy_conntrack(%p)\n", ct);
309         IP_NF_ASSERT(atomic_read(&nfct->use) == 0);
310         IP_NF_ASSERT(!timer_pending(&ct->timeout));
311
312         ip_conntrack_event(IPCT_DESTROY, ct);
313         set_bit(IPS_DYING_BIT, &ct->status);
314
315         helper = ct->helper;
316         if (helper && helper->destroy)
317                 helper->destroy(ct);
318
319         /* To make sure we don't get any weird locking issues here:
320          * destroy_conntrack() MUST NOT be called with a write lock
321          * to ip_conntrack_lock!!! -HW */
322         rcu_read_lock();
323         proto = __ip_conntrack_proto_find(ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.protonum);
324         if (proto && proto->destroy)
325                 proto->destroy(ct);
326
327         destroyed = rcu_dereference(ip_conntrack_destroyed);
328         if (destroyed)
329                 destroyed(ct);
330
331         rcu_read_unlock();
332
333         write_lock_bh(&ip_conntrack_lock);
334         /* Expectations will have been removed in clean_from_lists,
335          * except TFTP can create an expectation on the first packet,
336          * before connection is in the list, so we need to clean here,
337          * too. */
338         ip_ct_remove_expectations(ct);
339
340         /* We overload first tuple to link into unconfirmed list. */
341         if (!is_confirmed(ct)) {
342                 BUG_ON(list_empty(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list));
343                 list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
344         }
345
346         CONNTRACK_STAT_INC(delete);
347         write_unlock_bh(&ip_conntrack_lock);
348
349         if (ct->master)
350                 ip_conntrack_put(ct->master);
351
352         DEBUGP("destroy_conntrack: returning ct=%p to slab\n", ct);
353         ip_conntrack_free(ct);
354 }
355
356 static void death_by_timeout(unsigned long ul_conntrack)
357 {
358         struct ip_conntrack *ct = (void *)ul_conntrack;
359
360         write_lock_bh(&ip_conntrack_lock);
361         /* Inside lock so preempt is disabled on module removal path.
362          * Otherwise we can get spurious warnings. */
363         CONNTRACK_STAT_INC(delete_list);
364         clean_from_lists(ct);
365         write_unlock_bh(&ip_conntrack_lock);
366         ip_conntrack_put(ct);
367 }
368
369 struct ip_conntrack_tuple_hash *
370 __ip_conntrack_find(const struct ip_conntrack_tuple *tuple,
371                     const struct ip_conntrack *ignored_conntrack)
372 {
373         struct ip_conntrack_tuple_hash *h;
374         unsigned int hash = hash_conntrack(tuple);
375
376         list_for_each_entry(h, &ip_conntrack_hash[hash], list) {
377                 if (tuplehash_to_ctrack(h) != ignored_conntrack &&
378                     ip_ct_tuple_equal(tuple, &h->tuple)) {
379                         CONNTRACK_STAT_INC(found);
380                         return h;
381                 }
382                 CONNTRACK_STAT_INC(searched);
383         }
384
385         return NULL;
386 }
387
388 /* Find a connection corresponding to a tuple. */
389 struct ip_conntrack_tuple_hash *
390 ip_conntrack_find_get(const struct ip_conntrack_tuple *tuple,
391                       const struct ip_conntrack *ignored_conntrack)
392 {
393         struct ip_conntrack_tuple_hash *h;
394
395         read_lock_bh(&ip_conntrack_lock);
396         h = __ip_conntrack_find(tuple, ignored_conntrack);
397         if (h)
398                 atomic_inc(&tuplehash_to_ctrack(h)->ct_general.use);
399         read_unlock_bh(&ip_conntrack_lock);
400
401         return h;
402 }
403
404 static void __ip_conntrack_hash_insert(struct ip_conntrack *ct,
405                                         unsigned int hash,
406                                         unsigned int repl_hash)
407 {
408         ct->id = ++ip_conntrack_next_id;
409         list_add(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list,
410                  &ip_conntrack_hash[hash]);
411         list_add(&ct->tuplehash[IP_CT_DIR_REPLY].list,
412                  &ip_conntrack_hash[repl_hash]);
413 }
414
415 void ip_conntrack_hash_insert(struct ip_conntrack *ct)
416 {
417         unsigned int hash, repl_hash;
418
419         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
420         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
421
422         write_lock_bh(&ip_conntrack_lock);
423         __ip_conntrack_hash_insert(ct, hash, repl_hash);
424         write_unlock_bh(&ip_conntrack_lock);
425 }
426
427 /* Confirm a connection given skb; places it in hash table */
428 int
429 __ip_conntrack_confirm(struct sk_buff **pskb)
430 {
431         unsigned int hash, repl_hash;
432         struct ip_conntrack_tuple_hash *h;
433         struct ip_conntrack *ct;
434         enum ip_conntrack_info ctinfo;
435
436         ct = ip_conntrack_get(*pskb, &ctinfo);
437
438         /* ipt_REJECT uses ip_conntrack_attach to attach related
439            ICMP/TCP RST packets in other direction.  Actual packet
440            which created connection will be IP_CT_NEW or for an
441            expected connection, IP_CT_RELATED. */
442         if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
443                 return NF_ACCEPT;
444
445         hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple);
446         repl_hash = hash_conntrack(&ct->tuplehash[IP_CT_DIR_REPLY].tuple);
447
448         /* We're not in hash table, and we refuse to set up related
449            connections for unconfirmed conns.  But packet copies and
450            REJECT will give spurious warnings here. */
451         /* IP_NF_ASSERT(atomic_read(&ct->ct_general.use) == 1); */
452
453         /* No external references means noone else could have
454            confirmed us. */
455         IP_NF_ASSERT(!is_confirmed(ct));
456         DEBUGP("Confirming conntrack %p\n", ct);
457
458         write_lock_bh(&ip_conntrack_lock);
459
460         /* See if there's one in the list already, including reverse:
461            NAT could have grabbed it without realizing, since we're
462            not in the hash.  If there is, we lost race. */
463         list_for_each_entry(h, &ip_conntrack_hash[hash], list)
464                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple,
465                                       &h->tuple))
466                         goto out;
467         list_for_each_entry(h, &ip_conntrack_hash[repl_hash], list)
468                 if (ip_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple,
469                                       &h->tuple))
470                         goto out;
471
472         /* Remove from unconfirmed list */
473         list_del(&ct->tuplehash[IP_CT_DIR_ORIGINAL].list);
474
475         __ip_conntrack_hash_insert(ct, hash, repl_hash);
476         /* Timer relative to confirmation time, not original
477            setting time, otherwise we'd get timer wrap in
478            weird delay cases. */
479         ct->timeout.expires += jiffies;
480         add_timer(&ct->timeout);
481         atomic_inc(&ct->ct_general.use);
482         set_bit(IPS_CONFIRMED_BIT, &ct->status);
483         CONNTRACK_STAT_INC(insert);
484         write_unlock_bh(&ip_conntrack_lock);
485         if (ct->helper)
486                 ip_conntrack_event_cache(IPCT_HELPER, *pskb);
487 #ifdef CONFIG_IP_NF_NAT_NEEDED
488         if (test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status) ||
489             test_bit(IPS_DST_NAT_DONE_BIT, &ct->status))
490                 ip_conntrack_event_cache(IPCT_NATINFO, *pskb);
491 #endif
492         ip_conntrack_event_cache(master_ct(ct) ?
493                                  IPCT_RELATED : IPCT_NEW, *pskb);
494
495         return NF_ACCEPT;
496
497 out:
498         CONNTRACK_STAT_INC(insert_failed);
499         write_unlock_bh(&ip_conntrack_lock);
500         return NF_DROP;
501 }
502
503 /* Returns true if a connection correspondings to the tuple (required
504    for NAT). */
505 int
506 ip_conntrack_tuple_taken(const struct ip_conntrack_tuple *tuple,
507                          const struct ip_conntrack *ignored_conntrack)
508 {
509         struct ip_conntrack_tuple_hash *h;
510
511         read_lock_bh(&ip_conntrack_lock);
512         h = __ip_conntrack_find(tuple, ignored_conntrack);
513         read_unlock_bh(&ip_conntrack_lock);
514
515         return h != NULL;
516 }
517
518 /* There's a small race here where we may free a just-assured
519    connection.  Too bad: we're in trouble anyway. */
520 static int early_drop(struct list_head *chain)
521 {
522         /* Traverse backwards: gives us oldest, which is roughly LRU */
523         struct ip_conntrack_tuple_hash *h;
524         struct ip_conntrack *ct = NULL, *tmp;
525         int dropped = 0;
526
527         read_lock_bh(&ip_conntrack_lock);
528         list_for_each_entry_reverse(h, chain, list) {
529                 tmp = tuplehash_to_ctrack(h);
530                 if (!test_bit(IPS_ASSURED_BIT, &tmp->status)) {
531                         ct = tmp;
532                         atomic_inc(&ct->ct_general.use);
533                         break;
534                 }
535         }
536         read_unlock_bh(&ip_conntrack_lock);
537
538         if (!ct)
539                 return dropped;
540
541         if (del_timer(&ct->timeout)) {
542                 death_by_timeout((unsigned long)ct);
543                 dropped = 1;
544                 CONNTRACK_STAT_INC_ATOMIC(early_drop);
545         }
546         ip_conntrack_put(ct);
547         return dropped;
548 }
549
550 static struct ip_conntrack_helper *
551 __ip_conntrack_helper_find( const struct ip_conntrack_tuple *tuple)
552 {
553         struct ip_conntrack_helper *h;
554
555         list_for_each_entry(h, &helpers, list) {
556                 if (ip_ct_tuple_mask_cmp(tuple, &h->tuple, &h->mask))
557                         return h;
558         }
559         return NULL;
560 }
561
562 struct ip_conntrack_helper *
563 ip_conntrack_helper_find_get( const struct ip_conntrack_tuple *tuple)
564 {
565         struct ip_conntrack_helper *helper;
566
567         /* need ip_conntrack_lock to assure that helper exists until
568          * try_module_get() is called */
569         read_lock_bh(&ip_conntrack_lock);
570
571         helper = __ip_conntrack_helper_find(tuple);
572         if (helper) {
573                 /* need to increase module usage count to assure helper will
574                  * not go away while the caller is e.g. busy putting a
575                  * conntrack in the hash that uses the helper */
576                 if (!try_module_get(helper->me))
577                         helper = NULL;
578         }
579
580         read_unlock_bh(&ip_conntrack_lock);
581
582         return helper;
583 }
584
585 void ip_conntrack_helper_put(struct ip_conntrack_helper *helper)
586 {
587         module_put(helper->me);
588 }
589
590 struct ip_conntrack_protocol *
591 __ip_conntrack_proto_find(u_int8_t protocol)
592 {
593         return ip_ct_protos[protocol];
594 }
595
596 /* this is guaranteed to always return a valid protocol helper, since
597  * it falls back to generic_protocol */
598 struct ip_conntrack_protocol *
599 ip_conntrack_proto_find_get(u_int8_t protocol)
600 {
601         struct ip_conntrack_protocol *p;
602
603         rcu_read_lock();
604         p = __ip_conntrack_proto_find(protocol);
605         if (p) {
606                 if (!try_module_get(p->me))
607                         p = &ip_conntrack_generic_protocol;
608         }
609         rcu_read_unlock();
610
611         return p;
612 }
613
614 void ip_conntrack_proto_put(struct ip_conntrack_protocol *p)
615 {
616         module_put(p->me);
617 }
618
619 struct ip_conntrack *ip_conntrack_alloc(struct ip_conntrack_tuple *orig,
620                                         struct ip_conntrack_tuple *repl)
621 {
622         struct ip_conntrack *conntrack;
623
624         if (!ip_conntrack_hash_rnd_initted) {
625                 get_random_bytes(&ip_conntrack_hash_rnd, 4);
626                 ip_conntrack_hash_rnd_initted = 1;
627         }
628
629         /* We don't want any race condition at early drop stage */
630         atomic_inc(&ip_conntrack_count);
631
632         if (ip_conntrack_max
633             && atomic_read(&ip_conntrack_count) > ip_conntrack_max) {
634                 unsigned int hash = hash_conntrack(orig);
635                 /* Try dropping from this hash chain. */
636                 if (!early_drop(&ip_conntrack_hash[hash])) {
637                         atomic_dec(&ip_conntrack_count);
638                         if (net_ratelimit())
639                                 printk(KERN_WARNING
640                                        "ip_conntrack: table full, dropping"
641                                        " packet.\n");
642                         return ERR_PTR(-ENOMEM);
643                 }
644         }
645
646         conntrack = kmem_cache_zalloc(ip_conntrack_cachep, GFP_ATOMIC);
647         if (!conntrack) {
648                 DEBUGP("Can't allocate conntrack.\n");
649                 atomic_dec(&ip_conntrack_count);
650                 return ERR_PTR(-ENOMEM);
651         }
652
653         atomic_set(&conntrack->ct_general.use, 1);
654         conntrack->ct_general.destroy = destroy_conntrack;
655         conntrack->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig;
656         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *repl;
657         /* Don't set timer yet: wait for confirmation */
658         init_timer(&conntrack->timeout);
659         conntrack->timeout.data = (unsigned long)conntrack;
660         conntrack->timeout.function = death_by_timeout;
661
662         return conntrack;
663 }
664
665 void
666 ip_conntrack_free(struct ip_conntrack *conntrack)
667 {
668         atomic_dec(&ip_conntrack_count);
669         kmem_cache_free(ip_conntrack_cachep, conntrack);
670 }
671
672 /* Allocate a new conntrack: we return -ENOMEM if classification
673  * failed due to stress.   Otherwise it really is unclassifiable */
674 static struct ip_conntrack_tuple_hash *
675 init_conntrack(struct ip_conntrack_tuple *tuple,
676                struct ip_conntrack_protocol *protocol,
677                struct sk_buff *skb)
678 {
679         struct ip_conntrack *conntrack;
680         struct ip_conntrack_tuple repl_tuple;
681         struct ip_conntrack_expect *exp;
682
683         if (!ip_ct_invert_tuple(&repl_tuple, tuple, protocol)) {
684                 DEBUGP("Can't invert tuple.\n");
685                 return NULL;
686         }
687
688         conntrack = ip_conntrack_alloc(tuple, &repl_tuple);
689         if (conntrack == NULL || IS_ERR(conntrack))
690                 return (struct ip_conntrack_tuple_hash *)conntrack;
691
692         if (!protocol->new(conntrack, skb)) {
693                 ip_conntrack_free(conntrack);
694                 return NULL;
695         }
696
697         write_lock_bh(&ip_conntrack_lock);
698         exp = find_expectation(tuple);
699
700         if (exp) {
701                 DEBUGP("conntrack: expectation arrives ct=%p exp=%p\n",
702                         conntrack, exp);
703                 /* Welcome, Mr. Bond.  We've been expecting you... */
704                 __set_bit(IPS_EXPECTED_BIT, &conntrack->status);
705                 conntrack->master = exp->master;
706 #ifdef CONFIG_IP_NF_CONNTRACK_MARK
707                 conntrack->mark = exp->master->mark;
708 #endif
709 #if defined(CONFIG_IP_NF_TARGET_MASQUERADE) || \
710     defined(CONFIG_IP_NF_TARGET_MASQUERADE_MODULE)
711                 /* this is ugly, but there is no other place where to put it */
712                 conntrack->nat.masq_index = exp->master->nat.masq_index;
713 #endif
714 #ifdef CONFIG_IP_NF_CONNTRACK_SECMARK
715                 conntrack->secmark = exp->master->secmark;
716 #endif
717                 nf_conntrack_get(&conntrack->master->ct_general);
718                 CONNTRACK_STAT_INC(expect_new);
719         } else {
720                 conntrack->helper = __ip_conntrack_helper_find(&repl_tuple);
721
722                 CONNTRACK_STAT_INC(new);
723         }
724
725         /* Overload tuple linked list to put us in unconfirmed list. */
726         list_add(&conntrack->tuplehash[IP_CT_DIR_ORIGINAL].list, &unconfirmed);
727
728         write_unlock_bh(&ip_conntrack_lock);
729
730         if (exp) {
731                 if (exp->expectfn)
732                         exp->expectfn(conntrack, exp);
733                 ip_conntrack_expect_put(exp);
734         }
735
736         return &conntrack->tuplehash[IP_CT_DIR_ORIGINAL];
737 }
738
739 /* On success, returns conntrack ptr, sets skb->nfct and ctinfo */
740 static inline struct ip_conntrack *
741 resolve_normal_ct(struct sk_buff *skb,
742                   struct ip_conntrack_protocol *proto,
743                   int *set_reply,
744                   unsigned int hooknum,
745                   enum ip_conntrack_info *ctinfo)
746 {
747         struct ip_conntrack_tuple tuple;
748         struct ip_conntrack_tuple_hash *h;
749         struct ip_conntrack *ct;
750
751         IP_NF_ASSERT((ip_hdr(skb)->frag_off & htons(IP_OFFSET)) == 0);
752
753         if (!ip_ct_get_tuple(ip_hdr(skb), skb, ip_hdrlen(skb), &tuple,proto))
754                 return NULL;
755
756         /* look for tuple match */
757         h = ip_conntrack_find_get(&tuple, NULL);
758         if (!h) {
759                 h = init_conntrack(&tuple, proto, skb);
760                 if (!h)
761                         return NULL;
762                 if (IS_ERR(h))
763                         return (void *)h;
764         }
765         ct = tuplehash_to_ctrack(h);
766
767         /* It exists; we have (non-exclusive) reference. */
768         if (DIRECTION(h) == IP_CT_DIR_REPLY) {
769                 *ctinfo = IP_CT_ESTABLISHED + IP_CT_IS_REPLY;
770                 /* Please set reply bit if this packet OK */
771                 *set_reply = 1;
772         } else {
773                 /* Once we've had two way comms, always ESTABLISHED. */
774                 if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status)) {
775                         DEBUGP("ip_conntrack_in: normal packet for %p\n",
776                                ct);
777                         *ctinfo = IP_CT_ESTABLISHED;
778                 } else if (test_bit(IPS_EXPECTED_BIT, &ct->status)) {
779                         DEBUGP("ip_conntrack_in: related packet for %p\n",
780                                ct);
781                         *ctinfo = IP_CT_RELATED;
782                 } else {
783                         DEBUGP("ip_conntrack_in: new packet for %p\n",
784                                ct);
785                         *ctinfo = IP_CT_NEW;
786                 }
787                 *set_reply = 0;
788         }
789         skb->nfct = &ct->ct_general;
790         skb->nfctinfo = *ctinfo;
791         return ct;
792 }
793
794 /* Netfilter hook itself. */
795 unsigned int ip_conntrack_in(unsigned int hooknum,
796                              struct sk_buff **pskb,
797                              const struct net_device *in,
798                              const struct net_device *out,
799                              int (*okfn)(struct sk_buff *))
800 {
801         struct ip_conntrack *ct;
802         enum ip_conntrack_info ctinfo;
803         struct ip_conntrack_protocol *proto;
804         int set_reply = 0;
805         int ret;
806
807         /* Previously seen (loopback or untracked)?  Ignore. */
808         if ((*pskb)->nfct) {
809                 CONNTRACK_STAT_INC_ATOMIC(ignore);
810                 return NF_ACCEPT;
811         }
812
813         /* Never happen */
814         if (ip_hdr(*pskb)->frag_off & htons(IP_OFFSET)) {
815                 if (net_ratelimit()) {
816                 printk(KERN_ERR "ip_conntrack_in: Frag of proto %u (hook=%u)\n",
817                        ip_hdr(*pskb)->protocol, hooknum);
818                 }
819                 return NF_DROP;
820         }
821
822 /* Doesn't cover locally-generated broadcast, so not worth it. */
823 #if 0
824         /* Ignore broadcast: no `connection'. */
825         if ((*pskb)->pkt_type == PACKET_BROADCAST) {
826                 printk("Broadcast packet!\n");
827                 return NF_ACCEPT;
828         } else if ((ip_hdr(*pskb)->daddr & htonl(0x000000FF))
829                    == htonl(0x000000FF)) {
830                 printk("Should bcast: %u.%u.%u.%u->%u.%u.%u.%u (sk=%p, ptype=%u)\n",
831                        NIPQUAD(ip_hdr(*pskb)->saddr),
832                        NIPQUAD(ip_hdr(*pskb)->daddr),
833                        (*pskb)->sk, (*pskb)->pkt_type);
834         }
835 #endif
836
837         /* rcu_read_lock()ed by nf_hook_slow */
838         proto = __ip_conntrack_proto_find(ip_hdr(*pskb)->protocol);
839
840         /* It may be an special packet, error, unclean...
841          * inverse of the return code tells to the netfilter
842          * core what to do with the packet. */
843         if (proto->error != NULL
844             && (ret = proto->error(*pskb, &ctinfo, hooknum)) <= 0) {
845                 CONNTRACK_STAT_INC_ATOMIC(error);
846                 CONNTRACK_STAT_INC_ATOMIC(invalid);
847                 return -ret;
848         }
849
850         if (!(ct = resolve_normal_ct(*pskb, proto,&set_reply,hooknum,&ctinfo))) {
851                 /* Not valid part of a connection */
852                 CONNTRACK_STAT_INC_ATOMIC(invalid);
853                 return NF_ACCEPT;
854         }
855
856         if (IS_ERR(ct)) {
857                 /* Too stressed to deal. */
858                 CONNTRACK_STAT_INC_ATOMIC(drop);
859                 return NF_DROP;
860         }
861
862         IP_NF_ASSERT((*pskb)->nfct);
863
864         ret = proto->packet(ct, *pskb, ctinfo);
865         if (ret < 0) {
866                 /* Invalid: inverse of the return code tells
867                  * the netfilter core what to do*/
868                 nf_conntrack_put((*pskb)->nfct);
869                 (*pskb)->nfct = NULL;
870                 CONNTRACK_STAT_INC_ATOMIC(invalid);
871                 return -ret;
872         }
873
874         if (set_reply && !test_and_set_bit(IPS_SEEN_REPLY_BIT, &ct->status))
875                 ip_conntrack_event_cache(IPCT_STATUS, *pskb);
876
877         return ret;
878 }
879
880 int invert_tuplepr(struct ip_conntrack_tuple *inverse,
881                    const struct ip_conntrack_tuple *orig)
882 {
883         struct ip_conntrack_protocol *proto;
884         int ret;
885
886         rcu_read_lock();
887         proto = __ip_conntrack_proto_find(orig->dst.protonum);
888         ret = ip_ct_invert_tuple(inverse, orig, proto);
889         rcu_read_unlock();
890
891         return ret;
892 }
893
894 /* Would two expected things clash? */
895 static inline int expect_clash(const struct ip_conntrack_expect *a,
896                                const struct ip_conntrack_expect *b)
897 {
898         /* Part covered by intersection of masks must be unequal,
899            otherwise they clash */
900         struct ip_conntrack_tuple intersect_mask
901                 = { { a->mask.src.ip & b->mask.src.ip,
902                       { a->mask.src.u.all & b->mask.src.u.all } },
903                     { a->mask.dst.ip & b->mask.dst.ip,
904                       { a->mask.dst.u.all & b->mask.dst.u.all },
905                       a->mask.dst.protonum & b->mask.dst.protonum } };
906
907         return ip_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask);
908 }
909
910 static inline int expect_matches(const struct ip_conntrack_expect *a,
911                                  const struct ip_conntrack_expect *b)
912 {
913         return a->master == b->master
914                 && ip_ct_tuple_equal(&a->tuple, &b->tuple)
915                 && ip_ct_tuple_equal(&a->mask, &b->mask);
916 }
917
918 /* Generally a bad idea to call this: could have matched already. */
919 void ip_conntrack_unexpect_related(struct ip_conntrack_expect *exp)
920 {
921         struct ip_conntrack_expect *i;
922
923         write_lock_bh(&ip_conntrack_lock);
924         /* choose the the oldest expectation to evict */
925         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
926                 if (expect_matches(i, exp) && del_timer(&i->timeout)) {
927                         ip_ct_unlink_expect(i);
928                         write_unlock_bh(&ip_conntrack_lock);
929                         ip_conntrack_expect_put(i);
930                         return;
931                 }
932         }
933         write_unlock_bh(&ip_conntrack_lock);
934 }
935
936 /* We don't increase the master conntrack refcount for non-fulfilled
937  * conntracks. During the conntrack destruction, the expectations are
938  * always killed before the conntrack itself */
939 struct ip_conntrack_expect *ip_conntrack_expect_alloc(struct ip_conntrack *me)
940 {
941         struct ip_conntrack_expect *new;
942
943         new = kmem_cache_alloc(ip_conntrack_expect_cachep, GFP_ATOMIC);
944         if (!new) {
945                 DEBUGP("expect_related: OOM allocating expect\n");
946                 return NULL;
947         }
948         new->master = me;
949         atomic_set(&new->use, 1);
950         return new;
951 }
952
953 void ip_conntrack_expect_put(struct ip_conntrack_expect *exp)
954 {
955         if (atomic_dec_and_test(&exp->use))
956                 kmem_cache_free(ip_conntrack_expect_cachep, exp);
957 }
958
959 static void ip_conntrack_expect_insert(struct ip_conntrack_expect *exp)
960 {
961         atomic_inc(&exp->use);
962         exp->master->expecting++;
963         list_add(&exp->list, &ip_conntrack_expect_list);
964
965         init_timer(&exp->timeout);
966         exp->timeout.data = (unsigned long)exp;
967         exp->timeout.function = expectation_timed_out;
968         exp->timeout.expires = jiffies + exp->master->helper->timeout * HZ;
969         add_timer(&exp->timeout);
970
971         exp->id = ++ip_conntrack_expect_next_id;
972         atomic_inc(&exp->use);
973         CONNTRACK_STAT_INC(expect_create);
974 }
975
976 /* Race with expectations being used means we could have none to find; OK. */
977 static void evict_oldest_expect(struct ip_conntrack *master)
978 {
979         struct ip_conntrack_expect *i;
980
981         list_for_each_entry_reverse(i, &ip_conntrack_expect_list, list) {
982                 if (i->master == master) {
983                         if (del_timer(&i->timeout)) {
984                                 ip_ct_unlink_expect(i);
985                                 ip_conntrack_expect_put(i);
986                         }
987                         break;
988                 }
989         }
990 }
991
992 static inline int refresh_timer(struct ip_conntrack_expect *i)
993 {
994         if (!del_timer(&i->timeout))
995                 return 0;
996
997         i->timeout.expires = jiffies + i->master->helper->timeout*HZ;
998         add_timer(&i->timeout);
999         return 1;
1000 }
1001
1002 int ip_conntrack_expect_related(struct ip_conntrack_expect *expect)
1003 {
1004         struct ip_conntrack_expect *i;
1005         int ret;
1006
1007         DEBUGP("ip_conntrack_expect_related %p\n", related_to);
1008         DEBUGP("tuple: "); DUMP_TUPLE(&expect->tuple);
1009         DEBUGP("mask:  "); DUMP_TUPLE(&expect->mask);
1010
1011         write_lock_bh(&ip_conntrack_lock);
1012         list_for_each_entry(i, &ip_conntrack_expect_list, list) {
1013                 if (expect_matches(i, expect)) {
1014                         /* Refresh timer: if it's dying, ignore.. */
1015                         if (refresh_timer(i)) {
1016                                 ret = 0;
1017                                 goto out;
1018                         }
1019                 } else if (expect_clash(i, expect)) {
1020                         ret = -EBUSY;
1021                         goto out;
1022                 }
1023         }
1024
1025         /* Will be over limit? */
1026         if (expect->master->helper->max_expected &&
1027             expect->master->expecting >= expect->master->helper->max_expected)
1028                 evict_oldest_expect(expect->master);
1029
1030         ip_conntrack_expect_insert(expect);
1031         ip_conntrack_expect_event(IPEXP_NEW, expect);
1032         ret = 0;
1033 out:
1034         write_unlock_bh(&ip_conntrack_lock);
1035         return ret;
1036 }
1037
1038 /* Alter reply tuple (maybe alter helper).  This is for NAT, and is
1039    implicitly racy: see __ip_conntrack_confirm */
1040 void ip_conntrack_alter_reply(struct ip_conntrack *conntrack,
1041                               const struct ip_conntrack_tuple *newreply)
1042 {
1043         write_lock_bh(&ip_conntrack_lock);
1044         /* Should be unconfirmed, so not in hash table yet */
1045         IP_NF_ASSERT(!is_confirmed(conntrack));
1046
1047         DEBUGP("Altering reply tuple of %p to ", conntrack);
1048         DUMP_TUPLE(newreply);
1049
1050         conntrack->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply;
1051         if (!conntrack->master && conntrack->expecting == 0)
1052                 conntrack->helper = __ip_conntrack_helper_find(newreply);
1053         write_unlock_bh(&ip_conntrack_lock);
1054 }
1055
1056 int ip_conntrack_helper_register(struct ip_conntrack_helper *me)
1057 {
1058         BUG_ON(me->timeout == 0);
1059         write_lock_bh(&ip_conntrack_lock);
1060         list_add(&me->list, &helpers);
1061         write_unlock_bh(&ip_conntrack_lock);
1062
1063         return 0;
1064 }
1065
1066 struct ip_conntrack_helper *
1067 __ip_conntrack_helper_find_byname(const char *name)
1068 {
1069         struct ip_conntrack_helper *h;
1070
1071         list_for_each_entry(h, &helpers, list) {
1072                 if (!strcmp(h->name, name))
1073                         return h;
1074         }
1075
1076         return NULL;
1077 }
1078
1079 static inline void unhelp(struct ip_conntrack_tuple_hash *i,
1080                           const struct ip_conntrack_helper *me)
1081 {
1082         if (tuplehash_to_ctrack(i)->helper == me) {
1083                 ip_conntrack_event(IPCT_HELPER, tuplehash_to_ctrack(i));
1084                 tuplehash_to_ctrack(i)->helper = NULL;
1085         }
1086 }
1087
1088 void ip_conntrack_helper_unregister(struct ip_conntrack_helper *me)
1089 {
1090         unsigned int i;
1091         struct ip_conntrack_tuple_hash *h;
1092         struct ip_conntrack_expect *exp, *tmp;
1093
1094         /* Need write lock here, to delete helper. */
1095         write_lock_bh(&ip_conntrack_lock);
1096         list_del(&me->list);
1097
1098         /* Get rid of expectations */
1099         list_for_each_entry_safe(exp, tmp, &ip_conntrack_expect_list, list) {
1100                 if (exp->master->helper == me && del_timer(&exp->timeout)) {
1101                         ip_ct_unlink_expect(exp);
1102                         ip_conntrack_expect_put(exp);
1103                 }
1104         }
1105         /* Get rid of expecteds, set helpers to NULL. */
1106         list_for_each_entry(h, &unconfirmed, list)
1107                 unhelp(h, me);
1108         for (i = 0; i < ip_conntrack_htable_size; i++) {
1109                 list_for_each_entry(h, &ip_conntrack_hash[i], list)
1110                         unhelp(h, me);
1111         }
1112         write_unlock_bh(&ip_conntrack_lock);
1113
1114         /* Someone could be still looking at the helper in a bh. */
1115         synchronize_net();
1116 }
1117
1118 /* Refresh conntrack for this many jiffies and do accounting if do_acct is 1 */
1119 void __ip_ct_refresh_acct(struct ip_conntrack *ct,
1120                         enum ip_conntrack_info ctinfo,
1121                         const struct sk_buff *skb,
1122                         unsigned long extra_jiffies,
1123                         int do_acct)
1124 {
1125         int event = 0;
1126
1127         IP_NF_ASSERT(ct->timeout.data == (unsigned long)ct);
1128         IP_NF_ASSERT(skb);
1129
1130         write_lock_bh(&ip_conntrack_lock);
1131
1132         /* Only update if this is not a fixed timeout */
1133         if (test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
1134                 write_unlock_bh(&ip_conntrack_lock);
1135                 return;
1136         }
1137
1138         /* If not in hash table, timer will not be active yet */
1139         if (!is_confirmed(ct)) {
1140                 ct->timeout.expires = extra_jiffies;
1141                 event = IPCT_REFRESH;
1142         } else {
1143                 /* Need del_timer for race avoidance (may already be dying). */
1144                 if (del_timer(&ct->timeout)) {
1145                         ct->timeout.expires = jiffies + extra_jiffies;
1146                         add_timer(&ct->timeout);
1147                         event = IPCT_REFRESH;
1148                 }
1149         }
1150
1151 #ifdef CONFIG_IP_NF_CT_ACCT
1152         if (do_acct) {
1153                 ct->counters[CTINFO2DIR(ctinfo)].packets++;
1154                 ct->counters[CTINFO2DIR(ctinfo)].bytes +=
1155                                                 ntohs(ip_hdr(skb)->tot_len);
1156                 if ((ct->counters[CTINFO2DIR(ctinfo)].packets & 0x80000000)
1157                     || (ct->counters[CTINFO2DIR(ctinfo)].bytes & 0x80000000))
1158                         event |= IPCT_COUNTER_FILLING;
1159         }
1160 #endif
1161
1162         write_unlock_bh(&ip_conntrack_lock);
1163
1164         /* must be unlocked when calling event cache */
1165         if (event)
1166                 ip_conntrack_event_cache(event, skb);
1167 }
1168
1169 #if defined(CONFIG_IP_NF_CONNTRACK_NETLINK) || \
1170     defined(CONFIG_IP_NF_CONNTRACK_NETLINK_MODULE)
1171 /* Generic function for tcp/udp/sctp/dccp and alike. This needs to be
1172  * in ip_conntrack_core, since we don't want the protocols to autoload
1173  * or depend on ctnetlink */
1174 int ip_ct_port_tuple_to_nfattr(struct sk_buff *skb,
1175                                const struct ip_conntrack_tuple *tuple)
1176 {
1177         NFA_PUT(skb, CTA_PROTO_SRC_PORT, sizeof(__be16),
1178                 &tuple->src.u.tcp.port);
1179         NFA_PUT(skb, CTA_PROTO_DST_PORT, sizeof(__be16),
1180                 &tuple->dst.u.tcp.port);
1181         return 0;
1182
1183 nfattr_failure:
1184         return -1;
1185 }
1186
1187 int ip_ct_port_nfattr_to_tuple(struct nfattr *tb[],
1188                                struct ip_conntrack_tuple *t)
1189 {
1190         if (!tb[CTA_PROTO_SRC_PORT-1] || !tb[CTA_PROTO_DST_PORT-1])
1191                 return -EINVAL;
1192
1193         t->src.u.tcp.port =
1194                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_SRC_PORT-1]);
1195         t->dst.u.tcp.port =
1196                 *(__be16 *)NFA_DATA(tb[CTA_PROTO_DST_PORT-1]);
1197
1198         return 0;
1199 }
1200 #endif
1201
1202 /* Returns new sk_buff, or NULL */
1203 struct sk_buff *
1204 ip_ct_gather_frags(struct sk_buff *skb, u_int32_t user)
1205 {
1206         skb_orphan(skb);
1207
1208         local_bh_disable();
1209         skb = ip_defrag(skb, user);
1210         local_bh_enable();
1211
1212         if (skb)
1213                 ip_send_check(ip_hdr(skb));
1214         return skb;
1215 }
1216
1217 /* Used by ipt_REJECT. */
1218 static void ip_conntrack_attach(struct sk_buff *nskb, struct sk_buff *skb)
1219 {
1220         struct ip_conntrack *ct;
1221         enum ip_conntrack_info ctinfo;
1222
1223         /* This ICMP is in reverse direction to the packet which caused it */
1224         ct = ip_conntrack_get(skb, &ctinfo);
1225
1226         if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL)
1227                 ctinfo = IP_CT_RELATED + IP_CT_IS_REPLY;
1228         else
1229                 ctinfo = IP_CT_RELATED;
1230
1231         /* Attach to new skbuff, and increment count */
1232         nskb->nfct = &ct->ct_general;
1233         nskb->nfctinfo = ctinfo;
1234         nf_conntrack_get(nskb->nfct);
1235 }
1236
1237 /* Bring out ya dead! */
1238 static struct ip_conntrack *
1239 get_next_corpse(int (*iter)(struct ip_conntrack *i, void *data),
1240                 void *data, unsigned int *bucket)
1241 {
1242         struct ip_conntrack_tuple_hash *h;
1243         struct ip_conntrack *ct;
1244
1245         write_lock_bh(&ip_conntrack_lock);
1246         for (; *bucket < ip_conntrack_htable_size; (*bucket)++) {
1247                 list_for_each_entry(h, &ip_conntrack_hash[*bucket], list) {
1248                         ct = tuplehash_to_ctrack(h);
1249                         if (iter(ct, data))
1250                                 goto found;
1251                 }
1252         }
1253         list_for_each_entry(h, &unconfirmed, list) {
1254                 ct = tuplehash_to_ctrack(h);
1255                 if (iter(ct, data))
1256                         set_bit(IPS_DYING_BIT, &ct->status);
1257         }
1258         write_unlock_bh(&ip_conntrack_lock);
1259         return NULL;
1260
1261 found:
1262         atomic_inc(&ct->ct_general.use);
1263         write_unlock_bh(&ip_conntrack_lock);
1264         return ct;
1265 }
1266
1267 void
1268 ip_ct_iterate_cleanup(int (*iter)(struct ip_conntrack *i, void *), void *data)
1269 {
1270         struct ip_conntrack *ct;
1271         unsigned int bucket = 0;
1272
1273         while ((ct = get_next_corpse(iter, data, &bucket)) != NULL) {
1274                 /* Time to push up daises... */
1275                 if (del_timer(&ct->timeout))
1276                         death_by_timeout((unsigned long)ct);
1277                 /* ... else the timer will get him soon. */
1278
1279                 ip_conntrack_put(ct);
1280         }
1281 }
1282
1283 /* Fast function for those who don't want to parse /proc (and I don't
1284    blame them). */
1285 /* Reversing the socket's dst/src point of view gives us the reply
1286    mapping. */
1287 static int
1288 getorigdst(struct sock *sk, int optval, void __user *user, int *len)
1289 {
1290         struct inet_sock *inet = inet_sk(sk);
1291         struct ip_conntrack_tuple_hash *h;
1292         struct ip_conntrack_tuple tuple;
1293
1294         IP_CT_TUPLE_U_BLANK(&tuple);
1295         tuple.src.ip = inet->rcv_saddr;
1296         tuple.src.u.tcp.port = inet->sport;
1297         tuple.dst.ip = inet->daddr;
1298         tuple.dst.u.tcp.port = inet->dport;
1299         tuple.dst.protonum = IPPROTO_TCP;
1300
1301         /* We only do TCP at the moment: is there a better way? */
1302         if (strcmp(sk->sk_prot->name, "TCP")) {
1303                 DEBUGP("SO_ORIGINAL_DST: Not a TCP socket\n");
1304                 return -ENOPROTOOPT;
1305         }
1306
1307         if ((unsigned int) *len < sizeof(struct sockaddr_in)) {
1308                 DEBUGP("SO_ORIGINAL_DST: len %u not %u\n",
1309                        *len, sizeof(struct sockaddr_in));
1310                 return -EINVAL;
1311         }
1312
1313         h = ip_conntrack_find_get(&tuple, NULL);
1314         if (h) {
1315                 struct sockaddr_in sin;
1316                 struct ip_conntrack *ct = tuplehash_to_ctrack(h);
1317
1318                 sin.sin_family = AF_INET;
1319                 sin.sin_port = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1320                         .tuple.dst.u.tcp.port;
1321                 sin.sin_addr.s_addr = ct->tuplehash[IP_CT_DIR_ORIGINAL]
1322                         .tuple.dst.ip;
1323                 memset(sin.sin_zero, 0, sizeof(sin.sin_zero));
1324
1325                 DEBUGP("SO_ORIGINAL_DST: %u.%u.%u.%u %u\n",
1326                        NIPQUAD(sin.sin_addr.s_addr), ntohs(sin.sin_port));
1327                 ip_conntrack_put(ct);
1328                 if (copy_to_user(user, &sin, sizeof(sin)) != 0)
1329                         return -EFAULT;
1330                 else
1331                         return 0;
1332         }
1333         DEBUGP("SO_ORIGINAL_DST: Can't find %u.%u.%u.%u/%u-%u.%u.%u.%u/%u.\n",
1334                NIPQUAD(tuple.src.ip), ntohs(tuple.src.u.tcp.port),
1335                NIPQUAD(tuple.dst.ip), ntohs(tuple.dst.u.tcp.port));
1336         return -ENOENT;
1337 }
1338
1339 static struct nf_sockopt_ops so_getorigdst = {
1340         .pf             = PF_INET,
1341         .get_optmin     = SO_ORIGINAL_DST,
1342         .get_optmax     = SO_ORIGINAL_DST+1,
1343         .get            = &getorigdst,
1344 };
1345
1346 static int kill_all(struct ip_conntrack *i, void *data)
1347 {
1348         return 1;
1349 }
1350
1351 void ip_conntrack_flush(void)
1352 {
1353         ip_ct_iterate_cleanup(kill_all, NULL);
1354 }
1355
1356 static void free_conntrack_hash(struct list_head *hash, int vmalloced,int size)
1357 {
1358         if (vmalloced)
1359                 vfree(hash);
1360         else
1361                 free_pages((unsigned long)hash,
1362                            get_order(sizeof(struct list_head) * size));
1363 }
1364
1365 /* Mishearing the voices in his head, our hero wonders how he's
1366    supposed to kill the mall. */
1367 void ip_conntrack_cleanup(void)
1368 {
1369         rcu_assign_pointer(ip_ct_attach, NULL);
1370
1371         /* This makes sure all current packets have passed through
1372            netfilter framework.  Roll on, two-stage module
1373            delete... */
1374         synchronize_net();
1375
1376         ip_ct_event_cache_flush();
1377  i_see_dead_people:
1378         ip_conntrack_flush();
1379         if (atomic_read(&ip_conntrack_count) != 0) {
1380                 schedule();
1381                 goto i_see_dead_people;
1382         }
1383         /* wait until all references to ip_conntrack_untracked are dropped */
1384         while (atomic_read(&ip_conntrack_untracked.ct_general.use) > 1)
1385                 schedule();
1386
1387         kmem_cache_destroy(ip_conntrack_cachep);
1388         kmem_cache_destroy(ip_conntrack_expect_cachep);
1389         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1390                             ip_conntrack_htable_size);
1391         nf_unregister_sockopt(&so_getorigdst);
1392 }
1393
1394 static struct list_head *alloc_hashtable(int size, int *vmalloced)
1395 {
1396         struct list_head *hash;
1397         unsigned int i;
1398
1399         *vmalloced = 0;
1400         hash = (void*)__get_free_pages(GFP_KERNEL,
1401                                        get_order(sizeof(struct list_head)
1402                                                  * size));
1403         if (!hash) {
1404                 *vmalloced = 1;
1405                 printk(KERN_WARNING"ip_conntrack: falling back to vmalloc.\n");
1406                 hash = vmalloc(sizeof(struct list_head) * size);
1407         }
1408
1409         if (hash)
1410                 for (i = 0; i < size; i++)
1411                         INIT_LIST_HEAD(&hash[i]);
1412
1413         return hash;
1414 }
1415
1416 static int set_hashsize(const char *val, struct kernel_param *kp)
1417 {
1418         int i, bucket, hashsize, vmalloced;
1419         int old_vmalloced, old_size;
1420         int rnd;
1421         struct list_head *hash, *old_hash;
1422         struct ip_conntrack_tuple_hash *h;
1423
1424         /* On boot, we can set this without any fancy locking. */
1425         if (!ip_conntrack_htable_size)
1426                 return param_set_int(val, kp);
1427
1428         hashsize = simple_strtol(val, NULL, 0);
1429         if (!hashsize)
1430                 return -EINVAL;
1431
1432         hash = alloc_hashtable(hashsize, &vmalloced);
1433         if (!hash)
1434                 return -ENOMEM;
1435
1436         /* We have to rehash for the new table anyway, so we also can
1437          * use a new random seed */
1438         get_random_bytes(&rnd, 4);
1439
1440         write_lock_bh(&ip_conntrack_lock);
1441         for (i = 0; i < ip_conntrack_htable_size; i++) {
1442                 while (!list_empty(&ip_conntrack_hash[i])) {
1443                         h = list_entry(ip_conntrack_hash[i].next,
1444                                        struct ip_conntrack_tuple_hash, list);
1445                         list_del(&h->list);
1446                         bucket = __hash_conntrack(&h->tuple, hashsize, rnd);
1447                         list_add_tail(&h->list, &hash[bucket]);
1448                 }
1449         }
1450         old_size = ip_conntrack_htable_size;
1451         old_vmalloced = ip_conntrack_vmalloc;
1452         old_hash = ip_conntrack_hash;
1453
1454         ip_conntrack_htable_size = hashsize;
1455         ip_conntrack_vmalloc = vmalloced;
1456         ip_conntrack_hash = hash;
1457         ip_conntrack_hash_rnd = rnd;
1458         write_unlock_bh(&ip_conntrack_lock);
1459
1460         free_conntrack_hash(old_hash, old_vmalloced, old_size);
1461         return 0;
1462 }
1463
1464 module_param_call(hashsize, set_hashsize, param_get_uint,
1465                   &ip_conntrack_htable_size, 0600);
1466
1467 int __init ip_conntrack_init(void)
1468 {
1469         unsigned int i;
1470         int ret;
1471
1472         /* Idea from tcp.c: use 1/16384 of memory.  On i386: 32MB
1473          * machine has 256 buckets.  >= 1GB machines have 8192 buckets. */
1474         if (!ip_conntrack_htable_size) {
1475                 ip_conntrack_htable_size
1476                         = (((num_physpages << PAGE_SHIFT) / 16384)
1477                            / sizeof(struct list_head));
1478                 if (num_physpages > (1024 * 1024 * 1024 / PAGE_SIZE))
1479                         ip_conntrack_htable_size = 8192;
1480                 if (ip_conntrack_htable_size < 16)
1481                         ip_conntrack_htable_size = 16;
1482         }
1483         ip_conntrack_max = 8 * ip_conntrack_htable_size;
1484
1485         printk("ip_conntrack version %s (%u buckets, %d max)"
1486                " - %Zd bytes per conntrack\n", IP_CONNTRACK_VERSION,
1487                ip_conntrack_htable_size, ip_conntrack_max,
1488                sizeof(struct ip_conntrack));
1489
1490         ret = nf_register_sockopt(&so_getorigdst);
1491         if (ret != 0) {
1492                 printk(KERN_ERR "Unable to register netfilter socket option\n");
1493                 return ret;
1494         }
1495
1496         ip_conntrack_hash = alloc_hashtable(ip_conntrack_htable_size,
1497                                             &ip_conntrack_vmalloc);
1498         if (!ip_conntrack_hash) {
1499                 printk(KERN_ERR "Unable to create ip_conntrack_hash\n");
1500                 goto err_unreg_sockopt;
1501         }
1502
1503         ip_conntrack_cachep = kmem_cache_create("ip_conntrack",
1504                                                 sizeof(struct ip_conntrack), 0,
1505                                                 0, NULL, NULL);
1506         if (!ip_conntrack_cachep) {
1507                 printk(KERN_ERR "Unable to create ip_conntrack slab cache\n");
1508                 goto err_free_hash;
1509         }
1510
1511         ip_conntrack_expect_cachep = kmem_cache_create("ip_conntrack_expect",
1512                                         sizeof(struct ip_conntrack_expect),
1513                                         0, 0, NULL, NULL);
1514         if (!ip_conntrack_expect_cachep) {
1515                 printk(KERN_ERR "Unable to create ip_expect slab cache\n");
1516                 goto err_free_conntrack_slab;
1517         }
1518
1519         /* Don't NEED lock here, but good form anyway. */
1520         write_lock_bh(&ip_conntrack_lock);
1521         for (i = 0; i < MAX_IP_CT_PROTO; i++)
1522                 rcu_assign_pointer(ip_ct_protos[i], &ip_conntrack_generic_protocol);
1523         /* Sew in builtin protocols. */
1524         rcu_assign_pointer(ip_ct_protos[IPPROTO_TCP], &ip_conntrack_protocol_tcp);
1525         rcu_assign_pointer(ip_ct_protos[IPPROTO_UDP], &ip_conntrack_protocol_udp);
1526         rcu_assign_pointer(ip_ct_protos[IPPROTO_ICMP], &ip_conntrack_protocol_icmp);
1527         write_unlock_bh(&ip_conntrack_lock);
1528
1529         /* For use by ipt_REJECT */
1530         rcu_assign_pointer(ip_ct_attach, ip_conntrack_attach);
1531
1532         /* Set up fake conntrack:
1533             - to never be deleted, not in any hashes */
1534         atomic_set(&ip_conntrack_untracked.ct_general.use, 1);
1535         /*  - and look it like as a confirmed connection */
1536         set_bit(IPS_CONFIRMED_BIT, &ip_conntrack_untracked.status);
1537
1538         return ret;
1539
1540 err_free_conntrack_slab:
1541         kmem_cache_destroy(ip_conntrack_cachep);
1542 err_free_hash:
1543         free_conntrack_hash(ip_conntrack_hash, ip_conntrack_vmalloc,
1544                             ip_conntrack_htable_size);
1545 err_unreg_sockopt:
1546         nf_unregister_sockopt(&so_getorigdst);
1547
1548         return -ENOMEM;
1549 }