[NETPOLL]: Introduce a netpoll_info struct
[safe/jmp/linux-2.6] / net / core / netfilter.c
1 /* netfilter.c: look after the filters for various protocols. 
2  * Heavily influenced by the old firewall.c by David Bonn and Alan Cox.
3  *
4  * Thanks to Rob `CmdrTaco' Malda for not influencing this code in any
5  * way.
6  *
7  * Rusty Russell (C)2000 -- This code is GPL.
8  *
9  * February 2000: Modified by James Morris to have 1 queue per protocol.
10  * 15-Mar-2000:   Added NF_REPEAT --RR.
11  * 08-May-2003:   Internal logging interface added by Jozsef Kadlecsik.
12  */
13 #include <linux/config.h>
14 #include <linux/kernel.h>
15 #include <linux/netfilter.h>
16 #include <net/protocol.h>
17 #include <linux/init.h>
18 #include <linux/skbuff.h>
19 #include <linux/wait.h>
20 #include <linux/module.h>
21 #include <linux/interrupt.h>
22 #include <linux/if.h>
23 #include <linux/netdevice.h>
24 #include <linux/inetdevice.h>
25 #include <linux/tcp.h>
26 #include <linux/udp.h>
27 #include <linux/icmp.h>
28 #include <net/sock.h>
29 #include <net/route.h>
30 #include <linux/ip.h>
31
32 /* In this code, we can be waiting indefinitely for userspace to
33  * service a packet if a hook returns NF_QUEUE.  We could keep a count
34  * of skbuffs queued for userspace, and not deregister a hook unless
35  * this is zero, but that sucks.  Now, we simply check when the
36  * packets come back: if the hook is gone, the packet is discarded. */
37 #ifdef CONFIG_NETFILTER_DEBUG
38 #define NFDEBUG(format, args...)  printk(format , ## args)
39 #else
40 #define NFDEBUG(format, args...)
41 #endif
42
43 /* Sockopts only registered and called from user context, so
44    net locking would be overkill.  Also, [gs]etsockopt calls may
45    sleep. */
46 static DECLARE_MUTEX(nf_sockopt_mutex);
47
48 struct list_head nf_hooks[NPROTO][NF_MAX_HOOKS];
49 static LIST_HEAD(nf_sockopts);
50 static DEFINE_SPINLOCK(nf_hook_lock);
51
52 /* 
53  * A queue handler may be registered for each protocol.  Each is protected by
54  * long term mutex.  The handler must provide an an outfn() to accept packets
55  * for queueing and must reinject all packets it receives, no matter what.
56  */
57 static struct nf_queue_handler_t {
58         nf_queue_outfn_t outfn;
59         void *data;
60 } queue_handler[NPROTO];
61 static DEFINE_RWLOCK(queue_handler_lock);
62
63 int nf_register_hook(struct nf_hook_ops *reg)
64 {
65         struct list_head *i;
66
67         spin_lock_bh(&nf_hook_lock);
68         list_for_each(i, &nf_hooks[reg->pf][reg->hooknum]) {
69                 if (reg->priority < ((struct nf_hook_ops *)i)->priority)
70                         break;
71         }
72         list_add_rcu(&reg->list, i->prev);
73         spin_unlock_bh(&nf_hook_lock);
74
75         synchronize_net();
76         return 0;
77 }
78
79 void nf_unregister_hook(struct nf_hook_ops *reg)
80 {
81         spin_lock_bh(&nf_hook_lock);
82         list_del_rcu(&reg->list);
83         spin_unlock_bh(&nf_hook_lock);
84
85         synchronize_net();
86 }
87
88 /* Do exclusive ranges overlap? */
89 static inline int overlap(int min1, int max1, int min2, int max2)
90 {
91         return max1 > min2 && min1 < max2;
92 }
93
94 /* Functions to register sockopt ranges (exclusive). */
95 int nf_register_sockopt(struct nf_sockopt_ops *reg)
96 {
97         struct list_head *i;
98         int ret = 0;
99
100         if (down_interruptible(&nf_sockopt_mutex) != 0)
101                 return -EINTR;
102
103         list_for_each(i, &nf_sockopts) {
104                 struct nf_sockopt_ops *ops = (struct nf_sockopt_ops *)i;
105                 if (ops->pf == reg->pf
106                     && (overlap(ops->set_optmin, ops->set_optmax, 
107                                 reg->set_optmin, reg->set_optmax)
108                         || overlap(ops->get_optmin, ops->get_optmax, 
109                                    reg->get_optmin, reg->get_optmax))) {
110                         NFDEBUG("nf_sock overlap: %u-%u/%u-%u v %u-%u/%u-%u\n",
111                                 ops->set_optmin, ops->set_optmax, 
112                                 ops->get_optmin, ops->get_optmax, 
113                                 reg->set_optmin, reg->set_optmax,
114                                 reg->get_optmin, reg->get_optmax);
115                         ret = -EBUSY;
116                         goto out;
117                 }
118         }
119
120         list_add(&reg->list, &nf_sockopts);
121 out:
122         up(&nf_sockopt_mutex);
123         return ret;
124 }
125
126 void nf_unregister_sockopt(struct nf_sockopt_ops *reg)
127 {
128         /* No point being interruptible: we're probably in cleanup_module() */
129  restart:
130         down(&nf_sockopt_mutex);
131         if (reg->use != 0) {
132                 /* To be woken by nf_sockopt call... */
133                 /* FIXME: Stuart Young's name appears gratuitously. */
134                 set_current_state(TASK_UNINTERRUPTIBLE);
135                 reg->cleanup_task = current;
136                 up(&nf_sockopt_mutex);
137                 schedule();
138                 goto restart;
139         }
140         list_del(&reg->list);
141         up(&nf_sockopt_mutex);
142 }
143
144 /* Call get/setsockopt() */
145 static int nf_sockopt(struct sock *sk, int pf, int val, 
146                       char __user *opt, int *len, int get)
147 {
148         struct list_head *i;
149         struct nf_sockopt_ops *ops;
150         int ret;
151
152         if (down_interruptible(&nf_sockopt_mutex) != 0)
153                 return -EINTR;
154
155         list_for_each(i, &nf_sockopts) {
156                 ops = (struct nf_sockopt_ops *)i;
157                 if (ops->pf == pf) {
158                         if (get) {
159                                 if (val >= ops->get_optmin
160                                     && val < ops->get_optmax) {
161                                         ops->use++;
162                                         up(&nf_sockopt_mutex);
163                                         ret = ops->get(sk, val, opt, len);
164                                         goto out;
165                                 }
166                         } else {
167                                 if (val >= ops->set_optmin
168                                     && val < ops->set_optmax) {
169                                         ops->use++;
170                                         up(&nf_sockopt_mutex);
171                                         ret = ops->set(sk, val, opt, *len);
172                                         goto out;
173                                 }
174                         }
175                 }
176         }
177         up(&nf_sockopt_mutex);
178         return -ENOPROTOOPT;
179         
180  out:
181         down(&nf_sockopt_mutex);
182         ops->use--;
183         if (ops->cleanup_task)
184                 wake_up_process(ops->cleanup_task);
185         up(&nf_sockopt_mutex);
186         return ret;
187 }
188
189 int nf_setsockopt(struct sock *sk, int pf, int val, char __user *opt,
190                   int len)
191 {
192         return nf_sockopt(sk, pf, val, opt, &len, 0);
193 }
194
195 int nf_getsockopt(struct sock *sk, int pf, int val, char __user *opt, int *len)
196 {
197         return nf_sockopt(sk, pf, val, opt, len, 1);
198 }
199
200 static unsigned int nf_iterate(struct list_head *head,
201                                struct sk_buff **skb,
202                                int hook,
203                                const struct net_device *indev,
204                                const struct net_device *outdev,
205                                struct list_head **i,
206                                int (*okfn)(struct sk_buff *),
207                                int hook_thresh)
208 {
209         unsigned int verdict;
210
211         /*
212          * The caller must not block between calls to this
213          * function because of risk of continuing from deleted element.
214          */
215         list_for_each_continue_rcu(*i, head) {
216                 struct nf_hook_ops *elem = (struct nf_hook_ops *)*i;
217
218                 if (hook_thresh > elem->priority)
219                         continue;
220
221                 /* Optimization: we don't need to hold module
222                    reference here, since function can't sleep. --RR */
223                 verdict = elem->hook(hook, skb, indev, outdev, okfn);
224                 if (verdict != NF_ACCEPT) {
225 #ifdef CONFIG_NETFILTER_DEBUG
226                         if (unlikely(verdict > NF_MAX_VERDICT)) {
227                                 NFDEBUG("Evil return from %p(%u).\n",
228                                         elem->hook, hook);
229                                 continue;
230                         }
231 #endif
232                         if (verdict != NF_REPEAT)
233                                 return verdict;
234                         *i = (*i)->prev;
235                 }
236         }
237         return NF_ACCEPT;
238 }
239
240 int nf_register_queue_handler(int pf, nf_queue_outfn_t outfn, void *data)
241 {      
242         int ret;
243
244         write_lock_bh(&queue_handler_lock);
245         if (queue_handler[pf].outfn)
246                 ret = -EBUSY;
247         else {
248                 queue_handler[pf].outfn = outfn;
249                 queue_handler[pf].data = data;
250                 ret = 0;
251         }
252         write_unlock_bh(&queue_handler_lock);
253
254         return ret;
255 }
256
257 /* The caller must flush their queue before this */
258 int nf_unregister_queue_handler(int pf)
259 {
260         write_lock_bh(&queue_handler_lock);
261         queue_handler[pf].outfn = NULL;
262         queue_handler[pf].data = NULL;
263         write_unlock_bh(&queue_handler_lock);
264         
265         return 0;
266 }
267
268 /* 
269  * Any packet that leaves via this function must come back 
270  * through nf_reinject().
271  */
272 static int nf_queue(struct sk_buff *skb, 
273                     struct list_head *elem, 
274                     int pf, unsigned int hook,
275                     struct net_device *indev,
276                     struct net_device *outdev,
277                     int (*okfn)(struct sk_buff *))
278 {
279         int status;
280         struct nf_info *info;
281 #ifdef CONFIG_BRIDGE_NETFILTER
282         struct net_device *physindev = NULL;
283         struct net_device *physoutdev = NULL;
284 #endif
285
286         /* QUEUE == DROP if noone is waiting, to be safe. */
287         read_lock(&queue_handler_lock);
288         if (!queue_handler[pf].outfn) {
289                 read_unlock(&queue_handler_lock);
290                 kfree_skb(skb);
291                 return 1;
292         }
293
294         info = kmalloc(sizeof(*info), GFP_ATOMIC);
295         if (!info) {
296                 if (net_ratelimit())
297                         printk(KERN_ERR "OOM queueing packet %p\n",
298                                skb);
299                 read_unlock(&queue_handler_lock);
300                 kfree_skb(skb);
301                 return 1;
302         }
303
304         *info = (struct nf_info) { 
305                 (struct nf_hook_ops *)elem, pf, hook, indev, outdev, okfn };
306
307         /* If it's going away, ignore hook. */
308         if (!try_module_get(info->elem->owner)) {
309                 read_unlock(&queue_handler_lock);
310                 kfree(info);
311                 return 0;
312         }
313
314         /* Bump dev refs so they don't vanish while packet is out */
315         if (indev) dev_hold(indev);
316         if (outdev) dev_hold(outdev);
317
318 #ifdef CONFIG_BRIDGE_NETFILTER
319         if (skb->nf_bridge) {
320                 physindev = skb->nf_bridge->physindev;
321                 if (physindev) dev_hold(physindev);
322                 physoutdev = skb->nf_bridge->physoutdev;
323                 if (physoutdev) dev_hold(physoutdev);
324         }
325 #endif
326
327         status = queue_handler[pf].outfn(skb, info, queue_handler[pf].data);
328         read_unlock(&queue_handler_lock);
329
330         if (status < 0) {
331                 /* James M doesn't say fuck enough. */
332                 if (indev) dev_put(indev);
333                 if (outdev) dev_put(outdev);
334 #ifdef CONFIG_BRIDGE_NETFILTER
335                 if (physindev) dev_put(physindev);
336                 if (physoutdev) dev_put(physoutdev);
337 #endif
338                 module_put(info->elem->owner);
339                 kfree(info);
340                 kfree_skb(skb);
341                 return 1;
342         }
343         return 1;
344 }
345
346 /* Returns 1 if okfn() needs to be executed by the caller,
347  * -EPERM for NF_DROP, 0 otherwise. */
348 int nf_hook_slow(int pf, unsigned int hook, struct sk_buff **pskb,
349                  struct net_device *indev,
350                  struct net_device *outdev,
351                  int (*okfn)(struct sk_buff *),
352                  int hook_thresh)
353 {
354         struct list_head *elem;
355         unsigned int verdict;
356         int ret = 0;
357
358         /* We may already have this, but read-locks nest anyway */
359         rcu_read_lock();
360
361         elem = &nf_hooks[pf][hook];
362 next_hook:
363         verdict = nf_iterate(&nf_hooks[pf][hook], pskb, hook, indev,
364                              outdev, &elem, okfn, hook_thresh);
365         if (verdict == NF_ACCEPT || verdict == NF_STOP) {
366                 ret = 1;
367                 goto unlock;
368         } else if (verdict == NF_DROP) {
369                 kfree_skb(*pskb);
370                 ret = -EPERM;
371         } else if (verdict == NF_QUEUE) {
372                 NFDEBUG("nf_hook: Verdict = QUEUE.\n");
373                 if (!nf_queue(*pskb, elem, pf, hook, indev, outdev, okfn))
374                         goto next_hook;
375         }
376 unlock:
377         rcu_read_unlock();
378         return ret;
379 }
380
381 void nf_reinject(struct sk_buff *skb, struct nf_info *info,
382                  unsigned int verdict)
383 {
384         struct list_head *elem = &info->elem->list;
385         struct list_head *i;
386
387         rcu_read_lock();
388
389         /* Release those devices we held, or Alexey will kill me. */
390         if (info->indev) dev_put(info->indev);
391         if (info->outdev) dev_put(info->outdev);
392 #ifdef CONFIG_BRIDGE_NETFILTER
393         if (skb->nf_bridge) {
394                 if (skb->nf_bridge->physindev)
395                         dev_put(skb->nf_bridge->physindev);
396                 if (skb->nf_bridge->physoutdev)
397                         dev_put(skb->nf_bridge->physoutdev);
398         }
399 #endif
400
401         /* Drop reference to owner of hook which queued us. */
402         module_put(info->elem->owner);
403
404         list_for_each_rcu(i, &nf_hooks[info->pf][info->hook]) {
405                 if (i == elem) 
406                         break;
407         }
408   
409         if (elem == &nf_hooks[info->pf][info->hook]) {
410                 /* The module which sent it to userspace is gone. */
411                 NFDEBUG("%s: module disappeared, dropping packet.\n",
412                         __FUNCTION__);
413                 verdict = NF_DROP;
414         }
415
416         /* Continue traversal iff userspace said ok... */
417         if (verdict == NF_REPEAT) {
418                 elem = elem->prev;
419                 verdict = NF_ACCEPT;
420         }
421
422         if (verdict == NF_ACCEPT) {
423         next_hook:
424                 verdict = nf_iterate(&nf_hooks[info->pf][info->hook],
425                                      &skb, info->hook, 
426                                      info->indev, info->outdev, &elem,
427                                      info->okfn, INT_MIN);
428         }
429
430         switch (verdict) {
431         case NF_ACCEPT:
432                 info->okfn(skb);
433                 break;
434
435         case NF_QUEUE:
436                 if (!nf_queue(skb, elem, info->pf, info->hook, 
437                               info->indev, info->outdev, info->okfn))
438                         goto next_hook;
439                 break;
440         }
441         rcu_read_unlock();
442
443         if (verdict == NF_DROP)
444                 kfree_skb(skb);
445
446         kfree(info);
447         return;
448 }
449
450 #ifdef CONFIG_INET
451 /* route_me_harder function, used by iptable_nat, iptable_mangle + ip_queue */
452 int ip_route_me_harder(struct sk_buff **pskb)
453 {
454         struct iphdr *iph = (*pskb)->nh.iph;
455         struct rtable *rt;
456         struct flowi fl = {};
457         struct dst_entry *odst;
458         unsigned int hh_len;
459
460         /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause
461          * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook.
462          */
463         if (inet_addr_type(iph->saddr) == RTN_LOCAL) {
464                 fl.nl_u.ip4_u.daddr = iph->daddr;
465                 fl.nl_u.ip4_u.saddr = iph->saddr;
466                 fl.nl_u.ip4_u.tos = RT_TOS(iph->tos);
467                 fl.oif = (*pskb)->sk ? (*pskb)->sk->sk_bound_dev_if : 0;
468 #ifdef CONFIG_IP_ROUTE_FWMARK
469                 fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark;
470 #endif
471                 fl.proto = iph->protocol;
472                 if (ip_route_output_key(&rt, &fl) != 0)
473                         return -1;
474
475                 /* Drop old route. */
476                 dst_release((*pskb)->dst);
477                 (*pskb)->dst = &rt->u.dst;
478         } else {
479                 /* non-local src, find valid iif to satisfy
480                  * rp-filter when calling ip_route_input. */
481                 fl.nl_u.ip4_u.daddr = iph->saddr;
482                 if (ip_route_output_key(&rt, &fl) != 0)
483                         return -1;
484
485                 odst = (*pskb)->dst;
486                 if (ip_route_input(*pskb, iph->daddr, iph->saddr,
487                                    RT_TOS(iph->tos), rt->u.dst.dev) != 0) {
488                         dst_release(&rt->u.dst);
489                         return -1;
490                 }
491                 dst_release(&rt->u.dst);
492                 dst_release(odst);
493         }
494         
495         if ((*pskb)->dst->error)
496                 return -1;
497
498         /* Change in oif may mean change in hh_len. */
499         hh_len = (*pskb)->dst->dev->hard_header_len;
500         if (skb_headroom(*pskb) < hh_len) {
501                 struct sk_buff *nskb;
502
503                 nskb = skb_realloc_headroom(*pskb, hh_len);
504                 if (!nskb) 
505                         return -1;
506                 if ((*pskb)->sk)
507                         skb_set_owner_w(nskb, (*pskb)->sk);
508                 kfree_skb(*pskb);
509                 *pskb = nskb;
510         }
511
512         return 0;
513 }
514 EXPORT_SYMBOL(ip_route_me_harder);
515
516 int skb_ip_make_writable(struct sk_buff **pskb, unsigned int writable_len)
517 {
518         struct sk_buff *nskb;
519
520         if (writable_len > (*pskb)->len)
521                 return 0;
522
523         /* Not exclusive use of packet?  Must copy. */
524         if (skb_shared(*pskb) || skb_cloned(*pskb))
525                 goto copy_skb;
526
527         return pskb_may_pull(*pskb, writable_len);
528
529 copy_skb:
530         nskb = skb_copy(*pskb, GFP_ATOMIC);
531         if (!nskb)
532                 return 0;
533         BUG_ON(skb_is_nonlinear(nskb));
534
535         /* Rest of kernel will get very unhappy if we pass it a
536            suddenly-orphaned skbuff */
537         if ((*pskb)->sk)
538                 skb_set_owner_w(nskb, (*pskb)->sk);
539         kfree_skb(*pskb);
540         *pskb = nskb;
541         return 1;
542 }
543 EXPORT_SYMBOL(skb_ip_make_writable);
544 #endif /*CONFIG_INET*/
545
546 /* Internal logging interface, which relies on the real 
547    LOG target modules */
548
549 #define NF_LOG_PREFIXLEN                128
550
551 static nf_logfn *nf_logging[NPROTO]; /* = NULL */
552 static int reported = 0;
553 static DEFINE_SPINLOCK(nf_log_lock);
554
555 int nf_log_register(int pf, nf_logfn *logfn)
556 {
557         int ret = -EBUSY;
558
559         /* Any setup of logging members must be done before
560          * substituting pointer. */
561         spin_lock(&nf_log_lock);
562         if (!nf_logging[pf]) {
563                 rcu_assign_pointer(nf_logging[pf], logfn);
564                 ret = 0;
565         }
566         spin_unlock(&nf_log_lock);
567         return ret;
568 }               
569
570 void nf_log_unregister(int pf, nf_logfn *logfn)
571 {
572         spin_lock(&nf_log_lock);
573         if (nf_logging[pf] == logfn)
574                 nf_logging[pf] = NULL;
575         spin_unlock(&nf_log_lock);
576
577         /* Give time to concurrent readers. */
578         synchronize_net();
579 }               
580
581 void nf_log_packet(int pf,
582                    unsigned int hooknum,
583                    const struct sk_buff *skb,
584                    const struct net_device *in,
585                    const struct net_device *out,
586                    const char *fmt, ...)
587 {
588         va_list args;
589         char prefix[NF_LOG_PREFIXLEN];
590         nf_logfn *logfn;
591         
592         rcu_read_lock();
593         logfn = rcu_dereference(nf_logging[pf]);
594         if (logfn) {
595                 va_start(args, fmt);
596                 vsnprintf(prefix, sizeof(prefix), fmt, args);
597                 va_end(args);
598                 /* We must read logging before nf_logfn[pf] */
599                 logfn(hooknum, skb, in, out, prefix);
600         } else if (!reported) {
601                 printk(KERN_WARNING "nf_log_packet: can\'t log yet, "
602                        "no backend logging module loaded in!\n");
603                 reported++;
604         }
605         rcu_read_unlock();
606 }
607 EXPORT_SYMBOL(nf_log_register);
608 EXPORT_SYMBOL(nf_log_unregister);
609 EXPORT_SYMBOL(nf_log_packet);
610
611 /* This does not belong here, but locally generated errors need it if connection
612    tracking in use: without this, connection may not be in hash table, and hence
613    manufactured ICMP or RST packets will not be associated with it. */
614 void (*ip_ct_attach)(struct sk_buff *, struct sk_buff *);
615
616 void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb)
617 {
618         void (*attach)(struct sk_buff *, struct sk_buff *);
619
620         if (skb->nfct && (attach = ip_ct_attach) != NULL) {
621                 mb(); /* Just to be sure: must be read before executing this */
622                 attach(new, skb);
623         }
624 }
625
626 void __init netfilter_init(void)
627 {
628         int i, h;
629
630         for (i = 0; i < NPROTO; i++) {
631                 for (h = 0; h < NF_MAX_HOOKS; h++)
632                         INIT_LIST_HEAD(&nf_hooks[i][h]);
633         }
634 }
635
636 EXPORT_SYMBOL(ip_ct_attach);
637 EXPORT_SYMBOL(nf_ct_attach);
638 EXPORT_SYMBOL(nf_getsockopt);
639 EXPORT_SYMBOL(nf_hook_slow);
640 EXPORT_SYMBOL(nf_hooks);
641 EXPORT_SYMBOL(nf_register_hook);
642 EXPORT_SYMBOL(nf_register_queue_handler);
643 EXPORT_SYMBOL(nf_register_sockopt);
644 EXPORT_SYMBOL(nf_reinject);
645 EXPORT_SYMBOL(nf_setsockopt);
646 EXPORT_SYMBOL(nf_unregister_hook);
647 EXPORT_SYMBOL(nf_unregister_queue_handler);
648 EXPORT_SYMBOL(nf_unregister_sockopt);