[SK_BUFF]: Introduce ip_hdr(), remove skb->nh.iph
[safe/jmp/linux-2.6] / net / ipv4 / ipvs / ip_vs_xmit.c
1 /*
2  * ip_vs_xmit.c: various packet transmitters for IPVS
3  *
4  * Version:     $Id: ip_vs_xmit.c,v 1.2 2002/11/30 01:50:35 wensong Exp $
5  *
6  * Authors:     Wensong Zhang <wensong@linuxvirtualserver.org>
7  *              Julian Anastasov <ja@ssi.bg>
8  *
9  *              This program is free software; you can redistribute it and/or
10  *              modify it under the terms of the GNU General Public License
11  *              as published by the Free Software Foundation; either version
12  *              2 of the License, or (at your option) any later version.
13  *
14  * Changes:
15  *
16  */
17
18 #include <linux/kernel.h>
19 #include <linux/ip.h>
20 #include <linux/tcp.h>                  /* for tcphdr */
21 #include <net/tcp.h>                    /* for csum_tcpudp_magic */
22 #include <net/udp.h>
23 #include <net/icmp.h>                   /* for icmp_send */
24 #include <net/route.h>                  /* for ip_route_output */
25 #include <linux/netfilter.h>
26 #include <linux/netfilter_ipv4.h>
27
28 #include <net/ip_vs.h>
29
30
31 /*
32  *      Destination cache to speed up outgoing route lookup
33  */
34 static inline void
35 __ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst)
36 {
37         struct dst_entry *old_dst;
38
39         old_dst = dest->dst_cache;
40         dest->dst_cache = dst;
41         dest->dst_rtos = rtos;
42         dst_release(old_dst);
43 }
44
45 static inline struct dst_entry *
46 __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie)
47 {
48         struct dst_entry *dst = dest->dst_cache;
49
50         if (!dst)
51                 return NULL;
52         if ((dst->obsolete || rtos != dest->dst_rtos) &&
53             dst->ops->check(dst, cookie) == NULL) {
54                 dest->dst_cache = NULL;
55                 dst_release(dst);
56                 return NULL;
57         }
58         dst_hold(dst);
59         return dst;
60 }
61
62 static inline struct rtable *
63 __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos)
64 {
65         struct rtable *rt;                      /* Route to the other host */
66         struct ip_vs_dest *dest = cp->dest;
67
68         if (dest) {
69                 spin_lock(&dest->dst_lock);
70                 if (!(rt = (struct rtable *)
71                       __ip_vs_dst_check(dest, rtos, 0))) {
72                         struct flowi fl = {
73                                 .oif = 0,
74                                 .nl_u = {
75                                         .ip4_u = {
76                                                 .daddr = dest->addr,
77                                                 .saddr = 0,
78                                                 .tos = rtos, } },
79                         };
80
81                         if (ip_route_output_key(&rt, &fl)) {
82                                 spin_unlock(&dest->dst_lock);
83                                 IP_VS_DBG_RL("ip_route_output error, "
84                                              "dest: %u.%u.%u.%u\n",
85                                              NIPQUAD(dest->addr));
86                                 return NULL;
87                         }
88                         __ip_vs_dst_set(dest, rtos, dst_clone(&rt->u.dst));
89                         IP_VS_DBG(10, "new dst %u.%u.%u.%u, refcnt=%d, rtos=%X\n",
90                                   NIPQUAD(dest->addr),
91                                   atomic_read(&rt->u.dst.__refcnt), rtos);
92                 }
93                 spin_unlock(&dest->dst_lock);
94         } else {
95                 struct flowi fl = {
96                         .oif = 0,
97                         .nl_u = {
98                                 .ip4_u = {
99                                         .daddr = cp->daddr,
100                                         .saddr = 0,
101                                         .tos = rtos, } },
102                 };
103
104                 if (ip_route_output_key(&rt, &fl)) {
105                         IP_VS_DBG_RL("ip_route_output error, dest: "
106                                      "%u.%u.%u.%u\n", NIPQUAD(cp->daddr));
107                         return NULL;
108                 }
109         }
110
111         return rt;
112 }
113
114
115 /*
116  *      Release dest->dst_cache before a dest is removed
117  */
118 void
119 ip_vs_dst_reset(struct ip_vs_dest *dest)
120 {
121         struct dst_entry *old_dst;
122
123         old_dst = dest->dst_cache;
124         dest->dst_cache = NULL;
125         dst_release(old_dst);
126 }
127
128 #define IP_VS_XMIT(skb, rt)                             \
129 do {                                                    \
130         (skb)->ipvs_property = 1;                       \
131         (skb)->ip_summed = CHECKSUM_NONE;               \
132         NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL,  \
133                 (rt)->u.dst.dev, dst_output);           \
134 } while (0)
135
136
137 /*
138  *      NULL transmitter (do nothing except return NF_ACCEPT)
139  */
140 int
141 ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
142                 struct ip_vs_protocol *pp)
143 {
144         /* we do not touch skb and do not need pskb ptr */
145         return NF_ACCEPT;
146 }
147
148
149 /*
150  *      Bypass transmitter
151  *      Let packets bypass the destination when the destination is not
152  *      available, it may be only used in transparent cache cluster.
153  */
154 int
155 ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
156                   struct ip_vs_protocol *pp)
157 {
158         struct rtable *rt;                      /* Route to the other host */
159         struct iphdr  *iph = ip_hdr(skb);
160         u8     tos = iph->tos;
161         int    mtu;
162         struct flowi fl = {
163                 .oif = 0,
164                 .nl_u = {
165                         .ip4_u = {
166                                 .daddr = iph->daddr,
167                                 .saddr = 0,
168                                 .tos = RT_TOS(tos), } },
169         };
170
171         EnterFunction(10);
172
173         if (ip_route_output_key(&rt, &fl)) {
174                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): ip_route_output error, "
175                              "dest: %u.%u.%u.%u\n", NIPQUAD(iph->daddr));
176                 goto tx_error_icmp;
177         }
178
179         /* MTU checking */
180         mtu = dst_mtu(&rt->u.dst);
181         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
182                 ip_rt_put(rt);
183                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
184                 IP_VS_DBG_RL("ip_vs_bypass_xmit(): frag needed\n");
185                 goto tx_error;
186         }
187
188         /*
189          * Call ip_send_check because we are not sure it is called
190          * after ip_defrag. Is copy-on-write needed?
191          */
192         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
193                 ip_rt_put(rt);
194                 return NF_STOLEN;
195         }
196         ip_send_check(ip_hdr(skb));
197
198         /* drop old route */
199         dst_release(skb->dst);
200         skb->dst = &rt->u.dst;
201
202         /* Another hack: avoid icmp_send in ip_fragment */
203         skb->local_df = 1;
204
205         IP_VS_XMIT(skb, rt);
206
207         LeaveFunction(10);
208         return NF_STOLEN;
209
210  tx_error_icmp:
211         dst_link_failure(skb);
212  tx_error:
213         kfree_skb(skb);
214         LeaveFunction(10);
215         return NF_STOLEN;
216 }
217
218
219 /*
220  *      NAT transmitter (only for outside-to-inside nat forwarding)
221  *      Not used for related ICMP
222  */
223 int
224 ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
225                struct ip_vs_protocol *pp)
226 {
227         struct rtable *rt;              /* Route to the other host */
228         int mtu;
229         struct iphdr *iph = ip_hdr(skb);
230
231         EnterFunction(10);
232
233         /* check if it is a connection of no-client-port */
234         if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
235                 __be16 _pt, *p;
236                 p = skb_header_pointer(skb, iph->ihl*4, sizeof(_pt), &_pt);
237                 if (p == NULL)
238                         goto tx_error;
239                 ip_vs_conn_fill_cport(cp, *p);
240                 IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p));
241         }
242
243         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
244                 goto tx_error_icmp;
245
246         /* MTU checking */
247         mtu = dst_mtu(&rt->u.dst);
248         if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) {
249                 ip_rt_put(rt);
250                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
251                 IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
252                 goto tx_error;
253         }
254
255         /* copy-on-write the packet before mangling it */
256         if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
257                 goto tx_error_put;
258
259         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
260                 goto tx_error_put;
261
262         /* drop old route */
263         dst_release(skb->dst);
264         skb->dst = &rt->u.dst;
265
266         /* mangle the packet */
267         if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
268                 goto tx_error;
269         ip_hdr(skb)->daddr = cp->daddr;
270         ip_send_check(ip_hdr(skb));
271
272         IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
273
274         /* FIXME: when application helper enlarges the packet and the length
275            is larger than the MTU of outgoing device, there will be still
276            MTU problem. */
277
278         /* Another hack: avoid icmp_send in ip_fragment */
279         skb->local_df = 1;
280
281         IP_VS_XMIT(skb, rt);
282
283         LeaveFunction(10);
284         return NF_STOLEN;
285
286   tx_error_icmp:
287         dst_link_failure(skb);
288   tx_error:
289         LeaveFunction(10);
290         kfree_skb(skb);
291         return NF_STOLEN;
292   tx_error_put:
293         ip_rt_put(rt);
294         goto tx_error;
295 }
296
297
298 /*
299  *   IP Tunneling transmitter
300  *
301  *   This function encapsulates the packet in a new IP packet, its
302  *   destination will be set to cp->daddr. Most code of this function
303  *   is taken from ipip.c.
304  *
305  *   It is used in VS/TUN cluster. The load balancer selects a real
306  *   server from a cluster based on a scheduling algorithm,
307  *   encapsulates the request packet and forwards it to the selected
308  *   server. For example, all real servers are configured with
309  *   "ifconfig tunl0 <Virtual IP Address> up". When the server receives
310  *   the encapsulated packet, it will decapsulate the packet, processe
311  *   the request and return the response packets directly to the client
312  *   without passing the load balancer. This can greatly increase the
313  *   scalability of virtual server.
314  *
315  *   Used for ANY protocol
316  */
317 int
318 ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
319                   struct ip_vs_protocol *pp)
320 {
321         struct rtable *rt;                      /* Route to the other host */
322         struct net_device *tdev;                /* Device to other host */
323         struct iphdr  *old_iph = ip_hdr(skb);
324         u8     tos = old_iph->tos;
325         __be16 df = old_iph->frag_off;
326         struct iphdr  *iph;                     /* Our new IP header */
327         int    max_headroom;                    /* The extra header space needed */
328         int    mtu;
329
330         EnterFunction(10);
331
332         if (skb->protocol != htons(ETH_P_IP)) {
333                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): protocol error, "
334                              "ETH_P_IP: %d, skb protocol: %d\n",
335                              htons(ETH_P_IP), skb->protocol);
336                 goto tx_error;
337         }
338
339         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos))))
340                 goto tx_error_icmp;
341
342         tdev = rt->u.dst.dev;
343
344         mtu = dst_mtu(&rt->u.dst) - sizeof(struct iphdr);
345         if (mtu < 68) {
346                 ip_rt_put(rt);
347                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): mtu less than 68\n");
348                 goto tx_error;
349         }
350         if (skb->dst)
351                 skb->dst->ops->update_pmtu(skb->dst, mtu);
352
353         df |= (old_iph->frag_off & htons(IP_DF));
354
355         if ((old_iph->frag_off & htons(IP_DF))
356             && mtu < ntohs(old_iph->tot_len)) {
357                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
358                 ip_rt_put(rt);
359                 IP_VS_DBG_RL("ip_vs_tunnel_xmit(): frag needed\n");
360                 goto tx_error;
361         }
362
363         /*
364          * Okay, now see if we can stuff it in the buffer as-is.
365          */
366         max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr);
367
368         if (skb_headroom(skb) < max_headroom
369             || skb_cloned(skb) || skb_shared(skb)) {
370                 struct sk_buff *new_skb =
371                         skb_realloc_headroom(skb, max_headroom);
372                 if (!new_skb) {
373                         ip_rt_put(rt);
374                         kfree_skb(skb);
375                         IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
376                         return NF_STOLEN;
377                 }
378                 kfree_skb(skb);
379                 skb = new_skb;
380                 old_iph = ip_hdr(skb);
381         }
382
383         skb->h.raw = (void *) old_iph;
384
385         /* fix old IP header checksum */
386         ip_send_check(old_iph);
387
388         skb_push(skb, sizeof(struct iphdr));
389         skb_reset_network_header(skb);
390         memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
391
392         /* drop old route */
393         dst_release(skb->dst);
394         skb->dst = &rt->u.dst;
395
396         /*
397          *      Push down and install the IPIP header.
398          */
399         iph                     =       ip_hdr(skb);
400         iph->version            =       4;
401         iph->ihl                =       sizeof(struct iphdr)>>2;
402         iph->frag_off           =       df;
403         iph->protocol           =       IPPROTO_IPIP;
404         iph->tos                =       tos;
405         iph->daddr              =       rt->rt_dst;
406         iph->saddr              =       rt->rt_src;
407         iph->ttl                =       old_iph->ttl;
408         iph->tot_len            =       htons(skb->len);
409         ip_select_ident(iph, &rt->u.dst, NULL);
410         ip_send_check(iph);
411
412         /* Another hack: avoid icmp_send in ip_fragment */
413         skb->local_df = 1;
414
415         IP_VS_XMIT(skb, rt);
416
417         LeaveFunction(10);
418
419         return NF_STOLEN;
420
421   tx_error_icmp:
422         dst_link_failure(skb);
423   tx_error:
424         kfree_skb(skb);
425         LeaveFunction(10);
426         return NF_STOLEN;
427 }
428
429
430 /*
431  *      Direct Routing transmitter
432  *      Used for ANY protocol
433  */
434 int
435 ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
436               struct ip_vs_protocol *pp)
437 {
438         struct rtable *rt;                      /* Route to the other host */
439         struct iphdr  *iph = ip_hdr(skb);
440         int    mtu;
441
442         EnterFunction(10);
443
444         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
445                 goto tx_error_icmp;
446
447         /* MTU checking */
448         mtu = dst_mtu(&rt->u.dst);
449         if ((iph->frag_off & htons(IP_DF)) && skb->len > mtu) {
450                 icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
451                 ip_rt_put(rt);
452                 IP_VS_DBG_RL("ip_vs_dr_xmit(): frag needed\n");
453                 goto tx_error;
454         }
455
456         /*
457          * Call ip_send_check because we are not sure it is called
458          * after ip_defrag. Is copy-on-write needed?
459          */
460         if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
461                 ip_rt_put(rt);
462                 return NF_STOLEN;
463         }
464         ip_send_check(ip_hdr(skb));
465
466         /* drop old route */
467         dst_release(skb->dst);
468         skb->dst = &rt->u.dst;
469
470         /* Another hack: avoid icmp_send in ip_fragment */
471         skb->local_df = 1;
472
473         IP_VS_XMIT(skb, rt);
474
475         LeaveFunction(10);
476         return NF_STOLEN;
477
478   tx_error_icmp:
479         dst_link_failure(skb);
480   tx_error:
481         kfree_skb(skb);
482         LeaveFunction(10);
483         return NF_STOLEN;
484 }
485
486
487 /*
488  *      ICMP packet transmitter
489  *      called by the ip_vs_in_icmp
490  */
491 int
492 ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
493                 struct ip_vs_protocol *pp, int offset)
494 {
495         struct rtable   *rt;    /* Route to the other host */
496         int mtu;
497         int rc;
498
499         EnterFunction(10);
500
501         /* The ICMP packet for VS/TUN, VS/DR and LOCALNODE will be
502            forwarded directly here, because there is no need to
503            translate address/port back */
504         if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) {
505                 if (cp->packet_xmit)
506                         rc = cp->packet_xmit(skb, cp, pp);
507                 else
508                         rc = NF_ACCEPT;
509                 /* do not touch skb anymore */
510                 atomic_inc(&cp->in_pkts);
511                 goto out;
512         }
513
514         /*
515          * mangle and send the packet here (only for VS/NAT)
516          */
517
518         if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos))))
519                 goto tx_error_icmp;
520
521         /* MTU checking */
522         mtu = dst_mtu(&rt->u.dst);
523         if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) {
524                 ip_rt_put(rt);
525                 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
526                 IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
527                 goto tx_error;
528         }
529
530         /* copy-on-write the packet before mangling it */
531         if (!ip_vs_make_skb_writable(&skb, offset))
532                 goto tx_error_put;
533
534         if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
535                 goto tx_error_put;
536
537         /* drop the old route when skb is not shared */
538         dst_release(skb->dst);
539         skb->dst = &rt->u.dst;
540
541         ip_vs_nat_icmp(skb, pp, cp, 0);
542
543         /* Another hack: avoid icmp_send in ip_fragment */
544         skb->local_df = 1;
545
546         IP_VS_XMIT(skb, rt);
547
548         rc = NF_STOLEN;
549         goto out;
550
551   tx_error_icmp:
552         dst_link_failure(skb);
553   tx_error:
554         dev_kfree_skb(skb);
555         rc = NF_STOLEN;
556   out:
557         LeaveFunction(10);
558         return rc;
559   tx_error_put:
560         ip_rt_put(rt);
561         goto tx_error;
562 }