#ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
-EXPORT_SYMBOL(ip_vs_make_skb_writable);
/* ID used in ICMP lookups */
}
-int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
-{
- struct sk_buff *skb = *pskb;
-
- /* skb is already used, better copy skb and its payload */
- if (unlikely(skb_shared(skb) || skb->sk))
- goto copy_skb;
-
- /* skb data is already used, copy it */
- if (unlikely(skb_cloned(skb)))
- goto copy_data;
-
- return pskb_may_pull(skb, writable_len);
-
- copy_data:
- if (unlikely(writable_len > skb->len))
- return 0;
- return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
-
- copy_skb:
- if (unlikely(writable_len > skb->len))
- return 0;
- skb = skb_copy(skb, GFP_ATOMIC);
- if (!skb)
- return 0;
- BUG_ON(skb_is_nonlinear(skb));
-
- /* Rest of kernel will get very unhappy if we pass it a
- suddenly-orphaned skbuff */
- if ((*pskb)->sk)
- skb_set_owner_w(skb, (*pskb)->sk);
- kfree_skb(*pskb);
- *pskb = skb;
- return 1;
-}
-
/*
* IPVS persistent scheduling function
* It creates a connection entry according to its template if exists,
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc,
const struct sk_buff *skb,
- __u16 ports[2])
+ __be16 ports[2])
{
struct ip_vs_conn *cp = NULL;
- struct iphdr *iph = skb->nh.iph;
+ struct iphdr *iph = ip_hdr(skb);
struct ip_vs_dest *dest;
struct ip_vs_conn *ct;
- __u16 dport; /* destination port to forward */
- __u32 snet; /* source network of the client, after masking */
+ __be16 dport; /* destination port to forward */
+ __be32 snet; /* source network of the client, after masking */
/* Mask saddr with the netmask to adjust template granularity */
snet = iph->saddr & svc->netmask;
ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_conn *cp = NULL;
- struct iphdr *iph = skb->nh.iph;
+ struct iphdr *iph = ip_hdr(skb);
struct ip_vs_dest *dest;
- __u16 _ports[2], *pptr;
+ __be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, iph->ihl*4,
sizeof(_ports), _ports);
return NULL;
IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u "
- "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n",
+ "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n",
ip_vs_fwd_tag(cp),
NIPQUAD(cp->caddr), ntohs(cp->cport),
NIPQUAD(cp->vaddr), ntohs(cp->vport),
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp)
{
- __u16 _ports[2], *pptr;
- struct iphdr *iph = skb->nh.iph;
+ __be16 _ports[2], *pptr;
+ struct iphdr *iph = ip_hdr(skb);
pptr = skb_header_pointer(skb, iph->ihl*4,
sizeof(_ports), _ports);
and the destination is RTN_UNICAST (and not local), then create
a cache_bypass connection entry */
if (sysctl_ip_vs_cache_bypass && svc->fwmark
- && (inet_addr_type(iph->daddr) == RTN_UNICAST)) {
+ && (inet_addr_type(&init_net, iph->daddr) == RTN_UNICAST)) {
int ret, cs;
struct ip_vs_conn *cp;
/*
- * It is hooked before NF_IP_PRI_NAT_SRC at the NF_IP_POST_ROUTING
+ * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING
* chain, and is used for VS/NAT.
* It detects packets for VS/NAT connections and sends the packets
* immediately. This can avoid that iptable_nat mangles the packets
* for VS/NAT.
*/
static unsigned int ip_vs_post_routing(unsigned int hooknum,
- struct sk_buff **pskb,
+ struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- if (!((*pskb)->ipvs_property))
+ if (!skb->ipvs_property)
return NF_ACCEPT;
-
/* The packet was sent from IPVS, exit this chain */
- (*okfn)(*pskb);
-
- return NF_STOLEN;
+ return NF_STOP;
}
-u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
+__sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
- return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
+ return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
}
-static inline struct sk_buff *
-ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
+static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user)
{
- skb = ip_defrag(skb, user);
- if (skb)
- ip_send_check(skb->nh.iph);
- return skb;
+ int err = ip_defrag(skb, user);
+
+ if (!err)
+ ip_send_check(ip_hdr(skb));
+
+ return err;
}
/*
void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int inout)
{
- struct iphdr *iph = skb->nh.iph;
+ struct iphdr *iph = ip_hdr(skb);
unsigned int icmp_offset = iph->ihl*4;
- struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset);
+ struct icmphdr *icmph = (struct icmphdr *)(skb_network_header(skb) +
+ icmp_offset);
struct iphdr *ciph = (struct iphdr *)(icmph + 1);
if (inout) {
/* the TCP/UDP port */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
- __u16 *ports = (void *)ciph + ciph->ihl*4;
+ __be16 *ports = (void *)ciph + ciph->ihl*4;
if (inout)
ports[1] = cp->vport;
* Currently handles error types - unreachable, quench, ttl exceeded.
* (Only used in VS/NAT)
*/
-static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
+static int ip_vs_out_icmp(struct sk_buff *skb, int *related)
{
- struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
*related = 1;
/* reassemble IP fragments */
- if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
- skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
- if (!skb)
+ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
return NF_STOLEN;
- *pskb = skb;
}
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
offset = ihl = iph->ihl * 4;
ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
if (ic == NULL)
return NF_ACCEPT;
/* Is the embedded protocol header present? */
- if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
pp->dont_defrag))
return NF_ACCEPT;
verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != 0) {
- IP_VS_ERR("shouldn't reach here, because the box is on the"
+ IP_VS_ERR("shouldn't reach here, because the box is on the "
"half connection in the tun/dr module.\n");
}
/* Ensure the checksum is correct */
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- ip_vs_checksum_complete(skb, ihl)) {
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
/* Failed checksum! */
IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol)
offset += 2 * sizeof(__u16);
- if (!ip_vs_make_skb_writable(pskb, offset))
+ if (!skb_make_writable(skb, offset))
goto out;
- skb = *pskb;
ip_vs_nat_icmp(skb, pp, cp, 1);
{
struct tcphdr _tcph, *th;
- th = skb_header_pointer(skb, skb->nh.iph->ihl * 4,
- sizeof(_tcph), &_tcph);
+ th = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_tcph), &_tcph);
if (th == NULL)
return 0;
return th->rst;
}
/*
- * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
+ * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT.
* Check if outgoing packet belongs to the established ip_vs_conn,
* rewrite addresses of the packet and send it on its way...
*/
static unsigned int
-ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
+ip_vs_out(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
if (skb->ipvs_property)
return NF_ACCEPT;
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
if (unlikely(iph->protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_out_icmp(pskb, &related);
+ int related, verdict = ip_vs_out_icmp(skb, &related);
if (related)
return verdict;
- skb = *pskb;
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
}
pp = ip_vs_proto_get(iph->protocol);
return NF_ACCEPT;
/* reassemble IP fragments */
- if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
+ if (unlikely(iph->frag_off & htons(IP_MF|IP_OFFSET) &&
!pp->dont_defrag)) {
- skb = ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT);
- if (!skb)
+ if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT))
return NF_STOLEN;
- iph = skb->nh.iph;
- *pskb = skb;
+ iph = ip_hdr(skb);
}
ihl = iph->ihl << 2;
if (sysctl_ip_vs_nat_icmp_send &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP)) {
- __u16 _ports[2], *pptr;
+ __be16 _ports[2], *pptr;
pptr = skb_header_pointer(skb, ihl,
sizeof(_ports), _ports);
IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
- if (!ip_vs_make_skb_writable(pskb, ihl))
+ if (!skb_make_writable(skb, ihl))
goto drop;
/* mangle the packet */
- if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
+ if (pp->snat_handler && !pp->snat_handler(skb, pp, cp))
+ goto drop;
+ ip_hdr(skb)->saddr = cp->vaddr;
+ ip_send_check(ip_hdr(skb));
+
+ /* For policy routing, packets originating from this
+ * machine itself may be routed differently to packets
+ * passing through. We want this packet to be routed as
+ * if it came from this machine itself. So re-compute
+ * the routing information.
+ */
+ if (ip_route_me_harder(skb, RTN_LOCAL) != 0)
goto drop;
- skb = *pskb;
- skb->nh.iph->saddr = cp->vaddr;
- ip_send_check(skb->nh.iph);
IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
drop:
ip_vs_conn_put(cp);
- kfree_skb(*pskb);
+ kfree_skb(skb);
return NF_STOLEN;
}
* forward to the right destination host if relevant.
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
-static int
-ip_vs_in_icmp(struct sk_buff **pskb, int *related, unsigned int hooknum)
+static int
+ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum)
{
- struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct icmphdr _icmph, *ic;
struct iphdr _ciph, *cih; /* The ip header contained within the ICMP */
*related = 1;
/* reassemble IP fragments */
- if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
- skb = ip_vs_gather_frags(skb,
- hooknum == NF_IP_LOCAL_IN ?
- IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD);
- if (!skb)
+ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) {
+ if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ?
+ IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD))
return NF_STOLEN;
- *pskb = skb;
}
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
offset = ihl = iph->ihl * 4;
ic = skb_header_pointer(skb, offset, sizeof(_icmph), &_icmph);
if (ic == NULL)
return NF_ACCEPT;
/* Is the embedded protocol header present? */
- if (unlikely(cih->frag_off & __constant_htons(IP_OFFSET) &&
+ if (unlikely(cih->frag_off & htons(IP_OFFSET) &&
pp->dont_defrag))
return NF_ACCEPT;
verdict = NF_DROP;
/* Ensure the checksum is correct */
- if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
- ip_vs_checksum_complete(skb, ihl)) {
+ if (!skb_csum_unnecessary(skb) && ip_vs_checksum_complete(skb, ihl)) {
/* Failed checksum! */
IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
* and send it on its way...
*/
static unsigned int
-ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
+ip_vs_in(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
- struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
* ... don't know why 1st test DOES NOT include 2nd (?)
*/
if (unlikely(skb->pkt_type != PACKET_HOST
- || skb->dev == &loopback_dev || skb->sk)) {
+ || skb->dev->flags & IFF_LOOPBACK || skb->sk)) {
IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
skb->pkt_type,
- skb->nh.iph->protocol,
- NIPQUAD(skb->nh.iph->daddr));
+ ip_hdr(skb)->protocol,
+ NIPQUAD(ip_hdr(skb)->daddr));
return NF_ACCEPT;
}
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
if (unlikely(iph->protocol == IPPROTO_ICMP)) {
- int related, verdict = ip_vs_in_icmp(pskb, &related, hooknum);
+ int related, verdict = ip_vs_in_icmp(skb, &related, hooknum);
if (related)
return verdict;
- skb = *pskb;
- iph = skb->nh.iph;
+ iph = ip_hdr(skb);
}
/* Protocol supported? */
ret = NF_ACCEPT;
}
- /* increase its packet counter and check if it is needed
- to be synchronized */
+ /* Increase its packet counter and check if it is needed
+ * to be synchronized
+ *
+ * Sync connection if it is about to close to
+ * encorage the standby servers to update the connections timeout
+ */
atomic_inc(&cp->in_pkts);
if ((ip_vs_sync_state & IP_VS_STATE_MASTER) &&
- (cp->protocol != IPPROTO_TCP ||
- cp->state == IP_VS_TCP_S_ESTABLISHED) &&
- (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
- == sysctl_ip_vs_sync_threshold[0]))
+ (((cp->protocol != IPPROTO_TCP ||
+ cp->state == IP_VS_TCP_S_ESTABLISHED) &&
+ (atomic_read(&cp->in_pkts) % sysctl_ip_vs_sync_threshold[1]
+ == sysctl_ip_vs_sync_threshold[0])) ||
+ ((cp->protocol == IPPROTO_TCP) && (cp->old_state != cp->state) &&
+ ((cp->state == IP_VS_TCP_S_FIN_WAIT) ||
+ (cp->state == IP_VS_TCP_S_CLOSE)))))
ip_vs_sync_conn(cp);
+ cp->old_state = cp->state;
ip_vs_conn_put(cp);
return ret;
/*
- * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
+ * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP
* related packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent
* cache cluster, TCP packets can be marked and routed to ip_vs_in,
* but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
- * sent to ip_vs_in_icmp. So, catch them at the NF_IP_FORWARD chain
+ * sent to ip_vs_in_icmp. So, catch them at the NF_INET_FORWARD chain
* and send them to ip_vs_in_icmp.
*/
static unsigned int
-ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
+ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
int r;
- if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
+ if (ip_hdr(skb)->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
- return ip_vs_in_icmp(pskb, &r, hooknum);
+ return ip_vs_in_icmp(skb, &r, hooknum);
}
-/* After packet filtering, forward packet through VS/DR, VS/TUN,
- or VS/NAT(change destination), so that filtering rules can be
- applied to IPVS. */
-static struct nf_hook_ops ip_vs_in_ops = {
- .hook = ip_vs_in,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_LOCAL_IN,
- .priority = 100,
-};
-
-/* After packet filtering, change source only for VS/NAT */
-static struct nf_hook_ops ip_vs_out_ops = {
- .hook = ip_vs_out,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = 100,
-};
-
-/* After packet filtering (but before ip_vs_out_icmp), catch icmp
- destined for 0.0.0.0/0, which is for incoming IPVS connections */
-static struct nf_hook_ops ip_vs_forward_icmp_ops = {
- .hook = ip_vs_forward_icmp,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_FORWARD,
- .priority = 99,
-};
-
-/* Before the netfilter connection tracking, exit from POST_ROUTING */
-static struct nf_hook_ops ip_vs_post_routing_ops = {
- .hook = ip_vs_post_routing,
- .owner = THIS_MODULE,
- .pf = PF_INET,
- .hooknum = NF_IP_POST_ROUTING,
- .priority = NF_IP_PRI_NAT_SRC-1,
+static struct nf_hook_ops ip_vs_ops[] __read_mostly = {
+ /* After packet filtering, forward packet through VS/DR, VS/TUN,
+ * or VS/NAT(change destination), so that filtering rules can be
+ * applied to IPVS. */
+ {
+ .hook = ip_vs_in,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_LOCAL_IN,
+ .priority = 100,
+ },
+ /* After packet filtering, change source only for VS/NAT */
+ {
+ .hook = ip_vs_out,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 100,
+ },
+ /* After packet filtering (but before ip_vs_out_icmp), catch icmp
+ * destined for 0.0.0.0/0, which is for incoming IPVS connections */
+ {
+ .hook = ip_vs_forward_icmp,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_FORWARD,
+ .priority = 99,
+ },
+ /* Before the netfilter connection tracking, exit from POST_ROUTING */
+ {
+ .hook = ip_vs_post_routing,
+ .owner = THIS_MODULE,
+ .pf = PF_INET,
+ .hooknum = NF_INET_POST_ROUTING,
+ .priority = NF_IP_PRI_NAT_SRC-1,
+ },
};
goto cleanup_app;
}
- ret = nf_register_hook(&ip_vs_in_ops);
+ ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
if (ret < 0) {
- IP_VS_ERR("can't register in hook.\n");
+ IP_VS_ERR("can't register hooks.\n");
goto cleanup_conn;
}
- ret = nf_register_hook(&ip_vs_out_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register out hook.\n");
- goto cleanup_inops;
- }
- ret = nf_register_hook(&ip_vs_post_routing_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register post_routing hook.\n");
- goto cleanup_outops;
- }
- ret = nf_register_hook(&ip_vs_forward_icmp_ops);
- if (ret < 0) {
- IP_VS_ERR("can't register forward_icmp hook.\n");
- goto cleanup_postroutingops;
- }
-
IP_VS_INFO("ipvs loaded.\n");
return ret;
- cleanup_postroutingops:
- nf_unregister_hook(&ip_vs_post_routing_ops);
- cleanup_outops:
- nf_unregister_hook(&ip_vs_out_ops);
- cleanup_inops:
- nf_unregister_hook(&ip_vs_in_ops);
cleanup_conn:
ip_vs_conn_cleanup();
cleanup_app:
static void __exit ip_vs_cleanup(void)
{
- nf_unregister_hook(&ip_vs_forward_icmp_ops);
- nf_unregister_hook(&ip_vs_post_routing_ops);
- nf_unregister_hook(&ip_vs_out_ops);
- nf_unregister_hook(&ip_vs_in_ops);
+ nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops));
ip_vs_conn_cleanup();
ip_vs_app_cleanup();
ip_vs_protocol_cleanup();