net: Introduce sk_route_nocaps
[safe/jmp/linux-2.6] / net / ipv4 / tcp_ipv4.c
index 657ae33..202cf09 100644 (file)
@@ -60,6 +60,7 @@
 #include <linux/jhash.h>
 #include <linux/init.h>
 #include <linux/times.h>
+#include <linux/slab.h>
 
 #include <net/net_namespace.h>
 #include <net/icmp.h>
@@ -204,7 +205,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
                 * when trying new connection.
                 */
                if (peer != NULL &&
-                   peer->tcp_ts_stamp + TCP_PAWS_MSL >= get_seconds()) {
+                   (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
                        tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
                        tp->rx_opt.ts_recent = peer->tcp_ts;
                }
@@ -217,7 +218,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
        if (inet->opt)
                inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 
-       tp->rx_opt.mss_clamp = 536;
+       tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 
        /* Socket identity is still unknown (sport may be zero).
         * However we set state to SYN-SENT and not releasing socket
@@ -370,6 +371,11 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
        if (sk->sk_state == TCP_CLOSE)
                goto out;
 
+       if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+               NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+               goto out;
+       }
+
        icsk = inet_csk(sk);
        tp = tcp_sk(sk);
        seq = ntohl(th->seq);
@@ -513,26 +519,31 @@ out:
        sock_put(sk);
 }
 
-/* This routine computes an IPv4 TCP checksum. */
-void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
+static void __tcp_v4_send_check(struct sk_buff *skb,
+                               __be32 saddr, __be32 daddr)
 {
-       struct inet_sock *inet = inet_sk(sk);
        struct tcphdr *th = tcp_hdr(skb);
 
        if (skb->ip_summed == CHECKSUM_PARTIAL) {
-               th->check = ~tcp_v4_check(len, inet->inet_saddr,
-                                         inet->inet_daddr, 0);
+               th->check = ~tcp_v4_check(skb->len, saddr, daddr, 0);
                skb->csum_start = skb_transport_header(skb) - skb->head;
                skb->csum_offset = offsetof(struct tcphdr, check);
        } else {
-               th->check = tcp_v4_check(len, inet->inet_saddr,
-                                        inet->inet_daddr,
+               th->check = tcp_v4_check(skb->len, saddr, daddr,
                                         csum_partial(th,
                                                      th->doff << 2,
                                                      skb->csum));
        }
 }
 
+/* This routine computes an IPv4 TCP checksum. */
+void tcp_v4_send_check(struct sock *sk, struct sk_buff *skb)
+{
+       struct inet_sock *inet = inet_sk(sk);
+
+       __tcp_v4_send_check(skb, inet->inet_saddr, inet->inet_daddr);
+}
+
 int tcp_v4_gso_send_check(struct sk_buff *skb)
 {
        const struct iphdr *iph;
@@ -545,10 +556,8 @@ int tcp_v4_gso_send_check(struct sk_buff *skb)
        th = tcp_hdr(skb);
 
        th->check = 0;
-       th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
-       skb->csum_start = skb_transport_header(skb) - skb->head;
-       skb->csum_offset = offsetof(struct tcphdr, check);
        skb->ip_summed = CHECKSUM_PARTIAL;
+       __tcp_v4_send_check(skb, iph->saddr, iph->daddr);
        return 0;
 }
 
@@ -742,8 +751,9 @@ static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
  *     This still operates on a request_sock only, not on a big
  *     socket.
  */
-static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
-                               struct dst_entry *dst)
+static int tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
+                             struct request_sock *req,
+                             struct request_values *rvp)
 {
        const struct inet_request_sock *ireq = inet_rsk(req);
        int err = -1;
@@ -753,16 +763,10 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
        if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
                return -1;
 
-       skb = tcp_make_synack(sk, dst, req);
+       skb = tcp_make_synack(sk, dst, req, rvp);
 
        if (skb) {
-               struct tcphdr *th = tcp_hdr(skb);
-
-               th->check = tcp_v4_check(skb->len,
-                                        ireq->loc_addr,
-                                        ireq->rmt_addr,
-                                        csum_partial(th, skb->len,
-                                                     skb->csum));
+               __tcp_v4_send_check(skb, ireq->loc_addr, ireq->rmt_addr);
 
                err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
                                            ireq->rmt_addr,
@@ -774,9 +778,11 @@ static int __tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
        return err;
 }
 
-static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req)
+static int tcp_v4_rtx_synack(struct sock *sk, struct request_sock *req,
+                             struct request_values *rvp)
 {
-       return __tcp_v4_send_synack(sk, req, NULL);
+       TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
+       return tcp_v4_send_synack(sk, NULL, req, rvp);
 }
 
 /*
@@ -885,7 +891,7 @@ int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
                                kfree(newkey);
                                return -ENOMEM;
                        }
-                       sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+                       sk_nocaps_add(sk, NETIF_F_GSO_MASK);
                }
                if (tcp_alloc_md5sig_pool(sk) == NULL) {
                        kfree(newkey);
@@ -1015,7 +1021,7 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
                        return -EINVAL;
 
                tp->md5sig_info = p;
-               sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+               sk_nocaps_add(sk, NETIF_F_GSO_MASK);
        }
 
        newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
@@ -1190,10 +1196,11 @@ static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
        .family         =       PF_INET,
        .obj_size       =       sizeof(struct tcp_request_sock),
-       .rtx_syn_ack    =       tcp_v4_send_synack,
+       .rtx_syn_ack    =       tcp_v4_rtx_synack,
        .send_ack       =       tcp_v4_reqsk_send_ack,
        .destructor     =       tcp_v4_reqsk_destructor,
        .send_reset     =       tcp_v4_send_reset,
+       .syn_ack_timeout =      tcp_syn_ack_timeout,
 };
 
 #ifdef CONFIG_TCP_MD5SIG
@@ -1211,13 +1218,16 @@ static struct timewait_sock_ops tcp_timewait_sock_ops = {
 
 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
 {
-       struct inet_request_sock *ireq;
+       struct tcp_extend_values tmp_ext;
        struct tcp_options_received tmp_opt;
+       u8 *hash_location;
        struct request_sock *req;
+       struct inet_request_sock *ireq;
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct dst_entry *dst = NULL;
        __be32 saddr = ip_hdr(skb)->saddr;
        __be32 daddr = ip_hdr(skb)->daddr;
        __u32 isn = TCP_SKB_CB(skb)->when;
-       struct dst_entry *dst = NULL;
 #ifdef CONFIG_SYN_COOKIES
        int want_cookie = 0;
 #else
@@ -1257,31 +1267,61 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
 #endif
 
-       ireq = inet_rsk(req);
-       ireq->loc_addr = daddr;
-       ireq->rmt_addr = saddr;
-       ireq->no_srccheck = inet_sk(sk)->transparent;
-       ireq->opt = tcp_v4_save_options(sk, skb);
+       tcp_clear_options(&tmp_opt);
+       tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
+       tmp_opt.user_mss  = tp->rx_opt.user_mss;
+       tcp_parse_options(skb, &tmp_opt, &hash_location, 0);
+
+       if (tmp_opt.cookie_plus > 0 &&
+           tmp_opt.saw_tstamp &&
+           !tp->rx_opt.cookie_out_never &&
+           (sysctl_tcp_cookie_size > 0 ||
+            (tp->cookie_values != NULL &&
+             tp->cookie_values->cookie_desired > 0))) {
+               u8 *c;
+               u32 *mess = &tmp_ext.cookie_bakery[COOKIE_DIGEST_WORDS];
+               int l = tmp_opt.cookie_plus - TCPOLEN_COOKIE_BASE;
+
+               if (tcp_cookie_generator(&tmp_ext.cookie_bakery[0]) != 0)
+                       goto drop_and_release;
 
-       dst = inet_csk_route_req(sk, req);
-       if(!dst)
-               goto drop_and_free;
+               /* Secret recipe starts with IP addresses */
+               *mess++ ^= (__force u32)daddr;
+               *mess++ ^= (__force u32)saddr;
 
-       tcp_clear_options(&tmp_opt);
-       tmp_opt.mss_clamp = 536;
-       tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
+               /* plus variable length Initiator Cookie */
+               c = (u8 *)mess;
+               while (l-- > 0)
+                       *c++ ^= *hash_location++;
 
-       tcp_parse_options(skb, &tmp_opt, 0, dst);
+#ifdef CONFIG_SYN_COOKIES
+               want_cookie = 0;        /* not our kind of cookie */
+#endif
+               tmp_ext.cookie_out_never = 0; /* false */
+               tmp_ext.cookie_plus = tmp_opt.cookie_plus;
+       } else if (!tp->rx_opt.cookie_in_always) {
+               /* redundant indications, but ensure initialization. */
+               tmp_ext.cookie_out_never = 1; /* true */
+               tmp_ext.cookie_plus = 0;
+       } else {
+               goto drop_and_release;
+       }
+       tmp_ext.cookie_in_always = tp->rx_opt.cookie_in_always;
 
        if (want_cookie && !tmp_opt.saw_tstamp)
                tcp_clear_options(&tmp_opt);
 
        tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
-
        tcp_openreq_init(req, &tmp_opt, skb);
 
+       ireq = inet_rsk(req);
+       ireq->loc_addr = daddr;
+       ireq->rmt_addr = saddr;
+       ireq->no_srccheck = inet_sk(sk)->transparent;
+       ireq->opt = tcp_v4_save_options(sk, skb);
+
        if (security_inet_conn_request(sk, skb, req))
-               goto drop_and_release;
+               goto drop_and_free;
 
        if (!want_cookie)
                TCP_ECN_create_request(req, tcp_hdr(skb));
@@ -1306,9 +1346,10 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
                 */
                if (tmp_opt.saw_tstamp &&
                    tcp_death_row.sysctl_tw_recycle &&
+                   (dst = inet_csk_route_req(sk, req)) != NULL &&
                    (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
                    peer->v4daddr == saddr) {
-                       if (get_seconds() < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
+                       if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
                            (s32)(peer->tcp_ts - req->ts_recent) >
                                                        TCP_PAWS_WINDOW) {
                                NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
@@ -1337,7 +1378,9 @@ int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
        }
        tcp_rsk(req)->snt_isn = isn;
 
-       if (__tcp_v4_send_synack(sk, req, dst) || want_cookie)
+       if (tcp_v4_send_synack(sk, dst, req,
+                              (struct request_values *)&tmp_ext) ||
+           want_cookie)
                goto drop_and_free;
 
        inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
@@ -1419,11 +1462,11 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
                if (newkey != NULL)
                        tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
                                          newkey, key->keylen);
-               newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
+               sk_nocaps_add(newsk, NETIF_F_GSO_MASK);
        }
 #endif
 
-       __inet_hash_nolisten(newsk);
+       __inet_hash_nolisten(newsk, NULL);
        __inet_inherit_port(sk, newsk);
 
        return newsk;
@@ -1615,6 +1658,11 @@ process:
        if (sk->sk_state == TCP_TIME_WAIT)
                goto do_time_wait;
 
+       if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
+               NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
+               goto discard_and_relse;
+       }
+
        if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
                goto discard_and_relse;
        nf_reset(skb);
@@ -1624,6 +1672,8 @@ process:
 
        skb->dev = NULL;
 
+       sock_rps_save_rxhash(sk, skb->rxhash);
+
        bh_lock_sock_nested(sk);
        ret = 0;
        if (!sock_owned_by_user(sk)) {
@@ -1639,8 +1689,11 @@ process:
                        if (!tcp_prequeue(sk, skb))
                                ret = tcp_v4_do_rcv(sk, skb);
                }
-       } else
-               sk_add_backlog(sk, skb);
+       } else if (unlikely(sk_add_backlog(sk, skb))) {
+               bh_unlock_sock(sk);
+               NET_INC_STATS_BH(net, LINUX_MIB_TCPBACKLOGDROP);
+               goto discard_and_relse;
+       }
        bh_unlock_sock(sk);
 
        sock_put(sk);
@@ -1727,9 +1780,9 @@ int tcp_v4_remember_stamp(struct sock *sk)
 
        if (peer) {
                if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
-                   (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
-                    peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
-                       peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
+                   ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+                    peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
+                       peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
                        peer->tcp_ts = tp->rx_opt.ts_recent;
                }
                if (release_it)
@@ -1748,9 +1801,9 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
                const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 
                if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
-                   (peer->tcp_ts_stamp + TCP_PAWS_MSL < get_seconds() &&
-                    peer->tcp_ts_stamp <= tcptw->tw_ts_recent_stamp)) {
-                       peer->tcp_ts_stamp = tcptw->tw_ts_recent_stamp;
+                   ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
+                    peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
+                       peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
                        peer->tcp_ts       = tcptw->tw_ts_recent;
                }
                inet_putpeer(peer);
@@ -1815,7 +1868,7 @@ static int tcp_v4_init_sock(struct sock *sk)
         */
        tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
        tp->snd_cwnd_clamp = ~0;
-       tp->mss_cache = 536;
+       tp->mss_cache = TCP_MSS_DEFAULT;
 
        tp->reordering = sysctl_tcp_reordering;
        icsk->icsk_ca_ops = &tcp_init_congestion_ops;
@@ -1831,6 +1884,19 @@ static int tcp_v4_init_sock(struct sock *sk)
        tp->af_specific = &tcp_sock_ipv4_specific;
 #endif
 
+       /* TCP Cookie Transactions */
+       if (sysctl_tcp_cookie_size > 0) {
+               /* Default, cookies without s_data_payload. */
+               tp->cookie_values =
+                       kzalloc(sizeof(*tp->cookie_values),
+                               sk->sk_allocation);
+               if (tp->cookie_values != NULL)
+                       kref_init(&tp->cookie_values->kref);
+       }
+       /* Presumed zeroed, in order of appearance:
+        *      cookie_in_always, cookie_out_never,
+        *      s_data_constant, s_data_in, s_data_out
+        */
        sk->sk_sndbuf = sysctl_tcp_wmem[1];
        sk->sk_rcvbuf = sysctl_tcp_rmem[1];
 
@@ -1884,6 +1950,13 @@ void tcp_v4_destroy_sock(struct sock *sk)
                sk->sk_sndmsg_page = NULL;
        }
 
+       /* TCP Cookie Transactions */
+       if (tp->cookie_values != NULL) {
+               kref_put(&tp->cookie_values->kref,
+                        tcp_cookie_values_release);
+               tp->cookie_values = NULL;
+       }
+
        percpu_counter_dec(&tcp_sockets_allocated);
 }
 
@@ -2257,6 +2330,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
        __be32 src = inet->inet_rcv_saddr;
        __u16 destp = ntohs(inet->inet_dport);
        __u16 srcp = ntohs(inet->inet_sport);
+       int rx_queue;
 
        if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
                timer_active    = 1;
@@ -2272,12 +2346,19 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
                timer_expires = jiffies;
        }
 
+       if (sk->sk_state == TCP_LISTEN)
+               rx_queue = sk->sk_ack_backlog;
+       else
+               /*
+                * because we dont lock socket, we might find a transient negative value
+                */
+               rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+
        seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
                        "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
                i, src, srcp, dest, destp, sk->sk_state,
                tp->write_seq - tp->snd_una,
-               sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
-                                            (tp->rcv_nxt - tp->copied_seq),
+               rx_queue,
                timer_active,
                jiffies_to_clock_t(timer_expires - jiffies),
                icsk->icsk_retransmits,
@@ -2359,12 +2440,12 @@ static struct tcp_seq_afinfo tcp4_seq_afinfo = {
        },
 };
 
-static int tcp4_proc_init_net(struct net *net)
+static int __net_init tcp4_proc_init_net(struct net *net)
 {
        return tcp_proc_register(net, &tcp4_seq_afinfo);
 }
 
-static void tcp4_proc_exit_net(struct net *net)
+static void __net_exit tcp4_proc_exit_net(struct net *net)
 {
        tcp_proc_unregister(net, &tcp4_seq_afinfo);
 }
@@ -2468,12 +2549,17 @@ static int __net_init tcp_sk_init(struct net *net)
 static void __net_exit tcp_sk_exit(struct net *net)
 {
        inet_ctl_sock_destroy(net->ipv4.tcp_sock);
-       inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
+}
+
+static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+{
+       inet_twsk_purge(&tcp_hashinfo, &tcp_death_row, AF_INET);
 }
 
 static struct pernet_operations __net_initdata tcp_sk_ops = {
-       .init = tcp_sk_init,
-       .exit = tcp_sk_exit,
+       .init      = tcp_sk_init,
+       .exit      = tcp_sk_exit,
+       .exit_batch = tcp_sk_exit_batch,
 };
 
 void __init tcp_v4_init(void)