SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv4/tcp_ipv4.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  *              IPv4 specific functions
   9  *
  10  *
  11  *              code split from:
  12  *              linux/ipv4/tcp.c
  13  *              linux/ipv4/tcp_input.c
  14  *              linux/ipv4/tcp_output.c
  15  *
  16  *              See tcp.c for author information
  17  *
  18  *      This program is free software; you can redistribute it and/or
  19  *      modify it under the terms of the GNU General Public License
  20  *      as published by the Free Software Foundation; either version
  21  *      2 of the License, or (at your option) any later version.
  22  */
  23
  24 /*
  25  * Changes:
  26  *              David S. Miller :       New socket lookup architecture.
  27  *                                      This code is dedicated to John Dyson.
  28  *              David S. Miller :       Change semantics of established hash,
  29  *                                      half is devoted to TIME_WAIT sockets
  30  *                                      and the rest go in the other half.
  31  *              Andi Kleen :            Add support for syncookies and fixed
  32  *                                      some bugs: ip options weren't passed to
  33  *                                      the TCP layer, missed a check for an
  34  *                                      ACK bit.
  35  *              Andi Kleen :            Implemented fast path mtu discovery.
  36  *                                      Fixed many serious bugs in the
  37  *                                      request_sock handling and moved
  38  *                                      most of it into the af independent code.
  39  *                                      Added tail drop and some other bugfixes.
  40  *                                      Added new listen semantics.
  41  *              Mike McLagan    :       Routing by source
  42  *      Juan Jose Ciarlante:            ip_dynaddr bits
  43  *              Andi Kleen:             various fixes.
  44  *      Vitaly E. Lavrov        :       Transparent proxy revived after year
  45  *                                      coma.
  46  *      Andi Kleen              :       Fix new listen.
  47  *      Andi Kleen              :       Fix accept error reporting.
  48  *      YOSHIFUJI Hideaki @USAGI and:   Support IPV6_V6ONLY socket option, which
  49  *      Alexey Kuznetsov                allow both IPv4 and IPv6 sockets to bind
  50  *                                      a single port at the same time.
  51  */
  52
  53
  54 #include <linux/bottom_half.h>
  55 #include <linux/types.h>
  56 #include <linux/fcntl.h>
  57 #include <linux/module.h>
  58 #include <linux/random.h>
  59 #include <linux/cache.h>
  60 #include <linux/jhash.h>
  61 #include <linux/init.h>
  62 #include <linux/times.h>
  63
  64 #include <net/net_namespace.h>
  65 #include <net/icmp.h>
  66 #include <net/inet_hashtables.h>
  67 #include <net/tcp.h>
  68 #include <net/transp_v6.h>
  69 #include <net/ipv6.h>
  70 #include <net/inet_common.h>
  71 #include <net/timewait_sock.h>
  72 #include <net/xfrm.h>
  73 #include <net/netdma.h>
  74
  75 #include <linux/inet.h>
  76 #include <linux/ipv6.h>
  77 #include <linux/stddef.h>
  78 #include <linux/proc_fs.h>
  79 #include <linux/seq_file.h>
  80
  81 #include <linux/crypto.h>
  82 #include <linux/scatterlist.h>
  83
  84 int sysctl_tcp_tw_reuse __read_mostly;
  85 int sysctl_tcp_low_latency __read_mostly;
  86
  87
  88 #ifdef CONFIG_TCP_MD5SIG
  89 static struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk,
  90                                                    __be32 addr);
  91 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
  92                                __be32 daddr, __be32 saddr, struct tcphdr *th);
  93 #else
  94 static inline
  95 struct tcp_md5sig_key *tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
  96 {
  97         return NULL;
  98 }
  99 #endif
 100
 101 struct inet_hashinfo tcp_hashinfo;
 102
 103 static inline __u32 tcp_v4_init_sequence(struct sk_buff *skb)
 104 {
 105         return secure_tcp_sequence_number(ip_hdr(skb)->daddr,
 106                                           ip_hdr(skb)->saddr,
 107                                           tcp_hdr(skb)->dest,
 108                                           tcp_hdr(skb)->source);
 109 }
 110
 111 int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp)
 112 {
 113         const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw);
 114         struct tcp_sock *tp = tcp_sk(sk);
 115
 116         /* With PAWS, it is safe from the viewpoint
 117            of data integrity. Even without PAWS it is safe provided sequence
 118            spaces do not overlap i.e. at data rates <= 80Mbit/sec.
 119
 120            Actually, the idea is close to VJ's one, only timestamp cache is
 121            held not per host, but per port pair and TW bucket is used as state
 122            holder.
 123
 124            If TW bucket has been already destroyed we fall back to VJ's scheme
 125            and use initial timestamp retrieved from peer table.
 126          */
 127         if (tcptw->tw_ts_recent_stamp &&
 128             (twp == NULL || (sysctl_tcp_tw_reuse &&
 129                              get_seconds() - tcptw->tw_ts_recent_stamp > 1))) {
 130                 tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2;
 131                 if (tp->write_seq == 0)
 132                         tp->write_seq = 1;
 133                 tp->rx_opt.ts_recent       = tcptw->tw_ts_recent;
 134                 tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 135                 sock_hold(sktw);
 136                 return 1;
 137         }
 138
 139         return 0;
 140 }
 141
 142 EXPORT_SYMBOL_GPL(tcp_twsk_unique);
 143
 144 /* This will initiate an outgoing connection. */
 145 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
 146 {
 147         struct inet_sock *inet = inet_sk(sk);
 148         struct tcp_sock *tp = tcp_sk(sk);
 149         struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
 150         struct rtable *rt;
 151         __be32 daddr, nexthop;
 152         int tmp;
 153         int err;
 154
 155         if (addr_len < sizeof(struct sockaddr_in))
 156                 return -EINVAL;
 157
 158         if (usin->sin_family != AF_INET)
 159                 return -EAFNOSUPPORT;
 160
 161         nexthop = daddr = usin->sin_addr.s_addr;
 162         if (inet->opt && inet->opt->srr) {
 163                 if (!daddr)
 164                         return -EINVAL;
 165                 nexthop = inet->opt->faddr;
 166         }
 167
 168         tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
 169                                RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
 170                                IPPROTO_TCP,
 171                                inet->inet_sport, usin->sin_port, sk, 1);
 172         if (tmp < 0) {
 173                 if (tmp == -ENETUNREACH)
 174                         IP_INC_STATS_BH(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
 175                 return tmp;
 176         }
 177
 178         if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
 179                 ip_rt_put(rt);
 180                 return -ENETUNREACH;
 181         }
 182
 183         if (!inet->opt || !inet->opt->srr)
 184                 daddr = rt->rt_dst;
 185
 186         if (!inet->inet_saddr)
 187                 inet->inet_saddr = rt->rt_src;
 188         inet->inet_rcv_saddr = inet->inet_saddr;
 189
 190         if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
 191                 /* Reset inherited state */
 192                 tp->rx_opt.ts_recent       = 0;
 193                 tp->rx_opt.ts_recent_stamp = 0;
 194                 tp->write_seq              = 0;
 195         }
 196
 197         if (tcp_death_row.sysctl_tw_recycle &&
 198             !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
 199                 struct inet_peer *peer = rt_get_peer(rt);
 200                 /*
 201                  * VJ's idea. We save last timestamp seen from
 202                  * the destination in peer table, when entering state
 203                  * TIME-WAIT * and initialize rx_opt.ts_recent from it,
 204                  * when trying new connection.
 205                  */
 206                 if (peer != NULL &&
 207                     (u32)get_seconds() - peer->tcp_ts_stamp <= TCP_PAWS_MSL) {
 208                         tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
 209                         tp->rx_opt.ts_recent = peer->tcp_ts;
 210                 }
 211         }
 212
 213         inet->inet_dport = usin->sin_port;
 214         inet->inet_daddr = daddr;
 215
 216         inet_csk(sk)->icsk_ext_hdr_len = 0;
 217         if (inet->opt)
 218                 inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
 219
 220         tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;
 221
 222         /* Socket identity is still unknown (sport may be zero).
 223          * However we set state to SYN-SENT and not releasing socket
 224          * lock select source port, enter ourselves into the hash tables and
 225          * complete initialization after this.
 226          */
 227         tcp_set_state(sk, TCP_SYN_SENT);
 228         err = inet_hash_connect(&tcp_death_row, sk);
 229         if (err)
 230                 goto failure;
 231
 232         err = ip_route_newports(&rt, IPPROTO_TCP,
 233                                 inet->inet_sport, inet->inet_dport, sk);
 234         if (err)
 235                 goto failure;
 236
 237         /* OK, now commit destination to socket.  */
 238         sk->sk_gso_type = SKB_GSO_TCPV4;
 239         sk_setup_caps(sk, &rt->u.dst);
 240
 241         if (!tp->write_seq)
 242                 tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
 243                                                            inet->inet_daddr,
 244                                                            inet->inet_sport,
 245                                                            usin->sin_port);
 246
 247         inet->inet_id = tp->write_seq ^ jiffies;
 248
 249         err = tcp_connect(sk);
 250         rt = NULL;
 251         if (err)
 252                 goto failure;
 253
 254         return 0;
 255
 256 failure:
 257         /*
 258          * This unhashes the socket and releases the local port,
 259          * if necessary.
 260          */
 261         tcp_set_state(sk, TCP_CLOSE);
 262         ip_rt_put(rt);
 263         sk->sk_route_caps = 0;
 264         inet->inet_dport = 0;
 265         return err;
 266 }
 267
 268 /*
 269  * This routine does path mtu discovery as defined in RFC1191.
 270  */
 271 static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu)
 272 {
 273         struct dst_entry *dst;
 274         struct inet_sock *inet = inet_sk(sk);
 275
 276         /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
 277          * send out by Linux are always <576bytes so they should go through
 278          * unfragmented).
 279          */
 280         if (sk->sk_state == TCP_LISTEN)
 281                 return;
 282
 283         /* We don't check in the destentry if pmtu discovery is forbidden
 284          * on this route. We just assume that no packet_to_big packets
 285          * are send back when pmtu discovery is not active.
 286          * There is a small race when the user changes this flag in the
 287          * route, but I think that's acceptable.
 288          */
 289         if ((dst = __sk_dst_check(sk, 0)) == NULL)
 290                 return;
 291
 292         dst->ops->update_pmtu(dst, mtu);
 293
 294         /* Something is about to be wrong... Remember soft error
 295          * for the case, if this connection will not able to recover.
 296          */
 297         if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
 298                 sk->sk_err_soft = EMSGSIZE;
 299
 300         mtu = dst_mtu(dst);
 301
 302         if (inet->pmtudisc != IP_PMTUDISC_DONT &&
 303             inet_csk(sk)->icsk_pmtu_cookie > mtu) {
 304                 tcp_sync_mss(sk, mtu);
 305
 306                 /* Resend the TCP packet because it's
 307                  * clear that the old packet has been
 308                  * dropped. This is the new "fast" path mtu
 309                  * discovery.
 310                  */
 311                 tcp_simple_retransmit(sk);
 312         } /* else let the usual retransmit timer handle it */
 313 }
 314
 315 /*
 316  * This routine is called by the ICMP module when it gets some
 317  * sort of error condition.  If err < 0 then the socket should
 318  * be closed and the error returned to the user.  If err > 0
 319  * it's just the icmp type << 8 | icmp code.  After adjustment
 320  * header points to the first 8 bytes of the tcp header.  We need
 321  * to find the appropriate port.
 322  *
 323  * The locking strategy used here is very "optimistic". When
 324  * someone else accesses the socket the ICMP is just dropped
 325  * and for some paths there is no check at all.
 326  * A more general error queue to queue errors for later handling
 327  * is probably better.
 328  *
 329  */
 330
 331 void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
 332 {
 333         struct iphdr *iph = (struct iphdr *)icmp_skb->data;
 334         struct tcphdr *th = (struct tcphdr *)(icmp_skb->data + (iph->ihl << 2));
 335         struct inet_connection_sock *icsk;
 336         struct tcp_sock *tp;
 337         struct inet_sock *inet;
 338         const int type = icmp_hdr(icmp_skb)->type;
 339         const int code = icmp_hdr(icmp_skb)->code;
 340         struct sock *sk;
 341         struct sk_buff *skb;
 342         __u32 seq;
 343         __u32 remaining;
 344         int err;
 345         struct net *net = dev_net(icmp_skb->dev);
 346
 347         if (icmp_skb->len < (iph->ihl << 2) + 8) {
 348                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 349                 return;
 350         }
 351
 352         sk = inet_lookup(net, &tcp_hashinfo, iph->daddr, th->dest,
 353                         iph->saddr, th->source, inet_iif(icmp_skb));
 354         if (!sk) {
 355                 ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
 356                 return;
 357         }
 358         if (sk->sk_state == TCP_TIME_WAIT) {
 359                 inet_twsk_put(inet_twsk(sk));
 360                 return;
 361         }
 362
 363         bh_lock_sock(sk);
 364         /* If too many ICMPs get dropped on busy
 365          * servers this needs to be solved differently.
 366          */
 367         if (sock_owned_by_user(sk))
 368                 NET_INC_STATS_BH(net, LINUX_MIB_LOCKDROPPEDICMPS);
 369
 370         if (sk->sk_state == TCP_CLOSE)
 371                 goto out;
 372
 373         icsk = inet_csk(sk);
 374         tp = tcp_sk(sk);
 375         seq = ntohl(th->seq);
 376         if (sk->sk_state != TCP_LISTEN &&
 377             !between(seq, tp->snd_una, tp->snd_nxt)) {
 378                 NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 379                 goto out;
 380         }
 381
 382         switch (type) {
 383         case ICMP_SOURCE_QUENCH:
 384                 /* Just silently ignore these. */
 385                 goto out;
 386         case ICMP_PARAMETERPROB:
 387                 err = EPROTO;
 388                 break;
 389         case ICMP_DEST_UNREACH:
 390                 if (code > NR_ICMP_UNREACH)
 391                         goto out;
 392
 393                 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
 394                         if (!sock_owned_by_user(sk))
 395                                 do_pmtu_discovery(sk, iph, info);
 396                         goto out;
 397                 }
 398
 399                 err = icmp_err_convert[code].errno;
 400                 /* check if icmp_skb allows revert of backoff
 401                  * (see draft-zimmermann-tcp-lcd) */
 402                 if (code != ICMP_NET_UNREACH && code != ICMP_HOST_UNREACH)
 403                         break;
 404                 if (seq != tp->snd_una  || !icsk->icsk_retransmits ||
 405                     !icsk->icsk_backoff)
 406                         break;
 407
 408                 icsk->icsk_backoff--;
 409                 inet_csk(sk)->icsk_rto = __tcp_set_rto(tp) <<
 410                                          icsk->icsk_backoff;
 411                 tcp_bound_rto(sk);
 412
 413                 skb = tcp_write_queue_head(sk);
 414                 BUG_ON(!skb);
 415
 416                 remaining = icsk->icsk_rto - min(icsk->icsk_rto,
 417                                 tcp_time_stamp - TCP_SKB_CB(skb)->when);
 418
 419                 if (remaining) {
 420                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 421                                                   remaining, TCP_RTO_MAX);
 422                 } else if (sock_owned_by_user(sk)) {
 423                         /* RTO revert clocked out retransmission,
 424                          * but socket is locked. Will defer. */
 425                         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 426                                                   HZ/20, TCP_RTO_MAX);
 427                 } else {
 428                         /* RTO revert clocked out retransmission.
 429                          * Will retransmit now */
 430                         tcp_retransmit_timer(sk);
 431                 }
 432
 433                 break;
 434         case ICMP_TIME_EXCEEDED:
 435                 err = EHOSTUNREACH;
 436                 break;
 437         default:
 438                 goto out;
 439         }
 440
 441         switch (sk->sk_state) {
 442                 struct request_sock *req, **prev;
 443         case TCP_LISTEN:
 444                 if (sock_owned_by_user(sk))
 445                         goto out;
 446
 447                 req = inet_csk_search_req(sk, &prev, th->dest,
 448                                           iph->daddr, iph->saddr);
 449                 if (!req)
 450                         goto out;
 451
 452                 /* ICMPs are not backlogged, hence we cannot get
 453                    an established socket here.
 454                  */
 455                 WARN_ON(req->sk);
 456
 457                 if (seq != tcp_rsk(req)->snt_isn) {
 458                         NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
 459                         goto out;
 460                 }
 461
 462                 /*
 463                  * Still in SYN_RECV, just remove it silently.
 464                  * There is no good way to pass the error to the newly
 465                  * created socket, and POSIX does not want network
 466                  * errors returned from accept().
 467                  */
 468                 inet_csk_reqsk_queue_drop(sk, req, prev);
 469                 goto out;
 470
 471         case TCP_SYN_SENT:
 472         case TCP_SYN_RECV:  /* Cannot happen.
 473                                It can f.e. if SYNs crossed.
 474                              */
 475                 if (!sock_owned_by_user(sk)) {
 476                         sk->sk_err = err;
 477
 478                         sk->sk_error_report(sk);
 479
 480                         tcp_done(sk);
 481                 } else {
 482                         sk->sk_err_soft = err;
 483                 }
 484                 goto out;
 485         }
 486
 487         /* If we've already connected we will keep trying
 488          * until we time out, or the user gives up.
 489          *
 490          * rfc1122 4.2.3.9 allows to consider as hard errors
 491          * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
 492          * but it is obsoleted by pmtu discovery).
 493          *
 494          * Note, that in modern internet, where routing is unreliable
 495          * and in each dark corner broken firewalls sit, sending random
 496          * errors ordered by their masters even this two messages finally lose
 497          * their original sense (even Linux sends invalid PORT_UNREACHs)
 498          *
 499          * Now we are in compliance with RFCs.
 500          *                                                      --ANK (980905)
 501          */
 502
 503         inet = inet_sk(sk);
 504         if (!sock_owned_by_user(sk) && inet->recverr) {
 505                 sk->sk_err = err;
 506                 sk->sk_error_report(sk);
 507         } else  { /* Only an error on timeout */
 508                 sk->sk_err_soft = err;
 509         }
 510
 511 out:
 512         bh_unlock_sock(sk);
 513         sock_put(sk);
 514 }
 515
 516 /* This routine computes an IPv4 TCP checksum. */
 517 void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb)
 518 {
 519         struct inet_sock *inet = inet_sk(sk);
 520         struct tcphdr *th = tcp_hdr(skb);
 521
 522         if (skb->ip_summed == CHECKSUM_PARTIAL) {
 523                 th->check = ~tcp_v4_check(len, inet->inet_saddr,
 524                                           inet->inet_daddr, 0);
 525                 skb->csum_start = skb_transport_header(skb) - skb->head;
 526                 skb->csum_offset = offsetof(struct tcphdr, check);
 527         } else {
 528                 th->check = tcp_v4_check(len, inet->inet_saddr,
 529                                          inet->inet_daddr,
 530                                          csum_partial(th,
 531                                                       th->doff << 2,
 532                                                       skb->csum));
 533         }
 534 }
 535
 536 int tcp_v4_gso_send_check(struct sk_buff *skb)
 537 {
 538         const struct iphdr *iph;
 539         struct tcphdr *th;
 540
 541         if (!pskb_may_pull(skb, sizeof(*th)))
 542                 return -EINVAL;
 543
 544         iph = ip_hdr(skb);
 545         th = tcp_hdr(skb);
 546
 547         th->check = 0;
 548         th->check = ~tcp_v4_check(skb->len, iph->saddr, iph->daddr, 0);
 549         skb->csum_start = skb_transport_header(skb) - skb->head;
 550         skb->csum_offset = offsetof(struct tcphdr, check);
 551         skb->ip_summed = CHECKSUM_PARTIAL;
 552         return 0;
 553 }
 554
 555 /*
 556  *      This routine will send an RST to the other tcp.
 557  *
 558  *      Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
 559  *                    for reset.
 560  *      Answer: if a packet caused RST, it is not for a socket
 561  *              existing in our system, if it is matched to a socket,
 562  *              it is just duplicate segment or bug in other side's TCP.
 563  *              So that we build reply only basing on parameters
 564  *              arrived with segment.
 565  *      Exception: precedence violation. We do not implement it in any case.
 566  */
 567
 568 static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb)
 569 {
 570         struct tcphdr *th = tcp_hdr(skb);
 571         struct {
 572                 struct tcphdr th;
 573 #ifdef CONFIG_TCP_MD5SIG
 574                 __be32 opt[(TCPOLEN_MD5SIG_ALIGNED >> 2)];
 575 #endif
 576         } rep;
 577         struct ip_reply_arg arg;
 578 #ifdef CONFIG_TCP_MD5SIG
 579         struct tcp_md5sig_key *key;
 580 #endif
 581         struct net *net;
 582
 583         /* Never send a reset in response to a reset. */
 584         if (th->rst)
 585                 return;
 586
 587         if (skb_rtable(skb)->rt_type != RTN_LOCAL)
 588                 return;
 589
 590         /* Swap the send and the receive. */
 591         memset(&rep, 0, sizeof(rep));
 592         rep.th.dest   = th->source;
 593         rep.th.source = th->dest;
 594         rep.th.doff   = sizeof(struct tcphdr) / 4;
 595         rep.th.rst    = 1;
 596
 597         if (th->ack) {
 598                 rep.th.seq = th->ack_seq;
 599         } else {
 600                 rep.th.ack = 1;
 601                 rep.th.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
 602                                        skb->len - (th->doff << 2));
 603         }
 604
 605         memset(&arg, 0, sizeof(arg));
 606         arg.iov[0].iov_base = (unsigned char *)&rep;
 607         arg.iov[0].iov_len  = sizeof(rep.th);
 608
 609 #ifdef CONFIG_TCP_MD5SIG
 610         key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL;
 611         if (key) {
 612                 rep.opt[0] = htonl((TCPOPT_NOP << 24) |
 613                                    (TCPOPT_NOP << 16) |
 614                                    (TCPOPT_MD5SIG << 8) |
 615                                    TCPOLEN_MD5SIG);
 616                 /* Update length and the length the header thinks exists */
 617                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 618                 rep.th.doff = arg.iov[0].iov_len / 4;
 619
 620                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[1],
 621                                      key, ip_hdr(skb)->saddr,
 622                                      ip_hdr(skb)->daddr, &rep.th);
 623         }
 624 #endif
 625         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 626                                       ip_hdr(skb)->saddr, /* XXX */
 627                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 628         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 629         arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
 630
 631         net = dev_net(skb_dst(skb)->dev);
 632         ip_send_reply(net->ipv4.tcp_sock, skb,
 633                       &arg, arg.iov[0].iov_len);
 634
 635         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 636         TCP_INC_STATS_BH(net, TCP_MIB_OUTRSTS);
 637 }
 638
 639 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
 640    outside socket context is ugly, certainly. What can I do?
 641  */
 642
 643 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
 644                             u32 win, u32 ts, int oif,
 645                             struct tcp_md5sig_key *key,
 646                             int reply_flags)
 647 {
 648         struct tcphdr *th = tcp_hdr(skb);
 649         struct {
 650                 struct tcphdr th;
 651                 __be32 opt[(TCPOLEN_TSTAMP_ALIGNED >> 2)
 652 #ifdef CONFIG_TCP_MD5SIG
 653                            + (TCPOLEN_MD5SIG_ALIGNED >> 2)
 654 #endif
 655                         ];
 656         } rep;
 657         struct ip_reply_arg arg;
 658         struct net *net = dev_net(skb_dst(skb)->dev);
 659
 660         memset(&rep.th, 0, sizeof(struct tcphdr));
 661         memset(&arg, 0, sizeof(arg));
 662
 663         arg.iov[0].iov_base = (unsigned char *)&rep;
 664         arg.iov[0].iov_len  = sizeof(rep.th);
 665         if (ts) {
 666                 rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
 667                                    (TCPOPT_TIMESTAMP << 8) |
 668                                    TCPOLEN_TIMESTAMP);
 669                 rep.opt[1] = htonl(tcp_time_stamp);
 670                 rep.opt[2] = htonl(ts);
 671                 arg.iov[0].iov_len += TCPOLEN_TSTAMP_ALIGNED;
 672         }
 673
 674         /* Swap the send and the receive. */
 675         rep.th.dest    = th->source;
 676         rep.th.source  = th->dest;
 677         rep.th.doff    = arg.iov[0].iov_len / 4;
 678         rep.th.seq     = htonl(seq);
 679         rep.th.ack_seq = htonl(ack);
 680         rep.th.ack     = 1;
 681         rep.th.window  = htons(win);
 682
 683 #ifdef CONFIG_TCP_MD5SIG
 684         if (key) {
 685                 int offset = (ts) ? 3 : 0;
 686
 687                 rep.opt[offset++] = htonl((TCPOPT_NOP << 24) |
 688                                           (TCPOPT_NOP << 16) |
 689                                           (TCPOPT_MD5SIG << 8) |
 690                                           TCPOLEN_MD5SIG);
 691                 arg.iov[0].iov_len += TCPOLEN_MD5SIG_ALIGNED;
 692                 rep.th.doff = arg.iov[0].iov_len/4;
 693
 694                 tcp_v4_md5_hash_hdr((__u8 *) &rep.opt[offset],
 695                                     key, ip_hdr(skb)->saddr,
 696                                     ip_hdr(skb)->daddr, &rep.th);
 697         }
 698 #endif
 699         arg.flags = reply_flags;
 700         arg.csum = csum_tcpudp_nofold(ip_hdr(skb)->daddr,
 701                                       ip_hdr(skb)->saddr, /* XXX */
 702                                       arg.iov[0].iov_len, IPPROTO_TCP, 0);
 703         arg.csumoffset = offsetof(struct tcphdr, check) / 2;
 704         if (oif)
 705                 arg.bound_dev_if = oif;
 706
 707         ip_send_reply(net->ipv4.tcp_sock, skb,
 708                       &arg, arg.iov[0].iov_len);
 709
 710         TCP_INC_STATS_BH(net, TCP_MIB_OUTSEGS);
 711 }
 712
 713 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 714 {
 715         struct inet_timewait_sock *tw = inet_twsk(sk);
 716         struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 717
 718         tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
 719                         tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
 720                         tcptw->tw_ts_recent,
 721                         tw->tw_bound_dev_if,
 722                         tcp_twsk_md5_key(tcptw),
 723                         tw->tw_transparent ? IP_REPLY_ARG_NOSRCCHECK : 0
 724                         );
 725
 726         inet_twsk_put(tw);
 727 }
 728
 729 static void tcp_v4_reqsk_send_ack(struct sock *sk, struct sk_buff *skb,
 730                                   struct request_sock *req)
 731 {
 732         tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1,
 733                         tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
 734                         req->ts_recent,
 735                         0,
 736                         tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr),
 737                         inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0);
 738 }
 739
 740 /*
 741  *      Send a SYN-ACK after having received a SYN.
 742  *      This still operates on a request_sock only, not on a big
 743  *      socket.
 744  */
 745 static int __tcp_v4_send_synack(struct sock *sk, struct dst_entry *dst,
 746                                 struct request_sock *req,
 747                                 struct request_values *rvp)
 748 {
 749         const struct inet_request_sock *ireq = inet_rsk(req);
 750         int err = -1;
 751         struct sk_buff * skb;
 752
 753         /* First, grab a route. */
 754         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
 755                 return -1;
 756
 757         skb = tcp_make_synack(sk, dst, req, rvp);
 758
 759         if (skb) {
 760                 struct tcphdr *th = tcp_hdr(skb);
 761
 762                 th->check = tcp_v4_check(skb->len,
 763                                          ireq->loc_addr,
 764                                          ireq->rmt_addr,
 765                                          csum_partial(th, skb->len,
 766                                                       skb->csum));
 767
 768                 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
 769                                             ireq->rmt_addr,
 770                                             ireq->opt);
 771                 err = net_xmit_eval(err);
 772         }
 773
 774         dst_release(dst);
 775         return err;
 776 }
 777
 778 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
 779                               struct request_values *rvp)
 780 {
 781         return __tcp_v4_send_synack(sk, NULL, req, rvp);
 782 }
 783
 784 /*
 785  *      IPv4 request_sock destructor.
 786  */
 787 static void tcp_v4_reqsk_destructor(struct request_sock *req)
 788 {
 789         kfree(inet_rsk(req)->opt);
 790 }
 791
 792 #ifdef CONFIG_SYN_COOKIES
 793 static void syn_flood_warning(struct sk_buff *skb)
 794 {
 795         static unsigned long warntime;
 796
 797         if (time_after(jiffies, (warntime + HZ * 60))) {
 798                 warntime = jiffies;
 799                 printk(KERN_INFO
 800                        "possible SYN flooding on port %d. Sending cookies.\n",
 801                        ntohs(tcp_hdr(skb)->dest));
 802         }
 803 }
 804 #endif
 805
 806 /*
 807  * Save and compile IPv4 options into the request_sock if needed.
 808  */
 809 static struct ip_options *tcp_v4_save_options(struct sock *sk,
 810                                               struct sk_buff *skb)
 811 {
 812         struct ip_options *opt = &(IPCB(skb)->opt);
 813         struct ip_options *dopt = NULL;
 814
 815         if (opt && opt->optlen) {
 816                 int opt_size = optlength(opt);
 817                 dopt = kmalloc(opt_size, GFP_ATOMIC);
 818                 if (dopt) {
 819                         if (ip_options_echo(dopt, skb)) {
 820                                 kfree(dopt);
 821                                 dopt = NULL;
 822                         }
 823                 }
 824         }
 825         return dopt;
 826 }
 827
 828 #ifdef CONFIG_TCP_MD5SIG
 829 /*
 830  * RFC2385 MD5 checksumming requires a mapping of
 831  * IP address->MD5 Key.
 832  * We need to maintain these in the sk structure.
 833  */
 834
 835 /* Find the Key structure for an address.  */
 836 static struct tcp_md5sig_key *
 837                         tcp_v4_md5_do_lookup(struct sock *sk, __be32 addr)
 838 {
 839         struct tcp_sock *tp = tcp_sk(sk);
 840         int i;
 841
 842         if (!tp->md5sig_info || !tp->md5sig_info->entries4)
 843                 return NULL;
 844         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 845                 if (tp->md5sig_info->keys4[i].addr == addr)
 846                         return &tp->md5sig_info->keys4[i].base;
 847         }
 848         return NULL;
 849 }
 850
 851 struct tcp_md5sig_key *tcp_v4_md5_lookup(struct sock *sk,
 852                                          struct sock *addr_sk)
 853 {
 854         return tcp_v4_md5_do_lookup(sk, inet_sk(addr_sk)->inet_daddr);
 855 }
 856
 857 EXPORT_SYMBOL(tcp_v4_md5_lookup);
 858
 859 static struct tcp_md5sig_key *tcp_v4_reqsk_md5_lookup(struct sock *sk,
 860                                                       struct request_sock *req)
 861 {
 862         return tcp_v4_md5_do_lookup(sk, inet_rsk(req)->rmt_addr);
 863 }
 864
 865 /* This can be called on a newly created socket, from other files */
 866 int tcp_v4_md5_do_add(struct sock *sk, __be32 addr,
 867                       u8 *newkey, u8 newkeylen)
 868 {
 869         /* Add Key to the list */
 870         struct tcp_md5sig_key *key;
 871         struct tcp_sock *tp = tcp_sk(sk);
 872         struct tcp4_md5sig_key *keys;
 873
 874         key = tcp_v4_md5_do_lookup(sk, addr);
 875         if (key) {
 876                 /* Pre-existing entry - just update that one. */
 877                 kfree(key->key);
 878                 key->key = newkey;
 879                 key->keylen = newkeylen;
 880         } else {
 881                 struct tcp_md5sig_info *md5sig;
 882
 883                 if (!tp->md5sig_info) {
 884                         tp->md5sig_info = kzalloc(sizeof(*tp->md5sig_info),
 885                                                   GFP_ATOMIC);
 886                         if (!tp->md5sig_info) {
 887                                 kfree(newkey);
 888                                 return -ENOMEM;
 889                         }
 890                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
 891                 }
 892                 if (tcp_alloc_md5sig_pool(sk) == NULL) {
 893                         kfree(newkey);
 894                         return -ENOMEM;
 895                 }
 896                 md5sig = tp->md5sig_info;
 897
 898                 if (md5sig->alloced4 == md5sig->entries4) {
 899                         keys = kmalloc((sizeof(*keys) *
 900                                         (md5sig->entries4 + 1)), GFP_ATOMIC);
 901                         if (!keys) {
 902                                 kfree(newkey);
 903                                 tcp_free_md5sig_pool();
 904                                 return -ENOMEM;
 905                         }
 906
 907                         if (md5sig->entries4)
 908                                 memcpy(keys, md5sig->keys4,
 909                                        sizeof(*keys) * md5sig->entries4);
 910
 911                         /* Free old key list, and reference new one */
 912                         kfree(md5sig->keys4);
 913                         md5sig->keys4 = keys;
 914                         md5sig->alloced4++;
 915                 }
 916                 md5sig->entries4++;
 917                 md5sig->keys4[md5sig->entries4 - 1].addr        = addr;
 918                 md5sig->keys4[md5sig->entries4 - 1].base.key    = newkey;
 919                 md5sig->keys4[md5sig->entries4 - 1].base.keylen = newkeylen;
 920         }
 921         return 0;
 922 }
 923
 924 EXPORT_SYMBOL(tcp_v4_md5_do_add);
 925
 926 static int tcp_v4_md5_add_func(struct sock *sk, struct sock *addr_sk,
 927                                u8 *newkey, u8 newkeylen)
 928 {
 929         return tcp_v4_md5_do_add(sk, inet_sk(addr_sk)->inet_daddr,
 930                                  newkey, newkeylen);
 931 }
 932
 933 int tcp_v4_md5_do_del(struct sock *sk, __be32 addr)
 934 {
 935         struct tcp_sock *tp = tcp_sk(sk);
 936         int i;
 937
 938         for (i = 0; i < tp->md5sig_info->entries4; i++) {
 939                 if (tp->md5sig_info->keys4[i].addr == addr) {
 940                         /* Free the key */
 941                         kfree(tp->md5sig_info->keys4[i].base.key);
 942                         tp->md5sig_info->entries4--;
 943
 944                         if (tp->md5sig_info->entries4 == 0) {
 945                                 kfree(tp->md5sig_info->keys4);
 946                                 tp->md5sig_info->keys4 = NULL;
 947                                 tp->md5sig_info->alloced4 = 0;
 948                         } else if (tp->md5sig_info->entries4 != i) {
 949                                 /* Need to do some manipulation */
 950                                 memmove(&tp->md5sig_info->keys4[i],
 951                                         &tp->md5sig_info->keys4[i+1],
 952                                         (tp->md5sig_info->entries4 - i) *
 953                                          sizeof(struct tcp4_md5sig_key));
 954                         }
 955                         tcp_free_md5sig_pool();
 956                         return 0;
 957                 }
 958         }
 959         return -ENOENT;
 960 }
 961
 962 EXPORT_SYMBOL(tcp_v4_md5_do_del);
 963
 964 static void tcp_v4_clear_md5_list(struct sock *sk)
 965 {
 966         struct tcp_sock *tp = tcp_sk(sk);
 967
 968         /* Free each key, then the set of key keys,
 969          * the crypto element, and then decrement our
 970          * hold on the last resort crypto.
 971          */
 972         if (tp->md5sig_info->entries4) {
 973                 int i;
 974                 for (i = 0; i < tp->md5sig_info->entries4; i++)
 975                         kfree(tp->md5sig_info->keys4[i].base.key);
 976                 tp->md5sig_info->entries4 = 0;
 977                 tcp_free_md5sig_pool();
 978         }
 979         if (tp->md5sig_info->keys4) {
 980                 kfree(tp->md5sig_info->keys4);
 981                 tp->md5sig_info->keys4 = NULL;
 982                 tp->md5sig_info->alloced4  = 0;
 983         }
 984 }
 985
 986 static int tcp_v4_parse_md5_keys(struct sock *sk, char __user *optval,
 987                                  int optlen)
 988 {
 989         struct tcp_md5sig cmd;
 990         struct sockaddr_in *sin = (struct sockaddr_in *)&cmd.tcpm_addr;
 991         u8 *newkey;
 992
 993         if (optlen < sizeof(cmd))
 994                 return -EINVAL;
 995
 996         if (copy_from_user(&cmd, optval, sizeof(cmd)))
 997                 return -EFAULT;
 998
 999         if (sin->sin_family != AF_INET)
1000                 return -EINVAL;
1001
1002         if (!cmd.tcpm_key || !cmd.tcpm_keylen) {
1003                 if (!tcp_sk(sk)->md5sig_info)
1004                         return -ENOENT;
1005                 return tcp_v4_md5_do_del(sk, sin->sin_addr.s_addr);
1006         }
1007
1008         if (cmd.tcpm_keylen > TCP_MD5SIG_MAXKEYLEN)
1009                 return -EINVAL;
1010
1011         if (!tcp_sk(sk)->md5sig_info) {
1012                 struct tcp_sock *tp = tcp_sk(sk);
1013                 struct tcp_md5sig_info *p;
1014
1015                 p = kzalloc(sizeof(*p), sk->sk_allocation);
1016                 if (!p)
1017                         return -EINVAL;
1018
1019                 tp->md5sig_info = p;
1020                 sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1021         }
1022
1023         newkey = kmemdup(cmd.tcpm_key, cmd.tcpm_keylen, sk->sk_allocation);
1024         if (!newkey)
1025                 return -ENOMEM;
1026         return tcp_v4_md5_do_add(sk, sin->sin_addr.s_addr,
1027                                  newkey, cmd.tcpm_keylen);
1028 }
1029
1030 static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
1031                                         __be32 daddr, __be32 saddr, int nbytes)
1032 {
1033         struct tcp4_pseudohdr *bp;
1034         struct scatterlist sg;
1035
1036         bp = &hp->md5_blk.ip4;
1037
1038         /*
1039          * 1. the TCP pseudo-header (in the order: source IP address,
1040          * destination IP address, zero-padded protocol number, and
1041          * segment length)
1042          */
1043         bp->saddr = saddr;
1044         bp->daddr = daddr;
1045         bp->pad = 0;
1046         bp->protocol = IPPROTO_TCP;
1047         bp->len = cpu_to_be16(nbytes);
1048
1049         sg_init_one(&sg, bp, sizeof(*bp));
1050         return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
1051 }
1052
1053 static int tcp_v4_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
1054                                __be32 daddr, __be32 saddr, struct tcphdr *th)
1055 {
1056         struct tcp_md5sig_pool *hp;
1057         struct hash_desc *desc;
1058
1059         hp = tcp_get_md5sig_pool();
1060         if (!hp)
1061                 goto clear_hash_noput;
1062         desc = &hp->md5_desc;
1063
1064         if (crypto_hash_init(desc))
1065                 goto clear_hash;
1066         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
1067                 goto clear_hash;
1068         if (tcp_md5_hash_header(hp, th))
1069                 goto clear_hash;
1070         if (tcp_md5_hash_key(hp, key))
1071                 goto clear_hash;
1072         if (crypto_hash_final(desc, md5_hash))
1073                 goto clear_hash;
1074
1075         tcp_put_md5sig_pool();
1076         return 0;
1077
1078 clear_hash:
1079         tcp_put_md5sig_pool();
1080 clear_hash_noput:
1081         memset(md5_hash, 0, 16);
1082         return 1;
1083 }
1084
1085 int tcp_v4_md5_hash_skb(char *md5_hash, struct tcp_md5sig_key *key,
1086                         struct sock *sk, struct request_sock *req,
1087                         struct sk_buff *skb)
1088 {
1089         struct tcp_md5sig_pool *hp;
1090         struct hash_desc *desc;
1091         struct tcphdr *th = tcp_hdr(skb);
1092         __be32 saddr, daddr;
1093
1094         if (sk) {
1095                 saddr = inet_sk(sk)->inet_saddr;
1096                 daddr = inet_sk(sk)->inet_daddr;
1097         } else if (req) {
1098                 saddr = inet_rsk(req)->loc_addr;
1099                 daddr = inet_rsk(req)->rmt_addr;
1100         } else {
1101                 const struct iphdr *iph = ip_hdr(skb);
1102                 saddr = iph->saddr;
1103                 daddr = iph->daddr;
1104         }
1105
1106         hp = tcp_get_md5sig_pool();
1107         if (!hp)
1108                 goto clear_hash_noput;
1109         desc = &hp->md5_desc;
1110
1111         if (crypto_hash_init(desc))
1112                 goto clear_hash;
1113
1114         if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
1115                 goto clear_hash;
1116         if (tcp_md5_hash_header(hp, th))
1117                 goto clear_hash;
1118         if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))
1119                 goto clear_hash;
1120         if (tcp_md5_hash_key(hp, key))
1121                 goto clear_hash;
1122         if (crypto_hash_final(desc, md5_hash))
1123                 goto clear_hash;
1124
1125         tcp_put_md5sig_pool();
1126         return 0;
1127
1128 clear_hash:
1129         tcp_put_md5sig_pool();
1130 clear_hash_noput:
1131         memset(md5_hash, 0, 16);
1132         return 1;
1133 }
1134
1135 EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
1136
1137 static int tcp_v4_inbound_md5_hash(struct sock *sk, struct sk_buff *skb)
1138 {
1139         /*
1140          * This gets called for each TCP segment that arrives
1141          * so we want to be efficient.
1142          * We have 3 drop cases:
1143          * o No MD5 hash and one expected.
1144          * o MD5 hash and we're not expecting one.
1145          * o MD5 hash and its wrong.
1146          */
1147         __u8 *hash_location = NULL;
1148         struct tcp_md5sig_key *hash_expected;
1149         const struct iphdr *iph = ip_hdr(skb);
1150         struct tcphdr *th = tcp_hdr(skb);
1151         int genhash;
1152         unsigned char newhash[16];
1153
1154         hash_expected = tcp_v4_md5_do_lookup(sk, iph->saddr);
1155         hash_location = tcp_parse_md5sig_option(th);
1156
1157         /* We've parsed the options - do we have a hash? */
1158         if (!hash_expected && !hash_location)
1159                 return 0;
1160
1161         if (hash_expected && !hash_location) {
1162                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
1163                 return 1;
1164         }
1165
1166         if (!hash_expected && hash_location) {
1167                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
1168                 return 1;
1169         }
1170
1171         /* Okay, so this is hash_expected and hash_location -
1172          * so we need to calculate the checksum.
1173          */
1174         genhash = tcp_v4_md5_hash_skb(newhash,
1175                                       hash_expected,
1176                                       NULL, NULL, skb);
1177
1178         if (genhash || memcmp(hash_location, newhash, 16) != 0) {
1179                 if (net_ratelimit()) {
1180                         printk(KERN_INFO "MD5 Hash failed for (%pI4, %d)->(%pI4, %d)%s\n",
1181                                &iph->saddr, ntohs(th->source),
1182                                &iph->daddr, ntohs(th->dest),
1183                                genhash ? " tcp_v4_calc_md5_hash failed" : "");
1184                 }
1185                 return 1;
1186         }
1187         return 0;
1188 }
1189
1190 #endif
1191
1192 struct request_sock_ops tcp_request_sock_ops __read_mostly = {
1193         .family         =       PF_INET,
1194         .obj_size       =       sizeof(struct tcp_request_sock),
1195         .rtx_syn_ack    =       tcp_v4_send_synack,
1196         .send_ack       =       tcp_v4_reqsk_send_ack,
1197         .destructor     =       tcp_v4_reqsk_destructor,
1198         .send_reset     =       tcp_v4_send_reset,
1199 };
1200
1201 #ifdef CONFIG_TCP_MD5SIG
1202 static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
1203         .md5_lookup     =       tcp_v4_reqsk_md5_lookup,
1204         .calc_md5_hash  =       tcp_v4_md5_hash_skb,
1205 };
1206 #endif
1207
1208 static struct timewait_sock_ops tcp_timewait_sock_ops = {
1209         .twsk_obj_size  = sizeof(struct tcp_timewait_sock),
1210         .twsk_unique    = tcp_twsk_unique,
1211         .twsk_destructor= tcp_twsk_destructor,
1212 };
1213
1214 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1215 {
1216         struct tcp_options_received tmp_opt;
1217         struct request_sock *req;
1218         struct inet_request_sock *ireq;
1219         struct dst_entry *dst = NULL;
1220         __be32 saddr = ip_hdr(skb)->saddr;
1221         __be32 daddr = ip_hdr(skb)->daddr;
1222         __u32 isn = TCP_SKB_CB(skb)->when;
1223 #ifdef CONFIG_SYN_COOKIES
1224         int want_cookie = 0;
1225 #else
1226 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1227 #endif
1228
1229         /* Never answer to SYNs send to broadcast or multicast */
1230         if (skb_rtable(skb)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))
1231                 goto drop;
1232
1233         /* TW buckets are converted to open requests without
1234          * limitations, they conserve resources and peer is
1235          * evidently real one.
1236          */
1237         if (inet_csk_reqsk_queue_is_full(sk) && !isn) {
1238 #ifdef CONFIG_SYN_COOKIES
1239                 if (sysctl_tcp_syncookies) {
1240                         want_cookie = 1;
1241                 } else
1242 #endif
1243                 goto drop;
1244         }
1245
1246         /* Accept backlog is full. If we have already queued enough
1247          * of warm entries in syn queue, drop request. It is better than
1248          * clogging syn queue with openreqs with exponentially increasing
1249          * timeout.
1250          */
1251         if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1)
1252                 goto drop;
1253
1254         req = inet_reqsk_alloc(&tcp_request_sock_ops);
1255         if (!req)
1256                 goto drop;
1257
1258 #ifdef CONFIG_TCP_MD5SIG
1259         tcp_rsk(req)->af_specific = &tcp_request_sock_ipv4_ops;
1260 #endif
1261
1262         ireq = inet_rsk(req);
1263         ireq->loc_addr = daddr;
1264         ireq->rmt_addr = saddr;
1265         ireq->no_srccheck = inet_sk(sk)->transparent;
1266         ireq->opt = tcp_v4_save_options(sk, skb);
1267
1268         dst = inet_csk_route_req(sk, req);
1269         if(!dst)
1270                 goto drop_and_free;
1271
1272         tcp_clear_options(&tmp_opt);
1273         tmp_opt.mss_clamp = TCP_MSS_DEFAULT;
1274         tmp_opt.user_mss  = tcp_sk(sk)->rx_opt.user_mss;
1275
1276         tcp_parse_options(skb, &tmp_opt, 0, dst);
1277
1278         if (want_cookie && !tmp_opt.saw_tstamp)
1279                 tcp_clear_options(&tmp_opt);
1280
1281         tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1282
1283         tcp_openreq_init(req, &tmp_opt, skb);
1284
1285         if (security_inet_conn_request(sk, skb, req))
1286                 goto drop_and_release;
1287
1288         if (!want_cookie)
1289                 TCP_ECN_create_request(req, tcp_hdr(skb));
1290
1291         if (want_cookie) {
1292 #ifdef CONFIG_SYN_COOKIES
1293                 syn_flood_warning(skb);
1294                 req->cookie_ts = tmp_opt.tstamp_ok;
1295 #endif
1296                 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1297         } else if (!isn) {
1298                 struct inet_peer *peer = NULL;
1299
1300                 /* VJ's idea. We save last timestamp seen
1301                  * from the destination in peer table, when entering
1302                  * state TIME-WAIT, and check against it before
1303                  * accepting new connection request.
1304                  *
1305                  * If "isn" is not zero, this request hit alive
1306                  * timewait bucket, so that all the necessary checks
1307                  * are made in the function processing timewait state.
1308                  */
1309                 if (tmp_opt.saw_tstamp &&
1310                     tcp_death_row.sysctl_tw_recycle &&
1311                     (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1312                     peer->v4daddr == saddr) {
1313                         if ((u32)get_seconds() - peer->tcp_ts_stamp < TCP_PAWS_MSL &&
1314                             (s32)(peer->tcp_ts - req->ts_recent) >
1315                                                         TCP_PAWS_WINDOW) {
1316                                 NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
1317                                 goto drop_and_release;
1318                         }
1319                 }
1320                 /* Kill the following clause, if you dislike this way. */
1321                 else if (!sysctl_tcp_syncookies &&
1322                          (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
1323                           (sysctl_max_syn_backlog >> 2)) &&
1324                          (!peer || !peer->tcp_ts_stamp) &&
1325                          (!dst || !dst_metric(dst, RTAX_RTT))) {
1326                         /* Without syncookies last quarter of
1327                          * backlog is filled with destinations,
1328                          * proven to be alive.
1329                          * It means that we continue to communicate
1330                          * to destinations, already remembered
1331                          * to the moment of synflood.
1332                          */
1333                         LIMIT_NETDEBUG(KERN_DEBUG "TCP: drop open request from %pI4/%u\n",
1334                                        &saddr, ntohs(tcp_hdr(skb)->source));
1335                         goto drop_and_release;
1336                 }
1337
1338                 isn = tcp_v4_init_sequence(skb);
1339         }
1340         tcp_rsk(req)->snt_isn = isn;
1341
1342         if (__tcp_v4_send_synack(sk, dst, req, NULL) || want_cookie)
1343                 goto drop_and_free;
1344
1345         inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
1346         return 0;
1347
1348 drop_and_release:
1349         dst_release(dst);
1350 drop_and_free:
1351         reqsk_free(req);
1352 drop:
1353         return 0;
1354 }
1355
1356
1357 /*
1358  * The three way handshake has completed - we got a valid synack -
1359  * now create the new socket.
1360  */
1361 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1362                                   struct request_sock *req,
1363                                   struct dst_entry *dst)
1364 {
1365         struct inet_request_sock *ireq;
1366         struct inet_sock *newinet;
1367         struct tcp_sock *newtp;
1368         struct sock *newsk;
1369 #ifdef CONFIG_TCP_MD5SIG
1370         struct tcp_md5sig_key *key;
1371 #endif
1372
1373         if (sk_acceptq_is_full(sk))
1374                 goto exit_overflow;
1375
1376         if (!dst && (dst = inet_csk_route_req(sk, req)) == NULL)
1377                 goto exit;
1378
1379         newsk = tcp_create_openreq_child(sk, req, skb);
1380         if (!newsk)
1381                 goto exit;
1382
1383         newsk->sk_gso_type = SKB_GSO_TCPV4;
1384         sk_setup_caps(newsk, dst);
1385
1386         newtp                 = tcp_sk(newsk);
1387         newinet               = inet_sk(newsk);
1388         ireq                  = inet_rsk(req);
1389         newinet->inet_daddr   = ireq->rmt_addr;
1390         newinet->inet_rcv_saddr = ireq->loc_addr;
1391         newinet->inet_saddr           = ireq->loc_addr;
1392         newinet->opt          = ireq->opt;
1393         ireq->opt             = NULL;
1394         newinet->mc_index     = inet_iif(skb);
1395         newinet->mc_ttl       = ip_hdr(skb)->ttl;
1396         inet_csk(newsk)->icsk_ext_hdr_len = 0;
1397         if (newinet->opt)
1398                 inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen;
1399         newinet->inet_id = newtp->write_seq ^ jiffies;
1400
1401         tcp_mtup_init(newsk);
1402         tcp_sync_mss(newsk, dst_mtu(dst));
1403         newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1404         if (tcp_sk(sk)->rx_opt.user_mss &&
1405             tcp_sk(sk)->rx_opt.user_mss < newtp->advmss)
1406                 newtp->advmss = tcp_sk(sk)->rx_opt.user_mss;
1407
1408         tcp_initialize_rcv_mss(newsk);
1409
1410 #ifdef CONFIG_TCP_MD5SIG
1411         /* Copy over the MD5 key from the original socket */
1412         key = tcp_v4_md5_do_lookup(sk, newinet->inet_daddr);
1413         if (key != NULL) {
1414                 /*
1415                  * We're using one, so create a matching key
1416                  * on the newsk structure. If we fail to get
1417                  * memory, then we end up not copying the key
1418                  * across. Shucks.
1419                  */
1420                 char *newkey = kmemdup(key->key, key->keylen, GFP_ATOMIC);
1421                 if (newkey != NULL)
1422                         tcp_v4_md5_do_add(newsk, newinet->inet_daddr,
1423                                           newkey, key->keylen);
1424                 newsk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1425         }
1426 #endif
1427
1428         __inet_hash_nolisten(newsk);
1429         __inet_inherit_port(sk, newsk);
1430
1431         return newsk;
1432
1433 exit_overflow:
1434         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
1435 exit:
1436         NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS);
1437         dst_release(dst);
1438         return NULL;
1439 }
1440
1441 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1442 {
1443         struct tcphdr *th = tcp_hdr(skb);
1444         const struct iphdr *iph = ip_hdr(skb);
1445         struct sock *nsk;
1446         struct request_sock **prev;
1447         /* Find possible connection requests. */
1448         struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
1449                                                        iph->saddr, iph->daddr);
1450         if (req)
1451                 return tcp_check_req(sk, skb, req, prev);
1452
1453         nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
1454                         th->source, iph->daddr, th->dest, inet_iif(skb));
1455
1456         if (nsk) {
1457                 if (nsk->sk_state != TCP_TIME_WAIT) {
1458                         bh_lock_sock(nsk);
1459                         return nsk;
1460                 }
1461                 inet_twsk_put(inet_twsk(nsk));
1462                 return NULL;
1463         }
1464
1465 #ifdef CONFIG_SYN_COOKIES
1466         if (!th->rst && !th->syn && th->ack)
1467                 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1468 #endif
1469         return sk;
1470 }
1471
1472 static __sum16 tcp_v4_checksum_init(struct sk_buff *skb)
1473 {
1474         const struct iphdr *iph = ip_hdr(skb);
1475
1476         if (skb->ip_summed == CHECKSUM_COMPLETE) {
1477                 if (!tcp_v4_check(skb->len, iph->saddr,
1478                                   iph->daddr, skb->csum)) {
1479                         skb->ip_summed = CHECKSUM_UNNECESSARY;
1480                         return 0;
1481                 }
1482         }
1483
1484         skb->csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
1485                                        skb->len, IPPROTO_TCP, 0);
1486
1487         if (skb->len <= 76) {
1488                 return __skb_checksum_complete(skb);
1489         }
1490         return 0;
1491 }
1492
1493
1494 /* The socket must have it's spinlock held when we get
1495  * here.
1496  *
1497  * We have a potential double-lock case here, so even when
1498  * doing backlog processing we use the BH locking scheme.
1499  * This is because we cannot sleep with the original spinlock
1500  * held.
1501  */
1502 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1503 {
1504         struct sock *rsk;
1505 #ifdef CONFIG_TCP_MD5SIG
1506         /*
1507          * We really want to reject the packet as early as possible
1508          * if:
1509          *  o We're expecting an MD5'd packet and this is no MD5 tcp option
1510          *  o There is an MD5 option and we're not expecting one
1511          */
1512         if (tcp_v4_inbound_md5_hash(sk, skb))
1513                 goto discard;
1514 #endif
1515
1516         if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1517                 TCP_CHECK_TIMER(sk);
1518                 if (tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len)) {
1519                         rsk = sk;
1520                         goto reset;
1521                 }
1522                 TCP_CHECK_TIMER(sk);
1523                 return 0;
1524         }
1525
1526         if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
1527                 goto csum_err;
1528
1529         if (sk->sk_state == TCP_LISTEN) {
1530                 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1531                 if (!nsk)
1532                         goto discard;
1533
1534                 if (nsk != sk) {
1535                         if (tcp_child_process(sk, nsk, skb)) {
1536                                 rsk = nsk;
1537                                 goto reset;
1538                         }
1539                         return 0;
1540                 }
1541         }
1542
1543         TCP_CHECK_TIMER(sk);
1544         if (tcp_rcv_state_process(sk, skb, tcp_hdr(skb), skb->len)) {
1545                 rsk = sk;
1546                 goto reset;
1547         }
1548         TCP_CHECK_TIMER(sk);
1549         return 0;
1550
1551 reset:
1552         tcp_v4_send_reset(rsk, skb);
1553 discard:
1554         kfree_skb(skb);
1555         /* Be careful here. If this function gets more complicated and
1556          * gcc suffers from register pressure on the x86, sk (in %ebx)
1557          * might be destroyed here. This current version compiles correctly,
1558          * but you have been warned.
1559          */
1560         return 0;
1561
1562 csum_err:
1563         TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
1564         goto discard;
1565 }
1566
1567 /*
1568  *      From tcp_input.c
1569  */
1570
1571 int tcp_v4_rcv(struct sk_buff *skb)
1572 {
1573         const struct iphdr *iph;
1574         struct tcphdr *th;
1575         struct sock *sk;
1576         int ret;
1577         struct net *net = dev_net(skb->dev);
1578
1579         if (skb->pkt_type != PACKET_HOST)
1580                 goto discard_it;
1581
1582         /* Count it even if it's bad */
1583         TCP_INC_STATS_BH(net, TCP_MIB_INSEGS);
1584
1585         if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1586                 goto discard_it;
1587
1588         th = tcp_hdr(skb);
1589
1590         if (th->doff < sizeof(struct tcphdr) / 4)
1591                 goto bad_packet;
1592         if (!pskb_may_pull(skb, th->doff * 4))
1593                 goto discard_it;
1594
1595         /* An explanation is required here, I think.
1596          * Packet length and doff are validated by header prediction,
1597          * provided case of th->doff==0 is eliminated.
1598          * So, we defer the checks. */
1599         if (!skb_csum_unnecessary(skb) && tcp_v4_checksum_init(skb))
1600                 goto bad_packet;
1601
1602         th = tcp_hdr(skb);
1603         iph = ip_hdr(skb);
1604         TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1605         TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1606                                     skb->len - th->doff * 4);
1607         TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1608         TCP_SKB_CB(skb)->when    = 0;
1609         TCP_SKB_CB(skb)->flags   = iph->tos;
1610         TCP_SKB_CB(skb)->sacked  = 0;
1611
1612         sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
1613         if (!sk)
1614                 goto no_tcp_socket;
1615
1616 process:
1617         if (sk->sk_state == TCP_TIME_WAIT)
1618                 goto do_time_wait;
1619
1620         if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1621                 goto discard_and_relse;
1622         nf_reset(skb);
1623
1624         if (sk_filter(sk, skb))
1625                 goto discard_and_relse;
1626
1627         skb->dev = NULL;
1628
1629         bh_lock_sock_nested(sk);
1630         ret = 0;
1631         if (!sock_owned_by_user(sk)) {
1632 #ifdef CONFIG_NET_DMA
1633                 struct tcp_sock *tp = tcp_sk(sk);
1634                 if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list)
1635                         tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY);
1636                 if (tp->ucopy.dma_chan)
1637                         ret = tcp_v4_do_rcv(sk, skb);
1638                 else
1639 #endif
1640                 {
1641                         if (!tcp_prequeue(sk, skb))
1642                                 ret = tcp_v4_do_rcv(sk, skb);
1643                 }
1644         } else
1645                 sk_add_backlog(sk, skb);
1646         bh_unlock_sock(sk);
1647
1648         sock_put(sk);
1649
1650         return ret;
1651
1652 no_tcp_socket:
1653         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1654                 goto discard_it;
1655
1656         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1657 bad_packet:
1658                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1659         } else {
1660                 tcp_v4_send_reset(NULL, skb);
1661         }
1662
1663 discard_it:
1664         /* Discard frame. */
1665         kfree_skb(skb);
1666         return 0;
1667
1668 discard_and_relse:
1669         sock_put(sk);
1670         goto discard_it;
1671
1672 do_time_wait:
1673         if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1674                 inet_twsk_put(inet_twsk(sk));
1675                 goto discard_it;
1676         }
1677
1678         if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1679                 TCP_INC_STATS_BH(net, TCP_MIB_INERRS);
1680                 inet_twsk_put(inet_twsk(sk));
1681                 goto discard_it;
1682         }
1683         switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
1684         case TCP_TW_SYN: {
1685                 struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
1686                                                         &tcp_hashinfo,
1687                                                         iph->daddr, th->dest,
1688                                                         inet_iif(skb));
1689                 if (sk2) {
1690                         inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row);
1691                         inet_twsk_put(inet_twsk(sk));
1692                         sk = sk2;
1693                         goto process;
1694                 }
1695                 /* Fall through to ACK */
1696         }
1697         case TCP_TW_ACK:
1698                 tcp_v4_timewait_ack(sk, skb);
1699                 break;
1700         case TCP_TW_RST:
1701                 goto no_tcp_socket;
1702         case TCP_TW_SUCCESS:;
1703         }
1704         goto discard_it;
1705 }
1706
1707 /* VJ's idea. Save last timestamp seen from this destination
1708  * and hold it at least for normal timewait interval to use for duplicate
1709  * segment detection in subsequent connections, before they enter synchronized
1710  * state.
1711  */
1712
1713 int tcp_v4_remember_stamp(struct sock *sk)
1714 {
1715         struct inet_sock *inet = inet_sk(sk);
1716         struct tcp_sock *tp = tcp_sk(sk);
1717         struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1718         struct inet_peer *peer = NULL;
1719         int release_it = 0;
1720
1721         if (!rt || rt->rt_dst != inet->inet_daddr) {
1722                 peer = inet_getpeer(inet->inet_daddr, 1);
1723                 release_it = 1;
1724         } else {
1725                 if (!rt->peer)
1726                         rt_bind_peer(rt, 1);
1727                 peer = rt->peer;
1728         }
1729
1730         if (peer) {
1731                 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1732                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1733                      peer->tcp_ts_stamp <= (u32)tp->rx_opt.ts_recent_stamp)) {
1734                         peer->tcp_ts_stamp = (u32)tp->rx_opt.ts_recent_stamp;
1735                         peer->tcp_ts = tp->rx_opt.ts_recent;
1736                 }
1737                 if (release_it)
1738                         inet_putpeer(peer);
1739                 return 1;
1740         }
1741
1742         return 0;
1743 }
1744
1745 int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw)
1746 {
1747         struct inet_peer *peer = inet_getpeer(tw->tw_daddr, 1);
1748
1749         if (peer) {
1750                 const struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
1751
1752                 if ((s32)(peer->tcp_ts - tcptw->tw_ts_recent) <= 0 ||
1753                     ((u32)get_seconds() - peer->tcp_ts_stamp > TCP_PAWS_MSL &&
1754                      peer->tcp_ts_stamp <= (u32)tcptw->tw_ts_recent_stamp)) {
1755                         peer->tcp_ts_stamp = (u32)tcptw->tw_ts_recent_stamp;
1756                         peer->tcp_ts       = tcptw->tw_ts_recent;
1757                 }
1758                 inet_putpeer(peer);
1759                 return 1;
1760         }
1761
1762         return 0;
1763 }
1764
1765 const struct inet_connection_sock_af_ops ipv4_specific = {
1766         .queue_xmit        = ip_queue_xmit,
1767         .send_check        = tcp_v4_send_check,
1768         .rebuild_header    = inet_sk_rebuild_header,
1769         .conn_request      = tcp_v4_conn_request,
1770         .syn_recv_sock     = tcp_v4_syn_recv_sock,
1771         .remember_stamp    = tcp_v4_remember_stamp,
1772         .net_header_len    = sizeof(struct iphdr),
1773         .setsockopt        = ip_setsockopt,
1774         .getsockopt        = ip_getsockopt,
1775         .addr2sockaddr     = inet_csk_addr2sockaddr,
1776         .sockaddr_len      = sizeof(struct sockaddr_in),
1777         .bind_conflict     = inet_csk_bind_conflict,
1778 #ifdef CONFIG_COMPAT
1779         .compat_setsockopt = compat_ip_setsockopt,
1780         .compat_getsockopt = compat_ip_getsockopt,
1781 #endif
1782 };
1783
1784 #ifdef CONFIG_TCP_MD5SIG
1785 static const struct tcp_sock_af_ops tcp_sock_ipv4_specific = {
1786         .md5_lookup             = tcp_v4_md5_lookup,
1787         .calc_md5_hash          = tcp_v4_md5_hash_skb,
1788         .md5_add                = tcp_v4_md5_add_func,
1789         .md5_parse              = tcp_v4_parse_md5_keys,
1790 };
1791 #endif
1792
1793 /* NOTE: A lot of things set to zero explicitly by call to
1794  *       sk_alloc() so need not be done here.
1795  */
1796 static int tcp_v4_init_sock(struct sock *sk)
1797 {
1798         struct inet_connection_sock *icsk = inet_csk(sk);
1799         struct tcp_sock *tp = tcp_sk(sk);
1800
1801         skb_queue_head_init(&tp->out_of_order_queue);
1802         tcp_init_xmit_timers(sk);
1803         tcp_prequeue_init(tp);
1804
1805         icsk->icsk_rto = TCP_TIMEOUT_INIT;
1806         tp->mdev = TCP_TIMEOUT_INIT;
1807
1808         /* So many TCP implementations out there (incorrectly) count the
1809          * initial SYN frame in their delayed-ACK and congestion control
1810          * algorithms that we must have the following bandaid to talk
1811          * efficiently to them.  -DaveM
1812          */
1813         tp->snd_cwnd = 2;
1814
1815         /* See draft-stevens-tcpca-spec-01 for discussion of the
1816          * initialization of these values.
1817          */
1818         tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
1819         tp->snd_cwnd_clamp = ~0;
1820         tp->mss_cache = TCP_MSS_DEFAULT;
1821
1822         tp->reordering = sysctl_tcp_reordering;
1823         icsk->icsk_ca_ops = &tcp_init_congestion_ops;
1824
1825         sk->sk_state = TCP_CLOSE;
1826
1827         sk->sk_write_space = sk_stream_write_space;
1828         sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1829
1830         icsk->icsk_af_ops = &ipv4_specific;
1831         icsk->icsk_sync_mss = tcp_sync_mss;
1832 #ifdef CONFIG_TCP_MD5SIG
1833         tp->af_specific = &tcp_sock_ipv4_specific;
1834 #endif
1835
1836         sk->sk_sndbuf = sysctl_tcp_wmem[1];
1837         sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1838
1839         local_bh_disable();
1840         percpu_counter_inc(&tcp_sockets_allocated);
1841         local_bh_enable();
1842
1843         return 0;
1844 }
1845
1846 void tcp_v4_destroy_sock(struct sock *sk)
1847 {
1848         struct tcp_sock *tp = tcp_sk(sk);
1849
1850         tcp_clear_xmit_timers(sk);
1851
1852         tcp_cleanup_congestion_control(sk);
1853
1854         /* Cleanup up the write buffer. */
1855         tcp_write_queue_purge(sk);
1856
1857         /* Cleans up our, hopefully empty, out_of_order_queue. */
1858         __skb_queue_purge(&tp->out_of_order_queue);
1859
1860 #ifdef CONFIG_TCP_MD5SIG
1861         /* Clean up the MD5 key list, if any */
1862         if (tp->md5sig_info) {
1863                 tcp_v4_clear_md5_list(sk);
1864                 kfree(tp->md5sig_info);
1865                 tp->md5sig_info = NULL;
1866         }
1867 #endif
1868
1869 #ifdef CONFIG_NET_DMA
1870         /* Cleans up our sk_async_wait_queue */
1871         __skb_queue_purge(&sk->sk_async_wait_queue);
1872 #endif
1873
1874         /* Clean prequeue, it must be empty really */
1875         __skb_queue_purge(&tp->ucopy.prequeue);
1876
1877         /* Clean up a referenced TCP bind bucket. */
1878         if (inet_csk(sk)->icsk_bind_hash)
1879                 inet_put_port(sk);
1880
1881         /*
1882          * If sendmsg cached page exists, toss it.
1883          */
1884         if (sk->sk_sndmsg_page) {
1885                 __free_page(sk->sk_sndmsg_page);
1886                 sk->sk_sndmsg_page = NULL;
1887         }
1888
1889         percpu_counter_dec(&tcp_sockets_allocated);
1890 }
1891
1892 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1893
1894 #ifdef CONFIG_PROC_FS
1895 /* Proc filesystem TCP sock list dumping. */
1896
1897 static inline struct inet_timewait_sock *tw_head(struct hlist_nulls_head *head)
1898 {
1899         return hlist_nulls_empty(head) ? NULL :
1900                 list_entry(head->first, struct inet_timewait_sock, tw_node);
1901 }
1902
1903 static inline struct inet_timewait_sock *tw_next(struct inet_timewait_sock *tw)
1904 {
1905         return !is_a_nulls(tw->tw_node.next) ?
1906                 hlist_nulls_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1907 }
1908
1909 static void *listening_get_next(struct seq_file *seq, void *cur)
1910 {
1911         struct inet_connection_sock *icsk;
1912         struct hlist_nulls_node *node;
1913         struct sock *sk = cur;
1914         struct inet_listen_hashbucket *ilb;
1915         struct tcp_iter_state *st = seq->private;
1916         struct net *net = seq_file_net(seq);
1917
1918         if (!sk) {
1919                 st->bucket = 0;
1920                 ilb = &tcp_hashinfo.listening_hash[0];
1921                 spin_lock_bh(&ilb->lock);
1922                 sk = sk_nulls_head(&ilb->head);
1923                 goto get_sk;
1924         }
1925         ilb = &tcp_hashinfo.listening_hash[st->bucket];
1926         ++st->num;
1927
1928         if (st->state == TCP_SEQ_STATE_OPENREQ) {
1929                 struct request_sock *req = cur;
1930
1931                 icsk = inet_csk(st->syn_wait_sk);
1932                 req = req->dl_next;
1933                 while (1) {
1934                         while (req) {
1935                                 if (req->rsk_ops->family == st->family) {
1936                                         cur = req;
1937                                         goto out;
1938                                 }
1939                                 req = req->dl_next;
1940                         }
1941                         if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
1942                                 break;
1943 get_req:
1944                         req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
1945                 }
1946                 sk        = sk_next(st->syn_wait_sk);
1947                 st->state = TCP_SEQ_STATE_LISTENING;
1948                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1949         } else {
1950                 icsk = inet_csk(sk);
1951                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1952                 if (reqsk_queue_len(&icsk->icsk_accept_queue))
1953                         goto start_req;
1954                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1955                 sk = sk_next(sk);
1956         }
1957 get_sk:
1958         sk_nulls_for_each_from(sk, node) {
1959                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net)) {
1960                         cur = sk;
1961                         goto out;
1962                 }
1963                 icsk = inet_csk(sk);
1964                 read_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1965                 if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
1966 start_req:
1967                         st->uid         = sock_i_uid(sk);
1968                         st->syn_wait_sk = sk;
1969                         st->state       = TCP_SEQ_STATE_OPENREQ;
1970                         st->sbucket     = 0;
1971                         goto get_req;
1972                 }
1973                 read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
1974         }
1975         spin_unlock_bh(&ilb->lock);
1976         if (++st->bucket < INET_LHTABLE_SIZE) {
1977                 ilb = &tcp_hashinfo.listening_hash[st->bucket];
1978                 spin_lock_bh(&ilb->lock);
1979                 sk = sk_nulls_head(&ilb->head);
1980                 goto get_sk;
1981         }
1982         cur = NULL;
1983 out:
1984         return cur;
1985 }
1986
1987 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1988 {
1989         void *rc = listening_get_next(seq, NULL);
1990
1991         while (rc && *pos) {
1992                 rc = listening_get_next(seq, rc);
1993                 --*pos;
1994         }
1995         return rc;
1996 }
1997
1998 static inline int empty_bucket(struct tcp_iter_state *st)
1999 {
2000         return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain) &&
2001                 hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].twchain);
2002 }
2003
2004 static void *established_get_first(struct seq_file *seq)
2005 {
2006         struct tcp_iter_state *st = seq->private;
2007         struct net *net = seq_file_net(seq);
2008         void *rc = NULL;
2009
2010         for (st->bucket = 0; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) {
2011                 struct sock *sk;
2012                 struct hlist_nulls_node *node;
2013                 struct inet_timewait_sock *tw;
2014                 spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket);
2015
2016                 /* Lockless fast path for the common case of empty buckets */
2017                 if (empty_bucket(st))
2018                         continue;
2019
2020                 spin_lock_bh(lock);
2021                 sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
2022                         if (sk->sk_family != st->family ||
2023                             !net_eq(sock_net(sk), net)) {
2024                                 continue;
2025                         }
2026                         rc = sk;
2027                         goto out;
2028                 }
2029                 st->state = TCP_SEQ_STATE_TIME_WAIT;
2030                 inet_twsk_for_each(tw, node,
2031                                    &tcp_hashinfo.ehash[st->bucket].twchain) {
2032                         if (tw->tw_family != st->family ||
2033                             !net_eq(twsk_net(tw), net)) {
2034                                 continue;
2035                         }
2036                         rc = tw;
2037                         goto out;
2038                 }
2039                 spin_unlock_bh(lock);
2040                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2041         }
2042 out:
2043         return rc;
2044 }
2045
2046 static void *established_get_next(struct seq_file *seq, void *cur)
2047 {
2048         struct sock *sk = cur;
2049         struct inet_timewait_sock *tw;
2050         struct hlist_nulls_node *node;
2051         struct tcp_iter_state *st = seq->private;
2052         struct net *net = seq_file_net(seq);
2053
2054         ++st->num;
2055
2056         if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
2057                 tw = cur;
2058                 tw = tw_next(tw);
2059 get_tw:
2060                 while (tw && (tw->tw_family != st->family || !net_eq(twsk_net(tw), net))) {
2061                         tw = tw_next(tw);
2062                 }
2063                 if (tw) {
2064                         cur = tw;
2065                         goto out;
2066                 }
2067                 spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2068                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2069
2070                 /* Look for next non empty bucket */
2071                 while (++st->bucket <= tcp_hashinfo.ehash_mask &&
2072                                 empty_bucket(st))
2073                         ;
2074                 if (st->bucket > tcp_hashinfo.ehash_mask)
2075                         return NULL;
2076
2077                 spin_lock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2078                 sk = sk_nulls_head(&tcp_hashinfo.ehash[st->bucket].chain);
2079         } else
2080                 sk = sk_nulls_next(sk);
2081
2082         sk_nulls_for_each_from(sk, node) {
2083                 if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
2084                         goto found;
2085         }
2086
2087         st->state = TCP_SEQ_STATE_TIME_WAIT;
2088         tw = tw_head(&tcp_hashinfo.ehash[st->bucket].twchain);
2089         goto get_tw;
2090 found:
2091         cur = sk;
2092 out:
2093         return cur;
2094 }
2095
2096 static void *established_get_idx(struct seq_file *seq, loff_t pos)
2097 {
2098         void *rc = established_get_first(seq);
2099
2100         while (rc && pos) {
2101                 rc = established_get_next(seq, rc);
2102                 --pos;
2103         }
2104         return rc;
2105 }
2106
2107 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
2108 {
2109         void *rc;
2110         struct tcp_iter_state *st = seq->private;
2111
2112         st->state = TCP_SEQ_STATE_LISTENING;
2113         rc        = listening_get_idx(seq, &pos);
2114
2115         if (!rc) {
2116                 st->state = TCP_SEQ_STATE_ESTABLISHED;
2117                 rc        = established_get_idx(seq, pos);
2118         }
2119
2120         return rc;
2121 }
2122
2123 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
2124 {
2125         struct tcp_iter_state *st = seq->private;
2126         st->state = TCP_SEQ_STATE_LISTENING;
2127         st->num = 0;
2128         return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
2129 }
2130
2131 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2132 {
2133         void *rc = NULL;
2134         struct tcp_iter_state *st;
2135
2136         if (v == SEQ_START_TOKEN) {
2137                 rc = tcp_get_idx(seq, 0);
2138                 goto out;
2139         }
2140         st = seq->private;
2141
2142         switch (st->state) {
2143         case TCP_SEQ_STATE_OPENREQ:
2144         case TCP_SEQ_STATE_LISTENING:
2145                 rc = listening_get_next(seq, v);
2146                 if (!rc) {
2147                         st->state = TCP_SEQ_STATE_ESTABLISHED;
2148                         rc        = established_get_first(seq);
2149                 }
2150                 break;
2151         case TCP_SEQ_STATE_ESTABLISHED:
2152         case TCP_SEQ_STATE_TIME_WAIT:
2153                 rc = established_get_next(seq, v);
2154                 break;
2155         }
2156 out:
2157         ++*pos;
2158         return rc;
2159 }
2160
2161 static void tcp_seq_stop(struct seq_file *seq, void *v)
2162 {
2163         struct tcp_iter_state *st = seq->private;
2164
2165         switch (st->state) {
2166         case TCP_SEQ_STATE_OPENREQ:
2167                 if (v) {
2168                         struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
2169                         read_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
2170                 }
2171         case TCP_SEQ_STATE_LISTENING:
2172                 if (v != SEQ_START_TOKEN)
2173                         spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
2174                 break;
2175         case TCP_SEQ_STATE_TIME_WAIT:
2176         case TCP_SEQ_STATE_ESTABLISHED:
2177                 if (v)
2178                         spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket));
2179                 break;
2180         }
2181 }
2182
2183 static int tcp_seq_open(struct inode *inode, struct file *file)
2184 {
2185         struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2186         struct tcp_iter_state *s;
2187         int err;
2188
2189         err = seq_open_net(inode, file, &afinfo->seq_ops,
2190                           sizeof(struct tcp_iter_state));
2191         if (err < 0)
2192                 return err;
2193
2194         s = ((struct seq_file *)file->private_data)->private;
2195         s->family               = afinfo->family;
2196         return 0;
2197 }
2198
2199 int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
2200 {
2201         int rc = 0;
2202         struct proc_dir_entry *p;
2203
2204         afinfo->seq_fops.open           = tcp_seq_open;
2205         afinfo->seq_fops.read           = seq_read;
2206         afinfo->seq_fops.llseek         = seq_lseek;
2207         afinfo->seq_fops.release        = seq_release_net;
2208
2209         afinfo->seq_ops.start           = tcp_seq_start;
2210         afinfo->seq_ops.next            = tcp_seq_next;
2211         afinfo->seq_ops.stop            = tcp_seq_stop;
2212
2213         p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
2214                              &afinfo->seq_fops, afinfo);
2215         if (!p)
2216                 rc = -ENOMEM;
2217         return rc;
2218 }
2219
2220 void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
2221 {
2222         proc_net_remove(net, afinfo->name);
2223 }
2224
2225 static void get_openreq4(struct sock *sk, struct request_sock *req,
2226                          struct seq_file *f, int i, int uid, int *len)
2227 {
2228         const struct inet_request_sock *ireq = inet_rsk(req);
2229         int ttd = req->expires - jiffies;
2230
2231         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2232                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p%n",
2233                 i,
2234                 ireq->loc_addr,
2235                 ntohs(inet_sk(sk)->inet_sport),
2236                 ireq->rmt_addr,
2237                 ntohs(ireq->rmt_port),
2238                 TCP_SYN_RECV,
2239                 0, 0, /* could print option size, but that is af dependent. */
2240                 1,    /* timers active (only the expire timer) */
2241                 jiffies_to_clock_t(ttd),
2242                 req->retrans,
2243                 uid,
2244                 0,  /* non standard timer */
2245                 0, /* open_requests have no inode */
2246                 atomic_read(&sk->sk_refcnt),
2247                 req,
2248                 len);
2249 }
2250
2251 static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i, int *len)
2252 {
2253         int timer_active;
2254         unsigned long timer_expires;
2255         struct tcp_sock *tp = tcp_sk(sk);
2256         const struct inet_connection_sock *icsk = inet_csk(sk);
2257         struct inet_sock *inet = inet_sk(sk);
2258         __be32 dest = inet->inet_daddr;
2259         __be32 src = inet->inet_rcv_saddr;
2260         __u16 destp = ntohs(inet->inet_dport);
2261         __u16 srcp = ntohs(inet->inet_sport);
2262
2263         if (icsk->icsk_pending == ICSK_TIME_RETRANS) {
2264                 timer_active    = 1;
2265                 timer_expires   = icsk->icsk_timeout;
2266         } else if (icsk->icsk_pending == ICSK_TIME_PROBE0) {
2267                 timer_active    = 4;
2268                 timer_expires   = icsk->icsk_timeout;
2269         } else if (timer_pending(&sk->sk_timer)) {
2270                 timer_active    = 2;
2271                 timer_expires   = sk->sk_timer.expires;
2272         } else {
2273                 timer_active    = 0;
2274                 timer_expires = jiffies;
2275         }
2276
2277         seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2278                         "%08X %5d %8d %lu %d %p %lu %lu %u %u %d%n",
2279                 i, src, srcp, dest, destp, sk->sk_state,
2280                 tp->write_seq - tp->snd_una,
2281                 sk->sk_state == TCP_LISTEN ? sk->sk_ack_backlog :
2282                                              (tp->rcv_nxt - tp->copied_seq),
2283                 timer_active,
2284                 jiffies_to_clock_t(timer_expires - jiffies),
2285                 icsk->icsk_retransmits,
2286                 sock_i_uid(sk),
2287                 icsk->icsk_probes_out,
2288                 sock_i_ino(sk),
2289                 atomic_read(&sk->sk_refcnt), sk,
2290                 jiffies_to_clock_t(icsk->icsk_rto),
2291                 jiffies_to_clock_t(icsk->icsk_ack.ato),
2292                 (icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
2293                 tp->snd_cwnd,
2294                 tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh,
2295                 len);
2296 }
2297
2298 static void get_timewait4_sock(struct inet_timewait_sock *tw,
2299                                struct seq_file *f, int i, int *len)
2300 {
2301         __be32 dest, src;
2302         __u16 destp, srcp;
2303         int ttd = tw->tw_ttd - jiffies;
2304
2305         if (ttd < 0)
2306                 ttd = 0;
2307
2308         dest  = tw->tw_daddr;
2309         src   = tw->tw_rcv_saddr;
2310         destp = ntohs(tw->tw_dport);
2311         srcp  = ntohs(tw->tw_sport);
2312
2313         seq_printf(f, "%4d: %08X:%04X %08X:%04X"
2314                 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p%n",
2315                 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2316                 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2317                 atomic_read(&tw->tw_refcnt), tw, len);
2318 }
2319
2320 #define TMPSZ 150
2321
2322 static int tcp4_seq_show(struct seq_file *seq, void *v)
2323 {
2324         struct tcp_iter_state *st;
2325         int len;
2326
2327         if (v == SEQ_START_TOKEN) {
2328                 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2329                            "  sl  local_address rem_address   st tx_queue "
2330                            "rx_queue tr tm->when retrnsmt   uid  timeout "
2331                            "inode");
2332                 goto out;
2333         }
2334         st = seq->private;
2335
2336         switch (st->state) {
2337         case TCP_SEQ_STATE_LISTENING:
2338         case TCP_SEQ_STATE_ESTABLISHED:
2339                 get_tcp4_sock(v, seq, st->num, &len);
2340                 break;
2341         case TCP_SEQ_STATE_OPENREQ:
2342                 get_openreq4(st->syn_wait_sk, v, seq, st->num, st->uid, &len);
2343                 break;
2344         case TCP_SEQ_STATE_TIME_WAIT:
2345                 get_timewait4_sock(v, seq, st->num, &len);
2346                 break;
2347         }
2348         seq_printf(seq, "%*s\n", TMPSZ - 1 - len, "");
2349 out:
2350         return 0;
2351 }
2352
2353 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2354         .name           = "tcp",
2355         .family         = AF_INET,
2356         .seq_fops       = {
2357                 .owner          = THIS_MODULE,
2358         },
2359         .seq_ops        = {
2360                 .show           = tcp4_seq_show,
2361         },
2362 };
2363
2364 static int tcp4_proc_init_net(struct net *net)
2365 {
2366         return tcp_proc_register(net, &tcp4_seq_afinfo);
2367 }
2368
2369 static void tcp4_proc_exit_net(struct net *net)
2370 {
2371         tcp_proc_unregister(net, &tcp4_seq_afinfo);
2372 }
2373
2374 static struct pernet_operations tcp4_net_ops = {
2375         .init = tcp4_proc_init_net,
2376         .exit = tcp4_proc_exit_net,
2377 };
2378
2379 int __init tcp4_proc_init(void)
2380 {
2381         return register_pernet_subsys(&tcp4_net_ops);
2382 }
2383
2384 void tcp4_proc_exit(void)
2385 {
2386         unregister_pernet_subsys(&tcp4_net_ops);
2387 }
2388 #endif /* CONFIG_PROC_FS */
2389
2390 struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb)
2391 {
2392         struct iphdr *iph = skb_gro_network_header(skb);
2393
2394         switch (skb->ip_summed) {
2395         case CHECKSUM_COMPLETE:
2396                 if (!tcp_v4_check(skb_gro_len(skb), iph->saddr, iph->daddr,
2397                                   skb->csum)) {
2398                         skb->ip_summed = CHECKSUM_UNNECESSARY;
2399                         break;
2400                 }
2401
2402                 /* fall through */
2403         case CHECKSUM_NONE:
2404                 NAPI_GRO_CB(skb)->flush = 1;
2405                 return NULL;
2406         }
2407
2408         return tcp_gro_receive(head, skb);
2409 }
2410 EXPORT_SYMBOL(tcp4_gro_receive);
2411
2412 int tcp4_gro_complete(struct sk_buff *skb)
2413 {
2414         struct iphdr *iph = ip_hdr(skb);
2415         struct tcphdr *th = tcp_hdr(skb);
2416
2417         th->check = ~tcp_v4_check(skb->len - skb_transport_offset(skb),
2418                                   iph->saddr, iph->daddr, 0);
2419         skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4;
2420
2421         return tcp_gro_complete(skb);
2422 }
2423 EXPORT_SYMBOL(tcp4_gro_complete);
2424
2425 struct proto tcp_prot = {
2426         .name                   = "TCP",
2427         .owner                  = THIS_MODULE,
2428         .close                  = tcp_close,
2429         .connect                = tcp_v4_connect,
2430         .disconnect             = tcp_disconnect,
2431         .accept                 = inet_csk_accept,
2432         .ioctl                  = tcp_ioctl,
2433         .init                   = tcp_v4_init_sock,
2434         .destroy                = tcp_v4_destroy_sock,
2435         .shutdown               = tcp_shutdown,
2436         .setsockopt             = tcp_setsockopt,
2437         .getsockopt             = tcp_getsockopt,
2438         .recvmsg                = tcp_recvmsg,
2439         .backlog_rcv            = tcp_v4_do_rcv,
2440         .hash                   = inet_hash,
2441         .unhash                 = inet_unhash,
2442         .get_port               = inet_csk_get_port,
2443         .enter_memory_pressure  = tcp_enter_memory_pressure,
2444         .sockets_allocated      = &tcp_sockets_allocated,
2445         .orphan_count           = &tcp_orphan_count,
2446         .memory_allocated       = &tcp_memory_allocated,
2447         .memory_pressure        = &tcp_memory_pressure,
2448         .sysctl_mem             = sysctl_tcp_mem,
2449         .sysctl_wmem            = sysctl_tcp_wmem,
2450         .sysctl_rmem            = sysctl_tcp_rmem,
2451         .max_header             = MAX_TCP_HEADER,
2452         .obj_size               = sizeof(struct tcp_sock),
2453         .slab_flags             = SLAB_DESTROY_BY_RCU,
2454         .twsk_prot              = &tcp_timewait_sock_ops,
2455         .rsk_prot               = &tcp_request_sock_ops,
2456         .h.hashinfo             = &tcp_hashinfo,
2457 #ifdef CONFIG_COMPAT
2458         .compat_setsockopt      = compat_tcp_setsockopt,
2459         .compat_getsockopt      = compat_tcp_getsockopt,
2460 #endif
2461 };
2462
2463
2464 static int __net_init tcp_sk_init(struct net *net)
2465 {
2466         return inet_ctl_sock_create(&net->ipv4.tcp_sock,
2467                                     PF_INET, SOCK_RAW, IPPROTO_TCP, net);
2468 }
2469
2470 static void __net_exit tcp_sk_exit(struct net *net)
2471 {
2472         inet_ctl_sock_destroy(net->ipv4.tcp_sock);
2473         inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET);
2474 }
2475
2476 static struct pernet_operations __net_initdata tcp_sk_ops = {
2477        .init = tcp_sk_init,
2478        .exit = tcp_sk_exit,
2479 };
2480
2481 void __init tcp_v4_init(void)
2482 {
2483         inet_hashinfo_init(&tcp_hashinfo);
2484         if (register_pernet_subsys(&tcp_sk_ops))
2485                 panic("Failed to create the TCP control socket.\n");
2486 }
2487
2488 EXPORT_SYMBOL(ipv4_specific);
2489 EXPORT_SYMBOL(tcp_hashinfo);
2490 EXPORT_SYMBOL(tcp_prot);
2491 EXPORT_SYMBOL(tcp_v4_conn_request);
2492 EXPORT_SYMBOL(tcp_v4_connect);
2493 EXPORT_SYMBOL(tcp_v4_do_rcv);
2494 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2495 EXPORT_SYMBOL(tcp_v4_send_check);
2496 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2497
2498 #ifdef CONFIG_PROC_FS
2499 EXPORT_SYMBOL(tcp_proc_register);
2500 EXPORT_SYMBOL(tcp_proc_unregister);
2501 #endif
2502 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2503