SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv4/tcp_timer.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_timer.c,v 1.88 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/module.h>
  24 #include <net/tcp.h>
  25
  26 int sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
  27 int sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
  28 int sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
  29 int sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
  30 int sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
  31 int sysctl_tcp_retries1 = TCP_RETR1;
  32 int sysctl_tcp_retries2 = TCP_RETR2;
  33 int sysctl_tcp_orphan_retries;
  34
  35 static void tcp_write_timer(unsigned long);
  36 static void tcp_delack_timer(unsigned long);
  37 static void tcp_keepalive_timer (unsigned long data);
  38
  39 #ifdef INET_CSK_DEBUG
  40 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
  41 EXPORT_SYMBOL(inet_csk_timer_bug_msg);
  42 #endif
  43
  44 /*
  45  * Using different timers for retransmit, delayed acks and probes
  46  * We may wish use just one timer maintaining a list of expire jiffies
  47  * to optimize.
  48  */
  49 void inet_csk_init_xmit_timers(struct sock *sk,
  50                                void (*retransmit_handler)(unsigned long),
  51                                void (*delack_handler)(unsigned long),
  52                                void (*keepalive_handler)(unsigned long))
  53 {
  54         struct inet_connection_sock *icsk = inet_csk(sk);
  55
  56         init_timer(&icsk->icsk_retransmit_timer);
  57         init_timer(&icsk->icsk_delack_timer);
  58         init_timer(&sk->sk_timer);
  59
  60         icsk->icsk_retransmit_timer.function = retransmit_handler;
  61         icsk->icsk_delack_timer.function     = delack_handler;
  62         sk->sk_timer.function                = keepalive_handler;
  63
  64         icsk->icsk_retransmit_timer.data =
  65                 icsk->icsk_delack_timer.data =
  66                         sk->sk_timer.data  = (unsigned long)sk;
  67
  68         icsk->icsk_pending = icsk->icsk_ack.pending = 0;
  69 }
  70
  71 void inet_csk_clear_xmit_timers(struct sock *sk)
  72 {
  73         struct inet_connection_sock *icsk = inet_csk(sk);
  74
  75         icsk->icsk_pending = icsk->icsk_ack.pending = icsk->icsk_ack.blocked = 0;
  76
  77         sk_stop_timer(sk, &icsk->icsk_retransmit_timer);
  78         sk_stop_timer(sk, &icsk->icsk_delack_timer);
  79         sk_stop_timer(sk, &sk->sk_timer);
  80 }
  81
  82 void tcp_init_xmit_timers(struct sock *sk)
  83 {
  84         inet_csk_init_xmit_timers(sk, &tcp_write_timer, &tcp_delack_timer,
  85                                   &tcp_keepalive_timer);
  86 }
  87
  88 static void tcp_write_err(struct sock *sk)
  89 {
  90         sk->sk_err = sk->sk_err_soft ? : ETIMEDOUT;
  91         sk->sk_error_report(sk);
  92
  93         tcp_done(sk);
  94         NET_INC_STATS_BH(LINUX_MIB_TCPABORTONTIMEOUT);
  95 }
  96
  97 /* Do not allow orphaned sockets to eat all our resources.
  98  * This is direct violation of TCP specs, but it is required
  99  * to prevent DoS attacks. It is called when a retransmission timeout
 100  * or zero probe timeout occurs on orphaned socket.
 101  *
 102  * Criterium is still not confirmed experimentally and may change.
 103  * We kill the socket, if:
 104  * 1. If number of orphaned sockets exceeds an administratively configured
 105  *    limit.
 106  * 2. If we have strong memory pressure.
 107  */
 108 static int tcp_out_of_resources(struct sock *sk, int do_reset)
 109 {
 110         struct tcp_sock *tp = tcp_sk(sk);
 111         int orphans = atomic_read(&tcp_orphan_count);
 112
 113         /* If peer does not open window for long time, or did not transmit
 114          * anything for long time, penalize it. */
 115         if ((s32)(tcp_time_stamp - tp->lsndtime) > 2*TCP_RTO_MAX || !do_reset)
 116                 orphans <<= 1;
 117
 118         /* If some dubious ICMP arrived, penalize even more. */
 119         if (sk->sk_err_soft)
 120                 orphans <<= 1;
 121
 122         if (orphans >= sysctl_tcp_max_orphans ||
 123             (sk->sk_wmem_queued > SOCK_MIN_SNDBUF &&
 124              atomic_read(&tcp_memory_allocated) > sysctl_tcp_mem[2])) {
 125                 if (net_ratelimit())
 126                         printk(KERN_INFO "Out of socket memory\n");
 127
 128                 /* Catch exceptional cases, when connection requires reset.
 129                  *      1. Last segment was sent recently. */
 130                 if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN ||
 131                     /*  2. Window is closed. */
 132                     (!tp->snd_wnd && !tp->packets_out))
 133                         do_reset = 1;
 134                 if (do_reset)
 135                         tcp_send_active_reset(sk, GFP_ATOMIC);
 136                 tcp_done(sk);
 137                 NET_INC_STATS_BH(LINUX_MIB_TCPABORTONMEMORY);
 138                 return 1;
 139         }
 140         return 0;
 141 }
 142
 143 /* Calculate maximal number or retries on an orphaned socket. */
 144 static int tcp_orphan_retries(struct sock *sk, int alive)
 145 {
 146         int retries = sysctl_tcp_orphan_retries; /* May be zero. */
 147
 148         /* We know from an ICMP that something is wrong. */
 149         if (sk->sk_err_soft && !alive)
 150                 retries = 0;
 151
 152         /* However, if socket sent something recently, select some safe
 153          * number of retries. 8 corresponds to >100 seconds with minimal
 154          * RTO of 200msec. */
 155         if (retries == 0 && alive)
 156                 retries = 8;
 157         return retries;
 158 }
 159
 160 /* A write timeout has occurred. Process the after effects. */
 161 static int tcp_write_timeout(struct sock *sk)
 162 {
 163         const struct inet_connection_sock *icsk = inet_csk(sk);
 164         int retry_until;
 165
 166         if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
 167                 if (icsk->icsk_retransmits)
 168                         dst_negative_advice(&sk->sk_dst_cache);
 169                 retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
 170         } else {
 171                 if (icsk->icsk_retransmits >= sysctl_tcp_retries1) {
 172                         /* NOTE. draft-ietf-tcpimpl-pmtud-01.txt requires pmtu black
 173                            hole detection. :-(
 174
 175                            It is place to make it. It is not made. I do not want
 176                            to make it. It is disguisting. It does not work in any
 177                            case. Let me to cite the same draft, which requires for
 178                            us to implement this:
 179
 180    "The one security concern raised by this memo is that ICMP black holes
 181    are often caused by over-zealous security administrators who block
 182    all ICMP messages.  It is vitally important that those who design and
 183    deploy security systems understand the impact of strict filtering on
 184    upper-layer protocols.  The safest web site in the world is worthless
 185    if most TCP implementations cannot transfer data from it.  It would
 186    be far nicer to have all of the black holes fixed rather than fixing
 187    all of the TCP implementations."
 188
 189                            Golden words :-).
 190                    */
 191
 192                         dst_negative_advice(&sk->sk_dst_cache);
 193                 }
 194
 195                 retry_until = sysctl_tcp_retries2;
 196                 if (sock_flag(sk, SOCK_DEAD)) {
 197                         const int alive = (icsk->icsk_rto < TCP_RTO_MAX);
 198
 199                         retry_until = tcp_orphan_retries(sk, alive);
 200
 201                         if (tcp_out_of_resources(sk, alive || icsk->icsk_retransmits < retry_until))
 202                                 return 1;
 203                 }
 204         }
 205
 206         if (icsk->icsk_retransmits >= retry_until) {
 207                 /* Has it gone just too far? */
 208                 tcp_write_err(sk);
 209                 return 1;
 210         }
 211         return 0;
 212 }
 213
 214 static void tcp_delack_timer(unsigned long data)
 215 {
 216         struct sock *sk = (struct sock*)data;
 217         struct tcp_sock *tp = tcp_sk(sk);
 218         struct inet_connection_sock *icsk = inet_csk(sk);
 219
 220         bh_lock_sock(sk);
 221         if (sock_owned_by_user(sk)) {
 222                 /* Try again later. */
 223                 icsk->icsk_ack.blocked = 1;
 224                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKLOCKED);
 225                 sk_reset_timer(sk, &icsk->icsk_delack_timer, jiffies + TCP_DELACK_MIN);
 226                 goto out_unlock;
 227         }
 228
 229         sk_stream_mem_reclaim(sk);
 230
 231         if (sk->sk_state == TCP_CLOSE || !(icsk->icsk_ack.pending & ICSK_ACK_TIMER))
 232                 goto out;
 233
 234         if (time_after(icsk->icsk_ack.timeout, jiffies)) {
 235                 sk_reset_timer(sk, &icsk->icsk_delack_timer, icsk->icsk_ack.timeout);
 236                 goto out;
 237         }
 238         icsk->icsk_ack.pending &= ~ICSK_ACK_TIMER;
 239
 240         if (!skb_queue_empty(&tp->ucopy.prequeue)) {
 241                 struct sk_buff *skb;
 242
 243                 NET_INC_STATS_BH(LINUX_MIB_TCPSCHEDULERFAILED);
 244
 245                 while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
 246                         sk->sk_backlog_rcv(sk, skb);
 247
 248                 tp->ucopy.memory = 0;
 249         }
 250
 251         if (inet_csk_ack_scheduled(sk)) {
 252                 if (!icsk->icsk_ack.pingpong) {
 253                         /* Delayed ACK missed: inflate ATO. */
 254                         icsk->icsk_ack.ato = min(icsk->icsk_ack.ato << 1, icsk->icsk_rto);
 255                 } else {
 256                         /* Delayed ACK missed: leave pingpong mode and
 257                          * deflate ATO.
 258                          */
 259                         icsk->icsk_ack.pingpong = 0;
 260                         icsk->icsk_ack.ato      = TCP_ATO_MIN;
 261                 }
 262                 tcp_send_ack(sk);
 263                 NET_INC_STATS_BH(LINUX_MIB_DELAYEDACKS);
 264         }
 265         TCP_CHECK_TIMER(sk);
 266
 267 out:
 268         if (tcp_memory_pressure)
 269                 sk_stream_mem_reclaim(sk);
 270 out_unlock:
 271         bh_unlock_sock(sk);
 272         sock_put(sk);
 273 }
 274
 275 static void tcp_probe_timer(struct sock *sk)
 276 {
 277         struct tcp_sock *tp = tcp_sk(sk);
 278         int max_probes;
 279
 280         if (tp->packets_out || !sk->sk_send_head) {
 281                 tp->probes_out = 0;
 282                 return;
 283         }
 284
 285         /* *WARNING* RFC 1122 forbids this
 286          *
 287          * It doesn't AFAIK, because we kill the retransmit timer -AK
 288          *
 289          * FIXME: We ought not to do it, Solaris 2.5 actually has fixing
 290          * this behaviour in Solaris down as a bug fix. [AC]
 291          *
 292          * Let me to explain. probes_out is zeroed by incoming ACKs
 293          * even if they advertise zero window. Hence, connection is killed only
 294          * if we received no ACKs for normal connection timeout. It is not killed
 295          * only because window stays zero for some time, window may be zero
 296          * until armageddon and even later. We are in full accordance
 297          * with RFCs, only probe timer combines both retransmission timeout
 298          * and probe timeout in one bottle.                             --ANK
 299          */
 300         max_probes = sysctl_tcp_retries2;
 301
 302         if (sock_flag(sk, SOCK_DEAD)) {
 303                 const struct inet_connection_sock *icsk = inet_csk(sk);
 304                 const int alive = ((icsk->icsk_rto << icsk->icsk_backoff) < TCP_RTO_MAX);
 305
 306                 max_probes = tcp_orphan_retries(sk, alive);
 307
 308                 if (tcp_out_of_resources(sk, alive || tp->probes_out <= max_probes))
 309                         return;
 310         }
 311
 312         if (tp->probes_out > max_probes) {
 313                 tcp_write_err(sk);
 314         } else {
 315                 /* Only send another probe if we didn't close things up. */
 316                 tcp_send_probe0(sk);
 317         }
 318 }
 319
 320 /*
 321  *      The TCP retransmit timer.
 322  */
 323
 324 static void tcp_retransmit_timer(struct sock *sk)
 325 {
 326         struct tcp_sock *tp = tcp_sk(sk);
 327         struct inet_connection_sock *icsk = inet_csk(sk);
 328
 329         if (!tp->packets_out)
 330                 goto out;
 331
 332         BUG_TRAP(!skb_queue_empty(&sk->sk_write_queue));
 333
 334         if (!tp->snd_wnd && !sock_flag(sk, SOCK_DEAD) &&
 335             !((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))) {
 336                 /* Receiver dastardly shrinks window. Our retransmits
 337                  * become zero probes, but we should not timeout this
 338                  * connection. If the socket is an orphan, time it out,
 339                  * we cannot allow such beasts to hang infinitely.
 340                  */
 341 #ifdef TCP_DEBUG
 342                 if (net_ratelimit()) {
 343                         struct inet_sock *inet = inet_sk(sk);
 344                         printk(KERN_DEBUG "TCP: Treason uncloaked! Peer %u.%u.%u.%u:%u/%u shrinks window %u:%u. Repaired.\n",
 345                                NIPQUAD(inet->daddr), htons(inet->dport),
 346                                inet->num, tp->snd_una, tp->snd_nxt);
 347                 }
 348 #endif
 349                 if (tcp_time_stamp - tp->rcv_tstamp > TCP_RTO_MAX) {
 350                         tcp_write_err(sk);
 351                         goto out;
 352                 }
 353                 tcp_enter_loss(sk, 0);
 354                 tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue));
 355                 __sk_dst_reset(sk);
 356                 goto out_reset_timer;
 357         }
 358
 359         if (tcp_write_timeout(sk))
 360                 goto out;
 361
 362         if (icsk->icsk_retransmits == 0) {
 363                 if (tp->ca_state == TCP_CA_Disorder || tp->ca_state == TCP_CA_Recovery) {
 364                         if (tp->rx_opt.sack_ok) {
 365                                 if (tp->ca_state == TCP_CA_Recovery)
 366                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKRECOVERYFAIL);
 367                                 else
 368                                         NET_INC_STATS_BH(LINUX_MIB_TCPSACKFAILURES);
 369                         } else {
 370                                 if (tp->ca_state == TCP_CA_Recovery)
 371                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENORECOVERYFAIL);
 372                                 else
 373                                         NET_INC_STATS_BH(LINUX_MIB_TCPRENOFAILURES);
 374                         }
 375                 } else if (tp->ca_state == TCP_CA_Loss) {
 376                         NET_INC_STATS_BH(LINUX_MIB_TCPLOSSFAILURES);
 377                 } else {
 378                         NET_INC_STATS_BH(LINUX_MIB_TCPTIMEOUTS);
 379                 }
 380         }
 381
 382         if (tcp_use_frto(sk)) {
 383                 tcp_enter_frto(sk);
 384         } else {
 385                 tcp_enter_loss(sk, 0);
 386         }
 387
 388         if (tcp_retransmit_skb(sk, skb_peek(&sk->sk_write_queue)) > 0) {
 389                 /* Retransmission failed because of local congestion,
 390                  * do not backoff.
 391                  */
 392                 if (!icsk->icsk_retransmits)
 393                         icsk->icsk_retransmits = 1;
 394                 inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
 395                                           min(icsk->icsk_rto, TCP_RESOURCE_PROBE_INTERVAL));
 396                 goto out;
 397         }
 398
 399         /* Increase the timeout each time we retransmit.  Note that
 400          * we do not increase the rtt estimate.  rto is initialized
 401          * from rtt, but increases here.  Jacobson (SIGCOMM 88) suggests
 402          * that doubling rto each time is the least we can get away with.
 403          * In KA9Q, Karn uses this for the first few times, and then
 404          * goes to quadratic.  netBSD doubles, but only goes up to *64,
 405          * and clamps at 1 to 64 sec afterwards.  Note that 120 sec is
 406          * defined in the protocol as the maximum possible RTT.  I guess
 407          * we'll have to use something other than TCP to talk to the
 408          * University of Mars.
 409          *
 410          * PAWS allows us longer timeouts and large windows, so once
 411          * implemented ftp to mars will work nicely. We will have to fix
 412          * the 120 second clamps though!
 413          */
 414         icsk->icsk_backoff++;
 415         icsk->icsk_retransmits++;
 416
 417 out_reset_timer:
 418         icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
 419         inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto);
 420         if (icsk->icsk_retransmits > sysctl_tcp_retries1)
 421                 __sk_dst_reset(sk);
 422
 423 out:;
 424 }
 425
 426 static void tcp_write_timer(unsigned long data)
 427 {
 428         struct sock *sk = (struct sock*)data;
 429         struct inet_connection_sock *icsk = inet_csk(sk);
 430         int event;
 431
 432         bh_lock_sock(sk);
 433         if (sock_owned_by_user(sk)) {
 434                 /* Try again later */
 435                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, jiffies + (HZ / 20));
 436                 goto out_unlock;
 437         }
 438
 439         if (sk->sk_state == TCP_CLOSE || !icsk->icsk_pending)
 440                 goto out;
 441
 442         if (time_after(icsk->icsk_timeout, jiffies)) {
 443                 sk_reset_timer(sk, &icsk->icsk_retransmit_timer, icsk->icsk_timeout);
 444                 goto out;
 445         }
 446
 447         event = icsk->icsk_pending;
 448         icsk->icsk_pending = 0;
 449
 450         switch (event) {
 451         case ICSK_TIME_RETRANS:
 452                 tcp_retransmit_timer(sk);
 453                 break;
 454         case ICSK_TIME_PROBE0:
 455                 tcp_probe_timer(sk);
 456                 break;
 457         }
 458         TCP_CHECK_TIMER(sk);
 459
 460 out:
 461         sk_stream_mem_reclaim(sk);
 462 out_unlock:
 463         bh_unlock_sock(sk);
 464         sock_put(sk);
 465 }
 466
 467 /*
 468  *      Timer for listening sockets
 469  */
 470
 471 static void tcp_synack_timer(struct sock *sk)
 472 {
 473         struct tcp_sock *tp = tcp_sk(sk);
 474         struct inet_connection_sock *icsk = inet_csk(sk);
 475         struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
 476         int max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
 477         int thresh = max_retries;
 478         unsigned long now = jiffies;
 479         struct request_sock **reqp, *req;
 480         int i, budget;
 481
 482         if (lopt == NULL || lopt->qlen == 0)
 483                 return;
 484
 485         /* Normally all the openreqs are young and become mature
 486          * (i.e. converted to established socket) for first timeout.
 487          * If synack was not acknowledged for 3 seconds, it means
 488          * one of the following things: synack was lost, ack was lost,
 489          * rtt is high or nobody planned to ack (i.e. synflood).
 490          * When server is a bit loaded, queue is populated with old
 491          * open requests, reducing effective size of queue.
 492          * When server is well loaded, queue size reduces to zero
 493          * after several minutes of work. It is not synflood,
 494          * it is normal operation. The solution is pruning
 495          * too old entries overriding normal timeout, when
 496          * situation becomes dangerous.
 497          *
 498          * Essentially, we reserve half of room for young
 499          * embrions; and abort old ones without pity, if old
 500          * ones are about to clog our table.
 501          */
 502         if (lopt->qlen>>(lopt->max_qlen_log-1)) {
 503                 int young = (lopt->qlen_young<<1);
 504
 505                 while (thresh > 2) {
 506                         if (lopt->qlen < young)
 507                                 break;
 508                         thresh--;
 509                         young <<= 1;
 510                 }
 511         }
 512
 513         if (tp->defer_accept)
 514                 max_retries = tp->defer_accept;
 515
 516         budget = 2*(TCP_SYNQ_HSIZE/(TCP_TIMEOUT_INIT/TCP_SYNQ_INTERVAL));
 517         i = lopt->clock_hand;
 518
 519         do {
 520                 reqp=&lopt->syn_table[i];
 521                 while ((req = *reqp) != NULL) {
 522                         if (time_after_eq(now, req->expires)) {
 523                                 if ((req->retrans < thresh ||
 524                                      (inet_rsk(req)->acked && req->retrans < max_retries))
 525                                     && !req->rsk_ops->rtx_syn_ack(sk, req, NULL)) {
 526                                         unsigned long timeo;
 527
 528                                         if (req->retrans++ == 0)
 529                                                 lopt->qlen_young--;
 530                                         timeo = min((TCP_TIMEOUT_INIT << req->retrans),
 531                                                     TCP_RTO_MAX);
 532                                         req->expires = now + timeo;
 533                                         reqp = &req->dl_next;
 534                                         continue;
 535                                 }
 536
 537                                 /* Drop this request */
 538                                 inet_csk_reqsk_queue_unlink(sk, req, reqp);
 539                                 reqsk_queue_removed(&icsk->icsk_accept_queue, req);
 540                                 reqsk_free(req);
 541                                 continue;
 542                         }
 543                         reqp = &req->dl_next;
 544                 }
 545
 546                 i = (i+1)&(TCP_SYNQ_HSIZE-1);
 547
 548         } while (--budget > 0);
 549
 550         lopt->clock_hand = i;
 551
 552         if (lopt->qlen)
 553                 inet_csk_reset_keepalive_timer(sk, TCP_SYNQ_INTERVAL);
 554 }
 555
 556 void inet_csk_delete_keepalive_timer(struct sock *sk)
 557 {
 558         sk_stop_timer(sk, &sk->sk_timer);
 559 }
 560
 561 void inet_csk_reset_keepalive_timer(struct sock *sk, unsigned long len)
 562 {
 563         sk_reset_timer(sk, &sk->sk_timer, jiffies + len);
 564 }
 565
 566 void tcp_set_keepalive(struct sock *sk, int val)
 567 {
 568         if ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))
 569                 return;
 570
 571         if (val && !sock_flag(sk, SOCK_KEEPOPEN))
 572                 inet_csk_reset_keepalive_timer(sk, keepalive_time_when(tcp_sk(sk)));
 573         else if (!val)
 574                 inet_csk_delete_keepalive_timer(sk);
 575 }
 576
 577
 578 static void tcp_keepalive_timer (unsigned long data)
 579 {
 580         struct sock *sk = (struct sock *) data;
 581         struct tcp_sock *tp = tcp_sk(sk);
 582         __u32 elapsed;
 583
 584         /* Only process if socket is not in use. */
 585         bh_lock_sock(sk);
 586         if (sock_owned_by_user(sk)) {
 587                 /* Try again later. */
 588                 inet_csk_reset_keepalive_timer (sk, HZ/20);
 589                 goto out;
 590         }
 591
 592         if (sk->sk_state == TCP_LISTEN) {
 593                 tcp_synack_timer(sk);
 594                 goto out;
 595         }
 596
 597         if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
 598                 if (tp->linger2 >= 0) {
 599                         const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
 600
 601                         if (tmo > 0) {
 602                                 tcp_time_wait(sk, TCP_FIN_WAIT2, tmo);
 603                                 goto out;
 604                         }
 605                 }
 606                 tcp_send_active_reset(sk, GFP_ATOMIC);
 607                 goto death;
 608         }
 609
 610         if (!sock_flag(sk, SOCK_KEEPOPEN) || sk->sk_state == TCP_CLOSE)
 611                 goto out;
 612
 613         elapsed = keepalive_time_when(tp);
 614
 615         /* It is alive without keepalive 8) */
 616         if (tp->packets_out || sk->sk_send_head)
 617                 goto resched;
 618
 619         elapsed = tcp_time_stamp - tp->rcv_tstamp;
 620
 621         if (elapsed >= keepalive_time_when(tp)) {
 622                 if ((!tp->keepalive_probes && tp->probes_out >= sysctl_tcp_keepalive_probes) ||
 623                      (tp->keepalive_probes && tp->probes_out >= tp->keepalive_probes)) {
 624                         tcp_send_active_reset(sk, GFP_ATOMIC);
 625                         tcp_write_err(sk);
 626                         goto out;
 627                 }
 628                 if (tcp_write_wakeup(sk) <= 0) {
 629                         tp->probes_out++;
 630                         elapsed = keepalive_intvl_when(tp);
 631                 } else {
 632                         /* If keepalive was lost due to local congestion,
 633                          * try harder.
 634                          */
 635                         elapsed = TCP_RESOURCE_PROBE_INTERVAL;
 636                 }
 637         } else {
 638                 /* It is tp->rcv_tstamp + keepalive_time_when(tp) */
 639                 elapsed = keepalive_time_when(tp) - elapsed;
 640         }
 641
 642         TCP_CHECK_TIMER(sk);
 643         sk_stream_mem_reclaim(sk);
 644
 645 resched:
 646         inet_csk_reset_keepalive_timer (sk, elapsed);
 647         goto out;
 648
 649 death:
 650         tcp_done(sk);
 651
 652 out:
 653         bh_unlock_sock(sk);
 654         sock_put(sk);
 655 }
 656
 657 EXPORT_SYMBOL(inet_csk_clear_xmit_timers);
 658 EXPORT_SYMBOL(inet_csk_delete_keepalive_timer);
 659 EXPORT_SYMBOL(tcp_init_xmit_timers);
 660 EXPORT_SYMBOL(inet_csk_reset_keepalive_timer);