SAFE public projects git trees. - safe/jmp/linux-2.6/blob - net/ipv4/tcp_minisocks.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              Implementation of the Transmission Control Protocol(TCP).
   7  *
   8  * Version:     $Id: tcp_minisocks.c,v 1.15 2002/02/01 22:01:04 davem Exp $
   9  *
  10  * Authors:     Ross Biro
  11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  12  *              Mark Evans, <evansmp@uhura.aston.ac.uk>
  13  *              Corey Minyard <wf-rch!minyard@relay.EU.net>
  14  *              Florian La Roche, <flla@stud.uni-sb.de>
  15  *              Charles Hedrick, <hedrick@klinzhai.rutgers.edu>
  16  *              Linus Torvalds, <torvalds@cs.helsinki.fi>
  17  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  18  *              Matthew Dillon, <dillon@apollo.west.oic.com>
  19  *              Arnt Gulbrandsen, <agulbra@nvg.unit.no>
  20  *              Jorge Cwik, <jorge@laser.satlink.net>
  21  */
  22
  23 #include <linux/config.h>
  24 #include <linux/mm.h>
  25 #include <linux/module.h>
  26 #include <linux/sysctl.h>
  27 #include <linux/workqueue.h>
  28 #include <net/tcp.h>
  29 #include <net/inet_common.h>
  30 #include <net/xfrm.h>
  31
  32 #ifdef CONFIG_SYSCTL
  33 #define SYNC_INIT 0 /* let the user enable it */
  34 #else
  35 #define SYNC_INIT 1
  36 #endif
  37
  38 int sysctl_tcp_tw_recycle;
  39 int sysctl_tcp_max_tw_buckets = NR_FILE*2;
  40
  41 int sysctl_tcp_syncookies = SYNC_INIT;
  42 int sysctl_tcp_abort_on_overflow;
  43
  44 static void tcp_tw_schedule(struct inet_timewait_sock *tw, int timeo);
  45
  46 static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win)
  47 {
  48         if (seq == s_win)
  49                 return 1;
  50         if (after(end_seq, s_win) && before(seq, e_win))
  51                 return 1;
  52         return (seq == e_win && seq == end_seq);
  53 }
  54
  55 /* New-style handling of TIME_WAIT sockets. */
  56
  57 int tcp_tw_count;
  58
  59 /*
  60  * * Main purpose of TIME-WAIT state is to close connection gracefully,
  61  *   when one of ends sits in LAST-ACK or CLOSING retransmitting FIN
  62  *   (and, probably, tail of data) and one or more our ACKs are lost.
  63  * * What is TIME-WAIT timeout? It is associated with maximal packet
  64  *   lifetime in the internet, which results in wrong conclusion, that
  65  *   it is set to catch "old duplicate segments" wandering out of their path.
  66  *   It is not quite correct. This timeout is calculated so that it exceeds
  67  *   maximal retransmission timeout enough to allow to lose one (or more)
  68  *   segments sent by peer and our ACKs. This time may be calculated from RTO.
  69  * * When TIME-WAIT socket receives RST, it means that another end
  70  *   finally closed and we are allowed to kill TIME-WAIT too.
  71  * * Second purpose of TIME-WAIT is catching old duplicate segments.
  72  *   Well, certainly it is pure paranoia, but if we load TIME-WAIT
  73  *   with this semantics, we MUST NOT kill TIME-WAIT state with RSTs.
  74  * * If we invented some more clever way to catch duplicates
  75  *   (f.e. based on PAWS), we could truncate TIME-WAIT to several RTOs.
  76  *
  77  * The algorithm below is based on FORMAL INTERPRETATION of RFCs.
  78  * When you compare it to RFCs, please, read section SEGMENT ARRIVES
  79  * from the very beginning.
  80  *
  81  * NOTE. With recycling (and later with fin-wait-2) TW bucket
  82  * is _not_ stateless. It means, that strictly speaking we must
  83  * spinlock it. I do not want! Well, probability of misbehaviour
  84  * is ridiculously low and, seems, we could use some mb() tricks
  85  * to avoid misread sequence numbers, states etc.  --ANK
  86  */
  87 enum tcp_tw_status
  88 tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
  89                            const struct tcphdr *th)
  90 {
  91         struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
  92         struct tcp_options_received tmp_opt;
  93         int paws_reject = 0;
  94
  95         tmp_opt.saw_tstamp = 0;
  96         if (th->doff > (sizeof(*th) >> 2) && tcptw->tw_ts_recent_stamp) {
  97                 tcp_parse_options(skb, &tmp_opt, 0);
  98
  99                 if (tmp_opt.saw_tstamp) {
 100                         tmp_opt.ts_recent       = tcptw->tw_ts_recent;
 101                         tmp_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp;
 102                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 103                 }
 104         }
 105
 106         if (tw->tw_substate == TCP_FIN_WAIT2) {
 107                 /* Just repeat all the checks of tcp_rcv_state_process() */
 108
 109                 /* Out of window, send ACK */
 110                 if (paws_reject ||
 111                     !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 112                                    tcptw->tw_rcv_nxt,
 113                                    tcptw->tw_rcv_nxt + tcptw->tw_rcv_wnd))
 114                         return TCP_TW_ACK;
 115
 116                 if (th->rst)
 117                         goto kill;
 118
 119                 if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
 120                         goto kill_with_rst;
 121
 122                 /* Dup ACK? */
 123                 if (!after(TCP_SKB_CB(skb)->end_seq, tcptw->tw_rcv_nxt) ||
 124                     TCP_SKB_CB(skb)->end_seq == TCP_SKB_CB(skb)->seq) {
 125                         inet_twsk_put(tw);
 126                         return TCP_TW_SUCCESS;
 127                 }
 128
 129                 /* New data or FIN. If new data arrive after half-duplex close,
 130                  * reset.
 131                  */
 132                 if (!th->fin ||
 133                     TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
 134 kill_with_rst:
 135                         tcp_tw_deschedule(tw);
 136                         inet_twsk_put(tw);
 137                         return TCP_TW_RST;
 138                 }
 139
 140                 /* FIN arrived, enter true time-wait state. */
 141                 tw->tw_substate   = TCP_TIME_WAIT;
 142                 tcptw->tw_rcv_nxt = TCP_SKB_CB(skb)->end_seq;
 143                 if (tmp_opt.saw_tstamp) {
 144                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 145                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 146                 }
 147
 148                 /* I am shamed, but failed to make it more elegant.
 149                  * Yes, it is direct reference to IP, which is impossible
 150                  * to generalize to IPv6. Taking into account that IPv6
 151                  * do not undertsnad recycling in any case, it not
 152                  * a big problem in practice. --ANK */
 153                 if (tw->tw_family == AF_INET &&
 154                     sysctl_tcp_tw_recycle && tcptw->tw_ts_recent_stamp &&
 155                     tcp_v4_tw_remember_stamp(tw))
 156                         tcp_tw_schedule(tw, tw->tw_timeout);
 157                 else
 158                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 159                 return TCP_TW_ACK;
 160         }
 161
 162         /*
 163          *      Now real TIME-WAIT state.
 164          *
 165          *      RFC 1122:
 166          *      "When a connection is [...] on TIME-WAIT state [...]
 167          *      [a TCP] MAY accept a new SYN from the remote TCP to
 168          *      reopen the connection directly, if it:
 169          *
 170          *      (1)  assigns its initial sequence number for the new
 171          *      connection to be larger than the largest sequence
 172          *      number it used on the previous connection incarnation,
 173          *      and
 174          *
 175          *      (2)  returns to TIME-WAIT state if the SYN turns out
 176          *      to be an old duplicate".
 177          */
 178
 179         if (!paws_reject &&
 180             (TCP_SKB_CB(skb)->seq == tcptw->tw_rcv_nxt &&
 181              (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq || th->rst))) {
 182                 /* In window segment, it may be only reset or bare ack. */
 183
 184                 if (th->rst) {
 185                         /* This is TIME_WAIT assasination, in two flavors.
 186                          * Oh well... nobody has a sufficient solution to this
 187                          * protocol bug yet.
 188                          */
 189                         if (sysctl_tcp_rfc1337 == 0) {
 190 kill:
 191                                 tcp_tw_deschedule(tw);
 192                                 inet_twsk_put(tw);
 193                                 return TCP_TW_SUCCESS;
 194                         }
 195                 }
 196                 tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 197
 198                 if (tmp_opt.saw_tstamp) {
 199                         tcptw->tw_ts_recent       = tmp_opt.rcv_tsval;
 200                         tcptw->tw_ts_recent_stamp = xtime.tv_sec;
 201                 }
 202
 203                 inet_twsk_put(tw);
 204                 return TCP_TW_SUCCESS;
 205         }
 206
 207         /* Out of window segment.
 208
 209            All the segments are ACKed immediately.
 210
 211            The only exception is new SYN. We accept it, if it is
 212            not old duplicate and we are not in danger to be killed
 213            by delayed old duplicates. RFC check is that it has
 214            newer sequence number works at rates <40Mbit/sec.
 215            However, if paws works, it is reliable AND even more,
 216            we even may relax silly seq space cutoff.
 217
 218            RED-PEN: we violate main RFC requirement, if this SYN will appear
 219            old duplicate (i.e. we receive RST in reply to SYN-ACK),
 220            we must return socket to time-wait state. It is not good,
 221            but not fatal yet.
 222          */
 223
 224         if (th->syn && !th->rst && !th->ack && !paws_reject &&
 225             (after(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt) ||
 226              (tmp_opt.saw_tstamp &&
 227               (s32)(tcptw->tw_ts_recent - tmp_opt.rcv_tsval) < 0))) {
 228                 u32 isn = tcptw->tw_snd_nxt + 65535 + 2;
 229                 if (isn == 0)
 230                         isn++;
 231                 TCP_SKB_CB(skb)->when = isn;
 232                 return TCP_TW_SYN;
 233         }
 234
 235         if (paws_reject)
 236                 NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 237
 238         if(!th->rst) {
 239                 /* In this case we must reset the TIMEWAIT timer.
 240                  *
 241                  * If it is ACKless SYN it may be both old duplicate
 242                  * and new good SYN with random sequence number <rcv_nxt.
 243                  * Do not reschedule in the last case.
 244                  */
 245                 if (paws_reject || th->ack)
 246                         tcp_tw_schedule(tw, TCP_TIMEWAIT_LEN);
 247
 248                 /* Send ACK. Note, we do not put the bucket,
 249                  * it will be released by caller.
 250                  */
 251                 return TCP_TW_ACK;
 252         }
 253         inet_twsk_put(tw);
 254         return TCP_TW_SUCCESS;
 255 }
 256
 257 /*
 258  * Move a socket to time-wait or dead fin-wait-2 state.
 259  */
 260 void tcp_time_wait(struct sock *sk, int state, int timeo)
 261 {
 262         struct inet_timewait_sock *tw = NULL;
 263         const struct tcp_sock *tp = tcp_sk(sk);
 264         int recycle_ok = 0;
 265
 266         if (sysctl_tcp_tw_recycle && tp->rx_opt.ts_recent_stamp)
 267                 recycle_ok = tp->af_specific->remember_stamp(sk);
 268
 269         if (tcp_tw_count < sysctl_tcp_max_tw_buckets)
 270                 tw = inet_twsk_alloc(sk, state);
 271
 272         if (tw != NULL) {
 273                 struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw);
 274                 const int rto = (tp->rto << 2) - (tp->rto >> 1);
 275
 276                 tw->tw_rcv_wscale       = tp->rx_opt.rcv_wscale;
 277                 tcptw->tw_rcv_nxt       = tp->rcv_nxt;
 278                 tcptw->tw_snd_nxt       = tp->snd_nxt;
 279                 tcptw->tw_rcv_wnd       = tcp_receive_window(tp);
 280                 tcptw->tw_ts_recent     = tp->rx_opt.ts_recent;
 281                 tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 282
 283 #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
 284                 if (tw->tw_family == PF_INET6) {
 285                         struct ipv6_pinfo *np = inet6_sk(sk);
 286                         struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw);
 287
 288                         ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr);
 289                         ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr);
 290                         tw->tw_ipv6only = np->ipv6only;
 291                 }
 292 #endif
 293                 /* Linkage updates. */
 294                 __inet_twsk_hashdance(tw, sk, &tcp_hashinfo);
 295
 296                 /* Get the TIME_WAIT timeout firing. */
 297                 if (timeo < rto)
 298                         timeo = rto;
 299
 300                 if (recycle_ok) {
 301                         tw->tw_timeout = rto;
 302                 } else {
 303                         tw->tw_timeout = TCP_TIMEWAIT_LEN;
 304                         if (state == TCP_TIME_WAIT)
 305                                 timeo = TCP_TIMEWAIT_LEN;
 306                 }
 307
 308                 tcp_tw_schedule(tw, timeo);
 309                 inet_twsk_put(tw);
 310         } else {
 311                 /* Sorry, if we're out of memory, just CLOSE this
 312                  * socket up.  We've got bigger problems than
 313                  * non-graceful socket closings.
 314                  */
 315                 if (net_ratelimit())
 316                         printk(KERN_INFO "TCP: time wait bucket table overflow\n");
 317         }
 318
 319         tcp_update_metrics(sk);
 320         tcp_done(sk);
 321 }
 322
 323 /* Kill off TIME_WAIT sockets once their lifetime has expired. */
 324 static int tcp_tw_death_row_slot;
 325
 326 static void tcp_twkill(unsigned long);
 327
 328 /* TIME_WAIT reaping mechanism. */
 329 #define TCP_TWKILL_SLOTS        8       /* Please keep this a power of 2. */
 330 #define TCP_TWKILL_PERIOD       (TCP_TIMEWAIT_LEN/TCP_TWKILL_SLOTS)
 331
 332 #define TCP_TWKILL_QUOTA        100
 333
 334 static struct hlist_head tcp_tw_death_row[TCP_TWKILL_SLOTS];
 335 static DEFINE_SPINLOCK(tw_death_lock);
 336 static struct timer_list tcp_tw_timer = TIMER_INITIALIZER(tcp_twkill, 0, 0);
 337 static void twkill_work(void *);
 338 static DECLARE_WORK(tcp_twkill_work, twkill_work, NULL);
 339 static u32 twkill_thread_slots;
 340
 341 /* Returns non-zero if quota exceeded.  */
 342 static int tcp_do_twkill_work(int slot, unsigned int quota)
 343 {
 344         struct inet_timewait_sock *tw;
 345         struct hlist_node *node;
 346         unsigned int killed;
 347         int ret;
 348
 349         /* NOTE: compare this to previous version where lock
 350          * was released after detaching chain. It was racy,
 351          * because tw buckets are scheduled in not serialized context
 352          * in 2.3 (with netfilter), and with softnet it is common, because
 353          * soft irqs are not sequenced.
 354          */
 355         killed = 0;
 356         ret = 0;
 357 rescan:
 358         inet_twsk_for_each_inmate(tw, node, &tcp_tw_death_row[slot]) {
 359                 __inet_twsk_del_dead_node(tw);
 360                 spin_unlock(&tw_death_lock);
 361                 __inet_twsk_kill(tw, &tcp_hashinfo);
 362                 inet_twsk_put(tw);
 363                 killed++;
 364                 spin_lock(&tw_death_lock);
 365                 if (killed > quota) {
 366                         ret = 1;
 367                         break;
 368                 }
 369
 370                 /* While we dropped tw_death_lock, another cpu may have
 371                  * killed off the next TW bucket in the list, therefore
 372                  * do a fresh re-read of the hlist head node with the
 373                  * lock reacquired.  We still use the hlist traversal
 374                  * macro in order to get the prefetches.
 375                  */
 376                 goto rescan;
 377         }
 378
 379         tcp_tw_count -= killed;
 380         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITED, killed);
 381
 382         return ret;
 383 }
 384
 385 static void tcp_twkill(unsigned long dummy)
 386 {
 387         int need_timer, ret;
 388
 389         spin_lock(&tw_death_lock);
 390
 391         if (tcp_tw_count == 0)
 392                 goto out;
 393
 394         need_timer = 0;
 395         ret = tcp_do_twkill_work(tcp_tw_death_row_slot, TCP_TWKILL_QUOTA);
 396         if (ret) {
 397                 twkill_thread_slots |= (1 << tcp_tw_death_row_slot);
 398                 mb();
 399                 schedule_work(&tcp_twkill_work);
 400                 need_timer = 1;
 401         } else {
 402                 /* We purged the entire slot, anything left?  */
 403                 if (tcp_tw_count)
 404                         need_timer = 1;
 405         }
 406         tcp_tw_death_row_slot =
 407                 ((tcp_tw_death_row_slot + 1) & (TCP_TWKILL_SLOTS - 1));
 408         if (need_timer)
 409                 mod_timer(&tcp_tw_timer, jiffies + TCP_TWKILL_PERIOD);
 410 out:
 411         spin_unlock(&tw_death_lock);
 412 }
 413
 414 extern void twkill_slots_invalid(void);
 415
 416 static void twkill_work(void *dummy)
 417 {
 418         int i;
 419
 420         if ((TCP_TWKILL_SLOTS - 1) > (sizeof(twkill_thread_slots) * 8))
 421                 twkill_slots_invalid();
 422
 423         while (twkill_thread_slots) {
 424                 spin_lock_bh(&tw_death_lock);
 425                 for (i = 0; i < TCP_TWKILL_SLOTS; i++) {
 426                         if (!(twkill_thread_slots & (1 << i)))
 427                                 continue;
 428
 429                         while (tcp_do_twkill_work(i, TCP_TWKILL_QUOTA) != 0) {
 430                                 if (need_resched()) {
 431                                         spin_unlock_bh(&tw_death_lock);
 432                                         schedule();
 433                                         spin_lock_bh(&tw_death_lock);
 434                                 }
 435                         }
 436
 437                         twkill_thread_slots &= ~(1 << i);
 438                 }
 439                 spin_unlock_bh(&tw_death_lock);
 440         }
 441 }
 442
 443 /* These are always called from BH context.  See callers in
 444  * tcp_input.c to verify this.
 445  */
 446
 447 /* This is for handling early-kills of TIME_WAIT sockets. */
 448 void tcp_tw_deschedule(struct inet_timewait_sock *tw)
 449 {
 450         spin_lock(&tw_death_lock);
 451         if (inet_twsk_del_dead_node(tw)) {
 452                 inet_twsk_put(tw);
 453                 if (--tcp_tw_count == 0)
 454                         del_timer(&tcp_tw_timer);
 455         }
 456         spin_unlock(&tw_death_lock);
 457         __inet_twsk_kill(tw, &tcp_hashinfo);
 458 }
 459
 460 /* Short-time timewait calendar */
 461
 462 static int tcp_twcal_hand = -1;
 463 static int tcp_twcal_jiffie;
 464 static void tcp_twcal_tick(unsigned long);
 465 static struct timer_list tcp_twcal_timer =
 466                 TIMER_INITIALIZER(tcp_twcal_tick, 0, 0);
 467 static struct hlist_head tcp_twcal_row[TCP_TW_RECYCLE_SLOTS];
 468
 469 static void tcp_tw_schedule(struct inet_timewait_sock *tw, const int timeo)
 470 {
 471         struct hlist_head *list;
 472         int slot;
 473
 474         /* timeout := RTO * 3.5
 475          *
 476          * 3.5 = 1+2+0.5 to wait for two retransmits.
 477          *
 478          * RATIONALE: if FIN arrived and we entered TIME-WAIT state,
 479          * our ACK acking that FIN can be lost. If N subsequent retransmitted
 480          * FINs (or previous seqments) are lost (probability of such event
 481          * is p^(N+1), where p is probability to lose single packet and
 482          * time to detect the loss is about RTO*(2^N - 1) with exponential
 483          * backoff). Normal timewait length is calculated so, that we
 484          * waited at least for one retransmitted FIN (maximal RTO is 120sec).
 485          * [ BTW Linux. following BSD, violates this requirement waiting
 486          *   only for 60sec, we should wait at least for 240 secs.
 487          *   Well, 240 consumes too much of resources 8)
 488          * ]
 489          * This interval is not reduced to catch old duplicate and
 490          * responces to our wandering segments living for two MSLs.
 491          * However, if we use PAWS to detect
 492          * old duplicates, we can reduce the interval to bounds required
 493          * by RTO, rather than MSL. So, if peer understands PAWS, we
 494          * kill tw bucket after 3.5*RTO (it is important that this number
 495          * is greater than TS tick!) and detect old duplicates with help
 496          * of PAWS.
 497          */
 498         slot = (timeo + (1<<TCP_TW_RECYCLE_TICK) - 1) >> TCP_TW_RECYCLE_TICK;
 499
 500         spin_lock(&tw_death_lock);
 501
 502         /* Unlink it, if it was scheduled */
 503         if (inet_twsk_del_dead_node(tw))
 504                 tcp_tw_count--;
 505         else
 506                 atomic_inc(&tw->tw_refcnt);
 507
 508         if (slot >= TCP_TW_RECYCLE_SLOTS) {
 509                 /* Schedule to slow timer */
 510                 if (timeo >= TCP_TIMEWAIT_LEN) {
 511                         slot = TCP_TWKILL_SLOTS-1;
 512                 } else {
 513                         slot = (timeo + TCP_TWKILL_PERIOD-1) / TCP_TWKILL_PERIOD;
 514                         if (slot >= TCP_TWKILL_SLOTS)
 515                                 slot = TCP_TWKILL_SLOTS-1;
 516                 }
 517                 tw->tw_ttd = jiffies + timeo;
 518                 slot = (tcp_tw_death_row_slot + slot) & (TCP_TWKILL_SLOTS - 1);
 519                 list = &tcp_tw_death_row[slot];
 520         } else {
 521                 tw->tw_ttd = jiffies + (slot << TCP_TW_RECYCLE_TICK);
 522
 523                 if (tcp_twcal_hand < 0) {
 524                         tcp_twcal_hand = 0;
 525                         tcp_twcal_jiffie = jiffies;
 526                         tcp_twcal_timer.expires = tcp_twcal_jiffie + (slot<<TCP_TW_RECYCLE_TICK);
 527                         add_timer(&tcp_twcal_timer);
 528                 } else {
 529                         if (time_after(tcp_twcal_timer.expires, jiffies + (slot<<TCP_TW_RECYCLE_TICK)))
 530                                 mod_timer(&tcp_twcal_timer, jiffies + (slot<<TCP_TW_RECYCLE_TICK));
 531                         slot = (tcp_twcal_hand + slot)&(TCP_TW_RECYCLE_SLOTS-1);
 532                 }
 533                 list = &tcp_twcal_row[slot];
 534         }
 535
 536         hlist_add_head(&tw->tw_death_node, list);
 537
 538         if (tcp_tw_count++ == 0)
 539                 mod_timer(&tcp_tw_timer, jiffies+TCP_TWKILL_PERIOD);
 540         spin_unlock(&tw_death_lock);
 541 }
 542
 543 void tcp_twcal_tick(unsigned long dummy)
 544 {
 545         int n, slot;
 546         unsigned long j;
 547         unsigned long now = jiffies;
 548         int killed = 0;
 549         int adv = 0;
 550
 551         spin_lock(&tw_death_lock);
 552         if (tcp_twcal_hand < 0)
 553                 goto out;
 554
 555         slot = tcp_twcal_hand;
 556         j = tcp_twcal_jiffie;
 557
 558         for (n=0; n<TCP_TW_RECYCLE_SLOTS; n++) {
 559                 if (time_before_eq(j, now)) {
 560                         struct hlist_node *node, *safe;
 561                         struct inet_timewait_sock *tw;
 562
 563                         inet_twsk_for_each_inmate_safe(tw, node, safe,
 564                                                        &tcp_twcal_row[slot]) {
 565                                 __inet_twsk_del_dead_node(tw);
 566                                 __inet_twsk_kill(tw, &tcp_hashinfo);
 567                                 inet_twsk_put(tw);
 568                                 killed++;
 569                         }
 570                 } else {
 571                         if (!adv) {
 572                                 adv = 1;
 573                                 tcp_twcal_jiffie = j;
 574                                 tcp_twcal_hand = slot;
 575                         }
 576
 577                         if (!hlist_empty(&tcp_twcal_row[slot])) {
 578                                 mod_timer(&tcp_twcal_timer, j);
 579                                 goto out;
 580                         }
 581                 }
 582                 j += (1<<TCP_TW_RECYCLE_TICK);
 583                 slot = (slot+1)&(TCP_TW_RECYCLE_SLOTS-1);
 584         }
 585         tcp_twcal_hand = -1;
 586
 587 out:
 588         if ((tcp_tw_count -= killed) == 0)
 589                 del_timer(&tcp_tw_timer);
 590         NET_ADD_STATS_BH(LINUX_MIB_TIMEWAITKILLED, killed);
 591         spin_unlock(&tw_death_lock);
 592 }
 593
 594 /* This is not only more efficient than what we used to do, it eliminates
 595  * a lot of code duplication between IPv4/IPv6 SYN recv processing. -DaveM
 596  *
 597  * Actually, we could lots of memory writes here. tp of listening
 598  * socket contains all necessary default parameters.
 599  */
 600 struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req, struct sk_buff *skb)
 601 {
 602         /* allocate the newsk from the same slab of the master sock,
 603          * if not, at sk_free time we'll try to free it from the wrong
 604          * slabcache (i.e. is it TCPv4 or v6?), this is handled thru sk->sk_prot -acme */
 605         struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, sk->sk_prot, 0);
 606
 607         if(newsk != NULL) {
 608                 struct inet_request_sock *ireq = inet_rsk(req);
 609                 struct tcp_request_sock *treq = tcp_rsk(req);
 610                 struct inet_sock *newinet = inet_sk(newsk);
 611                 struct tcp_sock *newtp;
 612                 struct sk_filter *filter;
 613
 614                 memcpy(newsk, sk, sizeof(struct tcp_sock));
 615                 newsk->sk_state = TCP_SYN_RECV;
 616
 617                 /* SANITY */
 618                 sk_node_init(&newsk->sk_node);
 619                 newinet->bind_hash = NULL;
 620
 621                 /* Clone the TCP header template */
 622                 newinet->dport = ireq->rmt_port;
 623
 624                 sock_lock_init(newsk);
 625                 bh_lock_sock(newsk);
 626
 627                 rwlock_init(&newsk->sk_dst_lock);
 628                 newsk->sk_dst_cache = NULL;
 629                 atomic_set(&newsk->sk_rmem_alloc, 0);
 630                 skb_queue_head_init(&newsk->sk_receive_queue);
 631                 atomic_set(&newsk->sk_wmem_alloc, 0);
 632                 skb_queue_head_init(&newsk->sk_write_queue);
 633                 atomic_set(&newsk->sk_omem_alloc, 0);
 634                 newsk->sk_wmem_queued = 0;
 635                 newsk->sk_forward_alloc = 0;
 636
 637                 sock_reset_flag(newsk, SOCK_DONE);
 638                 newsk->sk_userlocks = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
 639                 newsk->sk_backlog.head = newsk->sk_backlog.tail = NULL;
 640                 newsk->sk_send_head = NULL;
 641                 rwlock_init(&newsk->sk_callback_lock);
 642                 skb_queue_head_init(&newsk->sk_error_queue);
 643                 newsk->sk_write_space = sk_stream_write_space;
 644
 645                 if ((filter = newsk->sk_filter) != NULL)
 646                         sk_filter_charge(newsk, filter);
 647
 648                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
 649                         /* It is still raw copy of parent, so invalidate
 650                          * destructor and make plain sk_free() */
 651                         newsk->sk_destruct = NULL;
 652                         sk_free(newsk);
 653                         return NULL;
 654                 }
 655
 656                 /* Now setup tcp_sock */
 657                 newtp = tcp_sk(newsk);
 658                 newtp->pred_flags = 0;
 659                 newtp->rcv_nxt = treq->rcv_isn + 1;
 660                 newtp->snd_nxt = treq->snt_isn + 1;
 661                 newtp->snd_una = treq->snt_isn + 1;
 662                 newtp->snd_sml = treq->snt_isn + 1;
 663
 664                 tcp_prequeue_init(newtp);
 665
 666                 tcp_init_wl(newtp, treq->snt_isn, treq->rcv_isn);
 667
 668                 newtp->retransmits = 0;
 669                 newtp->backoff = 0;
 670                 newtp->srtt = 0;
 671                 newtp->mdev = TCP_TIMEOUT_INIT;
 672                 newtp->rto = TCP_TIMEOUT_INIT;
 673
 674                 newtp->packets_out = 0;
 675                 newtp->left_out = 0;
 676                 newtp->retrans_out = 0;
 677                 newtp->sacked_out = 0;
 678                 newtp->fackets_out = 0;
 679                 newtp->snd_ssthresh = 0x7fffffff;
 680
 681                 /* So many TCP implementations out there (incorrectly) count the
 682                  * initial SYN frame in their delayed-ACK and congestion control
 683                  * algorithms that we must have the following bandaid to talk
 684                  * efficiently to them.  -DaveM
 685                  */
 686                 newtp->snd_cwnd = 2;
 687                 newtp->snd_cwnd_cnt = 0;
 688
 689                 newtp->frto_counter = 0;
 690                 newtp->frto_highmark = 0;
 691
 692                 newtp->ca_ops = &tcp_reno;
 693
 694                 tcp_set_ca_state(newtp, TCP_CA_Open);
 695                 tcp_init_xmit_timers(newsk);
 696                 skb_queue_head_init(&newtp->out_of_order_queue);
 697                 newtp->rcv_wup = treq->rcv_isn + 1;
 698                 newtp->write_seq = treq->snt_isn + 1;
 699                 newtp->pushed_seq = newtp->write_seq;
 700                 newtp->copied_seq = treq->rcv_isn + 1;
 701
 702                 newtp->rx_opt.saw_tstamp = 0;
 703
 704                 newtp->rx_opt.dsack = 0;
 705                 newtp->rx_opt.eff_sacks = 0;
 706
 707                 newtp->probes_out = 0;
 708                 newtp->rx_opt.num_sacks = 0;
 709                 newtp->urg_data = 0;
 710                 /* Deinitialize accept_queue to trap illegal accesses. */
 711                 memset(&newtp->accept_queue, 0, sizeof(newtp->accept_queue));
 712
 713                 /* Back to base struct sock members. */
 714                 newsk->sk_err = 0;
 715                 newsk->sk_priority = 0;
 716                 atomic_set(&newsk->sk_refcnt, 2);
 717
 718                 /*
 719                  * Increment the counter in the same struct proto as the master
 720                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
 721                  * is the same as sk->sk_prot->socks, as this field was copied
 722                  * with memcpy), same rationale as the first comment in this
 723                  * function.
 724                  *
 725                  * This _changes_ the previous behaviour, where
 726                  * tcp_create_openreq_child always was incrementing the
 727                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
 728                  * to be taken into account in all callers. -acme
 729                  */
 730                 sk_refcnt_debug_inc(newsk);
 731
 732                 atomic_inc(&tcp_sockets_allocated);
 733
 734                 if (sock_flag(newsk, SOCK_KEEPOPEN))
 735                         tcp_reset_keepalive_timer(newsk,
 736                                                   keepalive_time_when(newtp));
 737                 newsk->sk_socket = NULL;
 738                 newsk->sk_sleep = NULL;
 739
 740                 newtp->rx_opt.tstamp_ok = ireq->tstamp_ok;
 741                 if((newtp->rx_opt.sack_ok = ireq->sack_ok) != 0) {
 742                         if (sysctl_tcp_fack)
 743                                 newtp->rx_opt.sack_ok |= 2;
 744                 }
 745                 newtp->window_clamp = req->window_clamp;
 746                 newtp->rcv_ssthresh = req->rcv_wnd;
 747                 newtp->rcv_wnd = req->rcv_wnd;
 748                 newtp->rx_opt.wscale_ok = ireq->wscale_ok;
 749                 if (newtp->rx_opt.wscale_ok) {
 750                         newtp->rx_opt.snd_wscale = ireq->snd_wscale;
 751                         newtp->rx_opt.rcv_wscale = ireq->rcv_wscale;
 752                 } else {
 753                         newtp->rx_opt.snd_wscale = newtp->rx_opt.rcv_wscale = 0;
 754                         newtp->window_clamp = min(newtp->window_clamp, 65535U);
 755                 }
 756                 newtp->snd_wnd = ntohs(skb->h.th->window) << newtp->rx_opt.snd_wscale;
 757                 newtp->max_window = newtp->snd_wnd;
 758
 759                 if (newtp->rx_opt.tstamp_ok) {
 760                         newtp->rx_opt.ts_recent = req->ts_recent;
 761                         newtp->rx_opt.ts_recent_stamp = xtime.tv_sec;
 762                         newtp->tcp_header_len = sizeof(struct tcphdr) + TCPOLEN_TSTAMP_ALIGNED;
 763                 } else {
 764                         newtp->rx_opt.ts_recent_stamp = 0;
 765                         newtp->tcp_header_len = sizeof(struct tcphdr);
 766                 }
 767                 if (skb->len >= TCP_MIN_RCVMSS+newtp->tcp_header_len)
 768                         newtp->ack.last_seg_size = skb->len-newtp->tcp_header_len;
 769                 newtp->rx_opt.mss_clamp = req->mss;
 770                 TCP_ECN_openreq_child(newtp, req);
 771                 if (newtp->ecn_flags&TCP_ECN_OK)
 772                         sock_set_flag(newsk, SOCK_NO_LARGESEND);
 773
 774                 TCP_INC_STATS_BH(TCP_MIB_PASSIVEOPENS);
 775         }
 776         return newsk;
 777 }
 778
 779 /*
 780  *      Process an incoming packet for SYN_RECV sockets represented
 781  *      as a request_sock.
 782  */
 783
 784 struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb,
 785                            struct request_sock *req,
 786                            struct request_sock **prev)
 787 {
 788         struct tcphdr *th = skb->h.th;
 789         struct tcp_sock *tp = tcp_sk(sk);
 790         u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
 791         int paws_reject = 0;
 792         struct tcp_options_received tmp_opt;
 793         struct sock *child;
 794
 795         tmp_opt.saw_tstamp = 0;
 796         if (th->doff > (sizeof(struct tcphdr)>>2)) {
 797                 tcp_parse_options(skb, &tmp_opt, 0);
 798
 799                 if (tmp_opt.saw_tstamp) {
 800                         tmp_opt.ts_recent = req->ts_recent;
 801                         /* We do not store true stamp, but it is not required,
 802                          * it can be estimated (approximately)
 803                          * from another data.
 804                          */
 805                         tmp_opt.ts_recent_stamp = xtime.tv_sec - ((TCP_TIMEOUT_INIT/HZ)<<req->retrans);
 806                         paws_reject = tcp_paws_check(&tmp_opt, th->rst);
 807                 }
 808         }
 809
 810         /* Check for pure retransmitted SYN. */
 811         if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn &&
 812             flg == TCP_FLAG_SYN &&
 813             !paws_reject) {
 814                 /*
 815                  * RFC793 draws (Incorrectly! It was fixed in RFC1122)
 816                  * this case on figure 6 and figure 8, but formal
 817                  * protocol description says NOTHING.
 818                  * To be more exact, it says that we should send ACK,
 819                  * because this segment (at least, if it has no data)
 820                  * is out of window.
 821                  *
 822                  *  CONCLUSION: RFC793 (even with RFC1122) DOES NOT
 823                  *  describe SYN-RECV state. All the description
 824                  *  is wrong, we cannot believe to it and should
 825                  *  rely only on common sense and implementation
 826                  *  experience.
 827                  *
 828                  * Enforce "SYN-ACK" according to figure 8, figure 6
 829                  * of RFC793, fixed by RFC1122.
 830                  */
 831                 req->rsk_ops->rtx_syn_ack(sk, req, NULL);
 832                 return NULL;
 833         }
 834
 835         /* Further reproduces section "SEGMENT ARRIVES"
 836            for state SYN-RECEIVED of RFC793.
 837            It is broken, however, it does not work only
 838            when SYNs are crossed.
 839
 840            You would think that SYN crossing is impossible here, since
 841            we should have a SYN_SENT socket (from connect()) on our end,
 842            but this is not true if the crossed SYNs were sent to both
 843            ends by a malicious third party.  We must defend against this,
 844            and to do that we first verify the ACK (as per RFC793, page
 845            36) and reset if it is invalid.  Is this a true full defense?
 846            To convince ourselves, let us consider a way in which the ACK
 847            test can still pass in this 'malicious crossed SYNs' case.
 848            Malicious sender sends identical SYNs (and thus identical sequence
 849            numbers) to both A and B:
 850
 851                 A: gets SYN, seq=7
 852                 B: gets SYN, seq=7
 853
 854            By our good fortune, both A and B select the same initial
 855            send sequence number of seven :-)
 856
 857                 A: sends SYN|ACK, seq=7, ack_seq=8
 858                 B: sends SYN|ACK, seq=7, ack_seq=8
 859
 860            So we are now A eating this SYN|ACK, ACK test passes.  So
 861            does sequence test, SYN is truncated, and thus we consider
 862            it a bare ACK.
 863
 864            If tp->defer_accept, we silently drop this bare ACK.  Otherwise,
 865            we create an established connection.  Both ends (listening sockets)
 866            accept the new incoming connection and try to talk to each other. 8-)
 867
 868            Note: This case is both harmless, and rare.  Possibility is about the
 869            same as us discovering intelligent life on another plant tomorrow.
 870
 871            But generally, we should (RFC lies!) to accept ACK
 872            from SYNACK both here and in tcp_rcv_state_process().
 873            tcp_rcv_state_process() does not, hence, we do not too.
 874
 875            Note that the case is absolutely generic:
 876            we cannot optimize anything here without
 877            violating protocol. All the checks must be made
 878            before attempt to create socket.
 879          */
 880
 881         /* RFC793 page 36: "If the connection is in any non-synchronized state ...
 882          *                  and the incoming segment acknowledges something not yet
 883          *                  sent (the segment carries an unaccaptable ACK) ...
 884          *                  a reset is sent."
 885          *
 886          * Invalid ACK: reset will be sent by listening socket
 887          */
 888         if ((flg & TCP_FLAG_ACK) &&
 889             (TCP_SKB_CB(skb)->ack_seq != tcp_rsk(req)->snt_isn + 1))
 890                 return sk;
 891
 892         /* Also, it would be not so bad idea to check rcv_tsecr, which
 893          * is essentially ACK extension and too early or too late values
 894          * should cause reset in unsynchronized states.
 895          */
 896
 897         /* RFC793: "first check sequence number". */
 898
 899         if (paws_reject || !tcp_in_window(TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq,
 900                                           tcp_rsk(req)->rcv_isn + 1, tcp_rsk(req)->rcv_isn + 1 + req->rcv_wnd)) {
 901                 /* Out of window: send ACK and drop. */
 902                 if (!(flg & TCP_FLAG_RST))
 903                         req->rsk_ops->send_ack(skb, req);
 904                 if (paws_reject)
 905                         NET_INC_STATS_BH(LINUX_MIB_PAWSESTABREJECTED);
 906                 return NULL;
 907         }
 908
 909         /* In sequence, PAWS is OK. */
 910
 911         if (tmp_opt.saw_tstamp && !after(TCP_SKB_CB(skb)->seq, tcp_rsk(req)->rcv_isn + 1))
 912                         req->ts_recent = tmp_opt.rcv_tsval;
 913
 914                 if (TCP_SKB_CB(skb)->seq == tcp_rsk(req)->rcv_isn) {
 915                         /* Truncate SYN, it is out of window starting
 916                            at tcp_rsk(req)->rcv_isn + 1. */
 917                         flg &= ~TCP_FLAG_SYN;
 918                 }
 919
 920                 /* RFC793: "second check the RST bit" and
 921                  *         "fourth, check the SYN bit"
 922                  */
 923                 if (flg & (TCP_FLAG_RST|TCP_FLAG_SYN))
 924                         goto embryonic_reset;
 925
 926                 /* ACK sequence verified above, just make sure ACK is
 927                  * set.  If ACK not set, just silently drop the packet.
 928                  */
 929                 if (!(flg & TCP_FLAG_ACK))
 930                         return NULL;
 931
 932                 /* If TCP_DEFER_ACCEPT is set, drop bare ACK. */
 933                 if (tp->defer_accept && TCP_SKB_CB(skb)->end_seq == tcp_rsk(req)->rcv_isn + 1) {
 934                         inet_rsk(req)->acked = 1;
 935                         return NULL;
 936                 }
 937
 938                 /* OK, ACK is valid, create big socket and
 939                  * feed this segment to it. It will repeat all
 940                  * the tests. THIS SEGMENT MUST MOVE SOCKET TO
 941                  * ESTABLISHED STATE. If it will be dropped after
 942                  * socket is created, wait for troubles.
 943                  */
 944                 child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL);
 945                 if (child == NULL)
 946                         goto listen_overflow;
 947
 948                 tcp_synq_unlink(tp, req, prev);
 949                 tcp_synq_removed(sk, req);
 950
 951                 tcp_acceptq_queue(sk, req, child);
 952                 return child;
 953
 954         listen_overflow:
 955                 if (!sysctl_tcp_abort_on_overflow) {
 956                         inet_rsk(req)->acked = 1;
 957                         return NULL;
 958                 }
 959
 960         embryonic_reset:
 961                 NET_INC_STATS_BH(LINUX_MIB_EMBRYONICRSTS);
 962                 if (!(flg & TCP_FLAG_RST))
 963                         req->rsk_ops->send_reset(skb);
 964
 965                 tcp_synq_drop(sk, req, prev);
 966                 return NULL;
 967 }
 968
 969 /*
 970  * Queue segment on the new socket if the new socket is active,
 971  * otherwise we just shortcircuit this and continue with
 972  * the new socket.
 973  */
 974
 975 int tcp_child_process(struct sock *parent, struct sock *child,
 976                       struct sk_buff *skb)
 977 {
 978         int ret = 0;
 979         int state = child->sk_state;
 980
 981         if (!sock_owned_by_user(child)) {
 982                 ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len);
 983
 984                 /* Wakeup parent, send SIGIO */
 985                 if (state == TCP_SYN_RECV && child->sk_state != state)
 986                         parent->sk_data_ready(parent, 0);
 987         } else {
 988                 /* Alas, it is possible again, because we do lookup
 989                  * in main socket hash table and lock on listening
 990                  * socket does not protect us more.
 991                  */
 992                 sk_add_backlog(child, skb);
 993         }
 994
 995         bh_unlock_sock(child);
 996         sock_put(child);
 997         return ret;
 998 }
 999
1000 EXPORT_SYMBOL(tcp_check_req);
1001 EXPORT_SYMBOL(tcp_child_process);
1002 EXPORT_SYMBOL(tcp_create_openreq_child);
1003 EXPORT_SYMBOL(tcp_timewait_state_process);
1004 EXPORT_SYMBOL(tcp_tw_deschedule);