2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * Implementation of the Transmission Control Protocol(TCP).
8 * Version: $Id: tcp_ipv4.c,v 1.240 2002/02/01 22:01:04 davem Exp $
10 * IPv4 specific functions
15 * linux/ipv4/tcp_input.c
16 * linux/ipv4/tcp_output.c
18 * See tcp.c for author information
20 * This program is free software; you can redistribute it and/or
21 * modify it under the terms of the GNU General Public License
22 * as published by the Free Software Foundation; either version
23 * 2 of the License, or (at your option) any later version.
28 * David S. Miller : New socket lookup architecture.
29 * This code is dedicated to John Dyson.
30 * David S. Miller : Change semantics of established hash,
31 * half is devoted to TIME_WAIT sockets
32 * and the rest go in the other half.
33 * Andi Kleen : Add support for syncookies and fixed
34 * some bugs: ip options weren't passed to
35 * the TCP layer, missed a check for an
37 * Andi Kleen : Implemented fast path mtu discovery.
38 * Fixed many serious bugs in the
39 * request_sock handling and moved
40 * most of it into the af independent code.
41 * Added tail drop and some other bugfixes.
42 * Added new listen sematics.
43 * Mike McLagan : Routing by source
44 * Juan Jose Ciarlante: ip_dynaddr bits
45 * Andi Kleen: various fixes.
46 * Vitaly E. Lavrov : Transparent proxy revived after year
48 * Andi Kleen : Fix new listen.
49 * Andi Kleen : Fix accept error reporting.
50 * YOSHIFUJI Hideaki @USAGI and: Support IPV6_V6ONLY socket option, which
51 * Alexey Kuznetsov allow both IPv4 and IPv6 sockets to bind
52 * a single port at the same time.
55 #include <linux/config.h>
57 #include <linux/types.h>
58 #include <linux/fcntl.h>
59 #include <linux/module.h>
60 #include <linux/random.h>
61 #include <linux/cache.h>
62 #include <linux/jhash.h>
63 #include <linux/init.h>
64 #include <linux/times.h>
67 #include <net/inet_hashtables.h>
70 #include <net/inet_common.h>
73 #include <linux/inet.h>
74 #include <linux/ipv6.h>
75 #include <linux/stddef.h>
76 #include <linux/proc_fs.h>
77 #include <linux/seq_file.h>
79 extern int sysctl_ip_dynaddr;
80 int sysctl_tcp_tw_reuse;
81 int sysctl_tcp_low_latency;
83 /* Check TCP sequence numbers in ICMP packets. */
84 #define ICMP_MIN_LENGTH 8
86 /* Socket used for sending RSTs */
87 static struct socket *tcp_socket;
89 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
92 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
93 .lhash_lock = RW_LOCK_UNLOCKED,
94 .lhash_users = ATOMIC_INIT(0),
95 .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
96 .portalloc_lock = SPIN_LOCK_UNLOCKED,
97 .port_rover = 1024 - 1,
101 * This array holds the first and last local port number.
102 * For high-usage systems, use sysctl to change this to
105 int sysctl_local_port_range[2] = { 1024, 4999 };
107 static inline int tcp_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb)
109 const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk);
111 struct hlist_node *node;
112 int reuse = sk->sk_reuse;
114 sk_for_each_bound(sk2, node, &tb->owners) {
116 !tcp_v6_ipv6only(sk2) &&
117 (!sk->sk_bound_dev_if ||
118 !sk2->sk_bound_dev_if ||
119 sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) {
120 if (!reuse || !sk2->sk_reuse ||
121 sk2->sk_state == TCP_LISTEN) {
122 const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2);
123 if (!sk2_rcv_saddr || !sk_rcv_saddr ||
124 sk2_rcv_saddr == sk_rcv_saddr)
132 /* Obtain a reference to a local port for the given sock,
133 * if snum is zero it means select any available local port.
135 static int tcp_v4_get_port(struct sock *sk, unsigned short snum)
137 struct inet_bind_hashbucket *head;
138 struct hlist_node *node;
139 struct inet_bind_bucket *tb;
144 int low = sysctl_local_port_range[0];
145 int high = sysctl_local_port_range[1];
146 int remaining = (high - low) + 1;
149 spin_lock(&tcp_hashinfo.portalloc_lock);
150 if (tcp_hashinfo.port_rover < low)
153 rover = tcp_hashinfo.port_rover;
158 head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)];
159 spin_lock(&head->lock);
160 inet_bind_bucket_for_each(tb, node, &head->chain)
161 if (tb->port == rover)
165 spin_unlock(&head->lock);
166 } while (--remaining > 0);
167 tcp_hashinfo.port_rover = rover;
168 spin_unlock(&tcp_hashinfo.portalloc_lock);
170 /* Exhausted local port range during search? It is not
171 * possible for us to be holding one of the bind hash
172 * locks if this test triggers, because if 'remaining'
173 * drops to zero, we broke out of the do/while loop at
174 * the top level, not from the 'break;' statement.
177 if (unlikely(remaining <= 0))
180 /* OK, here is the one we will use. HEAD is
181 * non-NULL and we hold it's mutex.
185 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
186 spin_lock(&head->lock);
187 inet_bind_bucket_for_each(tb, node, &head->chain)
188 if (tb->port == snum)
194 if (!hlist_empty(&tb->owners)) {
195 if (sk->sk_reuse > 1)
197 if (tb->fastreuse > 0 &&
198 sk->sk_reuse && sk->sk_state != TCP_LISTEN) {
202 if (tcp_bind_conflict(sk, tb))
208 if (!tb && (tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum)) == NULL)
210 if (hlist_empty(&tb->owners)) {
211 if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
215 } else if (tb->fastreuse &&
216 (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
219 if (!inet_sk(sk)->bind_hash)
220 inet_bind_hash(sk, tb, snum);
221 BUG_TRAP(inet_sk(sk)->bind_hash == tb);
225 spin_unlock(&head->lock);
231 static void tcp_v4_hash(struct sock *sk)
233 inet_hash(&tcp_hashinfo, sk);
236 void tcp_unhash(struct sock *sk)
238 inet_unhash(&tcp_hashinfo, sk);
241 /* Sockets in TCP_CLOSE state are _always_ taken out of the hash, so
242 * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM
244 * Local BH must be disabled here.
247 static inline struct sock *__tcp_v4_lookup_established(const u32 saddr,
253 struct inet_ehash_bucket *head;
254 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
255 __u32 ports = TCP_COMBINED_PORTS(sport, hnum);
257 struct hlist_node *node;
258 /* Optimize here for direct hit, only listening connections can
259 * have wildcards anyways.
261 const int hash = inet_ehashfn(daddr, hnum, saddr, sport, tcp_hashinfo.ehash_size);
262 head = &tcp_hashinfo.ehash[hash];
263 read_lock(&head->lock);
264 sk_for_each(sk, node, &head->chain) {
265 if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif))
266 goto hit; /* You sunk my battleship! */
269 /* Must check for a TIME_WAIT'er before going to listener hash. */
270 sk_for_each(sk, node, &(head + tcp_hashinfo.ehash_size)->chain) {
271 if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif))
276 read_unlock(&head->lock);
283 static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport,
284 u32 daddr, u16 hnum, int dif)
286 struct sock *sk = __tcp_v4_lookup_established(saddr, sport,
289 return sk ? : inet_lookup_listener(&tcp_hashinfo, daddr, hnum, dif);
292 inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr,
298 sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif);
304 EXPORT_SYMBOL_GPL(tcp_v4_lookup);
306 static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb)
308 return secure_tcp_sequence_number(skb->nh.iph->daddr,
314 /* called with local bh disabled */
315 static int __tcp_v4_check_established(struct sock *sk, __u16 lport,
316 struct tcp_tw_bucket **twp)
318 struct inet_sock *inet = inet_sk(sk);
319 u32 daddr = inet->rcv_saddr;
320 u32 saddr = inet->daddr;
321 int dif = sk->sk_bound_dev_if;
322 TCP_V4_ADDR_COOKIE(acookie, saddr, daddr)
323 __u32 ports = TCP_COMBINED_PORTS(inet->dport, lport);
324 const int hash = inet_ehashfn(daddr, lport, saddr, inet->dport, tcp_hashinfo.ehash_size);
325 struct inet_ehash_bucket *head = &tcp_hashinfo.ehash[hash];
327 struct hlist_node *node;
328 struct tcp_tw_bucket *tw;
330 write_lock(&head->lock);
332 /* Check TIME-WAIT sockets first. */
333 sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) {
334 tw = (struct tcp_tw_bucket *)sk2;
336 if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) {
337 struct tcp_sock *tp = tcp_sk(sk);
339 /* With PAWS, it is safe from the viewpoint
340 of data integrity. Even without PAWS it
341 is safe provided sequence spaces do not
342 overlap i.e. at data rates <= 80Mbit/sec.
344 Actually, the idea is close to VJ's one,
345 only timestamp cache is held not per host,
346 but per port pair and TW bucket is used
349 If TW bucket has been already destroyed we
350 fall back to VJ's scheme and use initial
351 timestamp retrieved from peer table.
353 if (tw->tw_ts_recent_stamp &&
354 (!twp || (sysctl_tcp_tw_reuse &&
356 tw->tw_ts_recent_stamp > 1))) {
358 tw->tw_snd_nxt + 65535 + 2) == 0)
360 tp->rx_opt.ts_recent = tw->tw_ts_recent;
361 tp->rx_opt.ts_recent_stamp = tw->tw_ts_recent_stamp;
370 /* And established part... */
371 sk_for_each(sk2, node, &head->chain) {
372 if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif))
377 /* Must record num and sport now. Otherwise we will see
378 * in hash table socket with a funny identity. */
380 inet->sport = htons(lport);
381 sk->sk_hashent = hash;
382 BUG_TRAP(sk_unhashed(sk));
383 __sk_add_node(sk, &head->chain);
384 sock_prot_inc_use(sk->sk_prot);
385 write_unlock(&head->lock);
389 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
391 /* Silly. Should hash-dance instead... */
392 tcp_tw_deschedule(tw);
393 NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED);
401 write_unlock(&head->lock);
402 return -EADDRNOTAVAIL;
405 static inline u32 connect_port_offset(const struct sock *sk)
407 const struct inet_sock *inet = inet_sk(sk);
409 return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr,
414 * Bind a port for a connect operation and hash it.
416 static inline int tcp_v4_hash_connect(struct sock *sk)
418 const unsigned short snum = inet_sk(sk)->num;
419 struct inet_bind_hashbucket *head;
420 struct inet_bind_bucket *tb;
424 int low = sysctl_local_port_range[0];
425 int high = sysctl_local_port_range[1];
426 int range = high - low;
430 u32 offset = hint + connect_port_offset(sk);
431 struct hlist_node *node;
432 struct tcp_tw_bucket *tw = NULL;
435 for (i = 1; i <= range; i++) {
436 port = low + (i + offset) % range;
437 head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)];
438 spin_lock(&head->lock);
440 /* Does not bother with rcv_saddr checks,
441 * because the established check is already
444 inet_bind_bucket_for_each(tb, node, &head->chain) {
445 if (tb->port == port) {
446 BUG_TRAP(!hlist_empty(&tb->owners));
447 if (tb->fastreuse >= 0)
449 if (!__tcp_v4_check_established(sk,
457 tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port);
459 spin_unlock(&head->lock);
466 spin_unlock(&head->lock);
470 return -EADDRNOTAVAIL;
475 /* Head lock still held and bh's disabled */
476 inet_bind_hash(sk, tb, port);
477 if (sk_unhashed(sk)) {
478 inet_sk(sk)->sport = htons(port);
479 __inet_hash(&tcp_hashinfo, sk, 0);
481 spin_unlock(&head->lock);
484 tcp_tw_deschedule(tw);
492 head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)];
493 tb = inet_sk(sk)->bind_hash;
494 spin_lock_bh(&head->lock);
495 if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
496 __inet_hash(&tcp_hashinfo, sk, 0);
497 spin_unlock_bh(&head->lock);
500 spin_unlock(&head->lock);
501 /* No definite answer... Walk to established hash table */
502 ret = __tcp_v4_check_established(sk, snum, NULL);
509 /* This will initiate an outgoing connection. */
510 int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
512 struct inet_sock *inet = inet_sk(sk);
513 struct tcp_sock *tp = tcp_sk(sk);
514 struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
520 if (addr_len < sizeof(struct sockaddr_in))
523 if (usin->sin_family != AF_INET)
524 return -EAFNOSUPPORT;
526 nexthop = daddr = usin->sin_addr.s_addr;
527 if (inet->opt && inet->opt->srr) {
530 nexthop = inet->opt->faddr;
533 tmp = ip_route_connect(&rt, nexthop, inet->saddr,
534 RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
536 inet->sport, usin->sin_port, sk);
540 if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
545 if (!inet->opt || !inet->opt->srr)
549 inet->saddr = rt->rt_src;
550 inet->rcv_saddr = inet->saddr;
552 if (tp->rx_opt.ts_recent_stamp && inet->daddr != daddr) {
553 /* Reset inherited state */
554 tp->rx_opt.ts_recent = 0;
555 tp->rx_opt.ts_recent_stamp = 0;
559 if (sysctl_tcp_tw_recycle &&
560 !tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
561 struct inet_peer *peer = rt_get_peer(rt);
563 /* VJ's idea. We save last timestamp seen from
564 * the destination in peer table, when entering state TIME-WAIT
565 * and initialize rx_opt.ts_recent from it, when trying new connection.
568 if (peer && peer->tcp_ts_stamp + TCP_PAWS_MSL >= xtime.tv_sec) {
569 tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
570 tp->rx_opt.ts_recent = peer->tcp_ts;
574 inet->dport = usin->sin_port;
577 tp->ext_header_len = 0;
579 tp->ext_header_len = inet->opt->optlen;
581 tp->rx_opt.mss_clamp = 536;
583 /* Socket identity is still unknown (sport may be zero).
584 * However we set state to SYN-SENT and not releasing socket
585 * lock select source port, enter ourselves into the hash tables and
586 * complete initialization after this.
588 tcp_set_state(sk, TCP_SYN_SENT);
589 err = tcp_v4_hash_connect(sk);
593 err = ip_route_newports(&rt, inet->sport, inet->dport, sk);
597 /* OK, now commit destination to socket. */
598 sk_setup_caps(sk, &rt->u.dst);
601 tp->write_seq = secure_tcp_sequence_number(inet->saddr,
606 inet->id = tp->write_seq ^ jiffies;
608 err = tcp_connect(sk);
616 /* This unhashes the socket and releases the local port, if necessary. */
617 tcp_set_state(sk, TCP_CLOSE);
619 sk->sk_route_caps = 0;
624 static __inline__ int tcp_v4_iif(struct sk_buff *skb)
626 return ((struct rtable *)skb->dst)->rt_iif;
629 static __inline__ u32 tcp_v4_synq_hash(u32 raddr, u16 rport, u32 rnd)
631 return (jhash_2words(raddr, (u32) rport, rnd) & (TCP_SYNQ_HSIZE - 1));
634 static struct request_sock *tcp_v4_search_req(struct tcp_sock *tp,
635 struct request_sock ***prevp,
637 __u32 raddr, __u32 laddr)
639 struct listen_sock *lopt = tp->accept_queue.listen_opt;
640 struct request_sock *req, **prev;
642 for (prev = &lopt->syn_table[tcp_v4_synq_hash(raddr, rport, lopt->hash_rnd)];
643 (req = *prev) != NULL;
644 prev = &req->dl_next) {
645 const struct inet_request_sock *ireq = inet_rsk(req);
647 if (ireq->rmt_port == rport &&
648 ireq->rmt_addr == raddr &&
649 ireq->loc_addr == laddr &&
650 TCP_INET_FAMILY(req->rsk_ops->family)) {
660 static void tcp_v4_synq_add(struct sock *sk, struct request_sock *req)
662 struct tcp_sock *tp = tcp_sk(sk);
663 struct listen_sock *lopt = tp->accept_queue.listen_opt;
664 u32 h = tcp_v4_synq_hash(inet_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd);
666 reqsk_queue_hash_req(&tp->accept_queue, h, req, TCP_TIMEOUT_INIT);
672 * This routine does path mtu discovery as defined in RFC1191.
674 static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph,
677 struct dst_entry *dst;
678 struct inet_sock *inet = inet_sk(sk);
679 struct tcp_sock *tp = tcp_sk(sk);
681 /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs
682 * send out by Linux are always <576bytes so they should go through
685 if (sk->sk_state == TCP_LISTEN)
688 /* We don't check in the destentry if pmtu discovery is forbidden
689 * on this route. We just assume that no packet_to_big packets
690 * are send back when pmtu discovery is not active.
691 * There is a small race when the user changes this flag in the
692 * route, but I think that's acceptable.
694 if ((dst = __sk_dst_check(sk, 0)) == NULL)
697 dst->ops->update_pmtu(dst, mtu);
699 /* Something is about to be wrong... Remember soft error
700 * for the case, if this connection will not able to recover.
702 if (mtu < dst_mtu(dst) && ip_dont_fragment(sk, dst))
703 sk->sk_err_soft = EMSGSIZE;
707 if (inet->pmtudisc != IP_PMTUDISC_DONT &&
708 tp->pmtu_cookie > mtu) {
709 tcp_sync_mss(sk, mtu);
711 /* Resend the TCP packet because it's
712 * clear that the old packet has been
713 * dropped. This is the new "fast" path mtu
716 tcp_simple_retransmit(sk);
717 } /* else let the usual retransmit timer handle it */
721 * This routine is called by the ICMP module when it gets some
722 * sort of error condition. If err < 0 then the socket should
723 * be closed and the error returned to the user. If err > 0
724 * it's just the icmp type << 8 | icmp code. After adjustment
725 * header points to the first 8 bytes of the tcp header. We need
726 * to find the appropriate port.
728 * The locking strategy used here is very "optimistic". When
729 * someone else accesses the socket the ICMP is just dropped
730 * and for some paths there is no check at all.
731 * A more general error queue to queue errors for later handling
732 * is probably better.
736 void tcp_v4_err(struct sk_buff *skb, u32 info)
738 struct iphdr *iph = (struct iphdr *)skb->data;
739 struct tcphdr *th = (struct tcphdr *)(skb->data + (iph->ihl << 2));
741 struct inet_sock *inet;
742 int type = skb->h.icmph->type;
743 int code = skb->h.icmph->code;
748 if (skb->len < (iph->ihl << 2) + 8) {
749 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
753 sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr,
754 th->source, tcp_v4_iif(skb));
756 ICMP_INC_STATS_BH(ICMP_MIB_INERRORS);
759 if (sk->sk_state == TCP_TIME_WAIT) {
760 tcp_tw_put((struct tcp_tw_bucket *)sk);
765 /* If too many ICMPs get dropped on busy
766 * servers this needs to be solved differently.
768 if (sock_owned_by_user(sk))
769 NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS);
771 if (sk->sk_state == TCP_CLOSE)
775 seq = ntohl(th->seq);
776 if (sk->sk_state != TCP_LISTEN &&
777 !between(seq, tp->snd_una, tp->snd_nxt)) {
778 NET_INC_STATS(LINUX_MIB_OUTOFWINDOWICMPS);
783 case ICMP_SOURCE_QUENCH:
784 /* Just silently ignore these. */
786 case ICMP_PARAMETERPROB:
789 case ICMP_DEST_UNREACH:
790 if (code > NR_ICMP_UNREACH)
793 if (code == ICMP_FRAG_NEEDED) { /* PMTU discovery (RFC1191) */
794 if (!sock_owned_by_user(sk))
795 do_pmtu_discovery(sk, iph, info);
799 err = icmp_err_convert[code].errno;
801 case ICMP_TIME_EXCEEDED:
808 switch (sk->sk_state) {
809 struct request_sock *req, **prev;
811 if (sock_owned_by_user(sk))
814 req = tcp_v4_search_req(tp, &prev, th->dest,
815 iph->daddr, iph->saddr);
819 /* ICMPs are not backlogged, hence we cannot get
820 an established socket here.
824 if (seq != tcp_rsk(req)->snt_isn) {
825 NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS);
830 * Still in SYN_RECV, just remove it silently.
831 * There is no good way to pass the error to the newly
832 * created socket, and POSIX does not want network
833 * errors returned from accept().
835 tcp_synq_drop(sk, req, prev);
839 case TCP_SYN_RECV: /* Cannot happen.
840 It can f.e. if SYNs crossed.
842 if (!sock_owned_by_user(sk)) {
843 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
846 sk->sk_error_report(sk);
850 sk->sk_err_soft = err;
855 /* If we've already connected we will keep trying
856 * until we time out, or the user gives up.
858 * rfc1122 4.2.3.9 allows to consider as hard errors
859 * only PROTO_UNREACH and PORT_UNREACH (well, FRAG_FAILED too,
860 * but it is obsoleted by pmtu discovery).
862 * Note, that in modern internet, where routing is unreliable
863 * and in each dark corner broken firewalls sit, sending random
864 * errors ordered by their masters even this two messages finally lose
865 * their original sense (even Linux sends invalid PORT_UNREACHs)
867 * Now we are in compliance with RFCs.
872 if (!sock_owned_by_user(sk) && inet->recverr) {
874 sk->sk_error_report(sk);
875 } else { /* Only an error on timeout */
876 sk->sk_err_soft = err;
884 /* This routine computes an IPv4 TCP checksum. */
885 void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len,
888 struct inet_sock *inet = inet_sk(sk);
890 if (skb->ip_summed == CHECKSUM_HW) {
891 th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0);
892 skb->csum = offsetof(struct tcphdr, check);
894 th->check = tcp_v4_check(th, len, inet->saddr, inet->daddr,
895 csum_partial((char *)th,
902 * This routine will send an RST to the other tcp.
904 * Someone asks: why I NEVER use socket parameters (TOS, TTL etc.)
906 * Answer: if a packet caused RST, it is not for a socket
907 * existing in our system, if it is matched to a socket,
908 * it is just duplicate segment or bug in other side's TCP.
909 * So that we build reply only basing on parameters
910 * arrived with segment.
911 * Exception: precedence violation. We do not implement it in any case.
914 static void tcp_v4_send_reset(struct sk_buff *skb)
916 struct tcphdr *th = skb->h.th;
918 struct ip_reply_arg arg;
920 /* Never send a reset in response to a reset. */
924 if (((struct rtable *)skb->dst)->rt_type != RTN_LOCAL)
927 /* Swap the send and the receive. */
928 memset(&rth, 0, sizeof(struct tcphdr));
929 rth.dest = th->source;
930 rth.source = th->dest;
931 rth.doff = sizeof(struct tcphdr) / 4;
935 rth.seq = th->ack_seq;
938 rth.ack_seq = htonl(ntohl(th->seq) + th->syn + th->fin +
939 skb->len - (th->doff << 2));
942 memset(&arg, 0, sizeof arg);
943 arg.iov[0].iov_base = (unsigned char *)&rth;
944 arg.iov[0].iov_len = sizeof rth;
945 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
946 skb->nh.iph->saddr, /*XXX*/
947 sizeof(struct tcphdr), IPPROTO_TCP, 0);
948 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
950 ip_send_reply(tcp_socket->sk, skb, &arg, sizeof rth);
952 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
953 TCP_INC_STATS_BH(TCP_MIB_OUTRSTS);
956 /* The code following below sending ACKs in SYN-RECV and TIME-WAIT states
957 outside socket context is ugly, certainly. What can I do?
960 static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
963 struct tcphdr *th = skb->h.th;
968 struct ip_reply_arg arg;
970 memset(&rep.th, 0, sizeof(struct tcphdr));
971 memset(&arg, 0, sizeof arg);
973 arg.iov[0].iov_base = (unsigned char *)&rep;
974 arg.iov[0].iov_len = sizeof(rep.th);
976 rep.tsopt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) |
977 (TCPOPT_TIMESTAMP << 8) |
979 rep.tsopt[1] = htonl(tcp_time_stamp);
980 rep.tsopt[2] = htonl(ts);
981 arg.iov[0].iov_len = sizeof(rep);
984 /* Swap the send and the receive. */
985 rep.th.dest = th->source;
986 rep.th.source = th->dest;
987 rep.th.doff = arg.iov[0].iov_len / 4;
988 rep.th.seq = htonl(seq);
989 rep.th.ack_seq = htonl(ack);
991 rep.th.window = htons(win);
993 arg.csum = csum_tcpudp_nofold(skb->nh.iph->daddr,
994 skb->nh.iph->saddr, /*XXX*/
995 arg.iov[0].iov_len, IPPROTO_TCP, 0);
996 arg.csumoffset = offsetof(struct tcphdr, check) / 2;
998 ip_send_reply(tcp_socket->sk, skb, &arg, arg.iov[0].iov_len);
1000 TCP_INC_STATS_BH(TCP_MIB_OUTSEGS);
1003 static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
1005 struct tcp_tw_bucket *tw = (struct tcp_tw_bucket *)sk;
1007 tcp_v4_send_ack(skb, tw->tw_snd_nxt, tw->tw_rcv_nxt,
1008 tw->tw_rcv_wnd >> tw->tw_rcv_wscale, tw->tw_ts_recent);
1013 static void tcp_v4_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req)
1015 tcp_v4_send_ack(skb, tcp_rsk(req)->snt_isn + 1, tcp_rsk(req)->rcv_isn + 1, req->rcv_wnd,
1019 static struct dst_entry* tcp_v4_route_req(struct sock *sk,
1020 struct request_sock *req)
1023 const struct inet_request_sock *ireq = inet_rsk(req);
1024 struct ip_options *opt = inet_rsk(req)->opt;
1025 struct flowi fl = { .oif = sk->sk_bound_dev_if,
1027 { .daddr = ((opt && opt->srr) ?
1030 .saddr = ireq->loc_addr,
1031 .tos = RT_CONN_FLAGS(sk) } },
1032 .proto = IPPROTO_TCP,
1034 { .sport = inet_sk(sk)->sport,
1035 .dport = ireq->rmt_port } } };
1037 if (ip_route_output_flow(&rt, &fl, sk, 0)) {
1038 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1041 if (opt && opt->is_strictroute && rt->rt_dst != rt->rt_gateway) {
1043 IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES);
1050 * Send a SYN-ACK after having received an ACK.
1051 * This still operates on a request_sock only, not on a big
1054 static int tcp_v4_send_synack(struct sock *sk, struct request_sock *req,
1055 struct dst_entry *dst)
1057 const struct inet_request_sock *ireq = inet_rsk(req);
1059 struct sk_buff * skb;
1061 /* First, grab a route. */
1062 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1065 skb = tcp_make_synack(sk, dst, req);
1068 struct tcphdr *th = skb->h.th;
1070 th->check = tcp_v4_check(th, skb->len,
1073 csum_partial((char *)th, skb->len,
1076 err = ip_build_and_send_pkt(skb, sk, ireq->loc_addr,
1079 if (err == NET_XMIT_CN)
1089 * IPv4 request_sock destructor.
1091 static void tcp_v4_reqsk_destructor(struct request_sock *req)
1093 if (inet_rsk(req)->opt)
1094 kfree(inet_rsk(req)->opt);
1097 static inline void syn_flood_warning(struct sk_buff *skb)
1099 static unsigned long warntime;
1101 if (time_after(jiffies, (warntime + HZ * 60))) {
1104 "possible SYN flooding on port %d. Sending cookies.\n",
1105 ntohs(skb->h.th->dest));
1110 * Save and compile IPv4 options into the request_sock if needed.
1112 static inline struct ip_options *tcp_v4_save_options(struct sock *sk,
1113 struct sk_buff *skb)
1115 struct ip_options *opt = &(IPCB(skb)->opt);
1116 struct ip_options *dopt = NULL;
1118 if (opt && opt->optlen) {
1119 int opt_size = optlength(opt);
1120 dopt = kmalloc(opt_size, GFP_ATOMIC);
1122 if (ip_options_echo(dopt, skb)) {
1131 struct request_sock_ops tcp_request_sock_ops = {
1133 .obj_size = sizeof(struct tcp_request_sock),
1134 .rtx_syn_ack = tcp_v4_send_synack,
1135 .send_ack = tcp_v4_reqsk_send_ack,
1136 .destructor = tcp_v4_reqsk_destructor,
1137 .send_reset = tcp_v4_send_reset,
1140 int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
1142 struct inet_request_sock *ireq;
1143 struct tcp_options_received tmp_opt;
1144 struct request_sock *req;
1145 __u32 saddr = skb->nh.iph->saddr;
1146 __u32 daddr = skb->nh.iph->daddr;
1147 __u32 isn = TCP_SKB_CB(skb)->when;
1148 struct dst_entry *dst = NULL;
1149 #ifdef CONFIG_SYN_COOKIES
1150 int want_cookie = 0;
1152 #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */
1155 /* Never answer to SYNs send to broadcast or multicast */
1156 if (((struct rtable *)skb->dst)->rt_flags &
1157 (RTCF_BROADCAST | RTCF_MULTICAST))
1160 /* TW buckets are converted to open requests without
1161 * limitations, they conserve resources and peer is
1162 * evidently real one.
1164 if (tcp_synq_is_full(sk) && !isn) {
1165 #ifdef CONFIG_SYN_COOKIES
1166 if (sysctl_tcp_syncookies) {
1173 /* Accept backlog is full. If we have already queued enough
1174 * of warm entries in syn queue, drop request. It is better than
1175 * clogging syn queue with openreqs with exponentially increasing
1178 if (sk_acceptq_is_full(sk) && tcp_synq_young(sk) > 1)
1181 req = reqsk_alloc(&tcp_request_sock_ops);
1185 tcp_clear_options(&tmp_opt);
1186 tmp_opt.mss_clamp = 536;
1187 tmp_opt.user_mss = tcp_sk(sk)->rx_opt.user_mss;
1189 tcp_parse_options(skb, &tmp_opt, 0);
1192 tcp_clear_options(&tmp_opt);
1193 tmp_opt.saw_tstamp = 0;
1196 if (tmp_opt.saw_tstamp && !tmp_opt.rcv_tsval) {
1197 /* Some OSes (unknown ones, but I see them on web server, which
1198 * contains information interesting only for windows'
1199 * users) do not send their stamp in SYN. It is easy case.
1200 * We simply do not advertise TS support.
1202 tmp_opt.saw_tstamp = 0;
1203 tmp_opt.tstamp_ok = 0;
1205 tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
1207 tcp_openreq_init(req, &tmp_opt, skb);
1209 ireq = inet_rsk(req);
1210 ireq->loc_addr = daddr;
1211 ireq->rmt_addr = saddr;
1212 ireq->opt = tcp_v4_save_options(sk, skb);
1214 TCP_ECN_create_request(req, skb->h.th);
1217 #ifdef CONFIG_SYN_COOKIES
1218 syn_flood_warning(skb);
1220 isn = cookie_v4_init_sequence(sk, skb, &req->mss);
1222 struct inet_peer *peer = NULL;
1224 /* VJ's idea. We save last timestamp seen
1225 * from the destination in peer table, when entering
1226 * state TIME-WAIT, and check against it before
1227 * accepting new connection request.
1229 * If "isn" is not zero, this request hit alive
1230 * timewait bucket, so that all the necessary checks
1231 * are made in the function processing timewait state.
1233 if (tmp_opt.saw_tstamp &&
1234 sysctl_tcp_tw_recycle &&
1235 (dst = tcp_v4_route_req(sk, req)) != NULL &&
1236 (peer = rt_get_peer((struct rtable *)dst)) != NULL &&
1237 peer->v4daddr == saddr) {
1238 if (xtime.tv_sec < peer->tcp_ts_stamp + TCP_PAWS_MSL &&
1239 (s32)(peer->tcp_ts - req->ts_recent) >
1241 NET_INC_STATS_BH(LINUX_MIB_PAWSPASSIVEREJECTED);
1246 /* Kill the following clause, if you dislike this way. */
1247 else if (!sysctl_tcp_syncookies &&
1248 (sysctl_max_syn_backlog - tcp_synq_len(sk) <
1249 (sysctl_max_syn_backlog >> 2)) &&
1250 (!peer || !peer->tcp_ts_stamp) &&
1251 (!dst || !dst_metric(dst, RTAX_RTT))) {
1252 /* Without syncookies last quarter of
1253 * backlog is filled with destinations,
1254 * proven to be alive.
1255 * It means that we continue to communicate
1256 * to destinations, already remembered
1257 * to the moment of synflood.
1259 LIMIT_NETDEBUG(printk(KERN_DEBUG "TCP: drop open "
1260 "request from %u.%u."
1263 ntohs(skb->h.th->source)));
1268 isn = tcp_v4_init_sequence(sk, skb);
1270 tcp_rsk(req)->snt_isn = isn;
1272 if (tcp_v4_send_synack(sk, req, dst))
1278 tcp_v4_synq_add(sk, req);
1285 TCP_INC_STATS_BH(TCP_MIB_ATTEMPTFAILS);
1291 * The three way handshake has completed - we got a valid synack -
1292 * now create the new socket.
1294 struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
1295 struct request_sock *req,
1296 struct dst_entry *dst)
1298 struct inet_request_sock *ireq;
1299 struct inet_sock *newinet;
1300 struct tcp_sock *newtp;
1303 if (sk_acceptq_is_full(sk))
1306 if (!dst && (dst = tcp_v4_route_req(sk, req)) == NULL)
1309 newsk = tcp_create_openreq_child(sk, req, skb);
1313 sk_setup_caps(newsk, dst);
1315 newtp = tcp_sk(newsk);
1316 newinet = inet_sk(newsk);
1317 ireq = inet_rsk(req);
1318 newinet->daddr = ireq->rmt_addr;
1319 newinet->rcv_saddr = ireq->loc_addr;
1320 newinet->saddr = ireq->loc_addr;
1321 newinet->opt = ireq->opt;
1323 newinet->mc_index = tcp_v4_iif(skb);
1324 newinet->mc_ttl = skb->nh.iph->ttl;
1325 newtp->ext_header_len = 0;
1327 newtp->ext_header_len = newinet->opt->optlen;
1328 newinet->id = newtp->write_seq ^ jiffies;
1330 tcp_sync_mss(newsk, dst_mtu(dst));
1331 newtp->advmss = dst_metric(dst, RTAX_ADVMSS);
1332 tcp_initialize_rcv_mss(newsk);
1334 __inet_hash(&tcp_hashinfo, newsk, 0);
1335 __inet_inherit_port(&tcp_hashinfo, sk, newsk);
1340 NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS);
1342 NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS);
1347 static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
1349 struct tcphdr *th = skb->h.th;
1350 struct iphdr *iph = skb->nh.iph;
1351 struct tcp_sock *tp = tcp_sk(sk);
1353 struct request_sock **prev;
1354 /* Find possible connection requests. */
1355 struct request_sock *req = tcp_v4_search_req(tp, &prev, th->source,
1356 iph->saddr, iph->daddr);
1358 return tcp_check_req(sk, skb, req, prev);
1360 nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr,
1367 if (nsk->sk_state != TCP_TIME_WAIT) {
1371 tcp_tw_put((struct tcp_tw_bucket *)nsk);
1375 #ifdef CONFIG_SYN_COOKIES
1376 if (!th->rst && !th->syn && th->ack)
1377 sk = cookie_v4_check(sk, skb, &(IPCB(skb)->opt));
1382 static int tcp_v4_checksum_init(struct sk_buff *skb)
1384 if (skb->ip_summed == CHECKSUM_HW) {
1385 skb->ip_summed = CHECKSUM_UNNECESSARY;
1386 if (!tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1387 skb->nh.iph->daddr, skb->csum))
1390 LIMIT_NETDEBUG(printk(KERN_DEBUG "hw tcp v4 csum failed\n"));
1391 skb->ip_summed = CHECKSUM_NONE;
1393 if (skb->len <= 76) {
1394 if (tcp_v4_check(skb->h.th, skb->len, skb->nh.iph->saddr,
1396 skb_checksum(skb, 0, skb->len, 0)))
1398 skb->ip_summed = CHECKSUM_UNNECESSARY;
1400 skb->csum = ~tcp_v4_check(skb->h.th, skb->len,
1402 skb->nh.iph->daddr, 0);
1408 /* The socket must have it's spinlock held when we get
1411 * We have a potential double-lock case here, so even when
1412 * doing backlog processing we use the BH locking scheme.
1413 * This is because we cannot sleep with the original spinlock
1416 int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
1418 if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
1419 TCP_CHECK_TIMER(sk);
1420 if (tcp_rcv_established(sk, skb, skb->h.th, skb->len))
1422 TCP_CHECK_TIMER(sk);
1426 if (skb->len < (skb->h.th->doff << 2) || tcp_checksum_complete(skb))
1429 if (sk->sk_state == TCP_LISTEN) {
1430 struct sock *nsk = tcp_v4_hnd_req(sk, skb);
1435 if (tcp_child_process(sk, nsk, skb))
1441 TCP_CHECK_TIMER(sk);
1442 if (tcp_rcv_state_process(sk, skb, skb->h.th, skb->len))
1444 TCP_CHECK_TIMER(sk);
1448 tcp_v4_send_reset(skb);
1451 /* Be careful here. If this function gets more complicated and
1452 * gcc suffers from register pressure on the x86, sk (in %ebx)
1453 * might be destroyed here. This current version compiles correctly,
1454 * but you have been warned.
1459 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1467 int tcp_v4_rcv(struct sk_buff *skb)
1473 if (skb->pkt_type != PACKET_HOST)
1476 /* Count it even if it's bad */
1477 TCP_INC_STATS_BH(TCP_MIB_INSEGS);
1479 if (!pskb_may_pull(skb, sizeof(struct tcphdr)))
1484 if (th->doff < sizeof(struct tcphdr) / 4)
1486 if (!pskb_may_pull(skb, th->doff * 4))
1489 /* An explanation is required here, I think.
1490 * Packet length and doff are validated by header prediction,
1491 * provided case of th->doff==0 is elimineted.
1492 * So, we defer the checks. */
1493 if ((skb->ip_summed != CHECKSUM_UNNECESSARY &&
1494 tcp_v4_checksum_init(skb) < 0))
1498 TCP_SKB_CB(skb)->seq = ntohl(th->seq);
1499 TCP_SKB_CB(skb)->end_seq = (TCP_SKB_CB(skb)->seq + th->syn + th->fin +
1500 skb->len - th->doff * 4);
1501 TCP_SKB_CB(skb)->ack_seq = ntohl(th->ack_seq);
1502 TCP_SKB_CB(skb)->when = 0;
1503 TCP_SKB_CB(skb)->flags = skb->nh.iph->tos;
1504 TCP_SKB_CB(skb)->sacked = 0;
1506 sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source,
1507 skb->nh.iph->daddr, ntohs(th->dest),
1514 if (sk->sk_state == TCP_TIME_WAIT)
1517 if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
1518 goto discard_and_relse;
1520 if (sk_filter(sk, skb, 0))
1521 goto discard_and_relse;
1527 if (!sock_owned_by_user(sk)) {
1528 if (!tcp_prequeue(sk, skb))
1529 ret = tcp_v4_do_rcv(sk, skb);
1531 sk_add_backlog(sk, skb);
1539 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
1542 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1544 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1546 tcp_v4_send_reset(skb);
1550 /* Discard frame. */
1559 if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
1560 tcp_tw_put((struct tcp_tw_bucket *) sk);
1564 if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
1565 TCP_INC_STATS_BH(TCP_MIB_INERRS);
1566 tcp_tw_put((struct tcp_tw_bucket *) sk);
1569 switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
1570 skb, th, skb->len)) {
1572 struct sock *sk2 = inet_lookup_listener(&tcp_hashinfo,
1577 tcp_tw_deschedule((struct tcp_tw_bucket *)sk);
1578 tcp_tw_put((struct tcp_tw_bucket *)sk);
1582 /* Fall through to ACK */
1585 tcp_v4_timewait_ack(sk, skb);
1589 case TCP_TW_SUCCESS:;
1594 static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr)
1596 struct sockaddr_in *sin = (struct sockaddr_in *) uaddr;
1597 struct inet_sock *inet = inet_sk(sk);
1599 sin->sin_family = AF_INET;
1600 sin->sin_addr.s_addr = inet->daddr;
1601 sin->sin_port = inet->dport;
1604 /* VJ's idea. Save last timestamp seen from this destination
1605 * and hold it at least for normal timewait interval to use for duplicate
1606 * segment detection in subsequent connections, before they enter synchronized
1610 int tcp_v4_remember_stamp(struct sock *sk)
1612 struct inet_sock *inet = inet_sk(sk);
1613 struct tcp_sock *tp = tcp_sk(sk);
1614 struct rtable *rt = (struct rtable *)__sk_dst_get(sk);
1615 struct inet_peer *peer = NULL;
1618 if (!rt || rt->rt_dst != inet->daddr) {
1619 peer = inet_getpeer(inet->daddr, 1);
1623 rt_bind_peer(rt, 1);
1628 if ((s32)(peer->tcp_ts - tp->rx_opt.ts_recent) <= 0 ||
1629 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1630 peer->tcp_ts_stamp <= tp->rx_opt.ts_recent_stamp)) {
1631 peer->tcp_ts_stamp = tp->rx_opt.ts_recent_stamp;
1632 peer->tcp_ts = tp->rx_opt.ts_recent;
1642 int tcp_v4_tw_remember_stamp(struct tcp_tw_bucket *tw)
1644 struct inet_peer *peer = NULL;
1646 peer = inet_getpeer(tw->tw_daddr, 1);
1649 if ((s32)(peer->tcp_ts - tw->tw_ts_recent) <= 0 ||
1650 (peer->tcp_ts_stamp + TCP_PAWS_MSL < xtime.tv_sec &&
1651 peer->tcp_ts_stamp <= tw->tw_ts_recent_stamp)) {
1652 peer->tcp_ts_stamp = tw->tw_ts_recent_stamp;
1653 peer->tcp_ts = tw->tw_ts_recent;
1662 struct tcp_func ipv4_specific = {
1663 .queue_xmit = ip_queue_xmit,
1664 .send_check = tcp_v4_send_check,
1665 .rebuild_header = inet_sk_rebuild_header,
1666 .conn_request = tcp_v4_conn_request,
1667 .syn_recv_sock = tcp_v4_syn_recv_sock,
1668 .remember_stamp = tcp_v4_remember_stamp,
1669 .net_header_len = sizeof(struct iphdr),
1670 .setsockopt = ip_setsockopt,
1671 .getsockopt = ip_getsockopt,
1672 .addr2sockaddr = v4_addr2sockaddr,
1673 .sockaddr_len = sizeof(struct sockaddr_in),
1676 /* NOTE: A lot of things set to zero explicitly by call to
1677 * sk_alloc() so need not be done here.
1679 static int tcp_v4_init_sock(struct sock *sk)
1681 struct tcp_sock *tp = tcp_sk(sk);
1683 skb_queue_head_init(&tp->out_of_order_queue);
1684 tcp_init_xmit_timers(sk);
1685 tcp_prequeue_init(tp);
1687 tp->rto = TCP_TIMEOUT_INIT;
1688 tp->mdev = TCP_TIMEOUT_INIT;
1690 /* So many TCP implementations out there (incorrectly) count the
1691 * initial SYN frame in their delayed-ACK and congestion control
1692 * algorithms that we must have the following bandaid to talk
1693 * efficiently to them. -DaveM
1697 /* See draft-stevens-tcpca-spec-01 for discussion of the
1698 * initialization of these values.
1700 tp->snd_ssthresh = 0x7fffffff; /* Infinity */
1701 tp->snd_cwnd_clamp = ~0;
1702 tp->mss_cache = 536;
1704 tp->reordering = sysctl_tcp_reordering;
1705 tp->ca_ops = &tcp_init_congestion_ops;
1707 sk->sk_state = TCP_CLOSE;
1709 sk->sk_write_space = sk_stream_write_space;
1710 sock_set_flag(sk, SOCK_USE_WRITE_QUEUE);
1712 tp->af_specific = &ipv4_specific;
1714 sk->sk_sndbuf = sysctl_tcp_wmem[1];
1715 sk->sk_rcvbuf = sysctl_tcp_rmem[1];
1717 atomic_inc(&tcp_sockets_allocated);
1722 int tcp_v4_destroy_sock(struct sock *sk)
1724 struct tcp_sock *tp = tcp_sk(sk);
1726 tcp_clear_xmit_timers(sk);
1728 tcp_cleanup_congestion_control(tp);
1730 /* Cleanup up the write buffer. */
1731 sk_stream_writequeue_purge(sk);
1733 /* Cleans up our, hopefully empty, out_of_order_queue. */
1734 __skb_queue_purge(&tp->out_of_order_queue);
1736 /* Clean prequeue, it must be empty really */
1737 __skb_queue_purge(&tp->ucopy.prequeue);
1739 /* Clean up a referenced TCP bind bucket. */
1740 if (inet_sk(sk)->bind_hash)
1741 inet_put_port(&tcp_hashinfo, sk);
1744 * If sendmsg cached page exists, toss it.
1746 if (sk->sk_sndmsg_page) {
1747 __free_page(sk->sk_sndmsg_page);
1748 sk->sk_sndmsg_page = NULL;
1751 atomic_dec(&tcp_sockets_allocated);
1756 EXPORT_SYMBOL(tcp_v4_destroy_sock);
1758 #ifdef CONFIG_PROC_FS
1759 /* Proc filesystem TCP sock list dumping. */
1761 static inline struct tcp_tw_bucket *tw_head(struct hlist_head *head)
1763 return hlist_empty(head) ? NULL :
1764 list_entry(head->first, struct tcp_tw_bucket, tw_node);
1767 static inline struct tcp_tw_bucket *tw_next(struct tcp_tw_bucket *tw)
1769 return tw->tw_node.next ?
1770 hlist_entry(tw->tw_node.next, typeof(*tw), tw_node) : NULL;
1773 static void *listening_get_next(struct seq_file *seq, void *cur)
1775 struct tcp_sock *tp;
1776 struct hlist_node *node;
1777 struct sock *sk = cur;
1778 struct tcp_iter_state* st = seq->private;
1782 sk = sk_head(&tcp_hashinfo.listening_hash[0]);
1788 if (st->state == TCP_SEQ_STATE_OPENREQ) {
1789 struct request_sock *req = cur;
1791 tp = tcp_sk(st->syn_wait_sk);
1795 if (req->rsk_ops->family == st->family) {
1801 if (++st->sbucket >= TCP_SYNQ_HSIZE)
1804 req = tp->accept_queue.listen_opt->syn_table[st->sbucket];
1806 sk = sk_next(st->syn_wait_sk);
1807 st->state = TCP_SEQ_STATE_LISTENING;
1808 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1811 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1812 if (reqsk_queue_len(&tp->accept_queue))
1814 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1818 sk_for_each_from(sk, node) {
1819 if (sk->sk_family == st->family) {
1824 read_lock_bh(&tp->accept_queue.syn_wait_lock);
1825 if (reqsk_queue_len(&tp->accept_queue)) {
1827 st->uid = sock_i_uid(sk);
1828 st->syn_wait_sk = sk;
1829 st->state = TCP_SEQ_STATE_OPENREQ;
1833 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
1835 if (++st->bucket < INET_LHTABLE_SIZE) {
1836 sk = sk_head(&tcp_hashinfo.listening_hash[st->bucket]);
1844 static void *listening_get_idx(struct seq_file *seq, loff_t *pos)
1846 void *rc = listening_get_next(seq, NULL);
1848 while (rc && *pos) {
1849 rc = listening_get_next(seq, rc);
1855 static void *established_get_first(struct seq_file *seq)
1857 struct tcp_iter_state* st = seq->private;
1860 for (st->bucket = 0; st->bucket < tcp_hashinfo.ehash_size; ++st->bucket) {
1862 struct hlist_node *node;
1863 struct tcp_tw_bucket *tw;
1865 /* We can reschedule _before_ having picked the target: */
1866 cond_resched_softirq();
1868 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1869 sk_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) {
1870 if (sk->sk_family != st->family) {
1876 st->state = TCP_SEQ_STATE_TIME_WAIT;
1877 tw_for_each(tw, node,
1878 &tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain) {
1879 if (tw->tw_family != st->family) {
1885 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1886 st->state = TCP_SEQ_STATE_ESTABLISHED;
1892 static void *established_get_next(struct seq_file *seq, void *cur)
1894 struct sock *sk = cur;
1895 struct tcp_tw_bucket *tw;
1896 struct hlist_node *node;
1897 struct tcp_iter_state* st = seq->private;
1901 if (st->state == TCP_SEQ_STATE_TIME_WAIT) {
1905 while (tw && tw->tw_family != st->family) {
1912 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
1913 st->state = TCP_SEQ_STATE_ESTABLISHED;
1915 /* We can reschedule between buckets: */
1916 cond_resched_softirq();
1918 if (++st->bucket < tcp_hashinfo.ehash_size) {
1919 read_lock(&tcp_hashinfo.ehash[st->bucket].lock);
1920 sk = sk_head(&tcp_hashinfo.ehash[st->bucket].chain);
1928 sk_for_each_from(sk, node) {
1929 if (sk->sk_family == st->family)
1933 st->state = TCP_SEQ_STATE_TIME_WAIT;
1934 tw = tw_head(&tcp_hashinfo.ehash[st->bucket + tcp_hashinfo.ehash_size].chain);
1942 static void *established_get_idx(struct seq_file *seq, loff_t pos)
1944 void *rc = established_get_first(seq);
1947 rc = established_get_next(seq, rc);
1953 static void *tcp_get_idx(struct seq_file *seq, loff_t pos)
1956 struct tcp_iter_state* st = seq->private;
1958 inet_listen_lock(&tcp_hashinfo);
1959 st->state = TCP_SEQ_STATE_LISTENING;
1960 rc = listening_get_idx(seq, &pos);
1963 inet_listen_unlock(&tcp_hashinfo);
1965 st->state = TCP_SEQ_STATE_ESTABLISHED;
1966 rc = established_get_idx(seq, pos);
1972 static void *tcp_seq_start(struct seq_file *seq, loff_t *pos)
1974 struct tcp_iter_state* st = seq->private;
1975 st->state = TCP_SEQ_STATE_LISTENING;
1977 return *pos ? tcp_get_idx(seq, *pos - 1) : SEQ_START_TOKEN;
1980 static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
1983 struct tcp_iter_state* st;
1985 if (v == SEQ_START_TOKEN) {
1986 rc = tcp_get_idx(seq, 0);
1991 switch (st->state) {
1992 case TCP_SEQ_STATE_OPENREQ:
1993 case TCP_SEQ_STATE_LISTENING:
1994 rc = listening_get_next(seq, v);
1996 inet_listen_unlock(&tcp_hashinfo);
1998 st->state = TCP_SEQ_STATE_ESTABLISHED;
1999 rc = established_get_first(seq);
2002 case TCP_SEQ_STATE_ESTABLISHED:
2003 case TCP_SEQ_STATE_TIME_WAIT:
2004 rc = established_get_next(seq, v);
2012 static void tcp_seq_stop(struct seq_file *seq, void *v)
2014 struct tcp_iter_state* st = seq->private;
2016 switch (st->state) {
2017 case TCP_SEQ_STATE_OPENREQ:
2019 struct tcp_sock *tp = tcp_sk(st->syn_wait_sk);
2020 read_unlock_bh(&tp->accept_queue.syn_wait_lock);
2022 case TCP_SEQ_STATE_LISTENING:
2023 if (v != SEQ_START_TOKEN)
2024 inet_listen_unlock(&tcp_hashinfo);
2026 case TCP_SEQ_STATE_TIME_WAIT:
2027 case TCP_SEQ_STATE_ESTABLISHED:
2029 read_unlock(&tcp_hashinfo.ehash[st->bucket].lock);
2035 static int tcp_seq_open(struct inode *inode, struct file *file)
2037 struct tcp_seq_afinfo *afinfo = PDE(inode)->data;
2038 struct seq_file *seq;
2039 struct tcp_iter_state *s;
2042 if (unlikely(afinfo == NULL))
2045 s = kmalloc(sizeof(*s), GFP_KERNEL);
2048 memset(s, 0, sizeof(*s));
2049 s->family = afinfo->family;
2050 s->seq_ops.start = tcp_seq_start;
2051 s->seq_ops.next = tcp_seq_next;
2052 s->seq_ops.show = afinfo->seq_show;
2053 s->seq_ops.stop = tcp_seq_stop;
2055 rc = seq_open(file, &s->seq_ops);
2058 seq = file->private_data;
2067 int tcp_proc_register(struct tcp_seq_afinfo *afinfo)
2070 struct proc_dir_entry *p;
2074 afinfo->seq_fops->owner = afinfo->owner;
2075 afinfo->seq_fops->open = tcp_seq_open;
2076 afinfo->seq_fops->read = seq_read;
2077 afinfo->seq_fops->llseek = seq_lseek;
2078 afinfo->seq_fops->release = seq_release_private;
2080 p = proc_net_fops_create(afinfo->name, S_IRUGO, afinfo->seq_fops);
2088 void tcp_proc_unregister(struct tcp_seq_afinfo *afinfo)
2092 proc_net_remove(afinfo->name);
2093 memset(afinfo->seq_fops, 0, sizeof(*afinfo->seq_fops));
2096 static void get_openreq4(struct sock *sk, struct request_sock *req,
2097 char *tmpbuf, int i, int uid)
2099 const struct inet_request_sock *ireq = inet_rsk(req);
2100 int ttd = req->expires - jiffies;
2102 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2103 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %u %d %p",
2106 ntohs(inet_sk(sk)->sport),
2108 ntohs(ireq->rmt_port),
2110 0, 0, /* could print option size, but that is af dependent. */
2111 1, /* timers active (only the expire timer) */
2112 jiffies_to_clock_t(ttd),
2115 0, /* non standard timer */
2116 0, /* open_requests have no inode */
2117 atomic_read(&sk->sk_refcnt),
2121 static void get_tcp4_sock(struct sock *sp, char *tmpbuf, int i)
2124 unsigned long timer_expires;
2125 struct tcp_sock *tp = tcp_sk(sp);
2126 struct inet_sock *inet = inet_sk(sp);
2127 unsigned int dest = inet->daddr;
2128 unsigned int src = inet->rcv_saddr;
2129 __u16 destp = ntohs(inet->dport);
2130 __u16 srcp = ntohs(inet->sport);
2132 if (tp->pending == TCP_TIME_RETRANS) {
2134 timer_expires = tp->timeout;
2135 } else if (tp->pending == TCP_TIME_PROBE0) {
2137 timer_expires = tp->timeout;
2138 } else if (timer_pending(&sp->sk_timer)) {
2140 timer_expires = sp->sk_timer.expires;
2143 timer_expires = jiffies;
2146 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
2147 "%08X %5d %8d %lu %d %p %u %u %u %u %d",
2148 i, src, srcp, dest, destp, sp->sk_state,
2149 tp->write_seq - tp->snd_una, tp->rcv_nxt - tp->copied_seq,
2151 jiffies_to_clock_t(timer_expires - jiffies),
2156 atomic_read(&sp->sk_refcnt), sp,
2157 tp->rto, tp->ack.ato, (tp->ack.quick << 1) | tp->ack.pingpong,
2159 tp->snd_ssthresh >= 0xFFFF ? -1 : tp->snd_ssthresh);
2162 static void get_timewait4_sock(struct tcp_tw_bucket *tw, char *tmpbuf, int i)
2164 unsigned int dest, src;
2166 int ttd = tw->tw_ttd - jiffies;
2171 dest = tw->tw_daddr;
2172 src = tw->tw_rcv_saddr;
2173 destp = ntohs(tw->tw_dport);
2174 srcp = ntohs(tw->tw_sport);
2176 sprintf(tmpbuf, "%4d: %08X:%04X %08X:%04X"
2177 " %02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p",
2178 i, src, srcp, dest, destp, tw->tw_substate, 0, 0,
2179 3, jiffies_to_clock_t(ttd), 0, 0, 0, 0,
2180 atomic_read(&tw->tw_refcnt), tw);
2185 static int tcp4_seq_show(struct seq_file *seq, void *v)
2187 struct tcp_iter_state* st;
2188 char tmpbuf[TMPSZ + 1];
2190 if (v == SEQ_START_TOKEN) {
2191 seq_printf(seq, "%-*s\n", TMPSZ - 1,
2192 " sl local_address rem_address st tx_queue "
2193 "rx_queue tr tm->when retrnsmt uid timeout "
2199 switch (st->state) {
2200 case TCP_SEQ_STATE_LISTENING:
2201 case TCP_SEQ_STATE_ESTABLISHED:
2202 get_tcp4_sock(v, tmpbuf, st->num);
2204 case TCP_SEQ_STATE_OPENREQ:
2205 get_openreq4(st->syn_wait_sk, v, tmpbuf, st->num, st->uid);
2207 case TCP_SEQ_STATE_TIME_WAIT:
2208 get_timewait4_sock(v, tmpbuf, st->num);
2211 seq_printf(seq, "%-*s\n", TMPSZ - 1, tmpbuf);
2216 static struct file_operations tcp4_seq_fops;
2217 static struct tcp_seq_afinfo tcp4_seq_afinfo = {
2218 .owner = THIS_MODULE,
2221 .seq_show = tcp4_seq_show,
2222 .seq_fops = &tcp4_seq_fops,
2225 int __init tcp4_proc_init(void)
2227 return tcp_proc_register(&tcp4_seq_afinfo);
2230 void tcp4_proc_exit(void)
2232 tcp_proc_unregister(&tcp4_seq_afinfo);
2234 #endif /* CONFIG_PROC_FS */
2236 struct proto tcp_prot = {
2238 .owner = THIS_MODULE,
2240 .connect = tcp_v4_connect,
2241 .disconnect = tcp_disconnect,
2242 .accept = tcp_accept,
2244 .init = tcp_v4_init_sock,
2245 .destroy = tcp_v4_destroy_sock,
2246 .shutdown = tcp_shutdown,
2247 .setsockopt = tcp_setsockopt,
2248 .getsockopt = tcp_getsockopt,
2249 .sendmsg = tcp_sendmsg,
2250 .recvmsg = tcp_recvmsg,
2251 .backlog_rcv = tcp_v4_do_rcv,
2252 .hash = tcp_v4_hash,
2253 .unhash = tcp_unhash,
2254 .get_port = tcp_v4_get_port,
2255 .enter_memory_pressure = tcp_enter_memory_pressure,
2256 .sockets_allocated = &tcp_sockets_allocated,
2257 .memory_allocated = &tcp_memory_allocated,
2258 .memory_pressure = &tcp_memory_pressure,
2259 .sysctl_mem = sysctl_tcp_mem,
2260 .sysctl_wmem = sysctl_tcp_wmem,
2261 .sysctl_rmem = sysctl_tcp_rmem,
2262 .max_header = MAX_TCP_HEADER,
2263 .obj_size = sizeof(struct tcp_sock),
2264 .rsk_prot = &tcp_request_sock_ops,
2269 void __init tcp_v4_init(struct net_proto_family *ops)
2271 int err = sock_create_kern(PF_INET, SOCK_RAW, IPPROTO_TCP, &tcp_socket);
2273 panic("Failed to create the TCP control socket.\n");
2274 tcp_socket->sk->sk_allocation = GFP_ATOMIC;
2275 inet_sk(tcp_socket->sk)->uc_ttl = -1;
2277 /* Unhash it so that IP input processing does not even
2278 * see it, we do not wish this socket to see incoming
2281 tcp_socket->sk->sk_prot->unhash(tcp_socket->sk);
2284 EXPORT_SYMBOL(ipv4_specific);
2285 EXPORT_SYMBOL(inet_bind_bucket_create);
2286 EXPORT_SYMBOL(tcp_hashinfo);
2287 EXPORT_SYMBOL(tcp_prot);
2288 EXPORT_SYMBOL(tcp_unhash);
2289 EXPORT_SYMBOL(tcp_v4_conn_request);
2290 EXPORT_SYMBOL(tcp_v4_connect);
2291 EXPORT_SYMBOL(tcp_v4_do_rcv);
2292 EXPORT_SYMBOL(tcp_v4_remember_stamp);
2293 EXPORT_SYMBOL(tcp_v4_send_check);
2294 EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
2296 #ifdef CONFIG_PROC_FS
2297 EXPORT_SYMBOL(tcp_proc_register);
2298 EXPORT_SYMBOL(tcp_proc_unregister);
2300 EXPORT_SYMBOL(sysctl_local_port_range);
2301 EXPORT_SYMBOL(sysctl_tcp_low_latency);
2302 EXPORT_SYMBOL(sysctl_tcp_tw_reuse);