X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=net%2Fsunrpc%2Fxprtsock.c;h=ec3462f141b489e4e2701d8925c39a5653bb60ea;hb=2818bf81a8c91fb29634df68bdc3cc5e003201d0;hp=70a772d7a7966d9045bc535597f1011cca1d7243;hpb=c7b2cae8a634015b72941ba2fc6c4bc9b8d3a129;p=safe%2Fjmp%2Flinux-2.6 diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 70a772d..ec3462f 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -36,9 +37,63 @@ #include /* - * Maximum port number to use when requesting a reserved port. + * xprtsock tunables */ -#define XS_MAX_RESVPORT (800U) +unsigned int xprt_udp_slot_table_entries = RPC_DEF_SLOT_TABLE; +unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE; + +unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT; +unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT; + +/* + * How many times to try sending a request on a socket before waiting + * for the socket buffer to clear. + */ +#define XS_SENDMSG_RETRY (10U) + +/* + * Time out for an RPC UDP socket connect. UDP socket connects are + * synchronous, but we set a timeout anyway in case of resource + * exhaustion on the local host. + */ +#define XS_UDP_CONN_TO (5U * HZ) + +/* + * Wait duration for an RPC TCP connection to be established. Solaris + * NFS over TCP uses 60 seconds, for example, which is in line with how + * long a server takes to reboot. + */ +#define XS_TCP_CONN_TO (60U * HZ) + +/* + * Wait duration for a reply from the RPC portmapper. + */ +#define XS_BIND_TO (60U * HZ) + +/* + * Delay if a UDP socket connect error occurs. This is most likely some + * kind of resource problem on the local host. + */ +#define XS_UDP_REEST_TO (2U * HZ) + +/* + * The reestablish timeout allows clients to delay for a bit before attempting + * to reconnect to a server that just dropped our connection. + * + * We implement an exponential backoff when trying to reestablish a TCP + * transport connection with the server. Some servers like to drop a TCP + * connection when they are overworked, so we start with a short timeout and + * increase over time if the server is down or not responding. + */ +#define XS_TCP_INIT_REEST_TO (3U * HZ) +#define XS_TCP_MAX_REEST_TO (5U * 60 * HZ) + +/* + * TCP idle timeout; client drops the transport socket if it is idle + * for this long. Note that we also timeout UDP sockets to prevent + * holding port numbers when there is no RPC traffic. + */ +#define XS_IDLE_DISC_TO (5U * 60 * HZ) #ifdef RPC_DEBUG # undef RPC_DEBUG_DATA @@ -70,39 +125,95 @@ static inline void xs_pktdump(char *msg, u32 *packet, unsigned int count) } #endif +static void xs_format_peer_addresses(struct rpc_xprt *xprt) +{ + struct sockaddr_in *addr = (struct sockaddr_in *) &xprt->addr; + char *buf; + + buf = kzalloc(20, GFP_KERNEL); + if (buf) { + snprintf(buf, 20, "%u.%u.%u.%u", + NIPQUAD(addr->sin_addr.s_addr)); + } + xprt->address_strings[RPC_DISPLAY_ADDR] = buf; + + buf = kzalloc(8, GFP_KERNEL); + if (buf) { + snprintf(buf, 8, "%u", + ntohs(addr->sin_port)); + } + xprt->address_strings[RPC_DISPLAY_PORT] = buf; + + if (xprt->prot == IPPROTO_UDP) + xprt->address_strings[RPC_DISPLAY_PROTO] = "udp"; + else + xprt->address_strings[RPC_DISPLAY_PROTO] = "tcp"; + + buf = kzalloc(48, GFP_KERNEL); + if (buf) { + snprintf(buf, 48, "addr=%u.%u.%u.%u port=%u proto=%s", + NIPQUAD(addr->sin_addr.s_addr), + ntohs(addr->sin_port), + xprt->prot == IPPROTO_UDP ? "udp" : "tcp"); + } + xprt->address_strings[RPC_DISPLAY_ALL] = buf; +} + +static void xs_free_peer_addresses(struct rpc_xprt *xprt) +{ + kfree(xprt->address_strings[RPC_DISPLAY_ADDR]); + kfree(xprt->address_strings[RPC_DISPLAY_PORT]); + kfree(xprt->address_strings[RPC_DISPLAY_ALL]); +} + #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) -static inline int xs_send_head(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base, unsigned int len) +static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more) { - struct kvec iov = { - .iov_base = xdr->head[0].iov_base + base, - .iov_len = len - base, - }; struct msghdr msg = { .msg_name = addr, .msg_namelen = addrlen, - .msg_flags = XS_SENDMSG_FLAGS, + .msg_flags = XS_SENDMSG_FLAGS | (more ? MSG_MORE : 0), + }; + struct kvec iov = { + .iov_base = vec->iov_base + base, + .iov_len = vec->iov_len - base, }; - if (xdr->len > len) - msg.msg_flags |= MSG_MORE; - - if (likely(iov.iov_len)) + if (iov.iov_len != 0) return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); return kernel_sendmsg(sock, &msg, NULL, 0, 0); } -static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int base, unsigned int len) +static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned int base, int more) { - struct kvec iov = { - .iov_base = xdr->tail[0].iov_base + base, - .iov_len = len - base, - }; - struct msghdr msg = { - .msg_flags = XS_SENDMSG_FLAGS, - }; + struct page **ppage; + unsigned int remainder; + int err, sent = 0; + + remainder = xdr->page_len - base; + base += xdr->page_base; + ppage = xdr->pages + (base >> PAGE_SHIFT); + base &= ~PAGE_MASK; + for(;;) { + unsigned int len = min_t(unsigned int, PAGE_SIZE - base, remainder); + int flags = XS_SENDMSG_FLAGS; - return kernel_sendmsg(sock, &msg, &iov, 1, iov.iov_len); + remainder -= len; + if (remainder != 0 || more) + flags |= MSG_MORE; + err = sock->ops->sendpage(sock, *ppage, base, len, flags); + if (remainder == 0 || err != len) + break; + sent += err; + ppage++; + base = 0; + } + if (sent == 0) + return err; + if (err > 0) + sent += err; + return sent; } /** @@ -116,218 +227,251 @@ static int xs_send_tail(struct socket *sock, struct xdr_buf *xdr, unsigned int b */ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen, struct xdr_buf *xdr, unsigned int base) { - struct page **ppage = xdr->pages; - unsigned int len, pglen = xdr->page_len; - int err, ret = 0; - ssize_t (*sendpage)(struct socket *, struct page *, int, size_t, int); - - len = xdr->head[0].iov_len; - if (base < len || (addr != NULL && base == 0)) { - err = xs_send_head(sock, addr, addrlen, xdr, base, len); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != (len - base)) + unsigned int remainder = xdr->len - base; + int err, sent = 0; + + if (unlikely(!sock)) + return -ENOTCONN; + + clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); + if (base != 0) { + addr = NULL; + addrlen = 0; + } + + if (base < xdr->head[0].iov_len || addr != NULL) { + unsigned int len = xdr->head[0].iov_len - base; + remainder -= len; + err = xs_send_kvec(sock, addr, addrlen, &xdr->head[0], base, remainder != 0); + if (remainder == 0 || err != len) goto out; + sent += err; base = 0; } else - base -= len; + base -= xdr->head[0].iov_len; - if (unlikely(pglen == 0)) - goto copy_tail; - if (unlikely(base >= pglen)) { - base -= pglen; - goto copy_tail; - } - if (base || xdr->page_base) { - pglen -= base; - base += xdr->page_base; - ppage += base >> PAGE_CACHE_SHIFT; - base &= ~PAGE_CACHE_MASK; - } + if (base < xdr->page_len) { + unsigned int len = xdr->page_len - base; + remainder -= len; + err = xs_send_pagedata(sock, xdr, base, remainder != 0); + if (remainder == 0 || err != len) + goto out; + sent += err; + base = 0; + } else + base -= xdr->page_len; - sendpage = sock->ops->sendpage ? : sock_no_sendpage; - do { - int flags = XS_SENDMSG_FLAGS; + if (base >= xdr->tail[0].iov_len) + return sent; + err = xs_send_kvec(sock, NULL, 0, &xdr->tail[0], base, 0); +out: + if (sent == 0) + return err; + if (err > 0) + sent += err; + return sent; +} - len = PAGE_CACHE_SIZE; - if (base) - len -= base; - if (pglen < len) - len = pglen; +/** + * xs_nospace - place task on wait queue if transmit was incomplete + * @task: task to put to sleep + * + */ +static void xs_nospace(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; - if (pglen != len || xdr->tail[0].iov_len != 0) - flags |= MSG_MORE; + dprintk("RPC: %4d xmit incomplete (%u left of %u)\n", + task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_slen); - /* Hmm... We might be dealing with highmem pages */ - if (PageHighMem(*ppage)) - sendpage = sock_no_sendpage; - err = sendpage(sock, *ppage, base, len, flags); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - if (err != len) - goto out; - base = 0; - ppage++; - } while ((pglen -= len) != 0); -copy_tail: - len = xdr->tail[0].iov_len; - if (base < len) { - err = xs_send_tail(sock, xdr, base, len); - if (ret == 0) - ret = err; - else if (err > 0) - ret += err; - } -out: - return ret; + if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { + /* Protect against races with write_space */ + spin_lock_bh(&xprt->transport_lock); + + /* Don't race with disconnect */ + if (!xprt_connected(xprt)) + task->tk_status = -ENOTCONN; + else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) + xprt_wait_for_buffer_space(task); + + spin_unlock_bh(&xprt->transport_lock); + } else + /* Keep holding the socket if it is blocked */ + rpc_delay(task, HZ>>4); } /** - * xs_sendmsg - write an RPC request to a socket - * @xprt: generic transport - * @req: the RPC request to write + * xs_udp_send_request - write an RPC request to a UDP socket + * @task: address of RPC task that manages the state of an RPC request * + * Return values: + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent */ -static int xs_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req) +static int xs_udp_send_request(struct rpc_task *task) { - struct socket *sock = xprt->sock; + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; struct xdr_buf *xdr = &req->rq_snd_buf; - struct sockaddr *addr = NULL; - int addrlen = 0; - unsigned int skip; - int result; - - if (!sock) - return -ENOTCONN; + int status; xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); - /* For UDP, we need to provide an address */ - if (!xprt->stream) { - addr = (struct sockaddr *) &xprt->addr; - addrlen = sizeof(xprt->addr); - } - /* Don't repeat bytes */ - skip = req->rq_bytes_sent; + req->rq_xtime = jiffies; + status = xs_sendpages(xprt->sock, (struct sockaddr *) &xprt->addr, + xprt->addrlen, xdr, req->rq_bytes_sent); - clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags); - result = xs_sendpages(sock, addr, addrlen, xdr, skip); + dprintk("RPC: xs_udp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); - dprintk("RPC: xs_sendmsg(%d) = %d\n", xdr->len - skip, result); + if (likely(status >= (int) req->rq_slen)) + return 0; - if (result >= 0) - return result; + /* Still some bytes left; set up for a retry later. */ + if (status > 0) + status = -EAGAIN; - switch (result) { + switch (status) { + case -ENETUNREACH: + case -EPIPE: case -ECONNREFUSED: /* When the server has died, an ICMP port unreachable message * prompts ECONNREFUSED. */ - case -EAGAIN: break; - case -ECONNRESET: - case -ENOTCONN: - case -EPIPE: - /* connection broken */ - if (xprt->stream) - result = -ENOTCONN; + case -EAGAIN: + xs_nospace(task); break; default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); break; } - return result; + + return status; +} + +static inline void xs_encode_tcp_record_marker(struct xdr_buf *buf) +{ + u32 reclen = buf->len - sizeof(rpc_fraghdr); + rpc_fraghdr *base = buf->head[0].iov_base; + *base = htonl(RPC_LAST_STREAM_FRAGMENT | reclen); } /** - * xs_send_request - write an RPC request to a socket + * xs_tcp_send_request - write an RPC request to a TCP socket * @task: address of RPC task that manages the state of an RPC request * * Return values: - * 0: The request has been sent - * EAGAIN: The socket was blocked, please call again later to - * complete the request - * other: Some other error occured, the request was not sent + * 0: The request has been sent + * EAGAIN: The socket was blocked, please call again later to + * complete the request + * ENOTCONN: Caller needs to invoke connect logic then call again + * other: Some other error occured, the request was not sent * * XXX: In the case of soft timeouts, should we eventually give up - * if the socket is not able to make progress? + * if sendmsg is not able to make progress? */ -static int xs_send_request(struct rpc_task *task) +static int xs_tcp_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; + struct xdr_buf *xdr = &req->rq_snd_buf; int status, retry = 0; - /* set up everything as needed. */ - /* Write the record marker */ - if (xprt->stream) { - u32 *marker = req->rq_svec[0].iov_base; + xs_encode_tcp_record_marker(&req->rq_snd_buf); - *marker = htonl(0x80000000|(req->rq_slen-sizeof(*marker))); - } + xs_pktdump("packet data:", + req->rq_svec->iov_base, + req->rq_svec->iov_len); /* Continue transmitting the packet/record. We must be careful * to cope with writespace callbacks arriving _after_ we have - * called sendmsg(). - */ + * called sendmsg(). */ while (1) { req->rq_xtime = jiffies; - status = xs_sendmsg(xprt, req); + status = xs_sendpages(xprt->sock, NULL, 0, xdr, + req->rq_bytes_sent); - if (status < 0) - break; + dprintk("RPC: xs_tcp_send_request(%u) = %d\n", + xdr->len - req->rq_bytes_sent, status); - if (xprt->stream) { - req->rq_bytes_sent += status; - - /* If we've sent the entire packet, immediately - * reset the count of bytes sent. */ - if (req->rq_bytes_sent >= req->rq_slen) { - req->rq_bytes_sent = 0; - return 0; - } - } else { - if (status >= req->rq_slen) - return 0; - status = -EAGAIN; + if (unlikely(status < 0)) break; - } - dprintk("RPC: %4d xmit incomplete (%d left of %d)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, - req->rq_slen); + /* If we've sent the entire packet, immediately + * reset the count of bytes sent. */ + req->rq_bytes_sent += status; + task->tk_bytes_sent += status; + if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_bytes_sent = 0; + return 0; + } status = -EAGAIN; - if (retry++ > 50) + if (retry++ > XS_SENDMSG_RETRY) break; } - if (status == -EAGAIN) { - if (test_bit(SOCK_ASYNC_NOSPACE, &xprt->sock->flags)) { - /* Protect against races with write_space */ - spin_lock_bh(&xprt->transport_lock); - /* Don't race with disconnect */ - if (!xprt_connected(xprt)) - task->tk_status = -ENOTCONN; - else if (test_bit(SOCK_NOSPACE, &xprt->sock->flags)) - xprt_wait_for_buffer_space(task); - spin_unlock_bh(&xprt->transport_lock); - return status; - } - /* Keep holding the socket if it is blocked */ - rpc_delay(task, HZ>>4); + switch (status) { + case -EAGAIN: + xs_nospace(task); + break; + case -ECONNREFUSED: + case -ECONNRESET: + case -ENOTCONN: + case -EPIPE: + status = -ENOTCONN; + break; + default: + dprintk("RPC: sendmsg returned unrecognized error %d\n", + -status); + xprt_disconnect(xprt); + break; } + return status; } /** + * xs_tcp_release_xprt - clean up after a tcp transmission + * @xprt: transport + * @task: rpc task + * + * This cleans up if an error causes us to abort the transmission of a request. + * In this case, the socket may need to be reset in order to avoid confusing + * the server. + */ +static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) +{ + struct rpc_rqst *req; + + if (task != xprt->snd_task) + return; + if (task == NULL) + goto out_release; + req = task->tk_rqstp; + if (req->rq_bytes_sent == 0) + goto out_release; + if (req->rq_bytes_sent == req->rq_snd_buf.len) + goto out_release; + set_bit(XPRT_CLOSE_WAIT, &task->tk_xprt->state); +out_release: + xprt_release_xprt(xprt, task); +} + +/** * xs_close - close a socket * @xprt: transport * + * This is used when all requests are complete; ie, no DRC state remains + * on the server we want to save. */ static void xs_close(struct rpc_xprt *xprt) { @@ -335,7 +479,7 @@ static void xs_close(struct rpc_xprt *xprt) struct sock *sk = xprt->inet; if (!sk) - return; + goto clear_close_wait; dprintk("RPC: xs_close xprt %p\n", xprt); @@ -352,6 +496,10 @@ static void xs_close(struct rpc_xprt *xprt) sk->sk_no_check = 0; sock_release(sock); +clear_close_wait: + smp_mb__before_clear_bit(); + clear_bit(XPRT_CLOSE_WAIT, &xprt->state); + smp_mb__after_clear_bit(); } /** @@ -368,7 +516,9 @@ static void xs_destroy(struct rpc_xprt *xprt) xprt_disconnect(xprt); xs_close(xprt); + xs_free_peer_addresses(xprt); kfree(xprt->slot); + kfree(xprt); } static inline struct rpc_xprt *xprt_from_sock(struct sock *sk) @@ -389,7 +539,8 @@ static void xs_udp_data_ready(struct sock *sk, int len) struct rpc_rqst *rovr; struct sk_buff *skb; int err, repsize, copied; - u32 _xid, *xp; + u32 _xid; + __be32 *xp; read_lock(&sk->sk_callback_lock); dprintk("RPC: xs_udp_data_ready...\n"); @@ -421,8 +572,6 @@ static void xs_udp_data_ready(struct sock *sk, int len) goto out_unlock; task = rovr->rq_task; - dprintk("RPC: %4d received reply\n", task->tk_pid); - if ((copied = rovr->rq_private_buf.buflen) > repsize) copied = repsize; @@ -433,7 +582,9 @@ static void xs_udp_data_ready(struct sock *sk, int len) /* Something worked... */ dst_confirm(skb->dst); - xprt_complete_rqst(xprt, rovr, copied); + xprt_adjust_cwnd(task, copied); + xprt_update_rtt(task); + xprt_complete_rqst(task, copied); out_unlock: spin_unlock(&xprt->transport_lock); @@ -470,16 +621,19 @@ static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, skb_reader_t *desc xprt->tcp_offset += used; if (used != len) return; + xprt->tcp_reclen = ntohl(xprt->tcp_recm); - if (xprt->tcp_reclen & 0x80000000) + if (xprt->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) xprt->tcp_flags |= XPRT_LAST_FRAG; else xprt->tcp_flags &= ~XPRT_LAST_FRAG; - xprt->tcp_reclen &= 0x7fffffff; + xprt->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; + xprt->tcp_flags &= ~XPRT_COPY_RECM; xprt->tcp_offset = 0; + /* Sanity check of the record length */ - if (xprt->tcp_reclen < 4) { + if (unlikely(xprt->tcp_reclen < 4)) { dprintk("RPC: invalid TCP record fragment length\n"); xprt_disconnect(xprt); return; @@ -592,11 +746,8 @@ static inline void xs_tcp_read_request(struct rpc_xprt *xprt, skb_reader_t *desc } out: - if (!(xprt->tcp_flags & XPRT_COPY_DATA)) { - dprintk("RPC: %4d received reply complete\n", - req->rq_task->tk_pid); - xprt_complete_rqst(xprt, req, xprt->tcp_copied); - } + if (!(xprt->tcp_flags & XPRT_COPY_DATA)) + xprt_complete_rqst(req->rq_task, xprt->tcp_copied); spin_unlock(&xprt->transport_lock); xs_tcp_check_recm(xprt); } @@ -703,6 +854,7 @@ static void xs_tcp_state_change(struct sock *sk) xprt->tcp_reclen = 0; xprt->tcp_copied = 0; xprt->tcp_flags = XPRT_COPY_RECM | XPRT_COPY_XID; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; xprt_wake_pending_tasks(xprt, 0); } spin_unlock_bh(&xprt->transport_lock); @@ -710,9 +862,13 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_SYN_SENT: case TCP_SYN_RECV: break; + case TCP_CLOSE_WAIT: + /* Try to schedule an autoclose RPC calls */ + set_bit(XPRT_CLOSE_WAIT, &xprt->state); + if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0) + schedule_work(&xprt->task_cleanup); default: xprt_disconnect(xprt); - break; } out: read_unlock(&sk->sk_callback_lock); @@ -784,20 +940,10 @@ static void xs_tcp_write_space(struct sock *sk) read_unlock(&sk->sk_callback_lock); } -/** - * xs_set_buffer_size - set send and receive limits - * @xprt: generic transport - * - * Set socket send and receive limits based on the - * sndsize and rcvsize fields in the generic transport - * structure. This applies only to UDP sockets. - */ -static void xs_set_buffer_size(struct rpc_xprt *xprt) +static void xs_udp_do_set_buffer_size(struct rpc_xprt *xprt) { struct sock *sk = xprt->inet; - if (xprt->stream) - return; if (xprt->rcvsize) { sk->sk_userlocks |= SOCK_RCVBUF_LOCK; sk->sk_rcvbuf = xprt->rcvsize * xprt->max_reqs * 2; @@ -809,18 +955,83 @@ static void xs_set_buffer_size(struct rpc_xprt *xprt) } } +/** + * xs_udp_set_buffer_size - set send and receive limits + * @xprt: generic transport + * @sndsize: requested size of send buffer, in bytes + * @rcvsize: requested size of receive buffer, in bytes + * + * Set socket send and receive buffer size limits. + */ +static void xs_udp_set_buffer_size(struct rpc_xprt *xprt, size_t sndsize, size_t rcvsize) +{ + xprt->sndsize = 0; + if (sndsize) + xprt->sndsize = sndsize + 1024; + xprt->rcvsize = 0; + if (rcvsize) + xprt->rcvsize = rcvsize + 1024; + + xs_udp_do_set_buffer_size(xprt); +} + +/** + * xs_udp_timer - called when a retransmit timeout occurs on a UDP transport + * @task: task that timed out + * + * Adjust the congestion window after a retransmit timeout has occurred. + */ +static void xs_udp_timer(struct rpc_task *task) +{ + xprt_adjust_cwnd(task, -ETIMEDOUT); +} + +static unsigned short xs_get_random_port(void) +{ + unsigned short range = xprt_max_resvport - xprt_min_resvport; + unsigned short rand = (unsigned short) net_random() % range; + return rand + xprt_min_resvport; +} + +/** + * xs_print_peer_address - format an IPv4 address for printing + * @xprt: generic transport + * @format: flags field indicating which parts of the address to render + */ +static char *xs_print_peer_address(struct rpc_xprt *xprt, enum rpc_display_format_t format) +{ + if (xprt->address_strings[format] != NULL) + return xprt->address_strings[format]; + else + return "unprintable"; +} + +/** + * xs_set_port - reset the port number in the remote endpoint address + * @xprt: generic transport + * @port: new port number + * + */ +static void xs_set_port(struct rpc_xprt *xprt, unsigned short port) +{ + struct sockaddr_in *sap = (struct sockaddr_in *) &xprt->addr; + + dprintk("RPC: setting port for xprt %p to %u\n", xprt, port); + + sap->sin_port = htons(port); +} + static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) { struct sockaddr_in myaddr = { .sin_family = AF_INET, }; - int err, port; + int err; + unsigned short port = xprt->port; - /* Were we already bound to a given port? Try to reuse it */ - port = xprt->port; do { myaddr.sin_port = htons(port); - err = sock->ops->bind(sock, (struct sockaddr *) &myaddr, + err = kernel_bind(sock, (struct sockaddr *) &myaddr, sizeof(myaddr)); if (err == 0) { xprt->port = port; @@ -828,112 +1039,167 @@ static int xs_bindresvport(struct rpc_xprt *xprt, struct socket *sock) port); return 0; } - if (--port == 0) - port = XS_MAX_RESVPORT; + if (port <= xprt_min_resvport) + port = xprt_max_resvport; + else + port--; } while (err == -EADDRINUSE && port != xprt->port); dprintk("RPC: can't bind to reserved port (%d).\n", -err); return err; } -static struct socket *xs_create(struct rpc_xprt *xprt, int proto, int resvport) +/** + * xs_udp_connect_worker - set up a UDP socket + * @args: RPC transport to connect + * + * Invoked by a work queue tasklet. + */ +static void xs_udp_connect_worker(void *args) { - struct socket *sock; - int type, err; + struct rpc_xprt *xprt = (struct rpc_xprt *) args; + struct socket *sock = xprt->sock; + int err, status = -EIO; - dprintk("RPC: xs_create(%s %d)\n", - (proto == IPPROTO_UDP)? "udp" : "tcp", proto); + if (xprt->shutdown || !xprt_bound(xprt)) + goto out; - type = (proto == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM; + /* Start by resetting any existing state */ + xs_close(xprt); - if ((err = sock_create_kern(PF_INET, type, proto, &sock)) < 0) { - dprintk("RPC: can't create socket (%d).\n", -err); - return NULL; + if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) { + dprintk("RPC: can't create UDP transport socket (%d).\n", -err); + goto out; } - /* If the caller has the capability, bind to a reserved port */ - if (resvport && xs_bindresvport(xprt, sock) < 0) - goto failed; - - return sock; + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } -failed: - sock_release(sock); - return NULL; -} + dprintk("RPC: worker connecting xprt %p to address: %s\n", + xprt, xs_print_peer_address(xprt, RPC_DISPLAY_ALL)); -static void xs_bind(struct rpc_xprt *xprt, struct socket *sock) -{ - struct sock *sk = sock->sk; + if (!xprt->inet) { + struct sock *sk = sock->sk; - if (xprt->inet) - return; + write_lock_bh(&sk->sk_callback_lock); - write_lock_bh(&sk->sk_callback_lock); - sk->sk_user_data = xprt; - xprt->old_data_ready = sk->sk_data_ready; - xprt->old_state_change = sk->sk_state_change; - xprt->old_write_space = sk->sk_write_space; - if (xprt->prot == IPPROTO_UDP) { + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; sk->sk_data_ready = xs_udp_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_no_check = UDP_CSUM_NORCV; + sk->sk_allocation = GFP_ATOMIC; + xprt_set_connected(xprt); - } else { - tcp_sk(sk)->nonagle = 1; /* disable Nagle's algorithm */ - sk->sk_data_ready = xs_tcp_data_ready; - sk->sk_state_change = xs_tcp_state_change; - sk->sk_write_space = xs_tcp_write_space; - xprt_clear_connected(xprt); + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); } + xs_udp_do_set_buffer_size(xprt); + status = 0; +out: + xprt_wake_pending_tasks(xprt, status); + xprt_clear_connecting(xprt); +} - /* Reset to new socket */ - xprt->sock = sock; - xprt->inet = sk; - write_unlock_bh(&sk->sk_callback_lock); +/* + * We need to preserve the port number so the reply cache on the server can + * find our cached RPC replies when we get around to reconnecting. + */ +static void xs_tcp_reuse_connection(struct rpc_xprt *xprt) +{ + int result; + struct socket *sock = xprt->sock; + struct sockaddr any; + + dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt); - return; + /* + * Disconnect the transport socket by doing a connect operation + * with AF_UNSPEC. This should return immediately... + */ + memset(&any, 0, sizeof(any)); + any.sa_family = AF_UNSPEC; + result = kernel_connect(sock, &any, sizeof(any), 0); + if (result) + dprintk("RPC: AF_UNSPEC connect return code %d\n", + result); } /** - * xs_connect_worker - try to connect a socket to a remote endpoint + * xs_tcp_connect_worker - connect a TCP socket to a remote endpoint * @args: RPC transport to connect * * Invoked by a work queue tasklet. */ -static void xs_connect_worker(void *args) +static void xs_tcp_connect_worker(void *args) { struct rpc_xprt *xprt = (struct rpc_xprt *)args; struct socket *sock = xprt->sock; - int status = -EIO; + int err, status = -EIO; - if (xprt->shutdown || xprt->addr.sin_port == 0) + if (xprt->shutdown || !xprt_bound(xprt)) goto out; - dprintk("RPC: xs_connect_worker xprt %p\n", xprt); + if (!xprt->sock) { + /* start from scratch */ + if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) { + dprintk("RPC: can't create TCP transport socket (%d).\n", -err); + goto out; + } - /* - * Start by resetting any existing state - */ - xs_close(xprt); - sock = xs_create(xprt, xprt->prot, xprt->resvport); - if (sock == NULL) { - /* couldn't create socket or bind to reserved port; - * this is likely a permanent error, so cause an abort */ - goto out; - } - xs_bind(xprt, sock); - xs_set_buffer_size(xprt); + if (xprt->resvport && xs_bindresvport(xprt, sock) < 0) { + sock_release(sock); + goto out; + } + } else + /* "close" the socket, preserving the local port */ + xs_tcp_reuse_connection(xprt); - status = 0; - if (!xprt->stream) - goto out; + dprintk("RPC: worker connecting xprt %p to address: %s\n", + xprt, xs_print_peer_address(xprt, RPC_DISPLAY_ALL)); - /* - * Tell the socket layer to start connecting... - */ - status = sock->ops->connect(sock, (struct sockaddr *) &xprt->addr, - sizeof(xprt->addr), O_NONBLOCK); + if (!xprt->inet) { + struct sock *sk = sock->sk; + + write_lock_bh(&sk->sk_callback_lock); + + sk->sk_user_data = xprt; + xprt->old_data_ready = sk->sk_data_ready; + xprt->old_state_change = sk->sk_state_change; + xprt->old_write_space = sk->sk_write_space; + sk->sk_data_ready = xs_tcp_data_ready; + sk->sk_state_change = xs_tcp_state_change; + sk->sk_write_space = xs_tcp_write_space; + sk->sk_allocation = GFP_ATOMIC; + + /* socket options */ + sk->sk_userlocks |= SOCK_BINDPORT_LOCK; + sock_reset_flag(sk, SOCK_LINGER); + tcp_sk(sk)->linger2 = 0; + tcp_sk(sk)->nonagle |= TCP_NAGLE_OFF; + + xprt_clear_connected(xprt); + + /* Reset to new socket */ + xprt->sock = sock; + xprt->inet = sk; + + write_unlock_bh(&sk->sk_callback_lock); + } + + /* Tell the socket layer to start connecting... */ + xprt->stat.connect_count++; + xprt->stat.connect_start = jiffies; + status = kernel_connect(sock, (struct sockaddr *) &xprt->addr, + xprt->addrlen, O_NONBLOCK); dprintk("RPC: %p connect status %d connected %d sock state %d\n", xprt, -status, xprt_connected(xprt), sock->sk->sk_state); if (status < 0) { @@ -941,6 +1207,14 @@ static void xs_connect_worker(void *args) case -EINPROGRESS: case -EALREADY: goto out_clear; + case -ECONNREFUSED: + case -ECONNRESET: + /* retry with existing socket, after a delay */ + break; + default: + /* get rid of existing socket, and retry */ + xs_close(xprt); + break; } } out: @@ -954,112 +1228,229 @@ out_clear: * @task: address of RPC task that manages state of connect request * * TCP: If the remote end dropped the connection, delay reconnecting. + * + * UDP socket connects are synchronous, but we use a work queue anyway + * to guarantee that even unprivileged user processes can set up a + * socket on a privileged port. + * + * If a UDP socket connect fails, the delay behavior here prevents + * retry floods (hard mounts). */ static void xs_connect(struct rpc_task *task) { struct rpc_xprt *xprt = task->tk_xprt; - if (!xprt_test_and_set_connecting(xprt)) { - if (xprt->sock != NULL) { - dprintk("RPC: xs_connect delayed xprt %p\n", xprt); - schedule_delayed_work(&xprt->connect_worker, - RPC_REESTABLISH_TIMEOUT); - } else { - dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); - schedule_work(&xprt->connect_worker); - /* flush_scheduled_work can sleep... */ - if (!RPC_IS_ASYNC(task)) - flush_scheduled_work(); - } + if (xprt_test_and_set_connecting(xprt)) + return; + + if (xprt->sock != NULL) { + dprintk("RPC: xs_connect delayed xprt %p for %lu seconds\n", + xprt, xprt->reestablish_timeout / HZ); + schedule_delayed_work(&xprt->connect_worker, + xprt->reestablish_timeout); + xprt->reestablish_timeout <<= 1; + if (xprt->reestablish_timeout > XS_TCP_MAX_REEST_TO) + xprt->reestablish_timeout = XS_TCP_MAX_REEST_TO; + } else { + dprintk("RPC: xs_connect scheduled xprt %p\n", xprt); + schedule_work(&xprt->connect_worker); + + /* flush_scheduled_work can sleep... */ + if (!RPC_IS_ASYNC(task)) + flush_scheduled_work(); } } -static struct rpc_xprt_ops xs_ops = { - .set_buffer_size = xs_set_buffer_size, +/** + * xs_udp_print_stats - display UDP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_udp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + seq_printf(seq, "\txprt:\tudp %u %lu %lu %lu %lu %Lu %Lu\n", + xprt->port, + xprt->stat.bind_count, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + +/** + * xs_tcp_print_stats - display TCP socket-specifc stats + * @xprt: rpc_xprt struct containing statistics + * @seq: output file + * + */ +static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +{ + long idle_time = 0; + + if (xprt_connected(xprt)) + idle_time = (long)(jiffies - xprt->last_used) / HZ; + + seq_printf(seq, "\txprt:\ttcp %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu\n", + xprt->port, + xprt->stat.bind_count, + xprt->stat.connect_count, + xprt->stat.connect_time, + idle_time, + xprt->stat.sends, + xprt->stat.recvs, + xprt->stat.bad_xids, + xprt->stat.req_u, + xprt->stat.bklog_u); +} + +static struct rpc_xprt_ops xs_udp_ops = { + .set_buffer_size = xs_udp_set_buffer_size, + .print_addr = xs_print_peer_address, + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .rpcbind = rpc_getport, + .set_port = xs_set_port, + .connect = xs_connect, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_udp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_rtt, + .timer = xs_udp_timer, + .release_request = xprt_release_rqst_cong, + .close = xs_close, + .destroy = xs_destroy, + .print_stats = xs_udp_print_stats, +}; + +static struct rpc_xprt_ops xs_tcp_ops = { + .print_addr = xs_print_peer_address, + .reserve_xprt = xprt_reserve_xprt, + .release_xprt = xs_tcp_release_xprt, + .rpcbind = rpc_getport, + .set_port = xs_set_port, .connect = xs_connect, - .send_request = xs_send_request, + .buf_alloc = rpc_malloc, + .buf_free = rpc_free, + .send_request = xs_tcp_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, .destroy = xs_destroy, + .print_stats = xs_tcp_print_stats, }; -extern unsigned int xprt_udp_slot_table_entries; -extern unsigned int xprt_tcp_slot_table_entries; +static struct rpc_xprt *xs_setup_xprt(struct sockaddr *addr, size_t addrlen, unsigned int slot_table_size) +{ + struct rpc_xprt *xprt; + + if (addrlen > sizeof(xprt->addr)) { + dprintk("RPC: xs_setup_xprt: address too large\n"); + return ERR_PTR(-EBADF); + } + + xprt = kzalloc(sizeof(struct rpc_xprt), GFP_KERNEL); + if (xprt == NULL) { + dprintk("RPC: xs_setup_xprt: couldn't allocate rpc_xprt\n"); + return ERR_PTR(-ENOMEM); + } + + xprt->max_reqs = slot_table_size; + xprt->slot = kcalloc(xprt->max_reqs, sizeof(struct rpc_rqst), GFP_KERNEL); + if (xprt->slot == NULL) { + kfree(xprt); + dprintk("RPC: xs_setup_xprt: couldn't allocate slot table\n"); + return ERR_PTR(-ENOMEM); + } + + memcpy(&xprt->addr, addr, addrlen); + xprt->addrlen = addrlen; + xprt->port = xs_get_random_port(); + + return xprt; +} /** * xs_setup_udp - Set up transport to use a UDP socket - * @xprt: transport to set up + * @addr: address of remote server + * @addrlen: length of address in bytes * @to: timeout parameters * */ -int xs_setup_udp(struct rpc_xprt *xprt, struct rpc_timeout *to) +struct rpc_xprt *xs_setup_udp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to) { - size_t slot_table_size; + struct rpc_xprt *xprt; - dprintk("RPC: setting up udp-ipv4 transport...\n"); + xprt = xs_setup_xprt(addr, addrlen, xprt_udp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; - xprt->max_reqs = xprt_udp_slot_table_entries; - slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); - xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); - if (xprt->slot == NULL) - return -ENOMEM; - memset(xprt->slot, 0, slot_table_size); + if (ntohs(((struct sockaddr_in *)addr)->sin_port) != 0) + xprt_set_bound(xprt); xprt->prot = IPPROTO_UDP; - xprt->port = XS_MAX_RESVPORT; - xprt->stream = 0; - xprt->nocong = 0; - xprt->cwnd = RPC_INITCWND; - xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; + xprt->tsh_size = 0; /* XXX: header size can vary due to auth type, IPv6, etc. */ xprt->max_payload = (1U << 16) - (MAX_HEADER << 3); - INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_udp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_UDP_CONN_TO; + xprt->reestablish_timeout = XS_UDP_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; - xprt->ops = &xs_ops; + xprt->ops = &xs_udp_ops; if (to) xprt->timeout = *to; else xprt_set_timeout(&xprt->timeout, 5, 5 * HZ); - return 0; + xs_format_peer_addresses(xprt); + dprintk("RPC: set up transport to address %s\n", + xs_print_peer_address(xprt, RPC_DISPLAY_ALL)); + + return xprt; } /** * xs_setup_tcp - Set up transport to use a TCP socket - * @xprt: transport to set up + * @addr: address of remote server + * @addrlen: length of address in bytes * @to: timeout parameters * */ -int xs_setup_tcp(struct rpc_xprt *xprt, struct rpc_timeout *to) +struct rpc_xprt *xs_setup_tcp(struct sockaddr *addr, size_t addrlen, struct rpc_timeout *to) { - size_t slot_table_size; + struct rpc_xprt *xprt; - dprintk("RPC: setting up tcp-ipv4 transport...\n"); + xprt = xs_setup_xprt(addr, addrlen, xprt_tcp_slot_table_entries); + if (IS_ERR(xprt)) + return xprt; - xprt->max_reqs = xprt_tcp_slot_table_entries; - slot_table_size = xprt->max_reqs * sizeof(xprt->slot[0]); - xprt->slot = kmalloc(slot_table_size, GFP_KERNEL); - if (xprt->slot == NULL) - return -ENOMEM; - memset(xprt->slot, 0, slot_table_size); + if (ntohs(((struct sockaddr_in *)addr)->sin_port) != 0) + xprt_set_bound(xprt); xprt->prot = IPPROTO_TCP; - xprt->port = XS_MAX_RESVPORT; - xprt->stream = 1; - xprt->nocong = 1; - xprt->cwnd = RPC_MAXCWND(xprt); - xprt->resvport = capable(CAP_NET_BIND_SERVICE) ? 1 : 0; - xprt->max_payload = (1U << 31) - 1; + xprt->tsh_size = sizeof(rpc_fraghdr) / sizeof(u32); + xprt->max_payload = RPC_MAX_FRAGMENT_SIZE; - INIT_WORK(&xprt->connect_worker, xs_connect_worker, xprt); + INIT_WORK(&xprt->connect_worker, xs_tcp_connect_worker, xprt); + xprt->bind_timeout = XS_BIND_TO; + xprt->connect_timeout = XS_TCP_CONN_TO; + xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO; + xprt->idle_timeout = XS_IDLE_DISC_TO; - xprt->ops = &xs_ops; + xprt->ops = &xs_tcp_ops; if (to) xprt->timeout = *to; else xprt_set_timeout(&xprt->timeout, 2, 60 * HZ); - return 0; + xs_format_peer_addresses(xprt); + dprintk("RPC: set up transport to address %s\n", + xs_print_peer_address(xprt, RPC_DISPLAY_ALL)); + + return xprt; }