dccp: Feature negotiation for minimum-checksum-coverage
[safe/jmp/linux-2.6] / net / dccp / proto.c
index ed0bf58..47b137a 100644 (file)
@@ -9,7 +9,6 @@
  *     published by the Free Software Foundation.
  */
 
-#include <linux/config.h>
 #include <linux/dccp.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/random.h>
 #include <net/checksum.h>
 
-#include <net/inet_common.h>
-#include <net/ip.h>
-#include <net/protocol.h>
+#include <net/inet_sock.h>
 #include <net/sock.h>
 #include <net/xfrm.h>
 
-#include <asm/semaphore.h>
+#include <asm/ioctls.h>
 #include <linux/spinlock.h>
 #include <linux/timer.h>
 #include <linux/delay.h>
 #include <linux/poll.h>
-#include <linux/dccp.h>
 
 #include "ccid.h"
 #include "dccp.h"
+#include "feat.h"
+
+DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly;
 
-DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics);
+EXPORT_SYMBOL_GPL(dccp_statistics);
 
 atomic_t dccp_orphan_count = ATOMIC_INIT(0);
 
-static struct net_protocol dccp_protocol = {
-       .handler        = dccp_v4_rcv,
-       .err_handler    = dccp_v4_err,
+EXPORT_SYMBOL_GPL(dccp_orphan_count);
+
+struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
+       .lhash_lock     = RW_LOCK_UNLOCKED,
+       .lhash_users    = ATOMIC_INIT(0),
+       .lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
 };
 
+EXPORT_SYMBOL_GPL(dccp_hashinfo);
+
+/* the maximum queue length for tx in packets. 0 is no limit */
+int sysctl_dccp_tx_qlen __read_mostly = 5;
+
+void dccp_set_state(struct sock *sk, const int state)
+{
+       const int oldstate = sk->sk_state;
+
+       dccp_pr_debug("%s(%p)  %s  -->  %s\n", dccp_role(sk), sk,
+                     dccp_state_name(oldstate), dccp_state_name(state));
+       WARN_ON(state == oldstate);
+
+       switch (state) {
+       case DCCP_OPEN:
+               if (oldstate != DCCP_OPEN)
+                       DCCP_INC_STATS(DCCP_MIB_CURRESTAB);
+               break;
+
+       case DCCP_CLOSED:
+               if (oldstate == DCCP_OPEN || oldstate == DCCP_ACTIVE_CLOSEREQ ||
+                   oldstate == DCCP_CLOSING)
+                       DCCP_INC_STATS(DCCP_MIB_ESTABRESETS);
+
+               sk->sk_prot->unhash(sk);
+               if (inet_csk(sk)->icsk_bind_hash != NULL &&
+                   !(sk->sk_userlocks & SOCK_BINDPORT_LOCK))
+                       inet_put_port(sk);
+               /* fall through */
+       default:
+               if (oldstate == DCCP_OPEN)
+                       DCCP_DEC_STATS(DCCP_MIB_CURRESTAB);
+       }
+
+       /* Change state AFTER socket is unhashed to avoid closed
+        * socket sitting in hash tables.
+        */
+       sk->sk_state = state;
+}
+
+EXPORT_SYMBOL_GPL(dccp_set_state);
+
+static void dccp_finish_passive_close(struct sock *sk)
+{
+       switch (sk->sk_state) {
+       case DCCP_PASSIVE_CLOSE:
+               /* Node (client or server) has received Close packet. */
+               dccp_send_reset(sk, DCCP_RESET_CODE_CLOSED);
+               dccp_set_state(sk, DCCP_CLOSED);
+               break;
+       case DCCP_PASSIVE_CLOSEREQ:
+               /*
+                * Client received CloseReq. We set the `active' flag so that
+                * dccp_send_close() retransmits the Close as per RFC 4340, 8.3.
+                */
+               dccp_send_close(sk, 1);
+               dccp_set_state(sk, DCCP_CLOSING);
+       }
+}
+
+void dccp_done(struct sock *sk)
+{
+       dccp_set_state(sk, DCCP_CLOSED);
+       dccp_clear_xmit_timers(sk);
+
+       sk->sk_shutdown = SHUTDOWN_MASK;
+
+       if (!sock_flag(sk, SOCK_DEAD))
+               sk->sk_state_change(sk);
+       else
+               inet_csk_destroy_sock(sk);
+}
+
+EXPORT_SYMBOL_GPL(dccp_done);
+
 const char *dccp_packet_name(const int type)
 {
        static const char *dccp_packet_names[] = {
@@ -74,14 +151,17 @@ EXPORT_SYMBOL_GPL(dccp_packet_name);
 const char *dccp_state_name(const int state)
 {
        static char *dccp_state_names[] = {
-       [DCCP_OPEN]       = "OPEN",
-       [DCCP_REQUESTING] = "REQUESTING",
-       [DCCP_PARTOPEN]   = "PARTOPEN",
-       [DCCP_LISTEN]     = "LISTEN",
-       [DCCP_RESPOND]    = "RESPOND",
-       [DCCP_CLOSING]    = "CLOSING",
-       [DCCP_TIME_WAIT]  = "TIME_WAIT",
-       [DCCP_CLOSED]     = "CLOSED",
+       [DCCP_OPEN]             = "OPEN",
+       [DCCP_REQUESTING]       = "REQUESTING",
+       [DCCP_PARTOPEN]         = "PARTOPEN",
+       [DCCP_LISTEN]           = "LISTEN",
+       [DCCP_RESPOND]          = "RESPOND",
+       [DCCP_CLOSING]          = "CLOSING",
+       [DCCP_ACTIVE_CLOSEREQ]  = "CLOSEREQ",
+       [DCCP_PASSIVE_CLOSE]    = "PASSIVE_CLOSE",
+       [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ",
+       [DCCP_TIME_WAIT]        = "TIME_WAIT",
+       [DCCP_CLOSED]           = "CLOSED",
        };
 
        if (state >= DCCP_MAX_STATES)
@@ -92,10 +172,122 @@ const char *dccp_state_name(const int state)
 
 EXPORT_SYMBOL_GPL(dccp_state_name);
 
-static inline int dccp_listen_start(struct sock *sk)
+int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized)
+{
+       struct dccp_sock *dp = dccp_sk(sk);
+       struct dccp_minisock *dmsk = dccp_msk(sk);
+       struct inet_connection_sock *icsk = inet_csk(sk);
+
+       dccp_minisock_init(&dp->dccps_minisock);
+
+       icsk->icsk_rto          = DCCP_TIMEOUT_INIT;
+       icsk->icsk_syn_retries  = sysctl_dccp_request_retries;
+       sk->sk_state            = DCCP_CLOSED;
+       sk->sk_write_space      = dccp_write_space;
+       icsk->icsk_sync_mss     = dccp_sync_mss;
+       dp->dccps_mss_cache     = 536;
+       dp->dccps_rate_last     = jiffies;
+       dp->dccps_role          = DCCP_ROLE_UNDEFINED;
+       dp->dccps_service       = DCCP_SERVICE_CODE_IS_ABSENT;
+       dp->dccps_l_ack_ratio   = dp->dccps_r_ack_ratio = 1;
+
+       dccp_init_xmit_timers(sk);
+
+       INIT_LIST_HEAD(&dp->dccps_featneg);
+       /*
+        * FIXME: We're hardcoding the CCID, and doing this at this point makes
+        * the listening (master) sock get CCID control blocks, which is not
+        * necessary, but for now, to not mess with the test userspace apps,
+        * lets leave it here, later the real solution is to do this in a
+        * setsockopt(CCIDs-I-want/accept). -acme
+        */
+       if (likely(ctl_sock_initialized)) {
+               int rc = dccp_feat_init(sk);
+
+               if (rc)
+                       return rc;
+
+               if (dmsk->dccpms_send_ack_vector) {
+                       dp->dccps_hc_rx_ackvec = dccp_ackvec_alloc(GFP_KERNEL);
+                       if (dp->dccps_hc_rx_ackvec == NULL)
+                               return -ENOMEM;
+               }
+               dp->dccps_hc_rx_ccid = ccid_hc_rx_new(dmsk->dccpms_rx_ccid,
+                                                     sk, GFP_KERNEL);
+               dp->dccps_hc_tx_ccid = ccid_hc_tx_new(dmsk->dccpms_tx_ccid,
+                                                     sk, GFP_KERNEL);
+               if (unlikely(dp->dccps_hc_rx_ccid == NULL ||
+                            dp->dccps_hc_tx_ccid == NULL)) {
+                       ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+                       ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+                       if (dmsk->dccpms_send_ack_vector) {
+                               dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+                               dp->dccps_hc_rx_ackvec = NULL;
+                       }
+                       dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+                       return -ENOMEM;
+               }
+       } else {
+               /* control socket doesn't need feat nego */
+               INIT_LIST_HEAD(&dmsk->dccpms_pending);
+               INIT_LIST_HEAD(&dmsk->dccpms_conf);
+       }
+
+       return 0;
+}
+
+EXPORT_SYMBOL_GPL(dccp_init_sock);
+
+void dccp_destroy_sock(struct sock *sk)
+{
+       struct dccp_sock *dp = dccp_sk(sk);
+       struct dccp_minisock *dmsk = dccp_msk(sk);
+
+       /*
+        * DCCP doesn't use sk_write_queue, just sk_send_head
+        * for retransmissions
+        */
+       if (sk->sk_send_head != NULL) {
+               kfree_skb(sk->sk_send_head);
+               sk->sk_send_head = NULL;
+       }
+
+       /* Clean up a referenced DCCP bind bucket. */
+       if (inet_csk(sk)->icsk_bind_hash != NULL)
+               inet_put_port(sk);
+
+       kfree(dp->dccps_service_list);
+       dp->dccps_service_list = NULL;
+
+       if (dmsk->dccpms_send_ack_vector) {
+               dccp_ackvec_free(dp->dccps_hc_rx_ackvec);
+               dp->dccps_hc_rx_ackvec = NULL;
+       }
+       ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk);
+       ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk);
+       dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL;
+
+       /* clean up feature negotiation state */
+       dccp_feat_list_purge(&dp->dccps_featneg);
+}
+
+EXPORT_SYMBOL_GPL(dccp_destroy_sock);
+
+static inline int dccp_listen_start(struct sock *sk, int backlog)
+{
+       struct dccp_sock *dp = dccp_sk(sk);
+
+       dp->dccps_role = DCCP_ROLE_LISTEN;
+       /* do not start to listen if feature negotiation setup fails */
+       if (dccp_feat_finalise_settings(dp))
+               return -EPROTO;
+       return inet_csk_listen_start(sk, backlog);
+}
+
+static inline int dccp_need_reset(int state)
 {
-       dccp_sk(sk)->dccps_role = DCCP_ROLE_LISTEN;
-       return inet_csk_listen_start(sk, TCP_SYNQ_HSIZE);
+       return state != DCCP_CLOSED && state != DCCP_LISTEN &&
+              state != DCCP_REQUESTING;
 }
 
 int dccp_disconnect(struct sock *sk, int flags)
@@ -108,15 +300,22 @@ int dccp_disconnect(struct sock *sk, int flags)
        if (old_state != DCCP_CLOSED)
                dccp_set_state(sk, DCCP_CLOSED);
 
-       /* ABORT function of RFC793 */
+       /*
+        * This corresponds to the ABORT function of RFC793, sec. 3.8
+        * TCP uses a RST segment, DCCP a Reset packet with Code 2, "Aborted".
+        */
        if (old_state == DCCP_LISTEN) {
                inet_csk_listen_stop(sk);
-       /* FIXME: do the active reset thing */
+       } else if (dccp_need_reset(old_state)) {
+               dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+               sk->sk_err = ECONNRESET;
        } else if (old_state == DCCP_REQUESTING)
                sk->sk_err = ECONNRESET;
 
        dccp_clear_xmit_timers(sk);
+
        __skb_queue_purge(&sk->sk_receive_queue);
+       __skb_queue_purge(&sk->sk_write_queue);
        if (sk->sk_send_head != NULL) {
                __kfree_skb(sk->sk_send_head);
                sk->sk_send_head = NULL;
@@ -134,40 +333,356 @@ int dccp_disconnect(struct sock *sk, int flags)
        inet_csk_delack_init(sk);
        __sk_dst_reset(sk);
 
-       BUG_TRAP(!inet->num || icsk->icsk_bind_hash);
+       WARN_ON(inet->num && !icsk->icsk_bind_hash);
 
        sk->sk_error_report(sk);
        return err;
 }
 
+EXPORT_SYMBOL_GPL(dccp_disconnect);
+
+/*
+ *     Wait for a DCCP event.
+ *
+ *     Note that we don't need to lock the socket, as the upper poll layers
+ *     take care of normal races (between the test and the event) and we don't
+ *     go look at any of the socket buffers directly.
+ */
+unsigned int dccp_poll(struct file *file, struct socket *sock,
+                      poll_table *wait)
+{
+       unsigned int mask;
+       struct sock *sk = sock->sk;
+
+       poll_wait(file, sk->sk_sleep, wait);
+       if (sk->sk_state == DCCP_LISTEN)
+               return inet_csk_listen_poll(sk);
+
+       /* Socket is not locked. We are protected from async events
+          by poll logic and correct handling of state changes
+          made by another threads is impossible in any case.
+        */
+
+       mask = 0;
+       if (sk->sk_err)
+               mask = POLLERR;
+
+       if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == DCCP_CLOSED)
+               mask |= POLLHUP;
+       if (sk->sk_shutdown & RCV_SHUTDOWN)
+               mask |= POLLIN | POLLRDNORM | POLLRDHUP;
+
+       /* Connected? */
+       if ((1 << sk->sk_state) & ~(DCCPF_REQUESTING | DCCPF_RESPOND)) {
+               if (atomic_read(&sk->sk_rmem_alloc) > 0)
+                       mask |= POLLIN | POLLRDNORM;
+
+               if (!(sk->sk_shutdown & SEND_SHUTDOWN)) {
+                       if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk)) {
+                               mask |= POLLOUT | POLLWRNORM;
+                       } else {  /* send SIGIO later */
+                               set_bit(SOCK_ASYNC_NOSPACE,
+                                       &sk->sk_socket->flags);
+                               set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+
+                               /* Race breaker. If space is freed after
+                                * wspace test but before the flags are set,
+                                * IO signal will be lost.
+                                */
+                               if (sk_stream_wspace(sk) >= sk_stream_min_wspace(sk))
+                                       mask |= POLLOUT | POLLWRNORM;
+                       }
+               }
+       }
+       return mask;
+}
+
+EXPORT_SYMBOL_GPL(dccp_poll);
+
 int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg)
 {
-       dccp_pr_debug("entry\n");
-       return -ENOIOCTLCMD;
+       int rc = -ENOTCONN;
+
+       lock_sock(sk);
+
+       if (sk->sk_state == DCCP_LISTEN)
+               goto out;
+
+       switch (cmd) {
+       case SIOCINQ: {
+               struct sk_buff *skb;
+               unsigned long amount = 0;
+
+               skb = skb_peek(&sk->sk_receive_queue);
+               if (skb != NULL) {
+                       /*
+                        * We will only return the amount of this packet since
+                        * that is all that will be read.
+                        */
+                       amount = skb->len;
+               }
+               rc = put_user(amount, (int __user *)arg);
+       }
+               break;
+       default:
+               rc = -ENOIOCTLCMD;
+               break;
+       }
+out:
+       release_sock(sk);
+       return rc;
+}
+
+EXPORT_SYMBOL_GPL(dccp_ioctl);
+
+static int dccp_setsockopt_service(struct sock *sk, const __be32 service,
+                                  char __user *optval, int optlen)
+{
+       struct dccp_sock *dp = dccp_sk(sk);
+       struct dccp_service_list *sl = NULL;
+
+       if (service == DCCP_SERVICE_INVALID_VALUE ||
+           optlen > DCCP_SERVICE_LIST_MAX_LEN * sizeof(u32))
+               return -EINVAL;
+
+       if (optlen > sizeof(service)) {
+               sl = kmalloc(optlen, GFP_KERNEL);
+               if (sl == NULL)
+                       return -ENOMEM;
+
+               sl->dccpsl_nr = optlen / sizeof(u32) - 1;
+               if (copy_from_user(sl->dccpsl_list,
+                                  optval + sizeof(service),
+                                  optlen - sizeof(service)) ||
+                   dccp_list_has_service(sl, DCCP_SERVICE_INVALID_VALUE)) {
+                       kfree(sl);
+                       return -EFAULT;
+               }
+       }
+
+       lock_sock(sk);
+       dp->dccps_service = service;
+
+       kfree(dp->dccps_service_list);
+
+       dp->dccps_service_list = sl;
+       release_sock(sk);
+       return 0;
+}
+
+static int dccp_setsockopt_cscov(struct sock *sk, int cscov, bool rx)
+{
+       u8 *list, len;
+       int i, rc;
+
+       if (cscov < 0 || cscov > 15)
+               return -EINVAL;
+       /*
+        * Populate a list of permissible values, in the range cscov...15. This
+        * is necessary since feature negotiation of single values only works if
+        * both sides incidentally choose the same value. Since the list starts
+        * lowest-value first, negotiation will pick the smallest shared value.
+        */
+       if (cscov == 0)
+               return 0;
+       len = 16 - cscov;
+
+       list = kmalloc(len, GFP_KERNEL);
+       if (list == NULL)
+               return -ENOBUFS;
+
+       for (i = 0; i < len; i++)
+               list[i] = cscov++;
+
+       rc = dccp_feat_register_sp(sk, DCCPF_MIN_CSUM_COVER, rx, list, len);
+
+       if (rc == 0) {
+               if (rx)
+                       dccp_sk(sk)->dccps_pcrlen = cscov;
+               else
+                       dccp_sk(sk)->dccps_pcslen = cscov;
+       }
+       kfree(list);
+       return rc;
+}
+
+static int do_dccp_setsockopt(struct sock *sk, int level, int optname,
+               char __user *optval, int optlen)
+{
+       struct dccp_sock *dp = dccp_sk(sk);
+       int val, err = 0;
+
+       if (optlen < sizeof(int))
+               return -EINVAL;
+
+       if (get_user(val, (int __user *)optval))
+               return -EFAULT;
+
+       if (optname == DCCP_SOCKOPT_SERVICE)
+               return dccp_setsockopt_service(sk, val, optval, optlen);
+
+       lock_sock(sk);
+       switch (optname) {
+       case DCCP_SOCKOPT_PACKET_SIZE:
+               DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+               err = 0;
+               break;
+       case DCCP_SOCKOPT_CHANGE_L:
+       case DCCP_SOCKOPT_CHANGE_R:
+               DCCP_WARN("sockopt(CHANGE_L/R) is deprecated: fix your app\n");
+               err = 0;
+               break;
+       case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+               if (dp->dccps_role != DCCP_ROLE_SERVER)
+                       err = -EOPNOTSUPP;
+               else
+                       dp->dccps_server_timewait = (val != 0);
+               break;
+       case DCCP_SOCKOPT_SEND_CSCOV:
+               err = dccp_setsockopt_cscov(sk, val, false);
+               break;
+       case DCCP_SOCKOPT_RECV_CSCOV:
+               err = dccp_setsockopt_cscov(sk, val, true);
+               break;
+       default:
+               err = -ENOPROTOOPT;
+               break;
+       }
+
+       release_sock(sk);
+       return err;
 }
 
 int dccp_setsockopt(struct sock *sk, int level, int optname,
-                   char *optval, int optlen)
+                   char __user *optval, int optlen)
 {
-       dccp_pr_debug("entry\n");
+       if (level != SOL_DCCP)
+               return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level,
+                                                            optname, optval,
+                                                            optlen);
+       return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
 
+EXPORT_SYMBOL_GPL(dccp_setsockopt);
+
+#ifdef CONFIG_COMPAT
+int compat_dccp_setsockopt(struct sock *sk, int level, int optname,
+                          char __user *optval, int optlen)
+{
        if (level != SOL_DCCP)
-               return ip_setsockopt(sk, level, optname, optval, optlen);
+               return inet_csk_compat_setsockopt(sk, level, optname,
+                                                 optval, optlen);
+       return do_dccp_setsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(compat_dccp_setsockopt);
+#endif
 
-       return -EOPNOTSUPP;
+static int dccp_getsockopt_service(struct sock *sk, int len,
+                                  __be32 __user *optval,
+                                  int __user *optlen)
+{
+       const struct dccp_sock *dp = dccp_sk(sk);
+       const struct dccp_service_list *sl;
+       int err = -ENOENT, slen = 0, total_len = sizeof(u32);
+
+       lock_sock(sk);
+       if ((sl = dp->dccps_service_list) != NULL) {
+               slen = sl->dccpsl_nr * sizeof(u32);
+               total_len += slen;
+       }
+
+       err = -EINVAL;
+       if (total_len > len)
+               goto out;
+
+       err = 0;
+       if (put_user(total_len, optlen) ||
+           put_user(dp->dccps_service, optval) ||
+           (sl != NULL && copy_to_user(optval + 1, sl->dccpsl_list, slen)))
+               err = -EFAULT;
+out:
+       release_sock(sk);
+       return err;
 }
 
-int dccp_getsockopt(struct sock *sk, int level, int optname,
-                   char *optval, int *optlen)
+static int do_dccp_getsockopt(struct sock *sk, int level, int optname,
+                   char __user *optval, int __user *optlen)
 {
-       dccp_pr_debug("entry\n");
+       struct dccp_sock *dp;
+       int val, len;
+
+       if (get_user(len, optlen))
+               return -EFAULT;
+
+       if (len < (int)sizeof(int))
+               return -EINVAL;
+
+       dp = dccp_sk(sk);
+
+       switch (optname) {
+       case DCCP_SOCKOPT_PACKET_SIZE:
+               DCCP_WARN("sockopt(PACKET_SIZE) is deprecated: fix your app\n");
+               return 0;
+       case DCCP_SOCKOPT_SERVICE:
+               return dccp_getsockopt_service(sk, len,
+                                              (__be32 __user *)optval, optlen);
+       case DCCP_SOCKOPT_GET_CUR_MPS:
+               val = dp->dccps_mss_cache;
+               break;
+       case DCCP_SOCKOPT_AVAILABLE_CCIDS:
+               return ccid_getsockopt_builtin_ccids(sk, len, optval, optlen);
+       case DCCP_SOCKOPT_SERVER_TIMEWAIT:
+               val = dp->dccps_server_timewait;
+               break;
+       case DCCP_SOCKOPT_SEND_CSCOV:
+               val = dp->dccps_pcslen;
+               break;
+       case DCCP_SOCKOPT_RECV_CSCOV:
+               val = dp->dccps_pcrlen;
+               break;
+       case 128 ... 191:
+               return ccid_hc_rx_getsockopt(dp->dccps_hc_rx_ccid, sk, optname,
+                                            len, (u32 __user *)optval, optlen);
+       case 192 ... 255:
+               return ccid_hc_tx_getsockopt(dp->dccps_hc_tx_ccid, sk, optname,
+                                            len, (u32 __user *)optval, optlen);
+       default:
+               return -ENOPROTOOPT;
+       }
+
+       len = sizeof(val);
+       if (put_user(len, optlen) || copy_to_user(optval, &val, len))
+               return -EFAULT;
 
+       return 0;
+}
+
+int dccp_getsockopt(struct sock *sk, int level, int optname,
+                   char __user *optval, int __user *optlen)
+{
        if (level != SOL_DCCP)
-               return ip_getsockopt(sk, level, optname, optval, optlen);
+               return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level,
+                                                            optname, optval,
+                                                            optlen);
+       return do_dccp_getsockopt(sk, level, optname, optval, optlen);
+}
+
+EXPORT_SYMBOL_GPL(dccp_getsockopt);
 
-       return -EOPNOTSUPP;
+#ifdef CONFIG_COMPAT
+int compat_dccp_getsockopt(struct sock *sk, int level, int optname,
+                          char __user *optval, int __user *optlen)
+{
+       if (level != SOL_DCCP)
+               return inet_csk_compat_getsockopt(sk, level, optname,
+                                                 optval, optlen);
+       return do_dccp_getsockopt(sk, level, optname, optval, optlen);
 }
 
+EXPORT_SYMBOL_GPL(compat_dccp_getsockopt);
+#endif
+
 int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                 size_t len)
 {
@@ -182,6 +697,13 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                return -EMSGSIZE;
 
        lock_sock(sk);
+
+       if (sysctl_dccp_tx_qlen &&
+           (sk->sk_write_queue.qlen >= sysctl_dccp_tx_qlen)) {
+               rc = -EAGAIN;
+               goto out_release;
+       }
+
        timeo = sock_sndtimeo(sk, noblock);
 
        /*
@@ -189,7 +711,7 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
         * so that the trick in dccp_rcv_request_sent_state_process.
         */
        /* Wait for a connection to finish. */
-       if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN | DCCPF_CLOSING))
+       if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN))
                if ((rc = sk_stream_wait_connect(sk, &timeo)) != 0)
                        goto out_release;
 
@@ -205,7 +727,8 @@ int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
        if (rc != 0)
                goto out_discard;
 
-       rc = dccp_write_xmit(sk, skb, len);
+       skb_queue_tail(&sk->sk_write_queue, skb);
+       dccp_write_xmit(sk,0);
 out_release:
        release_sock(sk);
        return rc ? : len;
@@ -214,6 +737,8 @@ out_discard:
        goto out_release;
 }
 
+EXPORT_SYMBOL_GPL(dccp_sendmsg);
+
 int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
                 size_t len, int nonblock, int flags, int *addr_len)
 {
@@ -237,19 +762,26 @@ int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
                dh = dccp_hdr(skb);
 
-               if (dh->dccph_type == DCCP_PKT_DATA ||
-                   dh->dccph_type == DCCP_PKT_DATAACK)
+               switch (dh->dccph_type) {
+               case DCCP_PKT_DATA:
+               case DCCP_PKT_DATAACK:
                        goto found_ok_skb;
 
-               if (dh->dccph_type == DCCP_PKT_RESET ||
-                   dh->dccph_type == DCCP_PKT_CLOSE) {
-                       dccp_pr_debug("found fin ok!\n");
+               case DCCP_PKT_CLOSE:
+               case DCCP_PKT_CLOSEREQ:
+                       if (!(flags & MSG_PEEK))
+                               dccp_finish_passive_close(sk);
+                       /* fall through */
+               case DCCP_PKT_RESET:
+                       dccp_pr_debug("found fin (%s) ok!\n",
+                                     dccp_packet_name(dh->dccph_type));
                        len = 0;
                        goto found_fin_ok;
+               default:
+                       dccp_pr_debug("packet_type=%s\n",
+                                     dccp_packet_name(dh->dccph_type));
+                       sk_eat_skb(sk, skb, 0);
                }
-               dccp_pr_debug("packet_type=%s\n",
-                             dccp_packet_name(dh->dccph_type));
-               sk_eat_skb(sk, skb);
 verify_sock_status:
                if (sock_flag(sk, SOCK_DONE)) {
                        len = 0;
@@ -303,7 +835,7 @@ verify_sock_status:
                }
        found_fin_ok:
                if (!(flags & MSG_PEEK))
-                       sk_eat_skb(sk, skb);
+                       sk_eat_skb(sk, skb, 0);
                break;
        } while (1);
 out:
@@ -311,7 +843,9 @@ out:
        return len;
 }
 
-static int inet_dccp_listen(struct socket *sock, int backlog)
+EXPORT_SYMBOL_GPL(dccp_recvmsg);
+
+int inet_dccp_listen(struct socket *sock, int backlog)
 {
        struct sock *sk = sock->sk;
        unsigned char old_state;
@@ -335,7 +869,7 @@ static int inet_dccp_listen(struct socket *sock, int backlog)
                 * FIXME: here it probably should be sk->sk_prot->listen_start
                 * see tcp_listen_start
                 */
-               err = dccp_listen_start(sk);
+               err = dccp_listen_start(sk, backlog);
                if (err)
                        goto out;
        }
@@ -347,33 +881,41 @@ out:
        return err;
 }
 
-static const unsigned char dccp_new_state[] = {
-       /* current state:   new state:      action:     */
-       [0]               = DCCP_CLOSED,
-       [DCCP_OPEN]       = DCCP_CLOSING | DCCP_ACTION_FIN,
-       [DCCP_REQUESTING] = DCCP_CLOSED,
-       [DCCP_PARTOPEN]   = DCCP_CLOSING | DCCP_ACTION_FIN,
-       [DCCP_LISTEN]     = DCCP_CLOSED,
-       [DCCP_RESPOND]    = DCCP_CLOSED,
-       [DCCP_CLOSING]    = DCCP_CLOSED,
-       [DCCP_TIME_WAIT]  = DCCP_CLOSED,
-       [DCCP_CLOSED]     = DCCP_CLOSED,
-};
+EXPORT_SYMBOL_GPL(inet_dccp_listen);
 
-static int dccp_close_state(struct sock *sk)
+static void dccp_terminate_connection(struct sock *sk)
 {
-       const int next = dccp_new_state[sk->sk_state];
-       const int ns = next & DCCP_STATE_MASK;
-
-       if (ns != sk->sk_state)
-               dccp_set_state(sk, ns);
+       u8 next_state = DCCP_CLOSED;
 
-       return next & DCCP_ACTION_FIN;
+       switch (sk->sk_state) {
+       case DCCP_PASSIVE_CLOSE:
+       case DCCP_PASSIVE_CLOSEREQ:
+               dccp_finish_passive_close(sk);
+               break;
+       case DCCP_PARTOPEN:
+               dccp_pr_debug("Stop PARTOPEN timer (%p)\n", sk);
+               inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+               /* fall through */
+       case DCCP_OPEN:
+               dccp_send_close(sk, 1);
+
+               if (dccp_sk(sk)->dccps_role == DCCP_ROLE_SERVER &&
+                   !dccp_sk(sk)->dccps_server_timewait)
+                       next_state = DCCP_ACTIVE_CLOSEREQ;
+               else
+                       next_state = DCCP_CLOSING;
+               /* fall through */
+       default:
+               dccp_set_state(sk, next_state);
+       }
 }
 
 void dccp_close(struct sock *sk, long timeout)
 {
+       struct dccp_sock *dp = dccp_sk(sk);
        struct sk_buff *skb;
+       u32 data_was_unread = 0;
+       int state;
 
        lock_sock(sk);
 
@@ -388,26 +930,41 @@ void dccp_close(struct sock *sk, long timeout)
                goto adjudge_to_death;
        }
 
+       sk_stop_timer(sk, &dp->dccps_xmit_timer);
+
        /*
         * We need to flush the recv. buffs.  We do this only on the
         * descriptor close, not protocol-sourced closes, because the
          *reader process may not have drained the data yet!
         */
-       /* FIXME: check for unread data */
        while ((skb = __skb_dequeue(&sk->sk_receive_queue)) != NULL) {
+               data_was_unread += skb->len;
                __kfree_skb(skb);
        }
 
-       if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
+       if (data_was_unread) {
+               /* Unread data was tossed, send an appropriate Reset Code */
+               DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread);
+               dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED);
+               dccp_set_state(sk, DCCP_CLOSED);
+       } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) {
                /* Check zero linger _after_ checking for unread data. */
                sk->sk_prot->disconnect(sk, 0);
-       } else if (dccp_close_state(sk)) {
-               dccp_send_close(sk);
+       } else if (sk->sk_state != DCCP_CLOSED) {
+               dccp_terminate_connection(sk);
        }
 
        sk_stream_wait_close(sk, timeout);
 
 adjudge_to_death:
+       state = sk->sk_state;
+       sock_hold(sk);
+       sock_orphan(sk);
+       atomic_inc(sk->sk_prot->orphan_count);
+
+       /*
+        * It is the last release_sock in its life. It will remove backlog.
+        */
        release_sock(sk);
        /*
         * Now socket is owned by kernel and we acquire BH lock
@@ -415,145 +972,69 @@ adjudge_to_death:
         */
        local_bh_disable();
        bh_lock_sock(sk);
-       BUG_TRAP(!sock_owned_by_user(sk));
+       WARN_ON(sock_owned_by_user(sk));
 
-       sock_hold(sk);
-       sock_orphan(sk);
-                                               
-       if (sk->sk_state != DCCP_CLOSED)
-               dccp_set_state(sk, DCCP_CLOSED);
+       /* Have we already been destroyed by a softirq or backlog? */
+       if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
+               goto out;
 
-       atomic_inc(&dccp_orphan_count);
        if (sk->sk_state == DCCP_CLOSED)
                inet_csk_destroy_sock(sk);
 
        /* Otherwise, socket is reprieved until protocol close. */
 
+out:
        bh_unlock_sock(sk);
        local_bh_enable();
        sock_put(sk);
 }
 
+EXPORT_SYMBOL_GPL(dccp_close);
+
 void dccp_shutdown(struct sock *sk, int how)
 {
-       dccp_pr_debug("entry\n");
+       dccp_pr_debug("called shutdown(%x)\n", how);
 }
 
-struct proto_ops inet_dccp_ops = {
-       .family         = PF_INET,
-       .owner          = THIS_MODULE,
-       .release        = inet_release,
-       .bind           = inet_bind,
-       .connect        = inet_stream_connect,
-       .socketpair     = sock_no_socketpair,
-       .accept         = inet_accept,
-       .getname        = inet_getname,
-       .poll           = sock_no_poll,
-       .ioctl          = inet_ioctl,
-       /* FIXME: work on inet_listen to rename it to sock_common_listen */
-       .listen         = inet_dccp_listen,
-       .shutdown       = inet_shutdown,
-       .setsockopt     = sock_common_setsockopt,
-       .getsockopt     = sock_common_getsockopt,
-       .sendmsg        = inet_sendmsg,
-       .recvmsg        = sock_common_recvmsg,
-       .mmap           = sock_no_mmap,
-       .sendpage       = sock_no_sendpage,
-};
-
-extern struct net_proto_family inet_family_ops;
+EXPORT_SYMBOL_GPL(dccp_shutdown);
 
-static struct inet_protosw dccp_v4_protosw = {
-       .type           = SOCK_DCCP,
-       .protocol       = IPPROTO_DCCP,
-       .prot           = &dccp_v4_prot,
-       .ops            = &inet_dccp_ops,
-       .capability     = -1,
-       .no_check       = 0,
-       .flags          = 0,
-};
-
-/*
- * This is the global socket data structure used for responding to
- * the Out-of-the-blue (OOTB) packets. A control sock will be created
- * for this socket at the initialization time.
- */
-struct socket *dccp_ctl_socket;
-
-static char dccp_ctl_socket_err_msg[] __initdata =
-       KERN_ERR "DCCP: Failed to create the control socket.\n";
-
-static int __init dccp_ctl_sock_init(void)
+static inline int dccp_mib_init(void)
 {
-       int rc = sock_create_kern(PF_INET, SOCK_DCCP, IPPROTO_DCCP,
-                                 &dccp_ctl_socket);
-       if (rc < 0)
-               printk(dccp_ctl_socket_err_msg);
-       else {
-               dccp_ctl_socket->sk->sk_allocation = GFP_ATOMIC;
-               inet_sk(dccp_ctl_socket->sk)->uc_ttl = -1;
-
-               /* Unhash it so that IP input processing does not even
-                * see it, we do not wish this socket to see incoming
-                * packets.
-                */
-               dccp_ctl_socket->sk->sk_prot->unhash(dccp_ctl_socket->sk);
-       }
-
-       return rc;
+       return snmp_mib_init((void**)dccp_statistics, sizeof(struct dccp_mib));
 }
 
-static void __exit dccp_ctl_sock_exit(void)
+static inline void dccp_mib_exit(void)
 {
-       if (dccp_ctl_socket != NULL)
-               sock_release(dccp_ctl_socket);
-}
-
-static int __init init_dccp_v4_mibs(void)
-{
-       int rc = -ENOMEM;
-
-       dccp_statistics[0] = alloc_percpu(struct dccp_mib);
-       if (dccp_statistics[0] == NULL)
-               goto out;
-
-       dccp_statistics[1] = alloc_percpu(struct dccp_mib);
-       if (dccp_statistics[1] == NULL)
-               goto out_free_one;
-
-       rc = 0;
-out:
-       return rc;
-out_free_one:
-       free_percpu(dccp_statistics[0]);
-       dccp_statistics[0] = NULL;
-       goto out;
-
+       snmp_mib_free((void**)dccp_statistics);
 }
 
 static int thash_entries;
 module_param(thash_entries, int, 0444);
 MODULE_PARM_DESC(thash_entries, "Number of ehash buckets");
 
+#ifdef CONFIG_IP_DCCP_DEBUG
 int dccp_debug;
-module_param(dccp_debug, int, 0444);
+module_param(dccp_debug, bool, 0644);
 MODULE_PARM_DESC(dccp_debug, "Enable debug messages");
 
+EXPORT_SYMBOL_GPL(dccp_debug);
+#endif
+
 static int __init dccp_init(void)
 {
        unsigned long goal;
        int ehash_order, bhash_order, i;
-       int rc = proto_register(&dccp_v4_prot, 1);
+       int rc = -ENOBUFS;
 
-       if (rc)
-               goto out;
+       BUILD_BUG_ON(sizeof(struct dccp_skb_cb) >
+                    FIELD_SIZEOF(struct sk_buff, cb));
 
        dccp_hashinfo.bind_bucket_cachep =
                kmem_cache_create("dccp_bind_bucket",
                                  sizeof(struct inet_bind_bucket), 0,
-                                 SLAB_HWCACHE_ALIGN, NULL, NULL);
+                                 SLAB_HWCACHE_ALIGN, NULL);
        if (!dccp_hashinfo.bind_bucket_cachep)
-               goto out_proto_unregister;
+               goto out;
 
        /*
         * Size and allocate the main established and bind bucket
@@ -574,7 +1055,6 @@ static int __init dccp_init(void)
        do {
                dccp_hashinfo.ehash_size = (1UL << ehash_order) * PAGE_SIZE /
                                        sizeof(struct inet_ehash_bucket);
-               dccp_hashinfo.ehash_size >>= 1;
                while (dccp_hashinfo.ehash_size &
                       (dccp_hashinfo.ehash_size - 1))
                        dccp_hashinfo.ehash_size--;
@@ -583,16 +1063,18 @@ static int __init dccp_init(void)
        } while (!dccp_hashinfo.ehash && --ehash_order > 0);
 
        if (!dccp_hashinfo.ehash) {
-               printk(KERN_CRIT "Failed to allocate DCCP "
-                                "established hash table\n");
+               DCCP_CRIT("Failed to allocate DCCP established hash table");
                goto out_free_bind_bucket_cachep;
        }
 
-       for (i = 0; i < (dccp_hashinfo.ehash_size << 1); i++) {
-               rwlock_init(&dccp_hashinfo.ehash[i].lock);
+       for (i = 0; i < dccp_hashinfo.ehash_size; i++) {
                INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].chain);
+               INIT_HLIST_HEAD(&dccp_hashinfo.ehash[i].twchain);
        }
 
+       if (inet_ehash_locks_alloc(&dccp_hashinfo))
+                       goto out_free_dccp_ehash;
+
        bhash_order = ehash_order;
 
        do {
@@ -606,8 +1088,8 @@ static int __init dccp_init(void)
        } while (!dccp_hashinfo.bhash && --bhash_order >= 0);
 
        if (!dccp_hashinfo.bhash) {
-               printk(KERN_CRIT "Failed to allocate DCCP bind hash table\n");
-               goto out_free_dccp_ehash;
+               DCCP_CRIT("Failed to allocate DCCP bind hash table");
+               goto out_free_dccp_locks;
        }
 
        for (i = 0; i < dccp_hashinfo.bhash_size; i++) {
@@ -615,71 +1097,57 @@ static int __init dccp_init(void)
                INIT_HLIST_HEAD(&dccp_hashinfo.bhash[i].chain);
        }
 
-       if (init_dccp_v4_mibs())
+       rc = dccp_mib_init();
+       if (rc)
                goto out_free_dccp_bhash;
 
-       rc = -EAGAIN;
-       if (inet_add_protocol(&dccp_protocol, IPPROTO_DCCP))
-               goto out_free_dccp_v4_mibs;
-
-       inet_register_protosw(&dccp_v4_protosw);
+       rc = dccp_ackvec_init();
+       if (rc)
+               goto out_free_dccp_mib;
 
-       rc = dccp_ctl_sock_init();
+       rc = dccp_sysctl_init();
        if (rc)
-               goto out_unregister_protosw;
+               goto out_ackvec_exit;
+
+       dccp_timestamping_init();
 out:
        return rc;
-out_unregister_protosw:
-       inet_unregister_protosw(&dccp_v4_protosw);
-       inet_del_protocol(&dccp_protocol, IPPROTO_DCCP);
-out_free_dccp_v4_mibs:
-       free_percpu(dccp_statistics[0]);
-       free_percpu(dccp_statistics[1]);
-       dccp_statistics[0] = dccp_statistics[1] = NULL;
+out_ackvec_exit:
+       dccp_ackvec_exit();
+out_free_dccp_mib:
+       dccp_mib_exit();
 out_free_dccp_bhash:
        free_pages((unsigned long)dccp_hashinfo.bhash, bhash_order);
        dccp_hashinfo.bhash = NULL;
+out_free_dccp_locks:
+       inet_ehash_locks_free(&dccp_hashinfo);
 out_free_dccp_ehash:
        free_pages((unsigned long)dccp_hashinfo.ehash, ehash_order);
        dccp_hashinfo.ehash = NULL;
 out_free_bind_bucket_cachep:
        kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
        dccp_hashinfo.bind_bucket_cachep = NULL;
-out_proto_unregister:
-       proto_unregister(&dccp_v4_prot);
        goto out;
 }
 
-static const char dccp_del_proto_err_msg[] __exitdata =
-       KERN_ERR "can't remove dccp net_protocol\n";
-
 static void __exit dccp_fini(void)
 {
-       dccp_ctl_sock_exit();
-
-       inet_unregister_protosw(&dccp_v4_protosw);
-
-       if (inet_del_protocol(&dccp_protocol, IPPROTO_DCCP) < 0)
-               printk(dccp_del_proto_err_msg);
-
-       /* Free the control endpoint.  */
-       sock_release(dccp_ctl_socket);
-
-       proto_unregister(&dccp_v4_prot);
-
+       dccp_mib_exit();
+       free_pages((unsigned long)dccp_hashinfo.bhash,
+                  get_order(dccp_hashinfo.bhash_size *
+                            sizeof(struct inet_bind_hashbucket)));
+       free_pages((unsigned long)dccp_hashinfo.ehash,
+                  get_order(dccp_hashinfo.ehash_size *
+                            sizeof(struct inet_ehash_bucket)));
+       inet_ehash_locks_free(&dccp_hashinfo);
        kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep);
+       dccp_ackvec_exit();
+       dccp_sysctl_exit();
 }
 
 module_init(dccp_init);
 module_exit(dccp_fini);
 
-/*
- * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33)
- * values directly, Also cover the case where the protocol is not specified,
- * i.e. net-pf-PF_INET-proto-0-type-SOCK_DCCP
- */
-MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-33-type-6");
-MODULE_ALIAS("net-pf-" __stringify(PF_INET) "-proto-0-type-6");
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Arnaldo Carvalho de Melo <acme@conectiva.com.br>");
 MODULE_DESCRIPTION("DCCP - Datagram Congestion Controlled Protocol");