X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=include%2Fnet%2Fsock.h;h=eefeeaf7fc467123b6766da9ff8543b215a87f37;hb=b1e93a68ca41e7e73766f95ba32ca05cf9052e15;hp=849c7df23181fc4ba57dafffdf64941598a580eb;hpb=8488df894d05d6fa41c2bd298c335f944bb0e401;p=safe%2Fjmp%2Flinux-2.6 diff --git a/include/net/sock.h b/include/net/sock.h index 849c7df..eefeeaf 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -40,7 +40,9 @@ #ifndef _SOCK_H #define _SOCK_H +#include #include +#include #include #include #include @@ -51,6 +53,7 @@ #include #include +#include #include #include @@ -68,17 +71,20 @@ #define SOCK_DEBUG(sk, msg...) do { if ((sk) && sock_flag((sk), SOCK_DBG)) \ printk(KERN_DEBUG msg); } while (0) #else -#define SOCK_DEBUG(sk, msg...) do { } while (0) +/* Validate arguments and do nothing */ +static void inline int __attribute__ ((format (printf, 2, 3))) +SOCK_DEBUG(struct sock *sk, const char *msg, ...) +{ +} #endif /* This is the per-socket lock. The spinlock provides a synchronization * between user contexts and software interrupt processing, whereas the * mini-semaphore synchronizes multiple users amongst themselves. */ -struct sock_iocb; typedef struct { spinlock_t slock; - struct sock_iocb *owner; + int owned; wait_queue_head_t wq; /* * We express the mutex-alike socket_lock semantics @@ -93,6 +99,7 @@ typedef struct { struct sock; struct proto; +struct net; /** * struct sock_common - minimal network layer representation of sockets @@ -101,10 +108,12 @@ struct proto; * @skc_reuse: %SO_REUSEADDR setting * @skc_bound_dev_if: bound device index if != 0 * @skc_node: main hash linkage for various protocol lookup tables + * @skc_nulls_node: main hash linkage for UDP/UDP-Lite protocol * @skc_bind_node: bind hash linkage for various protocol lookup tables * @skc_refcnt: reference count * @skc_hash: hash value used with various protocol lookup tables * @skc_prot: protocol handlers inside a network family + * @skc_net: reference to the network namespace of this socket * * This is the minimal network layer representation of sockets, the header * for struct sock and struct inet_timewait_sock. @@ -114,11 +123,17 @@ struct sock_common { volatile unsigned char skc_state; unsigned char skc_reuse; int skc_bound_dev_if; - struct hlist_node skc_node; + union { + struct hlist_node skc_node; + struct hlist_nulls_node skc_nulls_node; + }; struct hlist_node skc_bind_node; atomic_t skc_refcnt; unsigned int skc_hash; struct proto *skc_prot; +#ifdef CONFIG_NET_NS + struct net *skc_net; +#endif }; /** @@ -142,17 +157,22 @@ struct sock_common { * @sk_forward_alloc: space allocated forward * @sk_allocation: allocation mode * @sk_sndbuf: size of send buffer in bytes - * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, %SO_OOBINLINE settings + * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, + * %SO_OOBINLINE settings * @sk_no_check: %SO_NO_CHECK setting, wether or not checkup packets * @sk_route_caps: route capabilities (e.g. %NETIF_F_TSO) * @sk_gso_type: GSO type (e.g. %SKB_GSO_TCPV4) + * @sk_gso_max_size: Maximum GSO segment size to build * @sk_lingertime: %SO_LINGER l_linger setting * @sk_backlog: always used with the per-socket spinlock held * @sk_callback_lock: used with the callbacks in the end of this struct * @sk_error_queue: rarely used - * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, IPV6_ADDRFORM for instance) + * @sk_prot_creator: sk_prot of original sock creator (see ipv6_setsockopt, + * IPV6_ADDRFORM for instance) * @sk_err: last error - * @sk_err_soft: errors that don't cause failure but are the cause of a persistent failure not just 'timed out' + * @sk_err_soft: errors that don't cause failure but are the cause of a + * persistent failure not just 'timed out' + * @sk_drops: raw/udp drops counter * @sk_ack_backlog: current listen backlog * @sk_max_ack_backlog: listen backlog set in listen() * @sk_priority: %SO_PRIORITY setting @@ -172,6 +192,7 @@ struct sock_common { * @sk_sndmsg_off: cached offset for sendmsg * @sk_send_head: front of stuff to transmit * @sk_security: used by security modules + * @sk_mark: generic packet mark * @sk_write_pending: a write to stream socket waits to start * @sk_state_change: callback to indicate change in the state of the sock * @sk_data_ready: callback to indicate there is data to be processed @@ -191,10 +212,12 @@ struct sock { #define sk_reuse __sk_common.skc_reuse #define sk_bound_dev_if __sk_common.skc_bound_dev_if #define sk_node __sk_common.skc_node +#define sk_nulls_node __sk_common.skc_nulls_node #define sk_bind_node __sk_common.skc_bind_node #define sk_refcnt __sk_common.skc_refcnt #define sk_hash __sk_common.skc_hash #define sk_prot __sk_common.skc_prot +#define sk_net __sk_common.skc_net unsigned char sk_shutdown : 2, sk_no_check : 2, sk_userlocks : 4; @@ -202,39 +225,45 @@ struct sock { unsigned short sk_type; int sk_rcvbuf; socket_lock_t sk_lock; + /* + * The backlog queue is special, it is always used with + * the per-socket spinlock held and requires low latency + * access. Therefore we special case it's implementation. + */ + struct { + struct sk_buff *head; + struct sk_buff *tail; + } sk_backlog; wait_queue_head_t *sk_sleep; struct dst_entry *sk_dst_cache; +#ifdef CONFIG_XFRM struct xfrm_policy *sk_policy[2]; +#endif rwlock_t sk_dst_lock; atomic_t sk_rmem_alloc; atomic_t sk_wmem_alloc; atomic_t sk_omem_alloc; + int sk_sndbuf; struct sk_buff_head sk_receive_queue; struct sk_buff_head sk_write_queue; +#ifdef CONFIG_NET_DMA struct sk_buff_head sk_async_wait_queue; +#endif int sk_wmem_queued; int sk_forward_alloc; gfp_t sk_allocation; - int sk_sndbuf; int sk_route_caps; int sk_gso_type; + unsigned int sk_gso_max_size; int sk_rcvlowat; unsigned long sk_flags; unsigned long sk_lingertime; - /* - * The backlog queue is special, it is always used with - * the per-socket spinlock held and requires low latency - * access. Therefore we special case it's implementation. - */ - struct { - struct sk_buff *head; - struct sk_buff *tail; - } sk_backlog; struct sk_buff_head sk_error_queue; struct proto *sk_prot_creator; rwlock_t sk_callback_lock; int sk_err, sk_err_soft; + atomic_t sk_drops; unsigned short sk_ack_backlog; unsigned short sk_max_ack_backlog; __u32 sk_priority; @@ -244,14 +273,18 @@ struct sock { struct sk_filter *sk_filter; void *sk_protinfo; struct timer_list sk_timer; - struct timeval sk_stamp; + ktime_t sk_stamp; struct socket *sk_socket; void *sk_user_data; struct page *sk_sndmsg_page; struct sk_buff *sk_send_head; __u32 sk_sndmsg_off; int sk_write_pending; +#ifdef CONFIG_SECURITY void *sk_security; +#endif + __u32 sk_mark; + /* XXX 4 bytes hole on 64 bit */ void (*sk_state_change)(struct sock *sk); void (*sk_data_ready)(struct sock *sk, int bytes); void (*sk_write_space)(struct sock *sk); @@ -274,12 +307,30 @@ static inline struct sock *sk_head(const struct hlist_head *head) return hlist_empty(head) ? NULL : __sk_head(head); } +static inline struct sock *__sk_nulls_head(const struct hlist_nulls_head *head) +{ + return hlist_nulls_entry(head->first, struct sock, sk_nulls_node); +} + +static inline struct sock *sk_nulls_head(const struct hlist_nulls_head *head) +{ + return hlist_nulls_empty(head) ? NULL : __sk_nulls_head(head); +} + static inline struct sock *sk_next(const struct sock *sk) { return sk->sk_node.next ? hlist_entry(sk->sk_node.next, struct sock, sk_node) : NULL; } +static inline struct sock *sk_nulls_next(const struct sock *sk) +{ + return (!is_a_nulls(sk->sk_nulls_node.next)) ? + hlist_nulls_entry(sk->sk_nulls_node.next, + struct sock, sk_nulls_node) : + NULL; +} + static inline int sk_unhashed(const struct sock *sk) { return hlist_unhashed(&sk->sk_node); @@ -295,6 +346,11 @@ static __inline__ void sk_node_init(struct hlist_node *node) node->pprev = NULL; } +static __inline__ void sk_nulls_node_init(struct hlist_nulls_node *node) +{ + node->pprev = NULL; +} + static __inline__ void __sk_del_node(struct sock *sk) { __hlist_del(&sk->sk_node); @@ -341,6 +397,27 @@ static __inline__ int sk_del_node_init(struct sock *sk) return rc; } +static __inline__ int __sk_nulls_del_node_init_rcu(struct sock *sk) +{ + if (sk_hashed(sk)) { + hlist_nulls_del_init_rcu(&sk->sk_nulls_node); + return 1; + } + return 0; +} + +static __inline__ int sk_nulls_del_node_init_rcu(struct sock *sk) +{ + int rc = __sk_nulls_del_node_init_rcu(sk); + + if (rc) { + /* paranoid for a while -acme */ + WARN_ON(atomic_read(&sk->sk_refcnt) == 1); + __sock_put(sk); + } + return rc; +} + static __inline__ void __sk_add_node(struct sock *sk, struct hlist_head *list) { hlist_add_head(&sk->sk_node, list); @@ -352,6 +429,17 @@ static __inline__ void sk_add_node(struct sock *sk, struct hlist_head *list) __sk_add_node(sk, list); } +static __inline__ void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) +{ + hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list); +} + +static __inline__ void sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list) +{ + sock_hold(sk); + __sk_nulls_add_node_rcu(sk, list); +} + static __inline__ void __sk_del_bind_node(struct sock *sk) { __hlist_del(&sk->sk_bind_node); @@ -365,9 +453,16 @@ static __inline__ void sk_add_bind_node(struct sock *sk, #define sk_for_each(__sk, node, list) \ hlist_for_each_entry(__sk, node, list, sk_node) +#define sk_nulls_for_each(__sk, node, list) \ + hlist_nulls_for_each_entry(__sk, node, list, sk_nulls_node) +#define sk_nulls_for_each_rcu(__sk, node, list) \ + hlist_nulls_for_each_entry_rcu(__sk, node, list, sk_nulls_node) #define sk_for_each_from(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_from(__sk, node, sk_node) +#define sk_nulls_for_each_from(__sk, node) \ + if (__sk && ({ node = &(__sk)->sk_nulls_node; 1; })) \ + hlist_nulls_for_each_entry_from(__sk, node, sk_nulls_node) #define sk_for_each_continue(__sk, node) \ if (__sk && ({ node = &(__sk)->sk_node; 1; })) \ hlist_for_each_entry_continue(__sk, node, sk_node) @@ -390,6 +485,7 @@ enum sock_flags { SOCK_USE_WRITE_QUEUE, /* whether to call sk->sk_write_space in sock_wfree */ SOCK_DBG, /* %SO_DEBUG setting */ SOCK_RCVTSTAMP, /* %SO_TIMESTAMP setting */ + SOCK_RCVTSTAMPNS, /* %SO_TIMESTAMPNS setting */ SOCK_LOCALROUTE, /* route locally only, %SO_DONTROUTE setting */ SOCK_QUEUE_SHRUNK, /* write queue has been shrunk recently */ }; @@ -426,7 +522,7 @@ static inline void sk_acceptq_added(struct sock *sk) static inline int sk_acceptq_is_full(struct sock *sk) { - return sk->sk_ack_backlog >= sk->sk_max_ack_backlog; + return sk->sk_ack_backlog > sk->sk_max_ack_backlog; } /* @@ -434,7 +530,7 @@ static inline int sk_acceptq_is_full(struct sock *sk) */ static inline int sk_stream_min_wspace(struct sock *sk) { - return sk->sk_wmem_queued / 2; + return sk->sk_wmem_queued >> 1; } static inline int sk_stream_wspace(struct sock *sk) @@ -449,25 +545,6 @@ static inline int sk_stream_memory_free(struct sock *sk) return sk->sk_wmem_queued < sk->sk_sndbuf; } -extern void sk_stream_rfree(struct sk_buff *skb); - -static inline void sk_stream_set_owner_r(struct sk_buff *skb, struct sock *sk) -{ - skb->sk = sk; - skb->destructor = sk_stream_rfree; - atomic_add(skb->truesize, &sk->sk_rmem_alloc); - sk->sk_forward_alloc -= skb->truesize; -} - -static inline void sk_stream_free_skb(struct sock *sk, struct sk_buff *skb) -{ - skb_truesize_check(skb); - sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - sk->sk_wmem_queued -= skb->truesize; - sk->sk_forward_alloc += skb->truesize; - __kfree_skb(skb); -} - /* The per-socket spinlock must be held here. */ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) { @@ -480,17 +557,22 @@ static inline void sk_add_backlog(struct sock *sk, struct sk_buff *skb) skb->next = NULL; } -#define sk_wait_event(__sk, __timeo, __condition) \ -({ int rc; \ - release_sock(__sk); \ - rc = __condition; \ - if (!rc) { \ - *(__timeo) = schedule_timeout(*(__timeo)); \ - } \ - lock_sock(__sk); \ - rc = __condition; \ - rc; \ -}) +static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb) +{ + return sk->sk_backlog_rcv(sk, skb); +} + +#define sk_wait_event(__sk, __timeo, __condition) \ + ({ int __rc; \ + release_sock(__sk); \ + __rc = __condition; \ + if (!__rc) { \ + *(__timeo) = schedule_timeout(*(__timeo)); \ + } \ + lock_sock(__sk); \ + __rc = __condition; \ + __rc; \ + }) extern int sk_stream_wait_connect(struct sock *sk, long *timeo_p); extern int sk_stream_wait_memory(struct sock *sk, long *timeo_p); @@ -502,6 +584,8 @@ extern int sk_wait_data(struct sock *sk, long *timeo); struct request_sock_ops; struct timewait_sock_ops; +struct inet_hashinfo; +struct raw_hashinfo; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface @@ -520,7 +604,7 @@ struct proto { int (*ioctl)(struct sock *sk, int cmd, unsigned long arg); int (*init)(struct sock *sk); - int (*destroy)(struct sock *sk); + void (*destroy)(struct sock *sk); void (*shutdown)(struct sock *sk, int how); int (*setsockopt)(struct sock *sk, int level, int optname, char __user *optval, @@ -528,6 +612,7 @@ struct proto { int (*getsockopt)(struct sock *sk, int level, int optname, char __user *optval, int __user *option); +#ifdef CONFIG_COMPAT int (*compat_setsockopt)(struct sock *sk, int level, int optname, char __user *optval, @@ -536,6 +621,7 @@ struct proto { int level, int optname, char __user *optval, int __user *option); +#endif int (*sendmsg)(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len); int (*recvmsg)(struct kiocb *iocb, struct sock *sk, @@ -555,14 +641,19 @@ struct proto { void (*unhash)(struct sock *sk); int (*get_port)(struct sock *sk, unsigned short snum); + /* Keeping track of sockets in use */ +#ifdef CONFIG_PROC_FS + unsigned int inuse_idx; +#endif + /* Memory pressure */ - void (*enter_memory_pressure)(void); + void (*enter_memory_pressure)(struct sock *sk); atomic_t *memory_allocated; /* Current allocated memory. */ - atomic_t *sockets_allocated; /* Current number of sockets. */ + struct percpu_counter *sockets_allocated; /* Current number of sockets. */ /* * Pressure flag: try to collapse. * Technical note: it is used by multiple contexts non atomically. - * All the sk_stream_mem_schedule() is of this nature: accounting + * All the __sk_mem_schedule() is of this nature: accounting * is strict, actions are advisory and have some latency. */ int *memory_pressure; @@ -571,14 +662,21 @@ struct proto { int *sysctl_rmem; int max_header; - struct kmem_cache *slab; + struct kmem_cache *slab; unsigned int obj_size; + int slab_flags; - atomic_t *orphan_count; + struct percpu_counter *orphan_count; struct request_sock_ops *rsk_prot; struct timewait_sock_ops *twsk_prot; + union { + struct inet_hashinfo *hashinfo; + struct udp_table *udp_table; + struct raw_hashinfo *raw_hash; + } h; + struct module *owner; char name[32]; @@ -587,10 +685,6 @@ struct proto { #ifdef SOCK_REFCNT_DEBUG atomic_t socks; #endif - struct { - int inuse; - u8 __pad[SMP_CACHE_BYTES - sizeof(int)]; - } stats[NR_CPUS]; }; extern int proto_register(struct proto *prot, int alloc_slab); @@ -621,16 +715,18 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) #define sk_refcnt_debug_release(sk) do { } while (0) #endif /* SOCK_REFCNT_DEBUG */ + +#ifdef CONFIG_PROC_FS /* Called with local bh disabled */ -static __inline__ void sock_prot_inc_use(struct proto *prot) +extern void sock_prot_inuse_add(struct net *net, struct proto *prot, int inc); +extern int sock_prot_inuse_get(struct net *net, struct proto *proto); +#else +static void inline sock_prot_inuse_add(struct net *net, struct proto *prot, + int inc) { - prot->stats[smp_processor_id()].inuse++; } +#endif -static __inline__ void sock_prot_dec_use(struct proto *prot) -{ - prot->stats[smp_processor_id()].inuse--; -} /* With per-bucket locks this operation is not-atomic, so that * this version is not worse. @@ -694,41 +790,80 @@ static inline struct inode *SOCK_INODE(struct socket *socket) return &container_of(socket, struct socket_alloc, socket)->vfs_inode; } -extern void __sk_stream_mem_reclaim(struct sock *sk); -extern int sk_stream_mem_schedule(struct sock *sk, int size, int kind); +/* + * Functions for memory accounting + */ +extern int __sk_mem_schedule(struct sock *sk, int size, int kind); +extern void __sk_mem_reclaim(struct sock *sk); + +#define SK_MEM_QUANTUM ((int)PAGE_SIZE) +#define SK_MEM_QUANTUM_SHIFT ilog2(SK_MEM_QUANTUM) +#define SK_MEM_SEND 0 +#define SK_MEM_RECV 1 -#define SK_STREAM_MEM_QUANTUM ((int)PAGE_SIZE) +static inline int sk_mem_pages(int amt) +{ + return (amt + SK_MEM_QUANTUM - 1) >> SK_MEM_QUANTUM_SHIFT; +} -static inline int sk_stream_pages(int amt) +static inline int sk_has_account(struct sock *sk) { - return (amt + SK_STREAM_MEM_QUANTUM - 1) / SK_STREAM_MEM_QUANTUM; + /* return true if protocol supports memory accounting */ + return !!sk->sk_prot->memory_allocated; } -static inline void sk_stream_mem_reclaim(struct sock *sk) +static inline int sk_wmem_schedule(struct sock *sk, int size) { - if (sk->sk_forward_alloc >= SK_STREAM_MEM_QUANTUM) - __sk_stream_mem_reclaim(sk); + if (!sk_has_account(sk)) + return 1; + return size <= sk->sk_forward_alloc || + __sk_mem_schedule(sk, size, SK_MEM_SEND); } -static inline void sk_stream_writequeue_purge(struct sock *sk) +static inline int sk_rmem_schedule(struct sock *sk, int size) { - struct sk_buff *skb; + if (!sk_has_account(sk)) + return 1; + return size <= sk->sk_forward_alloc || + __sk_mem_schedule(sk, size, SK_MEM_RECV); +} - while ((skb = __skb_dequeue(&sk->sk_write_queue)) != NULL) - sk_stream_free_skb(sk, skb); - sk_stream_mem_reclaim(sk); +static inline void sk_mem_reclaim(struct sock *sk) +{ + if (!sk_has_account(sk)) + return; + if (sk->sk_forward_alloc >= SK_MEM_QUANTUM) + __sk_mem_reclaim(sk); } -static inline int sk_stream_rmem_schedule(struct sock *sk, struct sk_buff *skb) +static inline void sk_mem_reclaim_partial(struct sock *sk) { - return (int)skb->truesize <= sk->sk_forward_alloc || - sk_stream_mem_schedule(sk, skb->truesize, 1); + if (!sk_has_account(sk)) + return; + if (sk->sk_forward_alloc > SK_MEM_QUANTUM) + __sk_mem_reclaim(sk); } -static inline int sk_stream_wmem_schedule(struct sock *sk, int size) +static inline void sk_mem_charge(struct sock *sk, int size) { - return size <= sk->sk_forward_alloc || - sk_stream_mem_schedule(sk, size, 0); + if (!sk_has_account(sk)) + return; + sk->sk_forward_alloc -= size; +} + +static inline void sk_mem_uncharge(struct sock *sk, int size) +{ + if (!sk_has_account(sk)) + return; + sk->sk_forward_alloc += size; +} + +static inline void sk_wmem_free_skb(struct sock *sk, struct sk_buff *skb) +{ + sock_set_flag(sk, SOCK_QUEUE_SHRUNK); + sk->sk_wmem_queued -= skb->truesize; + sk_mem_uncharge(sk, skb->truesize); + __kfree_skb(skb); } /* Used by processes to "lock" a socket state, so that @@ -744,7 +879,7 @@ static inline int sk_stream_wmem_schedule(struct sock *sk, int size) * Since ~2.3.5 it is also exclusive sleep lock serializing * accesses from user process context. */ -#define sock_owned_by_user(sk) ((sk)->sk_lock.owner) +#define sock_owned_by_user(sk) ((sk)->sk_lock.owned) /* * Macro so as to not evaluate some arguments when @@ -755,7 +890,7 @@ static inline int sk_stream_wmem_schedule(struct sock *sk, int size) */ #define sock_lock_init_class_and_name(sk, sname, skey, name, key) \ do { \ - sk->sk_lock.owner = NULL; \ + sk->sk_lock.owned = 0; \ init_waitqueue_head(&sk->sk_lock.wq); \ spin_lock_init(&(sk)->sk_lock.slock); \ debug_check_no_locks_freed((void *)&(sk)->sk_lock, \ @@ -765,14 +900,14 @@ do { \ lockdep_init_map(&(sk)->sk_lock.dep_map, (name), (key), 0); \ } while (0) -extern void FASTCALL(lock_sock_nested(struct sock *sk, int subclass)); +extern void lock_sock_nested(struct sock *sk, int subclass); static inline void lock_sock(struct sock *sk) { lock_sock_nested(sk, 0); } -extern void FASTCALL(release_sock(struct sock *sk)); +extern void release_sock(struct sock *sk); /* BH context may only use the following locking interface. */ #define bh_lock_sock(__sk) spin_lock(&((__sk)->sk_lock.slock)) @@ -781,10 +916,11 @@ extern void FASTCALL(release_sock(struct sock *sk)); SINGLE_DEPTH_NESTING) #define bh_unlock_sock(__sk) spin_unlock(&((__sk)->sk_lock.slock)) -extern struct sock *sk_alloc(int family, +extern struct sock *sk_alloc(struct net *net, int family, gfp_t priority, - struct proto *prot, int zero_it); + struct proto *prot); extern void sk_free(struct sock *sk); +extern void sk_release_kernel(struct sock *sk); extern struct sock *sk_clone(const struct sock *sk, const gfp_t priority); @@ -874,66 +1010,24 @@ extern void sk_common_release(struct sock *sk); extern void sock_init_data(struct socket *sock, struct sock *sk); /** - * sk_filter - run a packet through a socket filter - * @sk: sock associated with &sk_buff - * @skb: buffer to filter - * @needlock: set to 1 if the sock is not locked by caller. - * - * Run the filter code and then cut skb->data to correct size returned by - * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller - * than pkt_len we keep whole skb->data. This is the socket level - * wrapper to sk_run_filter. It returns 0 if the packet should - * be accepted or -EPERM if the packet should be tossed. - * - */ - -static inline int sk_filter(struct sock *sk, struct sk_buff *skb) -{ - int err; - struct sk_filter *filter; - - err = security_sock_rcv_skb(sk, skb); - if (err) - return err; - - rcu_read_lock_bh(); - filter = sk->sk_filter; - if (filter) { - unsigned int pkt_len = sk_run_filter(skb, filter->insns, - filter->len); - err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM; - } - rcu_read_unlock_bh(); - - return err; -} - -/** - * sk_filter_rcu_free: Free a socket filter - * @rcu: rcu_head that contains the sk_filter to free - */ -static inline void sk_filter_rcu_free(struct rcu_head *rcu) -{ - struct sk_filter *fp = container_of(rcu, struct sk_filter, rcu); - kfree(fp); -} - -/** * sk_filter_release: Release a socket filter - * @sk: socket * @fp: filter to remove * * Remove a filter from a socket and release its resources. */ -static inline void sk_filter_release(struct sock *sk, struct sk_filter *fp) +static inline void sk_filter_release(struct sk_filter *fp) +{ + if (atomic_dec_and_test(&fp->refcnt)) + kfree(fp); +} + +static inline void sk_filter_uncharge(struct sock *sk, struct sk_filter *fp) { unsigned int size = sk_filter_len(fp); atomic_sub(size, &sk->sk_omem_alloc); - - if (atomic_dec_and_test(&fp->refcnt)) - call_rcu_bh(&fp->rcu, sk_filter_rcu_free); + sk_filter_release(fp); } static inline void sk_filter_charge(struct sock *sk, struct sk_filter *fp) @@ -977,6 +1071,11 @@ static inline void sock_put(struct sock *sk) extern int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested); +static inline void sk_set_socket(struct sock *sk, struct socket *sock) +{ + sk->sk_socket = sock; +} + /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, @@ -988,7 +1087,7 @@ static inline void sock_orphan(struct sock *sk) { write_lock_bh(&sk->sk_callback_lock); sock_set_flag(sk, SOCK_DEAD); - sk->sk_socket = NULL; + sk_set_socket(sk, NULL); sk->sk_sleep = NULL; write_unlock_bh(&sk->sk_callback_lock); } @@ -998,24 +1097,11 @@ static inline void sock_graft(struct sock *sk, struct socket *parent) write_lock_bh(&sk->sk_callback_lock); sk->sk_sleep = &parent->wait; parent->sk = sk; - sk->sk_socket = parent; + sk_set_socket(sk, parent); security_sock_graft(sk, parent); write_unlock_bh(&sk->sk_callback_lock); } -static inline void sock_copy(struct sock *nsk, const struct sock *osk) -{ -#ifdef CONFIG_SECURITY_NETWORK - void *sptr = nsk->sk_security; -#endif - - memcpy(nsk, osk, osk->sk_prot->obj_size); -#ifdef CONFIG_SECURITY_NETWORK - nsk->sk_security = sptr; - security_sk_clone(osk, nsk); -#endif -} - extern int sock_i_uid(struct sock *sk); extern unsigned long sock_i_ino(struct sock *sk); @@ -1083,25 +1169,7 @@ static inline int sk_can_gso(const struct sock *sk) return net_gso_ok(sk->sk_route_caps, sk->sk_gso_type); } -static inline void sk_setup_caps(struct sock *sk, struct dst_entry *dst) -{ - __sk_dst_set(sk, dst); - sk->sk_route_caps = dst->dev->features; - if (sk->sk_route_caps & NETIF_F_GSO) - sk->sk_route_caps |= NETIF_F_GSO_MASK; - if (sk_can_gso(sk)) { - if (dst->header_len) - sk->sk_route_caps &= ~NETIF_F_GSO_MASK; - else - sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM; - } -} - -static inline void sk_charge_skb(struct sock *sk, struct sk_buff *skb) -{ - sk->sk_wmem_queued += skb->truesize; - sk->sk_forward_alloc -= skb->truesize; -} +extern void sk_setup_caps(struct sock *sk, struct dst_entry *dst); static inline int skb_copy_to_page(struct sock *sk, char __user *from, struct sk_buff *skb, struct page *page, @@ -1122,7 +1190,7 @@ static inline int skb_copy_to_page(struct sock *sk, char __user *from, skb->data_len += copy; skb->truesize += copy; sk->sk_wmem_queued += copy; - sk->sk_forward_alloc -= copy; + sk_mem_charge(sk, copy); return 0; } @@ -1148,6 +1216,7 @@ static inline void skb_set_owner_r(struct sk_buff *skb, struct sock *sk) skb->sk = sk; skb->destructor = sock_rfree; atomic_add(skb->truesize, &sk->sk_rmem_alloc); + sk_mem_charge(sk, skb->truesize); } extern void sk_reset_timer(struct sock *sk, struct timer_list* timer, @@ -1209,40 +1278,12 @@ static inline void sk_wake_async(struct sock *sk, int how, int band) static inline void sk_stream_moderate_sndbuf(struct sock *sk) { if (!(sk->sk_userlocks & SOCK_SNDBUF_LOCK)) { - sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued / 2); + sk->sk_sndbuf = min(sk->sk_sndbuf, sk->sk_wmem_queued >> 1); sk->sk_sndbuf = max(sk->sk_sndbuf, SOCK_MIN_SNDBUF); } } -static inline struct sk_buff *sk_stream_alloc_pskb(struct sock *sk, - int size, int mem, - gfp_t gfp) -{ - struct sk_buff *skb; - int hdr_len; - - hdr_len = SKB_DATA_ALIGN(sk->sk_prot->max_header); - skb = alloc_skb_fclone(size + hdr_len, gfp); - if (skb) { - skb->truesize += mem; - if (sk_stream_wmem_schedule(sk, skb->truesize)) { - skb_reserve(skb, hdr_len); - return skb; - } - __kfree_skb(skb); - } else { - sk->sk_prot->enter_memory_pressure(); - sk_stream_moderate_sndbuf(sk); - } - return NULL; -} - -static inline struct sk_buff *sk_stream_alloc_skb(struct sock *sk, - int size, - gfp_t gfp) -{ - return sk_stream_alloc_pskb(sk, size, 0, gfp); -} +struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp); static inline struct page *sk_stream_alloc_page(struct sock *sk) { @@ -1250,35 +1291,23 @@ static inline struct page *sk_stream_alloc_page(struct sock *sk) page = alloc_pages(sk->sk_allocation, 0); if (!page) { - sk->sk_prot->enter_memory_pressure(); + sk->sk_prot->enter_memory_pressure(sk); sk_stream_moderate_sndbuf(sk); } return page; } -#define sk_stream_for_retrans_queue(skb, sk) \ - for (skb = (sk)->sk_write_queue.next; \ - (skb != (sk)->sk_send_head) && \ - (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ - skb = skb->next) - -/*from STCP for fast SACK Process*/ -#define sk_stream_for_retrans_queue_from(skb, sk) \ - for (; (skb != (sk)->sk_send_head) && \ - (skb != (struct sk_buff *)&(sk)->sk_write_queue); \ - skb = skb->next) - /* * Default write policy as shown to user space via poll/select/SIGIO */ static inline int sock_writeable(const struct sock *sk) { - return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf / 2); + return atomic_read(&sk->sk_wmem_alloc) < (sk->sk_sndbuf >> 1); } static inline gfp_t gfp_any(void) { - return in_atomic() ? GFP_ATOMIC : GFP_KERNEL; + return in_softirq() ? GFP_ATOMIC : GFP_KERNEL; } static inline long sock_rcvtimeo(const struct sock *sk, int noblock) @@ -1304,22 +1333,18 @@ static inline int sock_intr_errno(long timeo) return timeo == MAX_SCHEDULE_TIMEOUT ? -ERESTARTSYS : -EINTR; } +extern void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb); + static __inline__ void sock_recv_timestamp(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) { - struct timeval stamp; + ktime_t kt = skb->tstamp; - skb_get_timestamp(skb, &stamp); - if (sock_flag(sk, SOCK_RCVTSTAMP)) { - /* Race occurred between timestamp enabling and packet - receiving. Fill in the current time for now. */ - if (stamp.tv_sec == 0) - do_gettimeofday(&stamp); - skb_set_timestamp(skb, &stamp); - put_cmsg(msg, SOL_SOCKET, SO_TIMESTAMP, sizeof(struct timeval), - &stamp); - } else - sk->sk_stamp = stamp; + if (sock_flag(sk, SOCK_RCVTSTAMP)) + __sock_recv_timestamp(msg, sk, skb); + else + sk->sk_stamp = kt; } /** @@ -1348,71 +1373,67 @@ static inline void sk_eat_skb(struct sock *sk, struct sk_buff *skb, int copied_e } #endif -extern void sock_enable_timestamp(struct sock *sk); -extern int sock_get_timestamp(struct sock *, struct timeval __user *); - -/* - * Enable debug/info messages - */ - -#ifdef CONFIG_NETDEBUG -#define NETDEBUG(fmt, args...) printk(fmt,##args) -#define LIMIT_NETDEBUG(fmt, args...) do { if (net_ratelimit()) printk(fmt,##args); } while(0) +static inline +struct net *sock_net(const struct sock *sk) +{ +#ifdef CONFIG_NET_NS + return sk->sk_net; #else -#define NETDEBUG(fmt, args...) do { } while (0) -#define LIMIT_NETDEBUG(fmt, args...) do { } while(0) + return &init_net; #endif +} + +static inline +void sock_net_set(struct sock *sk, struct net *net) +{ +#ifdef CONFIG_NET_NS + sk->sk_net = net; +#endif +} /* - * Macros for sleeping on a socket. Use them like this: - * - * SOCK_SLEEP_PRE(sk) - * if (condition) - * schedule(); - * SOCK_SLEEP_POST(sk) - * - * N.B. These are now obsolete and were, afaik, only ever used in DECnet - * and when the last use of them in DECnet has gone, I'm intending to - * remove them. + * Kernel sockets, f.e. rtnl or icmp_socket, are a part of a namespace. + * They should not hold a referrence to a namespace in order to allow + * to stop it. + * Sockets after sk_change_net should be released using sk_release_kernel */ +static inline void sk_change_net(struct sock *sk, struct net *net) +{ + put_net(sock_net(sk)); + sock_net_set(sk, hold_net(net)); +} -#define SOCK_SLEEP_PRE(sk) { struct task_struct *tsk = current; \ - DECLARE_WAITQUEUE(wait, tsk); \ - tsk->state = TASK_INTERRUPTIBLE; \ - add_wait_queue((sk)->sk_sleep, &wait); \ - release_sock(sk); - -#define SOCK_SLEEP_POST(sk) tsk->state = TASK_RUNNING; \ - remove_wait_queue((sk)->sk_sleep, &wait); \ - lock_sock(sk); \ - } - -static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool) +static inline struct sock *skb_steal_sock(struct sk_buff *skb) { - if (valbool) - sock_set_flag(sk, bit); - else - sock_reset_flag(sk, bit); + if (unlikely(skb->sk)) { + struct sock *sk = skb->sk; + + skb->destructor = NULL; + skb->sk = NULL; + return sk; + } + return NULL; } +extern void sock_enable_timestamp(struct sock *sk); +extern int sock_get_timestamp(struct sock *, struct timeval __user *); +extern int sock_get_timestampns(struct sock *, struct timespec __user *); + +/* + * Enable debug/info messages + */ +extern int net_msg_warn; +#define NETDEBUG(fmt, args...) \ + do { if (net_msg_warn) printk(fmt,##args); } while (0) + +#define LIMIT_NETDEBUG(fmt, args...) \ + do { if (net_msg_warn && net_ratelimit()) printk(fmt,##args); } while(0) + extern __u32 sysctl_wmem_max; extern __u32 sysctl_rmem_max; -#ifdef CONFIG_NET -int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg); -#else -static inline int siocdevprivate_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg) -{ - return -ENODEV; -} -#endif - extern void sk_init(void); -#ifdef CONFIG_SYSCTL -extern struct ctl_table core_table[]; -#endif - extern int sysctl_optmem_max; extern __u32 sysctl_wmem_default;