pkt_sched: ingress socket filter by mark
[safe/jmp/linux-2.6] / net / core / sock.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              Generic socket support routines. Memory allocators, socket lock/release
7  *              handler for protocols to use and generic option handler.
8  *
9  *
10  * Authors:     Ross Biro
11  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12  *              Florian La Roche, <flla@stud.uni-sb.de>
13  *              Alan Cox, <A.Cox@swansea.ac.uk>
14  *
15  * Fixes:
16  *              Alan Cox        :       Numerous verify_area() problems
17  *              Alan Cox        :       Connecting on a connecting socket
18  *                                      now returns an error for tcp.
19  *              Alan Cox        :       sock->protocol is set correctly.
20  *                                      and is not sometimes left as 0.
21  *              Alan Cox        :       connect handles icmp errors on a
22  *                                      connect properly. Unfortunately there
23  *                                      is a restart syscall nasty there. I
24  *                                      can't match BSD without hacking the C
25  *                                      library. Ideas urgently sought!
26  *              Alan Cox        :       Disallow bind() to addresses that are
27  *                                      not ours - especially broadcast ones!!
28  *              Alan Cox        :       Socket 1024 _IS_ ok for users. (fencepost)
29  *              Alan Cox        :       sock_wfree/sock_rfree don't destroy sockets,
30  *                                      instead they leave that for the DESTROY timer.
31  *              Alan Cox        :       Clean up error flag in accept
32  *              Alan Cox        :       TCP ack handling is buggy, the DESTROY timer
33  *                                      was buggy. Put a remove_sock() in the handler
34  *                                      for memory when we hit 0. Also altered the timer
35  *                                      code. The ACK stuff can wait and needs major
36  *                                      TCP layer surgery.
37  *              Alan Cox        :       Fixed TCP ack bug, removed remove sock
38  *                                      and fixed timer/inet_bh race.
39  *              Alan Cox        :       Added zapped flag for TCP
40  *              Alan Cox        :       Move kfree_skb into skbuff.c and tidied up surplus code
41  *              Alan Cox        :       for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42  *              Alan Cox        :       kfree_s calls now are kfree_skbmem so we can track skb resources
43  *              Alan Cox        :       Supports socket option broadcast now as does udp. Packet and raw need fixing.
44  *              Alan Cox        :       Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45  *              Rick Sladkey    :       Relaxed UDP rules for matching packets.
46  *              C.E.Hawkins     :       IFF_PROMISC/SIOCGHWADDR support
47  *      Pauline Middelink       :       identd support
48  *              Alan Cox        :       Fixed connect() taking signals I think.
49  *              Alan Cox        :       SO_LINGER supported
50  *              Alan Cox        :       Error reporting fixes
51  *              Anonymous       :       inet_create tidied up (sk->reuse setting)
52  *              Alan Cox        :       inet sockets don't set sk->type!
53  *              Alan Cox        :       Split socket option code
54  *              Alan Cox        :       Callbacks
55  *              Alan Cox        :       Nagle flag for Charles & Johannes stuff
56  *              Alex            :       Removed restriction on inet fioctl
57  *              Alan Cox        :       Splitting INET from NET core
58  *              Alan Cox        :       Fixed bogus SO_TYPE handling in getsockopt()
59  *              Adam Caldwell   :       Missing return in SO_DONTROUTE/SO_DEBUG code
60  *              Alan Cox        :       Split IP from generic code
61  *              Alan Cox        :       New kfree_skbmem()
62  *              Alan Cox        :       Make SO_DEBUG superuser only.
63  *              Alan Cox        :       Allow anyone to clear SO_DEBUG
64  *                                      (compatibility fix)
65  *              Alan Cox        :       Added optimistic memory grabbing for AF_UNIX throughput.
66  *              Alan Cox        :       Allocator for a socket is settable.
67  *              Alan Cox        :       SO_ERROR includes soft errors.
68  *              Alan Cox        :       Allow NULL arguments on some SO_ opts
69  *              Alan Cox        :       Generic socket allocation to make hooks
70  *                                      easier (suggested by Craig Metz).
71  *              Michael Pall    :       SO_ERROR returns positive errno again
72  *              Steve Whitehouse:       Added default destructor to free
73  *                                      protocol private data.
74  *              Steve Whitehouse:       Added various other default routines
75  *                                      common to several socket families.
76  *              Chris Evans     :       Call suser() check last on F_SETOWN
77  *              Jay Schulist    :       Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78  *              Andi Kleen      :       Add sock_kmalloc()/sock_kfree_s()
79  *              Andi Kleen      :       Fix write_space callback
80  *              Chris Evans     :       Security fixes - signedness again
81  *              Arnaldo C. Melo :       cleanups, use skb_queue_purge
82  *
83  * To Fix:
84  *
85  *
86  *              This program is free software; you can redistribute it and/or
87  *              modify it under the terms of the GNU General Public License
88  *              as published by the Free Software Foundation; either version
89  *              2 of the License, or (at your option) any later version.
90  */
91
92 #include <linux/capability.h>
93 #include <linux/errno.h>
94 #include <linux/types.h>
95 #include <linux/socket.h>
96 #include <linux/in.h>
97 #include <linux/kernel.h>
98 #include <linux/module.h>
99 #include <linux/proc_fs.h>
100 #include <linux/seq_file.h>
101 #include <linux/sched.h>
102 #include <linux/timer.h>
103 #include <linux/string.h>
104 #include <linux/sockios.h>
105 #include <linux/net.h>
106 #include <linux/mm.h>
107 #include <linux/slab.h>
108 #include <linux/interrupt.h>
109 #include <linux/poll.h>
110 #include <linux/tcp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113
114 #include <asm/uaccess.h>
115 #include <asm/system.h>
116
117 #include <linux/netdevice.h>
118 #include <net/protocol.h>
119 #include <linux/skbuff.h>
120 #include <net/net_namespace.h>
121 #include <net/request_sock.h>
122 #include <net/sock.h>
123 #include <linux/net_tstamp.h>
124 #include <net/xfrm.h>
125 #include <linux/ipsec.h>
126
127 #include <linux/filter.h>
128
129 #ifdef CONFIG_INET
130 #include <net/tcp.h>
131 #endif
132
133 /*
134  * Each address family might have different locking rules, so we have
135  * one slock key per address family:
136  */
137 static struct lock_class_key af_family_keys[AF_MAX];
138 static struct lock_class_key af_family_slock_keys[AF_MAX];
139
140 /*
141  * Make lock validator output more readable. (we pre-construct these
142  * strings build-time, so that runtime initialization of socket
143  * locks is fast):
144  */
145 static const char *const af_family_key_strings[AF_MAX+1] = {
146   "sk_lock-AF_UNSPEC", "sk_lock-AF_UNIX"     , "sk_lock-AF_INET"     ,
147   "sk_lock-AF_AX25"  , "sk_lock-AF_IPX"      , "sk_lock-AF_APPLETALK",
148   "sk_lock-AF_NETROM", "sk_lock-AF_BRIDGE"   , "sk_lock-AF_ATMPVC"   ,
149   "sk_lock-AF_X25"   , "sk_lock-AF_INET6"    , "sk_lock-AF_ROSE"     ,
150   "sk_lock-AF_DECnet", "sk_lock-AF_NETBEUI"  , "sk_lock-AF_SECURITY" ,
151   "sk_lock-AF_KEY"   , "sk_lock-AF_NETLINK"  , "sk_lock-AF_PACKET"   ,
152   "sk_lock-AF_ASH"   , "sk_lock-AF_ECONET"   , "sk_lock-AF_ATMSVC"   ,
153   "sk_lock-AF_RDS"   , "sk_lock-AF_SNA"      , "sk_lock-AF_IRDA"     ,
154   "sk_lock-AF_PPPOX" , "sk_lock-AF_WANPIPE"  , "sk_lock-AF_LLC"      ,
155   "sk_lock-27"       , "sk_lock-28"          , "sk_lock-AF_CAN"      ,
156   "sk_lock-AF_TIPC"  , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV"        ,
157   "sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN"     , "sk_lock-AF_PHONET"   ,
158   "sk_lock-AF_IEEE802154",
159   "sk_lock-AF_MAX"
160 };
161 static const char *const af_family_slock_key_strings[AF_MAX+1] = {
162   "slock-AF_UNSPEC", "slock-AF_UNIX"     , "slock-AF_INET"     ,
163   "slock-AF_AX25"  , "slock-AF_IPX"      , "slock-AF_APPLETALK",
164   "slock-AF_NETROM", "slock-AF_BRIDGE"   , "slock-AF_ATMPVC"   ,
165   "slock-AF_X25"   , "slock-AF_INET6"    , "slock-AF_ROSE"     ,
166   "slock-AF_DECnet", "slock-AF_NETBEUI"  , "slock-AF_SECURITY" ,
167   "slock-AF_KEY"   , "slock-AF_NETLINK"  , "slock-AF_PACKET"   ,
168   "slock-AF_ASH"   , "slock-AF_ECONET"   , "slock-AF_ATMSVC"   ,
169   "slock-AF_RDS"   , "slock-AF_SNA"      , "slock-AF_IRDA"     ,
170   "slock-AF_PPPOX" , "slock-AF_WANPIPE"  , "slock-AF_LLC"      ,
171   "slock-27"       , "slock-28"          , "slock-AF_CAN"      ,
172   "slock-AF_TIPC"  , "slock-AF_BLUETOOTH", "slock-AF_IUCV"     ,
173   "slock-AF_RXRPC" , "slock-AF_ISDN"     , "slock-AF_PHONET"   ,
174   "slock-AF_IEEE802154",
175   "slock-AF_MAX"
176 };
177 static const char *const af_family_clock_key_strings[AF_MAX+1] = {
178   "clock-AF_UNSPEC", "clock-AF_UNIX"     , "clock-AF_INET"     ,
179   "clock-AF_AX25"  , "clock-AF_IPX"      , "clock-AF_APPLETALK",
180   "clock-AF_NETROM", "clock-AF_BRIDGE"   , "clock-AF_ATMPVC"   ,
181   "clock-AF_X25"   , "clock-AF_INET6"    , "clock-AF_ROSE"     ,
182   "clock-AF_DECnet", "clock-AF_NETBEUI"  , "clock-AF_SECURITY" ,
183   "clock-AF_KEY"   , "clock-AF_NETLINK"  , "clock-AF_PACKET"   ,
184   "clock-AF_ASH"   , "clock-AF_ECONET"   , "clock-AF_ATMSVC"   ,
185   "clock-AF_RDS"   , "clock-AF_SNA"      , "clock-AF_IRDA"     ,
186   "clock-AF_PPPOX" , "clock-AF_WANPIPE"  , "clock-AF_LLC"      ,
187   "clock-27"       , "clock-28"          , "clock-AF_CAN"      ,
188   "clock-AF_TIPC"  , "clock-AF_BLUETOOTH", "clock-AF_IUCV"     ,
189   "clock-AF_RXRPC" , "clock-AF_ISDN"     , "clock-AF_PHONET"   ,
190   "clock-AF_IEEE802154",
191   "clock-AF_MAX"
192 };
193
194 /*
195  * sk_callback_lock locking rules are per-address-family,
196  * so split the lock classes by using a per-AF key:
197  */
198 static struct lock_class_key af_callback_keys[AF_MAX];
199
200 /* Take into consideration the size of the struct sk_buff overhead in the
201  * determination of these values, since that is non-constant across
202  * platforms.  This makes socket queueing behavior and performance
203  * not depend upon such differences.
204  */
205 #define _SK_MEM_PACKETS         256
206 #define _SK_MEM_OVERHEAD        (sizeof(struct sk_buff) + 256)
207 #define SK_WMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
208 #define SK_RMEM_MAX             (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
209
210 /* Run time adjustable parameters. */
211 __u32 sysctl_wmem_max __read_mostly = SK_WMEM_MAX;
212 __u32 sysctl_rmem_max __read_mostly = SK_RMEM_MAX;
213 __u32 sysctl_wmem_default __read_mostly = SK_WMEM_MAX;
214 __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
215
216 /* Maximal space eaten by iovec or ancilliary data plus some space */
217 int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
218 EXPORT_SYMBOL(sysctl_optmem_max);
219
220 static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
221 {
222         struct timeval tv;
223
224         if (optlen < sizeof(tv))
225                 return -EINVAL;
226         if (copy_from_user(&tv, optval, sizeof(tv)))
227                 return -EFAULT;
228         if (tv.tv_usec < 0 || tv.tv_usec >= USEC_PER_SEC)
229                 return -EDOM;
230
231         if (tv.tv_sec < 0) {
232                 static int warned __read_mostly;
233
234                 *timeo_p = 0;
235                 if (warned < 10 && net_ratelimit()) {
236                         warned++;
237                         printk(KERN_INFO "sock_set_timeout: `%s' (pid %d) "
238                                "tries to set negative timeout\n",
239                                 current->comm, task_pid_nr(current));
240                 }
241                 return 0;
242         }
243         *timeo_p = MAX_SCHEDULE_TIMEOUT;
244         if (tv.tv_sec == 0 && tv.tv_usec == 0)
245                 return 0;
246         if (tv.tv_sec < (MAX_SCHEDULE_TIMEOUT/HZ - 1))
247                 *timeo_p = tv.tv_sec*HZ + (tv.tv_usec+(1000000/HZ-1))/(1000000/HZ);
248         return 0;
249 }
250
251 static void sock_warn_obsolete_bsdism(const char *name)
252 {
253         static int warned;
254         static char warncomm[TASK_COMM_LEN];
255         if (strcmp(warncomm, current->comm) && warned < 5) {
256                 strcpy(warncomm,  current->comm);
257                 printk(KERN_WARNING "process `%s' is using obsolete "
258                        "%s SO_BSDCOMPAT\n", warncomm, name);
259                 warned++;
260         }
261 }
262
263 static void sock_disable_timestamp(struct sock *sk, int flag)
264 {
265         if (sock_flag(sk, flag)) {
266                 sock_reset_flag(sk, flag);
267                 if (!sock_flag(sk, SOCK_TIMESTAMP) &&
268                     !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
269                         net_disable_timestamp();
270                 }
271         }
272 }
273
274
275 int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
276 {
277         int err;
278         int skb_len;
279         unsigned long flags;
280         struct sk_buff_head *list = &sk->sk_receive_queue;
281
282         /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
283            number of warnings when compiling with -W --ANK
284          */
285         if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
286             (unsigned)sk->sk_rcvbuf) {
287                 atomic_inc(&sk->sk_drops);
288                 return -ENOMEM;
289         }
290
291         err = sk_filter(sk, skb);
292         if (err)
293                 return err;
294
295         if (!sk_rmem_schedule(sk, skb->truesize)) {
296                 atomic_inc(&sk->sk_drops);
297                 return -ENOBUFS;
298         }
299
300         skb->dev = NULL;
301         skb_set_owner_r(skb, sk);
302
303         /* Cache the SKB length before we tack it onto the receive
304          * queue.  Once it is added it no longer belongs to us and
305          * may be freed by other threads of control pulling packets
306          * from the queue.
307          */
308         skb_len = skb->len;
309
310         spin_lock_irqsave(&list->lock, flags);
311         skb->dropcount = atomic_read(&sk->sk_drops);
312         __skb_queue_tail(list, skb);
313         spin_unlock_irqrestore(&list->lock, flags);
314
315         if (!sock_flag(sk, SOCK_DEAD))
316                 sk->sk_data_ready(sk, skb_len);
317         return 0;
318 }
319 EXPORT_SYMBOL(sock_queue_rcv_skb);
320
321 int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested)
322 {
323         int rc = NET_RX_SUCCESS;
324
325         if (sk_filter(sk, skb))
326                 goto discard_and_relse;
327
328         skb->dev = NULL;
329
330         if (nested)
331                 bh_lock_sock_nested(sk);
332         else
333                 bh_lock_sock(sk);
334         if (!sock_owned_by_user(sk)) {
335                 /*
336                  * trylock + unlock semantics:
337                  */
338                 mutex_acquire(&sk->sk_lock.dep_map, 0, 1, _RET_IP_);
339
340                 rc = sk_backlog_rcv(sk, skb);
341
342                 mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
343         } else
344                 sk_add_backlog(sk, skb);
345         bh_unlock_sock(sk);
346 out:
347         sock_put(sk);
348         return rc;
349 discard_and_relse:
350         kfree_skb(skb);
351         goto out;
352 }
353 EXPORT_SYMBOL(sk_receive_skb);
354
355 struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
356 {
357         struct dst_entry *dst = sk->sk_dst_cache;
358
359         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
360                 sk->sk_dst_cache = NULL;
361                 dst_release(dst);
362                 return NULL;
363         }
364
365         return dst;
366 }
367 EXPORT_SYMBOL(__sk_dst_check);
368
369 struct dst_entry *sk_dst_check(struct sock *sk, u32 cookie)
370 {
371         struct dst_entry *dst = sk_dst_get(sk);
372
373         if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
374                 sk_dst_reset(sk);
375                 dst_release(dst);
376                 return NULL;
377         }
378
379         return dst;
380 }
381 EXPORT_SYMBOL(sk_dst_check);
382
383 static int sock_bindtodevice(struct sock *sk, char __user *optval, int optlen)
384 {
385         int ret = -ENOPROTOOPT;
386 #ifdef CONFIG_NETDEVICES
387         struct net *net = sock_net(sk);
388         char devname[IFNAMSIZ];
389         int index;
390
391         /* Sorry... */
392         ret = -EPERM;
393         if (!capable(CAP_NET_RAW))
394                 goto out;
395
396         ret = -EINVAL;
397         if (optlen < 0)
398                 goto out;
399
400         /* Bind this socket to a particular device like "eth0",
401          * as specified in the passed interface name. If the
402          * name is "" or the option length is zero the socket
403          * is not bound.
404          */
405         if (optlen > IFNAMSIZ - 1)
406                 optlen = IFNAMSIZ - 1;
407         memset(devname, 0, sizeof(devname));
408
409         ret = -EFAULT;
410         if (copy_from_user(devname, optval, optlen))
411                 goto out;
412
413         if (devname[0] == '\0') {
414                 index = 0;
415         } else {
416                 struct net_device *dev = dev_get_by_name(net, devname);
417
418                 ret = -ENODEV;
419                 if (!dev)
420                         goto out;
421
422                 index = dev->ifindex;
423                 dev_put(dev);
424         }
425
426         lock_sock(sk);
427         sk->sk_bound_dev_if = index;
428         sk_dst_reset(sk);
429         release_sock(sk);
430
431         ret = 0;
432
433 out:
434 #endif
435
436         return ret;
437 }
438
439 static inline void sock_valbool_flag(struct sock *sk, int bit, int valbool)
440 {
441         if (valbool)
442                 sock_set_flag(sk, bit);
443         else
444                 sock_reset_flag(sk, bit);
445 }
446
447 /*
448  *      This is meant for all protocols to use and covers goings on
449  *      at the socket level. Everything here is generic.
450  */
451
452 int sock_setsockopt(struct socket *sock, int level, int optname,
453                     char __user *optval, unsigned int optlen)
454 {
455         struct sock *sk = sock->sk;
456         int val;
457         int valbool;
458         struct linger ling;
459         int ret = 0;
460
461         /*
462          *      Options without arguments
463          */
464
465         if (optname == SO_BINDTODEVICE)
466                 return sock_bindtodevice(sk, optval, optlen);
467
468         if (optlen < sizeof(int))
469                 return -EINVAL;
470
471         if (get_user(val, (int __user *)optval))
472                 return -EFAULT;
473
474         valbool = val ? 1 : 0;
475
476         lock_sock(sk);
477
478         switch (optname) {
479         case SO_DEBUG:
480                 if (val && !capable(CAP_NET_ADMIN))
481                         ret = -EACCES;
482                 else
483                         sock_valbool_flag(sk, SOCK_DBG, valbool);
484                 break;
485         case SO_REUSEADDR:
486                 sk->sk_reuse = valbool;
487                 break;
488         case SO_TYPE:
489         case SO_PROTOCOL:
490         case SO_DOMAIN:
491         case SO_ERROR:
492                 ret = -ENOPROTOOPT;
493                 break;
494         case SO_DONTROUTE:
495                 sock_valbool_flag(sk, SOCK_LOCALROUTE, valbool);
496                 break;
497         case SO_BROADCAST:
498                 sock_valbool_flag(sk, SOCK_BROADCAST, valbool);
499                 break;
500         case SO_SNDBUF:
501                 /* Don't error on this BSD doesn't and if you think
502                    about it this is right. Otherwise apps have to
503                    play 'guess the biggest size' games. RCVBUF/SNDBUF
504                    are treated in BSD as hints */
505
506                 if (val > sysctl_wmem_max)
507                         val = sysctl_wmem_max;
508 set_sndbuf:
509                 sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
510                 if ((val * 2) < SOCK_MIN_SNDBUF)
511                         sk->sk_sndbuf = SOCK_MIN_SNDBUF;
512                 else
513                         sk->sk_sndbuf = val * 2;
514
515                 /*
516                  *      Wake up sending tasks if we
517                  *      upped the value.
518                  */
519                 sk->sk_write_space(sk);
520                 break;
521
522         case SO_SNDBUFFORCE:
523                 if (!capable(CAP_NET_ADMIN)) {
524                         ret = -EPERM;
525                         break;
526                 }
527                 goto set_sndbuf;
528
529         case SO_RCVBUF:
530                 /* Don't error on this BSD doesn't and if you think
531                    about it this is right. Otherwise apps have to
532                    play 'guess the biggest size' games. RCVBUF/SNDBUF
533                    are treated in BSD as hints */
534
535                 if (val > sysctl_rmem_max)
536                         val = sysctl_rmem_max;
537 set_rcvbuf:
538                 sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
539                 /*
540                  * We double it on the way in to account for
541                  * "struct sk_buff" etc. overhead.   Applications
542                  * assume that the SO_RCVBUF setting they make will
543                  * allow that much actual data to be received on that
544                  * socket.
545                  *
546                  * Applications are unaware that "struct sk_buff" and
547                  * other overheads allocate from the receive buffer
548                  * during socket buffer allocation.
549                  *
550                  * And after considering the possible alternatives,
551                  * returning the value we actually used in getsockopt
552                  * is the most desirable behavior.
553                  */
554                 if ((val * 2) < SOCK_MIN_RCVBUF)
555                         sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
556                 else
557                         sk->sk_rcvbuf = val * 2;
558                 break;
559
560         case SO_RCVBUFFORCE:
561                 if (!capable(CAP_NET_ADMIN)) {
562                         ret = -EPERM;
563                         break;
564                 }
565                 goto set_rcvbuf;
566
567         case SO_KEEPALIVE:
568 #ifdef CONFIG_INET
569                 if (sk->sk_protocol == IPPROTO_TCP)
570                         tcp_set_keepalive(sk, valbool);
571 #endif
572                 sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool);
573                 break;
574
575         case SO_OOBINLINE:
576                 sock_valbool_flag(sk, SOCK_URGINLINE, valbool);
577                 break;
578
579         case SO_NO_CHECK:
580                 sk->sk_no_check = valbool;
581                 break;
582
583         case SO_PRIORITY:
584                 if ((val >= 0 && val <= 6) || capable(CAP_NET_ADMIN))
585                         sk->sk_priority = val;
586                 else
587                         ret = -EPERM;
588                 break;
589
590         case SO_LINGER:
591                 if (optlen < sizeof(ling)) {
592                         ret = -EINVAL;  /* 1003.1g */
593                         break;
594                 }
595                 if (copy_from_user(&ling, optval, sizeof(ling))) {
596                         ret = -EFAULT;
597                         break;
598                 }
599                 if (!ling.l_onoff)
600                         sock_reset_flag(sk, SOCK_LINGER);
601                 else {
602 #if (BITS_PER_LONG == 32)
603                         if ((unsigned int)ling.l_linger >= MAX_SCHEDULE_TIMEOUT/HZ)
604                                 sk->sk_lingertime = MAX_SCHEDULE_TIMEOUT;
605                         else
606 #endif
607                                 sk->sk_lingertime = (unsigned int)ling.l_linger * HZ;
608                         sock_set_flag(sk, SOCK_LINGER);
609                 }
610                 break;
611
612         case SO_BSDCOMPAT:
613                 sock_warn_obsolete_bsdism("setsockopt");
614                 break;
615
616         case SO_PASSCRED:
617                 if (valbool)
618                         set_bit(SOCK_PASSCRED, &sock->flags);
619                 else
620                         clear_bit(SOCK_PASSCRED, &sock->flags);
621                 break;
622
623         case SO_TIMESTAMP:
624         case SO_TIMESTAMPNS:
625                 if (valbool)  {
626                         if (optname == SO_TIMESTAMP)
627                                 sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
628                         else
629                                 sock_set_flag(sk, SOCK_RCVTSTAMPNS);
630                         sock_set_flag(sk, SOCK_RCVTSTAMP);
631                         sock_enable_timestamp(sk, SOCK_TIMESTAMP);
632                 } else {
633                         sock_reset_flag(sk, SOCK_RCVTSTAMP);
634                         sock_reset_flag(sk, SOCK_RCVTSTAMPNS);
635                 }
636                 break;
637
638         case SO_TIMESTAMPING:
639                 if (val & ~SOF_TIMESTAMPING_MASK) {
640                         ret = -EINVAL;
641                         break;
642                 }
643                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE,
644                                   val & SOF_TIMESTAMPING_TX_HARDWARE);
645                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE,
646                                   val & SOF_TIMESTAMPING_TX_SOFTWARE);
647                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE,
648                                   val & SOF_TIMESTAMPING_RX_HARDWARE);
649                 if (val & SOF_TIMESTAMPING_RX_SOFTWARE)
650                         sock_enable_timestamp(sk,
651                                               SOCK_TIMESTAMPING_RX_SOFTWARE);
652                 else
653                         sock_disable_timestamp(sk,
654                                                SOCK_TIMESTAMPING_RX_SOFTWARE);
655                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
656                                   val & SOF_TIMESTAMPING_SOFTWARE);
657                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
658                                   val & SOF_TIMESTAMPING_SYS_HARDWARE);
659                 sock_valbool_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE,
660                                   val & SOF_TIMESTAMPING_RAW_HARDWARE);
661                 break;
662
663         case SO_RCVLOWAT:
664                 if (val < 0)
665                         val = INT_MAX;
666                 sk->sk_rcvlowat = val ? : 1;
667                 break;
668
669         case SO_RCVTIMEO:
670                 ret = sock_set_timeout(&sk->sk_rcvtimeo, optval, optlen);
671                 break;
672
673         case SO_SNDTIMEO:
674                 ret = sock_set_timeout(&sk->sk_sndtimeo, optval, optlen);
675                 break;
676
677         case SO_ATTACH_FILTER:
678                 ret = -EINVAL;
679                 if (optlen == sizeof(struct sock_fprog)) {
680                         struct sock_fprog fprog;
681
682                         ret = -EFAULT;
683                         if (copy_from_user(&fprog, optval, sizeof(fprog)))
684                                 break;
685
686                         ret = sk_attach_filter(&fprog, sk);
687                 }
688                 break;
689
690         case SO_DETACH_FILTER:
691                 ret = sk_detach_filter(sk);
692                 break;
693
694         case SO_PASSSEC:
695                 if (valbool)
696                         set_bit(SOCK_PASSSEC, &sock->flags);
697                 else
698                         clear_bit(SOCK_PASSSEC, &sock->flags);
699                 break;
700         case SO_MARK:
701                 if (!capable(CAP_NET_ADMIN))
702                         ret = -EPERM;
703                 else
704                         sk->sk_mark = val;
705                 break;
706
707                 /* We implement the SO_SNDLOWAT etc to
708                    not be settable (1003.1g 5.3) */
709         case SO_RXQ_OVFL:
710                 if (valbool)
711                         sock_set_flag(sk, SOCK_RXQ_OVFL);
712                 else
713                         sock_reset_flag(sk, SOCK_RXQ_OVFL);
714                 break;
715         default:
716                 ret = -ENOPROTOOPT;
717                 break;
718         }
719         release_sock(sk);
720         return ret;
721 }
722 EXPORT_SYMBOL(sock_setsockopt);
723
724
725 int sock_getsockopt(struct socket *sock, int level, int optname,
726                     char __user *optval, int __user *optlen)
727 {
728         struct sock *sk = sock->sk;
729
730         union {
731                 int val;
732                 struct linger ling;
733                 struct timeval tm;
734         } v;
735
736         unsigned int lv = sizeof(int);
737         int len;
738
739         if (get_user(len, optlen))
740                 return -EFAULT;
741         if (len < 0)
742                 return -EINVAL;
743
744         memset(&v, 0, sizeof(v));
745
746         switch (optname) {
747         case SO_DEBUG:
748                 v.val = sock_flag(sk, SOCK_DBG);
749                 break;
750
751         case SO_DONTROUTE:
752                 v.val = sock_flag(sk, SOCK_LOCALROUTE);
753                 break;
754
755         case SO_BROADCAST:
756                 v.val = !!sock_flag(sk, SOCK_BROADCAST);
757                 break;
758
759         case SO_SNDBUF:
760                 v.val = sk->sk_sndbuf;
761                 break;
762
763         case SO_RCVBUF:
764                 v.val = sk->sk_rcvbuf;
765                 break;
766
767         case SO_REUSEADDR:
768                 v.val = sk->sk_reuse;
769                 break;
770
771         case SO_KEEPALIVE:
772                 v.val = !!sock_flag(sk, SOCK_KEEPOPEN);
773                 break;
774
775         case SO_TYPE:
776                 v.val = sk->sk_type;
777                 break;
778
779         case SO_PROTOCOL:
780                 v.val = sk->sk_protocol;
781                 break;
782
783         case SO_DOMAIN:
784                 v.val = sk->sk_family;
785                 break;
786
787         case SO_ERROR:
788                 v.val = -sock_error(sk);
789                 if (v.val == 0)
790                         v.val = xchg(&sk->sk_err_soft, 0);
791                 break;
792
793         case SO_OOBINLINE:
794                 v.val = !!sock_flag(sk, SOCK_URGINLINE);
795                 break;
796
797         case SO_NO_CHECK:
798                 v.val = sk->sk_no_check;
799                 break;
800
801         case SO_PRIORITY:
802                 v.val = sk->sk_priority;
803                 break;
804
805         case SO_LINGER:
806                 lv              = sizeof(v.ling);
807                 v.ling.l_onoff  = !!sock_flag(sk, SOCK_LINGER);
808                 v.ling.l_linger = sk->sk_lingertime / HZ;
809                 break;
810
811         case SO_BSDCOMPAT:
812                 sock_warn_obsolete_bsdism("getsockopt");
813                 break;
814
815         case SO_TIMESTAMP:
816                 v.val = sock_flag(sk, SOCK_RCVTSTAMP) &&
817                                 !sock_flag(sk, SOCK_RCVTSTAMPNS);
818                 break;
819
820         case SO_TIMESTAMPNS:
821                 v.val = sock_flag(sk, SOCK_RCVTSTAMPNS);
822                 break;
823
824         case SO_TIMESTAMPING:
825                 v.val = 0;
826                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE))
827                         v.val |= SOF_TIMESTAMPING_TX_HARDWARE;
828                 if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE))
829                         v.val |= SOF_TIMESTAMPING_TX_SOFTWARE;
830                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_HARDWARE))
831                         v.val |= SOF_TIMESTAMPING_RX_HARDWARE;
832                 if (sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE))
833                         v.val |= SOF_TIMESTAMPING_RX_SOFTWARE;
834                 if (sock_flag(sk, SOCK_TIMESTAMPING_SOFTWARE))
835                         v.val |= SOF_TIMESTAMPING_SOFTWARE;
836                 if (sock_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE))
837                         v.val |= SOF_TIMESTAMPING_SYS_HARDWARE;
838                 if (sock_flag(sk, SOCK_TIMESTAMPING_RAW_HARDWARE))
839                         v.val |= SOF_TIMESTAMPING_RAW_HARDWARE;
840                 break;
841
842         case SO_RCVTIMEO:
843                 lv = sizeof(struct timeval);
844                 if (sk->sk_rcvtimeo == MAX_SCHEDULE_TIMEOUT) {
845                         v.tm.tv_sec = 0;
846                         v.tm.tv_usec = 0;
847                 } else {
848                         v.tm.tv_sec = sk->sk_rcvtimeo / HZ;
849                         v.tm.tv_usec = ((sk->sk_rcvtimeo % HZ) * 1000000) / HZ;
850                 }
851                 break;
852
853         case SO_SNDTIMEO:
854                 lv = sizeof(struct timeval);
855                 if (sk->sk_sndtimeo == MAX_SCHEDULE_TIMEOUT) {
856                         v.tm.tv_sec = 0;
857                         v.tm.tv_usec = 0;
858                 } else {
859                         v.tm.tv_sec = sk->sk_sndtimeo / HZ;
860                         v.tm.tv_usec = ((sk->sk_sndtimeo % HZ) * 1000000) / HZ;
861                 }
862                 break;
863
864         case SO_RCVLOWAT:
865                 v.val = sk->sk_rcvlowat;
866                 break;
867
868         case SO_SNDLOWAT:
869                 v.val = 1;
870                 break;
871
872         case SO_PASSCRED:
873                 v.val = test_bit(SOCK_PASSCRED, &sock->flags) ? 1 : 0;
874                 break;
875
876         case SO_PEERCRED:
877                 if (len > sizeof(sk->sk_peercred))
878                         len = sizeof(sk->sk_peercred);
879                 if (copy_to_user(optval, &sk->sk_peercred, len))
880                         return -EFAULT;
881                 goto lenout;
882
883         case SO_PEERNAME:
884         {
885                 char address[128];
886
887                 if (sock->ops->getname(sock, (struct sockaddr *)address, &lv, 2))
888                         return -ENOTCONN;
889                 if (lv < len)
890                         return -EINVAL;
891                 if (copy_to_user(optval, address, len))
892                         return -EFAULT;
893                 goto lenout;
894         }
895
896         /* Dubious BSD thing... Probably nobody even uses it, but
897          * the UNIX standard wants it for whatever reason... -DaveM
898          */
899         case SO_ACCEPTCONN:
900                 v.val = sk->sk_state == TCP_LISTEN;
901                 break;
902
903         case SO_PASSSEC:
904                 v.val = test_bit(SOCK_PASSSEC, &sock->flags) ? 1 : 0;
905                 break;
906
907         case SO_PEERSEC:
908                 return security_socket_getpeersec_stream(sock, optval, optlen, len);
909
910         case SO_MARK:
911                 v.val = sk->sk_mark;
912                 break;
913
914         case SO_RXQ_OVFL:
915                 v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
916                 break;
917
918         default:
919                 return -ENOPROTOOPT;
920         }
921
922         if (len > lv)
923                 len = lv;
924         if (copy_to_user(optval, &v, len))
925                 return -EFAULT;
926 lenout:
927         if (put_user(len, optlen))
928                 return -EFAULT;
929         return 0;
930 }
931
932 /*
933  * Initialize an sk_lock.
934  *
935  * (We also register the sk_lock with the lock validator.)
936  */
937 static inline void sock_lock_init(struct sock *sk)
938 {
939         sock_lock_init_class_and_name(sk,
940                         af_family_slock_key_strings[sk->sk_family],
941                         af_family_slock_keys + sk->sk_family,
942                         af_family_key_strings[sk->sk_family],
943                         af_family_keys + sk->sk_family);
944 }
945
946 /*
947  * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
948  * even temporarly, because of RCU lookups. sk_node should also be left as is.
949  */
950 static void sock_copy(struct sock *nsk, const struct sock *osk)
951 {
952 #ifdef CONFIG_SECURITY_NETWORK
953         void *sptr = nsk->sk_security;
954 #endif
955         BUILD_BUG_ON(offsetof(struct sock, sk_copy_start) !=
956                      sizeof(osk->sk_node) + sizeof(osk->sk_refcnt));
957         memcpy(&nsk->sk_copy_start, &osk->sk_copy_start,
958                osk->sk_prot->obj_size - offsetof(struct sock, sk_copy_start));
959 #ifdef CONFIG_SECURITY_NETWORK
960         nsk->sk_security = sptr;
961         security_sk_clone(osk, nsk);
962 #endif
963 }
964
965 static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
966                 int family)
967 {
968         struct sock *sk;
969         struct kmem_cache *slab;
970
971         slab = prot->slab;
972         if (slab != NULL) {
973                 sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
974                 if (!sk)
975                         return sk;
976                 if (priority & __GFP_ZERO) {
977                         /*
978                          * caches using SLAB_DESTROY_BY_RCU should let
979                          * sk_node.next un-modified. Special care is taken
980                          * when initializing object to zero.
981                          */
982                         if (offsetof(struct sock, sk_node.next) != 0)
983                                 memset(sk, 0, offsetof(struct sock, sk_node.next));
984                         memset(&sk->sk_node.pprev, 0,
985                                prot->obj_size - offsetof(struct sock,
986                                                          sk_node.pprev));
987                 }
988         }
989         else
990                 sk = kmalloc(prot->obj_size, priority);
991
992         if (sk != NULL) {
993                 kmemcheck_annotate_bitfield(sk, flags);
994
995                 if (security_sk_alloc(sk, family, priority))
996                         goto out_free;
997
998                 if (!try_module_get(prot->owner))
999                         goto out_free_sec;
1000         }
1001
1002         return sk;
1003
1004 out_free_sec:
1005         security_sk_free(sk);
1006 out_free:
1007         if (slab != NULL)
1008                 kmem_cache_free(slab, sk);
1009         else
1010                 kfree(sk);
1011         return NULL;
1012 }
1013
1014 static void sk_prot_free(struct proto *prot, struct sock *sk)
1015 {
1016         struct kmem_cache *slab;
1017         struct module *owner;
1018
1019         owner = prot->owner;
1020         slab = prot->slab;
1021
1022         security_sk_free(sk);
1023         if (slab != NULL)
1024                 kmem_cache_free(slab, sk);
1025         else
1026                 kfree(sk);
1027         module_put(owner);
1028 }
1029
1030 /**
1031  *      sk_alloc - All socket objects are allocated here
1032  *      @net: the applicable net namespace
1033  *      @family: protocol family
1034  *      @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
1035  *      @prot: struct proto associated with this new sock instance
1036  */
1037 struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
1038                       struct proto *prot)
1039 {
1040         struct sock *sk;
1041
1042         sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
1043         if (sk) {
1044                 sk->sk_family = family;
1045                 /*
1046                  * See comment in struct sock definition to understand
1047                  * why we need sk_prot_creator -acme
1048                  */
1049                 sk->sk_prot = sk->sk_prot_creator = prot;
1050                 sock_lock_init(sk);
1051                 sock_net_set(sk, get_net(net));
1052                 atomic_set(&sk->sk_wmem_alloc, 1);
1053         }
1054
1055         return sk;
1056 }
1057 EXPORT_SYMBOL(sk_alloc);
1058
1059 static void __sk_free(struct sock *sk)
1060 {
1061         struct sk_filter *filter;
1062
1063         if (sk->sk_destruct)
1064                 sk->sk_destruct(sk);
1065
1066         filter = rcu_dereference(sk->sk_filter);
1067         if (filter) {
1068                 sk_filter_uncharge(sk, filter);
1069                 rcu_assign_pointer(sk->sk_filter, NULL);
1070         }
1071
1072         sock_disable_timestamp(sk, SOCK_TIMESTAMP);
1073         sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
1074
1075         if (atomic_read(&sk->sk_omem_alloc))
1076                 printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
1077                        __func__, atomic_read(&sk->sk_omem_alloc));
1078
1079         put_net(sock_net(sk));
1080         sk_prot_free(sk->sk_prot_creator, sk);
1081 }
1082
1083 void sk_free(struct sock *sk)
1084 {
1085         /*
1086          * We substract one from sk_wmem_alloc and can know if
1087          * some packets are still in some tx queue.
1088          * If not null, sock_wfree() will call __sk_free(sk) later
1089          */
1090         if (atomic_dec_and_test(&sk->sk_wmem_alloc))
1091                 __sk_free(sk);
1092 }
1093 EXPORT_SYMBOL(sk_free);
1094
1095 /*
1096  * Last sock_put should drop referrence to sk->sk_net. It has already
1097  * been dropped in sk_change_net. Taking referrence to stopping namespace
1098  * is not an option.
1099  * Take referrence to a socket to remove it from hash _alive_ and after that
1100  * destroy it in the context of init_net.
1101  */
1102 void sk_release_kernel(struct sock *sk)
1103 {
1104         if (sk == NULL || sk->sk_socket == NULL)
1105                 return;
1106
1107         sock_hold(sk);
1108         sock_release(sk->sk_socket);
1109         release_net(sock_net(sk));
1110         sock_net_set(sk, get_net(&init_net));
1111         sock_put(sk);
1112 }
1113 EXPORT_SYMBOL(sk_release_kernel);
1114
1115 struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
1116 {
1117         struct sock *newsk;
1118
1119         newsk = sk_prot_alloc(sk->sk_prot, priority, sk->sk_family);
1120         if (newsk != NULL) {
1121                 struct sk_filter *filter;
1122
1123                 sock_copy(newsk, sk);
1124
1125                 /* SANITY */
1126                 get_net(sock_net(newsk));
1127                 sk_node_init(&newsk->sk_node);
1128                 sock_lock_init(newsk);
1129                 bh_lock_sock(newsk);
1130                 newsk->sk_backlog.head  = newsk->sk_backlog.tail = NULL;
1131
1132                 atomic_set(&newsk->sk_rmem_alloc, 0);
1133                 /*
1134                  * sk_wmem_alloc set to one (see sk_free() and sock_wfree())
1135                  */
1136                 atomic_set(&newsk->sk_wmem_alloc, 1);
1137                 atomic_set(&newsk->sk_omem_alloc, 0);
1138                 skb_queue_head_init(&newsk->sk_receive_queue);
1139                 skb_queue_head_init(&newsk->sk_write_queue);
1140 #ifdef CONFIG_NET_DMA
1141                 skb_queue_head_init(&newsk->sk_async_wait_queue);
1142 #endif
1143
1144                 rwlock_init(&newsk->sk_dst_lock);
1145                 rwlock_init(&newsk->sk_callback_lock);
1146                 lockdep_set_class_and_name(&newsk->sk_callback_lock,
1147                                 af_callback_keys + newsk->sk_family,
1148                                 af_family_clock_key_strings[newsk->sk_family]);
1149
1150                 newsk->sk_dst_cache     = NULL;
1151                 newsk->sk_wmem_queued   = 0;
1152                 newsk->sk_forward_alloc = 0;
1153                 newsk->sk_send_head     = NULL;
1154                 newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
1155
1156                 sock_reset_flag(newsk, SOCK_DONE);
1157                 skb_queue_head_init(&newsk->sk_error_queue);
1158
1159                 filter = newsk->sk_filter;
1160                 if (filter != NULL)
1161                         sk_filter_charge(newsk, filter);
1162
1163                 if (unlikely(xfrm_sk_clone_policy(newsk))) {
1164                         /* It is still raw copy of parent, so invalidate
1165                          * destructor and make plain sk_free() */
1166                         newsk->sk_destruct = NULL;
1167                         sk_free(newsk);
1168                         newsk = NULL;
1169                         goto out;
1170                 }
1171
1172                 newsk->sk_err      = 0;
1173                 newsk->sk_priority = 0;
1174                 /*
1175                  * Before updating sk_refcnt, we must commit prior changes to memory
1176                  * (Documentation/RCU/rculist_nulls.txt for details)
1177                  */
1178                 smp_wmb();
1179                 atomic_set(&newsk->sk_refcnt, 2);
1180
1181                 /*
1182                  * Increment the counter in the same struct proto as the master
1183                  * sock (sk_refcnt_debug_inc uses newsk->sk_prot->socks, that
1184                  * is the same as sk->sk_prot->socks, as this field was copied
1185                  * with memcpy).
1186                  *
1187                  * This _changes_ the previous behaviour, where
1188                  * tcp_create_openreq_child always was incrementing the
1189                  * equivalent to tcp_prot->socks (inet_sock_nr), so this have
1190                  * to be taken into account in all callers. -acme
1191                  */
1192                 sk_refcnt_debug_inc(newsk);
1193                 sk_set_socket(newsk, NULL);
1194                 newsk->sk_sleep  = NULL;
1195
1196                 if (newsk->sk_prot->sockets_allocated)
1197                         percpu_counter_inc(newsk->sk_prot->sockets_allocated);
1198         }
1199 out:
1200         return newsk;
1201 }
1202 EXPORT_SYMBOL_GPL(sk_clone);
1203
1204 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
1205 {
1206         __sk_dst_set(sk, dst);
1207         sk->sk_route_caps = dst->dev->features;
1208         if (sk->sk_route_caps & NETIF_F_GSO)
1209                 sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
1210         if (sk_can_gso(sk)) {
1211                 if (dst->header_len) {
1212                         sk->sk_route_caps &= ~NETIF_F_GSO_MASK;
1213                 } else {
1214                         sk->sk_route_caps |= NETIF_F_SG | NETIF_F_HW_CSUM;
1215                         sk->sk_gso_max_size = dst->dev->gso_max_size;
1216                 }
1217         }
1218 }
1219 EXPORT_SYMBOL_GPL(sk_setup_caps);
1220
1221 void __init sk_init(void)
1222 {
1223         if (totalram_pages <= 4096) {
1224                 sysctl_wmem_max = 32767;
1225                 sysctl_rmem_max = 32767;
1226                 sysctl_wmem_default = 32767;
1227                 sysctl_rmem_default = 32767;
1228         } else if (totalram_pages >= 131072) {
1229                 sysctl_wmem_max = 131071;
1230                 sysctl_rmem_max = 131071;
1231         }
1232 }
1233
1234 /*
1235  *      Simple resource managers for sockets.
1236  */
1237
1238
1239 /*
1240  * Write buffer destructor automatically called from kfree_skb.
1241  */
1242 void sock_wfree(struct sk_buff *skb)
1243 {
1244         struct sock *sk = skb->sk;
1245         unsigned int len = skb->truesize;
1246
1247         if (!sock_flag(sk, SOCK_USE_WRITE_QUEUE)) {
1248                 /*
1249                  * Keep a reference on sk_wmem_alloc, this will be released
1250                  * after sk_write_space() call
1251                  */
1252                 atomic_sub(len - 1, &sk->sk_wmem_alloc);
1253                 sk->sk_write_space(sk);
1254                 len = 1;
1255         }
1256         /*
1257          * if sk_wmem_alloc reaches 0, we must finish what sk_free()
1258          * could not do because of in-flight packets
1259          */
1260         if (atomic_sub_and_test(len, &sk->sk_wmem_alloc))
1261                 __sk_free(sk);
1262 }
1263 EXPORT_SYMBOL(sock_wfree);
1264
1265 /*
1266  * Read buffer destructor automatically called from kfree_skb.
1267  */
1268 void sock_rfree(struct sk_buff *skb)
1269 {
1270         struct sock *sk = skb->sk;
1271
1272         atomic_sub(skb->truesize, &sk->sk_rmem_alloc);
1273         sk_mem_uncharge(skb->sk, skb->truesize);
1274 }
1275 EXPORT_SYMBOL(sock_rfree);
1276
1277
1278 int sock_i_uid(struct sock *sk)
1279 {
1280         int uid;
1281
1282         read_lock(&sk->sk_callback_lock);
1283         uid = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_uid : 0;
1284         read_unlock(&sk->sk_callback_lock);
1285         return uid;
1286 }
1287 EXPORT_SYMBOL(sock_i_uid);
1288
1289 unsigned long sock_i_ino(struct sock *sk)
1290 {
1291         unsigned long ino;
1292
1293         read_lock(&sk->sk_callback_lock);
1294         ino = sk->sk_socket ? SOCK_INODE(sk->sk_socket)->i_ino : 0;
1295         read_unlock(&sk->sk_callback_lock);
1296         return ino;
1297 }
1298 EXPORT_SYMBOL(sock_i_ino);
1299
1300 /*
1301  * Allocate a skb from the socket's send buffer.
1302  */
1303 struct sk_buff *sock_wmalloc(struct sock *sk, unsigned long size, int force,
1304                              gfp_t priority)
1305 {
1306         if (force || atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1307                 struct sk_buff *skb = alloc_skb(size, priority);
1308                 if (skb) {
1309                         skb_set_owner_w(skb, sk);
1310                         return skb;
1311                 }
1312         }
1313         return NULL;
1314 }
1315 EXPORT_SYMBOL(sock_wmalloc);
1316
1317 /*
1318  * Allocate a skb from the socket's receive buffer.
1319  */
1320 struct sk_buff *sock_rmalloc(struct sock *sk, unsigned long size, int force,
1321                              gfp_t priority)
1322 {
1323         if (force || atomic_read(&sk->sk_rmem_alloc) < sk->sk_rcvbuf) {
1324                 struct sk_buff *skb = alloc_skb(size, priority);
1325                 if (skb) {
1326                         skb_set_owner_r(skb, sk);
1327                         return skb;
1328                 }
1329         }
1330         return NULL;
1331 }
1332
1333 /*
1334  * Allocate a memory block from the socket's option memory buffer.
1335  */
1336 void *sock_kmalloc(struct sock *sk, int size, gfp_t priority)
1337 {
1338         if ((unsigned)size <= sysctl_optmem_max &&
1339             atomic_read(&sk->sk_omem_alloc) + size < sysctl_optmem_max) {
1340                 void *mem;
1341                 /* First do the add, to avoid the race if kmalloc
1342                  * might sleep.
1343                  */
1344                 atomic_add(size, &sk->sk_omem_alloc);
1345                 mem = kmalloc(size, priority);
1346                 if (mem)
1347                         return mem;
1348                 atomic_sub(size, &sk->sk_omem_alloc);
1349         }
1350         return NULL;
1351 }
1352 EXPORT_SYMBOL(sock_kmalloc);
1353
1354 /*
1355  * Free an option memory block.
1356  */
1357 void sock_kfree_s(struct sock *sk, void *mem, int size)
1358 {
1359         kfree(mem);
1360         atomic_sub(size, &sk->sk_omem_alloc);
1361 }
1362 EXPORT_SYMBOL(sock_kfree_s);
1363
1364 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
1365    I think, these locks should be removed for datagram sockets.
1366  */
1367 static long sock_wait_for_wmem(struct sock *sk, long timeo)
1368 {
1369         DEFINE_WAIT(wait);
1370
1371         clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1372         for (;;) {
1373                 if (!timeo)
1374                         break;
1375                 if (signal_pending(current))
1376                         break;
1377                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1378                 prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1379                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf)
1380                         break;
1381                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1382                         break;
1383                 if (sk->sk_err)
1384                         break;
1385                 timeo = schedule_timeout(timeo);
1386         }
1387         finish_wait(sk->sk_sleep, &wait);
1388         return timeo;
1389 }
1390
1391
1392 /*
1393  *      Generic send/receive buffer handlers
1394  */
1395
1396 struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
1397                                      unsigned long data_len, int noblock,
1398                                      int *errcode)
1399 {
1400         struct sk_buff *skb;
1401         gfp_t gfp_mask;
1402         long timeo;
1403         int err;
1404
1405         gfp_mask = sk->sk_allocation;
1406         if (gfp_mask & __GFP_WAIT)
1407                 gfp_mask |= __GFP_REPEAT;
1408
1409         timeo = sock_sndtimeo(sk, noblock);
1410         while (1) {
1411                 err = sock_error(sk);
1412                 if (err != 0)
1413                         goto failure;
1414
1415                 err = -EPIPE;
1416                 if (sk->sk_shutdown & SEND_SHUTDOWN)
1417                         goto failure;
1418
1419                 if (atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf) {
1420                         skb = alloc_skb(header_len, gfp_mask);
1421                         if (skb) {
1422                                 int npages;
1423                                 int i;
1424
1425                                 /* No pages, we're done... */
1426                                 if (!data_len)
1427                                         break;
1428
1429                                 npages = (data_len + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
1430                                 skb->truesize += data_len;
1431                                 skb_shinfo(skb)->nr_frags = npages;
1432                                 for (i = 0; i < npages; i++) {
1433                                         struct page *page;
1434                                         skb_frag_t *frag;
1435
1436                                         page = alloc_pages(sk->sk_allocation, 0);
1437                                         if (!page) {
1438                                                 err = -ENOBUFS;
1439                                                 skb_shinfo(skb)->nr_frags = i;
1440                                                 kfree_skb(skb);
1441                                                 goto failure;
1442                                         }
1443
1444                                         frag = &skb_shinfo(skb)->frags[i];
1445                                         frag->page = page;
1446                                         frag->page_offset = 0;
1447                                         frag->size = (data_len >= PAGE_SIZE ?
1448                                                       PAGE_SIZE :
1449                                                       data_len);
1450                                         data_len -= PAGE_SIZE;
1451                                 }
1452
1453                                 /* Full success... */
1454                                 break;
1455                         }
1456                         err = -ENOBUFS;
1457                         goto failure;
1458                 }
1459                 set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
1460                 set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
1461                 err = -EAGAIN;
1462                 if (!timeo)
1463                         goto failure;
1464                 if (signal_pending(current))
1465                         goto interrupted;
1466                 timeo = sock_wait_for_wmem(sk, timeo);
1467         }
1468
1469         skb_set_owner_w(skb, sk);
1470         return skb;
1471
1472 interrupted:
1473         err = sock_intr_errno(timeo);
1474 failure:
1475         *errcode = err;
1476         return NULL;
1477 }
1478 EXPORT_SYMBOL(sock_alloc_send_pskb);
1479
1480 struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size,
1481                                     int noblock, int *errcode)
1482 {
1483         return sock_alloc_send_pskb(sk, size, 0, noblock, errcode);
1484 }
1485 EXPORT_SYMBOL(sock_alloc_send_skb);
1486
1487 static void __lock_sock(struct sock *sk)
1488 {
1489         DEFINE_WAIT(wait);
1490
1491         for (;;) {
1492                 prepare_to_wait_exclusive(&sk->sk_lock.wq, &wait,
1493                                         TASK_UNINTERRUPTIBLE);
1494                 spin_unlock_bh(&sk->sk_lock.slock);
1495                 schedule();
1496                 spin_lock_bh(&sk->sk_lock.slock);
1497                 if (!sock_owned_by_user(sk))
1498                         break;
1499         }
1500         finish_wait(&sk->sk_lock.wq, &wait);
1501 }
1502
1503 static void __release_sock(struct sock *sk)
1504 {
1505         struct sk_buff *skb = sk->sk_backlog.head;
1506
1507         do {
1508                 sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
1509                 bh_unlock_sock(sk);
1510
1511                 do {
1512                         struct sk_buff *next = skb->next;
1513
1514                         skb->next = NULL;
1515                         sk_backlog_rcv(sk, skb);
1516
1517                         /*
1518                          * We are in process context here with softirqs
1519                          * disabled, use cond_resched_softirq() to preempt.
1520                          * This is safe to do because we've taken the backlog
1521                          * queue private:
1522                          */
1523                         cond_resched_softirq();
1524
1525                         skb = next;
1526                 } while (skb != NULL);
1527
1528                 bh_lock_sock(sk);
1529         } while ((skb = sk->sk_backlog.head) != NULL);
1530 }
1531
1532 /**
1533  * sk_wait_data - wait for data to arrive at sk_receive_queue
1534  * @sk:    sock to wait on
1535  * @timeo: for how long
1536  *
1537  * Now socket state including sk->sk_err is changed only under lock,
1538  * hence we may omit checks after joining wait queue.
1539  * We check receive queue before schedule() only as optimization;
1540  * it is very likely that release_sock() added new data.
1541  */
1542 int sk_wait_data(struct sock *sk, long *timeo)
1543 {
1544         int rc;
1545         DEFINE_WAIT(wait);
1546
1547         prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE);
1548         set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1549         rc = sk_wait_event(sk, timeo, !skb_queue_empty(&sk->sk_receive_queue));
1550         clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
1551         finish_wait(sk->sk_sleep, &wait);
1552         return rc;
1553 }
1554 EXPORT_SYMBOL(sk_wait_data);
1555
1556 /**
1557  *      __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
1558  *      @sk: socket
1559  *      @size: memory size to allocate
1560  *      @kind: allocation type
1561  *
1562  *      If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
1563  *      rmem allocation. This function assumes that protocols which have
1564  *      memory_pressure use sk_wmem_queued as write buffer accounting.
1565  */
1566 int __sk_mem_schedule(struct sock *sk, int size, int kind)
1567 {
1568         struct proto *prot = sk->sk_prot;
1569         int amt = sk_mem_pages(size);
1570         int allocated;
1571
1572         sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
1573         allocated = atomic_add_return(amt, prot->memory_allocated);
1574
1575         /* Under limit. */
1576         if (allocated <= prot->sysctl_mem[0]) {
1577                 if (prot->memory_pressure && *prot->memory_pressure)
1578                         *prot->memory_pressure = 0;
1579                 return 1;
1580         }
1581
1582         /* Under pressure. */
1583         if (allocated > prot->sysctl_mem[1])
1584                 if (prot->enter_memory_pressure)
1585                         prot->enter_memory_pressure(sk);
1586
1587         /* Over hard limit. */
1588         if (allocated > prot->sysctl_mem[2])
1589                 goto suppress_allocation;
1590
1591         /* guarantee minimum buffer size under pressure */
1592         if (kind == SK_MEM_RECV) {
1593                 if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
1594                         return 1;
1595         } else { /* SK_MEM_SEND */
1596                 if (sk->sk_type == SOCK_STREAM) {
1597                         if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
1598                                 return 1;
1599                 } else if (atomic_read(&sk->sk_wmem_alloc) <
1600                            prot->sysctl_wmem[0])
1601                                 return 1;
1602         }
1603
1604         if (prot->memory_pressure) {
1605                 int alloc;
1606
1607                 if (!*prot->memory_pressure)
1608                         return 1;
1609                 alloc = percpu_counter_read_positive(prot->sockets_allocated);
1610                 if (prot->sysctl_mem[2] > alloc *
1611                     sk_mem_pages(sk->sk_wmem_queued +
1612                                  atomic_read(&sk->sk_rmem_alloc) +
1613                                  sk->sk_forward_alloc))
1614                         return 1;
1615         }
1616
1617 suppress_allocation:
1618
1619         if (kind == SK_MEM_SEND && sk->sk_type == SOCK_STREAM) {
1620                 sk_stream_moderate_sndbuf(sk);
1621
1622                 /* Fail only if socket is _under_ its sndbuf.
1623                  * In this case we cannot block, so that we have to fail.
1624                  */
1625                 if (sk->sk_wmem_queued + size >= sk->sk_sndbuf)
1626                         return 1;
1627         }
1628
1629         /* Alas. Undo changes. */
1630         sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
1631         atomic_sub(amt, prot->memory_allocated);
1632         return 0;
1633 }
1634 EXPORT_SYMBOL(__sk_mem_schedule);
1635
1636 /**
1637  *      __sk_reclaim - reclaim memory_allocated
1638  *      @sk: socket
1639  */
1640 void __sk_mem_reclaim(struct sock *sk)
1641 {
1642         struct proto *prot = sk->sk_prot;
1643
1644         atomic_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
1645                    prot->memory_allocated);
1646         sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
1647
1648         if (prot->memory_pressure && *prot->memory_pressure &&
1649             (atomic_read(prot->memory_allocated) < prot->sysctl_mem[0]))
1650                 *prot->memory_pressure = 0;
1651 }
1652 EXPORT_SYMBOL(__sk_mem_reclaim);
1653
1654
1655 /*
1656  * Set of default routines for initialising struct proto_ops when
1657  * the protocol does not support a particular function. In certain
1658  * cases where it makes no sense for a protocol to have a "do nothing"
1659  * function, some default processing is provided.
1660  */
1661
1662 int sock_no_bind(struct socket *sock, struct sockaddr *saddr, int len)
1663 {
1664         return -EOPNOTSUPP;
1665 }
1666 EXPORT_SYMBOL(sock_no_bind);
1667
1668 int sock_no_connect(struct socket *sock, struct sockaddr *saddr,
1669                     int len, int flags)
1670 {
1671         return -EOPNOTSUPP;
1672 }
1673 EXPORT_SYMBOL(sock_no_connect);
1674
1675 int sock_no_socketpair(struct socket *sock1, struct socket *sock2)
1676 {
1677         return -EOPNOTSUPP;
1678 }
1679 EXPORT_SYMBOL(sock_no_socketpair);
1680
1681 int sock_no_accept(struct socket *sock, struct socket *newsock, int flags)
1682 {
1683         return -EOPNOTSUPP;
1684 }
1685 EXPORT_SYMBOL(sock_no_accept);
1686
1687 int sock_no_getname(struct socket *sock, struct sockaddr *saddr,
1688                     int *len, int peer)
1689 {
1690         return -EOPNOTSUPP;
1691 }
1692 EXPORT_SYMBOL(sock_no_getname);
1693
1694 unsigned int sock_no_poll(struct file *file, struct socket *sock, poll_table *pt)
1695 {
1696         return 0;
1697 }
1698 EXPORT_SYMBOL(sock_no_poll);
1699
1700 int sock_no_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
1701 {
1702         return -EOPNOTSUPP;
1703 }
1704 EXPORT_SYMBOL(sock_no_ioctl);
1705
1706 int sock_no_listen(struct socket *sock, int backlog)
1707 {
1708         return -EOPNOTSUPP;
1709 }
1710 EXPORT_SYMBOL(sock_no_listen);
1711
1712 int sock_no_shutdown(struct socket *sock, int how)
1713 {
1714         return -EOPNOTSUPP;
1715 }
1716 EXPORT_SYMBOL(sock_no_shutdown);
1717
1718 int sock_no_setsockopt(struct socket *sock, int level, int optname,
1719                     char __user *optval, unsigned int optlen)
1720 {
1721         return -EOPNOTSUPP;
1722 }
1723 EXPORT_SYMBOL(sock_no_setsockopt);
1724
1725 int sock_no_getsockopt(struct socket *sock, int level, int optname,
1726                     char __user *optval, int __user *optlen)
1727 {
1728         return -EOPNOTSUPP;
1729 }
1730 EXPORT_SYMBOL(sock_no_getsockopt);
1731
1732 int sock_no_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1733                     size_t len)
1734 {
1735         return -EOPNOTSUPP;
1736 }
1737 EXPORT_SYMBOL(sock_no_sendmsg);
1738
1739 int sock_no_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *m,
1740                     size_t len, int flags)
1741 {
1742         return -EOPNOTSUPP;
1743 }
1744 EXPORT_SYMBOL(sock_no_recvmsg);
1745
1746 int sock_no_mmap(struct file *file, struct socket *sock, struct vm_area_struct *vma)
1747 {
1748         /* Mirror missing mmap method error code */
1749         return -ENODEV;
1750 }
1751 EXPORT_SYMBOL(sock_no_mmap);
1752
1753 ssize_t sock_no_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags)
1754 {
1755         ssize_t res;
1756         struct msghdr msg = {.msg_flags = flags};
1757         struct kvec iov;
1758         char *kaddr = kmap(page);
1759         iov.iov_base = kaddr + offset;
1760         iov.iov_len = size;
1761         res = kernel_sendmsg(sock, &msg, &iov, 1, size);
1762         kunmap(page);
1763         return res;
1764 }
1765 EXPORT_SYMBOL(sock_no_sendpage);
1766
1767 /*
1768  *      Default Socket Callbacks
1769  */
1770
1771 static void sock_def_wakeup(struct sock *sk)
1772 {
1773         read_lock(&sk->sk_callback_lock);
1774         if (sk_has_sleeper(sk))
1775                 wake_up_interruptible_all(sk->sk_sleep);
1776         read_unlock(&sk->sk_callback_lock);
1777 }
1778
1779 static void sock_def_error_report(struct sock *sk)
1780 {
1781         read_lock(&sk->sk_callback_lock);
1782         if (sk_has_sleeper(sk))
1783                 wake_up_interruptible_poll(sk->sk_sleep, POLLERR);
1784         sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
1785         read_unlock(&sk->sk_callback_lock);
1786 }
1787
1788 static void sock_def_readable(struct sock *sk, int len)
1789 {
1790         read_lock(&sk->sk_callback_lock);
1791         if (sk_has_sleeper(sk))
1792                 wake_up_interruptible_sync_poll(sk->sk_sleep, POLLIN |
1793                                                 POLLRDNORM | POLLRDBAND);
1794         sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
1795         read_unlock(&sk->sk_callback_lock);
1796 }
1797
1798 static void sock_def_write_space(struct sock *sk)
1799 {
1800         read_lock(&sk->sk_callback_lock);
1801
1802         /* Do not wake up a writer until he can make "significant"
1803          * progress.  --DaveM
1804          */
1805         if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
1806                 if (sk_has_sleeper(sk))
1807                         wake_up_interruptible_sync_poll(sk->sk_sleep, POLLOUT |
1808                                                 POLLWRNORM | POLLWRBAND);
1809
1810                 /* Should agree with poll, otherwise some programs break */
1811                 if (sock_writeable(sk))
1812                         sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
1813         }
1814
1815         read_unlock(&sk->sk_callback_lock);
1816 }
1817
1818 static void sock_def_destruct(struct sock *sk)
1819 {
1820         kfree(sk->sk_protinfo);
1821 }
1822
1823 void sk_send_sigurg(struct sock *sk)
1824 {
1825         if (sk->sk_socket && sk->sk_socket->file)
1826                 if (send_sigurg(&sk->sk_socket->file->f_owner))
1827                         sk_wake_async(sk, SOCK_WAKE_URG, POLL_PRI);
1828 }
1829 EXPORT_SYMBOL(sk_send_sigurg);
1830
1831 void sk_reset_timer(struct sock *sk, struct timer_list* timer,
1832                     unsigned long expires)
1833 {
1834         if (!mod_timer(timer, expires))
1835                 sock_hold(sk);
1836 }
1837 EXPORT_SYMBOL(sk_reset_timer);
1838
1839 void sk_stop_timer(struct sock *sk, struct timer_list* timer)
1840 {
1841         if (timer_pending(timer) && del_timer(timer))
1842                 __sock_put(sk);
1843 }
1844 EXPORT_SYMBOL(sk_stop_timer);
1845
1846 void sock_init_data(struct socket *sock, struct sock *sk)
1847 {
1848         skb_queue_head_init(&sk->sk_receive_queue);
1849         skb_queue_head_init(&sk->sk_write_queue);
1850         skb_queue_head_init(&sk->sk_error_queue);
1851 #ifdef CONFIG_NET_DMA
1852         skb_queue_head_init(&sk->sk_async_wait_queue);
1853 #endif
1854
1855         sk->sk_send_head        =       NULL;
1856
1857         init_timer(&sk->sk_timer);
1858
1859         sk->sk_allocation       =       GFP_KERNEL;
1860         sk->sk_rcvbuf           =       sysctl_rmem_default;
1861         sk->sk_sndbuf           =       sysctl_wmem_default;
1862         sk->sk_state            =       TCP_CLOSE;
1863         sk_set_socket(sk, sock);
1864
1865         sock_set_flag(sk, SOCK_ZAPPED);
1866
1867         if (sock) {
1868                 sk->sk_type     =       sock->type;
1869                 sk->sk_sleep    =       &sock->wait;
1870                 sock->sk        =       sk;
1871         } else
1872                 sk->sk_sleep    =       NULL;
1873
1874         rwlock_init(&sk->sk_dst_lock);
1875         rwlock_init(&sk->sk_callback_lock);
1876         lockdep_set_class_and_name(&sk->sk_callback_lock,
1877                         af_callback_keys + sk->sk_family,
1878                         af_family_clock_key_strings[sk->sk_family]);
1879
1880         sk->sk_state_change     =       sock_def_wakeup;
1881         sk->sk_data_ready       =       sock_def_readable;
1882         sk->sk_write_space      =       sock_def_write_space;
1883         sk->sk_error_report     =       sock_def_error_report;
1884         sk->sk_destruct         =       sock_def_destruct;
1885
1886         sk->sk_sndmsg_page      =       NULL;
1887         sk->sk_sndmsg_off       =       0;
1888
1889         sk->sk_peercred.pid     =       0;
1890         sk->sk_peercred.uid     =       -1;
1891         sk->sk_peercred.gid     =       -1;
1892         sk->sk_write_pending    =       0;
1893         sk->sk_rcvlowat         =       1;
1894         sk->sk_rcvtimeo         =       MAX_SCHEDULE_TIMEOUT;
1895         sk->sk_sndtimeo         =       MAX_SCHEDULE_TIMEOUT;
1896
1897         sk->sk_stamp = ktime_set(-1L, 0);
1898
1899         /*
1900          * Before updating sk_refcnt, we must commit prior changes to memory
1901          * (Documentation/RCU/rculist_nulls.txt for details)
1902          */
1903         smp_wmb();
1904         atomic_set(&sk->sk_refcnt, 1);
1905         atomic_set(&sk->sk_drops, 0);
1906 }
1907 EXPORT_SYMBOL(sock_init_data);
1908
1909 void lock_sock_nested(struct sock *sk, int subclass)
1910 {
1911         might_sleep();
1912         spin_lock_bh(&sk->sk_lock.slock);
1913         if (sk->sk_lock.owned)
1914                 __lock_sock(sk);
1915         sk->sk_lock.owned = 1;
1916         spin_unlock(&sk->sk_lock.slock);
1917         /*
1918          * The sk_lock has mutex_lock() semantics here:
1919          */
1920         mutex_acquire(&sk->sk_lock.dep_map, subclass, 0, _RET_IP_);
1921         local_bh_enable();
1922 }
1923 EXPORT_SYMBOL(lock_sock_nested);
1924
1925 void release_sock(struct sock *sk)
1926 {
1927         /*
1928          * The sk_lock has mutex_unlock() semantics:
1929          */
1930         mutex_release(&sk->sk_lock.dep_map, 1, _RET_IP_);
1931
1932         spin_lock_bh(&sk->sk_lock.slock);
1933         if (sk->sk_backlog.tail)
1934                 __release_sock(sk);
1935         sk->sk_lock.owned = 0;
1936         if (waitqueue_active(&sk->sk_lock.wq))
1937                 wake_up(&sk->sk_lock.wq);
1938         spin_unlock_bh(&sk->sk_lock.slock);
1939 }
1940 EXPORT_SYMBOL(release_sock);
1941
1942 int sock_get_timestamp(struct sock *sk, struct timeval __user *userstamp)
1943 {
1944         struct timeval tv;
1945         if (!sock_flag(sk, SOCK_TIMESTAMP))
1946                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1947         tv = ktime_to_timeval(sk->sk_stamp);
1948         if (tv.tv_sec == -1)
1949                 return -ENOENT;
1950         if (tv.tv_sec == 0) {
1951                 sk->sk_stamp = ktime_get_real();
1952                 tv = ktime_to_timeval(sk->sk_stamp);
1953         }
1954         return copy_to_user(userstamp, &tv, sizeof(tv)) ? -EFAULT : 0;
1955 }
1956 EXPORT_SYMBOL(sock_get_timestamp);
1957
1958 int sock_get_timestampns(struct sock *sk, struct timespec __user *userstamp)
1959 {
1960         struct timespec ts;
1961         if (!sock_flag(sk, SOCK_TIMESTAMP))
1962                 sock_enable_timestamp(sk, SOCK_TIMESTAMP);
1963         ts = ktime_to_timespec(sk->sk_stamp);
1964         if (ts.tv_sec == -1)
1965                 return -ENOENT;
1966         if (ts.tv_sec == 0) {
1967                 sk->sk_stamp = ktime_get_real();
1968                 ts = ktime_to_timespec(sk->sk_stamp);
1969         }
1970         return copy_to_user(userstamp, &ts, sizeof(ts)) ? -EFAULT : 0;
1971 }
1972 EXPORT_SYMBOL(sock_get_timestampns);
1973
1974 void sock_enable_timestamp(struct sock *sk, int flag)
1975 {
1976         if (!sock_flag(sk, flag)) {
1977                 sock_set_flag(sk, flag);
1978                 /*
1979                  * we just set one of the two flags which require net
1980                  * time stamping, but time stamping might have been on
1981                  * already because of the other one
1982                  */
1983                 if (!sock_flag(sk,
1984                                 flag == SOCK_TIMESTAMP ?
1985                                 SOCK_TIMESTAMPING_RX_SOFTWARE :
1986                                 SOCK_TIMESTAMP))
1987                         net_enable_timestamp();
1988         }
1989 }
1990
1991 /*
1992  *      Get a socket option on an socket.
1993  *
1994  *      FIX: POSIX 1003.1g is very ambiguous here. It states that
1995  *      asynchronous errors should be reported by getsockopt. We assume
1996  *      this means if you specify SO_ERROR (otherwise whats the point of it).
1997  */
1998 int sock_common_getsockopt(struct socket *sock, int level, int optname,
1999                            char __user *optval, int __user *optlen)
2000 {
2001         struct sock *sk = sock->sk;
2002
2003         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2004 }
2005 EXPORT_SYMBOL(sock_common_getsockopt);
2006
2007 #ifdef CONFIG_COMPAT
2008 int compat_sock_common_getsockopt(struct socket *sock, int level, int optname,
2009                                   char __user *optval, int __user *optlen)
2010 {
2011         struct sock *sk = sock->sk;
2012
2013         if (sk->sk_prot->compat_getsockopt != NULL)
2014                 return sk->sk_prot->compat_getsockopt(sk, level, optname,
2015                                                       optval, optlen);
2016         return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen);
2017 }
2018 EXPORT_SYMBOL(compat_sock_common_getsockopt);
2019 #endif
2020
2021 int sock_common_recvmsg(struct kiocb *iocb, struct socket *sock,
2022                         struct msghdr *msg, size_t size, int flags)
2023 {
2024         struct sock *sk = sock->sk;
2025         int addr_len = 0;
2026         int err;
2027
2028         err = sk->sk_prot->recvmsg(iocb, sk, msg, size, flags & MSG_DONTWAIT,
2029                                    flags & ~MSG_DONTWAIT, &addr_len);
2030         if (err >= 0)
2031                 msg->msg_namelen = addr_len;
2032         return err;
2033 }
2034 EXPORT_SYMBOL(sock_common_recvmsg);
2035
2036 /*
2037  *      Set socket options on an inet socket.
2038  */
2039 int sock_common_setsockopt(struct socket *sock, int level, int optname,
2040                            char __user *optval, unsigned int optlen)
2041 {
2042         struct sock *sk = sock->sk;
2043
2044         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2045 }
2046 EXPORT_SYMBOL(sock_common_setsockopt);
2047
2048 #ifdef CONFIG_COMPAT
2049 int compat_sock_common_setsockopt(struct socket *sock, int level, int optname,
2050                                   char __user *optval, unsigned int optlen)
2051 {
2052         struct sock *sk = sock->sk;
2053
2054         if (sk->sk_prot->compat_setsockopt != NULL)
2055                 return sk->sk_prot->compat_setsockopt(sk, level, optname,
2056                                                       optval, optlen);
2057         return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen);
2058 }
2059 EXPORT_SYMBOL(compat_sock_common_setsockopt);
2060 #endif
2061
2062 void sk_common_release(struct sock *sk)
2063 {
2064         if (sk->sk_prot->destroy)
2065                 sk->sk_prot->destroy(sk);
2066
2067         /*
2068          * Observation: when sock_common_release is called, processes have
2069          * no access to socket. But net still has.
2070          * Step one, detach it from networking:
2071          *
2072          * A. Remove from hash tables.
2073          */
2074
2075         sk->sk_prot->unhash(sk);
2076
2077         /*
2078          * In this point socket cannot receive new packets, but it is possible
2079          * that some packets are in flight because some CPU runs receiver and
2080          * did hash table lookup before we unhashed socket. They will achieve
2081          * receive queue and will be purged by socket destructor.
2082          *
2083          * Also we still have packets pending on receive queue and probably,
2084          * our own packets waiting in device queues. sock_destroy will drain
2085          * receive queue, but transmitted packets will delay socket destruction
2086          * until the last reference will be released.
2087          */
2088
2089         sock_orphan(sk);
2090
2091         xfrm_sk_free_policy(sk);
2092
2093         sk_refcnt_debug_release(sk);
2094         sock_put(sk);
2095 }
2096 EXPORT_SYMBOL(sk_common_release);
2097
2098 static DEFINE_RWLOCK(proto_list_lock);
2099 static LIST_HEAD(proto_list);
2100
2101 #ifdef CONFIG_PROC_FS
2102 #define PROTO_INUSE_NR  64      /* should be enough for the first time */
2103 struct prot_inuse {
2104         int val[PROTO_INUSE_NR];
2105 };
2106
2107 static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR);
2108
2109 #ifdef CONFIG_NET_NS
2110 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2111 {
2112         int cpu = smp_processor_id();
2113         per_cpu_ptr(net->core.inuse, cpu)->val[prot->inuse_idx] += val;
2114 }
2115 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2116
2117 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2118 {
2119         int cpu, idx = prot->inuse_idx;
2120         int res = 0;
2121
2122         for_each_possible_cpu(cpu)
2123                 res += per_cpu_ptr(net->core.inuse, cpu)->val[idx];
2124
2125         return res >= 0 ? res : 0;
2126 }
2127 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2128
2129 static int sock_inuse_init_net(struct net *net)
2130 {
2131         net->core.inuse = alloc_percpu(struct prot_inuse);
2132         return net->core.inuse ? 0 : -ENOMEM;
2133 }
2134
2135 static void sock_inuse_exit_net(struct net *net)
2136 {
2137         free_percpu(net->core.inuse);
2138 }
2139
2140 static struct pernet_operations net_inuse_ops = {
2141         .init = sock_inuse_init_net,
2142         .exit = sock_inuse_exit_net,
2143 };
2144
2145 static __init int net_inuse_init(void)
2146 {
2147         if (register_pernet_subsys(&net_inuse_ops))
2148                 panic("Cannot initialize net inuse counters");
2149
2150         return 0;
2151 }
2152
2153 core_initcall(net_inuse_init);
2154 #else
2155 static DEFINE_PER_CPU(struct prot_inuse, prot_inuse);
2156
2157 void sock_prot_inuse_add(struct net *net, struct proto *prot, int val)
2158 {
2159         __get_cpu_var(prot_inuse).val[prot->inuse_idx] += val;
2160 }
2161 EXPORT_SYMBOL_GPL(sock_prot_inuse_add);
2162
2163 int sock_prot_inuse_get(struct net *net, struct proto *prot)
2164 {
2165         int cpu, idx = prot->inuse_idx;
2166         int res = 0;
2167
2168         for_each_possible_cpu(cpu)
2169                 res += per_cpu(prot_inuse, cpu).val[idx];
2170
2171         return res >= 0 ? res : 0;
2172 }
2173 EXPORT_SYMBOL_GPL(sock_prot_inuse_get);
2174 #endif
2175
2176 static void assign_proto_idx(struct proto *prot)
2177 {
2178         prot->inuse_idx = find_first_zero_bit(proto_inuse_idx, PROTO_INUSE_NR);
2179
2180         if (unlikely(prot->inuse_idx == PROTO_INUSE_NR - 1)) {
2181                 printk(KERN_ERR "PROTO_INUSE_NR exhausted\n");
2182                 return;
2183         }
2184
2185         set_bit(prot->inuse_idx, proto_inuse_idx);
2186 }
2187
2188 static void release_proto_idx(struct proto *prot)
2189 {
2190         if (prot->inuse_idx != PROTO_INUSE_NR - 1)
2191                 clear_bit(prot->inuse_idx, proto_inuse_idx);
2192 }
2193 #else
2194 static inline void assign_proto_idx(struct proto *prot)
2195 {
2196 }
2197
2198 static inline void release_proto_idx(struct proto *prot)
2199 {
2200 }
2201 #endif
2202
2203 int proto_register(struct proto *prot, int alloc_slab)
2204 {
2205         if (alloc_slab) {
2206                 prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
2207                                         SLAB_HWCACHE_ALIGN | prot->slab_flags,
2208                                         NULL);
2209
2210                 if (prot->slab == NULL) {
2211                         printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
2212                                prot->name);
2213                         goto out;
2214                 }
2215
2216                 if (prot->rsk_prot != NULL) {
2217                         static const char mask[] = "request_sock_%s";
2218
2219                         prot->rsk_prot->slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2220                         if (prot->rsk_prot->slab_name == NULL)
2221                                 goto out_free_sock_slab;
2222
2223                         sprintf(prot->rsk_prot->slab_name, mask, prot->name);
2224                         prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
2225                                                                  prot->rsk_prot->obj_size, 0,
2226                                                                  SLAB_HWCACHE_ALIGN, NULL);
2227
2228                         if (prot->rsk_prot->slab == NULL) {
2229                                 printk(KERN_CRIT "%s: Can't create request sock SLAB cache!\n",
2230                                        prot->name);
2231                                 goto out_free_request_sock_slab_name;
2232                         }
2233                 }
2234
2235                 if (prot->twsk_prot != NULL) {
2236                         static const char mask[] = "tw_sock_%s";
2237
2238                         prot->twsk_prot->twsk_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL);
2239
2240                         if (prot->twsk_prot->twsk_slab_name == NULL)
2241                                 goto out_free_request_sock_slab;
2242
2243                         sprintf(prot->twsk_prot->twsk_slab_name, mask, prot->name);
2244                         prot->twsk_prot->twsk_slab =
2245                                 kmem_cache_create(prot->twsk_prot->twsk_slab_name,
2246                                                   prot->twsk_prot->twsk_obj_size,
2247                                                   0,
2248                                                   SLAB_HWCACHE_ALIGN |
2249                                                         prot->slab_flags,
2250                                                   NULL);
2251                         if (prot->twsk_prot->twsk_slab == NULL)
2252                                 goto out_free_timewait_sock_slab_name;
2253                 }
2254         }
2255
2256         write_lock(&proto_list_lock);
2257         list_add(&prot->node, &proto_list);
2258         assign_proto_idx(prot);
2259         write_unlock(&proto_list_lock);
2260         return 0;
2261
2262 out_free_timewait_sock_slab_name:
2263         kfree(prot->twsk_prot->twsk_slab_name);
2264 out_free_request_sock_slab:
2265         if (prot->rsk_prot && prot->rsk_prot->slab) {
2266                 kmem_cache_destroy(prot->rsk_prot->slab);
2267                 prot->rsk_prot->slab = NULL;
2268         }
2269 out_free_request_sock_slab_name:
2270         kfree(prot->rsk_prot->slab_name);
2271 out_free_sock_slab:
2272         kmem_cache_destroy(prot->slab);
2273         prot->slab = NULL;
2274 out:
2275         return -ENOBUFS;
2276 }
2277 EXPORT_SYMBOL(proto_register);
2278
2279 void proto_unregister(struct proto *prot)
2280 {
2281         write_lock(&proto_list_lock);
2282         release_proto_idx(prot);
2283         list_del(&prot->node);
2284         write_unlock(&proto_list_lock);
2285
2286         if (prot->slab != NULL) {
2287                 kmem_cache_destroy(prot->slab);
2288                 prot->slab = NULL;
2289         }
2290
2291         if (prot->rsk_prot != NULL && prot->rsk_prot->slab != NULL) {
2292                 kmem_cache_destroy(prot->rsk_prot->slab);
2293                 kfree(prot->rsk_prot->slab_name);
2294                 prot->rsk_prot->slab = NULL;
2295         }
2296
2297         if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) {
2298                 kmem_cache_destroy(prot->twsk_prot->twsk_slab);
2299                 kfree(prot->twsk_prot->twsk_slab_name);
2300                 prot->twsk_prot->twsk_slab = NULL;
2301         }
2302 }
2303 EXPORT_SYMBOL(proto_unregister);
2304
2305 #ifdef CONFIG_PROC_FS
2306 static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
2307         __acquires(proto_list_lock)
2308 {
2309         read_lock(&proto_list_lock);
2310         return seq_list_start_head(&proto_list, *pos);
2311 }
2312
2313 static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2314 {
2315         return seq_list_next(v, &proto_list, pos);
2316 }
2317
2318 static void proto_seq_stop(struct seq_file *seq, void *v)
2319         __releases(proto_list_lock)
2320 {
2321         read_unlock(&proto_list_lock);
2322 }
2323
2324 static char proto_method_implemented(const void *method)
2325 {
2326         return method == NULL ? 'n' : 'y';
2327 }
2328
2329 static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
2330 {
2331         seq_printf(seq, "%-9s %4u %6d  %6d   %-3s %6u   %-3s  %-10s "
2332                         "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
2333                    proto->name,
2334                    proto->obj_size,
2335                    sock_prot_inuse_get(seq_file_net(seq), proto),
2336                    proto->memory_allocated != NULL ? atomic_read(proto->memory_allocated) : -1,
2337                    proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
2338                    proto->max_header,
2339                    proto->slab == NULL ? "no" : "yes",
2340                    module_name(proto->owner),
2341                    proto_method_implemented(proto->close),
2342                    proto_method_implemented(proto->connect),
2343                    proto_method_implemented(proto->disconnect),
2344                    proto_method_implemented(proto->accept),
2345                    proto_method_implemented(proto->ioctl),
2346                    proto_method_implemented(proto->init),
2347                    proto_method_implemented(proto->destroy),
2348                    proto_method_implemented(proto->shutdown),
2349                    proto_method_implemented(proto->setsockopt),
2350                    proto_method_implemented(proto->getsockopt),
2351                    proto_method_implemented(proto->sendmsg),
2352                    proto_method_implemented(proto->recvmsg),
2353                    proto_method_implemented(proto->sendpage),
2354                    proto_method_implemented(proto->bind),
2355                    proto_method_implemented(proto->backlog_rcv),
2356                    proto_method_implemented(proto->hash),
2357                    proto_method_implemented(proto->unhash),
2358                    proto_method_implemented(proto->get_port),
2359                    proto_method_implemented(proto->enter_memory_pressure));
2360 }
2361
2362 static int proto_seq_show(struct seq_file *seq, void *v)
2363 {
2364         if (v == &proto_list)
2365                 seq_printf(seq, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
2366                            "protocol",
2367                            "size",
2368                            "sockets",
2369                            "memory",
2370                            "press",
2371                            "maxhdr",
2372                            "slab",
2373                            "module",
2374                            "cl co di ac io in de sh ss gs se re sp bi br ha uh gp em\n");
2375         else
2376                 proto_seq_printf(seq, list_entry(v, struct proto, node));
2377         return 0;
2378 }
2379
2380 static const struct seq_operations proto_seq_ops = {
2381         .start  = proto_seq_start,
2382         .next   = proto_seq_next,
2383         .stop   = proto_seq_stop,
2384         .show   = proto_seq_show,
2385 };
2386
2387 static int proto_seq_open(struct inode *inode, struct file *file)
2388 {
2389         return seq_open_net(inode, file, &proto_seq_ops,
2390                             sizeof(struct seq_net_private));
2391 }
2392
2393 static const struct file_operations proto_seq_fops = {
2394         .owner          = THIS_MODULE,
2395         .open           = proto_seq_open,
2396         .read           = seq_read,
2397         .llseek         = seq_lseek,
2398         .release        = seq_release_net,
2399 };
2400
2401 static __net_init int proto_init_net(struct net *net)
2402 {
2403         if (!proc_net_fops_create(net, "protocols", S_IRUGO, &proto_seq_fops))
2404                 return -ENOMEM;
2405
2406         return 0;
2407 }
2408
2409 static __net_exit void proto_exit_net(struct net *net)
2410 {
2411         proc_net_remove(net, "protocols");
2412 }
2413
2414
2415 static __net_initdata struct pernet_operations proto_net_ops = {
2416         .init = proto_init_net,
2417         .exit = proto_exit_net,
2418 };
2419
2420 static int __init proto_init(void)
2421 {
2422         return register_pernet_subsys(&proto_net_ops);
2423 }
2424
2425 subsys_initcall(proto_init);
2426
2427 #endif /* PROC_FS */