dquot: cleanup dquot initialize routine

[safe/jmp/linux-2.6] / Documentation / filesystems / Locking
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking

index 247d7f6..06bbbed 100644 (file)
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -15,6 +15,7 @@ prototypes:
         int (*d_delete)(struct dentry *);
         void (*d_release)(struct dentry *);
         void (*d_iput)(struct dentry *, struct inode *);
+       char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
  
  locking rules:
         none have BKL
@@ -25,6 +26,7 @@ d_compare:    no              yes             no              no
  d_delete:      yes             no              yes             no
  d_release:     no              no              no              yes
  d_iput:                no              no              no              yes
+d_dname:       no              no              no              no
  
  --------------------------- inode_operations --------------------------- 
  prototypes:
@@ -52,7 +54,7 @@ ata *);
  
  locking rules:
         all may block, none have BKL
-               i_sem(inode)
+               i_mutex(inode)
  lookup:                yes
  create:                yes
  link:          yes (both)
@@ -72,7 +74,7 @@ setxattr:     yes
  getxattr:      no
  listxattr:     no
  removexattr:   yes
-       Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_sem on
+       Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
  victim.
         cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
         ->truncate() is never called directly - it's a callback, not a
@@ -88,17 +90,15 @@ of the locking scheme for directory operations.
  prototypes:
         struct inode *(*alloc_inode)(struct super_block *sb);
         void (*destroy_inode)(struct inode *);
-       void (*read_inode) (struct inode *);
         void (*dirty_inode) (struct inode *);
         int (*write_inode) (struct inode *, int);
-       void (*put_inode) (struct inode *);
         void (*drop_inode) (struct inode *);
         void (*delete_inode) (struct inode *);
         void (*put_super) (struct super_block *);
         void (*write_super) (struct super_block *);
         int (*sync_fs)(struct super_block *sb, int wait);
-       void (*write_super_lockfs) (struct super_block *);
-       void (*unlockfs) (struct super_block *);
+       int (*freeze_fs) (struct super_block *);
+       int (*unfreeze_fs) (struct super_block *);
         int (*statfs) (struct dentry *, struct kstatfs *);
         int (*remount_fs) (struct super_block *, int *, char *);
         void (*clear_inode) (struct inode *);
@@ -109,30 +109,28 @@ prototypes:
  
  locking rules:
         All may block.
-                       BKL     s_lock  s_umount
-alloc_inode:           no      no      no
-destroy_inode:         no
-read_inode:            no                              (see below)
-dirty_inode:           no                              (must not sleep)
-write_inode:           no
-put_inode:             no
-drop_inode:            no                              !!!inode_lock!!!
-delete_inode:          no
-put_super:             yes     yes     no
-write_super:           no      yes     read
-sync_fs:               no      no      read
-write_super_lockfs:    ?
-unlockfs:              ?
-statfs:                        no      no      no
-remount_fs:            no      yes     maybe           (see below)
-clear_inode:           no
-umount_begin:          yes     no      no
-show_options:          no                              (vfsmount->sem)
-quota_read:            no      no      no              (see below)
-quota_write:           no      no      no              (see below)
-
-->read_inode() is not a method - it's a callback used in iget().
-->remount_fs() will have the s_umount lock if it's already mounted.
+       None have BKL
+                       s_umount
+alloc_inode:
+destroy_inode:
+dirty_inode:                           (must not sleep)
+write_inode:
+drop_inode:                            !!!inode_lock!!!
+delete_inode:
+put_super:             write
+write_super:           read
+sync_fs:               read
+freeze_fs:             read
+unfreeze_fs:           read
+statfs:                        no
+remount_fs:            maybe           (see below)
+clear_inode:
+umount_begin:          no
+show_options:          no              (namespace_sem)
+quota_read:            no              (see below)
+quota_write:           no              (see below)
+
+->remount_fs() will have the s_umount exclusive lock if it's already mounted.
  When called from get_sb_single, it does NOT have the s_umount lock.
  ->quota_read() and ->quota_write() functions are both guaranteed to
  be the only ones operating on the quota file by the quota code (via
@@ -147,8 +145,8 @@ prototypes:
         void (*kill_sb) (struct super_block *);
  locking rules:
                 may block       BKL
-get_sb         yes             yes
-kill_sb                yes             yes
+get_sb         yes             no
+kill_sb                yes             no
  
  ->get_sb() returns error or 0 with locked superblock attached to the vfsmount
  (exclusive on ->s_umount).
@@ -164,32 +162,39 @@ prototypes:
         int (*set_page_dirty)(struct page *page);
         int (*readpages)(struct file *filp, struct address_space *mapping,
                         struct list_head *pages, unsigned nr_pages);
-       int (*prepare_write)(struct file *, struct page *, unsigned, unsigned);
-       int (*commit_write)(struct file *, struct page *, unsigned, unsigned);
+       int (*write_begin)(struct file *, struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned flags,
+                               struct page **pagep, void **fsdata);
+       int (*write_end)(struct file *, struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned copied,
+                               struct page *page, void *fsdata);
         sector_t (*bmap)(struct address_space *, sector_t);
         int (*invalidatepage) (struct page *, unsigned long);
         int (*releasepage) (struct page *, int);
         int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
                         loff_t offset, unsigned long nr_segs);
+       int (*launder_page) (struct page *);
  
  locking rules:
         All except set_page_dirty may block
  
-                       BKL     PageLocked(page)
+                       BKL     PageLocked(page)        i_sem
  writepage:             no      yes, unlocks (see below)
  readpage:              no      yes, unlocks
  sync_page:             no      maybe
  writepages:            no
  set_page_dirty         no      no
  readpages:             no
-prepare_write:         no      yes
-commit_write:          no      yes
-bmap:                  yes
+write_begin:           no      locks the page          yes
+write_end:             no      yes, unlocks            yes
+perform_write:         no      n/a                     yes
+bmap:                  no
  invalidatepage:                no      yes
  releasepage:           no      yes
  direct_IO:             no
+launder_page:          no      yes
  
-       ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage()
+       ->write_begin(), ->write_end(), ->sync_page() and ->readpage()
  may be called from the request handler (/dev/loop).
  
         ->readpage() unlocks the page, either synchronously or via I/O
@@ -217,7 +222,7 @@ against the page the filesystem should redirty the page with
  redirty_page_for_writepage(), then unlock the page and return zero.
  This may also be done to avoid internal deadlocks, but rarely.
  
-If the filesytem is called for sync then it must wait on any
+If the filesystem is called for sync then it must wait on any
  in-progress I/O and then start new I/O.
  
  The filesystem should unlock the page synchronously, before returning to the
@@ -281,6 +286,12 @@ buffers from the page in preparation for freeing it.  It returns zero to
  indicate that the buffers are (or may be) freeable.  If ->releasepage is zero,
  the kernel assumes that the fs has no private interest in the buffers.
  
+       ->launder_page() may be called prior to releasing a page if
+it is still found to be dirty. It returns zero if the page was successfully
+cleaned, or an error value if not. Note that in order to prevent the page
+getting mapped back in and redirtied, it needs to be kept locked
+across the entire operation.
+
         Note: currently almost all instances of address_space methods are
  using BKL for internal serialization and that's one of the worst sources
  of contention. Normally they are calling library functions (in fs/buffer.c)
@@ -356,10 +367,9 @@ The last two are called only from check_disk_change().
  prototypes:
         loff_t (*llseek) (struct file *, loff_t, int);
         ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
-       ssize_t (*aio_read) (struct kiocb *, char __user *, size_t, loff_t);
         ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
-       ssize_t (*aio_write) (struct kiocb *, const char __user *, size_t,
-                       loff_t);
+       ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+       ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
         int (*readdir) (struct file *, void *, filldir_t);
         unsigned int (*poll) (struct file *, struct poll_table_struct *);
         int (*ioctl) (struct inode *, struct file *, unsigned int,
@@ -385,11 +395,10 @@ prototypes:
         unsigned long (*get_unmapped_area)(struct file *, unsigned long,
                         unsigned long, unsigned long, unsigned long);
         int (*check_flags)(int);
-       int (*dir_notify)(struct file *, unsigned long);
  };
  
  locking rules:
-       All except ->poll() may block.
+       All may block.
                         BKL
  llseek:                        no      (see below)
  read:                  no
@@ -402,12 +411,12 @@ ioctl:                    yes     (see below)
  unlocked_ioctl:                no      (see below)
  compat_ioctl:          no
  mmap:                  no
-open:                  maybe   (see below)
+open:                  no
  flush:                 no
  release:               no
  fsync:                 no      (see below)
  aio_fsync:             no
-fasync:                        yes     (see below)
+fasync:                        no
  lock:                  yes
  readv:                 no
  writev:                        no
@@ -415,7 +424,6 @@ sendfile:           no
  sendpage:              no
  get_unmapped_area:     no
  check_flags:           no
-dir_notify:            no
  
  ->llseek() locking has moved from llseek to the individual llseek
  implementations.  If your fs is not using generic_file_llseek, you
@@ -424,21 +432,17 @@ For many filesystems, it is probably safe to acquire the inode
  semaphore.  Note some filesystems (i.e. remote ones) provide no
  protection for i_size so you will need to use the BKL.
  
-->open() locking is in-transit: big lock partially moved into the methods.
-The only exception is ->open() in the instances of file_operations that never
-end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices
-(chrdev_open() takes lock before replacing ->f_op and calling the secondary
-method. As soon as we fix the handling of module reference counters all
-instances of ->open() will be called without the BKL.
-
  Note: ext2_release() was *the* source of contention on fs-intensive
  loads and dropping BKL on ->release() helps to get rid of that (we still
  grab BKL for cases when we close a file that had been opened r/w, but that
  can and should be done using the internal locking with smaller critical areas).
  Current worst offender is ext2_get_block()...
  
-->fasync() is a mess. This area needs a big cleanup and that will probably
-affect locking.
+->fasync() is called without BKL protection, and is responsible for
+maintaining the FASYNC bit in filp->f_flags.  Most instances call
+fasync_helper(), which does that maintenance, so it's not normally
+something one needs to worry about.  Return values > 0 will be mapped to
+zero in the VFS layer.
  
  ->readdir() and ->ioctl() on directories must be changed. Ideally we would
  move ->readdir() to inode_operations and use a separate method for directory
@@ -452,17 +456,10 @@ doesn't take the BKL.
  ->read on directories probably must go away - we should just enforce -EISDIR
  in sys_read() and friends.
  
-->fsync() has i_sem on inode.
+->fsync() has i_mutex on inode.
  
  --------------------------- dquot_operations -------------------------------
  prototypes:
-       int (*initialize) (struct inode *, int);
-       int (*drop) (struct inode *);
-       int (*alloc_space) (struct inode *, qsize_t, int);
-       int (*alloc_inode) (const struct inode *, unsigned long);
-       int (*free_space) (struct inode *, qsize_t);
-       int (*free_inode) (const struct inode *, unsigned long);
-       int (*transfer) (struct inode *, struct iattr *);
         int (*write_dquot) (struct dquot *);
         int (*acquire_dquot) (struct dquot *);
         int (*release_dquot) (struct dquot *);
@@ -475,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
  What filesystem should expect from the generic quota functions:
  
                 FS recursion    Held locks when called
-initialize:    yes             maybe dqonoff_sem
-drop:          yes             -
-alloc_space:   ->mark_dirty()  -
-alloc_inode:   ->mark_dirty()  -
-free_space:    ->mark_dirty()  -
-free_inode:    ->mark_dirty()  -
-transfer:      yes             -
  write_dquot:   yes             dqonoff_sem or dqptr_sem
  acquire_dquot: yes             dqonoff_sem or dqptr_sem
  release_dquot: yes             dqonoff_sem or dqptr_sem
@@ -491,23 +481,43 @@ write_info:       yes             dqonoff_sem
  FS recursion means calling ->quota_read() and ->quota_write() from superblock
  operations.
  
-->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
-
  More details about quota locking can be found in fs/dquot.c.
  
  --------------------------- vm_operations_struct -----------------------------
  prototypes:
         void (*open)(struct vm_area_struct*);
         void (*close)(struct vm_area_struct*);
-       struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *);
+       int (*fault)(struct vm_area_struct*, struct vm_fault *);
+       int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
+       int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
  
  locking rules:
-               BKL     mmap_sem
+               BKL     mmap_sem        PageLocked(page)
  open:          no      yes
  close:         no      yes
-nopage:                no      yes
+fault:         no      yes             can return with page locked
+page_mkwrite:  no      yes             can return with page locked
+access:                no      yes
+
+       ->fault() is called when a previously not present pte is about
+to be faulted in. The filesystem must find and return the page associated
+with the passed in "pgoff" in the vm_fault structure. If it is possible that
+the page may be truncated and/or invalidated, then the filesystem must lock
+the page, then ensure it is not already truncated (the page lock will block
+subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
+locked. The VM will unlock the page.
+
+       ->page_mkwrite() is called when a previously read-only pte is
+about to become writeable. The filesystem again must ensure that there are
+no truncate/invalidate races, and then return with the page locked. If
+the page has been truncated, the filesystem should not look up a new page
+like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
+will cause the VM to retry the fault.
+
+       ->access() is called when get_user_pages() fails in
+acces_process_vm(), typically used to debug a process through
+/proc/pid/mem or ptrace.  This function is needed only for
+VM_IO | VM_PFNMAP VMAs.
  
  ================================================================================
                         Dubious stuff
@@ -517,4 +527,3 @@ nopage:             no      yes
  
  ipc/shm.c::shm_delete() - may need BKL.
  ->read() and ->write() in many drivers are (probably) missing BKL.
-drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL.