X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=Documentation%2Ffilesystems%2FLocking;h=18b9d0ca0630e281bb20dc5c5991e4b6f06b4643;hb=955234755ce4a2c33cfc558912aa8f2148cc1fc6;hp=91ec4b40ebfe2b722a02ace1798ff17f7447df83;hpb=ed2f2f9b3ff8debdf512f7687b232c3c1d7d60d7;p=safe%2Fjmp%2Flinux-2.6 diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 91ec4b4..18b9d0c 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -90,17 +90,15 @@ of the locking scheme for directory operations. prototypes: struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); - void (*read_inode) (struct inode *); void (*dirty_inode) (struct inode *); int (*write_inode) (struct inode *, int); - void (*put_inode) (struct inode *); void (*drop_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); void (*write_super) (struct super_block *); int (*sync_fs)(struct super_block *sb, int wait); - void (*write_super_lockfs) (struct super_block *); - void (*unlockfs) (struct super_block *); + int (*freeze_fs) (struct super_block *); + int (*unfreeze_fs) (struct super_block *); int (*statfs) (struct dentry *, struct kstatfs *); int (*remount_fs) (struct super_block *, int *, char *); void (*clear_inode) (struct inode *); @@ -111,30 +109,28 @@ prototypes: locking rules: All may block. - BKL s_lock s_umount -alloc_inode: no no no -destroy_inode: no -read_inode: no (see below) -dirty_inode: no (must not sleep) -write_inode: no -put_inode: no -drop_inode: no !!!inode_lock!!! -delete_inode: no -put_super: yes yes no -write_super: no yes read -sync_fs: no no read -write_super_lockfs: ? -unlockfs: ? -statfs: no no no -remount_fs: yes yes maybe (see below) -clear_inode: no -umount_begin: yes no no -show_options: no (vfsmount->sem) -quota_read: no no no (see below) -quota_write: no no no (see below) - -->read_inode() is not a method - it's a callback used in iget(). -->remount_fs() will have the s_umount lock if it's already mounted. + None have BKL + s_umount +alloc_inode: +destroy_inode: +dirty_inode: (must not sleep) +write_inode: +drop_inode: !!!inode_lock!!! +delete_inode: +put_super: write +write_super: read +sync_fs: read +freeze_fs: read +unfreeze_fs: read +statfs: no +remount_fs: maybe (see below) +clear_inode: +umount_begin: no +show_options: no (namespace_sem) +quota_read: no (see below) +quota_write: no (see below) + +->remount_fs() will have the s_umount exclusive lock if it's already mounted. When called from get_sb_single, it does NOT have the s_umount lock. ->quota_read() and ->quota_write() functions are both guaranteed to be the only ones operating on the quota file by the quota code (via @@ -149,8 +145,8 @@ prototypes: void (*kill_sb) (struct super_block *); locking rules: may block BKL -get_sb yes yes -kill_sb yes yes +get_sb yes no +kill_sb yes no ->get_sb() returns error or 0 with locked superblock attached to the vfsmount (exclusive on ->s_umount). @@ -166,8 +162,12 @@ prototypes: int (*set_page_dirty)(struct page *page); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); - int (*prepare_write)(struct file *, struct page *, unsigned, unsigned); - int (*commit_write)(struct file *, struct page *, unsigned, unsigned); + int (*write_begin)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); + int (*write_end)(struct file *, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); sector_t (*bmap)(struct address_space *, sector_t); int (*invalidatepage) (struct page *, unsigned long); int (*releasepage) (struct page *, int); @@ -178,22 +178,23 @@ prototypes: locking rules: All except set_page_dirty may block - BKL PageLocked(page) + BKL PageLocked(page) i_sem writepage: no yes, unlocks (see below) readpage: no yes, unlocks sync_page: no maybe writepages: no set_page_dirty no no readpages: no -prepare_write: no yes -commit_write: no yes -bmap: yes +write_begin: no locks the page yes +write_end: no yes, unlocks yes +perform_write: no n/a yes +bmap: no invalidatepage: no yes releasepage: no yes direct_IO: no launder_page: no yes - ->prepare_write(), ->commit_write(), ->sync_page() and ->readpage() + ->write_begin(), ->write_end(), ->sync_page() and ->readpage() may be called from the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O @@ -221,7 +222,7 @@ against the page the filesystem should redirty the page with redirty_page_for_writepage(), then unlock the page and return zero. This may also be done to avoid internal deadlocks, but rarely. -If the filesytem is called for sync then it must wait on any +If the filesystem is called for sync then it must wait on any in-progress I/O and then start new I/O. The filesystem should unlock the page synchronously, before returning to the @@ -394,11 +395,10 @@ prototypes: unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long); int (*check_flags)(int); - int (*dir_notify)(struct file *, unsigned long); }; locking rules: - All except ->poll() may block. + All may block. BKL llseek: no (see below) read: no @@ -411,12 +411,12 @@ ioctl: yes (see below) unlocked_ioctl: no (see below) compat_ioctl: no mmap: no -open: maybe (see below) +open: no flush: no release: no fsync: no (see below) aio_fsync: no -fasync: yes (see below) +fasync: no lock: yes readv: no writev: no @@ -424,7 +424,6 @@ sendfile: no sendpage: no get_unmapped_area: no check_flags: no -dir_notify: no ->llseek() locking has moved from llseek to the individual llseek implementations. If your fs is not using generic_file_llseek, you @@ -433,21 +432,17 @@ For many filesystems, it is probably safe to acquire the inode semaphore. Note some filesystems (i.e. remote ones) provide no protection for i_size so you will need to use the BKL. -->open() locking is in-transit: big lock partially moved into the methods. -The only exception is ->open() in the instances of file_operations that never -end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices -(chrdev_open() takes lock before replacing ->f_op and calling the secondary -method. As soon as we fix the handling of module reference counters all -instances of ->open() will be called without the BKL. - Note: ext2_release() was *the* source of contention on fs-intensive loads and dropping BKL on ->release() helps to get rid of that (we still grab BKL for cases when we close a file that had been opened r/w, but that can and should be done using the internal locking with smaller critical areas). Current worst offender is ext2_get_block()... -->fasync() is a mess. This area needs a big cleanup and that will probably -affect locking. +->fasync() is called without BKL protection, and is responsible for +maintaining the FASYNC bit in filp->f_flags. Most instances call +fasync_helper(), which does that maintenance, so it's not normally +something one needs to worry about. Return values > 0 will be mapped to +zero in the VFS layer. ->readdir() and ->ioctl() on directories must be changed. Ideally we would move ->readdir() to inode_operations and use a separate method for directory @@ -510,24 +505,37 @@ More details about quota locking can be found in fs/dquot.c. prototypes: void (*open)(struct vm_area_struct*); void (*close)(struct vm_area_struct*); - struct page *(*fault)(struct vm_area_struct*, struct fault_data *); - struct page *(*nopage)(struct vm_area_struct*, unsigned long, int *); - int (*page_mkwrite)(struct vm_area_struct *, struct page *); + int (*fault)(struct vm_area_struct*, struct vm_fault *); + int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *); + int (*access)(struct vm_area_struct *, unsigned long, void*, int, int); locking rules: BKL mmap_sem PageLocked(page) open: no yes close: no yes -fault: no yes -nopage: no yes -page_mkwrite: no yes no - - ->page_mkwrite() is called when a previously read-only page is -about to become writeable. The file system is responsible for -protecting against truncate races. Once appropriate action has been -taking to lock out truncate, the page range should be verified to be -within i_size. The page mapping should also be checked that it is not -NULL. +fault: no yes can return with page locked +page_mkwrite: no yes can return with page locked +access: no yes + + ->fault() is called when a previously not present pte is about +to be faulted in. The filesystem must find and return the page associated +with the passed in "pgoff" in the vm_fault structure. If it is possible that +the page may be truncated and/or invalidated, then the filesystem must lock +the page, then ensure it is not already truncated (the page lock will block +subsequent truncate), and then return with VM_FAULT_LOCKED, and the page +locked. The VM will unlock the page. + + ->page_mkwrite() is called when a previously read-only pte is +about to become writeable. The filesystem again must ensure that there are +no truncate/invalidate races, and then return with the page locked. If +the page has been truncated, the filesystem should not look up a new page +like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which +will cause the VM to retry the fault. + + ->access() is called when get_user_pages() fails in +acces_process_vm(), typically used to debug a process through +/proc/pid/mem or ptrace. This function is needed only for +VM_IO | VM_PFNMAP VMAs. ================================================================================ Dubious stuff @@ -537,4 +545,3 @@ NULL. ipc/shm.c::shm_delete() - may need BKL. ->read() and ->write() in many drivers are (probably) missing BKL. -drivers/sgi/char/graphics.c::sgi_graphics_nopage() - may need BKL.