[PATCH] msync(MS_SYNC): don't hold mmap_sem while syncing
[safe/jmp/linux-2.6] / mm / msync.c
1 /*
2  *      linux/mm/msync.c
3  *
4  * Copyright (C) 1994-1999  Linus Torvalds
5  */
6
7 /*
8  * The msync() system call.
9  */
10 #include <linux/slab.h>
11 #include <linux/pagemap.h>
12 #include <linux/mm.h>
13 #include <linux/mman.h>
14 #include <linux/hugetlb.h>
15 #include <linux/writeback.h>
16 #include <linux/file.h>
17 #include <linux/syscalls.h>
18
19 #include <asm/pgtable.h>
20 #include <asm/tlbflush.h>
21
22 static unsigned long msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
23                                 unsigned long addr, unsigned long end)
24 {
25         pte_t *pte;
26         spinlock_t *ptl;
27         int progress = 0;
28         unsigned long ret = 0;
29
30 again:
31         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
32         do {
33                 struct page *page;
34
35                 if (progress >= 64) {
36                         progress = 0;
37                         if (need_resched() || need_lockbreak(ptl))
38                                 break;
39                 }
40                 progress++;
41                 if (!pte_present(*pte))
42                         continue;
43                 if (!pte_maybe_dirty(*pte))
44                         continue;
45                 page = vm_normal_page(vma, addr, *pte);
46                 if (!page)
47                         continue;
48                 if (ptep_clear_flush_dirty(vma, addr, pte) ||
49                                 page_test_and_clear_dirty(page))
50                         ret += set_page_dirty(page);
51                 progress += 3;
52         } while (pte++, addr += PAGE_SIZE, addr != end);
53         pte_unmap_unlock(pte - 1, ptl);
54         cond_resched();
55         if (addr != end)
56                 goto again;
57         return ret;
58 }
59
60 static inline unsigned long msync_pmd_range(struct vm_area_struct *vma,
61                         pud_t *pud, unsigned long addr, unsigned long end)
62 {
63         pmd_t *pmd;
64         unsigned long next;
65         unsigned long ret = 0;
66
67         pmd = pmd_offset(pud, addr);
68         do {
69                 next = pmd_addr_end(addr, end);
70                 if (pmd_none_or_clear_bad(pmd))
71                         continue;
72                 ret += msync_pte_range(vma, pmd, addr, next);
73         } while (pmd++, addr = next, addr != end);
74         return ret;
75 }
76
77 static inline unsigned long msync_pud_range(struct vm_area_struct *vma,
78                         pgd_t *pgd, unsigned long addr, unsigned long end)
79 {
80         pud_t *pud;
81         unsigned long next;
82         unsigned long ret = 0;
83
84         pud = pud_offset(pgd, addr);
85         do {
86                 next = pud_addr_end(addr, end);
87                 if (pud_none_or_clear_bad(pud))
88                         continue;
89                 ret += msync_pmd_range(vma, pud, addr, next);
90         } while (pud++, addr = next, addr != end);
91         return ret;
92 }
93
94 static unsigned long msync_page_range(struct vm_area_struct *vma,
95                                 unsigned long addr, unsigned long end)
96 {
97         pgd_t *pgd;
98         unsigned long next;
99         unsigned long ret = 0;
100
101         /* For hugepages we can't go walking the page table normally,
102          * but that's ok, hugetlbfs is memory based, so we don't need
103          * to do anything more on an msync().
104          */
105         if (vma->vm_flags & VM_HUGETLB)
106                 return 0;
107
108         BUG_ON(addr >= end);
109         pgd = pgd_offset(vma->vm_mm, addr);
110         flush_cache_range(vma, addr, end);
111         do {
112                 next = pgd_addr_end(addr, end);
113                 if (pgd_none_or_clear_bad(pgd))
114                         continue;
115                 ret += msync_pud_range(vma, pgd, addr, next);
116         } while (pgd++, addr = next, addr != end);
117         return ret;
118 }
119
120 /*
121  * MS_SYNC syncs the entire file - including mappings.
122  *
123  * MS_ASYNC does not start I/O (it used to, up to 2.5.67).  Instead, it just
124  * marks the relevant pages dirty.  The application may now run fsync() to
125  * write out the dirty pages and wait on the writeout and check the result.
126  * Or the application may run fadvise(FADV_DONTNEED) against the fd to start
127  * async writeout immediately.
128  * So my _not_ starting I/O in MS_ASYNC we provide complete flexibility to
129  * applications.
130  */
131 static int msync_interval(struct vm_area_struct *vma, unsigned long addr,
132                         unsigned long end, int flags,
133                         unsigned long *nr_pages_dirtied)
134 {
135         struct file *file = vma->vm_file;
136
137         if ((flags & MS_INVALIDATE) && (vma->vm_flags & VM_LOCKED))
138                 return -EBUSY;
139
140         if (file && (vma->vm_flags & VM_SHARED))
141                 *nr_pages_dirtied = msync_page_range(vma, addr, end);
142         return 0;
143 }
144
145 asmlinkage long sys_msync(unsigned long start, size_t len, int flags)
146 {
147         unsigned long end;
148         struct vm_area_struct *vma;
149         int unmapped_error, error = -EINVAL;
150         int done = 0;
151
152         if (flags & ~(MS_ASYNC | MS_INVALIDATE | MS_SYNC))
153                 goto out;
154         if (start & ~PAGE_MASK)
155                 goto out;
156         if ((flags & MS_ASYNC) && (flags & MS_SYNC))
157                 goto out;
158         error = -ENOMEM;
159         len = (len + ~PAGE_MASK) & PAGE_MASK;
160         end = start + len;
161         if (end < start)
162                 goto out;
163         error = 0;
164         if (end == start)
165                 goto out;
166         /*
167          * If the interval [start,end) covers some unmapped address ranges,
168          * just ignore them, but return -ENOMEM at the end.
169          */
170         down_read(&current->mm->mmap_sem);
171         if (flags & MS_SYNC)
172                 current->flags |= PF_SYNCWRITE;
173         vma = find_vma(current->mm, start);
174         unmapped_error = 0;
175         do {
176                 unsigned long nr_pages_dirtied = 0;
177                 struct file *file;
178
179                 /* Still start < end. */
180                 error = -ENOMEM;
181                 if (!vma)
182                         goto out_unlock;
183                 /* Here start < vma->vm_end. */
184                 if (start < vma->vm_start) {
185                         unmapped_error = -ENOMEM;
186                         start = vma->vm_start;
187                 }
188                 /* Here vma->vm_start <= start < vma->vm_end. */
189                 if (end <= vma->vm_end) {
190                         if (start < end) {
191                                 error = msync_interval(vma, start, end, flags,
192                                                         &nr_pages_dirtied);
193                                 if (error)
194                                         goto out_unlock;
195                         }
196                         error = unmapped_error;
197                         done = 1;
198                 } else {
199                         /* Here vma->vm_start <= start < vma->vm_end < end. */
200                         error = msync_interval(vma, start, vma->vm_end, flags,
201                                                 &nr_pages_dirtied);
202                         if (error)
203                                 goto out_unlock;
204                 }
205                 file = vma->vm_file;
206                 start = vma->vm_end;
207                 if ((flags & MS_ASYNC) && file && nr_pages_dirtied) {
208                         get_file(file);
209                         up_read(&current->mm->mmap_sem);
210                         balance_dirty_pages_ratelimited_nr(file->f_mapping,
211                                                         nr_pages_dirtied);
212                         fput(file);
213                         down_read(&current->mm->mmap_sem);
214                         vma = find_vma(current->mm, start);
215                 } else if ((flags & MS_SYNC) && file &&
216                                 (vma->vm_flags & VM_SHARED)) {
217                         struct address_space *mapping;
218                         int err;
219
220                         get_file(file);
221                         up_read(&current->mm->mmap_sem);
222                         mapping = file->f_mapping;
223                         error = filemap_fdatawrite(mapping);
224                         if (file->f_op && file->f_op->fsync) {
225                                 mutex_lock(&mapping->host->i_mutex);
226                                 err = file->f_op->fsync(file,file->f_dentry,1);
227                                 mutex_unlock(&mapping->host->i_mutex);
228                                 if (err && !error)
229                                         error = err;
230                         }
231                         err = filemap_fdatawait(mapping);
232                         if (err && !error)
233                                 error = err;
234                         fput(file);
235                         down_read(&current->mm->mmap_sem);
236                         if (error)
237                                 goto out_unlock;
238                         vma = find_vma(current->mm, start);
239                 } else {
240                         vma = vma->vm_next;
241                 }
242         } while (!done);
243 out_unlock:
244         current->flags &= ~PF_SYNCWRITE;
245         up_read(&current->mm->mmap_sem);
246 out:
247         return error;
248 }