Btrfs: Add zlib compression support
[safe/jmp/linux-2.6] / fs / btrfs / file.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/time.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/smp_lock.h>
26 #include <linux/backing-dev.h>
27 #include <linux/mpage.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/statfs.h>
31 #include <linux/compat.h>
32 #include <linux/version.h>
33 #include "ctree.h"
34 #include "disk-io.h"
35 #include "transaction.h"
36 #include "btrfs_inode.h"
37 #include "ioctl.h"
38 #include "print-tree.h"
39 #include "tree-log.h"
40 #include "locking.h"
41 #include "compat.h"
42
43
44 /* simple helper to fault in pages and copy.  This should go away
45  * and be replaced with calls into generic code.
46  */
47 static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
48                                          int write_bytes,
49                                          struct page **prepared_pages,
50                                          const char __user * buf)
51 {
52         long page_fault = 0;
53         int i;
54         int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56         for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57                 size_t count = min_t(size_t,
58                                      PAGE_CACHE_SIZE - offset, write_bytes);
59                 struct page *page = prepared_pages[i];
60                 fault_in_pages_readable(buf, count);
61
62                 /* Copy data from userspace to the current page */
63                 kmap(page);
64                 page_fault = __copy_from_user(page_address(page) + offset,
65                                               buf, count);
66                 /* Flush processor's dcache for this page */
67                 flush_dcache_page(page);
68                 kunmap(page);
69                 buf += count;
70                 write_bytes -= count;
71
72                 if (page_fault)
73                         break;
74         }
75         return page_fault ? -EFAULT : 0;
76 }
77
78 /*
79  * unlocks pages after btrfs_file_write is done with them
80  */
81 static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
82 {
83         size_t i;
84         for (i = 0; i < num_pages; i++) {
85                 if (!pages[i])
86                         break;
87                 /* page checked is some magic around finding pages that
88                  * have been modified without going through btrfs_set_page_dirty
89                  * clear it here
90                  */
91                 ClearPageChecked(pages[i]);
92                 unlock_page(pages[i]);
93                 mark_page_accessed(pages[i]);
94                 page_cache_release(pages[i]);
95         }
96 }
97
98 /*
99  * after copy_from_user, pages need to be dirtied and we need to make
100  * sure holes are created between the current EOF and the start of
101  * any next extents (if required).
102  *
103  * this also makes the decision about creating an inline extent vs
104  * doing real data extents, marking pages dirty and delalloc as required.
105  */
106 static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
107                                    struct btrfs_root *root,
108                                    struct file *file,
109                                    struct page **pages,
110                                    size_t num_pages,
111                                    loff_t pos,
112                                    size_t write_bytes)
113 {
114         int err = 0;
115         int i;
116         struct inode *inode = fdentry(file)->d_inode;
117         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118         u64 hint_byte;
119         u64 num_bytes;
120         u64 start_pos;
121         u64 end_of_last_block;
122         u64 end_pos = pos + write_bytes;
123         loff_t isize = i_size_read(inode);
124
125         start_pos = pos & ~((u64)root->sectorsize - 1);
126         num_bytes = (write_bytes + pos - start_pos +
127                     root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129         end_of_last_block = start_pos + num_bytes - 1;
130
131         lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132         trans = btrfs_join_transaction(root, 1);
133         if (!trans) {
134                 err = -ENOMEM;
135                 goto out_unlock;
136         }
137         btrfs_set_trans_block_group(trans, inode);
138         hint_byte = 0;
139
140         if ((end_of_last_block & 4095) == 0) {
141                 printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
142         }
143         set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
144
145         /* FIXME...EIEIO, ENOSPC and more */
146         /* insert any holes we need to create */
147         if (isize < start_pos) {
148                 u64 last_pos_in_file;
149                 u64 hole_size;
150                 u64 mask = root->sectorsize - 1;
151                 last_pos_in_file = (isize + mask) & ~mask;
152                 hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
153                 if (hole_size > 0) {
154                         btrfs_wait_ordered_range(inode, last_pos_in_file,
155                                                  last_pos_in_file + hole_size);
156                         mutex_lock(&BTRFS_I(inode)->extent_mutex);
157                         err = btrfs_drop_extents(trans, root, inode,
158                                                  last_pos_in_file,
159                                                  last_pos_in_file + hole_size,
160                                                  last_pos_in_file,
161                                                  &hint_byte);
162                         if (err)
163                                 goto failed;
164
165                         err = btrfs_insert_file_extent(trans, root,
166                                                        inode->i_ino,
167                                                        last_pos_in_file,
168                                                        0, 0, hole_size, 0,
169                                                        hole_size, 0, 0, 0);
170                         btrfs_drop_extent_cache(inode, last_pos_in_file,
171                                         last_pos_in_file + hole_size - 1, 0);
172                         mutex_unlock(&BTRFS_I(inode)->extent_mutex);
173                         btrfs_check_file(root, inode);
174                 }
175                 if (err)
176                         goto failed;
177         }
178
179         /* check for reserved extents on each page, we don't want
180          * to reset the delalloc bit on things that already have
181          * extents reserved.
182          */
183         btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
184         for (i = 0; i < num_pages; i++) {
185                 struct page *p = pages[i];
186                 SetPageUptodate(p);
187                 ClearPageChecked(p);
188                 set_page_dirty(p);
189         }
190         if (end_pos > isize) {
191                 i_size_write(inode, end_pos);
192                 btrfs_update_inode(trans, root, inode);
193         }
194 failed:
195         err = btrfs_end_transaction(trans, root);
196 out_unlock:
197         unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
198         return err;
199 }
200
201 /*
202  * this drops all the extents in the cache that intersect the range
203  * [start, end].  Existing extents are split as required.
204  */
205 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
206                             int skip_pinned)
207 {
208         struct extent_map *em;
209         struct extent_map *split = NULL;
210         struct extent_map *split2 = NULL;
211         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
212         u64 len = end - start + 1;
213         int ret;
214         int testend = 1;
215         unsigned long flags;
216         int compressed = 0;
217
218         WARN_ON(end < start);
219         if (end == (u64)-1) {
220                 len = (u64)-1;
221                 testend = 0;
222         }
223         while(1) {
224                 if (!split)
225                         split = alloc_extent_map(GFP_NOFS);
226                 if (!split2)
227                         split2 = alloc_extent_map(GFP_NOFS);
228
229                 spin_lock(&em_tree->lock);
230                 em = lookup_extent_mapping(em_tree, start, len);
231                 if (!em) {
232                         spin_unlock(&em_tree->lock);
233                         break;
234                 }
235                 flags = em->flags;
236                 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
237                         spin_unlock(&em_tree->lock);
238                         if (em->start <= start &&
239                             (!testend || em->start + em->len >= start + len)) {
240                                 free_extent_map(em);
241                                 break;
242                         }
243                         if (start < em->start) {
244                                 len = em->start - start;
245                         } else {
246                                 len = start + len - (em->start + em->len);
247                                 start = em->start + em->len;
248                         }
249                         free_extent_map(em);
250                         continue;
251                 }
252                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
253                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
254                 remove_extent_mapping(em_tree, em);
255
256                 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
257                     em->start < start) {
258                         split->start = em->start;
259                         split->len = start - em->start;
260                         split->block_start = em->block_start;
261
262                         if (compressed)
263                                 split->block_len = em->block_len;
264                         else
265                                 split->block_len = split->len;
266
267                         split->bdev = em->bdev;
268                         split->flags = flags;
269                         ret = add_extent_mapping(em_tree, split);
270                         BUG_ON(ret);
271                         free_extent_map(split);
272                         split = split2;
273                         split2 = NULL;
274                 }
275                 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
276                     testend && em->start + em->len > start + len) {
277                         u64 diff = start + len - em->start;
278
279                         split->start = start + len;
280                         split->len = em->start + em->len - (start + len);
281                         split->bdev = em->bdev;
282                         split->flags = flags;
283
284                         if (compressed) {
285                                 split->block_len = em->block_len;
286                                 split->block_start = em->block_start;
287                         } else {
288                                 split->block_len = split->len;
289                                 split->block_start = em->block_start + diff;
290                         }
291
292                         ret = add_extent_mapping(em_tree, split);
293                         BUG_ON(ret);
294                         free_extent_map(split);
295                         split = NULL;
296                 }
297                 spin_unlock(&em_tree->lock);
298
299                 /* once for us */
300                 free_extent_map(em);
301                 /* once for the tree*/
302                 free_extent_map(em);
303         }
304         if (split)
305                 free_extent_map(split);
306         if (split2)
307                 free_extent_map(split2);
308         return 0;
309 }
310
311 int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
312 {
313         return 0;
314 #if 0
315         struct btrfs_path *path;
316         struct btrfs_key found_key;
317         struct extent_buffer *leaf;
318         struct btrfs_file_extent_item *extent;
319         u64 last_offset = 0;
320         int nritems;
321         int slot;
322         int found_type;
323         int ret;
324         int err = 0;
325         u64 extent_end = 0;
326
327         path = btrfs_alloc_path();
328         ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
329                                        last_offset, 0);
330         while(1) {
331                 nritems = btrfs_header_nritems(path->nodes[0]);
332                 if (path->slots[0] >= nritems) {
333                         ret = btrfs_next_leaf(root, path);
334                         if (ret)
335                                 goto out;
336                         nritems = btrfs_header_nritems(path->nodes[0]);
337                 }
338                 slot = path->slots[0];
339                 leaf = path->nodes[0];
340                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
341                 if (found_key.objectid != inode->i_ino)
342                         break;
343                 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
344                         goto out;
345
346                 if (found_key.offset < last_offset) {
347                         WARN_ON(1);
348                         btrfs_print_leaf(root, leaf);
349                         printk("inode %lu found offset %Lu expected %Lu\n",
350                                inode->i_ino, found_key.offset, last_offset);
351                         err = 1;
352                         goto out;
353                 }
354                 extent = btrfs_item_ptr(leaf, slot,
355                                         struct btrfs_file_extent_item);
356                 found_type = btrfs_file_extent_type(leaf, extent);
357                 if (found_type == BTRFS_FILE_EXTENT_REG) {
358                         extent_end = found_key.offset +
359                              btrfs_file_extent_num_bytes(leaf, extent);
360                 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
361                         struct btrfs_item *item;
362                         item = btrfs_item_nr(leaf, slot);
363                         extent_end = found_key.offset +
364                              btrfs_file_extent_inline_len(leaf, extent);
365                         extent_end = (extent_end + root->sectorsize - 1) &
366                                 ~((u64)root->sectorsize -1 );
367                 }
368                 last_offset = extent_end;
369                 path->slots[0]++;
370         }
371         if (0 && last_offset < inode->i_size) {
372                 WARN_ON(1);
373                 btrfs_print_leaf(root, leaf);
374                 printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
375                        last_offset, inode->i_size);
376                 err = 1;
377
378         }
379 out:
380         btrfs_free_path(path);
381         return err;
382 #endif
383 }
384
385 /*
386  * this is very complex, but the basic idea is to drop all extents
387  * in the range start - end.  hint_block is filled in with a block number
388  * that would be a good hint to the block allocator for this file.
389  *
390  * If an extent intersects the range but is not entirely inside the range
391  * it is either truncated or split.  Anything entirely inside the range
392  * is deleted from the tree.
393  *
394  * inline_limit is used to tell this code which offsets in the file to keep
395  * if they contain inline extents.
396  */
397 int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
398                        struct btrfs_root *root, struct inode *inode,
399                        u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
400 {
401         u64 extent_end = 0;
402         u64 search_start = start;
403         u64 leaf_start;
404         u64 ram_bytes = 0;
405         u8 compression = 0;
406         u8 encryption = 0;
407         u16 other_encoding = 0;
408         u64 root_gen;
409         u64 root_owner;
410         struct extent_buffer *leaf;
411         struct btrfs_file_extent_item *extent;
412         struct btrfs_path *path;
413         struct btrfs_key key;
414         struct btrfs_file_extent_item old;
415         int keep;
416         int slot;
417         int bookend;
418         int found_type;
419         int found_extent;
420         int found_inline;
421         int recow;
422         int ret;
423
424         inline_limit = 0;
425         btrfs_drop_extent_cache(inode, start, end - 1, 0);
426
427         path = btrfs_alloc_path();
428         if (!path)
429                 return -ENOMEM;
430         while(1) {
431                 recow = 0;
432                 btrfs_release_path(root, path);
433                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
434                                                search_start, -1);
435                 if (ret < 0)
436                         goto out;
437                 if (ret > 0) {
438                         if (path->slots[0] == 0) {
439                                 ret = 0;
440                                 goto out;
441                         }
442                         path->slots[0]--;
443                 }
444 next_slot:
445                 keep = 0;
446                 bookend = 0;
447                 found_extent = 0;
448                 found_inline = 0;
449                 leaf_start = 0;
450                 root_gen = 0;
451                 root_owner = 0;
452                 extent = NULL;
453                 leaf = path->nodes[0];
454                 slot = path->slots[0];
455                 ret = 0;
456                 btrfs_item_key_to_cpu(leaf, &key, slot);
457                 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
458                     key.offset >= end) {
459                         goto out;
460                 }
461                 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
462                     key.objectid != inode->i_ino) {
463                         goto out;
464                 }
465                 if (recow) {
466                         search_start = key.offset;
467                         continue;
468                 }
469                 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
470                         extent = btrfs_item_ptr(leaf, slot,
471                                                 struct btrfs_file_extent_item);
472                         found_type = btrfs_file_extent_type(leaf, extent);
473                         compression = btrfs_file_extent_compression(leaf,
474                                                                     extent);
475                         encryption = btrfs_file_extent_encryption(leaf,
476                                                                   extent);
477                         other_encoding = btrfs_file_extent_other_encoding(leaf,
478                                                                   extent);
479                         if (found_type == BTRFS_FILE_EXTENT_REG) {
480                                 extent_end =
481                                      btrfs_file_extent_disk_bytenr(leaf,
482                                                                    extent);
483                                 if (extent_end)
484                                         *hint_byte = extent_end;
485
486                                 extent_end = key.offset +
487                                      btrfs_file_extent_num_bytes(leaf, extent);
488                                 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
489                                                                 extent);
490                                 found_extent = 1;
491                         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
492                                 found_inline = 1;
493                                 extent_end = key.offset +
494                                      btrfs_file_extent_inline_len(leaf, extent);
495                         }
496                 } else {
497                         extent_end = search_start;
498                 }
499
500                 /* we found nothing we can drop */
501                 if ((!found_extent && !found_inline) ||
502                     search_start >= extent_end) {
503                         int nextret;
504                         u32 nritems;
505                         nritems = btrfs_header_nritems(leaf);
506                         if (slot >= nritems - 1) {
507                                 nextret = btrfs_next_leaf(root, path);
508                                 if (nextret)
509                                         goto out;
510                                 recow = 1;
511                         } else {
512                                 path->slots[0]++;
513                         }
514                         goto next_slot;
515                 }
516
517                 if (found_inline) {
518                         u64 mask = root->sectorsize - 1;
519                         search_start = (extent_end + mask) & ~mask;
520                 } else
521                         search_start = extent_end;
522
523                 if (end <= extent_end && start >= key.offset && found_inline)
524                         *hint_byte = EXTENT_MAP_INLINE;
525
526                 if (found_extent) {
527                         read_extent_buffer(leaf, &old, (unsigned long)extent,
528                                            sizeof(old));
529                         root_gen = btrfs_header_generation(leaf);
530                         root_owner = btrfs_header_owner(leaf);
531                         leaf_start = leaf->start;
532                 }
533
534                 if (end < extent_end && end >= key.offset) {
535                         bookend = 1;
536                         if (found_inline && start <= key.offset)
537                                 keep = 1;
538                 }
539                 /* truncate existing extent */
540                 if (start > key.offset) {
541                         u64 new_num;
542                         u64 old_num;
543                         keep = 1;
544                         WARN_ON(start & (root->sectorsize - 1));
545                         if (found_extent) {
546                                 new_num = start - key.offset;
547                                 old_num = btrfs_file_extent_num_bytes(leaf,
548                                                                       extent);
549                                 *hint_byte =
550                                         btrfs_file_extent_disk_bytenr(leaf,
551                                                                       extent);
552                                 if (btrfs_file_extent_disk_bytenr(leaf,
553                                                                   extent)) {
554                                         inode_sub_bytes(inode, old_num -
555                                                         new_num);
556                                 }
557                                 btrfs_set_file_extent_num_bytes(leaf, extent,
558                                                                 new_num);
559                                 btrfs_mark_buffer_dirty(leaf);
560                         } else if (key.offset < inline_limit &&
561                                    (end > extent_end) &&
562                                    (inline_limit < extent_end)) {
563                                 u32 new_size;
564                                 new_size = btrfs_file_extent_calc_inline_size(
565                                                    inline_limit - key.offset);
566                                 inode_sub_bytes(inode, extent_end -
567                                                 inline_limit);
568                                 btrfs_truncate_item(trans, root, path,
569                                                     new_size, 1);
570                         }
571                 }
572                 /* delete the entire extent */
573                 if (!keep) {
574                         if (found_inline)
575                                 inode_sub_bytes(inode, extent_end -
576                                                 key.offset);
577                         ret = btrfs_del_item(trans, root, path);
578                         /* TODO update progress marker and return */
579                         BUG_ON(ret);
580                         extent = NULL;
581                         btrfs_release_path(root, path);
582                         /* the extent will be freed later */
583                 }
584                 if (bookend && found_inline && start <= key.offset) {
585                         u32 new_size;
586                         new_size = btrfs_file_extent_calc_inline_size(
587                                                    extent_end - end);
588                         inode_sub_bytes(inode, end - key.offset);
589                         ret = btrfs_truncate_item(trans, root, path,
590                                                   new_size, 0);
591                         BUG_ON(ret);
592                 }
593                 /* create bookend, splitting the extent in two */
594                 if (bookend && found_extent) {
595                         u64 disk_bytenr;
596                         struct btrfs_key ins;
597                         ins.objectid = inode->i_ino;
598                         ins.offset = end;
599                         btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
600                         btrfs_release_path(root, path);
601                         ret = btrfs_insert_empty_item(trans, root, path, &ins,
602                                                       sizeof(*extent));
603                         BUG_ON(ret);
604
605                         leaf = path->nodes[0];
606                         extent = btrfs_item_ptr(leaf, path->slots[0],
607                                                 struct btrfs_file_extent_item);
608                         write_extent_buffer(leaf, &old,
609                                             (unsigned long)extent, sizeof(old));
610
611                         btrfs_set_file_extent_compression(leaf, extent,
612                                                           compression);
613                         btrfs_set_file_extent_encryption(leaf, extent,
614                                                          encryption);
615                         btrfs_set_file_extent_other_encoding(leaf, extent,
616                                                              other_encoding);
617                         btrfs_set_file_extent_offset(leaf, extent,
618                                     le64_to_cpu(old.offset) + end - key.offset);
619                         WARN_ON(le64_to_cpu(old.num_bytes) <
620                                 (extent_end - end));
621                         btrfs_set_file_extent_num_bytes(leaf, extent,
622                                                         extent_end - end);
623
624                         /*
625                          * set the ram bytes to the size of the full extent
626                          * before splitting.  This is a worst case flag,
627                          * but its the best we can do because we don't know
628                          * how splitting affects compression
629                          */
630                         btrfs_set_file_extent_ram_bytes(leaf, extent,
631                                                         ram_bytes);
632                         btrfs_set_file_extent_type(leaf, extent,
633                                                    BTRFS_FILE_EXTENT_REG);
634
635                         btrfs_mark_buffer_dirty(path->nodes[0]);
636
637                         disk_bytenr = le64_to_cpu(old.disk_bytenr);
638                         if (disk_bytenr != 0) {
639                                 ret = btrfs_inc_extent_ref(trans, root,
640                                                 disk_bytenr,
641                                                 le64_to_cpu(old.disk_num_bytes),
642                                                 leaf->start,
643                                                 root->root_key.objectid,
644                                                 trans->transid, ins.objectid);
645                                 BUG_ON(ret);
646                         }
647                         btrfs_release_path(root, path);
648                         if (disk_bytenr != 0) {
649                                 inode_add_bytes(inode, extent_end - end);
650                         }
651                 }
652
653                 if (found_extent && !keep) {
654                         u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
655
656                         if (disk_bytenr != 0) {
657                                 inode_sub_bytes(inode,
658                                                 le64_to_cpu(old.num_bytes));
659                                 ret = btrfs_free_extent(trans, root,
660                                                 disk_bytenr,
661                                                 le64_to_cpu(old.disk_num_bytes),
662                                                 leaf_start, root_owner,
663                                                 root_gen, key.objectid, 0);
664                                 BUG_ON(ret);
665                                 *hint_byte = disk_bytenr;
666                         }
667                 }
668
669                 if (search_start >= end) {
670                         ret = 0;
671                         goto out;
672                 }
673         }
674 out:
675         btrfs_free_path(path);
676         btrfs_check_file(root, inode);
677         return ret;
678 }
679
680 /*
681  * this gets pages into the page cache and locks them down, it also properly
682  * waits for data=ordered extents to finish before allowing the pages to be
683  * modified.
684  */
685 static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
686                          struct page **pages, size_t num_pages,
687                          loff_t pos, unsigned long first_index,
688                          unsigned long last_index, size_t write_bytes)
689 {
690         int i;
691         unsigned long index = pos >> PAGE_CACHE_SHIFT;
692         struct inode *inode = fdentry(file)->d_inode;
693         int err = 0;
694         u64 start_pos;
695         u64 last_pos;
696
697         start_pos = pos & ~((u64)root->sectorsize - 1);
698         last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
699
700         memset(pages, 0, num_pages * sizeof(struct page *));
701 again:
702         for (i = 0; i < num_pages; i++) {
703                 pages[i] = grab_cache_page(inode->i_mapping, index + i);
704                 if (!pages[i]) {
705                         err = -ENOMEM;
706                         BUG_ON(1);
707                 }
708                 wait_on_page_writeback(pages[i]);
709         }
710         if (start_pos < inode->i_size) {
711                 struct btrfs_ordered_extent *ordered;
712                 lock_extent(&BTRFS_I(inode)->io_tree,
713                             start_pos, last_pos - 1, GFP_NOFS);
714                 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
715                 if (ordered &&
716                     ordered->file_offset + ordered->len > start_pos &&
717                     ordered->file_offset < last_pos) {
718                         btrfs_put_ordered_extent(ordered);
719                         unlock_extent(&BTRFS_I(inode)->io_tree,
720                                       start_pos, last_pos - 1, GFP_NOFS);
721                         for (i = 0; i < num_pages; i++) {
722                                 unlock_page(pages[i]);
723                                 page_cache_release(pages[i]);
724                         }
725                         btrfs_wait_ordered_range(inode, start_pos,
726                                                  last_pos - start_pos);
727                         goto again;
728                 }
729                 if (ordered)
730                         btrfs_put_ordered_extent(ordered);
731
732                 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
733                                   last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
734                                   GFP_NOFS);
735                 unlock_extent(&BTRFS_I(inode)->io_tree,
736                               start_pos, last_pos - 1, GFP_NOFS);
737         }
738         for (i = 0; i < num_pages; i++) {
739                 clear_page_dirty_for_io(pages[i]);
740                 set_page_extent_mapped(pages[i]);
741                 WARN_ON(!PageLocked(pages[i]));
742         }
743         return 0;
744 }
745
746 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
747                                 size_t count, loff_t *ppos)
748 {
749         loff_t pos;
750         loff_t start_pos;
751         ssize_t num_written = 0;
752         ssize_t err = 0;
753         int ret = 0;
754         struct inode *inode = fdentry(file)->d_inode;
755         struct btrfs_root *root = BTRFS_I(inode)->root;
756         struct page **pages = NULL;
757         int nrptrs;
758         struct page *pinned[2];
759         unsigned long first_index;
760         unsigned long last_index;
761         int will_write;
762
763         will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
764                       (file->f_flags & O_DIRECT));
765
766         nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
767                      PAGE_CACHE_SIZE / (sizeof(struct page *)));
768         pinned[0] = NULL;
769         pinned[1] = NULL;
770
771         pos = *ppos;
772         start_pos = pos;
773
774         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
775         current->backing_dev_info = inode->i_mapping->backing_dev_info;
776         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
777         if (err)
778                 goto out_nolock;
779         if (count == 0)
780                 goto out_nolock;
781
782         err = file_remove_suid(file);
783         if (err)
784                 goto out_nolock;
785         file_update_time(file);
786
787         pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
788
789         mutex_lock(&inode->i_mutex);
790         first_index = pos >> PAGE_CACHE_SHIFT;
791         last_index = (pos + count) >> PAGE_CACHE_SHIFT;
792
793         /*
794          * if this is a nodatasum mount, force summing off for the inode
795          * all the time.  That way a later mount with summing on won't
796          * get confused
797          */
798         if (btrfs_test_opt(root, NODATASUM))
799                 btrfs_set_flag(inode, NODATASUM);
800
801         /*
802          * there are lots of better ways to do this, but this code
803          * makes sure the first and last page in the file range are
804          * up to date and ready for cow
805          */
806         if ((pos & (PAGE_CACHE_SIZE - 1))) {
807                 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
808                 if (!PageUptodate(pinned[0])) {
809                         ret = btrfs_readpage(NULL, pinned[0]);
810                         BUG_ON(ret);
811                         wait_on_page_locked(pinned[0]);
812                 } else {
813                         unlock_page(pinned[0]);
814                 }
815         }
816         if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
817                 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
818                 if (!PageUptodate(pinned[1])) {
819                         ret = btrfs_readpage(NULL, pinned[1]);
820                         BUG_ON(ret);
821                         wait_on_page_locked(pinned[1]);
822                 } else {
823                         unlock_page(pinned[1]);
824                 }
825         }
826
827         while(count > 0) {
828                 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
829                 size_t write_bytes = min(count, nrptrs *
830                                         (size_t)PAGE_CACHE_SIZE -
831                                          offset);
832                 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
833                                         PAGE_CACHE_SHIFT;
834
835                 WARN_ON(num_pages > nrptrs);
836                 memset(pages, 0, sizeof(pages));
837
838                 ret = btrfs_check_free_space(root, write_bytes, 0);
839                 if (ret)
840                         goto out;
841
842                 ret = prepare_pages(root, file, pages, num_pages,
843                                     pos, first_index, last_index,
844                                     write_bytes);
845                 if (ret)
846                         goto out;
847
848                 ret = btrfs_copy_from_user(pos, num_pages,
849                                            write_bytes, pages, buf);
850                 if (ret) {
851                         btrfs_drop_pages(pages, num_pages);
852                         goto out;
853                 }
854
855                 ret = dirty_and_release_pages(NULL, root, file, pages,
856                                               num_pages, pos, write_bytes);
857                 btrfs_drop_pages(pages, num_pages);
858                 if (ret)
859                         goto out;
860
861                 if (will_write) {
862                         btrfs_fdatawrite_range(inode->i_mapping, pos,
863                                                pos + write_bytes - 1,
864                                                WB_SYNC_NONE);
865                 } else {
866                         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
867                                                            num_pages);
868                         if (num_pages <
869                             (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
870                                 btrfs_btree_balance_dirty(root, 1);
871                         btrfs_throttle(root);
872                 }
873
874                 buf += write_bytes;
875                 count -= write_bytes;
876                 pos += write_bytes;
877                 num_written += write_bytes;
878
879                 cond_resched();
880         }
881 out:
882         mutex_unlock(&inode->i_mutex);
883
884 out_nolock:
885         kfree(pages);
886         if (pinned[0])
887                 page_cache_release(pinned[0]);
888         if (pinned[1])
889                 page_cache_release(pinned[1]);
890         *ppos = pos;
891
892         if (num_written > 0 && will_write) {
893                 struct btrfs_trans_handle *trans;
894
895                 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
896                 if (err)
897                         num_written = err;
898
899                 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
900                         trans = btrfs_start_transaction(root, 1);
901                         ret = btrfs_log_dentry_safe(trans, root,
902                                                     file->f_dentry);
903                         if (ret == 0) {
904                                 btrfs_sync_log(trans, root);
905                                 btrfs_end_transaction(trans, root);
906                         } else {
907                                 btrfs_commit_transaction(trans, root);
908                         }
909                 }
910                 if (file->f_flags & O_DIRECT) {
911                         invalidate_mapping_pages(inode->i_mapping,
912                               start_pos >> PAGE_CACHE_SHIFT,
913                              (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
914                 }
915         }
916         current->backing_dev_info = NULL;
917         return num_written ? num_written : err;
918 }
919
920 int btrfs_release_file(struct inode * inode, struct file * filp)
921 {
922         if (filp->private_data)
923                 btrfs_ioctl_trans_end(filp);
924         return 0;
925 }
926
927 /*
928  * fsync call for both files and directories.  This logs the inode into
929  * the tree log instead of forcing full commits whenever possible.
930  *
931  * It needs to call filemap_fdatawait so that all ordered extent updates are
932  * in the metadata btree are up to date for copying to the log.
933  *
934  * It drops the inode mutex before doing the tree log commit.  This is an
935  * important optimization for directories because holding the mutex prevents
936  * new operations on the dir while we write to disk.
937  */
938 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
939 {
940         struct inode *inode = dentry->d_inode;
941         struct btrfs_root *root = BTRFS_I(inode)->root;
942         int ret = 0;
943         struct btrfs_trans_handle *trans;
944
945         /*
946          * check the transaction that last modified this inode
947          * and see if its already been committed
948          */
949         if (!BTRFS_I(inode)->last_trans)
950                 goto out;
951
952         mutex_lock(&root->fs_info->trans_mutex);
953         if (BTRFS_I(inode)->last_trans <=
954             root->fs_info->last_trans_committed) {
955                 BTRFS_I(inode)->last_trans = 0;
956                 mutex_unlock(&root->fs_info->trans_mutex);
957                 goto out;
958         }
959         mutex_unlock(&root->fs_info->trans_mutex);
960
961         root->fs_info->tree_log_batch++;
962         filemap_fdatawait(inode->i_mapping);
963         root->fs_info->tree_log_batch++;
964
965         /*
966          * ok we haven't committed the transaction yet, lets do a commit
967          */
968         if (file->private_data)
969                 btrfs_ioctl_trans_end(file);
970
971         trans = btrfs_start_transaction(root, 1);
972         if (!trans) {
973                 ret = -ENOMEM;
974                 goto out;
975         }
976
977         ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
978         if (ret < 0) {
979                 goto out;
980         }
981
982         /* we've logged all the items and now have a consistent
983          * version of the file in the log.  It is possible that
984          * someone will come in and modify the file, but that's
985          * fine because the log is consistent on disk, and we
986          * have references to all of the file's extents
987          *
988          * It is possible that someone will come in and log the
989          * file again, but that will end up using the synchronization
990          * inside btrfs_sync_log to keep things safe.
991          */
992         mutex_unlock(&file->f_dentry->d_inode->i_mutex);
993
994         if (ret > 0) {
995                 ret = btrfs_commit_transaction(trans, root);
996         } else {
997                 btrfs_sync_log(trans, root);
998                 ret = btrfs_end_transaction(trans, root);
999         }
1000         mutex_lock(&file->f_dentry->d_inode->i_mutex);
1001 out:
1002         return ret > 0 ? EIO : ret;
1003 }
1004
1005 static struct vm_operations_struct btrfs_file_vm_ops = {
1006         .fault          = filemap_fault,
1007         .page_mkwrite   = btrfs_page_mkwrite,
1008 };
1009
1010 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
1011 {
1012         vma->vm_ops = &btrfs_file_vm_ops;
1013         file_accessed(filp);
1014         return 0;
1015 }
1016
1017 struct file_operations btrfs_file_operations = {
1018         .llseek         = generic_file_llseek,
1019         .read           = do_sync_read,
1020         .aio_read       = generic_file_aio_read,
1021         .splice_read    = generic_file_splice_read,
1022         .write          = btrfs_file_write,
1023         .mmap           = btrfs_file_mmap,
1024         .open           = generic_file_open,
1025         .release        = btrfs_release_file,
1026         .fsync          = btrfs_sync_file,
1027         .unlocked_ioctl = btrfs_ioctl,
1028 #ifdef CONFIG_COMPAT
1029         .compat_ioctl   = btrfs_ioctl,
1030 #endif
1031 };