Btrfs: Compression corner fixes
[safe/jmp/linux-2.6] / fs / btrfs / file.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/fs.h>
20 #include <linux/pagemap.h>
21 #include <linux/highmem.h>
22 #include <linux/time.h>
23 #include <linux/init.h>
24 #include <linux/string.h>
25 #include <linux/smp_lock.h>
26 #include <linux/backing-dev.h>
27 #include <linux/mpage.h>
28 #include <linux/swap.h>
29 #include <linux/writeback.h>
30 #include <linux/statfs.h>
31 #include <linux/compat.h>
32 #include <linux/version.h>
33 #include "ctree.h"
34 #include "disk-io.h"
35 #include "transaction.h"
36 #include "btrfs_inode.h"
37 #include "ioctl.h"
38 #include "print-tree.h"
39 #include "tree-log.h"
40 #include "locking.h"
41 #include "compat.h"
42
43
44 /* simple helper to fault in pages and copy.  This should go away
45  * and be replaced with calls into generic code.
46  */
47 static int noinline btrfs_copy_from_user(loff_t pos, int num_pages,
48                                          int write_bytes,
49                                          struct page **prepared_pages,
50                                          const char __user * buf)
51 {
52         long page_fault = 0;
53         int i;
54         int offset = pos & (PAGE_CACHE_SIZE - 1);
55
56         for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
57                 size_t count = min_t(size_t,
58                                      PAGE_CACHE_SIZE - offset, write_bytes);
59                 struct page *page = prepared_pages[i];
60                 fault_in_pages_readable(buf, count);
61
62                 /* Copy data from userspace to the current page */
63                 kmap(page);
64                 page_fault = __copy_from_user(page_address(page) + offset,
65                                               buf, count);
66                 /* Flush processor's dcache for this page */
67                 flush_dcache_page(page);
68                 kunmap(page);
69                 buf += count;
70                 write_bytes -= count;
71
72                 if (page_fault)
73                         break;
74         }
75         return page_fault ? -EFAULT : 0;
76 }
77
78 /*
79  * unlocks pages after btrfs_file_write is done with them
80  */
81 static void noinline btrfs_drop_pages(struct page **pages, size_t num_pages)
82 {
83         size_t i;
84         for (i = 0; i < num_pages; i++) {
85                 if (!pages[i])
86                         break;
87                 /* page checked is some magic around finding pages that
88                  * have been modified without going through btrfs_set_page_dirty
89                  * clear it here
90                  */
91                 ClearPageChecked(pages[i]);
92                 unlock_page(pages[i]);
93                 mark_page_accessed(pages[i]);
94                 page_cache_release(pages[i]);
95         }
96 }
97
98 /*
99  * after copy_from_user, pages need to be dirtied and we need to make
100  * sure holes are created between the current EOF and the start of
101  * any next extents (if required).
102  *
103  * this also makes the decision about creating an inline extent vs
104  * doing real data extents, marking pages dirty and delalloc as required.
105  */
106 static int noinline dirty_and_release_pages(struct btrfs_trans_handle *trans,
107                                    struct btrfs_root *root,
108                                    struct file *file,
109                                    struct page **pages,
110                                    size_t num_pages,
111                                    loff_t pos,
112                                    size_t write_bytes)
113 {
114         int err = 0;
115         int i;
116         struct inode *inode = fdentry(file)->d_inode;
117         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
118         u64 hint_byte;
119         u64 num_bytes;
120         u64 start_pos;
121         u64 end_of_last_block;
122         u64 end_pos = pos + write_bytes;
123         loff_t isize = i_size_read(inode);
124
125         start_pos = pos & ~((u64)root->sectorsize - 1);
126         num_bytes = (write_bytes + pos - start_pos +
127                     root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
128
129         end_of_last_block = start_pos + num_bytes - 1;
130
131         lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
132         trans = btrfs_join_transaction(root, 1);
133         if (!trans) {
134                 err = -ENOMEM;
135                 goto out_unlock;
136         }
137         btrfs_set_trans_block_group(trans, inode);
138         hint_byte = 0;
139
140         if ((end_of_last_block & 4095) == 0) {
141                 printk("strange end of last %Lu %zu %Lu\n", start_pos, write_bytes, end_of_last_block);
142         }
143         set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS);
144
145         /* check for reserved extents on each page, we don't want
146          * to reset the delalloc bit on things that already have
147          * extents reserved.
148          */
149         btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
150         for (i = 0; i < num_pages; i++) {
151                 struct page *p = pages[i];
152                 SetPageUptodate(p);
153                 ClearPageChecked(p);
154                 set_page_dirty(p);
155         }
156         if (end_pos > isize) {
157                 i_size_write(inode, end_pos);
158                 btrfs_update_inode(trans, root, inode);
159         }
160         err = btrfs_end_transaction(trans, root);
161 out_unlock:
162         unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS);
163         return err;
164 }
165
166 /*
167  * this drops all the extents in the cache that intersect the range
168  * [start, end].  Existing extents are split as required.
169  */
170 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
171                             int skip_pinned)
172 {
173         struct extent_map *em;
174         struct extent_map *split = NULL;
175         struct extent_map *split2 = NULL;
176         struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
177         u64 len = end - start + 1;
178         int ret;
179         int testend = 1;
180         unsigned long flags;
181         int compressed = 0;
182
183         WARN_ON(end < start);
184         if (end == (u64)-1) {
185                 len = (u64)-1;
186                 testend = 0;
187         }
188         while(1) {
189                 if (!split)
190                         split = alloc_extent_map(GFP_NOFS);
191                 if (!split2)
192                         split2 = alloc_extent_map(GFP_NOFS);
193
194                 spin_lock(&em_tree->lock);
195                 em = lookup_extent_mapping(em_tree, start, len);
196                 if (!em) {
197                         spin_unlock(&em_tree->lock);
198                         break;
199                 }
200                 flags = em->flags;
201                 if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) {
202                         spin_unlock(&em_tree->lock);
203                         if (em->start <= start &&
204                             (!testend || em->start + em->len >= start + len)) {
205                                 free_extent_map(em);
206                                 break;
207                         }
208                         if (start < em->start) {
209                                 len = em->start - start;
210                         } else {
211                                 len = start + len - (em->start + em->len);
212                                 start = em->start + em->len;
213                         }
214                         free_extent_map(em);
215                         continue;
216                 }
217                 compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
218                 clear_bit(EXTENT_FLAG_PINNED, &em->flags);
219                 remove_extent_mapping(em_tree, em);
220
221                 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
222                     em->start < start) {
223                         split->start = em->start;
224                         split->len = start - em->start;
225                         split->block_start = em->block_start;
226
227                         if (compressed)
228                                 split->block_len = em->block_len;
229                         else
230                                 split->block_len = split->len;
231
232                         split->bdev = em->bdev;
233                         split->flags = flags;
234                         ret = add_extent_mapping(em_tree, split);
235                         BUG_ON(ret);
236                         free_extent_map(split);
237                         split = split2;
238                         split2 = NULL;
239                 }
240                 if (em->block_start < EXTENT_MAP_LAST_BYTE &&
241                     testend && em->start + em->len > start + len) {
242                         u64 diff = start + len - em->start;
243
244                         split->start = start + len;
245                         split->len = em->start + em->len - (start + len);
246                         split->bdev = em->bdev;
247                         split->flags = flags;
248
249                         if (compressed) {
250                                 split->block_len = em->block_len;
251                                 split->block_start = em->block_start;
252                         } else {
253                                 split->block_len = split->len;
254                                 split->block_start = em->block_start + diff;
255                         }
256
257                         ret = add_extent_mapping(em_tree, split);
258                         BUG_ON(ret);
259                         free_extent_map(split);
260                         split = NULL;
261                 }
262                 spin_unlock(&em_tree->lock);
263
264                 /* once for us */
265                 free_extent_map(em);
266                 /* once for the tree*/
267                 free_extent_map(em);
268         }
269         if (split)
270                 free_extent_map(split);
271         if (split2)
272                 free_extent_map(split2);
273         return 0;
274 }
275
276 int btrfs_check_file(struct btrfs_root *root, struct inode *inode)
277 {
278         return 0;
279 #if 0
280         struct btrfs_path *path;
281         struct btrfs_key found_key;
282         struct extent_buffer *leaf;
283         struct btrfs_file_extent_item *extent;
284         u64 last_offset = 0;
285         int nritems;
286         int slot;
287         int found_type;
288         int ret;
289         int err = 0;
290         u64 extent_end = 0;
291
292         path = btrfs_alloc_path();
293         ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino,
294                                        last_offset, 0);
295         while(1) {
296                 nritems = btrfs_header_nritems(path->nodes[0]);
297                 if (path->slots[0] >= nritems) {
298                         ret = btrfs_next_leaf(root, path);
299                         if (ret)
300                                 goto out;
301                         nritems = btrfs_header_nritems(path->nodes[0]);
302                 }
303                 slot = path->slots[0];
304                 leaf = path->nodes[0];
305                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
306                 if (found_key.objectid != inode->i_ino)
307                         break;
308                 if (found_key.type != BTRFS_EXTENT_DATA_KEY)
309                         goto out;
310
311                 if (found_key.offset < last_offset) {
312                         WARN_ON(1);
313                         btrfs_print_leaf(root, leaf);
314                         printk("inode %lu found offset %Lu expected %Lu\n",
315                                inode->i_ino, found_key.offset, last_offset);
316                         err = 1;
317                         goto out;
318                 }
319                 extent = btrfs_item_ptr(leaf, slot,
320                                         struct btrfs_file_extent_item);
321                 found_type = btrfs_file_extent_type(leaf, extent);
322                 if (found_type == BTRFS_FILE_EXTENT_REG) {
323                         extent_end = found_key.offset +
324                              btrfs_file_extent_num_bytes(leaf, extent);
325                 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
326                         struct btrfs_item *item;
327                         item = btrfs_item_nr(leaf, slot);
328                         extent_end = found_key.offset +
329                              btrfs_file_extent_inline_len(leaf, extent);
330                         extent_end = (extent_end + root->sectorsize - 1) &
331                                 ~((u64)root->sectorsize -1 );
332                 }
333                 last_offset = extent_end;
334                 path->slots[0]++;
335         }
336         if (0 && last_offset < inode->i_size) {
337                 WARN_ON(1);
338                 btrfs_print_leaf(root, leaf);
339                 printk("inode %lu found offset %Lu size %Lu\n", inode->i_ino,
340                        last_offset, inode->i_size);
341                 err = 1;
342
343         }
344 out:
345         btrfs_free_path(path);
346         return err;
347 #endif
348 }
349
350 /*
351  * this is very complex, but the basic idea is to drop all extents
352  * in the range start - end.  hint_block is filled in with a block number
353  * that would be a good hint to the block allocator for this file.
354  *
355  * If an extent intersects the range but is not entirely inside the range
356  * it is either truncated or split.  Anything entirely inside the range
357  * is deleted from the tree.
358  *
359  * inline_limit is used to tell this code which offsets in the file to keep
360  * if they contain inline extents.
361  */
362 int noinline btrfs_drop_extents(struct btrfs_trans_handle *trans,
363                        struct btrfs_root *root, struct inode *inode,
364                        u64 start, u64 end, u64 inline_limit, u64 *hint_byte)
365 {
366         u64 extent_end = 0;
367         u64 locked_end = end;
368         u64 search_start = start;
369         u64 leaf_start;
370         u64 ram_bytes = 0;
371         u8 compression;
372         u8 encryption;
373         u16 other_encoding = 0;
374         u64 root_gen;
375         u64 root_owner;
376         struct extent_buffer *leaf;
377         struct btrfs_file_extent_item *extent;
378         struct btrfs_path *path;
379         struct btrfs_key key;
380         struct btrfs_file_extent_item old;
381         int keep;
382         int slot;
383         int bookend;
384         int found_type = 0;
385         int found_extent;
386         int found_inline;
387         int recow;
388         int ret;
389
390         inline_limit = 0;
391         btrfs_drop_extent_cache(inode, start, end - 1, 0);
392
393         path = btrfs_alloc_path();
394         if (!path)
395                 return -ENOMEM;
396         while(1) {
397                 recow = 0;
398                 btrfs_release_path(root, path);
399                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
400                                                search_start, -1);
401                 if (ret < 0)
402                         goto out;
403                 if (ret > 0) {
404                         if (path->slots[0] == 0) {
405                                 ret = 0;
406                                 goto out;
407                         }
408                         path->slots[0]--;
409                 }
410 next_slot:
411                 keep = 0;
412                 bookend = 0;
413                 found_extent = 0;
414                 found_inline = 0;
415                 leaf_start = 0;
416                 root_gen = 0;
417                 root_owner = 0;
418                 compression = 0;
419                 encryption = 0;
420                 extent = NULL;
421                 leaf = path->nodes[0];
422                 slot = path->slots[0];
423                 ret = 0;
424                 btrfs_item_key_to_cpu(leaf, &key, slot);
425                 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY &&
426                     key.offset >= end) {
427                         goto out;
428                 }
429                 if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY ||
430                     key.objectid != inode->i_ino) {
431                         goto out;
432                 }
433                 if (recow) {
434                         search_start = key.offset;
435                         continue;
436                 }
437                 if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) {
438                         extent = btrfs_item_ptr(leaf, slot,
439                                                 struct btrfs_file_extent_item);
440                         found_type = btrfs_file_extent_type(leaf, extent);
441                         compression = btrfs_file_extent_compression(leaf,
442                                                                     extent);
443                         encryption = btrfs_file_extent_encryption(leaf,
444                                                                   extent);
445                         other_encoding = btrfs_file_extent_other_encoding(leaf,
446                                                                   extent);
447                         if (found_type == BTRFS_FILE_EXTENT_REG ||
448                             found_type == BTRFS_FILE_EXTENT_PREALLOC) {
449                                 extent_end =
450                                      btrfs_file_extent_disk_bytenr(leaf,
451                                                                    extent);
452                                 if (extent_end)
453                                         *hint_byte = extent_end;
454
455                                 extent_end = key.offset +
456                                      btrfs_file_extent_num_bytes(leaf, extent);
457                                 ram_bytes = btrfs_file_extent_ram_bytes(leaf,
458                                                                 extent);
459                                 found_extent = 1;
460                         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
461                                 found_inline = 1;
462                                 extent_end = key.offset +
463                                      btrfs_file_extent_inline_len(leaf, extent);
464                         }
465                 } else {
466                         extent_end = search_start;
467                 }
468
469                 /* we found nothing we can drop */
470                 if ((!found_extent && !found_inline) ||
471                     search_start >= extent_end) {
472                         int nextret;
473                         u32 nritems;
474                         nritems = btrfs_header_nritems(leaf);
475                         if (slot >= nritems - 1) {
476                                 nextret = btrfs_next_leaf(root, path);
477                                 if (nextret)
478                                         goto out;
479                                 recow = 1;
480                         } else {
481                                 path->slots[0]++;
482                         }
483                         goto next_slot;
484                 }
485
486                 if (end <= extent_end && start >= key.offset && found_inline)
487                         *hint_byte = EXTENT_MAP_INLINE;
488
489                 if (found_extent) {
490                         read_extent_buffer(leaf, &old, (unsigned long)extent,
491                                            sizeof(old));
492                         root_gen = btrfs_header_generation(leaf);
493                         root_owner = btrfs_header_owner(leaf);
494                         leaf_start = leaf->start;
495                 }
496
497                 if (end < extent_end && end >= key.offset) {
498                         bookend = 1;
499                         if (found_inline && start <= key.offset)
500                                 keep = 1;
501                 }
502
503                 if (bookend && found_extent && locked_end < extent_end) {
504                         ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
505                                         locked_end, extent_end - 1, GFP_NOFS);
506                         if (!ret) {
507                                 btrfs_release_path(root, path);
508                                 lock_extent(&BTRFS_I(inode)->io_tree,
509                                         locked_end, extent_end - 1, GFP_NOFS);
510                                 locked_end = extent_end;
511                                 continue;
512                         }
513                         locked_end = extent_end;
514                 }
515
516                 if (found_inline) {
517                         u64 mask = root->sectorsize - 1;
518                         search_start = (extent_end + mask) & ~mask;
519                 } else
520                         search_start = extent_end;
521
522                 /* truncate existing extent */
523                 if (start > key.offset) {
524                         u64 new_num;
525                         u64 old_num;
526                         keep = 1;
527                         WARN_ON(start & (root->sectorsize - 1));
528                         if (found_extent) {
529                                 new_num = start - key.offset;
530                                 old_num = btrfs_file_extent_num_bytes(leaf,
531                                                                       extent);
532                                 *hint_byte =
533                                         btrfs_file_extent_disk_bytenr(leaf,
534                                                                       extent);
535                                 if (btrfs_file_extent_disk_bytenr(leaf,
536                                                                   extent)) {
537                                         inode_sub_bytes(inode, old_num -
538                                                         new_num);
539                                 }
540                                 btrfs_set_file_extent_num_bytes(leaf, extent,
541                                                                 new_num);
542                                 btrfs_mark_buffer_dirty(leaf);
543                         } else if (key.offset < inline_limit &&
544                                    (end > extent_end) &&
545                                    (inline_limit < extent_end)) {
546                                 u32 new_size;
547                                 new_size = btrfs_file_extent_calc_inline_size(
548                                                    inline_limit - key.offset);
549                                 inode_sub_bytes(inode, extent_end -
550                                                 inline_limit);
551                                 btrfs_set_file_extent_ram_bytes(leaf, extent,
552                                                         new_size);
553                                 if (!compression && !encryption) {
554                                         btrfs_truncate_item(trans, root, path,
555                                                             new_size, 1);
556                                 }
557                         }
558                 }
559                 /* delete the entire extent */
560                 if (!keep) {
561                         if (found_inline)
562                                 inode_sub_bytes(inode, extent_end -
563                                                 key.offset);
564                         ret = btrfs_del_item(trans, root, path);
565                         /* TODO update progress marker and return */
566                         BUG_ON(ret);
567                         extent = NULL;
568                         btrfs_release_path(root, path);
569                         /* the extent will be freed later */
570                 }
571                 if (bookend && found_inline && start <= key.offset) {
572                         u32 new_size;
573                         new_size = btrfs_file_extent_calc_inline_size(
574                                                    extent_end - end);
575                         inode_sub_bytes(inode, end - key.offset);
576                         btrfs_set_file_extent_ram_bytes(leaf, extent,
577                                                         new_size);
578                         if (!compression && !encryption)
579                                 ret = btrfs_truncate_item(trans, root, path,
580                                                           new_size, 0);
581                         BUG_ON(ret);
582                 }
583                 /* create bookend, splitting the extent in two */
584                 if (bookend && found_extent) {
585                         u64 disk_bytenr;
586                         struct btrfs_key ins;
587                         ins.objectid = inode->i_ino;
588                         ins.offset = end;
589                         btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
590                         btrfs_release_path(root, path);
591                         ret = btrfs_insert_empty_item(trans, root, path, &ins,
592                                                       sizeof(*extent));
593                         BUG_ON(ret);
594
595                         leaf = path->nodes[0];
596                         extent = btrfs_item_ptr(leaf, path->slots[0],
597                                                 struct btrfs_file_extent_item);
598                         write_extent_buffer(leaf, &old,
599                                             (unsigned long)extent, sizeof(old));
600
601                         btrfs_set_file_extent_compression(leaf, extent,
602                                                           compression);
603                         btrfs_set_file_extent_encryption(leaf, extent,
604                                                          encryption);
605                         btrfs_set_file_extent_other_encoding(leaf, extent,
606                                                              other_encoding);
607                         btrfs_set_file_extent_offset(leaf, extent,
608                                     le64_to_cpu(old.offset) + end - key.offset);
609                         WARN_ON(le64_to_cpu(old.num_bytes) <
610                                 (extent_end - end));
611                         btrfs_set_file_extent_num_bytes(leaf, extent,
612                                                         extent_end - end);
613
614                         /*
615                          * set the ram bytes to the size of the full extent
616                          * before splitting.  This is a worst case flag,
617                          * but its the best we can do because we don't know
618                          * how splitting affects compression
619                          */
620                         btrfs_set_file_extent_ram_bytes(leaf, extent,
621                                                         ram_bytes);
622                         btrfs_set_file_extent_type(leaf, extent, found_type);
623
624                         btrfs_mark_buffer_dirty(path->nodes[0]);
625
626                         disk_bytenr = le64_to_cpu(old.disk_bytenr);
627                         if (disk_bytenr != 0) {
628                                 ret = btrfs_inc_extent_ref(trans, root,
629                                                 disk_bytenr,
630                                                 le64_to_cpu(old.disk_num_bytes),
631                                                 leaf->start,
632                                                 root->root_key.objectid,
633                                                 trans->transid, ins.objectid);
634                                 BUG_ON(ret);
635                         }
636                         btrfs_release_path(root, path);
637                         if (disk_bytenr != 0) {
638                                 inode_add_bytes(inode, extent_end - end);
639                         }
640                 }
641
642                 if (found_extent && !keep) {
643                         u64 disk_bytenr = le64_to_cpu(old.disk_bytenr);
644
645                         if (disk_bytenr != 0) {
646                                 inode_sub_bytes(inode,
647                                                 le64_to_cpu(old.num_bytes));
648                                 ret = btrfs_free_extent(trans, root,
649                                                 disk_bytenr,
650                                                 le64_to_cpu(old.disk_num_bytes),
651                                                 leaf_start, root_owner,
652                                                 root_gen, key.objectid, 0);
653                                 BUG_ON(ret);
654                                 *hint_byte = disk_bytenr;
655                         }
656                 }
657
658                 if (search_start >= end) {
659                         ret = 0;
660                         goto out;
661                 }
662         }
663 out:
664         btrfs_free_path(path);
665         if (locked_end > end) {
666                 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
667                               GFP_NOFS);
668         }
669         btrfs_check_file(root, inode);
670         return ret;
671 }
672
673 static int extent_mergeable(struct extent_buffer *leaf, int slot,
674                             u64 objectid, u64 bytenr, u64 *start, u64 *end)
675 {
676         struct btrfs_file_extent_item *fi;
677         struct btrfs_key key;
678         u64 extent_end;
679
680         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
681                 return 0;
682
683         btrfs_item_key_to_cpu(leaf, &key, slot);
684         if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
685                 return 0;
686
687         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
688         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
689             btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
690             btrfs_file_extent_compression(leaf, fi) ||
691             btrfs_file_extent_encryption(leaf, fi) ||
692             btrfs_file_extent_other_encoding(leaf, fi))
693                 return 0;
694
695         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
696         if ((*start && *start != key.offset) || (*end && *end != extent_end))
697                 return 0;
698
699         *start = key.offset;
700         *end = extent_end;
701         return 1;
702 }
703
704 /*
705  * Mark extent in the range start - end as written.
706  *
707  * This changes extent type from 'pre-allocated' to 'regular'. If only
708  * part of extent is marked as written, the extent will be split into
709  * two or three.
710  */
711 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
712                               struct btrfs_root *root,
713                               struct inode *inode, u64 start, u64 end)
714 {
715         struct extent_buffer *leaf;
716         struct btrfs_path *path;
717         struct btrfs_file_extent_item *fi;
718         struct btrfs_key key;
719         u64 bytenr;
720         u64 num_bytes;
721         u64 extent_end;
722         u64 extent_offset;
723         u64 other_start;
724         u64 other_end;
725         u64 split = start;
726         u64 locked_end = end;
727         int extent_type;
728         int split_end = 1;
729         int ret;
730
731         btrfs_drop_extent_cache(inode, start, end - 1, 0);
732
733         path = btrfs_alloc_path();
734         BUG_ON(!path);
735 again:
736         key.objectid = inode->i_ino;
737         key.type = BTRFS_EXTENT_DATA_KEY;
738         if (split == start)
739                 key.offset = split;
740         else
741                 key.offset = split - 1;
742
743         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
744         if (ret > 0 && path->slots[0] > 0)
745                 path->slots[0]--;
746
747         leaf = path->nodes[0];
748         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
749         BUG_ON(key.objectid != inode->i_ino ||
750                key.type != BTRFS_EXTENT_DATA_KEY);
751         fi = btrfs_item_ptr(leaf, path->slots[0],
752                             struct btrfs_file_extent_item);
753         extent_type = btrfs_file_extent_type(leaf, fi);
754         BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC);
755         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
756         BUG_ON(key.offset > start || extent_end < end);
757
758         bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
759         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
760         extent_offset = btrfs_file_extent_offset(leaf, fi);
761
762         if (key.offset == start)
763                 split = end;
764
765         if (key.offset == start && extent_end == end) {
766                 int del_nr = 0;
767                 int del_slot = 0;
768                 u64 leaf_owner = btrfs_header_owner(leaf);
769                 u64 leaf_gen = btrfs_header_generation(leaf);
770                 other_start = end;
771                 other_end = 0;
772                 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
773                                      bytenr, &other_start, &other_end)) {
774                         extent_end = other_end;
775                         del_slot = path->slots[0] + 1;
776                         del_nr++;
777                         ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
778                                                 leaf->start, leaf_owner,
779                                                 leaf_gen, inode->i_ino, 0);
780                         BUG_ON(ret);
781                 }
782                 other_start = 0;
783                 other_end = start;
784                 if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino,
785                                      bytenr, &other_start, &other_end)) {
786                         key.offset = other_start;
787                         del_slot = path->slots[0];
788                         del_nr++;
789                         ret = btrfs_free_extent(trans, root, bytenr, num_bytes,
790                                                 leaf->start, leaf_owner,
791                                                 leaf_gen, inode->i_ino, 0);
792                         BUG_ON(ret);
793                 }
794                 split_end = 0;
795                 if (del_nr == 0) {
796                         btrfs_set_file_extent_type(leaf, fi,
797                                                    BTRFS_FILE_EXTENT_REG);
798                         goto done;
799                 }
800
801                 fi = btrfs_item_ptr(leaf, del_slot - 1,
802                                     struct btrfs_file_extent_item);
803                 btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG);
804                 btrfs_set_file_extent_num_bytes(leaf, fi,
805                                                 extent_end - key.offset);
806                 btrfs_mark_buffer_dirty(leaf);
807
808                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
809                 BUG_ON(ret);
810                 goto done;
811         } else if (split == start) {
812                 if (locked_end < extent_end) {
813                         ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
814                                         locked_end, extent_end - 1, GFP_NOFS);
815                         if (!ret) {
816                                 btrfs_release_path(root, path);
817                                 lock_extent(&BTRFS_I(inode)->io_tree,
818                                         locked_end, extent_end - 1, GFP_NOFS);
819                                 locked_end = extent_end;
820                                 goto again;
821                         }
822                         locked_end = extent_end;
823                 }
824                 btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset);
825                 extent_offset += split - key.offset;
826         } else  {
827                 BUG_ON(key.offset != start);
828                 btrfs_set_file_extent_offset(leaf, fi, extent_offset +
829                                              split - key.offset);
830                 btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split);
831                 key.offset = split;
832                 btrfs_set_item_key_safe(trans, root, path, &key);
833                 extent_end = split;
834         }
835
836         if (extent_end == end) {
837                 split_end = 0;
838                 extent_type = BTRFS_FILE_EXTENT_REG;
839         }
840         if (extent_end == end && split == start) {
841                 other_start = end;
842                 other_end = 0;
843                 if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino,
844                                      bytenr, &other_start, &other_end)) {
845                         path->slots[0]++;
846                         fi = btrfs_item_ptr(leaf, path->slots[0],
847                                             struct btrfs_file_extent_item);
848                         key.offset = split;
849                         btrfs_set_item_key_safe(trans, root, path, &key);
850                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
851                         btrfs_set_file_extent_num_bytes(leaf, fi,
852                                                         other_end - split);
853                         goto done;
854                 }
855         }
856         if (extent_end == end && split == end) {
857                 other_start = 0;
858                 other_end = start;
859                 if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino,
860                                      bytenr, &other_start, &other_end)) {
861                         path->slots[0]--;
862                         fi = btrfs_item_ptr(leaf, path->slots[0],
863                                             struct btrfs_file_extent_item);
864                         btrfs_set_file_extent_num_bytes(leaf, fi, extent_end -
865                                                         other_start);
866                         goto done;
867                 }
868         }
869
870         btrfs_mark_buffer_dirty(leaf);
871         btrfs_release_path(root, path);
872
873         key.offset = start;
874         ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi));
875         BUG_ON(ret);
876
877         leaf = path->nodes[0];
878         fi = btrfs_item_ptr(leaf, path->slots[0],
879                             struct btrfs_file_extent_item);
880         btrfs_set_file_extent_generation(leaf, fi, trans->transid);
881         btrfs_set_file_extent_type(leaf, fi, extent_type);
882         btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr);
883         btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes);
884         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
885         btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset);
886         btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
887         btrfs_set_file_extent_compression(leaf, fi, 0);
888         btrfs_set_file_extent_encryption(leaf, fi, 0);
889         btrfs_set_file_extent_other_encoding(leaf, fi, 0);
890
891         ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes,
892                                    leaf->start, root->root_key.objectid,
893                                    trans->transid, inode->i_ino);
894         BUG_ON(ret);
895 done:
896         btrfs_mark_buffer_dirty(leaf);
897         btrfs_release_path(root, path);
898         if (split_end && split == start) {
899                 split = end;
900                 goto again;
901         }
902         if (locked_end > end) {
903                 unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1,
904                               GFP_NOFS);
905         }
906         btrfs_free_path(path);
907         return 0;
908 }
909
910 /*
911  * this gets pages into the page cache and locks them down, it also properly
912  * waits for data=ordered extents to finish before allowing the pages to be
913  * modified.
914  */
915 static int noinline prepare_pages(struct btrfs_root *root, struct file *file,
916                          struct page **pages, size_t num_pages,
917                          loff_t pos, unsigned long first_index,
918                          unsigned long last_index, size_t write_bytes)
919 {
920         int i;
921         unsigned long index = pos >> PAGE_CACHE_SHIFT;
922         struct inode *inode = fdentry(file)->d_inode;
923         int err = 0;
924         u64 start_pos;
925         u64 last_pos;
926
927         start_pos = pos & ~((u64)root->sectorsize - 1);
928         last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
929
930         if (start_pos > inode->i_size) {
931                 err = btrfs_cont_expand(inode, start_pos);
932                 if (err)
933                         return err;
934         }
935
936         memset(pages, 0, num_pages * sizeof(struct page *));
937 again:
938         for (i = 0; i < num_pages; i++) {
939                 pages[i] = grab_cache_page(inode->i_mapping, index + i);
940                 if (!pages[i]) {
941                         err = -ENOMEM;
942                         BUG_ON(1);
943                 }
944                 wait_on_page_writeback(pages[i]);
945         }
946         if (start_pos < inode->i_size) {
947                 struct btrfs_ordered_extent *ordered;
948                 lock_extent(&BTRFS_I(inode)->io_tree,
949                             start_pos, last_pos - 1, GFP_NOFS);
950                 ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
951                 if (ordered &&
952                     ordered->file_offset + ordered->len > start_pos &&
953                     ordered->file_offset < last_pos) {
954                         btrfs_put_ordered_extent(ordered);
955                         unlock_extent(&BTRFS_I(inode)->io_tree,
956                                       start_pos, last_pos - 1, GFP_NOFS);
957                         for (i = 0; i < num_pages; i++) {
958                                 unlock_page(pages[i]);
959                                 page_cache_release(pages[i]);
960                         }
961                         btrfs_wait_ordered_range(inode, start_pos,
962                                                  last_pos - start_pos);
963                         goto again;
964                 }
965                 if (ordered)
966                         btrfs_put_ordered_extent(ordered);
967
968                 clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
969                                   last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
970                                   GFP_NOFS);
971                 unlock_extent(&BTRFS_I(inode)->io_tree,
972                               start_pos, last_pos - 1, GFP_NOFS);
973         }
974         for (i = 0; i < num_pages; i++) {
975                 clear_page_dirty_for_io(pages[i]);
976                 set_page_extent_mapped(pages[i]);
977                 WARN_ON(!PageLocked(pages[i]));
978         }
979         return 0;
980 }
981
982 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
983                                 size_t count, loff_t *ppos)
984 {
985         loff_t pos;
986         loff_t start_pos;
987         ssize_t num_written = 0;
988         ssize_t err = 0;
989         int ret = 0;
990         struct inode *inode = fdentry(file)->d_inode;
991         struct btrfs_root *root = BTRFS_I(inode)->root;
992         struct page **pages = NULL;
993         int nrptrs;
994         struct page *pinned[2];
995         unsigned long first_index;
996         unsigned long last_index;
997         int will_write;
998
999         will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) ||
1000                       (file->f_flags & O_DIRECT));
1001
1002         nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE,
1003                      PAGE_CACHE_SIZE / (sizeof(struct page *)));
1004         pinned[0] = NULL;
1005         pinned[1] = NULL;
1006
1007         pos = *ppos;
1008         start_pos = pos;
1009
1010         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1011         current->backing_dev_info = inode->i_mapping->backing_dev_info;
1012         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1013         if (err)
1014                 goto out_nolock;
1015         if (count == 0)
1016                 goto out_nolock;
1017
1018         err = file_remove_suid(file);
1019         if (err)
1020                 goto out_nolock;
1021         file_update_time(file);
1022
1023         pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
1024
1025         mutex_lock(&inode->i_mutex);
1026         first_index = pos >> PAGE_CACHE_SHIFT;
1027         last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1028
1029         /*
1030          * if this is a nodatasum mount, force summing off for the inode
1031          * all the time.  That way a later mount with summing on won't
1032          * get confused
1033          */
1034         if (btrfs_test_opt(root, NODATASUM))
1035                 btrfs_set_flag(inode, NODATASUM);
1036
1037         /*
1038          * there are lots of better ways to do this, but this code
1039          * makes sure the first and last page in the file range are
1040          * up to date and ready for cow
1041          */
1042         if ((pos & (PAGE_CACHE_SIZE - 1))) {
1043                 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1044                 if (!PageUptodate(pinned[0])) {
1045                         ret = btrfs_readpage(NULL, pinned[0]);
1046                         BUG_ON(ret);
1047                         wait_on_page_locked(pinned[0]);
1048                 } else {
1049                         unlock_page(pinned[0]);
1050                 }
1051         }
1052         if ((pos + count) & (PAGE_CACHE_SIZE - 1)) {
1053                 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1054                 if (!PageUptodate(pinned[1])) {
1055                         ret = btrfs_readpage(NULL, pinned[1]);
1056                         BUG_ON(ret);
1057                         wait_on_page_locked(pinned[1]);
1058                 } else {
1059                         unlock_page(pinned[1]);
1060                 }
1061         }
1062
1063         while(count > 0) {
1064                 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1065                 size_t write_bytes = min(count, nrptrs *
1066                                         (size_t)PAGE_CACHE_SIZE -
1067                                          offset);
1068                 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1069                                         PAGE_CACHE_SHIFT;
1070
1071                 WARN_ON(num_pages > nrptrs);
1072                 memset(pages, 0, sizeof(pages));
1073
1074                 ret = btrfs_check_free_space(root, write_bytes, 0);
1075                 if (ret)
1076                         goto out;
1077
1078                 ret = prepare_pages(root, file, pages, num_pages,
1079                                     pos, first_index, last_index,
1080                                     write_bytes);
1081                 if (ret)
1082                         goto out;
1083
1084                 ret = btrfs_copy_from_user(pos, num_pages,
1085                                            write_bytes, pages, buf);
1086                 if (ret) {
1087                         btrfs_drop_pages(pages, num_pages);
1088                         goto out;
1089                 }
1090
1091                 ret = dirty_and_release_pages(NULL, root, file, pages,
1092                                               num_pages, pos, write_bytes);
1093                 btrfs_drop_pages(pages, num_pages);
1094                 if (ret)
1095                         goto out;
1096
1097                 if (will_write) {
1098                         btrfs_fdatawrite_range(inode->i_mapping, pos,
1099                                                pos + write_bytes - 1,
1100                                                WB_SYNC_NONE);
1101                 } else {
1102                         balance_dirty_pages_ratelimited_nr(inode->i_mapping,
1103                                                            num_pages);
1104                         if (num_pages <
1105                             (root->leafsize >> PAGE_CACHE_SHIFT) + 1)
1106                                 btrfs_btree_balance_dirty(root, 1);
1107                         btrfs_throttle(root);
1108                 }
1109
1110                 buf += write_bytes;
1111                 count -= write_bytes;
1112                 pos += write_bytes;
1113                 num_written += write_bytes;
1114
1115                 cond_resched();
1116         }
1117 out:
1118         mutex_unlock(&inode->i_mutex);
1119
1120 out_nolock:
1121         kfree(pages);
1122         if (pinned[0])
1123                 page_cache_release(pinned[0]);
1124         if (pinned[1])
1125                 page_cache_release(pinned[1]);
1126         *ppos = pos;
1127
1128         if (num_written > 0 && will_write) {
1129                 struct btrfs_trans_handle *trans;
1130
1131                 err = btrfs_wait_ordered_range(inode, start_pos, num_written);
1132                 if (err)
1133                         num_written = err;
1134
1135                 if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) {
1136                         trans = btrfs_start_transaction(root, 1);
1137                         ret = btrfs_log_dentry_safe(trans, root,
1138                                                     file->f_dentry);
1139                         if (ret == 0) {
1140                                 btrfs_sync_log(trans, root);
1141                                 btrfs_end_transaction(trans, root);
1142                         } else {
1143                                 btrfs_commit_transaction(trans, root);
1144                         }
1145                 }
1146                 if (file->f_flags & O_DIRECT) {
1147                         invalidate_mapping_pages(inode->i_mapping,
1148                               start_pos >> PAGE_CACHE_SHIFT,
1149                              (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
1150                 }
1151         }
1152         current->backing_dev_info = NULL;
1153         return num_written ? num_written : err;
1154 }
1155
1156 int btrfs_release_file(struct inode * inode, struct file * filp)
1157 {
1158         if (filp->private_data)
1159                 btrfs_ioctl_trans_end(filp);
1160         return 0;
1161 }
1162
1163 /*
1164  * fsync call for both files and directories.  This logs the inode into
1165  * the tree log instead of forcing full commits whenever possible.
1166  *
1167  * It needs to call filemap_fdatawait so that all ordered extent updates are
1168  * in the metadata btree are up to date for copying to the log.
1169  *
1170  * It drops the inode mutex before doing the tree log commit.  This is an
1171  * important optimization for directories because holding the mutex prevents
1172  * new operations on the dir while we write to disk.
1173  */
1174 int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
1175 {
1176         struct inode *inode = dentry->d_inode;
1177         struct btrfs_root *root = BTRFS_I(inode)->root;
1178         int ret = 0;
1179         struct btrfs_trans_handle *trans;
1180
1181         /*
1182          * check the transaction that last modified this inode
1183          * and see if its already been committed
1184          */
1185         if (!BTRFS_I(inode)->last_trans)
1186                 goto out;
1187
1188         mutex_lock(&root->fs_info->trans_mutex);
1189         if (BTRFS_I(inode)->last_trans <=
1190             root->fs_info->last_trans_committed) {
1191                 BTRFS_I(inode)->last_trans = 0;
1192                 mutex_unlock(&root->fs_info->trans_mutex);
1193                 goto out;
1194         }
1195         mutex_unlock(&root->fs_info->trans_mutex);
1196
1197         root->fs_info->tree_log_batch++;
1198         filemap_fdatawait(inode->i_mapping);
1199         root->fs_info->tree_log_batch++;
1200
1201         /*
1202          * ok we haven't committed the transaction yet, lets do a commit
1203          */
1204         if (file->private_data)
1205                 btrfs_ioctl_trans_end(file);
1206
1207         trans = btrfs_start_transaction(root, 1);
1208         if (!trans) {
1209                 ret = -ENOMEM;
1210                 goto out;
1211         }
1212
1213         ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
1214         if (ret < 0) {
1215                 goto out;
1216         }
1217
1218         /* we've logged all the items and now have a consistent
1219          * version of the file in the log.  It is possible that
1220          * someone will come in and modify the file, but that's
1221          * fine because the log is consistent on disk, and we
1222          * have references to all of the file's extents
1223          *
1224          * It is possible that someone will come in and log the
1225          * file again, but that will end up using the synchronization
1226          * inside btrfs_sync_log to keep things safe.
1227          */
1228         mutex_unlock(&file->f_dentry->d_inode->i_mutex);
1229
1230         if (ret > 0) {
1231                 ret = btrfs_commit_transaction(trans, root);
1232         } else {
1233                 btrfs_sync_log(trans, root);
1234                 ret = btrfs_end_transaction(trans, root);
1235         }
1236         mutex_lock(&file->f_dentry->d_inode->i_mutex);
1237 out:
1238         return ret > 0 ? EIO : ret;
1239 }
1240
1241 static struct vm_operations_struct btrfs_file_vm_ops = {
1242         .fault          = filemap_fault,
1243         .page_mkwrite   = btrfs_page_mkwrite,
1244 };
1245
1246 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
1247 {
1248         vma->vm_ops = &btrfs_file_vm_ops;
1249         file_accessed(filp);
1250         return 0;
1251 }
1252
1253 struct file_operations btrfs_file_operations = {
1254         .llseek         = generic_file_llseek,
1255         .read           = do_sync_read,
1256         .aio_read       = generic_file_aio_read,
1257         .splice_read    = generic_file_splice_read,
1258         .write          = btrfs_file_write,
1259         .mmap           = btrfs_file_mmap,
1260         .open           = generic_file_open,
1261         .release        = btrfs_release_file,
1262         .fsync          = btrfs_sync_file,
1263         .unlocked_ioctl = btrfs_ioctl,
1264 #ifdef CONFIG_COMPAT
1265         .compat_ioctl   = btrfs_ioctl,
1266 #endif
1267 };