Btrfs: Count space allocated to file in bytes
[safe/jmp/linux-2.6] / fs / btrfs / tree-log.c
1 /*
2  * Copyright (C) 2008 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18
19 #include <linux/sched.h>
20 #include "ctree.h"
21 #include "transaction.h"
22 #include "disk-io.h"
23 #include "locking.h"
24 #include "print-tree.h"
25 #include "compat.h"
26
27 /* magic values for the inode_only field in btrfs_log_inode:
28  *
29  * LOG_INODE_ALL means to log everything
30  * LOG_INODE_EXISTS means to log just enough to recreate the inode
31  * during log replay
32  */
33 #define LOG_INODE_ALL 0
34 #define LOG_INODE_EXISTS 1
35
36 /*
37  * stages for the tree walking.  The first
38  * stage (0) is to only pin down the blocks we find
39  * the second stage (1) is to make sure that all the inodes
40  * we find in the log are created in the subvolume.
41  *
42  * The last stage is to deal with directories and links and extents
43  * and all the other fun semantics
44  */
45 #define LOG_WALK_PIN_ONLY 0
46 #define LOG_WALK_REPLAY_INODES 1
47 #define LOG_WALK_REPLAY_ALL 2
48
49 static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
50                              struct btrfs_root *root, struct inode *inode,
51                              int inode_only);
52
53 /*
54  * tree logging is a special write ahead log used to make sure that
55  * fsyncs and O_SYNCs can happen without doing full tree commits.
56  *
57  * Full tree commits are expensive because they require commonly
58  * modified blocks to be recowed, creating many dirty pages in the
59  * extent tree an 4x-6x higher write load than ext3.
60  *
61  * Instead of doing a tree commit on every fsync, we use the
62  * key ranges and transaction ids to find items for a given file or directory
63  * that have changed in this transaction.  Those items are copied into
64  * a special tree (one per subvolume root), that tree is written to disk
65  * and then the fsync is considered complete.
66  *
67  * After a crash, items are copied out of the log-tree back into the
68  * subvolume tree.  Any file data extents found are recorded in the extent
69  * allocation tree, and the log-tree freed.
70  *
71  * The log tree is read three times, once to pin down all the extents it is
72  * using in ram and once, once to create all the inodes logged in the tree
73  * and once to do all the other items.
74  */
75
76 /*
77  * btrfs_add_log_tree adds a new per-subvolume log tree into the
78  * tree of log tree roots.  This must be called with a tree log transaction
79  * running (see start_log_trans).
80  */
81 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
82                       struct btrfs_root *root)
83 {
84         struct btrfs_key key;
85         struct btrfs_root_item root_item;
86         struct btrfs_inode_item *inode_item;
87         struct extent_buffer *leaf;
88         struct btrfs_root *new_root = root;
89         int ret;
90         u64 objectid = root->root_key.objectid;
91
92         leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
93                                       BTRFS_TREE_LOG_OBJECTID,
94                                       trans->transid, 0, 0, 0);
95         if (IS_ERR(leaf)) {
96                 ret = PTR_ERR(leaf);
97                 return ret;
98         }
99
100         btrfs_set_header_nritems(leaf, 0);
101         btrfs_set_header_level(leaf, 0);
102         btrfs_set_header_bytenr(leaf, leaf->start);
103         btrfs_set_header_generation(leaf, trans->transid);
104         btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
105
106         write_extent_buffer(leaf, root->fs_info->fsid,
107                             (unsigned long)btrfs_header_fsid(leaf),
108                             BTRFS_FSID_SIZE);
109         btrfs_mark_buffer_dirty(leaf);
110
111         inode_item = &root_item.inode;
112         memset(inode_item, 0, sizeof(*inode_item));
113         inode_item->generation = cpu_to_le64(1);
114         inode_item->size = cpu_to_le64(3);
115         inode_item->nlink = cpu_to_le32(1);
116         inode_item->nbytes = cpu_to_le64(root->leafsize);
117         inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
118
119         btrfs_set_root_bytenr(&root_item, leaf->start);
120         btrfs_set_root_level(&root_item, 0);
121         btrfs_set_root_refs(&root_item, 0);
122         btrfs_set_root_used(&root_item, 0);
123
124         memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
125         root_item.drop_level = 0;
126
127         btrfs_tree_unlock(leaf);
128         free_extent_buffer(leaf);
129         leaf = NULL;
130
131         btrfs_set_root_dirid(&root_item, 0);
132
133         key.objectid = BTRFS_TREE_LOG_OBJECTID;
134         key.offset = objectid;
135         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
136         ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
137                                 &root_item);
138         if (ret)
139                 goto fail;
140
141         new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
142                                                &key);
143         BUG_ON(!new_root);
144
145         WARN_ON(root->log_root);
146         root->log_root = new_root;
147
148         /*
149          * log trees do not get reference counted because they go away
150          * before a real commit is actually done.  They do store pointers
151          * to file data extents, and those reference counts still get
152          * updated (along with back refs to the log tree).
153          */
154         new_root->ref_cows = 0;
155         new_root->last_trans = trans->transid;
156 fail:
157         return ret;
158 }
159
160 /*
161  * start a sub transaction and setup the log tree
162  * this increments the log tree writer count to make the people
163  * syncing the tree wait for us to finish
164  */
165 static int start_log_trans(struct btrfs_trans_handle *trans,
166                            struct btrfs_root *root)
167 {
168         int ret;
169         mutex_lock(&root->fs_info->tree_log_mutex);
170         if (!root->fs_info->log_root_tree) {
171                 ret = btrfs_init_log_root_tree(trans, root->fs_info);
172                 BUG_ON(ret);
173         }
174         if (!root->log_root) {
175                 ret = btrfs_add_log_tree(trans, root);
176                 BUG_ON(ret);
177         }
178         atomic_inc(&root->fs_info->tree_log_writers);
179         root->fs_info->tree_log_batch++;
180         mutex_unlock(&root->fs_info->tree_log_mutex);
181         return 0;
182 }
183
184 /*
185  * returns 0 if there was a log transaction running and we were able
186  * to join, or returns -ENOENT if there were not transactions
187  * in progress
188  */
189 static int join_running_log_trans(struct btrfs_root *root)
190 {
191         int ret = -ENOENT;
192
193         smp_mb();
194         if (!root->log_root)
195                 return -ENOENT;
196
197         mutex_lock(&root->fs_info->tree_log_mutex);
198         if (root->log_root) {
199                 ret = 0;
200                 atomic_inc(&root->fs_info->tree_log_writers);
201                 root->fs_info->tree_log_batch++;
202         }
203         mutex_unlock(&root->fs_info->tree_log_mutex);
204         return ret;
205 }
206
207 /*
208  * indicate we're done making changes to the log tree
209  * and wake up anyone waiting to do a sync
210  */
211 static int end_log_trans(struct btrfs_root *root)
212 {
213         atomic_dec(&root->fs_info->tree_log_writers);
214         smp_mb();
215         if (waitqueue_active(&root->fs_info->tree_log_wait))
216                 wake_up(&root->fs_info->tree_log_wait);
217         return 0;
218 }
219
220
221 /*
222  * the walk control struct is used to pass state down the chain when
223  * processing the log tree.  The stage field tells us which part
224  * of the log tree processing we are currently doing.  The others
225  * are state fields used for that specific part
226  */
227 struct walk_control {
228         /* should we free the extent on disk when done?  This is used
229          * at transaction commit time while freeing a log tree
230          */
231         int free;
232
233         /* should we write out the extent buffer?  This is used
234          * while flushing the log tree to disk during a sync
235          */
236         int write;
237
238         /* should we wait for the extent buffer io to finish?  Also used
239          * while flushing the log tree to disk for a sync
240          */
241         int wait;
242
243         /* pin only walk, we record which extents on disk belong to the
244          * log trees
245          */
246         int pin;
247
248         /* what stage of the replay code we're currently in */
249         int stage;
250
251         /* the root we are currently replaying */
252         struct btrfs_root *replay_dest;
253
254         /* the trans handle for the current replay */
255         struct btrfs_trans_handle *trans;
256
257         /* the function that gets used to process blocks we find in the
258          * tree.  Note the extent_buffer might not be up to date when it is
259          * passed in, and it must be checked or read if you need the data
260          * inside it
261          */
262         int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb,
263                             struct walk_control *wc, u64 gen);
264 };
265
266 /*
267  * process_func used to pin down extents, write them or wait on them
268  */
269 static int process_one_buffer(struct btrfs_root *log,
270                               struct extent_buffer *eb,
271                               struct walk_control *wc, u64 gen)
272 {
273         if (wc->pin) {
274                 mutex_lock(&log->fs_info->alloc_mutex);
275                 btrfs_update_pinned_extents(log->fs_info->extent_root,
276                                             eb->start, eb->len, 1);
277                 mutex_unlock(&log->fs_info->alloc_mutex);
278         }
279
280         if (btrfs_buffer_uptodate(eb, gen)) {
281                 if (wc->write)
282                         btrfs_write_tree_block(eb);
283                 if (wc->wait)
284                         btrfs_wait_tree_block_writeback(eb);
285         }
286         return 0;
287 }
288
289 /*
290  * Item overwrite used by replay and tree logging.  eb, slot and key all refer
291  * to the src data we are copying out.
292  *
293  * root is the tree we are copying into, and path is a scratch
294  * path for use in this function (it should be released on entry and
295  * will be released on exit).
296  *
297  * If the key is already in the destination tree the existing item is
298  * overwritten.  If the existing item isn't big enough, it is extended.
299  * If it is too large, it is truncated.
300  *
301  * If the key isn't in the destination yet, a new item is inserted.
302  */
303 static noinline int overwrite_item(struct btrfs_trans_handle *trans,
304                                    struct btrfs_root *root,
305                                    struct btrfs_path *path,
306                                    struct extent_buffer *eb, int slot,
307                                    struct btrfs_key *key)
308 {
309         int ret;
310         u32 item_size;
311         u64 saved_i_size = 0;
312         int save_old_i_size = 0;
313         unsigned long src_ptr;
314         unsigned long dst_ptr;
315         int overwrite_root = 0;
316
317         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
318                 overwrite_root = 1;
319
320         item_size = btrfs_item_size_nr(eb, slot);
321         src_ptr = btrfs_item_ptr_offset(eb, slot);
322
323         /* look for the key in the destination tree */
324         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
325         if (ret == 0) {
326                 char *src_copy;
327                 char *dst_copy;
328                 u32 dst_size = btrfs_item_size_nr(path->nodes[0],
329                                                   path->slots[0]);
330                 if (dst_size != item_size)
331                         goto insert;
332
333                 if (item_size == 0) {
334                         btrfs_release_path(root, path);
335                         return 0;
336                 }
337                 dst_copy = kmalloc(item_size, GFP_NOFS);
338                 src_copy = kmalloc(item_size, GFP_NOFS);
339
340                 read_extent_buffer(eb, src_copy, src_ptr, item_size);
341
342                 dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
343                 read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
344                                    item_size);
345                 ret = memcmp(dst_copy, src_copy, item_size);
346
347                 kfree(dst_copy);
348                 kfree(src_copy);
349                 /*
350                  * they have the same contents, just return, this saves
351                  * us from cowing blocks in the destination tree and doing
352                  * extra writes that may not have been done by a previous
353                  * sync
354                  */
355                 if (ret == 0) {
356                         btrfs_release_path(root, path);
357                         return 0;
358                 }
359
360         }
361 insert:
362         btrfs_release_path(root, path);
363         /* try to insert the key into the destination tree */
364         ret = btrfs_insert_empty_item(trans, root, path,
365                                       key, item_size);
366
367         /* make sure any existing item is the correct size */
368         if (ret == -EEXIST) {
369                 u32 found_size;
370                 found_size = btrfs_item_size_nr(path->nodes[0],
371                                                 path->slots[0]);
372                 if (found_size > item_size) {
373                         btrfs_truncate_item(trans, root, path, item_size, 1);
374                 } else if (found_size < item_size) {
375                         ret = btrfs_del_item(trans, root,
376                                              path);
377                         BUG_ON(ret);
378
379                         btrfs_release_path(root, path);
380                         ret = btrfs_insert_empty_item(trans,
381                                   root, path, key, item_size);
382                         BUG_ON(ret);
383                 }
384         } else if (ret) {
385                 BUG();
386         }
387         dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
388                                         path->slots[0]);
389
390         /* don't overwrite an existing inode if the generation number
391          * was logged as zero.  This is done when the tree logging code
392          * is just logging an inode to make sure it exists after recovery.
393          *
394          * Also, don't overwrite i_size on directories during replay.
395          * log replay inserts and removes directory items based on the
396          * state of the tree found in the subvolume, and i_size is modified
397          * as it goes
398          */
399         if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
400                 struct btrfs_inode_item *src_item;
401                 struct btrfs_inode_item *dst_item;
402
403                 src_item = (struct btrfs_inode_item *)src_ptr;
404                 dst_item = (struct btrfs_inode_item *)dst_ptr;
405
406                 if (btrfs_inode_generation(eb, src_item) == 0)
407                         goto no_copy;
408
409                 if (overwrite_root &&
410                     S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
411                     S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
412                         save_old_i_size = 1;
413                         saved_i_size = btrfs_inode_size(path->nodes[0],
414                                                         dst_item);
415                 }
416         }
417
418         copy_extent_buffer(path->nodes[0], eb, dst_ptr,
419                            src_ptr, item_size);
420
421         if (save_old_i_size) {
422                 struct btrfs_inode_item *dst_item;
423                 dst_item = (struct btrfs_inode_item *)dst_ptr;
424                 btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
425         }
426
427         /* make sure the generation is filled in */
428         if (key->type == BTRFS_INODE_ITEM_KEY) {
429                 struct btrfs_inode_item *dst_item;
430                 dst_item = (struct btrfs_inode_item *)dst_ptr;
431                 if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
432                         btrfs_set_inode_generation(path->nodes[0], dst_item,
433                                                    trans->transid);
434                 }
435         }
436
437         if (overwrite_root &&
438             key->type == BTRFS_EXTENT_DATA_KEY) {
439                 int extent_type;
440                 struct btrfs_file_extent_item *fi;
441
442                 fi = (struct btrfs_file_extent_item *)dst_ptr;
443                 extent_type = btrfs_file_extent_type(path->nodes[0], fi);
444                 if (extent_type == BTRFS_FILE_EXTENT_REG) {
445                         struct btrfs_key ins;
446                         ins.objectid = btrfs_file_extent_disk_bytenr(
447                                                         path->nodes[0], fi);
448                         ins.offset = btrfs_file_extent_disk_num_bytes(
449                                                         path->nodes[0], fi);
450                         ins.type = BTRFS_EXTENT_ITEM_KEY;
451
452                         /*
453                          * is this extent already allocated in the extent
454                          * allocation tree?  If so, just add a reference
455                          */
456                         ret = btrfs_lookup_extent(root, ins.objectid,
457                                                   ins.offset);
458                         if (ret == 0) {
459                                 ret = btrfs_inc_extent_ref(trans, root,
460                                                 ins.objectid, ins.offset,
461                                                 path->nodes[0]->start,
462                                                 root->root_key.objectid,
463                                                 trans->transid,
464                                                 key->objectid, key->offset);
465                         } else {
466                                 /*
467                                  * insert the extent pointer in the extent
468                                  * allocation tree
469                                  */
470                                 ret = btrfs_alloc_logged_extent(trans, root,
471                                                 path->nodes[0]->start,
472                                                 root->root_key.objectid,
473                                                 trans->transid, key->objectid,
474                                                 key->offset, &ins);
475                                 BUG_ON(ret);
476                         }
477                 }
478         }
479 no_copy:
480         btrfs_mark_buffer_dirty(path->nodes[0]);
481         btrfs_release_path(root, path);
482         return 0;
483 }
484
485 /*
486  * simple helper to read an inode off the disk from a given root
487  * This can only be called for subvolume roots and not for the log
488  */
489 static noinline struct inode *read_one_inode(struct btrfs_root *root,
490                                              u64 objectid)
491 {
492         struct inode *inode;
493         inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
494         if (inode->i_state & I_NEW) {
495                 BTRFS_I(inode)->root = root;
496                 BTRFS_I(inode)->location.objectid = objectid;
497                 BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
498                 BTRFS_I(inode)->location.offset = 0;
499                 btrfs_read_locked_inode(inode);
500                 unlock_new_inode(inode);
501
502         }
503         if (is_bad_inode(inode)) {
504                 iput(inode);
505                 inode = NULL;
506         }
507         return inode;
508 }
509
510 /* replays a single extent in 'eb' at 'slot' with 'key' into the
511  * subvolume 'root'.  path is released on entry and should be released
512  * on exit.
513  *
514  * extents in the log tree have not been allocated out of the extent
515  * tree yet.  So, this completes the allocation, taking a reference
516  * as required if the extent already exists or creating a new extent
517  * if it isn't in the extent allocation tree yet.
518  *
519  * The extent is inserted into the file, dropping any existing extents
520  * from the file that overlap the new one.
521  */
522 static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
523                                       struct btrfs_root *root,
524                                       struct btrfs_path *path,
525                                       struct extent_buffer *eb, int slot,
526                                       struct btrfs_key *key)
527 {
528         int found_type;
529         u64 mask = root->sectorsize - 1;
530         u64 extent_end;
531         u64 alloc_hint;
532         u64 start = key->offset;
533         struct btrfs_file_extent_item *item;
534         struct inode *inode = NULL;
535         unsigned long size;
536         int ret = 0;
537
538         item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
539         found_type = btrfs_file_extent_type(eb, item);
540
541         if (found_type == BTRFS_FILE_EXTENT_REG)
542                 extent_end = start + btrfs_file_extent_num_bytes(eb, item);
543         else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
544                 size = btrfs_file_extent_inline_len(eb,
545                                                     btrfs_item_nr(eb, slot));
546                 extent_end = (start + size + mask) & ~mask;
547         } else {
548                 ret = 0;
549                 goto out;
550         }
551
552         inode = read_one_inode(root, key->objectid);
553         if (!inode) {
554                 ret = -EIO;
555                 goto out;
556         }
557
558         /*
559          * first check to see if we already have this extent in the
560          * file.  This must be done before the btrfs_drop_extents run
561          * so we don't try to drop this extent.
562          */
563         ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
564                                        start, 0);
565
566         if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
567                 struct btrfs_file_extent_item cmp1;
568                 struct btrfs_file_extent_item cmp2;
569                 struct btrfs_file_extent_item *existing;
570                 struct extent_buffer *leaf;
571
572                 leaf = path->nodes[0];
573                 existing = btrfs_item_ptr(leaf, path->slots[0],
574                                           struct btrfs_file_extent_item);
575
576                 read_extent_buffer(eb, &cmp1, (unsigned long)item,
577                                    sizeof(cmp1));
578                 read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
579                                    sizeof(cmp2));
580
581                 /*
582                  * we already have a pointer to this exact extent,
583                  * we don't have to do anything
584                  */
585                 if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
586                         btrfs_release_path(root, path);
587                         goto out;
588                 }
589         }
590         btrfs_release_path(root, path);
591
592         /* drop any overlapping extents */
593         ret = btrfs_drop_extents(trans, root, inode,
594                          start, extent_end, start, &alloc_hint);
595         BUG_ON(ret);
596
597         /* insert the extent */
598         ret = overwrite_item(trans, root, path, eb, slot, key);
599         BUG_ON(ret);
600
601         /* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
602         inode_add_bytes(inode, extent_end - start);
603         btrfs_update_inode(trans, root, inode);
604 out:
605         if (inode)
606                 iput(inode);
607         return ret;
608 }
609
610 /*
611  * when cleaning up conflicts between the directory names in the
612  * subvolume, directory names in the log and directory names in the
613  * inode back references, we may have to unlink inodes from directories.
614  *
615  * This is a helper function to do the unlink of a specific directory
616  * item
617  */
618 static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
619                                       struct btrfs_root *root,
620                                       struct btrfs_path *path,
621                                       struct inode *dir,
622                                       struct btrfs_dir_item *di)
623 {
624         struct inode *inode;
625         char *name;
626         int name_len;
627         struct extent_buffer *leaf;
628         struct btrfs_key location;
629         int ret;
630
631         leaf = path->nodes[0];
632
633         btrfs_dir_item_key_to_cpu(leaf, di, &location);
634         name_len = btrfs_dir_name_len(leaf, di);
635         name = kmalloc(name_len, GFP_NOFS);
636         read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
637         btrfs_release_path(root, path);
638
639         inode = read_one_inode(root, location.objectid);
640         BUG_ON(!inode);
641
642         btrfs_inc_nlink(inode);
643         ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
644         kfree(name);
645
646         iput(inode);
647         return ret;
648 }
649
650 /*
651  * helper function to see if a given name and sequence number found
652  * in an inode back reference are already in a directory and correctly
653  * point to this inode
654  */
655 static noinline int inode_in_dir(struct btrfs_root *root,
656                                  struct btrfs_path *path,
657                                  u64 dirid, u64 objectid, u64 index,
658                                  const char *name, int name_len)
659 {
660         struct btrfs_dir_item *di;
661         struct btrfs_key location;
662         int match = 0;
663
664         di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
665                                          index, name, name_len, 0);
666         if (di && !IS_ERR(di)) {
667                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
668                 if (location.objectid != objectid)
669                         goto out;
670         } else
671                 goto out;
672         btrfs_release_path(root, path);
673
674         di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
675         if (di && !IS_ERR(di)) {
676                 btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
677                 if (location.objectid != objectid)
678                         goto out;
679         } else
680                 goto out;
681         match = 1;
682 out:
683         btrfs_release_path(root, path);
684         return match;
685 }
686
687 /*
688  * helper function to check a log tree for a named back reference in
689  * an inode.  This is used to decide if a back reference that is
690  * found in the subvolume conflicts with what we find in the log.
691  *
692  * inode backreferences may have multiple refs in a single item,
693  * during replay we process one reference at a time, and we don't
694  * want to delete valid links to a file from the subvolume if that
695  * link is also in the log.
696  */
697 static noinline int backref_in_log(struct btrfs_root *log,
698                                    struct btrfs_key *key,
699                                    char *name, int namelen)
700 {
701         struct btrfs_path *path;
702         struct btrfs_inode_ref *ref;
703         unsigned long ptr;
704         unsigned long ptr_end;
705         unsigned long name_ptr;
706         int found_name_len;
707         int item_size;
708         int ret;
709         int match = 0;
710
711         path = btrfs_alloc_path();
712         ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
713         if (ret != 0)
714                 goto out;
715
716         item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
717         ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
718         ptr_end = ptr + item_size;
719         while (ptr < ptr_end) {
720                 ref = (struct btrfs_inode_ref *)ptr;
721                 found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
722                 if (found_name_len == namelen) {
723                         name_ptr = (unsigned long)(ref + 1);
724                         ret = memcmp_extent_buffer(path->nodes[0], name,
725                                                    name_ptr, namelen);
726                         if (ret == 0) {
727                                 match = 1;
728                                 goto out;
729                         }
730                 }
731                 ptr = (unsigned long)(ref + 1) + found_name_len;
732         }
733 out:
734         btrfs_free_path(path);
735         return match;
736 }
737
738
739 /*
740  * replay one inode back reference item found in the log tree.
741  * eb, slot and key refer to the buffer and key found in the log tree.
742  * root is the destination we are replaying into, and path is for temp
743  * use by this function.  (it should be released on return).
744  */
745 static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
746                                   struct btrfs_root *root,
747                                   struct btrfs_root *log,
748                                   struct btrfs_path *path,
749                                   struct extent_buffer *eb, int slot,
750                                   struct btrfs_key *key)
751 {
752         struct inode *dir;
753         int ret;
754         struct btrfs_key location;
755         struct btrfs_inode_ref *ref;
756         struct btrfs_dir_item *di;
757         struct inode *inode;
758         char *name;
759         int namelen;
760         unsigned long ref_ptr;
761         unsigned long ref_end;
762
763         location.objectid = key->objectid;
764         location.type = BTRFS_INODE_ITEM_KEY;
765         location.offset = 0;
766
767         /*
768          * it is possible that we didn't log all the parent directories
769          * for a given inode.  If we don't find the dir, just don't
770          * copy the back ref in.  The link count fixup code will take
771          * care of the rest
772          */
773         dir = read_one_inode(root, key->offset);
774         if (!dir)
775                 return -ENOENT;
776
777         inode = read_one_inode(root, key->objectid);
778         BUG_ON(!dir);
779
780         ref_ptr = btrfs_item_ptr_offset(eb, slot);
781         ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
782
783 again:
784         ref = (struct btrfs_inode_ref *)ref_ptr;
785
786         namelen = btrfs_inode_ref_name_len(eb, ref);
787         name = kmalloc(namelen, GFP_NOFS);
788         BUG_ON(!name);
789
790         read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
791
792         /* if we already have a perfect match, we're done */
793         if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
794                          btrfs_inode_ref_index(eb, ref),
795                          name, namelen)) {
796                 goto out;
797         }
798
799         /*
800          * look for a conflicting back reference in the metadata.
801          * if we find one we have to unlink that name of the file
802          * before we add our new link.  Later on, we overwrite any
803          * existing back reference, and we don't want to create
804          * dangling pointers in the directory.
805          */
806 conflict_again:
807         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
808         if (ret == 0) {
809                 char *victim_name;
810                 int victim_name_len;
811                 struct btrfs_inode_ref *victim_ref;
812                 unsigned long ptr;
813                 unsigned long ptr_end;
814                 struct extent_buffer *leaf = path->nodes[0];
815
816                 /* are we trying to overwrite a back ref for the root directory
817                  * if so, just jump out, we're done
818                  */
819                 if (key->objectid == key->offset)
820                         goto out_nowrite;
821
822                 /* check all the names in this back reference to see
823                  * if they are in the log.  if so, we allow them to stay
824                  * otherwise they must be unlinked as a conflict
825                  */
826                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
827                 ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
828                 while(ptr < ptr_end) {
829                         victim_ref = (struct btrfs_inode_ref *)ptr;
830                         victim_name_len = btrfs_inode_ref_name_len(leaf,
831                                                                    victim_ref);
832                         victim_name = kmalloc(victim_name_len, GFP_NOFS);
833                         BUG_ON(!victim_name);
834
835                         read_extent_buffer(leaf, victim_name,
836                                            (unsigned long)(victim_ref + 1),
837                                            victim_name_len);
838
839                         if (!backref_in_log(log, key, victim_name,
840                                             victim_name_len)) {
841                                 btrfs_inc_nlink(inode);
842                                 btrfs_release_path(root, path);
843                                 ret = btrfs_unlink_inode(trans, root, dir,
844                                                          inode, victim_name,
845                                                          victim_name_len);
846                                 kfree(victim_name);
847                                 btrfs_release_path(root, path);
848                                 goto conflict_again;
849                         }
850                         kfree(victim_name);
851                         ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
852                 }
853                 BUG_ON(ret);
854         }
855         btrfs_release_path(root, path);
856
857         /* look for a conflicting sequence number */
858         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
859                                          btrfs_inode_ref_index(eb, ref),
860                                          name, namelen, 0);
861         if (di && !IS_ERR(di)) {
862                 ret = drop_one_dir_item(trans, root, path, dir, di);
863                 BUG_ON(ret);
864         }
865         btrfs_release_path(root, path);
866
867
868         /* look for a conflicting name */
869         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
870                                    name, namelen, 0);
871         if (di && !IS_ERR(di)) {
872                 ret = drop_one_dir_item(trans, root, path, dir, di);
873                 BUG_ON(ret);
874         }
875         btrfs_release_path(root, path);
876
877         /* insert our name */
878         ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
879                              btrfs_inode_ref_index(eb, ref));
880         BUG_ON(ret);
881
882         btrfs_update_inode(trans, root, inode);
883
884 out:
885         ref_ptr = (unsigned long)(ref + 1) + namelen;
886         kfree(name);
887         if (ref_ptr < ref_end)
888                 goto again;
889
890         /* finally write the back reference in the inode */
891         ret = overwrite_item(trans, root, path, eb, slot, key);
892         BUG_ON(ret);
893
894 out_nowrite:
895         btrfs_release_path(root, path);
896         iput(dir);
897         iput(inode);
898         return 0;
899 }
900
901 /*
902  * replay one csum item from the log tree into the subvolume 'root'
903  * eb, slot and key all refer to the log tree
904  * path is for temp use by this function and should be released on return
905  *
906  * This copies the checksums out of the log tree and inserts them into
907  * the subvolume.  Any existing checksums for this range in the file
908  * are overwritten, and new items are added where required.
909  *
910  * We keep this simple by reusing the btrfs_ordered_sum code from
911  * the data=ordered mode.  This basically means making a copy
912  * of all the checksums in ram, which we have to do anyway for kmap
913  * rules.
914  *
915  * The copy is then sent down to btrfs_csum_file_blocks, which
916  * does all the hard work of finding existing items in the file
917  * or adding new ones.
918  */
919 static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
920                                       struct btrfs_root *root,
921                                       struct btrfs_path *path,
922                                       struct extent_buffer *eb, int slot,
923                                       struct btrfs_key *key)
924 {
925         int ret;
926         u32 item_size = btrfs_item_size_nr(eb, slot);
927         u64 cur_offset;
928         unsigned long file_bytes;
929         struct btrfs_ordered_sum *sums;
930         struct btrfs_sector_sum *sector_sum;
931         struct inode *inode;
932         unsigned long ptr;
933
934         file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
935         inode = read_one_inode(root, key->objectid);
936         if (!inode) {
937                 return -EIO;
938         }
939
940         sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
941         if (!sums) {
942                 iput(inode);
943                 return -ENOMEM;
944         }
945
946         INIT_LIST_HEAD(&sums->list);
947         sums->len = file_bytes;
948         sums->file_offset = key->offset;
949
950         /*
951          * copy all the sums into the ordered sum struct
952          */
953         sector_sum = sums->sums;
954         cur_offset = key->offset;
955         ptr = btrfs_item_ptr_offset(eb, slot);
956         while(item_size > 0) {
957                 sector_sum->offset = cur_offset;
958                 read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
959                 sector_sum++;
960                 item_size -= BTRFS_CRC32_SIZE;
961                 ptr += BTRFS_CRC32_SIZE;
962                 cur_offset += root->sectorsize;
963         }
964
965         /* let btrfs_csum_file_blocks add them into the file */
966         ret = btrfs_csum_file_blocks(trans, root, inode, sums);
967         BUG_ON(ret);
968         kfree(sums);
969         iput(inode);
970
971         return 0;
972 }
973 /*
974  * There are a few corners where the link count of the file can't
975  * be properly maintained during replay.  So, instead of adding
976  * lots of complexity to the log code, we just scan the backrefs
977  * for any file that has been through replay.
978  *
979  * The scan will update the link count on the inode to reflect the
980  * number of back refs found.  If it goes down to zero, the iput
981  * will free the inode.
982  */
983 static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
984                                            struct btrfs_root *root,
985                                            struct inode *inode)
986 {
987         struct btrfs_path *path;
988         int ret;
989         struct btrfs_key key;
990         u64 nlink = 0;
991         unsigned long ptr;
992         unsigned long ptr_end;
993         int name_len;
994
995         key.objectid = inode->i_ino;
996         key.type = BTRFS_INODE_REF_KEY;
997         key.offset = (u64)-1;
998
999         path = btrfs_alloc_path();
1000
1001         while(1) {
1002                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1003                 if (ret < 0)
1004                         break;
1005                 if (ret > 0) {
1006                         if (path->slots[0] == 0)
1007                                 break;
1008                         path->slots[0]--;
1009                 }
1010                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1011                                       path->slots[0]);
1012                 if (key.objectid != inode->i_ino ||
1013                     key.type != BTRFS_INODE_REF_KEY)
1014                         break;
1015                 ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
1016                 ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
1017                                                    path->slots[0]);
1018                 while(ptr < ptr_end) {
1019                         struct btrfs_inode_ref *ref;
1020
1021                         ref = (struct btrfs_inode_ref *)ptr;
1022                         name_len = btrfs_inode_ref_name_len(path->nodes[0],
1023                                                             ref);
1024                         ptr = (unsigned long)(ref + 1) + name_len;
1025                         nlink++;
1026                 }
1027
1028                 if (key.offset == 0)
1029                         break;
1030                 key.offset--;
1031                 btrfs_release_path(root, path);
1032         }
1033         btrfs_free_path(path);
1034         if (nlink != inode->i_nlink) {
1035                 inode->i_nlink = nlink;
1036                 btrfs_update_inode(trans, root, inode);
1037         }
1038         BTRFS_I(inode)->index_cnt = (u64)-1;
1039
1040         return 0;
1041 }
1042
1043 static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
1044                                             struct btrfs_root *root,
1045                                             struct btrfs_path *path)
1046 {
1047         int ret;
1048         struct btrfs_key key;
1049         struct inode *inode;
1050
1051         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1052         key.type = BTRFS_ORPHAN_ITEM_KEY;
1053         key.offset = (u64)-1;
1054         while(1) {
1055                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1056                 if (ret < 0)
1057                         break;
1058
1059                 if (ret == 1) {
1060                         if (path->slots[0] == 0)
1061                                 break;
1062                         path->slots[0]--;
1063                 }
1064
1065                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1066                 if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID ||
1067                     key.type != BTRFS_ORPHAN_ITEM_KEY)
1068                         break;
1069
1070                 ret = btrfs_del_item(trans, root, path);
1071                 BUG_ON(ret);
1072
1073                 btrfs_release_path(root, path);
1074                 inode = read_one_inode(root, key.offset);
1075                 BUG_ON(!inode);
1076
1077                 ret = fixup_inode_link_count(trans, root, inode);
1078                 BUG_ON(ret);
1079
1080                 iput(inode);
1081
1082                 if (key.offset == 0)
1083                         break;
1084                 key.offset--;
1085         }
1086         btrfs_release_path(root, path);
1087         return 0;
1088 }
1089
1090
1091 /*
1092  * record a given inode in the fixup dir so we can check its link
1093  * count when replay is done.  The link count is incremented here
1094  * so the inode won't go away until we check it
1095  */
1096 static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
1097                                       struct btrfs_root *root,
1098                                       struct btrfs_path *path,
1099                                       u64 objectid)
1100 {
1101         struct btrfs_key key;
1102         int ret = 0;
1103         struct inode *inode;
1104
1105         inode = read_one_inode(root, objectid);
1106         BUG_ON(!inode);
1107
1108         key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
1109         btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
1110         key.offset = objectid;
1111
1112         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1113
1114         btrfs_release_path(root, path);
1115         if (ret == 0) {
1116                 btrfs_inc_nlink(inode);
1117                 btrfs_update_inode(trans, root, inode);
1118         } else if (ret == -EEXIST) {
1119                 ret = 0;
1120         } else {
1121                 BUG();
1122         }
1123         iput(inode);
1124
1125         return ret;
1126 }
1127
1128 /*
1129  * when replaying the log for a directory, we only insert names
1130  * for inodes that actually exist.  This means an fsync on a directory
1131  * does not implicitly fsync all the new files in it
1132  */
1133 static noinline int insert_one_name(struct btrfs_trans_handle *trans,
1134                                     struct btrfs_root *root,
1135                                     struct btrfs_path *path,
1136                                     u64 dirid, u64 index,
1137                                     char *name, int name_len, u8 type,
1138                                     struct btrfs_key *location)
1139 {
1140         struct inode *inode;
1141         struct inode *dir;
1142         int ret;
1143
1144         inode = read_one_inode(root, location->objectid);
1145         if (!inode)
1146                 return -ENOENT;
1147
1148         dir = read_one_inode(root, dirid);
1149         if (!dir) {
1150                 iput(inode);
1151                 return -EIO;
1152         }
1153         ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
1154
1155         /* FIXME, put inode into FIXUP list */
1156
1157         iput(inode);
1158         iput(dir);
1159         return ret;
1160 }
1161
1162 /*
1163  * take a single entry in a log directory item and replay it into
1164  * the subvolume.
1165  *
1166  * if a conflicting item exists in the subdirectory already,
1167  * the inode it points to is unlinked and put into the link count
1168  * fix up tree.
1169  *
1170  * If a name from the log points to a file or directory that does
1171  * not exist in the FS, it is skipped.  fsyncs on directories
1172  * do not force down inodes inside that directory, just changes to the
1173  * names or unlinks in a directory.
1174  */
1175 static noinline int replay_one_name(struct btrfs_trans_handle *trans,
1176                                     struct btrfs_root *root,
1177                                     struct btrfs_path *path,
1178                                     struct extent_buffer *eb,
1179                                     struct btrfs_dir_item *di,
1180                                     struct btrfs_key *key)
1181 {
1182         char *name;
1183         int name_len;
1184         struct btrfs_dir_item *dst_di;
1185         struct btrfs_key found_key;
1186         struct btrfs_key log_key;
1187         struct inode *dir;
1188         u8 log_type;
1189         int exists;
1190         int ret;
1191
1192         dir = read_one_inode(root, key->objectid);
1193         BUG_ON(!dir);
1194
1195         name_len = btrfs_dir_name_len(eb, di);
1196         name = kmalloc(name_len, GFP_NOFS);
1197         log_type = btrfs_dir_type(eb, di);
1198         read_extent_buffer(eb, name, (unsigned long)(di + 1),
1199                    name_len);
1200
1201         btrfs_dir_item_key_to_cpu(eb, di, &log_key);
1202         exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
1203         if (exists == 0)
1204                 exists = 1;
1205         else
1206                 exists = 0;
1207         btrfs_release_path(root, path);
1208
1209         if (key->type == BTRFS_DIR_ITEM_KEY) {
1210                 dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
1211                                        name, name_len, 1);
1212         }
1213         else if (key->type == BTRFS_DIR_INDEX_KEY) {
1214                 dst_di = btrfs_lookup_dir_index_item(trans, root, path,
1215                                                      key->objectid,
1216                                                      key->offset, name,
1217                                                      name_len, 1);
1218         } else {
1219                 BUG();
1220         }
1221         if (!dst_di || IS_ERR(dst_di)) {
1222                 /* we need a sequence number to insert, so we only
1223                  * do inserts for the BTRFS_DIR_INDEX_KEY types
1224                  */
1225                 if (key->type != BTRFS_DIR_INDEX_KEY)
1226                         goto out;
1227                 goto insert;
1228         }
1229
1230         btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
1231         /* the existing item matches the logged item */
1232         if (found_key.objectid == log_key.objectid &&
1233             found_key.type == log_key.type &&
1234             found_key.offset == log_key.offset &&
1235             btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
1236                 goto out;
1237         }
1238
1239         /*
1240          * don't drop the conflicting directory entry if the inode
1241          * for the new entry doesn't exist
1242          */
1243         if (!exists)
1244                 goto out;
1245
1246         ret = drop_one_dir_item(trans, root, path, dir, dst_di);
1247         BUG_ON(ret);
1248
1249         if (key->type == BTRFS_DIR_INDEX_KEY)
1250                 goto insert;
1251 out:
1252         btrfs_release_path(root, path);
1253         kfree(name);
1254         iput(dir);
1255         return 0;
1256
1257 insert:
1258         btrfs_release_path(root, path);
1259         ret = insert_one_name(trans, root, path, key->objectid, key->offset,
1260                               name, name_len, log_type, &log_key);
1261
1262         if (ret && ret != -ENOENT)
1263                 BUG();
1264         goto out;
1265 }
1266
1267 /*
1268  * find all the names in a directory item and reconcile them into
1269  * the subvolume.  Only BTRFS_DIR_ITEM_KEY types will have more than
1270  * one name in a directory item, but the same code gets used for
1271  * both directory index types
1272  */
1273 static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
1274                                         struct btrfs_root *root,
1275                                         struct btrfs_path *path,
1276                                         struct extent_buffer *eb, int slot,
1277                                         struct btrfs_key *key)
1278 {
1279         int ret;
1280         u32 item_size = btrfs_item_size_nr(eb, slot);
1281         struct btrfs_dir_item *di;
1282         int name_len;
1283         unsigned long ptr;
1284         unsigned long ptr_end;
1285
1286         ptr = btrfs_item_ptr_offset(eb, slot);
1287         ptr_end = ptr + item_size;
1288         while(ptr < ptr_end) {
1289                 di = (struct btrfs_dir_item *)ptr;
1290                 name_len = btrfs_dir_name_len(eb, di);
1291                 ret = replay_one_name(trans, root, path, eb, di, key);
1292                 BUG_ON(ret);
1293                 ptr = (unsigned long)(di + 1);
1294                 ptr += name_len;
1295         }
1296         return 0;
1297 }
1298
1299 /*
1300  * directory replay has two parts.  There are the standard directory
1301  * items in the log copied from the subvolume, and range items
1302  * created in the log while the subvolume was logged.
1303  *
1304  * The range items tell us which parts of the key space the log
1305  * is authoritative for.  During replay, if a key in the subvolume
1306  * directory is in a logged range item, but not actually in the log
1307  * that means it was deleted from the directory before the fsync
1308  * and should be removed.
1309  */
1310 static noinline int find_dir_range(struct btrfs_root *root,
1311                                    struct btrfs_path *path,
1312                                    u64 dirid, int key_type,
1313                                    u64 *start_ret, u64 *end_ret)
1314 {
1315         struct btrfs_key key;
1316         u64 found_end;
1317         struct btrfs_dir_log_item *item;
1318         int ret;
1319         int nritems;
1320
1321         if (*start_ret == (u64)-1)
1322                 return 1;
1323
1324         key.objectid = dirid;
1325         key.type = key_type;
1326         key.offset = *start_ret;
1327
1328         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1329         if (ret < 0)
1330                 goto out;
1331         if (ret > 0) {
1332                 if (path->slots[0] == 0)
1333                         goto out;
1334                 path->slots[0]--;
1335         }
1336         if (ret != 0)
1337                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1338
1339         if (key.type != key_type || key.objectid != dirid) {
1340                 ret = 1;
1341                 goto next;
1342         }
1343         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1344                               struct btrfs_dir_log_item);
1345         found_end = btrfs_dir_log_end(path->nodes[0], item);
1346
1347         if (*start_ret >= key.offset && *start_ret <= found_end) {
1348                 ret = 0;
1349                 *start_ret = key.offset;
1350                 *end_ret = found_end;
1351                 goto out;
1352         }
1353         ret = 1;
1354 next:
1355         /* check the next slot in the tree to see if it is a valid item */
1356         nritems = btrfs_header_nritems(path->nodes[0]);
1357         if (path->slots[0] >= nritems) {
1358                 ret = btrfs_next_leaf(root, path);
1359                 if (ret)
1360                         goto out;
1361         } else {
1362                 path->slots[0]++;
1363         }
1364
1365         btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
1366
1367         if (key.type != key_type || key.objectid != dirid) {
1368                 ret = 1;
1369                 goto out;
1370         }
1371         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
1372                               struct btrfs_dir_log_item);
1373         found_end = btrfs_dir_log_end(path->nodes[0], item);
1374         *start_ret = key.offset;
1375         *end_ret = found_end;
1376         ret = 0;
1377 out:
1378         btrfs_release_path(root, path);
1379         return ret;
1380 }
1381
1382 /*
1383  * this looks for a given directory item in the log.  If the directory
1384  * item is not in the log, the item is removed and the inode it points
1385  * to is unlinked
1386  */
1387 static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
1388                                       struct btrfs_root *root,
1389                                       struct btrfs_root *log,
1390                                       struct btrfs_path *path,
1391                                       struct btrfs_path *log_path,
1392                                       struct inode *dir,
1393                                       struct btrfs_key *dir_key)
1394 {
1395         int ret;
1396         struct extent_buffer *eb;
1397         int slot;
1398         u32 item_size;
1399         struct btrfs_dir_item *di;
1400         struct btrfs_dir_item *log_di;
1401         int name_len;
1402         unsigned long ptr;
1403         unsigned long ptr_end;
1404         char *name;
1405         struct inode *inode;
1406         struct btrfs_key location;
1407
1408 again:
1409         eb = path->nodes[0];
1410         slot = path->slots[0];
1411         item_size = btrfs_item_size_nr(eb, slot);
1412         ptr = btrfs_item_ptr_offset(eb, slot);
1413         ptr_end = ptr + item_size;
1414         while(ptr < ptr_end) {
1415                 di = (struct btrfs_dir_item *)ptr;
1416                 name_len = btrfs_dir_name_len(eb, di);
1417                 name = kmalloc(name_len, GFP_NOFS);
1418                 if (!name) {
1419                         ret = -ENOMEM;
1420                         goto out;
1421                 }
1422                 read_extent_buffer(eb, name, (unsigned long)(di + 1),
1423                                   name_len);
1424                 log_di = NULL;
1425                 if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
1426                         log_di = btrfs_lookup_dir_item(trans, log, log_path,
1427                                                        dir_key->objectid,
1428                                                        name, name_len, 0);
1429                 } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
1430                         log_di = btrfs_lookup_dir_index_item(trans, log,
1431                                                      log_path,
1432                                                      dir_key->objectid,
1433                                                      dir_key->offset,
1434                                                      name, name_len, 0);
1435                 }
1436                 if (!log_di || IS_ERR(log_di)) {
1437                         btrfs_dir_item_key_to_cpu(eb, di, &location);
1438                         btrfs_release_path(root, path);
1439                         btrfs_release_path(log, log_path);
1440                         inode = read_one_inode(root, location.objectid);
1441                         BUG_ON(!inode);
1442
1443                         ret = link_to_fixup_dir(trans, root,
1444                                                 path, location.objectid);
1445                         BUG_ON(ret);
1446                         btrfs_inc_nlink(inode);
1447                         ret = btrfs_unlink_inode(trans, root, dir, inode,
1448                                                  name, name_len);
1449                         BUG_ON(ret);
1450                         kfree(name);
1451                         iput(inode);
1452
1453                         /* there might still be more names under this key
1454                          * check and repeat if required
1455                          */
1456                         ret = btrfs_search_slot(NULL, root, dir_key, path,
1457                                                 0, 0);
1458                         if (ret == 0)
1459                                 goto again;
1460                         ret = 0;
1461                         goto out;
1462                 }
1463                 btrfs_release_path(log, log_path);
1464                 kfree(name);
1465
1466                 ptr = (unsigned long)(di + 1);
1467                 ptr += name_len;
1468         }
1469         ret = 0;
1470 out:
1471         btrfs_release_path(root, path);
1472         btrfs_release_path(log, log_path);
1473         return ret;
1474 }
1475
1476 /*
1477  * deletion replay happens before we copy any new directory items
1478  * out of the log or out of backreferences from inodes.  It
1479  * scans the log to find ranges of keys that log is authoritative for,
1480  * and then scans the directory to find items in those ranges that are
1481  * not present in the log.
1482  *
1483  * Anything we don't find in the log is unlinked and removed from the
1484  * directory.
1485  */
1486 static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
1487                                        struct btrfs_root *root,
1488                                        struct btrfs_root *log,
1489                                        struct btrfs_path *path,
1490                                        u64 dirid)
1491 {
1492         u64 range_start;
1493         u64 range_end;
1494         int key_type = BTRFS_DIR_LOG_ITEM_KEY;
1495         int ret = 0;
1496         struct btrfs_key dir_key;
1497         struct btrfs_key found_key;
1498         struct btrfs_path *log_path;
1499         struct inode *dir;
1500
1501         dir_key.objectid = dirid;
1502         dir_key.type = BTRFS_DIR_ITEM_KEY;
1503         log_path = btrfs_alloc_path();
1504         if (!log_path)
1505                 return -ENOMEM;
1506
1507         dir = read_one_inode(root, dirid);
1508         /* it isn't an error if the inode isn't there, that can happen
1509          * because we replay the deletes before we copy in the inode item
1510          * from the log
1511          */
1512         if (!dir) {
1513                 btrfs_free_path(log_path);
1514                 return 0;
1515         }
1516 again:
1517         range_start = 0;
1518         range_end = 0;
1519         while(1) {
1520                 ret = find_dir_range(log, path, dirid, key_type,
1521                                      &range_start, &range_end);
1522                 if (ret != 0)
1523                         break;
1524
1525                 dir_key.offset = range_start;
1526                 while(1) {
1527                         int nritems;
1528                         ret = btrfs_search_slot(NULL, root, &dir_key, path,
1529                                                 0, 0);
1530                         if (ret < 0)
1531                                 goto out;
1532
1533                         nritems = btrfs_header_nritems(path->nodes[0]);
1534                         if (path->slots[0] >= nritems) {
1535                                 ret = btrfs_next_leaf(root, path);
1536                                 if (ret)
1537                                         break;
1538                         }
1539                         btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1540                                               path->slots[0]);
1541                         if (found_key.objectid != dirid ||
1542                             found_key.type != dir_key.type)
1543                                 goto next_type;
1544
1545                         if (found_key.offset > range_end)
1546                                 break;
1547
1548                         ret = check_item_in_log(trans, root, log, path,
1549                                                 log_path, dir, &found_key);
1550                         BUG_ON(ret);
1551                         if (found_key.offset == (u64)-1)
1552                                 break;
1553                         dir_key.offset = found_key.offset + 1;
1554                 }
1555                 btrfs_release_path(root, path);
1556                 if (range_end == (u64)-1)
1557                         break;
1558                 range_start = range_end + 1;
1559         }
1560
1561 next_type:
1562         ret = 0;
1563         if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
1564                 key_type = BTRFS_DIR_LOG_INDEX_KEY;
1565                 dir_key.type = BTRFS_DIR_INDEX_KEY;
1566                 btrfs_release_path(root, path);
1567                 goto again;
1568         }
1569 out:
1570         btrfs_release_path(root, path);
1571         btrfs_free_path(log_path);
1572         iput(dir);
1573         return ret;
1574 }
1575
1576 /*
1577  * the process_func used to replay items from the log tree.  This
1578  * gets called in two different stages.  The first stage just looks
1579  * for inodes and makes sure they are all copied into the subvolume.
1580  *
1581  * The second stage copies all the other item types from the log into
1582  * the subvolume.  The two stage approach is slower, but gets rid of
1583  * lots of complexity around inodes referencing other inodes that exist
1584  * only in the log (references come from either directory items or inode
1585  * back refs).
1586  */
1587 static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
1588                              struct walk_control *wc, u64 gen)
1589 {
1590         int nritems;
1591         struct btrfs_path *path;
1592         struct btrfs_root *root = wc->replay_dest;
1593         struct btrfs_key key;
1594         u32 item_size;
1595         int level;
1596         int i;
1597         int ret;
1598
1599         btrfs_read_buffer(eb, gen);
1600
1601         level = btrfs_header_level(eb);
1602
1603         if (level != 0)
1604                 return 0;
1605
1606         path = btrfs_alloc_path();
1607         BUG_ON(!path);
1608
1609         nritems = btrfs_header_nritems(eb);
1610         for (i = 0; i < nritems; i++) {
1611                 btrfs_item_key_to_cpu(eb, &key, i);
1612                 item_size = btrfs_item_size_nr(eb, i);
1613
1614                 /* inode keys are done during the first stage */
1615                 if (key.type == BTRFS_INODE_ITEM_KEY &&
1616                     wc->stage == LOG_WALK_REPLAY_INODES) {
1617                         struct inode *inode;
1618                         struct btrfs_inode_item *inode_item;
1619                         u32 mode;
1620
1621                         inode_item = btrfs_item_ptr(eb, i,
1622                                             struct btrfs_inode_item);
1623                         mode = btrfs_inode_mode(eb, inode_item);
1624                         if (S_ISDIR(mode)) {
1625                                 ret = replay_dir_deletes(wc->trans,
1626                                          root, log, path, key.objectid);
1627                                 BUG_ON(ret);
1628                         }
1629                         ret = overwrite_item(wc->trans, root, path,
1630                                              eb, i, &key);
1631                         BUG_ON(ret);
1632
1633                         /* for regular files, truncate away
1634                          * extents past the new EOF
1635                          */
1636                         if (S_ISREG(mode)) {
1637                                 inode = read_one_inode(root,
1638                                                        key.objectid);
1639                                 BUG_ON(!inode);
1640
1641                                 ret = btrfs_truncate_inode_items(wc->trans,
1642                                         root, inode, inode->i_size,
1643                                         BTRFS_EXTENT_DATA_KEY);
1644                                 BUG_ON(ret);
1645                                 iput(inode);
1646                         }
1647                         ret = link_to_fixup_dir(wc->trans, root,
1648                                                 path, key.objectid);
1649                         BUG_ON(ret);
1650                 }
1651                 if (wc->stage < LOG_WALK_REPLAY_ALL)
1652                         continue;
1653
1654                 /* these keys are simply copied */
1655                 if (key.type == BTRFS_XATTR_ITEM_KEY) {
1656                         ret = overwrite_item(wc->trans, root, path,
1657                                              eb, i, &key);
1658                         BUG_ON(ret);
1659                 } else if (key.type == BTRFS_INODE_REF_KEY) {
1660                         ret = add_inode_ref(wc->trans, root, log, path,
1661                                             eb, i, &key);
1662                         BUG_ON(ret && ret != -ENOENT);
1663                 } else if (key.type == BTRFS_EXTENT_DATA_KEY) {
1664                         ret = replay_one_extent(wc->trans, root, path,
1665                                                 eb, i, &key);
1666                         BUG_ON(ret);
1667                 } else if (key.type == BTRFS_CSUM_ITEM_KEY) {
1668                         ret = replay_one_csum(wc->trans, root, path,
1669                                               eb, i, &key);
1670                         BUG_ON(ret);
1671                 } else if (key.type == BTRFS_DIR_ITEM_KEY ||
1672                            key.type == BTRFS_DIR_INDEX_KEY) {
1673                         ret = replay_one_dir_item(wc->trans, root, path,
1674                                                   eb, i, &key);
1675                         BUG_ON(ret);
1676                 }
1677         }
1678         btrfs_free_path(path);
1679         return 0;
1680 }
1681
1682 static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
1683                                    struct btrfs_root *root,
1684                                    struct btrfs_path *path, int *level,
1685                                    struct walk_control *wc)
1686 {
1687         u64 root_owner;
1688         u64 root_gen;
1689         u64 bytenr;
1690         u64 ptr_gen;
1691         struct extent_buffer *next;
1692         struct extent_buffer *cur;
1693         struct extent_buffer *parent;
1694         u32 blocksize;
1695         int ret = 0;
1696
1697         WARN_ON(*level < 0);
1698         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1699
1700         while(*level > 0) {
1701                 WARN_ON(*level < 0);
1702                 WARN_ON(*level >= BTRFS_MAX_LEVEL);
1703                 cur = path->nodes[*level];
1704
1705                 if (btrfs_header_level(cur) != *level)
1706                         WARN_ON(1);
1707
1708                 if (path->slots[*level] >=
1709                     btrfs_header_nritems(cur))
1710                         break;
1711
1712                 bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
1713                 ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
1714                 blocksize = btrfs_level_size(root, *level - 1);
1715
1716                 parent = path->nodes[*level];
1717                 root_owner = btrfs_header_owner(parent);
1718                 root_gen = btrfs_header_generation(parent);
1719
1720                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
1721
1722                 wc->process_func(root, next, wc, ptr_gen);
1723
1724                 if (*level == 1) {
1725                         path->slots[*level]++;
1726                         if (wc->free) {
1727                                 btrfs_read_buffer(next, ptr_gen);
1728
1729                                 btrfs_tree_lock(next);
1730                                 clean_tree_block(trans, root, next);
1731                                 btrfs_wait_tree_block_writeback(next);
1732                                 btrfs_tree_unlock(next);
1733
1734                                 ret = btrfs_drop_leaf_ref(trans, root, next);
1735                                 BUG_ON(ret);
1736
1737                                 WARN_ON(root_owner !=
1738                                         BTRFS_TREE_LOG_OBJECTID);
1739                                 ret = btrfs_free_reserved_extent(root,
1740                                                          bytenr, blocksize);
1741                                 BUG_ON(ret);
1742                         }
1743                         free_extent_buffer(next);
1744                         continue;
1745                 }
1746                 btrfs_read_buffer(next, ptr_gen);
1747
1748                 WARN_ON(*level <= 0);
1749                 if (path->nodes[*level-1])
1750                         free_extent_buffer(path->nodes[*level-1]);
1751                 path->nodes[*level-1] = next;
1752                 *level = btrfs_header_level(next);
1753                 path->slots[*level] = 0;
1754                 cond_resched();
1755         }
1756         WARN_ON(*level < 0);
1757         WARN_ON(*level >= BTRFS_MAX_LEVEL);
1758
1759         if (path->nodes[*level] == root->node) {
1760                 parent = path->nodes[*level];
1761         } else {
1762                 parent = path->nodes[*level + 1];
1763         }
1764         bytenr = path->nodes[*level]->start;
1765
1766         blocksize = btrfs_level_size(root, *level);
1767         root_owner = btrfs_header_owner(parent);
1768         root_gen = btrfs_header_generation(parent);
1769
1770         wc->process_func(root, path->nodes[*level], wc,
1771                          btrfs_header_generation(path->nodes[*level]));
1772
1773         if (wc->free) {
1774                 next = path->nodes[*level];
1775                 btrfs_tree_lock(next);
1776                 clean_tree_block(trans, root, next);
1777                 btrfs_wait_tree_block_writeback(next);
1778                 btrfs_tree_unlock(next);
1779
1780                 if (*level == 0) {
1781                         ret = btrfs_drop_leaf_ref(trans, root, next);
1782                         BUG_ON(ret);
1783                 }
1784                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1785                 ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
1786                 BUG_ON(ret);
1787         }
1788         free_extent_buffer(path->nodes[*level]);
1789         path->nodes[*level] = NULL;
1790         *level += 1;
1791
1792         cond_resched();
1793         return 0;
1794 }
1795
1796 static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
1797                                  struct btrfs_root *root,
1798                                  struct btrfs_path *path, int *level,
1799                                  struct walk_control *wc)
1800 {
1801         u64 root_owner;
1802         u64 root_gen;
1803         int i;
1804         int slot;
1805         int ret;
1806
1807         for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
1808                 slot = path->slots[i];
1809                 if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
1810                         struct extent_buffer *node;
1811                         node = path->nodes[i];
1812                         path->slots[i]++;
1813                         *level = i;
1814                         WARN_ON(*level == 0);
1815                         return 0;
1816                 } else {
1817                         struct extent_buffer *parent;
1818                         if (path->nodes[*level] == root->node)
1819                                 parent = path->nodes[*level];
1820                         else
1821                                 parent = path->nodes[*level + 1];
1822
1823                         root_owner = btrfs_header_owner(parent);
1824                         root_gen = btrfs_header_generation(parent);
1825                         wc->process_func(root, path->nodes[*level], wc,
1826                                  btrfs_header_generation(path->nodes[*level]));
1827                         if (wc->free) {
1828                                 struct extent_buffer *next;
1829
1830                                 next = path->nodes[*level];
1831
1832                                 btrfs_tree_lock(next);
1833                                 clean_tree_block(trans, root, next);
1834                                 btrfs_wait_tree_block_writeback(next);
1835                                 btrfs_tree_unlock(next);
1836
1837                                 if (*level == 0) {
1838                                         ret = btrfs_drop_leaf_ref(trans, root,
1839                                                                   next);
1840                                         BUG_ON(ret);
1841                                 }
1842
1843                                 WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
1844                                 ret = btrfs_free_reserved_extent(root,
1845                                                 path->nodes[*level]->start,
1846                                                 path->nodes[*level]->len);
1847                                 BUG_ON(ret);
1848                         }
1849                         free_extent_buffer(path->nodes[*level]);
1850                         path->nodes[*level] = NULL;
1851                         *level = i + 1;
1852                 }
1853         }
1854         return 1;
1855 }
1856
1857 /*
1858  * drop the reference count on the tree rooted at 'snap'.  This traverses
1859  * the tree freeing any blocks that have a ref count of zero after being
1860  * decremented.
1861  */
1862 static int walk_log_tree(struct btrfs_trans_handle *trans,
1863                          struct btrfs_root *log, struct walk_control *wc)
1864 {
1865         int ret = 0;
1866         int wret;
1867         int level;
1868         struct btrfs_path *path;
1869         int i;
1870         int orig_level;
1871
1872         path = btrfs_alloc_path();
1873         BUG_ON(!path);
1874
1875         level = btrfs_header_level(log->node);
1876         orig_level = level;
1877         path->nodes[level] = log->node;
1878         extent_buffer_get(log->node);
1879         path->slots[level] = 0;
1880
1881         while(1) {
1882                 wret = walk_down_log_tree(trans, log, path, &level, wc);
1883                 if (wret > 0)
1884                         break;
1885                 if (wret < 0)
1886                         ret = wret;
1887
1888                 wret = walk_up_log_tree(trans, log, path, &level, wc);
1889                 if (wret > 0)
1890                         break;
1891                 if (wret < 0)
1892                         ret = wret;
1893         }
1894
1895         /* was the root node processed? if not, catch it here */
1896         if (path->nodes[orig_level]) {
1897                 wc->process_func(log, path->nodes[orig_level], wc,
1898                          btrfs_header_generation(path->nodes[orig_level]));
1899                 if (wc->free) {
1900                         struct extent_buffer *next;
1901
1902                         next = path->nodes[orig_level];
1903
1904                         btrfs_tree_lock(next);
1905                         clean_tree_block(trans, log, next);
1906                         btrfs_wait_tree_block_writeback(next);
1907                         btrfs_tree_unlock(next);
1908
1909                         if (orig_level == 0) {
1910                                 ret = btrfs_drop_leaf_ref(trans, log,
1911                                                           next);
1912                                 BUG_ON(ret);
1913                         }
1914                         WARN_ON(log->root_key.objectid !=
1915                                 BTRFS_TREE_LOG_OBJECTID);
1916                         ret = btrfs_free_reserved_extent(log, next->start,
1917                                                          next->len);
1918                         BUG_ON(ret);
1919                 }
1920         }
1921
1922         for (i = 0; i <= orig_level; i++) {
1923                 if (path->nodes[i]) {
1924                         free_extent_buffer(path->nodes[i]);
1925                         path->nodes[i] = NULL;
1926                 }
1927         }
1928         btrfs_free_path(path);
1929         if (wc->free)
1930                 free_extent_buffer(log->node);
1931         return ret;
1932 }
1933
1934 int wait_log_commit(struct btrfs_root *log)
1935 {
1936         DEFINE_WAIT(wait);
1937         u64 transid = log->fs_info->tree_log_transid;
1938
1939         do {
1940                 prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1941                                 TASK_UNINTERRUPTIBLE);
1942                 mutex_unlock(&log->fs_info->tree_log_mutex);
1943                 if (atomic_read(&log->fs_info->tree_log_commit))
1944                         schedule();
1945                 finish_wait(&log->fs_info->tree_log_wait, &wait);
1946                 mutex_lock(&log->fs_info->tree_log_mutex);
1947         } while(transid == log->fs_info->tree_log_transid &&
1948                 atomic_read(&log->fs_info->tree_log_commit));
1949         return 0;
1950 }
1951
1952 /*
1953  * btrfs_sync_log does sends a given tree log down to the disk and
1954  * updates the super blocks to record it.  When this call is done,
1955  * you know that any inodes previously logged are safely on disk
1956  */
1957 int btrfs_sync_log(struct btrfs_trans_handle *trans,
1958                    struct btrfs_root *root)
1959 {
1960         int ret;
1961         unsigned long batch;
1962         struct btrfs_root *log = root->log_root;
1963
1964         mutex_lock(&log->fs_info->tree_log_mutex);
1965         if (atomic_read(&log->fs_info->tree_log_commit)) {
1966                 wait_log_commit(log);
1967                 goto out;
1968         }
1969         atomic_set(&log->fs_info->tree_log_commit, 1);
1970
1971         while(1) {
1972                 batch = log->fs_info->tree_log_batch;
1973                 mutex_unlock(&log->fs_info->tree_log_mutex);
1974                 schedule_timeout_uninterruptible(1);
1975                 mutex_lock(&log->fs_info->tree_log_mutex);
1976
1977                 while(atomic_read(&log->fs_info->tree_log_writers)) {
1978                         DEFINE_WAIT(wait);
1979                         prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
1980                                         TASK_UNINTERRUPTIBLE);
1981                         mutex_unlock(&log->fs_info->tree_log_mutex);
1982                         if (atomic_read(&log->fs_info->tree_log_writers))
1983                                 schedule();
1984                         mutex_lock(&log->fs_info->tree_log_mutex);
1985                         finish_wait(&log->fs_info->tree_log_wait, &wait);
1986                 }
1987                 if (batch == log->fs_info->tree_log_batch)
1988                         break;
1989         }
1990
1991         ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
1992         BUG_ON(ret);
1993         ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
1994                                &root->fs_info->log_root_tree->dirty_log_pages);
1995         BUG_ON(ret);
1996
1997         btrfs_set_super_log_root(&root->fs_info->super_for_commit,
1998                                  log->fs_info->log_root_tree->node->start);
1999         btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
2000                        btrfs_header_level(log->fs_info->log_root_tree->node));
2001
2002         write_ctree_super(trans, log->fs_info->tree_root);
2003         log->fs_info->tree_log_transid++;
2004         log->fs_info->tree_log_batch = 0;
2005         atomic_set(&log->fs_info->tree_log_commit, 0);
2006         smp_mb();
2007         if (waitqueue_active(&log->fs_info->tree_log_wait))
2008                 wake_up(&log->fs_info->tree_log_wait);
2009 out:
2010         mutex_unlock(&log->fs_info->tree_log_mutex);
2011         return 0;
2012
2013 }
2014
2015 /* * free all the extents used by the tree log.  This should be called
2016  * at commit time of the full transaction
2017  */
2018 int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
2019 {
2020         int ret;
2021         struct btrfs_root *log;
2022         struct key;
2023         u64 start;
2024         u64 end;
2025         struct walk_control wc = {
2026                 .free = 1,
2027                 .process_func = process_one_buffer
2028         };
2029
2030         if (!root->log_root)
2031                 return 0;
2032
2033         log = root->log_root;
2034         ret = walk_log_tree(trans, log, &wc);
2035         BUG_ON(ret);
2036
2037         while(1) {
2038                 ret = find_first_extent_bit(&log->dirty_log_pages,
2039                                     0, &start, &end, EXTENT_DIRTY);
2040                 if (ret)
2041                         break;
2042
2043                 clear_extent_dirty(&log->dirty_log_pages,
2044                                    start, end, GFP_NOFS);
2045         }
2046
2047         log = root->log_root;
2048         ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
2049                              &log->root_key);
2050         BUG_ON(ret);
2051         root->log_root = NULL;
2052         kfree(root->log_root);
2053         return 0;
2054 }
2055
2056 /*
2057  * helper function to update the item for a given subvolumes log root
2058  * in the tree of log roots
2059  */
2060 static int update_log_root(struct btrfs_trans_handle *trans,
2061                            struct btrfs_root *log)
2062 {
2063         u64 bytenr = btrfs_root_bytenr(&log->root_item);
2064         int ret;
2065
2066         if (log->node->start == bytenr)
2067                 return 0;
2068
2069         btrfs_set_root_bytenr(&log->root_item, log->node->start);
2070         btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
2071         ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
2072                                 &log->root_key, &log->root_item);
2073         BUG_ON(ret);
2074         return ret;
2075 }
2076
2077 /*
2078  * If both a file and directory are logged, and unlinks or renames are
2079  * mixed in, we have a few interesting corners:
2080  *
2081  * create file X in dir Y
2082  * link file X to X.link in dir Y
2083  * fsync file X
2084  * unlink file X but leave X.link
2085  * fsync dir Y
2086  *
2087  * After a crash we would expect only X.link to exist.  But file X
2088  * didn't get fsync'd again so the log has back refs for X and X.link.
2089  *
2090  * We solve this by removing directory entries and inode backrefs from the
2091  * log when a file that was logged in the current transaction is
2092  * unlinked.  Any later fsync will include the updated log entries, and
2093  * we'll be able to reconstruct the proper directory items from backrefs.
2094  *
2095  * This optimizations allows us to avoid relogging the entire inode
2096  * or the entire directory.
2097  */
2098 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
2099                                  struct btrfs_root *root,
2100                                  const char *name, int name_len,
2101                                  struct inode *dir, u64 index)
2102 {
2103         struct btrfs_root *log;
2104         struct btrfs_dir_item *di;
2105         struct btrfs_path *path;
2106         int ret;
2107         int bytes_del = 0;
2108
2109         if (BTRFS_I(dir)->logged_trans < trans->transid)
2110                 return 0;
2111
2112         ret = join_running_log_trans(root);
2113         if (ret)
2114                 return 0;
2115
2116         mutex_lock(&BTRFS_I(dir)->log_mutex);
2117
2118         log = root->log_root;
2119         path = btrfs_alloc_path();
2120         di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
2121                                    name, name_len, -1);
2122         if (di && !IS_ERR(di)) {
2123                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2124                 bytes_del += name_len;
2125                 BUG_ON(ret);
2126         }
2127         btrfs_release_path(log, path);
2128         di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
2129                                          index, name, name_len, -1);
2130         if (di && !IS_ERR(di)) {
2131                 ret = btrfs_delete_one_dir_name(trans, log, path, di);
2132                 bytes_del += name_len;
2133                 BUG_ON(ret);
2134         }
2135
2136         /* update the directory size in the log to reflect the names
2137          * we have removed
2138          */
2139         if (bytes_del) {
2140                 struct btrfs_key key;
2141
2142                 key.objectid = dir->i_ino;
2143                 key.offset = 0;
2144                 key.type = BTRFS_INODE_ITEM_KEY;
2145                 btrfs_release_path(log, path);
2146
2147                 ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
2148                 if (ret == 0) {
2149                         struct btrfs_inode_item *item;
2150                         u64 i_size;
2151
2152                         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2153                                               struct btrfs_inode_item);
2154                         i_size = btrfs_inode_size(path->nodes[0], item);
2155                         if (i_size > bytes_del)
2156                                 i_size -= bytes_del;
2157                         else
2158                                 i_size = 0;
2159                         btrfs_set_inode_size(path->nodes[0], item, i_size);
2160                         btrfs_mark_buffer_dirty(path->nodes[0]);
2161                 } else
2162                         ret = 0;
2163                 btrfs_release_path(log, path);
2164         }
2165
2166         btrfs_free_path(path);
2167         mutex_unlock(&BTRFS_I(dir)->log_mutex);
2168         end_log_trans(root);
2169
2170         return 0;
2171 }
2172
2173 /* see comments for btrfs_del_dir_entries_in_log */
2174 int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
2175                                struct btrfs_root *root,
2176                                const char *name, int name_len,
2177                                struct inode *inode, u64 dirid)
2178 {
2179         struct btrfs_root *log;
2180         u64 index;
2181         int ret;
2182
2183         if (BTRFS_I(inode)->logged_trans < trans->transid)
2184                 return 0;
2185
2186         ret = join_running_log_trans(root);
2187         if (ret)
2188                 return 0;
2189         log = root->log_root;
2190         mutex_lock(&BTRFS_I(inode)->log_mutex);
2191
2192         ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
2193                                   dirid, &index);
2194         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2195         end_log_trans(root);
2196
2197         return ret;
2198 }
2199
2200 /*
2201  * creates a range item in the log for 'dirid'.  first_offset and
2202  * last_offset tell us which parts of the key space the log should
2203  * be considered authoritative for.
2204  */
2205 static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
2206                                        struct btrfs_root *log,
2207                                        struct btrfs_path *path,
2208                                        int key_type, u64 dirid,
2209                                        u64 first_offset, u64 last_offset)
2210 {
2211         int ret;
2212         struct btrfs_key key;
2213         struct btrfs_dir_log_item *item;
2214
2215         key.objectid = dirid;
2216         key.offset = first_offset;
2217         if (key_type == BTRFS_DIR_ITEM_KEY)
2218                 key.type = BTRFS_DIR_LOG_ITEM_KEY;
2219         else
2220                 key.type = BTRFS_DIR_LOG_INDEX_KEY;
2221         ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
2222         BUG_ON(ret);
2223
2224         item = btrfs_item_ptr(path->nodes[0], path->slots[0],
2225                               struct btrfs_dir_log_item);
2226         btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
2227         btrfs_mark_buffer_dirty(path->nodes[0]);
2228         btrfs_release_path(log, path);
2229         return 0;
2230 }
2231
2232 /*
2233  * log all the items included in the current transaction for a given
2234  * directory.  This also creates the range items in the log tree required
2235  * to replay anything deleted before the fsync
2236  */
2237 static noinline int log_dir_items(struct btrfs_trans_handle *trans,
2238                           struct btrfs_root *root, struct inode *inode,
2239                           struct btrfs_path *path,
2240                           struct btrfs_path *dst_path, int key_type,
2241                           u64 min_offset, u64 *last_offset_ret)
2242 {
2243         struct btrfs_key min_key;
2244         struct btrfs_key max_key;
2245         struct btrfs_root *log = root->log_root;
2246         struct extent_buffer *src;
2247         int ret;
2248         int i;
2249         int nritems;
2250         u64 first_offset = min_offset;
2251         u64 last_offset = (u64)-1;
2252
2253         log = root->log_root;
2254         max_key.objectid = inode->i_ino;
2255         max_key.offset = (u64)-1;
2256         max_key.type = key_type;
2257
2258         min_key.objectid = inode->i_ino;
2259         min_key.type = key_type;
2260         min_key.offset = min_offset;
2261
2262         path->keep_locks = 1;
2263
2264         ret = btrfs_search_forward(root, &min_key, &max_key,
2265                                    path, 0, trans->transid);
2266
2267         /*
2268          * we didn't find anything from this transaction, see if there
2269          * is anything at all
2270          */
2271         if (ret != 0 || min_key.objectid != inode->i_ino ||
2272             min_key.type != key_type) {
2273                 min_key.objectid = inode->i_ino;
2274                 min_key.type = key_type;
2275                 min_key.offset = (u64)-1;
2276                 btrfs_release_path(root, path);
2277                 ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2278                 if (ret < 0) {
2279                         btrfs_release_path(root, path);
2280                         return ret;
2281                 }
2282                 ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2283
2284                 /* if ret == 0 there are items for this type,
2285                  * create a range to tell us the last key of this type.
2286                  * otherwise, there are no items in this directory after
2287                  * *min_offset, and we create a range to indicate that.
2288                  */
2289                 if (ret == 0) {
2290                         struct btrfs_key tmp;
2291                         btrfs_item_key_to_cpu(path->nodes[0], &tmp,
2292                                               path->slots[0]);
2293                         if (key_type == tmp.type) {
2294                                 first_offset = max(min_offset, tmp.offset) + 1;
2295                         }
2296                 }
2297                 goto done;
2298         }
2299
2300         /* go backward to find any previous key */
2301         ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
2302         if (ret == 0) {
2303                 struct btrfs_key tmp;
2304                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2305                 if (key_type == tmp.type) {
2306                         first_offset = tmp.offset;
2307                         ret = overwrite_item(trans, log, dst_path,
2308                                              path->nodes[0], path->slots[0],
2309                                              &tmp);
2310                 }
2311         }
2312         btrfs_release_path(root, path);
2313
2314         /* find the first key from this transaction again */
2315         ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
2316         if (ret != 0) {
2317                 WARN_ON(1);
2318                 goto done;
2319         }
2320
2321         /*
2322          * we have a block from this transaction, log every item in it
2323          * from our directory
2324          */
2325         while(1) {
2326                 struct btrfs_key tmp;
2327                 src = path->nodes[0];
2328                 nritems = btrfs_header_nritems(src);
2329                 for (i = path->slots[0]; i < nritems; i++) {
2330                         btrfs_item_key_to_cpu(src, &min_key, i);
2331
2332                         if (min_key.objectid != inode->i_ino ||
2333                             min_key.type != key_type)
2334                                 goto done;
2335                         ret = overwrite_item(trans, log, dst_path, src, i,
2336                                              &min_key);
2337                         BUG_ON(ret);
2338                 }
2339                 path->slots[0] = nritems;
2340
2341                 /*
2342                  * look ahead to the next item and see if it is also
2343                  * from this directory and from this transaction
2344                  */
2345                 ret = btrfs_next_leaf(root, path);
2346                 if (ret == 1) {
2347                         last_offset = (u64)-1;
2348                         goto done;
2349                 }
2350                 btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
2351                 if (tmp.objectid != inode->i_ino || tmp.type != key_type) {
2352                         last_offset = (u64)-1;
2353                         goto done;
2354                 }
2355                 if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
2356                         ret = overwrite_item(trans, log, dst_path,
2357                                              path->nodes[0], path->slots[0],
2358                                              &tmp);
2359
2360                         BUG_ON(ret);
2361                         last_offset = tmp.offset;
2362                         goto done;
2363                 }
2364         }
2365 done:
2366         *last_offset_ret = last_offset;
2367         btrfs_release_path(root, path);
2368         btrfs_release_path(log, dst_path);
2369
2370         /* insert the log range keys to indicate where the log is valid */
2371         ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
2372                                  first_offset, last_offset);
2373         BUG_ON(ret);
2374         return 0;
2375 }
2376
2377 /*
2378  * logging directories is very similar to logging inodes, We find all the items
2379  * from the current transaction and write them to the log.
2380  *
2381  * The recovery code scans the directory in the subvolume, and if it finds a
2382  * key in the range logged that is not present in the log tree, then it means
2383  * that dir entry was unlinked during the transaction.
2384  *
2385  * In order for that scan to work, we must include one key smaller than
2386  * the smallest logged by this transaction and one key larger than the largest
2387  * key logged by this transaction.
2388  */
2389 static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
2390                           struct btrfs_root *root, struct inode *inode,
2391                           struct btrfs_path *path,
2392                           struct btrfs_path *dst_path)
2393 {
2394         u64 min_key;
2395         u64 max_key;
2396         int ret;
2397         int key_type = BTRFS_DIR_ITEM_KEY;
2398
2399 again:
2400         min_key = 0;
2401         max_key = 0;
2402         while(1) {
2403                 ret = log_dir_items(trans, root, inode, path,
2404                                     dst_path, key_type, min_key,
2405                                     &max_key);
2406                 BUG_ON(ret);
2407                 if (max_key == (u64)-1)
2408                         break;
2409                 min_key = max_key + 1;
2410         }
2411
2412         if (key_type == BTRFS_DIR_ITEM_KEY) {
2413                 key_type = BTRFS_DIR_INDEX_KEY;
2414                 goto again;
2415         }
2416         return 0;
2417 }
2418
2419 /*
2420  * a helper function to drop items from the log before we relog an
2421  * inode.  max_key_type indicates the highest item type to remove.
2422  * This cannot be run for file data extents because it does not
2423  * free the extents they point to.
2424  */
2425 static int drop_objectid_items(struct btrfs_trans_handle *trans,
2426                                   struct btrfs_root *log,
2427                                   struct btrfs_path *path,
2428                                   u64 objectid, int max_key_type)
2429 {
2430         int ret;
2431         struct btrfs_key key;
2432         struct btrfs_key found_key;
2433
2434         key.objectid = objectid;
2435         key.type = max_key_type;
2436         key.offset = (u64)-1;
2437
2438         while(1) {
2439                 ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
2440
2441                 if (ret != 1)
2442                         break;
2443
2444                 if (path->slots[0] == 0)
2445                         break;
2446
2447                 path->slots[0]--;
2448                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2449                                       path->slots[0]);
2450
2451                 if (found_key.objectid != objectid)
2452                         break;
2453
2454                 ret = btrfs_del_item(trans, log, path);
2455                 BUG_ON(ret);
2456                 btrfs_release_path(log, path);
2457         }
2458         btrfs_release_path(log, path);
2459         return 0;
2460 }
2461
2462 static noinline int copy_items(struct btrfs_trans_handle *trans,
2463                                struct btrfs_root *log,
2464                                struct btrfs_path *dst_path,
2465                                struct extent_buffer *src,
2466                                int start_slot, int nr, int inode_only)
2467 {
2468         unsigned long src_offset;
2469         unsigned long dst_offset;
2470         struct btrfs_file_extent_item *extent;
2471         struct btrfs_inode_item *inode_item;
2472         int ret;
2473         struct btrfs_key *ins_keys;
2474         u32 *ins_sizes;
2475         char *ins_data;
2476         int i;
2477
2478         ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
2479                            nr * sizeof(u32), GFP_NOFS);
2480         ins_sizes = (u32 *)ins_data;
2481         ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32));
2482
2483         for (i = 0; i < nr; i++) {
2484                 ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
2485                 btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
2486         }
2487         ret = btrfs_insert_empty_items(trans, log, dst_path,
2488                                        ins_keys, ins_sizes, nr);
2489         BUG_ON(ret);
2490
2491         for (i = 0; i < nr; i++) {
2492                 dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
2493                                                    dst_path->slots[0]);
2494
2495                 src_offset = btrfs_item_ptr_offset(src, start_slot + i);
2496
2497                 copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
2498                                    src_offset, ins_sizes[i]);
2499
2500                 if (inode_only == LOG_INODE_EXISTS &&
2501                     ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
2502                         inode_item = btrfs_item_ptr(dst_path->nodes[0],
2503                                                     dst_path->slots[0],
2504                                                     struct btrfs_inode_item);
2505                         btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
2506
2507                         /* set the generation to zero so the recover code
2508                          * can tell the difference between an logging
2509                          * just to say 'this inode exists' and a logging
2510                          * to say 'update this inode with these values'
2511                          */
2512                         btrfs_set_inode_generation(dst_path->nodes[0],
2513                                                    inode_item, 0);
2514                 }
2515                 /* take a reference on file data extents so that truncates
2516                  * or deletes of this inode don't have to relog the inode
2517                  * again
2518                  */
2519                 if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
2520                         int found_type;
2521                         extent = btrfs_item_ptr(src, start_slot + i,
2522                                                 struct btrfs_file_extent_item);
2523
2524                         found_type = btrfs_file_extent_type(src, extent);
2525                         if (found_type == BTRFS_FILE_EXTENT_REG) {
2526                                 u64 ds = btrfs_file_extent_disk_bytenr(src,
2527                                                                    extent);
2528                                 u64 dl = btrfs_file_extent_disk_num_bytes(src,
2529                                                                       extent);
2530                                 /* ds == 0 is a hole */
2531                                 if (ds != 0) {
2532                                         ret = btrfs_inc_extent_ref(trans, log,
2533                                                    ds, dl,
2534                                                    dst_path->nodes[0]->start,
2535                                                    BTRFS_TREE_LOG_OBJECTID,
2536                                                    trans->transid,
2537                                                    ins_keys[i].objectid,
2538                                                    ins_keys[i].offset);
2539                                         BUG_ON(ret);
2540                                 }
2541                         }
2542                 }
2543                 dst_path->slots[0]++;
2544         }
2545
2546         btrfs_mark_buffer_dirty(dst_path->nodes[0]);
2547         btrfs_release_path(log, dst_path);
2548         kfree(ins_data);
2549         return 0;
2550 }
2551
2552 /* log a single inode in the tree log.
2553  * At least one parent directory for this inode must exist in the tree
2554  * or be logged already.
2555  *
2556  * Any items from this inode changed by the current transaction are copied
2557  * to the log tree.  An extra reference is taken on any extents in this
2558  * file, allowing us to avoid a whole pile of corner cases around logging
2559  * blocks that have been removed from the tree.
2560  *
2561  * See LOG_INODE_ALL and related defines for a description of what inode_only
2562  * does.
2563  *
2564  * This handles both files and directories.
2565  */
2566 static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
2567                              struct btrfs_root *root, struct inode *inode,
2568                              int inode_only)
2569 {
2570         struct btrfs_path *path;
2571         struct btrfs_path *dst_path;
2572         struct btrfs_key min_key;
2573         struct btrfs_key max_key;
2574         struct btrfs_root *log = root->log_root;
2575         struct extent_buffer *src = NULL;
2576         u32 size;
2577         int ret;
2578         int nritems;
2579         int ins_start_slot = 0;
2580         int ins_nr;
2581
2582         log = root->log_root;
2583
2584         path = btrfs_alloc_path();
2585         dst_path = btrfs_alloc_path();
2586
2587         min_key.objectid = inode->i_ino;
2588         min_key.type = BTRFS_INODE_ITEM_KEY;
2589         min_key.offset = 0;
2590
2591         max_key.objectid = inode->i_ino;
2592         if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
2593                 max_key.type = BTRFS_XATTR_ITEM_KEY;
2594         else
2595                 max_key.type = (u8)-1;
2596         max_key.offset = (u64)-1;
2597
2598         /*
2599          * if this inode has already been logged and we're in inode_only
2600          * mode, we don't want to delete the things that have already
2601          * been written to the log.
2602          *
2603          * But, if the inode has been through an inode_only log,
2604          * the logged_trans field is not set.  This allows us to catch
2605          * any new names for this inode in the backrefs by logging it
2606          * again
2607          */
2608         if (inode_only == LOG_INODE_EXISTS &&
2609             BTRFS_I(inode)->logged_trans == trans->transid) {
2610                 btrfs_free_path(path);
2611                 btrfs_free_path(dst_path);
2612                 goto out;
2613         }
2614         mutex_lock(&BTRFS_I(inode)->log_mutex);
2615
2616         /*
2617          * a brute force approach to making sure we get the most uptodate
2618          * copies of everything.
2619          */
2620         if (S_ISDIR(inode->i_mode)) {
2621                 int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
2622
2623                 if (inode_only == LOG_INODE_EXISTS)
2624                         max_key_type = BTRFS_XATTR_ITEM_KEY;
2625                 ret = drop_objectid_items(trans, log, path,
2626                                           inode->i_ino, max_key_type);
2627         } else {
2628                 ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
2629         }
2630         BUG_ON(ret);
2631         path->keep_locks = 1;
2632
2633         while(1) {
2634                 ins_nr = 0;
2635                 ret = btrfs_search_forward(root, &min_key, &max_key,
2636                                            path, 0, trans->transid);
2637                 if (ret != 0)
2638                         break;
2639 again:
2640                 /* note, ins_nr might be > 0 here, cleanup outside the loop */
2641                 if (min_key.objectid != inode->i_ino)
2642                         break;
2643                 if (min_key.type > max_key.type)
2644                         break;
2645
2646                 src = path->nodes[0];
2647                 size = btrfs_item_size_nr(src, path->slots[0]);
2648                 if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
2649                         ins_nr++;
2650                         goto next_slot;
2651                 } else if (!ins_nr) {
2652                         ins_start_slot = path->slots[0];
2653                         ins_nr = 1;
2654                         goto next_slot;
2655                 }
2656
2657                 ret = copy_items(trans, log, dst_path, src, ins_start_slot,
2658                                  ins_nr, inode_only);
2659                 BUG_ON(ret);
2660                 ins_nr = 1;
2661                 ins_start_slot = path->slots[0];
2662 next_slot:
2663
2664                 nritems = btrfs_header_nritems(path->nodes[0]);
2665                 path->slots[0]++;
2666                 if (path->slots[0] < nritems) {
2667                         btrfs_item_key_to_cpu(path->nodes[0], &min_key,
2668                                               path->slots[0]);
2669                         goto again;
2670                 }
2671                 if (ins_nr) {
2672                         ret = copy_items(trans, log, dst_path, src,
2673                                          ins_start_slot,
2674                                          ins_nr, inode_only);
2675                         BUG_ON(ret);
2676                         ins_nr = 0;
2677                 }
2678                 btrfs_release_path(root, path);
2679
2680                 if (min_key.offset < (u64)-1)
2681                         min_key.offset++;
2682                 else if (min_key.type < (u8)-1)
2683                         min_key.type++;
2684                 else if (min_key.objectid < (u64)-1)
2685                         min_key.objectid++;
2686                 else
2687                         break;
2688         }
2689         if (ins_nr) {
2690                 ret = copy_items(trans, log, dst_path, src,
2691                                  ins_start_slot,
2692                                  ins_nr, inode_only);
2693                 BUG_ON(ret);
2694                 ins_nr = 0;
2695         }
2696         WARN_ON(ins_nr);
2697         if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
2698                 btrfs_release_path(root, path);
2699                 btrfs_release_path(log, dst_path);
2700                 BTRFS_I(inode)->log_dirty_trans = 0;
2701                 ret = log_directory_changes(trans, root, inode, path, dst_path);
2702                 BUG_ON(ret);
2703         }
2704         BTRFS_I(inode)->logged_trans = trans->transid;
2705         mutex_unlock(&BTRFS_I(inode)->log_mutex);
2706
2707         btrfs_free_path(path);
2708         btrfs_free_path(dst_path);
2709
2710         mutex_lock(&root->fs_info->tree_log_mutex);
2711         ret = update_log_root(trans, log);
2712         BUG_ON(ret);
2713         mutex_unlock(&root->fs_info->tree_log_mutex);
2714 out:
2715         return 0;
2716 }
2717
2718 int btrfs_log_inode(struct btrfs_trans_handle *trans,
2719                     struct btrfs_root *root, struct inode *inode,
2720                     int inode_only)
2721 {
2722         int ret;
2723
2724         start_log_trans(trans, root);
2725         ret = __btrfs_log_inode(trans, root, inode, inode_only);
2726         end_log_trans(root);
2727         return ret;
2728 }
2729
2730 /*
2731  * helper function around btrfs_log_inode to make sure newly created
2732  * parent directories also end up in the log.  A minimal inode and backref
2733  * only logging is done of any parent directories that are older than
2734  * the last committed transaction
2735  */
2736 int btrfs_log_dentry(struct btrfs_trans_handle *trans,
2737                     struct btrfs_root *root, struct dentry *dentry)
2738 {
2739         int inode_only = LOG_INODE_ALL;
2740         struct super_block *sb;
2741         int ret;
2742
2743         start_log_trans(trans, root);
2744         sb = dentry->d_inode->i_sb;
2745         while(1) {
2746                 ret = __btrfs_log_inode(trans, root, dentry->d_inode,
2747                                         inode_only);
2748                 BUG_ON(ret);
2749                 inode_only = LOG_INODE_EXISTS;
2750
2751                 dentry = dentry->d_parent;
2752                 if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
2753                         break;
2754
2755                 if (BTRFS_I(dentry->d_inode)->generation <=
2756                     root->fs_info->last_trans_committed)
2757                         break;
2758         }
2759         end_log_trans(root);
2760         return 0;
2761 }
2762
2763 /*
2764  * it is not safe to log dentry if the chunk root has added new
2765  * chunks.  This returns 0 if the dentry was logged, and 1 otherwise.
2766  * If this returns 1, you must commit the transaction to safely get your
2767  * data on disk.
2768  */
2769 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
2770                           struct btrfs_root *root, struct dentry *dentry)
2771 {
2772         u64 gen;
2773         gen = root->fs_info->last_trans_new_blockgroup;
2774         if (gen > root->fs_info->last_trans_committed)
2775                 return 1;
2776         else
2777                 return btrfs_log_dentry(trans, root, dentry);
2778 }
2779
2780 /*
2781  * should be called during mount to recover any replay any log trees
2782  * from the FS
2783  */
2784 int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
2785 {
2786         int ret;
2787         struct btrfs_path *path;
2788         struct btrfs_trans_handle *trans;
2789         struct btrfs_key key;
2790         struct btrfs_key found_key;
2791         struct btrfs_key tmp_key;
2792         struct btrfs_root *log;
2793         struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
2794         u64 highest_inode;
2795         struct walk_control wc = {
2796                 .process_func = process_one_buffer,
2797                 .stage = 0,
2798         };
2799
2800         fs_info->log_root_recovering = 1;
2801         path = btrfs_alloc_path();
2802         BUG_ON(!path);
2803
2804         trans = btrfs_start_transaction(fs_info->tree_root, 1);
2805
2806         wc.trans = trans;
2807         wc.pin = 1;
2808
2809         walk_log_tree(trans, log_root_tree, &wc);
2810
2811 again:
2812         key.objectid = BTRFS_TREE_LOG_OBJECTID;
2813         key.offset = (u64)-1;
2814         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2815
2816         while(1) {
2817                 ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
2818                 if (ret < 0)
2819                         break;
2820                 if (ret > 0) {
2821                         if (path->slots[0] == 0)
2822                                 break;
2823                         path->slots[0]--;
2824                 }
2825                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
2826                                       path->slots[0]);
2827                 btrfs_release_path(log_root_tree, path);
2828                 if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
2829                         break;
2830
2831                 log = btrfs_read_fs_root_no_radix(log_root_tree,
2832                                                   &found_key);
2833                 BUG_ON(!log);
2834
2835
2836                 tmp_key.objectid = found_key.offset;
2837                 tmp_key.type = BTRFS_ROOT_ITEM_KEY;
2838                 tmp_key.offset = (u64)-1;
2839
2840                 wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
2841
2842                 BUG_ON(!wc.replay_dest);
2843
2844                 btrfs_record_root_in_trans(wc.replay_dest);
2845                 ret = walk_log_tree(trans, log, &wc);
2846                 BUG_ON(ret);
2847
2848                 if (wc.stage == LOG_WALK_REPLAY_ALL) {
2849                         ret = fixup_inode_link_counts(trans, wc.replay_dest,
2850                                                       path);
2851                         BUG_ON(ret);
2852                 }
2853                 ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
2854                 if (ret == 0) {
2855                         wc.replay_dest->highest_inode = highest_inode;
2856                         wc.replay_dest->last_inode_alloc = highest_inode;
2857                 }
2858
2859                 key.offset = found_key.offset - 1;
2860                 free_extent_buffer(log->node);
2861                 kfree(log);
2862
2863                 if (found_key.offset == 0)
2864                         break;
2865         }
2866         btrfs_release_path(log_root_tree, path);
2867
2868         /* step one is to pin it all, step two is to replay just inodes */
2869         if (wc.pin) {
2870                 wc.pin = 0;
2871                 wc.process_func = replay_one_buffer;
2872                 wc.stage = LOG_WALK_REPLAY_INODES;
2873                 goto again;
2874         }
2875         /* step three is to replay everything */
2876         if (wc.stage < LOG_WALK_REPLAY_ALL) {
2877                 wc.stage++;
2878                 goto again;
2879         }
2880
2881         btrfs_free_path(path);
2882
2883         free_extent_buffer(log_root_tree->node);
2884         log_root_tree->log_root = NULL;
2885         fs_info->log_root_recovering = 0;
2886
2887         /* step 4: commit the transaction, which also unpins the blocks */
2888         btrfs_commit_transaction(trans, fs_info->tree_root);
2889
2890         kfree(log_root_tree);
2891         return 0;
2892 }