Btrfs: allocator improvements, inode block groups
[safe/jmp/linux-2.6] / fs / btrfs / super.c
1 #include <linux/module.h>
2 #include <linux/buffer_head.h>
3 #include <linux/fs.h>
4 #include <linux/pagemap.h>
5 #include <linux/highmem.h>
6 #include <linux/time.h>
7 #include <linux/init.h>
8 #include <linux/string.h>
9 #include <linux/smp_lock.h>
10 #include <linux/backing-dev.h>
11 #include <linux/mpage.h>
12 #include <linux/swap.h>
13 #include <linux/writeback.h>
14 #include <linux/statfs.h>
15 #include "ctree.h"
16 #include "disk-io.h"
17 #include "transaction.h"
18 #include "btrfs_inode.h"
19 #include "ioctl.h"
20
21 void btrfs_fsinfo_release(struct kobject *obj)
22 {
23         struct btrfs_fs_info *fsinfo = container_of(obj,
24                                             struct btrfs_fs_info, kobj);
25         kfree(fsinfo);
26 }
27
28 struct kobj_type btrfs_fsinfo_ktype = {
29         .release = btrfs_fsinfo_release,
30 };
31
32 struct btrfs_iget_args {
33         u64 ino;
34         struct btrfs_root *root;
35 };
36
37 decl_subsys(btrfs, &btrfs_fsinfo_ktype, NULL);
38
39 #define BTRFS_SUPER_MAGIC 0x9123682E
40
41 static struct inode_operations btrfs_dir_inode_operations;
42 static struct inode_operations btrfs_dir_ro_inode_operations;
43 static struct super_operations btrfs_super_ops;
44 static struct file_operations btrfs_dir_file_operations;
45 static struct inode_operations btrfs_file_inode_operations;
46 static struct address_space_operations btrfs_aops;
47 static struct file_operations btrfs_file_operations;
48
49 static void btrfs_read_locked_inode(struct inode *inode)
50 {
51         struct btrfs_path *path;
52         struct btrfs_inode_item *inode_item;
53         struct btrfs_root *root = BTRFS_I(inode)->root;
54         struct btrfs_key location;
55         struct btrfs_block_group_cache *alloc_group;
56         u64 alloc_group_block;
57         int ret;
58
59         path = btrfs_alloc_path();
60         BUG_ON(!path);
61         btrfs_init_path(path);
62         mutex_lock(&root->fs_info->fs_mutex);
63
64         memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
65         ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
66         if (ret) {
67                 btrfs_free_path(path);
68                 goto make_bad;
69         }
70         inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
71                                   path->slots[0],
72                                   struct btrfs_inode_item);
73
74         inode->i_mode = btrfs_inode_mode(inode_item);
75         inode->i_nlink = btrfs_inode_nlink(inode_item);
76         inode->i_uid = btrfs_inode_uid(inode_item);
77         inode->i_gid = btrfs_inode_gid(inode_item);
78         inode->i_size = btrfs_inode_size(inode_item);
79         inode->i_atime.tv_sec = btrfs_timespec_sec(&inode_item->atime);
80         inode->i_atime.tv_nsec = btrfs_timespec_nsec(&inode_item->atime);
81         inode->i_mtime.tv_sec = btrfs_timespec_sec(&inode_item->mtime);
82         inode->i_mtime.tv_nsec = btrfs_timespec_nsec(&inode_item->mtime);
83         inode->i_ctime.tv_sec = btrfs_timespec_sec(&inode_item->ctime);
84         inode->i_ctime.tv_nsec = btrfs_timespec_nsec(&inode_item->ctime);
85         inode->i_blocks = btrfs_inode_nblocks(inode_item);
86         inode->i_generation = btrfs_inode_generation(inode_item);
87         alloc_group_block = btrfs_inode_block_group(inode_item);
88         ret = radix_tree_gang_lookup(&root->fs_info->block_group_radix,
89                                      (void **)&alloc_group,
90                                      alloc_group_block, 1);
91         BUG_ON(!ret);
92         BTRFS_I(inode)->block_group = alloc_group;
93
94         btrfs_free_path(path);
95         inode_item = NULL;
96
97         mutex_unlock(&root->fs_info->fs_mutex);
98
99         switch (inode->i_mode & S_IFMT) {
100 #if 0
101         default:
102                 init_special_inode(inode, inode->i_mode,
103                                    btrfs_inode_rdev(inode_item));
104                 break;
105 #endif
106         case S_IFREG:
107                 inode->i_mapping->a_ops = &btrfs_aops;
108                 inode->i_fop = &btrfs_file_operations;
109                 inode->i_op = &btrfs_file_inode_operations;
110                 break;
111         case S_IFDIR:
112                 inode->i_fop = &btrfs_dir_file_operations;
113                 if (root == root->fs_info->tree_root)
114                         inode->i_op = &btrfs_dir_ro_inode_operations;
115                 else
116                         inode->i_op = &btrfs_dir_inode_operations;
117                 break;
118         case S_IFLNK:
119                 // inode->i_op = &page_symlink_inode_operations;
120                 break;
121         }
122         return;
123
124 make_bad:
125         btrfs_release_path(root, path);
126         btrfs_free_path(path);
127         mutex_unlock(&root->fs_info->fs_mutex);
128         make_bad_inode(inode);
129 }
130
131 static void fill_inode_item(struct btrfs_inode_item *item,
132                             struct inode *inode)
133 {
134         btrfs_set_inode_uid(item, inode->i_uid);
135         btrfs_set_inode_gid(item, inode->i_gid);
136         btrfs_set_inode_size(item, inode->i_size);
137         btrfs_set_inode_mode(item, inode->i_mode);
138         btrfs_set_inode_nlink(item, inode->i_nlink);
139         btrfs_set_timespec_sec(&item->atime, inode->i_atime.tv_sec);
140         btrfs_set_timespec_nsec(&item->atime, inode->i_atime.tv_nsec);
141         btrfs_set_timespec_sec(&item->mtime, inode->i_mtime.tv_sec);
142         btrfs_set_timespec_nsec(&item->mtime, inode->i_mtime.tv_nsec);
143         btrfs_set_timespec_sec(&item->ctime, inode->i_ctime.tv_sec);
144         btrfs_set_timespec_nsec(&item->ctime, inode->i_ctime.tv_nsec);
145         btrfs_set_inode_nblocks(item, inode->i_blocks);
146         btrfs_set_inode_generation(item, inode->i_generation);
147         btrfs_set_inode_block_group(item,
148                                     BTRFS_I(inode)->block_group->key.objectid);
149 }
150
151
152 static int btrfs_update_inode(struct btrfs_trans_handle *trans,
153                               struct btrfs_root *root,
154                               struct inode *inode)
155 {
156         struct btrfs_inode_item *inode_item;
157         struct btrfs_path *path;
158         int ret;
159
160         path = btrfs_alloc_path();
161         BUG_ON(!path);
162         btrfs_init_path(path);
163         ret = btrfs_lookup_inode(trans, root, path,
164                                  &BTRFS_I(inode)->location, 1);
165         if (ret) {
166                 if (ret > 0)
167                         ret = -ENOENT;
168                 goto failed;
169         }
170
171         inode_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
172                                   path->slots[0],
173                                   struct btrfs_inode_item);
174
175         fill_inode_item(inode_item, inode);
176         btrfs_mark_buffer_dirty(path->nodes[0]);
177         ret = 0;
178 failed:
179         btrfs_release_path(root, path);
180         btrfs_free_path(path);
181         return ret;
182 }
183
184
185 static int btrfs_unlink_trans(struct btrfs_trans_handle *trans,
186                               struct btrfs_root *root,
187                               struct inode *dir,
188                               struct dentry *dentry)
189 {
190         struct btrfs_path *path;
191         const char *name = dentry->d_name.name;
192         int name_len = dentry->d_name.len;
193         int ret = 0;
194         u64 objectid;
195         struct btrfs_dir_item *di;
196
197         path = btrfs_alloc_path();
198         BUG_ON(!path);
199         btrfs_init_path(path);
200         di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
201                                     name, name_len, -1);
202         if (IS_ERR(di)) {
203                 ret = PTR_ERR(di);
204                 goto err;
205         }
206         if (!di) {
207                 ret = -ENOENT;
208                 goto err;
209         }
210         objectid = btrfs_disk_key_objectid(&di->location);
211         ret = btrfs_delete_one_dir_name(trans, root, path, di);
212         BUG_ON(ret);
213         btrfs_release_path(root, path);
214
215         di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
216                                          objectid, name, name_len, -1);
217         if (IS_ERR(di)) {
218                 ret = PTR_ERR(di);
219                 goto err;
220         }
221         if (!di) {
222                 ret = -ENOENT;
223                 goto err;
224         }
225         ret = btrfs_delete_one_dir_name(trans, root, path, di);
226         BUG_ON(ret);
227
228         dentry->d_inode->i_ctime = dir->i_ctime;
229 err:
230         btrfs_free_path(path);
231         if (!ret) {
232                 dir->i_size -= name_len * 2;
233                 btrfs_update_inode(trans, root, dir);
234                 drop_nlink(dentry->d_inode);
235                 btrfs_update_inode(trans, root, dentry->d_inode);
236                 dir->i_sb->s_dirt = 1;
237         }
238         return ret;
239 }
240
241 static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
242 {
243         struct btrfs_root *root;
244         struct btrfs_trans_handle *trans;
245         int ret;
246
247         root = BTRFS_I(dir)->root;
248         mutex_lock(&root->fs_info->fs_mutex);
249         trans = btrfs_start_transaction(root, 1);
250         btrfs_set_trans_block_group(trans, dir);
251         ret = btrfs_unlink_trans(trans, root, dir, dentry);
252         btrfs_end_transaction(trans, root);
253         mutex_unlock(&root->fs_info->fs_mutex);
254         return ret;
255 }
256
257 static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
258 {
259         struct inode *inode = dentry->d_inode;
260         int err;
261         int ret;
262         struct btrfs_root *root = BTRFS_I(dir)->root;
263         struct btrfs_path *path;
264         struct btrfs_key key;
265         struct btrfs_trans_handle *trans;
266         struct btrfs_key found_key;
267         int found_type;
268         struct btrfs_leaf *leaf;
269         char *goodnames = "..";
270
271         path = btrfs_alloc_path();
272         BUG_ON(!path);
273         btrfs_init_path(path);
274         mutex_lock(&root->fs_info->fs_mutex);
275         trans = btrfs_start_transaction(root, 1);
276         btrfs_set_trans_block_group(trans, dir);
277         key.objectid = inode->i_ino;
278         key.offset = (u64)-1;
279         key.flags = (u32)-1;
280         while(1) {
281                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
282                 if (ret < 0) {
283                         err = ret;
284                         goto out;
285                 }
286                 BUG_ON(ret == 0);
287                 if (path->slots[0] == 0) {
288                         err = -ENOENT;
289                         goto out;
290                 }
291                 path->slots[0]--;
292                 leaf = btrfs_buffer_leaf(path->nodes[0]);
293                 btrfs_disk_key_to_cpu(&found_key,
294                                       &leaf->items[path->slots[0]].key);
295                 found_type = btrfs_key_type(&found_key);
296                 if (found_key.objectid != inode->i_ino) {
297                         err = -ENOENT;
298                         goto out;
299                 }
300                 if ((found_type != BTRFS_DIR_ITEM_KEY &&
301                      found_type != BTRFS_DIR_INDEX_KEY) ||
302                     (!btrfs_match_dir_item_name(root, path, goodnames, 2) &&
303                     !btrfs_match_dir_item_name(root, path, goodnames, 1))) {
304                         err = -ENOTEMPTY;
305                         goto out;
306                 }
307                 ret = btrfs_del_item(trans, root, path);
308                 BUG_ON(ret);
309
310                 if (found_type == BTRFS_DIR_ITEM_KEY && found_key.offset == 1)
311                         break;
312                 btrfs_release_path(root, path);
313         }
314         ret = 0;
315         btrfs_release_path(root, path);
316
317         /* now the directory is empty */
318         err = btrfs_unlink_trans(trans, root, dir, dentry);
319         if (!err) {
320                 inode->i_size = 0;
321         }
322 out:
323         btrfs_release_path(root, path);
324         btrfs_free_path(path);
325         mutex_unlock(&root->fs_info->fs_mutex);
326         ret = btrfs_end_transaction(trans, root);
327         if (ret && !err)
328                 err = ret;
329         return err;
330 }
331
332 static int btrfs_free_inode(struct btrfs_trans_handle *trans,
333                             struct btrfs_root *root,
334                             struct inode *inode)
335 {
336         struct btrfs_path *path;
337         int ret;
338
339         clear_inode(inode);
340
341         path = btrfs_alloc_path();
342         BUG_ON(!path);
343         btrfs_init_path(path);
344         ret = btrfs_lookup_inode(trans, root, path,
345                                  &BTRFS_I(inode)->location, -1);
346         BUG_ON(ret);
347         ret = btrfs_del_item(trans, root, path);
348         BUG_ON(ret);
349         btrfs_free_path(path);
350         return ret;
351 }
352
353 static int btrfs_truncate_in_trans(struct btrfs_trans_handle *trans,
354                                    struct btrfs_root *root,
355                                    struct inode *inode)
356 {
357         int ret;
358         struct btrfs_path *path;
359         struct btrfs_key key;
360         struct btrfs_disk_key *found_key;
361         struct btrfs_leaf *leaf;
362         struct btrfs_file_extent_item *fi = NULL;
363         u64 extent_start = 0;
364         u64 extent_num_blocks = 0;
365         int found_extent;
366
367         path = btrfs_alloc_path();
368         BUG_ON(!path);
369         /* FIXME, add redo link to tree so we don't leak on crash */
370         key.objectid = inode->i_ino;
371         key.offset = (u64)-1;
372         key.flags = 0;
373         /*
374          * use BTRFS_CSUM_ITEM_KEY because it is larger than inline keys
375          * or extent data
376          */
377         btrfs_set_key_type(&key, BTRFS_CSUM_ITEM_KEY);
378         while(1) {
379                 btrfs_init_path(path);
380                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
381                 if (ret < 0) {
382                         goto error;
383                 }
384                 if (ret > 0) {
385                         BUG_ON(path->slots[0] == 0);
386                         path->slots[0]--;
387                 }
388                 leaf = btrfs_buffer_leaf(path->nodes[0]);
389                 found_key = &leaf->items[path->slots[0]].key;
390                 if (btrfs_disk_key_objectid(found_key) != inode->i_ino)
391                         break;
392                 if (btrfs_disk_key_type(found_key) != BTRFS_CSUM_ITEM_KEY &&
393                     btrfs_disk_key_type(found_key) != BTRFS_EXTENT_DATA_KEY)
394                         break;
395                 if (btrfs_disk_key_offset(found_key) < inode->i_size)
396                         break;
397                 found_extent = 0;
398                 if (btrfs_disk_key_type(found_key) == BTRFS_EXTENT_DATA_KEY) {
399                         fi = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
400                                             path->slots[0],
401                                             struct btrfs_file_extent_item);
402                         if (btrfs_file_extent_type(fi) !=
403                             BTRFS_FILE_EXTENT_INLINE) {
404                                 extent_start =
405                                         btrfs_file_extent_disk_blocknr(fi);
406                                 extent_num_blocks =
407                                         btrfs_file_extent_disk_num_blocks(fi);
408                                 /* FIXME blocksize != 4096 */
409                                 inode->i_blocks -=
410                                         btrfs_file_extent_num_blocks(fi) << 3;
411                                 found_extent = 1;
412                         }
413                 }
414                 ret = btrfs_del_item(trans, root, path);
415                 BUG_ON(ret);
416                 btrfs_release_path(root, path);
417                 if (found_extent) {
418                         ret = btrfs_free_extent(trans, root, extent_start,
419                                                 extent_num_blocks, 0);
420                         BUG_ON(ret);
421                 }
422         }
423         ret = 0;
424 error:
425         btrfs_release_path(root, path);
426         btrfs_free_path(path);
427         inode->i_sb->s_dirt = 1;
428         return ret;
429 }
430
431 static void btrfs_delete_inode(struct inode *inode)
432 {
433         struct btrfs_trans_handle *trans;
434         struct btrfs_root *root = BTRFS_I(inode)->root;
435         int ret;
436
437         truncate_inode_pages(&inode->i_data, 0);
438         if (is_bad_inode(inode)) {
439                 goto no_delete;
440         }
441         inode->i_size = 0;
442         mutex_lock(&root->fs_info->fs_mutex);
443         trans = btrfs_start_transaction(root, 1);
444         btrfs_set_trans_block_group(trans, inode);
445         if (S_ISREG(inode->i_mode)) {
446                 ret = btrfs_truncate_in_trans(trans, root, inode);
447                 BUG_ON(ret);
448         }
449         btrfs_free_inode(trans, root, inode);
450         btrfs_end_transaction(trans, root);
451         mutex_unlock(&root->fs_info->fs_mutex);
452         return;
453 no_delete:
454         clear_inode(inode);
455 }
456
457 static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry,
458                                struct btrfs_key *location)
459 {
460         const char *name = dentry->d_name.name;
461         int namelen = dentry->d_name.len;
462         struct btrfs_dir_item *di;
463         struct btrfs_path *path;
464         struct btrfs_root *root = BTRFS_I(dir)->root;
465         int ret;
466
467         path = btrfs_alloc_path();
468         BUG_ON(!path);
469         btrfs_init_path(path);
470         di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name,
471                                     namelen, 0);
472         if (!di || IS_ERR(di)) {
473                 location->objectid = 0;
474                 ret = 0;
475                 goto out;
476         }
477         btrfs_disk_key_to_cpu(location, &di->location);
478 out:
479         btrfs_release_path(root, path);
480         btrfs_free_path(path);
481         return ret;
482 }
483
484 int fixup_tree_root_location(struct btrfs_root *root,
485                              struct btrfs_key *location,
486                              struct btrfs_root **sub_root)
487 {
488         struct btrfs_path *path;
489         struct btrfs_root_item *ri;
490
491         if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY)
492                 return 0;
493         if (location->objectid == BTRFS_ROOT_TREE_OBJECTID)
494                 return 0;
495
496         path = btrfs_alloc_path();
497         BUG_ON(!path);
498         mutex_lock(&root->fs_info->fs_mutex);
499
500         *sub_root = btrfs_read_fs_root(root->fs_info, location);
501         if (IS_ERR(*sub_root))
502                 return PTR_ERR(*sub_root);
503
504         ri = &(*sub_root)->root_item;
505         location->objectid = btrfs_root_dirid(ri);
506         location->flags = 0;
507         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
508         location->offset = 0;
509
510         btrfs_free_path(path);
511         mutex_unlock(&root->fs_info->fs_mutex);
512         return 0;
513 }
514
515 int btrfs_init_locked_inode(struct inode *inode, void *p)
516 {
517         struct btrfs_iget_args *args = p;
518         inode->i_ino = args->ino;
519         BTRFS_I(inode)->root = args->root;
520         return 0;
521 }
522
523 int btrfs_find_actor(struct inode *inode, void *opaque)
524 {
525         struct btrfs_iget_args *args = opaque;
526         return (args->ino == inode->i_ino &&
527                 args->root == BTRFS_I(inode)->root);
528 }
529
530 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
531                                 struct btrfs_root *root)
532 {
533         struct inode *inode;
534         struct btrfs_iget_args args;
535         args.ino = objectid;
536         args.root = root;
537
538         inode = iget5_locked(s, objectid, btrfs_find_actor,
539                              btrfs_init_locked_inode,
540                              (void *)&args);
541         return inode;
542 }
543
544 static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry,
545                                    struct nameidata *nd)
546 {
547         struct inode * inode;
548         struct btrfs_inode *bi = BTRFS_I(dir);
549         struct btrfs_root *root = bi->root;
550         struct btrfs_root *sub_root = root;
551         struct btrfs_key location;
552         int ret;
553
554         if (dentry->d_name.len > BTRFS_NAME_LEN)
555                 return ERR_PTR(-ENAMETOOLONG);
556         mutex_lock(&root->fs_info->fs_mutex);
557         ret = btrfs_inode_by_name(dir, dentry, &location);
558         mutex_unlock(&root->fs_info->fs_mutex);
559         if (ret < 0)
560                 return ERR_PTR(ret);
561         inode = NULL;
562         if (location.objectid) {
563                 ret = fixup_tree_root_location(root, &location, &sub_root);
564                 if (ret < 0)
565                         return ERR_PTR(ret);
566                 if (ret > 0)
567                         return ERR_PTR(-ENOENT);
568                 inode = btrfs_iget_locked(dir->i_sb, location.objectid,
569                                           sub_root);
570                 if (!inode)
571                         return ERR_PTR(-EACCES);
572                 if (inode->i_state & I_NEW) {
573                         if (sub_root != root) {
574 printk("adding new root for inode %lu root %p (found %p)\n", inode->i_ino, sub_root, BTRFS_I(inode)->root);
575                                 igrab(inode);
576                                 sub_root->inode = inode;
577                         }
578                         BTRFS_I(inode)->root = sub_root;
579                         memcpy(&BTRFS_I(inode)->location, &location,
580                                sizeof(location));
581                         btrfs_read_locked_inode(inode);
582                         unlock_new_inode(inode);
583                 }
584         }
585         return d_splice_alias(inode, dentry);
586 }
587
588 static int btrfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
589 {
590         struct inode *inode = filp->f_path.dentry->d_inode;
591         struct btrfs_root *root = BTRFS_I(inode)->root;
592         struct btrfs_item *item;
593         struct btrfs_dir_item *di;
594         struct btrfs_key key;
595         struct btrfs_path *path;
596         int ret;
597         u32 nritems;
598         struct btrfs_leaf *leaf;
599         int slot;
600         int advance;
601         unsigned char d_type = DT_UNKNOWN;
602         int over = 0;
603         u32 di_cur;
604         u32 di_total;
605         u32 di_len;
606         int key_type = BTRFS_DIR_INDEX_KEY;
607
608         /* FIXME, use a real flag for deciding about the key type */
609         if (root->fs_info->tree_root == root)
610                 key_type = BTRFS_DIR_ITEM_KEY;
611         mutex_lock(&root->fs_info->fs_mutex);
612         key.objectid = inode->i_ino;
613         key.flags = 0;
614         btrfs_set_key_type(&key, key_type);
615         key.offset = filp->f_pos;
616         path = btrfs_alloc_path();
617         btrfs_init_path(path);
618         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
619         if (ret < 0)
620                 goto err;
621         advance = 0;
622         while(1) {
623                 leaf = btrfs_buffer_leaf(path->nodes[0]);
624                 nritems = btrfs_header_nritems(&leaf->header);
625                 slot = path->slots[0];
626                 if (advance || slot >= nritems) {
627                         if (slot >= nritems -1) {
628                                 ret = btrfs_next_leaf(root, path);
629                                 if (ret)
630                                         break;
631                                 leaf = btrfs_buffer_leaf(path->nodes[0]);
632                                 nritems = btrfs_header_nritems(&leaf->header);
633                                 slot = path->slots[0];
634                         } else {
635                                 slot++;
636                                 path->slots[0]++;
637                         }
638                 }
639                 advance = 1;
640                 item = leaf->items + slot;
641                 if (btrfs_disk_key_objectid(&item->key) != key.objectid)
642                         break;
643                 if (btrfs_disk_key_type(&item->key) != key_type)
644                         break;
645                 if (btrfs_disk_key_offset(&item->key) < filp->f_pos)
646                         continue;
647                 filp->f_pos = btrfs_disk_key_offset(&item->key);
648                 advance = 1;
649                 di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
650                 di_cur = 0;
651                 di_total = btrfs_item_size(leaf->items + slot);
652                 while(di_cur < di_total) {
653                         over = filldir(dirent, (const char *)(di + 1),
654                                        btrfs_dir_name_len(di),
655                                        btrfs_disk_key_offset(&item->key),
656                                        btrfs_disk_key_objectid(&di->location),
657                                        d_type);
658                         if (over)
659                                 goto nopos;
660                         di_len = btrfs_dir_name_len(di) + sizeof(*di);
661                         di_cur += di_len;
662                         di = (struct btrfs_dir_item *)((char *)di + di_len);
663                 }
664         }
665         filp->f_pos++;
666 nopos:
667         ret = 0;
668 err:
669         btrfs_release_path(root, path);
670         btrfs_free_path(path);
671         mutex_unlock(&root->fs_info->fs_mutex);
672         return ret;
673 }
674
675 static void btrfs_put_super (struct super_block * sb)
676 {
677         struct btrfs_root *root = btrfs_sb(sb);
678         int ret;
679
680         ret = close_ctree(root);
681         if (ret) {
682                 printk("close ctree returns %d\n", ret);
683         }
684         sb->s_fs_info = NULL;
685 }
686
687 static int btrfs_fill_super(struct super_block * sb, void * data, int silent)
688 {
689         struct inode * inode;
690         struct dentry * root_dentry;
691         struct btrfs_super_block *disk_super;
692         struct btrfs_root *tree_root;
693         struct btrfs_inode *bi;
694
695         sb->s_maxbytes = MAX_LFS_FILESIZE;
696         sb->s_magic = BTRFS_SUPER_MAGIC;
697         sb->s_op = &btrfs_super_ops;
698         sb->s_time_gran = 1;
699
700         tree_root = open_ctree(sb);
701
702         if (!tree_root) {
703                 printk("btrfs: open_ctree failed\n");
704                 return -EIO;
705         }
706         sb->s_fs_info = tree_root;
707         disk_super = tree_root->fs_info->disk_super;
708         printk("read in super total blocks %Lu root %Lu\n",
709                btrfs_super_total_blocks(disk_super),
710                btrfs_super_root_dir(disk_super));
711
712         inode = btrfs_iget_locked(sb, btrfs_super_root_dir(disk_super),
713                                   tree_root);
714         bi = BTRFS_I(inode);
715         bi->location.objectid = inode->i_ino;
716         bi->location.offset = 0;
717         bi->location.flags = 0;
718         bi->root = tree_root;
719         btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY);
720
721         if (!inode)
722                 return -ENOMEM;
723         if (inode->i_state & I_NEW) {
724                 btrfs_read_locked_inode(inode);
725                 unlock_new_inode(inode);
726         }
727
728         root_dentry = d_alloc_root(inode);
729         if (!root_dentry) {
730                 iput(inode);
731                 return -ENOMEM;
732         }
733         sb->s_root = root_dentry;
734
735         return 0;
736 }
737
738 static int btrfs_write_inode(struct inode *inode, int wait)
739 {
740         struct btrfs_root *root = BTRFS_I(inode)->root;
741         struct btrfs_trans_handle *trans;
742         int ret = 0;
743
744         if (wait) {
745                 mutex_lock(&root->fs_info->fs_mutex);
746                 trans = btrfs_start_transaction(root, 1);
747                 btrfs_set_trans_block_group(trans, inode);
748                 ret = btrfs_commit_transaction(trans, root);
749                 mutex_unlock(&root->fs_info->fs_mutex);
750         }
751         return ret;
752 }
753
754 static void btrfs_dirty_inode(struct inode *inode)
755 {
756         struct btrfs_root *root = BTRFS_I(inode)->root;
757         struct btrfs_trans_handle *trans;
758
759         mutex_lock(&root->fs_info->fs_mutex);
760         trans = btrfs_start_transaction(root, 1);
761         btrfs_set_trans_block_group(trans, inode);
762         btrfs_update_inode(trans, root, inode);
763         btrfs_end_transaction(trans, root);
764         mutex_unlock(&root->fs_info->fs_mutex);
765 }
766
767 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
768                                      struct btrfs_root *root,
769                                      u64 objectid,
770                                      struct btrfs_block_group_cache *group,
771                                      int mode)
772 {
773         struct inode *inode;
774         struct btrfs_inode_item inode_item;
775         struct btrfs_key *location;
776         int ret;
777
778         inode = new_inode(root->fs_info->sb);
779         if (!inode)
780                 return ERR_PTR(-ENOMEM);
781
782         BTRFS_I(inode)->root = root;
783         group = btrfs_find_block_group(root, group, 0);
784         BTRFS_I(inode)->block_group = group;
785
786         inode->i_uid = current->fsuid;
787         inode->i_gid = current->fsgid;
788         inode->i_mode = mode;
789         inode->i_ino = objectid;
790         inode->i_blocks = 0;
791         inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
792         fill_inode_item(&inode_item, inode);
793         location = &BTRFS_I(inode)->location;
794         location->objectid = objectid;
795         location->flags = 0;
796         location->offset = 0;
797         btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
798
799         ret = btrfs_insert_inode(trans, root, objectid, &inode_item);
800         BUG_ON(ret);
801
802         insert_inode_hash(inode);
803         return inode;
804 }
805
806 static int btrfs_add_link(struct btrfs_trans_handle *trans,
807                             struct dentry *dentry, struct inode *inode)
808 {
809         int ret;
810         struct btrfs_key key;
811         struct btrfs_root *root = BTRFS_I(dentry->d_parent->d_inode)->root;
812         key.objectid = inode->i_ino;
813         key.flags = 0;
814         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
815         key.offset = 0;
816
817         ret = btrfs_insert_dir_item(trans, root,
818                                     dentry->d_name.name, dentry->d_name.len,
819                                     dentry->d_parent->d_inode->i_ino,
820                                     &key, 0);
821         if (ret == 0) {
822                 dentry->d_parent->d_inode->i_size += dentry->d_name.len * 2;
823                 ret = btrfs_update_inode(trans, root,
824                                          dentry->d_parent->d_inode);
825         }
826         return ret;
827 }
828
829 static int btrfs_add_nondir(struct btrfs_trans_handle *trans,
830                             struct dentry *dentry, struct inode *inode)
831 {
832         int err = btrfs_add_link(trans, dentry, inode);
833         if (!err) {
834                 d_instantiate(dentry, inode);
835                 return 0;
836         }
837         if (err > 0)
838                 err = -EEXIST;
839         return err;
840 }
841
842 static int btrfs_create(struct inode *dir, struct dentry *dentry,
843                         int mode, struct nameidata *nd)
844 {
845         struct btrfs_trans_handle *trans;
846         struct btrfs_root *root = BTRFS_I(dir)->root;
847         struct inode *inode;
848         int err;
849         int drop_inode = 0;
850         u64 objectid;
851
852         mutex_lock(&root->fs_info->fs_mutex);
853         trans = btrfs_start_transaction(root, 1);
854         btrfs_set_trans_block_group(trans, dir);
855
856         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
857         if (err) {
858                 err = -ENOSPC;
859                 goto out_unlock;
860         }
861
862         inode = btrfs_new_inode(trans, root, objectid,
863                                 BTRFS_I(dir)->block_group, mode);
864         err = PTR_ERR(inode);
865         if (IS_ERR(inode))
866                 goto out_unlock;
867
868         btrfs_set_trans_block_group(trans, inode);
869         err = btrfs_add_nondir(trans, dentry, inode);
870         if (err)
871                 drop_inode = 1;
872         else {
873                 inode->i_mapping->a_ops = &btrfs_aops;
874                 inode->i_fop = &btrfs_file_operations;
875                 inode->i_op = &btrfs_file_inode_operations;
876         }
877         dir->i_sb->s_dirt = 1;
878         btrfs_update_inode_block_group(trans, inode);
879         btrfs_update_inode_block_group(trans, dir);
880 out_unlock:
881         btrfs_end_transaction(trans, root);
882         mutex_unlock(&root->fs_info->fs_mutex);
883
884         if (drop_inode) {
885                 inode_dec_link_count(inode);
886                 iput(inode);
887         }
888         return err;
889 }
890
891 static int btrfs_make_empty_dir(struct btrfs_trans_handle *trans,
892                                 struct btrfs_root *root,
893                                 u64 objectid, u64 dirid)
894 {
895         int ret;
896         char buf[2];
897         struct btrfs_key key;
898
899         buf[0] = '.';
900         buf[1] = '.';
901
902         key.objectid = objectid;
903         key.offset = 0;
904         key.flags = 0;
905         btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY);
906
907         ret = btrfs_insert_dir_item(trans, root, buf, 1, objectid,
908                                     &key, 1);
909         if (ret)
910                 goto error;
911         key.objectid = dirid;
912         ret = btrfs_insert_dir_item(trans, root, buf, 2, objectid,
913                                     &key, 1);
914         if (ret)
915                 goto error;
916 error:
917         return ret;
918 }
919
920 static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
921 {
922         struct inode *inode;
923         struct btrfs_trans_handle *trans;
924         struct btrfs_root *root = BTRFS_I(dir)->root;
925         int err = 0;
926         int drop_on_err = 0;
927         u64 objectid;
928
929         mutex_lock(&root->fs_info->fs_mutex);
930         trans = btrfs_start_transaction(root, 1);
931         btrfs_set_trans_block_group(trans, dir);
932         if (IS_ERR(trans)) {
933                 err = PTR_ERR(trans);
934                 goto out_unlock;
935         }
936
937         err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
938         if (err) {
939                 err = -ENOSPC;
940                 goto out_unlock;
941         }
942
943         inode = btrfs_new_inode(trans, root, objectid,
944                                 BTRFS_I(dir)->block_group, S_IFDIR | mode);
945         if (IS_ERR(inode)) {
946                 err = PTR_ERR(inode);
947                 goto out_fail;
948         }
949         drop_on_err = 1;
950         inode->i_op = &btrfs_dir_inode_operations;
951         inode->i_fop = &btrfs_dir_file_operations;
952         btrfs_set_trans_block_group(trans, inode);
953
954         err = btrfs_make_empty_dir(trans, root, inode->i_ino, dir->i_ino);
955         if (err)
956                 goto out_fail;
957
958         inode->i_size = 6;
959         err = btrfs_update_inode(trans, root, inode);
960         if (err)
961                 goto out_fail;
962         err = btrfs_add_link(trans, dentry, inode);
963         if (err)
964                 goto out_fail;
965         d_instantiate(dentry, inode);
966         drop_on_err = 0;
967         dir->i_sb->s_dirt = 1;
968         btrfs_update_inode_block_group(trans, inode);
969         btrfs_update_inode_block_group(trans, dir);
970
971 out_fail:
972         btrfs_end_transaction(trans, root);
973 out_unlock:
974         mutex_unlock(&root->fs_info->fs_mutex);
975         if (drop_on_err)
976                 iput(inode);
977         return err;
978 }
979
980 static int btrfs_sync_file(struct file *file,
981                            struct dentry *dentry, int datasync)
982 {
983         struct inode *inode = dentry->d_inode;
984         struct btrfs_root *root = BTRFS_I(inode)->root;
985         int ret;
986         struct btrfs_trans_handle *trans;
987
988         mutex_lock(&root->fs_info->fs_mutex);
989         trans = btrfs_start_transaction(root, 1);
990         if (!trans) {
991                 ret = -ENOMEM;
992                 goto out;
993         }
994         ret = btrfs_commit_transaction(trans, root);
995         mutex_unlock(&root->fs_info->fs_mutex);
996 out:
997         return ret > 0 ? EIO : ret;
998 }
999
1000 static int btrfs_sync_fs(struct super_block *sb, int wait)
1001 {
1002         struct btrfs_trans_handle *trans;
1003         struct btrfs_root *root;
1004         int ret;
1005         root = btrfs_sb(sb);
1006
1007         sb->s_dirt = 0;
1008         if (!wait) {
1009                 filemap_flush(root->fs_info->btree_inode->i_mapping);
1010                 return 0;
1011         }
1012         mutex_lock(&root->fs_info->fs_mutex);
1013         trans = btrfs_start_transaction(root, 1);
1014         ret = btrfs_commit_transaction(trans, root);
1015         sb->s_dirt = 0;
1016         BUG_ON(ret);
1017 printk("btrfs sync_fs\n");
1018         mutex_unlock(&root->fs_info->fs_mutex);
1019         return 0;
1020 }
1021
1022 static int btrfs_get_block_lock(struct inode *inode, sector_t iblock,
1023                            struct buffer_head *result, int create)
1024 {
1025         int ret;
1026         int err = 0;
1027         u64 blocknr;
1028         u64 extent_start = 0;
1029         u64 extent_end = 0;
1030         u64 objectid = inode->i_ino;
1031         u32 found_type;
1032         struct btrfs_path *path;
1033         struct btrfs_root *root = BTRFS_I(inode)->root;
1034         struct btrfs_file_extent_item *item;
1035         struct btrfs_leaf *leaf;
1036         struct btrfs_disk_key *found_key;
1037
1038         path = btrfs_alloc_path();
1039         BUG_ON(!path);
1040         btrfs_init_path(path);
1041         if (create) {
1042                 WARN_ON(1);
1043         }
1044
1045         ret = btrfs_lookup_file_extent(NULL, root, path,
1046                                        inode->i_ino,
1047                                        iblock << inode->i_blkbits, 0);
1048         if (ret < 0) {
1049                 err = ret;
1050                 goto out;
1051         }
1052
1053         if (ret != 0) {
1054                 if (path->slots[0] == 0) {
1055                         btrfs_release_path(root, path);
1056                         goto out;
1057                 }
1058                 path->slots[0]--;
1059         }
1060
1061         item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]), path->slots[0],
1062                               struct btrfs_file_extent_item);
1063         leaf = btrfs_buffer_leaf(path->nodes[0]);
1064         blocknr = btrfs_file_extent_disk_blocknr(item);
1065         blocknr += btrfs_file_extent_offset(item);
1066
1067         /* are we inside the extent that was found? */
1068         found_key = &leaf->items[path->slots[0]].key;
1069         found_type = btrfs_disk_key_type(found_key);
1070         if (btrfs_disk_key_objectid(found_key) != objectid ||
1071             found_type != BTRFS_EXTENT_DATA_KEY) {
1072                 extent_end = 0;
1073                 extent_start = 0;
1074                 btrfs_release_path(root, path);
1075                 goto out;
1076         }
1077         found_type = btrfs_file_extent_type(item);
1078         extent_start = btrfs_disk_key_offset(&leaf->items[path->slots[0]].key);
1079         if (found_type == BTRFS_FILE_EXTENT_REG) {
1080                 extent_start = extent_start >> inode->i_blkbits;
1081                 extent_end = extent_start + btrfs_file_extent_num_blocks(item);
1082                 if (iblock >= extent_start && iblock < extent_end) {
1083                         err = 0;
1084                         btrfs_map_bh_to_logical(root, result, blocknr +
1085                                                 iblock - extent_start);
1086                         goto out;
1087                 }
1088         } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
1089                 char *ptr;
1090                 char *map;
1091                 u32 size;
1092                 size = btrfs_file_extent_inline_len(leaf->items +
1093                                                     path->slots[0]);
1094                 extent_end = (extent_start + size) >> inode->i_blkbits;
1095                 extent_start >>= inode->i_blkbits;
1096                 if (iblock < extent_start || iblock > extent_end) {
1097                         goto out;
1098                 }
1099                 ptr = btrfs_file_extent_inline_start(item);
1100                 map = kmap(result->b_page);
1101                 memcpy(map, ptr, size);
1102                 memset(map + size, 0, PAGE_CACHE_SIZE - size);
1103                 flush_dcache_page(result->b_page);
1104                 kunmap(result->b_page);
1105                 set_buffer_uptodate(result);
1106                 SetPageChecked(result->b_page);
1107                 btrfs_map_bh_to_logical(root, result, 0);
1108         }
1109 out:
1110         btrfs_release_path(root, path);
1111         btrfs_free_path(path);
1112         return err;
1113 }
1114
1115 static int btrfs_get_block(struct inode *inode, sector_t iblock,
1116                            struct buffer_head *result, int create)
1117 {
1118         int err;
1119         struct btrfs_root *root = BTRFS_I(inode)->root;
1120         mutex_lock(&root->fs_info->fs_mutex);
1121         err = btrfs_get_block_lock(inode, iblock, result, create);
1122         mutex_unlock(&root->fs_info->fs_mutex);
1123         return err;
1124 }
1125
1126 static int btrfs_prepare_write(struct file *file, struct page *page,
1127                                unsigned from, unsigned to)
1128 {
1129         return nobh_prepare_write(page, from, to, btrfs_get_block);
1130 }
1131
1132 static void btrfs_write_super(struct super_block *sb)
1133 {
1134         btrfs_sync_fs(sb, 1);
1135 }
1136
1137 static int btrfs_readpage(struct file *file, struct page *page)
1138 {
1139         return mpage_readpage(page, btrfs_get_block);
1140 }
1141
1142 /*
1143  * While block_write_full_page is writing back the dirty buffers under
1144  * the page lock, whoever dirtied the buffers may decide to clean them
1145  * again at any time.  We handle that by only looking at the buffer
1146  * state inside lock_buffer().
1147  *
1148  * If block_write_full_page() is called for regular writeback
1149  * (wbc->sync_mode == WB_SYNC_NONE) then it will redirty a page which has a
1150  * locked buffer.   This only can happen if someone has written the buffer
1151  * directly, with submit_bh().  At the address_space level PageWriteback
1152  * prevents this contention from occurring.
1153  */
1154 static int __btrfs_write_full_page(struct inode *inode, struct page *page,
1155                                    struct writeback_control *wbc)
1156 {
1157         int err;
1158         sector_t block;
1159         sector_t last_block;
1160         struct buffer_head *bh, *head;
1161         const unsigned blocksize = 1 << inode->i_blkbits;
1162         int nr_underway = 0;
1163
1164         BUG_ON(!PageLocked(page));
1165
1166         last_block = (i_size_read(inode) - 1) >> inode->i_blkbits;
1167
1168         if (!page_has_buffers(page)) {
1169                 create_empty_buffers(page, blocksize,
1170                                         (1 << BH_Dirty)|(1 << BH_Uptodate));
1171         }
1172
1173         /*
1174          * Be very careful.  We have no exclusion from __set_page_dirty_buffers
1175          * here, and the (potentially unmapped) buffers may become dirty at
1176          * any time.  If a buffer becomes dirty here after we've inspected it
1177          * then we just miss that fact, and the page stays dirty.
1178          *
1179          * Buffers outside i_size may be dirtied by __set_page_dirty_buffers;
1180          * handle that here by just cleaning them.
1181          */
1182
1183         block = (sector_t)page->index << (PAGE_CACHE_SHIFT - inode->i_blkbits);
1184         head = page_buffers(page);
1185         bh = head;
1186
1187         /*
1188          * Get all the dirty buffers mapped to disk addresses and
1189          * handle any aliases from the underlying blockdev's mapping.
1190          */
1191         do {
1192                 if (block > last_block) {
1193                         /*
1194                          * mapped buffers outside i_size will occur, because
1195                          * this page can be outside i_size when there is a
1196                          * truncate in progress.
1197                          */
1198                         /*
1199                          * The buffer was zeroed by block_write_full_page()
1200                          */
1201                         clear_buffer_dirty(bh);
1202                         set_buffer_uptodate(bh);
1203                 } else if (!buffer_mapped(bh) && buffer_dirty(bh)) {
1204                         WARN_ON(bh->b_size != blocksize);
1205                         err = btrfs_get_block(inode, block, bh, 0);
1206                         if (err)
1207                                 goto recover;
1208                         if (buffer_new(bh)) {
1209                                 /* blockdev mappings never come here */
1210                                 clear_buffer_new(bh);
1211                                 unmap_underlying_metadata(bh->b_bdev,
1212                                                         bh->b_blocknr);
1213                         }
1214                 }
1215                 bh = bh->b_this_page;
1216                 block++;
1217         } while (bh != head);
1218
1219         do {
1220                 if (!buffer_mapped(bh))
1221                         continue;
1222                 /*
1223                  * If it's a fully non-blocking write attempt and we cannot
1224                  * lock the buffer then redirty the page.  Note that this can
1225                  * potentially cause a busy-wait loop from pdflush and kswapd
1226                  * activity, but those code paths have their own higher-level
1227                  * throttling.
1228                  */
1229                 if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) {
1230                         lock_buffer(bh);
1231                 } else if (test_set_buffer_locked(bh)) {
1232                         redirty_page_for_writepage(wbc, page);
1233                         continue;
1234                 }
1235                 if (test_clear_buffer_dirty(bh) && bh->b_blocknr != 0) {
1236                         mark_buffer_async_write(bh);
1237                 } else {
1238                         unlock_buffer(bh);
1239                 }
1240         } while ((bh = bh->b_this_page) != head);
1241
1242         /*
1243          * The page and its buffers are protected by PageWriteback(), so we can
1244          * drop the bh refcounts early.
1245          */
1246         BUG_ON(PageWriteback(page));
1247         set_page_writeback(page);
1248
1249         do {
1250                 struct buffer_head *next = bh->b_this_page;
1251                 if (buffer_async_write(bh)) {
1252                         submit_bh(WRITE, bh);
1253                         nr_underway++;
1254                 }
1255                 bh = next;
1256         } while (bh != head);
1257         unlock_page(page);
1258
1259         err = 0;
1260 done:
1261         if (nr_underway == 0) {
1262                 /*
1263                  * The page was marked dirty, but the buffers were
1264                  * clean.  Someone wrote them back by hand with
1265                  * ll_rw_block/submit_bh.  A rare case.
1266                  */
1267                 int uptodate = 1;
1268                 do {
1269                         if (!buffer_uptodate(bh)) {
1270                                 uptodate = 0;
1271                                 break;
1272                         }
1273                         bh = bh->b_this_page;
1274                 } while (bh != head);
1275                 if (uptodate)
1276                         SetPageUptodate(page);
1277                 end_page_writeback(page);
1278                 /*
1279                  * The page and buffer_heads can be released at any time from
1280                  * here on.
1281                  */
1282                 wbc->pages_skipped++;   /* We didn't write this page */
1283         }
1284         return err;
1285
1286 recover:
1287         /*
1288          * ENOSPC, or some other error.  We may already have added some
1289          * blocks to the file, so we need to write these out to avoid
1290          * exposing stale data.
1291          * The page is currently locked and not marked for writeback
1292          */
1293         bh = head;
1294         /* Recovery: lock and submit the mapped buffers */
1295         do {
1296                 if (buffer_mapped(bh) && buffer_dirty(bh)) {
1297                         lock_buffer(bh);
1298                         mark_buffer_async_write(bh);
1299                 } else {
1300                         /*
1301                          * The buffer may have been set dirty during
1302                          * attachment to a dirty page.
1303                          */
1304                         clear_buffer_dirty(bh);
1305                 }
1306         } while ((bh = bh->b_this_page) != head);
1307         SetPageError(page);
1308         BUG_ON(PageWriteback(page));
1309         set_page_writeback(page);
1310         do {
1311                 struct buffer_head *next = bh->b_this_page;
1312                 if (buffer_async_write(bh)) {
1313                         clear_buffer_dirty(bh);
1314                         submit_bh(WRITE, bh);
1315                         nr_underway++;
1316                 }
1317                 bh = next;
1318         } while (bh != head);
1319         unlock_page(page);
1320         goto done;
1321 }
1322
1323 /*
1324  * The generic ->writepage function for buffer-backed address_spaces
1325  */
1326 static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
1327 {
1328         struct inode * const inode = page->mapping->host;
1329         loff_t i_size = i_size_read(inode);
1330         const pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
1331         unsigned offset;
1332         void *kaddr;
1333
1334         /* Is the page fully inside i_size? */
1335         if (page->index < end_index)
1336                 return __btrfs_write_full_page(inode, page, wbc);
1337
1338         /* Is the page fully outside i_size? (truncate in progress) */
1339         offset = i_size & (PAGE_CACHE_SIZE-1);
1340         if (page->index >= end_index+1 || !offset) {
1341                 /*
1342                  * The page may have dirty, unmapped buffers.  For example,
1343                  * they may have been added in ext3_writepage().  Make them
1344                  * freeable here, so the page does not leak.
1345                  */
1346                 block_invalidatepage(page, 0);
1347                 unlock_page(page);
1348                 return 0; /* don't care */
1349         }
1350
1351         /*
1352          * The page straddles i_size.  It must be zeroed out on each and every
1353          * writepage invokation because it may be mmapped.  "A file is mapped
1354          * in multiples of the page size.  For a file that is not a multiple of
1355          * the  page size, the remaining memory is zeroed when mapped, and
1356          * writes to that region are not written out to the file."
1357          */
1358         kaddr = kmap_atomic(page, KM_USER0);
1359         memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
1360         flush_dcache_page(page);
1361         kunmap_atomic(kaddr, KM_USER0);
1362         return __btrfs_write_full_page(inode, page, wbc);
1363 }
1364
1365 static void btrfs_truncate(struct inode *inode)
1366 {
1367         struct btrfs_root *root = BTRFS_I(inode)->root;
1368         int ret;
1369         struct btrfs_trans_handle *trans;
1370
1371         if (!S_ISREG(inode->i_mode))
1372                 return;
1373         if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
1374                 return;
1375
1376         nobh_truncate_page(inode->i_mapping, inode->i_size);
1377
1378         /* FIXME, add redo link to tree so we don't leak on crash */
1379         mutex_lock(&root->fs_info->fs_mutex);
1380         trans = btrfs_start_transaction(root, 1);
1381         btrfs_set_trans_block_group(trans, inode);
1382         ret = btrfs_truncate_in_trans(trans, root, inode);
1383         BUG_ON(ret);
1384         ret = btrfs_end_transaction(trans, root);
1385         BUG_ON(ret);
1386         mutex_unlock(&root->fs_info->fs_mutex);
1387         mark_inode_dirty(inode);
1388 }
1389
1390 /*
1391  * Make sure any changes to nobh_commit_write() are reflected in
1392  * nobh_truncate_page(), since it doesn't call commit_write().
1393  */
1394 static int btrfs_commit_write(struct file *file, struct page *page,
1395                               unsigned from, unsigned to)
1396 {
1397         struct inode *inode = page->mapping->host;
1398         struct buffer_head *bh;
1399         loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1400
1401         SetPageUptodate(page);
1402         bh = page_buffers(page);
1403         if (buffer_mapped(bh) && bh->b_blocknr != 0) {
1404                 set_page_dirty(page);
1405         }
1406         if (pos > inode->i_size) {
1407                 i_size_write(inode, pos);
1408                 mark_inode_dirty(inode);
1409         }
1410         return 0;
1411 }
1412
1413 static int btrfs_copy_from_user(loff_t pos, int num_pages, int write_bytes,
1414                                 struct page **prepared_pages,
1415                                 const char __user * buf)
1416 {
1417         long page_fault = 0;
1418         int i;
1419         int offset = pos & (PAGE_CACHE_SIZE - 1);
1420
1421         for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) {
1422                 size_t count = min_t(size_t,
1423                                      PAGE_CACHE_SIZE - offset, write_bytes);
1424                 struct page *page = prepared_pages[i];
1425                 fault_in_pages_readable(buf, count);
1426
1427                 /* Copy data from userspace to the current page */
1428                 kmap(page);
1429                 page_fault = __copy_from_user(page_address(page) + offset,
1430                                               buf, count);
1431                 /* Flush processor's dcache for this page */
1432                 flush_dcache_page(page);
1433                 kunmap(page);
1434                 buf += count;
1435                 write_bytes -= count;
1436
1437                 if (page_fault)
1438                         break;
1439         }
1440         return page_fault ? -EFAULT : 0;
1441 }
1442
1443 static void btrfs_drop_pages(struct page **pages, size_t num_pages)
1444 {
1445         size_t i;
1446         for (i = 0; i < num_pages; i++) {
1447                 if (!pages[i])
1448                         break;
1449                 unlock_page(pages[i]);
1450                 mark_page_accessed(pages[i]);
1451                 page_cache_release(pages[i]);
1452         }
1453 }
1454 static int dirty_and_release_pages(struct btrfs_trans_handle *trans,
1455                                    struct btrfs_root *root,
1456                                    struct file *file,
1457                                    struct page **pages,
1458                                    size_t num_pages,
1459                                    loff_t pos,
1460                                    size_t write_bytes)
1461 {
1462         int i;
1463         int offset;
1464         int err = 0;
1465         int ret;
1466         int this_write;
1467         struct inode *inode = file->f_path.dentry->d_inode;
1468         struct buffer_head *bh;
1469         struct btrfs_file_extent_item *ei;
1470
1471         for (i = 0; i < num_pages; i++) {
1472                 offset = pos & (PAGE_CACHE_SIZE -1);
1473                 this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
1474                 /* FIXME, one block at a time */
1475
1476                 mutex_lock(&root->fs_info->fs_mutex);
1477                 trans = btrfs_start_transaction(root, 1);
1478                 btrfs_set_trans_block_group(trans, inode);
1479
1480                 bh = page_buffers(pages[i]);
1481                 if (buffer_mapped(bh) && bh->b_blocknr == 0) {
1482                         struct btrfs_key key;
1483                         struct btrfs_path *path;
1484                         char *ptr;
1485                         u32 datasize;
1486
1487                         path = btrfs_alloc_path();
1488                         BUG_ON(!path);
1489                         key.objectid = inode->i_ino;
1490                         key.offset = pages[i]->index << PAGE_CACHE_SHIFT;
1491                         key.flags = 0;
1492                         btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY);
1493                         BUG_ON(write_bytes >= PAGE_CACHE_SIZE);
1494                         datasize = offset +
1495                                 btrfs_file_extent_calc_inline_size(write_bytes);
1496                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1497                                                       datasize);
1498                         BUG_ON(ret);
1499                         ei = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
1500                                path->slots[0], struct btrfs_file_extent_item);
1501                         btrfs_set_file_extent_generation(ei, trans->transid);
1502                         btrfs_set_file_extent_type(ei,
1503                                                    BTRFS_FILE_EXTENT_INLINE);
1504                         ptr = btrfs_file_extent_inline_start(ei);
1505                         memcpy(ptr, bh->b_data, offset + write_bytes);
1506                         mark_buffer_dirty(path->nodes[0]);
1507                         btrfs_free_path(path);
1508                 } else {
1509                         btrfs_csum_file_block(trans, root, inode->i_ino,
1510                                       pages[i]->index << PAGE_CACHE_SHIFT,
1511                                       kmap(pages[i]), PAGE_CACHE_SIZE);
1512                         kunmap(pages[i]);
1513                 }
1514                 SetPageChecked(pages[i]);
1515                 btrfs_update_inode_block_group(trans, inode);
1516                 ret = btrfs_end_transaction(trans, root);
1517                 BUG_ON(ret);
1518                 mutex_unlock(&root->fs_info->fs_mutex);
1519
1520                 ret = btrfs_commit_write(file, pages[i], offset,
1521                                          offset + this_write);
1522                 pos += this_write;
1523                 if (ret) {
1524                         err = ret;
1525                         goto failed;
1526                 }
1527                 WARN_ON(this_write > write_bytes);
1528                 write_bytes -= this_write;
1529         }
1530 failed:
1531         return err;
1532 }
1533
1534 static int drop_extents(struct btrfs_trans_handle *trans,
1535                           struct btrfs_root *root,
1536                           struct inode *inode,
1537                           u64 start, u64 end)
1538 {
1539         int ret;
1540         struct btrfs_key key;
1541         struct btrfs_leaf *leaf;
1542         int slot;
1543         struct btrfs_file_extent_item *extent;
1544         u64 extent_end = 0;
1545         int keep;
1546         struct btrfs_file_extent_item old;
1547         struct btrfs_path *path;
1548         u64 search_start = start;
1549         int bookend;
1550         int found_type;
1551         int found_extent;
1552         int found_inline;
1553
1554         path = btrfs_alloc_path();
1555         if (!path)
1556                 return -ENOMEM;
1557         while(1) {
1558                 btrfs_release_path(root, path);
1559                 ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
1560                                                search_start, -1);
1561                 if (ret < 0)
1562                         goto out;
1563                 if (ret > 0) {
1564                         if (path->slots[0] == 0) {
1565                                 ret = 0;
1566                                 goto out;
1567                         }
1568                         path->slots[0]--;
1569                 }
1570                 keep = 0;
1571                 bookend = 0;
1572                 found_extent = 0;
1573                 found_inline = 0;
1574                 extent = NULL;
1575                 leaf = btrfs_buffer_leaf(path->nodes[0]);
1576                 slot = path->slots[0];
1577                 btrfs_disk_key_to_cpu(&key, &leaf->items[slot].key);
1578                 if (key.offset >= end || key.objectid != inode->i_ino) {
1579                         ret = 0;
1580                         goto out;
1581                 }
1582                 if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) {
1583                         ret = 0;
1584                         goto out;
1585                 }
1586                 extent = btrfs_item_ptr(leaf, slot,
1587                                         struct btrfs_file_extent_item);
1588                 found_type = btrfs_file_extent_type(extent);
1589                 if (found_type == BTRFS_FILE_EXTENT_REG) {
1590                         extent_end = key.offset +
1591                                 (btrfs_file_extent_num_blocks(extent) <<
1592                                  inode->i_blkbits);
1593                         found_extent = 1;
1594                 } else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
1595                         found_inline = 1;
1596                         extent_end = key.offset +
1597                              btrfs_file_extent_inline_len(leaf->items + slot);
1598                 }
1599
1600                 if (!found_extent && !found_inline) {
1601                         ret = 0;
1602                         goto out;
1603                 }
1604
1605                 if (search_start >= extent_end) {
1606                         ret = 0;
1607                         goto out;
1608                 }
1609
1610                 search_start = extent_end;
1611
1612                 if (end < extent_end && end >= key.offset) {
1613                         if (found_extent) {
1614                                 memcpy(&old, extent, sizeof(old));
1615                                 ret = btrfs_inc_extent_ref(trans, root,
1616                                       btrfs_file_extent_disk_blocknr(&old),
1617                                       btrfs_file_extent_disk_num_blocks(&old));
1618                                 BUG_ON(ret);
1619                         }
1620                         WARN_ON(found_inline);
1621                         bookend = 1;
1622                 }
1623
1624                 if (start > key.offset) {
1625                         u64 new_num;
1626                         u64 old_num;
1627                         /* truncate existing extent */
1628                         keep = 1;
1629                         WARN_ON(start & (root->blocksize - 1));
1630                         if (found_extent) {
1631                                 new_num = (start - key.offset) >>
1632                                         inode->i_blkbits;
1633                                 old_num = btrfs_file_extent_num_blocks(extent);
1634                                 inode->i_blocks -= (old_num - new_num) << 3;
1635                                 btrfs_set_file_extent_num_blocks(extent,
1636                                                                  new_num);
1637                                 mark_buffer_dirty(path->nodes[0]);
1638                         } else {
1639                                 WARN_ON(1);
1640                                 /*
1641                                 ret = btrfs_truncate_item(trans, root, path,
1642                                                           start - key.offset);
1643                                 BUG_ON(ret);
1644                                 */
1645                         }
1646                 }
1647                 if (!keep) {
1648                         u64 disk_blocknr = 0;
1649                         u64 disk_num_blocks = 0;
1650                         u64 extent_num_blocks = 0;
1651                         if (found_extent) {
1652                                 disk_blocknr =
1653                                       btrfs_file_extent_disk_blocknr(extent);
1654                                 disk_num_blocks =
1655                                       btrfs_file_extent_disk_num_blocks(extent);
1656                                 extent_num_blocks =
1657                                       btrfs_file_extent_num_blocks(extent);
1658                         }
1659                         ret = btrfs_del_item(trans, root, path);
1660                         BUG_ON(ret);
1661                         btrfs_release_path(root, path);
1662                         if (found_extent) {
1663                                 inode->i_blocks -=
1664                                 btrfs_file_extent_num_blocks(extent) << 3;
1665                                 ret = btrfs_free_extent(trans, root,
1666                                                         disk_blocknr,
1667                                                         disk_num_blocks, 0);
1668                         }
1669
1670                         BUG_ON(ret);
1671                         if (!bookend && search_start >= end) {
1672                                 ret = 0;
1673                                 goto out;
1674                         }
1675                         if (!bookend)
1676                                 continue;
1677                 }
1678                 if (bookend && found_extent) {
1679                         /* create bookend */
1680                         struct btrfs_key ins;
1681                         ins.objectid = inode->i_ino;
1682                         ins.offset = end;
1683                         ins.flags = 0;
1684                         btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
1685
1686                         btrfs_release_path(root, path);
1687                         ret = btrfs_insert_empty_item(trans, root, path, &ins,
1688                                                       sizeof(*extent));
1689                         BUG_ON(ret);
1690                         extent = btrfs_item_ptr(
1691                                     btrfs_buffer_leaf(path->nodes[0]),
1692                                     path->slots[0],
1693                                     struct btrfs_file_extent_item);
1694                         btrfs_set_file_extent_disk_blocknr(extent,
1695                                     btrfs_file_extent_disk_blocknr(&old));
1696                         btrfs_set_file_extent_disk_num_blocks(extent,
1697                                     btrfs_file_extent_disk_num_blocks(&old));
1698
1699                         btrfs_set_file_extent_offset(extent,
1700                                     btrfs_file_extent_offset(&old) +
1701                                     ((end - key.offset) >> inode->i_blkbits));
1702                         WARN_ON(btrfs_file_extent_num_blocks(&old) <
1703                                 (end - key.offset) >> inode->i_blkbits);
1704                         btrfs_set_file_extent_num_blocks(extent,
1705                                     btrfs_file_extent_num_blocks(&old) -
1706                                     ((end - key.offset) >> inode->i_blkbits));
1707
1708                         btrfs_set_file_extent_type(extent,
1709                                                    BTRFS_FILE_EXTENT_REG);
1710                         btrfs_set_file_extent_generation(extent,
1711                                     btrfs_file_extent_generation(&old));
1712                         btrfs_mark_buffer_dirty(path->nodes[0]);
1713                         inode->i_blocks +=
1714                                 btrfs_file_extent_num_blocks(extent) << 3;
1715                         ret = 0;
1716                         goto out;
1717                 }
1718         }
1719 out:
1720         btrfs_free_path(path);
1721         return ret;
1722 }
1723
1724 static int prepare_pages(struct btrfs_root *root,
1725                          struct file *file,
1726                          struct page **pages,
1727                          size_t num_pages,
1728                          loff_t pos,
1729                          unsigned long first_index,
1730                          unsigned long last_index,
1731                          size_t write_bytes,
1732                          u64 alloc_extent_start)
1733 {
1734         int i;
1735         unsigned long index = pos >> PAGE_CACHE_SHIFT;
1736         struct inode *inode = file->f_path.dentry->d_inode;
1737         int offset;
1738         int err = 0;
1739         int this_write;
1740         struct buffer_head *bh;
1741         struct buffer_head *head;
1742         loff_t isize = i_size_read(inode);
1743
1744         memset(pages, 0, num_pages * sizeof(struct page *));
1745
1746         for (i = 0; i < num_pages; i++) {
1747                 pages[i] = grab_cache_page(inode->i_mapping, index + i);
1748                 if (!pages[i]) {
1749                         err = -ENOMEM;
1750                         goto failed_release;
1751                 }
1752                 offset = pos & (PAGE_CACHE_SIZE -1);
1753                 this_write = min(PAGE_CACHE_SIZE - offset, write_bytes);
1754                 create_empty_buffers(pages[i], root->fs_info->sb->s_blocksize,
1755                                      (1 << BH_Uptodate));
1756                 head = page_buffers(pages[i]);
1757                 bh = head;
1758                 do {
1759                         err = btrfs_map_bh_to_logical(root, bh,
1760                                                       alloc_extent_start);
1761                         BUG_ON(err);
1762                         if (err)
1763                                 goto failed_truncate;
1764                         bh = bh->b_this_page;
1765                         if (alloc_extent_start)
1766                                 alloc_extent_start++;
1767                 } while (bh != head);
1768                 pos += this_write;
1769                 WARN_ON(this_write > write_bytes);
1770                 write_bytes -= this_write;
1771         }
1772         return 0;
1773
1774 failed_release:
1775         btrfs_drop_pages(pages, num_pages);
1776         return err;
1777
1778 failed_truncate:
1779         btrfs_drop_pages(pages, num_pages);
1780         if (pos > isize)
1781                 vmtruncate(inode, isize);
1782         return err;
1783 }
1784
1785 static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
1786                                 size_t count, loff_t *ppos)
1787 {
1788         loff_t pos;
1789         size_t num_written = 0;
1790         int err = 0;
1791         int ret = 0;
1792         struct inode *inode = file->f_path.dentry->d_inode;
1793         struct btrfs_root *root = BTRFS_I(inode)->root;
1794         struct page *pages[8];
1795         struct page *pinned[2] = { NULL, NULL };
1796         unsigned long first_index;
1797         unsigned long last_index;
1798         u64 start_pos;
1799         u64 num_blocks;
1800         u64 alloc_extent_start;
1801         struct btrfs_trans_handle *trans;
1802         struct btrfs_key ins;
1803
1804         if (file->f_flags & O_DIRECT)
1805                 return -EINVAL;
1806         pos = *ppos;
1807         vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
1808         current->backing_dev_info = inode->i_mapping->backing_dev_info;
1809         err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode));
1810         if (err)
1811                 goto out;
1812         if (count == 0)
1813                 goto out;
1814         err = remove_suid(file->f_path.dentry);
1815         if (err)
1816                 goto out;
1817         file_update_time(file);
1818
1819         start_pos = pos & ~((u64)PAGE_CACHE_SIZE - 1);
1820         num_blocks = (count + pos - start_pos + root->blocksize - 1) >>
1821                         inode->i_blkbits;
1822
1823         mutex_lock(&inode->i_mutex);
1824         first_index = pos >> PAGE_CACHE_SHIFT;
1825         last_index = (pos + count) >> PAGE_CACHE_SHIFT;
1826
1827         if ((first_index << PAGE_CACHE_SHIFT) < inode->i_size &&
1828             (pos & (PAGE_CACHE_SIZE - 1))) {
1829                 pinned[0] = grab_cache_page(inode->i_mapping, first_index);
1830                 if (!PageUptodate(pinned[0])) {
1831                         ret = mpage_readpage(pinned[0], btrfs_get_block);
1832                         BUG_ON(ret);
1833                 } else {
1834                         unlock_page(pinned[0]);
1835                 }
1836         }
1837         if (first_index != last_index &&
1838             (last_index << PAGE_CACHE_SHIFT) < inode->i_size &&
1839             (count & (PAGE_CACHE_SIZE - 1))) {
1840                 pinned[1] = grab_cache_page(inode->i_mapping, last_index);
1841                 if (!PageUptodate(pinned[1])) {
1842                         ret = mpage_readpage(pinned[1], btrfs_get_block);
1843                         BUG_ON(ret);
1844                 } else {
1845                         unlock_page(pinned[1]);
1846                 }
1847         }
1848
1849         mutex_lock(&root->fs_info->fs_mutex);
1850         trans = btrfs_start_transaction(root, 1);
1851         if (!trans) {
1852                 err = -ENOMEM;
1853                 mutex_unlock(&root->fs_info->fs_mutex);
1854                 goto out_unlock;
1855         }
1856         btrfs_set_trans_block_group(trans, inode);
1857         /* FIXME blocksize != 4096 */
1858         inode->i_blocks += num_blocks << 3;
1859         if (start_pos < inode->i_size) {
1860                 /* FIXME blocksize != pagesize */
1861                 ret = drop_extents(trans, root, inode,
1862                                    start_pos,
1863                                    (pos + count + root->blocksize -1) &
1864                                    ~((u64)root->blocksize - 1));
1865                 BUG_ON(ret);
1866         }
1867         if (inode->i_size >= PAGE_CACHE_SIZE || pos + count < inode->i_size ||
1868             pos + count - start_pos > BTRFS_MAX_INLINE_DATA_SIZE(root)) {
1869                 ret = btrfs_alloc_extent(trans, root, inode->i_ino,
1870                                          num_blocks, 1, (u64)-1, &ins);
1871                 BUG_ON(ret);
1872                 ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
1873                                        start_pos, ins.objectid, ins.offset);
1874                 BUG_ON(ret);
1875         } else {
1876                 ins.offset = 0;
1877                 ins.objectid = 0;
1878         }
1879         BUG_ON(ret);
1880         alloc_extent_start = ins.objectid;
1881         btrfs_update_inode_block_group(trans, inode);
1882         ret = btrfs_end_transaction(trans, root);
1883         mutex_unlock(&root->fs_info->fs_mutex);
1884
1885         while(count > 0) {
1886                 size_t offset = pos & (PAGE_CACHE_SIZE - 1);
1887                 size_t write_bytes = min(count, PAGE_CACHE_SIZE - offset);
1888                 size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >>
1889                                         PAGE_CACHE_SHIFT;
1890
1891                 memset(pages, 0, sizeof(pages));
1892                 ret = prepare_pages(root, file, pages, num_pages,
1893                                     pos, first_index, last_index,
1894                                     write_bytes, alloc_extent_start);
1895                 BUG_ON(ret);
1896
1897                 /* FIXME blocks != pagesize */
1898                 if (alloc_extent_start)
1899                         alloc_extent_start += num_pages;
1900                 ret = btrfs_copy_from_user(pos, num_pages,
1901                                            write_bytes, pages, buf);
1902                 BUG_ON(ret);
1903
1904                 ret = dirty_and_release_pages(NULL, root, file, pages,
1905                                               num_pages, pos, write_bytes);
1906                 BUG_ON(ret);
1907                 btrfs_drop_pages(pages, num_pages);
1908
1909                 buf += write_bytes;
1910                 count -= write_bytes;
1911                 pos += write_bytes;
1912                 num_written += write_bytes;
1913
1914                 balance_dirty_pages_ratelimited(inode->i_mapping);
1915                 cond_resched();
1916         }
1917 out_unlock:
1918         mutex_unlock(&inode->i_mutex);
1919 out:
1920         if (pinned[0])
1921                 page_cache_release(pinned[0]);
1922         if (pinned[1])
1923                 page_cache_release(pinned[1]);
1924         *ppos = pos;
1925         current->backing_dev_info = NULL;
1926         mark_inode_dirty(inode);
1927         return num_written ? num_written : err;
1928 }
1929
1930 static int btrfs_read_actor(read_descriptor_t *desc, struct page *page,
1931                         unsigned long offset, unsigned long size)
1932 {
1933         char *kaddr;
1934         unsigned long left, count = desc->count;
1935         struct inode *inode = page->mapping->host;
1936
1937         if (size > count)
1938                 size = count;
1939
1940         if (!PageChecked(page)) {
1941                 /* FIXME, do it per block */
1942                 struct btrfs_root *root = BTRFS_I(inode)->root;
1943
1944                 int ret = btrfs_csum_verify_file_block(root,
1945                                   page->mapping->host->i_ino,
1946                                   page->index << PAGE_CACHE_SHIFT,
1947                                   kmap(page), PAGE_CACHE_SIZE);
1948                 if (ret) {
1949                         printk("failed to verify ino %lu page %lu\n",
1950                                page->mapping->host->i_ino,
1951                                page->index);
1952                         memset(page_address(page), 0, PAGE_CACHE_SIZE);
1953                 }
1954                 SetPageChecked(page);
1955                 kunmap(page);
1956         }
1957         /*
1958          * Faults on the destination of a read are common, so do it before
1959          * taking the kmap.
1960          */
1961         if (!fault_in_pages_writeable(desc->arg.buf, size)) {
1962                 kaddr = kmap_atomic(page, KM_USER0);
1963                 left = __copy_to_user_inatomic(desc->arg.buf,
1964                                                 kaddr + offset, size);
1965                 kunmap_atomic(kaddr, KM_USER0);
1966                 if (left == 0)
1967                         goto success;
1968         }
1969
1970         /* Do it the slow way */
1971         kaddr = kmap(page);
1972         left = __copy_to_user(desc->arg.buf, kaddr + offset, size);
1973         kunmap(page);
1974
1975         if (left) {
1976                 size -= left;
1977                 desc->error = -EFAULT;
1978         }
1979 success:
1980         desc->count = count - size;
1981         desc->written += size;
1982         desc->arg.buf += size;
1983         return size;
1984 }
1985
1986 /**
1987  * btrfs_file_aio_read - filesystem read routine
1988  * @iocb:       kernel I/O control block
1989  * @iov:        io vector request
1990  * @nr_segs:    number of segments in the iovec
1991  * @pos:        current file position
1992  */
1993 static ssize_t btrfs_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1994                                    unsigned long nr_segs, loff_t pos)
1995 {
1996         struct file *filp = iocb->ki_filp;
1997         ssize_t retval;
1998         unsigned long seg;
1999         size_t count;
2000         loff_t *ppos = &iocb->ki_pos;
2001
2002         count = 0;
2003         for (seg = 0; seg < nr_segs; seg++) {
2004                 const struct iovec *iv = &iov[seg];
2005
2006                 /*
2007                  * If any segment has a negative length, or the cumulative
2008                  * length ever wraps negative then return -EINVAL.
2009                  */
2010                 count += iv->iov_len;
2011                 if (unlikely((ssize_t)(count|iv->iov_len) < 0))
2012                         return -EINVAL;
2013                 if (access_ok(VERIFY_WRITE, iv->iov_base, iv->iov_len))
2014                         continue;
2015                 if (seg == 0)
2016                         return -EFAULT;
2017                 nr_segs = seg;
2018                 count -= iv->iov_len;   /* This segment is no good */
2019                 break;
2020         }
2021         retval = 0;
2022         if (count) {
2023                 for (seg = 0; seg < nr_segs; seg++) {
2024                         read_descriptor_t desc;
2025
2026                         desc.written = 0;
2027                         desc.arg.buf = iov[seg].iov_base;
2028                         desc.count = iov[seg].iov_len;
2029                         if (desc.count == 0)
2030                                 continue;
2031                         desc.error = 0;
2032                         do_generic_file_read(filp, ppos, &desc,
2033                                              btrfs_read_actor);
2034                         retval += desc.written;
2035                         if (desc.error) {
2036                                 retval = retval ?: desc.error;
2037                                 break;
2038                         }
2039                 }
2040         }
2041         return retval;
2042 }
2043
2044 static int create_subvol(struct btrfs_root *root, char *name, int namelen)
2045 {
2046         struct btrfs_trans_handle *trans;
2047         struct btrfs_key key;
2048         struct btrfs_root_item root_item;
2049         struct btrfs_inode_item *inode_item;
2050         struct buffer_head *subvol;
2051         struct btrfs_leaf *leaf;
2052         struct btrfs_root *new_root;
2053         struct inode *inode;
2054         struct inode *dir;
2055         int ret;
2056         u64 objectid;
2057         u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID;
2058
2059         mutex_lock(&root->fs_info->fs_mutex);
2060         trans = btrfs_start_transaction(root, 1);
2061         BUG_ON(!trans);
2062
2063         subvol = btrfs_alloc_free_block(trans, root, 0);
2064         if (subvol == NULL)
2065                 return -ENOSPC;
2066         leaf = btrfs_buffer_leaf(subvol);
2067         btrfs_set_header_nritems(&leaf->header, 0);
2068         btrfs_set_header_level(&leaf->header, 0);
2069         btrfs_set_header_blocknr(&leaf->header, bh_blocknr(subvol));
2070         btrfs_set_header_generation(&leaf->header, trans->transid);
2071         btrfs_set_header_owner(&leaf->header, root->root_key.objectid);
2072         memcpy(leaf->header.fsid, root->fs_info->disk_super->fsid,
2073                sizeof(leaf->header.fsid));
2074         mark_buffer_dirty(subvol);
2075
2076         inode_item = &root_item.inode;
2077         memset(inode_item, 0, sizeof(*inode_item));
2078         btrfs_set_inode_generation(inode_item, 1);
2079         btrfs_set_inode_size(inode_item, 3);
2080         btrfs_set_inode_nlink(inode_item, 1);
2081         btrfs_set_inode_nblocks(inode_item, 1);
2082         btrfs_set_inode_mode(inode_item, S_IFDIR | 0755);
2083
2084         btrfs_set_root_blocknr(&root_item, bh_blocknr(subvol));
2085         btrfs_set_root_refs(&root_item, 1);
2086         brelse(subvol);
2087         subvol = NULL;
2088
2089         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
2090                                        0, &objectid);
2091         BUG_ON(ret);
2092
2093         btrfs_set_root_dirid(&root_item, new_dirid);
2094
2095         key.objectid = objectid;
2096         key.offset = 1;
2097         key.flags = 0;
2098         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2099         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
2100                                 &root_item);
2101         BUG_ON(ret);
2102
2103         /*
2104          * insert the directory item
2105          */
2106         key.offset = (u64)-1;
2107         dir = root->fs_info->sb->s_root->d_inode;
2108         ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
2109                                     name, namelen, dir->i_ino, &key, 0);
2110         BUG_ON(ret);
2111
2112         ret = btrfs_commit_transaction(trans, root);
2113         BUG_ON(ret);
2114
2115         new_root = btrfs_read_fs_root(root->fs_info, &key);
2116         BUG_ON(!new_root);
2117
2118         trans = btrfs_start_transaction(new_root, 1);
2119         BUG_ON(!trans);
2120
2121         inode = btrfs_new_inode(trans, new_root, new_dirid,
2122                                 BTRFS_I(dir)->block_group, S_IFDIR | 0700);
2123         inode->i_op = &btrfs_dir_inode_operations;
2124         inode->i_fop = &btrfs_dir_file_operations;
2125
2126         ret = btrfs_make_empty_dir(trans, new_root, new_dirid, new_dirid);
2127         BUG_ON(ret);
2128
2129         inode->i_nlink = 1;
2130         inode->i_size = 6;
2131         ret = btrfs_update_inode(trans, new_root, inode);
2132         BUG_ON(ret);
2133
2134         ret = btrfs_commit_transaction(trans, new_root);
2135         BUG_ON(ret);
2136
2137         iput(inode);
2138
2139         mutex_unlock(&root->fs_info->fs_mutex);
2140         return 0;
2141 }
2142
2143 static int create_snapshot(struct btrfs_root *root, char *name, int namelen)
2144 {
2145         struct btrfs_trans_handle *trans;
2146         struct btrfs_key key;
2147         struct btrfs_root_item new_root_item;
2148         int ret;
2149         u64 objectid;
2150
2151         if (!root->ref_cows)
2152                 return -EINVAL;
2153
2154         mutex_lock(&root->fs_info->fs_mutex);
2155         trans = btrfs_start_transaction(root, 1);
2156         BUG_ON(!trans);
2157
2158         ret = btrfs_update_inode(trans, root, root->inode);
2159         BUG_ON(ret);
2160
2161         ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
2162                                        0, &objectid);
2163         BUG_ON(ret);
2164
2165         memcpy(&new_root_item, &root->root_item,
2166                sizeof(new_root_item));
2167
2168         key.objectid = objectid;
2169         key.offset = 1;
2170         key.flags = 0;
2171         btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
2172         btrfs_set_root_blocknr(&new_root_item, bh_blocknr(root->node));
2173
2174         ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
2175                                 &new_root_item);
2176         BUG_ON(ret);
2177
2178         /*
2179          * insert the directory item
2180          */
2181         key.offset = (u64)-1;
2182         ret = btrfs_insert_dir_item(trans, root->fs_info->tree_root,
2183                                     name, namelen,
2184                                     root->fs_info->sb->s_root->d_inode->i_ino,
2185                                     &key, 0);
2186
2187         BUG_ON(ret);
2188
2189         ret = btrfs_inc_root_ref(trans, root);
2190         BUG_ON(ret);
2191
2192         ret = btrfs_commit_transaction(trans, root);
2193         BUG_ON(ret);
2194         mutex_unlock(&root->fs_info->fs_mutex);
2195         return 0;
2196 }
2197
2198 static int add_disk(struct btrfs_root *root, char *name, int namelen)
2199 {
2200         struct block_device *bdev;
2201         struct btrfs_path *path;
2202         struct super_block *sb = root->fs_info->sb;
2203         struct btrfs_root *dev_root = root->fs_info->dev_root;
2204         struct btrfs_trans_handle *trans;
2205         struct btrfs_device_item *dev_item;
2206         struct btrfs_key key;
2207         u16 item_size;
2208         u64 num_blocks;
2209         u64 new_blocks;
2210         u64 device_id;
2211         int ret;
2212
2213 printk("adding disk %s\n", name);
2214         path = btrfs_alloc_path();
2215         if (!path)
2216                 return -ENOMEM;
2217         num_blocks = btrfs_super_total_blocks(root->fs_info->disk_super);
2218         bdev = open_bdev_excl(name, O_RDWR, sb);
2219         if (IS_ERR(bdev)) {
2220                 ret = PTR_ERR(bdev);
2221 printk("open bdev excl failed ret %d\n", ret);
2222                 goto out_nolock;
2223         }
2224         set_blocksize(bdev, sb->s_blocksize);
2225         new_blocks = bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2226         key.objectid = num_blocks;
2227         key.offset = new_blocks;
2228         key.flags = 0;
2229         btrfs_set_key_type(&key, BTRFS_DEV_ITEM_KEY);
2230
2231         mutex_lock(&dev_root->fs_info->fs_mutex);
2232         trans = btrfs_start_transaction(dev_root, 1);
2233         item_size = sizeof(*dev_item) + namelen;
2234 printk("insert empty on %Lu %Lu %u size %d\n", num_blocks, new_blocks, key.flags, item_size);
2235         ret = btrfs_insert_empty_item(trans, dev_root, path, &key, item_size);
2236         if (ret) {
2237 printk("insert failed %d\n", ret);
2238                 close_bdev_excl(bdev);
2239                 if (ret > 0)
2240                         ret = -EEXIST;
2241                 goto out;
2242         }
2243         dev_item = btrfs_item_ptr(btrfs_buffer_leaf(path->nodes[0]),
2244                                   path->slots[0], struct btrfs_device_item);
2245         btrfs_set_device_pathlen(dev_item, namelen);
2246         memcpy(dev_item + 1, name, namelen);
2247
2248         device_id = btrfs_super_last_device_id(root->fs_info->disk_super) + 1;
2249         btrfs_set_super_last_device_id(root->fs_info->disk_super, device_id);
2250         btrfs_set_device_id(dev_item, device_id);
2251         mark_buffer_dirty(path->nodes[0]);
2252
2253         ret = btrfs_insert_dev_radix(root, bdev, device_id, num_blocks,
2254                                      new_blocks);
2255
2256         if (!ret) {
2257                 btrfs_set_super_total_blocks(root->fs_info->disk_super,
2258                                              num_blocks + new_blocks);
2259                 i_size_write(root->fs_info->btree_inode,
2260                              (num_blocks + new_blocks) <<
2261                              root->fs_info->btree_inode->i_blkbits);
2262         }
2263
2264 out:
2265         ret = btrfs_commit_transaction(trans, dev_root);
2266         BUG_ON(ret);
2267         mutex_unlock(&root->fs_info->fs_mutex);
2268 out_nolock:
2269         btrfs_free_path(path);
2270
2271         return ret;
2272 }
2273
2274 static int btrfs_ioctl(struct inode *inode, struct file *filp, unsigned int
2275                        cmd, unsigned long arg)
2276 {
2277         struct btrfs_root *root = BTRFS_I(inode)->root;
2278         struct btrfs_ioctl_vol_args vol_args;
2279         int ret = 0;
2280         struct btrfs_dir_item *di;
2281         int namelen;
2282         struct btrfs_path *path;
2283         u64 root_dirid;
2284
2285         switch (cmd) {
2286         case BTRFS_IOC_SNAP_CREATE:
2287                 if (copy_from_user(&vol_args,
2288                                    (struct btrfs_ioctl_vol_args __user *)arg,
2289                                    sizeof(vol_args)))
2290                         return -EFAULT;
2291                 namelen = strlen(vol_args.name);
2292                 if (namelen > BTRFS_VOL_NAME_MAX)
2293                         return -EINVAL;
2294                 path = btrfs_alloc_path();
2295                 if (!path)
2296                         return -ENOMEM;
2297                 root_dirid = root->fs_info->sb->s_root->d_inode->i_ino,
2298                 mutex_lock(&root->fs_info->fs_mutex);
2299                 di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root,
2300                                     path, root_dirid,
2301                                     vol_args.name, namelen, 0);
2302                 mutex_unlock(&root->fs_info->fs_mutex);
2303                 btrfs_free_path(path);
2304                 if (di && !IS_ERR(di))
2305                         return -EEXIST;
2306
2307                 if (root == root->fs_info->tree_root)
2308                         ret = create_subvol(root, vol_args.name, namelen);
2309                 else
2310                         ret = create_snapshot(root, vol_args.name, namelen);
2311                 WARN_ON(ret);
2312                 break;
2313         case BTRFS_IOC_ADD_DISK:
2314                 if (copy_from_user(&vol_args,
2315                                    (struct btrfs_ioctl_vol_args __user *)arg,
2316                                    sizeof(vol_args)))
2317                         return -EFAULT;
2318                 namelen = strlen(vol_args.name);
2319                 if (namelen > BTRFS_VOL_NAME_MAX)
2320                         return -EINVAL;
2321                 vol_args.name[namelen] = '\0';
2322                 ret = add_disk(root, vol_args.name, namelen);
2323                 break;
2324         default:
2325                 return -ENOTTY;
2326         }
2327         return ret;
2328 }
2329
2330 static struct kmem_cache *btrfs_inode_cachep;
2331 struct kmem_cache *btrfs_trans_handle_cachep;
2332 struct kmem_cache *btrfs_transaction_cachep;
2333 struct kmem_cache *btrfs_bit_radix_cachep;
2334 struct kmem_cache *btrfs_path_cachep;
2335
2336 /*
2337  * Called inside transaction, so use GFP_NOFS
2338  */
2339 static struct inode *btrfs_alloc_inode(struct super_block *sb)
2340 {
2341         struct btrfs_inode *ei;
2342
2343         ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS);
2344         if (!ei)
2345                 return NULL;
2346         return &ei->vfs_inode;
2347 }
2348
2349 static void btrfs_destroy_inode(struct inode *inode)
2350 {
2351         WARN_ON(!list_empty(&inode->i_dentry));
2352         WARN_ON(inode->i_data.nrpages);
2353
2354         kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
2355 }
2356
2357 static void init_once(void * foo, struct kmem_cache * cachep,
2358                       unsigned long flags)
2359 {
2360         struct btrfs_inode *ei = (struct btrfs_inode *) foo;
2361
2362         if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
2363             SLAB_CTOR_CONSTRUCTOR) {
2364                 inode_init_once(&ei->vfs_inode);
2365         }
2366 }
2367
2368 static int init_inodecache(void)
2369 {
2370         btrfs_inode_cachep = kmem_cache_create("btrfs_inode_cache",
2371                                              sizeof(struct btrfs_inode),
2372                                              0, (SLAB_RECLAIM_ACCOUNT|
2373                                                 SLAB_MEM_SPREAD),
2374                                              init_once, NULL);
2375         btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle_cache",
2376                                              sizeof(struct btrfs_trans_handle),
2377                                              0, (SLAB_RECLAIM_ACCOUNT|
2378                                                 SLAB_MEM_SPREAD),
2379                                              NULL, NULL);
2380         btrfs_transaction_cachep = kmem_cache_create("btrfs_transaction_cache",
2381                                              sizeof(struct btrfs_transaction),
2382                                              0, (SLAB_RECLAIM_ACCOUNT|
2383                                                 SLAB_MEM_SPREAD),
2384                                              NULL, NULL);
2385         btrfs_path_cachep = kmem_cache_create("btrfs_path_cache",
2386                                              sizeof(struct btrfs_transaction),
2387                                              0, (SLAB_RECLAIM_ACCOUNT|
2388                                                 SLAB_MEM_SPREAD),
2389                                              NULL, NULL);
2390         btrfs_bit_radix_cachep = kmem_cache_create("btrfs_radix",
2391                                              256,
2392                                              0, (SLAB_RECLAIM_ACCOUNT|
2393                                                 SLAB_MEM_SPREAD |
2394                                                 SLAB_DESTROY_BY_RCU),
2395                                              NULL, NULL);
2396         if (btrfs_inode_cachep == NULL || btrfs_trans_handle_cachep == NULL ||
2397             btrfs_transaction_cachep == NULL || btrfs_bit_radix_cachep == NULL)
2398                 return -ENOMEM;
2399         return 0;
2400 }
2401
2402 static void destroy_inodecache(void)
2403 {
2404         kmem_cache_destroy(btrfs_inode_cachep);
2405         kmem_cache_destroy(btrfs_trans_handle_cachep);
2406         kmem_cache_destroy(btrfs_transaction_cachep);
2407         kmem_cache_destroy(btrfs_bit_radix_cachep);
2408         kmem_cache_destroy(btrfs_path_cachep);
2409 }
2410
2411 static int btrfs_get_sb(struct file_system_type *fs_type,
2412         int flags, const char *dev_name, void *data, struct vfsmount *mnt)
2413 {
2414         return get_sb_bdev(fs_type, flags, dev_name, data,
2415                            btrfs_fill_super, mnt);
2416 }
2417
2418
2419 static int btrfs_getattr(struct vfsmount *mnt,
2420                          struct dentry *dentry, struct kstat *stat)
2421 {
2422         struct inode *inode = dentry->d_inode;
2423         generic_fillattr(inode, stat);
2424         stat->blksize = 256 * 1024;
2425         return 0;
2426 }
2427
2428 static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf)
2429 {
2430         struct btrfs_root *root = btrfs_sb(dentry->d_sb);
2431         struct btrfs_super_block *disk_super = root->fs_info->disk_super;
2432
2433         buf->f_namelen = BTRFS_NAME_LEN;
2434         buf->f_blocks = btrfs_super_total_blocks(disk_super);
2435         buf->f_bfree = buf->f_blocks - btrfs_super_blocks_used(disk_super);
2436         buf->f_bavail = buf->f_bfree;
2437         buf->f_bsize = dentry->d_sb->s_blocksize;
2438         buf->f_type = BTRFS_SUPER_MAGIC;
2439         return 0;
2440 }
2441
2442 static struct file_system_type btrfs_fs_type = {
2443         .owner          = THIS_MODULE,
2444         .name           = "btrfs",
2445         .get_sb         = btrfs_get_sb,
2446         .kill_sb        = kill_block_super,
2447         .fs_flags       = FS_REQUIRES_DEV,
2448 };
2449
2450 static struct super_operations btrfs_super_ops = {
2451         .delete_inode   = btrfs_delete_inode,
2452         .put_super      = btrfs_put_super,
2453         .read_inode     = btrfs_read_locked_inode,
2454         .write_super    = btrfs_write_super,
2455         .sync_fs        = btrfs_sync_fs,
2456         .write_inode    = btrfs_write_inode,
2457         .dirty_inode    = btrfs_dirty_inode,
2458         .alloc_inode    = btrfs_alloc_inode,
2459         .destroy_inode  = btrfs_destroy_inode,
2460         .statfs         = btrfs_statfs,
2461 };
2462
2463 static struct inode_operations btrfs_dir_inode_operations = {
2464         .lookup         = btrfs_lookup,
2465         .create         = btrfs_create,
2466         .unlink         = btrfs_unlink,
2467         .mkdir          = btrfs_mkdir,
2468         .rmdir          = btrfs_rmdir,
2469 };
2470
2471 static struct inode_operations btrfs_dir_ro_inode_operations = {
2472         .lookup         = btrfs_lookup,
2473 };
2474
2475 static struct file_operations btrfs_dir_file_operations = {
2476         .llseek         = generic_file_llseek,
2477         .read           = generic_read_dir,
2478         .readdir        = btrfs_readdir,
2479         .ioctl          = btrfs_ioctl,
2480 };
2481
2482 static struct address_space_operations btrfs_aops = {
2483         .readpage       = btrfs_readpage,
2484         .writepage      = btrfs_writepage,
2485         .sync_page      = block_sync_page,
2486         .prepare_write  = btrfs_prepare_write,
2487         .commit_write   = btrfs_commit_write,
2488 };
2489
2490 static struct inode_operations btrfs_file_inode_operations = {
2491         .truncate       = btrfs_truncate,
2492         .getattr        = btrfs_getattr,
2493 };
2494
2495 static struct file_operations btrfs_file_operations = {
2496         .llseek         = generic_file_llseek,
2497         .read           = do_sync_read,
2498         .aio_read       = btrfs_file_aio_read,
2499         .write          = btrfs_file_write,
2500         .mmap           = generic_file_mmap,
2501         .open           = generic_file_open,
2502         .ioctl          = btrfs_ioctl,
2503         .fsync          = btrfs_sync_file,
2504 };
2505
2506 static int __init init_btrfs_fs(void)
2507 {
2508         int err;
2509         printk("btrfs loaded!\n");
2510         err = init_inodecache();
2511         if (err)
2512                 return err;
2513         kset_set_kset_s(&btrfs_subsys, fs_subsys);
2514         err = subsystem_register(&btrfs_subsys);
2515         if (err)
2516                 goto out;
2517         return register_filesystem(&btrfs_fs_type);
2518 out:
2519         destroy_inodecache();
2520         return err;
2521 }
2522
2523 static void __exit exit_btrfs_fs(void)
2524 {
2525         destroy_inodecache();
2526         unregister_filesystem(&btrfs_fs_type);
2527         subsystem_unregister(&btrfs_subsys);
2528         printk("btrfs unloaded\n");
2529 }
2530
2531 module_init(init_btrfs_fs)
2532 module_exit(exit_btrfs_fs)
2533
2534 MODULE_LICENSE("GPL");