fs: dcache fix LRU ordering
[safe/jmp/linux-2.6] / fs / ubifs / file.c
index 3d698e2..6d34dc7 100644 (file)
@@ -72,8 +72,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
                return err;
        }
 
-       ubifs_assert(dn->ch.sqnum > ubifs_inode(inode)->creat_sqnum);
-
+       ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+                    ubifs_inode(inode)->creat_sqnum);
        len = le32_to_cpu(dn->size);
        if (len <= 0 || len > UBIFS_BLOCK_SIZE)
                goto dump;
@@ -147,6 +147,12 @@ static int do_readpage(struct page *page)
                                err = ret;
                                if (err != -ENOENT)
                                        break;
+                       } else if (block + 1 == beyond) {
+                               int dlen = le32_to_cpu(dn->size);
+                               int ilen = i_size & (UBIFS_BLOCK_SIZE - 1);
+
+                               if (ilen && ilen < dlen)
+                                       memset(addr + ilen, 0, dlen - ilen);
                        }
                }
                if (++i >= UBIFS_BLOCKS_PER_PAGE)
@@ -213,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c)
 }
 
 static int write_begin_slow(struct address_space *mapping,
-                           loff_t pos, unsigned len, struct page **pagep)
+                           loff_t pos, unsigned len, struct page **pagep,
+                           unsigned flags)
 {
        struct inode *inode = mapping->host;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
@@ -241,14 +248,14 @@ static int write_begin_slow(struct address_space *mapping,
        if (unlikely(err))
                return err;
 
-       page = __grab_cache_page(mapping, index);
+       page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page)) {
                ubifs_release_budget(c, &req);
                return -ENOMEM;
        }
 
        if (!PageUptodate(page)) {
-               if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+               if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
                        SetPageChecked(page);
                else {
                        err = do_readpage(page);
@@ -423,22 +430,22 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        struct ubifs_inode *ui = ubifs_inode(inode);
        pgoff_t index = pos >> PAGE_CACHE_SHIFT;
        int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
+       int skipped_read = 0;
        struct page *page;
 
-
        ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
 
        if (unlikely(c->ro_media))
                return -EROFS;
 
        /* Try out the fast-path part first */
-       page = __grab_cache_page(mapping, index);
+       page = grab_cache_page_write_begin(mapping, index, flags);
        if (unlikely(!page))
                return -ENOMEM;
 
        if (!PageUptodate(page)) {
                /* The page is not loaded from the flash */
-               if (!(pos & PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE)
+               if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
                        /*
                         * We change whole page so no need to load it. But we
                         * have to set the @PG_checked flag to make the further
@@ -447,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                         * the media.
                         */
                        SetPageChecked(page);
-               else {
+                       skipped_read = 1;
+               } else {
                        err = do_readpage(page);
                        if (err) {
                                unlock_page(page);
@@ -464,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
        if (unlikely(err)) {
                ubifs_assert(err == -ENOSPC);
                /*
+                * If we skipped reading the page because we were going to
+                * write all of it, then it is not up to date.
+                */
+               if (skipped_read) {
+                       ClearPageChecked(page);
+                       ClearPageUptodate(page);
+               }
+               /*
                 * Budgeting failed which means it would have to force
                 * write-back but didn't, because we set the @fast flag in the
                 * request. Write-back cannot be done now, while we have the
@@ -477,7 +493,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
                unlock_page(page);
                page_cache_release(page);
 
-               return write_begin_slow(mapping, pos, len, pagep);
+               return write_begin_slow(mapping, pos, len, pagep, flags);
        }
 
        /*
@@ -577,8 +593,297 @@ out:
        return copied;
 }
 
+/**
+ * populate_page - copy data nodes into a page for bulk-read.
+ * @c: UBIFS file-system description object
+ * @page: page
+ * @bu: bulk-read information
+ * @n: next zbranch slot
+ *
+ * This function returns %0 on success and a negative error code on failure.
+ */
+static int populate_page(struct ubifs_info *c, struct page *page,
+                        struct bu_info *bu, int *n)
+{
+       int i = 0, nn = *n, offs = bu->zbranch[0].offs, hole = 0, read = 0;
+       struct inode *inode = page->mapping->host;
+       loff_t i_size = i_size_read(inode);
+       unsigned int page_block;
+       void *addr, *zaddr;
+       pgoff_t end_index;
+
+       dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx",
+               inode->i_ino, page->index, i_size, page->flags);
+
+       addr = zaddr = kmap(page);
+
+       end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+       if (!i_size || page->index > end_index) {
+               hole = 1;
+               memset(addr, 0, PAGE_CACHE_SIZE);
+               goto out_hole;
+       }
+
+       page_block = page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT;
+       while (1) {
+               int err, len, out_len, dlen;
+
+               if (nn >= bu->cnt) {
+                       hole = 1;
+                       memset(addr, 0, UBIFS_BLOCK_SIZE);
+               } else if (key_block(c, &bu->zbranch[nn].key) == page_block) {
+                       struct ubifs_data_node *dn;
+
+                       dn = bu->buf + (bu->zbranch[nn].offs - offs);
+
+                       ubifs_assert(le64_to_cpu(dn->ch.sqnum) >
+                                    ubifs_inode(inode)->creat_sqnum);
+
+                       len = le32_to_cpu(dn->size);
+                       if (len <= 0 || len > UBIFS_BLOCK_SIZE)
+                               goto out_err;
+
+                       dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+                       out_len = UBIFS_BLOCK_SIZE;
+                       err = ubifs_decompress(&dn->data, dlen, addr, &out_len,
+                                              le16_to_cpu(dn->compr_type));
+                       if (err || len != out_len)
+                               goto out_err;
+
+                       if (len < UBIFS_BLOCK_SIZE)
+                               memset(addr + len, 0, UBIFS_BLOCK_SIZE - len);
+
+                       nn += 1;
+                       read = (i << UBIFS_BLOCK_SHIFT) + len;
+               } else if (key_block(c, &bu->zbranch[nn].key) < page_block) {
+                       nn += 1;
+                       continue;
+               } else {
+                       hole = 1;
+                       memset(addr, 0, UBIFS_BLOCK_SIZE);
+               }
+               if (++i >= UBIFS_BLOCKS_PER_PAGE)
+                       break;
+               addr += UBIFS_BLOCK_SIZE;
+               page_block += 1;
+       }
+
+       if (end_index == page->index) {
+               int len = i_size & (PAGE_CACHE_SIZE - 1);
+
+               if (len && len < read)
+                       memset(zaddr + len, 0, read - len);
+       }
+
+out_hole:
+       if (hole) {
+               SetPageChecked(page);
+               dbg_gen("hole");
+       }
+
+       SetPageUptodate(page);
+       ClearPageError(page);
+       flush_dcache_page(page);
+       kunmap(page);
+       *n = nn;
+       return 0;
+
+out_err:
+       ClearPageUptodate(page);
+       SetPageError(page);
+       flush_dcache_page(page);
+       kunmap(page);
+       ubifs_err("bad data node (block %u, inode %lu)",
+                 page_block, inode->i_ino);
+       return -EINVAL;
+}
+
+/**
+ * ubifs_do_bulk_read - do bulk-read.
+ * @c: UBIFS file-system description object
+ * @bu: bulk-read information
+ * @page1: first page to read
+ *
+ * This function returns %1 if the bulk-read is done, otherwise %0 is returned.
+ */
+static int ubifs_do_bulk_read(struct ubifs_info *c, struct bu_info *bu,
+                             struct page *page1)
+{
+       pgoff_t offset = page1->index, end_index;
+       struct address_space *mapping = page1->mapping;
+       struct inode *inode = mapping->host;
+       struct ubifs_inode *ui = ubifs_inode(inode);
+       int err, page_idx, page_cnt, ret = 0, n = 0;
+       int allocate = bu->buf ? 0 : 1;
+       loff_t isize;
+
+       err = ubifs_tnc_get_bu_keys(c, bu);
+       if (err)
+               goto out_warn;
+
+       if (bu->eof) {
+               /* Turn off bulk-read at the end of the file */
+               ui->read_in_a_row = 1;
+               ui->bulk_read = 0;
+       }
+
+       page_cnt = bu->blk_cnt >> UBIFS_BLOCKS_PER_PAGE_SHIFT;
+       if (!page_cnt) {
+               /*
+                * This happens when there are multiple blocks per page and the
+                * blocks for the first page we are looking for, are not
+                * together. If all the pages were like this, bulk-read would
+                * reduce performance, so we turn it off for a while.
+                */
+               goto out_bu_off;
+       }
+
+       if (bu->cnt) {
+               if (allocate) {
+                       /*
+                        * Allocate bulk-read buffer depending on how many data
+                        * nodes we are going to read.
+                        */
+                       bu->buf_len = bu->zbranch[bu->cnt - 1].offs +
+                                     bu->zbranch[bu->cnt - 1].len -
+                                     bu->zbranch[0].offs;
+                       ubifs_assert(bu->buf_len > 0);
+                       ubifs_assert(bu->buf_len <= c->leb_size);
+                       bu->buf = kmalloc(bu->buf_len, GFP_NOFS | __GFP_NOWARN);
+                       if (!bu->buf)
+                               goto out_bu_off;
+               }
+
+               err = ubifs_tnc_bulk_read(c, bu);
+               if (err)
+                       goto out_warn;
+       }
+
+       err = populate_page(c, page1, bu, &n);
+       if (err)
+               goto out_warn;
+
+       unlock_page(page1);
+       ret = 1;
+
+       isize = i_size_read(inode);
+       if (isize == 0)
+               goto out_free;
+       end_index = ((isize - 1) >> PAGE_CACHE_SHIFT);
+
+       for (page_idx = 1; page_idx < page_cnt; page_idx++) {
+               pgoff_t page_offset = offset + page_idx;
+               struct page *page;
+
+               if (page_offset > end_index)
+                       break;
+               page = find_or_create_page(mapping, page_offset,
+                                          GFP_NOFS | __GFP_COLD);
+               if (!page)
+                       break;
+               if (!PageUptodate(page))
+                       err = populate_page(c, page, bu, &n);
+               unlock_page(page);
+               page_cache_release(page);
+               if (err)
+                       break;
+       }
+
+       ui->last_page_read = offset + page_idx - 1;
+
+out_free:
+       if (allocate)
+               kfree(bu->buf);
+       return ret;
+
+out_warn:
+       ubifs_warn("ignoring error %d and skipping bulk-read", err);
+       goto out_free;
+
+out_bu_off:
+       ui->read_in_a_row = ui->bulk_read = 0;
+       goto out_free;
+}
+
+/**
+ * ubifs_bulk_read - determine whether to bulk-read and, if so, do it.
+ * @page: page from which to start bulk-read.
+ *
+ * Some flash media are capable of reading sequentially at faster rates. UBIFS
+ * bulk-read facility is designed to take advantage of that, by reading in one
+ * go consecutive data nodes that are also located consecutively in the same
+ * LEB. This function returns %1 if a bulk-read is done and %0 otherwise.
+ */
+static int ubifs_bulk_read(struct page *page)
+{
+       struct inode *inode = page->mapping->host;
+       struct ubifs_info *c = inode->i_sb->s_fs_info;
+       struct ubifs_inode *ui = ubifs_inode(inode);
+       pgoff_t index = page->index, last_page_read = ui->last_page_read;
+       struct bu_info *bu;
+       int err = 0, allocated = 0;
+
+       ui->last_page_read = index;
+       if (!c->bulk_read)
+               return 0;
+
+       /*
+        * Bulk-read is protected by @ui->ui_mutex, but it is an optimization,
+        * so don't bother if we cannot lock the mutex.
+        */
+       if (!mutex_trylock(&ui->ui_mutex))
+               return 0;
+
+       if (index != last_page_read + 1) {
+               /* Turn off bulk-read if we stop reading sequentially */
+               ui->read_in_a_row = 1;
+               if (ui->bulk_read)
+                       ui->bulk_read = 0;
+               goto out_unlock;
+       }
+
+       if (!ui->bulk_read) {
+               ui->read_in_a_row += 1;
+               if (ui->read_in_a_row < 3)
+                       goto out_unlock;
+               /* Three reads in a row, so switch on bulk-read */
+               ui->bulk_read = 1;
+       }
+
+       /*
+        * If possible, try to use pre-allocated bulk-read information, which
+        * is protected by @c->bu_mutex.
+        */
+       if (mutex_trylock(&c->bu_mutex))
+               bu = &c->bu;
+       else {
+               bu = kmalloc(sizeof(struct bu_info), GFP_NOFS | __GFP_NOWARN);
+               if (!bu)
+                       goto out_unlock;
+
+               bu->buf = NULL;
+               allocated = 1;
+       }
+
+       bu->buf_len = c->max_bu_buf_len;
+       data_key_init(c, &bu->key, inode->i_ino,
+                     page->index << UBIFS_BLOCKS_PER_PAGE_SHIFT);
+       err = ubifs_do_bulk_read(c, bu, page);
+
+       if (!allocated)
+               mutex_unlock(&c->bu_mutex);
+       else
+               kfree(bu);
+
+out_unlock:
+       mutex_unlock(&ui->ui_mutex);
+       return err;
+}
+
 static int ubifs_readpage(struct file *file, struct page *page)
 {
+       if (ubifs_bulk_read(page))
+               return 0;
        do_readpage(page);
        unlock_page(page);
        return 0;
@@ -654,7 +959,7 @@ static int do_writepage(struct page *page, int len)
  * whole index and correct all inode sizes, which is long an unacceptable.
  *
  * To prevent situations like this, UBIFS writes pages back only if they are
- * within last synchronized inode size, i.e. the the size which has been
+ * within the last synchronized inode size, i.e. the size which has been
  * written to the flash media last time. Otherwise, UBIFS forces inode
  * write-back, thus making sure the on-flash inode contains current inode size,
  * and then keeps writing pages back.
@@ -1139,8 +1444,9 @@ static int ubifs_releasepage(struct page *page, gfp_t unused_gfp_flags)
  * mmap()d file has taken write protection fault and is being made
  * writable. UBIFS must ensure page is budgeted for.
  */
-static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
+static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
+       struct page *page = vmf->page;
        struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
        struct ubifs_info *c = inode->i_sb->s_fs_info;
        struct timespec now = ubifs_current_time(inode);
@@ -1152,7 +1458,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
        ubifs_assert(!(inode->i_sb->s_flags & MS_RDONLY));
 
        if (unlikely(c->ro_media))
-               return -EROFS;
+               return VM_FAULT_SIGBUS; /* -EROFS */
 
        /*
         * We have not locked @page so far so we may budget for changing the
@@ -1185,7 +1491,7 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
                if (err == -ENOSPC)
                        ubifs_warn("out of space for mmapped file "
                                   "(inode number %lu)", inode->i_ino);
-               return err;
+               return VM_FAULT_SIGBUS;
        }
 
        lock_page(page);
@@ -1225,6 +1531,8 @@ static int ubifs_vm_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 out_unlock:
        unlock_page(page);
        ubifs_release_budget(c, &req);
+       if (err)
+               err = VM_FAULT_SIGBUS;
        return err;
 }
 
@@ -1245,7 +1553,7 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
        return 0;
 }
 
-struct address_space_operations ubifs_file_address_operations = {
+const struct address_space_operations ubifs_file_address_operations = {
        .readpage       = ubifs_readpage,
        .writepage      = ubifs_writepage,
        .write_begin    = ubifs_write_begin,
@@ -1255,7 +1563,7 @@ struct address_space_operations ubifs_file_address_operations = {
        .releasepage    = ubifs_releasepage,
 };
 
-struct inode_operations ubifs_file_inode_operations = {
+const struct inode_operations ubifs_file_inode_operations = {
        .setattr     = ubifs_setattr,
        .getattr     = ubifs_getattr,
 #ifdef CONFIG_UBIFS_FS_XATTR
@@ -1266,14 +1574,14 @@ struct inode_operations ubifs_file_inode_operations = {
 #endif
 };
 
-struct inode_operations ubifs_symlink_inode_operations = {
+const struct inode_operations ubifs_symlink_inode_operations = {
        .readlink    = generic_readlink,
        .follow_link = ubifs_follow_link,
        .setattr     = ubifs_setattr,
        .getattr     = ubifs_getattr,
 };
 
-struct file_operations ubifs_file_operations = {
+const struct file_operations ubifs_file_operations = {
        .llseek         = generic_file_llseek,
        .read           = do_sync_read,
        .write          = do_sync_write,