ceph: fix xattr dangling pointer / double free
[safe/jmp/linux-2.6] / fs / ceph / addr.c
index 25360d5..d9c60b8 100644 (file)
@@ -5,6 +5,7 @@
 #include <linux/mm.h>
 #include <linux/pagemap.h>
 #include <linux/writeback.h>   /* generic_writepages */
+#include <linux/slab.h>
 #include <linux/pagevec.h>
 #include <linux/task_io_accounting_ops.h>
 
@@ -144,7 +145,7 @@ static int ceph_set_page_dirty(struct page *page)
  */
 static void ceph_invalidatepage(struct page *page, unsigned long offset)
 {
-       struct inode *inode = page->mapping->host;
+       struct inode *inode;
        struct ceph_inode_info *ci;
        struct ceph_snap_context *snapc = (void *)page->private;
 
@@ -153,6 +154,8 @@ static void ceph_invalidatepage(struct page *page, unsigned long offset)
        BUG_ON(!PagePrivate(page));
        BUG_ON(!page->mapping);
 
+       inode = page->mapping->host;
+
        /*
         * We can get non-dirty pages here due to races between
         * set_page_dirty and truncate_complete_page; just spit out a
@@ -271,7 +274,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
        int rc = 0;
        struct page **pages;
-       struct pagevec pvec;
        loff_t offset;
        u64 len;
 
@@ -294,8 +296,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
        if (rc < 0)
                goto out;
 
-       /* set uptodate and add to lru in pagevec-sized chunks */
-       pagevec_init(&pvec, 0);
        for (; !list_empty(page_list) && len > 0;
             rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
                struct page *page =
@@ -309,7 +309,7 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                        zero_user_segment(page, s, PAGE_CACHE_SIZE);
                }
 
-               if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+               if (add_to_page_cache_lru(page, mapping, page->index, GFP_NOFS)) {
                        page_cache_release(page);
                        dout("readpages %p add_to_page_cache failed %p\n",
                             inode, page);
@@ -320,10 +320,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
                flush_dcache_page(page);
                SetPageUptodate(page);
                unlock_page(page);
-               if (pagevec_add(&pvec, page) == 0)
-                       pagevec_lru_add_file(&pvec);   /* add to lru */
+               page_cache_release(page);
        }
-       pagevec_lru_add_file(&pvec);
        rc = 0;
 
 out:
@@ -334,16 +332,15 @@ out:
 /*
  * Get ref for the oldest snapc for an inode with dirty data... that is, the
  * only snap context we are allowed to write back.
- *
- * Caller holds i_lock.
  */
-static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
-                                                     u64 *snap_size)
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+                                                   u64 *snap_size)
 {
        struct ceph_inode_info *ci = ceph_inode(inode);
        struct ceph_snap_context *snapc = NULL;
        struct ceph_cap_snap *capsnap = NULL;
 
+       spin_lock(&inode->i_lock);
        list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
                dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
                     capsnap->context, capsnap->dirty_pages);
@@ -354,21 +351,11 @@ static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
                        break;
                }
        }
-       if (!snapc && ci->i_snap_realm) {
-               snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+       if (!snapc && ci->i_head_snapc) {
+               snapc = ceph_get_snap_context(ci->i_head_snapc);
                dout(" head snapc %p has %d dirty pages\n",
                     snapc, ci->i_wrbuffer_ref_head);
        }
-       return snapc;
-}
-
-static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-                                                   u64 *snap_size)
-{
-       struct ceph_snap_context *snapc = NULL;
-
-       spin_lock(&inode->i_lock);
-       snapc = __get_oldest_context(inode, snap_size);
        spin_unlock(&inode->i_lock);
        return snapc;
 }
@@ -389,7 +376,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        int len = PAGE_CACHE_SIZE;
        loff_t i_size;
        int err = 0;
-       struct ceph_snap_context *snapc;
+       struct ceph_snap_context *snapc, *oldest;
        u64 snap_size = 0;
        long writeback_stat;
 
@@ -410,13 +397,16 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
                dout("writepage %p page %p not dirty?\n", inode, page);
                goto out;
        }
-       if (snapc != get_oldest_context(inode, &snap_size)) {
+       oldest = get_oldest_context(inode, &snap_size);
+       if (snapc->seq > oldest->seq) {
                dout("writepage %p page %p snapc %p not writeable - noop\n",
                     inode, page, (void *)page->private);
                /* we should only noop if called by kswapd */
                WARN_ON((current->flags & PF_MEMALLOC) == 0);
+               ceph_put_snap_context(oldest);
                goto out;
        }
+       ceph_put_snap_context(oldest);
 
        /* is this a partial page at end of file? */
        if (snap_size)
@@ -455,7 +445,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
        ClearPagePrivate(page);
        end_page_writeback(page);
        ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
-       ceph_put_snap_context(snapc);
+       ceph_put_snap_context(snapc);  /* page's reference */
 out:
        return err;
 }
@@ -509,12 +499,11 @@ static void writepages_finish(struct ceph_osd_request *req,
        int i;
        struct ceph_snap_context *snapc = req->r_snapc;
        struct address_space *mapping = inode->i_mapping;
-       struct writeback_control *wbc = req->r_wbc;
        __s32 rc = -EIO;
        u64 bytes = 0;
        struct ceph_client *client = ceph_inode_to_client(inode);
        long writeback_stat;
-       unsigned issued = __ceph_caps_issued(ci, NULL);
+       unsigned issued = ceph_caps_issued(ci);
 
        /* parse reply */
        replyhead = msg->front.iov_base;
@@ -551,13 +540,9 @@ static void writepages_finish(struct ceph_osd_request *req,
                        clear_bdi_congested(&client->backing_dev_info,
                                            BLK_RW_ASYNC);
 
-               if (i >= wrote) {
-                       dout("inode %p skipping page %p\n", inode, page);
-                       wbc->pages_skipped++;
-               }
+               ceph_put_snap_context((void *)page->private);
                page->private = 0;
                ClearPagePrivate(page);
-               ceph_put_snap_context(snapc);
                dout("unlocking %d %p\n", i, page);
                end_page_writeback(page);
 
@@ -578,7 +563,7 @@ static void writepages_finish(struct ceph_osd_request *req,
        ceph_release_pages(req->r_pages, req->r_num_pages);
        if (req->r_pages_from_pool)
                mempool_free(req->r_pages,
-                            ceph_client(inode->i_sb)->wb_pagevec_pool);
+                            ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
        else
                kfree(req->r_pages);
        ceph_osdc_put_request(req);
@@ -615,7 +600,7 @@ static int ceph_writepages_start(struct address_space *mapping,
        int range_whole = 0;
        int should_loop = 1;
        pgoff_t max_pages = 0, max_pages_ever = 0;
-       struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+       struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc;
        struct pagevec pvec;
        int done = 0;
        int rc = 0;
@@ -767,9 +752,10 @@ get_more_pages:
                        }
 
                        /* only if matching snap context */
-                       if (snapc != (void *)page->private) {
-                               dout("page snapc %p != oldest %p\n",
-                                    (void *)page->private, snapc);
+                       pgsnapc = (void *)page->private;
+                       if (pgsnapc->seq > snapc->seq) {
+                               dout("page snapc %p %lld > oldest %p %lld\n",
+                                    pgsnapc, pgsnapc->seq, snapc, snapc->seq);
                                unlock_page(page);
                                if (!locked_pages)
                                        continue; /* keep looking for snap */
@@ -803,7 +789,6 @@ get_more_pages:
                                alloc_page_vec(client, req);
                                req->r_callback = writepages_finish;
                                req->r_inode = inode;
-                               req->r_wbc = wbc;
                        }
 
                        /* note position of first page in pvec */
@@ -911,12 +896,19 @@ static int context_is_writeable_or_written(struct inode *inode,
                                           struct ceph_snap_context *snapc)
 {
        struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
-       return !oldest || snapc->seq <= oldest->seq;
+       int ret = !oldest || snapc->seq <= oldest->seq;
+
+       ceph_put_snap_context(oldest);
+       return ret;
 }
 
 /*
  * We are only allowed to write into/dirty the page if the page is
  * clean, or already dirty within the same snap context.
+ *
+ * called with page locked.
+ * return success with page locked,
+ * or any failure (incl -EAGAIN) with page unlocked.
  */
 static int ceph_update_writeable_page(struct file *file,
                            loff_t pos, unsigned len,
@@ -929,8 +921,8 @@ static int ceph_update_writeable_page(struct file *file,
        int pos_in_page = pos & ~PAGE_CACHE_MASK;
        int end_in_page = pos_in_page + len;
        loff_t i_size;
-       struct ceph_snap_context *snapc;
        int r;
+       struct ceph_snap_context *snapc, *oldest;
 
 retry_locked:
        /* writepages currently holds page lock, but if we change that later, */
@@ -940,30 +932,34 @@ retry_locked:
        BUG_ON(!ci->i_snap_realm);
        down_read(&mdsc->snap_rwsem);
        BUG_ON(!ci->i_snap_realm->cached_context);
-       if (page->private &&
-           (void *)page->private != ci->i_snap_realm->cached_context) {
+       snapc = (void *)page->private;
+       if (snapc && snapc != ci->i_head_snapc) {
                /*
                 * this page is already dirty in another (older) snap
                 * context!  is it writeable now?
                 */
-               snapc = get_oldest_context(inode, NULL);
+               oldest = get_oldest_context(inode, NULL);
                up_read(&mdsc->snap_rwsem);
 
-               if (snapc != (void *)page->private) {
+               if (snapc->seq > oldest->seq) {
+                       ceph_put_snap_context(oldest);
                        dout(" page %p snapc %p not current or oldest\n",
-                            page, (void *)page->private);
+                            page, snapc);
                        /*
                         * queue for writeback, and wait for snapc to
                         * be writeable or written
                         */
-                       snapc = ceph_get_snap_context((void *)page->private);
+                       snapc = ceph_get_snap_context(snapc);
                        unlock_page(page);
                        ceph_queue_writeback(inode);
-                       wait_event_interruptible(ci->i_cap_wq,
+                       r = wait_event_interruptible(ci->i_cap_wq,
                               context_is_writeable_or_written(inode, snapc));
                        ceph_put_snap_context(snapc);
+                       if (r == -ERESTARTSYS)
+                               return r;
                        return -EAGAIN;
                }
+               ceph_put_snap_context(oldest);
 
                /* yay, writeable, do it now (without dropping page lock) */
                dout(" page %p snapc %p not current, but oldest\n",
@@ -1033,7 +1029,7 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
        int r;
 
        do {
-               /* get a page*/
+               /* get a page */
                page = grab_cache_page_write_begin(mapping, index, 0);
                if (!page)
                        return -ENOMEM;