memcgroup: revert swap_state mods
[safe/jmp/linux-2.6] / mm / shmem.c
index ce64b66..0f246c4 100644 (file)
@@ -80,6 +80,7 @@
 enum sgp_type {
        SGP_READ,       /* don't exceed i_size, don't allocate page */
        SGP_CACHE,      /* don't exceed i_size, may allocate page */
+       SGP_DIRTY,      /* like SGP_CACHE, but set new page dirty */
        SGP_WRITE,      /* may exceed i_size, may allocate page */
 };
 
@@ -192,7 +193,7 @@ static struct backing_dev_info shmem_backing_dev_info  __read_mostly = {
 };
 
 static LIST_HEAD(shmem_swaplist);
-static DEFINE_SPINLOCK(shmem_swaplist_lock);
+static DEFINE_MUTEX(shmem_swaplist_mutex);
 
 static void shmem_free_blocks(struct inode *inode, long pages)
 {
@@ -795,9 +796,9 @@ static void shmem_delete_inode(struct inode *inode)
                inode->i_size = 0;
                shmem_truncate(inode);
                if (!list_empty(&info->swaplist)) {
-                       spin_lock(&shmem_swaplist_lock);
+                       mutex_lock(&shmem_swaplist_mutex);
                        list_del_init(&info->swaplist);
-                       spin_unlock(&shmem_swaplist_lock);
+                       mutex_unlock(&shmem_swaplist_mutex);
                }
        }
        BUG_ON(inode->i_blocks);
@@ -827,19 +828,22 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
        struct page *subdir;
        swp_entry_t *ptr;
        int offset;
+       int error;
 
        idx = 0;
        ptr = info->i_direct;
        spin_lock(&info->lock);
+       if (!info->swapped) {
+               list_del_init(&info->swaplist);
+               goto lost2;
+       }
        limit = info->next_index;
        size = limit;
        if (size > SHMEM_NR_DIRECT)
                size = SHMEM_NR_DIRECT;
        offset = shmem_find_swp(entry, ptr, ptr+size);
-       if (offset >= 0) {
-               shmem_swp_balance_unmap();
+       if (offset >= 0)
                goto found;
-       }
        if (!info->i_indirect)
                goto lost2;
 
@@ -849,6 +853,14 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
        for (idx = SHMEM_NR_DIRECT; idx < limit; idx += ENTRIES_PER_PAGE, dir++) {
                if (unlikely(idx == stage)) {
                        shmem_dir_unmap(dir-1);
+                       if (cond_resched_lock(&info->lock)) {
+                               /* check it has not been truncated */
+                               if (limit > info->next_index) {
+                                       limit = info->next_index;
+                                       if (idx >= limit)
+                                               goto lost2;
+                               }
+                       }
                        dir = shmem_dir_map(info->i_indirect) +
                            ENTRIES_PER_PAGE/2 + idx/ENTRIES_PER_PAGEPAGE;
                        while (!*dir) {
@@ -869,11 +881,11 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
                        if (size > ENTRIES_PER_PAGE)
                                size = ENTRIES_PER_PAGE;
                        offset = shmem_find_swp(entry, ptr, ptr+size);
+                       shmem_swp_unmap(ptr);
                        if (offset >= 0) {
                                shmem_dir_unmap(dir);
                                goto found;
                        }
-                       shmem_swp_unmap(ptr);
                }
        }
 lost1:
@@ -883,19 +895,63 @@ lost2:
        return 0;
 found:
        idx += offset;
-       inode = &info->vfs_inode;
-       if (move_from_swap_cache(page, idx, inode->i_mapping) == 0) {
-               info->flags |= SHMEM_PAGEIN;
-               shmem_swp_set(info, ptr + offset, 0);
-       }
-       shmem_swp_unmap(ptr);
+       inode = igrab(&info->vfs_inode);
        spin_unlock(&info->lock);
+
        /*
-        * Decrement swap count even when the entry is left behind:
-        * try_to_unuse will skip over mms, then reincrement count.
+        * Move _head_ to start search for next from here.
+        * But be careful: shmem_delete_inode checks list_empty without taking
+        * mutex, and there's an instant in list_move_tail when info->swaplist
+        * would appear empty, if it were the only one on shmem_swaplist.  We
+        * could avoid doing it if inode NULL; or use this minor optimization.
         */
-       swap_free(entry);
-       return 1;
+       if (shmem_swaplist.next != &info->swaplist)
+               list_move_tail(&shmem_swaplist, &info->swaplist);
+       mutex_unlock(&shmem_swaplist_mutex);
+
+       error = 1;
+       if (!inode)
+               goto out;
+       error = radix_tree_preload(GFP_KERNEL);
+       if (error)
+               goto out;
+       error = 1;
+
+       spin_lock(&info->lock);
+       ptr = shmem_swp_entry(info, idx, NULL);
+       if (ptr && ptr->val == entry.val)
+               error = add_to_page_cache(page, inode->i_mapping,
+                                               idx, GFP_NOWAIT);
+       if (error == -EEXIST) {
+               struct page *filepage = find_get_page(inode->i_mapping, idx);
+               error = 1;
+               if (filepage) {
+                       /*
+                        * There might be a more uptodate page coming down
+                        * from a stacked writepage: forget our swappage if so.
+                        */
+                       if (PageUptodate(filepage))
+                               error = 0;
+                       page_cache_release(filepage);
+               }
+       }
+       if (!error) {
+               delete_from_swap_cache(page);
+               set_page_dirty(page);
+               info->flags |= SHMEM_PAGEIN;
+               shmem_swp_set(info, ptr, 0);
+               swap_free(entry);
+               error = 1;      /* not an error, but entry was found */
+       }
+       if (ptr)
+               shmem_swp_unmap(ptr);
+       spin_unlock(&info->lock);
+       radix_tree_preload_end();
+out:
+       unlock_page(page);
+       page_cache_release(page);
+       iput(inode);            /* allows for NULL */
+       return error;
 }
 
 /*
@@ -907,20 +963,16 @@ int shmem_unuse(swp_entry_t entry, struct page *page)
        struct shmem_inode_info *info;
        int found = 0;
 
-       spin_lock(&shmem_swaplist_lock);
+       mutex_lock(&shmem_swaplist_mutex);
        list_for_each_safe(p, next, &shmem_swaplist) {
                info = list_entry(p, struct shmem_inode_info, swaplist);
-               if (!info->swapped)
-                       list_del_init(&info->swaplist);
-               else if (shmem_unuse_inode(info, entry, page)) {
-                       /* move head to start search for next from here */
-                       list_move_tail(&shmem_swaplist, &info->swaplist);
-                       found = 1;
-                       break;
-               }
+               found = shmem_unuse_inode(info, entry, page);
+               cond_resched();
+               if (found)
+                       goto out;
        }
-       spin_unlock(&shmem_swaplist_lock);
-       return found;
+       mutex_unlock(&shmem_swaplist_mutex);
+out:   return found;   /* 0 or 1 or -ENOMEM */
 }
 
 /*
@@ -935,54 +987,65 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
        struct inode *inode;
 
        BUG_ON(!PageLocked(page));
-       /*
-        * shmem_backing_dev_info's capabilities prevent regular writeback or
-        * sync from ever calling shmem_writepage; but a stacking filesystem
-        * may use the ->writepage of its underlying filesystem, in which case
-        * we want to do nothing when that underlying filesystem is tmpfs
-        * (writing out to swap is useful as a response to memory pressure, but
-        * of no use to stabilize the data) - just redirty the page, unlock it
-        * and claim success in this case.  AOP_WRITEPAGE_ACTIVATE, and the
-        * page_mapped check below, must be avoided unless we're in reclaim.
-        */
-       if (!wbc->for_reclaim) {
-               set_page_dirty(page);
-               unlock_page(page);
-               return 0;
-       }
-       BUG_ON(page_mapped(page));
-
        mapping = page->mapping;
        index = page->index;
        inode = mapping->host;
        info = SHMEM_I(inode);
        if (info->flags & VM_LOCKED)
                goto redirty;
-       swap = get_swap_page();
-       if (!swap.val)
+       if (!total_swap_pages)
                goto redirty;
 
+       /*
+        * shmem_backing_dev_info's capabilities prevent regular writeback or
+        * sync from ever calling shmem_writepage; but a stacking filesystem
+        * may use the ->writepage of its underlying filesystem, in which case
+        * tmpfs should write out to swap only in response to memory pressure,
+        * and not for pdflush or sync.  However, in those cases, we do still
+        * want to check if there's a redundant swappage to be discarded.
+        */
+       if (wbc->for_reclaim)
+               swap = get_swap_page();
+       else
+               swap.val = 0;
+
        spin_lock(&info->lock);
-       shmem_recalc_inode(inode);
        if (index >= info->next_index) {
                BUG_ON(!(info->flags & SHMEM_TRUNCATE));
                goto unlock;
        }
        entry = shmem_swp_entry(info, index, NULL);
-       BUG_ON(!entry);
-       BUG_ON(entry->val);
+       if (entry->val) {
+               /*
+                * The more uptodate page coming down from a stacked
+                * writepage should replace our old swappage.
+                */
+               free_swap_and_cache(*entry);
+               shmem_swp_set(info, entry, 0);
+       }
+       shmem_recalc_inode(inode);
 
-       if (move_to_swap_cache(page, swap) == 0) {
+       if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) {
+               remove_from_page_cache(page);
                shmem_swp_set(info, entry, swap.val);
                shmem_swp_unmap(entry);
+               if (list_empty(&info->swaplist))
+                       inode = igrab(inode);
+               else
+                       inode = NULL;
                spin_unlock(&info->lock);
-               if (list_empty(&info->swaplist)) {
-                       spin_lock(&shmem_swaplist_lock);
+               swap_duplicate(swap);
+               BUG_ON(page_mapped(page));
+               page_cache_release(page);       /* pagecache ref */
+               set_page_dirty(page);
+               unlock_page(page);
+               if (inode) {
+                       mutex_lock(&shmem_swaplist_mutex);
                        /* move instead of add in case we're racing */
                        list_move_tail(&info->swaplist, &shmem_swaplist);
-                       spin_unlock(&shmem_swaplist_lock);
+                       mutex_unlock(&shmem_swaplist_mutex);
+                       iput(inode);
                }
-               unlock_page(page);
                return 0;
        }
 
@@ -992,7 +1055,10 @@ unlock:
        swap_free(swap);
 redirty:
        set_page_dirty(page);
-       return AOP_WRITEPAGE_ACTIVATE;  /* Return with the page locked */
+       if (wbc->for_reclaim)
+               return AOP_WRITEPAGE_ACTIVATE;  /* Return with page locked */
+       unlock_page(page);
+       return 0;
 }
 
 #ifdef CONFIG_NUMA
@@ -1137,6 +1203,16 @@ repeat:
                goto done;
        error = 0;
        gfp = mapping_gfp_mask(mapping);
+       if (!filepage) {
+               /*
+                * Try to preload while we can wait, to not make a habit of
+                * draining atomic reserves; but don't latch on to this cpu.
+                */
+               error = radix_tree_preload(gfp & ~__GFP_HIGHMEM);
+               if (error)
+                       goto failed;
+               radix_tree_preload_end();
+       }
 
        spin_lock(&info->lock);
        shmem_recalc_inode(inode);
@@ -1217,23 +1293,21 @@ repeat:
                        SetPageUptodate(filepage);
                        set_page_dirty(filepage);
                        swap_free(swap);
-               } else if (!(error = move_from_swap_cache(
-                               swappage, idx, mapping))) {
+               } else if (!(error = add_to_page_cache(
+                               swappage, mapping, idx, GFP_NOWAIT))) {
                        info->flags |= SHMEM_PAGEIN;
                        shmem_swp_set(info, entry, 0);
                        shmem_swp_unmap(entry);
+                       delete_from_swap_cache(swappage);
                        spin_unlock(&info->lock);
                        filepage = swappage;
+                       set_page_dirty(filepage);
                        swap_free(swap);
                } else {
                        shmem_swp_unmap(entry);
                        spin_unlock(&info->lock);
                        unlock_page(swappage);
                        page_cache_release(swappage);
-                       if (error == -ENOMEM) {
-                               /* let kswapd refresh zone for GFP_ATOMICs */
-                               congestion_wait(WRITE, HZ/50);
-                       }
                        goto repeat;
                }
        } else if (sgp == SGP_READ && !filepage) {
@@ -1288,7 +1362,7 @@ repeat:
                                shmem_swp_unmap(entry);
                        }
                        if (error || swap.val || 0 != add_to_page_cache_lru(
-                                       filepage, mapping, idx, GFP_ATOMIC)) {
+                                       filepage, mapping, idx, GFP_NOWAIT)) {
                                spin_unlock(&info->lock);
                                page_cache_release(filepage);
                                shmem_unacct_blocks(info->flags, 1);
@@ -1306,6 +1380,8 @@ repeat:
                clear_highpage(filepage);
                flush_dcache_page(filepage);
                SetPageUptodate(filepage);
+               if (sgp == SGP_DIRTY)
+                       set_page_dirty(filepage);
        }
 done:
        *pagep = filepage;
@@ -1491,6 +1567,15 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
        struct inode *inode = filp->f_path.dentry->d_inode;
        struct address_space *mapping = inode->i_mapping;
        unsigned long index, offset;
+       enum sgp_type sgp = SGP_READ;
+
+       /*
+        * Might this read be for a stacking filesystem?  Then when reading
+        * holes of a sparse file, we actually need to allocate those pages,
+        * and even mark them dirty, so it cannot exceed the max_blocks limit.
+        */
+       if (segment_eq(get_fs(), KERNEL_DS))
+               sgp = SGP_DIRTY;
 
        index = *ppos >> PAGE_CACHE_SHIFT;
        offset = *ppos & ~PAGE_CACHE_MASK;
@@ -1509,7 +1594,7 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                                break;
                }
 
-               desc->error = shmem_getpage(inode, index, &page, SGP_READ, NULL);
+               desc->error = shmem_getpage(inode, index, &page, sgp, NULL);
                if (desc->error) {
                        if (desc->error == -EINVAL)
                                desc->error = 0;
@@ -1870,8 +1955,7 @@ static int shmem_xattr_security_get(struct inode *inode, const char *name,
 {
        if (strcmp(name, "") == 0)
                return -EINVAL;
-       return security_inode_getsecurity(inode, name, buffer, size,
-                                         -EOPNOTSUPP);
+       return xattr_getsecurity(inode, name, buffer, size);
 }
 
 static int shmem_xattr_security_set(struct inode *inode, const char *name,
@@ -2012,7 +2096,7 @@ static int shmem_parse_options(char *options, int *mode, uid_t *uid,
                        }
                        if (*rest)
                                goto bad_val;
-                       *blocks = size >> PAGE_CACHE_SHIFT;
+                       *blocks = DIV_ROUND_UP(size, PAGE_CACHE_SIZE);
                } else if (!strcmp(this_char,"nr_blocks")) {
                        *blocks = memparse(value,&rest);
                        if (*rest)