X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmigrate.c;h=efddbf0926b283ae5ef292e076ff9900842c4dd5;hb=fa1f136e073ddc4e60497c51bc8918569314d38a;hp=20b95db63da5285a728911fd9375a5459b6c24e9;hpb=e23ca00bf1b1c6c0f04702cb4d29e275ab8dc330;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/migrate.c b/mm/migrate.c index 20b95db..efddbf0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -9,71 +9,40 @@ * IWAMOTO Toshihiro * Hirokazu Takahashi * Dave Hansen - * Christoph Lameter + * Christoph Lameter */ #include #include #include +#include #include #include #include +#include #include +#include #include #include #include #include -#include +#include +#include +#include +#include +#include +#include #include "internal.h" -/* The maximum number of pages to take off the LRU for migration */ -#define MIGRATE_CHUNK_SIZE 256 - #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* - * Isolate one page from the LRU lists. If successful put it onto - * the indicated list with elevated page count. - * - * Result: - * -EBUSY: page not on LRU list - * 0: page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page, struct list_head *pagelist) -{ - int ret = -EBUSY; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page)) { - ret = 0; - get_page(page); - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - list_add_tail(&page->lru, pagelist); - } - spin_unlock_irq(&zone->lru_lock); - } - return ret; -} - -/* - * migrate_prep() needs to be called after we have compiled the list of pages - * to be migrated using isolate_lru_page() but before we begin a series of calls - * to migrate_pages(). + * migrate_prep() needs to be called before we start compiling a list of pages + * to be migrated using isolate_lru_page(). */ int migrate_prep(void) { - /* Must have swap device for migration */ - if (nr_swap_pages <= 0) - return -ENODEV; - /* * Clear the LRU lists so pages can be isolated. * Note that pages may be moved off the LRU after we have @@ -85,24 +54,9 @@ int migrate_prep(void) return 0; } -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - /* - * Add isolated pages on the list back to the LRU. + * Add isolated pages on the list back to the LRU under page lock + * to avoid leaking evictable pages back onto unevictable list. * * returns the number of pages put back. */ @@ -113,156 +67,216 @@ int putback_lru_pages(struct list_head *l) int count = 0; list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); count++; } return count; } /* - * Non migratable page + * Restore a potential migration pte to a working pte entry */ -int fail_migrate_page(struct page *newpage, struct page *page) +static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, + unsigned long addr, void *old) { - return -EIO; + struct mm_struct *mm = vma->vm_mm; + swp_entry_t entry; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *ptep, pte; + spinlock_t *ptl; + + pgd = pgd_offset(mm, addr); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, addr); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, addr); + if (!pmd_present(*pmd)) + goto out; + + ptep = pte_offset_map(pmd, addr); + + if (!is_swap_pte(*ptep)) { + pte_unmap(ptep); + goto out; + } + + ptl = pte_lockptr(mm, pmd); + spin_lock(ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto unlock; + + entry = pte_to_swp_entry(pte); + + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != old) + goto unlock; + + get_page(new); + pte = pte_mkold(mk_pte(new, vma->vm_page_prot)); + if (is_write_migration_entry(entry)) + pte = pte_mkwrite(pte); + flush_cache_page(vma, addr, pte_pfn(pte)); + set_pte_at(mm, addr, ptep, pte); + + if (PageAnon(new)) + page_add_anon_rmap(new, vma, addr); + else + page_add_file_rmap(new); + + /* No need to invalidate - it was non-present before */ + update_mmu_cache(vma, addr, pte); +unlock: + pte_unmap_unlock(ptep, ptl); +out: + return SWAP_AGAIN; } -EXPORT_SYMBOL(fail_migrate_page); /* - * swapout a single page - * page is locked upon entry, unlocked on exit + * Get rid of all migration entries and replace them by + * references to the indicated page. */ -static int swap_page(struct page *page) +static void remove_migration_ptes(struct page *old, struct page *new) { - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; + rmap_walk(new, remove_migration_pte, old); +} - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } +/* + * Something used the pte of a page under migration. We need to + * get to the page and wait until migration is finished. + * When we return from this function the fault will be retried. + * + * This function is called from do_swap_page(). + */ +void migration_entry_wait(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) +{ + pte_t *ptep, pte; + spinlock_t *ptl; + swp_entry_t entry; + struct page *page; - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } + ptep = pte_offset_map_lock(mm, pmd, address, &ptl); + pte = *ptep; + if (!is_swap_pte(pte)) + goto out; - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } + entry = pte_to_swp_entry(pte); + if (!is_migration_entry(entry)) + goto out; -unlock_retry: - unlock_page(page); + page = migration_entry_to_page(entry); -retry: - return -EAGAIN; + /* + * Once radix-tree replacement of page migration started, page_count + * *must* be zero. And, we don't want to call wait_on_page_locked() + * against a page without get_page(). + * So, we use get_page_unless_zero(), here. Even failed, page fault + * will occur again. + */ + if (!get_page_unless_zero(page)) + goto out; + pte_unmap_unlock(ptep, ptl); + wait_on_page_locked(page); + put_page(page); + return; +out: + pte_unmap_unlock(ptep, ptl); } -EXPORT_SYMBOL(swap_page); /* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. + * Replace the page in the mapping. + * + * The number of remaining references must be: + * 1 for anonymous pages without a mapping + * 2 for pages with a mapping + * 3 for pages with a mapping and PagePrivate/PagePrivate2 set. */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) +static int migrate_page_move_mapping(struct address_space *mapping, + struct page *newpage, struct page *page) { - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; + int expected_count; + void **pslot; - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; - - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> permanent failure */ - return -EPERM; + if (!mapping) { + /* Anonymous page without mapping */ + if (page_count(page) != 1) + return -EAGAIN; + return 0; + } - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; + spin_lock_irq(&mapping->tree_lock); - write_lock_irq(&mapping->tree_lock); + pslot = radix_tree_lookup_slot(&mapping->page_tree, + page_index(page)); - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); + expected_count = 2 + page_has_private(page); + if (page_count(page) != expected_count || + (struct page *)radix_tree_deref_slot(pslot) != page) { + spin_unlock_irq(&mapping->tree_lock); + return -EAGAIN; + } - if (!page_mapping(page) || page_count(page) != nr_refs || - *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); + if (!page_freeze_refs(page, expected_count)) { + spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } /* * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. */ - get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; + get_page(newpage); /* add cache reference */ if (PageSwapCache(page)) { SetPageSwapCache(newpage); set_page_private(newpage, page_private(page)); } - *radix_pointer = newpage; + radix_tree_replace_slot(pslot, newpage); + + page_unfreeze_refs(page, expected_count); + /* + * Drop cache reference from old page. + * We know this isn't the last reference. + */ __put_page(page); - write_unlock_irq(&mapping->tree_lock); + + /* + * If moved to a different zone then also account + * the page for that zone. Other VM counters will be + * taken care of when we establish references to the + * new page and drop references to the old page. + * + * Note that anonymous pages are accounted for + * via NR_FILE_PAGES and NR_ANON_PAGES if they + * are mapped to swap space. + */ + __dec_zone_page_state(page, NR_FILE_PAGES); + __inc_zone_page_state(newpage, NR_FILE_PAGES); + if (PageSwapBacked(page)) { + __dec_zone_page_state(page, NR_SHMEM); + __inc_zone_page_state(newpage, NR_SHMEM); + } + spin_unlock_irq(&mapping->tree_lock); return 0; } -EXPORT_SYMBOL(migrate_page_remove_references); /* * Copy the page to its new location */ -void migrate_page_copy(struct page *newpage, struct page *page) +static void migrate_page_copy(struct page *newpage, struct page *page) { + int anon; + copy_highpage(newpage, page); if (PageError(page)) @@ -271,8 +285,11 @@ void migrate_page_copy(struct page *newpage, struct page *page) SetPageReferenced(newpage); if (PageUptodate(page)) SetPageUptodate(newpage); - if (PageActive(page)) + if (TestClearPageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); + } else if (TestClearPageUnevictable(page)) + SetPageUnevictable(newpage); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -280,13 +297,24 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageDirty(page)) { clear_page_dirty_for_io(page); - set_page_dirty(newpage); + /* + * Want to mark the page and the radix tree as dirty, and + * redo the accounting that clear_page_dirty_for_io undid, + * but we can't use set_page_dirty because that function + * is actually a signal that all of the page has become dirty. + * Wheras only part of our page may be dirty. + */ + __set_page_dirty_nobuffers(newpage); } + mlock_migrate_page(newpage, page); + ksm_migrate_page(newpage, page); + ClearPageSwapCache(page); - ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); + /* page->mapping contains a flag for PageAnon() */ + anon = PageAnon(page); page->mapping = NULL; /* @@ -296,59 +324,384 @@ void migrate_page_copy(struct page *newpage, struct page *page) if (PageWriteback(newpage)) end_page_writeback(newpage); } -EXPORT_SYMBOL(migrate_page_copy); + +/************************************************************ + * Migration functions + ***********************************************************/ + +/* Always fail migration. Used for mappings that are not movable */ +int fail_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + return -EIO; +} +EXPORT_SYMBOL(fail_migrate_page); /* * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. + * pages that do not use PagePrivate/PagePrivate2. * * Pages are locked upon entry and exit. */ -int migrate_page(struct page *newpage, struct page *page) +int migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) { int rc; BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - rc = migrate_page_remove_references(newpage, page, 2); + rc = migrate_page_move_mapping(mapping, newpage, page); if (rc) return rc; migrate_page_copy(newpage, page); + return 0; +} +EXPORT_SYMBOL(migrate_page); + +#ifdef CONFIG_BLOCK +/* + * Migration function for pages with buffers. This function can only be used + * if the underlying filesystem guarantees that no other references to "page" + * exist. + */ +int buffer_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + struct buffer_head *bh, *head; + int rc; + + if (!page_has_buffers(page)) + return migrate_page(mapping, newpage, page); + + head = page_buffers(page); + + rc = migrate_page_move_mapping(mapping, newpage, page); + + if (rc) + return rc; + + bh = head; + do { + get_bh(bh); + lock_buffer(bh); + bh = bh->b_this_page; + + } while (bh != head); + + ClearPagePrivate(page); + set_page_private(newpage, page_private(page)); + set_page_private(page, 0); + put_page(page); + get_page(newpage); + + bh = head; + do { + set_bh_page(bh, newpage, bh_offset(bh)); + bh = bh->b_this_page; + + } while (bh != head); + + SetPagePrivate(newpage); + + migrate_page_copy(newpage, page); + + bh = head; + do { + unlock_buffer(bh); + put_bh(bh); + bh = bh->b_this_page; + + } while (bh != head); + + return 0; +} +EXPORT_SYMBOL(buffer_migrate_page); +#endif + +/* + * Writeback a page to clean the dirty state + */ +static int writeout(struct address_space *mapping, struct page *page) +{ + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + .nr_to_write = 1, + .range_start = 0, + .range_end = LLONG_MAX, + .nonblocking = 1, + .for_reclaim = 1 + }; + int rc; + + if (!mapping->a_ops->writepage) + /* No write method for the address space */ + return -EINVAL; + + if (!clear_page_dirty_for_io(page)) + /* Someone else already triggered a write */ + return -EAGAIN; /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. + * A dirty page may imply that the underlying filesystem has + * the page on some queue. So the page must be clean for + * migration. Writeout may mean we loose the lock and the + * page state is no longer what we checked for earlier. + * At this point we know that the migration attempt cannot + * be successful. */ - remove_from_swap(newpage); - return 0; + remove_migration_ptes(page, page); + + rc = mapping->a_ops->writepage(page, &wbc); + + if (rc != AOP_WRITEPAGE_ACTIVATE) + /* unlocked. Relock */ + lock_page(page); + + return (rc < 0) ? -EIO : -EAGAIN; +} + +/* + * Default handling if a filesystem does not provide a migration function. + */ +static int fallback_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page) +{ + if (PageDirty(page)) + return writeout(mapping, page); + + /* + * Buffers may be managed in a filesystem specific way. + * We must have no buffers or drop them. + */ + if (page_has_private(page) && + !try_to_release_page(page, GFP_KERNEL)) + return -EAGAIN; + + return migrate_page(mapping, newpage, page); +} + +/* + * Move a page to a newly allocated page + * The page is locked and all ptes have been successfully removed. + * + * The new page will have replaced the old page if this function + * is successful. + * + * Return value: + * < 0 - error code + * == 0 - success + */ +static int move_to_new_page(struct page *newpage, struct page *page) +{ + struct address_space *mapping; + int rc; + + /* + * Block others from accessing the page when we get around to + * establishing additional references. We are the only one + * holding a reference to the new page at this point. + */ + if (!trylock_page(newpage)) + BUG(); + + /* Prepare mapping for the new page.*/ + newpage->index = page->index; + newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); + + mapping = page_mapping(page); + if (!mapping) + rc = migrate_page(mapping, newpage, page); + else if (mapping->a_ops->migratepage) + /* + * Most pages have a mapping and most filesystems + * should provide a migration function. Anonymous + * pages are part of swap space which also has its + * own migration function. This is the most common + * path for page migration. + */ + rc = mapping->a_ops->migratepage(mapping, + newpage, page); + else + rc = fallback_migrate_page(mapping, newpage, page); + + if (!rc) + remove_migration_ptes(page, newpage); + else + newpage->mapping = NULL; + + unlock_page(newpage); + + return rc; +} + +/* + * Obtain the lock on page, remove all ptes and migrate the page + * to the newly allocated page in newpage. + */ +static int unmap_and_move(new_page_t get_new_page, unsigned long private, + struct page *page, int force, int offlining) +{ + int rc = 0; + int *result = NULL; + struct page *newpage = get_new_page(page, private, &result); + int rcu_locked = 0; + int charge = 0; + struct mem_cgroup *mem = NULL; + + if (!newpage) + return -ENOMEM; + + if (page_count(page) == 1) { + /* page was freed from under us. So we are done. */ + goto move_newpage; + } + + /* prepare cgroup just returns 0 or -ENOMEM */ + rc = -EAGAIN; + + if (!trylock_page(page)) { + if (!force) + goto move_newpage; + lock_page(page); + } + + /* + * Only memory hotplug's offline_pages() caller has locked out KSM, + * and can safely migrate a KSM page. The other cases have skipped + * PageKsm along with PageReserved - but it is only now when we have + * the page lock that we can be certain it will not go KSM beneath us + * (KSM will not upgrade a page from PageAnon to PageKsm when it sees + * its pagecount raised, but only here do we take the page lock which + * serializes that). + */ + if (PageKsm(page) && !offlining) { + rc = -EBUSY; + goto unlock; + } + + /* charge against new page */ + charge = mem_cgroup_prepare_migration(page, &mem); + if (charge == -ENOMEM) { + rc = -ENOMEM; + goto unlock; + } + BUG_ON(charge); + + if (PageWriteback(page)) { + if (!force) + goto uncharge; + wait_on_page_writeback(page); + } + /* + * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrates a page. + * This rcu_read_lock() delays freeing anon_vma pointer until the end + * of migration. File cache pages are no problem because of page_lock() + * File Caches may use write_page() or lock_page() in migration, then, + * just care Anon page here. + */ + if (PageAnon(page)) { + rcu_read_lock(); + rcu_locked = 1; + } + + /* + * Corner case handling: + * 1. When a new swap-cache page is read into, it is added to the LRU + * and treated as swapcache but it has no rmap yet. + * Calling try_to_unmap() against a page->mapping==NULL page will + * trigger a BUG. So handle it here. + * 2. An orphaned page (see truncate_complete_page) might have + * fs-private metadata. The page can be picked up due to memory + * offlining. Everywhere else except page reclaim, the page is + * invisible to the vm, so the page can not be migrated. So try to + * free the metadata, so the page can be freed. + */ + if (!page->mapping) { + if (!PageAnon(page) && page_has_private(page)) { + /* + * Go direct to try_to_free_buffers() here because + * a) that's what try_to_release_page() would do anyway + * b) we may be under rcu_read_lock() here, so we can't + * use GFP_KERNEL which is what try_to_release_page() + * needs to be effective. + */ + try_to_free_buffers(page); + goto rcu_unlock; + } + goto skip_unmap; + } + + /* Establish migration ptes or remove ptes */ + try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS); + +skip_unmap: + if (!page_mapped(page)) + rc = move_to_new_page(newpage, page); + + if (rc) + remove_migration_ptes(page, page); +rcu_unlock: + if (rcu_locked) + rcu_read_unlock(); +uncharge: + if (!charge) + mem_cgroup_end_migration(mem, page, newpage); +unlock: + unlock_page(page); + + if (rc != -EAGAIN) { + /* + * A page that has been migrated has all references + * removed and will be freed. A page that has not been + * migrated will have kepts its references and be + * restored. + */ + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + putback_lru_page(page); + } + +move_newpage: + + /* + * Move the new page to the LRU. If migration was not successful + * then this will free the page. + */ + putback_lru_page(newpage); + + if (result) { + if (rc) + *result = rc; + else + *result = page_to_nid(newpage); + } + return rc; } -EXPORT_SYMBOL(migrate_page); /* * migrate_pages * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. + * The function takes one list of pages to migrate and a function + * that determines from the page to be migrated and the private data + * the target of the move and allocates the page. * * The function returns after 10 attempts or if no pages * are movable anymore because to has become empty - * or no retryable pages exist anymore. + * or no retryable pages exist anymore. All pages will be + * returned to the LRU or freed. * - * Return: Number of pages not migrated when "to" ran empty. + * Return: Number of pages not migrated or error code. */ -int migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) +int migrate_pages(struct list_head *from, + new_page_t get_new_page, unsigned long private, int offlining) { - int retry; + int retry = 1; int nr_failed = 0; int pass = 0; struct page *page; @@ -359,294 +712,407 @@ int migrate_pages(struct list_head *from, struct list_head *to, if (!swapwrite) current->flags |= PF_SWAPWRITE; -redo: - retry = 0; + for(pass = 0; pass < 10 && retry; pass++) { + retry = 0; + + list_for_each_entry_safe(page, page2, from, lru) { + cond_resched(); + + rc = unmap_and_move(get_new_page, private, + page, pass > 2, offlining); + + switch(rc) { + case -ENOMEM: + goto out; + case -EAGAIN: + retry++; + break; + case 0: + break; + default: + /* Permanent failure */ + nr_failed++; + break; + } + } + } + rc = 0; +out: + if (!swapwrite) + current->flags &= ~PF_SWAPWRITE; - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; + putback_lru_pages(from); - cond_resched(); + if (rc) + return rc; - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; + return nr_failed + retry; +} - if (to && list_empty(to)) - break; +#ifdef CONFIG_NUMA +/* + * Move a list of individual pages + */ +struct page_to_node { + unsigned long addr; + struct page *page; + int node; + int status; +}; - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; +static struct page *new_page_node(struct page *p, unsigned long private, + int **result) +{ + struct page_to_node *pm = (struct page_to_node *)private; - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } + while (pm->node != MAX_NUMNODES && pm->page != p) + pm++; - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } + if (pm->node == MAX_NUMNODES) + return NULL; - if (!to) { - rc = swap_page(page); - goto next; - } + *result = &pm->status; - newpage = lru_to_page(to); - lock_page(newpage); + return alloc_pages_exact_node(pm->node, + GFP_HIGHUSER_MOVABLE | GFP_THISNODE, 0); +} - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; +/* + * Move a set of pages as indicated in the pm array. The addr + * field must be set to the virtual address of the page to be moved + * and the node number must contain a valid target node. + * The pm array ends with node = MAX_NUMNODES. + */ +static int do_move_page_to_node_array(struct mm_struct *mm, + struct page_to_node *pm, + int migrate_all) +{ + int err; + struct page_to_node *pp; + LIST_HEAD(pagelist); - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } + down_read(&mm->mmap_sem); - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } + /* + * Build a list of pages to migrate + */ + for (pp = pm; pp->node != MAX_NUMNODES; pp++) { + struct vm_area_struct *vma; + struct page *page; - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } + err = -EFAULT; + vma = find_vma(mm, pp->addr); + if (!vma || !vma_migratable(vma)) + goto set_status; - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { + page = follow_page(vma, pp->addr, FOLL_GET); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + + err = -ENOENT; + if (!page) + goto set_status; + + /* Use PageReserved to check for zero page */ + if (PageReserved(page) || PageKsm(page)) + goto put_and_set; + + pp->page = page; + err = page_to_nid(page); + + if (err == pp->node) /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). + * Node already in the right place */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); + goto put_and_set; + + err = -EACCES; + if (page_mapcount(page) > 1 && + !migrate_all) + goto put_and_set; + + err = isolate_lru_page(page); + if (!err) { + list_add_tail(&page->lru, &pagelist); + inc_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); } +put_and_set: + /* + * Either remove the duplicate refcount from + * isolate_lru_page() or drop the page ref if it was + * not isolated. + */ + put_page(page); +set_status: + pp->status = err; } - if (retry && pass++ < 10) - goto redo; - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; + err = 0; + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_page_node, + (unsigned long)pm, 0); - return nr_failed + retry; + up_read(&mm->mmap_sem); + return err; } /* - * Migration function for pages with buffers. This function can only be used - * if the underlying filesystem guarantees that no other references to "page" - * exist. + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. */ -int buffer_migrate_page(struct page *newpage, struct page *page) +static int do_pages_move(struct mm_struct *mm, struct task_struct *task, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) { - struct address_space *mapping = page->mapping; - struct buffer_head *bh, *head; - int rc; + struct page_to_node *pm; + nodemask_t task_nodes; + unsigned long chunk_nr_pages; + unsigned long chunk_start; + int err; - if (!mapping) - return -EAGAIN; + task_nodes = cpuset_mems_allowed(task); - if (!page_has_buffers(page)) - return migrate_page(newpage, page); + err = -ENOMEM; + pm = (struct page_to_node *)__get_free_page(GFP_KERNEL); + if (!pm) + goto out; - head = page_buffers(page); + migrate_prep(); - rc = migrate_page_remove_references(newpage, page, 3); + /* + * Store a chunk of page_to_node array in a page, + * but keep the last one as a marker + */ + chunk_nr_pages = (PAGE_SIZE / sizeof(struct page_to_node)) - 1; - if (rc) - return rc; + for (chunk_start = 0; + chunk_start < nr_pages; + chunk_start += chunk_nr_pages) { + int j; - bh = head; - do { - get_bh(bh); - lock_buffer(bh); - bh = bh->b_this_page; + if (chunk_start + chunk_nr_pages > nr_pages) + chunk_nr_pages = nr_pages - chunk_start; - } while (bh != head); + /* fill the chunk pm with addrs and nodes from user-space */ + for (j = 0; j < chunk_nr_pages; j++) { + const void __user *p; + int node; - ClearPagePrivate(page); - set_page_private(newpage, page_private(page)); - set_page_private(page, 0); - put_page(page); - get_page(newpage); + err = -EFAULT; + if (get_user(p, pages + j + chunk_start)) + goto out_pm; + pm[j].addr = (unsigned long) p; - bh = head; - do { - set_bh_page(bh, newpage, bh_offset(bh)); - bh = bh->b_this_page; + if (get_user(node, nodes + j + chunk_start)) + goto out_pm; - } while (bh != head); + err = -ENODEV; + if (!node_state(node, N_HIGH_MEMORY)) + goto out_pm; - SetPagePrivate(newpage); + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_pm; - migrate_page_copy(newpage, page); + pm[j].node = node; + } - bh = head; - do { - unlock_buffer(bh); - put_bh(bh); - bh = bh->b_this_page; + /* End marker for this chunk */ + pm[chunk_nr_pages].node = MAX_NUMNODES; - } while (bh != head); + /* Migrate this chunk */ + err = do_move_page_to_node_array(mm, pm, + flags & MPOL_MF_MOVE_ALL); + if (err < 0) + goto out_pm; - return 0; + /* Return status information */ + for (j = 0; j < chunk_nr_pages; j++) + if (put_user(pm[j].status, status + j + chunk_start)) { + err = -EFAULT; + goto out_pm; + } + } + err = 0; + +out_pm: + free_page((unsigned long)pm); +out: + return err; } -EXPORT_SYMBOL(buffer_migrate_page); /* - * Migrate the list 'pagelist' of pages to a certain destination. - * - * Specify destination with either non-NULL vma or dest_node >= 0 - * Return the number of pages not migrated or error code + * Determine the nodes of an array of pages and store it in an array of status. */ -int migrate_pages_to(struct list_head *pagelist, - struct vm_area_struct *vma, int dest) +static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, + const void __user **pages, int *status) { - LIST_HEAD(newlist); - LIST_HEAD(moved); - LIST_HEAD(failed); - int err = 0; - unsigned long offset = 0; - int nr_pages; - struct page *page; - struct list_head *p; + unsigned long i; -redo: - nr_pages = 0; - list_for_each(p, pagelist) { - if (vma) { - /* - * The address passed to alloc_page_vma is used to - * generate the proper interleave behavior. We fake - * the address here by an increasing offset in order - * to get the proper distribution of pages. - * - * No decision has been made as to which page - * a certain old page is moved to so we cannot - * specify the correct address. - */ - page = alloc_page_vma(GFP_HIGHUSER, vma, - offset + vma->vm_start); - offset += PAGE_SIZE; + down_read(&mm->mmap_sem); + + for (i = 0; i < nr_pages; i++) { + unsigned long addr = (unsigned long)(*pages); + struct vm_area_struct *vma; + struct page *page; + int err = -EFAULT; + + vma = find_vma(mm, addr); + if (!vma) + goto set_status; + + page = follow_page(vma, addr, 0); + + err = PTR_ERR(page); + if (IS_ERR(page)) + goto set_status; + + err = -ENOENT; + /* Use PageReserved to check for zero page */ + if (!page || PageReserved(page) || PageKsm(page)) + goto set_status; + + err = page_to_nid(page); +set_status: + *status = err; + + pages++; + status++; + } + + up_read(&mm->mmap_sem); +} + +/* + * Determine the nodes of a user array of pages and store it in + * a user array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ +#define DO_PAGES_STAT_CHUNK_NR 16 + const void __user *chunk_pages[DO_PAGES_STAT_CHUNK_NR]; + int chunk_status[DO_PAGES_STAT_CHUNK_NR]; + unsigned long i, chunk_nr = DO_PAGES_STAT_CHUNK_NR; + int err; + + for (i = 0; i < nr_pages; i += chunk_nr) { + if (chunk_nr > nr_pages - i) + chunk_nr = nr_pages - i; + + err = copy_from_user(chunk_pages, &pages[i], + chunk_nr * sizeof(*chunk_pages)); + if (err) { + err = -EFAULT; + goto out; } - else - page = alloc_pages_node(dest, GFP_HIGHUSER, 0); - if (!page) { - err = -ENOMEM; + do_pages_stat_array(mm, chunk_nr, chunk_pages, chunk_status); + + err = copy_to_user(&status[i], chunk_status, + chunk_nr * sizeof(*chunk_status)); + if (err) { + err = -EFAULT; goto out; } - list_add_tail(&page->lru, &newlist); - nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE) - break; } - err = migrate_pages(pagelist, &newlist, &moved, &failed); - - putback_lru_pages(&moved); /* Call release pages instead ?? */ + err = 0; - if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist)) - goto redo; out: - /* Return leftover allocated pages */ - while (!list_empty(&newlist)) { - page = list_entry(newlist.next, struct page, lru); - list_del(&page->lru); - __free_page(page); + return err; +} + +/* + * Move a list of pages in the address space of the currently executing + * process. + */ +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) +{ + const struct cred *cred = current_cred(), *tcred; + struct task_struct *task; + struct mm_struct *mm; + int err; + + /* Check flags */ + if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) + return -EINVAL; + + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_vpid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; } - list_splice(&failed, pagelist); - if (err < 0) - return err; - - /* Calculate number of leftover pages */ - nr_pages = 0; - list_for_each(p, pagelist) - nr_pages++; - return nr_pages; + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + rcu_read_lock(); + tcred = __task_cred(task); + if (cred->euid != tcred->suid && cred->euid != tcred->uid && + cred->uid != tcred->suid && cred->uid != tcred->uid && + !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + err = -EPERM; + goto out; + } + rcu_read_unlock(); + + err = security_task_movememory(task); + if (err) + goto out; + + if (nodes) { + err = do_pages_move(mm, task, nr_pages, pages, nodes, status, + flags); + } else { + err = do_pages_stat(mm, nr_pages, pages, status); + } + +out: + mmput(mm); + return err; +} + +/* + * Call migration functions in the vma_ops that may prepare + * memory in a vm for migration. migration functions may perform + * the migration for vmas that do not have an underlying page struct. + */ +int migrate_vmas(struct mm_struct *mm, const nodemask_t *to, + const nodemask_t *from, unsigned long flags) +{ + struct vm_area_struct *vma; + int err = 0; + + for (vma = mm->mmap; vma && !err; vma = vma->vm_next) { + if (vma->vm_ops && vma->vm_ops->migrate) { + err = vma->vm_ops->migrate(vma, to, from, flags); + if (err) + break; + } + } + return err; } +#endif