+
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ */
+int zone_reclaim_mode __read_mostly;
+
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0) /* Run shrink_inactive_list on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
+/*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
+
+static inline unsigned long zone_unmapped_file_pages(struct zone *zone)
+{
+ unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED);
+ unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_FILE);
+
+ /*
+ * It's possible for there to be more file mapped pages than
+ * accounted for by the pages on the file LRU lists because
+ * tmpfs pages accounted for as ANON can also be FILE_MAPPED
+ */
+ return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0;
+}
+
+/* Work out how many page cache pages we can reclaim in this reclaim_mode */
+static long zone_pagecache_reclaimable(struct zone *zone)
+{
+ long nr_pagecache_reclaimable;
+ long delta = 0;
+
+ /*
+ * If RECLAIM_SWAP is set, then all file pages are considered
+ * potentially reclaimable. Otherwise, we have to worry about
+ * pages like swapcache and zone_unmapped_file_pages() provides
+ * a better estimate
+ */
+ if (zone_reclaim_mode & RECLAIM_SWAP)
+ nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES);
+ else
+ nr_pagecache_reclaimable = zone_unmapped_file_pages(zone);
+
+ /* If we can't clean pages, remove dirty pages from consideration */
+ if (!(zone_reclaim_mode & RECLAIM_WRITE))
+ delta += zone_page_state(zone, NR_FILE_DIRTY);
+
+ /* Watch for any possible underflows due to delta */
+ if (unlikely(delta > nr_pagecache_reclaimable))
+ delta = nr_pagecache_reclaimable;
+
+ return nr_pagecache_reclaimable - delta;
+}
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ int priority;
+ struct scan_control sc = {
+ .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+ .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .may_swap = 1,
+ .nr_to_reclaim = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
+ .gfp_mask = gfp_mask,
+ .swappiness = vm_swappiness,
+ .order = order,
+ .isolate_pages = isolate_pages_global,
+ };
+ unsigned long slab_reclaimable;
+
+ disable_swap_token();
+ cond_resched();
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ lockdep_set_current_reclaim_state(gfp_mask);
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
+
+ if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) {
+ /*
+ * Free memory by calling shrink zone with increasing
+ * priorities until we have enough memory freed.
+ */
+ priority = ZONE_RECLAIM_PRIORITY;
+ do {
+ note_zone_scanning_priority(zone, priority);
+ shrink_zone(priority, zone, &sc);
+ priority--;
+ } while (priority >= 0 && sc.nr_reclaimed < nr_pages);
+ }
+
+ slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (slab_reclaimable > zone->min_slab_pages) {
+ /*
+ * shrink_slab() does not currently allow us to determine how
+ * many pages were freed in this zone. So we take the current
+ * number of slab pages and shake the slab until it is reduced
+ * by the same nr_pages that we used for reclaiming unmapped
+ * pages.
+ *
+ * Note that shrink_slab will free memory on all zones and may
+ * take a long time.
+ */
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+ slab_reclaimable - nr_pages)
+ ;
+
+ /*
+ * Update nr_reclaimed by the number of slab pages we
+ * reclaimed from this zone.
+ */
+ sc.nr_reclaimed += slab_reclaimable -
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ }
+
+ p->reclaim_state = NULL;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+ lockdep_clear_current_reclaim_state();
+ return sc.nr_reclaimed >= nr_pages;
+}
+
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+{
+ int node_id;
+ int ret;
+
+ /*
+ * Zone reclaim reclaims unmapped file backed pages and
+ * slab pages if we are over the defined limits.
+ *
+ * A small portion of unmapped file backed pages is needed for
+ * file I/O otherwise pages read by file I/O will be immediately
+ * thrown out if the zone is overallocated. So we do not reclaim
+ * if less than a specified percentage of the zone is used by
+ * unmapped file backed pages.
+ */
+ if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages &&
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
+ return ZONE_RECLAIM_FULL;
+
+ if (zone->all_unreclaimable)
+ return ZONE_RECLAIM_FULL;
+
+ /*
+ * Do not scan if the allocation should not be delayed.
+ */
+ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
+ return ZONE_RECLAIM_NOSCAN;
+
+ /*
+ * Only run zone reclaim on the local zone or on zones that do not
+ * have associated processors. This will favor the local processor
+ * over remote processors and spread off node memory allocations
+ * as wide as possible.
+ */
+ node_id = zone_to_nid(zone);
+ if (node_state(node_id, N_CPU) && node_id != numa_node_id())
+ return ZONE_RECLAIM_NOSCAN;
+
+ if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+ return ZONE_RECLAIM_NOSCAN;
+
+ ret = __zone_reclaim(zone, gfp_mask, order);
+ zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+
+ if (!ret)
+ count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED);
+
+ return ret;
+}
+#endif
+
+/*
+ * page_evictable - test whether a page is evictable
+ * @page: the page to test
+ * @vma: the VMA in which the page is or will be mapped, may be NULL
+ *
+ * Test whether page is evictable--i.e., should be placed on active/inactive
+ * lists vs unevictable list. The vma argument is !NULL when called from the
+ * fault path to determine how to instantate a new page.
+ *
+ * Reasons page might not be evictable:
+ * (1) page's mapping marked unevictable
+ * (2) page is part of an mlocked VMA
+ *
+ */
+int page_evictable(struct page *page, struct vm_area_struct *vma)
+{
+
+ if (mapping_unevictable(page_mapping(page)))
+ return 0;
+
+ if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page)))
+ return 0;
+
+ return 1;
+}
+
+/**
+ * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list
+ * @page: page to check evictability and move to appropriate lru list
+ * @zone: zone page is in
+ *
+ * Checks a page for evictability and moves the page to the appropriate
+ * zone lru list.
+ *
+ * Restrictions: zone->lru_lock must be held, page must be on LRU and must
+ * have PageUnevictable set.
+ */
+static void check_move_unevictable_page(struct page *page, struct zone *zone)
+{
+ VM_BUG_ON(PageActive(page));
+
+retry:
+ ClearPageUnevictable(page);
+ if (page_evictable(page, NULL)) {
+ enum lru_list l = page_lru_base_type(page);
+
+ __dec_zone_state(zone, NR_UNEVICTABLE);
+ list_move(&page->lru, &zone->lru[l].list);
+ mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l);
+ __inc_zone_state(zone, NR_INACTIVE_ANON + l);
+ __count_vm_event(UNEVICTABLE_PGRESCUED);
+ } else {
+ /*
+ * rotate unevictable list
+ */
+ SetPageUnevictable(page);
+ list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list);
+ mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE);
+ if (page_evictable(page, NULL))
+ goto retry;
+ }
+}
+
+/**
+ * scan_mapping_unevictable_pages - scan an address space for evictable pages
+ * @mapping: struct address_space to scan for evictable pages
+ *
+ * Scan all pages in mapping. Check unevictable pages for
+ * evictability and move them to the appropriate zone lru list.
+ */
+void scan_mapping_unevictable_pages(struct address_space *mapping)
+{
+ pgoff_t next = 0;
+ pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >>
+ PAGE_CACHE_SHIFT;
+ struct zone *zone;
+ struct pagevec pvec;
+
+ if (mapping->nrpages == 0)
+ return;
+
+ pagevec_init(&pvec, 0);
+ while (next < end &&
+ pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
+ int i;
+ int pg_scanned = 0;
+
+ zone = NULL;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ struct page *page = pvec.pages[i];
+ pgoff_t page_index = page->index;
+ struct zone *pagezone = page_zone(page);
+
+ pg_scanned++;
+ if (page_index > next)
+ next = page_index;
+ next++;
+
+ if (pagezone != zone) {
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ zone = pagezone;
+ spin_lock_irq(&zone->lru_lock);
+ }
+
+ if (PageLRU(page) && PageUnevictable(page))
+ check_move_unevictable_page(page, zone);
+ }
+ if (zone)
+ spin_unlock_irq(&zone->lru_lock);
+ pagevec_release(&pvec);
+
+ count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned);
+ }
+
+}
+
+/**
+ * scan_zone_unevictable_pages - check unevictable list for evictable pages
+ * @zone - zone of which to scan the unevictable list
+ *
+ * Scan @zone's unevictable LRU lists to check for pages that have become
+ * evictable. Move those that have to @zone's inactive list where they
+ * become candidates for reclaim, unless shrink_inactive_zone() decides
+ * to reactivate them. Pages that are still unevictable are rotated
+ * back onto @zone's unevictable list.
+ */
+#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */
+static void scan_zone_unevictable_pages(struct zone *zone)
+{
+ struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list;
+ unsigned long scan;
+ unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE);
+
+ while (nr_to_scan > 0) {
+ unsigned long batch_size = min(nr_to_scan,
+ SCAN_UNEVICTABLE_BATCH_SIZE);
+
+ spin_lock_irq(&zone->lru_lock);
+ for (scan = 0; scan < batch_size; scan++) {
+ struct page *page = lru_to_page(l_unevictable);
+
+ if (!trylock_page(page))
+ continue;
+
+ prefetchw_prev_lru_page(page, l_unevictable, flags);
+
+ if (likely(PageLRU(page) && PageUnevictable(page)))
+ check_move_unevictable_page(page, zone);
+
+ unlock_page(page);
+ }
+ spin_unlock_irq(&zone->lru_lock);
+
+ nr_to_scan -= batch_size;
+ }
+}
+
+
+/**
+ * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages
+ *
+ * A really big hammer: scan all zones' unevictable LRU lists to check for
+ * pages that have become evictable. Move those back to the zones'
+ * inactive list where they become candidates for reclaim.
+ * This occurs when, e.g., we have unswappable pages on the unevictable lists,
+ * and we add swap to the system. As such, it runs in the context of a task
+ * that has possibly/probably made some previously unevictable pages
+ * evictable.
+ */
+static void scan_all_zones_unevictable_pages(void)
+{
+ struct zone *zone;
+
+ for_each_zone(zone) {
+ scan_zone_unevictable_pages(zone);
+ }
+}
+
+/*
+ * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of
+ * all nodes' unevictable lists for evictable pages
+ */
+unsigned long scan_unevictable_pages;
+
+int scan_unevictable_handler(struct ctl_table *table, int write,
+ void __user *buffer,
+ size_t *length, loff_t *ppos)
+{
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
+
+ if (write && *(unsigned long *)table->data)
+ scan_all_zones_unevictable_pages();
+
+ scan_unevictable_pages = 0;
+ return 0;
+}
+
+/*
+ * per node 'scan_unevictable_pages' attribute. On demand re-scan of
+ * a specified node's per zone unevictable lists for evictable pages.
+ */
+
+static ssize_t read_scan_unevictable_node(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "0\n"); /* always zero; should fit... */
+}
+
+static ssize_t write_scan_unevictable_node(struct sys_device *dev,
+ struct sysdev_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct zone *node_zones = NODE_DATA(dev->id)->node_zones;
+ struct zone *zone;
+ unsigned long res;
+ unsigned long req = strict_strtoul(buf, 10, &res);
+
+ if (!req)
+ return 1; /* zero is no-op */
+
+ for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
+ if (!populated_zone(zone))
+ continue;
+ scan_zone_unevictable_pages(zone);
+ }
+ return 1;
+}
+
+
+static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR,
+ read_scan_unevictable_node,
+ write_scan_unevictable_node);
+
+int scan_unevictable_register_node(struct node *node)
+{
+ return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+
+void scan_unevictable_unregister_node(struct node *node)
+{
+ sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages);
+}
+