[PATCH] core remove PageReserved

author Nick Piggin <nickpiggin@yahoo.com.au>

Sun, 30 Oct 2005 01:16:12 +0000 (18:16 -0700)

committer Linus Torvalds <torvalds@g5.osdl.org>

Sun, 30 Oct 2005 04:40:39 +0000 (21:40 -0700)
author Nick Piggin <nickpiggin@yahoo.com.au>
Sun, 30 Oct 2005 01:16:12 +0000 (18:16 -0700)
committer Linus Torvalds <torvalds@g5.osdl.org>
Sun, 30 Oct 2005 04:40:39 +0000 (21:40 -0700)
diff --git a/arch/ppc64/kernel/vdso.c b/arch/ppc64/kernel/vdso.c

index efa985f..4aacf52 100644 (file)
--- a/arch/ppc64/kernel/vdso.c
+++ b/arch/ppc64/kernel/vdso.c
@@ -176,13 +176,13 @@ static struct page * vdso_vma_nopage(struct vm_area_struct * vma,
                 return NOPAGE_SIGBUS;
  
         /*
-        * Last page is systemcfg, special handling here, no get_page() a
-        * this is a reserved page
+        * Last page is systemcfg.
          */
         if ((vma->vm_end - address) <= PAGE_SIZE)
-               return virt_to_page(systemcfg);
+               pg = virt_to_page(systemcfg);
+       else
+               pg = virt_to_page(vbase + offset);
  
-       pg = virt_to_page(vbase + offset);
         get_page(pg);
         DBG(" ->page count: %d\n", page_count(pg));
  
@@ -259,7 +259,7 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int executable_stack)
          * gettimeofday will be totally dead. It's fine to use that for setting
          * breakpoints in the vDSO code pages though
          */
-       vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
+       vma->vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC | VM_RESERVED;
         vma->vm_flags |= mm->def_flags;
         vma->vm_page_prot = protection_map[vma->vm_flags & 0x7];
         vma->vm_ops = &vdso_vmops;
@@ -603,6 +603,8 @@ void __init vdso_init(void)
                 ClearPageReserved(pg);
                 get_page(pg);
         }
+
+       get_page(virt_to_page(systemcfg));
  }
  
  int in_gate_area_no_task(unsigned long addr)
diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c

index 20ccb95..659c9a7 100644 (file)
--- a/arch/sparc/mm/generic.c
+++ b/arch/sparc/mm/generic.c
@@ -73,6 +73,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
         int space = GET_IOSPACE(pfn);
         unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
  
+       /* See comment in mm/memory.c remap_pfn_range */
+       vma->vm_flags |= VM_IO | VM_RESERVED;
+
         prot = __pgprot(pg_iobits);
         offset -= from;
         dir = pgd_offset(mm, from);
diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c

index c954d91..afc01ce 100644 (file)
--- a/arch/sparc64/mm/generic.c
+++ b/arch/sparc64/mm/generic.c
@@ -127,6 +127,9 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from,
         int space = GET_IOSPACE(pfn);
         unsigned long offset = GET_PFN(pfn) << PAGE_SHIFT;
  
+       /* See comment in mm/memory.c remap_pfn_range */
+       vma->vm_flags |= VM_IO | VM_RESERVED;
+
         prot = __pgprot(pg_iobits);
         offset -= from;
         dir = pgd_offset(mm, from);
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c

index 861e513..2d30b46 100644 (file)
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1886,13 +1886,17 @@ st_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_pages,
         int i;
  
         for (i=0; i < nr_pages; i++) {
-               if (dirtied && !PageReserved(sgl[i].page))
-                       SetPageDirty(sgl[i].page);
-               /* unlock_page(sgl[i].page); */
+               struct page *page = sgl[i].page;
+
+               /* XXX: just for debug. Remove when PageReserved is removed */
+               BUG_ON(PageReserved(page));
+               if (dirtied)
+                       SetPageDirty(page);
+               /* unlock_page(page); */
                 /* FIXME: cache flush missing for rw==READ
                  * FIXME: call the correct reference counting function
                  */
-               page_cache_release(sgl[i].page);
+               page_cache_release(page);
         }
  
         return 0;
diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c

index 5eb54d8..da97662 100644 (file)
--- a/drivers/scsi/st.c
+++ b/drivers/scsi/st.c
@@ -4526,12 +4526,16 @@ static int sgl_unmap_user_pages(struct scatterlist *sgl, const unsigned int nr_p
         int i;
  
         for (i=0; i < nr_pages; i++) {
-               if (dirtied && !PageReserved(sgl[i].page))
-                       SetPageDirty(sgl[i].page);
+               struct page *page = sgl[i].page;
+
+               /* XXX: just for debug. Remove when PageReserved is removed */
+               BUG_ON(PageReserved(page));
+               if (dirtied)
+                       SetPageDirty(page);
                 /* FIXME: cache flush missing for rw==READ
                  * FIXME: call the correct reference counting function
                  */
-               page_cache_release(sgl[i].page);
+               page_cache_release(page);
         }
  
         return 0;
diff --git a/fs/direct-io.c b/fs/direct-io.c

index 0d06097..3931e7f 100644 (file)
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -162,6 +162,7 @@ static int dio_refill_pages(struct dio *dio)
         up_read(&current->mm->mmap_sem);
  
         if (ret < 0 && dio->blocks_available && (dio->rw == WRITE)) {
+               struct page *page = ZERO_PAGE(dio->curr_user_address);
                 /*
                  * A memory fault, but the filesystem has some outstanding
                  * mapped blocks.  We need to use those blocks up to avoid
@@ -169,7 +170,8 @@ static int dio_refill_pages(struct dio *dio)
                  */
                 if (dio->page_errors == 0)
                         dio->page_errors = ret;
-               dio->pages[0] = ZERO_PAGE(dio->curr_user_address);
+               page_cache_get(page);
+               dio->pages[0] = page;
                 dio->head = 0;
                 dio->tail = 1;
                 ret = 0;
diff --git a/include/linux/mm.h b/include/linux/mm.h

index 0c64484..da42093 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -157,7 +157,7 @@ extern unsigned int kobjsize(const void *objp);
  
  #define VM_DONTCOPY    0x00020000      /* Do not copy this vma on fork */
  #define VM_DONTEXPAND  0x00040000      /* Cannot expand with mremap() */
-#define VM_RESERVED    0x00080000      /* Don't unmap it from swap_out */
+#define VM_RESERVED    0x00080000      /* Pages managed in a special way */
  #define VM_ACCOUNT     0x00100000      /* Is a VM accounted object */
  #define VM_HUGETLB     0x00400000      /* Huge TLB Page VM */
  #define VM_NONLINEAR   0x00800000      /* Is non-linear (remap_file_pages) */
@@ -338,7 +338,7 @@ static inline void get_page(struct page *page)
  
  static inline void put_page(struct page *page)
  {
-       if (!PageReserved(page) && put_page_testzero(page))
+       if (put_page_testzero(page))
                 __page_cache_release(page);
  }
  
@@ -723,6 +723,7 @@ void install_arg_page(struct vm_area_struct *, struct page *, unsigned long);
  
  int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start,
                 int len, int write, int force, struct page **pages, struct vm_area_struct **vmas);
+void print_bad_pte(struct vm_area_struct *, pte_t, unsigned long);
  
  int __set_page_dirty_buffers(struct page *page);
  int __set_page_dirty_nobuffers(struct page *page);
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c

index 10bc5ec..016504c 100644 (file)
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -578,15 +578,23 @@ static int save_highmem_zone(struct zone *zone)
                         continue;
                 page = pfn_to_page(pfn);
                 /*
-                * This condition results from rvmalloc() sans vmalloc_32()
-                * and architectural memory reservations. This should be
-                * corrected eventually when the cases giving rise to this
-                * are better understood.
+                * PageReserved results from rvmalloc() sans vmalloc_32()
+                * and architectural memory reservations.
+                *
+                * rvmalloc should not cause this, because all implementations
+                * appear to always be using vmalloc_32 on architectures with
+                * highmem. This is a good thing, because we would like to save
+                * rvmalloc pages.
+                *
+                * It appears to be triggered by pages which do not point to
+                * valid memory (see arch/i386/mm/init.c:one_highpage_init(),
+                * which sets PageReserved if the page does not point to valid
+                * RAM.
+                *
+                * XXX: must remove usage of PageReserved!
                  */
-               if (PageReserved(page)) {
-                       printk("highmem reserved page?!\n");
+               if (PageReserved(page))
                         continue;
-               }
                 BUG_ON(PageNosave(page));
                 if (PageNosaveFree(page))
                         continue;
@@ -672,10 +680,9 @@ static int saveable(struct zone * zone, unsigned long * zone_pfn)
                 return 0;
  
         page = pfn_to_page(pfn);
-       BUG_ON(PageReserved(page) && PageNosave(page));
         if (PageNosave(page))
                 return 0;
-       if (PageReserved(page) && pfn_is_nosave(pfn)) {
+       if (pfn_is_nosave(pfn)) {
                 pr_debug("[nosave pfn 0x%lx]", pfn);
                 return 0;
         }
diff --git a/mm/bootmem.c b/mm/bootmem.c

index a58699b..e8c5671 100644 (file)
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -305,6 +305,7 @@ static unsigned long __init free_all_bootmem_core(pg_data_t *pgdat)
                                 if (j + 16 < BITS_PER_LONG)
                                         prefetchw(page + j + 16);
                                 __ClearPageReserved(page + j);
+                               set_page_count(page + j, 0);
                         }
                         __free_pages(page, order);
                         i += BITS_PER_LONG;
diff --git a/mm/filemap_xip.c b/mm/filemap_xip.c

index 8c199f5..9354ee2 100644 (file)
--- a/mm/filemap_xip.c
+++ b/mm/filemap_xip.c
@@ -174,6 +174,7 @@ __xip_unmap (struct address_space * mapping,
         unsigned long address;
         pte_t *pte;
         pte_t pteval;
+       struct page *page = ZERO_PAGE(address);
  
         spin_lock(&mapping->i_mmap_lock);
         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
@@ -185,15 +186,17 @@ __xip_unmap (struct address_space * mapping,
                  * We need the page_table_lock to protect us from page faults,
                  * munmap, fork, etc...
                  */
-               pte = page_check_address(ZERO_PAGE(address), mm,
-                                        address);
+               pte = page_check_address(page, mm, address);
                 if (!IS_ERR(pte)) {
                         /* Nuke the page table entry. */
                         flush_cache_page(vma, address, pte_pfn(*pte));
                         pteval = ptep_clear_flush(vma, address, pte);
+                       page_remove_rmap(page);
+                       dec_mm_counter(mm, file_rss);
                         BUG_ON(pte_dirty(pteval));
                         pte_unmap(pte);
                         spin_unlock(&mm->page_table_lock);
+                       page_cache_release(page);
                 }
         }
         spin_unlock(&mapping->i_mmap_lock);
@@ -228,7 +231,7 @@ xip_file_nopage(struct vm_area_struct * area,
  
         page = mapping->a_ops->get_xip_page(mapping, pgoff*(PAGE_SIZE/512), 0);
         if (!IS_ERR(page)) {
-               return page;
+               goto out;
         }
         if (PTR_ERR(page) != -ENODATA)
                 return NULL;
@@ -249,6 +252,8 @@ xip_file_nopage(struct vm_area_struct * area,
                 page = ZERO_PAGE(address);
         }
  
+out:
+       page_cache_get(page);
         return page;
  }
  
diff --git a/mm/fremap.c b/mm/fremap.c

index fd7f2a1..224cc15 100644 (file)
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -29,19 +29,20 @@ static inline void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
                 return;
         if (pte_present(pte)) {
                 unsigned long pfn = pte_pfn(pte);
+               struct page *page;
  
                 flush_cache_page(vma, addr, pfn);
                 pte = ptep_clear_flush(vma, addr, ptep);
-               if (pfn_valid(pfn)) {
-                       struct page *page = pfn_to_page(pfn);
-                       if (!PageReserved(page)) {
-                               if (pte_dirty(pte))
-                                       set_page_dirty(page);
-                               page_remove_rmap(page);
-                               page_cache_release(page);
-                               dec_mm_counter(mm, file_rss);
-                       }
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, pte, addr);
+                       return;
                 }
+               page = pfn_to_page(pfn);
+               if (pte_dirty(pte))
+                       set_page_dirty(page);
+               page_remove_rmap(page);
+               page_cache_release(page);
+               dec_mm_counter(mm, file_rss);
         } else {
                 if (!pte_file(pte))
                         free_swap_and_cache(pte_to_swp_entry(pte));
@@ -65,6 +66,8 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma,
         pgd_t *pgd;
         pte_t pte_val;
  
+       BUG_ON(vma->vm_flags & VM_RESERVED);
+
         pgd = pgd_offset(mm, addr);
         spin_lock(&mm->page_table_lock);
         
@@ -125,6 +128,8 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
         pgd_t *pgd;
         pte_t pte_val;
  
+       BUG_ON(vma->vm_flags & VM_RESERVED);
+
         pgd = pgd_offset(mm, addr);
         spin_lock(&mm->page_table_lock);
         
diff --git a/mm/madvise.c b/mm/madvise.c

index 20e075d..17aaf3e 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
                              unsigned long start, unsigned long end)
  {
         *prev = vma;
-       if ((vma->vm_flags & VM_LOCKED) || is_vm_hugetlb_page(vma))
+       if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED))
                 return -EINVAL;
  
         if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
diff --git a/mm/memory.c b/mm/memory.c

index da642b5..e83f944 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -343,6 +343,23 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
  #define NO_RSS 2       /* Increment neither file_rss nor anon_rss */
  
  /*
+ * This function is called to print an error when a pte in a
+ * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * is an error.
+ *
+ * The calling function must still handle the error.
+ */
+void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
+{
+       printk(KERN_ERR "Bad pte = %08llx, process = %s, "
+                       "vm_flags = %lx, vaddr = %lx\n",
+               (long long)pte_val(pte),
+               (vma->vm_mm == current->mm ? current->comm : "???"),
+               vma->vm_flags, vaddr);
+       dump_stack();
+}
+
+/*
   * copy one vm_area from one task to the other. Assumes the page tables
   * already present in the new task to be cleared in the whole range
   * covered by this vma.
@@ -353,9 +370,10 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
  
  static inline int
  copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
-               pte_t *dst_pte, pte_t *src_pte, unsigned long vm_flags,
+               pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
                 unsigned long addr)
  {
+       unsigned long vm_flags = vma->vm_flags;
         pte_t pte = *src_pte;
         struct page *page;
         unsigned long pfn;
@@ -375,18 +393,22 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 goto out_set_pte;
         }
  
+       /* If the region is VM_RESERVED, the mapping is not
+        * mapped via rmap - duplicate the pte as is.
+        */
+       if (vm_flags & VM_RESERVED)
+               goto out_set_pte;
+
         pfn = pte_pfn(pte);
-       /* the pte points outside of valid memory, the
-        * mapping is assumed to be good, meaningful
-        * and not mapped via rmap - duplicate the
-        * mapping as is.
+       /* If the pte points outside of valid memory but
+        * the region is not VM_RESERVED, we have a problem.
          */
-       page = NULL;
-       if (pfn_valid(pfn))
-               page = pfn_to_page(pfn);
+       if (unlikely(!pfn_valid(pfn))) {
+               print_bad_pte(vma, pte, addr);
+               goto out_set_pte; /* try to do something sane */
+       }
  
-       if (!page || PageReserved(page))
-               goto out_set_pte;
+       page = pfn_to_page(pfn);
  
         /*
          * If it's a COW mapping, write protect it both
@@ -418,7 +440,6 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
                 unsigned long addr, unsigned long end)
  {
         pte_t *src_pte, *dst_pte;
-       unsigned long vm_flags = vma->vm_flags;
         int progress = 0;
         int rss[NO_RSS+1], anon;
  
@@ -446,8 +467,7 @@ again:
                         progress++;
                         continue;
                 }
-               anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
-                                                       vm_flags, addr);
+               anon = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte, vma,addr);
                 rss[anon]++;
                 progress += 8;
         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -541,10 +561,12 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         return 0;
  }
  
-static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
+static void zap_pte_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pmd_t *pmd,
                                 unsigned long addr, unsigned long end,
                                 struct zap_details *details)
  {
+       struct mm_struct *mm = tlb->mm;
         pte_t *pte;
         int file_rss = 0;
         int anon_rss = 0;
@@ -556,11 +578,12 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                         continue;
                 if (pte_present(ptent)) {
                         struct page *page = NULL;
-                       unsigned long pfn = pte_pfn(ptent);
-                       if (pfn_valid(pfn)) {
-                               page = pfn_to_page(pfn);
-                               if (PageReserved(page))
-                                       page = NULL;
+                       if (!(vma->vm_flags & VM_RESERVED)) {
+                               unsigned long pfn = pte_pfn(ptent);
+                               if (unlikely(!pfn_valid(pfn)))
+                                       print_bad_pte(vma, ptent, addr);
+                               else
+                                       page = pfn_to_page(pfn);
                         }
                         if (unlikely(details) && page) {
                                 /*
@@ -580,7 +603,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                                      page->index > details->last_index))
                                         continue;
                         }
-                       ptent = ptep_get_and_clear_full(tlb->mm, addr, pte,
+                       ptent = ptep_get_and_clear_full(mm, addr, pte,
                                                         tlb->fullmm);
                         tlb_remove_tlb_entry(tlb, pte, addr);
                         if (unlikely(!page))
@@ -588,7 +611,7 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                         if (unlikely(details) && details->nonlinear_vma
                             && linear_page_index(details->nonlinear_vma,
                                                 addr) != page->index)
-                               set_pte_at(tlb->mm, addr, pte,
+                               set_pte_at(mm, addr, pte,
                                            pgoff_to_pte(page->index));
                         if (PageAnon(page))
                                 anon_rss++;
@@ -611,14 +634,15 @@ static void zap_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
                         continue;
                 if (!pte_file(ptent))
                         free_swap_and_cache(pte_to_swp_entry(ptent));
-               pte_clear_full(tlb->mm, addr, pte, tlb->fullmm);
+               pte_clear_full(mm, addr, pte, tlb->fullmm);
         } while (pte++, addr += PAGE_SIZE, addr != end);
  
-       add_mm_rss(tlb->mm, -file_rss, -anon_rss);
+       add_mm_rss(mm, -file_rss, -anon_rss);
         pte_unmap(pte - 1);
  }
  
-static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+static inline void zap_pmd_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pud_t *pud,
                                 unsigned long addr, unsigned long end,
                                 struct zap_details *details)
  {
@@ -630,11 +654,12 @@ static inline void zap_pmd_range(struct mmu_gather *tlb, pud_t *pud,
                 next = pmd_addr_end(addr, end);
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               zap_pte_range(tlb, pmd, addr, next, details);
+               zap_pte_range(tlb, vma, pmd, addr, next, details);
         } while (pmd++, addr = next, addr != end);
  }
  
-static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+static inline void zap_pud_range(struct mmu_gather *tlb,
+                               struct vm_area_struct *vma, pgd_t *pgd,
                                 unsigned long addr, unsigned long end,
                                 struct zap_details *details)
  {
@@ -646,7 +671,7 @@ static inline void zap_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
                 next = pud_addr_end(addr, end);
                 if (pud_none_or_clear_bad(pud))
                         continue;
-               zap_pmd_range(tlb, pud, addr, next, details);
+               zap_pmd_range(tlb, vma, pud, addr, next, details);
         } while (pud++, addr = next, addr != end);
  }
  
@@ -667,7 +692,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               zap_pud_range(tlb, pgd, addr, next, details);
+               zap_pud_range(tlb, vma, pgd, addr, next, details);
         } while (pgd++, addr = next, addr != end);
         tlb_end_vma(tlb, vma);
  }
@@ -967,7 +992,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         continue;
                 }
  
-               if (!vma || (vma->vm_flags & VM_IO)
+               if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
                                 || !(flags & vma->vm_flags))
                         return i ? : -EFAULT;
  
@@ -1027,8 +1052,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         if (pages) {
                                 pages[i] = page;
                                 flush_dcache_page(page);
-                               if (!PageReserved(page))
-                                       page_cache_get(page);
+                               page_cache_get(page);
                         }
                         if (vmas)
                                 vmas[i] = vma;
@@ -1051,7 +1075,11 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
         if (!pte)
                 return -ENOMEM;
         do {
-               pte_t zero_pte = pte_wrprotect(mk_pte(ZERO_PAGE(addr), prot));
+               struct page *page = ZERO_PAGE(addr);
+               pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
+               page_cache_get(page);
+               page_add_file_rmap(page);
+               inc_mm_counter(mm, file_rss);
                 BUG_ON(!pte_none(*pte));
                 set_pte_at(mm, addr, pte, zero_pte);
         } while (pte++, addr += PAGE_SIZE, addr != end);
@@ -1132,8 +1160,7 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                 return -ENOMEM;
         do {
                 BUG_ON(!pte_none(*pte));
-               if (!pfn_valid(pfn) || PageReserved(pfn_to_page(pfn)))
-                       set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
+               set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
                 pfn++;
         } while (pte++, addr += PAGE_SIZE, addr != end);
         pte_unmap(pte - 1);
@@ -1195,8 +1222,8 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
          * rest of the world about it:
          *   VM_IO tells people not to look at these pages
          *      (accesses can have side effects).
-        *   VM_RESERVED tells swapout not to try to touch
-        *      this region.
+        *   VM_RESERVED tells the core MM not to "manage" these pages
+         *     (e.g. refcount, mapcount, try to swap them out).
          */
         vma->vm_flags |= VM_IO | VM_RESERVED;
  
@@ -1256,11 +1283,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         pte_t entry;
         int ret = VM_FAULT_MINOR;
  
+       BUG_ON(vma->vm_flags & VM_RESERVED);
+
         if (unlikely(!pfn_valid(pfn))) {
                 /*
                  * Page table corrupted: show pte and kill process.
                  */
-               pte_ERROR(orig_pte);
+               print_bad_pte(vma, orig_pte, address);
                 ret = VM_FAULT_OOM;
                 goto unlock;
         }
@@ -1284,8 +1313,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         /*
          * Ok, we need to copy. Oh, well..
          */
-       if (!PageReserved(old_page))
-               page_cache_get(old_page);
+       page_cache_get(old_page);
         pte_unmap(page_table);
         spin_unlock(&mm->page_table_lock);
  
@@ -1308,14 +1336,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
         spin_lock(&mm->page_table_lock);
         page_table = pte_offset_map(pmd, address);
         if (likely(pte_same(*page_table, orig_pte))) {
-               if (PageReserved(old_page))
+               page_remove_rmap(old_page);
+               if (!PageAnon(old_page)) {
                         inc_mm_counter(mm, anon_rss);
-               else {
-                       page_remove_rmap(old_page);
-                       if (!PageAnon(old_page)) {
-                               inc_mm_counter(mm, anon_rss);
-                               dec_mm_counter(mm, file_rss);
-                       }
+                       dec_mm_counter(mm, file_rss);
                 }
                 flush_cache_page(vma, address, pfn);
                 entry = mk_pte(new_page, vma->vm_page_prot);
@@ -1769,14 +1793,13 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pte_t *page_table, pmd_t *pmd,
                 int write_access)
  {
+       struct page *page = ZERO_PAGE(addr);
         pte_t entry;
  
         /* Mapping of ZERO_PAGE - vm_page_prot is readonly */
-       entry = mk_pte(ZERO_PAGE(addr), vma->vm_page_prot);
+       entry = mk_pte(page, vma->vm_page_prot);
  
         if (write_access) {
-               struct page *page;
-
                 /* Allocate our own private page. */
                 pte_unmap(page_table);
                 spin_unlock(&mm->page_table_lock);
@@ -1800,6 +1823,10 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 lru_cache_add_active(page);
                 SetPageReferenced(page);
                 page_add_anon_rmap(page, vma, address);
+       } else {
+               inc_mm_counter(mm, file_rss);
+               page_add_file_rmap(page);
+               page_cache_get(page);
         }
  
         set_pte_at(mm, address, page_table, entry);
@@ -1916,7 +1943,7 @@ retry:
                         inc_mm_counter(mm, anon_rss);
                         lru_cache_add_active(new_page);
                         page_add_anon_rmap(new_page, vma, address);
-               } else if (!PageReserved(new_page)) {
+               } else if (!(vma->vm_flags & VM_RESERVED)) {
                         inc_mm_counter(mm, file_rss);
                         page_add_file_rmap(new_page);
                 }
@@ -1957,7 +1984,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 /*
                  * Page table corrupted: show pte and kill process.
                  */
-               pte_ERROR(orig_pte);
+               print_bad_pte(vma, orig_pte, address);
                 return VM_FAULT_OOM;
         }
         /* We can then assume vm->vm_ops && vma->vm_ops->populate */
@@ -2232,7 +2259,7 @@ static int __init gate_vma_init(void)
         gate_vma.vm_start = FIXADDR_USER_START;
         gate_vma.vm_end = FIXADDR_USER_END;
         gate_vma.vm_page_prot = PAGE_READONLY;
-       gate_vma.vm_flags = 0;
+       gate_vma.vm_flags = VM_RESERVED;
         return 0;
  }
  __initcall(gate_vma_init);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 43b1199..11d824f 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -223,13 +223,13 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
  }
  
  /* Ensure all existing pages follow the policy. */
-static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
+static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                 unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pte_t *orig_pte;
         pte_t *pte;
  
-       spin_lock(&mm->page_table_lock);
+       spin_lock(&vma->vm_mm->page_table_lock);
         orig_pte = pte = pte_offset_map(pmd, addr);
         do {
                 unsigned long pfn;
@@ -238,18 +238,20 @@ static int check_pte_range(struct mm_struct *mm, pmd_t *pmd,
                 if (!pte_present(*pte))
                         continue;
                 pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (!pfn_valid(pfn)) {
+                       print_bad_pte(vma, *pte, addr);
                         continue;
+               }
                 nid = pfn_to_nid(pfn);
                 if (!node_isset(nid, *nodes))
                         break;
         } while (pte++, addr += PAGE_SIZE, addr != end);
         pte_unmap(orig_pte);
-       spin_unlock(&mm->page_table_lock);
+       spin_unlock(&vma->vm_mm->page_table_lock);
         return addr != end;
  }
  
-static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
+static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
                 unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pmd_t *pmd;
@@ -260,13 +262,13 @@ static inline int check_pmd_range(struct mm_struct *mm, pud_t *pud,
                 next = pmd_addr_end(addr, end);
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               if (check_pte_range(mm, pmd, addr, next, nodes))
+               if (check_pte_range(vma, pmd, addr, next, nodes))
                         return -EIO;
         } while (pmd++, addr = next, addr != end);
         return 0;
  }
  
-static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
+static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
                 unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pud_t *pud;
@@ -277,24 +279,24 @@ static inline int check_pud_range(struct mm_struct *mm, pgd_t *pgd,
                 next = pud_addr_end(addr, end);
                 if (pud_none_or_clear_bad(pud))
                         continue;
-               if (check_pmd_range(mm, pud, addr, next, nodes))
+               if (check_pmd_range(vma, pud, addr, next, nodes))
                         return -EIO;
         } while (pud++, addr = next, addr != end);
         return 0;
  }
  
-static inline int check_pgd_range(struct mm_struct *mm,
+static inline int check_pgd_range(struct vm_area_struct *vma,
                 unsigned long addr, unsigned long end, nodemask_t *nodes)
  {
         pgd_t *pgd;
         unsigned long next;
  
-       pgd = pgd_offset(mm, addr);
+       pgd = pgd_offset(vma->vm_mm, addr);
         do {
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               if (check_pud_range(mm, pgd, addr, next, nodes))
+               if (check_pud_range(vma, pgd, addr, next, nodes))
                         return -EIO;
         } while (pgd++, addr = next, addr != end);
         return 0;
@@ -311,6 +313,8 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
         first = find_vma(mm, start);
         if (!first)
                 return ERR_PTR(-EFAULT);
+       if (first->vm_flags & VM_RESERVED)
+               return ERR_PTR(-EACCES);
         prev = NULL;
         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
                 if (!vma->vm_next && vma->vm_end < end)
@@ -323,8 +327,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
                                 endvma = end;
                         if (vma->vm_start > start)
                                 start = vma->vm_start;
-                       err = check_pgd_range(vma->vm_mm,
-                                          start, endvma, nodes);
+                       err = check_pgd_range(vma, start, endvma, nodes);
                         if (err) {
                                 first = ERR_PTR(err);
                                 break;
diff --git a/mm/mmap.c b/mm/mmap.c

index 459b9f0..8a11179 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1088,6 +1088,17 @@ munmap_back:
                 error = file->f_op->mmap(file, vma);
                 if (error)
                         goto unmap_and_free_vma;
+               if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED))
+                                               == (VM_WRITE | VM_RESERVED)) {
+                       printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+                               "PROT_WRITE mmap of VM_RESERVED memory, which "
+                               "is deprecated. Please report this to "
+                               "linux-kernel@vger.kernel.org\n",current->comm);
+                       if (vma->vm_ops && vma->vm_ops->close)
+                               vma->vm_ops->close(vma);
+                       error = -EACCES;
+                       goto unmap_and_free_vma;
+               }
         } else if (vm_flags & VM_SHARED) {
                 error = shmem_zero_setup(vma);
                 if (error)
diff --git a/mm/mprotect.c b/mm/mprotect.c

index b426f01..672a76f 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -125,6 +125,14 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
          * a MAP_NORESERVE private mapping to writable will now reserve.
          */
         if (newflags & VM_WRITE) {
+               if (oldflags & VM_RESERVED) {
+                       BUG_ON(oldflags & VM_WRITE);
+                       printk(KERN_WARNING "program %s is using MAP_PRIVATE, "
+                               "PROT_WRITE mprotect of VM_RESERVED memory, "
+                               "which is deprecated. Please report this to "
+                               "linux-kernel@vger.kernel.org\n",current->comm);
+                       return -EACCES;
+               }
                 if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) {
                         charged = nrpages;
                         if (security_vm_enough_memory(charged))
diff --git a/mm/msync.c b/mm/msync.c

index 3b5f1c5..8603954 100644 (file)
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -25,6 +25,7 @@
  static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
                                 unsigned long addr, unsigned long end)
  {
+       struct mm_struct *mm = vma->vm_mm;
         pte_t *pte;
         int progress = 0;
  
@@ -37,7 +38,7 @@ again:
                 if (progress >= 64) {
                         progress = 0;
                         if (need_resched() ||
-                           need_lockbreak(&vma->vm_mm->page_table_lock))
+                           need_lockbreak(&mm->page_table_lock))
                                 break;
                 }
                 progress++;
@@ -46,11 +47,11 @@ again:
                 if (!pte_maybe_dirty(*pte))
                         continue;
                 pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, *pte, addr);
                         continue;
+               }
                 page = pfn_to_page(pfn);
-               if (PageReserved(page))
-                       continue;
  
                 if (ptep_clear_flush_dirty(vma, addr, pte) ||
                     page_test_and_clear_dirty(page))
@@ -58,7 +59,7 @@ again:
                 progress += 3;
         } while (pte++, addr += PAGE_SIZE, addr != end);
         pte_unmap(pte - 1);
-       cond_resched_lock(&vma->vm_mm->page_table_lock);
+       cond_resched_lock(&mm->page_table_lock);
         if (addr != end)
                 goto again;
  }
@@ -102,8 +103,10 @@ static void msync_page_range(struct vm_area_struct *vma,
  
         /* For hugepages we can't go walking the page table normally,
          * but that's ok, hugetlbfs is memory based, so we don't need
-        * to do anything more on an msync() */
-       if (is_vm_hugetlb_page(vma))
+        * to do anything more on an msync().
+        * Can't do anything with VM_RESERVED regions either.
+        */
+       if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED))
                 return;
  
         BUG_ON(addr >= end);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 6066323..0541288 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -114,7 +114,8 @@ static void bad_page(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback);
+                       1 << PG_writeback |
+                       1 << PG_reserved );
         set_page_count(page, 0);
         reset_page_mapcount(page);
         page->mapping = NULL;
@@ -244,7 +245,6 @@ static inline int page_is_buddy(struct page *page, int order)
  {
         if (PagePrivate(page)           &&
             (page_order(page) == order) &&
-           !PageReserved(page)         &&
              page_count(page) == 0)
                 return 1;
         return 0;
@@ -327,7 +327,8 @@ static inline void free_pages_check(const char *function, struct page *page)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                 bad_page(function, page);
         if (PageDirty(page))
                 __ClearPageDirty(page);
@@ -455,7 +456,8 @@ static void prep_new_page(struct page *page, int order)
                         1 << PG_reclaim |
                         1 << PG_slab    |
                         1 << PG_swapcache |
-                       1 << PG_writeback )))
+                       1 << PG_writeback |
+                       1 << PG_reserved )))
                 bad_page(__FUNCTION__, page);
  
         page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
@@ -1016,7 +1018,7 @@ void __pagevec_free(struct pagevec *pvec)
  
  fastcall void __free_pages(struct page *page, unsigned int order)
  {
-       if (!PageReserved(page) && put_page_testzero(page)) {
+       if (put_page_testzero(page)) {
                 if (order == 0)
                         free_hot_page(page);
                 else
@@ -1674,7 +1676,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
                         continue;
                 page = pfn_to_page(pfn);
                 set_page_links(page, zone, nid, pfn);
-               set_page_count(page, 0);
+               set_page_count(page, 1);
                 reset_page_mapcount(page);
                 SetPageReserved(page);
                 INIT_LIST_HEAD(&page->lru);
diff --git a/mm/rmap.c b/mm/rmap.c

index 5047576..f69d534 100644 (file)
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -443,8 +443,6 @@ int page_referenced(struct page *page, int is_locked, int ignore_token)
  void page_add_anon_rmap(struct page *page,
         struct vm_area_struct *vma, unsigned long address)
  {
-       BUG_ON(PageReserved(page));
-
         if (atomic_inc_and_test(&page->_mapcount)) {
                 struct anon_vma *anon_vma = vma->anon_vma;
  
@@ -468,8 +466,7 @@ void page_add_anon_rmap(struct page *page,
  void page_add_file_rmap(struct page *page)
  {
         BUG_ON(PageAnon(page));
-       if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
-               return;
+       BUG_ON(!pfn_valid(page_to_pfn(page)));
  
         if (atomic_inc_and_test(&page->_mapcount))
                 inc_page_state(nr_mapped);
@@ -483,8 +480,6 @@ void page_add_file_rmap(struct page *page)
   */
  void page_remove_rmap(struct page *page)
  {
-       BUG_ON(PageReserved(page));
-
         if (atomic_add_negative(-1, &page->_mapcount)) {
                 BUG_ON(page_mapcount(page) < 0);
                 /*
@@ -640,13 +635,13 @@ static void try_to_unmap_cluster(unsigned long cursor,
                         continue;
  
                 pfn = pte_pfn(*pte);
-               if (!pfn_valid(pfn))
+               if (unlikely(!pfn_valid(pfn))) {
+                       print_bad_pte(vma, *pte, address);
                         continue;
+               }
  
                 page = pfn_to_page(pfn);
                 BUG_ON(PageAnon(page));
-               if (PageReserved(page))
-                       continue;
  
                 if (ptep_clear_flush_young(vma, address, pte))
                         continue;
@@ -808,7 +803,6 @@ int try_to_unmap(struct page *page)
  {
         int ret;
  
-       BUG_ON(PageReserved(page));
         BUG_ON(!PageLocked(page));
  
         if (PageAnon(page))
diff --git a/mm/shmem.c b/mm/shmem.c

index 6796311..37777f4 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -1506,8 +1506,10 @@ static void do_shmem_file_read(struct file *filp, loff_t *ppos, read_descriptor_
                          */
                         if (!offset)
                                 mark_page_accessed(page);
-               } else
+               } else {
                         page = ZERO_PAGE(0);
+                       page_cache_get(page);
+               }
  
                 /*
                  * Ok, we have the page, and it's up-to-date, so
diff --git a/mm/swap.c b/mm/swap.c

index 7771d28..21d15f9 100644 (file)
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -48,7 +48,7 @@ void put_page(struct page *page)
                 }
                 return;
         }
-       if (!PageReserved(page) && put_page_testzero(page))
+       if (put_page_testzero(page))
                 __page_cache_release(page);
  }
  EXPORT_SYMBOL(put_page);
@@ -215,7 +215,7 @@ void release_pages(struct page **pages, int nr, int cold)
                 struct page *page = pages[i];
                 struct zone *pagezone;
  
-               if (PageReserved(page) || !put_page_testzero(page))
+               if (!put_page_testzero(page))
                         continue;
  
                 pagezone = page_zone(page);
diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c

index 67abeba..e97b2d1 100644 (file)
--- a/sound/core/pcm_native.c
+++ b/sound/core/pcm_native.c
@@ -2949,8 +2949,7 @@ static struct page * snd_pcm_mmap_status_nopage(struct vm_area_struct *area, uns
                 return NOPAGE_OOM;
         runtime = substream->runtime;
         page = virt_to_page(runtime->status);
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
         if (type)
                 *type = VM_FAULT_MINOR;
         return page;
@@ -2992,8 +2991,7 @@ static struct page * snd_pcm_mmap_control_nopage(struct vm_area_struct *area, un
                 return NOPAGE_OOM;
         runtime = substream->runtime;
         page = virt_to_page(runtime->control);
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
         if (type)
                 *type = VM_FAULT_MINOR;
         return page;
@@ -3066,8 +3064,7 @@ static struct page *snd_pcm_mmap_data_nopage(struct vm_area_struct *area, unsign
                 vaddr = runtime->dma_area + offset;
                 page = virt_to_page(vaddr);
         }
-       if (!PageReserved(page))
-               get_page(page);
+       get_page(page);
         if (type)
                 *type = VM_FAULT_MINOR;
         return page;
author	Nick Piggin <nickpiggin@yahoo.com.au>
	Sun, 30 Oct 2005 01:16:12 +0000 (18:16 -0700)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Sun, 30 Oct 2005 04:40:39 +0000 (21:40 -0700)
arch/ppc64/kernel/vdso.c		patch \| blob \| history
arch/sparc/mm/generic.c		patch \| blob \| history
arch/sparc64/mm/generic.c		patch \| blob \| history
drivers/scsi/sg.c		patch \| blob \| history
drivers/scsi/st.c		patch \| blob \| history
fs/direct-io.c		patch \| blob \| history
include/linux/mm.h		patch \| blob \| history
kernel/power/swsusp.c		patch \| blob \| history
mm/bootmem.c		patch \| blob \| history
mm/filemap_xip.c		patch \| blob \| history
mm/fremap.c		patch \| blob \| history
mm/madvise.c		patch \| blob \| history
mm/memory.c		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history
mm/mmap.c		patch \| blob \| history
mm/mprotect.c		patch \| blob \| history
mm/msync.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/rmap.c		patch \| blob \| history
mm/shmem.c		patch \| blob \| history
mm/swap.c		patch \| blob \| history
sound/core/pcm_native.c		patch \| blob \| history