[PATCH] shared page table for hugetlb page

author Chen, Kenneth W <kenneth.w.chen@intel.com>

Thu, 7 Dec 2006 04:32:03 +0000 (20:32 -0800)

committer Linus Torvalds <torvalds@woody.osdl.org>

Thu, 7 Dec 2006 16:39:21 +0000 (08:39 -0800)
author Chen, Kenneth W <kenneth.w.chen@intel.com>
Thu, 7 Dec 2006 04:32:03 +0000 (20:32 -0800)
committer Linus Torvalds <torvalds@woody.osdl.org>
Thu, 7 Dec 2006 16:39:21 +0000 (08:39 -0800)
diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c

index 1719a81..34728e4 100644 (file)
--- a/arch/i386/mm/hugetlbpage.c
+++ b/arch/i386/mm/hugetlbpage.c
@@ -17,6 +17,113 @@
  #include <asm/tlb.h>
  #include <asm/tlbflush.h>
  
+static unsigned long page_table_shareable(struct vm_area_struct *svma,
+                               struct vm_area_struct *vma,
+                               unsigned long addr, pgoff_t idx)
+{
+       unsigned long saddr = ((idx - svma->vm_pgoff) << PAGE_SHIFT) +
+                               svma->vm_start;
+       unsigned long sbase = saddr & PUD_MASK;
+       unsigned long s_end = sbase + PUD_SIZE;
+
+       /*
+        * match the virtual addresses, permission and the alignment of the
+        * page table page.
+        */
+       if (pmd_index(addr) != pmd_index(saddr) ||
+           vma->vm_flags != svma->vm_flags ||
+           sbase < svma->vm_start || svma->vm_end < s_end)
+               return 0;
+
+       return saddr;
+}
+
+static int vma_shareable(struct vm_area_struct *vma, unsigned long addr)
+{
+       unsigned long base = addr & PUD_MASK;
+       unsigned long end = base + PUD_SIZE;
+
+       /*
+        * check on proper vm_flags and page table alignment
+        */
+       if (vma->vm_flags & VM_MAYSHARE &&
+           vma->vm_start <= base && end <= vma->vm_end)
+               return 1;
+       return 0;
+}
+
+/*
+ * search for a shareable pmd page for hugetlb.
+ */
+static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
+{
+       struct vm_area_struct *vma = find_vma(mm, addr);
+       struct address_space *mapping = vma->vm_file->f_mapping;
+       pgoff_t idx = ((addr - vma->vm_start) >> PAGE_SHIFT) +
+                       vma->vm_pgoff;
+       struct prio_tree_iter iter;
+       struct vm_area_struct *svma;
+       unsigned long saddr;
+       pte_t *spte = NULL;
+
+       if (!vma_shareable(vma, addr))
+               return;
+
+       spin_lock(&mapping->i_mmap_lock);
+       vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
+               if (svma == vma)
+                       continue;
+
+               saddr = page_table_shareable(svma, vma, addr, idx);
+               if (saddr) {
+                       spte = huge_pte_offset(svma->vm_mm, saddr);
+                       if (spte) {
+                               get_page(virt_to_page(spte));
+                               break;
+                       }
+               }
+       }
+
+       if (!spte)
+               goto out;
+
+       spin_lock(&mm->page_table_lock);
+       if (pud_none(*pud))
+               pud_populate(mm, pud, (unsigned long) spte & PAGE_MASK);
+       else
+               put_page(virt_to_page(spte));
+       spin_unlock(&mm->page_table_lock);
+out:
+       spin_unlock(&mapping->i_mmap_lock);
+}
+
+/*
+ * unmap huge page backed by shared pte.
+ *
+ * Hugetlb pte page is ref counted at the time of mapping.  If pte is shared
+ * indicated by page_count > 1, unmap is achieved by clearing pud and
+ * decrementing the ref count. If count == 1, the pte page is not shared.
+ *
+ * called with vma->vm_mm->page_table_lock held.
+ *
+ * returns: 1 successfully unmapped a shared pte page
+ *         0 the underlying pte page is not shared, or it is the last user
+ */
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       pgd_t *pgd = pgd_offset(mm, *addr);
+       pud_t *pud = pud_offset(pgd, *addr);
+
+       BUG_ON(page_count(virt_to_page(ptep)) == 0);
+       if (page_count(virt_to_page(ptep)) == 1)
+               return 0;
+
+       pud_clear(pud);
+       put_page(virt_to_page(ptep));
+       *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE;
+       return 1;
+}
+
  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
  {
         pgd_t *pgd;
@@ -25,8 +132,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
  
         pgd = pgd_offset(mm, addr);
         pud = pud_alloc(mm, pgd, addr);
-       if (pud)
+       if (pud) {
+               if (pud_none(*pud))
+                       huge_pmd_share(mm, addr, pud);
                 pte = (pte_t *) pmd_alloc(mm, pud, addr);
+       }
         BUG_ON(pte && !pte_none(*pte) && !pte_huge(*pte));
  
         return pte;
diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c

index f3a9585..0c7e94e 100644 (file)
--- a/arch/ia64/mm/hugetlbpage.c
+++ b/arch/ia64/mm/hugetlbpage.c
@@ -64,6 +64,11 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr)
         return pte;
  }
  
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
  #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; }
  
  /*
diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c

index 506d897..424a8f5 100644 (file)
--- a/arch/powerpc/mm/hugetlbpage.c
+++ b/arch/powerpc/mm/hugetlbpage.c
@@ -146,6 +146,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr)
         return hugepte_offset(hpdp, addr);
  }
  
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
  static void free_hugepte_range(struct mmu_gather *tlb, hugepd_t *hpdp)
  {
         pte_t *hugepte = hugepd_page(*hpdp);
diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c

index 329059d..cf2c2ee 100644 (file)
--- a/arch/sh/mm/hugetlbpage.c
+++ b/arch/sh/mm/hugetlbpage.c
@@ -63,6 +63,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
         return pte;
  }
  
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
  struct page *follow_huge_addr(struct mm_struct *mm,
                               unsigned long address, int write)
  {
diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c

index 187cf01..4b455f6 100644 (file)
--- a/arch/sh64/mm/hugetlbpage.c
+++ b/arch/sh64/mm/hugetlbpage.c
@@ -53,6 +53,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
         return pte;
  }
  
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
  void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep, pte_t entry)
  {
diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c

index 53b9b1f..33fd0b2 100644 (file)
--- a/arch/sparc64/mm/hugetlbpage.c
+++ b/arch/sparc64/mm/hugetlbpage.c
@@ -235,6 +235,11 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr)
         return pte;
  }
  
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+{
+       return 0;
+}
+
  void set_huge_pte_at(struct mm_struct *mm, unsigned long addr,
                      pte_t *ptep, pte_t entry)
  {
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h

index ace64e5..a60995a 100644 (file)
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -35,6 +35,7 @@ extern int sysctl_hugetlb_shm_group;
  
  pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr);
  pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr);
+int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep);
  struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address,
                               int write);
  struct page *follow_huge_pmd(struct mm_struct *mm, unsigned long address,
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index f7355bf..9244971 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -386,6 +386,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
                 if (!ptep)
                         continue;
  
+               if (huge_pmd_unshare(mm, &address, ptep))
+                       continue;
+
                 pte = huge_ptep_get_and_clear(mm, address, ptep);
                 if (pte_none(pte))
                         continue;
@@ -658,11 +661,14 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
         BUG_ON(address >= end);
         flush_cache_range(vma, address, end);
  
+       spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
         spin_lock(&mm->page_table_lock);
         for (; address < end; address += HPAGE_SIZE) {
                 ptep = huge_pte_offset(mm, address);
                 if (!ptep)
                         continue;
+               if (huge_pmd_unshare(mm, &address, ptep))
+                       continue;
                 if (!pte_none(*ptep)) {
                         pte = huge_ptep_get_and_clear(mm, address, ptep);
                         pte = pte_mkhuge(pte_modify(pte, newprot));
@@ -671,6 +677,7 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
                 }
         }
         spin_unlock(&mm->page_table_lock);
+       spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
  
         flush_tlb_range(vma, start, end);
  }
author	Chen, Kenneth W <kenneth.w.chen@intel.com>
	Thu, 7 Dec 2006 04:32:03 +0000 (20:32 -0800)
committer	Linus Torvalds <torvalds@woody.osdl.org>
	Thu, 7 Dec 2006 16:39:21 +0000 (08:39 -0800)
arch/i386/mm/hugetlbpage.c		patch \| blob \| history
arch/ia64/mm/hugetlbpage.c		patch \| blob \| history
arch/powerpc/mm/hugetlbpage.c		patch \| blob \| history
arch/sh/mm/hugetlbpage.c		patch \| blob \| history
arch/sh64/mm/hugetlbpage.c		patch \| blob \| history
arch/sparc64/mm/hugetlbpage.c		patch \| blob \| history
include/linux/hugetlb.h		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history