[PATCH] sem2mutex: mm/slab.c
[safe/jmp/linux-2.6] / mm / memory.c
index 6c1eac9..7a11ddd 100644 (file)
@@ -349,6 +349,11 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
        dump_stack();
 }
 
+static inline int is_cow_mapping(unsigned int flags)
+{
+       return (flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
+}
+
 /*
  * This function gets the "struct page" associated with a pte.
  *
@@ -377,6 +382,8 @@ struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_
                unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
                if (pfn == vma->vm_pgoff + off)
                        return NULL;
+               if (!is_cow_mapping(vma->vm_flags))
+                       return NULL;
        }
 
        /*
@@ -437,7 +444,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * If it's a COW mapping, write protect it both
         * in the parent and the child
         */
-       if ((vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE) {
+       if (is_cow_mapping(vm_flags)) {
                ptep_set_wrprotect(src_mm, addr, src_pte);
                pte = *src_pte;
        }
@@ -567,7 +574,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         * readonly mappings. The tradeoff is that copy_page_range is more
         * efficient than faulting.
         */
-       if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
+       if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP|VM_INSERTPAGE))) {
                if (!vma->anon_vma)
                        return 0;
        }
@@ -1002,7 +1009,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                        continue;
                }
 
-               if (!vma || (vma->vm_flags & VM_IO)
+               if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP))
                                || !(vm_flags & vma->vm_flags))
                        return i ? : -EFAULT;
 
@@ -1146,6 +1153,86 @@ int zeromap_page_range(struct vm_area_struct *vma,
        return err;
 }
 
+pte_t * fastcall get_locked_pte(struct mm_struct *mm, unsigned long addr, spinlock_t **ptl)
+{
+       pgd_t * pgd = pgd_offset(mm, addr);
+       pud_t * pud = pud_alloc(mm, pgd, addr);
+       if (pud) {
+               pmd_t * pmd = pmd_alloc(mm, pud, addr);
+               if (pmd)
+                       return pte_alloc_map_lock(mm, pmd, addr, ptl);
+       }
+       return NULL;
+}
+
+/*
+ * This is the old fallback for page remapping.
+ *
+ * For historical reasons, it only allows reserved pages. Only
+ * old drivers should use this, and they needed to mark their
+ * pages reserved for the old functions anyway.
+ */
+static int insert_page(struct mm_struct *mm, unsigned long addr, struct page *page, pgprot_t prot)
+{
+       int retval;
+       pte_t *pte;
+       spinlock_t *ptl;  
+
+       retval = -EINVAL;
+       if (PageAnon(page))
+               goto out;
+       retval = -ENOMEM;
+       flush_dcache_page(page);
+       pte = get_locked_pte(mm, addr, &ptl);
+       if (!pte)
+               goto out;
+       retval = -EBUSY;
+       if (!pte_none(*pte))
+               goto out_unlock;
+
+       /* Ok, finally just insert the thing.. */
+       get_page(page);
+       inc_mm_counter(mm, file_rss);
+       page_add_file_rmap(page);
+       set_pte_at(mm, addr, pte, mk_pte(page, prot));
+
+       retval = 0;
+out_unlock:
+       pte_unmap_unlock(pte, ptl);
+out:
+       return retval;
+}
+
+/*
+ * This allows drivers to insert individual pages they've allocated
+ * into a user vma.
+ *
+ * The page has to be a nice clean _individual_ kernel allocation.
+ * If you allocate a compound page, you need to have marked it as
+ * such (__GFP_COMP), or manually just split the page up yourself
+ * (which is mainly an issue of doing "set_page_count(page, 1)" for
+ * each sub-page, and then freeing them one by one when you free
+ * them rather than freeing it as a compound page).
+ *
+ * NOTE! Traditionally this was done with "remap_pfn_range()" which
+ * took an arbitrary page protection parameter. This doesn't allow
+ * that. Your vma protection will have to be set up correctly, which
+ * means that if you want a shared writable mapping, you'd better
+ * ask for a shared writable mapping!
+ *
+ * The page does not need to be reserved.
+ */
+int vm_insert_page(struct vm_area_struct *vma, unsigned long addr, struct page *page)
+{
+       if (addr < vma->vm_start || addr >= vma->vm_end)
+               return -EFAULT;
+       if (!page_count(page))
+               return -EINVAL;
+       vma->vm_flags |= VM_INSERTPAGE;
+       return insert_page(vma->vm_mm, addr, page, vma->vm_page_prot);
+}
+EXPORT_SYMBOL(vm_insert_page);
+
 /*
  * maps a range of physical memory into the requested pages. the old
  * mappings are removed. any references to nonexistent pages results
@@ -1233,9 +1320,18 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
         *   VM_PFNMAP tells the core MM that the base pages are just
         *      raw PFN mappings, and do not have a "struct page" associated
         *      with them.
+        *
+        * There's a horrible special case to handle copy-on-write
+        * behaviour that some programs depend on. We mark the "original"
+        * un-COW'ed pages by matching them up with "vma->vm_pgoff".
         */
+       if (is_cow_mapping(vma->vm_flags)) {
+               if (addr != vma->vm_start || end != vma->vm_end)
+                       return -EINVAL;
+               vma->vm_pgoff = pfn;
+       }
+
        vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
-       vma->vm_pgoff = pfn;
 
        BUG_ON(addr >= end);
        pfn -= addr >> PAGE_SHIFT;
@@ -1300,8 +1396,15 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
         */
        if (unlikely(!src)) {
                void *kaddr = kmap_atomic(dst, KM_USER0);
-               unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
-               if (left)
+               void __user *uaddr = (void __user *)(va & PAGE_MASK);
+
+               /*
+                * This really shouldn't fail, because the page is there
+                * in the page tables. But it might just be unreadable,
+                * in which case we just give up and fill the result with
+                * zeroes.
+                */
+               if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
                        memset(kaddr, 0, PAGE_SIZE);
                kunmap_atomic(kaddr, KM_USER0);
                return;
@@ -1332,12 +1435,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                unsigned long address, pte_t *page_table, pmd_t *pmd,
                spinlock_t *ptl, pte_t orig_pte)
 {
-       struct page *old_page, *src_page, *new_page;
+       struct page *old_page, *new_page;
        pte_t entry;
        int ret = VM_FAULT_MINOR;
 
        old_page = vm_normal_page(vma, address, orig_pte);
-       src_page = old_page;
        if (!old_page)
                goto gotten;
 
@@ -1345,7 +1447,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                int reuse = can_share_swap_page(old_page);
                unlock_page(old_page);
                if (reuse) {
-                       flush_cache_page(vma, address, pfn);
+                       flush_cache_page(vma, address, pte_pfn(orig_pte));
                        entry = pte_mkyoung(orig_pte);
                        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                        ptep_set_access_flags(vma, address, page_table, entry, 1);
@@ -1365,7 +1467,7 @@ gotten:
 
        if (unlikely(anon_vma_prepare(vma)))
                goto oom;
-       if (src_page == ZERO_PAGE(address)) {
+       if (old_page == ZERO_PAGE(address)) {
                new_page = alloc_zeroed_user_highpage(vma, address);
                if (!new_page)
                        goto oom;
@@ -1373,7 +1475,7 @@ gotten:
                new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                if (!new_page)
                        goto oom;
-               cow_user_page(new_page, src_page, address);
+               cow_user_page(new_page, old_page, address);
        }
 
        /*
@@ -1389,14 +1491,14 @@ gotten:
                        }
                } else
                        inc_mm_counter(mm, anon_rss);
-               flush_cache_page(vma, address, pfn);
+               flush_cache_page(vma, address, pte_pfn(orig_pte));
                entry = mk_pte(new_page, vma->vm_page_prot);
                entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                ptep_establish(vma, address, page_table, entry);
                update_mmu_cache(vma, address, entry);
                lazy_mmu_prot_update(entry);
                lru_cache_add_active(new_page);
-               page_add_anon_rmap(new_page, vma, address);
+               page_add_new_anon_rmap(new_page, vma, address);
 
                /* Free the old page.. */
                new_page = old_page;
@@ -1668,9 +1770,32 @@ out_big:
 out_busy:
        return -ETXTBSY;
 }
-
 EXPORT_SYMBOL(vmtruncate);
 
+int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
+{
+       struct address_space *mapping = inode->i_mapping;
+
+       /*
+        * If the underlying filesystem is not going to provide
+        * a way to truncate a range of blocks (punch a hole) -
+        * we should return failure right now.
+        */
+       if (!inode->i_op || !inode->i_op->truncate_range)
+               return -ENOSYS;
+
+       mutex_lock(&inode->i_mutex);
+       down_write(&inode->i_alloc_sem);
+       unmap_mapping_range(mapping, offset, (end - offset), 1);
+       truncate_inode_pages_range(mapping, offset, end);
+       inode->i_op->truncate_range(inode, offset, end);
+       up_write(&inode->i_alloc_sem);
+       mutex_unlock(&inode->i_mutex);
+
+       return 0;
+}
+EXPORT_SYMBOL(vmtruncate_range);
+
 /* 
  * Primitive swap readahead code. We simply read an aligned block of
  * (1 << page_cluster) entries in the swap area. This method is chosen
@@ -1852,8 +1977,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        goto release;
                inc_mm_counter(mm, anon_rss);
                lru_cache_add_active(page);
-               SetPageReferenced(page);
-               page_add_anon_rmap(page, vma, address);
+               page_add_new_anon_rmap(page, vma, address);
        } else {
                /* Map the ZERO_PAGE - vm_page_prot is readonly */
                page = ZERO_PAGE(address);
@@ -1909,6 +2033,8 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
        int anon = 0;
 
        pte_unmap(page_table);
+       BUG_ON(vma->vm_flags & VM_PFNMAP);
+
        if (vma->vm_file) {
                mapping = vma->vm_file->f_mapping;
                sequence = mapping->truncate_count;
@@ -1941,7 +2067,7 @@ retry:
                page = alloc_page_vma(GFP_HIGHUSER, vma, address);
                if (!page)
                        goto oom;
-               cow_user_page(page, new_page, address);
+               copy_user_highpage(page, new_page, address);
                page_cache_release(new_page);
                new_page = page;
                anon = 1;
@@ -1982,7 +2108,7 @@ retry:
                if (anon) {
                        inc_mm_counter(mm, anon_rss);
                        lru_cache_add_active(new_page);
-                       page_add_anon_rmap(new_page, vma, address);
+                       page_add_new_anon_rmap(new_page, vma, address);
                } else {
                        inc_mm_counter(mm, file_rss);
                        page_add_file_rmap(new_page);
@@ -2141,6 +2267,8 @@ int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
        return handle_pte_fault(mm, vma, address, pte, pmd, write_access);
 }
 
+EXPORT_SYMBOL_GPL(__handle_mm_fault);
+
 #ifndef __PAGETABLE_PUD_FOLDED
 /*
  * Allocate page upper directory.