nfsd: nfsd should drop CAP_MKNOD for non-root

[safe/jmp/linux-2.6] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index b12888c..baa999e 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -397,8 +397,8 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
                         return;
                 }
                 if (nr_unshown) {
-                       printk(KERN_EMERG
-                               "Bad page map: %lu messages suppressed\n",
+                       printk(KERN_ALERT
+                               "BUG: Bad page map: %lu messages suppressed\n",
                                 nr_unshown);
                         nr_unshown = 0;
                 }
@@ -410,26 +410,27 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr,
         mapping = vma->vm_file ? vma->vm_file->f_mapping : NULL;
         index = linear_page_index(vma, addr);
  
-       printk(KERN_EMERG "Bad page map in process %s  pte:%08llx pmd:%08llx\n",
+       printk(KERN_ALERT
+               "BUG: Bad page map in process %s  pte:%08llx pmd:%08llx\n",
                 current->comm,
                 (long long)pte_val(pte), (long long)pmd_val(*pmd));
         if (page) {
-               printk(KERN_EMERG
+               printk(KERN_ALERT
                 "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n",
                 page, (void *)page->flags, page_count(page),
                 page_mapcount(page), page->mapping, page->index);
         }
-       printk(KERN_EMERG
+       printk(KERN_ALERT
                 "addr:%p vm_flags:%08lx anon_vma:%p mapping:%p index:%lx\n",
                 (void *)addr, vma->vm_flags, vma->anon_vma, mapping, index);
         /*
          * Choose text because data symbols depend on CONFIG_KALLSYMS_ALL=y
          */
         if (vma->vm_ops)
-               print_symbol(KERN_EMERG "vma->vm_ops->fault: %s\n",
+               print_symbol(KERN_ALERT "vma->vm_ops->fault: %s\n",
                                 (unsigned long)vma->vm_ops->fault);
         if (vma->vm_file && vma->vm_file->f_op)
-               print_symbol(KERN_EMERG "vma->vm_file->f_op->mmap: %s\n",
+               print_symbol(KERN_ALERT "vma->vm_file->f_op->mmap: %s\n",
                                 (unsigned long)vma->vm_file->f_op->mmap);
         dump_stack();
         add_taint(TAINT_BAD_PAGE);
@@ -1209,6 +1210,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
         int write = !!(flags & GUP_FLAGS_WRITE);
         int force = !!(flags & GUP_FLAGS_FORCE);
         int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS);
+       int ignore_sigkill = !!(flags & GUP_FLAGS_IGNORE_SIGKILL);
  
         if (len <= 0)
                 return 0;
@@ -1287,12 +1289,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                         struct page *page;
  
                         /*
-                        * If tsk is ooming, cut off its access to large memory
-                        * allocations. It has a pending SIGKILL, but it can't
-                        * be processed until returning to user space.
+                        * If we have a pending SIGKILL, don't keep faulting
+                        * pages and potentially allocating memory, unless
+                        * current is handling munlock--e.g., on exit. In
+                        * that case, we are not allocating memory.  Rather,
+                        * we're only unlocking already resident/mapped pages.
                          */
-                       if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
-                               return i ? i : -ENOMEM;
+                       if (unlikely(!ignore_sigkill &&
+                                       fatal_signal_pending(current)))
+                               return i ? i : -ERESTARTSYS;
  
                         if (write)
                                 foll_flags |= FOLL_WRITE;
@@ -1506,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
                         unsigned long pfn)
  {
         int ret;
+       pgprot_t pgprot = vma->vm_page_prot;
         /*
          * Technically, architectures with pte_special can avoid all these
          * restrictions (same for remap_pfn_range).  However we would like
@@ -1520,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr,
  
         if (addr < vma->vm_start || addr >= vma->vm_end)
                 return -EFAULT;
-       if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE))
+       if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE))
                 return -EINVAL;
  
-       ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot);
+       ret = insert_pfn(vma, addr, pfn, pgprot);
  
         if (ret)
                 untrack_pfn_vma(vma, pfn, PAGE_SIZE);
@@ -1666,9 +1672,15 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
  
         vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
  
-       err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size));
-       if (err)
+       err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size));
+       if (err) {
+               /*
+                * To indicate that track_pfn related cleanup is not
+                * needed from higher level routine calling unmap_vmas
+                */
+               vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP);
                 return -EINVAL;
+       }
  
         BUG_ON(addr >= end);
         pfn -= addr >> PAGE_SHIFT;
@@ -1987,7 +1999,7 @@ gotten:
          * Don't let another task, with possibly unlocked vma,
          * keep the mlocked page.
          */
-       if (vma->vm_flags & VM_LOCKED) {
+       if ((vma->vm_flags & VM_LOCKED) && old_page) {
                 lock_page(old_page);    /* for LRU manipulation */
                 clear_page_mlock(old_page);
                 unlock_page(old_page);
@@ -1995,7 +2007,7 @@ gotten:
         cow_user_page(new_page, old_page, address, vma);
         __SetPageUptodate(new_page);
  
-       if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
+       if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
                 goto oom_free_new;
  
         /*
@@ -2387,6 +2399,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         struct page *page;
         swp_entry_t entry;
         pte_t pte;
+       struct mem_cgroup *ptr = NULL;
         int ret = 0;
  
         if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
@@ -2425,7 +2438,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
         lock_page(page);
         delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
  
-       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+       if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
                 ret = VM_FAULT_OOM;
                 unlock_page(page);
                 goto out;
@@ -2443,7 +2456,19 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 goto out_nomap;
         }
  
-       /* The page isn't present yet, go ahead with the fault. */
+       /*
+        * The page isn't present yet, go ahead with the fault.
+        *
+        * Be careful about the sequence of operations here.
+        * To get its accounting right, reuse_swap_page() must be called
+        * while the page is counted on swap but not yet in mapcount i.e.
+        * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
+        * must be called after the swap_free(), or it will never succeed.
+        * Because delete_from_swap_page() may be called by reuse_swap_page(),
+        * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
+        * in page->private. In this case, a record in swap_cgroup  is silently
+        * discarded at swap_free().
+        */
  
         inc_mm_counter(mm, anon_rss);
         pte = mk_pte(page, vma->vm_page_prot);
@@ -2451,10 +2476,11 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 pte = maybe_mkwrite(pte_mkdirty(pte), vma);
                 write_access = 0;
         }
-
         flush_icache_page(vma, page);
         set_pte_at(mm, address, page_table, pte);
         page_add_anon_rmap(page, vma, address);
+       /* It's better to call commit-charge after rmap is established */
+       mem_cgroup_commit_charge_swapin(page, ptr);
  
         swap_free(entry);
         if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2475,7 +2501,7 @@ unlock:
  out:
         return ret;
  out_nomap:
-       mem_cgroup_uncharge_page(page);
+       mem_cgroup_cancel_charge_swapin(ptr);
         pte_unmap_unlock(page_table, ptl);
         unlock_page(page);
         page_cache_release(page);
@@ -2505,7 +2531,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 goto oom;
         __SetPageUptodate(page);
  
-       if (mem_cgroup_charge(page, mm, GFP_KERNEL))
+       if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
                 goto oom_free_page;
  
         entry = mk_pte(page, vma->vm_page_prot);
@@ -2596,7 +2622,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                                 ret = VM_FAULT_OOM;
                                 goto out;
                         }
-                       if (mem_cgroup_charge(page, mm, GFP_KERNEL)) {
+                       if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
                                 ret = VM_FAULT_OOM;
                                 page_cache_release(page);
                                 goto out;
@@ -3146,6 +3172,15 @@ void print_vma_addr(char *prefix, unsigned long ip)
  #ifdef CONFIG_PROVE_LOCKING
  void might_fault(void)
  {
+       /*
+        * Some code (nfs/sunrpc) uses socket ops on kernel memory while
+        * holding the mmap_sem, this is safe because kernel memory doesn't
+        * get paged out, therefore we'll never actually fault, and the
+        * below annotations will generate false positives.
+        */
+       if (segment_eq(get_fs(), KERNEL_DS))
+               return;
+
         might_sleep();
         /*
          * it would be nicer only to annotate paths which are not under