powerpc/8xx: Invalidate non present TLBs

[safe/jmp/linux-2.6] / arch / powerpc / mm / fault.c
diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c

index e8fa506..26fb6b9 100644 (file)
--- a/arch/powerpc/mm/fault.c
+++ b/arch/powerpc/mm/fault.c
@@ -28,7 +28,10 @@
  #include <linux/highmem.h>
  #include <linux/module.h>
  #include <linux/kprobes.h>
+#include <linux/kdebug.h>
+#include <linux/perf_event.h>
  
+#include <asm/firmware.h>
  #include <asm/page.h>
  #include <asm/pgtable.h>
  #include <asm/mmu.h>
@@ -36,40 +39,28 @@
  #include <asm/system.h>
  #include <asm/uaccess.h>
  #include <asm/tlbflush.h>
-#include <asm/kdebug.h>
  #include <asm/siginfo.h>
+#include <mm/mmu_decl.h>
  
  #ifdef CONFIG_KPROBES
-ATOMIC_NOTIFIER_HEAD(notify_page_fault_chain);
-
-/* Hook to register for page fault notifications */
-int register_page_fault_notifier(struct notifier_block *nb)
-{
-       return atomic_notifier_chain_register(&notify_page_fault_chain, nb);
-}
-
-int unregister_page_fault_notifier(struct notifier_block *nb)
+static inline int notify_page_fault(struct pt_regs *regs)
  {
-       return atomic_notifier_chain_unregister(&notify_page_fault_chain, nb);
-}
+       int ret = 0;
+
+       /* kprobe_running() needs smp_processor_id() */
+       if (!user_mode(regs)) {
+               preempt_disable();
+               if (kprobe_running() && kprobe_fault_handler(regs, 11))
+                       ret = 1;
+               preempt_enable();
+       }
  
-static inline int notify_page_fault(enum die_val val, const char *str,
-                       struct pt_regs *regs, long err, int trap, int sig)
-{
-       struct die_args args = {
-               .regs = regs,
-               .str = str,
-               .err = err,
-               .trapnr = trap,
-               .signr = sig
-       };
-       return atomic_notifier_call_chain(&notify_page_fault_chain, val, &args);
+       return ret;
  }
  #else
-static inline int notify_page_fault(enum die_val val, const char *str,
-                       struct pt_regs *regs, long err, int trap, int sig)
+static inline int notify_page_fault(struct pt_regs *regs)
  {
-       return NOTIFY_DONE;
+       return 0;
  }
  #endif
  
@@ -111,31 +102,6 @@ static int store_updates_sp(struct pt_regs *regs)
         return 0;
  }
  
-#if !(defined(CONFIG_4xx) || defined(CONFIG_BOOKE))
-static void do_dabr(struct pt_regs *regs, unsigned long address,
-                   unsigned long error_code)
-{
-       siginfo_t info;
-
-       if (notify_die(DIE_DABR_MATCH, "dabr_match", regs, error_code,
-                       11, SIGSEGV) == NOTIFY_STOP)
-               return;
-
-       if (debugger_dabr_match(regs))
-               return;
-
-       /* Clear the DABR */
-       set_dabr(0);
-
-       /* Deliver the signal to userspace */
-       info.si_signo = SIGTRAP;
-       info.si_errno = 0;
-       info.si_code = TRAP_HWBKPT;
-       info.si_addr = (void __user *)address;
-       force_sig_info(SIGTRAP, &info, current);
-}
-#endif /* !(CONFIG_4xx || CONFIG_BOOKE)*/
-
  /*
   * For 600- and 800-family processors, the error_code parameter is DSISR
   * for a data fault, SRR1 for an instruction fault. For 400-family processors
@@ -156,7 +122,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
         struct mm_struct *mm = current->mm;
         siginfo_t info;
         int code = SEGV_MAPERR;
-       int is_write = 0;
+       int is_write = 0, ret;
         int trap = TRAP(regs);
         int is_exec = trap == 0x400;
  
@@ -175,14 +141,11 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
         is_write = error_code & ESR_DST;
  #endif /* CONFIG_4xx || CONFIG_BOOKE */
  
-       if (notify_page_fault(DIE_PAGE_FAULT, "page_fault", regs, error_code,
-                               11, SIGSEGV) == NOTIFY_STOP)
+       if (notify_page_fault(regs))
                 return 0;
  
-       if (trap == 0x300) {
-               if (debugger_fault_handler(regs))
-                       return 0;
-       }
+       if (unlikely(debugger_fault_handler(regs)))
+               return 0;
  
         /* On a kernel SLB miss we can only check for a valid exception entry */
         if (!user_mode(regs) && (address >= TASK_SIZE))
@@ -201,13 +164,15 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address,
                         return SIGSEGV;
                 /* in_atomic() in user mode is really bad,
                    as is current->mm == NULL. */
-               printk(KERN_EMERG "Page fault in user mode with"
+               printk(KERN_EMERG "Page fault in user mode with "
                        "in_atomic() = %d mm = %p\n", in_atomic(), mm);
                 printk(KERN_EMERG "NIP = %lx  MSR = %lx\n",
                        regs->nip, regs->msr);
                 die("Weird page fault", regs, SIGSEGV);
         }
  
+       perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, 0, regs, address);
+
         /* When running in the kernel we expect faults to occur only to
          * addresses in user space.  All other faults represent errors in the
          * kernel and should generate an OOPS.  Unfortunately, in the case of an
@@ -281,6 +246,12 @@ good_area:
                 goto bad_area;
  #endif /* CONFIG_6xx */
  #if defined(CONFIG_8xx)
+       /* 8xx sometimes need to load a invalid/non-present TLBs.
+        * These must be invalidated separately as linux mm don't.
+        */
+       if (error_code & 0x40000000) /* no translation? */
+               _tlbil_va(address, 0, 0, 0);
+
          /* The MPC8xx seems to always set 0x80000000, which is
           * "undefined".  Of those that can be set, this is the only
           * one which seems bad.
@@ -291,39 +262,33 @@ good_area:
  #endif /* CONFIG_8xx */
  
         if (is_exec) {
-#ifdef CONFIG_PPC64
-               /* protection fault */
+#ifdef CONFIG_PPC_STD_MMU
+               /* Protection fault on exec go straight to failure on
+                * Hash based MMUs as they either don't support per-page
+                * execute permission, or if they do, it's handled already
+                * at the hash level. This test would probably have to
+                * be removed if we change the way this works to make hash
+                * processors use the same I/D cache coherency mechanism
+                * as embedded.
+                */
                 if (error_code & DSISR_PROTFAULT)
                         goto bad_area;
-               if (!(vma->vm_flags & VM_EXEC))
+#endif /* CONFIG_PPC_STD_MMU */
+
+               /*
+                * Allow execution from readable areas if the MMU does not
+                * provide separate controls over reading and executing.
+                *
+                * Note: That code used to not be enabled for 4xx/BookE.
+                * It is now as I/D cache coherency for these is done at
+                * set_pte_at() time and I see no reason why the test
+                * below wouldn't be valid on those processors. This -may-
+                * break programs compiled with a really old ABI though.
+                */
+               if (!(vma->vm_flags & VM_EXEC) &&
+                   (cpu_has_feature(CPU_FTR_NOEXECUTE) ||
+                    !(vma->vm_flags & (VM_READ | VM_WRITE))))
                         goto bad_area;
-#endif
-#if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
-               pte_t *ptep;
-               pmd_t *pmdp;
-
-               /* Since 4xx/Book-E supports per-page execute permission,
-                * we lazily flush dcache to icache. */
-               ptep = NULL;
-               if (get_pteptr(mm, address, &ptep, &pmdp)) {
-                       spinlock_t *ptl = pte_lockptr(mm, pmdp);
-                       spin_lock(ptl);
-                       if (pte_present(*ptep)) {
-                               struct page *page = pte_page(*ptep);
-
-                               if (!test_bit(PG_arch_1, &page->flags)) {
-                                       flush_dcache_icache_page(page);
-                                       set_bit(PG_arch_1, &page->flags);
-                               }
-                               pte_update(ptep, 0, _PAGE_HWEXEC);
-                               _tlbie(address);
-                               pte_unmap_unlock(ptep, ptl);
-                               up_read(&mm->mmap_sem);
-                               return 0;
-                       }
-                       pte_unmap_unlock(ptep, ptl);
-               }
-#endif
         /* a write */
         } else if (is_write) {
                 if (!(vma->vm_flags & VM_WRITE))
@@ -343,22 +308,30 @@ good_area:
          * the fault.
          */
   survive:
-       switch (handle_mm_fault(mm, vma, address, is_write)) {
-
-       case VM_FAULT_MINOR:
-               current->min_flt++;
-               break;
-       case VM_FAULT_MAJOR:
-               current->maj_flt++;
-               break;
-       case VM_FAULT_SIGBUS:
-               goto do_sigbus;
-       case VM_FAULT_OOM:
-               goto out_of_memory;
-       default:
+       ret = handle_mm_fault(mm, vma, address, is_write ? FAULT_FLAG_WRITE : 0);
+       if (unlikely(ret & VM_FAULT_ERROR)) {
+               if (ret & VM_FAULT_OOM)
+                       goto out_of_memory;
+               else if (ret & VM_FAULT_SIGBUS)
+                       goto do_sigbus;
                 BUG();
         }
-
+       if (ret & VM_FAULT_MAJOR) {
+               current->maj_flt++;
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
+                                    regs, address);
+#ifdef CONFIG_PPC_SMLPAR
+               if (firmware_has_feature(FW_FEATURE_CMO)) {
+                       preempt_disable();
+                       get_lppaca()->page_ins += (1 << PAGE_FACTOR);
+                       preempt_enable();
+               }
+#endif
+       } else {
+               current->min_flt++;
+               perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
+                                    regs, address);
+       }
         up_read(&mm->mmap_sem);
         return 0;
  
@@ -376,7 +349,7 @@ bad_area_nosemaphore:
             && printk_ratelimit())
                 printk(KERN_CRIT "kernel tried to execute NX-protected"
                        " page (%lx) - exploit attempt? (uid: %d)\n",
-                      address, current->uid);
+                      address, current_uid());
  
         return SIGSEGV;
  
@@ -386,14 +359,14 @@ bad_area_nosemaphore:
   */
  out_of_memory:
         up_read(&mm->mmap_sem);
-       if (is_init(current)) {
+       if (is_global_init(current)) {
                 yield();
                 down_read(&mm->mmap_sem);
                 goto survive;
         }
         printk("VM: killing process %s\n", current->comm);
         if (user_mode(regs))
-               do_exit(SIGKILL);
+               do_group_exit(SIGKILL);
         return SIGKILL;
  
  do_sigbus:
@@ -426,18 +399,21 @@ void bad_page_fault(struct pt_regs *regs, unsigned long address, int sig)
  
         /* kernel has accessed a bad area */
  
-       printk(KERN_ALERT "Unable to handle kernel paging request for ");
         switch (regs->trap) {
-               case 0x300:
-               case 0x380:
-                       printk("data at address 0x%08lx\n", regs->dar);
-                       break;
-               case 0x400:
-               case 0x480:
-                       printk("instruction fetch\n");
-                       break;
-               default:
-                       printk("unknown fault\n");
+       case 0x300:
+       case 0x380:
+               printk(KERN_ALERT "Unable to handle kernel paging request for "
+                       "data at address 0x%08lx\n", regs->dar);
+               break;
+       case 0x400:
+       case 0x480:
+               printk(KERN_ALERT "Unable to handle kernel paging request for "
+                       "instruction fetch\n");
+               break;
+       default:
+               printk(KERN_ALERT "Unable to handle kernel paging request for "
+                       "unknown fault\n");
+               break;
         }
         printk(KERN_ALERT "Faulting instruction address: 0x%08lx\n",
                 regs->nip);