Merge branch 'linus' into perfcounters/core
authorIngo Molnar <mingo@elte.hu>
Tue, 7 Apr 2009 10:05:21 +0000 (12:05 +0200)
committerIngo Molnar <mingo@elte.hu>
Tue, 7 Apr 2009 10:05:25 +0000 (12:05 +0200)
Merge reason: need the upstream facility added by:

  7f1e2ca: hrtimer: fix rq->lock inversion (again)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
1  2 
arch/x86/Kconfig
arch/x86/kernel/apic/apic.c
init/Kconfig
kernel/exit.c
kernel/sched.c

diff --combined arch/x86/Kconfig
@@@ -253,6 -253,7 +253,7 @@@ config SM
  config X86_X2APIC
        bool "Support x2apic"
        depends on X86_LOCAL_APIC && X86_64
+       select INTR_REMAP
        ---help---
          This enables x2apic support on CPUs that have this feature.
  
@@@ -727,7 -728,6 +728,7 @@@ config X86_UP_IOAPI
  config X86_LOCAL_APIC
        def_bool y
        depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC
 +      select HAVE_PERF_COUNTERS if (!M386 && !M486)
  
  config X86_IO_APIC
        def_bool y
@@@ -1882,7 -1882,6 +1883,6 @@@ config DMAR_FLOPPY_W
  config INTR_REMAP
        bool "Support for Interrupt Remapping (EXPERIMENTAL)"
        depends on X86_64 && X86_IO_APIC && PCI_MSI && ACPI && EXPERIMENTAL
-       select X86_X2APIC
        ---help---
          Supports Interrupt remapping for IO-APIC and MSI devices.
          To use x2apic mode in the CPU's which support x2APIC enhancements or
@@@ -34,7 -34,6 +34,7 @@@
  #include <linux/smp.h>
  #include <linux/mm.h>
  
 +#include <asm/perf_counter.h>
  #include <asm/pgalloc.h>
  #include <asm/atomic.h>
  #include <asm/mpspec.h>
@@@ -756,8 -755,6 +756,8 @@@ static void local_apic_timer_interrupt(
        inc_irq_stat(apic_timer_irqs);
  
        evt->event_handler(evt);
 +
 +      perf_counter_unthrottle();
  }
  
  /*
@@@ -1130,7 -1127,6 +1130,7 @@@ void __cpuinit setup_local_APIC(void
                apic_write(APIC_ESR, 0);
        }
  #endif
 +      perf_counters_lapic_init(0);
  
        preempt_disable();
  
@@@ -1308,6 -1304,7 +1308,7 @@@ void __init enable_IR_x2apic(void
  #ifdef CONFIG_INTR_REMAP
        int ret;
        unsigned long flags;
+       struct IO_APIC_route_entry **ioapic_entries = NULL;
  
        if (!cpu_has_x2apic)
                return;
                return;
        }
  
-       ret = save_IO_APIC_setup();
+       ioapic_entries = alloc_ioapic_entries();
+       if (!ioapic_entries) {
+               pr_info("Allocate ioapic_entries failed: %d\n", ret);
+               goto end;
+       }
+       ret = save_IO_APIC_setup(ioapic_entries);
        if (ret) {
                pr_info("Saving IO-APIC state failed: %d\n", ret);
                goto end;
        }
  
        local_irq_save(flags);
-       mask_IO_APIC_setup();
+       mask_IO_APIC_setup(ioapic_entries);
        mask_8259A();
  
-       ret = enable_intr_remapping(1);
+       ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
  
        if (ret && x2apic_preenabled) {
                local_irq_restore(flags);
@@@ -1368,9 -1371,9 +1375,9 @@@ end_restore
                /*
                 * IR enabling failed
                 */
-               restore_IO_APIC_setup();
+               restore_IO_APIC_setup(ioapic_entries);
        else
-               reinit_intr_remapped_IO_APIC(x2apic_preenabled);
+               reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
  
        unmask_8259A();
        local_irq_restore(flags);
@@@ -1383,6 -1386,8 +1390,8 @@@ end
                        pr_info("Enabled Interrupt-remapping\n");
        } else
                pr_err("Failed to enable Interrupt-remapping and x2apic\n");
+       if (ioapic_entries)
+               free_ioapic_entries(ioapic_entries);
  #else
        if (!cpu_has_x2apic)
                return;
@@@ -1958,6 -1963,10 +1967,10 @@@ static int lapic_suspend(struct sys_dev
  
        local_irq_save(flags);
        disable_local_APIC();
+ #ifdef CONFIG_INTR_REMAP
+       if (intr_remapping_enabled)
+               disable_intr_remapping();
+ #endif
        local_irq_restore(flags);
        return 0;
  }
@@@ -1968,15 -1977,41 +1981,41 @@@ static int lapic_resume(struct sys_devi
        unsigned long flags;
        int maxlvt;
  
+ #ifdef CONFIG_INTR_REMAP
+       int ret;
+       struct IO_APIC_route_entry **ioapic_entries = NULL;
        if (!apic_pm_state.active)
                return 0;
  
-       maxlvt = lapic_get_maxlvt();
        local_irq_save(flags);
+       if (x2apic) {
+               ioapic_entries = alloc_ioapic_entries();
+               if (!ioapic_entries) {
+                       WARN(1, "Alloc ioapic_entries in lapic resume failed.");
+                       return -ENOMEM;
+               }
+               ret = save_IO_APIC_setup(ioapic_entries);
+               if (ret) {
+                       WARN(1, "Saving IO-APIC state failed: %d\n", ret);
+                       free_ioapic_entries(ioapic_entries);
+                       return ret;
+               }
+               mask_IO_APIC_setup(ioapic_entries);
+               mask_8259A();
+               enable_x2apic();
+       }
+ #else
+       if (!apic_pm_state.active)
+               return 0;
  
+       local_irq_save(flags);
        if (x2apic)
                enable_x2apic();
+ #endif
        else {
                /*
                 * Make sure the APICBASE points to the right address
                wrmsr(MSR_IA32_APICBASE, l, h);
        }
  
+       maxlvt = lapic_get_maxlvt();
        apic_write(APIC_LVTERR, ERROR_APIC_VECTOR | APIC_LVT_MASKED);
        apic_write(APIC_ID, apic_pm_state.apic_id);
        apic_write(APIC_DFR, apic_pm_state.apic_dfr);
        apic_write(APIC_ESR, 0);
        apic_read(APIC_ESR);
  
+ #ifdef CONFIG_INTR_REMAP
+       if (intr_remapping_enabled)
+               reenable_intr_remapping(EIM_32BIT_APIC_ID);
+       if (x2apic) {
+               unmask_8259A();
+               restore_IO_APIC_setup(ioapic_entries);
+               free_ioapic_entries(ioapic_entries);
+       }
+ #endif
        local_irq_restore(flags);
  
        return 0;
  }
  
@@@ -2052,7 -2100,9 +2104,9 @@@ static int __init init_lapic_sysfs(void
                error = sysdev_register(&device_lapic);
        return error;
  }
- device_initcall(init_lapic_sysfs);
+ /* local apic needs to resume before other devices access its registers. */
+ core_initcall(init_lapic_sysfs);
  
  #else /* CONFIG_PM */
  
diff --combined init/Kconfig
@@@ -919,41 -919,6 +919,41 @@@ config AI
            by some high performance threaded applications. Disabling
            this option saves about 7k.
  
 +config HAVE_PERF_COUNTERS
 +      bool
 +
 +menu "Performance Counters"
 +
 +config PERF_COUNTERS
 +      bool "Kernel Performance Counters"
 +      depends on HAVE_PERF_COUNTERS
 +      default y
 +      select ANON_INODES
 +      help
 +        Enable kernel support for performance counter hardware.
 +
 +        Performance counters are special hardware registers available
 +        on most modern CPUs. These registers count the number of certain
 +        types of hw events: such as instructions executed, cachemisses
 +        suffered, or branches mis-predicted - without slowing down the
 +        kernel or applications. These registers can also trigger interrupts
 +        when a threshold number of events have passed - and can thus be
 +        used to profile the code that runs on that CPU.
 +
 +        The Linux Performance Counter subsystem provides an abstraction of
 +        these hardware capabilities, available via a system call. It
 +        provides per task and per CPU counters, and it provides event
 +        capabilities on top of those.
 +
 +        Say Y if unsure.
 +
 +config EVENT_PROFILE
 +      bool "Tracepoint profile sources"
 +      depends on PERF_COUNTERS && EVENT_TRACER
 +      default y
 +
 +endmenu
 +
  config VM_EVENT_COUNTERS
        default y
        bool "Enable VM event counters for /proc/vmstat" if EMBEDDED
@@@ -1051,7 -1016,7 +1051,7 @@@ source "arch/Kconfig
  
  config SLOW_WORK
        default n
-       bool "Enable slow work thread pool"
+       bool
        help
          The slow work thread pool provides a number of dynamically allocated
          threads that can be used by the kernel to perform operations that
          by a series of mkdirs and a create call, all of which have to touch
          disk.
  
+         See Documentation/slow-work.txt.
  endmenu               # General setup
  
  config HAVE_GENERIC_DMA_COHERENT
diff --combined kernel/exit.c
@@@ -158,9 -158,6 +158,9 @@@ static void delayed_put_task_struct(str
  {
        struct task_struct *tsk = container_of(rhp, struct task_struct, rcu);
  
 +#ifdef CONFIG_PERF_COUNTERS
 +      WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list));
 +#endif
        trace_sched_process_free(tsk);
        put_task_struct(tsk);
  }
@@@ -840,8 -837,7 +840,7 @@@ static void exit_notify(struct task_str
         */
        if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
            (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-            tsk->self_exec_id != tsk->parent_exec_id) &&
-           !capable(CAP_KILL))
+            tsk->self_exec_id != tsk->parent_exec_id))
                tsk->exit_signal = SIGCHLD;
  
        signal = tracehook_notify_death(tsk, &cookie, group_dead);
@@@ -983,6 -979,10 +982,6 @@@ NORET_TYPE void do_exit(long code
        tsk->mempolicy = NULL;
  #endif
  #ifdef CONFIG_FUTEX
 -      /*
 -       * This must happen late, after the PID is not
 -       * hashed anymore:
 -       */
        if (unlikely(!list_empty(&tsk->pi_state_list)))
                exit_pi_state_list(tsk);
        if (unlikely(current->pi_state_cache))
@@@ -1249,12 -1249,6 +1248,12 @@@ static int wait_task_zombie(struct task
         */
        read_unlock(&tasklist_lock);
  
 +      /*
 +       * Flush inherited counters to the parent - before the parent
 +       * gets woken up by child-exit notifications.
 +       */
 +      perf_counter_exit_task(p);
 +
        retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0;
        status = (p->signal->flags & SIGNAL_GROUP_EXIT)
                ? p->signal->group_exit_code : p->exit_code;
diff --combined kernel/sched.c
@@@ -231,13 -231,20 +231,20 @@@ static void start_rt_bandwidth(struct r
  
        spin_lock(&rt_b->rt_runtime_lock);
        for (;;) {
+               unsigned long delta;
+               ktime_t soft, hard;
                if (hrtimer_active(&rt_b->rt_period_timer))
                        break;
  
                now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
                hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-               hrtimer_start_expires(&rt_b->rt_period_timer,
-                               HRTIMER_MODE_ABS);
+               soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+               hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+               delta = ktime_to_ns(ktime_sub(hard, soft));
+               __hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+                               HRTIMER_MODE_ABS, 0);
        }
        spin_unlock(&rt_b->rt_runtime_lock);
  }
@@@ -577,7 -584,6 +584,7 @@@ struct rq 
        struct load_weight load;
        unsigned long nr_load_updates;
        u64 nr_switches;
 +      u64 nr_migrations_in;
  
        struct cfs_rq cfs;
        struct rt_rq rt;
@@@ -686,7 -692,7 +693,7 @@@ static inline int cpu_of(struct rq *rq
  #define task_rq(p)            cpu_rq(task_cpu(p))
  #define cpu_curr(cpu)         (cpu_rq(cpu)->curr)
  
 -static inline void update_rq_clock(struct rq *rq)
 +inline void update_rq_clock(struct rq *rq)
  {
        rq->clock = sched_clock_cpu(cpu_of(rq));
  }
@@@ -1147,7 -1153,8 +1154,8 @@@ static __init void init_hrtick(void
   */
  static void hrtick_start(struct rq *rq, u64 delay)
  {
-       hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+       __hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+                       HRTIMER_MODE_REL, 0);
  }
  
  static inline void init_hrtick(void)
@@@ -1948,15 -1955,12 +1956,15 @@@ void set_task_cpu(struct task_struct *p
                p->se.sleep_start -= clock_offset;
        if (p->se.block_start)
                p->se.block_start -= clock_offset;
 +#endif
        if (old_cpu != new_cpu) {
 -              schedstat_inc(p, se.nr_migrations);
 +              p->se.nr_migrations++;
 +              new_rq->nr_migrations_in++;
 +#ifdef CONFIG_SCHEDSTATS
                if (task_hot(p, old_rq->clock, NULL))
                        schedstat_inc(p, se.nr_forced2_migrations);
 -      }
  #endif
 +      }
        p->se.vruntime -= old_cfsrq->min_vruntime -
                                         new_cfsrq->min_vruntime;
  
@@@ -2308,27 -2312,6 +2316,27 @@@ static int sched_balance_self(int cpu, 
  
  #endif /* CONFIG_SMP */
  
 +/**
 + * task_oncpu_function_call - call a function on the cpu on which a task runs
 + * @p:                the task to evaluate
 + * @func:     the function to be called
 + * @info:     the function call argument
 + *
 + * Calls the function @func when the task is currently running. This might
 + * be on the current CPU, which just calls the function directly
 + */
 +void task_oncpu_function_call(struct task_struct *p,
 +                            void (*func) (void *info), void *info)
 +{
 +      int cpu;
 +
 +      preempt_disable();
 +      cpu = task_cpu(p);
 +      if (task_curr(p))
 +              smp_call_function_single(cpu, func, info, 1);
 +      preempt_enable();
 +}
 +
  /***
   * try_to_wake_up - wake up a thread
   * @p: the to-be-woken-up thread
@@@ -2485,7 -2468,6 +2493,7 @@@ static void __sched_fork(struct task_st
        p->se.exec_start                = 0;
        p->se.sum_exec_runtime          = 0;
        p->se.prev_sum_exec_runtime     = 0;
 +      p->se.nr_migrations             = 0;
        p->se.last_wakeup               = 0;
        p->se.avg_overlap               = 0;
        p->se.start_runtime             = 0;
@@@ -2716,7 -2698,6 +2724,7 @@@ static void finish_task_switch(struct r
         */
        prev_state = prev->state;
        finish_arch_switch(prev);
 +      perf_counter_task_sched_in(current, cpu_of(rq));
        finish_lock_switch(rq, prev);
  #ifdef CONFIG_SMP
        if (post_schedule)
@@@ -2879,15 -2860,6 +2887,15 @@@ unsigned long nr_active(void
  }
  
  /*
 + * Externally visible per-cpu scheduler statistics:
 + * cpu_nr_migrations(cpu) - number of migrations into that cpu
 + */
 +u64 cpu_nr_migrations(int cpu)
 +{
 +      return cpu_rq(cpu)->nr_migrations_in;
 +}
 +
 +/*
   * Update rq->cpu_load[] statistics. This function is usually called every
   * scheduler tick (TICK_NSEC).
   */
@@@ -4542,29 -4514,6 +4550,29 @@@ EXPORT_PER_CPU_SYMBOL(kstat)
   * Return any ns on the sched_clock that have not yet been banked in
   * @p in case that task is currently running.
   */
 +unsigned long long __task_delta_exec(struct task_struct *p, int update)
 +{
 +      s64 delta_exec;
 +      struct rq *rq;
 +
 +      rq = task_rq(p);
 +      WARN_ON_ONCE(!runqueue_is_locked());
 +      WARN_ON_ONCE(!task_current(rq, p));
 +
 +      if (update)
 +              update_rq_clock(rq);
 +
 +      delta_exec = rq->clock - p->se.exec_start;
 +
 +      WARN_ON_ONCE(delta_exec < 0);
 +
 +      return delta_exec;
 +}
 +
 +/*
 + * Return any ns on the sched_clock that have not yet been banked in
 + * @p in case that task is currently running.
 + */
  unsigned long long task_delta_exec(struct task_struct *p)
  {
        unsigned long flags;
@@@ -4824,7 -4773,6 +4832,7 @@@ void scheduler_tick(void
        update_rq_clock(rq);
        update_cpu_load(rq);
        curr->sched_class->task_tick(rq, curr, 0);
 +      perf_counter_task_tick(curr, cpu);
        spin_unlock(&rq->lock);
  
  #ifdef CONFIG_SMP
@@@ -5040,7 -4988,6 +5048,7 @@@ need_resched_nonpreemptible
  
        if (likely(prev != next)) {
                sched_info_switch(prev, next);
 +              perf_counter_task_sched_out(prev, cpu);
  
                rq->nr_switches++;
                rq->curr = next;