Merge commit 'v2.6.29-rc1' into perfcounters/core
[safe/jmp/linux-2.6] / drivers / acpi / processor_idle.c
1 /*
2  * processor_idle - idle state submodule to the ACPI processor driver
3  *
4  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6  *  Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
7  *  Copyright (C) 2004  Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
8  *                      - Added processor hotplug support
9  *  Copyright (C) 2005  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
10  *                      - Added support for C3 on SMP
11  *
12  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13  *
14  *  This program is free software; you can redistribute it and/or modify
15  *  it under the terms of the GNU General Public License as published by
16  *  the Free Software Foundation; either version 2 of the License, or (at
17  *  your option) any later version.
18  *
19  *  This program is distributed in the hope that it will be useful, but
20  *  WITHOUT ANY WARRANTY; without even the implied warranty of
21  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22  *  General Public License for more details.
23  *
24  *  You should have received a copy of the GNU General Public License along
25  *  with this program; if not, write to the Free Software Foundation, Inc.,
26  *  59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
27  *
28  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
29  */
30
31 #include <linux/kernel.h>
32 #include <linux/module.h>
33 #include <linux/init.h>
34 #include <linux/cpufreq.h>
35 #include <linux/proc_fs.h>
36 #include <linux/seq_file.h>
37 #include <linux/acpi.h>
38 #include <linux/dmi.h>
39 #include <linux/moduleparam.h>
40 #include <linux/sched.h>        /* need_resched() */
41 #include <linux/pm_qos_params.h>
42 #include <linux/clockchips.h>
43 #include <linux/cpuidle.h>
44 #include <linux/irqflags.h>
45
46 /*
47  * Include the apic definitions for x86 to have the APIC timer related defines
48  * available also for UP (on SMP it gets magically included via linux/smp.h).
49  * asm/acpi.h is not an option, as it would require more include magic. Also
50  * creating an empty asm-ia64/apic.h would just trade pest vs. cholera.
51  */
52 #ifdef CONFIG_X86
53 #include <asm/apic.h>
54 #endif
55
56 #include <asm/io.h>
57 #include <asm/uaccess.h>
58
59 #include <acpi/acpi_bus.h>
60 #include <acpi/processor.h>
61 #include <asm/processor.h>
62
63 #define ACPI_PROCESSOR_CLASS            "processor"
64 #define _COMPONENT              ACPI_PROCESSOR_COMPONENT
65 ACPI_MODULE_NAME("processor_idle");
66 #define ACPI_PROCESSOR_FILE_POWER       "power"
67 #define US_TO_PM_TIMER_TICKS(t)         ((t * (PM_TIMER_FREQUENCY/1000)) / 1000)
68 #define PM_TIMER_TICK_NS                (1000000000ULL/PM_TIMER_FREQUENCY)
69 #ifndef CONFIG_CPU_IDLE
70 #define C2_OVERHEAD                     4       /* 1us (3.579 ticks per us) */
71 #define C3_OVERHEAD                     4       /* 1us (3.579 ticks per us) */
72 static void (*pm_idle_save) (void) __read_mostly;
73 #else
74 #define C2_OVERHEAD                     1       /* 1us */
75 #define C3_OVERHEAD                     1       /* 1us */
76 #endif
77 #define PM_TIMER_TICKS_TO_US(p)         (((p) * 1000)/(PM_TIMER_FREQUENCY/1000))
78
79 static unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER;
80 #ifdef CONFIG_CPU_IDLE
81 module_param(max_cstate, uint, 0000);
82 #else
83 module_param(max_cstate, uint, 0644);
84 #endif
85 static unsigned int nocst __read_mostly;
86 module_param(nocst, uint, 0000);
87
88 #ifndef CONFIG_CPU_IDLE
89 /*
90  * bm_history -- bit-mask with a bit per jiffy of bus-master activity
91  * 1000 HZ: 0xFFFFFFFF: 32 jiffies = 32ms
92  * 800 HZ: 0xFFFFFFFF: 32 jiffies = 40ms
93  * 100 HZ: 0x0000000F: 4 jiffies = 40ms
94  * reduce history for more aggressive entry into C3
95  */
96 static unsigned int bm_history __read_mostly =
97     (HZ >= 800 ? 0xFFFFFFFF : ((1U << (HZ / 25)) - 1));
98 module_param(bm_history, uint, 0644);
99
100 static int acpi_processor_set_power_policy(struct acpi_processor *pr);
101
102 #else   /* CONFIG_CPU_IDLE */
103 static unsigned int latency_factor __read_mostly = 2;
104 module_param(latency_factor, uint, 0644);
105 #endif
106
107 /*
108  * IBM ThinkPad R40e crashes mysteriously when going into C2 or C3.
109  * For now disable this. Probably a bug somewhere else.
110  *
111  * To skip this limit, boot/load with a large max_cstate limit.
112  */
113 static int set_max_cstate(const struct dmi_system_id *id)
114 {
115         if (max_cstate > ACPI_PROCESSOR_MAX_POWER)
116                 return 0;
117
118         printk(KERN_NOTICE PREFIX "%s detected - limiting to C%ld max_cstate."
119                " Override with \"processor.max_cstate=%d\"\n", id->ident,
120                (long)id->driver_data, ACPI_PROCESSOR_MAX_POWER + 1);
121
122         max_cstate = (long)id->driver_data;
123
124         return 0;
125 }
126
127 /* Actually this shouldn't be __cpuinitdata, would be better to fix the
128    callers to only run once -AK */
129 static struct dmi_system_id __cpuinitdata processor_power_dmi_table[] = {
130         { set_max_cstate, "IBM ThinkPad R40e", {
131           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
132           DMI_MATCH(DMI_BIOS_VERSION,"1SET70WW")}, (void *)1},
133         { set_max_cstate, "IBM ThinkPad R40e", {
134           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
135           DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW")}, (void *)1},
136         { set_max_cstate, "IBM ThinkPad R40e", {
137           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
138           DMI_MATCH(DMI_BIOS_VERSION,"1SET43WW") }, (void*)1},
139         { set_max_cstate, "IBM ThinkPad R40e", {
140           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
141           DMI_MATCH(DMI_BIOS_VERSION,"1SET45WW") }, (void*)1},
142         { set_max_cstate, "IBM ThinkPad R40e", {
143           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
144           DMI_MATCH(DMI_BIOS_VERSION,"1SET47WW") }, (void*)1},
145         { set_max_cstate, "IBM ThinkPad R40e", {
146           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
147           DMI_MATCH(DMI_BIOS_VERSION,"1SET50WW") }, (void*)1},
148         { set_max_cstate, "IBM ThinkPad R40e", {
149           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
150           DMI_MATCH(DMI_BIOS_VERSION,"1SET52WW") }, (void*)1},
151         { set_max_cstate, "IBM ThinkPad R40e", {
152           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
153           DMI_MATCH(DMI_BIOS_VERSION,"1SET55WW") }, (void*)1},
154         { set_max_cstate, "IBM ThinkPad R40e", {
155           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
156           DMI_MATCH(DMI_BIOS_VERSION,"1SET56WW") }, (void*)1},
157         { set_max_cstate, "IBM ThinkPad R40e", {
158           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
159           DMI_MATCH(DMI_BIOS_VERSION,"1SET59WW") }, (void*)1},
160         { set_max_cstate, "IBM ThinkPad R40e", {
161           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
162           DMI_MATCH(DMI_BIOS_VERSION,"1SET60WW") }, (void*)1},
163         { set_max_cstate, "IBM ThinkPad R40e", {
164           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
165           DMI_MATCH(DMI_BIOS_VERSION,"1SET61WW") }, (void*)1},
166         { set_max_cstate, "IBM ThinkPad R40e", {
167           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
168           DMI_MATCH(DMI_BIOS_VERSION,"1SET62WW") }, (void*)1},
169         { set_max_cstate, "IBM ThinkPad R40e", {
170           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
171           DMI_MATCH(DMI_BIOS_VERSION,"1SET64WW") }, (void*)1},
172         { set_max_cstate, "IBM ThinkPad R40e", {
173           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
174           DMI_MATCH(DMI_BIOS_VERSION,"1SET65WW") }, (void*)1},
175         { set_max_cstate, "IBM ThinkPad R40e", {
176           DMI_MATCH(DMI_BIOS_VENDOR,"IBM"),
177           DMI_MATCH(DMI_BIOS_VERSION,"1SET68WW") }, (void*)1},
178         { set_max_cstate, "Medion 41700", {
179           DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
180           DMI_MATCH(DMI_BIOS_VERSION,"R01-A1J")}, (void *)1},
181         { set_max_cstate, "Clevo 5600D", {
182           DMI_MATCH(DMI_BIOS_VENDOR,"Phoenix Technologies LTD"),
183           DMI_MATCH(DMI_BIOS_VERSION,"SHE845M0.86C.0013.D.0302131307")},
184          (void *)2},
185         {},
186 };
187
188 static inline u32 ticks_elapsed(u32 t1, u32 t2)
189 {
190         if (t2 >= t1)
191                 return (t2 - t1);
192         else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
193                 return (((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
194         else
195                 return ((0xFFFFFFFF - t1) + t2);
196 }
197
198 static inline u32 ticks_elapsed_in_us(u32 t1, u32 t2)
199 {
200         if (t2 >= t1)
201                 return PM_TIMER_TICKS_TO_US(t2 - t1);
202         else if (!(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER))
203                 return PM_TIMER_TICKS_TO_US(((0x00FFFFFF - t1) + t2) & 0x00FFFFFF);
204         else
205                 return PM_TIMER_TICKS_TO_US((0xFFFFFFFF - t1) + t2);
206 }
207
208 /*
209  * Callers should disable interrupts before the call and enable
210  * interrupts after return.
211  */
212 static void acpi_safe_halt(void)
213 {
214         current_thread_info()->status &= ~TS_POLLING;
215         /*
216          * TS_POLLING-cleared state must be visible before we
217          * test NEED_RESCHED:
218          */
219         smp_mb();
220         if (!need_resched()) {
221                 safe_halt();
222                 local_irq_disable();
223         }
224         current_thread_info()->status |= TS_POLLING;
225 }
226
227 #ifndef CONFIG_CPU_IDLE
228
229 static void
230 acpi_processor_power_activate(struct acpi_processor *pr,
231                               struct acpi_processor_cx *new)
232 {
233         struct acpi_processor_cx *old;
234
235         if (!pr || !new)
236                 return;
237
238         old = pr->power.state;
239
240         if (old)
241                 old->promotion.count = 0;
242         new->demotion.count = 0;
243
244         /* Cleanup from old state. */
245         if (old) {
246                 switch (old->type) {
247                 case ACPI_STATE_C3:
248                         /* Disable bus master reload */
249                         if (new->type != ACPI_STATE_C3 && pr->flags.bm_check)
250                                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
251                         break;
252                 }
253         }
254
255         /* Prepare to use new state. */
256         switch (new->type) {
257         case ACPI_STATE_C3:
258                 /* Enable bus master reload */
259                 if (old->type != ACPI_STATE_C3 && pr->flags.bm_check)
260                         acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
261                 break;
262         }
263
264         pr->power.state = new;
265
266         return;
267 }
268
269 static atomic_t c3_cpu_count;
270
271 /* Common C-state entry for C2, C3, .. */
272 static void acpi_cstate_enter(struct acpi_processor_cx *cstate)
273 {
274         u64 perf_flags;
275
276         /* Don't trace irqs off for idle */
277         stop_critical_timings();
278         perf_flags = hw_perf_save_disable();
279         if (cstate->entry_method == ACPI_CSTATE_FFH) {
280                 /* Call into architectural FFH based C-state */
281                 acpi_processor_ffh_cstate_enter(cstate);
282         } else {
283                 int unused;
284                 /* IO port based C-state */
285                 inb(cstate->address);
286                 /* Dummy wait op - must do something useless after P_LVL2 read
287                    because chipsets cannot guarantee that STPCLK# signal
288                    gets asserted in time to freeze execution properly. */
289                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
290         }
291         hw_perf_restore(perf_flags);
292         start_critical_timings();
293 }
294 #endif /* !CONFIG_CPU_IDLE */
295
296 #ifdef ARCH_APICTIMER_STOPS_ON_C3
297
298 /*
299  * Some BIOS implementations switch to C3 in the published C2 state.
300  * This seems to be a common problem on AMD boxen, but other vendors
301  * are affected too. We pick the most conservative approach: we assume
302  * that the local APIC stops in both C2 and C3.
303  */
304 static void acpi_timer_check_state(int state, struct acpi_processor *pr,
305                                    struct acpi_processor_cx *cx)
306 {
307         struct acpi_processor_power *pwr = &pr->power;
308         u8 type = local_apic_timer_c2_ok ? ACPI_STATE_C3 : ACPI_STATE_C2;
309
310         /*
311          * Check, if one of the previous states already marked the lapic
312          * unstable
313          */
314         if (pwr->timer_broadcast_on_state < state)
315                 return;
316
317         if (cx->type >= type)
318                 pr->power.timer_broadcast_on_state = state;
319 }
320
321 static void acpi_propagate_timer_broadcast(struct acpi_processor *pr)
322 {
323         unsigned long reason;
324
325         reason = pr->power.timer_broadcast_on_state < INT_MAX ?
326                 CLOCK_EVT_NOTIFY_BROADCAST_ON : CLOCK_EVT_NOTIFY_BROADCAST_OFF;
327
328         clockevents_notify(reason, &pr->id);
329 }
330
331 /* Power(C) State timer broadcast control */
332 static void acpi_state_timer_broadcast(struct acpi_processor *pr,
333                                        struct acpi_processor_cx *cx,
334                                        int broadcast)
335 {
336         int state = cx - pr->power.states;
337
338         if (state >= pr->power.timer_broadcast_on_state) {
339                 unsigned long reason;
340
341                 reason = broadcast ?  CLOCK_EVT_NOTIFY_BROADCAST_ENTER :
342                         CLOCK_EVT_NOTIFY_BROADCAST_EXIT;
343                 clockevents_notify(reason, &pr->id);
344         }
345 }
346
347 #else
348
349 static void acpi_timer_check_state(int state, struct acpi_processor *pr,
350                                    struct acpi_processor_cx *cstate) { }
351 static void acpi_propagate_timer_broadcast(struct acpi_processor *pr) { }
352 static void acpi_state_timer_broadcast(struct acpi_processor *pr,
353                                        struct acpi_processor_cx *cx,
354                                        int broadcast)
355 {
356 }
357
358 #endif
359
360 /*
361  * Suspend / resume control
362  */
363 static int acpi_idle_suspend;
364
365 int acpi_processor_suspend(struct acpi_device * device, pm_message_t state)
366 {
367         acpi_idle_suspend = 1;
368         return 0;
369 }
370
371 int acpi_processor_resume(struct acpi_device * device)
372 {
373         acpi_idle_suspend = 0;
374         return 0;
375 }
376
377 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
378 static int tsc_halts_in_c(int state)
379 {
380         switch (boot_cpu_data.x86_vendor) {
381         case X86_VENDOR_AMD:
382         case X86_VENDOR_INTEL:
383                 /*
384                  * AMD Fam10h TSC will tick in all
385                  * C/P/S0/S1 states when this bit is set.
386                  */
387                 if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
388                         return 0;
389
390                 /*FALL THROUGH*/
391         default:
392                 return state > ACPI_STATE_C1;
393         }
394 }
395 #endif
396
397 #ifndef CONFIG_CPU_IDLE
398 static void acpi_processor_idle(void)
399 {
400         struct acpi_processor *pr = NULL;
401         struct acpi_processor_cx *cx = NULL;
402         struct acpi_processor_cx *next_state = NULL;
403         int sleep_ticks = 0;
404         u32 t1, t2 = 0;
405
406         /*
407          * Interrupts must be disabled during bus mastering calculations and
408          * for C2/C3 transitions.
409          */
410         local_irq_disable();
411
412         pr = __get_cpu_var(processors);
413         if (!pr) {
414                 local_irq_enable();
415                 return;
416         }
417
418         /*
419          * Check whether we truly need to go idle, or should
420          * reschedule:
421          */
422         if (unlikely(need_resched())) {
423                 local_irq_enable();
424                 return;
425         }
426
427         cx = pr->power.state;
428         if (!cx || acpi_idle_suspend) {
429                 if (pm_idle_save) {
430                         pm_idle_save(); /* enables IRQs */
431                 } else {
432                         acpi_safe_halt();
433                         local_irq_enable();
434                 }
435
436                 return;
437         }
438
439         /*
440          * Check BM Activity
441          * -----------------
442          * Check for bus mastering activity (if required), record, and check
443          * for demotion.
444          */
445         if (pr->flags.bm_check) {
446                 u32 bm_status = 0;
447                 unsigned long diff = jiffies - pr->power.bm_check_timestamp;
448
449                 if (diff > 31)
450                         diff = 31;
451
452                 pr->power.bm_activity <<= diff;
453
454                 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
455                 if (bm_status) {
456                         pr->power.bm_activity |= 0x1;
457                         acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
458                 }
459                 /*
460                  * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
461                  * the true state of bus mastering activity; forcing us to
462                  * manually check the BMIDEA bit of each IDE channel.
463                  */
464                 else if (errata.piix4.bmisx) {
465                         if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
466                             || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
467                                 pr->power.bm_activity |= 0x1;
468                 }
469
470                 pr->power.bm_check_timestamp = jiffies;
471
472                 /*
473                  * If bus mastering is or was active this jiffy, demote
474                  * to avoid a faulty transition.  Note that the processor
475                  * won't enter a low-power state during this call (to this
476                  * function) but should upon the next.
477                  *
478                  * TBD: A better policy might be to fallback to the demotion
479                  *      state (use it for this quantum only) istead of
480                  *      demoting -- and rely on duration as our sole demotion
481                  *      qualification.  This may, however, introduce DMA
482                  *      issues (e.g. floppy DMA transfer overrun/underrun).
483                  */
484                 if ((pr->power.bm_activity & 0x1) &&
485                     cx->demotion.threshold.bm) {
486                         local_irq_enable();
487                         next_state = cx->demotion.state;
488                         goto end;
489                 }
490         }
491
492 #ifdef CONFIG_HOTPLUG_CPU
493         /*
494          * Check for P_LVL2_UP flag before entering C2 and above on
495          * an SMP system. We do it here instead of doing it at _CST/P_LVL
496          * detection phase, to work cleanly with logical CPU hotplug.
497          */
498         if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
499             !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
500                 cx = &pr->power.states[ACPI_STATE_C1];
501 #endif
502
503         /*
504          * Sleep:
505          * ------
506          * Invoke the current Cx state to put the processor to sleep.
507          */
508         if (cx->type == ACPI_STATE_C2 || cx->type == ACPI_STATE_C3) {
509                 current_thread_info()->status &= ~TS_POLLING;
510                 /*
511                  * TS_POLLING-cleared state must be visible before we
512                  * test NEED_RESCHED:
513                  */
514                 smp_mb();
515                 if (need_resched()) {
516                         current_thread_info()->status |= TS_POLLING;
517                         local_irq_enable();
518                         return;
519                 }
520         }
521
522         switch (cx->type) {
523
524         case ACPI_STATE_C1:
525                 /*
526                  * Invoke C1.
527                  * Use the appropriate idle routine, the one that would
528                  * be used without acpi C-states.
529                  */
530                 if (pm_idle_save) {
531                         pm_idle_save(); /* enables IRQs */
532                 } else {
533                         acpi_safe_halt();
534                         local_irq_enable();
535                 }
536
537                 /*
538                  * TBD: Can't get time duration while in C1, as resumes
539                  *      go to an ISR rather than here.  Need to instrument
540                  *      base interrupt handler.
541                  *
542                  * Note: the TSC better not stop in C1, sched_clock() will
543                  *       skew otherwise.
544                  */
545                 sleep_ticks = 0xFFFFFFFF;
546
547                 break;
548
549         case ACPI_STATE_C2:
550                 /* Get start time (ticks) */
551                 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
552                 /* Tell the scheduler that we are going deep-idle: */
553                 sched_clock_idle_sleep_event();
554                 /* Invoke C2 */
555                 acpi_state_timer_broadcast(pr, cx, 1);
556                 acpi_cstate_enter(cx);
557                 /* Get end time (ticks) */
558                 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
559
560 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
561                 /* TSC halts in C2, so notify users */
562                 if (tsc_halts_in_c(ACPI_STATE_C2))
563                         mark_tsc_unstable("possible TSC halt in C2");
564 #endif
565                 /* Compute time (ticks) that we were actually asleep */
566                 sleep_ticks = ticks_elapsed(t1, t2);
567
568                 /* Tell the scheduler how much we idled: */
569                 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
570
571                 /* Re-enable interrupts */
572                 local_irq_enable();
573                 /* Do not account our idle-switching overhead: */
574                 sleep_ticks -= cx->latency_ticks + C2_OVERHEAD;
575
576                 current_thread_info()->status |= TS_POLLING;
577                 acpi_state_timer_broadcast(pr, cx, 0);
578                 break;
579
580         case ACPI_STATE_C3:
581                 acpi_unlazy_tlb(smp_processor_id());
582                 /*
583                  * Must be done before busmaster disable as we might
584                  * need to access HPET !
585                  */
586                 acpi_state_timer_broadcast(pr, cx, 1);
587                 /*
588                  * disable bus master
589                  * bm_check implies we need ARB_DIS
590                  * !bm_check implies we need cache flush
591                  * bm_control implies whether we can do ARB_DIS
592                  *
593                  * That leaves a case where bm_check is set and bm_control is
594                  * not set. In that case we cannot do much, we enter C3
595                  * without doing anything.
596                  */
597                 if (pr->flags.bm_check && pr->flags.bm_control) {
598                         if (atomic_inc_return(&c3_cpu_count) ==
599                             num_online_cpus()) {
600                                 /*
601                                  * All CPUs are trying to go to C3
602                                  * Disable bus master arbitration
603                                  */
604                                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
605                         }
606                 } else if (!pr->flags.bm_check) {
607                         /* SMP with no shared cache... Invalidate cache  */
608                         ACPI_FLUSH_CPU_CACHE();
609                 }
610
611                 /* Get start time (ticks) */
612                 t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
613                 /* Invoke C3 */
614                 /* Tell the scheduler that we are going deep-idle: */
615                 sched_clock_idle_sleep_event();
616                 acpi_cstate_enter(cx);
617                 /* Get end time (ticks) */
618                 t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
619                 if (pr->flags.bm_check && pr->flags.bm_control) {
620                         /* Enable bus master arbitration */
621                         atomic_dec(&c3_cpu_count);
622                         acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
623                 }
624
625 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
626                 /* TSC halts in C3, so notify users */
627                 if (tsc_halts_in_c(ACPI_STATE_C3))
628                         mark_tsc_unstable("TSC halts in C3");
629 #endif
630                 /* Compute time (ticks) that we were actually asleep */
631                 sleep_ticks = ticks_elapsed(t1, t2);
632                 /* Tell the scheduler how much we idled: */
633                 sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
634
635                 /* Re-enable interrupts */
636                 local_irq_enable();
637                 /* Do not account our idle-switching overhead: */
638                 sleep_ticks -= cx->latency_ticks + C3_OVERHEAD;
639
640                 current_thread_info()->status |= TS_POLLING;
641                 acpi_state_timer_broadcast(pr, cx, 0);
642                 break;
643
644         default:
645                 local_irq_enable();
646                 return;
647         }
648         cx->usage++;
649         if ((cx->type != ACPI_STATE_C1) && (sleep_ticks > 0))
650                 cx->time += sleep_ticks;
651
652         next_state = pr->power.state;
653
654 #ifdef CONFIG_HOTPLUG_CPU
655         /* Don't do promotion/demotion */
656         if ((cx->type == ACPI_STATE_C1) && (num_online_cpus() > 1) &&
657             !pr->flags.has_cst && !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED)) {
658                 next_state = cx;
659                 goto end;
660         }
661 #endif
662
663         /*
664          * Promotion?
665          * ----------
666          * Track the number of longs (time asleep is greater than threshold)
667          * and promote when the count threshold is reached.  Note that bus
668          * mastering activity may prevent promotions.
669          * Do not promote above max_cstate.
670          */
671         if (cx->promotion.state &&
672             ((cx->promotion.state - pr->power.states) <= max_cstate)) {
673                 if (sleep_ticks > cx->promotion.threshold.ticks &&
674                   cx->promotion.state->latency <=
675                                 pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
676                         cx->promotion.count++;
677                         cx->demotion.count = 0;
678                         if (cx->promotion.count >=
679                             cx->promotion.threshold.count) {
680                                 if (pr->flags.bm_check) {
681                                         if (!
682                                             (pr->power.bm_activity & cx->
683                                              promotion.threshold.bm)) {
684                                                 next_state =
685                                                     cx->promotion.state;
686                                                 goto end;
687                                         }
688                                 } else {
689                                         next_state = cx->promotion.state;
690                                         goto end;
691                                 }
692                         }
693                 }
694         }
695
696         /*
697          * Demotion?
698          * ---------
699          * Track the number of shorts (time asleep is less than time threshold)
700          * and demote when the usage threshold is reached.
701          */
702         if (cx->demotion.state) {
703                 if (sleep_ticks < cx->demotion.threshold.ticks) {
704                         cx->demotion.count++;
705                         cx->promotion.count = 0;
706                         if (cx->demotion.count >= cx->demotion.threshold.count) {
707                                 next_state = cx->demotion.state;
708                                 goto end;
709                         }
710                 }
711         }
712
713       end:
714         /*
715          * Demote if current state exceeds max_cstate
716          * or if the latency of the current state is unacceptable
717          */
718         if ((pr->power.state - pr->power.states) > max_cstate ||
719                 pr->power.state->latency >
720                                 pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY)) {
721                 if (cx->demotion.state)
722                         next_state = cx->demotion.state;
723         }
724
725         /*
726          * New Cx State?
727          * -------------
728          * If we're going to start using a new Cx state we must clean up
729          * from the previous and prepare to use the new.
730          */
731         if (next_state != pr->power.state)
732                 acpi_processor_power_activate(pr, next_state);
733 }
734
735 static int acpi_processor_set_power_policy(struct acpi_processor *pr)
736 {
737         unsigned int i;
738         unsigned int state_is_set = 0;
739         struct acpi_processor_cx *lower = NULL;
740         struct acpi_processor_cx *higher = NULL;
741         struct acpi_processor_cx *cx;
742
743
744         if (!pr)
745                 return -EINVAL;
746
747         /*
748          * This function sets the default Cx state policy (OS idle handler).
749          * Our scheme is to promote quickly to C2 but more conservatively
750          * to C3.  We're favoring C2  for its characteristics of low latency
751          * (quick response), good power savings, and ability to allow bus
752          * mastering activity.  Note that the Cx state policy is completely
753          * customizable and can be altered dynamically.
754          */
755
756         /* startup state */
757         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
758                 cx = &pr->power.states[i];
759                 if (!cx->valid)
760                         continue;
761
762                 if (!state_is_set)
763                         pr->power.state = cx;
764                 state_is_set++;
765                 break;
766         }
767
768         if (!state_is_set)
769                 return -ENODEV;
770
771         /* demotion */
772         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
773                 cx = &pr->power.states[i];
774                 if (!cx->valid)
775                         continue;
776
777                 if (lower) {
778                         cx->demotion.state = lower;
779                         cx->demotion.threshold.ticks = cx->latency_ticks;
780                         cx->demotion.threshold.count = 1;
781                         if (cx->type == ACPI_STATE_C3)
782                                 cx->demotion.threshold.bm = bm_history;
783                 }
784
785                 lower = cx;
786         }
787
788         /* promotion */
789         for (i = (ACPI_PROCESSOR_MAX_POWER - 1); i > 0; i--) {
790                 cx = &pr->power.states[i];
791                 if (!cx->valid)
792                         continue;
793
794                 if (higher) {
795                         cx->promotion.state = higher;
796                         cx->promotion.threshold.ticks = cx->latency_ticks;
797                         if (cx->type >= ACPI_STATE_C2)
798                                 cx->promotion.threshold.count = 4;
799                         else
800                                 cx->promotion.threshold.count = 10;
801                         if (higher->type == ACPI_STATE_C3)
802                                 cx->promotion.threshold.bm = bm_history;
803                 }
804
805                 higher = cx;
806         }
807
808         return 0;
809 }
810 #endif /* !CONFIG_CPU_IDLE */
811
812 static int acpi_processor_get_power_info_fadt(struct acpi_processor *pr)
813 {
814
815         if (!pr)
816                 return -EINVAL;
817
818         if (!pr->pblk)
819                 return -ENODEV;
820
821         /* if info is obtained from pblk/fadt, type equals state */
822         pr->power.states[ACPI_STATE_C2].type = ACPI_STATE_C2;
823         pr->power.states[ACPI_STATE_C3].type = ACPI_STATE_C3;
824
825 #ifndef CONFIG_HOTPLUG_CPU
826         /*
827          * Check for P_LVL2_UP flag before entering C2 and above on
828          * an SMP system.
829          */
830         if ((num_online_cpus() > 1) &&
831             !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
832                 return -ENODEV;
833 #endif
834
835         /* determine C2 and C3 address from pblk */
836         pr->power.states[ACPI_STATE_C2].address = pr->pblk + 4;
837         pr->power.states[ACPI_STATE_C3].address = pr->pblk + 5;
838
839         /* determine latencies from FADT */
840         pr->power.states[ACPI_STATE_C2].latency = acpi_gbl_FADT.C2latency;
841         pr->power.states[ACPI_STATE_C3].latency = acpi_gbl_FADT.C3latency;
842
843         ACPI_DEBUG_PRINT((ACPI_DB_INFO,
844                           "lvl2[0x%08x] lvl3[0x%08x]\n",
845                           pr->power.states[ACPI_STATE_C2].address,
846                           pr->power.states[ACPI_STATE_C3].address));
847
848         return 0;
849 }
850
851 static int acpi_processor_get_power_info_default(struct acpi_processor *pr)
852 {
853         if (!pr->power.states[ACPI_STATE_C1].valid) {
854                 /* set the first C-State to C1 */
855                 /* all processors need to support C1 */
856                 pr->power.states[ACPI_STATE_C1].type = ACPI_STATE_C1;
857                 pr->power.states[ACPI_STATE_C1].valid = 1;
858                 pr->power.states[ACPI_STATE_C1].entry_method = ACPI_CSTATE_HALT;
859         }
860         /* the C0 state only exists as a filler in our array */
861         pr->power.states[ACPI_STATE_C0].valid = 1;
862         return 0;
863 }
864
865 static int acpi_processor_get_power_info_cst(struct acpi_processor *pr)
866 {
867         acpi_status status = 0;
868         acpi_integer count;
869         int current_count;
870         int i;
871         struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL };
872         union acpi_object *cst;
873
874
875         if (nocst)
876                 return -ENODEV;
877
878         current_count = 0;
879
880         status = acpi_evaluate_object(pr->handle, "_CST", NULL, &buffer);
881         if (ACPI_FAILURE(status)) {
882                 ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No _CST, giving up\n"));
883                 return -ENODEV;
884         }
885
886         cst = buffer.pointer;
887
888         /* There must be at least 2 elements */
889         if (!cst || (cst->type != ACPI_TYPE_PACKAGE) || cst->package.count < 2) {
890                 printk(KERN_ERR PREFIX "not enough elements in _CST\n");
891                 status = -EFAULT;
892                 goto end;
893         }
894
895         count = cst->package.elements[0].integer.value;
896
897         /* Validate number of power states. */
898         if (count < 1 || count != cst->package.count - 1) {
899                 printk(KERN_ERR PREFIX "count given by _CST is not valid\n");
900                 status = -EFAULT;
901                 goto end;
902         }
903
904         /* Tell driver that at least _CST is supported. */
905         pr->flags.has_cst = 1;
906
907         for (i = 1; i <= count; i++) {
908                 union acpi_object *element;
909                 union acpi_object *obj;
910                 struct acpi_power_register *reg;
911                 struct acpi_processor_cx cx;
912
913                 memset(&cx, 0, sizeof(cx));
914
915                 element = &(cst->package.elements[i]);
916                 if (element->type != ACPI_TYPE_PACKAGE)
917                         continue;
918
919                 if (element->package.count != 4)
920                         continue;
921
922                 obj = &(element->package.elements[0]);
923
924                 if (obj->type != ACPI_TYPE_BUFFER)
925                         continue;
926
927                 reg = (struct acpi_power_register *)obj->buffer.pointer;
928
929                 if (reg->space_id != ACPI_ADR_SPACE_SYSTEM_IO &&
930                     (reg->space_id != ACPI_ADR_SPACE_FIXED_HARDWARE))
931                         continue;
932
933                 /* There should be an easy way to extract an integer... */
934                 obj = &(element->package.elements[1]);
935                 if (obj->type != ACPI_TYPE_INTEGER)
936                         continue;
937
938                 cx.type = obj->integer.value;
939                 /*
940                  * Some buggy BIOSes won't list C1 in _CST -
941                  * Let acpi_processor_get_power_info_default() handle them later
942                  */
943                 if (i == 1 && cx.type != ACPI_STATE_C1)
944                         current_count++;
945
946                 cx.address = reg->address;
947                 cx.index = current_count + 1;
948
949                 cx.entry_method = ACPI_CSTATE_SYSTEMIO;
950                 if (reg->space_id == ACPI_ADR_SPACE_FIXED_HARDWARE) {
951                         if (acpi_processor_ffh_cstate_probe
952                                         (pr->id, &cx, reg) == 0) {
953                                 cx.entry_method = ACPI_CSTATE_FFH;
954                         } else if (cx.type == ACPI_STATE_C1) {
955                                 /*
956                                  * C1 is a special case where FIXED_HARDWARE
957                                  * can be handled in non-MWAIT way as well.
958                                  * In that case, save this _CST entry info.
959                                  * Otherwise, ignore this info and continue.
960                                  */
961                                 cx.entry_method = ACPI_CSTATE_HALT;
962                                 snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
963                         } else {
964                                 continue;
965                         }
966                         if (cx.type == ACPI_STATE_C1 &&
967                                         (idle_halt || idle_nomwait)) {
968                                 /*
969                                  * In most cases the C1 space_id obtained from
970                                  * _CST object is FIXED_HARDWARE access mode.
971                                  * But when the option of idle=halt is added,
972                                  * the entry_method type should be changed from
973                                  * CSTATE_FFH to CSTATE_HALT.
974                                  * When the option of idle=nomwait is added,
975                                  * the C1 entry_method type should be
976                                  * CSTATE_HALT.
977                                  */
978                                 cx.entry_method = ACPI_CSTATE_HALT;
979                                 snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI HLT");
980                         }
981                 } else {
982                         snprintf(cx.desc, ACPI_CX_DESC_LEN, "ACPI IOPORT 0x%x",
983                                  cx.address);
984                 }
985
986                 if (cx.type == ACPI_STATE_C1) {
987                         cx.valid = 1;
988                 }
989
990                 obj = &(element->package.elements[2]);
991                 if (obj->type != ACPI_TYPE_INTEGER)
992                         continue;
993
994                 cx.latency = obj->integer.value;
995
996                 obj = &(element->package.elements[3]);
997                 if (obj->type != ACPI_TYPE_INTEGER)
998                         continue;
999
1000                 cx.power = obj->integer.value;
1001
1002                 current_count++;
1003                 memcpy(&(pr->power.states[current_count]), &cx, sizeof(cx));
1004
1005                 /*
1006                  * We support total ACPI_PROCESSOR_MAX_POWER - 1
1007                  * (From 1 through ACPI_PROCESSOR_MAX_POWER - 1)
1008                  */
1009                 if (current_count >= (ACPI_PROCESSOR_MAX_POWER - 1)) {
1010                         printk(KERN_WARNING
1011                                "Limiting number of power states to max (%d)\n",
1012                                ACPI_PROCESSOR_MAX_POWER);
1013                         printk(KERN_WARNING
1014                                "Please increase ACPI_PROCESSOR_MAX_POWER if needed.\n");
1015                         break;
1016                 }
1017         }
1018
1019         ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %d power states\n",
1020                           current_count));
1021
1022         /* Validate number of power states discovered */
1023         if (current_count < 2)
1024                 status = -EFAULT;
1025
1026       end:
1027         kfree(buffer.pointer);
1028
1029         return status;
1030 }
1031
1032 static void acpi_processor_power_verify_c2(struct acpi_processor_cx *cx)
1033 {
1034
1035         if (!cx->address)
1036                 return;
1037
1038         /*
1039          * C2 latency must be less than or equal to 100
1040          * microseconds.
1041          */
1042         else if (cx->latency > ACPI_PROCESSOR_MAX_C2_LATENCY) {
1043                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1044                                   "latency too large [%d]\n", cx->latency));
1045                 return;
1046         }
1047
1048         /*
1049          * Otherwise we've met all of our C2 requirements.
1050          * Normalize the C2 latency to expidite policy
1051          */
1052         cx->valid = 1;
1053
1054 #ifndef CONFIG_CPU_IDLE
1055         cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
1056 #else
1057         cx->latency_ticks = cx->latency;
1058 #endif
1059
1060         return;
1061 }
1062
1063 static void acpi_processor_power_verify_c3(struct acpi_processor *pr,
1064                                            struct acpi_processor_cx *cx)
1065 {
1066         static int bm_check_flag;
1067
1068
1069         if (!cx->address)
1070                 return;
1071
1072         /*
1073          * C3 latency must be less than or equal to 1000
1074          * microseconds.
1075          */
1076         else if (cx->latency > ACPI_PROCESSOR_MAX_C3_LATENCY) {
1077                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1078                                   "latency too large [%d]\n", cx->latency));
1079                 return;
1080         }
1081
1082         /*
1083          * PIIX4 Erratum #18: We don't support C3 when Type-F (fast)
1084          * DMA transfers are used by any ISA device to avoid livelock.
1085          * Note that we could disable Type-F DMA (as recommended by
1086          * the erratum), but this is known to disrupt certain ISA
1087          * devices thus we take the conservative approach.
1088          */
1089         else if (errata.piix4.fdma) {
1090                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1091                                   "C3 not supported on PIIX4 with Type-F DMA\n"));
1092                 return;
1093         }
1094
1095         /* All the logic here assumes flags.bm_check is same across all CPUs */
1096         if (!bm_check_flag) {
1097                 /* Determine whether bm_check is needed based on CPU  */
1098                 acpi_processor_power_init_bm_check(&(pr->flags), pr->id);
1099                 bm_check_flag = pr->flags.bm_check;
1100         } else {
1101                 pr->flags.bm_check = bm_check_flag;
1102         }
1103
1104         if (pr->flags.bm_check) {
1105                 if (!pr->flags.bm_control) {
1106                         if (pr->flags.has_cst != 1) {
1107                                 /* bus mastering control is necessary */
1108                                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1109                                         "C3 support requires BM control\n"));
1110                                 return;
1111                         } else {
1112                                 /* Here we enter C3 without bus mastering */
1113                                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1114                                         "C3 support without BM control\n"));
1115                         }
1116                 }
1117         } else {
1118                 /*
1119                  * WBINVD should be set in fadt, for C3 state to be
1120                  * supported on when bm_check is not required.
1121                  */
1122                 if (!(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD)) {
1123                         ACPI_DEBUG_PRINT((ACPI_DB_INFO,
1124                                           "Cache invalidation should work properly"
1125                                           " for C3 to be enabled on SMP systems\n"));
1126                         return;
1127                 }
1128                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
1129         }
1130
1131         /*
1132          * Otherwise we've met all of our C3 requirements.
1133          * Normalize the C3 latency to expidite policy.  Enable
1134          * checking of bus mastering status (bm_check) so we can
1135          * use this in our C3 policy
1136          */
1137         cx->valid = 1;
1138
1139 #ifndef CONFIG_CPU_IDLE
1140         cx->latency_ticks = US_TO_PM_TIMER_TICKS(cx->latency);
1141 #else
1142         cx->latency_ticks = cx->latency;
1143 #endif
1144
1145         return;
1146 }
1147
1148 static int acpi_processor_power_verify(struct acpi_processor *pr)
1149 {
1150         unsigned int i;
1151         unsigned int working = 0;
1152
1153         pr->power.timer_broadcast_on_state = INT_MAX;
1154
1155         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
1156                 struct acpi_processor_cx *cx = &pr->power.states[i];
1157
1158                 switch (cx->type) {
1159                 case ACPI_STATE_C1:
1160                         cx->valid = 1;
1161                         break;
1162
1163                 case ACPI_STATE_C2:
1164                         acpi_processor_power_verify_c2(cx);
1165                         if (cx->valid)
1166                                 acpi_timer_check_state(i, pr, cx);
1167                         break;
1168
1169                 case ACPI_STATE_C3:
1170                         acpi_processor_power_verify_c3(pr, cx);
1171                         if (cx->valid)
1172                                 acpi_timer_check_state(i, pr, cx);
1173                         break;
1174                 }
1175
1176                 if (cx->valid)
1177                         working++;
1178         }
1179
1180         acpi_propagate_timer_broadcast(pr);
1181
1182         return (working);
1183 }
1184
1185 static int acpi_processor_get_power_info(struct acpi_processor *pr)
1186 {
1187         unsigned int i;
1188         int result;
1189
1190
1191         /* NOTE: the idle thread may not be running while calling
1192          * this function */
1193
1194         /* Zero initialize all the C-states info. */
1195         memset(pr->power.states, 0, sizeof(pr->power.states));
1196
1197         result = acpi_processor_get_power_info_cst(pr);
1198         if (result == -ENODEV)
1199                 result = acpi_processor_get_power_info_fadt(pr);
1200
1201         if (result)
1202                 return result;
1203
1204         acpi_processor_get_power_info_default(pr);
1205
1206         pr->power.count = acpi_processor_power_verify(pr);
1207
1208 #ifndef CONFIG_CPU_IDLE
1209         /*
1210          * Set Default Policy
1211          * ------------------
1212          * Now that we know which states are supported, set the default
1213          * policy.  Note that this policy can be changed dynamically
1214          * (e.g. encourage deeper sleeps to conserve battery life when
1215          * not on AC).
1216          */
1217         result = acpi_processor_set_power_policy(pr);
1218         if (result)
1219                 return result;
1220 #endif
1221
1222         /*
1223          * if one state of type C2 or C3 is available, mark this
1224          * CPU as being "idle manageable"
1225          */
1226         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER; i++) {
1227                 if (pr->power.states[i].valid) {
1228                         pr->power.count = i;
1229                         if (pr->power.states[i].type >= ACPI_STATE_C2)
1230                                 pr->flags.power = 1;
1231                 }
1232         }
1233
1234         return 0;
1235 }
1236
1237 static int acpi_processor_power_seq_show(struct seq_file *seq, void *offset)
1238 {
1239         struct acpi_processor *pr = seq->private;
1240         unsigned int i;
1241
1242
1243         if (!pr)
1244                 goto end;
1245
1246         seq_printf(seq, "active state:            C%zd\n"
1247                    "max_cstate:              C%d\n"
1248                    "bus master activity:     %08x\n"
1249                    "maximum allowed latency: %d usec\n",
1250                    pr->power.state ? pr->power.state - pr->power.states : 0,
1251                    max_cstate, (unsigned)pr->power.bm_activity,
1252                    pm_qos_requirement(PM_QOS_CPU_DMA_LATENCY));
1253
1254         seq_puts(seq, "states:\n");
1255
1256         for (i = 1; i <= pr->power.count; i++) {
1257                 seq_printf(seq, "   %cC%d:                  ",
1258                            (&pr->power.states[i] ==
1259                             pr->power.state ? '*' : ' '), i);
1260
1261                 if (!pr->power.states[i].valid) {
1262                         seq_puts(seq, "<not supported>\n");
1263                         continue;
1264                 }
1265
1266                 switch (pr->power.states[i].type) {
1267                 case ACPI_STATE_C1:
1268                         seq_printf(seq, "type[C1] ");
1269                         break;
1270                 case ACPI_STATE_C2:
1271                         seq_printf(seq, "type[C2] ");
1272                         break;
1273                 case ACPI_STATE_C3:
1274                         seq_printf(seq, "type[C3] ");
1275                         break;
1276                 default:
1277                         seq_printf(seq, "type[--] ");
1278                         break;
1279                 }
1280
1281                 if (pr->power.states[i].promotion.state)
1282                         seq_printf(seq, "promotion[C%zd] ",
1283                                    (pr->power.states[i].promotion.state -
1284                                     pr->power.states));
1285                 else
1286                         seq_puts(seq, "promotion[--] ");
1287
1288                 if (pr->power.states[i].demotion.state)
1289                         seq_printf(seq, "demotion[C%zd] ",
1290                                    (pr->power.states[i].demotion.state -
1291                                     pr->power.states));
1292                 else
1293                         seq_puts(seq, "demotion[--] ");
1294
1295                 seq_printf(seq, "latency[%03d] usage[%08d] duration[%020llu]\n",
1296                            pr->power.states[i].latency,
1297                            pr->power.states[i].usage,
1298                            (unsigned long long)pr->power.states[i].time);
1299         }
1300
1301       end:
1302         return 0;
1303 }
1304
1305 static int acpi_processor_power_open_fs(struct inode *inode, struct file *file)
1306 {
1307         return single_open(file, acpi_processor_power_seq_show,
1308                            PDE(inode)->data);
1309 }
1310
1311 static const struct file_operations acpi_processor_power_fops = {
1312         .owner = THIS_MODULE,
1313         .open = acpi_processor_power_open_fs,
1314         .read = seq_read,
1315         .llseek = seq_lseek,
1316         .release = single_release,
1317 };
1318
1319 #ifndef CONFIG_CPU_IDLE
1320
1321 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
1322 {
1323         int result = 0;
1324
1325         if (boot_option_idle_override)
1326                 return 0;
1327
1328         if (!pr)
1329                 return -EINVAL;
1330
1331         if (nocst) {
1332                 return -ENODEV;
1333         }
1334
1335         if (!pr->flags.power_setup_done)
1336                 return -ENODEV;
1337
1338         /*
1339          * Fall back to the default idle loop, when pm_idle_save had
1340          * been initialized.
1341          */
1342         if (pm_idle_save) {
1343                 pm_idle = pm_idle_save;
1344                 /* Relies on interrupts forcing exit from idle. */
1345                 synchronize_sched();
1346         }
1347
1348         pr->flags.power = 0;
1349         result = acpi_processor_get_power_info(pr);
1350         if ((pr->flags.power == 1) && (pr->flags.power_setup_done))
1351                 pm_idle = acpi_processor_idle;
1352
1353         return result;
1354 }
1355
1356 #ifdef CONFIG_SMP
1357 static void smp_callback(void *v)
1358 {
1359         /* we already woke the CPU up, nothing more to do */
1360 }
1361
1362 /*
1363  * This function gets called when a part of the kernel has a new latency
1364  * requirement.  This means we need to get all processors out of their C-state,
1365  * and then recalculate a new suitable C-state. Just do a cross-cpu IPI; that
1366  * wakes them all right up.
1367  */
1368 static int acpi_processor_latency_notify(struct notifier_block *b,
1369                 unsigned long l, void *v)
1370 {
1371         smp_call_function(smp_callback, NULL, 1);
1372         return NOTIFY_OK;
1373 }
1374
1375 static struct notifier_block acpi_processor_latency_notifier = {
1376         .notifier_call = acpi_processor_latency_notify,
1377 };
1378
1379 #endif
1380
1381 #else /* CONFIG_CPU_IDLE */
1382
1383 /**
1384  * acpi_idle_bm_check - checks if bus master activity was detected
1385  */
1386 static int acpi_idle_bm_check(void)
1387 {
1388         u32 bm_status = 0;
1389
1390         acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
1391         if (bm_status)
1392                 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
1393         /*
1394          * PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
1395          * the true state of bus mastering activity; forcing us to
1396          * manually check the BMIDEA bit of each IDE channel.
1397          */
1398         else if (errata.piix4.bmisx) {
1399                 if ((inb_p(errata.piix4.bmisx + 0x02) & 0x01)
1400                     || (inb_p(errata.piix4.bmisx + 0x0A) & 0x01))
1401                         bm_status = 1;
1402         }
1403         return bm_status;
1404 }
1405
1406 /**
1407  * acpi_idle_update_bm_rld - updates the BM_RLD bit depending on target state
1408  * @pr: the processor
1409  * @target: the new target state
1410  */
1411 static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr,
1412                                            struct acpi_processor_cx *target)
1413 {
1414         if (pr->flags.bm_rld_set && target->type != ACPI_STATE_C3) {
1415                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
1416                 pr->flags.bm_rld_set = 0;
1417         }
1418
1419         if (!pr->flags.bm_rld_set && target->type == ACPI_STATE_C3) {
1420                 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, 1);
1421                 pr->flags.bm_rld_set = 1;
1422         }
1423 }
1424
1425 /**
1426  * acpi_idle_do_entry - a helper function that does C2 and C3 type entry
1427  * @cx: cstate data
1428  *
1429  * Caller disables interrupt before call and enables interrupt after return.
1430  */
1431 static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx)
1432 {
1433         u64 pctrl;
1434
1435         /* Don't trace irqs off for idle */
1436         stop_critical_timings();
1437         pctrl = hw_perf_save_disable();
1438         if (cx->entry_method == ACPI_CSTATE_FFH) {
1439                 /* Call into architectural FFH based C-state */
1440                 acpi_processor_ffh_cstate_enter(cx);
1441         } else if (cx->entry_method == ACPI_CSTATE_HALT) {
1442                 acpi_safe_halt();
1443         } else {
1444                 int unused;
1445                 /* IO port based C-state */
1446                 inb(cx->address);
1447                 /* Dummy wait op - must do something useless after P_LVL2 read
1448                    because chipsets cannot guarantee that STPCLK# signal
1449                    gets asserted in time to freeze execution properly. */
1450                 unused = inl(acpi_gbl_FADT.xpm_timer_block.address);
1451         }
1452         hw_perf_restore(pctrl);
1453         start_critical_timings();
1454 }
1455
1456 /**
1457  * acpi_idle_enter_c1 - enters an ACPI C1 state-type
1458  * @dev: the target CPU
1459  * @state: the state data
1460  *
1461  * This is equivalent to the HALT instruction.
1462  */
1463 static int acpi_idle_enter_c1(struct cpuidle_device *dev,
1464                               struct cpuidle_state *state)
1465 {
1466         u32 t1, t2;
1467         struct acpi_processor *pr;
1468         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1469
1470         pr = __get_cpu_var(processors);
1471
1472         if (unlikely(!pr))
1473                 return 0;
1474
1475         local_irq_disable();
1476
1477         /* Do not access any ACPI IO ports in suspend path */
1478         if (acpi_idle_suspend) {
1479                 acpi_safe_halt();
1480                 local_irq_enable();
1481                 return 0;
1482         }
1483
1484         if (pr->flags.bm_check)
1485                 acpi_idle_update_bm_rld(pr, cx);
1486
1487         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1488         acpi_idle_do_entry(cx);
1489         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1490
1491         local_irq_enable();
1492         cx->usage++;
1493
1494         return ticks_elapsed_in_us(t1, t2);
1495 }
1496
1497 /**
1498  * acpi_idle_enter_simple - enters an ACPI state without BM handling
1499  * @dev: the target CPU
1500  * @state: the state data
1501  */
1502 static int acpi_idle_enter_simple(struct cpuidle_device *dev,
1503                                   struct cpuidle_state *state)
1504 {
1505         struct acpi_processor *pr;
1506         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1507         u32 t1, t2;
1508         int sleep_ticks = 0;
1509
1510         pr = __get_cpu_var(processors);
1511
1512         if (unlikely(!pr))
1513                 return 0;
1514
1515         if (acpi_idle_suspend)
1516                 return(acpi_idle_enter_c1(dev, state));
1517
1518         local_irq_disable();
1519         current_thread_info()->status &= ~TS_POLLING;
1520         /*
1521          * TS_POLLING-cleared state must be visible before we test
1522          * NEED_RESCHED:
1523          */
1524         smp_mb();
1525
1526         if (unlikely(need_resched())) {
1527                 current_thread_info()->status |= TS_POLLING;
1528                 local_irq_enable();
1529                 return 0;
1530         }
1531
1532         /*
1533          * Must be done before busmaster disable as we might need to
1534          * access HPET !
1535          */
1536         acpi_state_timer_broadcast(pr, cx, 1);
1537
1538         if (pr->flags.bm_check)
1539                 acpi_idle_update_bm_rld(pr, cx);
1540
1541         if (cx->type == ACPI_STATE_C3)
1542                 ACPI_FLUSH_CPU_CACHE();
1543
1544         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1545         /* Tell the scheduler that we are going deep-idle: */
1546         sched_clock_idle_sleep_event();
1547         acpi_idle_do_entry(cx);
1548         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1549
1550 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
1551         /* TSC could halt in idle, so notify users */
1552         if (tsc_halts_in_c(cx->type))
1553                 mark_tsc_unstable("TSC halts in idle");;
1554 #endif
1555         sleep_ticks = ticks_elapsed(t1, t2);
1556
1557         /* Tell the scheduler how much we idled: */
1558         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
1559
1560         local_irq_enable();
1561         current_thread_info()->status |= TS_POLLING;
1562
1563         cx->usage++;
1564
1565         acpi_state_timer_broadcast(pr, cx, 0);
1566         cx->time += sleep_ticks;
1567         return ticks_elapsed_in_us(t1, t2);
1568 }
1569
1570 static int c3_cpu_count;
1571 static DEFINE_SPINLOCK(c3_lock);
1572
1573 /**
1574  * acpi_idle_enter_bm - enters C3 with proper BM handling
1575  * @dev: the target CPU
1576  * @state: the state data
1577  *
1578  * If BM is detected, the deepest non-C3 idle state is entered instead.
1579  */
1580 static int acpi_idle_enter_bm(struct cpuidle_device *dev,
1581                               struct cpuidle_state *state)
1582 {
1583         struct acpi_processor *pr;
1584         struct acpi_processor_cx *cx = cpuidle_get_statedata(state);
1585         u32 t1, t2;
1586         int sleep_ticks = 0;
1587
1588         pr = __get_cpu_var(processors);
1589
1590         if (unlikely(!pr))
1591                 return 0;
1592
1593         if (acpi_idle_suspend)
1594                 return(acpi_idle_enter_c1(dev, state));
1595
1596         if (acpi_idle_bm_check()) {
1597                 if (dev->safe_state) {
1598                         dev->last_state = dev->safe_state;
1599                         return dev->safe_state->enter(dev, dev->safe_state);
1600                 } else {
1601                         local_irq_disable();
1602                         acpi_safe_halt();
1603                         local_irq_enable();
1604                         return 0;
1605                 }
1606         }
1607
1608         local_irq_disable();
1609         current_thread_info()->status &= ~TS_POLLING;
1610         /*
1611          * TS_POLLING-cleared state must be visible before we test
1612          * NEED_RESCHED:
1613          */
1614         smp_mb();
1615
1616         if (unlikely(need_resched())) {
1617                 current_thread_info()->status |= TS_POLLING;
1618                 local_irq_enable();
1619                 return 0;
1620         }
1621
1622         acpi_unlazy_tlb(smp_processor_id());
1623
1624         /* Tell the scheduler that we are going deep-idle: */
1625         sched_clock_idle_sleep_event();
1626         /*
1627          * Must be done before busmaster disable as we might need to
1628          * access HPET !
1629          */
1630         acpi_state_timer_broadcast(pr, cx, 1);
1631
1632         acpi_idle_update_bm_rld(pr, cx);
1633
1634         /*
1635          * disable bus master
1636          * bm_check implies we need ARB_DIS
1637          * !bm_check implies we need cache flush
1638          * bm_control implies whether we can do ARB_DIS
1639          *
1640          * That leaves a case where bm_check is set and bm_control is
1641          * not set. In that case we cannot do much, we enter C3
1642          * without doing anything.
1643          */
1644         if (pr->flags.bm_check && pr->flags.bm_control) {
1645                 spin_lock(&c3_lock);
1646                 c3_cpu_count++;
1647                 /* Disable bus master arbitration when all CPUs are in C3 */
1648                 if (c3_cpu_count == num_online_cpus())
1649                         acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
1650                 spin_unlock(&c3_lock);
1651         } else if (!pr->flags.bm_check) {
1652                 ACPI_FLUSH_CPU_CACHE();
1653         }
1654
1655         t1 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1656         acpi_idle_do_entry(cx);
1657         t2 = inl(acpi_gbl_FADT.xpm_timer_block.address);
1658
1659         /* Re-enable bus master arbitration */
1660         if (pr->flags.bm_check && pr->flags.bm_control) {
1661                 spin_lock(&c3_lock);
1662                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
1663                 c3_cpu_count--;
1664                 spin_unlock(&c3_lock);
1665         }
1666
1667 #if defined (CONFIG_GENERIC_TIME) && defined (CONFIG_X86)
1668         /* TSC could halt in idle, so notify users */
1669         if (tsc_halts_in_c(ACPI_STATE_C3))
1670                 mark_tsc_unstable("TSC halts in idle");
1671 #endif
1672         sleep_ticks = ticks_elapsed(t1, t2);
1673         /* Tell the scheduler how much we idled: */
1674         sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS);
1675
1676         local_irq_enable();
1677         current_thread_info()->status |= TS_POLLING;
1678
1679         cx->usage++;
1680
1681         acpi_state_timer_broadcast(pr, cx, 0);
1682         cx->time += sleep_ticks;
1683         return ticks_elapsed_in_us(t1, t2);
1684 }
1685
1686 struct cpuidle_driver acpi_idle_driver = {
1687         .name =         "acpi_idle",
1688         .owner =        THIS_MODULE,
1689 };
1690
1691 /**
1692  * acpi_processor_setup_cpuidle - prepares and configures CPUIDLE
1693  * @pr: the ACPI processor
1694  */
1695 static int acpi_processor_setup_cpuidle(struct acpi_processor *pr)
1696 {
1697         int i, count = CPUIDLE_DRIVER_STATE_START;
1698         struct acpi_processor_cx *cx;
1699         struct cpuidle_state *state;
1700         struct cpuidle_device *dev = &pr->power.dev;
1701
1702         if (!pr->flags.power_setup_done)
1703                 return -EINVAL;
1704
1705         if (pr->flags.power == 0) {
1706                 return -EINVAL;
1707         }
1708
1709         dev->cpu = pr->id;
1710         for (i = 0; i < CPUIDLE_STATE_MAX; i++) {
1711                 dev->states[i].name[0] = '\0';
1712                 dev->states[i].desc[0] = '\0';
1713         }
1714
1715         for (i = 1; i < ACPI_PROCESSOR_MAX_POWER && i <= max_cstate; i++) {
1716                 cx = &pr->power.states[i];
1717                 state = &dev->states[count];
1718
1719                 if (!cx->valid)
1720                         continue;
1721
1722 #ifdef CONFIG_HOTPLUG_CPU
1723                 if ((cx->type != ACPI_STATE_C1) && (num_online_cpus() > 1) &&
1724                     !pr->flags.has_cst &&
1725                     !(acpi_gbl_FADT.flags & ACPI_FADT_C2_MP_SUPPORTED))
1726                         continue;
1727 #endif
1728                 cpuidle_set_statedata(state, cx);
1729
1730                 snprintf(state->name, CPUIDLE_NAME_LEN, "C%d", i);
1731                 strncpy(state->desc, cx->desc, CPUIDLE_DESC_LEN);
1732                 state->exit_latency = cx->latency;
1733                 state->target_residency = cx->latency * latency_factor;
1734                 state->power_usage = cx->power;
1735
1736                 state->flags = 0;
1737                 switch (cx->type) {
1738                         case ACPI_STATE_C1:
1739                         state->flags |= CPUIDLE_FLAG_SHALLOW;
1740                         if (cx->entry_method == ACPI_CSTATE_FFH)
1741                                 state->flags |= CPUIDLE_FLAG_TIME_VALID;
1742
1743                         state->enter = acpi_idle_enter_c1;
1744                         dev->safe_state = state;
1745                         break;
1746
1747                         case ACPI_STATE_C2:
1748                         state->flags |= CPUIDLE_FLAG_BALANCED;
1749                         state->flags |= CPUIDLE_FLAG_TIME_VALID;
1750                         state->enter = acpi_idle_enter_simple;
1751                         dev->safe_state = state;
1752                         break;
1753
1754                         case ACPI_STATE_C3:
1755                         state->flags |= CPUIDLE_FLAG_DEEP;
1756                         state->flags |= CPUIDLE_FLAG_TIME_VALID;
1757                         state->flags |= CPUIDLE_FLAG_CHECK_BM;
1758                         state->enter = pr->flags.bm_check ?
1759                                         acpi_idle_enter_bm :
1760                                         acpi_idle_enter_simple;
1761                         break;
1762                 }
1763
1764                 count++;
1765                 if (count == CPUIDLE_STATE_MAX)
1766                         break;
1767         }
1768
1769         dev->state_count = count;
1770
1771         if (!count)
1772                 return -EINVAL;
1773
1774         return 0;
1775 }
1776
1777 int acpi_processor_cst_has_changed(struct acpi_processor *pr)
1778 {
1779         int ret = 0;
1780
1781         if (boot_option_idle_override)
1782                 return 0;
1783
1784         if (!pr)
1785                 return -EINVAL;
1786
1787         if (nocst) {
1788                 return -ENODEV;
1789         }
1790
1791         if (!pr->flags.power_setup_done)
1792                 return -ENODEV;
1793
1794         cpuidle_pause_and_lock();
1795         cpuidle_disable_device(&pr->power.dev);
1796         acpi_processor_get_power_info(pr);
1797         if (pr->flags.power) {
1798                 acpi_processor_setup_cpuidle(pr);
1799                 ret = cpuidle_enable_device(&pr->power.dev);
1800         }
1801         cpuidle_resume_and_unlock();
1802
1803         return ret;
1804 }
1805
1806 #endif /* CONFIG_CPU_IDLE */
1807
1808 int __cpuinit acpi_processor_power_init(struct acpi_processor *pr,
1809                               struct acpi_device *device)
1810 {
1811         acpi_status status = 0;
1812         static int first_run;
1813         struct proc_dir_entry *entry = NULL;
1814         unsigned int i;
1815
1816         if (boot_option_idle_override)
1817                 return 0;
1818
1819         if (!first_run) {
1820                 if (idle_halt) {
1821                         /*
1822                          * When the boot option of "idle=halt" is added, halt
1823                          * is used for CPU IDLE.
1824                          * In such case C2/C3 is meaningless. So the max_cstate
1825                          * is set to one.
1826                          */
1827                         max_cstate = 1;
1828                 }
1829                 dmi_check_system(processor_power_dmi_table);
1830                 max_cstate = acpi_processor_cstate_check(max_cstate);
1831                 if (max_cstate < ACPI_C_STATES_MAX)
1832                         printk(KERN_NOTICE
1833                                "ACPI: processor limited to max C-state %d\n",
1834                                max_cstate);
1835                 first_run++;
1836 #if !defined(CONFIG_CPU_IDLE) && defined(CONFIG_SMP)
1837                 pm_qos_add_notifier(PM_QOS_CPU_DMA_LATENCY,
1838                                 &acpi_processor_latency_notifier);
1839 #endif
1840         }
1841
1842         if (!pr)
1843                 return -EINVAL;
1844
1845         if (acpi_gbl_FADT.cst_control && !nocst) {
1846                 status =
1847                     acpi_os_write_port(acpi_gbl_FADT.smi_command, acpi_gbl_FADT.cst_control, 8);
1848                 if (ACPI_FAILURE(status)) {
1849                         ACPI_EXCEPTION((AE_INFO, status,
1850                                         "Notifying BIOS of _CST ability failed"));
1851                 }
1852         }
1853
1854         acpi_processor_get_power_info(pr);
1855         pr->flags.power_setup_done = 1;
1856
1857         /*
1858          * Install the idle handler if processor power management is supported.
1859          * Note that we use previously set idle handler will be used on
1860          * platforms that only support C1.
1861          */
1862         if (pr->flags.power) {
1863 #ifdef CONFIG_CPU_IDLE
1864                 acpi_processor_setup_cpuidle(pr);
1865                 if (cpuidle_register_device(&pr->power.dev))
1866                         return -EIO;
1867 #endif
1868
1869                 printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
1870                 for (i = 1; i <= pr->power.count; i++)
1871                         if (pr->power.states[i].valid)
1872                                 printk(" C%d[C%d]", i,
1873                                        pr->power.states[i].type);
1874                 printk(")\n");
1875
1876 #ifndef CONFIG_CPU_IDLE
1877                 if (pr->id == 0) {
1878                         pm_idle_save = pm_idle;
1879                         pm_idle = acpi_processor_idle;
1880                 }
1881 #endif
1882         }
1883
1884         /* 'power' [R] */
1885         entry = proc_create_data(ACPI_PROCESSOR_FILE_POWER,
1886                                  S_IRUGO, acpi_device_dir(device),
1887                                  &acpi_processor_power_fops,
1888                                  acpi_driver_data(device));
1889         if (!entry)
1890                 return -EIO;
1891         return 0;
1892 }
1893
1894 int acpi_processor_power_exit(struct acpi_processor *pr,
1895                               struct acpi_device *device)
1896 {
1897         if (boot_option_idle_override)
1898                 return 0;
1899
1900 #ifdef CONFIG_CPU_IDLE
1901         cpuidle_unregister_device(&pr->power.dev);
1902 #endif
1903         pr->flags.power_setup_done = 0;
1904
1905         if (acpi_device_dir(device))
1906                 remove_proc_entry(ACPI_PROCESSOR_FILE_POWER,
1907                                   acpi_device_dir(device));
1908
1909 #ifndef CONFIG_CPU_IDLE
1910
1911         /* Unregister the idle handler when processor #0 is removed. */
1912         if (pr->id == 0) {
1913                 if (pm_idle_save)
1914                         pm_idle = pm_idle_save;
1915
1916                 /*
1917                  * We are about to unload the current idle thread pm callback
1918                  * (pm_idle), Wait for all processors to update cached/local
1919                  * copies of pm_idle before proceeding.
1920                  */
1921                 cpu_idle_wait();
1922 #ifdef CONFIG_SMP
1923                 pm_qos_remove_notifier(PM_QOS_CPU_DMA_LATENCY,
1924                                 &acpi_processor_latency_notifier);
1925 #endif
1926         }
1927 #endif
1928
1929         return 0;
1930 }