2 * drivers/cpufreq/cpufreq_ondemand.c
4 * Copyright (C) 2001 Russell King
5 * (C) 2003 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>.
6 * Jun Nakajima <jun.nakajima@intel.com>
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License version 2 as
10 * published by the Free Software Foundation.
13 #include <linux/kernel.h>
14 #include <linux/module.h>
15 #include <linux/init.h>
16 #include <linux/cpufreq.h>
17 #include <linux/cpu.h>
18 #include <linux/jiffies.h>
19 #include <linux/kernel_stat.h>
20 #include <linux/mutex.h>
23 * dbs is used in this file as a shortform for demandbased switching
24 * It helps to keep variable names smaller, simpler
27 #define DEF_FREQUENCY_DOWN_DIFFERENTIAL (10)
28 #define DEF_FREQUENCY_UP_THRESHOLD (80)
29 #define MIN_FREQUENCY_UP_THRESHOLD (11)
30 #define MAX_FREQUENCY_UP_THRESHOLD (100)
33 * The polling frequency of this governor depends on the capability of
34 * the processor. Default polling frequency is 1000 times the transition
35 * latency of the processor. The governor will work on any processor with
36 * transition latency <= 10mS, using appropriate sampling
38 * For CPUs with transition latency > 10mS (mostly drivers with CPUFREQ_ETERNAL)
39 * this governor will not work.
40 * All times here are in uS.
42 static unsigned int def_sampling_rate;
43 #define MIN_SAMPLING_RATE_RATIO (2)
44 /* for correct statistics, we need at least 10 ticks between each measure */
45 #define MIN_STAT_SAMPLING_RATE \
46 (MIN_SAMPLING_RATE_RATIO * jiffies_to_usecs(10))
47 #define MIN_SAMPLING_RATE \
48 (def_sampling_rate / MIN_SAMPLING_RATE_RATIO)
49 #define MAX_SAMPLING_RATE (500 * def_sampling_rate)
50 #define DEF_SAMPLING_RATE_LATENCY_MULTIPLIER (1000)
51 #define TRANSITION_LATENCY_LIMIT (10 * 1000 * 1000)
53 static void do_dbs_timer(struct work_struct *work);
56 enum {DBS_NORMAL_SAMPLE, DBS_SUB_SAMPLE};
58 struct cpu_dbs_info_s {
59 cputime64_t prev_cpu_idle;
60 cputime64_t prev_cpu_wall;
61 struct cpufreq_policy *cur_policy;
62 struct delayed_work work;
63 struct cpufreq_frequency_table *freq_table;
65 unsigned int freq_lo_jiffies;
66 unsigned int freq_hi_jiffies;
68 unsigned int enable:1,
71 static DEFINE_PER_CPU(struct cpu_dbs_info_s, cpu_dbs_info);
73 static unsigned int dbs_enable; /* number of CPUs using this policy */
76 * DEADLOCK ALERT! There is a ordering requirement between cpu_hotplug
77 * lock and dbs_mutex. cpu_hotplug lock should always be held before
78 * dbs_mutex. If any function that can potentially take cpu_hotplug lock
79 * (like __cpufreq_driver_target()) is being called with dbs_mutex taken, then
80 * cpu_hotplug lock should be taken before that. Note that cpu_hotplug lock
81 * is recursive for the same process. -Venki
83 static DEFINE_MUTEX(dbs_mutex);
85 static struct workqueue_struct *kondemand_wq;
87 static struct dbs_tuners {
88 unsigned int sampling_rate;
89 unsigned int up_threshold;
90 unsigned int down_differential;
91 unsigned int ignore_nice;
92 unsigned int powersave_bias;
94 .up_threshold = DEF_FREQUENCY_UP_THRESHOLD,
95 .down_differential = DEF_FREQUENCY_DOWN_DIFFERENTIAL,
100 static inline cputime64_t get_cpu_idle_time(unsigned int cpu, cputime64_t *wall)
102 cputime64_t idle_time;
103 cputime64_t cur_wall_time;
104 cputime64_t busy_time;
106 cur_wall_time = jiffies64_to_cputime64(get_jiffies_64());
107 busy_time = cputime64_add(kstat_cpu(cpu).cpustat.user,
108 kstat_cpu(cpu).cpustat.system);
110 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.irq);
111 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.softirq);
112 busy_time = cputime64_add(busy_time, kstat_cpu(cpu).cpustat.steal);
114 if (!dbs_tuners_ins.ignore_nice) {
115 busy_time = cputime64_add(busy_time,
116 kstat_cpu(cpu).cpustat.nice);
119 idle_time = cputime64_sub(cur_wall_time, busy_time);
121 *wall = cur_wall_time;
127 * Find right freq to be set now with powersave_bias on.
128 * Returns the freq_hi to be used right now and will set freq_hi_jiffies,
129 * freq_lo, and freq_lo_jiffies in percpu area for averaging freqs.
131 static unsigned int powersave_bias_target(struct cpufreq_policy *policy,
132 unsigned int freq_next,
133 unsigned int relation)
135 unsigned int freq_req, freq_reduc, freq_avg;
136 unsigned int freq_hi, freq_lo;
137 unsigned int index = 0;
138 unsigned int jiffies_total, jiffies_hi, jiffies_lo;
139 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, policy->cpu);
141 if (!dbs_info->freq_table) {
142 dbs_info->freq_lo = 0;
143 dbs_info->freq_lo_jiffies = 0;
147 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_next,
149 freq_req = dbs_info->freq_table[index].frequency;
150 freq_reduc = freq_req * dbs_tuners_ins.powersave_bias / 1000;
151 freq_avg = freq_req - freq_reduc;
153 /* Find freq bounds for freq_avg in freq_table */
155 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
156 CPUFREQ_RELATION_H, &index);
157 freq_lo = dbs_info->freq_table[index].frequency;
159 cpufreq_frequency_table_target(policy, dbs_info->freq_table, freq_avg,
160 CPUFREQ_RELATION_L, &index);
161 freq_hi = dbs_info->freq_table[index].frequency;
163 /* Find out how long we have to be in hi and lo freqs */
164 if (freq_hi == freq_lo) {
165 dbs_info->freq_lo = 0;
166 dbs_info->freq_lo_jiffies = 0;
169 jiffies_total = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
170 jiffies_hi = (freq_avg - freq_lo) * jiffies_total;
171 jiffies_hi += ((freq_hi - freq_lo) / 2);
172 jiffies_hi /= (freq_hi - freq_lo);
173 jiffies_lo = jiffies_total - jiffies_hi;
174 dbs_info->freq_lo = freq_lo;
175 dbs_info->freq_lo_jiffies = jiffies_lo;
176 dbs_info->freq_hi_jiffies = jiffies_hi;
180 static void ondemand_powersave_bias_init(void)
183 for_each_online_cpu(i) {
184 struct cpu_dbs_info_s *dbs_info = &per_cpu(cpu_dbs_info, i);
185 dbs_info->freq_table = cpufreq_frequency_get_table(i);
186 dbs_info->freq_lo = 0;
190 /************************** sysfs interface ************************/
191 static ssize_t show_sampling_rate_max(struct cpufreq_policy *policy, char *buf)
193 return sprintf (buf, "%u\n", MAX_SAMPLING_RATE);
196 static ssize_t show_sampling_rate_min(struct cpufreq_policy *policy, char *buf)
198 return sprintf (buf, "%u\n", MIN_SAMPLING_RATE);
201 #define define_one_ro(_name) \
202 static struct freq_attr _name = \
203 __ATTR(_name, 0444, show_##_name, NULL)
205 define_one_ro(sampling_rate_max);
206 define_one_ro(sampling_rate_min);
208 /* cpufreq_ondemand Governor Tunables */
209 #define show_one(file_name, object) \
210 static ssize_t show_##file_name \
211 (struct cpufreq_policy *unused, char *buf) \
213 return sprintf(buf, "%u\n", dbs_tuners_ins.object); \
215 show_one(sampling_rate, sampling_rate);
216 show_one(up_threshold, up_threshold);
217 show_one(ignore_nice_load, ignore_nice);
218 show_one(powersave_bias, powersave_bias);
220 static ssize_t store_sampling_rate(struct cpufreq_policy *unused,
221 const char *buf, size_t count)
225 ret = sscanf(buf, "%u", &input);
227 mutex_lock(&dbs_mutex);
228 if (ret != 1 || input > MAX_SAMPLING_RATE
229 || input < MIN_SAMPLING_RATE) {
230 mutex_unlock(&dbs_mutex);
234 dbs_tuners_ins.sampling_rate = input;
235 mutex_unlock(&dbs_mutex);
240 static ssize_t store_up_threshold(struct cpufreq_policy *unused,
241 const char *buf, size_t count)
245 ret = sscanf(buf, "%u", &input);
247 mutex_lock(&dbs_mutex);
248 if (ret != 1 || input > MAX_FREQUENCY_UP_THRESHOLD ||
249 input < MIN_FREQUENCY_UP_THRESHOLD) {
250 mutex_unlock(&dbs_mutex);
254 dbs_tuners_ins.up_threshold = input;
255 mutex_unlock(&dbs_mutex);
260 static ssize_t store_ignore_nice_load(struct cpufreq_policy *policy,
261 const char *buf, size_t count)
268 ret = sscanf(buf, "%u", &input);
275 mutex_lock(&dbs_mutex);
276 if ( input == dbs_tuners_ins.ignore_nice ) { /* nothing to do */
277 mutex_unlock(&dbs_mutex);
280 dbs_tuners_ins.ignore_nice = input;
282 /* we need to re-evaluate prev_cpu_idle */
283 for_each_online_cpu(j) {
284 struct cpu_dbs_info_s *dbs_info;
285 dbs_info = &per_cpu(cpu_dbs_info, j);
286 dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
287 &dbs_info->prev_cpu_wall);
289 mutex_unlock(&dbs_mutex);
294 static ssize_t store_powersave_bias(struct cpufreq_policy *unused,
295 const char *buf, size_t count)
299 ret = sscanf(buf, "%u", &input);
307 mutex_lock(&dbs_mutex);
308 dbs_tuners_ins.powersave_bias = input;
309 ondemand_powersave_bias_init();
310 mutex_unlock(&dbs_mutex);
315 #define define_one_rw(_name) \
316 static struct freq_attr _name = \
317 __ATTR(_name, 0644, show_##_name, store_##_name)
319 define_one_rw(sampling_rate);
320 define_one_rw(up_threshold);
321 define_one_rw(ignore_nice_load);
322 define_one_rw(powersave_bias);
324 static struct attribute * dbs_attributes[] = {
325 &sampling_rate_max.attr,
326 &sampling_rate_min.attr,
329 &ignore_nice_load.attr,
330 &powersave_bias.attr,
334 static struct attribute_group dbs_attr_group = {
335 .attrs = dbs_attributes,
339 /************************** sysfs end ************************/
341 static void dbs_check_cpu(struct cpu_dbs_info_s *this_dbs_info)
343 unsigned int max_load_freq;
345 struct cpufreq_policy *policy;
348 if (!this_dbs_info->enable)
351 this_dbs_info->freq_lo = 0;
352 policy = this_dbs_info->cur_policy;
355 * Every sampling_rate, we check, if current idle time is less
356 * than 20% (default), then we try to increase frequency
357 * Every sampling_rate, we look for a the lowest
358 * frequency which can sustain the load while keeping idle time over
359 * 30%. If such a frequency exist, we try to decrease to this frequency.
361 * Any frequency increase takes it to the maximum frequency.
362 * Frequency reduction happens at minimum steps of
363 * 5% (default) of current frequency
366 /* Get Absolute Load - in terms of freq */
369 for_each_cpu_mask_nr(j, policy->cpus) {
370 struct cpu_dbs_info_s *j_dbs_info;
371 cputime64_t cur_wall_time, cur_idle_time;
372 unsigned int idle_time, wall_time;
373 unsigned int load, load_freq;
376 j_dbs_info = &per_cpu(cpu_dbs_info, j);
378 cur_idle_time = get_cpu_idle_time(j, &cur_wall_time);
380 wall_time = (unsigned int) cputime64_sub(cur_wall_time,
381 j_dbs_info->prev_cpu_wall);
382 j_dbs_info->prev_cpu_wall = cur_wall_time;
384 idle_time = (unsigned int) cputime64_sub(cur_idle_time,
385 j_dbs_info->prev_cpu_idle);
386 j_dbs_info->prev_cpu_idle = cur_idle_time;
388 if (unlikely(!wall_time || wall_time < idle_time))
391 load = 100 * (wall_time - idle_time) / wall_time;
393 freq_avg = __cpufreq_driver_getavg(policy, j);
395 freq_avg = policy->cur;
397 load_freq = load * freq_avg;
398 if (load_freq > max_load_freq)
399 max_load_freq = load_freq;
402 /* Check for frequency increase */
403 if (max_load_freq > dbs_tuners_ins.up_threshold * policy->cur) {
404 /* if we are already at full speed then break out early */
405 if (!dbs_tuners_ins.powersave_bias) {
406 if (policy->cur == policy->max)
409 __cpufreq_driver_target(policy, policy->max,
412 int freq = powersave_bias_target(policy, policy->max,
414 __cpufreq_driver_target(policy, freq,
420 /* Check for frequency decrease */
421 /* if we cannot reduce the frequency anymore, break out early */
422 if (policy->cur == policy->min)
426 * The optimal frequency is the frequency that is the lowest that
427 * can support the current CPU usage without triggering the up
428 * policy. To be safe, we focus 10 points under the threshold.
431 (dbs_tuners_ins.up_threshold - dbs_tuners_ins.down_differential) *
433 unsigned int freq_next;
434 freq_next = max_load_freq /
435 (dbs_tuners_ins.up_threshold -
436 dbs_tuners_ins.down_differential);
438 if (!dbs_tuners_ins.powersave_bias) {
439 __cpufreq_driver_target(policy, freq_next,
442 int freq = powersave_bias_target(policy, freq_next,
444 __cpufreq_driver_target(policy, freq,
450 static void do_dbs_timer(struct work_struct *work)
452 struct cpu_dbs_info_s *dbs_info =
453 container_of(work, struct cpu_dbs_info_s, work.work);
454 unsigned int cpu = dbs_info->cpu;
455 int sample_type = dbs_info->sample_type;
457 /* We want all CPUs to do sampling nearly on same jiffy */
458 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
460 delay -= jiffies % delay;
462 if (lock_policy_rwsem_write(cpu) < 0)
465 if (!dbs_info->enable) {
466 unlock_policy_rwsem_write(cpu);
470 /* Common NORMAL_SAMPLE setup */
471 dbs_info->sample_type = DBS_NORMAL_SAMPLE;
472 if (!dbs_tuners_ins.powersave_bias ||
473 sample_type == DBS_NORMAL_SAMPLE) {
474 dbs_check_cpu(dbs_info);
475 if (dbs_info->freq_lo) {
476 /* Setup timer for SUB_SAMPLE */
477 dbs_info->sample_type = DBS_SUB_SAMPLE;
478 delay = dbs_info->freq_hi_jiffies;
481 __cpufreq_driver_target(dbs_info->cur_policy,
485 queue_delayed_work_on(cpu, kondemand_wq, &dbs_info->work, delay);
486 unlock_policy_rwsem_write(cpu);
489 static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info)
491 /* We want all CPUs to do sampling nearly on same jiffy */
492 int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate);
493 delay -= jiffies % delay;
495 dbs_info->enable = 1;
496 ondemand_powersave_bias_init();
497 dbs_info->sample_type = DBS_NORMAL_SAMPLE;
498 INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer);
499 queue_delayed_work_on(dbs_info->cpu, kondemand_wq, &dbs_info->work,
503 static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info)
505 dbs_info->enable = 0;
506 cancel_delayed_work(&dbs_info->work);
509 static int cpufreq_governor_dbs(struct cpufreq_policy *policy,
512 unsigned int cpu = policy->cpu;
513 struct cpu_dbs_info_s *this_dbs_info;
517 this_dbs_info = &per_cpu(cpu_dbs_info, cpu);
520 case CPUFREQ_GOV_START:
521 if ((!cpu_online(cpu)) || (!policy->cur))
524 if (this_dbs_info->enable) /* Already enabled */
527 mutex_lock(&dbs_mutex);
530 rc = sysfs_create_group(&policy->kobj, &dbs_attr_group);
533 mutex_unlock(&dbs_mutex);
537 for_each_cpu_mask_nr(j, policy->cpus) {
538 struct cpu_dbs_info_s *j_dbs_info;
539 j_dbs_info = &per_cpu(cpu_dbs_info, j);
540 j_dbs_info->cur_policy = policy;
542 j_dbs_info->prev_cpu_idle = get_cpu_idle_time(j,
543 &j_dbs_info->prev_cpu_wall);
545 this_dbs_info->cpu = cpu;
547 * Start the timerschedule work, when this governor
548 * is used for first time
550 if (dbs_enable == 1) {
551 unsigned int latency;
552 /* policy latency is in nS. Convert it to uS first */
553 latency = policy->cpuinfo.transition_latency / 1000;
557 def_sampling_rate = latency *
558 DEF_SAMPLING_RATE_LATENCY_MULTIPLIER;
560 if (def_sampling_rate < MIN_STAT_SAMPLING_RATE)
561 def_sampling_rate = MIN_STAT_SAMPLING_RATE;
563 dbs_tuners_ins.sampling_rate = def_sampling_rate;
565 dbs_timer_init(this_dbs_info);
567 mutex_unlock(&dbs_mutex);
570 case CPUFREQ_GOV_STOP:
571 mutex_lock(&dbs_mutex);
572 dbs_timer_exit(this_dbs_info);
573 sysfs_remove_group(&policy->kobj, &dbs_attr_group);
575 mutex_unlock(&dbs_mutex);
579 case CPUFREQ_GOV_LIMITS:
580 mutex_lock(&dbs_mutex);
581 if (policy->max < this_dbs_info->cur_policy->cur)
582 __cpufreq_driver_target(this_dbs_info->cur_policy,
585 else if (policy->min > this_dbs_info->cur_policy->cur)
586 __cpufreq_driver_target(this_dbs_info->cur_policy,
589 mutex_unlock(&dbs_mutex);
595 struct cpufreq_governor cpufreq_gov_ondemand = {
597 .governor = cpufreq_governor_dbs,
598 .max_transition_latency = TRANSITION_LATENCY_LIMIT,
599 .owner = THIS_MODULE,
601 EXPORT_SYMBOL(cpufreq_gov_ondemand);
603 static int __init cpufreq_gov_dbs_init(void)
607 kondemand_wq = create_workqueue("kondemand");
609 printk(KERN_ERR "Creation of kondemand failed\n");
612 err = cpufreq_register_governor(&cpufreq_gov_ondemand);
614 destroy_workqueue(kondemand_wq);
619 static void __exit cpufreq_gov_dbs_exit(void)
621 cpufreq_unregister_governor(&cpufreq_gov_ondemand);
622 destroy_workqueue(kondemand_wq);
626 MODULE_AUTHOR("Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>");
627 MODULE_AUTHOR("Alexey Starikovskiy <alexey.y.starikovskiy@intel.com>");
628 MODULE_DESCRIPTION("'cpufreq_ondemand' - A dynamic cpufreq governor for "
629 "Low Latency Frequency Transition capable processors");
630 MODULE_LICENSE("GPL");
632 #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND
633 fs_initcall(cpufreq_gov_dbs_init);
635 module_init(cpufreq_gov_dbs_init);
637 module_exit(cpufreq_gov_dbs_exit);