From: Peter Zijlstra <pzijlstr@redhat.com> Date: Thu, 7 May 2009 15:59:42 +0200 Subject: [sched] rq clock Message-id: 20090507140138.332783000@chello.nl O-Subject: [PATCH 4/5] RHEL-5: sched: rq clock Bugzilla: 297731 RH-Acked-by: Rik van Riel <riel@redhat.com> CVE: CVE-2007-3719 backport of the upstream sched_clock machinery. Signed-off-by: Peter Zijlstra <pzijlstr@redhat.com> diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index afe6d2b..bf74446 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -52,6 +52,7 @@ ACPI_MODULE_NAME("acpi_processor") #define ACPI_PROCESSOR_FILE_POWER "power" #define US_TO_PM_TIMER_TICKS(t) ((t * (PM_TIMER_FREQUENCY/1000)) / 1000) +#define PM_TIMER_TICK_NS (1000000000ULL/PM_TIMER_FREQUENCY) #define C2_OVERHEAD 4 /* 1us (3.579 ticks per us) */ #define C3_OVERHEAD 4 /* 1us (3.579 ticks per us) */ static void (*pm_idle_save) (void) __read_mostly; @@ -351,6 +352,8 @@ static void acpi_processor_idle(void) case ACPI_STATE_C2: /* Get start time (ticks) */ t1 = inl(acpi_fadt.xpm_tmr_blk.address); + /* Tell the scheduler that we are going deep-idle */ + sched_clock_idle_sleep_event(); /* Invoke C2 */ inb(cx->address); /* Dummy wait op - must do something useless after P_LVL2 read @@ -365,12 +368,15 @@ static void acpi_processor_idle(void) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable(); #endif + /* Compute time (ticks) that we were actually asleep */ + sleep_ticks = ticks_elapsed(t1, t2); + /* Tell the scheduler how much we idled */ + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); /* Re-enable interrupts */ local_irq_enable(); current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD; + /* Do not account our idle-switching overhead */ + sleep_ticks -= cx->latency_ticks + C2_OVERHEAD; break; case ACPI_STATE_C3: @@ -402,6 +408,8 @@ static void acpi_processor_idle(void) /* Get start time (ticks) */ t1 = inl(acpi_fadt.xpm_tmr_blk.address); + /* Tell the scheduler that we are going deep-idle */ + sched_clock_idle_sleep_event(); /* Invoke C3 */ inb(cx->address); /* Dummy wait op (see above) */ @@ -420,12 +428,15 @@ static void acpi_processor_idle(void) if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable(); #endif + /* Compute time (ticks) that we were actually asleep */ + sleep_ticks = ticks_elapsed(t1, t2); + /* Tell the scheduler how much we idled */ + sched_clock_idle_wakeup_event(sleep_ticks*PM_TIMER_TICK_NS); /* Re-enable interrupts */ local_irq_enable(); current_thread_info()->status |= TS_POLLING; - /* Compute time (ticks) that we were actually asleep */ - sleep_ticks = - ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD; + /* Do not account our idle-switching overhead */ + sleep_ticks -= cx->latency_ticks + C3_OVERHEAD; break; default: diff --git a/include/linux/sched.h b/include/linux/sched.h index eaabf86..1faef4a 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1213,6 +1213,8 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern unsigned long long sched_clock(void); extern unsigned long long current_sched_time(const struct task_struct *current_task); +extern void sched_clock_idle_sleep_event(void); +extern void sched_clock_idle_wakeup_event(u64 delta_ns); /* sched_exec is called by processes performing an exec */ #ifdef CONFIG_SMP diff --git a/init/main.c b/init/main.c index 06349fe..e9f6fa5 100644 --- a/init/main.c +++ b/init/main.c @@ -118,6 +118,8 @@ extern void time_init(void); void (*late_time_init)(void); extern void softirq_init(void); +extern void sched_clock_init(void); + /* Untouched command line (eg. for /proc) saved by arch-specific code. */ char saved_command_line[COMMAND_LINE_SIZE]; @@ -543,6 +545,7 @@ asmlinkage void __init start_kernel(void) softirq_init(); timekeeping_init(); time_init(); + sched_clock_init(); profile_init(); if (!irqs_disabled()) printk("start_kernel(): bug: interrupts were enabled early\n"); diff --git a/kernel/sched.c b/kernel/sched.c index 7a62c3e..6b923f1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -41,6 +41,7 @@ #include <linux/smp.h> #include <linux/threads.h> #include <linux/timer.h> +#include <linux/time.h> #include <linux/rcupdate.h> #include <linux/cpu.h> #include <linux/cpuset.h> @@ -228,6 +229,7 @@ struct rq { unsigned long expired_timestamp; unsigned long long timestamp_last_tick; + unsigned long long clock; struct task_struct *curr, *idle; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; @@ -371,6 +373,237 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) } #endif /* __ARCH_WANT_UNLOCKED_CTXSW */ +static int sched_clock_running __read_mostly; + +/* + * can be set by an arch if their native sched_clock() is stable and + * synchronized between cpus + */ +int sched_clock_stable __read_mostly; + +struct sched_clock_data { + raw_spinlock_t lock; + + u64 tick_raw; + u64 tick_gtod; + u64 clock; +}; + +static DEFINE_PER_CPU(struct sched_clock_data, sched_clock_data); + +static inline struct sched_clock_data *this_scd(void) +{ + return &__get_cpu_var(sched_clock_data); +} + +static inline struct sched_clock_data *cpu_sdc(int cpu) +{ + return &per_cpu(sched_clock_data, cpu); +} + +static u64 get_monotonic_time(void) +{ + u64 time; + struct timespec tp; + + ktime_get_ts(&tp); + time = tp.tv_sec * NSEC_PER_SEC + tp.tv_nsec; + + return time; +} + +void sched_clock_init(void) +{ + u64 ktime_now = get_monotonic_time(); + int cpu; + + for_each_possible_cpu(cpu) { + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + scd->tick_raw = 0; + scd->tick_gtod = ktime_now; + scd->clock = ktime_now; + } + + sched_clock_running = 1; +} + +/* + * min, max except they take wrapping into account + */ + +static inline u64 wrap_min(u64 x, u64 y) +{ + return (s64)(x - y) < 0 ? x : y; +} + +static inline u64 wrap_max(u64 x, u64 y) +{ + return (s64)(x - y) > 0 ? x : y; +} + +/* + * update the percpu scd from the raw @now value + * + * - filter out backward motion + * - use the GTOD tick value to create a window to filter crazy TSC values + */ +static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now) +{ + s64 delta = now - scd->tick_raw; + u64 clock, min_clock, max_clock; + + if (unlikely(delta < 0)) + delta = 0; + + /* + * scd->clock = clamp(scd->tick_gtod + delta, + * max(scd->tick_gtod, scd->clock), + * scd->tick_gtod + TICK_NSEC); + */ + + clock = scd->tick_gtod + delta; + min_clock = wrap_max(scd->tick_gtod, scd->clock); + max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC); + + clock = wrap_max(clock, min_clock); + clock = wrap_min(clock, max_clock); + + scd->clock = clock; + + return scd->clock; +} + +static void lock_double_clock(struct sched_clock_data *data1, + struct sched_clock_data *data2) +{ + if (data1 < data2) { + __raw_spin_lock(&data1->lock); + __raw_spin_lock(&data2->lock); + } else { + __raw_spin_lock(&data2->lock); + __raw_spin_lock(&data1->lock); + } +} + +u64 sched_clock_cpu(int cpu) +{ + u64 now, clock, this_clock, remote_clock; + struct sched_clock_data *scd; + + if (sched_clock_stable) + return sched_clock(); + + scd = cpu_sdc(cpu); + + if (unlikely(!sched_clock_running)) + return 0ull; + + WARN_ON_ONCE(!irqs_disabled()); + now = sched_clock(); + + if (cpu != smp_processor_id()) { + struct sched_clock_data *my_scd = this_scd(); + + lock_double_clock(scd, my_scd); + + this_clock = __update_sched_clock(my_scd, now); + remote_clock = scd->clock; + + /* + * Use the opportunity that we have both locks + * taken to couple the two clocks: we take the + * larger time as the latest time for both + * runqueues. (this creates monotonic movement) + */ + if (likely((s64)(remote_clock - this_clock) < 0)) { + clock = this_clock; + scd->clock = clock; + } else { + /* + * Should be rare, but possible: + */ + clock = remote_clock; + my_scd->clock = remote_clock; + } + + __raw_spin_unlock(&my_scd->lock); + } else { + __raw_spin_lock(&scd->lock); + clock = __update_sched_clock(scd, now); + } + + __raw_spin_unlock(&scd->lock); + + return clock; +} + +static inline u64 sched_clock_local(void) +{ + return sched_clock_cpu(smp_processor_id()); +} + +static void sched_clock_tick(void) +{ + struct sched_clock_data *scd; + u64 now, now_gtod; + + if (sched_clock_stable) + return; + + if (unlikely(!sched_clock_running)) + return; + + WARN_ON_ONCE(!irqs_disabled()); + + scd = this_scd(); + now_gtod = get_monotonic_time(); + now = sched_clock(); + + __raw_spin_lock(&scd->lock); + scd->tick_raw = now; + scd->tick_gtod = now_gtod; + __update_sched_clock(scd, now); + __raw_spin_unlock(&scd->lock); +} + +/* + * We are going deep-idle (irqs are disabled): + */ +void sched_clock_idle_sleep_event(void) +{ + sched_clock_local(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + sched_clock_tick(); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); + +unsigned long long cpu_clock(int cpu) +{ + unsigned long long clock; + unsigned long flags; + + local_irq_save(flags); + clock = sched_clock_cpu(cpu); + local_irq_restore(flags); + + return clock; +} +EXPORT_SYMBOL_GPL(cpu_clock); + +static void update_rq_clock(struct rq *rq) +{ + rq->clock = sched_clock_cpu(cpu_of(rq)); +} + /* * __task_rq_lock - lock the runqueue a given task resides on. * Must be called interrupts disabled. @@ -387,6 +620,7 @@ repeat_lock_task: spin_unlock(&rq->lock); goto repeat_lock_task; } + update_rq_clock(rq); return rq; } @@ -408,6 +642,7 @@ repeat_lock_task: spin_unlock_irqrestore(&rq->lock, *flags); goto repeat_lock_task; } + update_rq_clock(rq); return rq; } @@ -559,6 +794,7 @@ static inline struct rq *this_rq_lock(void) local_irq_disable(); rq = this_rq(); spin_lock(&rq->lock); + update_rq_clock(rq); return rq; } @@ -941,15 +1177,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; - now = sched_clock(); -#ifdef CONFIG_SMP - if (!local) { - /* Compensate for drifting sched_clock */ - struct rq *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; - } -#endif + now = rq->clock; if (!rt_task(p)) p->prio = recalc_task_prio(p, now); @@ -1339,6 +1567,7 @@ static int wake_idle(int cpu, struct task_struct *p) struct sched_domain *sd; int i; unsigned long long now; + struct rq *rq = cpu_rq(cpu); /* * If it is idle, then it is the best cpu to run this task. @@ -1349,10 +1578,10 @@ static int wake_idle(int cpu, struct task_struct *p) * sibling runqueue info. This will avoid the checks and cache miss * penalities associated with that. */ - if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + if (idle_cpu(cpu) || rq->nr_running > 1) return cpu; - now = sched_clock(); + now = rq->clock; for_each_domain(cpu, sd) { if ((sd->flags & SD_WAKE_IDLE) || ((sd->flags & SD_WAKE_IDLE_FAR) @@ -1654,7 +1883,8 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) */ p->first_time_slice = current->pid; current->time_slice >>= 1; - p->timestamp = sched_clock(); + + p->timestamp = sched_clock_local(); if (unlikely(!current->time_slice)) { /* * This case is rare, it happens when the parent has only @@ -1729,12 +1959,6 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) } else { this_rq = cpu_rq(this_cpu); - /* - * Not the local CPU - must adjust timestamp. This should - * get optimised away in the !CONFIG_SMP case. - */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -2161,6 +2385,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) spin_lock(&rq1->lock); } } + update_rq_clock(rq1); + update_rq_clock(rq2); } /* @@ -2196,6 +2422,8 @@ static void double_lock_balance(struct rq *this_rq, struct rq *busiest) } else spin_lock(&busiest->lock); } + /* update_rq_clock(this_rq); */ + update_rq_clock(busiest); } /* @@ -2258,8 +2486,6 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2298,7 +2524,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (sd->nr_balance_failed > sd->cache_nice_tries) return 1; - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, rq->clock, sd)) return 0; return 1; } @@ -2397,7 +2623,7 @@ skip_queue: } #ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) + if (task_hot(tmp, busiest->clock, sd)) schedstat_inc(sd, lb_hot_gained[idle]); #endif @@ -3131,7 +3357,7 @@ unsigned long long current_sched_time(const struct task_struct *p) local_irq_save(flags); ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); - ns = p->sched_time + sched_clock() - ns; + ns = p->sched_time + sched_clock_local() - ns; local_irq_restore(flags); return ns; @@ -3242,16 +3468,22 @@ void account_steal_time(struct task_struct *p, cputime_t steal) */ void scheduler_tick(void) { - unsigned long long now = sched_clock(); + unsigned long long now; struct task_struct *p = current; int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); + sched_clock_tick(); + + spin_lock(&rq->lock); + update_rq_clock(rq); + now = rq->clock; update_cpu_clock(p, rq, now); rq->timestamp_last_tick = now; if (p == rq->idle) { + spin_unlock(&rq->lock); if (wake_priority_sleeper(rq)) goto out; rebalance_tick(cpu, rq, SCHED_IDLE); @@ -3261,9 +3493,8 @@ void scheduler_tick(void) /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { set_tsk_need_resched(p); - goto out; + goto out_unlock; } - spin_lock(&rq->lock); /* * The task was running during this tick - update the * time slice counter. Note: we do not update a thread's @@ -3551,7 +3782,8 @@ need_resched_nonpreemptible: schedstat_inc(rq, sched_cnt); spin_lock_irq(&rq->lock); - now = sched_clock(); + update_rq_clock(rq); + now = rq->clock; if (likely((long long)(now - prev->timestamp) < NS_MAX_SLEEP_AVG)) { run_time = now - prev->timestamp; if (unlikely((long long)(now - prev->timestamp) < 0)) @@ -5096,7 +5328,9 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) struct rq *rq = cpu_rq(cpu); unsigned long flags; - idle->timestamp = sched_clock(); + local_irq_save(flags); + + idle->timestamp = sched_clock_local(); idle->sleep_avg = 0; idle->array = NULL; idle->prio = idle->normal_prio = MAX_PRIO; @@ -5104,7 +5338,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) idle->cpus_allowed = cpumask_of_cpu(cpu); set_task_cpu(idle, cpu); - spin_lock_irqsave(&rq->lock, flags); + spin_lock(&rq->lock); + rq->curr = rq->idle = idle; #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; @@ -5219,14 +5454,6 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) set_task_cpu(p, dest_cpu); if (p->array) { - /* - * Sync timestamp with rq_dest's before activating. - * The same thing could be achieved by doing this step - * afterwards, and pretending it was a local activate. - * This way is cleaner and logically correct. - */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -5265,6 +5492,8 @@ static int migration_thread(void *data) goto wait_to_die; } + update_rq_clock(rq); + if (rq->active_balance) { active_load_balance(rq, cpu); rq->active_balance = 0;