From: Jeremy Fitzhardinge Date: Wed, 18 Jul 2007 01:37:05 +0000 (-0700) Subject: xen: Account for stolen time X-Git-Tag: v2.6.23-rc1~498^2~17 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=f91a8b447b9af64f589f6e13fec7f09b5927563d;p=linux-2.6 xen: Account for stolen time This patch accounts for the time stolen from our VCPUs. Stolen time is time where a vcpu is runnable and could be running, but all available physical CPUs are being used for something else. This accounting gets run on each timer interrupt, just as a way to get it run relatively often, and when interesting things are going on. Stolen time is not really used by much in the kernel; it is reported in /proc/stats, and that's about it. Signed-off-by: Jeremy Fitzhardinge Acked-by: Chris Wright Cc: john stultz Cc: Rik van Riel --- diff --git a/arch/i386/xen/time.c b/arch/i386/xen/time.c index b457980ff3..acbfd99694 100644 --- a/arch/i386/xen/time.c +++ b/arch/i386/xen/time.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -25,6 +26,7 @@ /* Xen may fire a timer up to this many ns early */ #define TIMER_SLOP 100000 +#define NS_PER_TICK (1000000000LL / HZ) /* These are perodically updated in shared_info, and then copied here. */ struct shadow_time_info { @@ -37,6 +39,139 @@ struct shadow_time_info { static DEFINE_PER_CPU(struct shadow_time_info, shadow_time); +/* runstate info updated by Xen */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); + +/* snapshots of runstate info */ +static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); + +/* unused ns of stolen and blocked time */ +static DEFINE_PER_CPU(u64, residual_stolen); +static DEFINE_PER_CPU(u64, residual_blocked); + +/* return an consistent snapshot of 64-bit time/counter value */ +static u64 get64(const u64 *p) +{ + u64 ret; + + if (BITS_PER_LONG < 64) { + u32 *p32 = (u32 *)p; + u32 h, l; + + /* + * Read high then low, and then make sure high is + * still the same; this will only loop if low wraps + * and carries into high. + * XXX some clean way to make this endian-proof? + */ + do { + h = p32[1]; + barrier(); + l = p32[0]; + barrier(); + } while (p32[1] != h); + + ret = (((u64)h) << 32) | l; + } else + ret = *p; + + return ret; +} + +/* + * Runstate accounting + */ +static void get_runstate_snapshot(struct vcpu_runstate_info *res) +{ + u64 state_time; + struct vcpu_runstate_info *state; + + preempt_disable(); + + state = &__get_cpu_var(runstate); + + /* + * The runstate info is always updated by the hypervisor on + * the current CPU, so there's no need to use anything + * stronger than a compiler barrier when fetching it. + */ + do { + state_time = get64(&state->state_entry_time); + barrier(); + *res = *state; + barrier(); + } while (get64(&state->state_entry_time) != state_time); + + preempt_enable(); +} + +static void setup_runstate_info(int cpu) +{ + struct vcpu_register_runstate_memory_area area; + + area.addr.v = &per_cpu(runstate, cpu); + + if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, + cpu, &area)) + BUG(); +} + +static void do_stolen_accounting(void) +{ + struct vcpu_runstate_info state; + struct vcpu_runstate_info *snap; + s64 blocked, runnable, offline, stolen; + cputime_t ticks; + + get_runstate_snapshot(&state); + + WARN_ON(state.state != RUNSTATE_running); + + snap = &__get_cpu_var(runstate_snapshot); + + /* work out how much time the VCPU has not been runn*ing* */ + blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; + runnable = state.time[RUNSTATE_runnable] - snap->time[RUNSTATE_runnable]; + offline = state.time[RUNSTATE_offline] - snap->time[RUNSTATE_offline]; + + *snap = state; + + /* Add the appropriate number of ticks of stolen time, + including any left-overs from last time. Passing NULL to + account_steal_time accounts the time as stolen. */ + stolen = runnable + offline + __get_cpu_var(residual_stolen); + + if (stolen < 0) + stolen = 0; + + ticks = 0; + while (stolen >= NS_PER_TICK) { + ticks++; + stolen -= NS_PER_TICK; + } + __get_cpu_var(residual_stolen) = stolen; + account_steal_time(NULL, ticks); + + /* Add the appropriate number of ticks of blocked time, + including any left-overs from last time. Passing idle to + account_steal_time accounts the time as idle/wait. */ + blocked += __get_cpu_var(residual_blocked); + + if (blocked < 0) + blocked = 0; + + ticks = 0; + while (blocked >= NS_PER_TICK) { + ticks++; + blocked -= NS_PER_TICK; + } + __get_cpu_var(residual_blocked) = blocked; + account_steal_time(idle_task(smp_processor_id()), ticks); +} + + + +/* Get the CPU speed from Xen */ unsigned long xen_cpu_khz(void) { u64 cpu_khz = 1000000ULL << 32; @@ -56,13 +191,11 @@ unsigned long xen_cpu_khz(void) * Reads a consistent set of time-base values from Xen, into a shadow data * area. */ -static void get_time_values_from_xen(void) +static unsigned get_time_values_from_xen(void) { struct vcpu_time_info *src; struct shadow_time_info *dst; - preempt_disable(); - /* src is shared memory with the hypervisor, so we need to make sure we get a consistent snapshot, even in the face of being preempted. */ @@ -79,7 +212,7 @@ static void get_time_values_from_xen(void) rmb(); /* test version after fetching data */ } while ((src->version & 1) | (dst->version ^ src->version)); - preempt_enable(); + return dst->version; } /* @@ -123,7 +256,7 @@ static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift) static u64 get_nsec_offset(struct shadow_time_info *shadow) { u64 now, delta; - rdtscll(now); + now = native_read_tsc(); delta = now - shadow->tsc_timestamp; return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); } @@ -132,10 +265,14 @@ cycle_t xen_clocksource_read(void) { struct shadow_time_info *shadow = &get_cpu_var(shadow_time); cycle_t ret; + unsigned version; - get_time_values_from_xen(); - - ret = shadow->system_timestamp + get_nsec_offset(shadow); + do { + version = get_time_values_from_xen(); + barrier(); + ret = shadow->system_timestamp + get_nsec_offset(shadow); + barrier(); + } while (version != __get_cpu_var(xen_vcpu)->time.version); put_cpu_var(shadow_time); @@ -352,6 +489,8 @@ static irqreturn_t xen_timer_interrupt(int irq, void *dev_id) ret = IRQ_HANDLED; } + do_stolen_accounting(); + return ret; } @@ -378,6 +517,8 @@ static void xen_setup_timer(int cpu) evt->irq = irq; clockevents_register_device(evt); + setup_runstate_info(cpu); + put_cpu_var(xen_clock_events); } @@ -390,7 +531,7 @@ __init void xen_time_init(void) clocksource_register(&xen_clocksource); if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { - /* Successfully turned off 100hz tick, so we have the + /* Successfully turned off 100Hz tick, so we have the vcpuop-based timer interface */ printk(KERN_DEBUG "Xen: using vcpuop timer interface\n"); xen_clockevent = &xen_vcpuop_clockevent;