From 04656384a1b9714e43db850c51431008e23450d8 Mon Sep 17 00:00:00 2001 From: Keir Fraser Date: Fri, 11 Dec 2009 08:51:21 +0000 Subject: docs: Example usage of pvrdtscp algorithm Signed-off-by: Dan Magenheimer --- docs/misc/pvrdtscp.c | 307 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 307 insertions(+) create mode 100644 docs/misc/pvrdtscp.c (limited to 'docs/misc/pvrdtscp.c') diff --git a/docs/misc/pvrdtscp.c b/docs/misc/pvrdtscp.c new file mode 100644 index 0000000000..8d25843532 --- /dev/null +++ b/docs/misc/pvrdtscp.c @@ -0,0 +1,307 @@ +/* pvrdtscp algorithm + * + * This sample code demonstrates the use of the paravirtualized rdtscp + * algorithm. Using this algorithm, an application may communicate with + * the Xen hypervisor (version 4.0+) to obtain timestamp information which + * is both monotonically increasing and has a fixed 1 GHz rate, even across + * migrations between machines with different TSC rates and offsets. + * Further,the algorithm provides performance near the performance of a + * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED + * the application is running on a machine on which the rdtscp instruction + * is supported and TSC is "safe". The application must also be running in a + * PV domain. (HVM domains may be supported at a later time.) On machines + * where TSC is unsafe or the rdtscp instruction is not supported, Xen + * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp + * algorithm, thus providing support for the algorithm for live migration + * across all machines. + * + * More information can be found within the Xen (4.0+) source tree at + * docs/misc/tscmode.txt + * + * Copyright (c) 2009 Oracle Corporation and/or its affiliates. + * All rights reserved + * Written by: Dan Magenheimer + * + * This code is derived from code licensed under the GNU + * General Public License ("GPL") version 2 and is therefore itself + * also licensed under the GPL version 2. + * + * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2 + * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp + * algorithm and its ABI to Xen version 4.0+ + */ + +#include +#include +#include +#include + +#ifdef __LP64__ +#define __X86_64__ +typedef unsigned short u16; +typedef unsigned int u32; +typedef unsigned long u64; +typedef int i32; +typedef long i64; +#define NSEC_PER_SEC 1000000000 +#else +#define __X86_32__ +typedef unsigned int u16; +typedef unsigned long u32; +typedef unsigned long long u64; +typedef long i32; +typedef long long i64; +#define NSEC_PER_SEC 1000000000L +#endif + +static inline void hvm_cpuid(u32 idx, u32 sub, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + *eax = idx, *ecx = sub; + asm("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) + : "0" (*eax), "2" (*ecx)); +} + +static inline void pv_cpuid(u32 idx, u32 sub, + u32 *eax, u32 *ebx, u32 *ecx, u32 *edx) +{ + *eax = idx, *ecx = sub; + asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" : "=a" (*eax), + "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx)); +} + +static inline u64 do_rdtscp(u32 *aux) +{ +static u64 last = 0; + u32 lo32, hi32; + u64 val; + + asm volatile(".byte 0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c" (*aux)); + val = lo32 | ((u64)hi32 << 32); + return val; +} + +static inline int get_xen_tsc_mode(void) +{ + u32 val, dummy1, dummy2, dummy3; + pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3); + return val; +} + +static inline int get_xen_vtsc(void) +{ + u32 val, dummy1, dummy2, dummy3; + pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3); + return val & 1; +} + +static inline int get_xen_vtsc_khz(void) +{ + u32 val, dummy1, dummy2, dummy3; + pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3); + return val; +} + +static inline u32 get_xen_cpu_khz(void) +{ + u32 cpu_khz, dummy1, dummy2, dummy3; + pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3); + return cpu_khz; +} + +static inline u32 get_xen_incarnation(void) +{ + u32 incarn, dummy1, dummy2, dummy3; + pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn); + return incarn; +} + +static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift) +{ + u32 off_lo, off_hi, sys_lo, sys_hi, dummy; + + pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift); + *offset = off_lo | ((u64)off_hi << 32); +} + +static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift) +{ + u64 product; +#ifdef __X86_32__ + u32 tmp1, tmp2; +#endif + + if ( tsc_shift < 0 ) + delta >>= -tsc_shift; + else + delta <<= tsc_shift; + +#ifdef __X86_32__ + asm ( + "mul %5 ; " + "mov %4,%%eax ; " + "mov %%edx,%4 ; " + "mul %5 ; " + "xor %5,%5 ; " + "add %4,%%eax ; " + "adc %5,%%edx ; " + : "=A" (product), "=r" (tmp1), "=r" (tmp2) + : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (tsc_mul_frac) ); +#else + asm ( + "mul %%rdx ; shrd $32,%%rdx,%%rax" + : "=a" (product) : "0" (delta), "d" ((u64)tsc_mul_frac) ); +#endif + + return product; +} + +static inline u64 get_pvrdtscp_timestamp(int *discontinuity) +{ + static int firsttime = 1; + static u64 last_pvrdtscp_timestamp = 0; + static u32 last_tsc_aux; + static u64 xen_ns_offset; + static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift; + u32 this_tsc_aux; + u64 timestamp, cur_tsc, cur_ns; + + if (firsttime) { + cur_tsc = do_rdtscp(&last_tsc_aux); + get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac, + &xen_tsc_to_ns_shift); + cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac, + xen_tsc_to_ns_shift); + timestamp = cur_ns - xen_ns_offset; + last_pvrdtscp_timestamp = timestamp; + firsttime = 0; + } + cur_tsc = do_rdtscp(&this_tsc_aux); + *discontinuity = 0; + while (this_tsc_aux != last_tsc_aux) { + /* if tsc_aux changed, try again */ + last_tsc_aux = this_tsc_aux; + get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac, + &xen_tsc_to_ns_shift); + cur_tsc = do_rdtscp(&this_tsc_aux); + *discontinuity = 1; + } + + /* compute nsec from TSC and Xen time values */ + cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac, + xen_tsc_to_ns_shift); + timestamp = cur_ns - xen_ns_offset; + + /* enforce monotonicity just in case */ + if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0) + last_pvrdtscp_timestamp = timestamp; + else { + /* this should never happen but we'll check it anyway in + * case of some strange combination of scaling errors + * occurs across a very fast migration */ + printf("Time went backwards by %lluns\n", + (unsigned long long)(last_pvrdtscp_timestamp-timestamp)); + timestamp = ++last_pvrdtscp_timestamp; + } + return timestamp; +} + +#define HVM 1 +#define PVM 0 + +static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor) +{ + u32 eax, ebx, ecx, edx, base; + union { char csig[16]; u32 u[4]; } sig; + + for (base=0x40000000; base < 0x40010000; base += 0x100) { + if (hvm==HVM) + hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx); + else + pv_cpuid(base,0,&eax,&ebx,&ecx,&edx); + sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx; + sig.csig[12] = '\0'; + if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >= (base+2))) { + if (hvm==HVM) + hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx); + else + pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx); + *version_major = (eax >> 16) & 0xffff; + *version_minor = eax & 0xffff; + return 1; + } + } + return 0; +} + +main(int ac, char **av) +{ + u32 dummy; + u16 version_hi, version_lo; + u64 ts, last_ts; + int status, discontinuity = 0; + pid_t pid; + + if (running_on_xen(HVM,&version_hi,&version_lo)) { + printf("running on Xen v%d.%d as an HVM domain, " + "pvrdtsc not supported, exiting\n", + (int)version_hi, (int)version_lo); + exit(0); + } + pid = fork(); + if (pid == -1) { + fprintf(stderr,"Huh? Fork failed\n"); + return 0; + } + else if (pid == 0) { /* child */ + pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy); + exit(0); + } + waitpid(pid,&status,0); + if (!WIFEXITED(status)) + exit(0); + if (!running_on_xen(PVM,&version_hi,&version_lo)) { + printf("not running on Xen, exiting\n"); + exit(0); + } + printf("running on Xen v%d.%d as a PV domain\n", + (int)version_hi, (int)version_lo); + if ( version_hi <= 3 ) { + printf("pvrdtscp requires Xen version 4.0 or greater\n"); + /* exit(0); FIXME after xen-unstable is officially v4.0 */ + } + if ( get_xen_tsc_mode() != 3 ) + printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n"); + + /* OK, we are on Xen, now loop forever checking timestamps */ + ts = get_pvrdtscp_timestamp(&discontinuity); + printf("Starting with ts=%lluns 0x%llx (%llusec)\n",ts,ts,ts/NSEC_PER_SEC); + printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n", + (unsigned long)get_xen_incarnation(), + (unsigned long)get_xen_vtsc(), + (unsigned long)get_xen_vtsc_khz(), + (unsigned long)get_xen_cpu_khz()); + ts = get_pvrdtscp_timestamp(&discontinuity); + last_ts = ts; + while (1) { + ts = get_pvrdtscp_timestamp(&discontinuity); + if (discontinuity) + printf("migrated/restored, incarn=%d: " + "vtsc now %d, vtsc_khz=%lu, phys cpu_khz=%lu\n", + (unsigned long)get_xen_incarnation(), + (unsigned long)get_xen_vtsc(), + (unsigned long)get_xen_vtsc_khz(), + (unsigned long)get_xen_cpu_khz()); + if (ts < last_ts) + /* this should NEVER happen, especially since there + * is a check for it in get_pvrdtscp_timestamp() */ + printf("Time went backwards: %lluns (%llusec)\n", + last_ts-ts,(last_ts-ts)/NSEC_PER_SEC); + if (ts > last_ts + 200000000LL) + /* this is OK, usually about 2sec for save/restore + * and a fraction of a second for live migrate */ + printf("Time jumped forward %lluns (%llusec)\n", + ts-last_ts,(ts-last_ts)/NSEC_PER_SEC); + last_ts = ts; + } +} -- cgit v1.2.3