openwrt/upstream - upstream openwrt

	Commit message (Expand)	Author	Age	Files	Lines
...
*	toolchain: fix binutils broken symlink	Felix Fietkau	2014-10-18	1	-1/+1
*	uClibc: Backport uClibc master git commit fd355bc1dbcb794ae1abf0fad1459e28d85...	John Crispin	2014-10-14	1	-0/+269
*	gcc: remove version 4.8.0	Felix Fietkau	2014-10-05	17	-864/+1
*	binutils: remove old versions	Felix Fietkau	2014-10-05	13	-271/+2
*	Fix broken implementation of NOEXECSTACK flag for MIPS architectures for gcc-...	Felix Fietkau	2014-10-05	2	-0/+235
*	gcc: gcc 4.9.x disable libsanitize	Felix Fietkau	2014-10-05	2	-0/+2
*	gcc: add 4.9-linaro based on the 2014.09 release	Felix Fietkau	2014-09-30	20	-1/+936
*	binutils: select linaro 2.24 version by default	Felix Fietkau	2014-09-30	1	-1/+1
*	binutils: update -linaro to 2.24.0-2014.09	Felix Fietkau	2014-09-30	1	-3/+3
*	toolchain: remove another bogus copy of libiberty.a	Felix Fietkau	2014-09-20	1	-0/+1
*	Kconfig: Aesthetic/formatting fixes to toolchain/Config.in.	Hauke Mehrtens	2014-09-13	1	-14/+16
*	toolchain/binutils: do not install host libiberty.a into target lib dir	Jonas Gorski	2014-09-07	1	-0/+1
*	toolchain/musl: update to version 1.1.4	Felix Fietkau	2014-08-02	2	-2/+2
*	octeon: select gcc 4.6 by default	John Crispin	2014-07-27	2	-3/+3
*	toolchain/gcc: pass proper flags for arm hard float build	Luka Perkov	2014-07-14	1	-0/+7
*	toolchain: switch back to uClibc for octeon, it actually works now	Felix Fietkau	2014-07-10	1	-1/+0
*	uClibc: fix copy&pasted typo in the stat fix	Felix Fietkau	2014-07-10	1	-9/+9
*	uClibc: add a whole bunch of mips64 related fixes	Felix Fietkau	2014-07-10	12	-10/+1004
*	gcc: backport a fix for issues with accessing weak data references	Felix Fietkau	2014-07-10	1	-0/+42
*	toolchain: only default eglibc for octeon and not all of mips64	John Crispin	2014-07-10	3	-6/+6
*	musl: add sgidefs.h	Felix Fietkau	2014-06-29	1	-0/+73
*	musl: add a few more glibc types to includes	Felix Fietkau	2014-06-29	1	-0/+3
*	musl: remove fake glibc defines	Felix Fietkau	2014-06-29	1	-8/+0
*	musl: update to version 1.1.3	Felix Fietkau	2014-06-29	2	-2/+2
*	eglibc: temporarily disable parallel build until fixed	Felix Fietkau	2014-06-29	1	-1/+1
*	toolchain: make the default eglibc selection work even if TOOLCHAINOPTS is no...	John Crispin	2014-06-19	2	-7/+11
*	toolchain: use different defaults for octeon	John Crispin	2014-06-18	3	-2/+6
*	musl: fix toolchain build failure caused by some header changes	Felix Fietkau	2014-06-12	2	-1/+4
*	musl: install a few extra headers to improve compatibility with various packages	Felix Fietkau	2014-06-09	6	-0/+1039
*	musl: update to version 1.1.2	Felix Fietkau	2014-06-07	2	-2/+2
*	toolchain/binutils: remove obsolete patches	Felix Fietkau	2014-06-06	8	-246/+0
*	eglibc: add support for other mips64 ABI variants as well	Felix Fietkau	2014-05-31	1	-1/+9
*	eglibc: fix mips64 abi selection, default to o64 instead of n32	Felix Fietkau	2014-05-31	1	-0/+3
*	musl: add a hack to rip out excessive iconv bloat	Felix Fietkau	2014-05-28	1	-0/+68
*	gcc: define USE_PT_GNU_EH_FRAME for musl as well, reduces binary object size	Felix Fietkau	2014-05-28	1	-1/+3
*	gcc: disable libsanitzier off_t check to fix musl build	Felix Fietkau	2014-05-28	1	-0/+11
*	musl: update to version 1.1.1	Felix Fietkau	2014-05-21	4	-24/+3
*	uClibc: add a patch to reduce vasprintf allocation size (fixes #13024)	Felix Fietkau	2014-05-17	1	-0/+37
*	toolchain/gcc: fix build on arm with clang as host gcc replacement	Felix Fietkau	2014-05-15	1	-0/+4
*	gcc: revert an upstream patch that is causing a regression on powerpc	Felix Fietkau	2014-05-06	1	-0/+65
*	toolchain/gcc: update linaro-gcc-4.8 to 2014.04	Felix Fietkau	2014-05-02	6	-15/+15
*	uClibc: backport mount.h update	Luka Perkov	2014-03-24	1	-0/+91
*	build: remove check to nonexistant CONFIG_ENABLE_LOCALE variable and move DIS...	Felix Fietkau	2014-03-21	2	-2/+0
*	musl: update to 1.0.0	Felix Fietkau	2014-03-20	7	-30/+8
*	toolchain/gcc: use 4.8-linaro by default	Felix Fietkau	2014-03-10	2	-2/+2
*	musl: move the dependency on BROKEN to the right config symbol	Felix Fietkau	2014-02-26	2	-1/+1
*	toolchain/musl: add version 0.9.15, remove older versions (still broken, but ...	Felix Fietkau	2014-02-24	24	-1153/+28
*	gcc: fix visibility of symbols libgcc.a	Felix Fietkau	2014-02-24	3	-27/+23
*	eglibc: use version 2.19 by default (lots of fixes, some security related)	Felix Fietkau	2014-02-24	2	-2/+2
*	eglibc: remove versions 2.16 and 2.17 - they are unmaintained	Felix Fietkau	2014-02-24	10	-471/+1

/**************************************************************************** * (C) 2009 - George Dunlap - Citrix Systems R&D UK, Ltd **************************************************************************** * * File: common/csched_credit2.c * Author: George Dunlap * * Description: Credit-based SMP CPU scheduler * Based on an earlier verson by Emmanuel Ackaouy. */ #include <xen/config.h> #include <xen/init.h> #include <xen/lib.h> #include <xen/sched.h> #include <xen/domain.h> #include <xen/delay.h> #include <xen/event.h> #include <xen/time.h> #include <xen/perfc.h> #include <xen/sched-if.h> #include <xen/softirq.h> #include <asm/atomic.h> #include <xen/errno.h> #include <xen/trace.h> #include <xen/cpu.h> #if __i386__ #define PRI_stime "lld" #else #define PRI_stime "ld" #endif #define d2printk(x...) //#define d2printk printk #define TRC_CSCHED2_TICK TRC_SCHED_CLASS + 1 #define TRC_CSCHED2_RUNQ_POS TRC_SCHED_CLASS + 2 #define TRC_CSCHED2_CREDIT_BURN TRC_SCHED_CLASS + 3 #define TRC_CSCHED2_CREDIT_ADD TRC_SCHED_CLASS + 4 #define TRC_CSCHED2_TICKLE_CHECK TRC_SCHED_CLASS + 5 #define TRC_CSCHED2_TICKLE TRC_SCHED_CLASS + 6 #define TRC_CSCHED2_CREDIT_RESET TRC_SCHED_CLASS + 7 #define TRC_CSCHED2_SCHED_TASKLET TRC_SCHED_CLASS + 8 #define TRC_CSCHED2_UPDATE_LOAD TRC_SCHED_CLASS + 9 #define TRC_CSCHED2_RUNQ_ASSIGN TRC_SCHED_CLASS + 10 #define TRC_CSCHED2_UPDATE_VCPU_LOAD TRC_SCHED_CLASS + 11 #define TRC_CSCHED2_UPDATE_RUNQ_LOAD TRC_SCHED_CLASS + 12 /* * WARNING: This is still in an experimental phase. Status and work can be found at the * credit2 wiki page: * http://wiki.xensource.com/xenwiki/Credit2_Scheduler_Development * TODO: * + Immediate bug-fixes * - Do per-runqueue, grab proper lock for dump debugkey * + Multiple sockets * - Detect cpu layout and make runqueue map, one per L2 (make_runq_map()) * - Simple load balancer / runqueue assignment * - Runqueue load measurement * - Load-based load balancer * + Hyperthreading * - Look for non-busy core if possible * - "Discount" time run on a thread with busy siblings * + Algorithm: * - "Mixed work" problem: if a VM is playing audio (5%) but also burning cpu (e.g., * a flash animation in the background) can we schedule it with low enough latency * so that audio doesn't skip? * - Cap and reservation: How to implement with the current system? * + Optimizing * - Profiling, making new algorithms, making math more efficient (no long division) */ /* * Design: * * VMs "burn" credits based on their weight; higher weight means * credits burn more slowly. The highest weight vcpu burns credits at * a rate of 1 credit per nanosecond. Others burn proportionally * more. * * vcpus are inserted into the runqueue by credit order. * * Credits are "reset" when the next vcpu in the runqueue is less than * or equal to zero. At that point, everyone's credits are "clipped" * to a small value, and a fixed credit is added to everyone. * * The plan is for all cores that share an L2 will share the same * runqueue. At the moment, there is one global runqueue for all * cores. */ /* * Locking: * - Schedule-lock is per-runqueue * + Protects runqueue data, runqueue insertion, &c * + Also protects updates to private sched vcpu structure * + Must be grabbed using vcpu_schedule_lock_irq() to make sure vcpu->processr * doesn't change under our feet. * - Private data lock * + Protects access to global domain list * + All other private data is written at init and only read afterwards. * Ordering: * - We grab private->schedule when updating domain weight; so we * must never grab private if a schedule lock is held. */ /* * Basic constants */ /* Default weight: How much a new domain starts with */ #define CSCHED_DEFAULT_WEIGHT 256 /* Min timer: Minimum length a timer will be set, to * achieve efficiency */ #define CSCHED_MIN_TIMER MICROSECS(500) /* Amount of credit VMs begin with, and are reset to. * ATM, set so that highest-weight VMs can only run for 10ms * before a reset event. */ #define CSCHED_CREDIT_INIT MILLISECS(10) /* Carryover: How much "extra" credit may be carried over after * a reset. */ #define CSCHED_CARRYOVER_MAX CSCHED_MIN_TIMER /* Stickiness: Cross-L2 migration resistance. Should be less than * MIN_TIMER. */ #define CSCHED_MIGRATE_RESIST ((opt_migrate_resist)*MICROSECS(1)) /* How much to "compensate" a vcpu for L2 migration */ #define CSCHED_MIGRATE_COMPENSATION MICROSECS(50) /* Reset: Value below which credit will be reset. */ #define CSCHED_CREDIT_RESET 0 /* Max timer: Maximum time a guest can be run for. */ #define CSCHED_MAX_TIMER MILLISECS(2) #define CSCHED_IDLE_CREDIT (-(1<<30)) /* * Flags */ /* CSFLAG_scheduled: Is this vcpu either running on, or context-switching off, * a physical cpu? * + Accessed only with runqueue lock held * + Set when chosen as next in csched_schedule(). * + Cleared after context switch has been saved in csched_context_saved() * + Checked in vcpu_wake to see if we can add to the runqueue, or if we should * set CSFLAG_delayed_runq_add * + Checked to be false in runq_insert. */ #define __CSFLAG_scheduled 1 #define CSFLAG_scheduled (1<<__CSFLAG_scheduled) /* CSFLAG_delayed_runq_add: Do we need to add this to the runqueue once it'd done * being context switched out? * + Set when scheduling out in csched_schedule() if prev is runnable * + Set in csched_vcpu_wake if it finds CSFLAG_scheduled set * + Read in csched_context_saved(). If set, it adds prev to the runqueue and * clears the bit. */ #define __CSFLAG_delayed_runq_add 2 #define CSFLAG_delayed_runq_add (1<<__CSFLAG_delayed_runq_add) /* CSFLAG_runq_migrate_request: This vcpu is being migrated as a result of a * credit2-initiated runq migrate request; migrate it to the runqueue indicated * in the svc struct. */ #define __CSFLAG_runq_migrate_request 3 #define CSFLAG_runq_migrate_request (1<<__CSFLAG_runq_migrate_request) int opt_migrate_resist=500; integer_param("sched_credit2_migrate_resist", opt_migrate_resist); /* * Useful macros */ #define CSCHED_PRIV(_ops) \ ((struct csched_private *)((_ops)->sched_data)) #define CSCHED_VCPU(_vcpu) ((struct csched_vcpu *) (_vcpu)->sched_priv) #define CSCHED_DOM(_dom) ((struct csched_dom *) (_dom)->sched_priv) #define CSCHED_CPUONLINE(_pool) \ (((_pool) == NULL) ? &cpupool_free_cpus : &(_pool)->cpu_valid) /* CPU to runq_id macro */ #define c2r(_ops, _cpu) (CSCHED_PRIV(_ops)->runq_map[(_cpu)]) /* CPU to runqueue struct macro */ #define RQD(_ops, _cpu) (&CSCHED_PRIV(_ops)->rqd[c2r(_ops, _cpu)]) /* * Shifts for load average. * - granularity: Reduce granularity of time by a factor of 1000, so we can use 32-bit maths * - window shift: Given granularity shift, make the window about 1 second * - scale shift: Shift up load by this amount rather than using fractions; 128 corresponds * to a load of 1. */ #define LOADAVG_GRANULARITY_SHIFT (10) int opt_load_window_shift=18; #define LOADAVG_WINDOW_SHIFT_MIN 4 integer_param("credit2_load_window_shift", opt_load_window_shift); int opt_underload_balance_tolerance=0; integer_param("credit2_balance_under", opt_underload_balance_tolerance); int opt_overload_balance_tolerance=-3; integer_param("credit2_balance_over", opt_overload_balance_tolerance); /* * Per-runqueue data */ struct csched_runqueue_data { int id; spinlock_t lock; /* Lock for this runqueue. */ cpumask_t active; /* CPUs enabled for this runqueue */ struct list_head runq; /* Ordered list of runnable vms */ struct list_head svc; /* List of all vcpus assigned to this runqueue */ int max_weight; cpumask_t idle, /* Currently idle */ tickled; /* Another cpu in the queue is already targeted for this one */ int load; /* Instantaneous load: Length of queue + num non-idle threads */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ s_time_t b_avgload; /* Decaying queue load modified by balancing */ }; /* * System-wide private data */ struct csched_private { spinlock_t lock; cpumask_t initialized; /* CPU is initialized for this pool */ struct list_head sdom; /* Used mostly for dump keyhandler. */ int runq_map[NR_CPUS]; cpumask_t active_queues; /* Queues which may have active cpus */ struct csched_runqueue_data rqd[NR_CPUS]; int load_window_shift; }; /* * Virtual CPU */ struct csched_vcpu { struct list_head rqd_elem; /* On the runqueue data list */ struct list_head sdom_elem; /* On the domain vcpu list */ struct list_head runq_elem; /* On the runqueue */ struct csched_runqueue_data *rqd; /* Up-pointer to the runqueue */ /* Up-pointers */ struct csched_dom *sdom; struct vcpu *vcpu; int weight; int credit; s_time_t start_time; /* When we were scheduled (used for credit) */ unsigned flags; /* 16 bits doesn't seem to play well with clear_bit() */ /* Individual contribution to load */ s_time_t load_last_update; /* Last time average was updated */ s_time_t avgload; /* Decaying queue load */ struct csched_runqueue_data *migrate_rqd; /* Pre-determined rqd to which to migrate */ }; /* * Domain */ struct csched_dom { struct list_head vcpu; struct list_head sdom_elem; struct domain *dom; uint16_t weight; uint16_t nr_vcpus; }; /* * Time-to-credit, credit-to-time. * FIXME: Do pre-calculated division? */ static s_time_t t2c(struct csched_runqueue_data *rqd, s_time_t time, struct csched_vcpu *svc) { return time * rqd->max_weight / svc->weight; } static s_time_t c2t(struct csched_runqueue_data *rqd, s_time_t credit, struct csched_vcpu *svc) { return credit * svc->weight / rqd->max_weight; } /* * Runqueue related code */ static /*inline*/ int __vcpu_on_runq(struct csched_vcpu *svc) { return !list_empty(&svc->runq_elem); } static /*inline*/ struct csched_vcpu * __runq_elem(struct list_head *elem) { return list_entry(elem, struct csched_vcpu, runq_elem); } static void __update_runq_load(const struct scheduler *ops, struct csched_runqueue_data *rqd, int change, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); s_time_t delta=-1; now >>= LOADAVG_GRANULARITY_SHIFT; if ( rqd->load_last_update + (1ULL<<prv->load_window_shift) < now ) { rqd->avgload = (unsigned long long)rqd->load << prv->load_window_shift; rqd->b_avgload = (unsigned long long)rqd->load << prv->load_window_shift; } else { delta = now - rqd->load_last_update; rqd->avgload = ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) ) + ( ((1ULL<<prv->load_window_shift) - delta) * rqd->avgload ) ) >> prv->load_window_shift; rqd->b_avgload = ( ( delta * ( (unsigned long long)rqd->load << prv->load_window_shift ) ) + ( ((1ULL<<prv->load_window_shift) - delta) * rqd->b_avgload ) ) >> prv->load_window_shift; } rqd->load += change; rqd->load_last_update = now; { struct { unsigned rq_load:4, rq_avgload:28; unsigned rq_id:4, b_avgload:28; } d; d.rq_id=rqd->id; d.rq_load = rqd->load; d.rq_avgload = rqd->avgload; d.b_avgload = rqd->b_avgload; trace_var(TRC_CSCHED2_UPDATE_RUNQ_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void __update_svc_load(const struct scheduler *ops, struct csched_vcpu *svc, int change, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); s_time_t delta=-1; int vcpu_load; if ( change == -1 ) vcpu_load = 1; else if ( change == 1 ) vcpu_load = 0; else vcpu_load = vcpu_runnable(svc->vcpu); now >>= LOADAVG_GRANULARITY_SHIFT; if ( svc->load_last_update + (1ULL<<prv->load_window_shift) < now ) { svc->avgload = (unsigned long long)vcpu_load << prv->load_window_shift; } else { delta = now - svc->load_last_update; svc->avgload = ( ( delta * ( (unsigned long long)vcpu_load << prv->load_window_shift ) ) + ( ((1ULL<<prv->load_window_shift) - delta) * svc->avgload ) ) >> prv->load_window_shift; } svc->load_last_update = now; { struct { unsigned dom:16,vcpu:16; unsigned v_avgload:32; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.v_avgload = svc->avgload; trace_var(TRC_CSCHED2_UPDATE_VCPU_LOAD, 1, sizeof(d), (unsigned char *)&d); } } static void update_load(const struct scheduler *ops, struct csched_runqueue_data *rqd, struct csched_vcpu *svc, int change, s_time_t now) { __update_runq_load(ops, rqd, change, now); if ( svc ) __update_svc_load(ops, svc, change, now); } static int __runq_insert(struct list_head *runq, struct csched_vcpu *svc) { struct list_head *iter; int pos = 0; d2printk("rqi d%dv%d\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); BUG_ON(&svc->rqd->runq != runq); /* Idle vcpus not allowed on the runqueue anymore */ BUG_ON(is_idle_vcpu(svc->vcpu)); BUG_ON(svc->vcpu->is_running); BUG_ON(test_bit(__CSFLAG_scheduled, &svc->flags)); list_for_each( iter, runq ) { struct csched_vcpu * iter_svc = __runq_elem(iter); if ( svc->credit > iter_svc->credit ) { d2printk(" p%d d%dv%d\n", pos, iter_svc->vcpu->domain->domain_id, iter_svc->vcpu->vcpu_id); break; } pos++; } list_add_tail(&svc->runq_elem, iter); return pos; } static void runq_insert(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *svc) { struct list_head * runq = &RQD(ops, cpu)->runq; int pos = 0; ASSERT( spin_is_locked(per_cpu(schedule_data, cpu).schedule_lock) ); BUG_ON( __vcpu_on_runq(svc) ); BUG_ON( c2r(ops, cpu) != c2r(ops, svc->vcpu->processor) ); pos = __runq_insert(runq, svc); { struct { unsigned dom:16,vcpu:16; unsigned pos; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.pos = pos; trace_var(TRC_CSCHED2_RUNQ_POS, 0, sizeof(d), (unsigned char *)&d); } return; } static inline void __runq_remove(struct csched_vcpu *svc) { BUG_ON( !__vcpu_on_runq(svc) ); list_del_init(&svc->runq_elem); } void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *, s_time_t); /* Check to see if the item on the runqueue is higher priority than what's * currently running; if so, wake up the processor */ static /*inline*/ void runq_tickle(const struct scheduler *ops, unsigned int cpu, struct csched_vcpu *new, s_time_t now) { int i, ipid=-1; s_time_t lowest=(1<<30); struct csched_runqueue_data *rqd = RQD(ops, cpu); cpumask_t mask; struct csched_vcpu * cur; d2printk("rqt d%dv%d cd%dv%d\n", new->vcpu->domain->domain_id, new->vcpu->vcpu_id, current->domain->domain_id, current->vcpu_id); BUG_ON(new->vcpu->processor != cpu); BUG_ON(new->rqd != rqd); /* Look at the cpu it's running on first */ cur = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); burn_credits(rqd, cur, now); if ( cur->credit < new->credit ) { ipid = cpu; goto tickle; } /* Get a mask of idle, but not tickled */ cpus_andnot(mask, rqd->idle, rqd->tickled); /* If it's not empty, choose one */ if ( !cpus_empty(mask) ) { ipid=first_cpu(mask); goto tickle; } /* Otherwise, look for the non-idle cpu with the lowest credit, * skipping cpus which have been tickled but not scheduled yet */ cpus_andnot(mask, rqd->active, rqd->idle); cpus_andnot(mask, mask, rqd->tickled); for_each_cpu_mask(i, mask) { struct csched_vcpu * cur; /* Already looked at this one above */ if ( i == cpu ) continue; cur = CSCHED_VCPU(per_cpu(schedule_data, i).curr); BUG_ON(is_idle_vcpu(cur->vcpu)); /* Update credits for current to see if we want to preempt */ burn_credits(rqd, cur, now); if ( cur->credit < lowest ) { ipid = i; lowest = cur->credit; } /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit; } d; d.dom = cur->vcpu->domain->domain_id; d.vcpu = cur->vcpu->vcpu_id; d.credit = cur->credit; trace_var(TRC_CSCHED2_TICKLE_CHECK, 1, sizeof(d), (unsigned char *)&d); } } /* Only switch to another processor if the credit difference is greater * than the migrate resistance */ if ( ipid == -1 || lowest + CSCHED_MIGRATE_RESIST > new->credit ) goto no_tickle; tickle: BUG_ON(ipid == -1); /* TRACE */ { struct { unsigned cpu:8; } d; d.cpu = ipid; trace_var(TRC_CSCHED2_TICKLE, 0, sizeof(d), (unsigned char *)&d); } cpu_set(ipid, rqd->tickled); cpu_raise_softirq(ipid, SCHEDULE_SOFTIRQ); no_tickle: return; } /* * Credit-related code */ static void reset_credit(const struct scheduler *ops, int cpu, s_time_t now) { struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head *iter; list_for_each( iter, &rqd->svc ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem); int start_credit; BUG_ON( is_idle_vcpu(svc->vcpu) ); BUG_ON( svc->rqd != rqd ); start_credit = svc->credit; /* "Clip" credits to max carryover */ if ( svc->credit > CSCHED_CARRYOVER_MAX ) svc->credit = CSCHED_CARRYOVER_MAX; /* And add INIT */ svc->credit += CSCHED_CREDIT_INIT; svc->start_time = now; /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit_start, credit_end; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.credit_start = start_credit; d.credit_end = svc->credit; trace_var(TRC_CSCHED2_CREDIT_RESET, 1, sizeof(d), (unsigned char *)&d); } } /* No need to resort runqueue, as everyone's order should be the same. */ } void burn_credits(struct csched_runqueue_data *rqd, struct csched_vcpu *svc, s_time_t now) { s_time_t delta; /* Assert svc is current */ ASSERT(svc==CSCHED_VCPU(per_cpu(schedule_data, svc->vcpu->processor).curr)); if ( is_idle_vcpu(svc->vcpu) ) { BUG_ON(svc->credit != CSCHED_IDLE_CREDIT); return; } delta = now - svc->start_time; if ( delta > 0 ) { /* This will round down; should we consider rounding up...? */ svc->credit -= t2c(rqd, delta, svc); svc->start_time = now; d2printk("b d%dv%d c%d\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->credit); } else { d2printk("%s: Time went backwards? now %"PRI_stime" start %"PRI_stime"\n", __func__, now, svc->start_time); } /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned credit; int delta; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.credit = svc->credit; d.delta = delta; trace_var(TRC_CSCHED2_CREDIT_BURN, 1, sizeof(d), (unsigned char *)&d); } } /* Find the domain with the highest weight. */ void update_max_weight(struct csched_runqueue_data *rqd, int new_weight, int old_weight) { /* Try to avoid brute-force search: * - If new_weight is larger, max_weigth <- new_weight * - If old_weight != max_weight, someone else is still max_weight * (No action required) * - If old_weight == max_weight, brute-force search for max weight */ if ( new_weight > rqd->max_weight ) { rqd->max_weight = new_weight; d2printk("%s: Runqueue id %d max weight %d\n", __func__, rqd->id, rqd->max_weight); } else if ( old_weight == rqd->max_weight ) { struct list_head *iter; int max_weight = 1; list_for_each( iter, &rqd->svc ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, rqd_elem); if ( svc->weight > max_weight ) max_weight = svc->weight; } rqd->max_weight = max_weight; d2printk("%s: Runqueue %d max weight %d\n", __func__, rqd->id, rqd->max_weight); } } #ifndef NDEBUG static /*inline*/ void __csched_vcpu_check(struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( svc->vcpu != vc ); BUG_ON( sdom != CSCHED_DOM(vc->domain) ); if ( sdom ) { BUG_ON( is_idle_vcpu(vc) ); BUG_ON( sdom->dom != vc->domain ); } else { BUG_ON( !is_idle_vcpu(vc) ); } } #define CSCHED_VCPU_CHECK(_vc) (__csched_vcpu_check(_vc)) #else #define CSCHED_VCPU_CHECK(_vc) #endif static void * csched_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd) { struct csched_vcpu *svc; /* Allocate per-VCPU info */ svc = xmalloc(struct csched_vcpu); if ( svc == NULL ) return NULL; memset(svc, 0, sizeof(*svc)); INIT_LIST_HEAD(&svc->rqd_elem); INIT_LIST_HEAD(&svc->sdom_elem); INIT_LIST_HEAD(&svc->runq_elem); svc->sdom = dd; svc->vcpu = vc; svc->flags = 0U; if ( ! is_idle_vcpu(vc) ) { BUG_ON( svc->sdom == NULL ); svc->credit = CSCHED_CREDIT_INIT; svc->weight = svc->sdom->weight; /* Starting load of 50% */ svc->avgload = 1ULL << (CSCHED_PRIV(ops)->load_window_shift - 1); svc->load_last_update = NOW(); } else { BUG_ON( svc->sdom != NULL ); svc->credit = CSCHED_IDLE_CREDIT; svc->weight = 0; } return svc; } /* Add and remove from runqueue assignment (not active run queue) */ static void __runq_assign(struct csched_vcpu *svc, struct csched_runqueue_data *rqd) { svc->rqd = rqd; list_add_tail(&svc->rqd_elem, &svc->rqd->svc); update_max_weight(svc->rqd, svc->weight, 0); /* Expected new load based on adding this vcpu */ rqd->b_avgload += svc->avgload; /* TRACE */ { struct { unsigned dom:16,vcpu:16; unsigned rqi:16; } d; d.dom = svc->vcpu->domain->domain_id; d.vcpu = svc->vcpu->vcpu_id; d.rqi=rqd->id; trace_var(TRC_CSCHED2_RUNQ_ASSIGN, 1, sizeof(d), (unsigned char *)&d); } } static void runq_assign(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; BUG_ON(svc->rqd != NULL); __runq_assign(svc, RQD(ops, vc->processor)); } static void __runq_deassign(struct csched_vcpu *svc) { BUG_ON(__vcpu_on_runq(svc)); list_del_init(&svc->rqd_elem); update_max_weight(svc->rqd, 0, svc->weight); /* Expected new load based on removing this vcpu */ svc->rqd->b_avgload -= svc->avgload; svc->rqd = NULL; } static void runq_deassign(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; BUG_ON(svc->rqd != RQD(ops, vc->processor)); __runq_deassign(svc); } static void csched_vcpu_insert(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu *svc = vc->sched_priv; struct domain * const dom = vc->domain; struct csched_dom * const sdom = svc->sdom; printk("%s: Inserting d%dv%d\n", __func__, dom->domain_id, vc->vcpu_id); /* NB: On boot, idle vcpus are inserted before alloc_pdata() has * been called for that cpu. */ if ( ! is_idle_vcpu(vc) ) { /* FIXME: Do we need the private lock here? */ list_add_tail(&svc->sdom_elem, &svc->sdom->vcpu); /* Add vcpu to runqueue of initial processor */ vcpu_schedule_lock_irq(vc); runq_assign(ops, vc); vcpu_schedule_unlock_irq(vc); sdom->nr_vcpus++; } CSCHED_VCPU_CHECK(vc); } static void csched_free_vdata(const struct scheduler *ops, void *priv) { struct csched_vcpu *svc = priv; xfree(svc); } static void csched_vcpu_remove(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); struct csched_dom * const sdom = svc->sdom; BUG_ON( sdom == NULL ); BUG_ON( !list_empty(&svc->runq_elem) ); if ( ! is_idle_vcpu(vc) ) { /* Remove from runqueue */ vcpu_schedule_lock_irq(vc); runq_deassign(ops, vc); vcpu_schedule_unlock_irq(vc); /* Remove from sdom list. Don't need a lock for this, as it's called * syncronously when nothing else can happen. */ list_del_init(&svc->sdom_elem); svc->sdom->nr_vcpus--; } } static void csched_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); BUG_ON( is_idle_vcpu(vc) ); if ( per_cpu(schedule_data, vc->processor).curr == vc ) cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ); else if ( __vcpu_on_runq(svc) ) { BUG_ON(svc->rqd != RQD(ops, vc->processor)); update_load(ops, svc->rqd, svc, -1, NOW()); __runq_remove(svc); } else if ( test_bit(__CSFLAG_delayed_runq_add, &svc->flags) ) clear_bit(__CSFLAG_delayed_runq_add, &svc->flags); } static void csched_vcpu_wake(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); s_time_t now = 0; /* Schedule lock should be held at this point. */ d2printk("w d%dv%d\n", vc->domain->domain_id, vc->vcpu_id); BUG_ON( is_idle_vcpu(vc) ); /* Make sure svc priority mod happens before runq check */ if ( unlikely(per_cpu(schedule_data, vc->processor).curr == vc) ) { goto out; } if ( unlikely(__vcpu_on_runq(svc)) ) { /* If we've boosted someone that's already on a runqueue, prioritize * it and inform the cpu in question. */ goto out; } /* If the context hasn't been saved for this vcpu yet, we can't put it on * another runqueue. Instead, we set a flag so that it will be put on the runqueue * after the context has been saved. */ if ( unlikely (test_bit(__CSFLAG_scheduled, &svc->flags) ) ) { set_bit(__CSFLAG_delayed_runq_add, &svc->flags); goto out; } /* Add into the new runqueue if necessary */ if ( svc->rqd == NULL ) runq_assign(ops, vc); else BUG_ON(RQD(ops, vc->processor) != svc->rqd ); now = NOW(); update_load(ops, svc->rqd, svc, 1, now); /* Put the VCPU on the runq */ runq_insert(ops, vc->processor, svc); runq_tickle(ops, vc->processor, svc, now); out: d2printk("w-\n"); return; } static void csched_context_saved(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); s_time_t now = NOW(); vcpu_schedule_lock_irq(vc); BUG_ON( !is_idle_vcpu(vc) && svc->rqd != RQD(ops, vc->processor)); /* This vcpu is now eligible to be put on the runqueue again */ clear_bit(__CSFLAG_scheduled, &svc->flags); /* If someone wants it on the runqueue, put it there. */ /* * NB: We can get rid of CSFLAG_scheduled by checking for * vc->is_running and __vcpu_on_runq(svc) here. However, * since we're accessing the flags cacheline anyway, * it seems a bit pointless; especially as we have plenty of * bits free. */ if ( test_and_clear_bit(__CSFLAG_delayed_runq_add, &svc->flags) && likely(vcpu_runnable(vc)) ) { BUG_ON(__vcpu_on_runq(svc)); runq_insert(ops, vc->processor, svc); runq_tickle(ops, vc->processor, svc, now); } else if ( !is_idle_vcpu(vc) ) update_load(ops, svc->rqd, svc, -1, now); vcpu_schedule_unlock_irq(vc); } #define MAX_LOAD (1ULL<<60); static int choose_cpu(const struct scheduler *ops, struct vcpu *vc) { struct csched_private *prv = CSCHED_PRIV(ops); int i, min_rqi = -1, new_cpu; struct csched_vcpu *svc = CSCHED_VCPU(vc); s_time_t min_avgload; BUG_ON(cpus_empty(prv->active_queues)); /* Locking: * - vc->processor is already locked * - Need to grab prv lock to make sure active runqueues don't * change * - Need to grab locks for other runqueues while checking * avgload * Locking constraint is: * - Lock prv before runqueue locks * - Trylock between runqueue locks (no ordering) * * Since one of the runqueue locks is already held, we can't * just grab the prv lock. Instead, we'll have to trylock, and * do something else reasonable if we fail. */ if ( !spin_trylock(&prv->lock) ) { if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) { d2printk("d%dv%d -\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); clear_bit(__CSFLAG_runq_migrate_request, &svc->flags); } /* Leave it where it is for now. When we actually pay attention * to affinity we'll have to figure something out... */ return vc->processor; } /* First check to see if we're here because someone else suggested a place * for us to move. */ if ( test_and_clear_bit(__CSFLAG_runq_migrate_request, &svc->flags) ) { if ( unlikely(svc->migrate_rqd->id < 0) ) { printk("%s: Runqueue migrate aborted because target runqueue disappeared!\n", __func__); /* Fall-through to normal cpu pick */ } else { d2printk("d%dv%d +\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id); new_cpu = first_cpu(svc->migrate_rqd->active); goto out_up; } } /* FIXME: Pay attention to cpu affinity */ min_avgload = MAX_LOAD; /* Find the runqueue with the lowest instantaneous load */ for_each_cpu_mask(i, prv->active_queues) { struct csched_runqueue_data *rqd; s_time_t rqd_avgload; rqd = prv->rqd + i; /* If checking a different runqueue, grab the lock, * read the avg, and then release the lock. * * If on our own runqueue, don't grab or release the lock; * but subtract our own load from the runqueue load to simulate * impartiality */ if ( rqd == svc->rqd ) { rqd_avgload = rqd->b_avgload - svc->avgload; } else if ( spin_trylock(&rqd->lock) ) { rqd_avgload = rqd->b_avgload; spin_unlock(&rqd->lock); } else continue; if ( rqd_avgload < min_avgload ) { min_avgload = rqd_avgload; min_rqi=i; } } /* We didn't find anyone (most likely because of spinlock contention); leave it where it is */ if ( min_rqi == -1 ) new_cpu = vc->processor; else { BUG_ON(cpus_empty(prv->rqd[min_rqi].active)); new_cpu = first_cpu(prv->rqd[min_rqi].active); } out_up: spin_unlock(&prv->lock); return new_cpu; } static void balance_load(const struct scheduler *ops, int cpu, s_time_t now) { struct csched_private *prv = CSCHED_PRIV(ops); int i, max_delta_rqi = -1; struct list_head *push_iter, *pull_iter; /* NB: Modified by consider() */ s_time_t load_delta; struct csched_vcpu * best_push_svc=NULL, *best_pull_svc=NULL; /* NB: Read by consider() */ struct csched_runqueue_data *lrqd; struct csched_runqueue_data *orqd; void consider(struct csched_vcpu *push_svc, struct csched_vcpu *pull_svc) { s_time_t l_load, o_load, delta; l_load = lrqd->b_avgload; o_load = orqd->b_avgload; if ( push_svc ) { /* What happens to the load on both if we push? */ l_load -= push_svc->avgload; o_load += push_svc->avgload; } if ( pull_svc ) { /* What happens to the load on both if we pull? */ l_load += pull_svc->avgload; o_load -= pull_svc->avgload; } delta = l_load - o_load; if ( delta < 0 ) delta = -delta; if ( delta < load_delta ) { load_delta = delta; best_push_svc=push_svc; best_pull_svc=pull_svc; } } void migrate(struct csched_vcpu *svc, struct csched_runqueue_data *trqd) { if ( test_bit(__CSFLAG_scheduled, &svc->flags) ) { d2printk("d%dv%d %d-%d a\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->rqd->id, trqd->id); /* It's running; mark it to migrate. */ svc->migrate_rqd = trqd; set_bit(_VPF_migrating, &svc->vcpu->pause_flags); set_bit(__CSFLAG_runq_migrate_request, &svc->flags); } else { int on_runq=0; /* It's not running; just move it */ d2printk("d%dv%d %d-%d i\n", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->rqd->id, trqd->id); if ( __vcpu_on_runq(svc) ) { __runq_remove(svc); update_load(ops, svc->rqd, svc, -1, now); on_runq=1; } __runq_deassign(svc); svc->vcpu->processor = first_cpu(trqd->active); __runq_assign(svc, trqd); if ( on_runq ) { update_load(ops, svc->rqd, svc, 1, now); runq_insert(ops, svc->vcpu->processor, svc); runq_tickle(ops, svc->vcpu->processor, svc, now); } } } /* * Basic algorithm: Push, pull, or swap. * - Find the runqueue with the furthest load distance * - Find a pair that makes the difference the least (where one * on either side may be empty). */ /* Locking: * - pcpu schedule lock should be already locked */ lrqd = RQD(ops, cpu); __update_runq_load(ops, lrqd, 0, now); retry: if ( !spin_trylock(&prv->lock) ) return; load_delta = 0; for_each_cpu_mask(i, prv->active_queues) { s_time_t delta; orqd = prv->rqd + i; if ( orqd == lrqd || !spin_trylock(&orqd->lock) ) continue; __update_runq_load(ops, orqd, 0, now); delta = lrqd->b_avgload - orqd->b_avgload; if ( delta < 0 ) delta = -delta; if ( delta > load_delta ) { load_delta = delta; max_delta_rqi = i; } spin_unlock(&orqd->lock); } /* Minimize holding the big lock */ spin_unlock(&prv->lock); if ( max_delta_rqi == -1 ) goto out; { s_time_t load_max; int cpus_max; load_max = lrqd->b_avgload; if ( orqd->b_avgload > load_max ) load_max = orqd->b_avgload; cpus_max=cpus_weight(lrqd->active); if ( cpus_weight(orqd->active) > cpus_max ) cpus_max = cpus_weight(orqd->active); /* If we're under 100% capacaty, only shift if load difference * is > 1. otherwise, shift if under 12.5% */ if ( load_max < (1ULL<<(prv->load_window_shift))*cpus_max ) { if ( load_delta < (1ULL<<(prv->load_window_shift+opt_underload_balance_tolerance) ) ) goto out; } else if ( load_delta < (1ULL<<(prv->load_window_shift+opt_overload_balance_tolerance)) ) goto out; } /* Try to grab the other runqueue lock; if it's been taken in the * meantime, try the process over again. This can't deadlock * because if it doesn't get any other rqd locks, it will simply * give up and return. */ orqd = prv->rqd + max_delta_rqi; if ( !spin_trylock(&orqd->lock) ) goto retry; /* Make sure the runqueue hasn't been deactivated since we released prv->lock */ if ( unlikely(orqd->id < 0) ) goto out_up; /* Look for "swap" which gives the best load average * FIXME: O(n^2)! */ /* Reuse load delta (as we're trying to minimize it) */ list_for_each( push_iter, &lrqd->svc ) { int inner_load_updated = 0; struct csched_vcpu * push_svc = list_entry(push_iter, struct csched_vcpu, rqd_elem); __update_svc_load(ops, push_svc, 0, now); /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &push_svc->flags) ) continue; list_for_each( pull_iter, &orqd->svc ) { struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem); if ( ! inner_load_updated ) { __update_svc_load(ops, pull_svc, 0, now); } /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; consider(push_svc, pull_svc); } inner_load_updated = 1; /* Consider push only */ consider(push_svc, NULL); } list_for_each( pull_iter, &orqd->svc ) { struct csched_vcpu * pull_svc = list_entry(pull_iter, struct csched_vcpu, rqd_elem); /* Skip this one if it's already been flagged to migrate */ if ( test_bit(__CSFLAG_runq_migrate_request, &pull_svc->flags) ) continue; /* Consider pull only */ consider(NULL, pull_svc); } /* OK, now we have some candidates; do the moving */ if ( best_push_svc ) migrate(best_push_svc, orqd); if ( best_pull_svc ) migrate(best_pull_svc, lrqd); out_up: spin_unlock(&orqd->lock); out: return; } static int csched_cpu_pick(const struct scheduler *ops, struct vcpu *vc) { struct csched_vcpu * const svc = CSCHED_VCPU(vc); int new_cpu; /* The scheduler interface doesn't have an explicit mechanism to * involve the choosable scheduler in the migrate process, so we * infer that a change may happen by the call to cpu_pick, and * remove it from the old runqueue while the lock for the old * runqueue is held. It can't be actively waiting to run. It * will be added to the new runqueue when it next wakes. * * If we want to be able to call pick() separately, we need to add * a mechansim to remove a vcpu from an old processor / runqueue * before releasing the lock. */ BUG_ON(__vcpu_on_runq(svc)); new_cpu = choose_cpu(ops, vc); /* If we're suggesting moving to a different runqueue, remove it * from the old runqueue while we have the lock. It will be added * to the new one when it wakes. */ if ( svc->rqd != NULL && RQD(ops, new_cpu) != svc->rqd ) runq_deassign(ops, vc); return new_cpu; } static int csched_dom_cntl( const struct scheduler *ops, struct domain *d, struct xen_domctl_scheduler_op *op) { struct csched_dom * const sdom = CSCHED_DOM(d); struct csched_private *prv = CSCHED_PRIV(ops); unsigned long flags; if ( op->cmd == XEN_DOMCTL_SCHEDOP_getinfo ) { op->u.credit2.weight = sdom->weight; } else { ASSERT(op->cmd == XEN_DOMCTL_SCHEDOP_putinfo); if ( op->u.credit2.weight != 0 ) { struct list_head *iter; int old_weight; /* Must hold csched_priv lock to update sdom, runq lock to * update csvcs. */ spin_lock_irqsave(&prv->lock, flags); old_weight = sdom->weight; sdom->weight = op->u.credit2.weight; /* Update weights for vcpus, and max_weight for runqueues on which they reside */ list_for_each ( iter, &sdom->vcpu ) { struct csched_vcpu *svc = list_entry(iter, struct csched_vcpu, sdom_elem); /* NB: Locking order is important here. Because we grab this lock here, we * must never lock csched_priv.lock if we're holding a runqueue * lock. */ vcpu_schedule_lock_irq(svc->vcpu); BUG_ON(svc->rqd != RQD(ops, svc->vcpu->processor)); svc->weight = sdom->weight; update_max_weight(svc->rqd, svc->weight, old_weight); vcpu_schedule_unlock_irq(svc->vcpu); } spin_unlock_irqrestore(&prv->lock, flags); } } return 0; } static void * csched_alloc_domdata(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; int flags; sdom = xmalloc(struct csched_dom); if ( sdom == NULL ) return NULL; memset(sdom, 0, sizeof(*sdom)); /* Initialize credit and weight */ INIT_LIST_HEAD(&sdom->vcpu); INIT_LIST_HEAD(&sdom->sdom_elem); sdom->dom = dom; sdom->weight = CSCHED_DEFAULT_WEIGHT; sdom->nr_vcpus = 0; spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); list_add_tail(&sdom->sdom_elem, &CSCHED_PRIV(ops)->sdom); spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); return (void *)sdom; } static int csched_dom_init(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom; printk("%s: Initializing domain %d\n", __func__, dom->domain_id); if ( is_idle_domain(dom) ) return 0; sdom = csched_alloc_domdata(ops, dom); if ( sdom == NULL ) return -ENOMEM; dom->sched_priv = sdom; return 0; } static void csched_free_domdata(const struct scheduler *ops, void *data) { int flags; struct csched_dom *sdom = data; spin_lock_irqsave(&CSCHED_PRIV(ops)->lock, flags); list_del_init(&sdom->sdom_elem); spin_unlock_irqrestore(&CSCHED_PRIV(ops)->lock, flags); xfree(data); } static void csched_dom_destroy(const struct scheduler *ops, struct domain *dom) { struct csched_dom *sdom = CSCHED_DOM(dom); BUG_ON(!list_empty(&sdom->vcpu)); csched_free_domdata(ops, CSCHED_DOM(dom)); } /* How long should we let this vcpu run for? */ static s_time_t csched_runtime(const struct scheduler *ops, int cpu, struct csched_vcpu *snext) { s_time_t time = CSCHED_MAX_TIMER; struct csched_runqueue_data *rqd = RQD(ops, cpu); struct list_head *runq = &rqd->runq; if ( is_idle_vcpu(snext->vcpu) ) return CSCHED_MAX_TIMER; /* Basic time */ time = c2t(rqd, snext->credit, snext); /* Next guy on runqueue */ if ( ! list_empty(runq) ) { struct csched_vcpu *svc = __runq_elem(runq->next); s_time_t ntime; if ( ! is_idle_vcpu(svc->vcpu) ) { ntime = c2t(rqd, snext->credit - svc->credit, snext); if ( time > ntime ) time = ntime; } } /* Check limits */ if ( time < CSCHED_MIN_TIMER ) time = CSCHED_MIN_TIMER; else if ( time > CSCHED_MAX_TIMER ) time = CSCHED_MAX_TIMER; return time; } void __dump_execstate(void *unused); /* * Find a candidate. */ static struct csched_vcpu * runq_candidate(struct csched_runqueue_data *rqd, struct csched_vcpu *scurr, int cpu, s_time_t now) { struct list_head *iter; struct csched_vcpu *snext = NULL; /* Default to current if runnable, idle otherwise */ if ( vcpu_runnable(scurr->vcpu) ) snext = scurr; else snext = CSCHED_VCPU(idle_vcpu[cpu]); list_for_each( iter, &rqd->runq ) { struct csched_vcpu * svc = list_entry(iter, struct csched_vcpu, runq_elem); /* If this is on a different processor, don't pull it unless * its credit is at least CSCHED_MIGRATE_RESIST higher. */ if ( svc->vcpu->processor != cpu && snext->credit + CSCHED_MIGRATE_RESIST > svc->credit ) continue; /* If the next one on the list has more credit than current * (or idle, if current is not runnable), choose it. */ if ( svc->credit > snext->credit ) snext = svc; /* In any case, if we got this far, break. */ break; } return snext; } /* * This function is in the critical path. It is designed to be simple and * fast for the common case. */ static struct task_slice csched_schedule( const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled) { const int cpu = smp_processor_id(); struct csched_runqueue_data *rqd; struct csched_vcpu * const scurr = CSCHED_VCPU(current); struct csched_vcpu *snext = NULL; struct task_slice ret; CSCHED_VCPU_CHECK(current); d2printk("sc p%d c d%dv%d now %"PRI_stime"\n", cpu, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id, now); BUG_ON(!cpu_isset(cpu, CSCHED_PRIV(ops)->initialized)); rqd = RQD(ops, cpu); BUG_ON(!cpu_isset(cpu, rqd->active)); /* Protected by runqueue lock */ BUG_ON(!is_idle_vcpu(scurr->vcpu) && scurr->rqd != rqd); /* Clear "tickled" bit now that we've been scheduled */ if ( cpu_isset(cpu, rqd->tickled) ) cpu_clear(cpu, rqd->tickled); /* Update credits */ burn_credits(rqd, scurr, now); /* * Select next runnable local VCPU (ie top of local runq). * * If the current vcpu is runnable, and has higher credit than * the next guy on the queue (or there is noone else), we want to * run him again. * * If there's tasklet work to do, we want to chose the idle vcpu * for this processor, and mark the current for delayed runqueue * add. * * If the current vcpu is runnable, and there's another runnable * candidate, we want to mark current for delayed runqueue add, * and remove the next guy from the queue. * * If the current vcpu is not runnable, we want to chose the idle * vcpu for this processor. */ if ( tasklet_work_scheduled ) { trace_var(TRC_CSCHED2_SCHED_TASKLET, 0, 0, NULL); snext = CSCHED_VCPU(idle_vcpu[cpu]); } else snext=runq_candidate(rqd, scurr, cpu, now); /* If switching from a non-idle runnable vcpu, put it * back on the runqueue. */ if ( snext != scurr && !is_idle_vcpu(scurr->vcpu) && vcpu_runnable(current) ) set_bit(__CSFLAG_delayed_runq_add, &scurr->flags); ret.migrated = 0; /* Accounting for non-idle tasks */ if ( !is_idle_vcpu(snext->vcpu) ) { /* If switching, remove this from the runqueue and mark it scheduled */ if ( snext != scurr ) { BUG_ON(snext->rqd != rqd); __runq_remove(snext); if ( snext->vcpu->is_running ) { printk("p%d: snext d%dv%d running on p%d! scurr d%dv%d\n", cpu, snext->vcpu->domain->domain_id, snext->vcpu->vcpu_id, snext->vcpu->processor, scurr->vcpu->domain->domain_id, scurr->vcpu->vcpu_id); BUG(); } set_bit(__CSFLAG_scheduled, &snext->flags); } /* Check for the reset condition */ if ( snext->credit <= CSCHED_CREDIT_RESET ) { reset_credit(ops, cpu, now); balance_load(ops, cpu, now); } /* Clear the idle mask if necessary */ if ( cpu_isset(cpu, rqd->idle) ) cpu_clear(cpu, rqd->idle); snext->start_time = now; /* Safe because lock for old processor is held */ if ( snext->vcpu->processor != cpu ) { snext->credit += CSCHED_MIGRATE_COMPENSATION; snext->vcpu->processor = cpu; ret.migrated = 1; } } else { /* Update the idle mask if necessary */ if ( !cpu_isset(cpu, rqd->idle) ) cpu_set(cpu, rqd->idle); /* Make sure avgload gets updated periodically even * if there's no activity */ update_load(ops, rqd, NULL, 0, now); } /* * Return task to run next... */ ret.time = csched_runtime(ops, cpu, snext); ret.task = snext->vcpu; CSCHED_VCPU_CHECK(ret.task); return ret; } static void csched_dump_vcpu(struct csched_vcpu *svc) { printk("[%i.%i] flags=%x cpu=%i", svc->vcpu->domain->domain_id, svc->vcpu->vcpu_id, svc->flags, svc->vcpu->processor); printk(" credit=%" PRIi32" [w=%u]", svc->credit, svc->weight); printk("\n"); } static void csched_dump_pcpu(const struct scheduler *ops, int cpu) { struct list_head *runq, *iter; struct csched_vcpu *svc; int loop; char cpustr[100]; /* FIXME: Do locking properly for access to runqueue structures */ runq = &RQD(ops, cpu)->runq; cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_sibling_map,cpu)); printk(" sibling=%s, ", cpustr); cpumask_scnprintf(cpustr, sizeof(cpustr), per_cpu(cpu_core_map,cpu)); printk("core=%s\n", cpustr); /* current VCPU */ svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr); if ( svc ) { printk("\trun: "); csched_dump_vcpu(svc); } loop = 0; list_for_each( iter, runq ) { svc = __runq_elem(iter); if ( svc ) { printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } } static void csched_dump(const struct scheduler *ops) { struct list_head *iter_sdom, *iter_svc; struct csched_private *prv = CSCHED_PRIV(ops); int i, loop; printk("Active queues: %d\n" "\tdefault-weight = %d\n", cpus_weight(prv->active_queues), CSCHED_DEFAULT_WEIGHT); for_each_cpu_mask(i, prv->active_queues) { s_time_t fraction; fraction = prv->rqd[i].avgload * 100 / (1ULL<<prv->load_window_shift); printk("Runqueue %d:\n" "\tncpus = %u\n" "\tmax_weight = %d\n" "\tinstload = %d\n" "\taveload = %3ld\n", i, cpus_weight(prv->rqd[i].active), prv->rqd[i].max_weight, prv->rqd[i].load, fraction); } /* FIXME: Locking! */ printk("Domain info:\n"); loop = 0; list_for_each( iter_sdom, &prv->sdom ) { struct csched_dom *sdom; sdom = list_entry(iter_sdom, struct csched_dom, sdom_elem); printk("\tDomain: %d w %d v %d\n\t", sdom->dom->domain_id, sdom->weight, sdom->nr_vcpus); list_for_each( iter_svc, &sdom->vcpu ) { struct csched_vcpu *svc; svc = list_entry(iter_svc, struct csched_vcpu, sdom_elem); printk("\t%3d: ", ++loop); csched_dump_vcpu(svc); } } } static void activate_runqueue(struct csched_private *prv, int rqi) { struct csched_runqueue_data *rqd; rqd = prv->rqd + rqi; BUG_ON(!cpus_empty(rqd->active)); rqd->max_weight = 1; rqd->id = rqi; INIT_LIST_HEAD(&rqd->svc); INIT_LIST_HEAD(&rqd->runq); spin_lock_init(&rqd->lock); cpu_set(rqi, prv->active_queues); } static void deactivate_runqueue(struct csched_private *prv, int rqi) { struct csched_runqueue_data *rqd; rqd = prv->rqd + rqi; BUG_ON(!cpus_empty(rqd->active)); rqd->id = -1; cpu_clear(rqi, prv->active_queues); } static void init_pcpu(const struct scheduler *ops, int cpu) { int rqi, old_rqi, flags; struct csched_private *prv = CSCHED_PRIV(ops); struct csched_runqueue_data *rqd; spinlock_t *old_lock; spin_lock_irqsave(&prv->lock, flags); if ( cpu_isset(cpu, prv->initialized) ) { printk("%s: Strange, cpu %d already initialized!\n", __func__, cpu); spin_unlock_irqrestore(&prv->lock, flags); return; } old_rqi = prv->runq_map[cpu]; /* Figure out which runqueue to put it in */ rqi = 0; /* Figure out which runqueue to put it in */ /* NB: cpu 0 doesn't get a STARTING callback, so we hard-code it to runqueue 0. */ if ( cpu == 0 ) rqi = 0; else rqi = cpu_to_socket(cpu); if ( rqi < 0 ) { printk("%s: cpu_to_socket(%d) returned %d!\n", __func__, cpu, rqi); BUG(); } rqd=prv->rqd + rqi; printk("Adding cpu %d to runqueue %d\n", cpu, rqi); if ( ! cpu_isset(rqi, prv->active_queues) ) { printk(" First cpu on runqueue, activating\n"); activate_runqueue(prv, rqi); } /* IRQs already disabled */ old_lock=pcpu_schedule_lock(cpu); /* Move spinlock to new runq lock. */ per_cpu(schedule_data, cpu).schedule_lock = &rqd->lock; /* Set the runqueue map */ prv->runq_map[cpu]=rqi; cpu_set(cpu, rqd->idle); cpu_set(cpu, rqd->active); spin_unlock(old_lock); cpu_set(cpu, prv->initialized); spin_unlock_irqrestore(&prv->lock, flags); return; } static void * csched_alloc_pdata(const struct scheduler *ops, int cpu) { /* Check to see if the cpu is online yet */ /* Note: cpu 0 doesn't get a STARTING callback */ if ( cpu == 0 || cpu_to_socket(cpu) >= 0 ) init_pcpu(ops, cpu); else printk("%s: cpu %d not online yet, deferring initializatgion\n", __func__, cpu); return (void *)1; } static void csched_free_pdata(const struct scheduler *ops, void *pcpu, int cpu) { unsigned long flags; struct csched_private *prv = CSCHED_PRIV(ops); struct csched_runqueue_data *rqd; int rqi; spin_lock_irqsave(&prv->lock, flags); BUG_ON( !cpu_isset(cpu, prv->initialized)); /* Find the old runqueue and remove this cpu from it */ rqi = prv->runq_map[cpu]; rqd = prv->rqd + rqi; /* No need to save IRQs here, they're already disabled */ spin_lock(&rqd->lock); BUG_ON(!cpu_isset(cpu, rqd->idle)); printk("Removing cpu %d from runqueue %d\n", cpu, rqi); cpu_clear(cpu, rqd->idle); cpu_clear(cpu, rqd->active); if ( cpus_empty(rqd->active) ) { printk(" No cpus left on runqueue, disabling\n"); deactivate_runqueue(prv, rqi); } spin_unlock(&rqd->lock); cpu_clear(cpu, prv->initialized); spin_unlock_irqrestore(&prv->lock, flags); return; } static int csched_cpu_starting(int cpu) { struct scheduler *ops; /* Hope this is safe from cpupools switching things around. :-) */ ops = per_cpu(scheduler, cpu); init_pcpu(ops, cpu); return NOTIFY_DONE; } static int cpu_credit2_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_STARTING: csched_cpu_starting(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_credit2_nfb = { .notifier_call = cpu_credit2_callback }; static int csched_init(struct scheduler *ops) { int i; struct csched_private *prv; printk("Initializing Credit2 scheduler\n" \ " WARNING: This is experimental software in development.\n" \ " Use at your own risk.\n"); printk(" load_window_shift: %d\n", opt_load_window_shift); printk(" underload_balance_tolerance: %d\n", opt_underload_balance_tolerance); printk(" overload_balance_tolerance: %d\n", opt_overload_balance_tolerance); if ( opt_load_window_shift < LOADAVG_WINDOW_SHIFT_MIN ) { printk("%s: opt_load_window_shift %d below min %d, resetting\n", __func__, opt_load_window_shift, LOADAVG_WINDOW_SHIFT_MIN); opt_load_window_shift = LOADAVG_WINDOW_SHIFT_MIN; } /* Basically no CPU information is available at this point; just * set up basic structures, and a callback when the CPU info is * available. */ prv = xmalloc(struct csched_private); if ( prv == NULL ) return -ENOMEM; memset(prv, 0, sizeof(*prv)); ops->sched_data = prv; spin_lock_init(&prv->lock); INIT_LIST_HEAD(&prv->sdom); register_cpu_notifier(&cpu_credit2_nfb); /* But un-initialize all runqueues */ for ( i=0; i<NR_CPUS; i++) { prv->runq_map[i] = -1; prv->rqd[i].id = -1; } prv->load_window_shift = opt_load_window_shift; return 0; } static void csched_deinit(const struct scheduler *ops) { struct csched_private *prv; prv = CSCHED_PRIV(ops); if ( prv != NULL ) xfree(prv); } static struct csched_private _csched_priv; const struct scheduler sched_credit2_def = { .name = "SMP Credit Scheduler rev2", .opt_name = "credit2", .sched_id = XEN_SCHEDULER_CREDIT2, .sched_data = &_csched_priv, .init_domain = csched_dom_init, .destroy_domain = csched_dom_destroy, .insert_vcpu = csched_vcpu_insert, .remove_vcpu = csched_vcpu_remove, .sleep = csched_vcpu_sleep, .wake = csched_vcpu_wake, .adjust = csched_dom_cntl, .pick_cpu = csched_cpu_pick, .do_schedule = csched_schedule, .context_saved = csched_context_saved, .dump_cpu_state = csched_dump_pcpu, .dump_settings = csched_dump, .init = csched_init, .deinit = csched_deinit, .alloc_vdata = csched_alloc_vdata, .free_vdata = csched_free_vdata, .alloc_pdata = csched_alloc_pdata, .free_pdata = csched_free_pdata, .alloc_domdata = csched_alloc_domdata, .free_domdata = csched_free_domdata, };