| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
| * Use is subject to license terms. |
| */ |
| |
| #include <sys/disp.h> |
| #include <sys/param.h> |
| #include <sys/systm.h> |
| #include <sys/sysmacros.h> |
| #include <sys/atomic.h> |
| #include <sys/cpucaps_impl.h> |
| #include <sys/dtrace.h> |
| #include <sys/sdt.h> |
| #include <sys/debug.h> |
| #include <sys/rctl.h> |
| #include <sys/errno.h> |
| |
| /* |
| * CPU Caps implementation |
| * ======================= |
| * |
| * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU |
| * usage for all projects running inside the zone. If the zone CPU cap is set |
| * below the project CPU cap, the latter will have no effect. |
| * |
| * When CPU usage of projects and/or zones reaches specified caps, threads in |
| * them do not get scheduled and instead are placed on wait queues associated |
| * with a cap. Such threads will start running again only when CPU usage drops |
| * below the cap level. Each zone and each project has its own wait queue. |
| * |
| * When CPU cap is set, the kernel continously keeps track of CPU time used by |
| * capped zones and/or projects over a short time interval and calculates their |
| * current CPU usage as a percentage. When the accumulated usage reaches the CPU |
| * cap, LWPs running in the user-land (when they are not holding any critical |
| * kernel locks) are placed on special wait queues until their project's or |
| * zone's CPU usage drops below the cap. |
| * |
| * The system maintains a list of all capped projects and all capped zones. On |
| * every clock tick every active thread belonging to a capped project adds its |
| * CPU usage to its project. Usage from all projects belonging to a capped zone |
| * is aggregated to get the zone usage. |
| * |
| * When the current CPU usage is above the cap, a project or zone is considered |
| * over-capped. Every user thread caught running in an over-capped project or |
| * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and |
| * is requested to surrender its CPU. This causes scheduling class specific |
| * CL_PREEMPT() callback to be invoked. The callback function places threads |
| * marked as TS_PROJWAIT on a wait queue and calls switch(). |
| * |
| * Threads are only placed on wait queues after trapping from user-land |
| * (they could be holding some user locks, but no kernel locks) and while |
| * returning from the trap back to the user-land when no kernel locks are held. |
| * Putting threads on wait queues in random places while running in the |
| * kernel might lead to all kinds of locking problems. |
| * |
| * Accounting |
| * ========== |
| * |
| * Accounting of CPU usage is based on per-thread micro-state accounting data. |
| * On every clock tick clock() adds new on-CPU time for every thread found on |
| * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. |
| * New times means time since it was last accounted for. On-CPU times greater |
| * than 1 tick are truncated to 1 tick. |
| * |
| * Project CPU usage is aggregated from all threads within the project. |
| * Zone CPU usage is the sum of usages for all projects within the zone. Zone |
| * CPU usage is calculated on every clock tick by walking list of projects and |
| * adding their usage together. |
| * |
| * Decay |
| * ===== |
| * |
| * CPU usage is decayed by the caps_update() routine which is called once per |
| * every clock tick. It walks lists of project caps and decays their usages by |
| * one per cent. If CPU usage drops below cap levels, threads on the wait queue |
| * are made runnable again, one thread per clock tick. |
| * |
| * Interfaces |
| * ========== |
| * |
| * The CPU Caps facility provides the following interfaces to the rest of the |
| * system: |
| * |
| * cpucaps_project_add(kproject_t *) |
| * |
| * Notifies the framework of a new project. It should be put on the |
| * capped_projects list if its zone has a cap. |
| * |
| * cpucaps_project_remove(kproject_t *) |
| * |
| * Remove the association between the specified project and its cap. |
| * Called right before the project is destroyed. |
| * |
| * cpucaps_project_set(kproject_t *, rctl_qty_t) |
| * |
| * Set project cap of the specified project to the specified value. Setting the |
| * value to NOCAP is equivalent to removing the cap. |
| * |
| * cpucaps_zone_set(zone_t *, rctl_qty_t) |
| * |
| * Set zone cap of the specified zone to the specified value. Setting the value |
| * to NOCAP is equivalent to removing the cap. |
| * |
| * cpucaps_zone_remove(zone_t *) |
| * |
| * Remove the association between the zone and its cap. |
| * |
| * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) |
| * |
| * Charges specified thread's project the amount of on-CPU time that it used. |
| * If the third argument is CPUCAPS_CHARGE_ONLY returns False. |
| * Otherwise returns True if project or zone should be penalized because its |
| * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ |
| * bits in t_schedflag in this case. |
| * |
| * CPUCAPS_ENFORCE(kthread_id_t *) |
| * |
| * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER |
| * state on project or zone wait queues, as requested by TS_PROJWAITQ or |
| * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a |
| * wait queue or False otherwise. |
| * |
| * cpucaps_sc_init(caps_sc_t *) |
| * |
| * Initializes the scheduling-class specific CPU Caps data for a thread. |
| * |
| * LOCKS |
| * ===== |
| * |
| * all the individual caps structures and their lists are protected by a global |
| * caps_lock mutex. The lock is grabbed either by clock() or by events modifying |
| * caps, so it is usually uncontended. We avoid all blocking memory allocations |
| * while holding caps_lock to prevent clock() from blocking. |
| * |
| * Thread state is protected by the thread lock. It protects the association |
| * between a thread and its project and, as a consequence, to its zone. The |
| * association can not break while thread lock is held, so the project or zone |
| * cap are not going to disappear while thread lock is held. |
| * |
| * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is |
| * grabbed by scheduling classes already holding thread lock at high PIL and by |
| * clock thread performing usage decay. We should do as little work as possible |
| * while holding the lock since it may be very hot. All threads in the project |
| * contend for the same cache line doing cap usage updates. |
| */ |
| |
| /* |
| * caps_lock protects list of capped projects and zones, changes in the cap |
| * state and changes of the global cpucaps_enabled flag. |
| * |
| * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is |
| * modified in parallel. This can be per-zone cap flag, but we don't keep any |
| * cap state for now. |
| */ |
| static kmutex_t caps_lock; /* lock to protect: */ |
| static list_t capped_zones; /* - list of zones with caps */ |
| static list_t capped_projects; /* - list of projects with caps */ |
| boolean_t cpucaps_enabled; /* - are there any caps defined? */ |
| boolean_t cpucaps_busy; /* - is framework busy? */ |
| |
| /* |
| * The accounting is based on the number of nanoseconds threads spend running |
| * during a tick which is kept in the cap_tick_cost variable. |
| */ |
| static hrtime_t cap_tick_cost; |
| |
| /* |
| * How much of the usage value is decayed every clock tick |
| * Decay one per cent of value per tick |
| */ |
| #define CAP_DECAY_FACTOR 100 |
| |
| /* |
| * Scale the value and round it to the closest integer value |
| */ |
| #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) |
| |
| static void caps_update(); |
| |
| /* |
| * CAP kstats. |
| */ |
| struct cap_kstat { |
| kstat_named_t cap_value; |
| kstat_named_t cap_usage; |
| kstat_named_t cap_nwait; |
| kstat_named_t cap_below; |
| kstat_named_t cap_above; |
| kstat_named_t cap_maxusage; |
| kstat_named_t cap_zonename; |
| } cap_kstat = { |
| { "value", KSTAT_DATA_UINT64 }, |
| { "usage", KSTAT_DATA_UINT64 }, |
| { "nwait", KSTAT_DATA_UINT64 }, |
| { "below_sec", KSTAT_DATA_UINT64 }, |
| { "above_sec", KSTAT_DATA_UINT64 }, |
| { "maxusage", KSTAT_DATA_UINT64 }, |
| { "zonename", KSTAT_DATA_STRING }, |
| }; |
| |
| |
| static kmutex_t cap_kstat_lock; |
| static int cap_kstat_update(kstat_t *, int); |
| |
| /* |
| * Initialize CPU caps infrastructure. |
| * - Initialize lists of capped zones and capped projects |
| * - Set cpucaps_clock_callout to NULL |
| */ |
| void |
| cpucaps_init() |
| { |
| /* |
| * Initialize global variables |
| */ |
| cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); |
| |
| list_create(&capped_zones, sizeof (cpucap_t), |
| offsetof(cpucap_t, cap_link)); |
| list_create(&capped_projects, sizeof (cpucap_t), |
| offsetof(cpucap_t, cap_link)); |
| |
| cpucaps_enabled = B_FALSE; |
| cpucaps_busy = B_FALSE; |
| cpucaps_clock_callout = NULL; |
| } |
| |
| /* |
| * Initialize scheduling-class specific CPU Caps data. |
| */ |
| void |
| cpucaps_sc_init(caps_sc_t *csc) |
| { |
| csc->csc_cputime = 0; |
| } |
| |
| /* |
| * Allocate and initialize cpucap structure |
| */ |
| static cpucap_t * |
| cap_alloc(void) |
| { |
| cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); |
| |
| DISP_LOCK_INIT(&cap->cap_usagelock); |
| waitq_init(&cap->cap_waitq); |
| |
| return (cap); |
| } |
| |
| /* |
| * Free cpucap structure |
| */ |
| static void |
| cap_free(cpucap_t *cap) |
| { |
| if (cap == NULL) |
| return; |
| |
| /* |
| * This cap should not be active |
| */ |
| ASSERT(!list_link_active(&cap->cap_link)); |
| ASSERT(cap->cap_value == 0); |
| ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); |
| |
| waitq_fini(&cap->cap_waitq); |
| DISP_LOCK_DESTROY(&cap->cap_usagelock); |
| |
| kmem_free(cap, sizeof (cpucap_t)); |
| } |
| |
| /* |
| * Activate cap - insert into active list and unblock its |
| * wait queue. Should be called with caps_lock held. |
| * The cap_value field is set to the value supplied. |
| */ |
| static void |
| cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) |
| { |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| |
| /* |
| * Cap can not be already enabled |
| */ |
| ASSERT(!CAP_ENABLED(cap)); |
| ASSERT(!list_link_active(&cap->cap_link)); |
| |
| list_insert_tail(l, cap); |
| cap->cap_below = cap->cap_above = 0; |
| cap->cap_maxusage = 0; |
| cap->cap_usage = 0; |
| cap->cap_value = value; |
| waitq_unblock(&cap->cap_waitq); |
| if (CPUCAPS_OFF()) { |
| cpucaps_enabled = B_TRUE; |
| cpucaps_clock_callout = caps_update; |
| } |
| } |
| |
| /* |
| * Deactivate cap |
| * - Block its wait queue. This prevents any new threads from being |
| * enqueued there and moves all enqueued threads to the run queue. |
| * - Remove cap from list l. |
| * - Disable CPU caps globally if there are no capped projects or zones |
| * |
| * Should be called with caps_lock held. |
| */ |
| static void |
| cap_disable(list_t *l, cpucap_t *cap) |
| { |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| /* |
| * Cap should be currently active |
| */ |
| ASSERT(CPUCAPS_ON()); |
| ASSERT(list_link_active(&cap->cap_link)); |
| ASSERT(CAP_ENABLED(cap)); |
| |
| waitq_block(&cap->cap_waitq); |
| list_remove(l, cap); |
| if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { |
| cpucaps_enabled = B_FALSE; |
| cpucaps_clock_callout = NULL; |
| } |
| cap->cap_value = 0; |
| cap->cap_project = NULL; |
| cap->cap_zone = NULL; |
| if (cap->cap_kstat != NULL) { |
| kstat_delete(cap->cap_kstat); |
| cap->cap_kstat = NULL; |
| } |
| |
| } |
| |
| /* |
| * Enable cap for a project kpj |
| * It is safe to enable already enabled project cap. |
| * Should be called with caps_lock held. |
| */ |
| static void |
| cap_project_enable(kproject_t *kpj, hrtime_t value) |
| { |
| cpucap_t *cap = kpj->kpj_cpucap; |
| |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| ASSERT(cap != NULL); |
| |
| if (CAP_DISABLED(cap)) { |
| ASSERT(cap->cap_kstat == NULL); |
| cap_enable(&capped_projects, cap, value); |
| cap->cap_project = kpj; |
| cap->cap_zone = kpj->kpj_zone; |
| |
| /* |
| * Create cap kstats |
| */ |
| if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", |
| KSTAT_TYPE_NAMED, |
| sizeof (cap_kstat) / sizeof (kstat_named_t), |
| KSTAT_FLAG_VIRTUAL)) != NULL) { |
| cap->cap_kstat->ks_data_size += |
| strlen(cap->cap_zone->zone_name) + 1; |
| cap->cap_kstat->ks_lock = &cap_kstat_lock; |
| cap->cap_kstat->ks_data = &cap_kstat; |
| cap->cap_kstat->ks_update = cap_kstat_update; |
| cap->cap_kstat->ks_private = cap; |
| kstat_install(cap->cap_kstat); |
| } |
| } |
| } |
| |
| /* |
| * Disable project cap. |
| * It is safe to disable already disabled project cap. |
| * Should be called with caps_lock held. |
| */ |
| static void |
| cap_project_disable(kproject_t *kpj) |
| { |
| cpucap_t *cap = kpj->kpj_cpucap; |
| |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| ASSERT(cap != NULL); |
| ASSERT(cap->cap_project == kpj); |
| |
| if (CAP_ENABLED(cap)) |
| cap_disable(&capped_projects, cap); |
| } |
| |
| /* |
| * Enable cap for a zone |
| * It is safe to enable already enabled zone cap. |
| * Should be called with caps_lock held. |
| */ |
| static void |
| cap_zone_enable(zone_t *zone, hrtime_t value) |
| { |
| cpucap_t *cap = zone->zone_cpucap; |
| |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| ASSERT(cap != NULL); |
| |
| if (CAP_DISABLED(cap)) { |
| ASSERT(cap->cap_kstat == NULL); |
| cap_enable(&capped_zones, cap, value); |
| cap->cap_zone = zone; |
| |
| /* |
| * Create cap kstats |
| */ |
| if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", |
| KSTAT_TYPE_NAMED, |
| sizeof (cap_kstat) / sizeof (kstat_named_t), |
| KSTAT_FLAG_VIRTUAL)) != NULL) { |
| cap->cap_kstat->ks_data_size += |
| strlen(cap->cap_zone->zone_name) + 1; |
| cap->cap_kstat->ks_lock = &cap_kstat_lock; |
| cap->cap_kstat->ks_data = &cap_kstat; |
| cap->cap_kstat->ks_update = cap_kstat_update; |
| cap->cap_kstat->ks_private = cap; |
| kstat_install(cap->cap_kstat); |
| } |
| } |
| } |
| |
| /* |
| * Disable zone cap. |
| * It is safe to disable already disabled zone cap. |
| * Should be called with caps_lock held. |
| */ |
| static void |
| cap_zone_disable(zone_t *zone) |
| { |
| cpucap_t *cap = zone->zone_cpucap; |
| |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| ASSERT(cap != NULL); |
| ASSERT(cap->cap_zone == zone); |
| |
| if (CAP_ENABLED(cap)) |
| cap_disable(&capped_zones, cap); |
| } |
| |
| /* |
| * Apply specified callback to all caps contained in the list `l'. |
| */ |
| static void |
| cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) |
| { |
| static uint64_t cpucap_walk_gen; |
| cpucap_t *cap; |
| |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| |
| for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { |
| (*cb)(cap, cpucap_walk_gen); |
| } |
| |
| atomic_inc_64(&cpucap_walk_gen); |
| } |
| |
| /* |
| * If cap limit is not reached, make one thread from wait queue runnable. |
| * The waitq_isempty check is performed without the waitq lock. If a new thread |
| * is placed on the waitq right after the check, it will be picked up during the |
| * next invocation of cap_poke_waitq(). |
| */ |
| /* ARGSUSED */ |
| static void |
| cap_poke_waitq(cpucap_t *cap, int64_t gen) |
| { |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| |
| if (cap->cap_usage >= cap->cap_value) { |
| cap->cap_above++; |
| } else { |
| waitq_t *wq = &cap->cap_waitq; |
| |
| cap->cap_below++; |
| |
| if (!waitq_isempty(wq)) |
| waitq_runone(wq); |
| } |
| } |
| |
| /* |
| * The callback function called for every cap on capped_projects list. |
| * Decay cap usage by CAP_DECAY_FACTOR |
| * Add this cap project usage to its zone usage. |
| * Kick off a thread from the cap waitq if cap is not reached. |
| */ |
| static void |
| cap_project_usage_walker(cpucap_t *cap, int64_t gen) |
| { |
| zone_t *zone = cap->cap_zone; |
| hrtime_t cap_usage = cap->cap_usage; |
| |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| ASSERT(cap->cap_project->kpj_cpucap == cap); |
| ASSERT(zone == cap->cap_project->kpj_zone); |
| ASSERT(CAP_ENABLED(cap)); |
| |
| /* |
| * Set or clear the CAP_REACHED flag based on the current usage. |
| * Only projects having their own caps are ever marked as CAP_REACHED. |
| */ |
| cap_poke_waitq(cap, 0); |
| |
| /* |
| * Add project's CPU usage to our zone's CPU usage. |
| */ |
| if (ZONE_IS_CAPPED(zone)) { |
| cpucap_t *zcap = zone->zone_cpucap; |
| |
| ASSERT(zcap->cap_zone == zone); |
| |
| /* |
| * If we haven't reset this zone's usage during this clock tick |
| * yet, then do it now. The cap_gen field is used to check |
| * whether this is the first zone's project we see during this |
| * tick or a subsequent one. |
| */ |
| if (zcap->cap_gen != gen) { |
| if (zcap->cap_usage > zcap->cap_maxusage) |
| zcap->cap_maxusage = zcap->cap_usage; |
| zcap->cap_usage = 0; |
| zcap->cap_gen = gen; |
| } |
| DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, |
| hrtime_t, cap_usage); |
| zcap->cap_usage += cap_usage; |
| /* Check for overflows */ |
| if (zcap->cap_usage < 0) |
| zcap->cap_usage = MAX_USAGE - 1; |
| } |
| |
| /* |
| * Decay project usage. |
| */ |
| disp_lock_enter(&cap->cap_usagelock); |
| cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); |
| disp_lock_exit(&cap->cap_usagelock); |
| } |
| |
| /* |
| * On every clock tick walk the list of project caps and update the CPU usage. |
| * Also walk the list of zone caps checking whether any threads should |
| * transition from wait queue to run queue. |
| * |
| * This function gets called by the clock thread directly when there are any |
| * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs |
| * caps_lock for long periods of time, so there should be almost no contention |
| * for it. |
| */ |
| static void |
| caps_update() |
| { |
| mutex_enter(&caps_lock); |
| cap_walk(&capped_projects, cap_project_usage_walker); |
| cap_walk(&capped_zones, cap_poke_waitq); |
| mutex_exit(&caps_lock); |
| } |
| |
| /* |
| * The function is called for each project in a zone when the zone cap is |
| * modified. It enables project caps if zone cap is enabled and disables if the |
| * zone cap is disabled and project doesn't have its own cap. |
| * |
| * For each project that does not have cpucap structure allocated it allocates a |
| * new structure and assigns to kpj->cpu_cap. The allocation is performed |
| * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock |
| * held. |
| */ |
| static int |
| cap_project_zone_modify_walker(kproject_t *kpj, void *arg) |
| { |
| cpucap_t *project_cap = NULL; |
| cpucap_t *zone_cap = (cpucap_t *)arg; |
| |
| ASSERT(zone_cap != NULL); |
| |
| if (kpj->kpj_cpucap == NULL) { |
| /* |
| * This is the first time any cap was established for this |
| * project. Allocate a new cpucap structure for it. |
| */ |
| project_cap = cap_alloc(); |
| } |
| |
| mutex_enter(&caps_lock); |
| |
| /* |
| * Double-check that kpj_cpucap is still NULL - now with caps_lock held |
| * and assign the newly allocated cpucap structure to it. |
| */ |
| if (kpj->kpj_cpucap == NULL) { |
| kpj->kpj_cpucap = project_cap; |
| } else if (project_cap != NULL) { |
| cap_free(project_cap); |
| } |
| |
| project_cap = kpj->kpj_cpucap; |
| |
| if (CAP_DISABLED(zone_cap)) { |
| /* |
| * Remove all projects in this zone without caps |
| * from the capped_projects list. |
| */ |
| if (project_cap->cap_value == MAX_USAGE) { |
| cap_project_disable(kpj); |
| } |
| } else if (CAP_DISABLED(project_cap)) { |
| /* |
| * Add the project to capped_projects list. |
| */ |
| ASSERT(project_cap->cap_value == 0); |
| cap_project_enable(kpj, MAX_USAGE); |
| } |
| mutex_exit(&caps_lock); |
| |
| return (0); |
| } |
| |
| /* |
| * Set zone cap to cap_val |
| * If cap_val is equal to NOCAP, disable zone cap. |
| * |
| * If this is the first time a cap is set on a zone, allocate cpucap structure |
| * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. |
| */ |
| int |
| cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) |
| { |
| cpucap_t *cap = NULL; |
| hrtime_t value; |
| |
| if (cap_val == 0) |
| return (EINVAL); |
| |
| ASSERT(cap_val <= MAXCAP); |
| if (cap_val > MAXCAP) |
| cap_val = MAXCAP; |
| |
| /* |
| * Nothing to do if trying to disable a cap on a zone when caps are off |
| * or a zone which does not have a cap yet. |
| */ |
| if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) |
| return (0); |
| |
| if (zone->zone_cpucap == NULL) |
| cap = cap_alloc(); |
| |
| mutex_enter(&caps_lock); |
| |
| if (cpucaps_busy) { |
| mutex_exit(&caps_lock); |
| return (EBUSY); |
| } |
| |
| /* |
| * Double-check whether zone->zone_cpucap is NULL, now with caps_lock |
| * held. If it is still NULL, assign a newly allocated cpucap to it. |
| */ |
| if (zone->zone_cpucap == NULL) { |
| zone->zone_cpucap = cap; |
| } else if (cap != NULL) { |
| cap_free(cap); |
| } |
| |
| cap = zone->zone_cpucap; |
| value = cap_val * cap_tick_cost; |
| if (value < 0) |
| value = MAX_USAGE; |
| |
| /* Nothing to do if the value is staying the same */ |
| if (value == cap->cap_value) { |
| mutex_exit(&caps_lock); |
| return (0); |
| } |
| |
| /* |
| * Clear cap statistics since the cap value itself changes. |
| */ |
| cap->cap_above = cap->cap_below = 0; |
| |
| |
| if (cap_val == NOCAP) { |
| if (CAP_ENABLED(cap)) { |
| /* |
| * Remove cap for the zone |
| */ |
| cap_zone_disable(zone); |
| cpucaps_busy = B_TRUE; |
| mutex_exit(&caps_lock); |
| /* |
| * Disable caps for all project belonging to this zone |
| * unless they have their own cap. |
| */ |
| (void) project_walk_all(zone->zone_id, |
| cap_project_zone_modify_walker, cap); |
| |
| mutex_enter(&caps_lock); |
| cpucaps_busy = B_FALSE; |
| } |
| } else if (CAP_DISABLED(cap)) { |
| /* |
| * Set a cap on a zone which previously was not capped. |
| */ |
| cap_zone_enable(zone, value); |
| cpucaps_busy = B_TRUE; |
| mutex_exit(&caps_lock); |
| |
| /* |
| * Enable cap for all projects belonging to this zone. |
| */ |
| (void) project_walk_all(zone->zone_id, |
| cap_project_zone_modify_walker, cap); |
| |
| mutex_enter(&caps_lock); |
| cpucaps_busy = B_FALSE; |
| } else { |
| /* |
| * No state transitions, just change the value |
| */ |
| cap->cap_value = value; |
| } |
| |
| ASSERT(MUTEX_HELD(&caps_lock)); |
| ASSERT(!cpucaps_busy); |
| mutex_exit(&caps_lock); |
| |
| return (0); |
| } |
| |
| /* |
| * The project is going away so disable its cap. |
| */ |
| void |
| cpucaps_project_remove(kproject_t *kpj) |
| { |
| mutex_enter(&caps_lock); |
| if (PROJECT_IS_CAPPED(kpj)) |
| cap_project_disable(kpj); |
| if (kpj->kpj_cpucap != NULL) { |
| cap_free(kpj->kpj_cpucap); |
| kpj->kpj_cpucap = NULL; |
| } |
| mutex_exit(&caps_lock); |
| } |
| |
| /* |
| * The zone is going away, so disable its cap. |
| */ |
| void |
| cpucaps_zone_remove(zone_t *zone) |
| { |
| mutex_enter(&caps_lock); |
| while (ZONE_IS_CAPPED(zone)) { |
| mutex_exit(&caps_lock); |
| (void) cpucaps_zone_set(zone, NOCAP); |
| mutex_enter(&caps_lock); |
| } |
| if (zone->zone_cpucap != NULL) { |
| cap_free(zone->zone_cpucap); |
| zone->zone_cpucap = NULL; |
| } |
| mutex_exit(&caps_lock); |
| } |
| |
| /* |
| * New project was created. It should be put on the capped_projects list if |
| * its zone has a cap. |
| */ |
| void |
| cpucaps_project_add(kproject_t *kpj) |
| { |
| cpucap_t *cap = NULL; |
| |
| if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) |
| return; |
| |
| /* |
| * This project was never capped before, so allocate its cap structure. |
| */ |
| if (kpj->kpj_cpucap == NULL) |
| cap = cap_alloc(); |
| |
| mutex_enter(&caps_lock); |
| /* |
| * Double-check with caps_lock held |
| */ |
| if (kpj->kpj_cpucap == NULL) { |
| kpj->kpj_cpucap = cap; |
| } else if (cap != NULL) { |
| cap_free(cap); |
| } |
| |
| if (ZONE_IS_CAPPED(kpj->kpj_zone)) |
| cap_project_enable(kpj, MAX_USAGE); |
| |
| mutex_exit(&caps_lock); |
| } |
| |
| /* |
| * Set project cap to cap_val |
| * If cap_val is equal to NOCAP, disable project cap. |
| * |
| * If this is the first time a cap is set on a project, allocate cpucap |
| * structure without holding caps_lock to avoid KM_SLEEP allocation with |
| * caps_lock held. |
| */ |
| int |
| cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) |
| { |
| cpucap_t *cap = NULL; |
| hrtime_t value; |
| |
| if (cap_val == 0) |
| return (EINVAL); |
| |
| ASSERT(cap_val <= MAXCAP); |
| if (cap_val > MAXCAP) |
| cap_val = MAXCAP; |
| |
| /* |
| * Nothing to do if trying to disable project cap and caps are not |
| * enabled or if trying to disable cap on a project that does not have |
| * cap enabled. |
| */ |
| if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) |
| return (0); |
| |
| if (kpj->kpj_cpucap == NULL) { |
| /* |
| * This project was never capped before, so allocate its cap |
| * structure. |
| */ |
| cap = cap_alloc(); |
| } |
| |
| mutex_enter(&caps_lock); |
| |
| /* |
| * Double-check with caps_lock held. |
| */ |
| if (kpj->kpj_cpucap == NULL) { |
| kpj->kpj_cpucap = cap; |
| } else if (cap != NULL) { |
| cap_free(cap); |
| } |
| |
| /* |
| * Get the actual pointer to the project cap. |
| */ |
| cap = kpj->kpj_cpucap; |
| value = cap_val * cap_tick_cost; |
| if (value < 0) |
| value = MAX_USAGE; |
| |
| /* |
| * Nothing to do if the value is not changing |
| */ |
| if (value == cap->cap_value) { |
| mutex_exit(&caps_lock); |
| return (0); |
| } |
| |
| /* |
| * Clear cap statistics since the cap value itself changes. |
| */ |
| cap->cap_above = cap->cap_below = 0; |
| cap->cap_maxusage = 0; |
| |
| if (cap_val != NOCAP) { |
| /* |
| * Enable this cap if it is not already enabled. |
| */ |
| if (CAP_DISABLED(cap)) |
| cap_project_enable(kpj, value); |
| else |
| cap->cap_value = value; |
| } else if (CAP_ENABLED(cap)) { |
| /* |
| * User requested to drop a cap on the project. If it is part of |
| * capped zone, keep the cap and set the value to MAX_USAGE, |
| * otherwise disable the cap. |
| */ |
| if (ZONE_IS_CAPPED(kpj->kpj_zone)) { |
| cap->cap_value = MAX_USAGE; |
| } else { |
| cap_project_disable(kpj); |
| } |
| } |
| mutex_exit(&caps_lock); |
| |
| return (0); |
| } |
| |
| /* |
| * Get cap usage. |
| */ |
| static rctl_qty_t |
| cap_get(cpucap_t *cap) |
| { |
| return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); |
| } |
| |
| /* |
| * Get current project usage. |
| */ |
| rctl_qty_t |
| cpucaps_project_get(kproject_t *kpj) |
| { |
| return (cap_get(kpj->kpj_cpucap)); |
| } |
| |
| /* |
| * Get current zone usage. |
| */ |
| rctl_qty_t |
| cpucaps_zone_get(zone_t *zone) |
| { |
| return (cap_get(zone->zone_cpucap)); |
| } |
| |
| /* |
| * Charge project of thread t the time thread t spent on CPU since previously |
| * adjusted. |
| * |
| * Record the current on-CPU time in the csc structure. |
| * |
| * Do not adjust for more than one tick worth of time. |
| * |
| * It is possible that the project cap is being disabled while this routine is |
| * executed. This should not cause any issues since the association between the |
| * thread and its project is protected by thread lock. |
| */ |
| static void |
| caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) |
| { |
| kproject_t *kpj = ttoproj(t); |
| hrtime_t new_usage; |
| hrtime_t usage_delta; |
| |
| ASSERT(THREAD_LOCK_HELD(t)); |
| ASSERT(kpj->kpj_cpucap != NULL); |
| |
| /* Get on-CPU time since birth of a thread */ |
| new_usage = mstate_thread_onproc_time(t); |
| |
| /* Time spent on CPU since last checked */ |
| usage_delta = new_usage - csc->csc_cputime; |
| |
| /* Save the accumulated on-CPU time */ |
| csc->csc_cputime = new_usage; |
| |
| /* Charge at most one tick worth of on-CPU time */ |
| if (usage_delta > cap_tick_cost) |
| usage_delta = cap_tick_cost; |
| |
| /* Add usage_delta to the project usage value. */ |
| if (usage_delta > 0) { |
| cpucap_t *cap = kpj->kpj_cpucap; |
| |
| DTRACE_PROBE2(cpucaps__project__charge, |
| kthread_id_t, t, hrtime_t, usage_delta); |
| |
| disp_lock_enter_high(&cap->cap_usagelock); |
| cap->cap_usage += usage_delta; |
| |
| /* Check for overflows */ |
| if (cap->cap_usage < 0) |
| cap->cap_usage = MAX_USAGE - 1; |
| |
| disp_lock_exit_high(&cap->cap_usagelock); |
| |
| /* |
| * cap_maxusage is only kept for observability. Move it outside |
| * the lock to reduce the time spent while holding the lock. |
| */ |
| if (cap->cap_usage > cap->cap_maxusage) |
| cap->cap_maxusage = cap->cap_usage; |
| } |
| } |
| |
| /* |
| * Charge thread's project and return True if project or zone should be |
| * penalized because its project or zone is exceeding its cap. Also sets |
| * TS_PROJWAITQ or TS_ZONEWAITQ in this case. |
| * |
| * It is possible that the project cap is being disabled while this routine is |
| * executed. This should not cause any issues since the association between the |
| * thread and its project is protected by thread lock. It will still set |
| * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place |
| * anything on the blocked wait queue. |
| * |
| */ |
| boolean_t |
| cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) |
| { |
| kproject_t *kpj = ttoproj(t); |
| klwp_t *lwp = t->t_lwp; |
| zone_t *zone; |
| cpucap_t *project_cap; |
| boolean_t rc = B_FALSE; |
| |
| ASSERT(THREAD_LOCK_HELD(t)); |
| |
| /* Nothing to do for projects that are not capped. */ |
| if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) |
| return (B_FALSE); |
| |
| caps_charge_adjust(t, csc); |
| |
| /* |
| * The caller only requested to charge the project usage, no enforcement |
| * part. |
| */ |
| if (charge_type == CPUCAPS_CHARGE_ONLY) |
| return (B_FALSE); |
| |
| project_cap = kpj->kpj_cpucap; |
| |
| if (project_cap->cap_usage >= project_cap->cap_value) { |
| t->t_schedflag |= TS_PROJWAITQ; |
| rc = B_TRUE; |
| } else if (t->t_schedflag & TS_PROJWAITQ) { |
| t->t_schedflag &= ~TS_PROJWAITQ; |
| } |
| |
| zone = ttozone(t); |
| if (!ZONE_IS_CAPPED(zone)) { |
| if (t->t_schedflag & TS_ZONEWAITQ) |
| t->t_schedflag &= ~TS_ZONEWAITQ; |
| } else { |
| cpucap_t *zone_cap = zone->zone_cpucap; |
| |
| if (zone_cap->cap_usage >= zone_cap->cap_value) { |
| t->t_schedflag |= TS_ZONEWAITQ; |
| rc = B_TRUE; |
| } else if (t->t_schedflag & TS_ZONEWAITQ) { |
| t->t_schedflag &= ~TS_ZONEWAITQ; |
| } |
| } |
| |
| |
| return (rc); |
| } |
| |
| /* |
| * Enforce CPU caps. If got preempted in the user-land, we know that thread does |
| * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. |
| * |
| * CPU Caps are only enforced for user threads. |
| * |
| * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and |
| * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. |
| * |
| * It is possible that by the time we enter cpucaps_enforce() the cap is already |
| * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We |
| * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer |
| * apply. |
| */ |
| boolean_t |
| cpucaps_enforce(kthread_t *t) |
| { |
| klwp_t *lwp = t->t_lwp; |
| |
| ASSERT(THREAD_LOCK_HELD(t)); |
| |
| if (lwp != NULL && lwp->lwp_state == LWP_USER) { |
| if (t->t_schedflag & TS_PROJWAITQ) { |
| ASSERT(ttoproj(t)->kpj_cpucap != NULL); |
| t->t_schedflag &= ~TS_ANYWAITQ; |
| if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), |
| t)) { |
| return (B_TRUE); |
| } |
| } |
| if (t->t_schedflag & TS_ZONEWAITQ) { |
| ASSERT(ttozone(t)->zone_cpucap != NULL); |
| t->t_schedflag &= ~TS_ZONEWAITQ; |
| if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), |
| t)) { |
| return (B_TRUE); |
| } |
| } |
| } |
| |
| /* |
| * The thread is not enqueued on the wait queue. |
| */ |
| return (B_FALSE); |
| } |
| |
| /* |
| * Convert internal cap statistics into values exported by cap kstat. |
| */ |
| static int |
| cap_kstat_update(kstat_t *ksp, int rw) |
| { |
| struct cap_kstat *capsp = &cap_kstat; |
| cpucap_t *cap = ksp->ks_private; |
| clock_t tick_sec = SEC_TO_TICK(1); |
| char *zonename = cap->cap_zone->zone_name; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| capsp->cap_value.value.ui64 = |
| ROUND_SCALE(cap->cap_value, cap_tick_cost); |
| capsp->cap_usage.value.ui64 = |
| ROUND_SCALE(cap->cap_usage, cap_tick_cost); |
| capsp->cap_maxusage.value.ui64 = |
| ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); |
| capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; |
| capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); |
| capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); |
| kstat_named_setstr(&capsp->cap_zonename, zonename); |
| |
| return (0); |
| } |