| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| /* |
| * Copyright (c) 1996, 2010, Oracle and/or its affiliates. All rights reserved. |
| * Copyright (c) 2017 by Delphix. All rights reserved. |
| */ |
| |
| #include <sys/types.h> |
| #include <sys/systm.h> |
| #include <sys/cmn_err.h> |
| #include <sys/cpuvar.h> |
| #include <sys/thread.h> |
| #include <sys/disp.h> |
| #include <sys/kmem.h> |
| #include <sys/debug.h> |
| #include <sys/cpupart.h> |
| #include <sys/pset.h> |
| #include <sys/var.h> |
| #include <sys/cyclic.h> |
| #include <sys/lgrp.h> |
| #include <sys/pghw.h> |
| #include <sys/loadavg.h> |
| #include <sys/class.h> |
| #include <sys/fss.h> |
| #include <sys/pool.h> |
| #include <sys/pool_pset.h> |
| #include <sys/policy.h> |
| |
| /* |
| * Calling pool_lock() protects the pools configuration, which includes |
| * CPU partitions. cpu_lock protects the CPU partition list, and prevents |
| * partitions from being created or destroyed while the lock is held. |
| * The lock ordering with respect to related locks is: |
| * |
| * pool_lock() ---> cpu_lock ---> pidlock --> p_lock |
| * |
| * Blocking memory allocations may be made while holding "pool_lock" |
| * or cpu_lock. |
| */ |
| |
| /* |
| * The cp_default partition is allocated statically, but its lgroup load average |
| * (lpl) list is allocated dynamically after kmem subsystem is initialized. This |
| * saves some memory since the space allocated reflects the actual number of |
| * lgroups supported by the platform. The lgrp facility provides a temporary |
| * space to hold lpl information during system bootstrap. |
| */ |
| |
| cpupart_t *cp_list_head; |
| cpupart_t cp_default; |
| static cpupartid_t cp_id_next; |
| uint_t cp_numparts; |
| uint_t cp_numparts_nonempty; |
| |
| /* |
| * Need to limit total number of partitions to avoid slowing down the |
| * clock code too much. The clock code traverses the list of |
| * partitions and needs to be able to execute in a reasonable amount |
| * of time (less than 1/hz seconds). The maximum is sized based on |
| * max_ncpus so it shouldn't be a problem unless there are large |
| * numbers of empty partitions. |
| */ |
| static uint_t cp_max_numparts; |
| |
| /* |
| * Processor sets and CPU partitions are different but related concepts. |
| * A processor set is a user-level abstraction allowing users to create |
| * sets of CPUs and bind threads exclusively to those sets. A CPU |
| * partition is a kernel dispatcher object consisting of a set of CPUs |
| * and a global dispatch queue. The processor set abstraction is |
| * implemented via a CPU partition, and currently there is a 1-1 |
| * mapping between processor sets and partitions (excluding the default |
| * partition, which is not visible as a processor set). Hence, the |
| * numbering for processor sets and CPU partitions is identical. This |
| * may not always be true in the future, and these macros could become |
| * less trivial if we support e.g. a processor set containing multiple |
| * CPU partitions. |
| */ |
| #define PSTOCP(psid) ((cpupartid_t)((psid) == PS_NONE ? CP_DEFAULT : (psid))) |
| #define CPTOPS(cpid) ((psetid_t)((cpid) == CP_DEFAULT ? PS_NONE : (cpid))) |
| |
| static int cpupart_unbind_threads(cpupart_t *, boolean_t); |
| |
| /* |
| * Find a CPU partition given a processor set ID. |
| */ |
| static cpupart_t * |
| cpupart_find_all(psetid_t psid) |
| { |
| cpupart_t *cp; |
| cpupartid_t cpid = PSTOCP(psid); |
| |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| |
| /* default partition not visible as a processor set */ |
| if (psid == CP_DEFAULT) |
| return (NULL); |
| |
| if (psid == PS_MYID) |
| return (curthread->t_cpupart); |
| |
| cp = cp_list_head; |
| do { |
| if (cp->cp_id == cpid) |
| return (cp); |
| cp = cp->cp_next; |
| } while (cp != cp_list_head); |
| return (NULL); |
| } |
| |
| /* |
| * Find a CPU partition given a processor set ID if the processor set |
| * should be visible from the calling zone. |
| */ |
| cpupart_t * |
| cpupart_find(psetid_t psid) |
| { |
| cpupart_t *cp; |
| |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| cp = cpupart_find_all(psid); |
| if (cp != NULL && !INGLOBALZONE(curproc) && pool_pset_enabled() && |
| zone_pset_get(curproc->p_zone) != CPTOPS(cp->cp_id)) |
| return (NULL); |
| return (cp); |
| } |
| |
| static int |
| cpupart_kstat_update(kstat_t *ksp, int rw) |
| { |
| cpupart_t *cp = (cpupart_t *)ksp->ks_private; |
| cpupart_kstat_t *cpksp = ksp->ks_data; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| cpksp->cpk_updates.value.ui64 = cp->cp_updates; |
| cpksp->cpk_runnable.value.ui64 = cp->cp_nrunnable_cum; |
| cpksp->cpk_waiting.value.ui64 = cp->cp_nwaiting_cum; |
| cpksp->cpk_ncpus.value.ui32 = cp->cp_ncpus; |
| cpksp->cpk_avenrun_1min.value.ui32 = cp->cp_hp_avenrun[0] >> |
| (16 - FSHIFT); |
| cpksp->cpk_avenrun_5min.value.ui32 = cp->cp_hp_avenrun[1] >> |
| (16 - FSHIFT); |
| cpksp->cpk_avenrun_15min.value.ui32 = cp->cp_hp_avenrun[2] >> |
| (16 - FSHIFT); |
| return (0); |
| } |
| |
| static void |
| cpupart_kstat_create(cpupart_t *cp) |
| { |
| kstat_t *ksp; |
| zoneid_t zoneid; |
| |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| |
| /* |
| * We have a bit of a chicken-egg problem since this code will |
| * get called to create the kstats for CP_DEFAULT before the |
| * pools framework gets initialized. We circumvent the problem |
| * by special-casing cp_default. |
| */ |
| if (cp != &cp_default && pool_pset_enabled()) |
| zoneid = GLOBAL_ZONEID; |
| else |
| zoneid = ALL_ZONES; |
| ksp = kstat_create_zone("unix", cp->cp_id, "pset", "misc", |
| KSTAT_TYPE_NAMED, |
| sizeof (cpupart_kstat_t) / sizeof (kstat_named_t), 0, zoneid); |
| if (ksp != NULL) { |
| cpupart_kstat_t *cpksp = ksp->ks_data; |
| |
| kstat_named_init(&cpksp->cpk_updates, "updates", |
| KSTAT_DATA_UINT64); |
| kstat_named_init(&cpksp->cpk_runnable, "runnable", |
| KSTAT_DATA_UINT64); |
| kstat_named_init(&cpksp->cpk_waiting, "waiting", |
| KSTAT_DATA_UINT64); |
| kstat_named_init(&cpksp->cpk_ncpus, "ncpus", |
| KSTAT_DATA_UINT32); |
| kstat_named_init(&cpksp->cpk_avenrun_1min, "avenrun_1min", |
| KSTAT_DATA_UINT32); |
| kstat_named_init(&cpksp->cpk_avenrun_5min, "avenrun_5min", |
| KSTAT_DATA_UINT32); |
| kstat_named_init(&cpksp->cpk_avenrun_15min, "avenrun_15min", |
| KSTAT_DATA_UINT32); |
| |
| ksp->ks_update = cpupart_kstat_update; |
| ksp->ks_private = cp; |
| |
| kstat_install(ksp); |
| } |
| cp->cp_kstat = ksp; |
| } |
| |
| /* |
| * Initialize the cpupart's lgrp partions (lpls) |
| */ |
| static void |
| cpupart_lpl_initialize(cpupart_t *cp) |
| { |
| int i, sz; |
| |
| sz = cp->cp_nlgrploads = lgrp_plat_max_lgrps(); |
| cp->cp_lgrploads = kmem_zalloc(sizeof (lpl_t) * sz, KM_SLEEP); |
| |
| for (i = 0; i < sz; i++) { |
| /* |
| * The last entry of the lpl's resource set is always NULL |
| * by design (to facilitate iteration)...hence the "oversizing" |
| * by 1. |
| */ |
| cp->cp_lgrploads[i].lpl_rset_sz = sz + 1; |
| cp->cp_lgrploads[i].lpl_rset = |
| kmem_zalloc(sizeof (struct lgrp_ld *) * (sz + 1), KM_SLEEP); |
| cp->cp_lgrploads[i].lpl_id2rset = |
| kmem_zalloc(sizeof (int) * (sz + 1), KM_SLEEP); |
| cp->cp_lgrploads[i].lpl_lgrpid = i; |
| } |
| } |
| |
| /* |
| * Teardown the cpupart's lgrp partitions |
| */ |
| static void |
| cpupart_lpl_teardown(cpupart_t *cp) |
| { |
| int i, sz; |
| lpl_t *lpl; |
| |
| for (i = 0; i < cp->cp_nlgrploads; i++) { |
| lpl = &cp->cp_lgrploads[i]; |
| |
| sz = lpl->lpl_rset_sz; |
| kmem_free(lpl->lpl_rset, sizeof (struct lgrp_ld *) * sz); |
| kmem_free(lpl->lpl_id2rset, sizeof (int) * sz); |
| lpl->lpl_rset = NULL; |
| lpl->lpl_id2rset = NULL; |
| } |
| kmem_free(cp->cp_lgrploads, sizeof (lpl_t) * cp->cp_nlgrploads); |
| cp->cp_lgrploads = NULL; |
| } |
| |
| /* |
| * Initialize the default partition and kpreempt disp queue. |
| */ |
| void |
| cpupart_initialize_default(void) |
| { |
| lgrp_id_t i; |
| |
| cp_list_head = &cp_default; |
| cp_default.cp_next = &cp_default; |
| cp_default.cp_prev = &cp_default; |
| cp_default.cp_id = CP_DEFAULT; |
| cp_default.cp_kp_queue.disp_maxrunpri = -1; |
| cp_default.cp_kp_queue.disp_max_unbound_pri = -1; |
| cp_default.cp_kp_queue.disp_cpu = NULL; |
| cp_default.cp_gen = 0; |
| cp_default.cp_loadavg.lg_cur = 0; |
| cp_default.cp_loadavg.lg_len = 0; |
| cp_default.cp_loadavg.lg_total = 0; |
| for (i = 0; i < S_LOADAVG_SZ; i++) { |
| cp_default.cp_loadavg.lg_loads[i] = 0; |
| } |
| DISP_LOCK_INIT(&cp_default.cp_kp_queue.disp_lock); |
| cp_id_next = CP_DEFAULT + 1; |
| cpupart_kstat_create(&cp_default); |
| cp_numparts = 1; |
| if (cp_max_numparts == 0) /* allow for /etc/system tuning */ |
| cp_max_numparts = max_ncpus * 2 + 1; |
| /* |
| * Allocate space for cp_default list of lgrploads |
| */ |
| cpupart_lpl_initialize(&cp_default); |
| |
| /* |
| * The initial lpl topology is created in a special lpl list |
| * lpl_bootstrap. It should be copied to cp_default. |
| * NOTE: lpl_topo_bootstrap() also updates CPU0 cpu_lpl pointer to point |
| * to the correct lpl in the cp_default.cp_lgrploads list. |
| */ |
| lpl_topo_bootstrap(cp_default.cp_lgrploads, |
| cp_default.cp_nlgrploads); |
| |
| |
| cp_default.cp_attr = PSET_NOESCAPE; |
| cp_numparts_nonempty = 1; |
| /* |
| * Set t0's home |
| */ |
| t0.t_lpl = &cp_default.cp_lgrploads[LGRP_ROOTID]; |
| |
| bitset_init(&cp_default.cp_cmt_pgs); |
| bitset_init_fanout(&cp_default.cp_haltset, cp_haltset_fanout); |
| |
| bitset_resize(&cp_default.cp_haltset, max_ncpus); |
| } |
| |
| |
| static int |
| cpupart_move_cpu(cpu_t *cp, cpupart_t *newpp, int forced) |
| { |
| cpupart_t *oldpp; |
| cpu_t *ncp, *newlist; |
| kthread_t *t; |
| int move_threads = 1; |
| lgrp_id_t lgrpid; |
| proc_t *p; |
| int lgrp_diff_lpl; |
| lpl_t *cpu_lpl; |
| int ret; |
| boolean_t unbind_all_threads = (forced != 0); |
| |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| ASSERT(newpp != NULL); |
| |
| oldpp = cp->cpu_part; |
| ASSERT(oldpp != NULL); |
| ASSERT(oldpp->cp_ncpus > 0); |
| |
| if (newpp == oldpp) { |
| /* |
| * Don't need to do anything. |
| */ |
| return (0); |
| } |
| |
| cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_OUT); |
| |
| if (!disp_bound_partition(cp, 0)) { |
| /* |
| * Don't need to move threads if there are no threads in |
| * the partition. Note that threads can't enter the |
| * partition while we're holding cpu_lock. |
| */ |
| move_threads = 0; |
| } else if (oldpp->cp_ncpus == 1) { |
| /* |
| * The last CPU is removed from a partition which has threads |
| * running in it. Some of these threads may be bound to this |
| * CPU. |
| * |
| * Attempt to unbind threads from the CPU and from the processor |
| * set. Note that no threads should be bound to this CPU since |
| * cpupart_move_threads will refuse to move bound threads to |
| * other CPUs. |
| */ |
| (void) cpu_unbind(oldpp->cp_cpulist->cpu_id, B_FALSE); |
| (void) cpupart_unbind_threads(oldpp, B_FALSE); |
| |
| if (!disp_bound_partition(cp, 0)) { |
| /* |
| * No bound threads in this partition any more |
| */ |
| move_threads = 0; |
| } else { |
| /* |
| * There are still threads bound to the partition |
| */ |
| cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); |
| return (EBUSY); |
| } |
| } |
| |
| /* |
| * If forced flag is set unbind any threads from this CPU. |
| * Otherwise unbind soft-bound threads only. |
| */ |
| if ((ret = cpu_unbind(cp->cpu_id, unbind_all_threads)) != 0) { |
| cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); |
| return (ret); |
| } |
| |
| /* |
| * Stop further threads weak binding to this cpu. |
| */ |
| cpu_inmotion = cp; |
| membar_enter(); |
| |
| /* |
| * Notify the Processor Groups subsystem that the CPU |
| * will be moving cpu partitions. This is done before |
| * CPUs are paused to provide an opportunity for any |
| * needed memory allocations. |
| */ |
| pg_cpupart_out(cp, oldpp); |
| pg_cpupart_in(cp, newpp); |
| |
| again: |
| if (move_threads) { |
| int loop_count; |
| /* |
| * Check for threads strong or weak bound to this CPU. |
| */ |
| for (loop_count = 0; disp_bound_threads(cp, 0); loop_count++) { |
| if (loop_count >= 5) { |
| cpu_state_change_notify(cp->cpu_id, |
| CPU_CPUPART_IN); |
| pg_cpupart_out(cp, newpp); |
| pg_cpupart_in(cp, oldpp); |
| cpu_inmotion = NULL; |
| return (EBUSY); /* some threads still bound */ |
| } |
| delay(1); |
| } |
| } |
| |
| /* |
| * Before we actually start changing data structures, notify |
| * the cyclic subsystem that we want to move this CPU out of its |
| * partition. |
| */ |
| if (!cyclic_move_out(cp)) { |
| /* |
| * This CPU must be the last CPU in a processor set with |
| * a bound cyclic. |
| */ |
| cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); |
| pg_cpupart_out(cp, newpp); |
| pg_cpupart_in(cp, oldpp); |
| cpu_inmotion = NULL; |
| return (EBUSY); |
| } |
| |
| pause_cpus(cp, NULL); |
| |
| if (move_threads) { |
| /* |
| * The thread on cpu before the pause thread may have read |
| * cpu_inmotion before we raised the barrier above. Check |
| * again. |
| */ |
| if (disp_bound_threads(cp, 1)) { |
| start_cpus(); |
| goto again; |
| } |
| |
| } |
| |
| /* |
| * Now that CPUs are paused, let the PG subsystem perform |
| * any necessary data structure updates. |
| */ |
| pg_cpupart_move(cp, oldpp, newpp); |
| |
| /* save this cpu's lgroup -- it'll be the same in the new partition */ |
| lgrpid = cp->cpu_lpl->lpl_lgrpid; |
| |
| cpu_lpl = cp->cpu_lpl; |
| /* |
| * let the lgroup framework know cp has left the partition |
| */ |
| lgrp_config(LGRP_CONFIG_CPUPART_DEL, (uintptr_t)cp, lgrpid); |
| |
| /* move out of old partition */ |
| oldpp->cp_ncpus--; |
| if (oldpp->cp_ncpus > 0) { |
| |
| ncp = cp->cpu_prev_part->cpu_next_part = cp->cpu_next_part; |
| cp->cpu_next_part->cpu_prev_part = cp->cpu_prev_part; |
| if (oldpp->cp_cpulist == cp) { |
| oldpp->cp_cpulist = ncp; |
| } |
| } else { |
| ncp = oldpp->cp_cpulist = NULL; |
| cp_numparts_nonempty--; |
| ASSERT(cp_numparts_nonempty != 0); |
| } |
| oldpp->cp_gen++; |
| |
| /* move into new partition */ |
| newlist = newpp->cp_cpulist; |
| if (newlist == NULL) { |
| newpp->cp_cpulist = cp->cpu_next_part = cp->cpu_prev_part = cp; |
| cp_numparts_nonempty++; |
| ASSERT(cp_numparts_nonempty != 0); |
| } else { |
| cp->cpu_next_part = newlist; |
| cp->cpu_prev_part = newlist->cpu_prev_part; |
| newlist->cpu_prev_part->cpu_next_part = cp; |
| newlist->cpu_prev_part = cp; |
| } |
| cp->cpu_part = newpp; |
| newpp->cp_ncpus++; |
| newpp->cp_gen++; |
| |
| ASSERT(bitset_is_null(&newpp->cp_haltset)); |
| ASSERT(bitset_is_null(&oldpp->cp_haltset)); |
| |
| /* |
| * let the lgroup framework know cp has entered the partition |
| */ |
| lgrp_config(LGRP_CONFIG_CPUPART_ADD, (uintptr_t)cp, lgrpid); |
| |
| /* |
| * If necessary, move threads off processor. |
| */ |
| if (move_threads) { |
| ASSERT(ncp != NULL); |
| |
| /* |
| * Walk thru the active process list to look for |
| * threads that need to have a new home lgroup, |
| * or the last CPU they run on is the same CPU |
| * being moved out of the partition. |
| */ |
| |
| for (p = practive; p != NULL; p = p->p_next) { |
| |
| t = p->p_tlist; |
| |
| if (t == NULL) |
| continue; |
| |
| lgrp_diff_lpl = 0; |
| |
| do { |
| |
| ASSERT(t->t_lpl != NULL); |
| |
| /* |
| * Update the count of how many threads are |
| * in this CPU's lgroup but have a different lpl |
| */ |
| |
| if (t->t_lpl != cpu_lpl && |
| t->t_lpl->lpl_lgrpid == lgrpid) |
| lgrp_diff_lpl++; |
| /* |
| * If the lgroup that t is assigned to no |
| * longer has any CPUs in t's partition, |
| * we'll have to choose a new lgroup for t. |
| */ |
| |
| if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, |
| t->t_cpupart)) { |
| lgrp_move_thread(t, |
| lgrp_choose(t, t->t_cpupart), 0); |
| } |
| |
| /* |
| * make sure lpl points to our own partition |
| */ |
| ASSERT(t->t_lpl >= t->t_cpupart->cp_lgrploads && |
| (t->t_lpl < t->t_cpupart->cp_lgrploads + |
| t->t_cpupart->cp_nlgrploads)); |
| |
| ASSERT(t->t_lpl->lpl_ncpu > 0); |
| |
| /* Update CPU last ran on if it was this CPU */ |
| if (t->t_cpu == cp && t->t_cpupart == oldpp && |
| t->t_bound_cpu != cp) { |
| t->t_cpu = disp_lowpri_cpu(ncp, |
| t->t_lpl, t->t_pri, NULL); |
| } |
| t = t->t_forw; |
| } while (t != p->p_tlist); |
| |
| /* |
| * Didn't find any threads in the same lgroup as this |
| * CPU with a different lpl, so remove the lgroup from |
| * the process lgroup bitmask. |
| */ |
| |
| if (lgrp_diff_lpl) |
| klgrpset_del(p->p_lgrpset, lgrpid); |
| } |
| |
| /* |
| * Walk thread list looking for threads that need to be |
| * rehomed, since there are some threads that are not in |
| * their process's p_tlist. |
| */ |
| |
| t = curthread; |
| |
| do { |
| ASSERT(t != NULL && t->t_lpl != NULL); |
| |
| /* |
| * If the lgroup that t is assigned to no |
| * longer has any CPUs in t's partition, |
| * we'll have to choose a new lgroup for t. |
| * Also, choose best lgroup for home when |
| * thread has specified lgroup affinities, |
| * since there may be an lgroup with more |
| * affinity available after moving CPUs |
| * around. |
| */ |
| if (!LGRP_CPUS_IN_PART(t->t_lpl->lpl_lgrpid, |
| t->t_cpupart) || t->t_lgrp_affinity) { |
| lgrp_move_thread(t, |
| lgrp_choose(t, t->t_cpupart), 1); |
| } |
| |
| /* make sure lpl points to our own partition */ |
| ASSERT((t->t_lpl >= t->t_cpupart->cp_lgrploads) && |
| (t->t_lpl < t->t_cpupart->cp_lgrploads + |
| t->t_cpupart->cp_nlgrploads)); |
| |
| ASSERT(t->t_lpl->lpl_ncpu > 0); |
| |
| /* Update CPU last ran on if it was this CPU */ |
| if (t->t_cpu == cp && t->t_cpupart == oldpp && |
| t->t_bound_cpu != cp) { |
| t->t_cpu = disp_lowpri_cpu(ncp, t->t_lpl, |
| t->t_pri, NULL); |
| } |
| |
| t = t->t_next; |
| } while (t != curthread); |
| |
| /* |
| * Clear off the CPU's run queue, and the kp queue if the |
| * partition is now empty. |
| */ |
| disp_cpu_inactive(cp); |
| |
| /* |
| * Make cp switch to a thread from the new partition. |
| */ |
| cp->cpu_runrun = 1; |
| cp->cpu_kprunrun = 1; |
| } |
| |
| cpu_inmotion = NULL; |
| start_cpus(); |
| |
| /* |
| * Let anyone interested know that cpu has been added to the set. |
| */ |
| cpu_state_change_notify(cp->cpu_id, CPU_CPUPART_IN); |
| |
| /* |
| * Now let the cyclic subsystem know that it can reshuffle cyclics |
| * bound to the new processor set. |
| */ |
| cyclic_move_in(cp); |
| |
| return (0); |
| } |
| |
| /* |
| * Check if thread can be moved to a new cpu partition. Called by |
| * cpupart_move_thread() and pset_bind_start(). |
| */ |
| int |
| cpupart_movable_thread(kthread_id_t tp, cpupart_t *cp, int ignore) |
| { |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); |
| ASSERT(cp != NULL); |
| ASSERT(THREAD_LOCK_HELD(tp)); |
| |
| /* |
| * CPU-bound threads can't be moved. |
| */ |
| if (!ignore) { |
| cpu_t *boundcpu = tp->t_bound_cpu ? tp->t_bound_cpu : |
| tp->t_weakbound_cpu; |
| if (boundcpu != NULL && boundcpu->cpu_part != cp) |
| return (EBUSY); |
| } |
| |
| if (tp->t_cid == sysdccid) { |
| return (EINVAL); /* For now, sysdc threads can't move */ |
| } |
| |
| return (0); |
| } |
| |
| /* |
| * Move thread to new partition. If ignore is non-zero, then CPU |
| * bindings should be ignored (this is used when destroying a |
| * partition). |
| */ |
| static int |
| cpupart_move_thread(kthread_id_t tp, cpupart_t *newpp, int ignore, |
| void *projbuf, void *zonebuf) |
| { |
| cpupart_t *oldpp = tp->t_cpupart; |
| int ret; |
| |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| ASSERT(MUTEX_HELD(&pidlock)); |
| ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); |
| ASSERT(newpp != NULL); |
| |
| if (newpp->cp_cpulist == NULL) |
| return (EINVAL); |
| |
| /* |
| * Check for errors first. |
| */ |
| thread_lock(tp); |
| if ((ret = cpupart_movable_thread(tp, newpp, ignore)) != 0) { |
| thread_unlock(tp); |
| return (ret); |
| } |
| |
| /* move the thread */ |
| if (oldpp != newpp) { |
| /* |
| * Make the thread switch to the new partition. |
| */ |
| tp->t_cpupart = newpp; |
| ASSERT(tp->t_lpl != NULL); |
| /* |
| * Leave the thread on the same lgroup if possible; otherwise |
| * choose a new lgroup for it. In either case, update its |
| * t_lpl. |
| */ |
| if (LGRP_CPUS_IN_PART(tp->t_lpl->lpl_lgrpid, newpp) && |
| tp->t_lgrp_affinity == NULL) { |
| /* |
| * The thread's lgroup has CPUs in the thread's new |
| * partition, so the thread can stay assigned to the |
| * same lgroup. Update its t_lpl to point to the |
| * lpl_t for its lgroup in its new partition. |
| */ |
| lgrp_move_thread(tp, &tp->t_cpupart->\ |
| cp_lgrploads[tp->t_lpl->lpl_lgrpid], 1); |
| } else { |
| /* |
| * The thread's lgroup has no cpus in its new |
| * partition or it has specified lgroup affinities, |
| * so choose the best lgroup for the thread and |
| * assign it to that lgroup. |
| */ |
| lgrp_move_thread(tp, lgrp_choose(tp, tp->t_cpupart), |
| 1); |
| } |
| /* |
| * make sure lpl points to our own partition |
| */ |
| ASSERT((tp->t_lpl >= tp->t_cpupart->cp_lgrploads) && |
| (tp->t_lpl < tp->t_cpupart->cp_lgrploads + |
| tp->t_cpupart->cp_nlgrploads)); |
| |
| ASSERT(tp->t_lpl->lpl_ncpu > 0); |
| |
| if (tp->t_state == TS_ONPROC) { |
| cpu_surrender(tp); |
| } else if (tp->t_state == TS_RUN) { |
| (void) dispdeq(tp); |
| setbackdq(tp); |
| } |
| } |
| |
| /* |
| * Our binding has changed; set TP_CHANGEBIND. |
| */ |
| tp->t_proc_flag |= TP_CHANGEBIND; |
| aston(tp); |
| |
| thread_unlock(tp); |
| fss_changepset(tp, newpp, projbuf, zonebuf); |
| |
| return (0); /* success */ |
| } |
| |
| |
| /* |
| * This function binds a thread to a partition. Must be called with the |
| * p_lock of the containing process held (to keep the thread from going |
| * away), and thus also with cpu_lock held (since cpu_lock must be |
| * acquired before p_lock). If ignore is non-zero, then CPU bindings |
| * should be ignored (this is used when destroying a partition). |
| */ |
| int |
| cpupart_bind_thread(kthread_id_t tp, psetid_t psid, int ignore, void *projbuf, |
| void *zonebuf) |
| { |
| cpupart_t *newpp; |
| |
| ASSERT(pool_lock_held()); |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| ASSERT(MUTEX_HELD(&pidlock)); |
| ASSERT(MUTEX_HELD(&ttoproc(tp)->p_lock)); |
| |
| if (psid == PS_NONE) |
| newpp = &cp_default; |
| else { |
| newpp = cpupart_find(psid); |
| if (newpp == NULL) { |
| return (EINVAL); |
| } |
| } |
| return (cpupart_move_thread(tp, newpp, ignore, projbuf, zonebuf)); |
| } |
| |
| |
| /* |
| * Create a new partition. On MP systems, this also allocates a |
| * kpreempt disp queue for that partition. |
| */ |
| int |
| cpupart_create(psetid_t *psid) |
| { |
| cpupart_t *pp; |
| |
| ASSERT(pool_lock_held()); |
| |
| pp = kmem_zalloc(sizeof (cpupart_t), KM_SLEEP); |
| |
| mutex_enter(&cpu_lock); |
| if (cp_numparts == cp_max_numparts) { |
| mutex_exit(&cpu_lock); |
| kmem_free(pp, sizeof (cpupart_t)); |
| return (ENOMEM); |
| } |
| cp_numparts++; |
| /* find the next free partition ID */ |
| while (cpupart_find(CPTOPS(cp_id_next)) != NULL) |
| cp_id_next++; |
| pp->cp_id = cp_id_next++; |
| pp->cp_ncpus = 0; |
| pp->cp_cpulist = NULL; |
| pp->cp_attr = 0; |
| klgrpset_clear(pp->cp_lgrpset); |
| pp->cp_kp_queue.disp_maxrunpri = -1; |
| pp->cp_kp_queue.disp_max_unbound_pri = -1; |
| pp->cp_kp_queue.disp_cpu = NULL; |
| pp->cp_gen = 0; |
| DISP_LOCK_INIT(&pp->cp_kp_queue.disp_lock); |
| *psid = CPTOPS(pp->cp_id); |
| disp_kp_alloc(&pp->cp_kp_queue, v.v_nglobpris); |
| cpupart_kstat_create(pp); |
| cpupart_lpl_initialize(pp); |
| |
| bitset_init(&pp->cp_cmt_pgs); |
| |
| /* |
| * Initialize and size the partition's bitset of halted CPUs. |
| */ |
| bitset_init_fanout(&pp->cp_haltset, cp_haltset_fanout); |
| bitset_resize(&pp->cp_haltset, max_ncpus); |
| |
| /* |
| * Pause all CPUs while changing the partition list, to make sure |
| * the clock thread (which traverses the list without holding |
| * cpu_lock) isn't running. |
| */ |
| pause_cpus(NULL, NULL); |
| pp->cp_next = cp_list_head; |
| pp->cp_prev = cp_list_head->cp_prev; |
| cp_list_head->cp_prev->cp_next = pp; |
| cp_list_head->cp_prev = pp; |
| start_cpus(); |
| mutex_exit(&cpu_lock); |
| |
| return (0); |
| } |
| |
| /* |
| * Move threads from specified partition to cp_default. If `force' is specified, |
| * move all threads, otherwise move only soft-bound threads. |
| */ |
| static int |
| cpupart_unbind_threads(cpupart_t *pp, boolean_t unbind_all) |
| { |
| void *projbuf, *zonebuf; |
| kthread_t *t; |
| proc_t *p; |
| int err = 0; |
| psetid_t psid = pp->cp_id; |
| |
| ASSERT(pool_lock_held()); |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| |
| if (pp == NULL || pp == &cp_default) { |
| return (EINVAL); |
| } |
| |
| /* |
| * Pre-allocate enough buffers for FSS for all active projects and |
| * for all active zones on the system. Unused buffers will be |
| * freed later by fss_freebuf(). |
| */ |
| projbuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_PROJ); |
| zonebuf = fss_allocbuf(FSS_NPROJ_BUF, FSS_ALLOC_ZONE); |
| |
| mutex_enter(&pidlock); |
| t = curthread; |
| do { |
| if (t->t_bind_pset == psid) { |
| again: p = ttoproc(t); |
| mutex_enter(&p->p_lock); |
| if (ttoproc(t) != p) { |
| /* |
| * lwp_exit has changed this thread's process |
| * pointer before we grabbed its p_lock. |
| */ |
| mutex_exit(&p->p_lock); |
| goto again; |
| } |
| |
| /* |
| * Can only unbind threads which have revocable binding |
| * unless force unbinding requested. |
| */ |
| if (unbind_all || TB_PSET_IS_SOFT(t)) { |
| err = cpupart_bind_thread(t, PS_NONE, 1, |
| projbuf, zonebuf); |
| if (err) { |
| mutex_exit(&p->p_lock); |
| mutex_exit(&pidlock); |
| fss_freebuf(projbuf, FSS_ALLOC_PROJ); |
| fss_freebuf(zonebuf, FSS_ALLOC_ZONE); |
| return (err); |
| } |
| t->t_bind_pset = PS_NONE; |
| } |
| mutex_exit(&p->p_lock); |
| } |
| t = t->t_next; |
| } while (t != curthread); |
| |
| mutex_exit(&pidlock); |
| fss_freebuf(projbuf, FSS_ALLOC_PROJ); |
| fss_freebuf(zonebuf, FSS_ALLOC_ZONE); |
| return (err); |
| } |
| |
| /* |
| * Destroy a partition. |
| */ |
| int |
| cpupart_destroy(psetid_t psid) |
| { |
| cpu_t *cp, *first_cp; |
| cpupart_t *pp, *newpp; |
| int err = 0; |
| |
| ASSERT(pool_lock_held()); |
| mutex_enter(&cpu_lock); |
| |
| pp = cpupart_find(psid); |
| if (pp == NULL || pp == &cp_default) { |
| mutex_exit(&cpu_lock); |
| return (EINVAL); |
| } |
| |
| /* |
| * Unbind all the threads currently bound to the partition. |
| */ |
| err = cpupart_unbind_threads(pp, B_TRUE); |
| if (err) { |
| mutex_exit(&cpu_lock); |
| return (err); |
| } |
| |
| newpp = &cp_default; |
| while ((cp = pp->cp_cpulist) != NULL) { |
| if (err = cpupart_move_cpu(cp, newpp, 0)) { |
| mutex_exit(&cpu_lock); |
| return (err); |
| } |
| } |
| |
| ASSERT(bitset_is_null(&pp->cp_cmt_pgs)); |
| ASSERT(bitset_is_null(&pp->cp_haltset)); |
| |
| /* |
| * Teardown the partition's group of active CMT PGs and halted |
| * CPUs now that they have all left. |
| */ |
| bitset_fini(&pp->cp_cmt_pgs); |
| bitset_fini(&pp->cp_haltset); |
| |
| /* |
| * Reset the pointers in any offline processors so they won't |
| * try to rejoin the destroyed partition when they're turned |
| * online. |
| */ |
| first_cp = cp = CPU; |
| do { |
| if (cp->cpu_part == pp) { |
| ASSERT(cp->cpu_flags & CPU_OFFLINE); |
| cp->cpu_part = newpp; |
| } |
| cp = cp->cpu_next; |
| } while (cp != first_cp); |
| |
| /* |
| * Pause all CPUs while changing the partition list, to make sure |
| * the clock thread (which traverses the list without holding |
| * cpu_lock) isn't running. |
| */ |
| pause_cpus(NULL, NULL); |
| pp->cp_prev->cp_next = pp->cp_next; |
| pp->cp_next->cp_prev = pp->cp_prev; |
| if (cp_list_head == pp) |
| cp_list_head = pp->cp_next; |
| start_cpus(); |
| |
| if (cp_id_next > pp->cp_id) |
| cp_id_next = pp->cp_id; |
| |
| if (pp->cp_kstat) |
| kstat_delete(pp->cp_kstat); |
| |
| cp_numparts--; |
| |
| disp_kp_free(&pp->cp_kp_queue); |
| |
| cpupart_lpl_teardown(pp); |
| |
| kmem_free(pp, sizeof (cpupart_t)); |
| mutex_exit(&cpu_lock); |
| |
| return (err); |
| } |
| |
| |
| /* |
| * Return the ID of the partition to which the specified processor belongs. |
| */ |
| psetid_t |
| cpupart_query_cpu(cpu_t *cp) |
| { |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| |
| return (CPTOPS(cp->cpu_part->cp_id)); |
| } |
| |
| |
| /* |
| * Attach a processor to an existing partition. |
| */ |
| int |
| cpupart_attach_cpu(psetid_t psid, cpu_t *cp, int forced) |
| { |
| cpupart_t *pp; |
| int err; |
| |
| ASSERT(pool_lock_held()); |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| |
| pp = cpupart_find(psid); |
| if (pp == NULL) |
| return (EINVAL); |
| if (cp->cpu_flags & CPU_OFFLINE) |
| return (EINVAL); |
| |
| err = cpupart_move_cpu(cp, pp, forced); |
| return (err); |
| } |
| |
| /* |
| * Get a list of cpus belonging to the partition. If numcpus is NULL, |
| * this just checks for a valid partition. If numcpus is non-NULL but |
| * cpulist is NULL, the current number of cpus is stored in *numcpus. |
| * If both are non-NULL, the current number of cpus is stored in *numcpus, |
| * and a list of those cpus up to the size originally in *numcpus is |
| * stored in cpulist[]. Also, store the processor set id in *psid. |
| * This is useful in case the processor set id passed in was PS_MYID. |
| */ |
| int |
| cpupart_get_cpus(psetid_t *psid, processorid_t *cpulist, uint_t *numcpus) |
| { |
| cpupart_t *pp; |
| uint_t ncpus; |
| cpu_t *c; |
| int i; |
| |
| mutex_enter(&cpu_lock); |
| pp = cpupart_find(*psid); |
| if (pp == NULL) { |
| mutex_exit(&cpu_lock); |
| return (EINVAL); |
| } |
| *psid = CPTOPS(pp->cp_id); |
| ncpus = pp->cp_ncpus; |
| if (numcpus) { |
| if (ncpus > *numcpus) { |
| /* |
| * Only copy as many cpus as were passed in, but |
| * pass back the real number. |
| */ |
| uint_t t = ncpus; |
| ncpus = *numcpus; |
| *numcpus = t; |
| } else |
| *numcpus = ncpus; |
| |
| if (cpulist) { |
| c = pp->cp_cpulist; |
| for (i = 0; i < ncpus; i++) { |
| ASSERT(c != NULL); |
| cpulist[i] = c->cpu_id; |
| c = c->cpu_next_part; |
| } |
| } |
| } |
| mutex_exit(&cpu_lock); |
| return (0); |
| } |
| |
| /* |
| * Reallocate kpreempt queues for each CPU partition. Called from |
| * disp_setup when a new scheduling class is loaded that increases the |
| * number of priorities in the system. |
| */ |
| void |
| cpupart_kpqalloc(pri_t npri) |
| { |
| cpupart_t *cpp; |
| |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| cpp = cp_list_head; |
| do { |
| disp_kp_alloc(&cpp->cp_kp_queue, npri); |
| cpp = cpp->cp_next; |
| } while (cpp != cp_list_head); |
| } |
| |
| int |
| cpupart_get_loadavg(psetid_t psid, int *buf, int nelem) |
| { |
| cpupart_t *cp; |
| int i; |
| |
| ASSERT(nelem >= 0); |
| ASSERT(nelem <= LOADAVG_NSTATS); |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| |
| cp = cpupart_find(psid); |
| if (cp == NULL) |
| return (EINVAL); |
| for (i = 0; i < nelem; i++) |
| buf[i] = cp->cp_hp_avenrun[i] >> (16 - FSHIFT); |
| |
| return (0); |
| } |
| |
| |
| uint_t |
| cpupart_list(psetid_t *list, uint_t nelem, int flag) |
| { |
| uint_t numpart = 0; |
| cpupart_t *cp; |
| |
| ASSERT(MUTEX_HELD(&cpu_lock)); |
| ASSERT(flag == CP_ALL || flag == CP_NONEMPTY); |
| |
| if (list != NULL) { |
| cp = cp_list_head; |
| do { |
| if (((flag == CP_ALL) && (cp != &cp_default)) || |
| ((flag == CP_NONEMPTY) && (cp->cp_ncpus != 0))) { |
| if (numpart == nelem) |
| break; |
| list[numpart++] = CPTOPS(cp->cp_id); |
| } |
| cp = cp->cp_next; |
| } while (cp != cp_list_head); |
| } |
| |
| ASSERT(numpart < cp_numparts); |
| |
| if (flag == CP_ALL) |
| numpart = cp_numparts - 1; /* leave out default partition */ |
| else if (flag == CP_NONEMPTY) |
| numpart = cp_numparts_nonempty; |
| |
| return (numpart); |
| } |
| |
| int |
| cpupart_setattr(psetid_t psid, uint_t attr) |
| { |
| cpupart_t *cp; |
| |
| ASSERT(pool_lock_held()); |
| |
| mutex_enter(&cpu_lock); |
| if ((cp = cpupart_find(psid)) == NULL) { |
| mutex_exit(&cpu_lock); |
| return (EINVAL); |
| } |
| /* |
| * PSET_NOESCAPE attribute for default cpu partition is always set |
| */ |
| if (cp == &cp_default && !(attr & PSET_NOESCAPE)) { |
| mutex_exit(&cpu_lock); |
| return (EINVAL); |
| } |
| cp->cp_attr = attr; |
| mutex_exit(&cpu_lock); |
| return (0); |
| } |
| |
| int |
| cpupart_getattr(psetid_t psid, uint_t *attrp) |
| { |
| cpupart_t *cp; |
| |
| mutex_enter(&cpu_lock); |
| if ((cp = cpupart_find(psid)) == NULL) { |
| mutex_exit(&cpu_lock); |
| return (EINVAL); |
| } |
| *attrp = cp->cp_attr; |
| mutex_exit(&cpu_lock); |
| return (0); |
| } |