| /* |
| * CDDL HEADER START |
| * |
| * The contents of this file are subject to the terms of the |
| * Common Development and Distribution License (the "License"). |
| * You may not use this file except in compliance with the License. |
| * |
| * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| * or http://www.opensolaris.org/os/licensing. |
| * See the License for the specific language governing permissions |
| * and limitations under the License. |
| * |
| * When distributing Covered Code, include this CDDL HEADER in each |
| * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| * If applicable, add the following below this CDDL HEADER, with the |
| * fields enclosed by brackets "[]" replaced with your own identifying |
| * information: Portions Copyright [yyyy] [name of copyright owner] |
| * |
| * CDDL HEADER END |
| */ |
| |
| /* |
| * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. |
| * Copyright 2015, Joyent Inc. All rights reserved. |
| * Copyright (c) 2016 by Delphix. All rights reserved. |
| * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. |
| */ |
| |
| /* |
| * Zones |
| * |
| * A zone is a named collection of processes, namespace constraints, |
| * and other system resources which comprise a secure and manageable |
| * application containment facility. |
| * |
| * Zones (represented by the reference counted zone_t) are tracked in |
| * the kernel in the zonehash. Elsewhere in the kernel, Zone IDs |
| * (zoneid_t) are used to track zone association. Zone IDs are |
| * dynamically generated when the zone is created; if a persistent |
| * identifier is needed (core files, accounting logs, audit trail, |
| * etc.), the zone name should be used. |
| * |
| * |
| * Global Zone: |
| * |
| * The global zone (zoneid 0) is automatically associated with all |
| * system resources that have not been bound to a user-created zone. |
| * This means that even systems where zones are not in active use |
| * have a global zone, and all processes, mounts, etc. are |
| * associated with that zone. The global zone is generally |
| * unconstrained in terms of privileges and access, though the usual |
| * credential and privilege based restrictions apply. |
| * |
| * |
| * Zone States: |
| * |
| * The states in which a zone may be in and the transitions are as |
| * follows: |
| * |
| * ZONE_IS_UNINITIALIZED: primordial state for a zone. The partially |
| * initialized zone is added to the list of active zones on the system but |
| * isn't accessible. |
| * |
| * ZONE_IS_INITIALIZED: Initialization complete except the ZSD callbacks are |
| * not yet completed. Not possible to enter the zone, but attributes can |
| * be retrieved. |
| * |
| * ZONE_IS_READY: zsched (the kernel dummy process for a zone) is |
| * ready. The zone is made visible after the ZSD constructor callbacks are |
| * executed. A zone remains in this state until it transitions into |
| * the ZONE_IS_BOOTING state as a result of a call to zone_boot(). |
| * |
| * ZONE_IS_BOOTING: in this shortlived-state, zsched attempts to start |
| * init. Should that fail, the zone proceeds to the ZONE_IS_SHUTTING_DOWN |
| * state. |
| * |
| * ZONE_IS_RUNNING: The zone is open for business: zsched has |
| * successfully started init. A zone remains in this state until |
| * zone_shutdown() is called. |
| * |
| * ZONE_IS_SHUTTING_DOWN: zone_shutdown() has been called, the system is |
| * killing all processes running in the zone. The zone remains |
| * in this state until there are no more user processes running in the zone. |
| * zone_create(), zone_enter(), and zone_destroy() on this zone will fail. |
| * Since zone_shutdown() is restartable, it may be called successfully |
| * multiple times for the same zone_t. Setting of the zone's state to |
| * ZONE_IS_SHUTTING_DOWN is synchronized with mounts, so VOP_MOUNT() may check |
| * the zone's status without worrying about it being a moving target. |
| * |
| * ZONE_IS_EMPTY: zone_shutdown() has been called, and there |
| * are no more user processes in the zone. The zone remains in this |
| * state until there are no more kernel threads associated with the |
| * zone. zone_create(), zone_enter(), and zone_destroy() on this zone will |
| * fail. |
| * |
| * ZONE_IS_DOWN: All kernel threads doing work on behalf of the zone |
| * have exited. zone_shutdown() returns. Henceforth it is not possible to |
| * join the zone or create kernel threads therein. |
| * |
| * ZONE_IS_DYING: zone_destroy() has been called on the zone; zone |
| * remains in this state until zsched exits. Calls to zone_find_by_*() |
| * return NULL from now on. |
| * |
| * ZONE_IS_DEAD: zsched has exited (zone_ntasks == 0). There are no |
| * processes or threads doing work on behalf of the zone. The zone is |
| * removed from the list of active zones. zone_destroy() returns, and |
| * the zone can be recreated. |
| * |
| * ZONE_IS_FREE (internal state): zone_ref goes to 0, ZSD destructor |
| * callbacks are executed, and all memory associated with the zone is |
| * freed. |
| * |
| * Threads can wait for the zone to enter a requested state by using |
| * zone_status_wait() or zone_status_timedwait() with the desired |
| * state passed in as an argument. Zone state transitions are |
| * uni-directional; it is not possible to move back to an earlier state. |
| * |
| * |
| * Zone-Specific Data: |
| * |
| * Subsystems needing to maintain zone-specific data can store that |
| * data using the ZSD mechanism. This provides a zone-specific data |
| * store, similar to thread-specific data (see pthread_getspecific(3C) |
| * or the TSD code in uts/common/disp/thread.c. Also, ZSD can be used |
| * to register callbacks to be invoked when a zone is created, shut |
| * down, or destroyed. This can be used to initialize zone-specific |
| * data for new zones and to clean up when zones go away. |
| * |
| * |
| * Data Structures: |
| * |
| * The per-zone structure (zone_t) is reference counted, and freed |
| * when all references are released. zone_hold and zone_rele can be |
| * used to adjust the reference count. In addition, reference counts |
| * associated with the cred_t structure are tracked separately using |
| * zone_cred_hold and zone_cred_rele. |
| * |
| * Pointers to active zone_t's are stored in two hash tables; one |
| * for searching by id, the other for searching by name. Lookups |
| * can be performed on either basis, using zone_find_by_id and |
| * zone_find_by_name. Both return zone_t pointers with the zone |
| * held, so zone_rele should be called when the pointer is no longer |
| * needed. Zones can also be searched by path; zone_find_by_path |
| * returns the zone with which a path name is associated (global |
| * zone if the path is not within some other zone's file system |
| * hierarchy). This currently requires iterating through each zone, |
| * so it is slower than an id or name search via a hash table. |
| * |
| * |
| * Locking: |
| * |
| * zonehash_lock: This is a top-level global lock used to protect the |
| * zone hash tables and lists. Zones cannot be created or destroyed |
| * while this lock is held. |
| * zone_status_lock: This is a global lock protecting zone state. |
| * Zones cannot change state while this lock is held. It also |
| * protects the list of kernel threads associated with a zone. |
| * zone_lock: This is a per-zone lock used to protect several fields of |
| * the zone_t (see <sys/zone.h> for details). In addition, holding |
| * this lock means that the zone cannot go away. |
| * zone_nlwps_lock: This is a per-zone lock used to protect the fields |
| * related to the zone.max-lwps rctl. |
| * zone_mem_lock: This is a per-zone lock used to protect the fields |
| * related to the zone.max-locked-memory and zone.max-swap rctls. |
| * zone_rctl_lock: This is a per-zone lock used to protect other rctls, |
| * currently just max_lofi |
| * zsd_key_lock: This is a global lock protecting the key state for ZSD. |
| * zone_deathrow_lock: This is a global lock protecting the "deathrow" |
| * list (a list of zones in the ZONE_IS_DEAD state). |
| * |
| * Ordering requirements: |
| * pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock --> |
| * zone_lock --> zsd_key_lock --> pidlock --> p_lock |
| * |
| * When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is: |
| * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock |
| * zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_nlwps_lock |
| * |
| * Blocking memory allocations are permitted while holding any of the |
| * zone locks. |
| * |
| * |
| * System Call Interface: |
| * |
| * The zone subsystem can be managed and queried from user level with |
| * the following system calls (all subcodes of the primary "zone" |
| * system call): |
| * - zone_create: creates a zone with selected attributes (name, |
| * root path, privileges, resource controls, ZFS datasets) |
| * - zone_enter: allows the current process to enter a zone |
| * - zone_getattr: reports attributes of a zone |
| * - zone_setattr: set attributes of a zone |
| * - zone_boot: set 'init' running for the zone |
| * - zone_list: lists all zones active in the system |
| * - zone_lookup: looks up zone id based on name |
| * - zone_shutdown: initiates shutdown process (see states above) |
| * - zone_destroy: completes shutdown process (see states above) |
| * |
| */ |
| |
| #include <sys/priv_impl.h> |
| #include <sys/cred.h> |
| #include <c2/audit.h> |
| #include <sys/debug.h> |
| #include <sys/file.h> |
| #include <sys/kmem.h> |
| #include <sys/kstat.h> |
| #include <sys/mutex.h> |
| #include <sys/note.h> |
| #include <sys/pathname.h> |
| #include <sys/proc.h> |
| #include <sys/project.h> |
| #include <sys/sysevent.h> |
| #include <sys/task.h> |
| #include <sys/systm.h> |
| #include <sys/types.h> |
| #include <sys/utsname.h> |
| #include <sys/vnode.h> |
| #include <sys/vfs.h> |
| #include <sys/systeminfo.h> |
| #include <sys/policy.h> |
| #include <sys/cred_impl.h> |
| #include <sys/contract_impl.h> |
| #include <sys/contract/process_impl.h> |
| #include <sys/class.h> |
| #include <sys/pool.h> |
| #include <sys/pool_pset.h> |
| #include <sys/pset.h> |
| #include <sys/strlog.h> |
| #include <sys/sysmacros.h> |
| #include <sys/callb.h> |
| #include <sys/vmparam.h> |
| #include <sys/corectl.h> |
| #include <sys/ipc_impl.h> |
| #include <sys/klpd.h> |
| |
| #include <sys/door.h> |
| #include <sys/cpuvar.h> |
| #include <sys/sdt.h> |
| |
| #include <sys/uadmin.h> |
| #include <sys/session.h> |
| #include <sys/cmn_err.h> |
| #include <sys/modhash.h> |
| #include <sys/sunddi.h> |
| #include <sys/nvpair.h> |
| #include <sys/rctl.h> |
| #include <sys/fss.h> |
| #include <sys/brand.h> |
| #include <sys/zone.h> |
| #include <net/if.h> |
| #include <sys/cpucaps.h> |
| #include <vm/seg.h> |
| #include <sys/mac.h> |
| |
| /* |
| * This constant specifies the number of seconds that threads waiting for |
| * subsystems to release a zone's general-purpose references will wait before |
| * they log the zone's reference counts. The constant's value shouldn't |
| * be so small that reference counts are unnecessarily reported for zones |
| * whose references are slowly released. On the other hand, it shouldn't be so |
| * large that users reboot their systems out of frustration over hung zones |
| * before the system logs the zones' reference counts. |
| */ |
| #define ZONE_DESTROY_TIMEOUT_SECS 60 |
| |
| /* List of data link IDs which are accessible from the zone */ |
| typedef struct zone_dl { |
| datalink_id_t zdl_id; |
| nvlist_t *zdl_net; |
| list_node_t zdl_linkage; |
| } zone_dl_t; |
| |
| /* |
| * cv used to signal that all references to the zone have been released. This |
| * needs to be global since there may be multiple waiters, and the first to |
| * wake up will free the zone_t, hence we cannot use zone->zone_cv. |
| */ |
| static kcondvar_t zone_destroy_cv; |
| /* |
| * Lock used to serialize access to zone_cv. This could have been per-zone, |
| * but then we'd need another lock for zone_destroy_cv, and why bother? |
| */ |
| static kmutex_t zone_status_lock; |
| |
| /* |
| * ZSD-related global variables. |
| */ |
| static kmutex_t zsd_key_lock; /* protects the following two */ |
| /* |
| * The next caller of zone_key_create() will be assigned a key of ++zsd_keyval. |
| */ |
| static zone_key_t zsd_keyval = 0; |
| /* |
| * Global list of registered keys. We use this when a new zone is created. |
| */ |
| static list_t zsd_registered_keys; |
| |
| int zone_hash_size = 256; |
| static mod_hash_t *zonehashbyname, *zonehashbyid, *zonehashbylabel; |
| static kmutex_t zonehash_lock; |
| static uint_t zonecount; |
| static id_space_t *zoneid_space; |
| |
| /* |
| * The global zone (aka zone0) is the all-seeing, all-knowing zone in which the |
| * kernel proper runs, and which manages all other zones. |
| * |
| * Although not declared as static, the variable "zone0" should not be used |
| * except for by code that needs to reference the global zone early on in boot, |
| * before it is fully initialized. All other consumers should use |
| * 'global_zone'. |
| */ |
| zone_t zone0; |
| zone_t *global_zone = NULL; /* Set when the global zone is initialized */ |
| |
| /* |
| * List of active zones, protected by zonehash_lock. |
| */ |
| static list_t zone_active; |
| |
| /* |
| * List of destroyed zones that still have outstanding cred references. |
| * Used for debugging. Uses a separate lock to avoid lock ordering |
| * problems in zone_free. |
| */ |
| static list_t zone_deathrow; |
| static kmutex_t zone_deathrow_lock; |
| |
| /* number of zones is limited by virtual interface limit in IP */ |
| uint_t maxzones = 8192; |
| |
| /* Event channel to sent zone state change notifications */ |
| evchan_t *zone_event_chan; |
| |
| /* |
| * This table holds the mapping from kernel zone states to |
| * states visible in the state notification API. |
| * The idea is that we only expose "obvious" states and |
| * do not expose states which are just implementation details. |
| */ |
| const char *zone_status_table[] = { |
| ZONE_EVENT_UNINITIALIZED, /* uninitialized */ |
| ZONE_EVENT_INITIALIZED, /* initialized */ |
| ZONE_EVENT_READY, /* ready */ |
| ZONE_EVENT_READY, /* booting */ |
| ZONE_EVENT_RUNNING, /* running */ |
| ZONE_EVENT_SHUTTING_DOWN, /* shutting_down */ |
| ZONE_EVENT_SHUTTING_DOWN, /* empty */ |
| ZONE_EVENT_SHUTTING_DOWN, /* down */ |
| ZONE_EVENT_SHUTTING_DOWN, /* dying */ |
| ZONE_EVENT_UNINITIALIZED, /* dead */ |
| }; |
| |
| /* |
| * This array contains the names of the subsystems listed in zone_ref_subsys_t |
| * (see sys/zone.h). |
| */ |
| static char *zone_ref_subsys_names[] = { |
| "NFS", /* ZONE_REF_NFS */ |
| "NFSv4", /* ZONE_REF_NFSV4 */ |
| "SMBFS", /* ZONE_REF_SMBFS */ |
| "MNTFS", /* ZONE_REF_MNTFS */ |
| "LOFI", /* ZONE_REF_LOFI */ |
| "VFS", /* ZONE_REF_VFS */ |
| "IPC" /* ZONE_REF_IPC */ |
| }; |
| |
| /* |
| * This isn't static so lint doesn't complain. |
| */ |
| rctl_hndl_t rc_zone_cpu_shares; |
| rctl_hndl_t rc_zone_locked_mem; |
| rctl_hndl_t rc_zone_max_swap; |
| rctl_hndl_t rc_zone_max_lofi; |
| rctl_hndl_t rc_zone_cpu_cap; |
| rctl_hndl_t rc_zone_nlwps; |
| rctl_hndl_t rc_zone_nprocs; |
| rctl_hndl_t rc_zone_shmmax; |
| rctl_hndl_t rc_zone_shmmni; |
| rctl_hndl_t rc_zone_semmni; |
| rctl_hndl_t rc_zone_msgmni; |
| |
| const char * const zone_default_initname = "/sbin/init"; |
| static char * const zone_prefix = "/zone/"; |
| static int zone_shutdown(zoneid_t zoneid); |
| static int zone_add_datalink(zoneid_t, datalink_id_t); |
| static int zone_remove_datalink(zoneid_t, datalink_id_t); |
| static int zone_list_datalink(zoneid_t, int *, datalink_id_t *); |
| static int zone_set_network(zoneid_t, zone_net_data_t *); |
| static int zone_get_network(zoneid_t, zone_net_data_t *); |
| |
| typedef boolean_t zsd_applyfn_t(kmutex_t *, boolean_t, zone_t *, zone_key_t); |
| |
| static void zsd_apply_all_zones(zsd_applyfn_t *, zone_key_t); |
| static void zsd_apply_all_keys(zsd_applyfn_t *, zone_t *); |
| static boolean_t zsd_apply_create(kmutex_t *, boolean_t, zone_t *, zone_key_t); |
| static boolean_t zsd_apply_shutdown(kmutex_t *, boolean_t, zone_t *, |
| zone_key_t); |
| static boolean_t zsd_apply_destroy(kmutex_t *, boolean_t, zone_t *, zone_key_t); |
| static boolean_t zsd_wait_for_creator(zone_t *, struct zsd_entry *, |
| kmutex_t *); |
| static boolean_t zsd_wait_for_inprogress(zone_t *, struct zsd_entry *, |
| kmutex_t *); |
| |
| /* |
| * Bump this number when you alter the zone syscall interfaces; this is |
| * because we need to have support for previous API versions in libc |
| * to support patching; libc calls into the kernel to determine this number. |
| * |
| * Version 1 of the API is the version originally shipped with Solaris 10 |
| * Version 2 alters the zone_create system call in order to support more |
| * arguments by moving the args into a structure; and to do better |
| * error reporting when zone_create() fails. |
| * Version 3 alters the zone_create system call in order to support the |
| * import of ZFS datasets to zones. |
| * Version 4 alters the zone_create system call in order to support |
| * Trusted Extensions. |
| * Version 5 alters the zone_boot system call, and converts its old |
| * bootargs parameter to be set by the zone_setattr API instead. |
| * Version 6 adds the flag argument to zone_create. |
| */ |
| static const int ZONE_SYSCALL_API_VERSION = 6; |
| |
| /* |
| * Certain filesystems (such as NFS and autofs) need to know which zone |
| * the mount is being placed in. Because of this, we need to be able to |
| * ensure that a zone isn't in the process of being created/destroyed such |
| * that nfs_mount() thinks it is in the global/NGZ zone, while by the time |
| * it gets added the list of mounted zones, it ends up on the wrong zone's |
| * mount list. Since a zone can't reside on an NFS file system, we don't |
| * have to worry about the zonepath itself. |
| * |
| * The following functions: block_mounts()/resume_mounts() and |
| * mount_in_progress()/mount_completed() are used by zones and the VFS |
| * layer (respectively) to synchronize zone state transitions and new |
| * mounts within a zone. This syncronization is on a per-zone basis, so |
| * activity for one zone will not interfere with activity for another zone. |
| * |
| * The semantics are like a reader-reader lock such that there may |
| * either be multiple mounts (or zone state transitions, if that weren't |
| * serialized by zonehash_lock) in progress at the same time, but not |
| * both. |
| * |
| * We use cv's so the user can ctrl-C out of the operation if it's |
| * taking too long. |
| * |
| * The semantics are such that there is unfair bias towards the |
| * "current" operation. This means that zone halt may starve if |
| * there is a rapid succession of new mounts coming in to the zone. |
| */ |
| /* |
| * Prevent new mounts from progressing to the point of calling |
| * VFS_MOUNT(). If there are already mounts in this "region", wait for |
| * them to complete. |
| */ |
| static int |
| block_mounts(zone_t *zp) |
| { |
| int retval = 0; |
| |
| /* |
| * Since it may block for a long time, block_mounts() shouldn't be |
| * called with zonehash_lock held. |
| */ |
| ASSERT(MUTEX_NOT_HELD(&zonehash_lock)); |
| mutex_enter(&zp->zone_mount_lock); |
| while (zp->zone_mounts_in_progress > 0) { |
| if (cv_wait_sig(&zp->zone_mount_cv, &zp->zone_mount_lock) == 0) |
| goto signaled; |
| } |
| /* |
| * A negative value of mounts_in_progress indicates that mounts |
| * have been blocked by (-mounts_in_progress) different callers |
| * (remotely possible if two threads enter zone_shutdown at the same |
| * time). |
| */ |
| zp->zone_mounts_in_progress--; |
| retval = 1; |
| signaled: |
| mutex_exit(&zp->zone_mount_lock); |
| return (retval); |
| } |
| |
| /* |
| * The VFS layer may progress with new mounts as far as we're concerned. |
| * Allow them to progress if we were the last obstacle. |
| */ |
| static void |
| resume_mounts(zone_t *zp) |
| { |
| mutex_enter(&zp->zone_mount_lock); |
| if (++zp->zone_mounts_in_progress == 0) |
| cv_broadcast(&zp->zone_mount_cv); |
| mutex_exit(&zp->zone_mount_lock); |
| } |
| |
| /* |
| * The VFS layer is busy with a mount; this zone should wait until all |
| * of its mounts are completed to progress. |
| */ |
| void |
| mount_in_progress(zone_t *zp) |
| { |
| mutex_enter(&zp->zone_mount_lock); |
| while (zp->zone_mounts_in_progress < 0) |
| cv_wait(&zp->zone_mount_cv, &zp->zone_mount_lock); |
| zp->zone_mounts_in_progress++; |
| mutex_exit(&zp->zone_mount_lock); |
| } |
| |
| /* |
| * VFS is done with one mount; wake up any waiting block_mounts() |
| * callers if this is the last mount. |
| */ |
| void |
| mount_completed(zone_t *zp) |
| { |
| mutex_enter(&zp->zone_mount_lock); |
| if (--zp->zone_mounts_in_progress == 0) |
| cv_broadcast(&zp->zone_mount_cv); |
| mutex_exit(&zp->zone_mount_lock); |
| } |
| |
| /* |
| * ZSD routines. |
| * |
| * Zone Specific Data (ZSD) is modeled after Thread Specific Data as |
| * defined by the pthread_key_create() and related interfaces. |
| * |
| * Kernel subsystems may register one or more data items and/or |
| * callbacks to be executed when a zone is created, shutdown, or |
| * destroyed. |
| * |
| * Unlike the thread counterpart, destructor callbacks will be executed |
| * even if the data pointer is NULL and/or there are no constructor |
| * callbacks, so it is the responsibility of such callbacks to check for |
| * NULL data values if necessary. |
| * |
| * The locking strategy and overall picture is as follows: |
| * |
| * When someone calls zone_key_create(), a template ZSD entry is added to the |
| * global list "zsd_registered_keys", protected by zsd_key_lock. While |
| * holding that lock all the existing zones are marked as |
| * ZSD_CREATE_NEEDED and a copy of the ZSD entry added to the per-zone |
| * zone_zsd list (protected by zone_lock). The global list is updated first |
| * (under zone_key_lock) to make sure that newly created zones use the |
| * most recent list of keys. Then under zonehash_lock we walk the zones |
| * and mark them. Similar locking is used in zone_key_delete(). |
| * |
| * The actual create, shutdown, and destroy callbacks are done without |
| * holding any lock. And zsd_flags are used to ensure that the operations |
| * completed so that when zone_key_create (and zone_create) is done, as well as |
| * zone_key_delete (and zone_destroy) is done, all the necessary callbacks |
| * are completed. |
| * |
| * When new zones are created constructor callbacks for all registered ZSD |
| * entries will be called. That also uses the above two phases of marking |
| * what needs to be done, and then running the callbacks without holding |
| * any locks. |
| * |
| * The framework does not provide any locking around zone_getspecific() and |
| * zone_setspecific() apart from that needed for internal consistency, so |
| * callers interested in atomic "test-and-set" semantics will need to provide |
| * their own locking. |
| */ |
| |
| /* |
| * Helper function to find the zsd_entry associated with the key in the |
| * given list. |
| */ |
| static struct zsd_entry * |
| zsd_find(list_t *l, zone_key_t key) |
| { |
| struct zsd_entry *zsd; |
| |
| for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { |
| if (zsd->zsd_key == key) { |
| return (zsd); |
| } |
| } |
| return (NULL); |
| } |
| |
| /* |
| * Helper function to find the zsd_entry associated with the key in the |
| * given list. Move it to the front of the list. |
| */ |
| static struct zsd_entry * |
| zsd_find_mru(list_t *l, zone_key_t key) |
| { |
| struct zsd_entry *zsd; |
| |
| for (zsd = list_head(l); zsd != NULL; zsd = list_next(l, zsd)) { |
| if (zsd->zsd_key == key) { |
| /* |
| * Move to head of list to keep list in MRU order. |
| */ |
| if (zsd != list_head(l)) { |
| list_remove(l, zsd); |
| list_insert_head(l, zsd); |
| } |
| return (zsd); |
| } |
| } |
| return (NULL); |
| } |
| |
| void |
| zone_key_create(zone_key_t *keyp, void *(*create)(zoneid_t), |
| void (*shutdown)(zoneid_t, void *), void (*destroy)(zoneid_t, void *)) |
| { |
| struct zsd_entry *zsdp; |
| struct zsd_entry *t; |
| struct zone *zone; |
| zone_key_t key; |
| |
| zsdp = kmem_zalloc(sizeof (*zsdp), KM_SLEEP); |
| zsdp->zsd_data = NULL; |
| zsdp->zsd_create = create; |
| zsdp->zsd_shutdown = shutdown; |
| zsdp->zsd_destroy = destroy; |
| |
| /* |
| * Insert in global list of callbacks. Makes future zone creations |
| * see it. |
| */ |
| mutex_enter(&zsd_key_lock); |
| key = zsdp->zsd_key = ++zsd_keyval; |
| ASSERT(zsd_keyval != 0); |
| list_insert_tail(&zsd_registered_keys, zsdp); |
| mutex_exit(&zsd_key_lock); |
| |
| /* |
| * Insert for all existing zones and mark them as needing |
| * a create callback. |
| */ |
| mutex_enter(&zonehash_lock); /* stop the world */ |
| for (zone = list_head(&zone_active); zone != NULL; |
| zone = list_next(&zone_active, zone)) { |
| zone_status_t status; |
| |
| mutex_enter(&zone->zone_lock); |
| |
| /* Skip zones that are on the way down or not yet up */ |
| status = zone_status_get(zone); |
| if (status >= ZONE_IS_DOWN || |
| status == ZONE_IS_UNINITIALIZED) { |
| mutex_exit(&zone->zone_lock); |
| continue; |
| } |
| |
| t = zsd_find_mru(&zone->zone_zsd, key); |
| if (t != NULL) { |
| /* |
| * A zsd_configure already inserted it after |
| * we dropped zsd_key_lock above. |
| */ |
| mutex_exit(&zone->zone_lock); |
| continue; |
| } |
| t = kmem_zalloc(sizeof (*t), KM_SLEEP); |
| t->zsd_key = key; |
| t->zsd_create = create; |
| t->zsd_shutdown = shutdown; |
| t->zsd_destroy = destroy; |
| if (create != NULL) { |
| t->zsd_flags = ZSD_CREATE_NEEDED; |
| DTRACE_PROBE2(zsd__create__needed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| list_insert_tail(&zone->zone_zsd, t); |
| mutex_exit(&zone->zone_lock); |
| } |
| mutex_exit(&zonehash_lock); |
| |
| if (create != NULL) { |
| /* Now call the create callback for this key */ |
| zsd_apply_all_zones(zsd_apply_create, key); |
| } |
| /* |
| * It is safe for consumers to use the key now, make it |
| * globally visible. Specifically zone_getspecific() will |
| * always successfully return the zone specific data associated |
| * with the key. |
| */ |
| *keyp = key; |
| |
| } |
| |
| /* |
| * Function called when a module is being unloaded, or otherwise wishes |
| * to unregister its ZSD key and callbacks. |
| * |
| * Remove from the global list and determine the functions that need to |
| * be called under a global lock. Then call the functions without |
| * holding any locks. Finally free up the zone_zsd entries. (The apply |
| * functions need to access the zone_zsd entries to find zsd_data etc.) |
| */ |
| int |
| zone_key_delete(zone_key_t key) |
| { |
| struct zsd_entry *zsdp = NULL; |
| zone_t *zone; |
| |
| mutex_enter(&zsd_key_lock); |
| zsdp = zsd_find_mru(&zsd_registered_keys, key); |
| if (zsdp == NULL) { |
| mutex_exit(&zsd_key_lock); |
| return (-1); |
| } |
| list_remove(&zsd_registered_keys, zsdp); |
| mutex_exit(&zsd_key_lock); |
| |
| mutex_enter(&zonehash_lock); |
| for (zone = list_head(&zone_active); zone != NULL; |
| zone = list_next(&zone_active, zone)) { |
| struct zsd_entry *del; |
| |
| mutex_enter(&zone->zone_lock); |
| del = zsd_find_mru(&zone->zone_zsd, key); |
| if (del == NULL) { |
| /* |
| * Somebody else got here first e.g the zone going |
| * away. |
| */ |
| mutex_exit(&zone->zone_lock); |
| continue; |
| } |
| ASSERT(del->zsd_shutdown == zsdp->zsd_shutdown); |
| ASSERT(del->zsd_destroy == zsdp->zsd_destroy); |
| if (del->zsd_shutdown != NULL && |
| (del->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { |
| del->zsd_flags |= ZSD_SHUTDOWN_NEEDED; |
| DTRACE_PROBE2(zsd__shutdown__needed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| if (del->zsd_destroy != NULL && |
| (del->zsd_flags & ZSD_DESTROY_ALL) == 0) { |
| del->zsd_flags |= ZSD_DESTROY_NEEDED; |
| DTRACE_PROBE2(zsd__destroy__needed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| mutex_exit(&zone->zone_lock); |
| } |
| mutex_exit(&zonehash_lock); |
| kmem_free(zsdp, sizeof (*zsdp)); |
| |
| /* Now call the shutdown and destroy callback for this key */ |
| zsd_apply_all_zones(zsd_apply_shutdown, key); |
| zsd_apply_all_zones(zsd_apply_destroy, key); |
| |
| /* Now we can free up the zsdp structures in each zone */ |
| mutex_enter(&zonehash_lock); |
| for (zone = list_head(&zone_active); zone != NULL; |
| zone = list_next(&zone_active, zone)) { |
| struct zsd_entry *del; |
| |
| mutex_enter(&zone->zone_lock); |
| del = zsd_find(&zone->zone_zsd, key); |
| if (del != NULL) { |
| list_remove(&zone->zone_zsd, del); |
| ASSERT(!(del->zsd_flags & ZSD_ALL_INPROGRESS)); |
| kmem_free(del, sizeof (*del)); |
| } |
| mutex_exit(&zone->zone_lock); |
| } |
| mutex_exit(&zonehash_lock); |
| |
| return (0); |
| } |
| |
| /* |
| * ZSD counterpart of pthread_setspecific(). |
| * |
| * Since all zsd callbacks, including those with no create function, |
| * have an entry in zone_zsd, if the key is registered it is part of |
| * the zone_zsd list. |
| * Return an error if the key wasn't registerd. |
| */ |
| int |
| zone_setspecific(zone_key_t key, zone_t *zone, const void *data) |
| { |
| struct zsd_entry *t; |
| |
| mutex_enter(&zone->zone_lock); |
| t = zsd_find_mru(&zone->zone_zsd, key); |
| if (t != NULL) { |
| /* |
| * Replace old value with new |
| */ |
| t->zsd_data = (void *)data; |
| mutex_exit(&zone->zone_lock); |
| return (0); |
| } |
| mutex_exit(&zone->zone_lock); |
| return (-1); |
| } |
| |
| /* |
| * ZSD counterpart of pthread_getspecific(). |
| */ |
| void * |
| zone_getspecific(zone_key_t key, zone_t *zone) |
| { |
| struct zsd_entry *t; |
| void *data; |
| |
| mutex_enter(&zone->zone_lock); |
| t = zsd_find_mru(&zone->zone_zsd, key); |
| data = (t == NULL ? NULL : t->zsd_data); |
| mutex_exit(&zone->zone_lock); |
| return (data); |
| } |
| |
| /* |
| * Function used to initialize a zone's list of ZSD callbacks and data |
| * when the zone is being created. The callbacks are initialized from |
| * the template list (zsd_registered_keys). The constructor callback is |
| * executed later (once the zone exists and with locks dropped). |
| */ |
| static void |
| zone_zsd_configure(zone_t *zone) |
| { |
| struct zsd_entry *zsdp; |
| struct zsd_entry *t; |
| |
| ASSERT(MUTEX_HELD(&zonehash_lock)); |
| ASSERT(list_head(&zone->zone_zsd) == NULL); |
| mutex_enter(&zone->zone_lock); |
| mutex_enter(&zsd_key_lock); |
| for (zsdp = list_head(&zsd_registered_keys); zsdp != NULL; |
| zsdp = list_next(&zsd_registered_keys, zsdp)) { |
| /* |
| * Since this zone is ZONE_IS_UNCONFIGURED, zone_key_create |
| * should not have added anything to it. |
| */ |
| ASSERT(zsd_find(&zone->zone_zsd, zsdp->zsd_key) == NULL); |
| |
| t = kmem_zalloc(sizeof (*t), KM_SLEEP); |
| t->zsd_key = zsdp->zsd_key; |
| t->zsd_create = zsdp->zsd_create; |
| t->zsd_shutdown = zsdp->zsd_shutdown; |
| t->zsd_destroy = zsdp->zsd_destroy; |
| if (zsdp->zsd_create != NULL) { |
| t->zsd_flags = ZSD_CREATE_NEEDED; |
| DTRACE_PROBE2(zsd__create__needed, |
| zone_t *, zone, zone_key_t, zsdp->zsd_key); |
| } |
| list_insert_tail(&zone->zone_zsd, t); |
| } |
| mutex_exit(&zsd_key_lock); |
| mutex_exit(&zone->zone_lock); |
| } |
| |
| enum zsd_callback_type { ZSD_CREATE, ZSD_SHUTDOWN, ZSD_DESTROY }; |
| |
| /* |
| * Helper function to execute shutdown or destructor callbacks. |
| */ |
| static void |
| zone_zsd_callbacks(zone_t *zone, enum zsd_callback_type ct) |
| { |
| struct zsd_entry *t; |
| |
| ASSERT(ct == ZSD_SHUTDOWN || ct == ZSD_DESTROY); |
| ASSERT(ct != ZSD_SHUTDOWN || zone_status_get(zone) >= ZONE_IS_EMPTY); |
| ASSERT(ct != ZSD_DESTROY || zone_status_get(zone) >= ZONE_IS_DOWN); |
| |
| /* |
| * Run the callback solely based on what is registered for the zone |
| * in zone_zsd. The global list can change independently of this |
| * as keys are registered and unregistered and we don't register new |
| * callbacks for a zone that is in the process of going away. |
| */ |
| mutex_enter(&zone->zone_lock); |
| for (t = list_head(&zone->zone_zsd); t != NULL; |
| t = list_next(&zone->zone_zsd, t)) { |
| zone_key_t key = t->zsd_key; |
| |
| /* Skip if no callbacks registered */ |
| |
| if (ct == ZSD_SHUTDOWN) { |
| if (t->zsd_shutdown != NULL && |
| (t->zsd_flags & ZSD_SHUTDOWN_ALL) == 0) { |
| t->zsd_flags |= ZSD_SHUTDOWN_NEEDED; |
| DTRACE_PROBE2(zsd__shutdown__needed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| } else { |
| if (t->zsd_destroy != NULL && |
| (t->zsd_flags & ZSD_DESTROY_ALL) == 0) { |
| t->zsd_flags |= ZSD_DESTROY_NEEDED; |
| DTRACE_PROBE2(zsd__destroy__needed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| } |
| } |
| mutex_exit(&zone->zone_lock); |
| |
| /* Now call the shutdown and destroy callback for this key */ |
| zsd_apply_all_keys(zsd_apply_shutdown, zone); |
| zsd_apply_all_keys(zsd_apply_destroy, zone); |
| |
| } |
| |
| /* |
| * Called when the zone is going away; free ZSD-related memory, and |
| * destroy the zone_zsd list. |
| */ |
| static void |
| zone_free_zsd(zone_t *zone) |
| { |
| struct zsd_entry *t, *next; |
| |
| /* |
| * Free all the zsd_entry's we had on this zone. |
| */ |
| mutex_enter(&zone->zone_lock); |
| for (t = list_head(&zone->zone_zsd); t != NULL; t = next) { |
| next = list_next(&zone->zone_zsd, t); |
| list_remove(&zone->zone_zsd, t); |
| ASSERT(!(t->zsd_flags & ZSD_ALL_INPROGRESS)); |
| kmem_free(t, sizeof (*t)); |
| } |
| list_destroy(&zone->zone_zsd); |
| mutex_exit(&zone->zone_lock); |
| |
| } |
| |
| /* |
| * Apply a function to all zones for particular key value. |
| * |
| * The applyfn has to drop zonehash_lock if it does some work, and |
| * then reacquire it before it returns. |
| * When the lock is dropped we don't follow list_next even |
| * if it is possible to do so without any hazards. This is |
| * because we want the design to allow for the list of zones |
| * to change in any arbitrary way during the time the |
| * lock was dropped. |
| * |
| * It is safe to restart the loop at list_head since the applyfn |
| * changes the zsd_flags as it does work, so a subsequent |
| * pass through will have no effect in applyfn, hence the loop will terminate |
| * in at worst O(N^2). |
| */ |
| static void |
| zsd_apply_all_zones(zsd_applyfn_t *applyfn, zone_key_t key) |
| { |
| zone_t *zone; |
| |
| mutex_enter(&zonehash_lock); |
| zone = list_head(&zone_active); |
| while (zone != NULL) { |
| if ((applyfn)(&zonehash_lock, B_FALSE, zone, key)) { |
| /* Lock dropped - restart at head */ |
| zone = list_head(&zone_active); |
| } else { |
| zone = list_next(&zone_active, zone); |
| } |
| } |
| mutex_exit(&zonehash_lock); |
| } |
| |
| /* |
| * Apply a function to all keys for a particular zone. |
| * |
| * The applyfn has to drop zonehash_lock if it does some work, and |
| * then reacquire it before it returns. |
| * When the lock is dropped we don't follow list_next even |
| * if it is possible to do so without any hazards. This is |
| * because we want the design to allow for the list of zsd callbacks |
| * to change in any arbitrary way during the time the |
| * lock was dropped. |
| * |
| * It is safe to restart the loop at list_head since the applyfn |
| * changes the zsd_flags as it does work, so a subsequent |
| * pass through will have no effect in applyfn, hence the loop will terminate |
| * in at worst O(N^2). |
| */ |
| static void |
| zsd_apply_all_keys(zsd_applyfn_t *applyfn, zone_t *zone) |
| { |
| struct zsd_entry *t; |
| |
| mutex_enter(&zone->zone_lock); |
| t = list_head(&zone->zone_zsd); |
| while (t != NULL) { |
| if ((applyfn)(NULL, B_TRUE, zone, t->zsd_key)) { |
| /* Lock dropped - restart at head */ |
| t = list_head(&zone->zone_zsd); |
| } else { |
| t = list_next(&zone->zone_zsd, t); |
| } |
| } |
| mutex_exit(&zone->zone_lock); |
| } |
| |
| /* |
| * Call the create function for the zone and key if CREATE_NEEDED |
| * is set. |
| * If some other thread gets here first and sets CREATE_INPROGRESS, then |
| * we wait for that thread to complete so that we can ensure that |
| * all the callbacks are done when we've looped over all zones/keys. |
| * |
| * When we call the create function, we drop the global held by the |
| * caller, and return true to tell the caller it needs to re-evalute the |
| * state. |
| * If the caller holds zone_lock then zone_lock_held is set, and zone_lock |
| * remains held on exit. |
| */ |
| static boolean_t |
| zsd_apply_create(kmutex_t *lockp, boolean_t zone_lock_held, |
| zone_t *zone, zone_key_t key) |
| { |
| void *result; |
| struct zsd_entry *t; |
| boolean_t dropped; |
| |
| if (lockp != NULL) { |
| ASSERT(MUTEX_HELD(lockp)); |
| } |
| if (zone_lock_held) { |
| ASSERT(MUTEX_HELD(&zone->zone_lock)); |
| } else { |
| mutex_enter(&zone->zone_lock); |
| } |
| |
| t = zsd_find(&zone->zone_zsd, key); |
| if (t == NULL) { |
| /* |
| * Somebody else got here first e.g the zone going |
| * away. |
| */ |
| if (!zone_lock_held) |
| mutex_exit(&zone->zone_lock); |
| return (B_FALSE); |
| } |
| dropped = B_FALSE; |
| if (zsd_wait_for_inprogress(zone, t, lockp)) |
| dropped = B_TRUE; |
| |
| if (t->zsd_flags & ZSD_CREATE_NEEDED) { |
| t->zsd_flags &= ~ZSD_CREATE_NEEDED; |
| t->zsd_flags |= ZSD_CREATE_INPROGRESS; |
| DTRACE_PROBE2(zsd__create__inprogress, |
| zone_t *, zone, zone_key_t, key); |
| mutex_exit(&zone->zone_lock); |
| if (lockp != NULL) |
| mutex_exit(lockp); |
| |
| dropped = B_TRUE; |
| ASSERT(t->zsd_create != NULL); |
| DTRACE_PROBE2(zsd__create__start, |
| zone_t *, zone, zone_key_t, key); |
| |
| result = (*t->zsd_create)(zone->zone_id); |
| |
| DTRACE_PROBE2(zsd__create__end, |
| zone_t *, zone, voidn *, result); |
| |
| ASSERT(result != NULL); |
| if (lockp != NULL) |
| mutex_enter(lockp); |
| mutex_enter(&zone->zone_lock); |
| t->zsd_data = result; |
| t->zsd_flags &= ~ZSD_CREATE_INPROGRESS; |
| t->zsd_flags |= ZSD_CREATE_COMPLETED; |
| cv_broadcast(&t->zsd_cv); |
| DTRACE_PROBE2(zsd__create__completed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| if (!zone_lock_held) |
| mutex_exit(&zone->zone_lock); |
| return (dropped); |
| } |
| |
| /* |
| * Call the shutdown function for the zone and key if SHUTDOWN_NEEDED |
| * is set. |
| * If some other thread gets here first and sets *_INPROGRESS, then |
| * we wait for that thread to complete so that we can ensure that |
| * all the callbacks are done when we've looped over all zones/keys. |
| * |
| * When we call the shutdown function, we drop the global held by the |
| * caller, and return true to tell the caller it needs to re-evalute the |
| * state. |
| * If the caller holds zone_lock then zone_lock_held is set, and zone_lock |
| * remains held on exit. |
| */ |
| static boolean_t |
| zsd_apply_shutdown(kmutex_t *lockp, boolean_t zone_lock_held, |
| zone_t *zone, zone_key_t key) |
| { |
| struct zsd_entry *t; |
| void *data; |
| boolean_t dropped; |
| |
| if (lockp != NULL) { |
| ASSERT(MUTEX_HELD(lockp)); |
| } |
| if (zone_lock_held) { |
| ASSERT(MUTEX_HELD(&zone->zone_lock)); |
| } else { |
| mutex_enter(&zone->zone_lock); |
| } |
| |
| t = zsd_find(&zone->zone_zsd, key); |
| if (t == NULL) { |
| /* |
| * Somebody else got here first e.g the zone going |
| * away. |
| */ |
| if (!zone_lock_held) |
| mutex_exit(&zone->zone_lock); |
| return (B_FALSE); |
| } |
| dropped = B_FALSE; |
| if (zsd_wait_for_creator(zone, t, lockp)) |
| dropped = B_TRUE; |
| |
| if (zsd_wait_for_inprogress(zone, t, lockp)) |
| dropped = B_TRUE; |
| |
| if (t->zsd_flags & ZSD_SHUTDOWN_NEEDED) { |
| t->zsd_flags &= ~ZSD_SHUTDOWN_NEEDED; |
| t->zsd_flags |= ZSD_SHUTDOWN_INPROGRESS; |
| DTRACE_PROBE2(zsd__shutdown__inprogress, |
| zone_t *, zone, zone_key_t, key); |
| mutex_exit(&zone->zone_lock); |
| if (lockp != NULL) |
| mutex_exit(lockp); |
| dropped = B_TRUE; |
| |
| ASSERT(t->zsd_shutdown != NULL); |
| data = t->zsd_data; |
| |
| DTRACE_PROBE2(zsd__shutdown__start, |
| zone_t *, zone, zone_key_t, key); |
| |
| (t->zsd_shutdown)(zone->zone_id, data); |
| DTRACE_PROBE2(zsd__shutdown__end, |
| zone_t *, zone, zone_key_t, key); |
| |
| if (lockp != NULL) |
| mutex_enter(lockp); |
| mutex_enter(&zone->zone_lock); |
| t->zsd_flags &= ~ZSD_SHUTDOWN_INPROGRESS; |
| t->zsd_flags |= ZSD_SHUTDOWN_COMPLETED; |
| cv_broadcast(&t->zsd_cv); |
| DTRACE_PROBE2(zsd__shutdown__completed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| if (!zone_lock_held) |
| mutex_exit(&zone->zone_lock); |
| return (dropped); |
| } |
| |
| /* |
| * Call the destroy function for the zone and key if DESTROY_NEEDED |
| * is set. |
| * If some other thread gets here first and sets *_INPROGRESS, then |
| * we wait for that thread to complete so that we can ensure that |
| * all the callbacks are done when we've looped over all zones/keys. |
| * |
| * When we call the destroy function, we drop the global held by the |
| * caller, and return true to tell the caller it needs to re-evalute the |
| * state. |
| * If the caller holds zone_lock then zone_lock_held is set, and zone_lock |
| * remains held on exit. |
| */ |
| static boolean_t |
| zsd_apply_destroy(kmutex_t *lockp, boolean_t zone_lock_held, |
| zone_t *zone, zone_key_t key) |
| { |
| struct zsd_entry *t; |
| void *data; |
| boolean_t dropped; |
| |
| if (lockp != NULL) { |
| ASSERT(MUTEX_HELD(lockp)); |
| } |
| if (zone_lock_held) { |
| ASSERT(MUTEX_HELD(&zone->zone_lock)); |
| } else { |
| mutex_enter(&zone->zone_lock); |
| } |
| |
| t = zsd_find(&zone->zone_zsd, key); |
| if (t == NULL) { |
| /* |
| * Somebody else got here first e.g the zone going |
| * away. |
| */ |
| if (!zone_lock_held) |
| mutex_exit(&zone->zone_lock); |
| return (B_FALSE); |
| } |
| dropped = B_FALSE; |
| if (zsd_wait_for_creator(zone, t, lockp)) |
| dropped = B_TRUE; |
| |
| if (zsd_wait_for_inprogress(zone, t, lockp)) |
| dropped = B_TRUE; |
| |
| if (t->zsd_flags & ZSD_DESTROY_NEEDED) { |
| t->zsd_flags &= ~ZSD_DESTROY_NEEDED; |
| t->zsd_flags |= ZSD_DESTROY_INPROGRESS; |
| DTRACE_PROBE2(zsd__destroy__inprogress, |
| zone_t *, zone, zone_key_t, key); |
| mutex_exit(&zone->zone_lock); |
| if (lockp != NULL) |
| mutex_exit(lockp); |
| dropped = B_TRUE; |
| |
| ASSERT(t->zsd_destroy != NULL); |
| data = t->zsd_data; |
| DTRACE_PROBE2(zsd__destroy__start, |
| zone_t *, zone, zone_key_t, key); |
| |
| (t->zsd_destroy)(zone->zone_id, data); |
| DTRACE_PROBE2(zsd__destroy__end, |
| zone_t *, zone, zone_key_t, key); |
| |
| if (lockp != NULL) |
| mutex_enter(lockp); |
| mutex_enter(&zone->zone_lock); |
| t->zsd_data = NULL; |
| t->zsd_flags &= ~ZSD_DESTROY_INPROGRESS; |
| t->zsd_flags |= ZSD_DESTROY_COMPLETED; |
| cv_broadcast(&t->zsd_cv); |
| DTRACE_PROBE2(zsd__destroy__completed, |
| zone_t *, zone, zone_key_t, key); |
| } |
| if (!zone_lock_held) |
| mutex_exit(&zone->zone_lock); |
| return (dropped); |
| } |
| |
| /* |
| * Wait for any CREATE_NEEDED flag to be cleared. |
| * Returns true if lockp was temporarily dropped while waiting. |
| */ |
| static boolean_t |
| zsd_wait_for_creator(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) |
| { |
| boolean_t dropped = B_FALSE; |
| |
| while (t->zsd_flags & ZSD_CREATE_NEEDED) { |
| DTRACE_PROBE2(zsd__wait__for__creator, |
| zone_t *, zone, struct zsd_entry *, t); |
| if (lockp != NULL) { |
| dropped = B_TRUE; |
| mutex_exit(lockp); |
| } |
| cv_wait(&t->zsd_cv, &zone->zone_lock); |
| if (lockp != NULL) { |
| /* First drop zone_lock to preserve order */ |
| mutex_exit(&zone->zone_lock); |
| mutex_enter(lockp); |
| mutex_enter(&zone->zone_lock); |
| } |
| } |
| return (dropped); |
| } |
| |
| /* |
| * Wait for any INPROGRESS flag to be cleared. |
| * Returns true if lockp was temporarily dropped while waiting. |
| */ |
| static boolean_t |
| zsd_wait_for_inprogress(zone_t *zone, struct zsd_entry *t, kmutex_t *lockp) |
| { |
| boolean_t dropped = B_FALSE; |
| |
| while (t->zsd_flags & ZSD_ALL_INPROGRESS) { |
| DTRACE_PROBE2(zsd__wait__for__inprogress, |
| zone_t *, zone, struct zsd_entry *, t); |
| if (lockp != NULL) { |
| dropped = B_TRUE; |
| mutex_exit(lockp); |
| } |
| cv_wait(&t->zsd_cv, &zone->zone_lock); |
| if (lockp != NULL) { |
| /* First drop zone_lock to preserve order */ |
| mutex_exit(&zone->zone_lock); |
| mutex_enter(lockp); |
| mutex_enter(&zone->zone_lock); |
| } |
| } |
| return (dropped); |
| } |
| |
| /* |
| * Frees memory associated with the zone dataset list. |
| */ |
| static void |
| zone_free_datasets(zone_t *zone) |
| { |
| zone_dataset_t *t, *next; |
| |
| for (t = list_head(&zone->zone_datasets); t != NULL; t = next) { |
| next = list_next(&zone->zone_datasets, t); |
| list_remove(&zone->zone_datasets, t); |
| kmem_free(t->zd_dataset, strlen(t->zd_dataset) + 1); |
| kmem_free(t, sizeof (*t)); |
| } |
| list_destroy(&zone->zone_datasets); |
| } |
| |
| /* |
| * zone.cpu-shares resource control support. |
| */ |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_cpu_shares_usage(rctl_t *rctl, struct proc *p) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| return (p->p_zone->zone_shares); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_cpu_shares_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, |
| rctl_qty_t nv) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| |
| e->rcep_p.zone->zone_shares = nv; |
| return (0); |
| } |
| |
| static rctl_ops_t zone_cpu_shares_ops = { |
| rcop_no_action, |
| zone_cpu_shares_usage, |
| zone_cpu_shares_set, |
| rcop_no_test |
| }; |
| |
| /* |
| * zone.cpu-cap resource control support. |
| */ |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_cpu_cap_get(rctl_t *rctl, struct proc *p) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| return (cpucaps_zone_get(p->p_zone)); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_cpu_cap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, |
| rctl_qty_t nv) |
| { |
| zone_t *zone = e->rcep_p.zone; |
| |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| |
| if (zone == NULL) |
| return (0); |
| |
| /* |
| * set cap to the new value. |
| */ |
| return (cpucaps_zone_set(zone, nv)); |
| } |
| |
| static rctl_ops_t zone_cpu_cap_ops = { |
| rcop_no_action, |
| zone_cpu_cap_get, |
| zone_cpu_cap_set, |
| rcop_no_test |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_lwps_usage(rctl_t *r, proc_t *p) |
| { |
| rctl_qty_t nlwps; |
| zone_t *zone = p->p_zone; |
| |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| |
| mutex_enter(&zone->zone_nlwps_lock); |
| nlwps = zone->zone_nlwps; |
| mutex_exit(&zone->zone_nlwps_lock); |
| |
| return (nlwps); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_lwps_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, |
| rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t nlwps; |
| |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); |
| nlwps = e->rcep_p.zone->zone_nlwps; |
| |
| if (nlwps + incr > rcntl->rcv_value) |
| return (1); |
| |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_lwps_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| e->rcep_p.zone->zone_nlwps_ctl = nv; |
| return (0); |
| } |
| |
| static rctl_ops_t zone_lwps_ops = { |
| rcop_no_action, |
| zone_lwps_usage, |
| zone_lwps_set, |
| zone_lwps_test, |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_procs_usage(rctl_t *r, proc_t *p) |
| { |
| rctl_qty_t nprocs; |
| zone_t *zone = p->p_zone; |
| |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| |
| mutex_enter(&zone->zone_nlwps_lock); |
| nprocs = zone->zone_nprocs; |
| mutex_exit(&zone->zone_nlwps_lock); |
| |
| return (nprocs); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_procs_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rcntl, |
| rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t nprocs; |
| |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| ASSERT(MUTEX_HELD(&(e->rcep_p.zone->zone_nlwps_lock))); |
| nprocs = e->rcep_p.zone->zone_nprocs; |
| |
| if (nprocs + incr > rcntl->rcv_value) |
| return (1); |
| |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_procs_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, rctl_qty_t nv) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| e->rcep_p.zone->zone_nprocs_ctl = nv; |
| return (0); |
| } |
| |
| static rctl_ops_t zone_procs_ops = { |
| rcop_no_action, |
| zone_procs_usage, |
| zone_procs_set, |
| zone_procs_test, |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_shmmax_usage(rctl_t *rctl, struct proc *p) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| return (p->p_zone->zone_shmmax); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_shmmax_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, |
| rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t v; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| v = e->rcep_p.zone->zone_shmmax + incr; |
| if (v > rval->rcv_value) |
| return (1); |
| return (0); |
| } |
| |
| static rctl_ops_t zone_shmmax_ops = { |
| rcop_no_action, |
| zone_shmmax_usage, |
| rcop_no_set, |
| zone_shmmax_test |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_shmmni_usage(rctl_t *rctl, struct proc *p) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| return (p->p_zone->zone_ipc.ipcq_shmmni); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_shmmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, |
| rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t v; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| v = e->rcep_p.zone->zone_ipc.ipcq_shmmni + incr; |
| if (v > rval->rcv_value) |
| return (1); |
| return (0); |
| } |
| |
| static rctl_ops_t zone_shmmni_ops = { |
| rcop_no_action, |
| zone_shmmni_usage, |
| rcop_no_set, |
| zone_shmmni_test |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_semmni_usage(rctl_t *rctl, struct proc *p) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| return (p->p_zone->zone_ipc.ipcq_semmni); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_semmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, |
| rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t v; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| v = e->rcep_p.zone->zone_ipc.ipcq_semmni + incr; |
| if (v > rval->rcv_value) |
| return (1); |
| return (0); |
| } |
| |
| static rctl_ops_t zone_semmni_ops = { |
| rcop_no_action, |
| zone_semmni_usage, |
| rcop_no_set, |
| zone_semmni_test |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_msgmni_usage(rctl_t *rctl, struct proc *p) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| return (p->p_zone->zone_ipc.ipcq_msgmni); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_msgmni_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, rctl_val_t *rval, |
| rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t v; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| v = e->rcep_p.zone->zone_ipc.ipcq_msgmni + incr; |
| if (v > rval->rcv_value) |
| return (1); |
| return (0); |
| } |
| |
| static rctl_ops_t zone_msgmni_ops = { |
| rcop_no_action, |
| zone_msgmni_usage, |
| rcop_no_set, |
| zone_msgmni_test |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_locked_mem_usage(rctl_t *rctl, struct proc *p) |
| { |
| rctl_qty_t q; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| mutex_enter(&p->p_zone->zone_mem_lock); |
| q = p->p_zone->zone_locked_mem; |
| mutex_exit(&p->p_zone->zone_mem_lock); |
| return (q); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_locked_mem_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, |
| rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t q; |
| zone_t *z; |
| |
| z = e->rcep_p.zone; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(MUTEX_HELD(&z->zone_mem_lock)); |
| q = z->zone_locked_mem; |
| if (q + incr > rcntl->rcv_value) |
| return (1); |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_locked_mem_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, |
| rctl_qty_t nv) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| e->rcep_p.zone->zone_locked_mem_ctl = nv; |
| return (0); |
| } |
| |
| static rctl_ops_t zone_locked_mem_ops = { |
| rcop_no_action, |
| zone_locked_mem_usage, |
| zone_locked_mem_set, |
| zone_locked_mem_test |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_max_swap_usage(rctl_t *rctl, struct proc *p) |
| { |
| rctl_qty_t q; |
| zone_t *z = p->p_zone; |
| |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| mutex_enter(&z->zone_mem_lock); |
| q = z->zone_max_swap; |
| mutex_exit(&z->zone_mem_lock); |
| return (q); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_max_swap_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, |
| rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t q; |
| zone_t *z; |
| |
| z = e->rcep_p.zone; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(MUTEX_HELD(&z->zone_mem_lock)); |
| q = z->zone_max_swap; |
| if (q + incr > rcntl->rcv_value) |
| return (1); |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_max_swap_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, |
| rctl_qty_t nv) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| e->rcep_p.zone->zone_max_swap_ctl = nv; |
| return (0); |
| } |
| |
| static rctl_ops_t zone_max_swap_ops = { |
| rcop_no_action, |
| zone_max_swap_usage, |
| zone_max_swap_set, |
| zone_max_swap_test |
| }; |
| |
| /*ARGSUSED*/ |
| static rctl_qty_t |
| zone_max_lofi_usage(rctl_t *rctl, struct proc *p) |
| { |
| rctl_qty_t q; |
| zone_t *z = p->p_zone; |
| |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| mutex_enter(&z->zone_rctl_lock); |
| q = z->zone_max_lofi; |
| mutex_exit(&z->zone_rctl_lock); |
| return (q); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_max_lofi_test(rctl_t *r, proc_t *p, rctl_entity_p_t *e, |
| rctl_val_t *rcntl, rctl_qty_t incr, uint_t flags) |
| { |
| rctl_qty_t q; |
| zone_t *z; |
| |
| z = e->rcep_p.zone; |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(MUTEX_HELD(&z->zone_rctl_lock)); |
| q = z->zone_max_lofi; |
| if (q + incr > rcntl->rcv_value) |
| return (1); |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| zone_max_lofi_set(rctl_t *rctl, struct proc *p, rctl_entity_p_t *e, |
| rctl_qty_t nv) |
| { |
| ASSERT(MUTEX_HELD(&p->p_lock)); |
| ASSERT(e->rcep_t == RCENTITY_ZONE); |
| if (e->rcep_p.zone == NULL) |
| return (0); |
| e->rcep_p.zone->zone_max_lofi_ctl = nv; |
| return (0); |
| } |
| |
| static rctl_ops_t zone_max_lofi_ops = { |
| rcop_no_action, |
| zone_max_lofi_usage, |
| zone_max_lofi_set, |
| zone_max_lofi_test |
| }; |
| |
| /* |
| * Helper function to brand the zone with a unique ID. |
| */ |
| static void |
| zone_uniqid(zone_t *zone) |
| { |
| static uint64_t uniqid = 0; |
| |
| ASSERT(MUTEX_HELD(&zonehash_lock)); |
| zone->zone_uniqid = uniqid++; |
| } |
| |
| /* |
| * Returns a held pointer to the "kcred" for the specified zone. |
| */ |
| struct cred * |
| zone_get_kcred(zoneid_t zoneid) |
| { |
| zone_t *zone; |
| cred_t *cr; |
| |
| if ((zone = zone_find_by_id(zoneid)) == NULL) |
| return (NULL); |
| cr = zone->zone_kcred; |
| crhold(cr); |
| zone_rele(zone); |
| return (cr); |
| } |
| |
| static int |
| zone_lockedmem_kstat_update(kstat_t *ksp, int rw) |
| { |
| zone_t *zone = ksp->ks_private; |
| zone_kstat_t *zk = ksp->ks_data; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| zk->zk_usage.value.ui64 = zone->zone_locked_mem; |
| zk->zk_value.value.ui64 = zone->zone_locked_mem_ctl; |
| return (0); |
| } |
| |
| static int |
| zone_nprocs_kstat_update(kstat_t *ksp, int rw) |
| { |
| zone_t *zone = ksp->ks_private; |
| zone_kstat_t *zk = ksp->ks_data; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| zk->zk_usage.value.ui64 = zone->zone_nprocs; |
| zk->zk_value.value.ui64 = zone->zone_nprocs_ctl; |
| return (0); |
| } |
| |
| static int |
| zone_swapresv_kstat_update(kstat_t *ksp, int rw) |
| { |
| zone_t *zone = ksp->ks_private; |
| zone_kstat_t *zk = ksp->ks_data; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| zk->zk_usage.value.ui64 = zone->zone_max_swap; |
| zk->zk_value.value.ui64 = zone->zone_max_swap_ctl; |
| return (0); |
| } |
| |
| static kstat_t * |
| zone_kstat_create_common(zone_t *zone, char *name, |
| int (*updatefunc) (kstat_t *, int)) |
| { |
| kstat_t *ksp; |
| zone_kstat_t *zk; |
| |
| ksp = rctl_kstat_create_zone(zone, name, KSTAT_TYPE_NAMED, |
| sizeof (zone_kstat_t) / sizeof (kstat_named_t), |
| KSTAT_FLAG_VIRTUAL); |
| |
| if (ksp == NULL) |
| return (NULL); |
| |
| zk = ksp->ks_data = kmem_alloc(sizeof (zone_kstat_t), KM_SLEEP); |
| ksp->ks_data_size += strlen(zone->zone_name) + 1; |
| kstat_named_init(&zk->zk_zonename, "zonename", KSTAT_DATA_STRING); |
| kstat_named_setstr(&zk->zk_zonename, zone->zone_name); |
| kstat_named_init(&zk->zk_usage, "usage", KSTAT_DATA_UINT64); |
| kstat_named_init(&zk->zk_value, "value", KSTAT_DATA_UINT64); |
| ksp->ks_update = updatefunc; |
| ksp->ks_private = zone; |
| kstat_install(ksp); |
| return (ksp); |
| } |
| |
| |
| static int |
| zone_mcap_kstat_update(kstat_t *ksp, int rw) |
| { |
| zone_t *zone = ksp->ks_private; |
| zone_mcap_kstat_t *zmp = ksp->ks_data; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| zmp->zm_pgpgin.value.ui64 = zone->zone_pgpgin; |
| zmp->zm_anonpgin.value.ui64 = zone->zone_anonpgin; |
| zmp->zm_execpgin.value.ui64 = zone->zone_execpgin; |
| zmp->zm_fspgin.value.ui64 = zone->zone_fspgin; |
| zmp->zm_anon_alloc_fail.value.ui64 = zone->zone_anon_alloc_fail; |
| |
| return (0); |
| } |
| |
| static kstat_t * |
| zone_mcap_kstat_create(zone_t *zone) |
| { |
| kstat_t *ksp; |
| zone_mcap_kstat_t *zmp; |
| |
| if ((ksp = kstat_create_zone("memory_cap", zone->zone_id, |
| zone->zone_name, "zone_memory_cap", KSTAT_TYPE_NAMED, |
| sizeof (zone_mcap_kstat_t) / sizeof (kstat_named_t), |
| KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) |
| return (NULL); |
| |
| if (zone->zone_id != GLOBAL_ZONEID) |
| kstat_zone_add(ksp, GLOBAL_ZONEID); |
| |
| zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_mcap_kstat_t), KM_SLEEP); |
| ksp->ks_data_size += strlen(zone->zone_name) + 1; |
| ksp->ks_lock = &zone->zone_mcap_lock; |
| zone->zone_mcap_stats = zmp; |
| |
| /* The kstat "name" field is not large enough for a full zonename */ |
| kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); |
| kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); |
| kstat_named_init(&zmp->zm_pgpgin, "pgpgin", KSTAT_DATA_UINT64); |
| kstat_named_init(&zmp->zm_anonpgin, "anonpgin", KSTAT_DATA_UINT64); |
| kstat_named_init(&zmp->zm_execpgin, "execpgin", KSTAT_DATA_UINT64); |
| kstat_named_init(&zmp->zm_fspgin, "fspgin", KSTAT_DATA_UINT64); |
| kstat_named_init(&zmp->zm_anon_alloc_fail, "anon_alloc_fail", |
| KSTAT_DATA_UINT64); |
| |
| ksp->ks_update = zone_mcap_kstat_update; |
| ksp->ks_private = zone; |
| |
| kstat_install(ksp); |
| return (ksp); |
| } |
| |
| static int |
| zone_misc_kstat_update(kstat_t *ksp, int rw) |
| { |
| zone_t *zone = ksp->ks_private; |
| zone_misc_kstat_t *zmp = ksp->ks_data; |
| hrtime_t hrtime; |
| uint64_t tmp; |
| |
| if (rw == KSTAT_WRITE) |
| return (EACCES); |
| |
| tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME); |
| hrtime = UINT64_OVERFLOW_TO_INT64(tmp); |
| scalehrtime(&hrtime); |
| zmp->zm_stime.value.ui64 = hrtime; |
| |
| tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME); |
| hrtime = UINT64_OVERFLOW_TO_INT64(tmp); |
| scalehrtime(&hrtime); |
| zmp->zm_utime.value.ui64 = hrtime; |
| |
| tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME); |
| hrtime = UINT64_OVERFLOW_TO_INT64(tmp); |
| scalehrtime(&hrtime); |
| zmp->zm_wtime.value.ui64 = hrtime; |
| |
| zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0]; |
| zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1]; |
| zmp->zm_avenrun15.value.ui32 = zone->zone_avenrun[2]; |
| |
| zmp->zm_ffcap.value.ui32 = zone->zone_ffcap; |
| zmp->zm_ffnoproc.value.ui32 = zone->zone_ffnoproc; |
| zmp->zm_ffnomem.value.ui32 = zone->zone_ffnomem; |
| zmp->zm_ffmisc.value.ui32 = zone->zone_ffmisc; |
| |
| zmp->zm_nested_intp.value.ui32 = zone->zone_nested_intp; |
| |
| zmp->zm_init_pid.value.ui32 = zone->zone_proc_initpid; |
| zmp->zm_boot_time.value.ui64 = (uint64_t)zone->zone_boot_time; |
| |
| return (0); |
| } |
| |
| static kstat_t * |
| zone_misc_kstat_create(zone_t *zone) |
| { |
| kstat_t *ksp; |
| zone_misc_kstat_t *zmp; |
| |
| if ((ksp = kstat_create_zone("zones", zone->zone_id, |
| zone->zone_name, "zone_misc", KSTAT_TYPE_NAMED, |
| sizeof (zone_misc_kstat_t) / sizeof (kstat_named_t), |
| KSTAT_FLAG_VIRTUAL, zone->zone_id)) == NULL) |
| return (NULL); |
| |
| if (zone->zone_id != GLOBAL_ZONEID) |
| kstat_zone_add(ksp, GLOBAL_ZONEID); |
| |
| zmp = ksp->ks_data = kmem_zalloc(sizeof (zone_misc_kstat_t), KM_SLEEP); |
| ksp->ks_data_size += strlen(zone->zone_name) + 1; |
| ksp->ks_lock = &zone->zone_misc_lock; |
| zone->zone_misc_stats = zmp; |
| |
| /* The kstat "name" field is not large enough for a full zonename */ |
| kstat_named_init(&zmp->zm_zonename, "zonename", KSTAT_DATA_STRING); |
| kstat_named_setstr(&zmp->zm_zonename, zone->zone_name); |
| kstat_named_init(&zmp->zm_utime, "nsec_user", KSTAT_DATA_UINT64); |
| kstat_named_init(&zmp->zm_stime, "nsec_sys", KSTAT_DATA_UINT64); |
| kstat_named_init(&zmp->zm_wtime, "nsec_waitrq", KSTAT_DATA_UINT64); |
| kstat_named_init(&zmp->zm_avenrun1, "avenrun_1min", KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_avenrun5, "avenrun_5min", KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_avenrun15, "avenrun_15min", |
| KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_ffcap, "forkfail_cap", KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_ffnoproc, "forkfail_noproc", |
| KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_ffnomem, "forkfail_nomem", KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_ffmisc, "forkfail_misc", KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_nested_intp, "nested_interp", |
| KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_init_pid, "init_pid", KSTAT_DATA_UINT32); |
| kstat_named_init(&zmp->zm_boot_time, "boot_time", KSTAT_DATA_UINT64); |
| |
| ksp->ks_update = zone_misc_kstat_update; |
| ksp->ks_private = zone; |
| |
| kstat_install(ksp); |
| return (ksp); |
| } |
| |
| static void |
| zone_kstat_create(zone_t *zone) |
| { |
| zone->zone_lockedmem_kstat = zone_kstat_create_common(zone, |
| "lockedmem", zone_lockedmem_kstat_update); |
| zone->zone_swapresv_kstat = zone_kstat_create_common(zone, |
| "swapresv", zone_swapresv_kstat_update); |
| zone->zone_nprocs_kstat = zone_kstat_create_common(zone, |
| "nprocs", zone_nprocs_kstat_update); |
| |
| if ((zone->zone_mcap_ksp = zone_mcap_kstat_create(zone)) == NULL) { |
| zone->zone_mcap_stats = kmem_zalloc( |
| sizeof (zone_mcap_kstat_t), KM_SLEEP); |
| } |
| |
| if ((zone->zone_misc_ksp = zone_misc_kstat_create(zone)) == NULL) { |
| zone->zone_misc_stats = kmem_zalloc( |
| sizeof (zone_misc_kstat_t), KM_SLEEP); |
| } |
| } |
| |
| static void |
| zone_kstat_delete_common(kstat_t **pkstat, size_t datasz) |
| { |
| void *data; |
| |
| if (*pkstat != NULL) { |
| data = (*pkstat)->ks_data; |
| kstat_delete(*pkstat); |
| kmem_free(data, datasz); |
| *pkstat = NULL; |
| } |
| } |
| |
| static void |
| zone_kstat_delete(zone_t *zone) |
| { |
| zone_kstat_delete_common(&zone->zone_lockedmem_kstat, |
| sizeof (zone_kstat_t)); |
| zone_kstat_delete_common(&zone->zone_swapresv_kstat, |
| sizeof (zone_kstat_t)); |
| zone_kstat_delete_common(&zone->zone_nprocs_kstat, |
| sizeof (zone_kstat_t)); |
| zone_kstat_delete_common(&zone->zone_mcap_ksp, |
| sizeof (zone_mcap_kstat_t)); |
| zone_kstat_delete_common(&zone->zone_misc_ksp, |
| sizeof (zone_misc_kstat_t)); |
| } |
| |
| /* |
| * Called very early on in boot to initialize the ZSD list so that |
| * zone_key_create() can be called before zone_init(). It also initializes |
| * portions of zone0 which may be used before zone_init() is called. The |
| * variable "global_zone" will be set when zone0 is fully initialized by |
| * zone_init(). |
| */ |
| void |
| zone_zsd_init(void) |
| { |
| mutex_init(&zonehash_lock, NULL, MUTEX_DEFAULT, NULL); |
| mutex_init(&zsd_key_lock, NULL, MUTEX_DEFAULT, NULL); |
| list_create(&zsd_registered_keys, sizeof (struct zsd_entry), |
| offsetof(struct zsd_entry, zsd_linkage)); |
| list_create(&zone_active, sizeof (zone_t), |
| offsetof(zone_t, zone_linkage)); |
| list_create(&zone_deathrow, sizeof (zone_t), |
| offsetof(zone_t, zone_linkage)); |
| |
| mutex_init(&zone0.zone_lock, NULL, MUTEX_DEFAULT, NULL); |
| mutex_init(&zone0.zone_nlwps_lock, NULL, MUTEX_DEFAULT, NULL); |
| mutex_init(&zone0.zone_mem_lock, NULL, MUTEX_DEFAULT, NULL); |
| zone0.zone_shares = 1; |
| zone0.zone_nlwps = 0; |
| zone0.zone_nlwps_ctl = INT_MAX; |
| zone0.zone_nprocs = 0; |
| zone0.zone_nprocs_ctl = INT_MAX; |
| zone0.zone_locked_mem = 0; |
| zone0.zone_locked_mem_ctl = UINT64_MAX; |
| ASSERT(zone0.zone_max_swap == 0); |
| zone0.zone_max_swap_ctl = UINT64_MAX; |
| zone0.zone_max_lofi = 0; |
| zone0.zone_max_lofi_ctl = UINT64_MAX; |
| zone0.zone_shmmax = 0; |
| zone0.zone_ipc.ipcq_shmmni = 0; |
| zone0.zone_ipc.ipcq_semmni = 0; |
| zone0.zone_ipc.ipcq_msgmni = 0; |
| zone0.zone_name = GLOBAL_ZONENAME; |
| zone0.zone_nodename = utsname.nodename; |
| zone0.zone_domain = srpc_domain; |
| zone0.zone_hostid = HW_INVALID_HOSTID; |
| zone0.zone_fs_allowed = NULL; |
| psecflags_default(&zone0.zone_secflags); |
| zone0.zone_ref = 1; |
| zone0.zone_id = GLOBAL_ZONEID; |
| zone0.zone_status = ZONE_IS_RUNNING; |
| zone0.zone_rootpath = "/"; |
| zone0.zone_rootpathlen = 2; |
| zone0.zone_psetid = ZONE_PS_INVAL; |
| zone0.zone_ncpus = 0; |
| zone0.zone_ncpus_online = 0; |
| zone0.zone_proc_initpid = 1; |
| zone0.zone_initname = initname; |
| zone0.zone_lockedmem_kstat = NULL; |
| zone0.zone_swapresv_kstat = NULL; |
| zone0.zone_nprocs_kstat = NULL; |
| |
| list_create(&zone0.zone_ref_list, sizeof (zone_ref_t), |
| offsetof(zone_ref_t, zref_linkage)); |
| list_create(&zone0.zone_zsd, sizeof (struct zsd_entry), |
| offsetof(struct zsd_entry, zsd_linkage)); |
| list_insert_head(&zone_active, &zone0); |
| |
| /* |
| * The root filesystem is not mounted yet, so zone_rootvp cannot be set |
| * to anything meaningful. It is assigned to be 'rootdir' in |
| * vfs_mountroot(). |
| */ |
| zone0.zone_rootvp = NULL; |
| zone0.zone_vfslist = NULL; |
| zone0.zone_bootargs = initargs; |
| zone0.zone_privset = kmem_alloc(sizeof (priv_set_t), KM_SLEEP); |
| /* |
| * The global zone has all privileges |
| */ |
| priv_fillset(zone0.zone_privset); |
| /* |
| * Add p0 to the global zone |
| */ |
| zone0.zone_zsched = &p0; |
| p0.p_zone = &zone0; |
| } |
| |
| /* |
| * Compute a hash value based on the contents of the label and the DOI. The |
| * hash algorithm is somewhat arbitrary, but is based on the observation that |
| * humans will likely pick labels that differ by amounts that work out to be |
| * multiples of the number of hash chains, and thus stirring in some primes |
| * should help. |
| */ |
| static uint_t |
| hash_bylabel(void *hdata, mod_hash_key_t key) |
| { |
| const ts_label_t *lab = (ts_label_t *)key; |
| const uint32_t *up, *ue; |
| uint_t hash; |
| int i; |
| |
| _NOTE(ARGUNUSED(hdata)); |
| |
| hash = lab->tsl_doi + (lab->tsl_doi << 1); |
| /* we depend on alignment of label, but not representation */ |
| up = (const uint32_t *)&lab->tsl_label; |
| ue = up + sizeof (lab->tsl_label) / sizeof (*up); |
| i = 1; |
| while (up < ue) { |
| /* using 2^n + 1, 1 <= n <= 16 as source of many primes */ |
| hash += *up + (*up << ((i % 16) + 1)); |
| up++; |
| i++; |
| } |
| return (hash); |
| } |
| |
| /* |
| * All that mod_hash cares about here is zero (equal) versus non-zero (not |
| * equal). This may need to be changed if less than / greater than is ever |
| * needed. |
| */ |
| static int |
| hash_labelkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) |
| { |
| ts_label_t *lab1 = (ts_label_t *)key1; |
| ts_label_t *lab2 = (ts_label_t *)key2; |
| |
| return (label_equal(lab1, lab2) ? 0 : 1); |
| } |
| |
| /* |
| * Called by main() to initialize the zones framework. |
| */ |
| void |
| zone_init(void) |
| { |
| rctl_dict_entry_t *rde; |
| rctl_val_t *dval; |
| rctl_set_t *set; |
| rctl_alloc_gp_t *gp; |
| rctl_entity_p_t e; |
| int res; |
| |
| ASSERT(curproc == &p0); |
| |
| /* |
| * Create ID space for zone IDs. ID 0 is reserved for the |
| * global zone. |
| */ |
| zoneid_space = id_space_create("zoneid_space", 1, MAX_ZONEID); |
| |
| /* |
| * Initialize generic zone resource controls, if any. |
| */ |
| rc_zone_cpu_shares = rctl_register("zone.cpu-shares", |
| RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_NEVER | |
| RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | RCTL_GLOBAL_SYSLOG_NEVER, |
| FSS_MAXSHARES, FSS_MAXSHARES, &zone_cpu_shares_ops); |
| |
| rc_zone_cpu_cap = rctl_register("zone.cpu-cap", |
| RCENTITY_ZONE, RCTL_GLOBAL_SIGNAL_NEVER | RCTL_GLOBAL_DENY_ALWAYS | |
| RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT |RCTL_GLOBAL_SYSLOG_NEVER | |
| RCTL_GLOBAL_INFINITE, |
| MAXCAP, MAXCAP, &zone_cpu_cap_ops); |
| |
| rc_zone_nlwps = rctl_register("zone.max-lwps", RCENTITY_ZONE, |
| RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, |
| INT_MAX, INT_MAX, &zone_lwps_ops); |
| |
| rc_zone_nprocs = rctl_register("zone.max-processes", RCENTITY_ZONE, |
| RCTL_GLOBAL_NOACTION | RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT, |
| INT_MAX, INT_MAX, &zone_procs_ops); |
| |
| /* |
| * System V IPC resource controls |
| */ |
| rc_zone_msgmni = rctl_register("zone.max-msg-ids", |
| RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | |
| RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_msgmni_ops); |
| |
| rc_zone_semmni = rctl_register("zone.max-sem-ids", |
| RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | |
| RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_semmni_ops); |
| |
| rc_zone_shmmni = rctl_register("zone.max-shm-ids", |
| RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | |
| RCTL_GLOBAL_COUNT, IPC_IDS_MAX, IPC_IDS_MAX, &zone_shmmni_ops); |
| |
| rc_zone_shmmax = rctl_register("zone.max-shm-memory", |
| RCENTITY_ZONE, RCTL_GLOBAL_DENY_ALWAYS | RCTL_GLOBAL_NOBASIC | |
| RCTL_GLOBAL_BYTES, UINT64_MAX, UINT64_MAX, &zone_shmmax_ops); |
| |
| /* |
| * Create a rctl_val with PRIVILEGED, NOACTION, value = 1. Then attach |
| * this at the head of the rctl_dict_entry for ``zone.cpu-shares''. |
| */ |
| dval = kmem_cache_alloc(rctl_val_cache, KM_SLEEP); |
| bzero(dval, sizeof (rctl_val_t)); |
| dval->rcv_value = 1; |
| dval->rcv_privilege = RCPRIV_PRIVILEGED; |
| dval->rcv_flagaction = RCTL_LOCAL_NOACTION; |
| dval->rcv_action_recip_pid = -1; |
| |
| rde = rctl_dict_lookup("zone.cpu-shares"); |
| (void) rctl_val_list_insert(&rde->rcd_default_value, dval); |
| |
| rc_zone_locked_mem = rctl_register("zone.max-locked-memory", |
| RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | |
| RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, |
| &zone_locked_mem_ops); |
| |
| rc_zone_max_swap = rctl_register("zone.max-swap", |
| RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_BYTES | |
| RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, |
| &zone_max_swap_ops); |
| |
| rc_zone_max_lofi = rctl_register("zone.max-lofi", |
| RCENTITY_ZONE, RCTL_GLOBAL_NOBASIC | RCTL_GLOBAL_COUNT | |
| RCTL_GLOBAL_DENY_ALWAYS, UINT64_MAX, UINT64_MAX, |
| &zone_max_lofi_ops); |
| |
| /* |
| * Initialize the ``global zone''. |
| */ |
| set = rctl_set_create(); |
| gp = rctl_set_init_prealloc(RCENTITY_ZONE); |
| mutex_enter(&p0.p_lock); |
| e.rcep_p.zone = &zone0; |
| e.rcep_t = RCENTITY_ZONE; |
| zone0.zone_rctls = rctl_set_init(RCENTITY_ZONE, &p0, &e, set, |
| gp); |
| |
| zone0.zone_nlwps = p0.p_lwpcnt; |
| zone0.zone_nprocs = 1; |
| zone0.zone_ntasks = 1; |
| mutex_exit(&p0.p_lock); |
| zone0.zone_restart_init = B_TRUE; |
| zone0.zone_brand = &native_brand; |
| rctl_prealloc_destroy(gp); |
| /* |
| * pool_default hasn't been initialized yet, so we let pool_init() |
| * take care of making sure the global zone is in the default pool. |
| */ |
| |
| /* |
| * Initialize global zone kstats |
| */ |
| zone_kstat_create(&zone0); |
| |
| /* |
| * Initialize zone label. |
| * mlp are initialized when tnzonecfg is loaded. |
| */ |
| zone0.zone_slabel = l_admin_low; |
| rw_init(&zone0.zone_mlps.mlpl_rwlock, NULL, RW_DEFAULT, NULL); |
| label_hold(l_admin_low); |
| |
| /* |
| * Initialise the lock for the database structure used by mntfs. |
| */ |
| rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL); |
| |
| zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP); |
| |
| mutex_enter(&zonehash_lock); |
| zone_uniqid(&zone0); |
| ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID); |
| |
| zonehashbyid = mod_hash_create_idhash("zone_by_id", zone_hash_size, |
| mod_hash_null_valdtor); |
| zonehashbyname = mod_hash_create_strhash("zone_by_name", |
| zone_hash_size, mod_hash_null_valdtor); |
| /* |
| * maintain zonehashbylabel only for labeled systems |
| */ |
| if (is_system_labeled()) |
| zonehashbylabel = mod_hash_create_extended("zone_by_label", |
| zone_hash_size, mod_hash_null_keydtor, |
| mod_hash_null_valdtor, hash_bylabel, NULL, |
| hash_labelkey_cmp, KM_SLEEP); |
| zonecount = 1; |
| |
| (void) mod_hash_insert(zonehashbyid, (mod_hash_key_t)GLOBAL_ZONEID, |
| (mod_hash_val_t)&zone0); |
| (void) mod_hash_insert(zonehashbyname, (mod_hash_key_t)zone0.zone_name, |
| (mod_hash_val_t)&zone0); |
| if (is_system_labeled()) { |
| zone0.zone_flags |= ZF_HASHED_LABEL; |
| (void) mod_hash_insert(zonehashbylabel, |
| (mod_hash_key_t)zone0.zone_slabel, (mod_hash_val_t)&zone0); |
| } |
| mutex_exit(&zonehash_lock); |
| |
| /* |
| * We avoid setting zone_kcred until now, since kcred is initialized |
| * sometime after zone_zsd_init() and before zone_init(). |
| */ |
| zone0.zone_kcred = kcred; |
| /* |
| * The global zone is fully initialized (except for zone_rootvp which |
| * will be set when the root filesystem is mounted). |
| */ |
| global_zone = &zone0; |
| |
| /* |
| * Setup an event channel to send zone status change notifications on |
| */ |
| res = sysevent_evc_bind(ZONE_EVENT_CHANNEL, &zone_event_chan, |
| EVCH_CREAT); |
| |
| if (res) |
| panic("Sysevent_evc_bind failed during zone setup.\n"); |
| |
| } |
| |
| static void |
| zone_free(zone_t *zone) |
| { |
| ASSERT(zone != global_zone); |
| ASSERT(zone->zone_ntasks == 0); |
| ASSERT(zone->zone_nlwps == 0); |
| ASSERT(zone->zone_nprocs == 0); |
| ASSERT(zone->zone_cred_ref == 0); |
| ASSERT(zone->zone_kcred == NULL); |
| ASSERT(zone_status_get(zone) == ZONE_IS_DEAD || |
| zone_status_get(zone) == ZONE_IS_UNINITIALIZED); |
| ASSERT(list_is_empty(&zone->zone_ref_list)); |
| |
| /* |
| * Remove any zone caps. |
| */ |
| cpucaps_zone_remove(zone); |
| |
| ASSERT(zone->zone_cpucap == NULL); |
| |
| /* remove from deathrow list */ |
| if (zone_status_get(zone) == ZONE_IS_DEAD) { |
| ASSERT(zone->zone_ref == 0); |
| mutex_enter(&zone_deathrow_lock); |
| list_remove(&zone_deathrow, zone); |
| mutex_exit(&zone_deathrow_lock); |
| } |
| |
| list_destroy(&zone->zone_ref_list); |
| zone_free_zsd(zone); |
| zone_free_datasets(zone); |
| list_destroy(&zone->zone_dl_list); |
| |
| cpu_uarray_free(zone->zone_ustate); |
| |
| if (zone->zone_rootvp != NULL) |
| VN_RELE(zone->zone_rootvp); |
| if (zone->zone_rootpath) |
| kmem_free(zone->zone_rootpath, zone->zone_rootpathlen); |
| if (zone->zone_name != NULL) |
| kmem_free(zone->zone_name, ZONENAME_MAX); |
| if (zone->zone_slabel != NULL) |
| label_rele(zone->zone_slabel); |
| if (zone->zone_nodename != NULL) |
| kmem_free(zone->zone_nodename, _SYS_NMLN); |
| if (zone->zone_domain != NULL) |
| kmem_free(zone->zone_domain, _SYS_NMLN); |
| if (zone->zone_privset != NULL) |
| kmem_free(zone->zone_privset, sizeof (priv_set_t)); |
| if (zone->zone_rctls != NULL) |
| rctl_set_free(zone->zone_rctls); |
| if (zone->zone_bootargs != NULL) |
| strfree(zone->zone_bootargs); |
| if (zone->zone_initname != NULL) |
| strfree(zone->zone_initname); |
| if (zone->zone_fs_allowed != NULL) |
| strfree(zone->zone_fs_allowed); |
| if (zone->zone_pfexecd != NULL) |
| klpd_freelist(&zone->zone_pfexecd); |
| id_free(zoneid_space, zone->zone_id); |
| mutex_destroy(&zone->zone_lock); |
| cv_destroy(&zone->zone_cv); |
| rw_destroy(&zone->zone_mlps.mlpl_rwlock); |
| rw_destroy(&zone->zone_mntfs_db_lock); |
| kmem_free(zone, sizeof (zone_t)); |
| } |
| |
| /* |
| * See block comment at the top of this file for information about zone |
| * status values. |
| */ |
| /* |
| * Convenience function for setting zone status. |
| */ |
| static void |
| zone_status_set(zone_t *zone, zone_status_t status) |
| { |
| |
| nvlist_t *nvl = NULL; |
| ASSERT(MUTEX_HELD(&zone_status_lock)); |
| ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE && |
| status >= zone_status_get(zone)); |
| |
| if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) || |
| nvlist_add_string(nvl, ZONE_CB_NAME, zone->zone_name) || |
| nvlist_add_string(nvl, ZONE_CB_NEWSTATE, |
| zone_status_table[status]) || |
| nvlist_add_string(nvl, ZONE_CB_OLDSTATE, |
| zone_status_table[zone->zone_status]) || |
| nvlist_add_int32(nvl, ZONE_CB_ZONEID, zone->zone_id) || |
| nvlist_add_uint64(nvl, ZONE_CB_TIMESTAMP, (uint64_t)gethrtime()) || |
| sysevent_evc_publish(zone_event_chan, ZONE_EVENT_STATUS_CLASS, |
| ZONE_EVENT_STATUS_SUBCLASS, "sun.com", "kernel", nvl, EVCH_SLEEP)) { |
| #ifdef DEBUG |
| (void) printf( |
| "Failed to allocate and send zone state change event.\n"); |
| #endif |
| } |
| nvlist_free(nvl); |
| |
| zone->zone_status = status; |
| |
| cv_broadcast(&zone->zone_cv); |
| } |
| |
| /* |
| * Public function to retrieve the zone status. The zone status may |
| * change after it is retrieved. |
| */ |
| zone_status_t |
| zone_status_get(zone_t *zone) |
| { |
| return (zone->zone_status); |
| } |
| |
| static int |
| zone_set_bootargs(zone_t *zone, const char *zone_bootargs) |
| { |
| char *buf = kmem_zalloc(BOOTARGS_MAX, KM_SLEEP); |
| int err = 0; |
| |
| ASSERT(zone != global_zone); |
| if ((err = copyinstr(zone_bootargs, buf, BOOTARGS_MAX, NULL)) != 0) |
| goto done; /* EFAULT or ENAMETOOLONG */ |
| |
| if (zone->zone_bootargs != NULL) |
| strfree(zone->zone_bootargs); |
| |
| zone->zone_bootargs = strdup(buf); |
| |
| done: |
| kmem_free(buf, BOOTARGS_MAX); |
| return (err); |
| } |
| |
| static int |
| zone_set_brand(zone_t *zone, const char *brand) |
| { |
| struct brand_attr *attrp; |
| brand_t *bp; |
| |
| attrp = kmem_alloc(sizeof (struct brand_attr), KM_SLEEP); |
| if (copyin(brand, attrp, sizeof (struct brand_attr)) != 0) { |
| kmem_free(attrp, sizeof (struct brand_attr)); |
| return (EFAULT); |
| } |
| |
| bp = brand_register_zone(attrp); |
| kmem_free(attrp, sizeof (struct brand_attr)); |
| if (bp == NULL) |
| return (EINVAL); |
| |
| /* |
| * This is the only place where a zone can change it's brand. |
| * We already need to hold zone_status_lock to check the zone |
| * status, so we'll just use that lock to serialize zone |
| * branding requests as well. |
| */ |
| mutex_enter(&zone_status_lock); |
| |
| /* Re-Branding is not allowed and the zone can't be booted yet */ |
| if ((ZONE_IS_BRANDED(zone)) || |
| (zone_status_get(zone) >= ZONE_IS_BOOTING)) { |
| mutex_exit(&zone_status_lock); |
| brand_unregister_zone(bp); |
| return (EINVAL); |
| } |
| |
| /* set up the brand specific data */ |
| zone->zone_brand = bp; |
| ZBROP(zone)->b_init_brand_data(zone); |
| |
| mutex_exit(&zone_status_lock); |
| return (0); |
| } |
| |
| static int |
| zone_set_secflags(zone_t *zone, const psecflags_t *zone_secflags) |
| { |
| int err = 0; |
| psecflags_t psf; |
| |
| ASSERT(zone != global_zone); |
| |
| if ((err = copyin(zone_secflags, &psf, sizeof (psf))) != 0) |
| return (err); |
| |
| if (zone_status_get(zone) > ZONE_IS_READY) |
| return (EINVAL); |
| |
| if (!psecflags_validate(&psf)) |
| return (EINVAL); |
| |
| (void) memcpy(&zone->zone_secflags, &psf, sizeof (psf)); |
| |
| /* Set security flags on the zone's zsched */ |
| (void) memcpy(&zone->zone_zsched->p_secflags, &zone->zone_secflags, |
| sizeof (zone->zone_zsched->p_secflags)); |
| |
| return (0); |
| } |
| |
| static int |
| zone_set_fs_allowed(zone_t *zone, const char *zone_fs_allowed) |
| { |
| char *buf = kmem_zalloc(ZONE_FS_ALLOWED_MAX, KM_SLEEP); |
| int err = 0; |
| |
| ASSERT(zone != global_zone); |
| if ((err = copyinstr(zone_fs_allowed, buf, |
| ZONE_FS_ALLOWED_MAX, NULL)) != 0) |
| goto done; |
| |
| if (zone->zone_fs_allowed != NULL) |
| strfree(zone->zone_fs_allowed); |
| |
| zone->zone_fs_allowed = strdup(buf); |
| |
| done: |
| kmem_free(buf, ZONE_FS_ALLOWED_MAX); |
| return (err); |
| } |
| |
| static int |
| zone_set_initname(zone_t *zone, const char *zone_initname) |
| { |
| char initname[INITNAME_SZ]; |
| size_t len; |
| int err = 0; |
| |
| ASSERT(zone != global_zone); |
| if ((err = copyinstr(zone_initname, initname, INITNAME_SZ, &len)) != 0) |
| return (err); /* EFAULT or ENAMETOOLONG */ |
| |
| if (zone->zone_initname != NULL) |
| strfree(zone->zone_initname); |
| |
| zone->zone_initname = kmem_alloc(strlen(initname) + 1, KM_SLEEP); |
| (void) strcpy(zone->zone_initname, initname); |
| return (0); |
| } |
| |
| static int |
| zone_set_phys_mcap(zone_t *zone, const uint64_t *zone_mcap) |
| { |
| uint64_t mcap; |
| int err = 0; |
| |
| if ((err = copyin(zone_mcap, &mcap, sizeof (uint64_t))) == 0) |
| zone->zone_phys_mcap = mcap; |
| |
| return (err); |
| } |
| |
| static int |
| zone_set_sched_class(zone_t *zone, const char *new_class) |
| { |
| char sched_class[PC_CLNMSZ]; |
| id_t classid; |
| int err; |
| |
| ASSERT(zone != global_zone); |
| if ((err = copyinstr(new_class, sched_class, PC_CLNMSZ, NULL)) != 0) |
| return (err); /* EFAULT or ENAMETOOLONG */ |
| |
| if (getcid(sched_class, &classid) != 0 || CLASS_KERNEL(classid)) |
| return (set_errno(EINVAL)); |
| zone->zone_defaultcid = classid; |
| ASSERT(zone->zone_defaultcid > 0 && |
| zone->zone_defaultcid < loaded_classes); |
| |
| return (0); |
| } |
| |
| /* |
| * Block indefinitely waiting for (zone_status >= status) |
| */ |
| void |
| zone_status_wait(zone_t *zone, zone_status_t status) |
| { |
| ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); |
| |
| mutex_enter(&zone_status_lock); |
| while (zone->zone_status < status) { |
| cv_wait(&zone->zone_cv, &zone_status_lock); |
| } |
| mutex_exit(&zone_status_lock); |
| } |
| |
| /* |
| * Private CPR-safe version of zone_status_wait(). |
| */ |
| static void |
| zone_status_wait_cpr(zone_t *zone, zone_status_t status, char *str) |
| { |
| callb_cpr_t cprinfo; |
| |
| ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); |
| |
| CALLB_CPR_INIT(&cprinfo, &zone_status_lock, callb_generic_cpr, |
| str); |
| mutex_enter(&zone_status_lock); |
| while (zone->zone_status < status) { |
| CALLB_CPR_SAFE_BEGIN(&cprinfo); |
| cv_wait(&zone->zone_cv, &zone_status_lock); |
| CALLB_CPR_SAFE_END(&cprinfo, &zone_status_lock); |
| } |
| /* |
| * zone_status_lock is implicitly released by the following. |
| */ |
| CALLB_CPR_EXIT(&cprinfo); |
| } |
| |
| /* |
| * Block until zone enters requested state or signal is received. Return (0) |
| * if signaled, non-zero otherwise. |
| */ |
| int |
| zone_status_wait_sig(zone_t *zone, zone_status_t status) |
| { |
| ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); |
| |
| mutex_enter(&zone_status_lock); |
| while (zone->zone_status < status) { |
| if (!cv_wait_sig(&zone->zone_cv, &zone_status_lock)) { |
| mutex_exit(&zone_status_lock); |
| return (0); |
| } |
| } |
| mutex_exit(&zone_status_lock); |
| return (1); |
| } |
| |
| /* |
| * Block until the zone enters the requested state or the timeout expires, |
| * whichever happens first. Return (-1) if operation timed out, time remaining |
| * otherwise. |
| */ |
| clock_t |
| zone_status_timedwait(zone_t *zone, clock_t tim, zone_status_t status) |
| { |
| clock_t timeleft = 0; |
| |
| ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); |
| |
| mutex_enter(&zone_status_lock); |
| while (zone->zone_status < status && timeleft != -1) { |
| timeleft = cv_timedwait(&zone->zone_cv, &zone_status_lock, tim); |
| } |
| mutex_exit(&zone_status_lock); |
| return (timeleft); |
| } |
| |
| /* |
| * Block until the zone enters the requested state, the current process is |
| * signaled, or the timeout expires, whichever happens first. Return (-1) if |
| * operation timed out, 0 if signaled, time remaining otherwise. |
| */ |
| clock_t |
| zone_status_timedwait_sig(zone_t *zone, clock_t tim, zone_status_t status) |
| { |
| clock_t timeleft = tim - ddi_get_lbolt(); |
| |
| ASSERT(status > ZONE_MIN_STATE && status <= ZONE_MAX_STATE); |
| |
| mutex_enter(&zone_status_lock); |
| while (zone->zone_status < status) { |
| timeleft = cv_timedwait_sig(&zone->zone_cv, &zone_status_lock, |
| tim); |
| if (timeleft <= 0) |
| break; |
| } |
| mutex_exit(&zone_status_lock); |
| return (timeleft); |
| } |
| |
| /* |
| * Zones have two reference counts: one for references from credential |
| * structures (zone_cred_ref), and one (zone_ref) for everything else. |
| * This is so we can allow a zone to be rebooted while there are still |
| * outstanding cred references, since certain drivers cache dblks (which |
| * implicitly results in cached creds). We wait for zone_ref to drop to |
| * 0 (actually 1), but not zone_cred_ref. The zone structure itself is |
| * later freed when the zone_cred_ref drops to 0, though nothing other |
| * than the zone id and privilege set should be accessed once the zone |
| * is "dead". |
| * |
| * A debugging flag, zone_wait_for_cred, can be set to a non-zero value |
| * to force halt/reboot to block waiting for the zone_cred_ref to drop |
| * to 0. This can be useful to flush out other sources of cached creds |
| * that may be less innocuous than the driver case. |
| * |
| * Zones also provide a tracked reference counting mechanism in which zone |
| * references are represented by "crumbs" (zone_ref structures). Crumbs help |
| * debuggers determine the sources of leaked zone references. See |
| * zone_hold_ref() and zone_rele_ref() below for more information. |
| */ |
| |
| int zone_wait_for_cred = 0; |
| |
| static void |
| zone_hold_locked(zone_t *z) |
| { |
| ASSERT(MUTEX_HELD(&z->zone_lock)); |
| z->zone_ref++; |
| ASSERT(z->zone_ref != 0); |
| } |
| |
| /* |
| * Increment the specified zone's reference count. The zone's zone_t structure |
| * will not be freed as long as the zone's reference count is nonzero. |
| * Decrement the zone's reference count via zone_rele(). |
| * |
| * NOTE: This function should only be used to hold zones for short periods of |
| * time. Use zone_hold_ref() if the zone must be held for a long time. |
| */ |
| void |
| zone_hold(zone_t *z) |
| { |
| mutex_enter(&z->zone_lock); |
| zone_hold_locked(z); |
| mutex_exit(&z->zone_lock); |
| } |
| |
| /* |
| * If the non-cred ref count drops to 1 and either the cred ref count |
| * is 0 or we aren't waiting for cred references, the zone is ready to |
| * be destroyed. |
| */ |
| #define ZONE_IS_UNREF(zone) ((zone)->zone_ref == 1 && \ |
| (!zone_wait_for_cred || (zone)->zone_cred_ref == 0)) |
| |
| /* |
| * Common zone reference release function invoked by zone_rele() and |
| * zone_rele_ref(). If subsys is ZONE_REF_NUM_SUBSYS, then the specified |
| * zone's subsystem-specific reference counters are not affected by the |
| * release. If ref is not NULL, then the zone_ref_t to which it refers is |
| * removed from the specified zone's reference list. ref must be non-NULL iff |
| * subsys is not ZONE_REF_NUM_SUBSYS. |
| */ |
| static void |
| zone_rele_common(zone_t *z, zone_ref_t *ref, zone_ref_subsys_t subsys) |
| { |
| boolean_t wakeup; |
| |
| mutex_enter(&z->zone_lock); |
| ASSERT(z->zone_ref != 0); |
| z->zone_ref--; |
| if (subsys != ZONE_REF_NUM_SUBSYS) { |
| ASSERT(z->zone_subsys_ref[subsys] != 0); |
| z->zone_subsys_ref[subsys]--; |
| list_remove(&z->zone_ref_list, ref); |
| } |
| if (z->zone_ref == 0 && z->zone_cred_ref == 0) { |
| /* no more refs, free the structure */ |
| mutex_exit(&z->zone_lock); |
| zone_free(z); |
| return; |
| } |
| /* signal zone_destroy so the zone can finish halting */ |
| wakeup = (ZONE_IS_UNREF(z) && zone_status_get(z) >= ZONE_IS_DEAD); |
| mutex_exit(&z->zone_lock); |
| |
| if (wakeup) { |
| /* |
| * Grabbing zonehash_lock here effectively synchronizes with |
| * zone_destroy() to avoid missed signals. |
| */ |
| mutex_enter(&zonehash_lock); |
| cv_broadcast(&zone_destroy_cv); |
| mutex_exit(&zonehash_lock); |
| } |
| } |
| |
| /* |
| * Decrement the specified zone's reference count. The specified zone will |
| * cease to exist after this function returns if the reference count drops to |
| * zero. This function should be paired with zone_hold(). |
| */ |
| void |
| zone_rele(zone_t *z) |
| { |
| zone_rele_common(z, NULL, ZONE_REF_NUM_SUBSYS); |
| } |
| |
| /* |
| * Initialize a zone reference structure. This function must be invoked for |
| * a reference structure before the structure is passed to zone_hold_ref(). |
| */ |
| void |
| zone_init_ref(zone_ref_t *ref) |
| { |
| ref->zref_zone = NULL; |
| list_link_init(&ref->zref_linkage); |
| } |
| |
| /* |
| * Acquire a reference to zone z. The caller must specify the |
| * zone_ref_subsys_t constant associated with its subsystem. The specified |
| * zone_ref_t structure will represent a reference to the specified zone. Use |
| * zone_rele_ref() to release the reference. |
| * |
| * The referenced zone_t structure will not be freed as long as the zone_t's |
| * zone_status field is not ZONE_IS_DEAD and the zone has outstanding |
| * references. |
| * |
| * NOTE: The zone_ref_t structure must be initialized before it is used. |
| * See zone_init_ref() above. |
| */ |
|