9936 atomic ops in syscall_mstate() induce significant overhead
9942 zone secflags are not initialized correctly
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Andy Fiddaman <andy@omniosce.org>
Reviewed by: Toomas Soome <tsoome@me.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
diff --git a/usr/src/pkg/manifests/system-header.mf b/usr/src/pkg/manifests/system-header.mf
index 3badc24..9130c75 100644
--- a/usr/src/pkg/manifests/system-header.mf
+++ b/usr/src/pkg/manifests/system-header.mf
@@ -856,6 +856,7 @@
file path=usr/include/sys/cpc_pcbe.h
file path=usr/include/sys/cpr.h
file path=usr/include/sys/cpu.h
+file path=usr/include/sys/cpu_uarray.h
file path=usr/include/sys/cpucaps.h
file path=usr/include/sys/cpucaps_impl.h
file path=usr/include/sys/cpupart.h
diff --git a/usr/src/uts/common/Makefile.files b/usr/src/uts/common/Makefile.files
index 370d031..f3ef668 100644
--- a/usr/src/uts/common/Makefile.files
+++ b/usr/src/uts/common/Makefile.files
@@ -51,6 +51,7 @@
cmt.o \
cmt_policy.o \
cpu.o \
+ cpu_uarray.o \
cpu_event.o \
cpu_intr.o \
cpu_pm.o \
diff --git a/usr/src/uts/common/os/cpu_uarray.c b/usr/src/uts/common/os/cpu_uarray.c
new file mode 100644
index 0000000..b26b89e
--- /dev/null
+++ b/usr/src/uts/common/os/cpu_uarray.c
@@ -0,0 +1,79 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+#include <sys/cpu_uarray.h>
+#include <sys/sysmacros.h>
+#include <sys/cpuvar.h>
+#include <sys/debug.h>
+#include <sys/kmem.h>
+
+static size_t
+cpu_uarray_size(size_t nr_items)
+{
+ size_t size = P2ROUNDUP(nr_items * sizeof (uint64_t), CUA_ALIGN);
+ size *= NCPU;
+ return (sizeof (cpu_uarray_t) + size);
+}
+
+cpu_uarray_t *
+cpu_uarray_zalloc(size_t nr_items, int kmflags)
+{
+ cpu_uarray_t *cua;
+
+ cua = kmem_zalloc(cpu_uarray_size(nr_items), kmflags);
+
+ if (cua != NULL) {
+ VERIFY(IS_P2ALIGNED(cua->cu_vals, CUA_ALIGN));
+ cua->cu_nr_items = nr_items;
+ }
+
+ return (cua);
+}
+
+void
+cpu_uarray_free(cpu_uarray_t *cua)
+{
+ kmem_free(cua, cpu_uarray_size(cua->cu_nr_items));
+}
+
+uint64_t
+cpu_uarray_sum(cpu_uarray_t *cua, size_t index)
+{
+ uint64_t sum = 0;
+
+ VERIFY3U(index, <, cua->cu_nr_items);
+
+ for (size_t c = 0; c < ncpus; c++) {
+ uint64_t addend = CPU_UARRAY_VAL(cua, c, index);
+ sum = UINT64_OVERFLOW_ADD(sum, addend);
+ }
+
+ return (sum);
+}
+
+uint64_t
+cpu_uarray_sum_all(cpu_uarray_t *cua)
+{
+ uint64_t sum = 0;
+
+ for (size_t c = 0; c < ncpus; c++) {
+ for (size_t i = 0; i < cua->cu_nr_items; i++) {
+ uint64_t addend = CPU_UARRAY_VAL(cua, c, i);
+ sum = UINT64_OVERFLOW_ADD(sum, addend);
+ }
+ }
+
+ return (sum);
+}
diff --git a/usr/src/uts/common/os/msacct.c b/usr/src/uts/common/os/msacct.c
index 6699432..db3ddd7 100644
--- a/usr/src/uts/common/os/msacct.c
+++ b/usr/src/uts/common/os/msacct.c
@@ -21,7 +21,7 @@
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
- * Copyright 2012 Joyent, Inc. All rights reserved.
+ * Copyright (c) 2018, Joyent, Inc.
*/
#include <sys/types.h>
@@ -416,16 +416,21 @@
newtime = curtime - ms->ms_state_start;
}
*mstimep += newtime;
- if (fromms == LMS_USER)
- atomic_add_64(&z->zone_utime, newtime);
- else if (fromms == LMS_SYSTEM)
- atomic_add_64(&z->zone_stime, newtime);
t->t_mstate = toms;
ms->ms_state_start = curtime;
ms->ms_prev = fromms;
kpreempt_disable(); /* don't change CPU while changing CPU's state */
cpu = CPU;
ASSERT(cpu == t->t_cpu);
+
+ if (fromms == LMS_USER) {
+ CPU_UARRAY_VAL(z->zone_ustate, cpu->cpu_id,
+ ZONE_USTATE_UTIME) += newtime;
+ } else if (fromms == LMS_SYSTEM) {
+ CPU_UARRAY_VAL(z->zone_ustate, cpu->cpu_id,
+ ZONE_USTATE_STIME) += newtime;
+ }
+
if ((toms != LMS_USER) && (cpu->cpu_mstate != CMS_SYSTEM)) {
NEW_CPU_MSTATE(CMS_SYSTEM);
} else if ((toms == LMS_USER) && (cpu->cpu_mstate != CMS_USER)) {
@@ -653,19 +658,6 @@
oldtime);
/*
- * When the system boots the initial startup thread will have a
- * ms_state_start of 0 which would add a huge system time to the global
- * zone. We want to skip aggregating that initial bit of work.
- */
- if (origstart != 0) {
- z = ttozone(t);
- if (state == LMS_USER)
- atomic_add_64(&z->zone_utime, ztime);
- else if (state == LMS_SYSTEM)
- atomic_add_64(&z->zone_stime, ztime);
- }
-
- /*
* Remember the previous running microstate.
*/
if (state != LMS_SLEEP && state != LMS_STOPPED)
@@ -676,7 +668,25 @@
*/
kpreempt_disable(); /* MUST disable kpreempt before touching t->cpu */
+
ASSERT(t->t_cpu == CPU);
+
+ /*
+ * When the system boots the initial startup thread will have a
+ * ms_state_start of 0 which would add a huge system time to the global
+ * zone. We want to skip aggregating that initial bit of work.
+ */
+ if (origstart != 0) {
+ z = ttozone(t);
+ if (state == LMS_USER) {
+ CPU_UARRAY_VAL(z->zone_ustate, t->t_cpu->cpu_id,
+ ZONE_USTATE_UTIME) += ztime;
+ } else if (state == LMS_SYSTEM) {
+ CPU_UARRAY_VAL(z->zone_ustate, t->t_cpu->cpu_id,
+ ZONE_USTATE_STIME) += ztime;
+ }
+ }
+
if (!CPU_ON_INTR(t->t_cpu) && curthread->t_intr == NULL) {
if (new_state == LMS_USER && t->t_cpu->cpu_mstate != CMS_USER)
new_cpu_mstate(CMS_USER, curtime);
@@ -783,7 +793,13 @@
z = ttozone(t);
waittime = curtime - waitrq;
ms->ms_acct[LMS_WAIT_CPU] += waittime;
- atomic_add_64(&z->zone_wtime, waittime);
+
+ /*
+ * We are in a disp context where we're not going to migrate CPUs.
+ */
+ CPU_UARRAY_VAL(z->zone_ustate, CPU->cpu_id,
+ ZONE_USTATE_WTIME) += waittime;
+
CPU->cpu_waitrq += waittime;
ms->ms_state_start = curtime;
}
diff --git a/usr/src/uts/common/os/zone.c b/usr/src/uts/common/os/zone.c
index 085a9f7..e89cf2c 100644
--- a/usr/src/uts/common/os/zone.c
+++ b/usr/src/uts/common/os/zone.c
@@ -170,7 +170,7 @@
*
* Ordering requirements:
* pool_lock --> cpu_lock --> zonehash_lock --> zone_status_lock -->
- * zone_lock --> zsd_key_lock --> pidlock --> p_lock
+ * zone_lock --> zsd_key_lock --> pidlock --> p_lock
*
* When taking zone_mem_lock or zone_nlwps_lock, the lock ordering is:
* zonehash_lock --> a_lock --> pidlock --> p_lock --> zone_mem_lock
@@ -1912,20 +1912,26 @@
{
zone_t *zone = ksp->ks_private;
zone_misc_kstat_t *zmp = ksp->ks_data;
- hrtime_t tmp;
+ hrtime_t hrtime;
+ uint64_t tmp;
if (rw == KSTAT_WRITE)
return (EACCES);
- tmp = zone->zone_utime;
- scalehrtime(&tmp);
- zmp->zm_utime.value.ui64 = tmp;
- tmp = zone->zone_stime;
- scalehrtime(&tmp);
- zmp->zm_stime.value.ui64 = tmp;
- tmp = zone->zone_wtime;
- scalehrtime(&tmp);
- zmp->zm_wtime.value.ui64 = tmp;
+ tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_STIME);
+ hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
+ scalehrtime(&hrtime);
+ zmp->zm_stime.value.ui64 = hrtime;
+
+ tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_UTIME);
+ hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
+ scalehrtime(&hrtime);
+ zmp->zm_utime.value.ui64 = hrtime;
+
+ tmp = cpu_uarray_sum(zone->zone_ustate, ZONE_USTATE_WTIME);
+ hrtime = UINT64_OVERFLOW_TO_INT64(tmp);
+ scalehrtime(&hrtime);
+ zmp->zm_wtime.value.ui64 = hrtime;
zmp->zm_avenrun1.value.ui32 = zone->zone_avenrun[0];
zmp->zm_avenrun5.value.ui32 = zone->zone_avenrun[1];
@@ -2097,10 +2103,6 @@
zone0.zone_swapresv_kstat = NULL;
zone0.zone_nprocs_kstat = NULL;
- zone0.zone_stime = 0;
- zone0.zone_utime = 0;
- zone0.zone_wtime = 0;
-
list_create(&zone0.zone_ref_list, sizeof (zone_ref_t),
offsetof(zone_ref_t, zref_linkage));
list_create(&zone0.zone_zsd, sizeof (struct zsd_entry),
@@ -2304,6 +2306,8 @@
*/
rw_init(&zone0.zone_mntfs_db_lock, NULL, RW_DEFAULT, NULL);
+ zone0.zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
+
mutex_enter(&zonehash_lock);
zone_uniqid(&zone0);
ASSERT(zone0.zone_uniqid == GLOBAL_ZONEUNIQID);
@@ -2388,6 +2392,8 @@
zone_free_datasets(zone);
list_destroy(&zone->zone_dl_list);
+ cpu_uarray_free(zone->zone_ustate);
+
if (zone->zone_rootvp != NULL)
VN_RELE(zone->zone_rootvp);
if (zone->zone_rootpath)
@@ -3203,12 +3209,13 @@
* Based on loadavg_update(), genloadavg() and calcloadavg() from clock.c.
*/
void
-zone_loadavg_update()
+zone_loadavg_update(void)
{
zone_t *zp;
zone_status_t status;
struct loadavg_s *lavg;
hrtime_t zone_total;
+ uint64_t tmp;
int i;
hrtime_t hr_avg;
int nrun;
@@ -3233,7 +3240,9 @@
*/
lavg = &zp->zone_loadavg;
- zone_total = zp->zone_utime + zp->zone_stime + zp->zone_wtime;
+ tmp = cpu_uarray_sum_all(zp->zone_ustate);
+ zone_total = UINT64_OVERFLOW_TO_INT64(tmp);
+
scalehrtime(&zone_total);
/* The zone_total should always be increasing. */
@@ -4232,8 +4241,8 @@
* Where each element of the nvpair_list_array is of the form:
*
* [(name = "privilege", value = RCPRIV_PRIVILEGED),
- * (name = "limit", value = uint64_t),
- * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
+ * (name = "limit", value = uint64_t),
+ * (name = "action", value = (RCTL_LOCAL_NOACTION || RCTL_LOCAL_DENY))]
*/
static int
parse_rctls(caddr_t ubuf, size_t buflen, nvlist_t **nvlp)
@@ -4523,10 +4532,7 @@
zone->zone_bootargs = NULL;
zone->zone_fs_allowed = NULL;
- secflags_zero(&zone0.zone_secflags.psf_lower);
- secflags_zero(&zone0.zone_secflags.psf_effective);
- secflags_zero(&zone0.zone_secflags.psf_inherit);
- secflags_fullset(&zone0.zone_secflags.psf_upper);
+ psecflags_default(&zone->zone_secflags);
zone->zone_initname =
kmem_alloc(strlen(zone_default_initname) + 1, KM_SLEEP);
@@ -4544,6 +4550,8 @@
zone0.zone_lockedmem_kstat = NULL;
zone0.zone_swapresv_kstat = NULL;
+ zone->zone_ustate = cpu_uarray_zalloc(ZONE_USTATE_MAX, KM_SLEEP);
+
/*
* Zsched initializes the rctls.
*/
diff --git a/usr/src/uts/common/sys/Makefile b/usr/src/uts/common/sys/Makefile
index 66433e9..8d26a71 100644
--- a/usr/src/uts/common/sys/Makefile
+++ b/usr/src/uts/common/sys/Makefile
@@ -21,7 +21,7 @@
#
# Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved.
-# Copyright 2014, Joyent, Inc. All rights reserved.
+# Copyright (c) 2018, Joyent, Inc.
# Copyright 2013 Garrett D'Amore <garrett@damore.org>
# Copyright 2013 Saso Kiselkov. All rights reserved.
# Copyright 2015 Igor Kozhukhov <ikozhukhov@gmail.com>
@@ -138,6 +138,7 @@
cpc_impl.h \
cpc_pcbe.h \
cpr.h \
+ cpu_uarray.h \
cpupart.h \
cpuvar.h \
crc32.h \
diff --git a/usr/src/uts/common/sys/cpu_uarray.h b/usr/src/uts/common/sys/cpu_uarray.h
new file mode 100644
index 0000000..9cad772
--- /dev/null
+++ b/usr/src/uts/common/sys/cpu_uarray.h
@@ -0,0 +1,81 @@
+/*
+ * This file and its contents are supplied under the terms of the
+ * Common Development and Distribution License ("CDDL"), version 1.0.
+ * You may only use this file in accordance with the terms of version
+ * 1.0 of the CDDL.
+ *
+ * A full copy of the text of the CDDL should have accompanied this
+ * source. A copy of the CDDL is also available via the Internet at
+ * http://www.illumos.org/license/CDDL.
+ */
+
+/*
+ * Copyright (c) 2018, Joyent, Inc.
+ */
+
+/*
+ * Use a cpu_uarray_t for an array of uint64_t values that are written on a
+ * per-CPU basis. We align each CPU on a 128-byte boundary (so two cachelines).
+ * It's not clear why, but this can have a significant effect in multi-socket
+ * systems running certain benchmarks on a relatively current Intel system.
+ *
+ * So the layout is like this, for example:
+ *
+ * 0: STAT1 for CPU 0
+ * 8: STAT2 for CPU 0
+ * 16: STAT3 for CPU 0
+ * 24: padding
+ * 128: STAT1 for CPU 1
+ * 136: STAT2 for CPU 1
+ * ...
+ *
+ * At collection time, cpu_uarray_sum() can be used to sum the given value index
+ * across all CPUs, or cpu_uarray_sum_all() sums all stats across all CPUs.
+ * The summation is done such that it saturates at UINT64_MAX.
+ */
+
+#ifndef _SYS_CPU_UARRAY_H
+#define _SYS_CPU_UARRAY_H
+
+#include <sys/types.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _KERNEL
+
+/*
+ * Trying to include sysmacros.h for P2ROUNDUP() here is just too painful.
+ */
+#define CUA_ROUNDUP(x, align) (-(-(x) & -(align)))
+#define CUA_ALIGN (128)
+#define CUA_CPU_STRIDE(nr_items) \
+ CUA_ROUNDUP((nr_items), CUA_ALIGN / sizeof (uint64_t))
+#define CUA_INDEX(nr_items, c, i) (((c) * CUA_CPU_STRIDE(nr_items)) + (i))
+
+#define CPU_UARRAY_VAL(cua, cpu_index, stat_index) \
+ ((cua)->cu_vals[CUA_INDEX((cua)->cu_nr_items, cpu_index, stat_index)])
+
+typedef struct {
+ uint64_t cu_nr_items;
+ char cu_pad[CUA_ALIGN - sizeof (uint64_t)];
+#ifdef __lint
+ volatile uint64_t cu_vals[1];
+#else
+ volatile uint64_t cu_vals[];
+#endif
+} cpu_uarray_t __aligned(CUA_ALIGN);
+
+extern cpu_uarray_t *cpu_uarray_zalloc(size_t, int);
+extern void cpu_uarray_free(cpu_uarray_t *);
+extern uint64_t cpu_uarray_sum(cpu_uarray_t *, size_t);
+extern uint64_t cpu_uarray_sum_all(cpu_uarray_t *);
+
+#endif /* _KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_CPU_UARRAY_H */
diff --git a/usr/src/uts/common/sys/sysmacros.h b/usr/src/uts/common/sys/sysmacros.h
index 6f5882b..5dc6eee 100644
--- a/usr/src/uts/common/sys/sysmacros.h
+++ b/usr/src/uts/common/sys/sysmacros.h
@@ -373,6 +373,19 @@
#define ARRAY_SIZE(x) (sizeof (x) / sizeof (x[0]))
#endif
+/*
+ * Add a value to a uint64_t that saturates at UINT64_MAX instead of wrapping
+ * around.
+ */
+#define UINT64_OVERFLOW_ADD(val, add) \
+ ((val) > ((val) + (add)) ? (UINT64_MAX) : ((val) + (add)))
+
+/*
+ * Convert to an int64, saturating at INT64_MAX.
+ */
+#define UINT64_OVERFLOW_TO_INT64(uval) \
+ (((uval) > INT64_MAX) ? INT64_MAX : (int64_t)(uval))
+
#ifdef __cplusplus
}
#endif
diff --git a/usr/src/uts/common/sys/zone.h b/usr/src/uts/common/sys/zone.h
index 27f52c5..56fa4b8 100644
--- a/usr/src/uts/common/sys/zone.h
+++ b/usr/src/uts/common/sys/zone.h
@@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright 2015 Joyent, Inc. All rights reserved.
+ * Copyright 2018 Joyent, Inc.
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright 2014 Igor Kozhukhov <ikozhukhov@gmail.com>.
*/
@@ -42,6 +42,7 @@
#include <sys/socket_impl.h>
#include <sys/secflags.h>
#include <netinet/in.h>
+#include <sys/cpu_uarray.h>
#ifdef __cplusplus
extern "C" {
@@ -330,6 +331,15 @@
#define GLOBAL_ZONEUNIQID 0 /* uniqid of the global zone */
+/*
+ * Indexes into ->zone_ustate array, summing the micro state of all threads in a
+ * particular zone.
+ */
+#define ZONE_USTATE_STIME (0)
+#define ZONE_USTATE_UTIME (1)
+#define ZONE_USTATE_WTIME (2)
+#define ZONE_USTATE_MAX (3)
+
struct pool;
struct brand;
@@ -433,13 +443,13 @@
/* if not emulated */
/*
* zone_lock protects the following fields of a zone_t:
- * zone_ref
- * zone_cred_ref
- * zone_subsys_ref
- * zone_ref_list
- * zone_ntasks
- * zone_flags
- * zone_zsd
+ * zone_ref
+ * zone_cred_ref
+ * zone_subsys_ref
+ * zone_ref_list
+ * zone_ntasks
+ * zone_flags
+ * zone_zsd
* zone_pfexecd
*/
kmutex_t zone_lock;
@@ -543,7 +553,7 @@
boolean_t zone_restart_init; /* Restart init if it dies? */
struct brand *zone_brand; /* zone's brand */
- void *zone_brand_data; /* store brand specific data */
+ void *zone_brand_data; /* store brand specific data */
id_t zone_defaultcid; /* dflt scheduling class id */
kstat_t *zone_swapresv_kstat;
kstat_t *zone_lockedmem_kstat;
@@ -584,22 +594,12 @@
/*
* Misc. kstats and counters for zone cpu-usage aggregation.
- * The zone_Xtime values are the sum of the micro-state accounting
- * values for all threads that are running or have run in the zone.
- * This is tracked in msacct.c as threads change state.
- * The zone_stime is the sum of the LMS_SYSTEM times.
- * The zone_utime is the sum of the LMS_USER times.
- * The zone_wtime is the sum of the LMS_WAIT_CPU times.
- * As with per-thread micro-state accounting values, these values are
- * not scaled to nanosecs. The scaling is done by the
- * zone_misc_kstat_update function when kstats are requested.
*/
kmutex_t zone_misc_lock; /* protects misc statistics */
kstat_t *zone_misc_ksp;
zone_misc_kstat_t *zone_misc_stats;
- uint64_t zone_stime; /* total system time */
- uint64_t zone_utime; /* total user time */
- uint64_t zone_wtime; /* total time waiting in runq */
+ /* Accumulated microstate for all threads in this zone. */
+ cpu_uarray_t *zone_ustate;
/* fork-fail kstat tracking */
uint32_t zone_ffcap; /* hit an rctl cap */
uint32_t zone_ffnoproc; /* get proc/lwp error */
@@ -681,7 +681,7 @@
extern void zone_key_create(zone_key_t *, void *(*)(zoneid_t),
void (*)(zoneid_t, void *), void (*)(zoneid_t, void *));
-extern int zone_key_delete(zone_key_t);
+extern int zone_key_delete(zone_key_t);
extern void *zone_getspecific(zone_key_t, zone_t *);
extern int zone_setspecific(zone_key_t, zone_t *, const void *);
@@ -707,7 +707,7 @@
void (*zsd_shutdown)(zoneid_t, void *);
void (*zsd_destroy)(zoneid_t, void *);
list_node_t zsd_linkage;
- uint16_t zsd_flags; /* See below */
+ uint16_t zsd_flags; /* See below */
kcondvar_t zsd_cv;
};