blob: 43ef57f34eefa49d6e5e46deb6bac1416a161995 [file] [log] [blame]
Robert Mustacchi047043c2020-04-08 21:35:09 -07001/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
13 * Copyright 2019, Joyent, Inc.
Keith M Wesolowskiba215ef2022-07-21 06:57:54 -070014 * Copyright 2022 Oxide Computer Company
Robert Mustacchi047043c2020-04-08 21:35:09 -070015 */
16
17/*
18 * This implements a temperature sensor for AMD Zen family products that rely
19 * upon the SMN framework for getting temperature information.
20 */
21
22#include <sys/modctl.h>
23#include <sys/conf.h>
24#include <sys/devops.h>
25#include <sys/types.h>
26#include <sys/cred.h>
27#include <sys/ddi.h>
28#include <sys/sunddi.h>
29#include <sys/cmn_err.h>
30#include <sys/x86_archext.h>
31#include <sys/cpuvar.h>
32#include <sys/sensors.h>
33#include <sys/sysmacros.h>
Keith M Wesolowskiba215ef2022-07-21 06:57:54 -070034#include <sys/amdzen/smn.h>
Robert Mustacchi047043c2020-04-08 21:35:09 -070035#include <amdzen_client.h>
36
37/*
38 * The following are register offsets and the meaning of their bits related to
39 * temperature. These addresses reside in the System Management Network which is
40 * accessed through the northbridge. They are not addresses in PCI configuration
41 * space.
42 */
Keith M Wesolowskiba215ef2022-07-21 06:57:54 -070043#define SMN_SMU_THERMAL_CURTEMP SMN_MAKE_REG(0x00059800)
Robert Mustacchi047043c2020-04-08 21:35:09 -070044#define SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(x) ((x) >> 21)
45#define SMN_SMU_THERMAL_CURTEMP_RANGE_SEL (1 << 19)
46
47#define SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ (-49)
48#define SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS 3
49#define SMN_SMU_THERMAL_CURTEMP_BITS_MASK 0x7
50
51/*
52 * The temperature sensor in Family 17 is measured in terms of 0.125 C steps.
53 */
54#define SMN_THERMAL_GRANULARITY 8
55
56typedef enum {
57 SMNTEMP_F_MUTEX = 1 << 0
58} smntemp_flags_t;
59
60typedef struct {
61 uint_t stt_dfno;
62 id_t stt_ksensor;
63 struct smntemp *stt_smn;
64 smntemp_flags_t stt_flags;
65 kmutex_t stt_mutex;
66 hrtime_t stt_last_read;
67 uint32_t stt_reg;
68 int64_t stt_temp;
69} smntemp_temp_t;
70
71typedef struct smntemp {
72 dev_info_t *smn_dip;
73 uint_t smn_ntemps;
74 int smn_offset;
75 smntemp_temp_t *smn_temps;
76} smntemp_t;
77
78static smntemp_t smntemp_data;
79
80/*
81 * AMD processors report a control temperature (called Tctl) which may be
82 * different from the junction temperature, which is the value that is actually
83 * measured from the die (sometimes called Tdie or Tjct). This is done so that
84 * socket-based environmental monitoring can be consistent from a platform
85 * perspective, but doesn't help us. Unfortunately, these values aren't in
86 * datasheets that we can find, but have been documented partially in a series
87 * of blog posts by AMD when discussing their 'Ryzen Master' monitoring software
88 * for Windows.
89 *
90 * The brand strings below may contain partial matches such in the Threadripper
91 * cases so we can match the entire family of processors. The offset value is
92 * the quantity in degrees that we should adjust Tctl to reach Tdie.
93 */
94typedef struct {
95 const char *sto_brand;
96 uint_t sto_family;
97 int sto_off;
98} smntemp_offset_t;
99
100static const smntemp_offset_t smntemp_offsets[] = {
101 { "AMD Ryzen 5 1600X", 0x17, -20 },
102 { "AMD Ryzen 7 1700X", 0x17, -20 },
103 { "AMD Ryzen 7 1800X", 0x17, -20 },
104 { "AMD Ryzen 7 2700X", 0x17, -10 },
105 { "AMD Ryzen Threadripper 19", 0x17, -27 },
106 { "AMD Ryzen Threadripper 29", 0x17, -27 },
107 { NULL }
108};
109
110static int
111smntemp_temp_update(smntemp_t *smn, smntemp_temp_t *stt)
112{
113 int ret;
114 uint32_t reg;
115 int64_t raw, decimal;
116
117 ASSERT(MUTEX_HELD((&stt->stt_mutex)));
118
Keith M Wesolowski4adf43b2022-11-09 07:00:30 +0000119 if ((ret = amdzen_c_smn_read(stt->stt_dfno, SMN_SMU_THERMAL_CURTEMP,
Robert Mustacchi047043c2020-04-08 21:35:09 -0700120 &reg)) != 0) {
121 return (ret);
122 }
123
124 stt->stt_last_read = gethrtime();
125 stt->stt_reg = reg;
126 raw = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) >>
127 SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
128 decimal = SMN_SMU_THERMAL_CURTEMP_TEMPERATURE(reg) &
129 SMN_SMU_THERMAL_CURTEMP_BITS_MASK;
130 if ((reg & SMN_SMU_THERMAL_CURTEMP_RANGE_SEL) != 0) {
131 raw += SMN_SMU_THERMAL_CURTEMP_RANGE_ADJ;
132 }
133 raw += smn->smn_offset;
134 stt->stt_temp = raw << SMN_SMU_THERMAL_CURTEMP_DECIMAL_BITS;
135 stt->stt_temp += decimal;
136
137 return (0);
138}
139
140static int
141smntemp_temp_read(void *arg, sensor_ioctl_scalar_t *temp)
142{
143 int ret;
144 smntemp_temp_t *stt = arg;
145 smntemp_t *smn = stt->stt_smn;
146
147 mutex_enter(&stt->stt_mutex);
148 if ((ret = smntemp_temp_update(smn, stt)) != 0) {
149 mutex_exit(&stt->stt_mutex);
150 return (ret);
151 }
152
153 temp->sis_unit = SENSOR_UNIT_CELSIUS;
154 temp->sis_value = stt->stt_temp;
155 temp->sis_gran = SMN_THERMAL_GRANULARITY;
156 mutex_exit(&stt->stt_mutex);
157
158 return (0);
159}
160
161static const ksensor_ops_t smntemp_temp_ops = {
162 .kso_kind = ksensor_kind_temperature,
163 .kso_scalar = smntemp_temp_read
164};
165
166static void
167smntemp_cleanup(smntemp_t *smn)
168{
169 if (smn->smn_temps != NULL) {
170 uint_t i;
171
172 (void) ksensor_remove(smn->smn_dip, KSENSOR_ALL_IDS);
173 for (i = 0; i < smn->smn_ntemps; i++) {
174 if ((smn->smn_temps[i].stt_flags & SMNTEMP_F_MUTEX) !=
175 0) {
176 mutex_destroy(&smn->smn_temps[i].stt_mutex);
177 smn->smn_temps[i].stt_flags &= ~SMNTEMP_F_MUTEX;
178 }
179 }
180 kmem_free(smn->smn_temps, sizeof (smntemp_temp_t) *
181 smn->smn_ntemps);
182 smn->smn_temps = NULL;
183 smn->smn_ntemps = 0;
184 }
185
186 if (smn->smn_dip != NULL) {
187 ddi_remove_minor_node(smn->smn_dip, NULL);
188 ddi_set_driver_private(smn->smn_dip, NULL);
189 smn->smn_dip = NULL;
190 }
191}
192
193static boolean_t
194smntemp_find_offset(smntemp_t *smn)
195{
196 uint_t i, family;
197 char buf[256];
198
199 if (cpuid_getbrandstr(CPU, buf, sizeof (buf)) >= sizeof (buf)) {
200 dev_err(smn->smn_dip, CE_WARN, "!failed to read processor "
201 "brand string, brand larger than internal buffer");
202 return (B_FALSE);
203 }
204
205 family = cpuid_getfamily(CPU);
206
207 for (i = 0; i < ARRAY_SIZE(smntemp_offsets); i++) {
208 if (family != smntemp_offsets[i].sto_family)
209 continue;
210 if (strncmp(buf, smntemp_offsets[i].sto_brand,
211 strlen(smntemp_offsets[i].sto_brand)) == 0) {
212 smn->smn_offset = smntemp_offsets[i].sto_off;
213 break;
214 }
215 }
216
217 return (B_TRUE);
218}
219
220static int
221smntemp_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
222{
223 uint_t i;
224 smntemp_t *smntemp = &smntemp_data;
225
226 if (cmd == DDI_RESUME) {
227 return (DDI_SUCCESS);
228 } else if (cmd != DDI_ATTACH) {
229 return (DDI_FAILURE);
230 }
231
232 if (smntemp->smn_dip != NULL) {
233 dev_err(dip, CE_WARN, "!smntemp already attached");
234 return (DDI_FAILURE);
235 }
236 smntemp->smn_dip = dip;
237 ddi_set_driver_private(dip, smntemp);
238
239 if (!smntemp_find_offset(smntemp)) {
240 goto err;
241 }
242
243 smntemp->smn_ntemps = amdzen_c_df_count();
244 if (smntemp->smn_ntemps == 0) {
245 dev_err(dip, CE_WARN, "!found zero DFs, can't attach smntemp");
246 goto err;
247 }
248 smntemp->smn_temps = kmem_zalloc(sizeof (smntemp_temp_t) *
249 smntemp->smn_ntemps, KM_SLEEP);
250 for (i = 0; i < smntemp->smn_ntemps; i++) {
251 int ret;
252 char buf[128];
253
254 smntemp->smn_temps[i].stt_smn = smntemp;
255 smntemp->smn_temps[i].stt_dfno = i;
256 mutex_init(&smntemp->smn_temps[i].stt_mutex, NULL, MUTEX_DRIVER,
257 NULL);
258 smntemp->smn_temps[i].stt_flags |= SMNTEMP_F_MUTEX;
259
260 if (snprintf(buf, sizeof (buf), "procnode.%u", i) >=
261 sizeof (buf)) {
262 dev_err(dip, CE_WARN, "!unexpected buffer name overrun "
263 "assembling temperature minor %u", i);
264 goto err;
265 }
266
267 if ((ret = ksensor_create(dip, &smntemp_temp_ops,
268 &smntemp->smn_temps[i], buf, DDI_NT_SENSOR_TEMP_CPU,
269 &smntemp->smn_temps[i].stt_ksensor)) != 0) {
270 dev_err(dip, CE_WARN, "!failed to create sensor %s: %d",
271 buf, ret);
272 goto err;
273 }
274 }
275
276 return (DDI_SUCCESS);
277
278err:
279 smntemp_cleanup(smntemp);
280 return (DDI_FAILURE);
281}
282
283static int
284smntemp_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
285{
286 smntemp_t *smntemp = &smntemp_data;
287
288 if (cmd == DDI_SUSPEND) {
289 return (DDI_SUCCESS);
290 } else if (cmd != DDI_DETACH) {
291 return (DDI_FAILURE);
292 }
293
294 if (smntemp->smn_dip == NULL) {
295 dev_err(smntemp->smn_dip, CE_WARN, "!asked to detach smn "
296 "instance %d that was never attached",
297 ddi_get_instance(dip));
298 return (DDI_FAILURE);
299 }
300
301 smntemp_cleanup(smntemp);
302 return (DDI_SUCCESS);
303}
304
305static struct dev_ops smntemp_dev_ops = {
306 .devo_rev = DEVO_REV,
307 .devo_refcnt = 0,
308 .devo_getinfo = nodev,
309 .devo_identify = nulldev,
310 .devo_probe = nulldev,
311 .devo_attach = smntemp_attach,
312 .devo_detach = smntemp_detach,
313 .devo_reset = nodev,
314 .devo_quiesce = ddi_quiesce_not_needed,
315};
316
317static struct modldrv smntemp_modldrv = {
318 .drv_modops = &mod_driverops,
319 .drv_linkinfo = "AMD SMN Temperature Driver",
320 .drv_dev_ops = &smntemp_dev_ops
321};
322
323static struct modlinkage smntemp_modlinkage = {
324 .ml_rev = MODREV_1,
325 .ml_linkage = { &smntemp_modldrv, NULL }
326};
327
328int
329_init(void)
330{
331 return (mod_install(&smntemp_modlinkage));
332}
333
334int
335_info(struct modinfo *modinfop)
336{
337 return (mod_info(&smntemp_modlinkage, modinfop));
338}
339
340int
341_fini(void)
342{
343 return (mod_remove(&smntemp_modlinkage));
344}