blob: 9b0840aa8b0ec7bc2bf81b4d0f0be42023b428bf [file] [log] [blame]
Bryan Cantrill17670062015-08-28 17:45:00 -07001/*
2 * This file and its contents are supplied under the terms of the
3 * Common Development and Distribution License ("CDDL"), version 1.0.
4 * You may only use this file in accordance with the terms of version
5 * 1.0 of the CDDL.
6 *
7 * A full copy of the text of the CDDL should have accompanied this
8 * source. A copy of the CDDL is also available via the Internet at
9 * http://www.illumos.org/license/CDDL.
10 */
11
12/*
Patrick Mooney80d56892017-09-22 23:43:19 +000013 * Copyright 2017 Joyent, Inc.
Bryan Cantrill17670062015-08-28 17:45:00 -070014 */
15
16/*
17 * Support for the eventfd facility, a Linux-borne facility for user-generated
18 * file descriptor-based events.
19 */
20
21#include <sys/ddi.h>
22#include <sys/sunddi.h>
23#include <sys/eventfd.h>
24#include <sys/conf.h>
25#include <sys/vmem.h>
26#include <sys/sysmacros.h>
27#include <sys/filio.h>
28#include <sys/stat.h>
29#include <sys/file.h>
30
31struct eventfd_state;
32typedef struct eventfd_state eventfd_state_t;
33
34struct eventfd_state {
35 kmutex_t efd_lock; /* lock protecting state */
36 boolean_t efd_semaphore; /* boolean: sema. semantics */
37 kcondvar_t efd_cv; /* condvar */
38 pollhead_t efd_pollhd; /* poll head */
39 uint64_t efd_value; /* value */
Patrick Mooney860884e2016-07-20 23:51:54 +000040 size_t efd_bwriters; /* count of blocked writers */
Bryan Cantrill17670062015-08-28 17:45:00 -070041 eventfd_state_t *efd_next; /* next state on global list */
42};
43
44/*
45 * Internal global variables.
46 */
47static kmutex_t eventfd_lock; /* lock protecting state */
48static dev_info_t *eventfd_devi; /* device info */
49static vmem_t *eventfd_minor; /* minor number arena */
50static void *eventfd_softstate; /* softstate pointer */
51static eventfd_state_t *eventfd_state; /* global list of state */
52
53/*ARGSUSED*/
54static int
55eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
56{
57 eventfd_state_t *state;
58 major_t major = getemajor(*devp);
59 minor_t minor = getminor(*devp);
60
61 if (minor != EVENTFDMNRN_EVENTFD)
62 return (ENXIO);
63
64 mutex_enter(&eventfd_lock);
65
66 minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1,
67 VM_BESTFIT | VM_SLEEP);
68
69 if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) {
70 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
71 mutex_exit(&eventfd_lock);
72 return (NULL);
73 }
74
75 state = ddi_get_soft_state(eventfd_softstate, minor);
76 *devp = makedevice(major, minor);
77
78 state->efd_next = eventfd_state;
79 eventfd_state = state;
80
81 mutex_exit(&eventfd_lock);
82
83 return (0);
84}
85
86/*ARGSUSED*/
87static int
88eventfd_read(dev_t dev, uio_t *uio, cred_t *cr)
89{
90 eventfd_state_t *state;
91 minor_t minor = getminor(dev);
92 uint64_t val, oval;
93 int err;
94
95 if (uio->uio_resid < sizeof (val))
96 return (EINVAL);
97
98 state = ddi_get_soft_state(eventfd_softstate, minor);
99
100 mutex_enter(&state->efd_lock);
101
102 while (state->efd_value == 0) {
103 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
104 mutex_exit(&state->efd_lock);
105 return (EAGAIN);
106 }
107
108 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
109 mutex_exit(&state->efd_lock);
110 return (EINTR);
111 }
112 }
113
114 /*
115 * We have a non-zero value and we own the lock; our behavior now
116 * depends on whether or not EFD_SEMAPHORE was set when the eventfd
117 * was created.
118 */
119 val = oval = state->efd_value;
120
121 if (state->efd_semaphore) {
122 state->efd_value--;
123 val = 1;
124 } else {
125 state->efd_value = 0;
126 }
127
128 err = uiomove(&val, sizeof (val), UIO_READ, uio);
129
Patrick Mooney860884e2016-07-20 23:51:54 +0000130 /*
131 * Wake any writers blocked on this eventfd as this read operation may
132 * have created adequate capacity for their values.
133 */
134 if (state->efd_bwriters != 0) {
135 cv_broadcast(&state->efd_cv);
136 }
Bryan Cantrill17670062015-08-28 17:45:00 -0700137 mutex_exit(&state->efd_lock);
138
Patrick Mooney860884e2016-07-20 23:51:54 +0000139 /*
140 * It is necessary to emit POLLOUT events only when the eventfd
141 * transitions from EVENTFD_VALMAX to a lower value. At all other
142 * times, it is already considered writable by poll.
143 */
Bryan Cantrill17670062015-08-28 17:45:00 -0700144 if (oval == EVENTFD_VALMAX) {
Bryan Cantrill17670062015-08-28 17:45:00 -0700145 pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT);
146 }
147
148 return (err);
149}
150
151/*ARGSUSED*/
152static int
153eventfd_write(dev_t dev, struct uio *uio, cred_t *credp)
154{
155 eventfd_state_t *state;
156 minor_t minor = getminor(dev);
157 uint64_t val, oval;
158 int err;
159
160 if (uio->uio_resid < sizeof (val))
161 return (EINVAL);
162
163 if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0)
164 return (err);
165
166 if (val > EVENTFD_VALMAX)
167 return (EINVAL);
168
169 state = ddi_get_soft_state(eventfd_softstate, minor);
170
171 mutex_enter(&state->efd_lock);
172
173 while (val > EVENTFD_VALMAX - state->efd_value) {
174 if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) {
175 mutex_exit(&state->efd_lock);
176 return (EAGAIN);
177 }
178
Patrick Mooney860884e2016-07-20 23:51:54 +0000179 state->efd_bwriters++;
Bryan Cantrill17670062015-08-28 17:45:00 -0700180 if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) {
Patrick Mooney860884e2016-07-20 23:51:54 +0000181 state->efd_bwriters--;
Bryan Cantrill17670062015-08-28 17:45:00 -0700182 mutex_exit(&state->efd_lock);
183 return (EINTR);
184 }
Patrick Mooney860884e2016-07-20 23:51:54 +0000185 state->efd_bwriters--;
Bryan Cantrill17670062015-08-28 17:45:00 -0700186 }
187
188 /*
189 * We now know that we can add the value without overflowing.
190 */
191 state->efd_value = (oval = state->efd_value) + val;
192
Patrick Mooney860884e2016-07-20 23:51:54 +0000193 /*
194 * If the value was previously "empty", notify blocked readers that
195 * data is available.
196 */
Bryan Cantrill17670062015-08-28 17:45:00 -0700197 if (oval == 0) {
198 cv_broadcast(&state->efd_cv);
Patrick Mooney860884e2016-07-20 23:51:54 +0000199 }
200 mutex_exit(&state->efd_lock);
201
202 /*
203 * Notify pollers as well if the eventfd is now readable.
204 */
205 if (oval == 0) {
Bryan Cantrill17670062015-08-28 17:45:00 -0700206 pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN);
207 }
208
209 return (0);
210}
211
212/*ARGSUSED*/
213static int
214eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp,
215 struct pollhead **phpp)
216{
217 eventfd_state_t *state;
218 minor_t minor = getminor(dev);
219 short revents = 0;
220
221 state = ddi_get_soft_state(eventfd_softstate, minor);
222
223 mutex_enter(&state->efd_lock);
224
225 if (state->efd_value > 0)
226 revents |= POLLRDNORM | POLLIN;
227
228 if (state->efd_value < EVENTFD_VALMAX)
229 revents |= POLLWRNORM | POLLOUT;
230
Patrick Mooney80d56892017-09-22 23:43:19 +0000231 *reventsp = revents & events;
232 if ((*reventsp == 0 && !anyyet) || (events & POLLET)) {
Bryan Cantrill17670062015-08-28 17:45:00 -0700233 *phpp = &state->efd_pollhd;
Patrick Mooney80d56892017-09-22 23:43:19 +0000234 }
Bryan Cantrill17670062015-08-28 17:45:00 -0700235
236 mutex_exit(&state->efd_lock);
237
238 return (0);
239}
240
241/*ARGSUSED*/
242static int
243eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
244{
245 eventfd_state_t *state;
246 minor_t minor = getminor(dev);
247
248 state = ddi_get_soft_state(eventfd_softstate, minor);
249
250 switch (cmd) {
251 case EVENTFDIOC_SEMAPHORE: {
252 mutex_enter(&state->efd_lock);
253 state->efd_semaphore ^= 1;
254 mutex_exit(&state->efd_lock);
255
256 return (0);
257 }
258
259 default:
260 break;
261 }
262
263 return (ENOTTY);
264}
265
266/*ARGSUSED*/
267static int
268eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
269{
270 eventfd_state_t *state, **sp;
271 minor_t minor = getminor(dev);
272
273 state = ddi_get_soft_state(eventfd_softstate, minor);
274
275 if (state->efd_pollhd.ph_list != NULL) {
276 pollwakeup(&state->efd_pollhd, POLLERR);
277 pollhead_clean(&state->efd_pollhd);
278 }
279
280 mutex_enter(&eventfd_lock);
281
282 /*
283 * Remove our state from our global list.
284 */
285 for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next))
286 VERIFY(*sp != NULL);
287
288 *sp = (*sp)->efd_next;
289
290 ddi_soft_state_free(eventfd_softstate, minor);
291 vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1);
292
293 mutex_exit(&eventfd_lock);
294
295 return (0);
296}
297
298static int
299eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
300{
301 switch (cmd) {
302 case DDI_ATTACH:
303 break;
304
305 case DDI_RESUME:
306 return (DDI_SUCCESS);
307
308 default:
309 return (DDI_FAILURE);
310 }
311
312 mutex_enter(&eventfd_lock);
313
314 if (ddi_soft_state_init(&eventfd_softstate,
315 sizeof (eventfd_state_t), 0) != 0) {
316 cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state");
317 mutex_exit(&eventfd_lock);
318 return (DDI_FAILURE);
319 }
320
321 if (ddi_create_minor_node(devi, "eventfd", S_IFCHR,
322 EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) {
323 cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node");
324 ddi_soft_state_fini(&eventfd_softstate);
325 mutex_exit(&eventfd_lock);
326 return (DDI_FAILURE);
327 }
328
329 ddi_report_dev(devi);
330 eventfd_devi = devi;
331
332 eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE,
333 UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0,
334 VM_SLEEP | VMC_IDENTIFIER);
335
336 mutex_exit(&eventfd_lock);
337
338 return (DDI_SUCCESS);
339}
340
341/*ARGSUSED*/
342static int
343eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
344{
345 switch (cmd) {
346 case DDI_DETACH:
347 break;
348
349 case DDI_SUSPEND:
350 return (DDI_SUCCESS);
351
352 default:
353 return (DDI_FAILURE);
354 }
355
356 mutex_enter(&eventfd_lock);
357 vmem_destroy(eventfd_minor);
358
359 ddi_remove_minor_node(eventfd_devi, NULL);
360 eventfd_devi = NULL;
361
362 ddi_soft_state_fini(&eventfd_softstate);
363 mutex_exit(&eventfd_lock);
364
365 return (DDI_SUCCESS);
366}
367
368/*ARGSUSED*/
369static int
370eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
371{
372 int error;
373
374 switch (infocmd) {
375 case DDI_INFO_DEVT2DEVINFO:
376 *result = (void *)eventfd_devi;
377 error = DDI_SUCCESS;
378 break;
379 case DDI_INFO_DEVT2INSTANCE:
380 *result = (void *)0;
381 error = DDI_SUCCESS;
382 break;
383 default:
384 error = DDI_FAILURE;
385 }
386 return (error);
387}
388
389static struct cb_ops eventfd_cb_ops = {
390 eventfd_open, /* open */
391 eventfd_close, /* close */
392 nulldev, /* strategy */
393 nulldev, /* print */
394 nodev, /* dump */
395 eventfd_read, /* read */
396 eventfd_write, /* write */
397 eventfd_ioctl, /* ioctl */
398 nodev, /* devmap */
399 nodev, /* mmap */
400 nodev, /* segmap */
401 eventfd_poll, /* poll */
402 ddi_prop_op, /* cb_prop_op */
403 0, /* streamtab */
404 D_NEW | D_MP /* Driver compatibility flag */
405};
406
407static struct dev_ops eventfd_ops = {
408 DEVO_REV, /* devo_rev */
409 0, /* refcnt */
410 eventfd_info, /* get_dev_info */
411 nulldev, /* identify */
412 nulldev, /* probe */
413 eventfd_attach, /* attach */
414 eventfd_detach, /* detach */
415 nodev, /* reset */
416 &eventfd_cb_ops, /* driver operations */
417 NULL, /* bus operations */
418 nodev, /* dev power */
419 ddi_quiesce_not_needed, /* quiesce */
420};
421
422static struct modldrv modldrv = {
423 &mod_driverops, /* module type (this is a pseudo driver) */
424 "eventfd support", /* name of module */
425 &eventfd_ops, /* driver ops */
426};
427
428static struct modlinkage modlinkage = {
429 MODREV_1,
430 (void *)&modldrv,
431 NULL
432};
433
434int
435_init(void)
436{
437 return (mod_install(&modlinkage));
438}
439
440int
441_info(struct modinfo *modinfop)
442{
443 return (mod_info(&modlinkage, modinfop));
444}
445
446int
447_fini(void)
448{
449 return (mod_remove(&modlinkage));
450}