| /* |
| * This file and its contents are supplied under the terms of the |
| * Common Development and Distribution License ("CDDL"), version 1.0. |
| * You may only use this file in accordance with the terms of version |
| * 1.0 of the CDDL. |
| * |
| * A full copy of the text of the CDDL should have accompanied this |
| * source. A copy of the CDDL is also available via the Internet at |
| * http://www.illumos.org/license/CDDL. |
| */ |
| |
| /* |
| * Copyright 2017 Joyent, Inc. |
| */ |
| |
| /* |
| * Support for the eventfd facility, a Linux-borne facility for user-generated |
| * file descriptor-based events. |
| */ |
| |
| #include <sys/ddi.h> |
| #include <sys/sunddi.h> |
| #include <sys/eventfd.h> |
| #include <sys/conf.h> |
| #include <sys/vmem.h> |
| #include <sys/sysmacros.h> |
| #include <sys/filio.h> |
| #include <sys/stat.h> |
| #include <sys/file.h> |
| |
| struct eventfd_state; |
| typedef struct eventfd_state eventfd_state_t; |
| |
| struct eventfd_state { |
| kmutex_t efd_lock; /* lock protecting state */ |
| boolean_t efd_semaphore; /* boolean: sema. semantics */ |
| kcondvar_t efd_cv; /* condvar */ |
| pollhead_t efd_pollhd; /* poll head */ |
| uint64_t efd_value; /* value */ |
| size_t efd_bwriters; /* count of blocked writers */ |
| eventfd_state_t *efd_next; /* next state on global list */ |
| }; |
| |
| /* |
| * Internal global variables. |
| */ |
| static kmutex_t eventfd_lock; /* lock protecting state */ |
| static dev_info_t *eventfd_devi; /* device info */ |
| static vmem_t *eventfd_minor; /* minor number arena */ |
| static void *eventfd_softstate; /* softstate pointer */ |
| static eventfd_state_t *eventfd_state; /* global list of state */ |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) |
| { |
| eventfd_state_t *state; |
| major_t major = getemajor(*devp); |
| minor_t minor = getminor(*devp); |
| |
| if (minor != EVENTFDMNRN_EVENTFD) |
| return (ENXIO); |
| |
| mutex_enter(&eventfd_lock); |
| |
| minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1, |
| VM_BESTFIT | VM_SLEEP); |
| |
| if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) { |
| vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); |
| mutex_exit(&eventfd_lock); |
| return (NULL); |
| } |
| |
| state = ddi_get_soft_state(eventfd_softstate, minor); |
| *devp = makedevice(major, minor); |
| |
| state->efd_next = eventfd_state; |
| eventfd_state = state; |
| |
| mutex_exit(&eventfd_lock); |
| |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_read(dev_t dev, uio_t *uio, cred_t *cr) |
| { |
| eventfd_state_t *state; |
| minor_t minor = getminor(dev); |
| uint64_t val, oval; |
| int err; |
| |
| if (uio->uio_resid < sizeof (val)) |
| return (EINVAL); |
| |
| state = ddi_get_soft_state(eventfd_softstate, minor); |
| |
| mutex_enter(&state->efd_lock); |
| |
| while (state->efd_value == 0) { |
| if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { |
| mutex_exit(&state->efd_lock); |
| return (EAGAIN); |
| } |
| |
| if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { |
| mutex_exit(&state->efd_lock); |
| return (EINTR); |
| } |
| } |
| |
| /* |
| * We have a non-zero value and we own the lock; our behavior now |
| * depends on whether or not EFD_SEMAPHORE was set when the eventfd |
| * was created. |
| */ |
| val = oval = state->efd_value; |
| |
| if (state->efd_semaphore) { |
| state->efd_value--; |
| val = 1; |
| } else { |
| state->efd_value = 0; |
| } |
| |
| err = uiomove(&val, sizeof (val), UIO_READ, uio); |
| |
| /* |
| * Wake any writers blocked on this eventfd as this read operation may |
| * have created adequate capacity for their values. |
| */ |
| if (state->efd_bwriters != 0) { |
| cv_broadcast(&state->efd_cv); |
| } |
| mutex_exit(&state->efd_lock); |
| |
| /* |
| * It is necessary to emit POLLOUT events only when the eventfd |
| * transitions from EVENTFD_VALMAX to a lower value. At all other |
| * times, it is already considered writable by poll. |
| */ |
| if (oval == EVENTFD_VALMAX) { |
| pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); |
| } |
| |
| return (err); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) |
| { |
| eventfd_state_t *state; |
| minor_t minor = getminor(dev); |
| uint64_t val, oval; |
| int err; |
| |
| if (uio->uio_resid < sizeof (val)) |
| return (EINVAL); |
| |
| if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) |
| return (err); |
| |
| if (val > EVENTFD_VALMAX) |
| return (EINVAL); |
| |
| state = ddi_get_soft_state(eventfd_softstate, minor); |
| |
| mutex_enter(&state->efd_lock); |
| |
| while (val > EVENTFD_VALMAX - state->efd_value) { |
| if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { |
| mutex_exit(&state->efd_lock); |
| return (EAGAIN); |
| } |
| |
| state->efd_bwriters++; |
| if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { |
| state->efd_bwriters--; |
| mutex_exit(&state->efd_lock); |
| return (EINTR); |
| } |
| state->efd_bwriters--; |
| } |
| |
| /* |
| * We now know that we can add the value without overflowing. |
| */ |
| state->efd_value = (oval = state->efd_value) + val; |
| |
| /* |
| * If the value was previously "empty", notify blocked readers that |
| * data is available. |
| */ |
| if (oval == 0) { |
| cv_broadcast(&state->efd_cv); |
| } |
| mutex_exit(&state->efd_lock); |
| |
| /* |
| * Notify pollers as well if the eventfd is now readable. |
| */ |
| if (oval == 0) { |
| pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); |
| } |
| |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, |
| struct pollhead **phpp) |
| { |
| eventfd_state_t *state; |
| minor_t minor = getminor(dev); |
| short revents = 0; |
| |
| state = ddi_get_soft_state(eventfd_softstate, minor); |
| |
| mutex_enter(&state->efd_lock); |
| |
| if (state->efd_value > 0) |
| revents |= POLLRDNORM | POLLIN; |
| |
| if (state->efd_value < EVENTFD_VALMAX) |
| revents |= POLLWRNORM | POLLOUT; |
| |
| *reventsp = revents & events; |
| if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { |
| *phpp = &state->efd_pollhd; |
| } |
| |
| mutex_exit(&state->efd_lock); |
| |
| return (0); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) |
| { |
| eventfd_state_t *state; |
| minor_t minor = getminor(dev); |
| |
| state = ddi_get_soft_state(eventfd_softstate, minor); |
| |
| switch (cmd) { |
| case EVENTFDIOC_SEMAPHORE: { |
| mutex_enter(&state->efd_lock); |
| state->efd_semaphore ^= 1; |
| mutex_exit(&state->efd_lock); |
| |
| return (0); |
| } |
| |
| default: |
| break; |
| } |
| |
| return (ENOTTY); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p) |
| { |
| eventfd_state_t *state, **sp; |
| minor_t minor = getminor(dev); |
| |
| state = ddi_get_soft_state(eventfd_softstate, minor); |
| |
| if (state->efd_pollhd.ph_list != NULL) { |
| pollwakeup(&state->efd_pollhd, POLLERR); |
| pollhead_clean(&state->efd_pollhd); |
| } |
| |
| mutex_enter(&eventfd_lock); |
| |
| /* |
| * Remove our state from our global list. |
| */ |
| for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next)) |
| VERIFY(*sp != NULL); |
| |
| *sp = (*sp)->efd_next; |
| |
| ddi_soft_state_free(eventfd_softstate, minor); |
| vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); |
| |
| mutex_exit(&eventfd_lock); |
| |
| return (0); |
| } |
| |
| static int |
| eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) |
| { |
| switch (cmd) { |
| case DDI_ATTACH: |
| break; |
| |
| case DDI_RESUME: |
| return (DDI_SUCCESS); |
| |
| default: |
| return (DDI_FAILURE); |
| } |
| |
| mutex_enter(&eventfd_lock); |
| |
| if (ddi_soft_state_init(&eventfd_softstate, |
| sizeof (eventfd_state_t), 0) != 0) { |
| cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state"); |
| mutex_exit(&eventfd_lock); |
| return (DDI_FAILURE); |
| } |
| |
| if (ddi_create_minor_node(devi, "eventfd", S_IFCHR, |
| EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) { |
| cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node"); |
| ddi_soft_state_fini(&eventfd_softstate); |
| mutex_exit(&eventfd_lock); |
| return (DDI_FAILURE); |
| } |
| |
| ddi_report_dev(devi); |
| eventfd_devi = devi; |
| |
| eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE, |
| UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0, |
| VM_SLEEP | VMC_IDENTIFIER); |
| |
| mutex_exit(&eventfd_lock); |
| |
| return (DDI_SUCCESS); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) |
| { |
| switch (cmd) { |
| case DDI_DETACH: |
| break; |
| |
| case DDI_SUSPEND: |
| return (DDI_SUCCESS); |
| |
| default: |
| return (DDI_FAILURE); |
| } |
| |
| mutex_enter(&eventfd_lock); |
| vmem_destroy(eventfd_minor); |
| |
| ddi_remove_minor_node(eventfd_devi, NULL); |
| eventfd_devi = NULL; |
| |
| ddi_soft_state_fini(&eventfd_softstate); |
| mutex_exit(&eventfd_lock); |
| |
| return (DDI_SUCCESS); |
| } |
| |
| /*ARGSUSED*/ |
| static int |
| eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) |
| { |
| int error; |
| |
| switch (infocmd) { |
| case DDI_INFO_DEVT2DEVINFO: |
| *result = (void *)eventfd_devi; |
| error = DDI_SUCCESS; |
| break; |
| case DDI_INFO_DEVT2INSTANCE: |
| *result = (void *)0; |
| error = DDI_SUCCESS; |
| break; |
| default: |
| error = DDI_FAILURE; |
| } |
| return (error); |
| } |
| |
| static struct cb_ops eventfd_cb_ops = { |
| eventfd_open, /* open */ |
| eventfd_close, /* close */ |
| nulldev, /* strategy */ |
| nulldev, /* print */ |
| nodev, /* dump */ |
| eventfd_read, /* read */ |
| eventfd_write, /* write */ |
| eventfd_ioctl, /* ioctl */ |
| nodev, /* devmap */ |
| nodev, /* mmap */ |
| nodev, /* segmap */ |
| eventfd_poll, /* poll */ |
| ddi_prop_op, /* cb_prop_op */ |
| 0, /* streamtab */ |
| D_NEW | D_MP /* Driver compatibility flag */ |
| }; |
| |
| static struct dev_ops eventfd_ops = { |
| DEVO_REV, /* devo_rev */ |
| 0, /* refcnt */ |
| eventfd_info, /* get_dev_info */ |
| nulldev, /* identify */ |
| nulldev, /* probe */ |
| eventfd_attach, /* attach */ |
| eventfd_detach, /* detach */ |
| nodev, /* reset */ |
| &eventfd_cb_ops, /* driver operations */ |
| NULL, /* bus operations */ |
| nodev, /* dev power */ |
| ddi_quiesce_not_needed, /* quiesce */ |
| }; |
| |
| static struct modldrv modldrv = { |
| &mod_driverops, /* module type (this is a pseudo driver) */ |
| "eventfd support", /* name of module */ |
| &eventfd_ops, /* driver ops */ |
| }; |
| |
| static struct modlinkage modlinkage = { |
| MODREV_1, |
| (void *)&modldrv, |
| NULL |
| }; |
| |
| int |
| _init(void) |
| { |
| return (mod_install(&modlinkage)); |
| } |
| |
| int |
| _info(struct modinfo *modinfop) |
| { |
| return (mod_info(&modlinkage, modinfop)); |
| } |
| |
| int |
| _fini(void) |
| { |
| return (mod_remove(&modlinkage)); |
| } |