Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 1 | /* |
| 2 | * This file and its contents are supplied under the terms of the |
| 3 | * Common Development and Distribution License ("CDDL"), version 1.0. |
| 4 | * You may only use this file in accordance with the terms of version |
| 5 | * 1.0 of the CDDL. |
| 6 | * |
| 7 | * A full copy of the text of the CDDL should have accompanied this |
| 8 | * source. A copy of the CDDL is also available via the Internet at |
| 9 | * http://www.illumos.org/license/CDDL. |
| 10 | */ |
| 11 | |
| 12 | /* |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 13 | * Copyright 2017 Joyent, Inc. |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 14 | */ |
| 15 | |
| 16 | /* |
| 17 | * Support for the eventfd facility, a Linux-borne facility for user-generated |
| 18 | * file descriptor-based events. |
| 19 | */ |
| 20 | |
| 21 | #include <sys/ddi.h> |
| 22 | #include <sys/sunddi.h> |
| 23 | #include <sys/eventfd.h> |
| 24 | #include <sys/conf.h> |
| 25 | #include <sys/vmem.h> |
| 26 | #include <sys/sysmacros.h> |
| 27 | #include <sys/filio.h> |
| 28 | #include <sys/stat.h> |
| 29 | #include <sys/file.h> |
| 30 | |
| 31 | struct eventfd_state; |
| 32 | typedef struct eventfd_state eventfd_state_t; |
| 33 | |
| 34 | struct eventfd_state { |
| 35 | kmutex_t efd_lock; /* lock protecting state */ |
| 36 | boolean_t efd_semaphore; /* boolean: sema. semantics */ |
| 37 | kcondvar_t efd_cv; /* condvar */ |
| 38 | pollhead_t efd_pollhd; /* poll head */ |
| 39 | uint64_t efd_value; /* value */ |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 40 | size_t efd_bwriters; /* count of blocked writers */ |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 41 | eventfd_state_t *efd_next; /* next state on global list */ |
| 42 | }; |
| 43 | |
| 44 | /* |
| 45 | * Internal global variables. |
| 46 | */ |
| 47 | static kmutex_t eventfd_lock; /* lock protecting state */ |
| 48 | static dev_info_t *eventfd_devi; /* device info */ |
| 49 | static vmem_t *eventfd_minor; /* minor number arena */ |
| 50 | static void *eventfd_softstate; /* softstate pointer */ |
| 51 | static eventfd_state_t *eventfd_state; /* global list of state */ |
| 52 | |
| 53 | /*ARGSUSED*/ |
| 54 | static int |
| 55 | eventfd_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) |
| 56 | { |
| 57 | eventfd_state_t *state; |
| 58 | major_t major = getemajor(*devp); |
| 59 | minor_t minor = getminor(*devp); |
| 60 | |
| 61 | if (minor != EVENTFDMNRN_EVENTFD) |
| 62 | return (ENXIO); |
| 63 | |
| 64 | mutex_enter(&eventfd_lock); |
| 65 | |
| 66 | minor = (minor_t)(uintptr_t)vmem_alloc(eventfd_minor, 1, |
| 67 | VM_BESTFIT | VM_SLEEP); |
| 68 | |
| 69 | if (ddi_soft_state_zalloc(eventfd_softstate, minor) != DDI_SUCCESS) { |
| 70 | vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); |
| 71 | mutex_exit(&eventfd_lock); |
| 72 | return (NULL); |
| 73 | } |
| 74 | |
| 75 | state = ddi_get_soft_state(eventfd_softstate, minor); |
| 76 | *devp = makedevice(major, minor); |
| 77 | |
| 78 | state->efd_next = eventfd_state; |
| 79 | eventfd_state = state; |
| 80 | |
| 81 | mutex_exit(&eventfd_lock); |
| 82 | |
| 83 | return (0); |
| 84 | } |
| 85 | |
| 86 | /*ARGSUSED*/ |
| 87 | static int |
| 88 | eventfd_read(dev_t dev, uio_t *uio, cred_t *cr) |
| 89 | { |
| 90 | eventfd_state_t *state; |
| 91 | minor_t minor = getminor(dev); |
| 92 | uint64_t val, oval; |
| 93 | int err; |
| 94 | |
| 95 | if (uio->uio_resid < sizeof (val)) |
| 96 | return (EINVAL); |
| 97 | |
| 98 | state = ddi_get_soft_state(eventfd_softstate, minor); |
| 99 | |
| 100 | mutex_enter(&state->efd_lock); |
| 101 | |
| 102 | while (state->efd_value == 0) { |
| 103 | if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { |
| 104 | mutex_exit(&state->efd_lock); |
| 105 | return (EAGAIN); |
| 106 | } |
| 107 | |
| 108 | if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { |
| 109 | mutex_exit(&state->efd_lock); |
| 110 | return (EINTR); |
| 111 | } |
| 112 | } |
| 113 | |
| 114 | /* |
| 115 | * We have a non-zero value and we own the lock; our behavior now |
| 116 | * depends on whether or not EFD_SEMAPHORE was set when the eventfd |
| 117 | * was created. |
| 118 | */ |
| 119 | val = oval = state->efd_value; |
| 120 | |
| 121 | if (state->efd_semaphore) { |
| 122 | state->efd_value--; |
| 123 | val = 1; |
| 124 | } else { |
| 125 | state->efd_value = 0; |
| 126 | } |
| 127 | |
| 128 | err = uiomove(&val, sizeof (val), UIO_READ, uio); |
| 129 | |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 130 | /* |
| 131 | * Wake any writers blocked on this eventfd as this read operation may |
| 132 | * have created adequate capacity for their values. |
| 133 | */ |
| 134 | if (state->efd_bwriters != 0) { |
| 135 | cv_broadcast(&state->efd_cv); |
| 136 | } |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 137 | mutex_exit(&state->efd_lock); |
| 138 | |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 139 | /* |
| 140 | * It is necessary to emit POLLOUT events only when the eventfd |
| 141 | * transitions from EVENTFD_VALMAX to a lower value. At all other |
| 142 | * times, it is already considered writable by poll. |
| 143 | */ |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 144 | if (oval == EVENTFD_VALMAX) { |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 145 | pollwakeup(&state->efd_pollhd, POLLWRNORM | POLLOUT); |
| 146 | } |
| 147 | |
| 148 | return (err); |
| 149 | } |
| 150 | |
| 151 | /*ARGSUSED*/ |
| 152 | static int |
| 153 | eventfd_write(dev_t dev, struct uio *uio, cred_t *credp) |
| 154 | { |
| 155 | eventfd_state_t *state; |
| 156 | minor_t minor = getminor(dev); |
| 157 | uint64_t val, oval; |
| 158 | int err; |
| 159 | |
| 160 | if (uio->uio_resid < sizeof (val)) |
| 161 | return (EINVAL); |
| 162 | |
| 163 | if ((err = uiomove(&val, sizeof (val), UIO_WRITE, uio)) != 0) |
| 164 | return (err); |
| 165 | |
| 166 | if (val > EVENTFD_VALMAX) |
| 167 | return (EINVAL); |
| 168 | |
| 169 | state = ddi_get_soft_state(eventfd_softstate, minor); |
| 170 | |
| 171 | mutex_enter(&state->efd_lock); |
| 172 | |
| 173 | while (val > EVENTFD_VALMAX - state->efd_value) { |
| 174 | if (uio->uio_fmode & (FNDELAY|FNONBLOCK)) { |
| 175 | mutex_exit(&state->efd_lock); |
| 176 | return (EAGAIN); |
| 177 | } |
| 178 | |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 179 | state->efd_bwriters++; |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 180 | if (!cv_wait_sig_swap(&state->efd_cv, &state->efd_lock)) { |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 181 | state->efd_bwriters--; |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 182 | mutex_exit(&state->efd_lock); |
| 183 | return (EINTR); |
| 184 | } |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 185 | state->efd_bwriters--; |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 186 | } |
| 187 | |
| 188 | /* |
| 189 | * We now know that we can add the value without overflowing. |
| 190 | */ |
| 191 | state->efd_value = (oval = state->efd_value) + val; |
| 192 | |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 193 | /* |
| 194 | * If the value was previously "empty", notify blocked readers that |
| 195 | * data is available. |
| 196 | */ |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 197 | if (oval == 0) { |
| 198 | cv_broadcast(&state->efd_cv); |
Patrick Mooney | 860884e | 2016-07-20 23:51:54 +0000 | [diff] [blame] | 199 | } |
| 200 | mutex_exit(&state->efd_lock); |
| 201 | |
| 202 | /* |
| 203 | * Notify pollers as well if the eventfd is now readable. |
| 204 | */ |
| 205 | if (oval == 0) { |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 206 | pollwakeup(&state->efd_pollhd, POLLRDNORM | POLLIN); |
| 207 | } |
| 208 | |
| 209 | return (0); |
| 210 | } |
| 211 | |
| 212 | /*ARGSUSED*/ |
| 213 | static int |
| 214 | eventfd_poll(dev_t dev, short events, int anyyet, short *reventsp, |
| 215 | struct pollhead **phpp) |
| 216 | { |
| 217 | eventfd_state_t *state; |
| 218 | minor_t minor = getminor(dev); |
| 219 | short revents = 0; |
| 220 | |
| 221 | state = ddi_get_soft_state(eventfd_softstate, minor); |
| 222 | |
| 223 | mutex_enter(&state->efd_lock); |
| 224 | |
| 225 | if (state->efd_value > 0) |
| 226 | revents |= POLLRDNORM | POLLIN; |
| 227 | |
| 228 | if (state->efd_value < EVENTFD_VALMAX) |
| 229 | revents |= POLLWRNORM | POLLOUT; |
| 230 | |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 231 | *reventsp = revents & events; |
| 232 | if ((*reventsp == 0 && !anyyet) || (events & POLLET)) { |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 233 | *phpp = &state->efd_pollhd; |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 234 | } |
Bryan Cantrill | 1767006 | 2015-08-28 17:45:00 -0700 | [diff] [blame] | 235 | |
| 236 | mutex_exit(&state->efd_lock); |
| 237 | |
| 238 | return (0); |
| 239 | } |
| 240 | |
| 241 | /*ARGSUSED*/ |
| 242 | static int |
| 243 | eventfd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) |
| 244 | { |
| 245 | eventfd_state_t *state; |
| 246 | minor_t minor = getminor(dev); |
| 247 | |
| 248 | state = ddi_get_soft_state(eventfd_softstate, minor); |
| 249 | |
| 250 | switch (cmd) { |
| 251 | case EVENTFDIOC_SEMAPHORE: { |
| 252 | mutex_enter(&state->efd_lock); |
| 253 | state->efd_semaphore ^= 1; |
| 254 | mutex_exit(&state->efd_lock); |
| 255 | |
| 256 | return (0); |
| 257 | } |
| 258 | |
| 259 | default: |
| 260 | break; |
| 261 | } |
| 262 | |
| 263 | return (ENOTTY); |
| 264 | } |
| 265 | |
| 266 | /*ARGSUSED*/ |
| 267 | static int |
| 268 | eventfd_close(dev_t dev, int flag, int otyp, cred_t *cred_p) |
| 269 | { |
| 270 | eventfd_state_t *state, **sp; |
| 271 | minor_t minor = getminor(dev); |
| 272 | |
| 273 | state = ddi_get_soft_state(eventfd_softstate, minor); |
| 274 | |
| 275 | if (state->efd_pollhd.ph_list != NULL) { |
| 276 | pollwakeup(&state->efd_pollhd, POLLERR); |
| 277 | pollhead_clean(&state->efd_pollhd); |
| 278 | } |
| 279 | |
| 280 | mutex_enter(&eventfd_lock); |
| 281 | |
| 282 | /* |
| 283 | * Remove our state from our global list. |
| 284 | */ |
| 285 | for (sp = &eventfd_state; *sp != state; sp = &((*sp)->efd_next)) |
| 286 | VERIFY(*sp != NULL); |
| 287 | |
| 288 | *sp = (*sp)->efd_next; |
| 289 | |
| 290 | ddi_soft_state_free(eventfd_softstate, minor); |
| 291 | vmem_free(eventfd_minor, (void *)(uintptr_t)minor, 1); |
| 292 | |
| 293 | mutex_exit(&eventfd_lock); |
| 294 | |
| 295 | return (0); |
| 296 | } |
| 297 | |
| 298 | static int |
| 299 | eventfd_attach(dev_info_t *devi, ddi_attach_cmd_t cmd) |
| 300 | { |
| 301 | switch (cmd) { |
| 302 | case DDI_ATTACH: |
| 303 | break; |
| 304 | |
| 305 | case DDI_RESUME: |
| 306 | return (DDI_SUCCESS); |
| 307 | |
| 308 | default: |
| 309 | return (DDI_FAILURE); |
| 310 | } |
| 311 | |
| 312 | mutex_enter(&eventfd_lock); |
| 313 | |
| 314 | if (ddi_soft_state_init(&eventfd_softstate, |
| 315 | sizeof (eventfd_state_t), 0) != 0) { |
| 316 | cmn_err(CE_NOTE, "/dev/eventfd failed to create soft state"); |
| 317 | mutex_exit(&eventfd_lock); |
| 318 | return (DDI_FAILURE); |
| 319 | } |
| 320 | |
| 321 | if (ddi_create_minor_node(devi, "eventfd", S_IFCHR, |
| 322 | EVENTFDMNRN_EVENTFD, DDI_PSEUDO, NULL) == DDI_FAILURE) { |
| 323 | cmn_err(CE_NOTE, "/dev/eventfd couldn't create minor node"); |
| 324 | ddi_soft_state_fini(&eventfd_softstate); |
| 325 | mutex_exit(&eventfd_lock); |
| 326 | return (DDI_FAILURE); |
| 327 | } |
| 328 | |
| 329 | ddi_report_dev(devi); |
| 330 | eventfd_devi = devi; |
| 331 | |
| 332 | eventfd_minor = vmem_create("eventfd_minor", (void *)EVENTFDMNRN_CLONE, |
| 333 | UINT32_MAX - EVENTFDMNRN_CLONE, 1, NULL, NULL, NULL, 0, |
| 334 | VM_SLEEP | VMC_IDENTIFIER); |
| 335 | |
| 336 | mutex_exit(&eventfd_lock); |
| 337 | |
| 338 | return (DDI_SUCCESS); |
| 339 | } |
| 340 | |
| 341 | /*ARGSUSED*/ |
| 342 | static int |
| 343 | eventfd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) |
| 344 | { |
| 345 | switch (cmd) { |
| 346 | case DDI_DETACH: |
| 347 | break; |
| 348 | |
| 349 | case DDI_SUSPEND: |
| 350 | return (DDI_SUCCESS); |
| 351 | |
| 352 | default: |
| 353 | return (DDI_FAILURE); |
| 354 | } |
| 355 | |
| 356 | mutex_enter(&eventfd_lock); |
| 357 | vmem_destroy(eventfd_minor); |
| 358 | |
| 359 | ddi_remove_minor_node(eventfd_devi, NULL); |
| 360 | eventfd_devi = NULL; |
| 361 | |
| 362 | ddi_soft_state_fini(&eventfd_softstate); |
| 363 | mutex_exit(&eventfd_lock); |
| 364 | |
| 365 | return (DDI_SUCCESS); |
| 366 | } |
| 367 | |
| 368 | /*ARGSUSED*/ |
| 369 | static int |
| 370 | eventfd_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) |
| 371 | { |
| 372 | int error; |
| 373 | |
| 374 | switch (infocmd) { |
| 375 | case DDI_INFO_DEVT2DEVINFO: |
| 376 | *result = (void *)eventfd_devi; |
| 377 | error = DDI_SUCCESS; |
| 378 | break; |
| 379 | case DDI_INFO_DEVT2INSTANCE: |
| 380 | *result = (void *)0; |
| 381 | error = DDI_SUCCESS; |
| 382 | break; |
| 383 | default: |
| 384 | error = DDI_FAILURE; |
| 385 | } |
| 386 | return (error); |
| 387 | } |
| 388 | |
| 389 | static struct cb_ops eventfd_cb_ops = { |
| 390 | eventfd_open, /* open */ |
| 391 | eventfd_close, /* close */ |
| 392 | nulldev, /* strategy */ |
| 393 | nulldev, /* print */ |
| 394 | nodev, /* dump */ |
| 395 | eventfd_read, /* read */ |
| 396 | eventfd_write, /* write */ |
| 397 | eventfd_ioctl, /* ioctl */ |
| 398 | nodev, /* devmap */ |
| 399 | nodev, /* mmap */ |
| 400 | nodev, /* segmap */ |
| 401 | eventfd_poll, /* poll */ |
| 402 | ddi_prop_op, /* cb_prop_op */ |
| 403 | 0, /* streamtab */ |
| 404 | D_NEW | D_MP /* Driver compatibility flag */ |
| 405 | }; |
| 406 | |
| 407 | static struct dev_ops eventfd_ops = { |
| 408 | DEVO_REV, /* devo_rev */ |
| 409 | 0, /* refcnt */ |
| 410 | eventfd_info, /* get_dev_info */ |
| 411 | nulldev, /* identify */ |
| 412 | nulldev, /* probe */ |
| 413 | eventfd_attach, /* attach */ |
| 414 | eventfd_detach, /* detach */ |
| 415 | nodev, /* reset */ |
| 416 | &eventfd_cb_ops, /* driver operations */ |
| 417 | NULL, /* bus operations */ |
| 418 | nodev, /* dev power */ |
| 419 | ddi_quiesce_not_needed, /* quiesce */ |
| 420 | }; |
| 421 | |
| 422 | static struct modldrv modldrv = { |
| 423 | &mod_driverops, /* module type (this is a pseudo driver) */ |
| 424 | "eventfd support", /* name of module */ |
| 425 | &eventfd_ops, /* driver ops */ |
| 426 | }; |
| 427 | |
| 428 | static struct modlinkage modlinkage = { |
| 429 | MODREV_1, |
| 430 | (void *)&modldrv, |
| 431 | NULL |
| 432 | }; |
| 433 | |
| 434 | int |
| 435 | _init(void) |
| 436 | { |
| 437 | return (mod_install(&modlinkage)); |
| 438 | } |
| 439 | |
| 440 | int |
| 441 | _info(struct modinfo *modinfop) |
| 442 | { |
| 443 | return (mod_info(&modlinkage, modinfop)); |
| 444 | } |
| 445 | |
| 446 | int |
| 447 | _fini(void) |
| 448 | { |
| 449 | return (mod_remove(&modlinkage)); |
| 450 | } |