stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1 | /* |
| 2 | * CDDL HEADER START |
| 3 | * |
| 4 | * The contents of this file are subject to the terms of the |
sp92102 | 5f684e2 | 2007-03-04 19:53:36 -0800 | [diff] [blame] | 5 | * Common Development and Distribution License (the "License"). |
| 6 | * You may not use this file except in compliance with the License. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 7 | * |
| 8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE |
| 9 | * or http://www.opensolaris.org/os/licensing. |
| 10 | * See the License for the specific language governing permissions |
| 11 | * and limitations under the License. |
| 12 | * |
| 13 | * When distributing Covered Code, include this CDDL HEADER in each |
| 14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. |
| 15 | * If applicable, add the following below this CDDL HEADER, with the |
| 16 | * fields enclosed by brackets "[]" replaced with your own identifying |
| 17 | * information: Portions Copyright [yyyy] [name of copyright owner] |
| 18 | * |
| 19 | * CDDL HEADER END |
| 20 | */ |
| 21 | /* |
meem | a85084c | 2008-07-18 11:33:05 -0700 | [diff] [blame] | 22 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 23 | * Use is subject to license terms. |
| 24 | */ |
| 25 | |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 26 | /* |
| 27 | * Copyright (c) 2012 by Delphix. All rights reserved. |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 28 | * Copyright 2017 Joyent, Inc. |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 29 | */ |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 30 | |
| 31 | #include <sys/types.h> |
| 32 | #include <sys/devops.h> |
| 33 | #include <sys/conf.h> |
| 34 | #include <sys/modctl.h> |
| 35 | #include <sys/sunddi.h> |
| 36 | #include <sys/stat.h> |
| 37 | #include <sys/poll_impl.h> |
| 38 | #include <sys/errno.h> |
| 39 | #include <sys/kmem.h> |
| 40 | #include <sys/mkdev.h> |
| 41 | #include <sys/debug.h> |
| 42 | #include <sys/file.h> |
| 43 | #include <sys/sysmacros.h> |
| 44 | #include <sys/systm.h> |
| 45 | #include <sys/bitmap.h> |
| 46 | #include <sys/devpoll.h> |
| 47 | #include <sys/rctl.h> |
| 48 | #include <sys/resource.h> |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 49 | #include <sys/schedctl.h> |
| 50 | #include <sys/epoll.h> |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 51 | |
| 52 | #define RESERVED 1 |
| 53 | |
| 54 | /* local data struct */ |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 55 | static dp_entry_t **devpolltbl; /* dev poll entries */ |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 56 | static size_t dptblsize; |
| 57 | |
| 58 | static kmutex_t devpoll_lock; /* lock protecting dev tbl */ |
| 59 | int devpoll_init; /* is /dev/poll initialized already */ |
| 60 | |
| 61 | /* device local functions */ |
| 62 | |
| 63 | static int dpopen(dev_t *devp, int flag, int otyp, cred_t *credp); |
| 64 | static int dpwrite(dev_t dev, struct uio *uiop, cred_t *credp); |
| 65 | static int dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, |
| 66 | int *rvalp); |
| 67 | static int dppoll(dev_t dev, short events, int anyyet, short *reventsp, |
| 68 | struct pollhead **phpp); |
| 69 | static int dpclose(dev_t dev, int flag, int otyp, cred_t *credp); |
| 70 | static dev_info_t *dpdevi; |
| 71 | |
| 72 | |
| 73 | static struct cb_ops dp_cb_ops = { |
| 74 | dpopen, /* open */ |
| 75 | dpclose, /* close */ |
| 76 | nodev, /* strategy */ |
| 77 | nodev, /* print */ |
| 78 | nodev, /* dump */ |
| 79 | nodev, /* read */ |
| 80 | dpwrite, /* write */ |
| 81 | dpioctl, /* ioctl */ |
| 82 | nodev, /* devmap */ |
| 83 | nodev, /* mmap */ |
| 84 | nodev, /* segmap */ |
| 85 | dppoll, /* poll */ |
cth | a913d55 | 2005-07-29 08:45:16 -0700 | [diff] [blame] | 86 | ddi_prop_op, /* prop_op */ |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 87 | (struct streamtab *)0, /* streamtab */ |
cth | a913d55 | 2005-07-29 08:45:16 -0700 | [diff] [blame] | 88 | D_MP, /* flags */ |
| 89 | CB_REV, /* cb_ops revision */ |
| 90 | nodev, /* aread */ |
| 91 | nodev /* awrite */ |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 92 | }; |
| 93 | |
| 94 | static int dpattach(dev_info_t *, ddi_attach_cmd_t); |
| 95 | static int dpdetach(dev_info_t *, ddi_detach_cmd_t); |
| 96 | static int dpinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); |
| 97 | |
| 98 | static struct dev_ops dp_ops = { |
| 99 | DEVO_REV, /* devo_rev */ |
| 100 | 0, /* refcnt */ |
| 101 | dpinfo, /* info */ |
| 102 | nulldev, /* identify */ |
| 103 | nulldev, /* probe */ |
| 104 | dpattach, /* attach */ |
| 105 | dpdetach, /* detach */ |
| 106 | nodev, /* reset */ |
| 107 | &dp_cb_ops, /* driver operations */ |
| 108 | (struct bus_ops *)NULL, /* bus operations */ |
Sherry Moore | 1939740 | 2008-09-22 16:30:26 -0700 | [diff] [blame] | 109 | nulldev, /* power */ |
| 110 | ddi_quiesce_not_needed, /* quiesce */ |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 111 | }; |
| 112 | |
| 113 | |
| 114 | static struct modldrv modldrv = { |
| 115 | &mod_driverops, /* type of module - a driver */ |
meem | a85084c | 2008-07-18 11:33:05 -0700 | [diff] [blame] | 116 | "/dev/poll driver", |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 117 | &dp_ops, |
| 118 | }; |
| 119 | |
| 120 | static struct modlinkage modlinkage = { |
| 121 | MODREV_1, |
| 122 | (void *)&modldrv, |
| 123 | NULL |
| 124 | }; |
| 125 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 126 | static void pcachelink_assoc(pollcache_t *, pollcache_t *); |
| 127 | static void pcachelink_mark_stale(pollcache_t *); |
| 128 | static void pcachelink_purge_stale(pollcache_t *); |
| 129 | static void pcachelink_purge_all(pollcache_t *); |
| 130 | |
| 131 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 132 | /* |
| 133 | * Locking Design |
| 134 | * |
| 135 | * The /dev/poll driver shares most of its code with poll sys call whose |
| 136 | * code is in common/syscall/poll.c. In poll(2) design, the pollcache |
| 137 | * structure is per lwp. An implicit assumption is made there that some |
| 138 | * portion of pollcache will never be touched by other lwps. E.g., in |
| 139 | * poll(2) design, no lwp will ever need to grow bitmap of other lwp. |
| 140 | * This assumption is not true for /dev/poll; hence the need for extra |
| 141 | * locking. |
| 142 | * |
amw | da6c28a | 2007-10-25 16:34:29 -0700 | [diff] [blame] | 143 | * To allow more parallelism, each /dev/poll file descriptor (indexed by |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 144 | * minor number) has its own lock. Since read (dpioctl) is a much more |
| 145 | * frequent operation than write, we want to allow multiple reads on same |
| 146 | * /dev/poll fd. However, we prevent writes from being starved by giving |
| 147 | * priority to write operation. Theoretically writes can starve reads as |
amw | da6c28a | 2007-10-25 16:34:29 -0700 | [diff] [blame] | 148 | * well. But in practical sense this is not important because (1) writes |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 149 | * happens less often than reads, and (2) write operation defines the |
| 150 | * content of poll fd a cache set. If writes happens so often that they |
| 151 | * can starve reads, that means the cached set is very unstable. It may |
| 152 | * not make sense to read an unstable cache set anyway. Therefore, the |
| 153 | * writers starving readers case is not handled in this design. |
| 154 | */ |
| 155 | |
| 156 | int |
| 157 | _init() |
| 158 | { |
| 159 | int error; |
| 160 | |
| 161 | dptblsize = DEVPOLLSIZE; |
| 162 | devpolltbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); |
| 163 | mutex_init(&devpoll_lock, NULL, MUTEX_DEFAULT, NULL); |
| 164 | devpoll_init = 1; |
| 165 | if ((error = mod_install(&modlinkage)) != 0) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 166 | kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); |
| 167 | devpoll_init = 0; |
| 168 | } |
| 169 | return (error); |
| 170 | } |
| 171 | |
| 172 | int |
| 173 | _fini() |
| 174 | { |
| 175 | int error; |
| 176 | |
| 177 | if ((error = mod_remove(&modlinkage)) != 0) { |
| 178 | return (error); |
| 179 | } |
| 180 | mutex_destroy(&devpoll_lock); |
| 181 | kmem_free(devpolltbl, sizeof (caddr_t) * dptblsize); |
| 182 | return (0); |
| 183 | } |
| 184 | |
| 185 | int |
| 186 | _info(struct modinfo *modinfop) |
| 187 | { |
| 188 | return (mod_info(&modlinkage, modinfop)); |
| 189 | } |
| 190 | |
| 191 | /*ARGSUSED*/ |
| 192 | static int |
| 193 | dpattach(dev_info_t *devi, ddi_attach_cmd_t cmd) |
| 194 | { |
| 195 | if (ddi_create_minor_node(devi, "poll", S_IFCHR, 0, DDI_PSEUDO, NULL) |
| 196 | == DDI_FAILURE) { |
| 197 | ddi_remove_minor_node(devi, NULL); |
| 198 | return (DDI_FAILURE); |
| 199 | } |
| 200 | dpdevi = devi; |
| 201 | return (DDI_SUCCESS); |
| 202 | } |
| 203 | |
| 204 | static int |
| 205 | dpdetach(dev_info_t *devi, ddi_detach_cmd_t cmd) |
| 206 | { |
| 207 | if (cmd != DDI_DETACH) |
| 208 | return (DDI_FAILURE); |
| 209 | |
| 210 | ddi_remove_minor_node(devi, NULL); |
| 211 | return (DDI_SUCCESS); |
| 212 | } |
| 213 | |
| 214 | /* ARGSUSED */ |
| 215 | static int |
| 216 | dpinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) |
| 217 | { |
| 218 | int error; |
| 219 | |
| 220 | switch (infocmd) { |
| 221 | case DDI_INFO_DEVT2DEVINFO: |
| 222 | *result = (void *)dpdevi; |
| 223 | error = DDI_SUCCESS; |
| 224 | break; |
| 225 | case DDI_INFO_DEVT2INSTANCE: |
| 226 | *result = (void *)0; |
| 227 | error = DDI_SUCCESS; |
| 228 | break; |
| 229 | default: |
| 230 | error = DDI_FAILURE; |
| 231 | } |
| 232 | return (error); |
| 233 | } |
| 234 | |
| 235 | /* |
| 236 | * dp_pcache_poll has similar logic to pcache_poll() in poll.c. The major |
| 237 | * differences are: (1) /dev/poll requires scanning the bitmap starting at |
| 238 | * where it was stopped last time, instead of always starting from 0, |
| 239 | * (2) since user may not have cleaned up the cached fds when they are |
| 240 | * closed, some polldats in cache may refer to closed or reused fds. We |
| 241 | * need to check for those cases. |
| 242 | * |
| 243 | * NOTE: Upon closing an fd, automatic poll cache cleanup is done for |
| 244 | * poll(2) caches but NOT for /dev/poll caches. So expect some |
| 245 | * stale entries! |
| 246 | */ |
| 247 | static int |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 248 | dp_pcache_poll(dp_entry_t *dpep, void *dpbuf, |
| 249 | pollcache_t *pcp, nfds_t nfds, int *fdcntp) |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 250 | { |
| 251 | int start, ostart, end; |
| 252 | int fdcnt, fd; |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 253 | boolean_t done; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 254 | file_t *fp; |
| 255 | short revent; |
| 256 | boolean_t no_wrap; |
| 257 | pollhead_t *php; |
| 258 | polldat_t *pdp; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 259 | pollfd_t *pfdp; |
| 260 | epoll_event_t *epoll; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 261 | int error = 0; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 262 | short mask = POLLRDHUP | POLLWRBAND; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 263 | boolean_t is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 264 | |
| 265 | ASSERT(MUTEX_HELD(&pcp->pc_lock)); |
| 266 | if (pcp->pc_bitmap == NULL) { |
| 267 | /* |
| 268 | * No Need to search because no poll fd |
| 269 | * has been cached. |
| 270 | */ |
| 271 | return (error); |
| 272 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 273 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 274 | if (is_epoll) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 275 | pfdp = NULL; |
| 276 | epoll = (epoll_event_t *)dpbuf; |
| 277 | } else { |
| 278 | pfdp = (pollfd_t *)dpbuf; |
| 279 | epoll = NULL; |
| 280 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 281 | retry: |
| 282 | start = ostart = pcp->pc_mapstart; |
| 283 | end = pcp->pc_mapend; |
| 284 | php = NULL; |
| 285 | |
| 286 | if (start == 0) { |
| 287 | /* |
| 288 | * started from every begining, no need to wrap around. |
| 289 | */ |
| 290 | no_wrap = B_TRUE; |
| 291 | } else { |
| 292 | no_wrap = B_FALSE; |
| 293 | } |
| 294 | done = B_FALSE; |
| 295 | fdcnt = 0; |
| 296 | while ((fdcnt < nfds) && !done) { |
| 297 | php = NULL; |
| 298 | revent = 0; |
| 299 | /* |
| 300 | * Examine the bit map in a circular fashion |
| 301 | * to avoid starvation. Always resume from |
| 302 | * last stop. Scan till end of the map. Then |
| 303 | * wrap around. |
| 304 | */ |
| 305 | fd = bt_getlowbit(pcp->pc_bitmap, start, end); |
| 306 | ASSERT(fd <= end); |
| 307 | if (fd >= 0) { |
| 308 | if (fd == end) { |
| 309 | if (no_wrap) { |
| 310 | done = B_TRUE; |
| 311 | } else { |
| 312 | start = 0; |
| 313 | end = ostart - 1; |
| 314 | no_wrap = B_TRUE; |
| 315 | } |
| 316 | } else { |
| 317 | start = fd + 1; |
| 318 | } |
| 319 | pdp = pcache_lookup_fd(pcp, fd); |
meem | a85084c | 2008-07-18 11:33:05 -0700 | [diff] [blame] | 320 | repoll: |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 321 | ASSERT(pdp != NULL); |
| 322 | ASSERT(pdp->pd_fd == fd); |
| 323 | if (pdp->pd_fp == NULL) { |
| 324 | /* |
| 325 | * The fd is POLLREMOVed. This fd is |
| 326 | * logically no longer cached. So move |
| 327 | * on to the next one. |
| 328 | */ |
| 329 | continue; |
| 330 | } |
| 331 | if ((fp = getf(fd)) == NULL) { |
| 332 | /* |
| 333 | * The fd has been closed, but user has not |
| 334 | * done a POLLREMOVE on this fd yet. Instead |
| 335 | * of cleaning it here implicitly, we return |
| 336 | * POLLNVAL. This is consistent with poll(2) |
| 337 | * polling a closed fd. Hope this will remind |
| 338 | * user to do a POLLREMOVE. |
| 339 | */ |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 340 | if (!is_epoll && pfdp != NULL) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 341 | pfdp[fdcnt].fd = fd; |
| 342 | pfdp[fdcnt].revents = POLLNVAL; |
| 343 | fdcnt++; |
| 344 | continue; |
| 345 | } |
| 346 | |
| 347 | /* |
| 348 | * In the epoll compatibility case, we actually |
| 349 | * perform the implicit removal to remain |
| 350 | * closer to the epoll semantics. |
| 351 | */ |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 352 | if (is_epoll) { |
| 353 | pdp->pd_fp = NULL; |
| 354 | pdp->pd_events = 0; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 355 | |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 356 | if (pdp->pd_php != NULL) { |
| 357 | pollhead_delete(pdp->pd_php, |
| 358 | pdp); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 359 | pdp->pd_php = NULL; |
| 360 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 361 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 362 | BT_CLEAR(pcp->pc_bitmap, fd); |
| 363 | continue; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 364 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 365 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 366 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 367 | if (fp != pdp->pd_fp) { |
| 368 | /* |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 369 | * The user is polling on a cached fd which was |
| 370 | * closed and then reused. Unfortunately there |
| 371 | * is no good way to communicate this fact to |
| 372 | * the consumer. |
| 373 | * |
| 374 | * If the file struct is also reused, we may |
| 375 | * not be able to detect the fd reuse at all. |
| 376 | * As long as this does not cause system |
| 377 | * failure and/or memory leaks, we will play |
| 378 | * along. The man page states that if the user |
| 379 | * does not clean up closed fds, polling |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 380 | * results will be indeterministic. |
| 381 | * |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 382 | * XXX: perhaps log the detection of fd reuse? |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 383 | */ |
| 384 | pdp->pd_fp = fp; |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 385 | |
| 386 | /* |
| 387 | * When this situation has been detected, it's |
| 388 | * likely that any existing pollhead is |
| 389 | * ill-suited to perform proper wake-ups. |
| 390 | * |
| 391 | * Clean up the old entry under the expectation |
| 392 | * that a valid one will be provided as part of |
| 393 | * the later VOP_POLL. |
| 394 | */ |
| 395 | if (pdp->pd_php != NULL) { |
| 396 | pollhead_delete(pdp->pd_php, pdp); |
| 397 | pdp->pd_php = NULL; |
| 398 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 399 | } |
| 400 | /* |
| 401 | * XXX - pollrelock() logic needs to know which |
| 402 | * which pollcache lock to grab. It'd be a |
| 403 | * cleaner solution if we could pass pcp as |
| 404 | * an arguement in VOP_POLL interface instead |
| 405 | * of implicitly passing it using thread_t |
| 406 | * struct. On the other hand, changing VOP_POLL |
| 407 | * interface will require all driver/file system |
| 408 | * poll routine to change. May want to revisit |
| 409 | * the tradeoff later. |
| 410 | */ |
| 411 | curthread->t_pollcache = pcp; |
| 412 | error = VOP_POLL(fp->f_vnode, pdp->pd_events, 0, |
amw | da6c28a | 2007-10-25 16:34:29 -0700 | [diff] [blame] | 413 | &revent, &php, NULL); |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 414 | |
| 415 | /* |
| 416 | * Recheck edge-triggered descriptors which lack a |
| 417 | * pollhead. While this check is performed when an fd |
| 418 | * is added to the pollcache in dpwrite(), subsequent |
| 419 | * descriptor manipulation could cause a different |
| 420 | * resource to be present now. |
| 421 | */ |
| 422 | if ((pdp->pd_events & POLLET) && error == 0 && |
| 423 | pdp->pd_php == NULL && php == NULL && revent != 0) { |
| 424 | short levent = 0; |
| 425 | |
| 426 | /* |
| 427 | * The same POLLET-only VOP_POLL is used in an |
| 428 | * attempt to coax a pollhead from older |
| 429 | * driver logic. |
| 430 | */ |
| 431 | error = VOP_POLL(fp->f_vnode, POLLET, |
| 432 | 0, &levent, &php, NULL); |
| 433 | } |
| 434 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 435 | curthread->t_pollcache = NULL; |
| 436 | releasef(fd); |
| 437 | if (error != 0) { |
| 438 | break; |
| 439 | } |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 440 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 441 | /* |
| 442 | * layered devices (e.g. console driver) |
| 443 | * may change the vnode and thus the pollhead |
| 444 | * pointer out from underneath us. |
| 445 | */ |
| 446 | if (php != NULL && pdp->pd_php != NULL && |
| 447 | php != pdp->pd_php) { |
| 448 | pollhead_delete(pdp->pd_php, pdp); |
| 449 | pdp->pd_php = php; |
| 450 | pollhead_insert(php, pdp); |
| 451 | /* |
| 452 | * The bit should still be set. |
| 453 | */ |
| 454 | ASSERT(BT_TEST(pcp->pc_bitmap, fd)); |
| 455 | goto retry; |
| 456 | } |
| 457 | |
| 458 | if (revent != 0) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 459 | if (pfdp != NULL) { |
| 460 | pfdp[fdcnt].fd = fd; |
| 461 | pfdp[fdcnt].events = pdp->pd_events; |
| 462 | pfdp[fdcnt].revents = revent; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 463 | } else if (epoll != NULL) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 464 | epoll_event_t *ep = &epoll[fdcnt]; |
| 465 | |
| 466 | ASSERT(epoll != NULL); |
| 467 | ep->data.u64 = pdp->pd_epolldata; |
| 468 | |
| 469 | /* |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 470 | * Since POLLNVAL is a legal event for |
| 471 | * VOP_POLL handlers to emit, it must |
| 472 | * be translated epoll-legal. |
| 473 | */ |
| 474 | if (revent & POLLNVAL) { |
| 475 | revent &= ~POLLNVAL; |
| 476 | revent |= POLLERR; |
| 477 | } |
| 478 | |
| 479 | /* |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 480 | * If any of the event bits are set for |
| 481 | * which poll and epoll representations |
| 482 | * differ, swizzle in the native epoll |
| 483 | * values. |
| 484 | */ |
| 485 | if (revent & mask) { |
| 486 | ep->events = (revent & ~mask) | |
| 487 | ((revent & POLLRDHUP) ? |
| 488 | EPOLLRDHUP : 0) | |
| 489 | ((revent & POLLWRBAND) ? |
| 490 | EPOLLWRBAND : 0); |
| 491 | } else { |
| 492 | ep->events = revent; |
| 493 | } |
| 494 | |
| 495 | /* |
| 496 | * We define POLLWRNORM to be POLLOUT, |
| 497 | * but epoll has separate definitions |
| 498 | * for them; if POLLOUT is set and the |
| 499 | * user has asked for EPOLLWRNORM, set |
| 500 | * that as well. |
| 501 | */ |
| 502 | if ((revent & POLLOUT) && |
| 503 | (pdp->pd_events & EPOLLWRNORM)) { |
| 504 | ep->events |= EPOLLWRNORM; |
| 505 | } |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 506 | } else { |
| 507 | pollstate_t *ps = |
| 508 | curthread->t_pollstate; |
| 509 | /* |
| 510 | * The devpoll handle itself is being |
| 511 | * polled. Notify the caller of any |
| 512 | * readable event(s), leaving as much |
| 513 | * state as possible untouched. |
| 514 | */ |
| 515 | VERIFY(fdcnt == 0); |
| 516 | VERIFY(ps != NULL); |
| 517 | |
| 518 | /* |
| 519 | * If a call to pollunlock() fails |
| 520 | * during VOP_POLL, skip over the fd |
| 521 | * and continue polling. |
| 522 | * |
| 523 | * Otherwise, report that there is an |
| 524 | * event pending. |
| 525 | */ |
| 526 | if ((ps->ps_flags & POLLSTATE_ULFAIL) |
| 527 | != 0) { |
| 528 | ps->ps_flags &= |
| 529 | ~POLLSTATE_ULFAIL; |
| 530 | continue; |
| 531 | } else { |
| 532 | fdcnt++; |
| 533 | break; |
| 534 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 535 | } |
| 536 | |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 537 | /* Handle special polling modes. */ |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 538 | if (pdp->pd_events & POLLONESHOT) { |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 539 | /* |
| 540 | * If POLLONESHOT is set, perform the |
| 541 | * implicit POLLREMOVE. |
| 542 | */ |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 543 | pdp->pd_fp = NULL; |
| 544 | pdp->pd_events = 0; |
| 545 | |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 546 | if (pdp->pd_php != NULL) { |
| 547 | pollhead_delete(pdp->pd_php, |
| 548 | pdp); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 549 | pdp->pd_php = NULL; |
| 550 | } |
| 551 | |
| 552 | BT_CLEAR(pcp->pc_bitmap, fd); |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 553 | } else if (pdp->pd_events & POLLET) { |
| 554 | /* |
| 555 | * Wire up the pollhead which should |
| 556 | * have been provided. Edge-triggered |
| 557 | * polling cannot function properly |
| 558 | * with drivers which do not emit one. |
| 559 | */ |
| 560 | if (php != NULL && |
| 561 | pdp->pd_php == NULL) { |
| 562 | pollhead_insert(php, pdp); |
| 563 | pdp->pd_php = php; |
| 564 | } |
| 565 | |
| 566 | /* |
| 567 | * If the driver has emitted a pollhead, |
| 568 | * clear the bit in the bitmap which |
| 569 | * effectively latches the edge on a |
| 570 | * pollwakeup() from the driver. |
| 571 | */ |
| 572 | if (pdp->pd_php != NULL) { |
| 573 | BT_CLEAR(pcp->pc_bitmap, fd); |
| 574 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 575 | } |
| 576 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 577 | fdcnt++; |
| 578 | } else if (php != NULL) { |
| 579 | /* |
| 580 | * We clear a bit or cache a poll fd if |
| 581 | * the driver returns a poll head ptr, |
| 582 | * which is expected in the case of 0 |
| 583 | * revents. Some buggy driver may return |
| 584 | * NULL php pointer with 0 revents. In |
| 585 | * this case, we just treat the driver as |
| 586 | * "noncachable" and not clearing the bit |
| 587 | * in bitmap. |
| 588 | */ |
| 589 | if ((pdp->pd_php != NULL) && |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 590 | ((pcp->pc_flag & PC_POLLWAKE) == 0)) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 591 | BT_CLEAR(pcp->pc_bitmap, fd); |
| 592 | } |
| 593 | if (pdp->pd_php == NULL) { |
| 594 | pollhead_insert(php, pdp); |
| 595 | pdp->pd_php = php; |
meem | a85084c | 2008-07-18 11:33:05 -0700 | [diff] [blame] | 596 | /* |
| 597 | * An event of interest may have |
| 598 | * arrived between the VOP_POLL() and |
| 599 | * the pollhead_insert(); check again. |
| 600 | */ |
| 601 | goto repoll; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 602 | } |
| 603 | } |
| 604 | } else { |
| 605 | /* |
| 606 | * No bit set in the range. Check for wrap around. |
| 607 | */ |
| 608 | if (!no_wrap) { |
| 609 | start = 0; |
| 610 | end = ostart - 1; |
| 611 | no_wrap = B_TRUE; |
| 612 | } else { |
| 613 | done = B_TRUE; |
| 614 | } |
| 615 | } |
| 616 | } |
| 617 | |
| 618 | if (!done) { |
| 619 | pcp->pc_mapstart = start; |
| 620 | } |
| 621 | ASSERT(*fdcntp == 0); |
| 622 | *fdcntp = fdcnt; |
| 623 | return (error); |
| 624 | } |
| 625 | |
| 626 | /*ARGSUSED*/ |
| 627 | static int |
| 628 | dpopen(dev_t *devp, int flag, int otyp, cred_t *credp) |
| 629 | { |
| 630 | minor_t minordev; |
| 631 | dp_entry_t *dpep; |
| 632 | pollcache_t *pcp; |
| 633 | |
| 634 | ASSERT(devpoll_init); |
| 635 | ASSERT(dptblsize <= MAXMIN); |
| 636 | mutex_enter(&devpoll_lock); |
| 637 | for (minordev = 0; minordev < dptblsize; minordev++) { |
| 638 | if (devpolltbl[minordev] == NULL) { |
| 639 | devpolltbl[minordev] = (dp_entry_t *)RESERVED; |
| 640 | break; |
| 641 | } |
| 642 | } |
| 643 | if (minordev == dptblsize) { |
| 644 | dp_entry_t **newtbl; |
| 645 | size_t oldsize; |
| 646 | |
| 647 | /* |
| 648 | * Used up every entry in the existing devpoll table. |
| 649 | * Grow the table by DEVPOLLSIZE. |
| 650 | */ |
| 651 | if ((oldsize = dptblsize) >= MAXMIN) { |
| 652 | mutex_exit(&devpoll_lock); |
| 653 | return (ENXIO); |
| 654 | } |
| 655 | dptblsize += DEVPOLLSIZE; |
| 656 | if (dptblsize > MAXMIN) { |
| 657 | dptblsize = MAXMIN; |
| 658 | } |
| 659 | newtbl = kmem_zalloc(sizeof (caddr_t) * dptblsize, KM_SLEEP); |
| 660 | bcopy(devpolltbl, newtbl, sizeof (caddr_t) * oldsize); |
| 661 | kmem_free(devpolltbl, sizeof (caddr_t) * oldsize); |
| 662 | devpolltbl = newtbl; |
| 663 | devpolltbl[minordev] = (dp_entry_t *)RESERVED; |
| 664 | } |
| 665 | mutex_exit(&devpoll_lock); |
| 666 | |
| 667 | dpep = kmem_zalloc(sizeof (dp_entry_t), KM_SLEEP); |
| 668 | /* |
| 669 | * allocate a pollcache skeleton here. Delay allocating bitmap |
| 670 | * structures until dpwrite() time, since we don't know the |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 671 | * optimal size yet. We also delay setting the pid until either |
| 672 | * dpwrite() or attempt to poll on the instance, allowing parents |
| 673 | * to create instances of /dev/poll for their children. (In the |
| 674 | * epoll compatibility case, this check isn't performed to maintain |
| 675 | * semantic compatibility.) |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 676 | */ |
| 677 | pcp = pcache_alloc(); |
| 678 | dpep->dpe_pcache = pcp; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 679 | pcp->pc_pid = -1; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 680 | *devp = makedevice(getmajor(*devp), minordev); /* clone the driver */ |
| 681 | mutex_enter(&devpoll_lock); |
| 682 | ASSERT(minordev < dptblsize); |
| 683 | ASSERT(devpolltbl[minordev] == (dp_entry_t *)RESERVED); |
| 684 | devpolltbl[minordev] = dpep; |
| 685 | mutex_exit(&devpoll_lock); |
| 686 | return (0); |
| 687 | } |
| 688 | |
| 689 | /* |
| 690 | * Write to dev/poll add/remove fd's to/from a cached poll fd set, |
| 691 | * or change poll events for a watched fd. |
| 692 | */ |
| 693 | /*ARGSUSED*/ |
| 694 | static int |
| 695 | dpwrite(dev_t dev, struct uio *uiop, cred_t *credp) |
| 696 | { |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 697 | minor_t minor; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 698 | dp_entry_t *dpep; |
| 699 | pollcache_t *pcp; |
| 700 | pollfd_t *pollfdp, *pfdp; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 701 | dvpoll_epollfd_t *epfdp; |
| 702 | uintptr_t limit; |
| 703 | int error, size; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 704 | ssize_t uiosize; |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 705 | size_t copysize; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 706 | nfds_t pollfdnum; |
| 707 | struct pollhead *php = NULL; |
| 708 | polldat_t *pdp; |
| 709 | int fd; |
| 710 | file_t *fp; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 711 | boolean_t is_epoll, fds_added = B_FALSE; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 712 | |
| 713 | minor = getminor(dev); |
| 714 | |
| 715 | mutex_enter(&devpoll_lock); |
| 716 | ASSERT(minor < dptblsize); |
| 717 | dpep = devpolltbl[minor]; |
| 718 | ASSERT(dpep != NULL); |
| 719 | mutex_exit(&devpoll_lock); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 720 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 721 | mutex_enter(&dpep->dpe_lock); |
| 722 | pcp = dpep->dpe_pcache; |
| 723 | is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; |
| 724 | size = (is_epoll) ? sizeof (dvpoll_epollfd_t) : sizeof (pollfd_t); |
| 725 | mutex_exit(&dpep->dpe_lock); |
| 726 | |
| 727 | if (!is_epoll && curproc->p_pid != pcp->pc_pid) { |
| 728 | if (pcp->pc_pid != -1) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 729 | return (EACCES); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 730 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 731 | |
| 732 | pcp->pc_pid = curproc->p_pid; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 733 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 734 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 735 | uiosize = uiop->uio_resid; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 736 | pollfdnum = uiosize / size; |
Bryan Cantrill | 158dfbe | 2015-11-24 00:52:03 +0000 | [diff] [blame] | 737 | |
| 738 | /* |
| 739 | * We want to make sure that pollfdnum isn't large enough to DoS us, |
| 740 | * but we also don't want to grab p_lock unnecessarily -- so we |
| 741 | * perform the full check against our resource limits if and only if |
| 742 | * pollfdnum is larger than the known-to-be-sane value of UINT8_MAX. |
| 743 | */ |
| 744 | if (pollfdnum > UINT8_MAX) { |
| 745 | mutex_enter(&curproc->p_lock); |
| 746 | if (pollfdnum > |
| 747 | (uint_t)rctl_enforced_value(rctlproc_legacy[RLIMIT_NOFILE], |
| 748 | curproc->p_rctls, curproc)) { |
| 749 | (void) rctl_action(rctlproc_legacy[RLIMIT_NOFILE], |
| 750 | curproc->p_rctls, curproc, RCA_SAFE); |
| 751 | mutex_exit(&curproc->p_lock); |
| 752 | return (EINVAL); |
| 753 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 754 | mutex_exit(&curproc->p_lock); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 755 | } |
Bryan Cantrill | 158dfbe | 2015-11-24 00:52:03 +0000 | [diff] [blame] | 756 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 757 | /* |
| 758 | * Copy in the pollfd array. Walk through the array and add |
| 759 | * each polled fd to the cached set. |
| 760 | */ |
| 761 | pollfdp = kmem_alloc(uiosize, KM_SLEEP); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 762 | limit = (uintptr_t)pollfdp + (pollfdnum * size); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 763 | |
| 764 | /* |
| 765 | * Although /dev/poll uses the write(2) interface to cache fds, it's |
| 766 | * not supposed to function as a seekable device. To prevent offset |
| 767 | * from growing and eventually exceed the maximum, reset the offset |
| 768 | * here for every call. |
| 769 | */ |
| 770 | uiop->uio_loffset = 0; |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 771 | |
| 772 | /* |
| 773 | * Use uiocopy instead of uiomove when populating pollfdp, keeping |
| 774 | * uio_resid untouched for now. Write syscalls will translate EINTR |
| 775 | * into a success if they detect "successfully transfered" data via an |
| 776 | * updated uio_resid. Falsely suppressing such errors is disastrous. |
| 777 | */ |
| 778 | if ((error = uiocopy((caddr_t)pollfdp, uiosize, UIO_WRITE, uiop, |
| 779 | ©size)) != 0) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 780 | kmem_free(pollfdp, uiosize); |
| 781 | return (error); |
| 782 | } |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 783 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 784 | /* |
| 785 | * We are about to enter the core portion of dpwrite(). Make sure this |
| 786 | * write has exclusive access in this portion of the code, i.e., no |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 787 | * other writers in this code. |
| 788 | * |
| 789 | * Waiting for all readers to drop their references to the dpe is |
| 790 | * unecessary since the pollcache itself is protected by pc_lock. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 791 | */ |
| 792 | mutex_enter(&dpep->dpe_lock); |
| 793 | dpep->dpe_writerwait++; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 794 | while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0) { |
| 795 | ASSERT(dpep->dpe_refcnt != 0); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 796 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 797 | if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { |
| 798 | dpep->dpe_writerwait--; |
| 799 | mutex_exit(&dpep->dpe_lock); |
| 800 | kmem_free(pollfdp, uiosize); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 801 | return (EINTR); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 802 | } |
| 803 | } |
| 804 | dpep->dpe_writerwait--; |
| 805 | dpep->dpe_flag |= DP_WRITER_PRESENT; |
| 806 | dpep->dpe_refcnt++; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 807 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 808 | if (!is_epoll && (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0) { |
| 809 | /* |
| 810 | * The epoll compat mode was enabled while we were waiting to |
| 811 | * establish write access. It is not safe to continue since |
| 812 | * state was prepared for non-epoll operation. |
| 813 | */ |
| 814 | error = EBUSY; |
| 815 | goto bypass; |
| 816 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 817 | mutex_exit(&dpep->dpe_lock); |
| 818 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 819 | /* |
| 820 | * Since the dpwrite() may recursively walk an added /dev/poll handle, |
| 821 | * pollstate_enter() deadlock and loop detection must be used. |
| 822 | */ |
| 823 | (void) pollstate_create(); |
| 824 | VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 825 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 826 | if (pcp->pc_bitmap == NULL) { |
| 827 | pcache_create(pcp, pollfdnum); |
| 828 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 829 | for (pfdp = pollfdp; (uintptr_t)pfdp < limit; |
| 830 | pfdp = (pollfd_t *)((uintptr_t)pfdp + size)) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 831 | fd = pfdp->fd; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 832 | if ((uint_t)fd >= P_FINFO(curproc)->fi_nfiles) { |
| 833 | /* |
| 834 | * epoll semantics demand that we return EBADF if our |
| 835 | * specified fd is invalid. |
| 836 | */ |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 837 | if (is_epoll) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 838 | error = EBADF; |
| 839 | break; |
| 840 | } |
| 841 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 842 | continue; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 843 | } |
| 844 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 845 | pdp = pcache_lookup_fd(pcp, fd); |
| 846 | if (pfdp->events != POLLREMOVE) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 847 | |
| 848 | fp = NULL; |
| 849 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 850 | if (pdp == NULL) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 851 | /* |
| 852 | * If we're in epoll compatibility mode, check |
| 853 | * that the fd is valid before allocating |
| 854 | * anything for it; epoll semantics demand that |
| 855 | * we return EBADF if our specified fd is |
| 856 | * invalid. |
| 857 | */ |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 858 | if (is_epoll) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 859 | if ((fp = getf(fd)) == NULL) { |
| 860 | error = EBADF; |
| 861 | break; |
| 862 | } |
| 863 | } |
| 864 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 865 | pdp = pcache_alloc_fd(0); |
| 866 | pdp->pd_fd = fd; |
| 867 | pdp->pd_pcache = pcp; |
| 868 | pcache_insert_fd(pcp, pdp, pollfdnum); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 869 | } else { |
| 870 | /* |
| 871 | * epoll semantics demand that we error out if |
| 872 | * a file descriptor is added twice, which we |
| 873 | * check (imperfectly) by checking if we both |
| 874 | * have the file descriptor cached and the |
| 875 | * file pointer that correponds to the file |
| 876 | * descriptor matches our cached value. If |
| 877 | * there is a pointer mismatch, the file |
| 878 | * descriptor was closed without being removed. |
| 879 | * The converse is clearly not true, however, |
| 880 | * so to narrow the window by which a spurious |
| 881 | * EEXIST may be returned, we also check if |
| 882 | * this fp has been added to an epoll control |
| 883 | * descriptor in the past; if it hasn't, we |
| 884 | * know that this is due to fp reuse -- it's |
| 885 | * not a true EEXIST case. (By performing this |
| 886 | * additional check, we limit the window of |
| 887 | * spurious EEXIST to situations where a single |
| 888 | * file descriptor is being used across two or |
| 889 | * more epoll control descriptors -- and even |
| 890 | * then, the file descriptor must be closed and |
| 891 | * reused in a relatively tight time span.) |
| 892 | */ |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 893 | if (is_epoll) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 894 | if (pdp->pd_fp != NULL && |
| 895 | (fp = getf(fd)) != NULL && |
| 896 | fp == pdp->pd_fp && |
| 897 | (fp->f_flag2 & FEPOLLED)) { |
| 898 | error = EEXIST; |
| 899 | releasef(fd); |
| 900 | break; |
| 901 | } |
| 902 | |
| 903 | /* |
| 904 | * We have decided that the cached |
| 905 | * information was stale: it either |
| 906 | * didn't match, or the fp had never |
| 907 | * actually been epoll()'d on before. |
| 908 | * We need to now clear our pd_events |
| 909 | * to assure that we don't mistakenly |
| 910 | * operate on cached event disposition. |
| 911 | */ |
| 912 | pdp->pd_events = 0; |
| 913 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 914 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 915 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 916 | if (is_epoll) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 917 | epfdp = (dvpoll_epollfd_t *)pfdp; |
| 918 | pdp->pd_epolldata = epfdp->dpep_data; |
| 919 | } |
| 920 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 921 | ASSERT(pdp->pd_fd == fd); |
| 922 | ASSERT(pdp->pd_pcache == pcp); |
| 923 | if (fd >= pcp->pc_mapsize) { |
| 924 | mutex_exit(&pcp->pc_lock); |
| 925 | pcache_grow_map(pcp, fd); |
| 926 | mutex_enter(&pcp->pc_lock); |
| 927 | } |
| 928 | if (fd > pcp->pc_mapend) { |
| 929 | pcp->pc_mapend = fd; |
| 930 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 931 | if (fp == NULL && (fp = getf(fd)) == NULL) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 932 | /* |
| 933 | * The fd is not valid. Since we can't pass |
| 934 | * this error back in the write() call, set |
| 935 | * the bit in bitmap to force DP_POLL ioctl |
| 936 | * to examine it. |
| 937 | */ |
| 938 | BT_SET(pcp->pc_bitmap, fd); |
| 939 | pdp->pd_events |= pfdp->events; |
| 940 | continue; |
| 941 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 942 | |
| 943 | /* |
| 944 | * To (greatly) reduce EEXIST false positives, we |
| 945 | * denote that this fp has been epoll()'d. We do this |
| 946 | * regardless of epoll compatibility mode, as the flag |
| 947 | * is harmless if not in epoll compatibility mode. |
| 948 | */ |
| 949 | fp->f_flag2 |= FEPOLLED; |
| 950 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 951 | /* |
| 952 | * Don't do VOP_POLL for an already cached fd with |
| 953 | * same poll events. |
| 954 | */ |
| 955 | if ((pdp->pd_events == pfdp->events) && |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 956 | (pdp->pd_fp == fp)) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 957 | /* |
| 958 | * the events are already cached |
| 959 | */ |
| 960 | releasef(fd); |
| 961 | continue; |
| 962 | } |
| 963 | |
| 964 | /* |
| 965 | * do VOP_POLL and cache this poll fd. |
| 966 | */ |
| 967 | /* |
| 968 | * XXX - pollrelock() logic needs to know which |
| 969 | * which pollcache lock to grab. It'd be a |
| 970 | * cleaner solution if we could pass pcp as |
| 971 | * an arguement in VOP_POLL interface instead |
| 972 | * of implicitly passing it using thread_t |
| 973 | * struct. On the other hand, changing VOP_POLL |
| 974 | * interface will require all driver/file system |
| 975 | * poll routine to change. May want to revisit |
| 976 | * the tradeoff later. |
| 977 | */ |
| 978 | curthread->t_pollcache = pcp; |
| 979 | error = VOP_POLL(fp->f_vnode, pfdp->events, 0, |
amw | da6c28a | 2007-10-25 16:34:29 -0700 | [diff] [blame] | 980 | &pfdp->revents, &php, NULL); |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 981 | |
| 982 | /* |
| 983 | * Edge-triggered polling requires a pollhead in order |
| 984 | * to initiate wake-ups properly. Drivers which are |
| 985 | * savvy to POLLET presence, which should include |
| 986 | * everything in-gate, will always emit one, regardless |
| 987 | * of revent status. Older drivers which only emit a |
| 988 | * pollhead if 'revents == 0' are given a second chance |
| 989 | * here via a second VOP_POLL, with only POLLET set in |
| 990 | * the events of interest. These circumstances should |
| 991 | * induce any cacheable drivers to emit a pollhead for |
| 992 | * wake-ups. |
| 993 | * |
| 994 | * Drivers which never emit a pollhead will simply |
| 995 | * disobey the exectation of edge-triggered behavior. |
| 996 | * This includes recursive epoll which, even on Linux, |
| 997 | * yields its events in a level-triggered fashion only. |
| 998 | */ |
| 999 | if ((pdp->pd_events & POLLET) && error == 0 && |
| 1000 | php == NULL) { |
| 1001 | short levent = 0; |
| 1002 | |
| 1003 | error = VOP_POLL(fp->f_vnode, POLLET, 0, |
| 1004 | &levent, &php, NULL); |
| 1005 | } |
| 1006 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1007 | curthread->t_pollcache = NULL; |
| 1008 | /* |
meem | a85084c | 2008-07-18 11:33:05 -0700 | [diff] [blame] | 1009 | * We always set the bit when this fd is cached; |
| 1010 | * this forces the first DP_POLL to poll this fd. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1011 | * Real performance gain comes from subsequent |
meem | a85084c | 2008-07-18 11:33:05 -0700 | [diff] [blame] | 1012 | * DP_POLL. We also attempt a pollhead_insert(); |
| 1013 | * if it's not possible, we'll do it in dpioctl(). |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1014 | */ |
| 1015 | BT_SET(pcp->pc_bitmap, fd); |
| 1016 | if (error != 0) { |
| 1017 | releasef(fd); |
| 1018 | break; |
| 1019 | } |
| 1020 | pdp->pd_fp = fp; |
| 1021 | pdp->pd_events |= pfdp->events; |
| 1022 | if (php != NULL) { |
| 1023 | if (pdp->pd_php == NULL) { |
| 1024 | pollhead_insert(php, pdp); |
| 1025 | pdp->pd_php = php; |
| 1026 | } else { |
| 1027 | if (pdp->pd_php != php) { |
| 1028 | pollhead_delete(pdp->pd_php, |
| 1029 | pdp); |
| 1030 | pollhead_insert(php, pdp); |
| 1031 | pdp->pd_php = php; |
| 1032 | } |
| 1033 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1034 | } |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1035 | fds_added = B_TRUE; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1036 | releasef(fd); |
| 1037 | } else { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1038 | if (pdp == NULL || pdp->pd_fp == NULL) { |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1039 | if (is_epoll) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1040 | /* |
| 1041 | * As with the add case (above), epoll |
| 1042 | * semantics demand that we error out |
| 1043 | * in this case. |
| 1044 | */ |
| 1045 | error = ENOENT; |
| 1046 | break; |
| 1047 | } |
| 1048 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1049 | continue; |
| 1050 | } |
| 1051 | ASSERT(pdp->pd_fd == fd); |
| 1052 | pdp->pd_fp = NULL; |
| 1053 | pdp->pd_events = 0; |
| 1054 | ASSERT(pdp->pd_thread == NULL); |
| 1055 | if (pdp->pd_php != NULL) { |
| 1056 | pollhead_delete(pdp->pd_php, pdp); |
| 1057 | pdp->pd_php = NULL; |
| 1058 | } |
| 1059 | BT_CLEAR(pcp->pc_bitmap, fd); |
| 1060 | } |
| 1061 | } |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1062 | /* |
Patrick Mooney | bf75909 | 2016-03-08 16:44:15 +0000 | [diff] [blame] | 1063 | * Wake any pollcache waiters so they can check the new descriptors. |
| 1064 | * |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1065 | * Any fds added to an recursive-capable pollcache could themselves be |
| 1066 | * /dev/poll handles. To ensure that proper event propagation occurs, |
Patrick Mooney | bf75909 | 2016-03-08 16:44:15 +0000 | [diff] [blame] | 1067 | * parent pollcaches are woken too, so that they can create any needed |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1068 | * pollcache links. |
| 1069 | */ |
| 1070 | if (fds_added) { |
Patrick Mooney | bf75909 | 2016-03-08 16:44:15 +0000 | [diff] [blame] | 1071 | cv_broadcast(&pcp->pc_cv); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1072 | pcache_wake_parents(pcp); |
| 1073 | } |
| 1074 | pollstate_exit(pcp); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1075 | mutex_enter(&dpep->dpe_lock); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1076 | bypass: |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1077 | dpep->dpe_flag &= ~DP_WRITER_PRESENT; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1078 | dpep->dpe_refcnt--; |
| 1079 | cv_broadcast(&dpep->dpe_cv); |
| 1080 | mutex_exit(&dpep->dpe_lock); |
| 1081 | kmem_free(pollfdp, uiosize); |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 1082 | if (error == 0) { |
| 1083 | /* |
| 1084 | * The state of uio_resid is updated only after the pollcache |
| 1085 | * is successfully modified. |
| 1086 | */ |
| 1087 | uioskip(uiop, copysize); |
| 1088 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1089 | return (error); |
| 1090 | } |
| 1091 | |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1092 | #define DP_SIGMASK_RESTORE(ksetp) { \ |
| 1093 | if (ksetp != NULL) { \ |
| 1094 | mutex_enter(&p->p_lock); \ |
| 1095 | if (lwp->lwp_cursig == 0) { \ |
| 1096 | t->t_hold = lwp->lwp_sigoldmask; \ |
| 1097 | t->t_flag &= ~T_TOMASK; \ |
| 1098 | } \ |
| 1099 | mutex_exit(&p->p_lock); \ |
| 1100 | } \ |
| 1101 | } |
| 1102 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1103 | /*ARGSUSED*/ |
| 1104 | static int |
| 1105 | dpioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, int *rvalp) |
| 1106 | { |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 1107 | minor_t minor; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1108 | dp_entry_t *dpep; |
| 1109 | pollcache_t *pcp; |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1110 | hrtime_t now; |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 1111 | int error = 0; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1112 | boolean_t is_epoll; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1113 | STRUCT_DECL(dvpoll, dvpoll); |
| 1114 | |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1115 | if (cmd == DP_POLL || cmd == DP_PPOLL) { |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1116 | /* do this now, before we sleep on DP_WRITER_PRESENT */ |
| 1117 | now = gethrtime(); |
| 1118 | } |
| 1119 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1120 | minor = getminor(dev); |
| 1121 | mutex_enter(&devpoll_lock); |
| 1122 | ASSERT(minor < dptblsize); |
| 1123 | dpep = devpolltbl[minor]; |
| 1124 | mutex_exit(&devpoll_lock); |
| 1125 | ASSERT(dpep != NULL); |
| 1126 | pcp = dpep->dpe_pcache; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1127 | |
| 1128 | mutex_enter(&dpep->dpe_lock); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1129 | is_epoll = (dpep->dpe_flag & DP_ISEPOLLCOMPAT) != 0; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1130 | |
| 1131 | if (cmd == DP_EPOLLCOMPAT) { |
| 1132 | if (dpep->dpe_refcnt != 0) { |
| 1133 | /* |
| 1134 | * We can't turn on epoll compatibility while there |
| 1135 | * are outstanding operations. |
| 1136 | */ |
| 1137 | mutex_exit(&dpep->dpe_lock); |
| 1138 | return (EBUSY); |
| 1139 | } |
| 1140 | |
| 1141 | /* |
| 1142 | * epoll compatibility is a one-way street: there's no way |
| 1143 | * to turn it off for a particular open. |
| 1144 | */ |
| 1145 | dpep->dpe_flag |= DP_ISEPOLLCOMPAT; |
| 1146 | mutex_exit(&dpep->dpe_lock); |
| 1147 | |
| 1148 | return (0); |
| 1149 | } |
| 1150 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1151 | if (!is_epoll && curproc->p_pid != pcp->pc_pid) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1152 | if (pcp->pc_pid != -1) { |
| 1153 | mutex_exit(&dpep->dpe_lock); |
| 1154 | return (EACCES); |
| 1155 | } |
| 1156 | |
| 1157 | pcp->pc_pid = curproc->p_pid; |
| 1158 | } |
| 1159 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1160 | /* Wait until all writers have cleared the handle before continuing */ |
| 1161 | while ((dpep->dpe_flag & DP_WRITER_PRESENT) != 0 || |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1162 | (dpep->dpe_writerwait != 0)) { |
| 1163 | if (!cv_wait_sig_swap(&dpep->dpe_cv, &dpep->dpe_lock)) { |
| 1164 | mutex_exit(&dpep->dpe_lock); |
| 1165 | return (EINTR); |
| 1166 | } |
| 1167 | } |
| 1168 | dpep->dpe_refcnt++; |
| 1169 | mutex_exit(&dpep->dpe_lock); |
| 1170 | |
| 1171 | switch (cmd) { |
| 1172 | case DP_POLL: |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1173 | case DP_PPOLL: |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1174 | { |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 1175 | pollstate_t *ps; |
| 1176 | nfds_t nfds; |
| 1177 | int fdcnt = 0; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1178 | size_t size, fdsize, dpsize; |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1179 | hrtime_t deadline = 0; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1180 | k_sigset_t *ksetp = NULL; |
| 1181 | k_sigset_t kset; |
| 1182 | sigset_t set; |
| 1183 | kthread_t *t = curthread; |
| 1184 | klwp_t *lwp = ttolwp(t); |
| 1185 | struct proc *p = ttoproc(curthread); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1186 | |
| 1187 | STRUCT_INIT(dvpoll, mode); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1188 | |
| 1189 | /* |
| 1190 | * The dp_setp member is only required/consumed for DP_PPOLL, |
| 1191 | * which otherwise uses the same structure as DP_POLL. |
| 1192 | */ |
| 1193 | if (cmd == DP_POLL) { |
| 1194 | dpsize = (uintptr_t)STRUCT_FADDR(dvpoll, dp_setp) - |
| 1195 | (uintptr_t)STRUCT_FADDR(dvpoll, dp_fds); |
| 1196 | } else { |
| 1197 | ASSERT(cmd == DP_PPOLL); |
| 1198 | dpsize = STRUCT_SIZE(dvpoll); |
| 1199 | } |
| 1200 | |
| 1201 | if ((mode & FKIOCTL) != 0) { |
| 1202 | /* Kernel-internal ioctl call */ |
| 1203 | bcopy((caddr_t)arg, STRUCT_BUF(dvpoll), dpsize); |
| 1204 | error = 0; |
| 1205 | } else { |
| 1206 | error = copyin((caddr_t)arg, STRUCT_BUF(dvpoll), |
| 1207 | dpsize); |
| 1208 | } |
| 1209 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1210 | if (error) { |
| 1211 | DP_REFRELE(dpep); |
| 1212 | return (EFAULT); |
| 1213 | } |
| 1214 | |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1215 | deadline = STRUCT_FGET(dvpoll, dp_timeout); |
| 1216 | if (deadline > 0) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1217 | /* |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1218 | * Convert the deadline from relative milliseconds |
| 1219 | * to absolute nanoseconds. They must wait for at |
| 1220 | * least a tick. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1221 | */ |
Josef 'Jeff' Sipek | 1944925 | 2014-04-29 13:05:25 -0400 | [diff] [blame] | 1222 | deadline = MSEC2NSEC(deadline); |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1223 | deadline = MAX(deadline, nsec_per_tick); |
| 1224 | deadline += now; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1225 | } |
| 1226 | |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1227 | if (cmd == DP_PPOLL) { |
| 1228 | void *setp = STRUCT_FGETP(dvpoll, dp_setp); |
| 1229 | |
| 1230 | if (setp != NULL) { |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 1231 | if ((mode & FKIOCTL) != 0) { |
| 1232 | /* Use the signal set directly */ |
| 1233 | ksetp = (k_sigset_t *)setp; |
| 1234 | } else { |
| 1235 | if (copyin(setp, &set, sizeof (set))) { |
| 1236 | DP_REFRELE(dpep); |
| 1237 | return (EFAULT); |
| 1238 | } |
| 1239 | sigutok(&set, &kset); |
| 1240 | ksetp = &kset; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1241 | } |
| 1242 | |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1243 | mutex_enter(&p->p_lock); |
| 1244 | schedctl_finish_sigblock(t); |
| 1245 | lwp->lwp_sigoldmask = t->t_hold; |
| 1246 | t->t_hold = *ksetp; |
| 1247 | t->t_flag |= T_TOMASK; |
| 1248 | |
| 1249 | /* |
| 1250 | * Like ppoll() with a non-NULL sigset, we'll |
| 1251 | * call cv_reltimedwait_sig() just to check for |
| 1252 | * signals. This call will return immediately |
| 1253 | * with either 0 (signalled) or -1 (no signal). |
| 1254 | * There are some conditions whereby we can |
| 1255 | * get 0 from cv_reltimedwait_sig() without |
| 1256 | * a true signal (e.g., a directed stop), so |
| 1257 | * we restore our signal mask in the unlikely |
| 1258 | * event that lwp_cursig is 0. |
| 1259 | */ |
| 1260 | if (!cv_reltimedwait_sig(&t->t_delay_cv, |
| 1261 | &p->p_lock, 0, TR_CLOCK_TICK)) { |
| 1262 | if (lwp->lwp_cursig == 0) { |
| 1263 | t->t_hold = lwp->lwp_sigoldmask; |
| 1264 | t->t_flag &= ~T_TOMASK; |
| 1265 | } |
| 1266 | |
| 1267 | mutex_exit(&p->p_lock); |
| 1268 | |
| 1269 | DP_REFRELE(dpep); |
| 1270 | return (EINTR); |
| 1271 | } |
| 1272 | |
| 1273 | mutex_exit(&p->p_lock); |
| 1274 | } |
| 1275 | } |
| 1276 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1277 | if ((nfds = STRUCT_FGET(dvpoll, dp_nfds)) == 0) { |
| 1278 | /* |
| 1279 | * We are just using DP_POLL to sleep, so |
| 1280 | * we don't any of the devpoll apparatus. |
| 1281 | * Do not check for signals if we have a zero timeout. |
| 1282 | */ |
| 1283 | DP_REFRELE(dpep); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1284 | if (deadline == 0) { |
| 1285 | DP_SIGMASK_RESTORE(ksetp); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1286 | return (0); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1287 | } |
| 1288 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1289 | mutex_enter(&curthread->t_delay_lock); |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1290 | while ((error = |
| 1291 | cv_timedwait_sig_hrtime(&curthread->t_delay_cv, |
| 1292 | &curthread->t_delay_lock, deadline)) > 0) |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1293 | continue; |
| 1294 | mutex_exit(&curthread->t_delay_lock); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1295 | |
| 1296 | DP_SIGMASK_RESTORE(ksetp); |
| 1297 | |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1298 | return (error == 0 ? EINTR : 0); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1299 | } |
| 1300 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1301 | if (is_epoll) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1302 | size = nfds * (fdsize = sizeof (epoll_event_t)); |
| 1303 | } else { |
| 1304 | size = nfds * (fdsize = sizeof (pollfd_t)); |
| 1305 | } |
| 1306 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1307 | /* |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 1308 | * XXX It would be nice not to have to alloc each time, but it |
| 1309 | * requires another per thread structure hook. This can be |
| 1310 | * implemented later if data suggests that it's necessary. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1311 | */ |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1312 | ps = pollstate_create(); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1313 | |
| 1314 | if (ps->ps_dpbufsize < size) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1315 | /* |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1316 | * If nfds is larger than twice the current maximum |
| 1317 | * open file count, we'll silently clamp it. This |
| 1318 | * only limits our exposure to allocating an |
| 1319 | * inordinate amount of kernel memory; it doesn't |
| 1320 | * otherwise affect the semantics. (We have this |
| 1321 | * check at twice the maximum instead of merely the |
| 1322 | * maximum because some applications pass an nfds that |
| 1323 | * is only slightly larger than their limit.) |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1324 | */ |
| 1325 | mutex_enter(&p->p_lock); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1326 | if ((nfds >> 1) > p->p_fno_ctl) { |
| 1327 | nfds = p->p_fno_ctl; |
| 1328 | size = nfds * fdsize; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1329 | } |
| 1330 | mutex_exit(&p->p_lock); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1331 | |
| 1332 | if (ps->ps_dpbufsize < size) { |
| 1333 | kmem_free(ps->ps_dpbuf, ps->ps_dpbufsize); |
| 1334 | ps->ps_dpbuf = kmem_zalloc(size, KM_SLEEP); |
| 1335 | ps->ps_dpbufsize = size; |
| 1336 | } |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1337 | } |
| 1338 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1339 | VERIFY(pollstate_enter(pcp) == PSE_SUCCESS); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1340 | for (;;) { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1341 | pcp->pc_flag &= ~PC_POLLWAKE; |
| 1342 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1343 | /* |
| 1344 | * Mark all child pcachelinks as stale. |
| 1345 | * Those which are still part of the tree will be |
| 1346 | * marked as valid during the poll. |
| 1347 | */ |
| 1348 | pcachelink_mark_stale(pcp); |
| 1349 | |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1350 | error = dp_pcache_poll(dpep, ps->ps_dpbuf, |
| 1351 | pcp, nfds, &fdcnt); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1352 | if (fdcnt > 0 || error != 0) |
| 1353 | break; |
| 1354 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1355 | /* Purge still-stale child pcachelinks */ |
| 1356 | pcachelink_purge_stale(pcp); |
| 1357 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1358 | /* |
| 1359 | * A pollwake has happened since we polled cache. |
| 1360 | */ |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1361 | if (pcp->pc_flag & PC_POLLWAKE) |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1362 | continue; |
| 1363 | |
| 1364 | /* |
amw | da6c28a | 2007-10-25 16:34:29 -0700 | [diff] [blame] | 1365 | * Sleep until we are notified, signaled, or timed out. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1366 | */ |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1367 | if (deadline == 0) { |
| 1368 | /* immediate timeout; do not check signals */ |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1369 | break; |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1370 | } |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1371 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1372 | error = cv_timedwait_sig_hrtime(&pcp->pc_cv, |
| 1373 | &pcp->pc_lock, deadline); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1374 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1375 | /* |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1376 | * If we were awakened by a signal or timeout then |
| 1377 | * break the loop, else poll again. |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1378 | */ |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1379 | if (error <= 0) { |
| 1380 | error = (error == 0) ? EINTR : 0; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1381 | break; |
Matthew Ahrens | cd1c8b8 | 2012-08-30 05:13:49 -0700 | [diff] [blame] | 1382 | } else { |
| 1383 | error = 0; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1384 | } |
| 1385 | } |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1386 | pollstate_exit(pcp); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1387 | |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1388 | DP_SIGMASK_RESTORE(ksetp); |
| 1389 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1390 | if (error == 0 && fdcnt > 0) { |
Patrick Mooney | 57a0264 | 2016-03-23 19:35:31 +0000 | [diff] [blame] | 1391 | /* |
| 1392 | * It should be noted that FKIOCTL does not influence |
| 1393 | * the copyout (vs bcopy) of dp_fds at this time. |
| 1394 | */ |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1395 | if (copyout(ps->ps_dpbuf, |
| 1396 | STRUCT_FGETP(dvpoll, dp_fds), fdcnt * fdsize)) { |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1397 | DP_REFRELE(dpep); |
| 1398 | return (EFAULT); |
| 1399 | } |
| 1400 | *rvalp = fdcnt; |
| 1401 | } |
| 1402 | break; |
| 1403 | } |
| 1404 | |
| 1405 | case DP_ISPOLLED: |
| 1406 | { |
| 1407 | pollfd_t pollfd; |
| 1408 | polldat_t *pdp; |
| 1409 | |
| 1410 | STRUCT_INIT(dvpoll, mode); |
| 1411 | error = copyin((caddr_t)arg, &pollfd, sizeof (pollfd_t)); |
| 1412 | if (error) { |
| 1413 | DP_REFRELE(dpep); |
| 1414 | return (EFAULT); |
| 1415 | } |
| 1416 | mutex_enter(&pcp->pc_lock); |
| 1417 | if (pcp->pc_hash == NULL) { |
| 1418 | /* |
| 1419 | * No Need to search because no poll fd |
| 1420 | * has been cached. |
| 1421 | */ |
| 1422 | mutex_exit(&pcp->pc_lock); |
| 1423 | DP_REFRELE(dpep); |
| 1424 | return (0); |
| 1425 | } |
| 1426 | if (pollfd.fd < 0) { |
| 1427 | mutex_exit(&pcp->pc_lock); |
| 1428 | break; |
| 1429 | } |
| 1430 | pdp = pcache_lookup_fd(pcp, pollfd.fd); |
| 1431 | if ((pdp != NULL) && (pdp->pd_fd == pollfd.fd) && |
| 1432 | (pdp->pd_fp != NULL)) { |
| 1433 | pollfd.revents = pdp->pd_events; |
| 1434 | if (copyout(&pollfd, (caddr_t)arg, sizeof (pollfd_t))) { |
| 1435 | mutex_exit(&pcp->pc_lock); |
| 1436 | DP_REFRELE(dpep); |
| 1437 | return (EFAULT); |
| 1438 | } |
| 1439 | *rvalp = 1; |
| 1440 | } |
| 1441 | mutex_exit(&pcp->pc_lock); |
| 1442 | break; |
| 1443 | } |
| 1444 | |
| 1445 | default: |
| 1446 | DP_REFRELE(dpep); |
| 1447 | return (EINVAL); |
| 1448 | } |
| 1449 | DP_REFRELE(dpep); |
| 1450 | return (error); |
| 1451 | } |
| 1452 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1453 | /* |
| 1454 | * Overview of Recursive Polling |
| 1455 | * |
| 1456 | * It is possible for /dev/poll to poll for events on file descriptors which |
| 1457 | * themselves are /dev/poll handles. Pending events in the child handle are |
| 1458 | * represented as readable data via the POLLIN flag. To limit surface area, |
| 1459 | * this recursion is presently allowed on only /dev/poll handles which have |
| 1460 | * been placed in epoll mode via the DP_EPOLLCOMPAT ioctl. Recursion depth is |
| 1461 | * limited to 5 in order to be consistent with Linux epoll. |
| 1462 | * |
| 1463 | * Extending dppoll() for VOP_POLL: |
| 1464 | * |
| 1465 | * The recursive /dev/poll implementation begins by extending dppoll() to |
| 1466 | * report when resources contained in the pollcache have relevant event state. |
| 1467 | * At the highest level, it means calling dp_pcache_poll() so it indicates if |
| 1468 | * fd events are present without consuming them or altering the pollcache |
| 1469 | * bitmap. This ensures that a subsequent DP_POLL operation on the bitmap will |
| 1470 | * yield the initiating event. Additionally, the VOP_POLL should return in |
| 1471 | * such a way that dp_pcache_poll() does not clear the parent bitmap entry |
| 1472 | * which corresponds to the child /dev/poll fd. This means that child |
| 1473 | * pollcaches will be checked during every poll which facilitates wake-up |
| 1474 | * behavior detailed below. |
| 1475 | * |
| 1476 | * Pollcache Links and Wake Events: |
| 1477 | * |
| 1478 | * Recursive /dev/poll avoids complicated pollcache locking constraints during |
| 1479 | * pollwakeup events by eschewing the traditional pollhead mechanism in favor |
| 1480 | * of a different approach. For each pollcache at the root of a recursive |
| 1481 | * /dev/poll "tree", pcachelink_t structures are established to all child |
| 1482 | * /dev/poll pollcaches. During pollnotify() in a child pollcache, the |
| 1483 | * linked list of pcachelink_t entries is walked, where those marked as valid |
| 1484 | * incur a cv_broadcast to their parent pollcache. Most notably, these |
| 1485 | * pcachelink_t cv wakeups are performed without acquiring pc_lock on the |
| 1486 | * parent pollcache (which would require careful deadlock avoidance). This |
| 1487 | * still allows the woken poll on the parent to discover the pertinent events |
| 1488 | * due to the fact that bitmap entires for the child pollcache are always |
| 1489 | * maintained by the dppoll() logic above. |
| 1490 | * |
| 1491 | * Depth Limiting and Loop Prevention: |
| 1492 | * |
| 1493 | * As each pollcache is encountered (either via DP_POLL or dppoll()), depth and |
| 1494 | * loop constraints are enforced via pollstate_enter(). The pollcache_t |
| 1495 | * pointer is compared against any existing entries in ps_pc_stack and is added |
| 1496 | * to the end if no match (and therefore loop) is found. Once poll operations |
| 1497 | * for a given pollcache_t are complete, pollstate_exit() clears the pointer |
| 1498 | * from the list. The pollstate_enter() and pollstate_exit() functions are |
| 1499 | * responsible for acquiring and releasing pc_lock, respectively. |
| 1500 | * |
| 1501 | * Deadlock Safety: |
| 1502 | * |
| 1503 | * Descending through a tree of recursive /dev/poll handles involves the tricky |
| 1504 | * business of sequentially entering multiple pollcache locks. This tree |
| 1505 | * topology cannot define a lock acquisition order in such a way that it is |
| 1506 | * immune to deadlocks between threads. The pollstate_enter() and |
| 1507 | * pollstate_exit() functions provide an interface for recursive /dev/poll |
| 1508 | * operations to safely lock pollcaches while failing gracefully in the face of |
| 1509 | * deadlocking topologies. (See pollstate_contend() for more detail about how |
| 1510 | * deadlocks are detected and resolved.) |
| 1511 | */ |
| 1512 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1513 | /*ARGSUSED*/ |
| 1514 | static int |
| 1515 | dppoll(dev_t dev, short events, int anyyet, short *reventsp, |
| 1516 | struct pollhead **phpp) |
| 1517 | { |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1518 | minor_t minor; |
| 1519 | dp_entry_t *dpep; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1520 | pollcache_t *pcp; |
| 1521 | int res, rc = 0; |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1522 | |
| 1523 | minor = getminor(dev); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1524 | mutex_enter(&devpoll_lock); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1525 | ASSERT(minor < dptblsize); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1526 | dpep = devpolltbl[minor]; |
| 1527 | ASSERT(dpep != NULL); |
| 1528 | mutex_exit(&devpoll_lock); |
| 1529 | |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1530 | mutex_enter(&dpep->dpe_lock); |
| 1531 | if ((dpep->dpe_flag & DP_ISEPOLLCOMPAT) == 0) { |
| 1532 | /* Poll recursion is not yet supported for non-epoll handles */ |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1533 | *reventsp = POLLERR; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1534 | mutex_exit(&dpep->dpe_lock); |
| 1535 | return (0); |
| 1536 | } else { |
| 1537 | dpep->dpe_refcnt++; |
| 1538 | pcp = dpep->dpe_pcache; |
| 1539 | mutex_exit(&dpep->dpe_lock); |
Bryan Cantrill | a5eb710 | 2015-02-14 16:55:35 -0800 | [diff] [blame] | 1540 | } |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1541 | |
| 1542 | res = pollstate_enter(pcp); |
| 1543 | if (res == PSE_SUCCESS) { |
| 1544 | nfds_t nfds = 1; |
| 1545 | int fdcnt = 0; |
| 1546 | pollstate_t *ps = curthread->t_pollstate; |
| 1547 | |
Patrick Mooney | 80d5689 | 2017-09-22 23:43:19 +0000 | [diff] [blame] | 1548 | /* |
| 1549 | * Recursive polling will only emit certain events. Skip a |
| 1550 | * scan of the pollcache if those events are not of interest. |
| 1551 | */ |
| 1552 | if (events & (POLLIN|POLLRDNORM)) { |
| 1553 | rc = dp_pcache_poll(dpep, NULL, pcp, nfds, &fdcnt); |
| 1554 | } else { |
| 1555 | rc = 0; |
| 1556 | fdcnt = 0; |
| 1557 | } |
| 1558 | |
| 1559 | if (rc == 0 && fdcnt > 0) { |
| 1560 | *reventsp = POLLIN|POLLRDNORM; |
| 1561 | } else { |
| 1562 | *reventsp = 0; |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1563 | } |
| 1564 | pcachelink_assoc(pcp, ps->ps_pc_stack[0]); |
| 1565 | pollstate_exit(pcp); |
| 1566 | } else { |
| 1567 | switch (res) { |
| 1568 | case PSE_FAIL_DEPTH: |
| 1569 | rc = EINVAL; |
| 1570 | break; |
| 1571 | case PSE_FAIL_LOOP: |
| 1572 | case PSE_FAIL_DEADLOCK: |
| 1573 | rc = ELOOP; |
| 1574 | break; |
| 1575 | default: |
| 1576 | /* |
| 1577 | * If anything else has gone awry, such as being polled |
| 1578 | * from an unexpected context, fall back to the |
| 1579 | * recursion-intolerant response. |
| 1580 | */ |
| 1581 | *reventsp = POLLERR; |
| 1582 | rc = 0; |
| 1583 | break; |
| 1584 | } |
| 1585 | } |
| 1586 | |
| 1587 | DP_REFRELE(dpep); |
| 1588 | return (rc); |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1589 | } |
| 1590 | |
| 1591 | /* |
| 1592 | * devpoll close should do enough clean up before the pollcache is deleted, |
| 1593 | * i.e., it should ensure no one still references the pollcache later. |
| 1594 | * There is no "permission" check in here. Any process having the last |
| 1595 | * reference of this /dev/poll fd can close. |
| 1596 | */ |
| 1597 | /*ARGSUSED*/ |
| 1598 | static int |
| 1599 | dpclose(dev_t dev, int flag, int otyp, cred_t *credp) |
| 1600 | { |
Matt Amdur | fe234e7 | 2011-10-20 07:54:20 -0700 | [diff] [blame] | 1601 | minor_t minor; |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1602 | dp_entry_t *dpep; |
| 1603 | pollcache_t *pcp; |
| 1604 | int i; |
| 1605 | polldat_t **hashtbl; |
| 1606 | polldat_t *pdp; |
| 1607 | |
| 1608 | minor = getminor(dev); |
| 1609 | |
| 1610 | mutex_enter(&devpoll_lock); |
| 1611 | dpep = devpolltbl[minor]; |
| 1612 | ASSERT(dpep != NULL); |
| 1613 | devpolltbl[minor] = NULL; |
| 1614 | mutex_exit(&devpoll_lock); |
| 1615 | pcp = dpep->dpe_pcache; |
| 1616 | ASSERT(pcp != NULL); |
| 1617 | /* |
| 1618 | * At this point, no other lwp can access this pollcache via the |
| 1619 | * /dev/poll fd. This pollcache is going away, so do the clean |
| 1620 | * up without the pc_lock. |
| 1621 | */ |
| 1622 | hashtbl = pcp->pc_hash; |
| 1623 | for (i = 0; i < pcp->pc_hashsize; i++) { |
| 1624 | for (pdp = hashtbl[i]; pdp; pdp = pdp->pd_hashnext) { |
| 1625 | if (pdp->pd_php != NULL) { |
| 1626 | pollhead_delete(pdp->pd_php, pdp); |
| 1627 | pdp->pd_php = NULL; |
| 1628 | pdp->pd_fp = NULL; |
| 1629 | } |
| 1630 | } |
| 1631 | } |
| 1632 | /* |
| 1633 | * pollwakeup() may still interact with this pollcache. Wait until |
| 1634 | * it is done. |
| 1635 | */ |
| 1636 | mutex_enter(&pcp->pc_no_exit); |
| 1637 | ASSERT(pcp->pc_busy >= 0); |
| 1638 | while (pcp->pc_busy > 0) |
| 1639 | cv_wait(&pcp->pc_busy_cv, &pcp->pc_no_exit); |
| 1640 | mutex_exit(&pcp->pc_no_exit); |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1641 | |
| 1642 | /* Clean up any pollcache links created via recursive /dev/poll */ |
| 1643 | if (pcp->pc_parents != NULL || pcp->pc_children != NULL) { |
| 1644 | /* |
| 1645 | * Because of the locking rules for pcachelink manipulation, |
| 1646 | * acquring pc_lock is required for this step. |
| 1647 | */ |
| 1648 | mutex_enter(&pcp->pc_lock); |
| 1649 | pcachelink_purge_all(pcp); |
| 1650 | mutex_exit(&pcp->pc_lock); |
| 1651 | } |
| 1652 | |
stevel@tonic-gate | 7c478bd | 2005-06-14 00:00:00 -0700 | [diff] [blame] | 1653 | pcache_destroy(pcp); |
| 1654 | ASSERT(dpep->dpe_refcnt == 0); |
| 1655 | kmem_free(dpep, sizeof (dp_entry_t)); |
| 1656 | return (0); |
| 1657 | } |
Patrick Mooney | f3bb54f | 2015-10-05 17:20:33 -0700 | [diff] [blame] | 1658 | |
| 1659 | static void |
| 1660 | pcachelink_locked_rele(pcachelink_t *pl) |
| 1661 | { |
| 1662 | ASSERT(MUTEX_HELD(&pl->pcl_lock)); |
| 1663 | VERIFY(pl->pcl_refcnt >= 1); |
| 1664 | |
| 1665 | pl->pcl_refcnt--; |
| 1666 | if (pl->pcl_refcnt == 0) { |
| 1667 | VERIFY(pl->pcl_state == PCL_INVALID); |
| 1668 | ASSERT(pl->pcl_parent_pc == NULL); |
| 1669 | ASSERT(pl->pcl_child_pc == NULL); |
| 1670 | ASSERT(pl->pcl_parent_next == NULL); |
| 1671 | ASSERT(pl->pcl_child_next == NULL); |
| 1672 | |
| 1673 | pl->pcl_state = PCL_FREE; |
| 1674 | mutex_destroy(&pl->pcl_lock); |
| 1675 | kmem_free(pl, sizeof (pcachelink_t)); |
| 1676 | } else { |
| 1677 | mutex_exit(&pl->pcl_lock); |
| 1678 | } |
| 1679 | } |
| 1680 | |
| 1681 | /* |
| 1682 | * Associate parent and child pollcaches via a pcachelink_t. If an existing |
| 1683 | * link (stale or valid) between the two is found, it will be reused. If a |
| 1684 | * suitable link is not found for reuse, a new one will be allocated. |
| 1685 | */ |
| 1686 | static void |
| 1687 | pcachelink_assoc(pollcache_t *child, pollcache_t *parent) |
| 1688 | { |
| 1689 | pcachelink_t *pl, **plpn; |
| 1690 | |
| 1691 | ASSERT(MUTEX_HELD(&child->pc_lock)); |
| 1692 | ASSERT(MUTEX_HELD(&parent->pc_lock)); |
| 1693 | |
| 1694 | /* Search for an existing link we can reuse. */ |
| 1695 | plpn = &child->pc_parents; |
| 1696 | for (pl = child->pc_parents; pl != NULL; pl = *plpn) { |
| 1697 | mutex_enter(&pl->pcl_lock); |
| 1698 | if (pl->pcl_state == PCL_INVALID) { |
| 1699 | /* Clean any invalid links while walking the list */ |
| 1700 | *plpn = pl->pcl_parent_next; |
| 1701 | pl->pcl_child_pc = NULL; |
| 1702 | pl->pcl_parent_next = NULL; |
| 1703 | pcachelink_locked_rele(pl); |
| 1704 | } else if (pl->pcl_parent_pc == parent) { |
| 1705 | /* Successfully found parent link */ |
| 1706 | ASSERT(pl->pcl_state == PCL_VALID || |
| 1707 | pl->pcl_state == PCL_STALE); |
| 1708 | pl->pcl_state = PCL_VALID; |
| 1709 | mutex_exit(&pl->pcl_lock); |
| 1710 | return; |
| 1711 | } else { |
| 1712 | plpn = &pl->pcl_parent_next; |
| 1713 | mutex_exit(&pl->pcl_lock); |
| 1714 | } |
| 1715 | } |
| 1716 | |
| 1717 | /* No existing link to the parent was found. Create a fresh one. */ |
| 1718 | pl = kmem_zalloc(sizeof (pcachelink_t), KM_SLEEP); |
| 1719 | mutex_init(&pl->pcl_lock, NULL, MUTEX_DEFAULT, NULL); |
| 1720 | |
| 1721 | pl->pcl_parent_pc = parent; |
| 1722 | pl->pcl_child_next = parent->pc_children; |
| 1723 | parent->pc_children = pl; |
| 1724 | pl->pcl_refcnt++; |
| 1725 | |
| 1726 | pl->pcl_child_pc = child; |
| 1727 | pl->pcl_parent_next = child->pc_parents; |
| 1728 | child->pc_parents = pl; |
| 1729 | pl->pcl_refcnt++; |
| 1730 | |
| 1731 | pl->pcl_state = PCL_VALID; |
| 1732 | } |
| 1733 | |
| 1734 | /* |
| 1735 | * Mark all child links in a pollcache as stale. Any invalid child links found |
| 1736 | * during iteration are purged. |
| 1737 | */ |
| 1738 | static void |
| 1739 | pcachelink_mark_stale(pollcache_t *pcp) |
| 1740 | { |
| 1741 | pcachelink_t *pl, **plpn; |
| 1742 | |
| 1743 | ASSERT(MUTEX_HELD(&pcp->pc_lock)); |
| 1744 | |
| 1745 | plpn = &pcp->pc_children; |
| 1746 | for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { |
| 1747 | mutex_enter(&pl->pcl_lock); |
| 1748 | if (pl->pcl_state == PCL_INVALID) { |
| 1749 | /* |
| 1750 | * Remove any invalid links while we are going to the |
| 1751 | * trouble of walking the list. |
| 1752 | */ |
| 1753 | *plpn = pl->pcl_child_next; |
| 1754 | pl->pcl_parent_pc = NULL; |
| 1755 | pl->pcl_child_next = NULL; |
| 1756 | pcachelink_locked_rele(pl); |
| 1757 | } else { |
| 1758 | pl->pcl_state = PCL_STALE; |
| 1759 | plpn = &pl->pcl_child_next; |
| 1760 | mutex_exit(&pl->pcl_lock); |
| 1761 | } |
| 1762 | } |
| 1763 | } |
| 1764 | |
| 1765 | /* |
| 1766 | * Purge all stale (or invalid) child links from a pollcache. |
| 1767 | */ |
| 1768 | static void |
| 1769 | pcachelink_purge_stale(pollcache_t *pcp) |
| 1770 | { |
| 1771 | pcachelink_t *pl, **plpn; |
| 1772 | |
| 1773 | ASSERT(MUTEX_HELD(&pcp->pc_lock)); |
| 1774 | |
| 1775 | plpn = &pcp->pc_children; |
| 1776 | for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { |
| 1777 | mutex_enter(&pl->pcl_lock); |
| 1778 | switch (pl->pcl_state) { |
| 1779 | case PCL_STALE: |
| 1780 | pl->pcl_state = PCL_INVALID; |
| 1781 | /* FALLTHROUGH */ |
| 1782 | case PCL_INVALID: |
| 1783 | *plpn = pl->pcl_child_next; |
| 1784 | pl->pcl_parent_pc = NULL; |
| 1785 | pl->pcl_child_next = NULL; |
| 1786 | pcachelink_locked_rele(pl); |
| 1787 | break; |
| 1788 | default: |
| 1789 | plpn = &pl->pcl_child_next; |
| 1790 | mutex_exit(&pl->pcl_lock); |
| 1791 | } |
| 1792 | } |
| 1793 | } |
| 1794 | |
| 1795 | /* |
| 1796 | * Purge all child and parent links from a pollcache, regardless of status. |
| 1797 | */ |
| 1798 | static void |
| 1799 | pcachelink_purge_all(pollcache_t *pcp) |
| 1800 | { |
| 1801 | pcachelink_t *pl, **plpn; |
| 1802 | |
| 1803 | ASSERT(MUTEX_HELD(&pcp->pc_lock)); |
| 1804 | |
| 1805 | plpn = &pcp->pc_parents; |
| 1806 | for (pl = pcp->pc_parents; pl != NULL; pl = *plpn) { |
| 1807 | mutex_enter(&pl->pcl_lock); |
| 1808 | pl->pcl_state = PCL_INVALID; |
| 1809 | *plpn = pl->pcl_parent_next; |
| 1810 | pl->pcl_child_pc = NULL; |
| 1811 | pl->pcl_parent_next = NULL; |
| 1812 | pcachelink_locked_rele(pl); |
| 1813 | } |
| 1814 | |
| 1815 | plpn = &pcp->pc_children; |
| 1816 | for (pl = pcp->pc_children; pl != NULL; pl = *plpn) { |
| 1817 | mutex_enter(&pl->pcl_lock); |
| 1818 | pl->pcl_state = PCL_INVALID; |
| 1819 | *plpn = pl->pcl_child_next; |
| 1820 | pl->pcl_parent_pc = NULL; |
| 1821 | pl->pcl_child_next = NULL; |
| 1822 | pcachelink_locked_rele(pl); |
| 1823 | } |
| 1824 | |
| 1825 | ASSERT(pcp->pc_parents == NULL); |
| 1826 | ASSERT(pcp->pc_children == NULL); |
| 1827 | } |