blob: c8ff2bea18c402fd97aac382836f528e8c3a3f47 [file] [log] [blame]
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
Rao Shoaibbfcb55b2009-01-05 10:51:43 -080023 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
Yu Xiangning0f1702c2008-12-11 20:04:13 -080024 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/param.h>
29#include <sys/signal.h>
30#include <sys/cmn_err.h>
31
32#include <sys/stropts.h>
33#include <sys/socket.h>
34#include <sys/socketvar.h>
35#include <sys/sockio.h>
36#include <sys/sodirect.h>
37#include <sys/strsubr.h>
38#include <sys/strsun.h>
39#include <sys/atomic.h>
40
41#include <fs/sockfs/sockcommon.h>
42#include <fs/sockfs/socktpi.h>
43#include <sys/ddi.h>
44#include <inet/ip.h>
45#include <sys/time.h>
46#include <sys/cmn_err.h>
47
48#ifdef SOCK_TEST
49extern int do_useracc;
50extern clock_t sock_test_timelimit;
51#endif /* SOCK_TEST */
52
53#define MBLK_PULL_LEN 64
54uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
55
56#ifdef DEBUG
57boolean_t so_debug_length = B_FALSE;
58static boolean_t so_check_length(sonode_t *so);
59#endif
60
61int
62so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
63{
64 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
65 ASSERT(nso->so_acceptq_next == NULL);
66
67 *so->so_acceptq_tail = nso;
68 so->so_acceptq_tail = &nso->so_acceptq_next;
69 so->so_acceptq_len++;
70
71 /* Wakeup a single consumer */
72 cv_signal(&so->so_acceptq_cv);
73
74 return (so->so_acceptq_len);
75}
76
77/*
78 * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
79 *
80 * Enqueue an incoming connection on a listening socket.
81 *
82 * Arguments:
83 * so - listening socket
84 * nso - new connection
85 *
86 * Returns:
87 * Number of queued connections, including the new connection
88 */
89int
90so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
91{
92 int conns;
93
94 mutex_enter(&so->so_acceptq_lock);
95 conns = so_acceptq_enqueue_locked(so, nso);
96 mutex_exit(&so->so_acceptq_lock);
97
98 return (conns);
99}
100
101static int
102so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
103 struct sonode **nsop)
104{
105 struct sonode *nso = NULL;
106
107 *nsop = NULL;
108 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
109 while ((nso = so->so_acceptq_head) == NULL) {
110 /*
111 * No need to check so_error here, because it is not
112 * possible for a listening socket to be reset or otherwise
113 * disconnected.
114 *
115 * So now we just need check if it's ok to wait.
116 */
117 if (dontblock)
118 return (EWOULDBLOCK);
119 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
120 return (EINTR);
121
122 if (cv_wait_sig_swap(&so->so_acceptq_cv,
123 &so->so_acceptq_lock) == 0)
124 return (EINTR);
125 }
126
127 ASSERT(nso != NULL);
128 so->so_acceptq_head = nso->so_acceptq_next;
129 nso->so_acceptq_next = NULL;
130
131 if (so->so_acceptq_head == NULL) {
132 ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
133 so->so_acceptq_tail = &so->so_acceptq_head;
134 }
135 ASSERT(so->so_acceptq_len > 0);
136 --so->so_acceptq_len;
137
138 *nsop = nso;
139
140 return (0);
141}
142
143/*
144 * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
145 *
146 * Pulls a connection off of the accept queue.
147 *
148 * Arguments:
149 * so - listening socket
150 * dontblock - indicate whether it's ok to sleep if there are no
151 * connections on the queue
152 * nsop - Value-return argument
153 *
154 * Return values:
155 * 0 when a connection is successfully dequeued, in which case nsop
156 * is set to point to the new connection. Upon failure a non-zero
157 * value is returned, and the value of nsop is set to NULL.
158 *
159 * Note:
160 * so_acceptq_dequeue() may return prematurly if the socket is falling
161 * back to TPI.
162 */
163int
164so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
165 struct sonode **nsop)
166{
167 int error;
168
169 mutex_enter(&so->so_acceptq_lock);
170 error = so_acceptq_dequeue_locked(so, dontblock, nsop);
171 mutex_exit(&so->so_acceptq_lock);
172
173 return (error);
174}
175
176/*
177 * void so_acceptq_flush(struct sonode *so)
178 *
179 * Removes all pending connections from a listening socket, and
180 * frees the associated resources.
181 *
182 * Arguments
183 * so - listening socket
184 *
185 * Return values:
186 * None.
187 *
188 * Note:
189 * The caller has to ensure that no calls to so_acceptq_enqueue() or
190 * so_acceptq_dequeue() occur while the accept queue is being flushed.
191 * So either the socket needs to be in a state where no operations
192 * would come in, or so_lock needs to be obtained.
193 */
194void
195so_acceptq_flush(struct sonode *so)
196{
197 struct sonode *nso;
198
199 nso = so->so_acceptq_head;
200
201 while (nso != NULL) {
202 struct sonode *nnso = NULL;
203
204 nnso = nso->so_acceptq_next;
205 nso->so_acceptq_next = NULL;
206 /*
207 * Since the socket is on the accept queue, there can
208 * only be one reference. We drop the reference and
209 * just blow off the socket.
210 */
211 ASSERT(nso->so_count == 1);
212 nso->so_count--;
213 socket_destroy(nso);
214 nso = nnso;
215 }
216
217 so->so_acceptq_head = NULL;
218 so->so_acceptq_tail = &so->so_acceptq_head;
219 so->so_acceptq_len = 0;
220}
221
222int
223so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
224 sock_connid_t id)
225{
226 ASSERT(MUTEX_HELD(&so->so_lock));
227
228 /*
229 * The protocol has notified us that a connection attempt is being
230 * made, so before we wait for a notification to arrive we must
231 * clear out any errors associated with earlier connection attempts.
232 */
233 if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
234 so->so_error = 0;
235
236 while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
237 if (nonblock)
238 return (EINPROGRESS);
239
240 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
241 return (EINTR);
242
243 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
244 return (EINTR);
245 }
246
247 if (so->so_error != 0)
248 return (sogeterr(so, B_TRUE));
249 /*
250 * Under normal circumstances, so_error should contain an error
251 * in case the connect failed. However, it is possible for another
252 * thread to come in a consume the error, so generate a sensible
253 * error in that case.
254 */
255 if ((so->so_state & SS_ISCONNECTED) == 0)
256 return (ECONNREFUSED);
257
258 return (0);
259}
260
261/*
262 * int so_wait_connected(struct sonode *so, boolean_t nonblock,
263 * sock_connid_t id)
264 *
265 * Wait until the socket is connected or an error has occured.
266 *
267 * Arguments:
268 * so - socket
269 * nonblock - indicate whether it's ok to sleep if the connection has
270 * not yet been established
271 * gen - generation number that was returned by the protocol
272 * when the operation was started
273 *
274 * Returns:
275 * 0 if the connection attempt was successful, or an error indicating why
276 * the connection attempt failed.
277 */
278int
279so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
280{
281 int error;
282
283 mutex_enter(&so->so_lock);
284 error = so_wait_connected_locked(so, nonblock, id);
285 mutex_exit(&so->so_lock);
286
287 return (error);
288}
289
290int
291so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
292{
293 int error;
294
295 ASSERT(MUTEX_HELD(&so->so_lock));
296 while (so->so_snd_qfull) {
297 if (so->so_state & SS_CANTSENDMORE)
298 return (EPIPE);
299 if (dontblock)
300 return (EWOULDBLOCK);
301
302 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
303 return (EINTR);
304
305 if (so->so_sndtimeo == 0) {
306 /*
307 * Zero means disable timeout.
308 */
309 error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
310 } else {
311 clock_t now;
312
313 time_to_wait(&now, so->so_sndtimeo);
314 error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock,
315 now);
316 }
317 if (error == 0)
318 return (EINTR);
319 else if (error == -1)
shenjian34dfe682009-01-21 10:04:42 +0800320 return (EAGAIN);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800321 }
322 return (0);
323}
324
325/*
326 * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
327 *
328 * Wait for the transport to notify us about send buffers becoming
329 * available.
330 */
331int
332so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
333{
334 int error = 0;
335
336 mutex_enter(&so->so_lock);
337 if (so->so_snd_qfull) {
338 so->so_snd_wakeup = B_TRUE;
339 error = so_snd_wait_qnotfull_locked(so, dontblock);
340 so->so_snd_wakeup = B_FALSE;
341 }
342 mutex_exit(&so->so_lock);
343
344 return (error);
345}
346
347void
348so_snd_qfull(struct sonode *so)
349{
350 mutex_enter(&so->so_lock);
351 so->so_snd_qfull = B_TRUE;
352 mutex_exit(&so->so_lock);
353}
354
355void
356so_snd_qnotfull(struct sonode *so)
357{
358 mutex_enter(&so->so_lock);
359 so->so_snd_qfull = B_FALSE;
360 /* wake up everyone waiting for buffers */
361 cv_broadcast(&so->so_snd_cv);
362 mutex_exit(&so->so_lock);
363}
364
365/*
366 * Change the process/process group to which SIGIO is sent.
367 */
368int
369socket_chgpgrp(struct sonode *so, pid_t pid)
370{
371 int error;
372
373 ASSERT(MUTEX_HELD(&so->so_lock));
374 if (pid != 0) {
375 /*
376 * Permissions check by sending signal 0.
377 * Note that when kill fails it does a
378 * set_errno causing the system call to fail.
379 */
380 error = kill(pid, 0);
381 if (error != 0) {
382 return (error);
383 }
384 }
385 so->so_pgrp = pid;
386 return (0);
387}
388
389
390/*
391 * Generate a SIGIO, for 'writable' events include siginfo structure,
392 * for read events just send the signal.
393 */
394/*ARGSUSED*/
395static void
396socket_sigproc(proc_t *proc, int event)
397{
398 k_siginfo_t info;
399
400 ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
401
402 if (event & SOCKETSIG_WRITE) {
403 info.si_signo = SIGPOLL;
404 info.si_code = POLL_OUT;
405 info.si_errno = 0;
406 info.si_fd = 0;
407 info.si_band = 0;
408 sigaddq(proc, NULL, &info, KM_NOSLEEP);
409 }
410 if (event & SOCKETSIG_READ) {
411 sigtoproc(proc, NULL, SIGPOLL);
412 }
413 if (event & SOCKETSIG_URG) {
414 sigtoproc(proc, NULL, SIGURG);
415 }
416}
417
418void
419socket_sendsig(struct sonode *so, int event)
420{
421 proc_t *proc;
422
423 ASSERT(MUTEX_HELD(&so->so_lock));
424
425 if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
426 event != SOCKETSIG_URG)) {
427 return;
428 }
429
430 dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
431
432 if (so->so_pgrp > 0) {
433 /*
434 * XXX This unfortunately still generates
435 * a signal when a fd is closed but
436 * the proc is active.
437 */
438 mutex_enter(&pidlock);
439 proc = prfind(so->so_pgrp);
440 if (proc == NULL) {
441 mutex_exit(&pidlock);
442 return;
443 }
444 mutex_enter(&proc->p_lock);
445 mutex_exit(&pidlock);
446 socket_sigproc(proc, event);
447 mutex_exit(&proc->p_lock);
448 } else {
449 /*
450 * Send to process group. Hold pidlock across
451 * calls to socket_sigproc().
452 */
453 pid_t pgrp = -so->so_pgrp;
454
455 mutex_enter(&pidlock);
456 proc = pgfind(pgrp);
457 while (proc != NULL) {
458 mutex_enter(&proc->p_lock);
459 socket_sigproc(proc, event);
460 mutex_exit(&proc->p_lock);
461 proc = proc->p_pglink;
462 }
463 mutex_exit(&pidlock);
464 }
465}
466
467#define MIN(a, b) ((a) < (b) ? (a) : (b))
468/* Copy userdata into a new mblk_t */
469mblk_t *
470socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800471 size_t tail_len, int *errorp, cred_t *cr)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800472{
473 mblk_t *head = NULL, **tail = &head;
474
475 ASSERT(iosize == INFPSZ || iosize > 0);
476
477 if (iosize == INFPSZ || iosize > uiop->uio_resid)
478 iosize = uiop->uio_resid;
479
480 if (maxblk == INFPSZ)
481 maxblk = iosize;
482
483 /* Nothing to do in these cases, so we're done */
484 if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
485 goto done;
486
487 /*
488 * We will enter the loop below if iosize is 0; it will allocate an
489 * empty message block and call uiomove(9F) which will just return.
490 * We could avoid that with an extra check but would only slow
491 * down the much more likely case where iosize is larger than 0.
492 */
493 do {
494 ssize_t blocksize;
495 mblk_t *mp;
496
497 blocksize = MIN(iosize, maxblk);
498 ASSERT(blocksize >= 0);
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800499 if (is_system_labeled())
500 mp = allocb_cred(wroff + blocksize + tail_len,
501 cr, curproc->p_pid);
502 else
503 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
504 if (mp == NULL) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800505 *errorp = ENOMEM;
506 return (head);
507 }
508 mp->b_rptr += wroff;
509 mp->b_wptr = mp->b_rptr + blocksize;
510
511 *tail = mp;
512 tail = &mp->b_cont;
513
514 /* uiomove(9F) either returns 0 or EFAULT */
515 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
516 UIO_WRITE, uiop)) != 0) {
517 ASSERT(*errorp != ENOMEM);
518 freemsg(head);
519 return (NULL);
520 }
521
522 iosize -= blocksize;
523 } while (iosize > 0);
524
525done:
526 *errorp = 0;
527 return (head);
528}
529
530mblk_t *
531socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
532{
533 int error;
534 ptrdiff_t n;
535 mblk_t *nmp;
536
537 ASSERT(mp->b_wptr >= mp->b_rptr);
538
539 /*
540 * max_read is the offset of the oobmark and read can not go pass
541 * the oobmark.
542 */
543 if (max_read == INFPSZ || max_read > uiop->uio_resid)
544 max_read = uiop->uio_resid;
545
546 do {
547 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
548 ASSERT(n > 0);
549
550 error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
551 if (error != 0) {
552 freemsg(mp);
553 *errorp = error;
554 return (NULL);
555 }
556 }
557
558 mp->b_rptr += n;
559 max_read -= n;
560 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
561 /*
562 * get rid of zero length mblks
563 */
564 nmp = mp;
565 mp = mp->b_cont;
566 freeb(nmp);
567 }
568 } while (mp != NULL && max_read > 0);
569
570 *errorp = 0;
571 return (mp);
572}
573
574static void
575so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
576{
577 ASSERT(last_tail != NULL);
578 mp->b_next = so->so_rcv_q_head;
579 mp->b_prev = last_tail;
580 ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
581
582 if (so->so_rcv_q_head == NULL) {
583 ASSERT(so->so_rcv_q_last_head == NULL);
584 so->so_rcv_q_last_head = mp;
585#ifdef DEBUG
586 } else {
587 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
588#endif
589 }
590 so->so_rcv_q_head = mp;
591
592#ifdef DEBUG
593 if (so_debug_length) {
594 mutex_enter(&so->so_lock);
595 ASSERT(so_check_length(so));
596 mutex_exit(&so->so_lock);
597 }
598#endif
599}
600
601static void
602process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
603{
604 ASSERT(mp_head->b_prev != NULL);
605 if (so->so_rcv_q_head == NULL) {
606 so->so_rcv_q_head = mp_head;
607 so->so_rcv_q_last_head = mp_last_head;
608 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
609 } else {
610 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
611 (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
612
613 if (mp_head->b_next == NULL &&
614 DB_TYPE(mp_head) == M_DATA &&
615 DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
616 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
617 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
618 mp_head->b_prev = NULL;
619 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
620 /*
621 * Append to last_head if more than one mblks, and both
622 * mp_head and last_head are I/OAT mblks.
623 */
624 ASSERT(mp_head->b_next != NULL);
625 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
626 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
627 mp_head->b_prev = NULL;
628
629 so->so_rcv_q_last_head->b_next = mp_head->b_next;
630 mp_head->b_next = NULL;
631 so->so_rcv_q_last_head = mp_last_head;
632 } else {
633#ifdef DEBUG
634 {
635 mblk_t *tmp_mblk;
636 tmp_mblk = mp_head;
637 while (tmp_mblk != NULL) {
638 ASSERT(tmp_mblk->b_prev != NULL);
639 tmp_mblk = tmp_mblk->b_next;
640 }
641 }
642#endif
643 so->so_rcv_q_last_head->b_next = mp_head;
644 so->so_rcv_q_last_head = mp_last_head;
645 }
646 }
647}
648
649int
650so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
651 rval_t *rvalp, int flags)
652{
653 mblk_t *mp, *nmp;
654 mblk_t *savemp, *savemptail;
655 mblk_t *new_msg_head;
656 mblk_t *new_msg_last_head;
657 mblk_t *last_tail;
658 boolean_t partial_read;
659 boolean_t reset_atmark = B_FALSE;
660 int more = 0;
661 int error;
662 ssize_t oobmark;
663 sodirect_t *sodp = so->so_direct;
664
665 partial_read = B_FALSE;
666 *mctlp = NULL;
667again:
668 mutex_enter(&so->so_lock);
669again1:
670#ifdef DEBUG
671 if (so_debug_length) {
672 ASSERT(so_check_length(so));
673 }
674#endif
675 /*
676 * First move messages from the dump area to processing area
677 */
678 if (sodp != NULL) {
679 /* No need to grab sod_lockp since it pointers to so_lock */
680 if (sodp->sod_state & SOD_ENABLED) {
681 ASSERT(sodp->sod_lockp == &so->so_lock);
682
683 if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
684 /* nothing to uioamove */
685 sodp = NULL;
686 } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
687 sodp->sod_uioa.uioa_state &= UIOA_CLR;
688 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
689 /*
690 * try to uioamove() the data that
691 * has already queued.
692 */
693 sod_uioa_so_init(so, sodp, uiop);
694 }
695 } else {
696 sodp = NULL;
697 }
698 }
699 new_msg_head = so->so_rcv_head;
700 new_msg_last_head = so->so_rcv_last_head;
701 so->so_rcv_head = NULL;
702 so->so_rcv_last_head = NULL;
703 oobmark = so->so_oobmark;
704 /*
705 * We can release the lock as there can only be one reader
706 */
707 mutex_exit(&so->so_lock);
708
709 if (so->so_state & SS_RCVATMARK) {
710 reset_atmark = B_TRUE;
711 }
712 if (new_msg_head != NULL) {
713 process_new_message(so, new_msg_head, new_msg_last_head);
714 }
715 savemp = savemptail = NULL;
716 rvalp->r_val1 = 0;
717 error = 0;
718 mp = so->so_rcv_q_head;
719
720 if (mp != NULL &&
721 (so->so_rcv_timer_tid == 0 ||
722 so->so_rcv_queued >= so->so_rcv_thresh)) {
723 partial_read = B_FALSE;
724
725 if (flags & MSG_PEEK) {
726 if ((nmp = dupmsg(mp)) == NULL &&
727 (nmp = copymsg(mp)) == NULL) {
728 size_t size = msgsize(mp);
729
730 error = strwaitbuf(size, BPRI_HI);
731 if (error) {
732 return (error);
733 }
734 goto again;
735 }
736 mp = nmp;
737 } else {
738 ASSERT(mp->b_prev != NULL);
739 last_tail = mp->b_prev;
740 mp->b_prev = NULL;
741 so->so_rcv_q_head = mp->b_next;
742 if (so->so_rcv_q_head == NULL) {
743 so->so_rcv_q_last_head = NULL;
744 }
745 mp->b_next = NULL;
746 }
747
748 ASSERT(mctlp != NULL);
749 /*
750 * First process PROTO or PCPROTO blocks, if any.
751 */
752 if (DB_TYPE(mp) != M_DATA) {
753 *mctlp = mp;
754 savemp = mp;
755 savemptail = mp;
756 ASSERT(DB_TYPE(mp) == M_PROTO ||
757 DB_TYPE(mp) == M_PCPROTO);
758 while (mp->b_cont != NULL &&
759 DB_TYPE(mp->b_cont) != M_DATA) {
760 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
761 DB_TYPE(mp->b_cont) == M_PCPROTO);
762 mp = mp->b_cont;
763 savemptail = mp;
764 }
765 mp = savemptail->b_cont;
766 savemptail->b_cont = NULL;
767 }
768
769 ASSERT(DB_TYPE(mp) == M_DATA);
770 /*
771 * Now process DATA blocks, if any. Note that for sodirect
772 * enabled socket, uio_resid can be 0.
773 */
774 if (uiop->uio_resid >= 0) {
775 ssize_t copied = 0;
776
777 if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
778 mutex_enter(sodp->sod_lockp);
779 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
780 copied = sod_uioa_mblk(so, mp);
781 if (copied > 0)
782 partial_read = B_TRUE;
783 mutex_exit(sodp->sod_lockp);
784 /* mark this mblk as processed */
785 mp = NULL;
786 } else {
787 ssize_t oldresid = uiop->uio_resid;
788
789 if (MBLKL(mp) < so_mblk_pull_len) {
790 if (pullupmsg(mp, -1) == 1) {
791 last_tail = mp;
792 }
793 }
794 /*
795 * Can not read beyond the oobmark
796 */
797 mp = socopyoutuio(mp, uiop,
798 oobmark == 0 ? INFPSZ : oobmark, &error);
799 if (error != 0) {
800 freemsg(*mctlp);
801 *mctlp = NULL;
802 more = 0;
803 goto done;
804 }
805 ASSERT(oldresid >= uiop->uio_resid);
806 copied = oldresid - uiop->uio_resid;
807 if (oldresid > uiop->uio_resid)
808 partial_read = B_TRUE;
809 }
810 ASSERT(copied >= 0);
811 if (copied > 0 && !(flags & MSG_PEEK)) {
812 mutex_enter(&so->so_lock);
813 so->so_rcv_queued -= copied;
814 ASSERT(so->so_oobmark >= 0);
815 if (so->so_oobmark > 0) {
816 so->so_oobmark -= copied;
817 ASSERT(so->so_oobmark >= 0);
818 if (so->so_oobmark == 0) {
819 ASSERT(so->so_state &
820 SS_OOBPEND);
821 so->so_oobmark = 0;
822 so->so_state |= SS_RCVATMARK;
823 }
824 }
825 if (so->so_flowctrld && so->so_rcv_queued <
826 so->so_rcvlowat) {
827 so->so_flowctrld = B_FALSE;
828 mutex_exit(&so->so_lock);
829 /*
830 * open up flow control
831 */
832 (*so->so_downcalls->sd_clr_flowctrl)
833 (so->so_proto_handle);
834 } else {
835 mutex_exit(&so->so_lock);
836 }
837 }
838 }
839 if (mp != NULL) { /* more data blocks in msg */
840 more |= MOREDATA;
841 if ((flags & (MSG_PEEK|MSG_TRUNC))) {
andersf0267582008-12-20 22:46:32 -0800842 if (flags & MSG_TRUNC &&
843 ((flags & MSG_PEEK) == 0)) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800844 mutex_enter(&so->so_lock);
845 so->so_rcv_queued -= msgdsize(mp);
846 mutex_exit(&so->so_lock);
847 }
848 freemsg(mp);
849 } else if (partial_read && !somsghasdata(mp)) {
850 /*
851 * Avoid queuing a zero-length tail part of
852 * a message. partial_read == 1 indicates that
853 * we read some of the message.
854 */
855 freemsg(mp);
856 more &= ~MOREDATA;
857 } else {
858 if (savemp != NULL &&
859 (flags & MSG_DUPCTRL)) {
860 mblk_t *nmp;
861 /*
862 * There should only be non data mblks
863 */
864 ASSERT(DB_TYPE(savemp) != M_DATA &&
865 DB_TYPE(savemptail) != M_DATA);
866try_again:
867 if ((nmp = dupmsg(savemp)) == NULL &&
868 (nmp = copymsg(savemp)) == NULL) {
869
870 size_t size = msgsize(savemp);
871
872 error = strwaitbuf(size,
873 BPRI_HI);
874 if (error != 0) {
875 /*
876 * In case we
877 * cannot copy
878 * control data
879 * free the remaining
880 * data.
881 */
882 freemsg(mp);
883 goto done;
884 }
885 goto try_again;
886 }
887
888 ASSERT(nmp != NULL);
889 ASSERT(DB_TYPE(nmp) != M_DATA);
890 savemptail->b_cont = mp;
891 *mctlp = nmp;
892 mp = savemp;
893 }
894 /*
895 * putback mp
896 */
897 so_prepend_msg(so, mp, last_tail);
898 }
899 }
900
901 /* fast check so_rcv_head if there is more data */
902 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
903 *mctlp == NULL && uiop->uio_resid > 0 &&
904 !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
905 goto again;
906 }
907 } else if (!partial_read) {
908 mutex_enter(&so->so_lock);
909 if (so->so_error != 0) {
910 error = sogeterr(so, !(flags & MSG_PEEK));
911 mutex_exit(&so->so_lock);
912 return (error);
913 }
914 /*
915 * No pending data. Return right away for nonblocking
916 * socket, otherwise sleep waiting for data.
917 */
Mike Cheng2caa6592008-12-29 14:01:03 +0800918 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800919 if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
920 (flags & MSG_DONTWAIT)) {
921 error = EWOULDBLOCK;
922 } else {
923 if (so->so_state & (SS_CLOSING |
924 SS_FALLBACK_PENDING)) {
925 mutex_exit(&so->so_lock);
926 error = EINTR;
927 goto done;
928 }
929
930 if (so->so_rcv_head != NULL) {
931 goto again1;
932 }
933 so->so_rcv_wakeup = B_TRUE;
934 so->so_rcv_wanted = uiop->uio_resid;
935 if (so->so_rcvtimeo == 0) {
936 /*
937 * Zero means disable timeout.
938 */
939 error = cv_wait_sig(&so->so_rcv_cv,
940 &so->so_lock);
941 } else {
942 clock_t now;
943 time_to_wait(&now, so->so_rcvtimeo);
944 error = cv_timedwait_sig(&so->so_rcv_cv,
945 &so->so_lock, now);
946 }
947 so->so_rcv_wakeup = B_FALSE;
948 so->so_rcv_wanted = 0;
949
950 if (error == 0) {
951 error = EINTR;
952 } else if (error == -1) {
shenjian34dfe682009-01-21 10:04:42 +0800953 error = EAGAIN;
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800954 } else {
955 goto again1;
956 }
957 }
958 }
959 mutex_exit(&so->so_lock);
960 }
961 if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
962 /*
963 * We are passed the mark, update state
964 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
965 * The draft Posix socket spec states that the mark should
966 * not be cleared when peeking. We follow the latter.
967 */
968 mutex_enter(&so->so_lock);
969 ASSERT(so_verify_oobstate(so));
970 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
971 freemsg(so->so_oobmsg);
972 so->so_oobmsg = NULL;
973 ASSERT(so_verify_oobstate(so));
974 mutex_exit(&so->so_lock);
975 }
976 ASSERT(so->so_rcv_wakeup == B_FALSE);
977done:
978 if (sodp != NULL) {
979 mutex_enter(sodp->sod_lockp);
980 if ((sodp->sod_state & SOD_ENABLED) &&
981 (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
982 SOD_UIOAFINI(sodp);
983 if (sodp->sod_uioa.uioa_mbytes > 0) {
984 ASSERT(so->so_rcv_q_head != NULL ||
985 so->so_rcv_head != NULL);
986 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
987 if (error == EWOULDBLOCK)
988 error = 0;
989 }
990 }
991 mutex_exit(sodp->sod_lockp);
992 }
993#ifdef DEBUG
994 if (so_debug_length) {
995 mutex_enter(&so->so_lock);
996 ASSERT(so_check_length(so));
997 mutex_exit(&so->so_lock);
998 }
999#endif
1000 rvalp->r_val1 = more;
1001 return (error);
1002}
1003
1004void
1005so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1006{
1007 ASSERT(MUTEX_HELD(&so->so_lock));
1008
1009#ifdef DEBUG
1010 if (so_debug_length) {
1011 ASSERT(so_check_length(so));
1012 }
1013#endif
1014 so->so_rcv_queued += msg_size;
1015
1016 if (so->so_rcv_head == NULL) {
1017 ASSERT(so->so_rcv_last_head == NULL);
1018 so->so_rcv_head = mp;
1019 so->so_rcv_last_head = mp;
1020 } else if ((DB_TYPE(mp) == M_DATA &&
1021 DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1022 ((DB_FLAGS(mp) & DBLK_UIOA) ==
1023 (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1024 /* Added to the end */
1025 ASSERT(so->so_rcv_last_head != NULL);
1026 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1027 so->so_rcv_last_head->b_prev->b_cont = mp;
1028 } else {
1029 /* Start a new end */
1030 so->so_rcv_last_head->b_next = mp;
1031 so->so_rcv_last_head = mp;
1032 }
1033 while (mp->b_cont != NULL)
1034 mp = mp->b_cont;
1035
1036 so->so_rcv_last_head->b_prev = mp;
1037#ifdef DEBUG
1038 if (so_debug_length) {
1039 ASSERT(so_check_length(so));
1040 }
1041#endif
1042}
1043
1044/*
1045 * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1046 */
1047boolean_t
1048somsghasdata(mblk_t *mp)
1049{
1050 for (; mp; mp = mp->b_cont)
1051 if (mp->b_datap->db_type == M_DATA) {
1052 ASSERT(mp->b_wptr >= mp->b_rptr);
1053 if (mp->b_wptr > mp->b_rptr)
1054 return (B_TRUE);
1055 }
1056 return (B_FALSE);
1057}
1058
1059/*
1060 * Flush the read side of sockfs.
1061 *
1062 * The caller must be sure that a reader is not already active when the
1063 * buffer is being flushed.
1064 */
1065void
1066so_rcv_flush(struct sonode *so)
1067{
1068 mblk_t *mp;
1069
1070 ASSERT(MUTEX_HELD(&so->so_lock));
1071
1072 if (so->so_oobmsg != NULL) {
1073 freemsg(so->so_oobmsg);
1074 so->so_oobmsg = NULL;
1075 so->so_oobmark = 0;
1076 so->so_state &=
1077 ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1078 }
1079
1080 /*
1081 * Free messages sitting in the send and recv queue
1082 */
1083 while (so->so_rcv_q_head != NULL) {
1084 mp = so->so_rcv_q_head;
1085 so->so_rcv_q_head = mp->b_next;
1086 mp->b_next = mp->b_prev = NULL;
1087 freemsg(mp);
1088 }
1089 while (so->so_rcv_head != NULL) {
1090 mp = so->so_rcv_head;
1091 so->so_rcv_head = mp->b_next;
1092 mp->b_next = mp->b_prev = NULL;
1093 freemsg(mp);
1094 }
1095 so->so_rcv_queued = 0;
1096 so->so_rcv_q_head = NULL;
1097 so->so_rcv_q_last_head = NULL;
1098 so->so_rcv_head = NULL;
1099 so->so_rcv_last_head = NULL;
1100}
1101
1102/*
1103 * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1104 */
1105int
1106sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1107 boolean_t oob_inline)
1108{
1109 mblk_t *mp, *nmp;
1110 int error;
1111
1112 dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1113 flags));
1114
1115 if (msg != NULL) {
1116 /*
1117 * There is never any oob data with addresses or control since
1118 * the T_EXDATA_IND does not carry any options.
1119 */
1120 msg->msg_controllen = 0;
1121 msg->msg_namelen = 0;
1122 msg->msg_flags = 0;
1123 }
1124
1125 mutex_enter(&so->so_lock);
1126 ASSERT(so_verify_oobstate(so));
1127 if (oob_inline ||
1128 (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1129 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1130 mutex_exit(&so->so_lock);
1131 return (EINVAL);
1132 }
1133 if (!(so->so_state & SS_HAVEOOBDATA)) {
1134 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1135 mutex_exit(&so->so_lock);
1136 return (EWOULDBLOCK);
1137 }
1138 ASSERT(so->so_oobmsg != NULL);
1139 mp = so->so_oobmsg;
1140 if (flags & MSG_PEEK) {
1141 /*
1142 * Since recv* can not return ENOBUFS we can not use dupmsg.
1143 * Instead we revert to the consolidation private
1144 * allocb_wait plus bcopy.
1145 */
1146 mblk_t *mp1;
1147
1148 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1149 ASSERT(mp1);
1150
1151 while (mp != NULL) {
1152 ssize_t size;
1153
1154 size = MBLKL(mp);
1155 bcopy(mp->b_rptr, mp1->b_wptr, size);
1156 mp1->b_wptr += size;
1157 ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1158 mp = mp->b_cont;
1159 }
1160 mp = mp1;
1161 } else {
1162 /*
1163 * Update the state indicating that the data has been consumed.
1164 * Keep SS_OOBPEND set until data is consumed past the mark.
1165 */
1166 so->so_oobmsg = NULL;
1167 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1168 }
1169 ASSERT(so_verify_oobstate(so));
1170 mutex_exit(&so->so_lock);
1171
1172 error = 0;
1173 nmp = mp;
1174 while (nmp != NULL && uiop->uio_resid > 0) {
1175 ssize_t n = MBLKL(nmp);
1176
1177 n = MIN(n, uiop->uio_resid);
1178 if (n > 0)
1179 error = uiomove(nmp->b_rptr, n,
1180 UIO_READ, uiop);
1181 if (error)
1182 break;
1183 nmp = nmp->b_cont;
1184 }
1185 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1186 freemsg(mp);
1187 return (error);
1188}
1189
1190/*
1191 * Allocate and initializ sonode
1192 */
1193/* ARGSUSED */
1194struct sonode *
1195socket_sonode_create(struct sockparams *sp, int family, int type,
1196 int protocol, int version, int sflags, int *errorp, struct cred *cr)
1197{
1198 sonode_t *so;
1199 int kmflags;
1200
1201 /*
1202 * Choose the right set of sonodeops based on the upcall and
1203 * down call version that the protocol has provided
1204 */
1205 if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1206 SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1207 /*
1208 * mismatch
1209 */
1210#ifdef DEBUG
1211 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1212#endif
1213 *errorp = EINVAL;
1214 return (NULL);
1215 }
1216
1217 kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1218
1219 so = kmem_cache_alloc(socket_cache, kmflags);
1220 if (so == NULL) {
1221 *errorp = ENOMEM;
1222 return (NULL);
1223 }
1224
1225 sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1226
1227 if (version == SOV_DEFAULT)
1228 version = so_default_version;
1229
1230 so->so_version = (short)version;
1231
1232 /*
1233 * set the default values to be INFPSZ
1234 * if a protocol desires it can change the value later
1235 */
1236 so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1237 so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1238 so->so_proto_props.sopp_maxpsz = INFPSZ;
1239 so->so_proto_props.sopp_maxblk = INFPSZ;
1240
1241 return (so);
1242}
1243
1244int
1245socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1246{
1247 int error = 0;
1248
1249 if (pso != NULL) {
1250 /*
1251 * We have a passive open, so inherit basic state from
1252 * the parent (listener).
1253 *
1254 * No need to grab the new sonode's lock, since there is no
1255 * one that can have a reference to it.
1256 */
1257 mutex_enter(&pso->so_lock);
1258
1259 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1260 so->so_pgrp = pso->so_pgrp;
1261 so->so_rcvtimeo = pso->so_rcvtimeo;
1262 so->so_sndtimeo = pso->so_sndtimeo;
Yu Xiangninga5adac42008-12-29 13:56:29 +08001263 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001264 /*
1265 * Make note of the socket level options. TCP and IP level
1266 * options are already inherited. We could do all this after
1267 * accept is successful but doing it here simplifies code and
1268 * no harm done for error case.
1269 */
1270 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
Yu Xiangninga5adac42008-12-29 13:56:29 +08001271 SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001272 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1273 so->so_proto_props = pso->so_proto_props;
1274 so->so_mode = pso->so_mode;
andersf0267582008-12-20 22:46:32 -08001275 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001276
1277 mutex_exit(&pso->so_lock);
1278
1279 if (uioasync.enabled) {
1280 sod_sock_init(so, NULL, NULL, NULL, &so->so_lock);
1281 }
1282 return (0);
1283 } else {
1284 struct sockparams *sp = so->so_sockparams;
1285 sock_upcalls_t *upcalls_to_use;
1286
1287 /*
1288 * Based on the version number select the right upcalls to
1289 * pass down. Currently we only have one version so choose
1290 * default
1291 */
1292 upcalls_to_use = &so_upcalls;
1293
1294 /* active open, so create a lower handle */
1295 so->so_proto_handle =
1296 sp->sp_smod_info->smod_proto_create_func(so->so_family,
1297 so->so_type, so->so_protocol, &so->so_downcalls,
1298 &so->so_mode, &error, flags, cr);
1299
1300 if (so->so_proto_handle == NULL) {
1301 ASSERT(error != 0);
1302 /*
1303 * To be safe; if a lower handle cannot be created, and
1304 * the proto does not give a reason why, assume there
1305 * was a lack of memory.
1306 */
1307 return ((error == 0) ? ENOMEM : error);
1308 }
1309 ASSERT(so->so_downcalls != NULL);
1310 ASSERT(so->so_downcalls->sd_send != NULL ||
1311 so->so_downcalls->sd_send_uio != NULL);
1312 if (so->so_downcalls->sd_recv_uio != NULL) {
1313 ASSERT(so->so_downcalls->sd_poll != NULL);
1314 so->so_pollev |= SO_POLLEV_ALWAYS;
1315 }
1316
1317 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1318 (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1319
1320 /* Wildcard */
1321
1322 /*
1323 * FIXME No need for this, the protocol can deal with it in
1324 * sd_create(). Should update ICMP.
1325 */
1326 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1327 int protocol = so->so_protocol;
1328 int error;
1329 /*
1330 * Issue SO_PROTOTYPE setsockopt.
1331 */
1332 error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1333 &protocol, (t_uscalar_t)sizeof (protocol), cr);
1334 if (error) {
1335 (void) (*so->so_downcalls->sd_close)
1336 (so->so_proto_handle, 0, cr);
1337
1338 mutex_enter(&so->so_lock);
1339 so_rcv_flush(so);
1340 mutex_exit(&so->so_lock);
1341 /*
1342 * Setsockopt often fails with ENOPROTOOPT but
1343 * socket() should fail with
1344 * EPROTONOSUPPORT/EPROTOTYPE.
1345 */
1346 return (EPROTONOSUPPORT);
1347 }
1348 }
1349 return (0);
1350 }
1351}
1352
1353/*
1354 * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1355 * struct cred *cr, int32_t *rvalp)
1356 *
1357 * Handle ioctls that manipulate basic socket state; non-blocking,
1358 * async, etc.
1359 *
1360 * Returns:
1361 * < 0 - ioctl was not handle
1362 * >= 0 - ioctl was handled, if > 0, then it is an errno
1363 *
1364 * Notes:
1365 * Assumes the standard receive buffer is used to obtain info for
1366 * NREAD.
1367 */
1368/* ARGSUSED */
1369int
1370socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1371 struct cred *cr, int32_t *rvalp)
1372{
1373 switch (cmd) {
Rao Shoaibbfcb55b2009-01-05 10:51:43 -08001374 case SIOCSQPTR:
1375 /*
1376 * SIOCSQPTR is valid only when helper stream is created
1377 * by the protocol.
1378 */
1379
1380 return (EOPNOTSUPP);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001381 case FIONBIO: {
1382 int32_t value;
1383
1384 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1385 (mode & (int)FKIOCTL)))
1386 return (EFAULT);
1387
1388 mutex_enter(&so->so_lock);
1389 if (value) {
1390 so->so_state |= SS_NDELAY;
1391 } else {
1392 so->so_state &= ~SS_NDELAY;
1393 }
1394 mutex_exit(&so->so_lock);
1395 return (0);
1396 }
1397 case FIOASYNC: {
1398 int32_t value;
1399
1400 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1401 (mode & (int)FKIOCTL)))
1402 return (EFAULT);
1403
1404 mutex_enter(&so->so_lock);
1405
1406 if (value) {
1407 /* Turn on SIGIO */
1408 so->so_state |= SS_ASYNC;
1409 } else {
1410 /* Turn off SIGIO */
1411 so->so_state &= ~SS_ASYNC;
1412 }
1413 mutex_exit(&so->so_lock);
1414
1415 return (0);
1416 }
1417
1418 case SIOCSPGRP:
1419 case FIOSETOWN: {
1420 int error;
1421 pid_t pid;
1422
1423 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1424 (mode & (int)FKIOCTL)))
1425 return (EFAULT);
1426
1427 mutex_enter(&so->so_lock);
1428 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1429 mutex_exit(&so->so_lock);
1430 return (error);
1431 }
1432 case SIOCGPGRP:
1433 case FIOGETOWN:
1434 if (so_copyout(&so->so_pgrp, (void *)arg,
1435 sizeof (pid_t), (mode & (int)FKIOCTL)))
1436 return (EFAULT);
1437
1438 return (0);
1439 case SIOCATMARK: {
1440 int retval;
1441
1442 /*
1443 * Only protocols that support urgent data can handle ATMARK.
1444 */
1445 if ((so->so_mode & SM_EXDATA) == 0)
1446 return (EINVAL);
1447
1448 /*
1449 * If the protocol is maintaining its own buffer, then the
1450 * request must be passed down.
1451 */
1452 if (so->so_downcalls->sd_recv_uio != NULL)
1453 return (-1);
1454
1455 retval = (so->so_state & SS_RCVATMARK) != 0;
1456
1457 if (so_copyout(&retval, (void *)arg, sizeof (int),
1458 (mode & (int)FKIOCTL))) {
1459 return (EFAULT);
1460 }
1461 return (0);
1462 }
1463
1464 case FIONREAD: {
1465 int retval;
1466
1467 /*
1468 * If the protocol is maintaining its own buffer, then the
1469 * request must be passed down.
1470 */
1471 if (so->so_downcalls->sd_recv_uio != NULL)
1472 return (-1);
1473
1474 retval = MIN(so->so_rcv_queued, INT_MAX);
1475
1476 if (so_copyout(&retval, (void *)arg,
1477 sizeof (retval), (mode & (int)FKIOCTL))) {
1478 return (EFAULT);
1479 }
1480 return (0);
1481 }
1482
1483 case _I_GETPEERCRED: {
1484 int error = 0;
1485
1486 if ((mode & FKIOCTL) == 0)
1487 return (EINVAL);
1488
1489 mutex_enter(&so->so_lock);
1490 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1491 error = ENOTSUP;
1492 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1493 error = ENOTCONN;
1494 } else if (so->so_peercred != NULL) {
1495 k_peercred_t *kp = (k_peercred_t *)arg;
1496 kp->pc_cr = so->so_peercred;
1497 kp->pc_cpid = so->so_cpid;
1498 crhold(so->so_peercred);
1499 } else {
1500 error = EINVAL;
1501 }
1502 mutex_exit(&so->so_lock);
1503 return (error);
1504 }
1505 default:
1506 return (-1);
1507 }
1508}
1509
1510/*
1511 * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified
1512 * then the socket will fall back to TPI.
1513 *
1514 * Returns:
1515 * < 0 - ioctl was not handle
1516 * >= 0 - ioctl was handled, if > 0, then it is an errno
1517 */
1518int
1519socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1520 struct cred *cr, int32_t *rvalp)
1521{
1522 switch (cmd) {
1523 case _I_INSERT:
1524 case _I_REMOVE:
1525 case I_FIND:
1526 case I_LIST:
1527 return (EOPNOTSUPP);
1528
1529 case I_PUSH:
1530 case I_POP: {
1531 int retval;
1532
1533 if ((retval = so_tpi_fallback(so, cr)) == 0) {
1534 /* Reissue the ioctl */
1535 ASSERT(so->so_rcv_q_head == NULL);
1536 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1537 }
1538 return (retval);
1539 }
1540 case I_LOOK:
1541 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1542 (mode & (int)FKIOCTL))) {
1543 return (EFAULT);
1544 }
1545 return (0);
1546 default:
1547 return (-1);
1548 }
1549}
1550
1551int
1552socket_getopt_common(struct sonode *so, int level, int option_name,
Yu Xiangninga5adac42008-12-29 13:56:29 +08001553 void *optval, socklen_t *optlenp, int flags)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001554{
1555 if (level != SOL_SOCKET)
1556 return (-1);
1557
1558 switch (option_name) {
1559 case SO_ERROR:
1560 case SO_DOMAIN:
1561 case SO_TYPE:
1562 case SO_ACCEPTCONN: {
1563 int32_t value;
1564 socklen_t optlen = *optlenp;
1565
1566 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1567 return (EINVAL);
1568 }
1569
1570 switch (option_name) {
1571 case SO_ERROR:
1572 mutex_enter(&so->so_lock);
1573 value = sogeterr(so, B_TRUE);
1574 mutex_exit(&so->so_lock);
1575 break;
1576 case SO_DOMAIN:
1577 value = so->so_family;
1578 break;
1579 case SO_TYPE:
1580 value = so->so_type;
1581 break;
1582 case SO_ACCEPTCONN:
1583 if (so->so_state & SS_ACCEPTCONN)
1584 value = SO_ACCEPTCONN;
1585 else
1586 value = 0;
1587 break;
1588 }
1589
1590 bcopy(&value, optval, sizeof (value));
1591 *optlenp = sizeof (value);
1592
1593 return (0);
1594 }
1595 case SO_SNDTIMEO:
1596 case SO_RCVTIMEO: {
1597 clock_t value;
1598 socklen_t optlen = *optlenp;
shenjiane5083e82009-01-20 14:46:11 +08001599
1600 if (get_udatamodel() == DATAMODEL_NONE ||
1601 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001602 if (optlen < sizeof (struct timeval))
1603 return (EINVAL);
1604 } else {
1605 if (optlen < sizeof (struct timeval32))
1606 return (EINVAL);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001607 }
1608 if (option_name == SO_RCVTIMEO)
1609 value = drv_hztousec(so->so_rcvtimeo);
1610 else
1611 value = drv_hztousec(so->so_sndtimeo);
shenjian22238f72009-01-07 13:45:08 +08001612
shenjiane5083e82009-01-20 14:46:11 +08001613 if (get_udatamodel() == DATAMODEL_NONE ||
1614 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001615 ((struct timeval *)(optval))->tv_sec =
1616 value / (1000 * 1000);
1617 ((struct timeval *)(optval))->tv_usec =
1618 value % (1000 * 1000);
1619 *optlenp = sizeof (struct timeval);
1620 } else {
1621 ((struct timeval32 *)(optval))->tv_sec =
1622 value / (1000 * 1000);
1623 ((struct timeval32 *)(optval))->tv_usec =
1624 value % (1000 * 1000);
1625 *optlenp = sizeof (struct timeval32);
1626 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001627 return (0);
1628 }
1629 case SO_DEBUG:
1630 case SO_REUSEADDR:
1631 case SO_KEEPALIVE:
1632 case SO_DONTROUTE:
1633 case SO_BROADCAST:
1634 case SO_USELOOPBACK:
1635 case SO_OOBINLINE:
1636 case SO_SNDBUF:
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001637#ifdef notyet
1638 case SO_SNDLOWAT:
1639 case SO_RCVLOWAT:
1640#endif /* notyet */
1641 case SO_DGRAM_ERRIND: {
1642 socklen_t optlen = *optlenp;
1643
1644 if (optlen < (t_uscalar_t)sizeof (int32_t))
1645 return (EINVAL);
1646 break;
1647 }
Yu Xiangninga5adac42008-12-29 13:56:29 +08001648 case SO_RCVBUF: {
1649 socklen_t optlen = *optlenp;
1650
1651 if (optlen < (t_uscalar_t)sizeof (int32_t))
1652 return (EINVAL);
1653
1654 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1655 /*
1656 * XXX If SO_RCVBUF has been set and this is an
1657 * XPG 4.2 application then do not ask the transport
1658 * since the transport might adjust the value and not
1659 * return exactly what was set by the application.
1660 * For non-XPG 4.2 application we return the value
1661 * that the transport is actually using.
1662 */
1663 *(int32_t *)optval = so->so_xpg_rcvbuf;
1664 *optlenp = sizeof (so->so_xpg_rcvbuf);
1665 return (0);
1666 }
1667 /*
1668 * If the option has not been set then get a default
1669 * value from the transport.
1670 */
1671 break;
1672 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001673 case SO_LINGER: {
1674 socklen_t optlen = *optlenp;
1675
1676 if (optlen < (t_uscalar_t)sizeof (struct linger))
1677 return (EINVAL);
1678 break;
1679 }
1680 case SO_SND_BUFINFO: {
1681 socklen_t optlen = *optlenp;
1682
1683 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1684 return (EINVAL);
1685 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1686 (so->so_proto_props).sopp_wroff;
1687 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1688 (so->so_proto_props).sopp_maxblk;
1689 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1690 (so->so_proto_props).sopp_maxpsz;
1691 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1692 (so->so_proto_props).sopp_tail;
1693 *optlenp = sizeof (struct so_snd_bufinfo);
1694 return (0);
1695 }
1696 default:
1697 break;
1698 }
1699
1700 /* Unknown Option */
1701 return (-1);
1702}
1703
1704void
1705socket_sonode_destroy(struct sonode *so)
1706{
1707 sonode_fini(so);
1708 kmem_cache_free(socket_cache, so);
1709}
1710
1711int
1712so_zcopy_wait(struct sonode *so)
1713{
1714 int error = 0;
1715
1716 mutex_enter(&so->so_lock);
1717 while (!(so->so_copyflag & STZCNOTIFY)) {
1718 if (so->so_state & SS_CLOSING) {
1719 mutex_exit(&so->so_lock);
1720 return (EINTR);
1721 }
1722 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1723 error = EINTR;
1724 break;
1725 }
1726 }
1727 so->so_copyflag &= ~STZCNOTIFY;
1728 mutex_exit(&so->so_lock);
1729 return (error);
1730}
1731
1732void
1733so_timer_callback(void *arg)
1734{
1735 struct sonode *so = (struct sonode *)arg;
1736
1737 mutex_enter(&so->so_lock);
1738
1739 so->so_rcv_timer_tid = 0;
1740 if (so->so_rcv_queued > 0) {
1741 so_notify_data(so, so->so_rcv_queued);
1742 } else {
1743 mutex_exit(&so->so_lock);
1744 }
1745}
1746
1747#ifdef DEBUG
1748/*
1749 * Verify that the length stored in so_rcv_queued and the length of data blocks
1750 * queued is same.
1751 */
1752static boolean_t
1753so_check_length(sonode_t *so)
1754{
1755 mblk_t *mp = so->so_rcv_q_head;
1756 int len = 0;
1757
1758 ASSERT(MUTEX_HELD(&so->so_lock));
1759
1760 if (mp != NULL) {
1761 len = msgdsize(mp);
1762 while ((mp = mp->b_next) != NULL)
1763 len += msgdsize(mp);
1764 }
1765 mp = so->so_rcv_head;
1766 if (mp != NULL) {
1767 len += msgdsize(mp);
1768 while ((mp = mp->b_next) != NULL)
1769 len += msgdsize(mp);
1770 }
1771 return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1772}
1773#endif
1774
1775int
1776so_get_mod_version(struct sockparams *sp)
1777{
1778 ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1779 return (sp->sp_smod_info->smod_version);
1780}
1781
1782/*
1783 * so_start_fallback()
1784 *
1785 * Block new socket operations from coming in, and wait for active operations
1786 * to complete. Threads that are sleeping will be woken up so they can get
1787 * out of the way.
1788 *
1789 * The caller must be a reader on so_fallback_rwlock.
1790 */
1791static boolean_t
1792so_start_fallback(struct sonode *so)
1793{
1794 ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1795
1796 mutex_enter(&so->so_lock);
1797 if (so->so_state & SS_FALLBACK_PENDING) {
1798 mutex_exit(&so->so_lock);
1799 return (B_FALSE);
1800 }
1801 so->so_state |= SS_FALLBACK_PENDING;
1802 /*
1803 * Poke all threads that might be sleeping. Any operation that comes
1804 * in after the cv_broadcast will observe the fallback pending flag
1805 * which cause the call to return where it would normally sleep.
1806 */
1807 cv_broadcast(&so->so_state_cv); /* threads in connect() */
1808 cv_broadcast(&so->so_rcv_cv); /* threads in recvmsg() */
1809 cv_broadcast(&so->so_snd_cv); /* threads in sendmsg() */
1810 mutex_enter(&so->so_acceptq_lock);
1811 cv_broadcast(&so->so_acceptq_cv); /* threads in accept() */
1812 mutex_exit(&so->so_acceptq_lock);
1813 mutex_exit(&so->so_lock);
1814
1815 /*
1816 * The main reason for the rw_tryupgrade call is to provide
1817 * observability during the fallback process. We want to
1818 * be able to see if there are pending operations.
1819 */
1820 if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1821 /*
1822 * It is safe to drop and reaquire the fallback lock, because
1823 * we are guaranteed that another fallback cannot take place.
1824 */
1825 rw_exit(&so->so_fallback_rwlock);
1826 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1827 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1828 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1829 }
1830
1831 return (B_TRUE);
1832}
1833
1834/*
1835 * so_end_fallback()
1836 *
1837 * Allow socket opertions back in.
1838 *
1839 * The caller must be a writer on so_fallback_rwlock.
1840 */
1841static void
1842so_end_fallback(struct sonode *so)
1843{
1844 ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1845
1846 mutex_enter(&so->so_lock);
1847 so->so_state &= ~SS_FALLBACK_PENDING;
1848 mutex_exit(&so->so_lock);
1849
1850 rw_downgrade(&so->so_fallback_rwlock);
1851}
1852
1853/*
1854 * so_quiesced_cb()
1855 *
1856 * Callback passed to the protocol during fallback. It is called once
1857 * the endpoint is quiescent.
1858 *
1859 * No requests from the user, no notifications from the protocol, so it
1860 * is safe to synchronize the state. Data can also be moved without
1861 * risk for reordering.
1862 *
1863 * NOTE: urgent data is dropped on the floor.
1864 *
1865 * We do not need to hold so_lock, since there can be only one thread
1866 * operating on the sonode.
1867 */
1868static void
1869so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
1870 struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
1871 struct sockaddr *faddr, socklen_t faddrlen, short opts)
1872{
1873 struct sonode *so = (struct sonode *)sock_handle;
1874
1875 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
1876
1877 mutex_enter(&so->so_lock);
1878 SOCKET_TIMER_CANCEL(so);
1879 mutex_exit(&so->so_lock);
1880 /*
1881 * Move data to the STREAM head.
1882 */
1883 if (so->so_rcv_head != NULL) {
1884 if (so->so_rcv_q_last_head == NULL)
1885 so->so_rcv_q_head = so->so_rcv_head;
1886 else
1887 so->so_rcv_q_last_head->b_next = so->so_rcv_head;
1888 so->so_rcv_q_last_head = so->so_rcv_last_head;
1889 }
1890
1891 while (so->so_rcv_q_head != NULL) {
1892 mblk_t *mp = so->so_rcv_q_head;
1893 size_t mlen = msgdsize(mp);
1894
1895 so->so_rcv_q_head = mp->b_next;
1896 mp->b_next = NULL;
1897 mp->b_prev = NULL;
1898 so->so_rcv_queued -= mlen;
1899 putnext(q, mp);
1900 }
1901 ASSERT(so->so_rcv_queued == 0);
1902 so->so_rcv_head = NULL;
1903 so->so_rcv_last_head = NULL;
1904 so->so_rcv_q_head = NULL;
1905 so->so_rcv_q_last_head = NULL;
1906
1907#ifdef DEBUG
1908 if (so->so_oobmsg != NULL || so->so_oobmark > 0) {
1909 cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n");
1910 }
1911#endif
1912 if (so->so_oobmsg != NULL) {
1913 freemsg(so->so_oobmsg);
1914 so->so_oobmsg = NULL;
1915 }
1916 so->so_oobmark = 0;
1917
1918 ASSERT(so->so_rcv_queued == 0);
1919}
1920
1921/*
1922 * so_tpi_fallback()
1923 *
1924 * This is fallback initation routine; things start here.
1925 *
1926 * Basic strategy:
1927 * o Block new socket operations from coming in
1928 * o Allocate/initate info needed by TPI
1929 * o Quiesce the connection, at which point we sync
1930 * state and move data
1931 * o Change operations (sonodeops) associated with the socket
1932 * o Unblock threads waiting for the fallback to finish
1933 */
1934int
1935so_tpi_fallback(struct sonode *so, struct cred *cr)
1936{
1937 int error;
1938 queue_t *q;
1939 struct sockparams *sp;
1940 struct sockparams *newsp;
1941 so_proto_fallback_func_t fbfunc;
1942 boolean_t direct;
1943
1944 error = 0;
1945 sp = so->so_sockparams;
1946 fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
1947
1948 /*
1949 * Fallback can only happen if there is a device associated
1950 * with the sonode, and the socket module has a fallback function.
1951 */
1952 if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
1953 return (EINVAL);
1954
1955 /*
1956 * Initiate fallback; upon success we know that no new requests
1957 * will come in from the user.
1958 */
1959 if (!so_start_fallback(so))
1960 return (EAGAIN);
1961
1962 newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
1963 so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
1964 KM_SLEEP, &error);
1965 if (error != 0)
1966 goto out;
1967
1968 if (so->so_direct != NULL) {
1969 sodirect_t *sodp = so->so_direct;
1970 mutex_enter(sodp->sod_lockp);
1971
1972 so->so_direct->sod_state &= ~SOD_ENABLED;
1973 so->so_state &= ~SS_SODIRECT;
1974 ASSERT(sodp->sod_uioafh == NULL);
1975 mutex_exit(sodp->sod_lockp);
1976 }
1977
1978 /* Turn sonode into a TPI socket */
1979 q = sotpi_convert_sonode(so, newsp, &direct, cr);
1980 if (q == NULL) {
1981 zcmn_err(getzoneid(), CE_WARN,
1982 "Failed to convert socket to TPI. Pid = %d\n",
1983 curproc->p_pid);
1984 SOCKPARAMS_DEC_REF(newsp);
1985 error = EINVAL;
1986 goto out;
1987 }
1988
1989 /*
1990 * Now tell the protocol to start using TPI. so_quiesced_cb be
1991 * called once it's safe to synchronize state.
1992 */
1993 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
1994 /* FIXME assumes this cannot fail. TCP can fail to enter squeue */
1995 (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
1996 DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
1997
1998 /*
1999 * Free all pending connection indications, i.e., socket_accept() has
2000 * not yet pulled the connection of the queue. The transport sent
2001 * a T_CONN_IND message for each pending connection to the STREAM head.
2002 */
2003 so_acceptq_flush(so);
2004
2005 mutex_enter(&so->so_lock);
2006 so->so_state |= SS_FALLBACK_COMP;
2007 mutex_exit(&so->so_lock);
2008
2009 /*
2010 * Swap the sonode ops. Socket opertations that come in once this
2011 * is done will proceed without blocking.
2012 */
2013 so->so_ops = &sotpi_sonodeops;
2014
2015 /*
Rao Shoaibd36be522008-12-17 13:09:55 -08002016 * No longer a non streams socket
2017 */
2018 so->so_not_str = B_FALSE;
2019 /*
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002020 * Wake up any threads stuck in poll. This is needed since the poll
2021 * head changes when the fallback happens (moves from the sonode to
2022 * the STREAMS head).
2023 */
2024 pollwakeup(&so->so_poll_list, POLLERR);
2025out:
2026 so_end_fallback(so);
2027
2028 return (error);
2029}