blob: b5d6ae9bf803d295dbaa68a57af6437469b01ea8 [file] [log] [blame]
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
Rao Shoaibbfcb55b2009-01-05 10:51:43 -080023 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
Yu Xiangning0f1702c2008-12-11 20:04:13 -080024 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/param.h>
29#include <sys/signal.h>
30#include <sys/cmn_err.h>
31
32#include <sys/stropts.h>
33#include <sys/socket.h>
34#include <sys/socketvar.h>
35#include <sys/sockio.h>
36#include <sys/sodirect.h>
37#include <sys/strsubr.h>
38#include <sys/strsun.h>
39#include <sys/atomic.h>
40
41#include <fs/sockfs/sockcommon.h>
42#include <fs/sockfs/socktpi.h>
43#include <sys/ddi.h>
44#include <inet/ip.h>
45#include <sys/time.h>
46#include <sys/cmn_err.h>
47
48#ifdef SOCK_TEST
49extern int do_useracc;
50extern clock_t sock_test_timelimit;
51#endif /* SOCK_TEST */
52
53#define MBLK_PULL_LEN 64
54uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
55
56#ifdef DEBUG
57boolean_t so_debug_length = B_FALSE;
58static boolean_t so_check_length(sonode_t *so);
59#endif
60
61int
62so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
63{
64 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
65 ASSERT(nso->so_acceptq_next == NULL);
66
67 *so->so_acceptq_tail = nso;
68 so->so_acceptq_tail = &nso->so_acceptq_next;
69 so->so_acceptq_len++;
70
71 /* Wakeup a single consumer */
72 cv_signal(&so->so_acceptq_cv);
73
74 return (so->so_acceptq_len);
75}
76
77/*
78 * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
79 *
80 * Enqueue an incoming connection on a listening socket.
81 *
82 * Arguments:
83 * so - listening socket
84 * nso - new connection
85 *
86 * Returns:
87 * Number of queued connections, including the new connection
88 */
89int
90so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
91{
92 int conns;
93
94 mutex_enter(&so->so_acceptq_lock);
95 conns = so_acceptq_enqueue_locked(so, nso);
96 mutex_exit(&so->so_acceptq_lock);
97
98 return (conns);
99}
100
101static int
102so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
103 struct sonode **nsop)
104{
105 struct sonode *nso = NULL;
106
107 *nsop = NULL;
108 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
109 while ((nso = so->so_acceptq_head) == NULL) {
110 /*
111 * No need to check so_error here, because it is not
112 * possible for a listening socket to be reset or otherwise
113 * disconnected.
114 *
115 * So now we just need check if it's ok to wait.
116 */
117 if (dontblock)
118 return (EWOULDBLOCK);
119 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
120 return (EINTR);
121
122 if (cv_wait_sig_swap(&so->so_acceptq_cv,
123 &so->so_acceptq_lock) == 0)
124 return (EINTR);
125 }
126
127 ASSERT(nso != NULL);
128 so->so_acceptq_head = nso->so_acceptq_next;
129 nso->so_acceptq_next = NULL;
130
131 if (so->so_acceptq_head == NULL) {
132 ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
133 so->so_acceptq_tail = &so->so_acceptq_head;
134 }
135 ASSERT(so->so_acceptq_len > 0);
136 --so->so_acceptq_len;
137
138 *nsop = nso;
139
140 return (0);
141}
142
143/*
144 * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
145 *
146 * Pulls a connection off of the accept queue.
147 *
148 * Arguments:
149 * so - listening socket
150 * dontblock - indicate whether it's ok to sleep if there are no
151 * connections on the queue
152 * nsop - Value-return argument
153 *
154 * Return values:
155 * 0 when a connection is successfully dequeued, in which case nsop
156 * is set to point to the new connection. Upon failure a non-zero
157 * value is returned, and the value of nsop is set to NULL.
158 *
159 * Note:
160 * so_acceptq_dequeue() may return prematurly if the socket is falling
161 * back to TPI.
162 */
163int
164so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
165 struct sonode **nsop)
166{
167 int error;
168
169 mutex_enter(&so->so_acceptq_lock);
170 error = so_acceptq_dequeue_locked(so, dontblock, nsop);
171 mutex_exit(&so->so_acceptq_lock);
172
173 return (error);
174}
175
176/*
177 * void so_acceptq_flush(struct sonode *so)
178 *
179 * Removes all pending connections from a listening socket, and
180 * frees the associated resources.
181 *
182 * Arguments
183 * so - listening socket
184 *
185 * Return values:
186 * None.
187 *
188 * Note:
189 * The caller has to ensure that no calls to so_acceptq_enqueue() or
190 * so_acceptq_dequeue() occur while the accept queue is being flushed.
191 * So either the socket needs to be in a state where no operations
192 * would come in, or so_lock needs to be obtained.
193 */
194void
195so_acceptq_flush(struct sonode *so)
196{
197 struct sonode *nso;
198
199 nso = so->so_acceptq_head;
200
201 while (nso != NULL) {
202 struct sonode *nnso = NULL;
203
204 nnso = nso->so_acceptq_next;
205 nso->so_acceptq_next = NULL;
206 /*
207 * Since the socket is on the accept queue, there can
208 * only be one reference. We drop the reference and
209 * just blow off the socket.
210 */
211 ASSERT(nso->so_count == 1);
212 nso->so_count--;
213 socket_destroy(nso);
214 nso = nnso;
215 }
216
217 so->so_acceptq_head = NULL;
218 so->so_acceptq_tail = &so->so_acceptq_head;
219 so->so_acceptq_len = 0;
220}
221
222int
223so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
224 sock_connid_t id)
225{
226 ASSERT(MUTEX_HELD(&so->so_lock));
227
228 /*
229 * The protocol has notified us that a connection attempt is being
230 * made, so before we wait for a notification to arrive we must
231 * clear out any errors associated with earlier connection attempts.
232 */
233 if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
234 so->so_error = 0;
235
236 while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
237 if (nonblock)
238 return (EINPROGRESS);
239
240 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
241 return (EINTR);
242
243 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
244 return (EINTR);
245 }
246
247 if (so->so_error != 0)
248 return (sogeterr(so, B_TRUE));
249 /*
250 * Under normal circumstances, so_error should contain an error
251 * in case the connect failed. However, it is possible for another
252 * thread to come in a consume the error, so generate a sensible
253 * error in that case.
254 */
255 if ((so->so_state & SS_ISCONNECTED) == 0)
256 return (ECONNREFUSED);
257
258 return (0);
259}
260
261/*
262 * int so_wait_connected(struct sonode *so, boolean_t nonblock,
263 * sock_connid_t id)
264 *
265 * Wait until the socket is connected or an error has occured.
266 *
267 * Arguments:
268 * so - socket
269 * nonblock - indicate whether it's ok to sleep if the connection has
270 * not yet been established
271 * gen - generation number that was returned by the protocol
272 * when the operation was started
273 *
274 * Returns:
275 * 0 if the connection attempt was successful, or an error indicating why
276 * the connection attempt failed.
277 */
278int
279so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
280{
281 int error;
282
283 mutex_enter(&so->so_lock);
284 error = so_wait_connected_locked(so, nonblock, id);
285 mutex_exit(&so->so_lock);
286
287 return (error);
288}
289
290int
291so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
292{
293 int error;
294
295 ASSERT(MUTEX_HELD(&so->so_lock));
296 while (so->so_snd_qfull) {
297 if (so->so_state & SS_CANTSENDMORE)
298 return (EPIPE);
299 if (dontblock)
300 return (EWOULDBLOCK);
301
302 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
303 return (EINTR);
304
305 if (so->so_sndtimeo == 0) {
306 /*
307 * Zero means disable timeout.
308 */
309 error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
310 } else {
311 clock_t now;
312
313 time_to_wait(&now, so->so_sndtimeo);
314 error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock,
315 now);
316 }
317 if (error == 0)
318 return (EINTR);
319 else if (error == -1)
320 return (ETIME);
321 }
322 return (0);
323}
324
325/*
326 * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
327 *
328 * Wait for the transport to notify us about send buffers becoming
329 * available.
330 */
331int
332so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
333{
334 int error = 0;
335
336 mutex_enter(&so->so_lock);
337 if (so->so_snd_qfull) {
338 so->so_snd_wakeup = B_TRUE;
339 error = so_snd_wait_qnotfull_locked(so, dontblock);
340 so->so_snd_wakeup = B_FALSE;
341 }
342 mutex_exit(&so->so_lock);
343
344 return (error);
345}
346
347void
348so_snd_qfull(struct sonode *so)
349{
350 mutex_enter(&so->so_lock);
351 so->so_snd_qfull = B_TRUE;
352 mutex_exit(&so->so_lock);
353}
354
355void
356so_snd_qnotfull(struct sonode *so)
357{
358 mutex_enter(&so->so_lock);
359 so->so_snd_qfull = B_FALSE;
360 /* wake up everyone waiting for buffers */
361 cv_broadcast(&so->so_snd_cv);
362 mutex_exit(&so->so_lock);
363}
364
365/*
366 * Change the process/process group to which SIGIO is sent.
367 */
368int
369socket_chgpgrp(struct sonode *so, pid_t pid)
370{
371 int error;
372
373 ASSERT(MUTEX_HELD(&so->so_lock));
374 if (pid != 0) {
375 /*
376 * Permissions check by sending signal 0.
377 * Note that when kill fails it does a
378 * set_errno causing the system call to fail.
379 */
380 error = kill(pid, 0);
381 if (error != 0) {
382 return (error);
383 }
384 }
385 so->so_pgrp = pid;
386 return (0);
387}
388
389
390/*
391 * Generate a SIGIO, for 'writable' events include siginfo structure,
392 * for read events just send the signal.
393 */
394/*ARGSUSED*/
395static void
396socket_sigproc(proc_t *proc, int event)
397{
398 k_siginfo_t info;
399
400 ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
401
402 if (event & SOCKETSIG_WRITE) {
403 info.si_signo = SIGPOLL;
404 info.si_code = POLL_OUT;
405 info.si_errno = 0;
406 info.si_fd = 0;
407 info.si_band = 0;
408 sigaddq(proc, NULL, &info, KM_NOSLEEP);
409 }
410 if (event & SOCKETSIG_READ) {
411 sigtoproc(proc, NULL, SIGPOLL);
412 }
413 if (event & SOCKETSIG_URG) {
414 sigtoproc(proc, NULL, SIGURG);
415 }
416}
417
418void
419socket_sendsig(struct sonode *so, int event)
420{
421 proc_t *proc;
422
423 ASSERT(MUTEX_HELD(&so->so_lock));
424
425 if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
426 event != SOCKETSIG_URG)) {
427 return;
428 }
429
430 dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
431
432 if (so->so_pgrp > 0) {
433 /*
434 * XXX This unfortunately still generates
435 * a signal when a fd is closed but
436 * the proc is active.
437 */
438 mutex_enter(&pidlock);
439 proc = prfind(so->so_pgrp);
440 if (proc == NULL) {
441 mutex_exit(&pidlock);
442 return;
443 }
444 mutex_enter(&proc->p_lock);
445 mutex_exit(&pidlock);
446 socket_sigproc(proc, event);
447 mutex_exit(&proc->p_lock);
448 } else {
449 /*
450 * Send to process group. Hold pidlock across
451 * calls to socket_sigproc().
452 */
453 pid_t pgrp = -so->so_pgrp;
454
455 mutex_enter(&pidlock);
456 proc = pgfind(pgrp);
457 while (proc != NULL) {
458 mutex_enter(&proc->p_lock);
459 socket_sigproc(proc, event);
460 mutex_exit(&proc->p_lock);
461 proc = proc->p_pglink;
462 }
463 mutex_exit(&pidlock);
464 }
465}
466
467#define MIN(a, b) ((a) < (b) ? (a) : (b))
468/* Copy userdata into a new mblk_t */
469mblk_t *
470socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
471 size_t tail_len, int *errorp)
472{
473 mblk_t *head = NULL, **tail = &head;
474
475 ASSERT(iosize == INFPSZ || iosize > 0);
476
477 if (iosize == INFPSZ || iosize > uiop->uio_resid)
478 iosize = uiop->uio_resid;
479
480 if (maxblk == INFPSZ)
481 maxblk = iosize;
482
483 /* Nothing to do in these cases, so we're done */
484 if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
485 goto done;
486
487 /*
488 * We will enter the loop below if iosize is 0; it will allocate an
489 * empty message block and call uiomove(9F) which will just return.
490 * We could avoid that with an extra check but would only slow
491 * down the much more likely case where iosize is larger than 0.
492 */
493 do {
494 ssize_t blocksize;
495 mblk_t *mp;
496
497 blocksize = MIN(iosize, maxblk);
498 ASSERT(blocksize >= 0);
499 if ((mp = allocb(wroff + blocksize + tail_len,
500 BPRI_MED)) == NULL) {
501 *errorp = ENOMEM;
502 return (head);
503 }
504 mp->b_rptr += wroff;
505 mp->b_wptr = mp->b_rptr + blocksize;
506
507 *tail = mp;
508 tail = &mp->b_cont;
509
510 /* uiomove(9F) either returns 0 or EFAULT */
511 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
512 UIO_WRITE, uiop)) != 0) {
513 ASSERT(*errorp != ENOMEM);
514 freemsg(head);
515 return (NULL);
516 }
517
518 iosize -= blocksize;
519 } while (iosize > 0);
520
521done:
522 *errorp = 0;
523 return (head);
524}
525
526mblk_t *
527socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
528{
529 int error;
530 ptrdiff_t n;
531 mblk_t *nmp;
532
533 ASSERT(mp->b_wptr >= mp->b_rptr);
534
535 /*
536 * max_read is the offset of the oobmark and read can not go pass
537 * the oobmark.
538 */
539 if (max_read == INFPSZ || max_read > uiop->uio_resid)
540 max_read = uiop->uio_resid;
541
542 do {
543 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
544 ASSERT(n > 0);
545
546 error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
547 if (error != 0) {
548 freemsg(mp);
549 *errorp = error;
550 return (NULL);
551 }
552 }
553
554 mp->b_rptr += n;
555 max_read -= n;
556 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
557 /*
558 * get rid of zero length mblks
559 */
560 nmp = mp;
561 mp = mp->b_cont;
562 freeb(nmp);
563 }
564 } while (mp != NULL && max_read > 0);
565
566 *errorp = 0;
567 return (mp);
568}
569
570static void
571so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
572{
573 ASSERT(last_tail != NULL);
574 mp->b_next = so->so_rcv_q_head;
575 mp->b_prev = last_tail;
576 ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
577
578 if (so->so_rcv_q_head == NULL) {
579 ASSERT(so->so_rcv_q_last_head == NULL);
580 so->so_rcv_q_last_head = mp;
581#ifdef DEBUG
582 } else {
583 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
584#endif
585 }
586 so->so_rcv_q_head = mp;
587
588#ifdef DEBUG
589 if (so_debug_length) {
590 mutex_enter(&so->so_lock);
591 ASSERT(so_check_length(so));
592 mutex_exit(&so->so_lock);
593 }
594#endif
595}
596
597static void
598process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
599{
600 ASSERT(mp_head->b_prev != NULL);
601 if (so->so_rcv_q_head == NULL) {
602 so->so_rcv_q_head = mp_head;
603 so->so_rcv_q_last_head = mp_last_head;
604 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
605 } else {
606 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
607 (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
608
609 if (mp_head->b_next == NULL &&
610 DB_TYPE(mp_head) == M_DATA &&
611 DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
612 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
613 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
614 mp_head->b_prev = NULL;
615 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
616 /*
617 * Append to last_head if more than one mblks, and both
618 * mp_head and last_head are I/OAT mblks.
619 */
620 ASSERT(mp_head->b_next != NULL);
621 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
622 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
623 mp_head->b_prev = NULL;
624
625 so->so_rcv_q_last_head->b_next = mp_head->b_next;
626 mp_head->b_next = NULL;
627 so->so_rcv_q_last_head = mp_last_head;
628 } else {
629#ifdef DEBUG
630 {
631 mblk_t *tmp_mblk;
632 tmp_mblk = mp_head;
633 while (tmp_mblk != NULL) {
634 ASSERT(tmp_mblk->b_prev != NULL);
635 tmp_mblk = tmp_mblk->b_next;
636 }
637 }
638#endif
639 so->so_rcv_q_last_head->b_next = mp_head;
640 so->so_rcv_q_last_head = mp_last_head;
641 }
642 }
643}
644
645int
646so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
647 rval_t *rvalp, int flags)
648{
649 mblk_t *mp, *nmp;
650 mblk_t *savemp, *savemptail;
651 mblk_t *new_msg_head;
652 mblk_t *new_msg_last_head;
653 mblk_t *last_tail;
654 boolean_t partial_read;
655 boolean_t reset_atmark = B_FALSE;
656 int more = 0;
657 int error;
658 ssize_t oobmark;
659 sodirect_t *sodp = so->so_direct;
660
661 partial_read = B_FALSE;
662 *mctlp = NULL;
663again:
664 mutex_enter(&so->so_lock);
665again1:
666#ifdef DEBUG
667 if (so_debug_length) {
668 ASSERT(so_check_length(so));
669 }
670#endif
671 /*
672 * First move messages from the dump area to processing area
673 */
674 if (sodp != NULL) {
675 /* No need to grab sod_lockp since it pointers to so_lock */
676 if (sodp->sod_state & SOD_ENABLED) {
677 ASSERT(sodp->sod_lockp == &so->so_lock);
678
679 if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
680 /* nothing to uioamove */
681 sodp = NULL;
682 } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
683 sodp->sod_uioa.uioa_state &= UIOA_CLR;
684 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
685 /*
686 * try to uioamove() the data that
687 * has already queued.
688 */
689 sod_uioa_so_init(so, sodp, uiop);
690 }
691 } else {
692 sodp = NULL;
693 }
694 }
695 new_msg_head = so->so_rcv_head;
696 new_msg_last_head = so->so_rcv_last_head;
697 so->so_rcv_head = NULL;
698 so->so_rcv_last_head = NULL;
699 oobmark = so->so_oobmark;
700 /*
701 * We can release the lock as there can only be one reader
702 */
703 mutex_exit(&so->so_lock);
704
705 if (so->so_state & SS_RCVATMARK) {
706 reset_atmark = B_TRUE;
707 }
708 if (new_msg_head != NULL) {
709 process_new_message(so, new_msg_head, new_msg_last_head);
710 }
711 savemp = savemptail = NULL;
712 rvalp->r_val1 = 0;
713 error = 0;
714 mp = so->so_rcv_q_head;
715
716 if (mp != NULL &&
717 (so->so_rcv_timer_tid == 0 ||
718 so->so_rcv_queued >= so->so_rcv_thresh)) {
719 partial_read = B_FALSE;
720
721 if (flags & MSG_PEEK) {
722 if ((nmp = dupmsg(mp)) == NULL &&
723 (nmp = copymsg(mp)) == NULL) {
724 size_t size = msgsize(mp);
725
726 error = strwaitbuf(size, BPRI_HI);
727 if (error) {
728 return (error);
729 }
730 goto again;
731 }
732 mp = nmp;
733 } else {
734 ASSERT(mp->b_prev != NULL);
735 last_tail = mp->b_prev;
736 mp->b_prev = NULL;
737 so->so_rcv_q_head = mp->b_next;
738 if (so->so_rcv_q_head == NULL) {
739 so->so_rcv_q_last_head = NULL;
740 }
741 mp->b_next = NULL;
742 }
743
744 ASSERT(mctlp != NULL);
745 /*
746 * First process PROTO or PCPROTO blocks, if any.
747 */
748 if (DB_TYPE(mp) != M_DATA) {
749 *mctlp = mp;
750 savemp = mp;
751 savemptail = mp;
752 ASSERT(DB_TYPE(mp) == M_PROTO ||
753 DB_TYPE(mp) == M_PCPROTO);
754 while (mp->b_cont != NULL &&
755 DB_TYPE(mp->b_cont) != M_DATA) {
756 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
757 DB_TYPE(mp->b_cont) == M_PCPROTO);
758 mp = mp->b_cont;
759 savemptail = mp;
760 }
761 mp = savemptail->b_cont;
762 savemptail->b_cont = NULL;
763 }
764
765 ASSERT(DB_TYPE(mp) == M_DATA);
766 /*
767 * Now process DATA blocks, if any. Note that for sodirect
768 * enabled socket, uio_resid can be 0.
769 */
770 if (uiop->uio_resid >= 0) {
771 ssize_t copied = 0;
772
773 if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
774 mutex_enter(sodp->sod_lockp);
775 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
776 copied = sod_uioa_mblk(so, mp);
777 if (copied > 0)
778 partial_read = B_TRUE;
779 mutex_exit(sodp->sod_lockp);
780 /* mark this mblk as processed */
781 mp = NULL;
782 } else {
783 ssize_t oldresid = uiop->uio_resid;
784
785 if (MBLKL(mp) < so_mblk_pull_len) {
786 if (pullupmsg(mp, -1) == 1) {
787 last_tail = mp;
788 }
789 }
790 /*
791 * Can not read beyond the oobmark
792 */
793 mp = socopyoutuio(mp, uiop,
794 oobmark == 0 ? INFPSZ : oobmark, &error);
795 if (error != 0) {
796 freemsg(*mctlp);
797 *mctlp = NULL;
798 more = 0;
799 goto done;
800 }
801 ASSERT(oldresid >= uiop->uio_resid);
802 copied = oldresid - uiop->uio_resid;
803 if (oldresid > uiop->uio_resid)
804 partial_read = B_TRUE;
805 }
806 ASSERT(copied >= 0);
807 if (copied > 0 && !(flags & MSG_PEEK)) {
808 mutex_enter(&so->so_lock);
809 so->so_rcv_queued -= copied;
810 ASSERT(so->so_oobmark >= 0);
811 if (so->so_oobmark > 0) {
812 so->so_oobmark -= copied;
813 ASSERT(so->so_oobmark >= 0);
814 if (so->so_oobmark == 0) {
815 ASSERT(so->so_state &
816 SS_OOBPEND);
817 so->so_oobmark = 0;
818 so->so_state |= SS_RCVATMARK;
819 }
820 }
821 if (so->so_flowctrld && so->so_rcv_queued <
822 so->so_rcvlowat) {
823 so->so_flowctrld = B_FALSE;
824 mutex_exit(&so->so_lock);
825 /*
826 * open up flow control
827 */
828 (*so->so_downcalls->sd_clr_flowctrl)
829 (so->so_proto_handle);
830 } else {
831 mutex_exit(&so->so_lock);
832 }
833 }
834 }
835 if (mp != NULL) { /* more data blocks in msg */
836 more |= MOREDATA;
837 if ((flags & (MSG_PEEK|MSG_TRUNC))) {
andersf0267582008-12-20 22:46:32 -0800838 if (flags & MSG_TRUNC &&
839 ((flags & MSG_PEEK) == 0)) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800840 mutex_enter(&so->so_lock);
841 so->so_rcv_queued -= msgdsize(mp);
842 mutex_exit(&so->so_lock);
843 }
844 freemsg(mp);
845 } else if (partial_read && !somsghasdata(mp)) {
846 /*
847 * Avoid queuing a zero-length tail part of
848 * a message. partial_read == 1 indicates that
849 * we read some of the message.
850 */
851 freemsg(mp);
852 more &= ~MOREDATA;
853 } else {
854 if (savemp != NULL &&
855 (flags & MSG_DUPCTRL)) {
856 mblk_t *nmp;
857 /*
858 * There should only be non data mblks
859 */
860 ASSERT(DB_TYPE(savemp) != M_DATA &&
861 DB_TYPE(savemptail) != M_DATA);
862try_again:
863 if ((nmp = dupmsg(savemp)) == NULL &&
864 (nmp = copymsg(savemp)) == NULL) {
865
866 size_t size = msgsize(savemp);
867
868 error = strwaitbuf(size,
869 BPRI_HI);
870 if (error != 0) {
871 /*
872 * In case we
873 * cannot copy
874 * control data
875 * free the remaining
876 * data.
877 */
878 freemsg(mp);
879 goto done;
880 }
881 goto try_again;
882 }
883
884 ASSERT(nmp != NULL);
885 ASSERT(DB_TYPE(nmp) != M_DATA);
886 savemptail->b_cont = mp;
887 *mctlp = nmp;
888 mp = savemp;
889 }
890 /*
891 * putback mp
892 */
893 so_prepend_msg(so, mp, last_tail);
894 }
895 }
896
897 /* fast check so_rcv_head if there is more data */
898 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
899 *mctlp == NULL && uiop->uio_resid > 0 &&
900 !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
901 goto again;
902 }
903 } else if (!partial_read) {
904 mutex_enter(&so->so_lock);
905 if (so->so_error != 0) {
906 error = sogeterr(so, !(flags & MSG_PEEK));
907 mutex_exit(&so->so_lock);
908 return (error);
909 }
910 /*
911 * No pending data. Return right away for nonblocking
912 * socket, otherwise sleep waiting for data.
913 */
Mike Cheng2caa6592008-12-29 14:01:03 +0800914 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800915 if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
916 (flags & MSG_DONTWAIT)) {
917 error = EWOULDBLOCK;
918 } else {
919 if (so->so_state & (SS_CLOSING |
920 SS_FALLBACK_PENDING)) {
921 mutex_exit(&so->so_lock);
922 error = EINTR;
923 goto done;
924 }
925
926 if (so->so_rcv_head != NULL) {
927 goto again1;
928 }
929 so->so_rcv_wakeup = B_TRUE;
930 so->so_rcv_wanted = uiop->uio_resid;
931 if (so->so_rcvtimeo == 0) {
932 /*
933 * Zero means disable timeout.
934 */
935 error = cv_wait_sig(&so->so_rcv_cv,
936 &so->so_lock);
937 } else {
938 clock_t now;
939 time_to_wait(&now, so->so_rcvtimeo);
940 error = cv_timedwait_sig(&so->so_rcv_cv,
941 &so->so_lock, now);
942 }
943 so->so_rcv_wakeup = B_FALSE;
944 so->so_rcv_wanted = 0;
945
946 if (error == 0) {
947 error = EINTR;
948 } else if (error == -1) {
949 error = ETIME;
950 } else {
951 goto again1;
952 }
953 }
954 }
955 mutex_exit(&so->so_lock);
956 }
957 if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
958 /*
959 * We are passed the mark, update state
960 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
961 * The draft Posix socket spec states that the mark should
962 * not be cleared when peeking. We follow the latter.
963 */
964 mutex_enter(&so->so_lock);
965 ASSERT(so_verify_oobstate(so));
966 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
967 freemsg(so->so_oobmsg);
968 so->so_oobmsg = NULL;
969 ASSERT(so_verify_oobstate(so));
970 mutex_exit(&so->so_lock);
971 }
972 ASSERT(so->so_rcv_wakeup == B_FALSE);
973done:
974 if (sodp != NULL) {
975 mutex_enter(sodp->sod_lockp);
976 if ((sodp->sod_state & SOD_ENABLED) &&
977 (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
978 SOD_UIOAFINI(sodp);
979 if (sodp->sod_uioa.uioa_mbytes > 0) {
980 ASSERT(so->so_rcv_q_head != NULL ||
981 so->so_rcv_head != NULL);
982 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
983 if (error == EWOULDBLOCK)
984 error = 0;
985 }
986 }
987 mutex_exit(sodp->sod_lockp);
988 }
989#ifdef DEBUG
990 if (so_debug_length) {
991 mutex_enter(&so->so_lock);
992 ASSERT(so_check_length(so));
993 mutex_exit(&so->so_lock);
994 }
995#endif
996 rvalp->r_val1 = more;
997 return (error);
998}
999
1000void
1001so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1002{
1003 ASSERT(MUTEX_HELD(&so->so_lock));
1004
1005#ifdef DEBUG
1006 if (so_debug_length) {
1007 ASSERT(so_check_length(so));
1008 }
1009#endif
1010 so->so_rcv_queued += msg_size;
1011
1012 if (so->so_rcv_head == NULL) {
1013 ASSERT(so->so_rcv_last_head == NULL);
1014 so->so_rcv_head = mp;
1015 so->so_rcv_last_head = mp;
1016 } else if ((DB_TYPE(mp) == M_DATA &&
1017 DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1018 ((DB_FLAGS(mp) & DBLK_UIOA) ==
1019 (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1020 /* Added to the end */
1021 ASSERT(so->so_rcv_last_head != NULL);
1022 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1023 so->so_rcv_last_head->b_prev->b_cont = mp;
1024 } else {
1025 /* Start a new end */
1026 so->so_rcv_last_head->b_next = mp;
1027 so->so_rcv_last_head = mp;
1028 }
1029 while (mp->b_cont != NULL)
1030 mp = mp->b_cont;
1031
1032 so->so_rcv_last_head->b_prev = mp;
1033#ifdef DEBUG
1034 if (so_debug_length) {
1035 ASSERT(so_check_length(so));
1036 }
1037#endif
1038}
1039
1040/*
1041 * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1042 */
1043boolean_t
1044somsghasdata(mblk_t *mp)
1045{
1046 for (; mp; mp = mp->b_cont)
1047 if (mp->b_datap->db_type == M_DATA) {
1048 ASSERT(mp->b_wptr >= mp->b_rptr);
1049 if (mp->b_wptr > mp->b_rptr)
1050 return (B_TRUE);
1051 }
1052 return (B_FALSE);
1053}
1054
1055/*
1056 * Flush the read side of sockfs.
1057 *
1058 * The caller must be sure that a reader is not already active when the
1059 * buffer is being flushed.
1060 */
1061void
1062so_rcv_flush(struct sonode *so)
1063{
1064 mblk_t *mp;
1065
1066 ASSERT(MUTEX_HELD(&so->so_lock));
1067
1068 if (so->so_oobmsg != NULL) {
1069 freemsg(so->so_oobmsg);
1070 so->so_oobmsg = NULL;
1071 so->so_oobmark = 0;
1072 so->so_state &=
1073 ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1074 }
1075
1076 /*
1077 * Free messages sitting in the send and recv queue
1078 */
1079 while (so->so_rcv_q_head != NULL) {
1080 mp = so->so_rcv_q_head;
1081 so->so_rcv_q_head = mp->b_next;
1082 mp->b_next = mp->b_prev = NULL;
1083 freemsg(mp);
1084 }
1085 while (so->so_rcv_head != NULL) {
1086 mp = so->so_rcv_head;
1087 so->so_rcv_head = mp->b_next;
1088 mp->b_next = mp->b_prev = NULL;
1089 freemsg(mp);
1090 }
1091 so->so_rcv_queued = 0;
1092 so->so_rcv_q_head = NULL;
1093 so->so_rcv_q_last_head = NULL;
1094 so->so_rcv_head = NULL;
1095 so->so_rcv_last_head = NULL;
1096}
1097
1098/*
1099 * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1100 */
1101int
1102sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1103 boolean_t oob_inline)
1104{
1105 mblk_t *mp, *nmp;
1106 int error;
1107
1108 dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1109 flags));
1110
1111 if (msg != NULL) {
1112 /*
1113 * There is never any oob data with addresses or control since
1114 * the T_EXDATA_IND does not carry any options.
1115 */
1116 msg->msg_controllen = 0;
1117 msg->msg_namelen = 0;
1118 msg->msg_flags = 0;
1119 }
1120
1121 mutex_enter(&so->so_lock);
1122 ASSERT(so_verify_oobstate(so));
1123 if (oob_inline ||
1124 (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1125 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1126 mutex_exit(&so->so_lock);
1127 return (EINVAL);
1128 }
1129 if (!(so->so_state & SS_HAVEOOBDATA)) {
1130 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1131 mutex_exit(&so->so_lock);
1132 return (EWOULDBLOCK);
1133 }
1134 ASSERT(so->so_oobmsg != NULL);
1135 mp = so->so_oobmsg;
1136 if (flags & MSG_PEEK) {
1137 /*
1138 * Since recv* can not return ENOBUFS we can not use dupmsg.
1139 * Instead we revert to the consolidation private
1140 * allocb_wait plus bcopy.
1141 */
1142 mblk_t *mp1;
1143
1144 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1145 ASSERT(mp1);
1146
1147 while (mp != NULL) {
1148 ssize_t size;
1149
1150 size = MBLKL(mp);
1151 bcopy(mp->b_rptr, mp1->b_wptr, size);
1152 mp1->b_wptr += size;
1153 ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1154 mp = mp->b_cont;
1155 }
1156 mp = mp1;
1157 } else {
1158 /*
1159 * Update the state indicating that the data has been consumed.
1160 * Keep SS_OOBPEND set until data is consumed past the mark.
1161 */
1162 so->so_oobmsg = NULL;
1163 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1164 }
1165 ASSERT(so_verify_oobstate(so));
1166 mutex_exit(&so->so_lock);
1167
1168 error = 0;
1169 nmp = mp;
1170 while (nmp != NULL && uiop->uio_resid > 0) {
1171 ssize_t n = MBLKL(nmp);
1172
1173 n = MIN(n, uiop->uio_resid);
1174 if (n > 0)
1175 error = uiomove(nmp->b_rptr, n,
1176 UIO_READ, uiop);
1177 if (error)
1178 break;
1179 nmp = nmp->b_cont;
1180 }
1181 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1182 freemsg(mp);
1183 return (error);
1184}
1185
1186/*
1187 * Allocate and initializ sonode
1188 */
1189/* ARGSUSED */
1190struct sonode *
1191socket_sonode_create(struct sockparams *sp, int family, int type,
1192 int protocol, int version, int sflags, int *errorp, struct cred *cr)
1193{
1194 sonode_t *so;
1195 int kmflags;
1196
1197 /*
1198 * Choose the right set of sonodeops based on the upcall and
1199 * down call version that the protocol has provided
1200 */
1201 if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1202 SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1203 /*
1204 * mismatch
1205 */
1206#ifdef DEBUG
1207 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1208#endif
1209 *errorp = EINVAL;
1210 return (NULL);
1211 }
1212
1213 kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1214
1215 so = kmem_cache_alloc(socket_cache, kmflags);
1216 if (so == NULL) {
1217 *errorp = ENOMEM;
1218 return (NULL);
1219 }
1220
1221 sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1222
1223 if (version == SOV_DEFAULT)
1224 version = so_default_version;
1225
1226 so->so_version = (short)version;
1227
1228 /*
1229 * set the default values to be INFPSZ
1230 * if a protocol desires it can change the value later
1231 */
1232 so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1233 so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1234 so->so_proto_props.sopp_maxpsz = INFPSZ;
1235 so->so_proto_props.sopp_maxblk = INFPSZ;
1236
1237 return (so);
1238}
1239
1240int
1241socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1242{
1243 int error = 0;
1244
1245 if (pso != NULL) {
1246 /*
1247 * We have a passive open, so inherit basic state from
1248 * the parent (listener).
1249 *
1250 * No need to grab the new sonode's lock, since there is no
1251 * one that can have a reference to it.
1252 */
1253 mutex_enter(&pso->so_lock);
1254
1255 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1256 so->so_pgrp = pso->so_pgrp;
1257 so->so_rcvtimeo = pso->so_rcvtimeo;
1258 so->so_sndtimeo = pso->so_sndtimeo;
Yu Xiangninga5adac42008-12-29 13:56:29 +08001259 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001260 /*
1261 * Make note of the socket level options. TCP and IP level
1262 * options are already inherited. We could do all this after
1263 * accept is successful but doing it here simplifies code and
1264 * no harm done for error case.
1265 */
1266 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
Yu Xiangninga5adac42008-12-29 13:56:29 +08001267 SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001268 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1269 so->so_proto_props = pso->so_proto_props;
1270 so->so_mode = pso->so_mode;
andersf0267582008-12-20 22:46:32 -08001271 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001272
1273 mutex_exit(&pso->so_lock);
1274
1275 if (uioasync.enabled) {
1276 sod_sock_init(so, NULL, NULL, NULL, &so->so_lock);
1277 }
1278 return (0);
1279 } else {
1280 struct sockparams *sp = so->so_sockparams;
1281 sock_upcalls_t *upcalls_to_use;
1282
1283 /*
1284 * Based on the version number select the right upcalls to
1285 * pass down. Currently we only have one version so choose
1286 * default
1287 */
1288 upcalls_to_use = &so_upcalls;
1289
1290 /* active open, so create a lower handle */
1291 so->so_proto_handle =
1292 sp->sp_smod_info->smod_proto_create_func(so->so_family,
1293 so->so_type, so->so_protocol, &so->so_downcalls,
1294 &so->so_mode, &error, flags, cr);
1295
1296 if (so->so_proto_handle == NULL) {
1297 ASSERT(error != 0);
1298 /*
1299 * To be safe; if a lower handle cannot be created, and
1300 * the proto does not give a reason why, assume there
1301 * was a lack of memory.
1302 */
1303 return ((error == 0) ? ENOMEM : error);
1304 }
1305 ASSERT(so->so_downcalls != NULL);
1306 ASSERT(so->so_downcalls->sd_send != NULL ||
1307 so->so_downcalls->sd_send_uio != NULL);
1308 if (so->so_downcalls->sd_recv_uio != NULL) {
1309 ASSERT(so->so_downcalls->sd_poll != NULL);
1310 so->so_pollev |= SO_POLLEV_ALWAYS;
1311 }
1312
1313 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1314 (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1315
1316 /* Wildcard */
1317
1318 /*
1319 * FIXME No need for this, the protocol can deal with it in
1320 * sd_create(). Should update ICMP.
1321 */
1322 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1323 int protocol = so->so_protocol;
1324 int error;
1325 /*
1326 * Issue SO_PROTOTYPE setsockopt.
1327 */
1328 error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1329 &protocol, (t_uscalar_t)sizeof (protocol), cr);
1330 if (error) {
1331 (void) (*so->so_downcalls->sd_close)
1332 (so->so_proto_handle, 0, cr);
1333
1334 mutex_enter(&so->so_lock);
1335 so_rcv_flush(so);
1336 mutex_exit(&so->so_lock);
1337 /*
1338 * Setsockopt often fails with ENOPROTOOPT but
1339 * socket() should fail with
1340 * EPROTONOSUPPORT/EPROTOTYPE.
1341 */
1342 return (EPROTONOSUPPORT);
1343 }
1344 }
1345 return (0);
1346 }
1347}
1348
1349/*
1350 * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1351 * struct cred *cr, int32_t *rvalp)
1352 *
1353 * Handle ioctls that manipulate basic socket state; non-blocking,
1354 * async, etc.
1355 *
1356 * Returns:
1357 * < 0 - ioctl was not handle
1358 * >= 0 - ioctl was handled, if > 0, then it is an errno
1359 *
1360 * Notes:
1361 * Assumes the standard receive buffer is used to obtain info for
1362 * NREAD.
1363 */
1364/* ARGSUSED */
1365int
1366socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1367 struct cred *cr, int32_t *rvalp)
1368{
1369 switch (cmd) {
Rao Shoaibbfcb55b2009-01-05 10:51:43 -08001370 case SIOCSQPTR:
1371 /*
1372 * SIOCSQPTR is valid only when helper stream is created
1373 * by the protocol.
1374 */
1375
1376 return (EOPNOTSUPP);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001377 case FIONBIO: {
1378 int32_t value;
1379
1380 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1381 (mode & (int)FKIOCTL)))
1382 return (EFAULT);
1383
1384 mutex_enter(&so->so_lock);
1385 if (value) {
1386 so->so_state |= SS_NDELAY;
1387 } else {
1388 so->so_state &= ~SS_NDELAY;
1389 }
1390 mutex_exit(&so->so_lock);
1391 return (0);
1392 }
1393 case FIOASYNC: {
1394 int32_t value;
1395
1396 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1397 (mode & (int)FKIOCTL)))
1398 return (EFAULT);
1399
1400 mutex_enter(&so->so_lock);
1401
1402 if (value) {
1403 /* Turn on SIGIO */
1404 so->so_state |= SS_ASYNC;
1405 } else {
1406 /* Turn off SIGIO */
1407 so->so_state &= ~SS_ASYNC;
1408 }
1409 mutex_exit(&so->so_lock);
1410
1411 return (0);
1412 }
1413
1414 case SIOCSPGRP:
1415 case FIOSETOWN: {
1416 int error;
1417 pid_t pid;
1418
1419 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1420 (mode & (int)FKIOCTL)))
1421 return (EFAULT);
1422
1423 mutex_enter(&so->so_lock);
1424 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1425 mutex_exit(&so->so_lock);
1426 return (error);
1427 }
1428 case SIOCGPGRP:
1429 case FIOGETOWN:
1430 if (so_copyout(&so->so_pgrp, (void *)arg,
1431 sizeof (pid_t), (mode & (int)FKIOCTL)))
1432 return (EFAULT);
1433
1434 return (0);
1435 case SIOCATMARK: {
1436 int retval;
1437
1438 /*
1439 * Only protocols that support urgent data can handle ATMARK.
1440 */
1441 if ((so->so_mode & SM_EXDATA) == 0)
1442 return (EINVAL);
1443
1444 /*
1445 * If the protocol is maintaining its own buffer, then the
1446 * request must be passed down.
1447 */
1448 if (so->so_downcalls->sd_recv_uio != NULL)
1449 return (-1);
1450
1451 retval = (so->so_state & SS_RCVATMARK) != 0;
1452
1453 if (so_copyout(&retval, (void *)arg, sizeof (int),
1454 (mode & (int)FKIOCTL))) {
1455 return (EFAULT);
1456 }
1457 return (0);
1458 }
1459
1460 case FIONREAD: {
1461 int retval;
1462
1463 /*
1464 * If the protocol is maintaining its own buffer, then the
1465 * request must be passed down.
1466 */
1467 if (so->so_downcalls->sd_recv_uio != NULL)
1468 return (-1);
1469
1470 retval = MIN(so->so_rcv_queued, INT_MAX);
1471
1472 if (so_copyout(&retval, (void *)arg,
1473 sizeof (retval), (mode & (int)FKIOCTL))) {
1474 return (EFAULT);
1475 }
1476 return (0);
1477 }
1478
1479 case _I_GETPEERCRED: {
1480 int error = 0;
1481
1482 if ((mode & FKIOCTL) == 0)
1483 return (EINVAL);
1484
1485 mutex_enter(&so->so_lock);
1486 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1487 error = ENOTSUP;
1488 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1489 error = ENOTCONN;
1490 } else if (so->so_peercred != NULL) {
1491 k_peercred_t *kp = (k_peercred_t *)arg;
1492 kp->pc_cr = so->so_peercred;
1493 kp->pc_cpid = so->so_cpid;
1494 crhold(so->so_peercred);
1495 } else {
1496 error = EINVAL;
1497 }
1498 mutex_exit(&so->so_lock);
1499 return (error);
1500 }
1501 default:
1502 return (-1);
1503 }
1504}
1505
1506/*
1507 * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified
1508 * then the socket will fall back to TPI.
1509 *
1510 * Returns:
1511 * < 0 - ioctl was not handle
1512 * >= 0 - ioctl was handled, if > 0, then it is an errno
1513 */
1514int
1515socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1516 struct cred *cr, int32_t *rvalp)
1517{
1518 switch (cmd) {
1519 case _I_INSERT:
1520 case _I_REMOVE:
1521 case I_FIND:
1522 case I_LIST:
1523 return (EOPNOTSUPP);
1524
1525 case I_PUSH:
1526 case I_POP: {
1527 int retval;
1528
1529 if ((retval = so_tpi_fallback(so, cr)) == 0) {
1530 /* Reissue the ioctl */
1531 ASSERT(so->so_rcv_q_head == NULL);
1532 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1533 }
1534 return (retval);
1535 }
1536 case I_LOOK:
1537 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1538 (mode & (int)FKIOCTL))) {
1539 return (EFAULT);
1540 }
1541 return (0);
1542 default:
1543 return (-1);
1544 }
1545}
1546
1547int
1548socket_getopt_common(struct sonode *so, int level, int option_name,
Yu Xiangninga5adac42008-12-29 13:56:29 +08001549 void *optval, socklen_t *optlenp, int flags)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001550{
1551 if (level != SOL_SOCKET)
1552 return (-1);
1553
1554 switch (option_name) {
1555 case SO_ERROR:
1556 case SO_DOMAIN:
1557 case SO_TYPE:
1558 case SO_ACCEPTCONN: {
1559 int32_t value;
1560 socklen_t optlen = *optlenp;
1561
1562 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1563 return (EINVAL);
1564 }
1565
1566 switch (option_name) {
1567 case SO_ERROR:
1568 mutex_enter(&so->so_lock);
1569 value = sogeterr(so, B_TRUE);
1570 mutex_exit(&so->so_lock);
1571 break;
1572 case SO_DOMAIN:
1573 value = so->so_family;
1574 break;
1575 case SO_TYPE:
1576 value = so->so_type;
1577 break;
1578 case SO_ACCEPTCONN:
1579 if (so->so_state & SS_ACCEPTCONN)
1580 value = SO_ACCEPTCONN;
1581 else
1582 value = 0;
1583 break;
1584 }
1585
1586 bcopy(&value, optval, sizeof (value));
1587 *optlenp = sizeof (value);
1588
1589 return (0);
1590 }
1591 case SO_SNDTIMEO:
1592 case SO_RCVTIMEO: {
1593 clock_t value;
1594 socklen_t optlen = *optlenp;
1595
1596 if (optlen < (t_uscalar_t)sizeof (struct timeval)) {
1597 return (EINVAL);
1598 }
1599 if (option_name == SO_RCVTIMEO)
1600 value = drv_hztousec(so->so_rcvtimeo);
1601 else
1602 value = drv_hztousec(so->so_sndtimeo);
1603 ((struct timeval *)(optval))->tv_sec = value / (1000 * 1000);
1604 ((struct timeval *)(optval))->tv_usec = value % (1000 * 1000);
1605 *optlenp = sizeof (struct timeval);
1606 return (0);
1607 }
1608 case SO_DEBUG:
1609 case SO_REUSEADDR:
1610 case SO_KEEPALIVE:
1611 case SO_DONTROUTE:
1612 case SO_BROADCAST:
1613 case SO_USELOOPBACK:
1614 case SO_OOBINLINE:
1615 case SO_SNDBUF:
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001616#ifdef notyet
1617 case SO_SNDLOWAT:
1618 case SO_RCVLOWAT:
1619#endif /* notyet */
1620 case SO_DGRAM_ERRIND: {
1621 socklen_t optlen = *optlenp;
1622
1623 if (optlen < (t_uscalar_t)sizeof (int32_t))
1624 return (EINVAL);
1625 break;
1626 }
Yu Xiangninga5adac42008-12-29 13:56:29 +08001627 case SO_RCVBUF: {
1628 socklen_t optlen = *optlenp;
1629
1630 if (optlen < (t_uscalar_t)sizeof (int32_t))
1631 return (EINVAL);
1632
1633 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1634 /*
1635 * XXX If SO_RCVBUF has been set and this is an
1636 * XPG 4.2 application then do not ask the transport
1637 * since the transport might adjust the value and not
1638 * return exactly what was set by the application.
1639 * For non-XPG 4.2 application we return the value
1640 * that the transport is actually using.
1641 */
1642 *(int32_t *)optval = so->so_xpg_rcvbuf;
1643 *optlenp = sizeof (so->so_xpg_rcvbuf);
1644 return (0);
1645 }
1646 /*
1647 * If the option has not been set then get a default
1648 * value from the transport.
1649 */
1650 break;
1651 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001652 case SO_LINGER: {
1653 socklen_t optlen = *optlenp;
1654
1655 if (optlen < (t_uscalar_t)sizeof (struct linger))
1656 return (EINVAL);
1657 break;
1658 }
1659 case SO_SND_BUFINFO: {
1660 socklen_t optlen = *optlenp;
1661
1662 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1663 return (EINVAL);
1664 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1665 (so->so_proto_props).sopp_wroff;
1666 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1667 (so->so_proto_props).sopp_maxblk;
1668 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1669 (so->so_proto_props).sopp_maxpsz;
1670 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1671 (so->so_proto_props).sopp_tail;
1672 *optlenp = sizeof (struct so_snd_bufinfo);
1673 return (0);
1674 }
1675 default:
1676 break;
1677 }
1678
1679 /* Unknown Option */
1680 return (-1);
1681}
1682
1683void
1684socket_sonode_destroy(struct sonode *so)
1685{
1686 sonode_fini(so);
1687 kmem_cache_free(socket_cache, so);
1688}
1689
1690int
1691so_zcopy_wait(struct sonode *so)
1692{
1693 int error = 0;
1694
1695 mutex_enter(&so->so_lock);
1696 while (!(so->so_copyflag & STZCNOTIFY)) {
1697 if (so->so_state & SS_CLOSING) {
1698 mutex_exit(&so->so_lock);
1699 return (EINTR);
1700 }
1701 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1702 error = EINTR;
1703 break;
1704 }
1705 }
1706 so->so_copyflag &= ~STZCNOTIFY;
1707 mutex_exit(&so->so_lock);
1708 return (error);
1709}
1710
1711void
1712so_timer_callback(void *arg)
1713{
1714 struct sonode *so = (struct sonode *)arg;
1715
1716 mutex_enter(&so->so_lock);
1717
1718 so->so_rcv_timer_tid = 0;
1719 if (so->so_rcv_queued > 0) {
1720 so_notify_data(so, so->so_rcv_queued);
1721 } else {
1722 mutex_exit(&so->so_lock);
1723 }
1724}
1725
1726#ifdef DEBUG
1727/*
1728 * Verify that the length stored in so_rcv_queued and the length of data blocks
1729 * queued is same.
1730 */
1731static boolean_t
1732so_check_length(sonode_t *so)
1733{
1734 mblk_t *mp = so->so_rcv_q_head;
1735 int len = 0;
1736
1737 ASSERT(MUTEX_HELD(&so->so_lock));
1738
1739 if (mp != NULL) {
1740 len = msgdsize(mp);
1741 while ((mp = mp->b_next) != NULL)
1742 len += msgdsize(mp);
1743 }
1744 mp = so->so_rcv_head;
1745 if (mp != NULL) {
1746 len += msgdsize(mp);
1747 while ((mp = mp->b_next) != NULL)
1748 len += msgdsize(mp);
1749 }
1750 return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1751}
1752#endif
1753
1754int
1755so_get_mod_version(struct sockparams *sp)
1756{
1757 ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1758 return (sp->sp_smod_info->smod_version);
1759}
1760
1761/*
1762 * so_start_fallback()
1763 *
1764 * Block new socket operations from coming in, and wait for active operations
1765 * to complete. Threads that are sleeping will be woken up so they can get
1766 * out of the way.
1767 *
1768 * The caller must be a reader on so_fallback_rwlock.
1769 */
1770static boolean_t
1771so_start_fallback(struct sonode *so)
1772{
1773 ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1774
1775 mutex_enter(&so->so_lock);
1776 if (so->so_state & SS_FALLBACK_PENDING) {
1777 mutex_exit(&so->so_lock);
1778 return (B_FALSE);
1779 }
1780 so->so_state |= SS_FALLBACK_PENDING;
1781 /*
1782 * Poke all threads that might be sleeping. Any operation that comes
1783 * in after the cv_broadcast will observe the fallback pending flag
1784 * which cause the call to return where it would normally sleep.
1785 */
1786 cv_broadcast(&so->so_state_cv); /* threads in connect() */
1787 cv_broadcast(&so->so_rcv_cv); /* threads in recvmsg() */
1788 cv_broadcast(&so->so_snd_cv); /* threads in sendmsg() */
1789 mutex_enter(&so->so_acceptq_lock);
1790 cv_broadcast(&so->so_acceptq_cv); /* threads in accept() */
1791 mutex_exit(&so->so_acceptq_lock);
1792 mutex_exit(&so->so_lock);
1793
1794 /*
1795 * The main reason for the rw_tryupgrade call is to provide
1796 * observability during the fallback process. We want to
1797 * be able to see if there are pending operations.
1798 */
1799 if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1800 /*
1801 * It is safe to drop and reaquire the fallback lock, because
1802 * we are guaranteed that another fallback cannot take place.
1803 */
1804 rw_exit(&so->so_fallback_rwlock);
1805 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1806 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1807 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1808 }
1809
1810 return (B_TRUE);
1811}
1812
1813/*
1814 * so_end_fallback()
1815 *
1816 * Allow socket opertions back in.
1817 *
1818 * The caller must be a writer on so_fallback_rwlock.
1819 */
1820static void
1821so_end_fallback(struct sonode *so)
1822{
1823 ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1824
1825 mutex_enter(&so->so_lock);
1826 so->so_state &= ~SS_FALLBACK_PENDING;
1827 mutex_exit(&so->so_lock);
1828
1829 rw_downgrade(&so->so_fallback_rwlock);
1830}
1831
1832/*
1833 * so_quiesced_cb()
1834 *
1835 * Callback passed to the protocol during fallback. It is called once
1836 * the endpoint is quiescent.
1837 *
1838 * No requests from the user, no notifications from the protocol, so it
1839 * is safe to synchronize the state. Data can also be moved without
1840 * risk for reordering.
1841 *
1842 * NOTE: urgent data is dropped on the floor.
1843 *
1844 * We do not need to hold so_lock, since there can be only one thread
1845 * operating on the sonode.
1846 */
1847static void
1848so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
1849 struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
1850 struct sockaddr *faddr, socklen_t faddrlen, short opts)
1851{
1852 struct sonode *so = (struct sonode *)sock_handle;
1853
1854 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
1855
1856 mutex_enter(&so->so_lock);
1857 SOCKET_TIMER_CANCEL(so);
1858 mutex_exit(&so->so_lock);
1859 /*
1860 * Move data to the STREAM head.
1861 */
1862 if (so->so_rcv_head != NULL) {
1863 if (so->so_rcv_q_last_head == NULL)
1864 so->so_rcv_q_head = so->so_rcv_head;
1865 else
1866 so->so_rcv_q_last_head->b_next = so->so_rcv_head;
1867 so->so_rcv_q_last_head = so->so_rcv_last_head;
1868 }
1869
1870 while (so->so_rcv_q_head != NULL) {
1871 mblk_t *mp = so->so_rcv_q_head;
1872 size_t mlen = msgdsize(mp);
1873
1874 so->so_rcv_q_head = mp->b_next;
1875 mp->b_next = NULL;
1876 mp->b_prev = NULL;
1877 so->so_rcv_queued -= mlen;
1878 putnext(q, mp);
1879 }
1880 ASSERT(so->so_rcv_queued == 0);
1881 so->so_rcv_head = NULL;
1882 so->so_rcv_last_head = NULL;
1883 so->so_rcv_q_head = NULL;
1884 so->so_rcv_q_last_head = NULL;
1885
1886#ifdef DEBUG
1887 if (so->so_oobmsg != NULL || so->so_oobmark > 0) {
1888 cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n");
1889 }
1890#endif
1891 if (so->so_oobmsg != NULL) {
1892 freemsg(so->so_oobmsg);
1893 so->so_oobmsg = NULL;
1894 }
1895 so->so_oobmark = 0;
1896
1897 ASSERT(so->so_rcv_queued == 0);
1898}
1899
1900/*
1901 * so_tpi_fallback()
1902 *
1903 * This is fallback initation routine; things start here.
1904 *
1905 * Basic strategy:
1906 * o Block new socket operations from coming in
1907 * o Allocate/initate info needed by TPI
1908 * o Quiesce the connection, at which point we sync
1909 * state and move data
1910 * o Change operations (sonodeops) associated with the socket
1911 * o Unblock threads waiting for the fallback to finish
1912 */
1913int
1914so_tpi_fallback(struct sonode *so, struct cred *cr)
1915{
1916 int error;
1917 queue_t *q;
1918 struct sockparams *sp;
1919 struct sockparams *newsp;
1920 so_proto_fallback_func_t fbfunc;
1921 boolean_t direct;
1922
1923 error = 0;
1924 sp = so->so_sockparams;
1925 fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
1926
1927 /*
1928 * Fallback can only happen if there is a device associated
1929 * with the sonode, and the socket module has a fallback function.
1930 */
1931 if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
1932 return (EINVAL);
1933
1934 /*
1935 * Initiate fallback; upon success we know that no new requests
1936 * will come in from the user.
1937 */
1938 if (!so_start_fallback(so))
1939 return (EAGAIN);
1940
1941 newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
1942 so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
1943 KM_SLEEP, &error);
1944 if (error != 0)
1945 goto out;
1946
1947 if (so->so_direct != NULL) {
1948 sodirect_t *sodp = so->so_direct;
1949 mutex_enter(sodp->sod_lockp);
1950
1951 so->so_direct->sod_state &= ~SOD_ENABLED;
1952 so->so_state &= ~SS_SODIRECT;
1953 ASSERT(sodp->sod_uioafh == NULL);
1954 mutex_exit(sodp->sod_lockp);
1955 }
1956
1957 /* Turn sonode into a TPI socket */
1958 q = sotpi_convert_sonode(so, newsp, &direct, cr);
1959 if (q == NULL) {
1960 zcmn_err(getzoneid(), CE_WARN,
1961 "Failed to convert socket to TPI. Pid = %d\n",
1962 curproc->p_pid);
1963 SOCKPARAMS_DEC_REF(newsp);
1964 error = EINVAL;
1965 goto out;
1966 }
1967
1968 /*
1969 * Now tell the protocol to start using TPI. so_quiesced_cb be
1970 * called once it's safe to synchronize state.
1971 */
1972 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
1973 /* FIXME assumes this cannot fail. TCP can fail to enter squeue */
1974 (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
1975 DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
1976
1977 /*
1978 * Free all pending connection indications, i.e., socket_accept() has
1979 * not yet pulled the connection of the queue. The transport sent
1980 * a T_CONN_IND message for each pending connection to the STREAM head.
1981 */
1982 so_acceptq_flush(so);
1983
1984 mutex_enter(&so->so_lock);
1985 so->so_state |= SS_FALLBACK_COMP;
1986 mutex_exit(&so->so_lock);
1987
1988 /*
1989 * Swap the sonode ops. Socket opertations that come in once this
1990 * is done will proceed without blocking.
1991 */
1992 so->so_ops = &sotpi_sonodeops;
1993
1994 /*
Rao Shoaibd36be522008-12-17 13:09:55 -08001995 * No longer a non streams socket
1996 */
1997 so->so_not_str = B_FALSE;
1998 /*
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001999 * Wake up any threads stuck in poll. This is needed since the poll
2000 * head changes when the fallback happens (moves from the sonode to
2001 * the STREAMS head).
2002 */
2003 pollwakeup(&so->so_poll_list, POLLERR);
2004out:
2005 so_end_fallback(so);
2006
2007 return (error);
2008}