blob: 799f154e2f66387fcc69a835fdd06622e22e9408 [file] [log] [blame]
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
Rao Shoaibbfcb55b2009-01-05 10:51:43 -080023 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
Yu Xiangning0f1702c2008-12-11 20:04:13 -080024 * Use is subject to license terms.
25 */
26
27#include <sys/types.h>
28#include <sys/param.h>
29#include <sys/signal.h>
30#include <sys/cmn_err.h>
31
32#include <sys/stropts.h>
33#include <sys/socket.h>
34#include <sys/socketvar.h>
35#include <sys/sockio.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080036#include <sys/strsubr.h>
37#include <sys/strsun.h>
38#include <sys/atomic.h>
Anders Persson41174432009-02-12 17:35:05 -080039#include <sys/tihdr.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080040
41#include <fs/sockfs/sockcommon.h>
42#include <fs/sockfs/socktpi.h>
Anders Perssonbbc000e2009-04-28 12:10:59 -070043#include <fs/sockfs/sodirect.h>
Yu Xiangning0f1702c2008-12-11 20:04:13 -080044#include <sys/ddi.h>
45#include <inet/ip.h>
46#include <sys/time.h>
47#include <sys/cmn_err.h>
48
49#ifdef SOCK_TEST
50extern int do_useracc;
51extern clock_t sock_test_timelimit;
52#endif /* SOCK_TEST */
53
54#define MBLK_PULL_LEN 64
55uint32_t so_mblk_pull_len = MBLK_PULL_LEN;
56
57#ifdef DEBUG
58boolean_t so_debug_length = B_FALSE;
59static boolean_t so_check_length(sonode_t *so);
60#endif
61
62int
63so_acceptq_enqueue_locked(struct sonode *so, struct sonode *nso)
64{
65 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
66 ASSERT(nso->so_acceptq_next == NULL);
67
68 *so->so_acceptq_tail = nso;
69 so->so_acceptq_tail = &nso->so_acceptq_next;
70 so->so_acceptq_len++;
71
72 /* Wakeup a single consumer */
73 cv_signal(&so->so_acceptq_cv);
74
75 return (so->so_acceptq_len);
76}
77
78/*
79 * int so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
80 *
81 * Enqueue an incoming connection on a listening socket.
82 *
83 * Arguments:
84 * so - listening socket
85 * nso - new connection
86 *
87 * Returns:
88 * Number of queued connections, including the new connection
89 */
90int
91so_acceptq_enqueue(struct sonode *so, struct sonode *nso)
92{
93 int conns;
94
95 mutex_enter(&so->so_acceptq_lock);
96 conns = so_acceptq_enqueue_locked(so, nso);
97 mutex_exit(&so->so_acceptq_lock);
98
99 return (conns);
100}
101
102static int
103so_acceptq_dequeue_locked(struct sonode *so, boolean_t dontblock,
104 struct sonode **nsop)
105{
106 struct sonode *nso = NULL;
107
108 *nsop = NULL;
109 ASSERT(MUTEX_HELD(&so->so_acceptq_lock));
110 while ((nso = so->so_acceptq_head) == NULL) {
111 /*
112 * No need to check so_error here, because it is not
113 * possible for a listening socket to be reset or otherwise
114 * disconnected.
115 *
116 * So now we just need check if it's ok to wait.
117 */
118 if (dontblock)
119 return (EWOULDBLOCK);
120 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
121 return (EINTR);
122
123 if (cv_wait_sig_swap(&so->so_acceptq_cv,
124 &so->so_acceptq_lock) == 0)
125 return (EINTR);
126 }
127
128 ASSERT(nso != NULL);
129 so->so_acceptq_head = nso->so_acceptq_next;
130 nso->so_acceptq_next = NULL;
131
132 if (so->so_acceptq_head == NULL) {
133 ASSERT(so->so_acceptq_tail == &nso->so_acceptq_next);
134 so->so_acceptq_tail = &so->so_acceptq_head;
135 }
136 ASSERT(so->so_acceptq_len > 0);
137 --so->so_acceptq_len;
138
139 *nsop = nso;
140
141 return (0);
142}
143
144/*
145 * int so_acceptq_dequeue(struct sonode *, boolean_t, struct sonode **)
146 *
147 * Pulls a connection off of the accept queue.
148 *
149 * Arguments:
150 * so - listening socket
151 * dontblock - indicate whether it's ok to sleep if there are no
152 * connections on the queue
153 * nsop - Value-return argument
154 *
155 * Return values:
156 * 0 when a connection is successfully dequeued, in which case nsop
157 * is set to point to the new connection. Upon failure a non-zero
158 * value is returned, and the value of nsop is set to NULL.
159 *
160 * Note:
161 * so_acceptq_dequeue() may return prematurly if the socket is falling
162 * back to TPI.
163 */
164int
165so_acceptq_dequeue(struct sonode *so, boolean_t dontblock,
166 struct sonode **nsop)
167{
168 int error;
169
170 mutex_enter(&so->so_acceptq_lock);
171 error = so_acceptq_dequeue_locked(so, dontblock, nsop);
172 mutex_exit(&so->so_acceptq_lock);
173
174 return (error);
175}
176
177/*
178 * void so_acceptq_flush(struct sonode *so)
179 *
180 * Removes all pending connections from a listening socket, and
181 * frees the associated resources.
182 *
183 * Arguments
184 * so - listening socket
185 *
186 * Return values:
187 * None.
188 *
189 * Note:
190 * The caller has to ensure that no calls to so_acceptq_enqueue() or
191 * so_acceptq_dequeue() occur while the accept queue is being flushed.
192 * So either the socket needs to be in a state where no operations
193 * would come in, or so_lock needs to be obtained.
194 */
195void
196so_acceptq_flush(struct sonode *so)
197{
198 struct sonode *nso;
199
200 nso = so->so_acceptq_head;
201
202 while (nso != NULL) {
203 struct sonode *nnso = NULL;
204
205 nnso = nso->so_acceptq_next;
206 nso->so_acceptq_next = NULL;
207 /*
208 * Since the socket is on the accept queue, there can
209 * only be one reference. We drop the reference and
210 * just blow off the socket.
211 */
212 ASSERT(nso->so_count == 1);
213 nso->so_count--;
214 socket_destroy(nso);
215 nso = nnso;
216 }
217
218 so->so_acceptq_head = NULL;
219 so->so_acceptq_tail = &so->so_acceptq_head;
220 so->so_acceptq_len = 0;
221}
222
223int
224so_wait_connected_locked(struct sonode *so, boolean_t nonblock,
225 sock_connid_t id)
226{
227 ASSERT(MUTEX_HELD(&so->so_lock));
228
229 /*
230 * The protocol has notified us that a connection attempt is being
231 * made, so before we wait for a notification to arrive we must
232 * clear out any errors associated with earlier connection attempts.
233 */
234 if (so->so_error != 0 && SOCK_CONNID_LT(so->so_proto_connid, id))
235 so->so_error = 0;
236
237 while (SOCK_CONNID_LT(so->so_proto_connid, id)) {
238 if (nonblock)
239 return (EINPROGRESS);
240
241 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
242 return (EINTR);
243
244 if (cv_wait_sig_swap(&so->so_state_cv, &so->so_lock) == 0)
245 return (EINTR);
246 }
247
248 if (so->so_error != 0)
249 return (sogeterr(so, B_TRUE));
250 /*
251 * Under normal circumstances, so_error should contain an error
252 * in case the connect failed. However, it is possible for another
253 * thread to come in a consume the error, so generate a sensible
254 * error in that case.
255 */
256 if ((so->so_state & SS_ISCONNECTED) == 0)
257 return (ECONNREFUSED);
258
259 return (0);
260}
261
262/*
263 * int so_wait_connected(struct sonode *so, boolean_t nonblock,
264 * sock_connid_t id)
265 *
266 * Wait until the socket is connected or an error has occured.
267 *
268 * Arguments:
269 * so - socket
270 * nonblock - indicate whether it's ok to sleep if the connection has
271 * not yet been established
272 * gen - generation number that was returned by the protocol
273 * when the operation was started
274 *
275 * Returns:
276 * 0 if the connection attempt was successful, or an error indicating why
277 * the connection attempt failed.
278 */
279int
280so_wait_connected(struct sonode *so, boolean_t nonblock, sock_connid_t id)
281{
282 int error;
283
284 mutex_enter(&so->so_lock);
285 error = so_wait_connected_locked(so, nonblock, id);
286 mutex_exit(&so->so_lock);
287
288 return (error);
289}
290
291int
292so_snd_wait_qnotfull_locked(struct sonode *so, boolean_t dontblock)
293{
294 int error;
295
296 ASSERT(MUTEX_HELD(&so->so_lock));
297 while (so->so_snd_qfull) {
298 if (so->so_state & SS_CANTSENDMORE)
299 return (EPIPE);
300 if (dontblock)
301 return (EWOULDBLOCK);
302
303 if (so->so_state & (SS_CLOSING | SS_FALLBACK_PENDING))
304 return (EINTR);
305
306 if (so->so_sndtimeo == 0) {
307 /*
308 * Zero means disable timeout.
309 */
310 error = cv_wait_sig(&so->so_snd_cv, &so->so_lock);
311 } else {
312 clock_t now;
313
314 time_to_wait(&now, so->so_sndtimeo);
315 error = cv_timedwait_sig(&so->so_snd_cv, &so->so_lock,
316 now);
317 }
318 if (error == 0)
319 return (EINTR);
320 else if (error == -1)
shenjian34dfe682009-01-21 10:04:42 +0800321 return (EAGAIN);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800322 }
323 return (0);
324}
325
326/*
327 * int so_wait_sendbuf(struct sonode *so, boolean_t dontblock)
328 *
329 * Wait for the transport to notify us about send buffers becoming
330 * available.
331 */
332int
333so_snd_wait_qnotfull(struct sonode *so, boolean_t dontblock)
334{
335 int error = 0;
336
337 mutex_enter(&so->so_lock);
338 if (so->so_snd_qfull) {
339 so->so_snd_wakeup = B_TRUE;
340 error = so_snd_wait_qnotfull_locked(so, dontblock);
341 so->so_snd_wakeup = B_FALSE;
342 }
343 mutex_exit(&so->so_lock);
344
345 return (error);
346}
347
348void
349so_snd_qfull(struct sonode *so)
350{
351 mutex_enter(&so->so_lock);
352 so->so_snd_qfull = B_TRUE;
353 mutex_exit(&so->so_lock);
354}
355
356void
357so_snd_qnotfull(struct sonode *so)
358{
359 mutex_enter(&so->so_lock);
360 so->so_snd_qfull = B_FALSE;
361 /* wake up everyone waiting for buffers */
362 cv_broadcast(&so->so_snd_cv);
363 mutex_exit(&so->so_lock);
364}
365
366/*
367 * Change the process/process group to which SIGIO is sent.
368 */
369int
370socket_chgpgrp(struct sonode *so, pid_t pid)
371{
372 int error;
373
374 ASSERT(MUTEX_HELD(&so->so_lock));
375 if (pid != 0) {
376 /*
377 * Permissions check by sending signal 0.
378 * Note that when kill fails it does a
379 * set_errno causing the system call to fail.
380 */
381 error = kill(pid, 0);
382 if (error != 0) {
383 return (error);
384 }
385 }
386 so->so_pgrp = pid;
387 return (0);
388}
389
390
391/*
392 * Generate a SIGIO, for 'writable' events include siginfo structure,
393 * for read events just send the signal.
394 */
395/*ARGSUSED*/
396static void
397socket_sigproc(proc_t *proc, int event)
398{
399 k_siginfo_t info;
400
401 ASSERT(event & (SOCKETSIG_WRITE | SOCKETSIG_READ | SOCKETSIG_URG));
402
403 if (event & SOCKETSIG_WRITE) {
404 info.si_signo = SIGPOLL;
405 info.si_code = POLL_OUT;
406 info.si_errno = 0;
407 info.si_fd = 0;
408 info.si_band = 0;
409 sigaddq(proc, NULL, &info, KM_NOSLEEP);
410 }
411 if (event & SOCKETSIG_READ) {
412 sigtoproc(proc, NULL, SIGPOLL);
413 }
414 if (event & SOCKETSIG_URG) {
415 sigtoproc(proc, NULL, SIGURG);
416 }
417}
418
419void
420socket_sendsig(struct sonode *so, int event)
421{
422 proc_t *proc;
423
424 ASSERT(MUTEX_HELD(&so->so_lock));
425
426 if (so->so_pgrp == 0 || (!(so->so_state & SS_ASYNC) &&
427 event != SOCKETSIG_URG)) {
428 return;
429 }
430
431 dprint(3, ("sending sig %d to %d\n", event, so->so_pgrp));
432
433 if (so->so_pgrp > 0) {
434 /*
435 * XXX This unfortunately still generates
436 * a signal when a fd is closed but
437 * the proc is active.
438 */
439 mutex_enter(&pidlock);
440 proc = prfind(so->so_pgrp);
441 if (proc == NULL) {
442 mutex_exit(&pidlock);
443 return;
444 }
445 mutex_enter(&proc->p_lock);
446 mutex_exit(&pidlock);
447 socket_sigproc(proc, event);
448 mutex_exit(&proc->p_lock);
449 } else {
450 /*
451 * Send to process group. Hold pidlock across
452 * calls to socket_sigproc().
453 */
454 pid_t pgrp = -so->so_pgrp;
455
456 mutex_enter(&pidlock);
457 proc = pgfind(pgrp);
458 while (proc != NULL) {
459 mutex_enter(&proc->p_lock);
460 socket_sigproc(proc, event);
461 mutex_exit(&proc->p_lock);
462 proc = proc->p_pglink;
463 }
464 mutex_exit(&pidlock);
465 }
466}
467
468#define MIN(a, b) ((a) < (b) ? (a) : (b))
469/* Copy userdata into a new mblk_t */
470mblk_t *
471socopyinuio(uio_t *uiop, ssize_t iosize, size_t wroff, ssize_t maxblk,
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800472 size_t tail_len, int *errorp, cred_t *cr)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800473{
474 mblk_t *head = NULL, **tail = &head;
475
476 ASSERT(iosize == INFPSZ || iosize > 0);
477
478 if (iosize == INFPSZ || iosize > uiop->uio_resid)
479 iosize = uiop->uio_resid;
480
481 if (maxblk == INFPSZ)
482 maxblk = iosize;
483
484 /* Nothing to do in these cases, so we're done */
485 if (iosize < 0 || maxblk < 0 || (maxblk == 0 && iosize > 0))
486 goto done;
487
488 /*
489 * We will enter the loop below if iosize is 0; it will allocate an
490 * empty message block and call uiomove(9F) which will just return.
491 * We could avoid that with an extra check but would only slow
492 * down the much more likely case where iosize is larger than 0.
493 */
494 do {
495 ssize_t blocksize;
496 mblk_t *mp;
497
498 blocksize = MIN(iosize, maxblk);
499 ASSERT(blocksize >= 0);
Erik Nordmarkde8c4a12009-02-12 08:42:06 -0800500 if (is_system_labeled())
501 mp = allocb_cred(wroff + blocksize + tail_len,
502 cr, curproc->p_pid);
503 else
504 mp = allocb(wroff + blocksize + tail_len, BPRI_MED);
505 if (mp == NULL) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800506 *errorp = ENOMEM;
507 return (head);
508 }
509 mp->b_rptr += wroff;
510 mp->b_wptr = mp->b_rptr + blocksize;
511
512 *tail = mp;
513 tail = &mp->b_cont;
514
515 /* uiomove(9F) either returns 0 or EFAULT */
516 if ((*errorp = uiomove(mp->b_rptr, (size_t)blocksize,
517 UIO_WRITE, uiop)) != 0) {
518 ASSERT(*errorp != ENOMEM);
519 freemsg(head);
520 return (NULL);
521 }
522
523 iosize -= blocksize;
524 } while (iosize > 0);
525
526done:
527 *errorp = 0;
528 return (head);
529}
530
531mblk_t *
532socopyoutuio(mblk_t *mp, struct uio *uiop, ssize_t max_read, int *errorp)
533{
534 int error;
535 ptrdiff_t n;
536 mblk_t *nmp;
537
538 ASSERT(mp->b_wptr >= mp->b_rptr);
539
540 /*
541 * max_read is the offset of the oobmark and read can not go pass
542 * the oobmark.
543 */
544 if (max_read == INFPSZ || max_read > uiop->uio_resid)
545 max_read = uiop->uio_resid;
546
547 do {
548 if ((n = MIN(max_read, MBLKL(mp))) != 0) {
549 ASSERT(n > 0);
550
551 error = uiomove(mp->b_rptr, n, UIO_READ, uiop);
552 if (error != 0) {
553 freemsg(mp);
554 *errorp = error;
555 return (NULL);
556 }
557 }
558
559 mp->b_rptr += n;
560 max_read -= n;
561 while (mp != NULL && (mp->b_rptr >= mp->b_wptr)) {
562 /*
563 * get rid of zero length mblks
564 */
565 nmp = mp;
566 mp = mp->b_cont;
567 freeb(nmp);
568 }
569 } while (mp != NULL && max_read > 0);
570
571 *errorp = 0;
572 return (mp);
573}
574
575static void
576so_prepend_msg(struct sonode *so, mblk_t *mp, mblk_t *last_tail)
577{
578 ASSERT(last_tail != NULL);
579 mp->b_next = so->so_rcv_q_head;
580 mp->b_prev = last_tail;
581 ASSERT(!(DB_FLAGS(mp) & DBLK_UIOA));
582
583 if (so->so_rcv_q_head == NULL) {
584 ASSERT(so->so_rcv_q_last_head == NULL);
585 so->so_rcv_q_last_head = mp;
586#ifdef DEBUG
587 } else {
588 ASSERT(!(DB_FLAGS(so->so_rcv_q_head) & DBLK_UIOA));
589#endif
590 }
591 so->so_rcv_q_head = mp;
592
593#ifdef DEBUG
594 if (so_debug_length) {
595 mutex_enter(&so->so_lock);
596 ASSERT(so_check_length(so));
597 mutex_exit(&so->so_lock);
598 }
599#endif
600}
601
Anders Perssone4b767e2009-03-26 17:08:33 -0700602/*
603 * Move a mblk chain (mp_head, mp_last_head) to the sonode's rcv queue so it
604 * can be processed by so_dequeue_msg().
605 */
606void
607so_process_new_message(struct sonode *so, mblk_t *mp_head, mblk_t *mp_last_head)
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800608{
609 ASSERT(mp_head->b_prev != NULL);
610 if (so->so_rcv_q_head == NULL) {
611 so->so_rcv_q_head = mp_head;
612 so->so_rcv_q_last_head = mp_last_head;
613 ASSERT(so->so_rcv_q_last_head->b_prev != NULL);
614 } else {
615 boolean_t flag_equal = ((DB_FLAGS(mp_head) & DBLK_UIOA) ==
616 (DB_FLAGS(so->so_rcv_q_last_head) & DBLK_UIOA));
617
618 if (mp_head->b_next == NULL &&
619 DB_TYPE(mp_head) == M_DATA &&
620 DB_TYPE(so->so_rcv_q_last_head) == M_DATA && flag_equal) {
621 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
622 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
623 mp_head->b_prev = NULL;
624 } else if (flag_equal && (DB_FLAGS(mp_head) & DBLK_UIOA)) {
625 /*
626 * Append to last_head if more than one mblks, and both
627 * mp_head and last_head are I/OAT mblks.
628 */
629 ASSERT(mp_head->b_next != NULL);
630 so->so_rcv_q_last_head->b_prev->b_cont = mp_head;
631 so->so_rcv_q_last_head->b_prev = mp_head->b_prev;
632 mp_head->b_prev = NULL;
633
634 so->so_rcv_q_last_head->b_next = mp_head->b_next;
635 mp_head->b_next = NULL;
636 so->so_rcv_q_last_head = mp_last_head;
637 } else {
638#ifdef DEBUG
639 {
640 mblk_t *tmp_mblk;
641 tmp_mblk = mp_head;
642 while (tmp_mblk != NULL) {
643 ASSERT(tmp_mblk->b_prev != NULL);
644 tmp_mblk = tmp_mblk->b_next;
645 }
646 }
647#endif
648 so->so_rcv_q_last_head->b_next = mp_head;
649 so->so_rcv_q_last_head = mp_last_head;
650 }
651 }
652}
653
654int
655so_dequeue_msg(struct sonode *so, mblk_t **mctlp, struct uio *uiop,
656 rval_t *rvalp, int flags)
657{
658 mblk_t *mp, *nmp;
659 mblk_t *savemp, *savemptail;
660 mblk_t *new_msg_head;
661 mblk_t *new_msg_last_head;
662 mblk_t *last_tail;
663 boolean_t partial_read;
664 boolean_t reset_atmark = B_FALSE;
665 int more = 0;
666 int error;
667 ssize_t oobmark;
668 sodirect_t *sodp = so->so_direct;
669
670 partial_read = B_FALSE;
671 *mctlp = NULL;
672again:
673 mutex_enter(&so->so_lock);
674again1:
675#ifdef DEBUG
676 if (so_debug_length) {
677 ASSERT(so_check_length(so));
678 }
679#endif
680 /*
681 * First move messages from the dump area to processing area
682 */
683 if (sodp != NULL) {
Anders Perssonbbc000e2009-04-28 12:10:59 -0700684 if (sodp->sod_enabled) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800685 if (sodp->sod_uioa.uioa_state & UIOA_ALLOC) {
686 /* nothing to uioamove */
687 sodp = NULL;
688 } else if (sodp->sod_uioa.uioa_state & UIOA_INIT) {
689 sodp->sod_uioa.uioa_state &= UIOA_CLR;
690 sodp->sod_uioa.uioa_state |= UIOA_ENABLED;
691 /*
692 * try to uioamove() the data that
693 * has already queued.
694 */
695 sod_uioa_so_init(so, sodp, uiop);
696 }
697 } else {
698 sodp = NULL;
699 }
700 }
701 new_msg_head = so->so_rcv_head;
702 new_msg_last_head = so->so_rcv_last_head;
703 so->so_rcv_head = NULL;
704 so->so_rcv_last_head = NULL;
705 oobmark = so->so_oobmark;
706 /*
707 * We can release the lock as there can only be one reader
708 */
709 mutex_exit(&so->so_lock);
710
711 if (so->so_state & SS_RCVATMARK) {
712 reset_atmark = B_TRUE;
713 }
714 if (new_msg_head != NULL) {
Anders Perssone4b767e2009-03-26 17:08:33 -0700715 so_process_new_message(so, new_msg_head, new_msg_last_head);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800716 }
717 savemp = savemptail = NULL;
718 rvalp->r_val1 = 0;
719 error = 0;
720 mp = so->so_rcv_q_head;
721
722 if (mp != NULL &&
723 (so->so_rcv_timer_tid == 0 ||
724 so->so_rcv_queued >= so->so_rcv_thresh)) {
725 partial_read = B_FALSE;
726
727 if (flags & MSG_PEEK) {
728 if ((nmp = dupmsg(mp)) == NULL &&
729 (nmp = copymsg(mp)) == NULL) {
730 size_t size = msgsize(mp);
731
732 error = strwaitbuf(size, BPRI_HI);
733 if (error) {
734 return (error);
735 }
736 goto again;
737 }
738 mp = nmp;
739 } else {
740 ASSERT(mp->b_prev != NULL);
741 last_tail = mp->b_prev;
742 mp->b_prev = NULL;
743 so->so_rcv_q_head = mp->b_next;
744 if (so->so_rcv_q_head == NULL) {
745 so->so_rcv_q_last_head = NULL;
746 }
747 mp->b_next = NULL;
748 }
749
750 ASSERT(mctlp != NULL);
751 /*
752 * First process PROTO or PCPROTO blocks, if any.
753 */
754 if (DB_TYPE(mp) != M_DATA) {
755 *mctlp = mp;
756 savemp = mp;
757 savemptail = mp;
758 ASSERT(DB_TYPE(mp) == M_PROTO ||
759 DB_TYPE(mp) == M_PCPROTO);
760 while (mp->b_cont != NULL &&
761 DB_TYPE(mp->b_cont) != M_DATA) {
762 ASSERT(DB_TYPE(mp->b_cont) == M_PROTO ||
763 DB_TYPE(mp->b_cont) == M_PCPROTO);
764 mp = mp->b_cont;
765 savemptail = mp;
766 }
767 mp = savemptail->b_cont;
768 savemptail->b_cont = NULL;
769 }
770
771 ASSERT(DB_TYPE(mp) == M_DATA);
772 /*
773 * Now process DATA blocks, if any. Note that for sodirect
774 * enabled socket, uio_resid can be 0.
775 */
776 if (uiop->uio_resid >= 0) {
777 ssize_t copied = 0;
778
779 if (sodp != NULL && (DB_FLAGS(mp) & DBLK_UIOA)) {
Anders Perssonbbc000e2009-04-28 12:10:59 -0700780 mutex_enter(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800781 ASSERT(uiop == (uio_t *)&sodp->sod_uioa);
782 copied = sod_uioa_mblk(so, mp);
783 if (copied > 0)
784 partial_read = B_TRUE;
Anders Perssonbbc000e2009-04-28 12:10:59 -0700785 mutex_exit(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800786 /* mark this mblk as processed */
787 mp = NULL;
788 } else {
789 ssize_t oldresid = uiop->uio_resid;
790
791 if (MBLKL(mp) < so_mblk_pull_len) {
792 if (pullupmsg(mp, -1) == 1) {
793 last_tail = mp;
794 }
795 }
796 /*
797 * Can not read beyond the oobmark
798 */
799 mp = socopyoutuio(mp, uiop,
800 oobmark == 0 ? INFPSZ : oobmark, &error);
801 if (error != 0) {
802 freemsg(*mctlp);
803 *mctlp = NULL;
804 more = 0;
805 goto done;
806 }
807 ASSERT(oldresid >= uiop->uio_resid);
808 copied = oldresid - uiop->uio_resid;
809 if (oldresid > uiop->uio_resid)
810 partial_read = B_TRUE;
811 }
812 ASSERT(copied >= 0);
813 if (copied > 0 && !(flags & MSG_PEEK)) {
814 mutex_enter(&so->so_lock);
815 so->so_rcv_queued -= copied;
816 ASSERT(so->so_oobmark >= 0);
817 if (so->so_oobmark > 0) {
818 so->so_oobmark -= copied;
819 ASSERT(so->so_oobmark >= 0);
820 if (so->so_oobmark == 0) {
821 ASSERT(so->so_state &
822 SS_OOBPEND);
823 so->so_oobmark = 0;
824 so->so_state |= SS_RCVATMARK;
825 }
826 }
827 if (so->so_flowctrld && so->so_rcv_queued <
828 so->so_rcvlowat) {
829 so->so_flowctrld = B_FALSE;
830 mutex_exit(&so->so_lock);
831 /*
Anders Persson419dcee2009-02-28 20:06:52 -0800832 * Open up flow control. SCTP does
833 * not have any downcalls, and it will
834 * clr flow ctrl in sosctp_recvmsg().
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800835 */
Anders Persson419dcee2009-02-28 20:06:52 -0800836 if (so->so_downcalls != NULL &&
837 so->so_downcalls->sd_clr_flowctrl !=
838 NULL) {
839 (*so->so_downcalls->
840 sd_clr_flowctrl)
841 (so->so_proto_handle);
842 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800843 } else {
844 mutex_exit(&so->so_lock);
845 }
846 }
847 }
848 if (mp != NULL) { /* more data blocks in msg */
849 more |= MOREDATA;
850 if ((flags & (MSG_PEEK|MSG_TRUNC))) {
andersf0267582008-12-20 22:46:32 -0800851 if (flags & MSG_TRUNC &&
852 ((flags & MSG_PEEK) == 0)) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800853 mutex_enter(&so->so_lock);
854 so->so_rcv_queued -= msgdsize(mp);
855 mutex_exit(&so->so_lock);
856 }
857 freemsg(mp);
858 } else if (partial_read && !somsghasdata(mp)) {
859 /*
860 * Avoid queuing a zero-length tail part of
861 * a message. partial_read == 1 indicates that
862 * we read some of the message.
863 */
864 freemsg(mp);
865 more &= ~MOREDATA;
866 } else {
867 if (savemp != NULL &&
868 (flags & MSG_DUPCTRL)) {
869 mblk_t *nmp;
870 /*
871 * There should only be non data mblks
872 */
873 ASSERT(DB_TYPE(savemp) != M_DATA &&
874 DB_TYPE(savemptail) != M_DATA);
875try_again:
876 if ((nmp = dupmsg(savemp)) == NULL &&
877 (nmp = copymsg(savemp)) == NULL) {
878
879 size_t size = msgsize(savemp);
880
881 error = strwaitbuf(size,
882 BPRI_HI);
883 if (error != 0) {
884 /*
885 * In case we
886 * cannot copy
887 * control data
888 * free the remaining
889 * data.
890 */
891 freemsg(mp);
892 goto done;
893 }
894 goto try_again;
895 }
896
897 ASSERT(nmp != NULL);
898 ASSERT(DB_TYPE(nmp) != M_DATA);
899 savemptail->b_cont = mp;
900 *mctlp = nmp;
901 mp = savemp;
902 }
903 /*
904 * putback mp
905 */
906 so_prepend_msg(so, mp, last_tail);
907 }
908 }
909
910 /* fast check so_rcv_head if there is more data */
911 if (partial_read && !(so->so_state & SS_RCVATMARK) &&
912 *mctlp == NULL && uiop->uio_resid > 0 &&
913 !(flags & MSG_PEEK) && so->so_rcv_head != NULL) {
914 goto again;
915 }
916 } else if (!partial_read) {
917 mutex_enter(&so->so_lock);
918 if (so->so_error != 0) {
919 error = sogeterr(so, !(flags & MSG_PEEK));
920 mutex_exit(&so->so_lock);
921 return (error);
922 }
923 /*
924 * No pending data. Return right away for nonblocking
925 * socket, otherwise sleep waiting for data.
926 */
Mike Cheng2caa6592008-12-29 14:01:03 +0800927 if (!(so->so_state & SS_CANTRCVMORE) && uiop->uio_resid > 0) {
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800928 if ((uiop->uio_fmode & (FNDELAY|FNONBLOCK)) ||
929 (flags & MSG_DONTWAIT)) {
930 error = EWOULDBLOCK;
931 } else {
932 if (so->so_state & (SS_CLOSING |
933 SS_FALLBACK_PENDING)) {
934 mutex_exit(&so->so_lock);
935 error = EINTR;
936 goto done;
937 }
938
939 if (so->so_rcv_head != NULL) {
940 goto again1;
941 }
942 so->so_rcv_wakeup = B_TRUE;
943 so->so_rcv_wanted = uiop->uio_resid;
944 if (so->so_rcvtimeo == 0) {
945 /*
946 * Zero means disable timeout.
947 */
948 error = cv_wait_sig(&so->so_rcv_cv,
949 &so->so_lock);
950 } else {
951 clock_t now;
952 time_to_wait(&now, so->so_rcvtimeo);
953 error = cv_timedwait_sig(&so->so_rcv_cv,
954 &so->so_lock, now);
955 }
956 so->so_rcv_wakeup = B_FALSE;
957 so->so_rcv_wanted = 0;
958
959 if (error == 0) {
960 error = EINTR;
961 } else if (error == -1) {
shenjian34dfe682009-01-21 10:04:42 +0800962 error = EAGAIN;
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800963 } else {
964 goto again1;
965 }
966 }
967 }
968 mutex_exit(&so->so_lock);
969 }
970 if (reset_atmark && partial_read && !(flags & MSG_PEEK)) {
971 /*
972 * We are passed the mark, update state
973 * 4.3BSD and 4.4BSD clears the mark when peeking across it.
974 * The draft Posix socket spec states that the mark should
975 * not be cleared when peeking. We follow the latter.
976 */
977 mutex_enter(&so->so_lock);
978 ASSERT(so_verify_oobstate(so));
979 so->so_state &= ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_RCVATMARK);
980 freemsg(so->so_oobmsg);
981 so->so_oobmsg = NULL;
982 ASSERT(so_verify_oobstate(so));
983 mutex_exit(&so->so_lock);
984 }
985 ASSERT(so->so_rcv_wakeup == B_FALSE);
986done:
987 if (sodp != NULL) {
Anders Perssonbbc000e2009-04-28 12:10:59 -0700988 mutex_enter(&so->so_lock);
989 if (sodp->sod_enabled &&
Yu Xiangning0f1702c2008-12-11 20:04:13 -0800990 (sodp->sod_uioa.uioa_state & UIOA_ENABLED)) {
991 SOD_UIOAFINI(sodp);
992 if (sodp->sod_uioa.uioa_mbytes > 0) {
993 ASSERT(so->so_rcv_q_head != NULL ||
994 so->so_rcv_head != NULL);
995 so->so_rcv_queued -= sod_uioa_mblk(so, NULL);
996 if (error == EWOULDBLOCK)
997 error = 0;
998 }
999 }
Anders Perssonbbc000e2009-04-28 12:10:59 -07001000 mutex_exit(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001001 }
1002#ifdef DEBUG
1003 if (so_debug_length) {
1004 mutex_enter(&so->so_lock);
1005 ASSERT(so_check_length(so));
1006 mutex_exit(&so->so_lock);
1007 }
1008#endif
1009 rvalp->r_val1 = more;
1010 return (error);
1011}
1012
Anders Perssone4b767e2009-03-26 17:08:33 -07001013/*
1014 * Enqueue data from the protocol on the socket's rcv queue.
1015 *
1016 * We try to hook new M_DATA mblks onto an existing chain, however,
1017 * that cannot be done if the existing chain has already been
1018 * processed by I/OAT. Non-M_DATA mblks are just linked together via
1019 * b_next. In all cases the b_prev of the enqueued mblk is set to
1020 * point to the last mblk in its b_cont chain.
1021 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001022void
1023so_enqueue_msg(struct sonode *so, mblk_t *mp, size_t msg_size)
1024{
1025 ASSERT(MUTEX_HELD(&so->so_lock));
1026
1027#ifdef DEBUG
1028 if (so_debug_length) {
1029 ASSERT(so_check_length(so));
1030 }
1031#endif
1032 so->so_rcv_queued += msg_size;
1033
1034 if (so->so_rcv_head == NULL) {
1035 ASSERT(so->so_rcv_last_head == NULL);
1036 so->so_rcv_head = mp;
1037 so->so_rcv_last_head = mp;
1038 } else if ((DB_TYPE(mp) == M_DATA &&
1039 DB_TYPE(so->so_rcv_last_head) == M_DATA) &&
1040 ((DB_FLAGS(mp) & DBLK_UIOA) ==
1041 (DB_FLAGS(so->so_rcv_last_head) & DBLK_UIOA))) {
1042 /* Added to the end */
1043 ASSERT(so->so_rcv_last_head != NULL);
1044 ASSERT(so->so_rcv_last_head->b_prev != NULL);
1045 so->so_rcv_last_head->b_prev->b_cont = mp;
1046 } else {
1047 /* Start a new end */
1048 so->so_rcv_last_head->b_next = mp;
1049 so->so_rcv_last_head = mp;
1050 }
1051 while (mp->b_cont != NULL)
1052 mp = mp->b_cont;
1053
1054 so->so_rcv_last_head->b_prev = mp;
1055#ifdef DEBUG
1056 if (so_debug_length) {
1057 ASSERT(so_check_length(so));
1058 }
1059#endif
1060}
1061
1062/*
1063 * Return B_TRUE if there is data in the message, B_FALSE otherwise.
1064 */
1065boolean_t
1066somsghasdata(mblk_t *mp)
1067{
1068 for (; mp; mp = mp->b_cont)
1069 if (mp->b_datap->db_type == M_DATA) {
1070 ASSERT(mp->b_wptr >= mp->b_rptr);
1071 if (mp->b_wptr > mp->b_rptr)
1072 return (B_TRUE);
1073 }
1074 return (B_FALSE);
1075}
1076
1077/*
1078 * Flush the read side of sockfs.
1079 *
1080 * The caller must be sure that a reader is not already active when the
1081 * buffer is being flushed.
1082 */
1083void
1084so_rcv_flush(struct sonode *so)
1085{
1086 mblk_t *mp;
1087
1088 ASSERT(MUTEX_HELD(&so->so_lock));
1089
1090 if (so->so_oobmsg != NULL) {
1091 freemsg(so->so_oobmsg);
1092 so->so_oobmsg = NULL;
1093 so->so_oobmark = 0;
1094 so->so_state &=
1095 ~(SS_OOBPEND|SS_HAVEOOBDATA|SS_HADOOBDATA|SS_RCVATMARK);
1096 }
1097
1098 /*
1099 * Free messages sitting in the send and recv queue
1100 */
1101 while (so->so_rcv_q_head != NULL) {
1102 mp = so->so_rcv_q_head;
1103 so->so_rcv_q_head = mp->b_next;
1104 mp->b_next = mp->b_prev = NULL;
1105 freemsg(mp);
1106 }
1107 while (so->so_rcv_head != NULL) {
1108 mp = so->so_rcv_head;
1109 so->so_rcv_head = mp->b_next;
1110 mp->b_next = mp->b_prev = NULL;
1111 freemsg(mp);
1112 }
1113 so->so_rcv_queued = 0;
1114 so->so_rcv_q_head = NULL;
1115 so->so_rcv_q_last_head = NULL;
1116 so->so_rcv_head = NULL;
1117 so->so_rcv_last_head = NULL;
1118}
1119
1120/*
1121 * Handle recv* calls that set MSG_OOB or MSG_OOB together with MSG_PEEK.
1122 */
1123int
1124sorecvoob(struct sonode *so, struct nmsghdr *msg, struct uio *uiop, int flags,
1125 boolean_t oob_inline)
1126{
1127 mblk_t *mp, *nmp;
1128 int error;
1129
1130 dprintso(so, 1, ("sorecvoob(%p, %p, 0x%x)\n", (void *)so, (void *)msg,
1131 flags));
1132
1133 if (msg != NULL) {
1134 /*
1135 * There is never any oob data with addresses or control since
1136 * the T_EXDATA_IND does not carry any options.
1137 */
1138 msg->msg_controllen = 0;
1139 msg->msg_namelen = 0;
1140 msg->msg_flags = 0;
1141 }
1142
1143 mutex_enter(&so->so_lock);
1144 ASSERT(so_verify_oobstate(so));
1145 if (oob_inline ||
1146 (so->so_state & (SS_OOBPEND|SS_HADOOBDATA)) != SS_OOBPEND) {
1147 dprintso(so, 1, ("sorecvoob: inline or data consumed\n"));
1148 mutex_exit(&so->so_lock);
1149 return (EINVAL);
1150 }
1151 if (!(so->so_state & SS_HAVEOOBDATA)) {
1152 dprintso(so, 1, ("sorecvoob: no data yet\n"));
1153 mutex_exit(&so->so_lock);
1154 return (EWOULDBLOCK);
1155 }
1156 ASSERT(so->so_oobmsg != NULL);
1157 mp = so->so_oobmsg;
1158 if (flags & MSG_PEEK) {
1159 /*
1160 * Since recv* can not return ENOBUFS we can not use dupmsg.
1161 * Instead we revert to the consolidation private
1162 * allocb_wait plus bcopy.
1163 */
1164 mblk_t *mp1;
1165
1166 mp1 = allocb_wait(msgdsize(mp), BPRI_MED, STR_NOSIG, NULL);
1167 ASSERT(mp1);
1168
1169 while (mp != NULL) {
1170 ssize_t size;
1171
1172 size = MBLKL(mp);
1173 bcopy(mp->b_rptr, mp1->b_wptr, size);
1174 mp1->b_wptr += size;
1175 ASSERT(mp1->b_wptr <= mp1->b_datap->db_lim);
1176 mp = mp->b_cont;
1177 }
1178 mp = mp1;
1179 } else {
1180 /*
1181 * Update the state indicating that the data has been consumed.
1182 * Keep SS_OOBPEND set until data is consumed past the mark.
1183 */
1184 so->so_oobmsg = NULL;
1185 so->so_state ^= SS_HAVEOOBDATA|SS_HADOOBDATA;
1186 }
1187 ASSERT(so_verify_oobstate(so));
1188 mutex_exit(&so->so_lock);
1189
1190 error = 0;
1191 nmp = mp;
1192 while (nmp != NULL && uiop->uio_resid > 0) {
1193 ssize_t n = MBLKL(nmp);
1194
1195 n = MIN(n, uiop->uio_resid);
1196 if (n > 0)
1197 error = uiomove(nmp->b_rptr, n,
1198 UIO_READ, uiop);
1199 if (error)
1200 break;
1201 nmp = nmp->b_cont;
1202 }
1203 ASSERT(mp->b_next == NULL && mp->b_prev == NULL);
1204 freemsg(mp);
1205 return (error);
1206}
1207
1208/*
1209 * Allocate and initializ sonode
1210 */
1211/* ARGSUSED */
1212struct sonode *
1213socket_sonode_create(struct sockparams *sp, int family, int type,
1214 int protocol, int version, int sflags, int *errorp, struct cred *cr)
1215{
1216 sonode_t *so;
1217 int kmflags;
1218
1219 /*
1220 * Choose the right set of sonodeops based on the upcall and
1221 * down call version that the protocol has provided
1222 */
1223 if (SOCK_UC_VERSION != sp->sp_smod_info->smod_uc_version ||
1224 SOCK_DC_VERSION != sp->sp_smod_info->smod_dc_version) {
1225 /*
1226 * mismatch
1227 */
1228#ifdef DEBUG
1229 cmn_err(CE_CONT, "protocol and socket module version mismatch");
1230#endif
1231 *errorp = EINVAL;
1232 return (NULL);
1233 }
1234
1235 kmflags = (sflags & SOCKET_NOSLEEP) ? KM_NOSLEEP : KM_SLEEP;
1236
1237 so = kmem_cache_alloc(socket_cache, kmflags);
1238 if (so == NULL) {
1239 *errorp = ENOMEM;
1240 return (NULL);
1241 }
1242
1243 sonode_init(so, sp, family, type, protocol, &so_sonodeops);
1244
1245 if (version == SOV_DEFAULT)
1246 version = so_default_version;
1247
1248 so->so_version = (short)version;
1249
1250 /*
1251 * set the default values to be INFPSZ
1252 * if a protocol desires it can change the value later
1253 */
1254 so->so_proto_props.sopp_rxhiwat = SOCKET_RECVHIWATER;
1255 so->so_proto_props.sopp_rxlowat = SOCKET_RECVLOWATER;
1256 so->so_proto_props.sopp_maxpsz = INFPSZ;
1257 so->so_proto_props.sopp_maxblk = INFPSZ;
1258
1259 return (so);
1260}
1261
1262int
1263socket_init_common(struct sonode *so, struct sonode *pso, int flags, cred_t *cr)
1264{
1265 int error = 0;
1266
1267 if (pso != NULL) {
1268 /*
1269 * We have a passive open, so inherit basic state from
1270 * the parent (listener).
1271 *
1272 * No need to grab the new sonode's lock, since there is no
1273 * one that can have a reference to it.
1274 */
1275 mutex_enter(&pso->so_lock);
1276
1277 so->so_state |= SS_ISCONNECTED | (pso->so_state & SS_ASYNC);
1278 so->so_pgrp = pso->so_pgrp;
1279 so->so_rcvtimeo = pso->so_rcvtimeo;
1280 so->so_sndtimeo = pso->so_sndtimeo;
Yu Xiangninga5adac42008-12-29 13:56:29 +08001281 so->so_xpg_rcvbuf = pso->so_xpg_rcvbuf;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001282 /*
1283 * Make note of the socket level options. TCP and IP level
1284 * options are already inherited. We could do all this after
1285 * accept is successful but doing it here simplifies code and
1286 * no harm done for error case.
1287 */
1288 so->so_options = pso->so_options & (SO_DEBUG|SO_REUSEADDR|
Yu Xiangninga5adac42008-12-29 13:56:29 +08001289 SO_KEEPALIVE|SO_DONTROUTE|SO_BROADCAST|SO_USELOOPBACK|
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001290 SO_OOBINLINE|SO_DGRAM_ERRIND|SO_LINGER);
1291 so->so_proto_props = pso->so_proto_props;
1292 so->so_mode = pso->so_mode;
andersf0267582008-12-20 22:46:32 -08001293 so->so_pollev = pso->so_pollev & SO_POLLEV_ALWAYS;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001294
1295 mutex_exit(&pso->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001296 } else {
1297 struct sockparams *sp = so->so_sockparams;
1298 sock_upcalls_t *upcalls_to_use;
1299
1300 /*
1301 * Based on the version number select the right upcalls to
1302 * pass down. Currently we only have one version so choose
1303 * default
1304 */
1305 upcalls_to_use = &so_upcalls;
1306
1307 /* active open, so create a lower handle */
1308 so->so_proto_handle =
1309 sp->sp_smod_info->smod_proto_create_func(so->so_family,
1310 so->so_type, so->so_protocol, &so->so_downcalls,
1311 &so->so_mode, &error, flags, cr);
1312
1313 if (so->so_proto_handle == NULL) {
1314 ASSERT(error != 0);
1315 /*
1316 * To be safe; if a lower handle cannot be created, and
1317 * the proto does not give a reason why, assume there
1318 * was a lack of memory.
1319 */
1320 return ((error == 0) ? ENOMEM : error);
1321 }
1322 ASSERT(so->so_downcalls != NULL);
1323 ASSERT(so->so_downcalls->sd_send != NULL ||
1324 so->so_downcalls->sd_send_uio != NULL);
1325 if (so->so_downcalls->sd_recv_uio != NULL) {
1326 ASSERT(so->so_downcalls->sd_poll != NULL);
1327 so->so_pollev |= SO_POLLEV_ALWAYS;
1328 }
1329
1330 (*so->so_downcalls->sd_activate)(so->so_proto_handle,
1331 (sock_upper_handle_t)so, upcalls_to_use, 0, cr);
1332
1333 /* Wildcard */
1334
1335 /*
1336 * FIXME No need for this, the protocol can deal with it in
1337 * sd_create(). Should update ICMP.
1338 */
1339 if (so->so_protocol != so->so_sockparams->sp_protocol) {
1340 int protocol = so->so_protocol;
1341 int error;
1342 /*
1343 * Issue SO_PROTOTYPE setsockopt.
1344 */
1345 error = socket_setsockopt(so, SOL_SOCKET, SO_PROTOTYPE,
1346 &protocol, (t_uscalar_t)sizeof (protocol), cr);
1347 if (error) {
1348 (void) (*so->so_downcalls->sd_close)
1349 (so->so_proto_handle, 0, cr);
1350
1351 mutex_enter(&so->so_lock);
1352 so_rcv_flush(so);
1353 mutex_exit(&so->so_lock);
1354 /*
1355 * Setsockopt often fails with ENOPROTOOPT but
1356 * socket() should fail with
1357 * EPROTONOSUPPORT/EPROTOTYPE.
1358 */
1359 return (EPROTONOSUPPORT);
1360 }
1361 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001362 }
Anders Perssonbbc000e2009-04-28 12:10:59 -07001363
1364 if (uioasync.enabled)
1365 sod_sock_init(so);
1366
1367 return (0);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001368}
1369
1370/*
1371 * int socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1372 * struct cred *cr, int32_t *rvalp)
1373 *
1374 * Handle ioctls that manipulate basic socket state; non-blocking,
1375 * async, etc.
1376 *
1377 * Returns:
1378 * < 0 - ioctl was not handle
1379 * >= 0 - ioctl was handled, if > 0, then it is an errno
1380 *
1381 * Notes:
1382 * Assumes the standard receive buffer is used to obtain info for
1383 * NREAD.
1384 */
1385/* ARGSUSED */
1386int
1387socket_ioctl_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1388 struct cred *cr, int32_t *rvalp)
1389{
1390 switch (cmd) {
Rao Shoaibbfcb55b2009-01-05 10:51:43 -08001391 case SIOCSQPTR:
1392 /*
1393 * SIOCSQPTR is valid only when helper stream is created
1394 * by the protocol.
1395 */
1396
1397 return (EOPNOTSUPP);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001398 case FIONBIO: {
1399 int32_t value;
1400
1401 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1402 (mode & (int)FKIOCTL)))
1403 return (EFAULT);
1404
1405 mutex_enter(&so->so_lock);
1406 if (value) {
1407 so->so_state |= SS_NDELAY;
1408 } else {
1409 so->so_state &= ~SS_NDELAY;
1410 }
1411 mutex_exit(&so->so_lock);
1412 return (0);
1413 }
1414 case FIOASYNC: {
1415 int32_t value;
1416
1417 if (so_copyin((void *)arg, &value, sizeof (int32_t),
1418 (mode & (int)FKIOCTL)))
1419 return (EFAULT);
1420
1421 mutex_enter(&so->so_lock);
1422
1423 if (value) {
1424 /* Turn on SIGIO */
1425 so->so_state |= SS_ASYNC;
1426 } else {
1427 /* Turn off SIGIO */
1428 so->so_state &= ~SS_ASYNC;
1429 }
1430 mutex_exit(&so->so_lock);
1431
1432 return (0);
1433 }
1434
1435 case SIOCSPGRP:
1436 case FIOSETOWN: {
1437 int error;
1438 pid_t pid;
1439
1440 if (so_copyin((void *)arg, &pid, sizeof (pid_t),
1441 (mode & (int)FKIOCTL)))
1442 return (EFAULT);
1443
1444 mutex_enter(&so->so_lock);
1445 error = (pid != so->so_pgrp) ? socket_chgpgrp(so, pid) : 0;
1446 mutex_exit(&so->so_lock);
1447 return (error);
1448 }
1449 case SIOCGPGRP:
1450 case FIOGETOWN:
1451 if (so_copyout(&so->so_pgrp, (void *)arg,
1452 sizeof (pid_t), (mode & (int)FKIOCTL)))
1453 return (EFAULT);
1454
1455 return (0);
1456 case SIOCATMARK: {
1457 int retval;
1458
1459 /*
1460 * Only protocols that support urgent data can handle ATMARK.
1461 */
1462 if ((so->so_mode & SM_EXDATA) == 0)
1463 return (EINVAL);
1464
1465 /*
1466 * If the protocol is maintaining its own buffer, then the
1467 * request must be passed down.
1468 */
1469 if (so->so_downcalls->sd_recv_uio != NULL)
1470 return (-1);
1471
1472 retval = (so->so_state & SS_RCVATMARK) != 0;
1473
1474 if (so_copyout(&retval, (void *)arg, sizeof (int),
1475 (mode & (int)FKIOCTL))) {
1476 return (EFAULT);
1477 }
1478 return (0);
1479 }
1480
1481 case FIONREAD: {
1482 int retval;
1483
1484 /*
1485 * If the protocol is maintaining its own buffer, then the
1486 * request must be passed down.
1487 */
1488 if (so->so_downcalls->sd_recv_uio != NULL)
1489 return (-1);
1490
1491 retval = MIN(so->so_rcv_queued, INT_MAX);
1492
1493 if (so_copyout(&retval, (void *)arg,
1494 sizeof (retval), (mode & (int)FKIOCTL))) {
1495 return (EFAULT);
1496 }
1497 return (0);
1498 }
1499
1500 case _I_GETPEERCRED: {
1501 int error = 0;
1502
1503 if ((mode & FKIOCTL) == 0)
1504 return (EINVAL);
1505
1506 mutex_enter(&so->so_lock);
1507 if ((so->so_mode & SM_CONNREQUIRED) == 0) {
1508 error = ENOTSUP;
1509 } else if ((so->so_state & SS_ISCONNECTED) == 0) {
1510 error = ENOTCONN;
1511 } else if (so->so_peercred != NULL) {
1512 k_peercred_t *kp = (k_peercred_t *)arg;
1513 kp->pc_cr = so->so_peercred;
1514 kp->pc_cpid = so->so_cpid;
1515 crhold(so->so_peercred);
1516 } else {
1517 error = EINVAL;
1518 }
1519 mutex_exit(&so->so_lock);
1520 return (error);
1521 }
1522 default:
1523 return (-1);
1524 }
1525}
1526
1527/*
Anders Persson41174432009-02-12 17:35:05 -08001528 * Handle the I_NREAD STREAM ioctl.
1529 */
1530static int
1531so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
1532{
1533 size_t size = 0;
1534 int retval;
1535 int count = 0;
1536 mblk_t *mp;
1537
1538 if (so->so_downcalls == NULL ||
1539 so->so_downcalls->sd_recv_uio != NULL)
1540 return (EINVAL);
1541
1542 mutex_enter(&so->so_lock);
1543 /* Wait for reader to get out of the way. */
1544 while (so->so_flag & SOREADLOCKED) {
1545 /*
1546 * If reader is waiting for data, then there should be nothing
1547 * on the rcv queue.
1548 */
1549 if (so->so_rcv_wakeup)
1550 goto out;
1551
1552 so->so_flag |= SOWANT;
1553 /* Do a timed sleep, in case the reader goes to sleep. */
1554 (void) cv_timedwait(&so->so_state_cv, &so->so_lock,
1555 lbolt + drv_usectohz(10));
1556 }
1557
1558 /*
1559 * Since we are holding so_lock no new reader will come in, and the
1560 * protocol will not be able to enqueue data. So it's safe to walk
1561 * both rcv queues.
1562 */
1563 mp = so->so_rcv_q_head;
1564 if (mp != NULL) {
1565 size = msgdsize(so->so_rcv_q_head);
1566 for (; mp != NULL; mp = mp->b_next)
1567 count++;
1568 } else {
1569 /*
1570 * In case the processing list was empty, get the size of the
1571 * next msg in line.
1572 */
1573 size = msgdsize(so->so_rcv_head);
1574 }
1575
1576 for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
1577 count++;
1578out:
1579 mutex_exit(&so->so_lock);
1580
1581 /*
1582 * Drop down from size_t to the "int" required by the
1583 * interface. Cap at INT_MAX.
1584 */
1585 retval = MIN(size, INT_MAX);
1586 if (so_copyout(&retval, (void *)arg, sizeof (retval),
1587 (mode & (int)FKIOCTL))) {
1588 return (EFAULT);
1589 } else {
1590 *rvalp = count;
1591 return (0);
1592 }
1593}
1594
1595/*
1596 * Process STREAM ioctls.
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001597 *
1598 * Returns:
1599 * < 0 - ioctl was not handle
1600 * >= 0 - ioctl was handled, if > 0, then it is an errno
1601 */
1602int
1603socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
1604 struct cred *cr, int32_t *rvalp)
1605{
Anders Persson41174432009-02-12 17:35:05 -08001606 int retval;
1607
1608 /* Only STREAM iotcls are handled here */
1609 if ((cmd & 0xffffff00U) != STR)
1610 return (-1);
1611
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001612 switch (cmd) {
Anders Persson41174432009-02-12 17:35:05 -08001613 case I_CANPUT:
1614 /*
1615 * We return an error for I_CANPUT so that isastream(3C) will
1616 * not report the socket as being a STREAM.
1617 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001618 return (EOPNOTSUPP);
Anders Persson41174432009-02-12 17:35:05 -08001619 case I_NREAD:
1620 /* Avoid doing a fallback for I_NREAD. */
1621 return (so_strioc_nread(so, arg, mode, rvalp));
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001622 case I_LOOK:
Anders Persson41174432009-02-12 17:35:05 -08001623 /* Avoid doing a fallback for I_LOOK. */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001624 if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
1625 (mode & (int)FKIOCTL))) {
1626 return (EFAULT);
1627 }
1628 return (0);
1629 default:
Anders Persson41174432009-02-12 17:35:05 -08001630 break;
1631 }
1632
1633 /*
1634 * Try to fall back to TPI, and if successful, reissue the ioctl.
1635 */
1636 if ((retval = so_tpi_fallback(so, cr)) == 0) {
1637 /* Reissue the ioctl */
1638 ASSERT(so->so_rcv_q_head == NULL);
1639 return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
1640 } else {
1641 return (retval);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001642 }
1643}
1644
1645int
1646socket_getopt_common(struct sonode *so, int level, int option_name,
Yu Xiangninga5adac42008-12-29 13:56:29 +08001647 void *optval, socklen_t *optlenp, int flags)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001648{
1649 if (level != SOL_SOCKET)
1650 return (-1);
1651
1652 switch (option_name) {
1653 case SO_ERROR:
1654 case SO_DOMAIN:
1655 case SO_TYPE:
1656 case SO_ACCEPTCONN: {
1657 int32_t value;
1658 socklen_t optlen = *optlenp;
1659
1660 if (optlen < (t_uscalar_t)sizeof (int32_t)) {
1661 return (EINVAL);
1662 }
1663
1664 switch (option_name) {
1665 case SO_ERROR:
1666 mutex_enter(&so->so_lock);
1667 value = sogeterr(so, B_TRUE);
1668 mutex_exit(&so->so_lock);
1669 break;
1670 case SO_DOMAIN:
1671 value = so->so_family;
1672 break;
1673 case SO_TYPE:
1674 value = so->so_type;
1675 break;
1676 case SO_ACCEPTCONN:
1677 if (so->so_state & SS_ACCEPTCONN)
1678 value = SO_ACCEPTCONN;
1679 else
1680 value = 0;
1681 break;
1682 }
1683
1684 bcopy(&value, optval, sizeof (value));
1685 *optlenp = sizeof (value);
1686
1687 return (0);
1688 }
1689 case SO_SNDTIMEO:
1690 case SO_RCVTIMEO: {
1691 clock_t value;
1692 socklen_t optlen = *optlenp;
shenjiane5083e82009-01-20 14:46:11 +08001693
1694 if (get_udatamodel() == DATAMODEL_NONE ||
1695 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001696 if (optlen < sizeof (struct timeval))
1697 return (EINVAL);
1698 } else {
1699 if (optlen < sizeof (struct timeval32))
1700 return (EINVAL);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001701 }
1702 if (option_name == SO_RCVTIMEO)
1703 value = drv_hztousec(so->so_rcvtimeo);
1704 else
1705 value = drv_hztousec(so->so_sndtimeo);
shenjian22238f72009-01-07 13:45:08 +08001706
shenjiane5083e82009-01-20 14:46:11 +08001707 if (get_udatamodel() == DATAMODEL_NONE ||
1708 get_udatamodel() == DATAMODEL_NATIVE) {
shenjian22238f72009-01-07 13:45:08 +08001709 ((struct timeval *)(optval))->tv_sec =
1710 value / (1000 * 1000);
1711 ((struct timeval *)(optval))->tv_usec =
1712 value % (1000 * 1000);
1713 *optlenp = sizeof (struct timeval);
1714 } else {
1715 ((struct timeval32 *)(optval))->tv_sec =
1716 value / (1000 * 1000);
1717 ((struct timeval32 *)(optval))->tv_usec =
1718 value % (1000 * 1000);
1719 *optlenp = sizeof (struct timeval32);
1720 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001721 return (0);
1722 }
1723 case SO_DEBUG:
1724 case SO_REUSEADDR:
1725 case SO_KEEPALIVE:
1726 case SO_DONTROUTE:
1727 case SO_BROADCAST:
1728 case SO_USELOOPBACK:
1729 case SO_OOBINLINE:
1730 case SO_SNDBUF:
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001731#ifdef notyet
1732 case SO_SNDLOWAT:
1733 case SO_RCVLOWAT:
1734#endif /* notyet */
1735 case SO_DGRAM_ERRIND: {
1736 socklen_t optlen = *optlenp;
1737
1738 if (optlen < (t_uscalar_t)sizeof (int32_t))
1739 return (EINVAL);
1740 break;
1741 }
Yu Xiangninga5adac42008-12-29 13:56:29 +08001742 case SO_RCVBUF: {
1743 socklen_t optlen = *optlenp;
1744
1745 if (optlen < (t_uscalar_t)sizeof (int32_t))
1746 return (EINVAL);
1747
1748 if ((flags & _SOGETSOCKOPT_XPG4_2) && so->so_xpg_rcvbuf != 0) {
1749 /*
1750 * XXX If SO_RCVBUF has been set and this is an
1751 * XPG 4.2 application then do not ask the transport
1752 * since the transport might adjust the value and not
1753 * return exactly what was set by the application.
1754 * For non-XPG 4.2 application we return the value
1755 * that the transport is actually using.
1756 */
1757 *(int32_t *)optval = so->so_xpg_rcvbuf;
1758 *optlenp = sizeof (so->so_xpg_rcvbuf);
1759 return (0);
1760 }
1761 /*
1762 * If the option has not been set then get a default
1763 * value from the transport.
1764 */
1765 break;
1766 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001767 case SO_LINGER: {
1768 socklen_t optlen = *optlenp;
1769
1770 if (optlen < (t_uscalar_t)sizeof (struct linger))
1771 return (EINVAL);
1772 break;
1773 }
1774 case SO_SND_BUFINFO: {
1775 socklen_t optlen = *optlenp;
1776
1777 if (optlen < (t_uscalar_t)sizeof (struct so_snd_bufinfo))
1778 return (EINVAL);
1779 ((struct so_snd_bufinfo *)(optval))->sbi_wroff =
1780 (so->so_proto_props).sopp_wroff;
1781 ((struct so_snd_bufinfo *)(optval))->sbi_maxblk =
1782 (so->so_proto_props).sopp_maxblk;
1783 ((struct so_snd_bufinfo *)(optval))->sbi_maxpsz =
1784 (so->so_proto_props).sopp_maxpsz;
1785 ((struct so_snd_bufinfo *)(optval))->sbi_tail =
1786 (so->so_proto_props).sopp_tail;
1787 *optlenp = sizeof (struct so_snd_bufinfo);
1788 return (0);
1789 }
1790 default:
1791 break;
1792 }
1793
1794 /* Unknown Option */
1795 return (-1);
1796}
1797
1798void
1799socket_sonode_destroy(struct sonode *so)
1800{
1801 sonode_fini(so);
1802 kmem_cache_free(socket_cache, so);
1803}
1804
1805int
1806so_zcopy_wait(struct sonode *so)
1807{
1808 int error = 0;
1809
1810 mutex_enter(&so->so_lock);
1811 while (!(so->so_copyflag & STZCNOTIFY)) {
1812 if (so->so_state & SS_CLOSING) {
1813 mutex_exit(&so->so_lock);
1814 return (EINTR);
1815 }
1816 if (cv_wait_sig(&so->so_copy_cv, &so->so_lock) == 0) {
1817 error = EINTR;
1818 break;
1819 }
1820 }
1821 so->so_copyflag &= ~STZCNOTIFY;
1822 mutex_exit(&so->so_lock);
1823 return (error);
1824}
1825
1826void
1827so_timer_callback(void *arg)
1828{
1829 struct sonode *so = (struct sonode *)arg;
1830
1831 mutex_enter(&so->so_lock);
1832
1833 so->so_rcv_timer_tid = 0;
1834 if (so->so_rcv_queued > 0) {
1835 so_notify_data(so, so->so_rcv_queued);
1836 } else {
1837 mutex_exit(&so->so_lock);
1838 }
1839}
1840
1841#ifdef DEBUG
1842/*
1843 * Verify that the length stored in so_rcv_queued and the length of data blocks
1844 * queued is same.
1845 */
1846static boolean_t
1847so_check_length(sonode_t *so)
1848{
1849 mblk_t *mp = so->so_rcv_q_head;
1850 int len = 0;
1851
1852 ASSERT(MUTEX_HELD(&so->so_lock));
1853
1854 if (mp != NULL) {
1855 len = msgdsize(mp);
1856 while ((mp = mp->b_next) != NULL)
1857 len += msgdsize(mp);
1858 }
1859 mp = so->so_rcv_head;
1860 if (mp != NULL) {
1861 len += msgdsize(mp);
1862 while ((mp = mp->b_next) != NULL)
1863 len += msgdsize(mp);
1864 }
1865 return ((len == so->so_rcv_queued) ? B_TRUE : B_FALSE);
1866}
1867#endif
1868
1869int
1870so_get_mod_version(struct sockparams *sp)
1871{
1872 ASSERT(sp != NULL && sp->sp_smod_info != NULL);
1873 return (sp->sp_smod_info->smod_version);
1874}
1875
1876/*
1877 * so_start_fallback()
1878 *
1879 * Block new socket operations from coming in, and wait for active operations
1880 * to complete. Threads that are sleeping will be woken up so they can get
1881 * out of the way.
1882 *
1883 * The caller must be a reader on so_fallback_rwlock.
1884 */
1885static boolean_t
1886so_start_fallback(struct sonode *so)
1887{
1888 ASSERT(RW_READ_HELD(&so->so_fallback_rwlock));
1889
1890 mutex_enter(&so->so_lock);
1891 if (so->so_state & SS_FALLBACK_PENDING) {
1892 mutex_exit(&so->so_lock);
1893 return (B_FALSE);
1894 }
1895 so->so_state |= SS_FALLBACK_PENDING;
1896 /*
1897 * Poke all threads that might be sleeping. Any operation that comes
1898 * in after the cv_broadcast will observe the fallback pending flag
1899 * which cause the call to return where it would normally sleep.
1900 */
1901 cv_broadcast(&so->so_state_cv); /* threads in connect() */
1902 cv_broadcast(&so->so_rcv_cv); /* threads in recvmsg() */
1903 cv_broadcast(&so->so_snd_cv); /* threads in sendmsg() */
1904 mutex_enter(&so->so_acceptq_lock);
1905 cv_broadcast(&so->so_acceptq_cv); /* threads in accept() */
1906 mutex_exit(&so->so_acceptq_lock);
1907 mutex_exit(&so->so_lock);
1908
1909 /*
1910 * The main reason for the rw_tryupgrade call is to provide
1911 * observability during the fallback process. We want to
1912 * be able to see if there are pending operations.
1913 */
1914 if (rw_tryupgrade(&so->so_fallback_rwlock) == 0) {
1915 /*
1916 * It is safe to drop and reaquire the fallback lock, because
1917 * we are guaranteed that another fallback cannot take place.
1918 */
1919 rw_exit(&so->so_fallback_rwlock);
1920 DTRACE_PROBE1(pending__ops__wait, (struct sonode *), so);
1921 rw_enter(&so->so_fallback_rwlock, RW_WRITER);
1922 DTRACE_PROBE1(pending__ops__complete, (struct sonode *), so);
1923 }
1924
1925 return (B_TRUE);
1926}
1927
1928/*
1929 * so_end_fallback()
1930 *
1931 * Allow socket opertions back in.
1932 *
1933 * The caller must be a writer on so_fallback_rwlock.
1934 */
1935static void
1936so_end_fallback(struct sonode *so)
1937{
1938 ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
1939
1940 mutex_enter(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08001941 so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001942 mutex_exit(&so->so_lock);
1943
1944 rw_downgrade(&so->so_fallback_rwlock);
1945}
1946
1947/*
1948 * so_quiesced_cb()
1949 *
1950 * Callback passed to the protocol during fallback. It is called once
1951 * the endpoint is quiescent.
1952 *
1953 * No requests from the user, no notifications from the protocol, so it
1954 * is safe to synchronize the state. Data can also be moved without
1955 * risk for reordering.
1956 *
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001957 * We do not need to hold so_lock, since there can be only one thread
1958 * operating on the sonode.
1959 */
1960static void
1961so_quiesced_cb(sock_upper_handle_t sock_handle, queue_t *q,
1962 struct T_capability_ack *tcap, struct sockaddr *laddr, socklen_t laddrlen,
1963 struct sockaddr *faddr, socklen_t faddrlen, short opts)
1964{
1965 struct sonode *so = (struct sonode *)sock_handle;
Anders Persson41174432009-02-12 17:35:05 -08001966 boolean_t atmark;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001967
1968 sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
1969
Anders Persson41174432009-02-12 17:35:05 -08001970 /*
1971 * Some protocols do not quiece the data path during fallback. Once
1972 * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
1973 * fail and the protocol is responsible for saving the data for later
1974 * delivery (i.e., once the fallback has completed).
1975 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001976 mutex_enter(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08001977 so->so_state |= SS_FALLBACK_DRAIN;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001978 SOCKET_TIMER_CANCEL(so);
1979 mutex_exit(&so->so_lock);
Anders Persson41174432009-02-12 17:35:05 -08001980
Yu Xiangning0f1702c2008-12-11 20:04:13 -08001981 if (so->so_rcv_head != NULL) {
1982 if (so->so_rcv_q_last_head == NULL)
1983 so->so_rcv_q_head = so->so_rcv_head;
1984 else
1985 so->so_rcv_q_last_head->b_next = so->so_rcv_head;
1986 so->so_rcv_q_last_head = so->so_rcv_last_head;
1987 }
1988
Anders Persson41174432009-02-12 17:35:05 -08001989 atmark = (so->so_state & SS_RCVATMARK) != 0;
1990 /*
1991 * Clear any OOB state having to do with pending data. The TPI
1992 * code path will set the appropriate oob state when we move the
1993 * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
1994 * data has already been consumed.
1995 */
1996 so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
1997
1998 ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
1999
2000 /*
2001 * Move data to the STREAM head.
2002 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002003 while (so->so_rcv_q_head != NULL) {
2004 mblk_t *mp = so->so_rcv_q_head;
2005 size_t mlen = msgdsize(mp);
2006
2007 so->so_rcv_q_head = mp->b_next;
2008 mp->b_next = NULL;
2009 mp->b_prev = NULL;
Anders Persson41174432009-02-12 17:35:05 -08002010
2011 /*
2012 * Send T_EXDATA_IND if we are at the oob mark.
2013 */
2014 if (atmark) {
2015 struct T_exdata_ind *tei;
2016 mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp;
2017
2018 SOTOTPI(so)->sti_exdata_mp = NULL;
2019 ASSERT(mp1 != NULL);
2020 mp1->b_datap->db_type = M_PROTO;
2021 tei = (struct T_exdata_ind *)mp1->b_rptr;
2022 tei->PRIM_type = T_EXDATA_IND;
2023 tei->MORE_flag = 0;
2024 mp1->b_wptr = (uchar_t *)&tei[1];
2025
2026 if (IS_SO_OOB_INLINE(so)) {
2027 mp1->b_cont = mp;
2028 } else {
2029 ASSERT(so->so_oobmsg != NULL);
2030 mp1->b_cont = so->so_oobmsg;
2031 so->so_oobmsg = NULL;
2032
2033 /* process current mp next time around */
2034 mp->b_next = so->so_rcv_q_head;
2035 so->so_rcv_q_head = mp;
2036 mlen = 0;
2037 }
2038 mp = mp1;
2039
2040 /* we have consumed the oob mark */
2041 atmark = B_FALSE;
2042 } else if (so->so_oobmark > 0) {
2043 /*
2044 * Check if the OOB mark is within the current
2045 * mblk chain. In that case we have to split it up.
2046 */
2047 if (so->so_oobmark < mlen) {
2048 mblk_t *urg_mp = mp;
2049
2050 atmark = B_TRUE;
2051 mp = NULL;
2052 mlen = so->so_oobmark;
2053
2054 /*
2055 * It is assumed that the OOB mark does
2056 * not land within a mblk.
2057 */
2058 do {
2059 so->so_oobmark -= MBLKL(urg_mp);
2060 mp = urg_mp;
2061 urg_mp = urg_mp->b_cont;
2062 } while (so->so_oobmark > 0);
2063 mp->b_cont = NULL;
2064 if (urg_mp != NULL) {
2065 urg_mp->b_next = so->so_rcv_q_head;
2066 so->so_rcv_q_head = urg_mp;
2067 }
2068 } else {
2069 so->so_oobmark -= mlen;
2070 if (so->so_oobmark == 0)
2071 atmark = B_TRUE;
2072 }
2073 }
2074
2075 /*
2076 * Queue data on the STREAM head.
2077 */
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002078 so->so_rcv_queued -= mlen;
2079 putnext(q, mp);
2080 }
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002081 so->so_rcv_head = NULL;
2082 so->so_rcv_last_head = NULL;
2083 so->so_rcv_q_head = NULL;
2084 so->so_rcv_q_last_head = NULL;
2085
Anders Persson41174432009-02-12 17:35:05 -08002086 /*
2087 * Check if the oob byte is at the end of the data stream, or if the
2088 * oob byte has not yet arrived. In the latter case we have to send a
2089 * SIGURG and a mark indicator to the STREAM head. The mark indicator
2090 * is needed to guarantee correct behavior for SIOCATMARK. See block
2091 * comment in socktpi.h for more details.
2092 */
2093 if (atmark || so->so_oobmark > 0) {
2094 mblk_t *mp;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002095
Anders Persson41174432009-02-12 17:35:05 -08002096 if (atmark && so->so_oobmsg != NULL) {
2097 struct T_exdata_ind *tei;
2098
2099 mp = SOTOTPI(so)->sti_exdata_mp;
2100 SOTOTPI(so)->sti_exdata_mp = NULL;
2101 ASSERT(mp != NULL);
2102 mp->b_datap->db_type = M_PROTO;
2103 tei = (struct T_exdata_ind *)mp->b_rptr;
2104 tei->PRIM_type = T_EXDATA_IND;
2105 tei->MORE_flag = 0;
2106 mp->b_wptr = (uchar_t *)&tei[1];
2107
2108 mp->b_cont = so->so_oobmsg;
2109 so->so_oobmsg = NULL;
2110
2111 putnext(q, mp);
2112 } else {
2113 /* Send up the signal */
2114 mp = SOTOTPI(so)->sti_exdata_mp;
2115 SOTOTPI(so)->sti_exdata_mp = NULL;
2116 ASSERT(mp != NULL);
2117 DB_TYPE(mp) = M_PCSIG;
2118 *mp->b_wptr++ = (uchar_t)SIGURG;
2119 putnext(q, mp);
2120
2121 /* Send up the mark indicator */
2122 mp = SOTOTPI(so)->sti_urgmark_mp;
2123 SOTOTPI(so)->sti_urgmark_mp = NULL;
2124 mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
2125 putnext(q, mp);
2126
2127 so->so_oobmark = 0;
2128 }
2129 }
2130
2131 if (SOTOTPI(so)->sti_exdata_mp != NULL) {
2132 freeb(SOTOTPI(so)->sti_exdata_mp);
2133 SOTOTPI(so)->sti_exdata_mp = NULL;
2134 }
2135
2136 if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
2137 freeb(SOTOTPI(so)->sti_urgmark_mp);
2138 SOTOTPI(so)->sti_urgmark_mp = NULL;
2139 }
2140
2141 ASSERT(so->so_oobmark == 0);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002142 ASSERT(so->so_rcv_queued == 0);
2143}
2144
Anders Persson41174432009-02-12 17:35:05 -08002145#ifdef DEBUG
2146/*
2147 * Do an integrity check of the sonode. This should be done if a
2148 * fallback fails after sonode has initially been converted to use
2149 * TPI and subsequently have to be reverted.
2150 *
2151 * Failure to pass the integrity check will panic the system.
2152 */
2153void
2154so_integrity_check(struct sonode *cur, struct sonode *orig)
2155{
2156 VERIFY(cur->so_vnode == orig->so_vnode);
2157 VERIFY(cur->so_ops == orig->so_ops);
2158 /*
2159 * For so_state we can only VERIFY the state flags in CHECK_STATE.
2160 * The other state flags might be affected by a notification from the
2161 * protocol.
2162 */
2163#define CHECK_STATE (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
2164 SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
2165 SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
2166 VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
2167 (orig->so_state & CHECK_STATE));
2168 VERIFY(cur->so_mode == orig->so_mode);
2169 VERIFY(cur->so_flag == orig->so_flag);
2170 VERIFY(cur->so_count == orig->so_count);
2171 /* Cannot VERIFY so_proto_connid; proto can update it */
2172 VERIFY(cur->so_sockparams == orig->so_sockparams);
2173 /* an error might have been recorded, but it can not be lost */
2174 VERIFY(cur->so_error != 0 || orig->so_error == 0);
2175 VERIFY(cur->so_family == orig->so_family);
2176 VERIFY(cur->so_type == orig->so_type);
2177 VERIFY(cur->so_protocol == orig->so_protocol);
2178 VERIFY(cur->so_version == orig->so_version);
2179 /* New conns might have arrived, but none should have been lost */
2180 VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
2181 VERIFY(cur->so_acceptq_head == orig->so_acceptq_head);
2182 VERIFY(cur->so_backlog == orig->so_backlog);
2183 /* New OOB migth have arrived, but mark should not have been lost */
2184 VERIFY(cur->so_oobmark >= orig->so_oobmark);
2185 /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
2186 VERIFY(cur->so_pgrp == orig->so_pgrp);
2187 VERIFY(cur->so_peercred == orig->so_peercred);
2188 VERIFY(cur->so_cpid == orig->so_cpid);
2189 VERIFY(cur->so_zoneid == orig->so_zoneid);
2190 /* New data migth have arrived, but none should have been lost */
2191 VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
2192 VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
2193 VERIFY(cur->so_rcv_head == orig->so_rcv_head);
2194 VERIFY(cur->so_proto_handle == orig->so_proto_handle);
2195 VERIFY(cur->so_downcalls == orig->so_downcalls);
2196 /* Cannot VERIFY so_proto_props; they can be updated by proto */
2197}
2198#endif
2199
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002200/*
2201 * so_tpi_fallback()
2202 *
Anders Persson41174432009-02-12 17:35:05 -08002203 * This is the fallback initation routine; things start here.
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002204 *
2205 * Basic strategy:
2206 * o Block new socket operations from coming in
2207 * o Allocate/initate info needed by TPI
2208 * o Quiesce the connection, at which point we sync
2209 * state and move data
2210 * o Change operations (sonodeops) associated with the socket
2211 * o Unblock threads waiting for the fallback to finish
2212 */
2213int
2214so_tpi_fallback(struct sonode *so, struct cred *cr)
2215{
2216 int error;
2217 queue_t *q;
2218 struct sockparams *sp;
Anders Persson41174432009-02-12 17:35:05 -08002219 struct sockparams *newsp = NULL;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002220 so_proto_fallback_func_t fbfunc;
2221 boolean_t direct;
Anders Persson41174432009-02-12 17:35:05 -08002222 struct sonode *nso;
2223#ifdef DEBUG
2224 struct sonode origso;
2225#endif
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002226 error = 0;
2227 sp = so->so_sockparams;
2228 fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
2229
2230 /*
2231 * Fallback can only happen if there is a device associated
2232 * with the sonode, and the socket module has a fallback function.
2233 */
2234 if (!SOCKPARAMS_HAS_DEVICE(sp) || fbfunc == NULL)
2235 return (EINVAL);
2236
2237 /*
2238 * Initiate fallback; upon success we know that no new requests
2239 * will come in from the user.
2240 */
2241 if (!so_start_fallback(so))
2242 return (EAGAIN);
Anders Persson41174432009-02-12 17:35:05 -08002243#ifdef DEBUG
2244 /*
2245 * Make a copy of the sonode in case we need to make an integrity
2246 * check later on.
2247 */
2248 bcopy(so, &origso, sizeof (*so));
2249#endif
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002250
Anders Persson7d64f412009-02-11 15:38:45 -08002251 sp->sp_stats.sps_nfallback.value.ui64++;
2252
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002253 newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
2254 so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
2255 KM_SLEEP, &error);
2256 if (error != 0)
2257 goto out;
2258
2259 if (so->so_direct != NULL) {
2260 sodirect_t *sodp = so->so_direct;
Anders Perssonbbc000e2009-04-28 12:10:59 -07002261 mutex_enter(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002262
Anders Perssonbbc000e2009-04-28 12:10:59 -07002263 so->so_direct->sod_enabled = B_FALSE;
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002264 so->so_state &= ~SS_SODIRECT;
2265 ASSERT(sodp->sod_uioafh == NULL);
Anders Perssonbbc000e2009-04-28 12:10:59 -07002266 mutex_exit(&so->so_lock);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002267 }
2268
2269 /* Turn sonode into a TPI socket */
Anders Persson41174432009-02-12 17:35:05 -08002270 error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
2271 if (error != 0)
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002272 goto out;
Anders Persson41174432009-02-12 17:35:05 -08002273
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002274
2275 /*
2276 * Now tell the protocol to start using TPI. so_quiesced_cb be
2277 * called once it's safe to synchronize state.
2278 */
2279 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
Anders Persson41174432009-02-12 17:35:05 -08002280 error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002281 DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
2282
Anders Persson41174432009-02-12 17:35:05 -08002283 if (error != 0) {
2284 /* protocol was unable to do a fallback, revert the sonode */
2285 sotpi_revert_sonode(so, cr);
2286 goto out;
2287 }
2288
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002289 /*
Anders Persson41174432009-02-12 17:35:05 -08002290 * Walk the accept queue and notify the proto that they should
2291 * fall back to TPI. The protocol will send up the T_CONN_IND.
2292 */
2293 nso = so->so_acceptq_head;
2294 while (nso != NULL) {
2295 int rval;
2296
2297 DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
2298 rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL);
2299 DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
2300 if (rval != 0) {
2301 zcmn_err(getzoneid(), CE_WARN,
2302 "Failed to convert socket in accept queue to TPI. "
2303 "Pid = %d\n", curproc->p_pid);
2304 }
2305 nso = nso->so_acceptq_next;
2306 }
2307
2308 /*
2309 * Now flush the acceptq, this will destroy all sockets. They will
2310 * be recreated in sotpi_accept().
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002311 */
2312 so_acceptq_flush(so);
2313
2314 mutex_enter(&so->so_lock);
2315 so->so_state |= SS_FALLBACK_COMP;
2316 mutex_exit(&so->so_lock);
2317
2318 /*
2319 * Swap the sonode ops. Socket opertations that come in once this
2320 * is done will proceed without blocking.
2321 */
2322 so->so_ops = &sotpi_sonodeops;
2323
2324 /*
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002325 * Wake up any threads stuck in poll. This is needed since the poll
2326 * head changes when the fallback happens (moves from the sonode to
2327 * the STREAMS head).
2328 */
2329 pollwakeup(&so->so_poll_list, POLLERR);
2330out:
2331 so_end_fallback(so);
2332
Anders Persson41174432009-02-12 17:35:05 -08002333 if (error != 0) {
2334#ifdef DEBUG
2335 so_integrity_check(so, &origso);
2336#endif
2337 zcmn_err(getzoneid(), CE_WARN,
2338 "Failed to convert socket to TPI (err=%d). Pid = %d\n",
2339 error, curproc->p_pid);
2340 if (newsp != NULL)
2341 SOCKPARAMS_DEC_REF(newsp);
2342 }
2343
Yu Xiangning0f1702c2008-12-11 20:04:13 -08002344 return (error);
2345}