6799655 sockets need better handling of STREAMS ioctls
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 4683188..7b21fac 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -1203,7 +1203,7 @@
}
mutex_enter(&so->so_lock);
- if (so->so_state & (SS_FALLBACK_PENDING | SS_FALLBACK_COMP)) {
+ if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
SOD_DISABLE(sodp);
mutex_exit(&so->so_lock);
*errorp = EOPNOTSUPP;
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index dd114db..d01447c 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -37,6 +37,7 @@
#include <sys/strsubr.h>
#include <sys/strsun.h>
#include <sys/atomic.h>
+#include <sys/tihdr.h>
#include <fs/sockfs/sockcommon.h>
#include <fs/sockfs/socktpi.h>
@@ -1515,8 +1516,75 @@
}
/*
- * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified
- * then the socket will fall back to TPI.
+ * Handle the I_NREAD STREAM ioctl.
+ */
+static int
+so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
+{
+ size_t size = 0;
+ int retval;
+ int count = 0;
+ mblk_t *mp;
+
+ if (so->so_downcalls == NULL ||
+ so->so_downcalls->sd_recv_uio != NULL)
+ return (EINVAL);
+
+ mutex_enter(&so->so_lock);
+ /* Wait for reader to get out of the way. */
+ while (so->so_flag & SOREADLOCKED) {
+ /*
+ * If reader is waiting for data, then there should be nothing
+ * on the rcv queue.
+ */
+ if (so->so_rcv_wakeup)
+ goto out;
+
+ so->so_flag |= SOWANT;
+ /* Do a timed sleep, in case the reader goes to sleep. */
+ (void) cv_timedwait(&so->so_state_cv, &so->so_lock,
+ lbolt + drv_usectohz(10));
+ }
+
+ /*
+ * Since we are holding so_lock no new reader will come in, and the
+ * protocol will not be able to enqueue data. So it's safe to walk
+ * both rcv queues.
+ */
+ mp = so->so_rcv_q_head;
+ if (mp != NULL) {
+ size = msgdsize(so->so_rcv_q_head);
+ for (; mp != NULL; mp = mp->b_next)
+ count++;
+ } else {
+ /*
+ * In case the processing list was empty, get the size of the
+ * next msg in line.
+ */
+ size = msgdsize(so->so_rcv_head);
+ }
+
+ for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
+ count++;
+out:
+ mutex_exit(&so->so_lock);
+
+ /*
+ * Drop down from size_t to the "int" required by the
+ * interface. Cap at INT_MAX.
+ */
+ retval = MIN(size, INT_MAX);
+ if (so_copyout(&retval, (void *)arg, sizeof (retval),
+ (mode & (int)FKIOCTL))) {
+ return (EFAULT);
+ } else {
+ *rvalp = count;
+ return (0);
+ }
+}
+
+/*
+ * Process STREAM ioctls.
*
* Returns:
* < 0 - ioctl was not handle
@@ -1526,32 +1594,42 @@
socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
struct cred *cr, int32_t *rvalp)
{
+ int retval;
+
+ /* Only STREAM iotcls are handled here */
+ if ((cmd & 0xffffff00U) != STR)
+ return (-1);
+
switch (cmd) {
- case _I_INSERT:
- case _I_REMOVE:
- case I_FIND:
- case I_LIST:
+ case I_CANPUT:
+ /*
+ * We return an error for I_CANPUT so that isastream(3C) will
+ * not report the socket as being a STREAM.
+ */
return (EOPNOTSUPP);
-
- case I_PUSH:
- case I_POP: {
- int retval;
-
- if ((retval = so_tpi_fallback(so, cr)) == 0) {
- /* Reissue the ioctl */
- ASSERT(so->so_rcv_q_head == NULL);
- return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
- }
- return (retval);
- }
+ case I_NREAD:
+ /* Avoid doing a fallback for I_NREAD. */
+ return (so_strioc_nread(so, arg, mode, rvalp));
case I_LOOK:
+ /* Avoid doing a fallback for I_LOOK. */
if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
(mode & (int)FKIOCTL))) {
return (EFAULT);
}
return (0);
default:
- return (-1);
+ break;
+ }
+
+ /*
+ * Try to fall back to TPI, and if successful, reissue the ioctl.
+ */
+ if ((retval = so_tpi_fallback(so, cr)) == 0) {
+ /* Reissue the ioctl */
+ ASSERT(so->so_rcv_q_head == NULL);
+ return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
+ } else {
+ return (retval);
}
}
@@ -1851,7 +1929,7 @@
ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
mutex_enter(&so->so_lock);
- so->so_state &= ~SS_FALLBACK_PENDING;
+ so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
mutex_exit(&so->so_lock);
rw_downgrade(&so->so_fallback_rwlock);
@@ -1867,8 +1945,6 @@
* is safe to synchronize the state. Data can also be moved without
* risk for reordering.
*
- * NOTE: urgent data is dropped on the floor.
- *
* We do not need to hold so_lock, since there can be only one thread
* operating on the sonode.
*/
@@ -1878,15 +1954,21 @@
struct sockaddr *faddr, socklen_t faddrlen, short opts)
{
struct sonode *so = (struct sonode *)sock_handle;
+ boolean_t atmark;
sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
+ /*
+ * Some protocols do not quiece the data path during fallback. Once
+ * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
+ * fail and the protocol is responsible for saving the data for later
+ * delivery (i.e., once the fallback has completed).
+ */
mutex_enter(&so->so_lock);
+ so->so_state |= SS_FALLBACK_DRAIN;
SOCKET_TIMER_CANCEL(so);
mutex_exit(&so->so_lock);
- /*
- * Move data to the STREAM head.
- */
+
if (so->so_rcv_head != NULL) {
if (so->so_rcv_q_last_head == NULL)
so->so_rcv_q_head = so->so_rcv_head;
@@ -1895,6 +1977,20 @@
so->so_rcv_q_last_head = so->so_rcv_last_head;
}
+ atmark = (so->so_state & SS_RCVATMARK) != 0;
+ /*
+ * Clear any OOB state having to do with pending data. The TPI
+ * code path will set the appropriate oob state when we move the
+ * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
+ * data has already been consumed.
+ */
+ so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
+
+ ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
+
+ /*
+ * Move data to the STREAM head.
+ */
while (so->so_rcv_q_head != NULL) {
mblk_t *mp = so->so_rcv_q_head;
size_t mlen = msgdsize(mp);
@@ -1902,33 +1998,200 @@
so->so_rcv_q_head = mp->b_next;
mp->b_next = NULL;
mp->b_prev = NULL;
+
+ /*
+ * Send T_EXDATA_IND if we are at the oob mark.
+ */
+ if (atmark) {
+ struct T_exdata_ind *tei;
+ mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp;
+
+ SOTOTPI(so)->sti_exdata_mp = NULL;
+ ASSERT(mp1 != NULL);
+ mp1->b_datap->db_type = M_PROTO;
+ tei = (struct T_exdata_ind *)mp1->b_rptr;
+ tei->PRIM_type = T_EXDATA_IND;
+ tei->MORE_flag = 0;
+ mp1->b_wptr = (uchar_t *)&tei[1];
+
+ if (IS_SO_OOB_INLINE(so)) {
+ mp1->b_cont = mp;
+ } else {
+ ASSERT(so->so_oobmsg != NULL);
+ mp1->b_cont = so->so_oobmsg;
+ so->so_oobmsg = NULL;
+
+ /* process current mp next time around */
+ mp->b_next = so->so_rcv_q_head;
+ so->so_rcv_q_head = mp;
+ mlen = 0;
+ }
+ mp = mp1;
+
+ /* we have consumed the oob mark */
+ atmark = B_FALSE;
+ } else if (so->so_oobmark > 0) {
+ /*
+ * Check if the OOB mark is within the current
+ * mblk chain. In that case we have to split it up.
+ */
+ if (so->so_oobmark < mlen) {
+ mblk_t *urg_mp = mp;
+
+ atmark = B_TRUE;
+ mp = NULL;
+ mlen = so->so_oobmark;
+
+ /*
+ * It is assumed that the OOB mark does
+ * not land within a mblk.
+ */
+ do {
+ so->so_oobmark -= MBLKL(urg_mp);
+ mp = urg_mp;
+ urg_mp = urg_mp->b_cont;
+ } while (so->so_oobmark > 0);
+ mp->b_cont = NULL;
+ if (urg_mp != NULL) {
+ urg_mp->b_next = so->so_rcv_q_head;
+ so->so_rcv_q_head = urg_mp;
+ }
+ } else {
+ so->so_oobmark -= mlen;
+ if (so->so_oobmark == 0)
+ atmark = B_TRUE;
+ }
+ }
+
+ /*
+ * Queue data on the STREAM head.
+ */
so->so_rcv_queued -= mlen;
putnext(q, mp);
}
- ASSERT(so->so_rcv_queued == 0);
so->so_rcv_head = NULL;
so->so_rcv_last_head = NULL;
so->so_rcv_q_head = NULL;
so->so_rcv_q_last_head = NULL;
-#ifdef DEBUG
- if (so->so_oobmsg != NULL || so->so_oobmark > 0) {
- cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n");
- }
-#endif
- if (so->so_oobmsg != NULL) {
- freemsg(so->so_oobmsg);
- so->so_oobmsg = NULL;
- }
- so->so_oobmark = 0;
+ /*
+ * Check if the oob byte is at the end of the data stream, or if the
+ * oob byte has not yet arrived. In the latter case we have to send a
+ * SIGURG and a mark indicator to the STREAM head. The mark indicator
+ * is needed to guarantee correct behavior for SIOCATMARK. See block
+ * comment in socktpi.h for more details.
+ */
+ if (atmark || so->so_oobmark > 0) {
+ mblk_t *mp;
+ if (atmark && so->so_oobmsg != NULL) {
+ struct T_exdata_ind *tei;
+
+ mp = SOTOTPI(so)->sti_exdata_mp;
+ SOTOTPI(so)->sti_exdata_mp = NULL;
+ ASSERT(mp != NULL);
+ mp->b_datap->db_type = M_PROTO;
+ tei = (struct T_exdata_ind *)mp->b_rptr;
+ tei->PRIM_type = T_EXDATA_IND;
+ tei->MORE_flag = 0;
+ mp->b_wptr = (uchar_t *)&tei[1];
+
+ mp->b_cont = so->so_oobmsg;
+ so->so_oobmsg = NULL;
+
+ putnext(q, mp);
+ } else {
+ /* Send up the signal */
+ mp = SOTOTPI(so)->sti_exdata_mp;
+ SOTOTPI(so)->sti_exdata_mp = NULL;
+ ASSERT(mp != NULL);
+ DB_TYPE(mp) = M_PCSIG;
+ *mp->b_wptr++ = (uchar_t)SIGURG;
+ putnext(q, mp);
+
+ /* Send up the mark indicator */
+ mp = SOTOTPI(so)->sti_urgmark_mp;
+ SOTOTPI(so)->sti_urgmark_mp = NULL;
+ mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
+ putnext(q, mp);
+
+ so->so_oobmark = 0;
+ }
+ }
+
+ if (SOTOTPI(so)->sti_exdata_mp != NULL) {
+ freeb(SOTOTPI(so)->sti_exdata_mp);
+ SOTOTPI(so)->sti_exdata_mp = NULL;
+ }
+
+ if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
+ freeb(SOTOTPI(so)->sti_urgmark_mp);
+ SOTOTPI(so)->sti_urgmark_mp = NULL;
+ }
+
+ ASSERT(so->so_oobmark == 0);
ASSERT(so->so_rcv_queued == 0);
}
+#ifdef DEBUG
+/*
+ * Do an integrity check of the sonode. This should be done if a
+ * fallback fails after sonode has initially been converted to use
+ * TPI and subsequently have to be reverted.
+ *
+ * Failure to pass the integrity check will panic the system.
+ */
+void
+so_integrity_check(struct sonode *cur, struct sonode *orig)
+{
+ VERIFY(cur->so_vnode == orig->so_vnode);
+ VERIFY(cur->so_ops == orig->so_ops);
+ /*
+ * For so_state we can only VERIFY the state flags in CHECK_STATE.
+ * The other state flags might be affected by a notification from the
+ * protocol.
+ */
+#define CHECK_STATE (SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
+ SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
+ SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
+ VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
+ (orig->so_state & CHECK_STATE));
+ VERIFY(cur->so_mode == orig->so_mode);
+ VERIFY(cur->so_flag == orig->so_flag);
+ VERIFY(cur->so_count == orig->so_count);
+ /* Cannot VERIFY so_proto_connid; proto can update it */
+ VERIFY(cur->so_sockparams == orig->so_sockparams);
+ /* an error might have been recorded, but it can not be lost */
+ VERIFY(cur->so_error != 0 || orig->so_error == 0);
+ VERIFY(cur->so_family == orig->so_family);
+ VERIFY(cur->so_type == orig->so_type);
+ VERIFY(cur->so_protocol == orig->so_protocol);
+ VERIFY(cur->so_version == orig->so_version);
+ /* New conns might have arrived, but none should have been lost */
+ VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
+ VERIFY(cur->so_acceptq_head == orig->so_acceptq_head);
+ VERIFY(cur->so_backlog == orig->so_backlog);
+ /* New OOB migth have arrived, but mark should not have been lost */
+ VERIFY(cur->so_oobmark >= orig->so_oobmark);
+ /* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
+ VERIFY(cur->so_pgrp == orig->so_pgrp);
+ VERIFY(cur->so_peercred == orig->so_peercred);
+ VERIFY(cur->so_cpid == orig->so_cpid);
+ VERIFY(cur->so_zoneid == orig->so_zoneid);
+ /* New data migth have arrived, but none should have been lost */
+ VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
+ VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
+ VERIFY(cur->so_rcv_head == orig->so_rcv_head);
+ VERIFY(cur->so_proto_handle == orig->so_proto_handle);
+ VERIFY(cur->so_downcalls == orig->so_downcalls);
+ /* Cannot VERIFY so_proto_props; they can be updated by proto */
+}
+#endif
+
/*
* so_tpi_fallback()
*
- * This is fallback initation routine; things start here.
+ * This is the fallback initation routine; things start here.
*
* Basic strategy:
* o Block new socket operations from coming in
@@ -1944,10 +2207,13 @@
int error;
queue_t *q;
struct sockparams *sp;
- struct sockparams *newsp;
+ struct sockparams *newsp = NULL;
so_proto_fallback_func_t fbfunc;
boolean_t direct;
-
+ struct sonode *nso;
+#ifdef DEBUG
+ struct sonode origso;
+#endif
error = 0;
sp = so->so_sockparams;
fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
@@ -1965,6 +2231,13 @@
*/
if (!so_start_fallback(so))
return (EAGAIN);
+#ifdef DEBUG
+ /*
+ * Make a copy of the sonode in case we need to make an integrity
+ * check later on.
+ */
+ bcopy(so, &origso, sizeof (*so));
+#endif
newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
@@ -1983,29 +2256,47 @@
}
/* Turn sonode into a TPI socket */
- q = sotpi_convert_sonode(so, newsp, &direct, cr);
- if (q == NULL) {
- zcmn_err(getzoneid(), CE_WARN,
- "Failed to convert socket to TPI. Pid = %d\n",
- curproc->p_pid);
- SOCKPARAMS_DEC_REF(newsp);
- error = EINVAL;
+ error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
+ if (error != 0)
goto out;
- }
+
/*
* Now tell the protocol to start using TPI. so_quiesced_cb be
* called once it's safe to synchronize state.
*/
DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
- /* FIXME assumes this cannot fail. TCP can fail to enter squeue */
- (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
+ error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
+ if (error != 0) {
+ /* protocol was unable to do a fallback, revert the sonode */
+ sotpi_revert_sonode(so, cr);
+ goto out;
+ }
+
/*
- * Free all pending connection indications, i.e., socket_accept() has
- * not yet pulled the connection of the queue. The transport sent
- * a T_CONN_IND message for each pending connection to the STREAM head.
+ * Walk the accept queue and notify the proto that they should
+ * fall back to TPI. The protocol will send up the T_CONN_IND.
+ */
+ nso = so->so_acceptq_head;
+ while (nso != NULL) {
+ int rval;
+
+ DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
+ rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL);
+ DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
+ if (rval != 0) {
+ zcmn_err(getzoneid(), CE_WARN,
+ "Failed to convert socket in accept queue to TPI. "
+ "Pid = %d\n", curproc->p_pid);
+ }
+ nso = nso->so_acceptq_next;
+ }
+
+ /*
+ * Now flush the acceptq, this will destroy all sockets. They will
+ * be recreated in sotpi_accept().
*/
so_acceptq_flush(so);
@@ -2020,10 +2311,6 @@
so->so_ops = &sotpi_sonodeops;
/*
- * No longer a non streams socket
- */
- so->so_not_str = B_FALSE;
- /*
* Wake up any threads stuck in poll. This is needed since the poll
* head changes when the fallback happens (moves from the sonode to
* the STREAMS head).
@@ -2032,5 +2319,16 @@
out:
so_end_fallback(so);
+ if (error != 0) {
+#ifdef DEBUG
+ so_integrity_check(so, &origso);
+#endif
+ zcmn_err(getzoneid(), CE_WARN,
+ "Failed to convert socket to TPI (err=%d). Pid = %d\n",
+ error, curproc->p_pid);
+ if (newsp != NULL)
+ SOCKPARAMS_DEC_REF(newsp);
+ }
+
return (error);
}
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index d801c1e..80738f5 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -6674,21 +6674,24 @@
* Given a non-TPI sonode, allocate and prep it to be ready for TPI.
*
* Caller must still update state and mode using sotpi_update_state().
- *
- * Returns the STREAM queue that the protocol should use.
*/
-queue_t *
+int
sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
- boolean_t *direct, struct cred *cr)
+ boolean_t *direct, queue_t **qp, struct cred *cr)
{
sotpi_info_t *sti;
struct sockparams *origsp = so->so_sockparams;
sock_lower_handle_t handle = so->so_proto_handle;
- uint_t old_state = so->so_state;
struct stdata *stp;
struct vnode *vp;
queue_t *q;
+ int error = 0;
+ ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
+ SS_FALLBACK_PENDING);
+ ASSERT(SOCK_IS_NONSTR(so));
+
+ *qp = NULL;
*direct = B_FALSE;
so->so_sockparams = newsp;
/*
@@ -6697,11 +6700,10 @@
(void) sotpi_info_create(so, KM_SLEEP);
sotpi_info_init(so);
- if (sotpi_init(so, NULL, cr, SO_FALLBACK) != 0) {
+ if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
sotpi_info_fini(so);
sotpi_info_destroy(so);
- so->so_state = old_state;
- return (NULL);
+ return (error);
}
ASSERT(handle == so->so_proto_handle);
sti = SOTOTPI(so);
@@ -6709,6 +6711,23 @@
*direct = B_TRUE;
/*
+ * When it comes to urgent data we have two cases to deal with;
+ * (1) The oob byte has already arrived, or (2) the protocol has
+ * notified that oob data is pending, but it has not yet arrived.
+ *
+ * For (1) all we need to do is send a T_EXDATA_IND to indicate were
+ * in the byte stream the oob byte is. For (2) we have to send a
+ * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
+ * the oob byte will be the next byte from the protocol.
+ *
+ * So in the worst case we need two mblks, one for the signal, another
+ * for mark indication. In that case we use the exdata_mp for the sig.
+ */
+ sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
+ STR_NOSIG, NULL);
+ sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
+
+ /*
* Keep the original sp around so we can properly dispose of the
* sonode when the socket is being closed.
*/
@@ -6728,10 +6747,8 @@
* connection indications.
*/
if (so->so_pgrp != 0) {
- mutex_enter(&so->so_lock);
if (so_set_events(so, so->so_vnode, cr) != 0)
so->so_pgrp = 0;
- mutex_exit(&so->so_lock);
}
/*
@@ -6748,9 +6765,52 @@
*/
while (q->q_next != NULL)
q = q->q_next;
- q = _RD(q);
+ *qp = _RD(q);
- return (q);
+ /* This is now a STREAMS sockets */
+ so->so_not_str = B_FALSE;
+
+ return (error);
+}
+
+/*
+ * Revert a TPI sonode. It is only allowed to revert the sonode during
+ * the fallback process.
+ */
+void
+sotpi_revert_sonode(struct sonode *so, struct cred *cr)
+{
+ vnode_t *vp = SOTOV(so);
+
+ ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
+ SS_FALLBACK_PENDING);
+ ASSERT(!SOCK_IS_NONSTR(so));
+ ASSERT(vp->v_stream != NULL);
+
+ if (SOTOTPI(so)->sti_exdata_mp != NULL) {
+ freeb(SOTOTPI(so)->sti_exdata_mp);
+ SOTOTPI(so)->sti_exdata_mp = NULL;
+ }
+
+ if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
+ freeb(SOTOTPI(so)->sti_urgmark_mp);
+ SOTOTPI(so)->sti_urgmark_mp = NULL;
+ }
+
+ strclean(vp);
+ (void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
+
+ /*
+ * Restore the original sockparams. The caller is responsible for
+ * dropping the ref to the new sp.
+ */
+ so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
+
+ sotpi_info_fini(so);
+ sotpi_info_destroy(so);
+
+ /* This is no longer a STREAMS sockets */
+ so->so_not_str = B_TRUE;
}
void
@@ -6815,8 +6875,7 @@
{
sotpi_info_t *sti;
- if (so == NULL)
- return (NULL);
+ ASSERT(so != NULL);
sti = (sotpi_info_t *)so->so_priv;
@@ -6845,6 +6904,9 @@
sti->sti_nl7c_uri = NULL;
sti->sti_nl7c_rcv_mp = NULL;
+ sti->sti_exdata_mp = NULL;
+ sti->sti_urgmark_mp = NULL;
+
mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
@@ -6870,6 +6932,9 @@
ASSERT(sti->sti_nl7c_uri == NULL);
ASSERT(sti->sti_nl7c_rcv_mp == NULL);
+ ASSERT(sti->sti_exdata_mp == NULL);
+ ASSERT(sti->sti_urgmark_mp == NULL);
+
mutex_destroy(&sti->sti_plumb_lock);
cv_destroy(&sti->sti_ack_cv);
}
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.h b/usr/src/uts/common/fs/sockfs/socktpi.h
index 4c1a5de..cee3a5d 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.h
+++ b/usr/src/uts/common/fs/sockfs/socktpi.h
@@ -20,7 +20,7 @@
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -252,6 +252,12 @@
kssl_endpt_type_t sti_kssl_type; /* is proxy/is proxied/none */
kssl_ent_t sti_kssl_ent; /* SSL config entry */
kssl_ctx_t sti_kssl_ctx; /* SSL session context */
+
+ /*
+ * The mblks below are only allocated and used during fallback.
+ */
+ mblk_t *sti_exdata_mp; /* T_EXDATA_IND or SIGURG */
+ mblk_t *sti_urgmark_mp; /* mark indication */
} sotpi_info_t;
struct T_capability_ack;
@@ -259,8 +265,9 @@
extern sonodeops_t sotpi_sonodeops;
extern int socktpi_init(void);
-extern queue_t *sotpi_convert_sonode(struct sonode *, struct sockparams *,
- boolean_t *, struct cred *);
+extern int sotpi_convert_sonode(struct sonode *, struct sockparams *,
+ boolean_t *, queue_t **, struct cred *);
+extern void sotpi_revert_sonode(struct sonode *, struct cred *);
extern void sotpi_update_state(struct sonode *, struct T_capability_ack *,
struct sockaddr *, socklen_t, struct sockaddr *, socklen_t,
short);
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index e26254a..eb0162a 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -151,6 +151,7 @@
static void icmp_wput_other(queue_t *q, mblk_t *mp);
static void icmp_wput_iocdata(queue_t *q, mblk_t *mp);
static void icmp_wput_restricted(queue_t *q, mblk_t *mp);
+static void icmp_ulp_recv(conn_t *, mblk_t *);
static void *rawip_stack_init(netstackid_t stackid, netstack_t *ns);
static void rawip_stack_fini(netstackid_t stackid, void *arg);
@@ -1131,6 +1132,7 @@
sin = sin_null;
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = ipha->ipha_dst;
+
if (IPCL_IS_NONSTR(connp)) {
rw_enter(&icmp->icmp_rwlock, RW_WRITER);
if (icmp->icmp_state == TS_DATA_XFER) {
@@ -1147,13 +1149,13 @@
}
rw_exit(&icmp->icmp_rwlock);
} else {
-
mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
0, error);
if (mp1 != NULL)
putnext(connp->conn_rq, mp1);
}
done:
+ ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
freemsg(mp);
}
@@ -1264,14 +1266,8 @@
* message. Free it, then send our empty message.
*/
freemsg(mp);
- if (!IPCL_IS_NONSTR(connp)) {
- putnext(connp->conn_rq, newmp);
- } else {
- (*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, newmp, 0, 0, &error,
- NULL);
- ASSERT(error == 0);
- }
+ icmp_ulp_recv(connp, newmp);
+
return;
}
case ICMP6_TIME_EXCEEDED:
@@ -1322,13 +1318,13 @@
}
rw_exit(&icmp->icmp_rwlock);
} else {
-
mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
NULL, 0, error);
if (mp1 != NULL)
putnext(connp->conn_rq, mp1);
}
done:
+ ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
freemsg(mp);
}
@@ -3339,7 +3335,8 @@
icmppa->icmp_param_value = new_value;
return (0);
}
-static void
+
+static mblk_t *
icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
{
ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
@@ -3356,13 +3353,56 @@
icmp->icmp_fallback_queue_tail->b_next = mp;
icmp->icmp_fallback_queue_tail = mp;
}
- mutex_exit(&icmp->icmp_recv_lock);
+ return (NULL);
} else {
/*
- * no more fallbacks possible, ok to drop lock.
+ * Fallback completed, let the caller putnext() the mblk.
*/
- mutex_exit(&icmp->icmp_recv_lock);
- putnext(icmp->icmp_connp->conn_rq, mp);
+ return (mp);
+ }
+}
+
+/*
+ * Deliver data to ULP. In case we have a socket, and it's falling back to
+ * TPI, then we'll queue the mp for later processing.
+ */
+static void
+icmp_ulp_recv(conn_t *connp, mblk_t *mp)
+{
+
+ if (IPCL_IS_NONSTR(connp)) {
+ icmp_t *icmp = connp->conn_icmp;
+ int error;
+
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
+ NULL) < 0) {
+ mutex_enter(&icmp->icmp_recv_lock);
+ if (error == ENOSPC) {
+ /*
+ * let's confirm while holding the lock
+ */
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, NULL, 0, 0,
+ &error, NULL) < 0) {
+ ASSERT(error == ENOSPC);
+ if (error == ENOSPC) {
+ connp->conn_flow_cntrld =
+ B_TRUE;
+ }
+ }
+ mutex_exit(&icmp->icmp_recv_lock);
+ } else {
+ ASSERT(error == EOPNOTSUPP);
+ mp = icmp_queue_fallback(icmp, mp);
+ mutex_exit(&icmp->icmp_recv_lock);
+ if (mp != NULL)
+ putnext(connp->conn_rq, mp);
+ }
+ }
+ ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
+ } else {
+ putnext(connp->conn_rq, mp);
}
}
@@ -3391,7 +3431,6 @@
uint_t icmp_opt = 0;
boolean_t icmp_ipv6_recvhoplimit = B_FALSE;
uint_t hopstrip;
- int error;
ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
@@ -4038,35 +4077,8 @@
BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
deliver:
- if (IPCL_IS_NONSTR(connp)) {
- if ((*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
- NULL) < 0) {
- mutex_enter(&icmp->icmp_recv_lock);
- if (error == ENOSPC) {
- /*
- * let's confirm while holding the lock
- */
- if ((*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, NULL, 0, 0,
- &error, NULL) < 0) {
- if (error == ENOSPC) {
- connp->conn_flow_cntrld =
- B_TRUE;
- } else {
- ASSERT(error == EOPNOTSUPP);
- }
- }
- mutex_exit(&icmp->icmp_recv_lock);
- } else {
- ASSERT(error == EOPNOTSUPP);
- icmp_queue_fallback(icmp, mp);
- }
- }
- } else {
- putnext(connp->conn_rq, mp);
- }
- ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
+ icmp_ulp_recv(connp, mp);
+
}
/*
@@ -5968,7 +5980,7 @@
}
/* ARGSUSED */
-void
+int
rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
{
@@ -6032,20 +6044,14 @@
if (icmp->icmp_dontroute)
opts |= SO_DONTROUTE;
- /*
- * Once we grab the drain lock, no data will be send up
- * to the socket. So we notify the socket that the endpoint
- * is quiescent and it's therefore safe move data from
- * the socket to the stream head.
- */
(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
(struct sockaddr *)&laddr, laddrlen,
(struct sockaddr *)&faddr, faddrlen, opts);
/*
- * push up any packets that were queued in icmp_t
+ * Attempts to send data up during fallback will result in it being
+ * queued in udp_t. Now we push up any queued packets.
*/
-
mutex_enter(&icmp->icmp_recv_lock);
while (icmp->icmp_fallback_queue_head != NULL) {
mblk_t *mp;
@@ -6058,15 +6064,22 @@
mutex_enter(&icmp->icmp_recv_lock);
}
icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
+
/*
* No longer a streams less socket
*/
+ rw_enter(&icmp->icmp_rwlock, RW_WRITER);
connp->conn_flags &= ~IPCL_NONSTR;
+ rw_exit(&icmp->icmp_rwlock);
+
mutex_exit(&icmp->icmp_recv_lock);
+
ASSERT(icmp->icmp_fallback_queue_head == NULL &&
icmp->icmp_fallback_queue_tail == NULL);
ASSERT(connp->conn_ref >= 1);
+
+ return (0);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index f818247..241132b 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
@@ -170,7 +170,7 @@
extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **,
uint_t *, int *, int, cred_t *);
-extern void rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+extern int rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t,
so_proto_quiesced_cb_t);
extern sock_downcalls_t sock_rawip_downcalls;
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index ddbcb82..af748fe 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -14059,6 +14059,11 @@
(*connp->conn_upcalls->su_recv)
(connp->conn_upper_handle, mp, seg_len,
MSG_OOB, &error, NULL);
+ /*
+ * We should never be in middle of a
+ * fallback, the squeue guarantees that.
+ */
+ ASSERT(error != EOPNOTSUPP);
mp = NULL;
goto update_ack;
} else if (!tcp->tcp_urp_mp) {
@@ -15157,12 +15162,13 @@
if ((*connp->conn_upcalls->su_recv)
(connp->conn_upper_handle, mp,
seg_len, 0, &error, NULL) <= 0) {
- if (error == ENOSPC) {
+ /*
+ * We should never be in middle of a
+ * fallback, the squeue guarantees that.
+ */
+ ASSERT(error != EOPNOTSUPP);
+ if (error == ENOSPC)
tcp->tcp_rwnd -= seg_len;
- } else if (error == EOPNOTSUPP) {
- tcp_rcv_enqueue(tcp, mp,
- seg_len);
- }
}
} else if (sodp != NULL) {
mutex_enter(sodp->sod_lockp);
@@ -15216,11 +15222,13 @@
if ((*connp->conn_upcalls->su_recv)(
connp->conn_upper_handle,
mp, seg_len, 0, &error, &push) <= 0) {
- if (error == ENOSPC) {
+ /*
+ * We should never be in middle of a
+ * fallback, the squeue guarantees that.
+ */
+ ASSERT(error != EOPNOTSUPP);
+ if (error == ENOSPC)
tcp->tcp_rwnd -= seg_len;
- } else if (error == EOPNOTSUPP) {
- tcp_rcv_enqueue(tcp, mp, seg_len);
- }
} else if (push) {
/*
* PUSH bit set and sockfs is not
@@ -18169,9 +18177,8 @@
0, &error, &push);
if (space_left < 0) {
/*
- * At this point the eager is not
- * visible to anyone, so fallback
- * can not happen.
+ * We should never be in middle of a
+ * fallback, the squeue guarantees that.
*/
ASSERT(error != EOPNOTSUPP);
}
@@ -27700,72 +27707,34 @@
/*
* tcp_fallback
*
- * A direct socket is falling back to using STREAMS. Hanging
- * off of the queue is a temporary tcp_t, which was created using
- * tcp_open(). The tcp_open() was called as part of the regular
- * sockfs create path, i.e., the SO_SOCKSTR flag is passed down,
- * and therefore the temporary tcp_t is marked to be a socket
- * (i.e., IPCL_SOCKET, tcp_issocket). So the optimizations
- * introduced by FireEngine will be used.
+ * A direct socket is falling back to using STREAMS. The queue
+ * that is being passed down was created using tcp_open() with
+ * the SO_FALLBACK flag set. As a result, the queue is not
+ * associated with a conn, and the q_ptrs instead contain the
+ * dev and minor area that should be used.
*
- * The tcp_t associated with the socket falling back will
- * still be marked as a socket, although the direct socket flag
- * (IPCL_NONSTR) is removed. A fall back to true TPI semantics
- * will not take place until a _SIOCSOCKFALLBACK ioctl is issued.
- *
- * If the above mentioned behavior, i.e., the tmp tcp_t is created
- * as a STREAMS/TPI endpoint, then we will need to do more work here.
- * Such as inserting the direct socket into the acceptor hash.
+ * The 'direct_sockfs' flag indicates whether the FireEngine
+ * optimizations should be used. The common case would be that
+ * optimizations are enabled, and they might be subsequently
+ * disabled using the _SIOCSOCKFALLBACK ioctl.
+ */
+
+/*
+ * An active connection is falling back to TPI. Gather all the information
+ * required by the STREAM head and TPI sonode and send it up.
*/
void
-tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
+tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
{
- tcp_t *tcp, *eager;
- conn_t *connp = (conn_t *)proto_handle;
- int error;
+ conn_t *connp = tcp->tcp_connp;
+ struct stroptions *stropt;
struct T_capability_ack tca;
struct sockaddr_in6 laddr, faddr;
socklen_t laddrlen, faddrlen;
short opts;
- struct stroptions *stropt;
- mblk_t *stropt_mp;
+ int error;
mblk_t *mp;
- mblk_t *conn_ind_head = NULL;
- mblk_t *conn_ind_tail = NULL;
- mblk_t *ordrel_mp;
- mblk_t *fused_sigurp_mp;
-
- tcp = connp->conn_tcp;
- /*
- * No support for acceptor fallback
- */
- ASSERT(q->q_qinfo != &tcp_acceptor_rinit);
-
- stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
-
- /* Pre-allocate the T_ordrel_ind mblk. */
- ASSERT(tcp->tcp_ordrel_mp == NULL);
- ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
- STR_NOSIG, NULL);
- ordrel_mp->b_datap->db_type = M_PROTO;
- ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
- ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
-
- /* Pre-allocate the M_PCSIG anyway */
- fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL);
-
- /*
- * Enter the squeue so that no new packets can come in
- */
- error = squeue_synch_enter(connp->conn_sqp, connp, 0);
- if (error != 0) {
- /* failed to enter, free all the pre-allocated messages. */
- freeb(stropt_mp);
- freeb(ordrel_mp);
- freeb(fused_sigurp_mp);
- return;
- }
/* Disable I/OAT during fallback */
tcp->tcp_sodirect = NULL;
@@ -27814,10 +27783,8 @@
tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
laddrlen = faddrlen = sizeof (sin6_t);
- (void) tcp_getsockname(proto_handle, (struct sockaddr *)&laddr,
- &laddrlen, CRED());
- error = tcp_getpeername(proto_handle, (struct sockaddr *)&faddr,
- &faddrlen, CRED());
+ (void) tcp_do_getsockname(tcp, (struct sockaddr *)&laddr, &laddrlen);
+ error = tcp_do_getpeername(tcp, (struct sockaddr *)&faddr, &faddrlen);
if (error != 0)
faddrlen = 0;
@@ -27844,6 +27811,90 @@
tcp->tcp_rcv_last_head = NULL;
tcp->tcp_rcv_last_tail = NULL;
tcp->tcp_rcv_cnt = 0;
+}
+
+/*
+ * An eager is falling back to TPI. All we have to do is send
+ * up a T_CONN_IND.
+ */
+void
+tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
+{
+ tcp_t *listener = eager->tcp_listener;
+ mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind;
+
+ ASSERT(listener != NULL);
+ ASSERT(mp != NULL);
+
+ eager->tcp_conn.tcp_eager_conn_ind = NULL;
+
+ /*
+ * TLI/XTI applications will get confused by
+ * sending eager as an option since it violates
+ * the option semantics. So remove the eager as
+ * option since TLI/XTI app doesn't need it anyway.
+ */
+ if (!direct_sockfs) {
+ struct T_conn_ind *conn_ind;
+
+ conn_ind = (struct T_conn_ind *)mp->b_rptr;
+ conn_ind->OPT_length = 0;
+ conn_ind->OPT_offset = 0;
+ }
+
+ /*
+ * Sockfs guarantees that the listener will not be closed
+ * during fallback. So we can safely use the listener's queue.
+ */
+ putnext(listener->tcp_rq, mp);
+}
+
+int
+tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
+ boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+{
+ tcp_t *tcp;
+ conn_t *connp = (conn_t *)proto_handle;
+ int error;
+ mblk_t *stropt_mp;
+ mblk_t *ordrel_mp;
+ mblk_t *fused_sigurp_mp;
+
+ tcp = connp->conn_tcp;
+
+ stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
+ NULL);
+
+ /* Pre-allocate the T_ordrel_ind mblk. */
+ ASSERT(tcp->tcp_ordrel_mp == NULL);
+ ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
+ STR_NOSIG, NULL);
+ ordrel_mp->b_datap->db_type = M_PROTO;
+ ((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
+ ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
+
+ /* Pre-allocate the M_PCSIG used by fusion */
+ fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL);
+
+ /*
+ * Enter the squeue so that no new packets can come in
+ */
+ error = squeue_synch_enter(connp->conn_sqp, connp, 0);
+ if (error != 0) {
+ /* failed to enter, free all the pre-allocated messages. */
+ freeb(stropt_mp);
+ freeb(ordrel_mp);
+ freeb(fused_sigurp_mp);
+ /*
+ * We cannot process the eager, so at least send out a
+ * RST so the peer can reconnect.
+ */
+ if (tcp->tcp_listener != NULL) {
+ (void) tcp_eager_blowoff(tcp->tcp_listener,
+ tcp->tcp_conn_req_seqnum);
+ }
+ return (ENOMEM);
+ }
/*
* No longer a direct socket
@@ -27859,45 +27910,13 @@
freeb(fused_sigurp_mp);
}
- /*
- * Send T_CONN_IND messages for all ESTABLISHED connections.
- */
- mutex_enter(&tcp->tcp_eager_lock);
- for (eager = tcp->tcp_eager_next_q; eager != NULL;
- eager = eager->tcp_eager_next_q) {
- mp = eager->tcp_conn.tcp_eager_conn_ind;
-
- eager->tcp_conn.tcp_eager_conn_ind = NULL;
- ASSERT(mp != NULL);
- /*
- * TLI/XTI applications will get confused by
- * sending eager as an option since it violates
- * the option semantics. So remove the eager as
- * option since TLI/XTI app doesn't need it anyway.
- */
- if (!TCP_IS_SOCKET(tcp)) {
- struct T_conn_ind *conn_ind;
-
- conn_ind = (struct T_conn_ind *)mp->b_rptr;
- conn_ind->OPT_length = 0;
- conn_ind->OPT_offset = 0;
- }
- if (conn_ind_head == NULL) {
- conn_ind_head = mp;
- } else {
- conn_ind_tail->b_next = mp;
- }
- conn_ind_tail = mp;
- }
- mutex_exit(&tcp->tcp_eager_lock);
-
- mp = conn_ind_head;
- while (mp != NULL) {
- mblk_t *nmp = mp->b_next;
- mp->b_next = NULL;
-
- putnext(tcp->tcp_rq, mp);
- mp = nmp;
+ if (tcp->tcp_listener != NULL) {
+ /* The eager will deal with opts when accept() is called */
+ freeb(stropt_mp);
+ tcp_fallback_eager(tcp, direct_sockfs);
+ } else {
+ tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
+ quiesced_cb);
}
/*
@@ -27905,6 +27924,8 @@
*/
ASSERT(connp->conn_ref >= 2);
squeue_synch_exit(connp->conn_sqp, connp);
+
+ return (0);
}
/* ARGSUSED */
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index a8b6780..7ac90ce 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -805,6 +805,7 @@
(*peer_tcp->tcp_connp->conn_upcalls->su_recv)(
peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size,
flags, &error, &push);
+ ASSERT(error != EOPNOTSUPP);
} else {
if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
(tcp->tcp_valid_bits & TCP_URG_VALID) &&
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 97374be..e61c854 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -277,7 +277,7 @@
extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **,
uint_t *, int *, int, cred_t *);
-extern void tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
so_proto_quiesced_cb_t);
extern sock_downcalls_t sock_tcp_downcalls;
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index f141ee0..c473afd 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -227,6 +227,7 @@
static int udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *,
cred_t *, pid_t);
+static void udp_ulp_recv(conn_t *, mblk_t *);
/* Common routine for TPI and socket module */
static conn_t *udp_do_open(cred_t *, boolean_t, int);
@@ -1206,7 +1207,7 @@
*/
static void
udp_icmp_error(conn_t *connp, mblk_t *mp)
- {
+{
icmph_t *icmph;
ipha_t *ipha;
int iph_hdr_length;
@@ -1286,7 +1287,6 @@
if (sin.sin_port == udp->udp_dstport &&
sin.sin_addr.s_addr ==
V4_PART_OF_V6(udp->udp_v6dst)) {
-
rw_exit(&udp->udp_rwlock);
(*connp->conn_upcalls->su_set_error)
(connp->conn_upper_handle, error);
@@ -1324,7 +1324,6 @@
}
rw_exit(&udp->udp_rwlock);
} else {
-
mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
NULL, 0, error);
}
@@ -1333,6 +1332,7 @@
if (mp1 != NULL)
putnext(connp->conn_rq, mp1);
done:
+ ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
freemsg(mp);
}
@@ -1444,13 +1444,8 @@
* message. Free it, then send our empty message.
*/
freemsg(mp);
- if (!IPCL_IS_NONSTR(connp)) {
- putnext(connp->conn_rq, newmp);
- } else {
- (*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, newmp, 0, 0, &error,
- NULL);
- }
+ udp_ulp_recv(connp, newmp);
+
return;
}
case ICMP6_TIME_EXCEEDED:
@@ -1508,8 +1503,8 @@
if (mp1 != NULL)
putnext(connp->conn_rq, mp1);
}
-
done:
+ ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
freemsg(mp);
}
@@ -3689,7 +3684,7 @@
}
}
-static void
+static mblk_t *
udp_queue_fallback(udp_t *udp, mblk_t *mp)
{
ASSERT(MUTEX_HELD(&udp->udp_recv_lock));
@@ -3706,13 +3701,55 @@
udp->udp_fallback_queue_tail->b_next = mp;
udp->udp_fallback_queue_tail = mp;
}
- mutex_exit(&udp->udp_recv_lock);
+ return (NULL);
} else {
/*
- * no more fallbacks possible, ok to drop lock.
+ * Fallback completed, let the caller putnext() the mblk.
*/
- mutex_exit(&udp->udp_recv_lock);
- putnext(udp->udp_connp->conn_rq, mp);
+ return (mp);
+ }
+}
+
+/*
+ * Deliver data to ULP. In case we have a socket, and it's falling back to
+ * TPI, then we'll queue the mp for later processing.
+ */
+static void
+udp_ulp_recv(conn_t *connp, mblk_t *mp)
+{
+ if (IPCL_IS_NONSTR(connp)) {
+ udp_t *udp = connp->conn_udp;
+ int error;
+
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
+ NULL) < 0) {
+ mutex_enter(&udp->udp_recv_lock);
+ if (error == ENOSPC) {
+ /*
+ * let's confirm while holding the lock
+ */
+ if ((*connp->conn_upcalls->su_recv)
+ (connp->conn_upper_handle, NULL, 0, 0,
+ &error, NULL) < 0) {
+ ASSERT(error == ENOSPC);
+ if (error == ENOSPC) {
+ connp->conn_flow_cntrld =
+ B_TRUE;
+ }
+ }
+ mutex_exit(&udp->udp_recv_lock);
+ } else {
+ ASSERT(error == EOPNOTSUPP);
+ mp = udp_queue_fallback(udp, mp);
+ mutex_exit(&udp->udp_recv_lock);
+ if (mp != NULL)
+ putnext(connp->conn_rq, mp);
+ }
+ }
+ ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
+ } else {
+ putnext(connp->conn_rq, mp);
}
}
@@ -4463,37 +4500,8 @@
if (options_mp != NULL)
freeb(options_mp);
- if (IPCL_IS_NONSTR(connp)) {
- int error;
+ udp_ulp_recv(connp, mp);
- if ((*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
- NULL) < 0) {
- mutex_enter(&udp->udp_recv_lock);
- if (error == ENOSPC) {
- /*
- * let's confirm while holding the lock
- */
- if ((*connp->conn_upcalls->su_recv)
- (connp->conn_upper_handle, NULL, 0, 0,
- &error, NULL) < 0) {
- if (error == ENOSPC) {
- connp->conn_flow_cntrld =
- B_TRUE;
- } else {
- ASSERT(error == EOPNOTSUPP);
- }
- }
- mutex_exit(&udp->udp_recv_lock);
- } else {
- ASSERT(error == EOPNOTSUPP);
- udp_queue_fallback(udp, mp);
- }
- }
- } else {
- putnext(connp->conn_rq, mp);
- }
- ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
return;
tossit:
@@ -5846,7 +5854,7 @@
/* M_DATA for connected socket */
- ASSERT(udp->udp_issocket || IPCL_IS_NONSTR(connp));
+ ASSERT(udp->udp_issocket);
UDP_DBGSTAT(us, udp_data_conn);
mutex_enter(&connp->conn_lock);
@@ -7990,6 +7998,7 @@
us = udp->udp_us;
ASSERT(us != NULL);
+ udp->udp_issocket = B_TRUE;
connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET;
/* Set flow control */
@@ -9272,7 +9281,7 @@
return (udp->udp_dgram_errind ? error : 0);
}
-void
+int
udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
{
@@ -9340,21 +9349,15 @@
if (udp->udp_dontroute)
opts |= SO_DONTROUTE;
- /*
- * Once we grab the drain lock, no data will be send up
- * to the socket. So we notify the socket that the endpoint
- * is quiescent and it's therefore safe move data from
- * the socket to the stream head.
- */
(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
(struct sockaddr *)&laddr, laddrlen,
(struct sockaddr *)&faddr, faddrlen, opts);
- /*
- * push up any packets that were queued in udp_t
- */
-
mutex_enter(&udp->udp_recv_lock);
+ /*
+ * Attempts to send data up during fallback will result in it being
+ * queued in udp_t. Now we push up any queued packets.
+ */
while (udp->udp_fallback_queue_head != NULL) {
mblk_t *mp;
mp = udp->udp_fallback_queue_head;
@@ -9368,10 +9371,15 @@
/*
* No longer a streams less socket
*/
+ rw_enter(&udp->udp_rwlock, RW_WRITER);
connp->conn_flags &= ~IPCL_NONSTR;
+ rw_exit(&udp->udp_rwlock);
+
mutex_exit(&udp->udp_recv_lock);
ASSERT(connp->conn_ref >= 1);
+
+ return (0);
}
static int
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 96f84e4..ba370cb 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -391,7 +391,7 @@
extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **,
uint_t *, int *, int, cred_t *);
-extern void udp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+extern int udp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
so_proto_quiesced_cb_t);
extern sock_downcalls_t sock_udp_downcalls;
diff --git a/usr/src/uts/common/sys/socket_proto.h b/usr/src/uts/common/sys/socket_proto.h
index 8f60ea9..12c9547 100644
--- a/usr/src/uts/common/sys/socket_proto.h
+++ b/usr/src/uts/common/sys/socket_proto.h
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -129,7 +129,7 @@
typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *,
struct T_capability_ack *, struct sockaddr *, socklen_t,
struct sockaddr *, socklen_t, short);
-typedef void (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *,
+typedef int (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *,
boolean_t, so_proto_quiesced_cb_t);
/*
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index c7c0f0a..f4f026f 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -291,14 +291,12 @@
#define SS_SODIRECT 0x00400000 /* transport supports sodirect */
-/* unused 0x01000000 */ /* was SS_LADDR_VALID */
-/* unused 0x02000000 */ /* was SS_FADDR_VALID */
+#define SS_SENTLASTREADSIG 0x01000000 /* last rx signal has been sent */
+#define SS_SENTLASTWRITESIG 0x02000000 /* last tx signal has been sent */
-#define SS_SENTLASTREADSIG 0x10000000 /* last rx signal has been sent */
-#define SS_SENTLASTWRITESIG 0x20000000 /* last tx signal has been sent */
-
-#define SS_FALLBACK_PENDING 0x40000000
-#define SS_FALLBACK_COMP 0x80000000
+#define SS_FALLBACK_DRAIN 0x20000000 /* data was/is being drained */
+#define SS_FALLBACK_PENDING 0x40000000 /* fallback is pending */
+#define SS_FALLBACK_COMP 0x80000000 /* fallback has completed */
/* Set of states when the socket can't be rebound */