6799655 sockets need better handling of STREAMS ioctls
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
index 4683188..7b21fac 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_sops.c
@@ -1203,7 +1203,7 @@
 	}
 
 	mutex_enter(&so->so_lock);
-	if (so->so_state & (SS_FALLBACK_PENDING | SS_FALLBACK_COMP)) {
+	if (so->so_state & (SS_FALLBACK_DRAIN | SS_FALLBACK_COMP)) {
 		SOD_DISABLE(sodp);
 		mutex_exit(&so->so_lock);
 		*errorp = EOPNOTSUPP;
diff --git a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
index dd114db..d01447c 100644
--- a/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
+++ b/usr/src/uts/common/fs/sockfs/sockcommon_subr.c
@@ -37,6 +37,7 @@
 #include <sys/strsubr.h>
 #include <sys/strsun.h>
 #include <sys/atomic.h>
+#include <sys/tihdr.h>
 
 #include <fs/sockfs/sockcommon.h>
 #include <fs/sockfs/socktpi.h>
@@ -1515,8 +1516,75 @@
 }
 
 /*
- * Process STREAMS related ioctls. If a I_PUSH/POP operation is specified
- * then the socket will fall back to TPI.
+ * Handle the I_NREAD STREAM ioctl.
+ */
+static int
+so_strioc_nread(struct sonode *so, intptr_t arg, int mode, int32_t *rvalp)
+{
+	size_t size = 0;
+	int retval;
+	int count = 0;
+	mblk_t *mp;
+
+	if (so->so_downcalls == NULL ||
+	    so->so_downcalls->sd_recv_uio != NULL)
+		return (EINVAL);
+
+	mutex_enter(&so->so_lock);
+	/* Wait for reader to get out of the way. */
+	while (so->so_flag & SOREADLOCKED) {
+		/*
+		 * If reader is waiting for data, then there should be nothing
+		 * on the rcv queue.
+		 */
+		if (so->so_rcv_wakeup)
+			goto out;
+
+		so->so_flag |= SOWANT;
+		/* Do a timed sleep, in case the reader goes to sleep. */
+		(void) cv_timedwait(&so->so_state_cv, &so->so_lock,
+		    lbolt + drv_usectohz(10));
+	}
+
+	/*
+	 * Since we are holding so_lock no new reader will come in, and the
+	 * protocol will not be able to enqueue data. So it's safe to walk
+	 * both rcv queues.
+	 */
+	mp = so->so_rcv_q_head;
+	if (mp != NULL) {
+		size = msgdsize(so->so_rcv_q_head);
+		for (; mp != NULL; mp = mp->b_next)
+			count++;
+	} else {
+		/*
+		 * In case the processing list was empty, get the size of the
+		 * next msg in line.
+		 */
+		size = msgdsize(so->so_rcv_head);
+	}
+
+	for (mp = so->so_rcv_head; mp != NULL; mp = mp->b_next)
+		count++;
+out:
+	mutex_exit(&so->so_lock);
+
+	/*
+	 * Drop down from size_t to the "int" required by the
+	 * interface.  Cap at INT_MAX.
+	 */
+	retval = MIN(size, INT_MAX);
+	if (so_copyout(&retval, (void *)arg, sizeof (retval),
+	    (mode & (int)FKIOCTL))) {
+		return (EFAULT);
+	} else {
+		*rvalp = count;
+		return (0);
+	}
+}
+
+/*
+ * Process STREAM ioctls.
  *
  * Returns:
  *   < 0  - ioctl was not handle
@@ -1526,32 +1594,42 @@
 socket_strioc_common(struct sonode *so, int cmd, intptr_t arg, int mode,
     struct cred *cr, int32_t *rvalp)
 {
+	int retval;
+
+	/* Only STREAM iotcls are handled here */
+	if ((cmd & 0xffffff00U) != STR)
+		return (-1);
+
 	switch (cmd) {
-	case _I_INSERT:
-	case _I_REMOVE:
-	case I_FIND:
-	case I_LIST:
+	case I_CANPUT:
+		/*
+		 * We return an error for I_CANPUT so that isastream(3C) will
+		 * not report the socket as being a STREAM.
+		 */
 		return (EOPNOTSUPP);
-
-	case I_PUSH:
-	case I_POP: {
-		int retval;
-
-		if ((retval = so_tpi_fallback(so, cr)) == 0) {
-			/* Reissue the ioctl */
-			ASSERT(so->so_rcv_q_head == NULL);
-			return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
-		}
-		return (retval);
-	}
+	case I_NREAD:
+		/* Avoid doing a fallback for I_NREAD. */
+		return (so_strioc_nread(so, arg, mode, rvalp));
 	case I_LOOK:
+		/* Avoid doing a fallback for I_LOOK. */
 		if (so_copyout("sockmod", (void *)arg, strlen("sockmod") + 1,
 		    (mode & (int)FKIOCTL))) {
 			return (EFAULT);
 		}
 		return (0);
 	default:
-		return (-1);
+		break;
+	}
+
+	/*
+	 * Try to fall back to TPI, and if successful, reissue the ioctl.
+	 */
+	if ((retval = so_tpi_fallback(so, cr)) == 0) {
+		/* Reissue the ioctl */
+		ASSERT(so->so_rcv_q_head == NULL);
+		return (SOP_IOCTL(so, cmd, arg, mode, cr, rvalp));
+	} else {
+		return (retval);
 	}
 }
 
@@ -1851,7 +1929,7 @@
 	ASSERT(RW_ISWRITER(&so->so_fallback_rwlock));
 
 	mutex_enter(&so->so_lock);
-	so->so_state &= ~SS_FALLBACK_PENDING;
+	so->so_state &= ~(SS_FALLBACK_PENDING|SS_FALLBACK_DRAIN);
 	mutex_exit(&so->so_lock);
 
 	rw_downgrade(&so->so_fallback_rwlock);
@@ -1867,8 +1945,6 @@
  * is safe to synchronize the state. Data can also be moved without
  * risk for reordering.
  *
- * NOTE: urgent data is dropped on the floor.
- *
  * We do not need to hold so_lock, since there can be only one thread
  * operating on the sonode.
  */
@@ -1878,15 +1954,21 @@
     struct sockaddr *faddr, socklen_t faddrlen, short opts)
 {
 	struct sonode *so = (struct sonode *)sock_handle;
+	boolean_t atmark;
 
 	sotpi_update_state(so, tcap, laddr, laddrlen, faddr, faddrlen, opts);
 
+	/*
+	 * Some protocols do not quiece the data path during fallback. Once
+	 * we set the SS_FALLBACK_DRAIN flag any attempt to queue data will
+	 * fail and the protocol is responsible for saving the data for later
+	 * delivery (i.e., once the fallback has completed).
+	 */
 	mutex_enter(&so->so_lock);
+	so->so_state |= SS_FALLBACK_DRAIN;
 	SOCKET_TIMER_CANCEL(so);
 	mutex_exit(&so->so_lock);
-	/*
-	 * Move data to the STREAM head.
-	 */
+
 	if (so->so_rcv_head != NULL) {
 		if (so->so_rcv_q_last_head == NULL)
 			so->so_rcv_q_head = so->so_rcv_head;
@@ -1895,6 +1977,20 @@
 		so->so_rcv_q_last_head = so->so_rcv_last_head;
 	}
 
+	atmark = (so->so_state & SS_RCVATMARK) != 0;
+	/*
+	 * Clear any OOB state having to do with pending data. The TPI
+	 * code path will set the appropriate oob state when we move the
+	 * oob data to the STREAM head. We leave SS_HADOOBDATA since the oob
+	 * data has already been consumed.
+	 */
+	so->so_state &= ~(SS_RCVATMARK|SS_OOBPEND|SS_HAVEOOBDATA);
+
+	ASSERT(so->so_oobmsg != NULL || so->so_oobmark <= so->so_rcv_queued);
+
+	/*
+	 * Move data to the STREAM head.
+	 */
 	while (so->so_rcv_q_head != NULL) {
 		mblk_t *mp = so->so_rcv_q_head;
 		size_t mlen = msgdsize(mp);
@@ -1902,33 +1998,200 @@
 		so->so_rcv_q_head = mp->b_next;
 		mp->b_next = NULL;
 		mp->b_prev = NULL;
+
+		/*
+		 * Send T_EXDATA_IND if we are at the oob mark.
+		 */
+		if (atmark) {
+			struct T_exdata_ind *tei;
+			mblk_t *mp1 = SOTOTPI(so)->sti_exdata_mp;
+
+			SOTOTPI(so)->sti_exdata_mp = NULL;
+			ASSERT(mp1 != NULL);
+			mp1->b_datap->db_type = M_PROTO;
+			tei = (struct T_exdata_ind *)mp1->b_rptr;
+			tei->PRIM_type = T_EXDATA_IND;
+			tei->MORE_flag = 0;
+			mp1->b_wptr = (uchar_t *)&tei[1];
+
+			if (IS_SO_OOB_INLINE(so)) {
+				mp1->b_cont = mp;
+			} else {
+				ASSERT(so->so_oobmsg != NULL);
+				mp1->b_cont = so->so_oobmsg;
+				so->so_oobmsg = NULL;
+
+				/* process current mp next time around */
+				mp->b_next = so->so_rcv_q_head;
+				so->so_rcv_q_head = mp;
+				mlen = 0;
+			}
+			mp = mp1;
+
+			/* we have consumed the oob mark */
+			atmark = B_FALSE;
+		} else if (so->so_oobmark > 0) {
+			/*
+			 * Check if the OOB mark is within the current
+			 * mblk chain. In that case we have to split it up.
+			 */
+			if (so->so_oobmark < mlen) {
+				mblk_t *urg_mp = mp;
+
+				atmark = B_TRUE;
+				mp = NULL;
+				mlen = so->so_oobmark;
+
+				/*
+				 * It is assumed that the OOB mark does
+				 * not land within a mblk.
+				 */
+				do {
+					so->so_oobmark -= MBLKL(urg_mp);
+					mp = urg_mp;
+					urg_mp = urg_mp->b_cont;
+				} while (so->so_oobmark > 0);
+				mp->b_cont = NULL;
+				if (urg_mp != NULL) {
+					urg_mp->b_next = so->so_rcv_q_head;
+					so->so_rcv_q_head = urg_mp;
+				}
+			} else {
+				so->so_oobmark -= mlen;
+				if (so->so_oobmark == 0)
+					atmark = B_TRUE;
+			}
+		}
+
+		/*
+		 * Queue data on the STREAM head.
+		 */
 		so->so_rcv_queued -= mlen;
 		putnext(q, mp);
 	}
-	ASSERT(so->so_rcv_queued == 0);
 	so->so_rcv_head = NULL;
 	so->so_rcv_last_head = NULL;
 	so->so_rcv_q_head = NULL;
 	so->so_rcv_q_last_head = NULL;
 
-#ifdef DEBUG
-	if (so->so_oobmsg != NULL || so->so_oobmark > 0) {
-		cmn_err(CE_NOTE, "losing oob data due to tpi fallback\n");
-	}
-#endif
-	if (so->so_oobmsg != NULL) {
-		freemsg(so->so_oobmsg);
-		so->so_oobmsg = NULL;
-	}
-	so->so_oobmark = 0;
+	/*
+	 * Check if the oob byte is at the end of the data stream, or if the
+	 * oob byte has not yet arrived. In the latter case we have to send a
+	 * SIGURG and a mark indicator to the STREAM head. The mark indicator
+	 * is needed to guarantee correct behavior for SIOCATMARK. See block
+	 * comment in socktpi.h for more details.
+	 */
+	if (atmark || so->so_oobmark > 0) {
+		mblk_t *mp;
 
+		if (atmark && so->so_oobmsg != NULL) {
+			struct T_exdata_ind *tei;
+
+			mp = SOTOTPI(so)->sti_exdata_mp;
+			SOTOTPI(so)->sti_exdata_mp = NULL;
+			ASSERT(mp != NULL);
+			mp->b_datap->db_type = M_PROTO;
+			tei = (struct T_exdata_ind *)mp->b_rptr;
+			tei->PRIM_type = T_EXDATA_IND;
+			tei->MORE_flag = 0;
+			mp->b_wptr = (uchar_t *)&tei[1];
+
+			mp->b_cont = so->so_oobmsg;
+			so->so_oobmsg = NULL;
+
+			putnext(q, mp);
+		} else {
+			/* Send up the signal */
+			mp = SOTOTPI(so)->sti_exdata_mp;
+			SOTOTPI(so)->sti_exdata_mp = NULL;
+			ASSERT(mp != NULL);
+			DB_TYPE(mp) = M_PCSIG;
+			*mp->b_wptr++ = (uchar_t)SIGURG;
+			putnext(q, mp);
+
+			/* Send up the mark indicator */
+			mp = SOTOTPI(so)->sti_urgmark_mp;
+			SOTOTPI(so)->sti_urgmark_mp = NULL;
+			mp->b_flag = atmark ? MSGMARKNEXT : MSGNOTMARKNEXT;
+			putnext(q, mp);
+
+			so->so_oobmark = 0;
+		}
+	}
+
+	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
+		freeb(SOTOTPI(so)->sti_exdata_mp);
+		SOTOTPI(so)->sti_exdata_mp = NULL;
+	}
+
+	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
+		freeb(SOTOTPI(so)->sti_urgmark_mp);
+		SOTOTPI(so)->sti_urgmark_mp = NULL;
+	}
+
+	ASSERT(so->so_oobmark == 0);
 	ASSERT(so->so_rcv_queued == 0);
 }
 
+#ifdef DEBUG
+/*
+ * Do an integrity check of the sonode. This should be done if a
+ * fallback fails after sonode has initially been converted to use
+ * TPI and subsequently have to be reverted.
+ *
+ * Failure to pass the integrity check will panic the system.
+ */
+void
+so_integrity_check(struct sonode *cur, struct sonode *orig)
+{
+	VERIFY(cur->so_vnode == orig->so_vnode);
+	VERIFY(cur->so_ops == orig->so_ops);
+	/*
+	 * For so_state we can only VERIFY the state flags in CHECK_STATE.
+	 * The other state flags might be affected by a notification from the
+	 * protocol.
+	 */
+#define	CHECK_STATE	(SS_CANTRCVMORE|SS_CANTSENDMORE|SS_NDELAY|SS_NONBLOCK| \
+	SS_ASYNC|SS_ACCEPTCONN|SS_SAVEDEOR|SS_RCVATMARK|SS_OOBPEND| \
+	SS_HAVEOOBDATA|SS_HADOOBDATA|SS_SENTLASTREADSIG|SS_SENTLASTWRITESIG)
+	VERIFY((cur->so_state & (orig->so_state & CHECK_STATE)) ==
+	    (orig->so_state & CHECK_STATE));
+	VERIFY(cur->so_mode == orig->so_mode);
+	VERIFY(cur->so_flag == orig->so_flag);
+	VERIFY(cur->so_count == orig->so_count);
+	/* Cannot VERIFY so_proto_connid; proto can update it */
+	VERIFY(cur->so_sockparams == orig->so_sockparams);
+	/* an error might have been recorded, but it can not be lost */
+	VERIFY(cur->so_error != 0 || orig->so_error == 0);
+	VERIFY(cur->so_family == orig->so_family);
+	VERIFY(cur->so_type == orig->so_type);
+	VERIFY(cur->so_protocol == orig->so_protocol);
+	VERIFY(cur->so_version == orig->so_version);
+	/* New conns might have arrived, but none should have been lost */
+	VERIFY(cur->so_acceptq_len >= orig->so_acceptq_len);
+	VERIFY(cur->so_acceptq_head == orig->so_acceptq_head);
+	VERIFY(cur->so_backlog == orig->so_backlog);
+	/* New OOB migth have arrived, but mark should not have been lost */
+	VERIFY(cur->so_oobmark >= orig->so_oobmark);
+	/* Cannot VERIFY so_oobmsg; the proto might have sent up a new one */
+	VERIFY(cur->so_pgrp == orig->so_pgrp);
+	VERIFY(cur->so_peercred == orig->so_peercred);
+	VERIFY(cur->so_cpid == orig->so_cpid);
+	VERIFY(cur->so_zoneid == orig->so_zoneid);
+	/* New data migth have arrived, but none should have been lost */
+	VERIFY(cur->so_rcv_queued >= orig->so_rcv_queued);
+	VERIFY(cur->so_rcv_q_head == orig->so_rcv_q_head);
+	VERIFY(cur->so_rcv_head == orig->so_rcv_head);
+	VERIFY(cur->so_proto_handle == orig->so_proto_handle);
+	VERIFY(cur->so_downcalls == orig->so_downcalls);
+	/* Cannot VERIFY so_proto_props; they can be updated by proto */
+}
+#endif
+
 /*
  * so_tpi_fallback()
  *
- * This is fallback initation routine; things start here.
+ * This is the fallback initation routine; things start here.
  *
  * Basic strategy:
  *   o Block new socket operations from coming in
@@ -1944,10 +2207,13 @@
 	int error;
 	queue_t *q;
 	struct sockparams *sp;
-	struct sockparams *newsp;
+	struct sockparams *newsp = NULL;
 	so_proto_fallback_func_t fbfunc;
 	boolean_t direct;
-
+	struct sonode *nso;
+#ifdef DEBUG
+	struct sonode origso;
+#endif
 	error = 0;
 	sp = so->so_sockparams;
 	fbfunc = sp->sp_smod_info->smod_proto_fallback_func;
@@ -1965,6 +2231,13 @@
 	 */
 	if (!so_start_fallback(so))
 		return (EAGAIN);
+#ifdef DEBUG
+	/*
+	 * Make a copy of the sonode in case we need to make an integrity
+	 * check later on.
+	 */
+	bcopy(so, &origso, sizeof (*so));
+#endif
 
 	newsp = sockparams_hold_ephemeral_bydev(so->so_family, so->so_type,
 	    so->so_protocol, so->so_sockparams->sp_sdev_info.sd_devpath,
@@ -1983,29 +2256,47 @@
 	}
 
 	/* Turn sonode into a TPI socket */
-	q = sotpi_convert_sonode(so, newsp, &direct, cr);
-	if (q == NULL) {
-		zcmn_err(getzoneid(), CE_WARN,
-		    "Failed to convert socket to TPI. Pid = %d\n",
-		    curproc->p_pid);
-		SOCKPARAMS_DEC_REF(newsp);
-		error = EINVAL;
+	error = sotpi_convert_sonode(so, newsp, &direct, &q, cr);
+	if (error != 0)
 		goto out;
-	}
+
 
 	/*
 	 * Now tell the protocol to start using TPI. so_quiesced_cb be
 	 * called once it's safe to synchronize state.
 	 */
 	DTRACE_PROBE1(proto__fallback__begin, struct sonode *, so);
-	/* FIXME assumes this cannot fail. TCP can fail to enter squeue */
-	(*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
+	error = (*fbfunc)(so->so_proto_handle, q, direct, so_quiesced_cb);
 	DTRACE_PROBE1(proto__fallback__end, struct sonode *, so);
 
+	if (error != 0) {
+		/* protocol was unable to do a fallback, revert the sonode */
+		sotpi_revert_sonode(so, cr);
+		goto out;
+	}
+
 	/*
-	 * Free all pending connection indications, i.e., socket_accept() has
-	 * not yet pulled the connection of the queue. The transport sent
-	 * a T_CONN_IND message for each pending connection to the STREAM head.
+	 * Walk the accept queue and notify the proto that they should
+	 * fall back to TPI. The protocol will send up the T_CONN_IND.
+	 */
+	nso = so->so_acceptq_head;
+	while (nso != NULL) {
+		int rval;
+
+		DTRACE_PROBE1(proto__fallback__begin, struct sonode *, nso);
+		rval = (*fbfunc)(nso->so_proto_handle, NULL, direct, NULL);
+		DTRACE_PROBE1(proto__fallback__end, struct sonode *, nso);
+		if (rval != 0) {
+			zcmn_err(getzoneid(), CE_WARN,
+			    "Failed to convert socket in accept queue to TPI. "
+			    "Pid = %d\n", curproc->p_pid);
+		}
+		nso = nso->so_acceptq_next;
+	}
+
+	/*
+	 * Now flush the acceptq, this will destroy all sockets. They will
+	 * be recreated in sotpi_accept().
 	 */
 	so_acceptq_flush(so);
 
@@ -2020,10 +2311,6 @@
 	so->so_ops = &sotpi_sonodeops;
 
 	/*
-	 * No longer a non streams socket
-	 */
-	so->so_not_str = B_FALSE;
-	/*
 	 * Wake up any threads stuck in poll. This is needed since the poll
 	 * head changes when the fallback happens (moves from the sonode to
 	 * the STREAMS head).
@@ -2032,5 +2319,16 @@
 out:
 	so_end_fallback(so);
 
+	if (error != 0) {
+#ifdef DEBUG
+		so_integrity_check(so, &origso);
+#endif
+		zcmn_err(getzoneid(), CE_WARN,
+		    "Failed to convert socket to TPI (err=%d). Pid = %d\n",
+		    error, curproc->p_pid);
+		if (newsp != NULL)
+			SOCKPARAMS_DEC_REF(newsp);
+	}
+
 	return (error);
 }
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.c b/usr/src/uts/common/fs/sockfs/socktpi.c
index d801c1e..80738f5 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.c
+++ b/usr/src/uts/common/fs/sockfs/socktpi.c
@@ -6674,21 +6674,24 @@
  * Given a non-TPI sonode, allocate and prep it to be ready for TPI.
  *
  * Caller must still update state and mode using sotpi_update_state().
- *
- * Returns the STREAM queue that the protocol should use.
  */
-queue_t *
+int
 sotpi_convert_sonode(struct sonode *so, struct sockparams *newsp,
-    boolean_t *direct, struct cred *cr)
+    boolean_t *direct, queue_t **qp, struct cred *cr)
 {
 	sotpi_info_t *sti;
 	struct sockparams *origsp = so->so_sockparams;
 	sock_lower_handle_t handle = so->so_proto_handle;
-	uint_t old_state = so->so_state;
 	struct stdata *stp;
 	struct vnode *vp;
 	queue_t *q;
+	int error = 0;
 
+	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
+	    SS_FALLBACK_PENDING);
+	ASSERT(SOCK_IS_NONSTR(so));
+
+	*qp = NULL;
 	*direct = B_FALSE;
 	so->so_sockparams = newsp;
 	/*
@@ -6697,11 +6700,10 @@
 	(void) sotpi_info_create(so, KM_SLEEP);
 	sotpi_info_init(so);
 
-	if (sotpi_init(so, NULL, cr, SO_FALLBACK) != 0) {
+	if ((error = sotpi_init(so, NULL, cr, SO_FALLBACK)) != 0) {
 		sotpi_info_fini(so);
 		sotpi_info_destroy(so);
-		so->so_state = old_state;
-		return (NULL);
+		return (error);
 	}
 	ASSERT(handle == so->so_proto_handle);
 	sti = SOTOTPI(so);
@@ -6709,6 +6711,23 @@
 		*direct = B_TRUE;
 
 	/*
+	 * When it comes to urgent data we have two cases to deal with;
+	 * (1) The oob byte has already arrived, or (2) the protocol has
+	 * notified that oob data is pending, but it has not yet arrived.
+	 *
+	 * For (1) all we need to do is send a T_EXDATA_IND to indicate were
+	 * in the byte stream the oob byte is. For (2) we have to send a
+	 * SIGURG (M_PCSIG), followed by a zero-length mblk indicating whether
+	 * the oob byte will be the next byte from the protocol.
+	 *
+	 * So in the worst case we need two mblks, one for the signal, another
+	 * for mark indication. In that case we use the exdata_mp for the sig.
+	 */
+	sti->sti_exdata_mp = allocb_wait(sizeof (struct T_exdata_ind), BPRI_MED,
+	    STR_NOSIG, NULL);
+	sti->sti_urgmark_mp = allocb_wait(0, BPRI_MED, STR_NOSIG, NULL);
+
+	/*
 	 * Keep the original sp around so we can properly dispose of the
 	 * sonode when the socket is being closed.
 	 */
@@ -6728,10 +6747,8 @@
 	 * connection indications.
 	 */
 	if (so->so_pgrp != 0) {
-		mutex_enter(&so->so_lock);
 		if (so_set_events(so, so->so_vnode, cr) != 0)
 			so->so_pgrp = 0;
-		mutex_exit(&so->so_lock);
 	}
 
 	/*
@@ -6748,9 +6765,52 @@
 	 */
 	while (q->q_next != NULL)
 		q = q->q_next;
-	q = _RD(q);
+	*qp = _RD(q);
 
-	return (q);
+	/* This is now a STREAMS sockets */
+	so->so_not_str = B_FALSE;
+
+	return (error);
+}
+
+/*
+ * Revert a TPI sonode. It is only allowed to revert the sonode during
+ * the fallback process.
+ */
+void
+sotpi_revert_sonode(struct sonode *so, struct cred *cr)
+{
+	vnode_t *vp = SOTOV(so);
+
+	ASSERT((so->so_state & (SS_FALLBACK_PENDING|SS_FALLBACK_COMP)) ==
+	    SS_FALLBACK_PENDING);
+	ASSERT(!SOCK_IS_NONSTR(so));
+	ASSERT(vp->v_stream != NULL);
+
+	if (SOTOTPI(so)->sti_exdata_mp != NULL) {
+		freeb(SOTOTPI(so)->sti_exdata_mp);
+		SOTOTPI(so)->sti_exdata_mp = NULL;
+	}
+
+	if (SOTOTPI(so)->sti_urgmark_mp != NULL) {
+		freeb(SOTOTPI(so)->sti_urgmark_mp);
+		SOTOTPI(so)->sti_urgmark_mp = NULL;
+	}
+
+	strclean(vp);
+	(void) strclose(vp, FREAD|FWRITE|SO_FALLBACK, cr);
+
+	/*
+	 * Restore the original sockparams. The caller is responsible for
+	 * dropping the ref to the new sp.
+	 */
+	so->so_sockparams = SOTOTPI(so)->sti_orig_sp;
+
+	sotpi_info_fini(so);
+	sotpi_info_destroy(so);
+
+	/* This is no longer a STREAMS sockets */
+	so->so_not_str = B_TRUE;
 }
 
 void
@@ -6815,8 +6875,7 @@
 {
 	sotpi_info_t *sti;
 
-	if (so == NULL)
-		return (NULL);
+	ASSERT(so != NULL);
 
 	sti = (sotpi_info_t *)so->so_priv;
 
@@ -6845,6 +6904,9 @@
 	sti->sti_nl7c_uri	= NULL;
 	sti->sti_nl7c_rcv_mp	= NULL;
 
+	sti->sti_exdata_mp	= NULL;
+	sti->sti_urgmark_mp	= NULL;
+
 	mutex_init(&sti->sti_plumb_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&sti->sti_ack_cv, NULL, CV_DEFAULT, NULL);
 
@@ -6870,6 +6932,9 @@
 	ASSERT(sti->sti_nl7c_uri == NULL);
 	ASSERT(sti->sti_nl7c_rcv_mp == NULL);
 
+	ASSERT(sti->sti_exdata_mp == NULL);
+	ASSERT(sti->sti_urgmark_mp == NULL);
+
 	mutex_destroy(&sti->sti_plumb_lock);
 	cv_destroy(&sti->sti_ack_cv);
 }
diff --git a/usr/src/uts/common/fs/sockfs/socktpi.h b/usr/src/uts/common/fs/sockfs/socktpi.h
index 4c1a5de..cee3a5d 100644
--- a/usr/src/uts/common/fs/sockfs/socktpi.h
+++ b/usr/src/uts/common/fs/sockfs/socktpi.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -252,6 +252,12 @@
 	kssl_endpt_type_t	sti_kssl_type;	/* is proxy/is proxied/none */
 	kssl_ent_t		sti_kssl_ent;	/* SSL config entry */
 	kssl_ctx_t		sti_kssl_ctx;	/* SSL session context */
+
+	/*
+	 * The mblks below are only allocated and used during fallback.
+	 */
+	mblk_t	*sti_exdata_mp;		/* T_EXDATA_IND or SIGURG */
+	mblk_t	*sti_urgmark_mp;	/* mark indication */
 } sotpi_info_t;
 
 struct T_capability_ack;
@@ -259,8 +265,9 @@
 extern sonodeops_t sotpi_sonodeops;
 
 extern int	socktpi_init(void);
-extern queue_t	*sotpi_convert_sonode(struct sonode *, struct sockparams *,
-		    boolean_t *, struct cred *);
+extern int	sotpi_convert_sonode(struct sonode *, struct sockparams *,
+		    boolean_t *, queue_t **, struct cred *);
+extern void	sotpi_revert_sonode(struct sonode *, struct cred *);
 extern void	sotpi_update_state(struct sonode *, struct T_capability_ack *,
 		    struct sockaddr *, socklen_t, struct sockaddr *, socklen_t,
 		    short);
diff --git a/usr/src/uts/common/inet/ip/icmp.c b/usr/src/uts/common/inet/ip/icmp.c
index e26254a..eb0162a 100644
--- a/usr/src/uts/common/inet/ip/icmp.c
+++ b/usr/src/uts/common/inet/ip/icmp.c
@@ -151,6 +151,7 @@
 static void	icmp_wput_other(queue_t *q, mblk_t *mp);
 static void	icmp_wput_iocdata(queue_t *q, mblk_t *mp);
 static void	icmp_wput_restricted(queue_t *q, mblk_t *mp);
+static void	icmp_ulp_recv(conn_t *, mblk_t *);
 
 static void	*rawip_stack_init(netstackid_t stackid, netstack_t *ns);
 static void	rawip_stack_fini(netstackid_t stackid, void *arg);
@@ -1131,6 +1132,7 @@
 	sin = sin_null;
 	sin.sin_family = AF_INET;
 	sin.sin_addr.s_addr = ipha->ipha_dst;
+
 	if (IPCL_IS_NONSTR(connp)) {
 		rw_enter(&icmp->icmp_rwlock, RW_WRITER);
 		if (icmp->icmp_state == TS_DATA_XFER) {
@@ -1147,13 +1149,13 @@
 		}
 		rw_exit(&icmp->icmp_rwlock);
 	} else {
-
 		mp1 = mi_tpi_uderror_ind((char *)&sin, sizeof (sin_t), NULL,
 		    0, error);
 		if (mp1 != NULL)
 			putnext(connp->conn_rq, mp1);
 	}
 done:
+	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
 	freemsg(mp);
 }
 
@@ -1264,14 +1266,8 @@
 		 * message.  Free it, then send our empty message.
 		 */
 		freemsg(mp);
-		if (!IPCL_IS_NONSTR(connp)) {
-			putnext(connp->conn_rq, newmp);
-		} else {
-			(*connp->conn_upcalls->su_recv)
-			    (connp->conn_upper_handle, newmp, 0, 0, &error,
-			    NULL);
-			ASSERT(error == 0);
-		}
+		icmp_ulp_recv(connp, newmp);
+
 		return;
 	}
 	case ICMP6_TIME_EXCEEDED:
@@ -1322,13 +1318,13 @@
 		}
 		rw_exit(&icmp->icmp_rwlock);
 	} else {
-
 		mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
 		    NULL, 0, error);
 		if (mp1 != NULL)
 			putnext(connp->conn_rq, mp1);
 	}
 done:
+	ASSERT(!RW_ISWRITER(&icmp->icmp_rwlock));
 	freemsg(mp);
 }
 
@@ -3339,7 +3335,8 @@
 	icmppa->icmp_param_value = new_value;
 	return (0);
 }
-static void
+
+static mblk_t *
 icmp_queue_fallback(icmp_t *icmp, mblk_t *mp)
 {
 	ASSERT(MUTEX_HELD(&icmp->icmp_recv_lock));
@@ -3356,13 +3353,56 @@
 			icmp->icmp_fallback_queue_tail->b_next = mp;
 			icmp->icmp_fallback_queue_tail = mp;
 		}
-		mutex_exit(&icmp->icmp_recv_lock);
+		return (NULL);
 	} else {
 		/*
-		 * no more fallbacks possible, ok to drop lock.
+		 * Fallback completed, let the caller putnext() the mblk.
 		 */
-		mutex_exit(&icmp->icmp_recv_lock);
-		putnext(icmp->icmp_connp->conn_rq, mp);
+		return (mp);
+	}
+}
+
+/*
+ * Deliver data to ULP. In case we have a socket, and it's falling back to
+ * TPI, then we'll queue the mp for later processing.
+ */
+static void
+icmp_ulp_recv(conn_t *connp, mblk_t *mp)
+{
+
+	if (IPCL_IS_NONSTR(connp)) {
+		icmp_t *icmp = connp->conn_icmp;
+		int error;
+
+		if ((*connp->conn_upcalls->su_recv)
+		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
+		    NULL) < 0) {
+			mutex_enter(&icmp->icmp_recv_lock);
+			if (error == ENOSPC) {
+				/*
+				 * let's confirm while holding the lock
+				 */
+				if ((*connp->conn_upcalls->su_recv)
+				    (connp->conn_upper_handle, NULL, 0, 0,
+				    &error, NULL) < 0) {
+					ASSERT(error == ENOSPC);
+					if (error == ENOSPC) {
+						connp->conn_flow_cntrld =
+						    B_TRUE;
+					}
+				}
+				mutex_exit(&icmp->icmp_recv_lock);
+			} else {
+				ASSERT(error == EOPNOTSUPP);
+				mp = icmp_queue_fallback(icmp, mp);
+				mutex_exit(&icmp->icmp_recv_lock);
+				if (mp != NULL)
+					putnext(connp->conn_rq, mp);
+			}
+		}
+		ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
+	} else {
+		putnext(connp->conn_rq, mp);
 	}
 }
 
@@ -3391,7 +3431,6 @@
 	uint_t			icmp_opt = 0;
 	boolean_t		icmp_ipv6_recvhoplimit = B_FALSE;
 	uint_t			hopstrip;
-	int			error;
 
 	ASSERT(connp->conn_flags & IPCL_RAWIPCONN);
 
@@ -4038,35 +4077,8 @@
 	BUMP_MIB(&is->is_rawip_mib, rawipInDatagrams);
 
 deliver:
-	if (IPCL_IS_NONSTR(connp)) {
-		if ((*connp->conn_upcalls->su_recv)
-		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
-		    NULL) < 0) {
-			mutex_enter(&icmp->icmp_recv_lock);
-			if (error == ENOSPC) {
-				/*
-				 * let's confirm while holding the lock
-				 */
-				if ((*connp->conn_upcalls->su_recv)
-				    (connp->conn_upper_handle, NULL, 0, 0,
-				    &error, NULL) < 0) {
-					if (error == ENOSPC) {
-						connp->conn_flow_cntrld =
-						    B_TRUE;
-					} else {
-						ASSERT(error == EOPNOTSUPP);
-					}
-				}
-				mutex_exit(&icmp->icmp_recv_lock);
-			} else {
-				ASSERT(error == EOPNOTSUPP);
-				icmp_queue_fallback(icmp, mp);
-			}
-		}
-	} else {
-		putnext(connp->conn_rq, mp);
-	}
-	ASSERT(MUTEX_NOT_HELD(&icmp->icmp_recv_lock));
+	icmp_ulp_recv(connp, mp);
+
 }
 
 /*
@@ -5968,7 +5980,7 @@
 }
 
 /* ARGSUSED */
-void
+int
 rawip_fallback(sock_lower_handle_t proto_handle, queue_t *q,
     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
 {
@@ -6032,20 +6044,14 @@
 	if (icmp->icmp_dontroute)
 		opts |= SO_DONTROUTE;
 
-	/*
-	 * Once we grab the drain lock, no data will be send up
-	 * to the socket. So we notify the socket that the endpoint
-	 * is quiescent and it's therefore safe move data from
-	 * the socket to the stream head.
-	 */
 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
 	    (struct sockaddr *)&laddr, laddrlen,
 	    (struct sockaddr *)&faddr, faddrlen, opts);
 
 	/*
-	 * push up any packets that were queued in icmp_t
+	 * Attempts to send data up during fallback will result in it being
+	 * queued in udp_t. Now we push up any queued packets.
 	 */
-
 	mutex_enter(&icmp->icmp_recv_lock);
 	while (icmp->icmp_fallback_queue_head != NULL) {
 		mblk_t	*mp;
@@ -6058,15 +6064,22 @@
 		mutex_enter(&icmp->icmp_recv_lock);
 	}
 	icmp->icmp_fallback_queue_tail = icmp->icmp_fallback_queue_head;
+
 	/*
 	 * No longer a streams less socket
 	 */
+	rw_enter(&icmp->icmp_rwlock, RW_WRITER);
 	connp->conn_flags &= ~IPCL_NONSTR;
+	rw_exit(&icmp->icmp_rwlock);
+
 	mutex_exit(&icmp->icmp_recv_lock);
+
 	ASSERT(icmp->icmp_fallback_queue_head == NULL &&
 	    icmp->icmp_fallback_queue_tail == NULL);
 
 	ASSERT(connp->conn_ref >= 1);
+
+	return (0);
 }
 
 /* ARGSUSED */
diff --git a/usr/src/uts/common/inet/rawip_impl.h b/usr/src/uts/common/inet/rawip_impl.h
index f818247..241132b 100644
--- a/usr/src/uts/common/inet/rawip_impl.h
+++ b/usr/src/uts/common/inet/rawip_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1990 Mentat Inc. */
@@ -170,7 +170,7 @@
 
 extern sock_lower_handle_t rawip_create(int, int, int, sock_downcalls_t **,
     uint_t *, int *, int, cred_t *);
-extern void rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+extern int rawip_fallback(sock_lower_handle_t, queue_t *, boolean_t,
     so_proto_quiesced_cb_t);
 
 extern sock_downcalls_t sock_rawip_downcalls;
diff --git a/usr/src/uts/common/inet/tcp/tcp.c b/usr/src/uts/common/inet/tcp/tcp.c
index ddbcb82..af748fe 100644
--- a/usr/src/uts/common/inet/tcp/tcp.c
+++ b/usr/src/uts/common/inet/tcp/tcp.c
@@ -14059,6 +14059,11 @@
 				(*connp->conn_upcalls->su_recv)
 				    (connp->conn_upper_handle, mp, seg_len,
 				    MSG_OOB, &error, NULL);
+				/*
+				 * We should never be in middle of a
+				 * fallback, the squeue guarantees that.
+				 */
+				ASSERT(error != EOPNOTSUPP);
 				mp = NULL;
 				goto update_ack;
 			} else if (!tcp->tcp_urp_mp) {
@@ -15157,12 +15162,13 @@
 				if ((*connp->conn_upcalls->su_recv)
 				    (connp->conn_upper_handle, mp,
 				    seg_len, 0, &error, NULL) <= 0) {
-					if (error == ENOSPC) {
+					/*
+					 * We should never be in middle of a
+					 * fallback, the squeue guarantees that.
+					 */
+					ASSERT(error != EOPNOTSUPP);
+					if (error == ENOSPC)
 						tcp->tcp_rwnd -= seg_len;
-					} else if (error == EOPNOTSUPP) {
-						tcp_rcv_enqueue(tcp, mp,
-						    seg_len);
-					}
 				}
 			} else if (sodp != NULL) {
 				mutex_enter(sodp->sod_lockp);
@@ -15216,11 +15222,13 @@
 			if ((*connp->conn_upcalls->su_recv)(
 			    connp->conn_upper_handle,
 			    mp, seg_len, 0, &error, &push) <= 0) {
-				if (error == ENOSPC) {
+				/*
+				 * We should never be in middle of a
+				 * fallback, the squeue guarantees that.
+				 */
+				ASSERT(error != EOPNOTSUPP);
+				if (error == ENOSPC)
 					tcp->tcp_rwnd -= seg_len;
-				} else if (error == EOPNOTSUPP) {
-					tcp_rcv_enqueue(tcp, mp, seg_len);
-				}
 			} else if (push) {
 				/*
 				 * PUSH bit set and sockfs is not
@@ -18169,9 +18177,8 @@
 				    0, &error, &push);
 				if (space_left < 0) {
 					/*
-					 * At this point the eager is not
-					 * visible to anyone, so fallback
-					 * can not happen.
+					 * We should never be in middle of a
+					 * fallback, the squeue guarantees that.
 					 */
 					ASSERT(error != EOPNOTSUPP);
 				}
@@ -27700,72 +27707,34 @@
 /*
  * tcp_fallback
  *
- * A direct socket is falling back to using STREAMS. Hanging
- * off of the queue is a temporary tcp_t, which was created using
- * tcp_open(). The tcp_open() was called as part of the regular
- * sockfs create path, i.e., the SO_SOCKSTR flag is passed down,
- * and therefore the temporary tcp_t is marked to be a socket
- * (i.e., IPCL_SOCKET, tcp_issocket). So the optimizations
- * introduced by FireEngine will be used.
+ * A direct socket is falling back to using STREAMS. The queue
+ * that is being passed down was created using tcp_open() with
+ * the SO_FALLBACK flag set. As a result, the queue is not
+ * associated with a conn, and the q_ptrs instead contain the
+ * dev and minor area that should be used.
  *
- * The tcp_t associated with the socket falling back will
- * still be marked as a socket, although the direct socket flag
- * (IPCL_NONSTR) is removed. A fall back to true TPI semantics
- * will not take place until a _SIOCSOCKFALLBACK ioctl is issued.
- *
- * If the above mentioned behavior, i.e., the tmp tcp_t is created
- * as a STREAMS/TPI endpoint, then we will need to do more work here.
- * Such as inserting the direct socket into the acceptor hash.
+ * The 'direct_sockfs' flag indicates whether the FireEngine
+ * optimizations should be used. The common case would be that
+ * optimizations are enabled, and they might be subsequently
+ * disabled using the _SIOCSOCKFALLBACK ioctl.
+ */
+
+/*
+ * An active connection is falling back to TPI. Gather all the information
+ * required by the STREAM head and TPI sonode and send it up.
  */
 void
-tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
+tcp_fallback_noneager(tcp_t *tcp, mblk_t *stropt_mp, queue_t *q,
     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
 {
-	tcp_t			*tcp, *eager;
-	conn_t 			*connp = (conn_t *)proto_handle;
-	int			error;
+	conn_t			*connp = tcp->tcp_connp;
+	struct stroptions	*stropt;
 	struct T_capability_ack tca;
 	struct sockaddr_in6	laddr, faddr;
 	socklen_t 		laddrlen, faddrlen;
 	short			opts;
-	struct stroptions	*stropt;
-	mblk_t			*stropt_mp;
+	int			error;
 	mblk_t			*mp;
-	mblk_t			*conn_ind_head = NULL;
-	mblk_t			*conn_ind_tail = NULL;
-	mblk_t			*ordrel_mp;
-	mblk_t			*fused_sigurp_mp;
-
-	tcp = connp->conn_tcp;
-	/*
-	 * No support for acceptor fallback
-	 */
-	ASSERT(q->q_qinfo != &tcp_acceptor_rinit);
-
-	stropt_mp = allocb_wait(sizeof (*stropt), BPRI_HI, STR_NOSIG, NULL);
-
-	/* Pre-allocate the T_ordrel_ind mblk. */
-	ASSERT(tcp->tcp_ordrel_mp == NULL);
-	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
-	    STR_NOSIG, NULL);
-	ordrel_mp->b_datap->db_type = M_PROTO;
-	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
-	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
-
-	/* Pre-allocate the M_PCSIG anyway */
-	fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL);
-
-	/*
-	 * Enter the squeue so that no new packets can come in
-	 */
-	error = squeue_synch_enter(connp->conn_sqp, connp, 0);
-	if (error != 0) {
-		/* failed to enter, free all the pre-allocated messages. */
-		freeb(stropt_mp);
-		freeb(ordrel_mp);
-		freeb(fused_sigurp_mp);
-		return;
-	}
 
 	/* Disable I/OAT during fallback */
 	tcp->tcp_sodirect = NULL;
@@ -27814,10 +27783,8 @@
 	tcp_do_capability_ack(tcp, &tca, TC1_INFO|TC1_ACCEPTOR_ID);
 
 	laddrlen = faddrlen = sizeof (sin6_t);
-	(void) tcp_getsockname(proto_handle, (struct sockaddr *)&laddr,
-	    &laddrlen, CRED());
-	error = tcp_getpeername(proto_handle, (struct sockaddr *)&faddr,
-	    &faddrlen, CRED());
+	(void) tcp_do_getsockname(tcp, (struct sockaddr *)&laddr, &laddrlen);
+	error = tcp_do_getpeername(tcp, (struct sockaddr *)&faddr, &faddrlen);
 	if (error != 0)
 		faddrlen = 0;
 
@@ -27844,6 +27811,90 @@
 	tcp->tcp_rcv_last_head = NULL;
 	tcp->tcp_rcv_last_tail = NULL;
 	tcp->tcp_rcv_cnt = 0;
+}
+
+/*
+ * An eager is falling back to TPI. All we have to do is send
+ * up a T_CONN_IND.
+ */
+void
+tcp_fallback_eager(tcp_t *eager, boolean_t direct_sockfs)
+{
+	tcp_t *listener = eager->tcp_listener;
+	mblk_t *mp = eager->tcp_conn.tcp_eager_conn_ind;
+
+	ASSERT(listener != NULL);
+	ASSERT(mp != NULL);
+
+	eager->tcp_conn.tcp_eager_conn_ind = NULL;
+
+	/*
+	 * TLI/XTI applications will get confused by
+	 * sending eager as an option since it violates
+	 * the option semantics. So remove the eager as
+	 * option since TLI/XTI app doesn't need it anyway.
+	 */
+	if (!direct_sockfs) {
+		struct T_conn_ind *conn_ind;
+
+		conn_ind = (struct T_conn_ind *)mp->b_rptr;
+		conn_ind->OPT_length = 0;
+		conn_ind->OPT_offset = 0;
+	}
+
+	/*
+	 * Sockfs guarantees that the listener will not be closed
+	 * during fallback. So we can safely use the listener's queue.
+	 */
+	putnext(listener->tcp_rq, mp);
+}
+
+int
+tcp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
+    boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
+{
+	tcp_t			*tcp;
+	conn_t 			*connp = (conn_t *)proto_handle;
+	int			error;
+	mblk_t			*stropt_mp;
+	mblk_t			*ordrel_mp;
+	mblk_t			*fused_sigurp_mp;
+
+	tcp = connp->conn_tcp;
+
+	stropt_mp = allocb_wait(sizeof (struct stroptions), BPRI_HI, STR_NOSIG,
+	    NULL);
+
+	/* Pre-allocate the T_ordrel_ind mblk. */
+	ASSERT(tcp->tcp_ordrel_mp == NULL);
+	ordrel_mp = allocb_wait(sizeof (struct T_ordrel_ind), BPRI_HI,
+	    STR_NOSIG, NULL);
+	ordrel_mp->b_datap->db_type = M_PROTO;
+	((struct T_ordrel_ind *)ordrel_mp->b_rptr)->PRIM_type = T_ORDREL_IND;
+	ordrel_mp->b_wptr += sizeof (struct T_ordrel_ind);
+
+	/* Pre-allocate the M_PCSIG used by fusion */
+	fused_sigurp_mp = allocb_wait(1, BPRI_HI, STR_NOSIG, NULL);
+
+	/*
+	 * Enter the squeue so that no new packets can come in
+	 */
+	error = squeue_synch_enter(connp->conn_sqp, connp, 0);
+	if (error != 0) {
+		/* failed to enter, free all the pre-allocated messages. */
+		freeb(stropt_mp);
+		freeb(ordrel_mp);
+		freeb(fused_sigurp_mp);
+		/*
+		 * We cannot process the eager, so at least send out a
+		 * RST so the peer can reconnect.
+		 */
+		if (tcp->tcp_listener != NULL) {
+			(void) tcp_eager_blowoff(tcp->tcp_listener,
+			    tcp->tcp_conn_req_seqnum);
+		}
+		return (ENOMEM);
+	}
 
 	/*
 	 * No longer a direct socket
@@ -27859,45 +27910,13 @@
 		freeb(fused_sigurp_mp);
 	}
 
-	/*
-	 * Send T_CONN_IND messages for all ESTABLISHED connections.
-	 */
-	mutex_enter(&tcp->tcp_eager_lock);
-	for (eager = tcp->tcp_eager_next_q; eager != NULL;
-	    eager = eager->tcp_eager_next_q) {
-		mp = eager->tcp_conn.tcp_eager_conn_ind;
-
-		eager->tcp_conn.tcp_eager_conn_ind = NULL;
-		ASSERT(mp != NULL);
-		/*
-		 * TLI/XTI applications will get confused by
-		 * sending eager as an option since it violates
-		 * the option semantics. So remove the eager as
-		 * option since TLI/XTI app doesn't need it anyway.
-		 */
-		if (!TCP_IS_SOCKET(tcp)) {
-			struct T_conn_ind *conn_ind;
-
-			conn_ind = (struct T_conn_ind *)mp->b_rptr;
-			conn_ind->OPT_length = 0;
-			conn_ind->OPT_offset = 0;
-		}
-		if (conn_ind_head == NULL) {
-			conn_ind_head = mp;
-		} else {
-			conn_ind_tail->b_next = mp;
-		}
-		conn_ind_tail = mp;
-	}
-	mutex_exit(&tcp->tcp_eager_lock);
-
-	mp = conn_ind_head;
-	while (mp != NULL) {
-		mblk_t *nmp = mp->b_next;
-		mp->b_next = NULL;
-
-		putnext(tcp->tcp_rq, mp);
-		mp = nmp;
+	if (tcp->tcp_listener != NULL) {
+		/* The eager will deal with opts when accept() is called */
+		freeb(stropt_mp);
+		tcp_fallback_eager(tcp, direct_sockfs);
+	} else {
+		tcp_fallback_noneager(tcp, stropt_mp, q, direct_sockfs,
+		    quiesced_cb);
 	}
 
 	/*
@@ -27905,6 +27924,8 @@
 	 */
 	ASSERT(connp->conn_ref >= 2);
 	squeue_synch_exit(connp->conn_sqp, connp);
+
+	return (0);
 }
 
 /* ARGSUSED */
diff --git a/usr/src/uts/common/inet/tcp/tcp_fusion.c b/usr/src/uts/common/inet/tcp/tcp_fusion.c
index a8b6780..7ac90ce 100644
--- a/usr/src/uts/common/inet/tcp/tcp_fusion.c
+++ b/usr/src/uts/common/inet/tcp/tcp_fusion.c
@@ -805,6 +805,7 @@
 		(*peer_tcp->tcp_connp->conn_upcalls->su_recv)(
 		    peer_tcp->tcp_connp->conn_upper_handle, mp, recv_size,
 		    flags, &error, &push);
+		ASSERT(error != EOPNOTSUPP);
 	} else {
 		if (IPCL_IS_NONSTR(peer_tcp->tcp_connp) &&
 		    (tcp->tcp_valid_bits & TCP_URG_VALID) &&
diff --git a/usr/src/uts/common/inet/tcp_impl.h b/usr/src/uts/common/inet/tcp_impl.h
index 97374be..e61c854 100644
--- a/usr/src/uts/common/inet/tcp_impl.h
+++ b/usr/src/uts/common/inet/tcp_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -277,7 +277,7 @@
 
 extern sock_lower_handle_t tcp_create(int, int, int, sock_downcalls_t **,
     uint_t *, int *, int, cred_t *);
-extern void tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+extern int tcp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
     so_proto_quiesced_cb_t);
 
 extern sock_downcalls_t sock_tcp_downcalls;
diff --git a/usr/src/uts/common/inet/udp/udp.c b/usr/src/uts/common/inet/udp/udp.c
index f141ee0..c473afd 100644
--- a/usr/src/uts/common/inet/udp/udp.c
+++ b/usr/src/uts/common/inet/udp/udp.c
@@ -227,6 +227,7 @@
 
 static int	udp_send_connected(conn_t *, mblk_t *, struct nmsghdr *,
 		    cred_t *, pid_t);
+static void	udp_ulp_recv(conn_t *, mblk_t *);
 
 /* Common routine for TPI and socket module */
 static conn_t	*udp_do_open(cred_t *, boolean_t, int);
@@ -1206,7 +1207,7 @@
  */
 static void
 udp_icmp_error(conn_t *connp, mblk_t *mp)
-			    {
+{
 	icmph_t *icmph;
 	ipha_t	*ipha;
 	int	iph_hdr_length;
@@ -1286,7 +1287,6 @@
 				if (sin.sin_port == udp->udp_dstport &&
 				    sin.sin_addr.s_addr ==
 				    V4_PART_OF_V6(udp->udp_v6dst)) {
-
 					rw_exit(&udp->udp_rwlock);
 					(*connp->conn_upcalls->su_set_error)
 					    (connp->conn_upper_handle, error);
@@ -1324,7 +1324,6 @@
 			}
 			rw_exit(&udp->udp_rwlock);
 		} else {
-
 			mp1 = mi_tpi_uderror_ind((char *)&sin6, sizeof (sin6_t),
 			    NULL, 0, error);
 		}
@@ -1333,6 +1332,7 @@
 	if (mp1 != NULL)
 		putnext(connp->conn_rq, mp1);
 done:
+	ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
 	freemsg(mp);
 }
 
@@ -1444,13 +1444,8 @@
 		 * message.  Free it, then send our empty message.
 		 */
 		freemsg(mp);
-		if (!IPCL_IS_NONSTR(connp)) {
-			putnext(connp->conn_rq, newmp);
-		} else {
-			(*connp->conn_upcalls->su_recv)
-			    (connp->conn_upper_handle, newmp, 0, 0, &error,
-			    NULL);
-		}
+		udp_ulp_recv(connp, newmp);
+
 		return;
 	}
 	case ICMP6_TIME_EXCEEDED:
@@ -1508,8 +1503,8 @@
 		if (mp1 != NULL)
 			putnext(connp->conn_rq, mp1);
 	}
-
 done:
+	ASSERT(!RW_ISWRITER(&udp->udp_rwlock));
 	freemsg(mp);
 }
 
@@ -3689,7 +3684,7 @@
 	}
 }
 
-static void
+static mblk_t *
 udp_queue_fallback(udp_t *udp, mblk_t *mp)
 {
 	ASSERT(MUTEX_HELD(&udp->udp_recv_lock));
@@ -3706,13 +3701,55 @@
 			udp->udp_fallback_queue_tail->b_next = mp;
 			udp->udp_fallback_queue_tail = mp;
 		}
-		mutex_exit(&udp->udp_recv_lock);
+		return (NULL);
 	} else {
 		/*
-		 * no more fallbacks possible, ok to drop lock.
+		 * Fallback completed, let the caller putnext() the mblk.
 		 */
-		mutex_exit(&udp->udp_recv_lock);
-		putnext(udp->udp_connp->conn_rq, mp);
+		return (mp);
+	}
+}
+
+/*
+ * Deliver data to ULP. In case we have a socket, and it's falling back to
+ * TPI, then we'll queue the mp for later processing.
+ */
+static void
+udp_ulp_recv(conn_t *connp, mblk_t *mp)
+{
+	if (IPCL_IS_NONSTR(connp)) {
+		udp_t *udp = connp->conn_udp;
+		int error;
+
+		if ((*connp->conn_upcalls->su_recv)
+		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
+		    NULL) < 0) {
+			mutex_enter(&udp->udp_recv_lock);
+			if (error == ENOSPC) {
+				/*
+				 * let's confirm while holding the lock
+				 */
+				if ((*connp->conn_upcalls->su_recv)
+				    (connp->conn_upper_handle, NULL, 0, 0,
+				    &error, NULL) < 0) {
+					ASSERT(error == ENOSPC);
+					if (error == ENOSPC) {
+						connp->conn_flow_cntrld =
+						    B_TRUE;
+					}
+				}
+				mutex_exit(&udp->udp_recv_lock);
+			} else {
+				ASSERT(error == EOPNOTSUPP);
+				mp = udp_queue_fallback(udp, mp);
+				mutex_exit(&udp->udp_recv_lock);
+				if (mp != NULL)
+					putnext(connp->conn_rq, mp);
+			}
+		}
+		ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
+	} else {
+		putnext(connp->conn_rq, mp);
 	}
 }
 
@@ -4463,37 +4500,8 @@
 	if (options_mp != NULL)
 		freeb(options_mp);
 
-	if (IPCL_IS_NONSTR(connp)) {
-		int error;
+	udp_ulp_recv(connp, mp);
 
-		if ((*connp->conn_upcalls->su_recv)
-		    (connp->conn_upper_handle, mp, msgdsize(mp), 0, &error,
-		    NULL) < 0) {
-			mutex_enter(&udp->udp_recv_lock);
-			if (error == ENOSPC) {
-				/*
-				 * let's confirm while holding the lock
-				 */
-				if ((*connp->conn_upcalls->su_recv)
-				    (connp->conn_upper_handle, NULL, 0, 0,
-				    &error, NULL) < 0) {
-					if (error == ENOSPC) {
-						connp->conn_flow_cntrld =
-						    B_TRUE;
-					} else {
-						ASSERT(error == EOPNOTSUPP);
-					}
-				}
-				mutex_exit(&udp->udp_recv_lock);
-			} else {
-				ASSERT(error == EOPNOTSUPP);
-				udp_queue_fallback(udp, mp);
-			}
-		}
-	} else {
-		putnext(connp->conn_rq, mp);
-	}
-	ASSERT(MUTEX_NOT_HELD(&udp->udp_recv_lock));
 	return;
 
 tossit:
@@ -5846,7 +5854,7 @@
 
 	/* M_DATA for connected socket */
 
-	ASSERT(udp->udp_issocket || IPCL_IS_NONSTR(connp));
+	ASSERT(udp->udp_issocket);
 	UDP_DBGSTAT(us, udp_data_conn);
 
 	mutex_enter(&connp->conn_lock);
@@ -7990,6 +7998,7 @@
 	us = udp->udp_us;
 	ASSERT(us != NULL);
 
+	udp->udp_issocket = B_TRUE;
 	connp->conn_flags |= IPCL_NONSTR | IPCL_SOCKET;
 
 	/* Set flow control */
@@ -9272,7 +9281,7 @@
 	return (udp->udp_dgram_errind ? error : 0);
 }
 
-void
+int
 udp_fallback(sock_lower_handle_t proto_handle, queue_t *q,
     boolean_t direct_sockfs, so_proto_quiesced_cb_t quiesced_cb)
 {
@@ -9340,21 +9349,15 @@
 	if (udp->udp_dontroute)
 		opts |= SO_DONTROUTE;
 
-	/*
-	 * Once we grab the drain lock, no data will be send up
-	 * to the socket. So we notify the socket that the endpoint
-	 * is quiescent and it's therefore safe move data from
-	 * the socket to the stream head.
-	 */
 	(*quiesced_cb)(connp->conn_upper_handle, q, &tca,
 	    (struct sockaddr *)&laddr, laddrlen,
 	    (struct sockaddr *)&faddr, faddrlen, opts);
 
-	/*
-	 * push up any packets that were queued in udp_t
-	 */
-
 	mutex_enter(&udp->udp_recv_lock);
+	/*
+	 * Attempts to send data up during fallback will result in it being
+	 * queued in udp_t. Now we push up any queued packets.
+	 */
 	while (udp->udp_fallback_queue_head != NULL) {
 		mblk_t *mp;
 		mp = udp->udp_fallback_queue_head;
@@ -9368,10 +9371,15 @@
 	/*
 	 * No longer a streams less socket
 	 */
+	rw_enter(&udp->udp_rwlock, RW_WRITER);
 	connp->conn_flags &= ~IPCL_NONSTR;
+	rw_exit(&udp->udp_rwlock);
+
 	mutex_exit(&udp->udp_recv_lock);
 
 	ASSERT(connp->conn_ref >= 1);
+
+	return (0);
 }
 
 static int
diff --git a/usr/src/uts/common/inet/udp_impl.h b/usr/src/uts/common/inet/udp_impl.h
index 96f84e4..ba370cb 100644
--- a/usr/src/uts/common/inet/udp_impl.h
+++ b/usr/src/uts/common/inet/udp_impl.h
@@ -391,7 +391,7 @@
 
 extern sock_lower_handle_t udp_create(int, int, int, sock_downcalls_t **,
     uint_t *, int *, int, cred_t *);
-extern void udp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
+extern int udp_fallback(sock_lower_handle_t, queue_t *, boolean_t,
     so_proto_quiesced_cb_t);
 
 extern sock_downcalls_t sock_udp_downcalls;
diff --git a/usr/src/uts/common/sys/socket_proto.h b/usr/src/uts/common/sys/socket_proto.h
index 8f60ea9..12c9547 100644
--- a/usr/src/uts/common/sys/socket_proto.h
+++ b/usr/src/uts/common/sys/socket_proto.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -129,7 +129,7 @@
 typedef void (*so_proto_quiesced_cb_t)(sock_upper_handle_t, queue_t *,
     struct T_capability_ack *, struct sockaddr *, socklen_t,
     struct sockaddr *, socklen_t, short);
-typedef void (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *,
+typedef int (*so_proto_fallback_func_t)(sock_lower_handle_t, queue_t *,
     boolean_t, so_proto_quiesced_cb_t);
 
 /*
diff --git a/usr/src/uts/common/sys/socketvar.h b/usr/src/uts/common/sys/socketvar.h
index c7c0f0a..f4f026f 100644
--- a/usr/src/uts/common/sys/socketvar.h
+++ b/usr/src/uts/common/sys/socketvar.h
@@ -291,14 +291,12 @@
 
 #define	SS_SODIRECT		0x00400000 /* transport supports sodirect */
 
-/*	unused			0x01000000 */	/* was SS_LADDR_VALID */
-/*	unused			0x02000000 */	/* was SS_FADDR_VALID */
+#define	SS_SENTLASTREADSIG	0x01000000 /* last rx signal has been sent */
+#define	SS_SENTLASTWRITESIG	0x02000000 /* last tx signal has been sent */
 
-#define	SS_SENTLASTREADSIG	0x10000000 /* last rx signal has been sent */
-#define	SS_SENTLASTWRITESIG	0x20000000 /* last tx signal has been sent */
-
-#define	SS_FALLBACK_PENDING	0x40000000
-#define	SS_FALLBACK_COMP	0x80000000
+#define	SS_FALLBACK_DRAIN	0x20000000 /* data was/is being drained */
+#define	SS_FALLBACK_PENDING	0x40000000 /* fallback is pending */
+#define	SS_FALLBACK_COMP	0x80000000 /* fallback has completed */
 
 
 /* Set of states when the socket can't be rebound */