blob: 470f6cad1d22c04203c64906ea3bfa775867631b [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
#pragma ident "%Z%%M% %I% %E% SMI"
const char tcp_version[] = "%Z%%M% %I% %E% SMI";
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
#include <sys/strsun.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/suntpi.h>
#include <sys/xti_inet.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/sdt.h>
#include <sys/vtrace.h>
#include <sys/kmem.h>
#include <sys/ethernet.h>
#include <sys/cpuvar.h>
#include <sys/dlpi.h>
#include <sys/multidata.h>
#include <sys/multidata_impl.h>
#include <sys/pattr.h>
#include <sys/policy.h>
#include <sys/priv.h>
#include <sys/zone.h>
#include <sys/sunldi.h>
#include <sys/errno.h>
#include <sys/signal.h>
#include <sys/socket.h>
#include <sys/sockio.h>
#include <sys/isa_defs.h>
#include <sys/md5.h>
#include <sys/random.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <net/if.h>
#include <net/route.h>
#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/mi.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/optcom.h>
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
#include <net/pfkeyv2.h>
#include <inet/ipsec_info.h>
#include <inet/ipdrop.h>
#include <inet/tcp_trace.h>
#include <inet/ipclassifier.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_if.h>
#include <inet/ipp_common.h>
#include <inet/ip_netinfo.h>
#include <sys/squeue.h>
#include <inet/kssl/ksslapi.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
#include <rpc/pmap_prot.h>
/*
* TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
*
* (Read the detailed design doc in PSARC case directory)
*
* The entire tcp state is contained in tcp_t and conn_t structure
* which are allocated in tandem using ipcl_conn_create() and passing
* IPCL_CONNTCP as a flag. We use 'conn_ref' and 'conn_lock' to protect
* the references on the tcp_t. The tcp_t structure is never compressed
* and packets always land on the correct TCP perimeter from the time
* eager is created till the time tcp_t dies (as such the old mentat
* TCP global queue is not used for detached state and no IPSEC checking
* is required). The global queue is still allocated to send out resets
* for connection which have no listeners and IP directly calls
* tcp_xmit_listeners_reset() which does any policy check.
*
* Protection and Synchronisation mechanism:
*
* The tcp data structure does not use any kind of lock for protecting
* its state but instead uses 'squeues' for mutual exclusion from various
* read and write side threads. To access a tcp member, the thread should
* always be behind squeue (via squeue_enter, squeue_enter_nodrain, or
* squeue_fill). Since the squeues allow a direct function call, caller
* can pass any tcp function having prototype of edesc_t as argument
* (different from traditional STREAMs model where packets come in only
* designated entry points). The list of functions that can be directly
* called via squeue are listed before the usual function prototype.
*
* Referencing:
*
* TCP is MT-Hot and we use a reference based scheme to make sure that the
* tcp structure doesn't disappear when its needed. When the application
* creates an outgoing connection or accepts an incoming connection, we
* start out with 2 references on 'conn_ref'. One for TCP and one for IP.
* The IP reference is just a symbolic reference since ip_tcpclose()
* looks at tcp structure after tcp_close_output() returns which could
* have dropped the last TCP reference. So as long as the connection is
* in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
* conn_t. The classifier puts its own reference when the connection is
* inserted in listen or connected hash. Anytime a thread needs to enter
* the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
* on write side or by doing a classify on read side and then puts a
* reference on the conn before doing squeue_enter/tryenter/fill. For
* read side, the classifier itself puts the reference under fanout lock
* to make sure that tcp can't disappear before it gets processed. The
* squeue will drop this reference automatically so the called function
* doesn't have to do a DEC_REF.
*
* Opening a new connection:
*
* The outgoing connection open is pretty simple. tcp_open() does the
* work in creating the conn/tcp structure and initializing it. The
* squeue assignment is done based on the CPU the application
* is running on. So for outbound connections, processing is always done
* on application CPU which might be different from the incoming CPU
* being interrupted by the NIC. An optimal way would be to figure out
* the NIC <-> CPU binding at listen time, and assign the outgoing
* connection to the squeue attached to the CPU that will be interrupted
* for incoming packets (we know the NIC based on the bind IP address).
* This might seem like a problem if more data is going out but the
* fact is that in most cases the transmit is ACK driven transmit where
* the outgoing data normally sits on TCP's xmit queue waiting to be
* transmitted.
*
* Accepting a connection:
*
* This is a more interesting case because of various races involved in
* establishing a eager in its own perimeter. Read the meta comment on
* top of tcp_conn_request(). But briefly, the squeue is picked by
* ip_tcp_input()/ip_fanout_tcp_v6() based on the interrupted CPU.
*
* Closing a connection:
*
* The close is fairly straight forward. tcp_close() calls tcp_close_output()
* via squeue to do the close and mark the tcp as detached if the connection
* was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
* reference but tcp_close() drop IP's reference always. So if tcp was
* not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
* and 1 because it is in classifier's connected hash. This is the condition
* we use to determine that its OK to clean up the tcp outside of squeue
* when time wait expires (check the ref under fanout and conn_lock and
* if it is 2, remove it from fanout hash and kill it).
*
* Although close just drops the necessary references and marks the
* tcp_detached state, tcp_close needs to know the tcp_detached has been
* set (under squeue) before letting the STREAM go away (because a
* inbound packet might attempt to go up the STREAM while the close
* has happened and tcp_detached is not set). So a special lock and
* flag is used along with a condition variable (tcp_closelock, tcp_closed,
* and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
* tcp_detached.
*
* Special provisions and fast paths:
*
* We make special provision for (AF_INET, SOCK_STREAM) sockets which
* can't have 'ipv6_recvpktinfo' set and for these type of sockets, IP
* will never send a M_CTL to TCP. As such, ip_tcp_input() which handles
* all TCP packets from the wire makes a IPCL_IS_TCP4_CONNECTED_NO_POLICY
* check to send packets directly to tcp_rput_data via squeue. Everyone
* else comes through tcp_input() on the read side.
*
* We also make special provisions for sockfs by marking tcp_issocket
* whenever we have only sockfs on top of TCP. This allows us to skip
* putting the tcp in acceptor hash since a sockfs listener can never
* become acceptor and also avoid allocating a tcp_t for acceptor STREAM
* since eager has already been allocated and the accept now happens
* on acceptor STREAM. There is a big blob of comment on top of
* tcp_conn_request explaining the new accept. When socket is POP'd,
* sockfs sends us an ioctl to mark the fact and we go back to old
* behaviour. Once tcp_issocket is unset, its never set for the
* life of that connection.
*
* IPsec notes :
*
* Since a packet is always executed on the correct TCP perimeter
* all IPsec processing is defered to IP including checking new
* connections and setting IPSEC policies for new connection. The
* only exception is tcp_xmit_listeners_reset() which is called
* directly from IP and needs to policy check to see if TH_RST
* can be sent out.
*
* PFHooks notes :
*
* For mdt case, one meta buffer contains multiple packets. Mblks for every
* packet are assembled and passed to the hooks. When packets are blocked,
* or boundary of any packet is changed, the mdt processing is stopped, and
* packets of the meta buffer are send to the IP path one by one.
*/
/*
* Values for squeue switch:
* 1: squeue_enter_nodrain
* 2: squeue_enter
* 3: squeue_fill
*/
int tcp_squeue_close = 2; /* Setable in /etc/system */
int tcp_squeue_wput = 2;
squeue_func_t tcp_squeue_close_proc;
squeue_func_t tcp_squeue_wput_proc;
/*
* This controls how tiny a write must be before we try to copy it
* into the the mblk on the tail of the transmit queue. Not much
* speedup is observed for values larger than sixteen. Zero will
* disable the optimisation.
*/
int tcp_tx_pull_len = 16;
/*
* TCP Statistics.
*
* How TCP statistics work.
*
* There are two types of statistics invoked by two macros.
*
* TCP_STAT(name) does non-atomic increment of a named stat counter. It is
* supposed to be used in non MT-hot paths of the code.
*
* TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
* supposed to be used for DEBUG purposes and may be used on a hot path.
*
* Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
* (use "kstat tcp" to get them).
*
* There is also additional debugging facility that marks tcp_clean_death()
* instances and saves them in tcp_t structure. It is triggered by
* TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
* tcp_clean_death() calls that counts the number of times each tag was hit. It
* is triggered by TCP_CLD_COUNTERS define.
*
* How to add new counters.
*
* 1) Add a field in the tcp_stat structure describing your counter.
* 2) Add a line in the template in tcp_kstat2_init() with the name
* of the counter.
*
* IMPORTANT!! - make sure that both are in sync !!
* 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
*
* Please avoid using private counters which are not kstat-exported.
*
* TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
* in tcp_t structure.
*
* TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
*/
#ifndef TCP_DEBUG_COUNTER
#ifdef DEBUG
#define TCP_DEBUG_COUNTER 1
#else
#define TCP_DEBUG_COUNTER 0
#endif
#endif
#define TCP_CLD_COUNTERS 0
#define TCP_TAG_CLEAN_DEATH 1
#define TCP_MAX_CLEAN_DEATH_TAG 32
#ifdef lint
static int _lint_dummy_;
#endif
#if TCP_CLD_COUNTERS
static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
#define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
#elif defined(lint)
#define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
#else
#define TCP_CLD_STAT(x)
#endif
#if TCP_DEBUG_COUNTER
#define TCP_DBGSTAT(tcps, x) \
atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
#define TCP_G_DBGSTAT(x) \
atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
#elif defined(lint)
#define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
#define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
#else
#define TCP_DBGSTAT(tcps, x)
#define TCP_G_DBGSTAT(x)
#endif
#define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++)
tcp_g_stat_t tcp_g_statistics;
kstat_t *tcp_g_kstat;
/*
* Call either ip_output or ip_output_v6. This replaces putnext() calls on the
* tcp write side.
*/
#define CALL_IP_WPUT(connp, q, mp) { \
tcp_stack_t *tcps; \
\
tcps = connp->conn_netstack->netstack_tcp; \
ASSERT(((q)->q_flag & QREADR) == 0); \
TCP_DBGSTAT(tcps, tcp_ip_output); \
connp->conn_send(connp, (mp), (q), IP_WPUT); \
}
/* Macros for timestamp comparisons */
#define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
#define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
/*
* Parameters for TCP Initial Send Sequence number (ISS) generation. When
* tcp_strong_iss is set to 1, which is the default, the ISS is calculated
* by adding three components: a time component which grows by 1 every 4096
* nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
* a per-connection component which grows by 125000 for every new connection;
* and an "extra" component that grows by a random amount centered
* approximately on 64000. This causes the the ISS generator to cycle every
* 4.89 hours if no TCP connections are made, and faster if connections are
* made.
*
* When tcp_strong_iss is set to 0, ISS is calculated by adding two
* components: a time component which grows by 250000 every second; and
* a per-connection component which grows by 125000 for every new connections.
*
* A third method, when tcp_strong_iss is set to 2, for generating ISS is
* prescribed by Steve Bellovin. This involves adding time, the 125000 per
* connection, and a one-way hash (MD5) of the connection ID <sport, dport,
* src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
* password.
*/
#define ISS_INCR 250000
#define ISS_NSEC_SHT 12
static sin_t sin_null; /* Zero address for quick clears */
static sin6_t sin6_null; /* Zero address for quick clears */
/*
* This implementation follows the 4.3BSD interpretation of the urgent
* pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
* incompatible changes in protocols like telnet and rlogin.
*/
#define TCP_OLD_URP_INTERPRETATION 1
#define TCP_IS_DETACHED_NONEAGER(tcp) \
(TCP_IS_DETACHED(tcp) && \
(!(tcp)->tcp_hard_binding))
/*
* TCP reassembly macros. We hide starting and ending sequence numbers in
* b_next and b_prev of messages on the reassembly queue. The messages are
* chained using b_cont. These macros are used in tcp_reass() so we don't
* have to see the ugly casts and assignments.
*/
#define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next))
#define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \
(mblk_t *)(uintptr_t)(u))
#define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev))
#define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \
(mblk_t *)(uintptr_t)(u))
/*
* Implementation of TCP Timers.
* =============================
*
* INTERFACE:
*
* There are two basic functions dealing with tcp timers:
*
* timeout_id_t tcp_timeout(connp, func, time)
* clock_t tcp_timeout_cancel(connp, timeout_id)
* TCP_TIMER_RESTART(tcp, intvl)
*
* tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
* after 'time' ticks passed. The function called by timeout() must adhere to
* the same restrictions as a driver soft interrupt handler - it must not sleep
* or call other functions that might sleep. The value returned is the opaque
* non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
* cancel the request. The call to tcp_timeout() may fail in which case it
* returns zero. This is different from the timeout(9F) function which never
* fails.
*
* The call-back function 'func' always receives 'connp' as its single
* argument. It is always executed in the squeue corresponding to the tcp
* structure. The tcp structure is guaranteed to be present at the time the
* call-back is called.
*
* NOTE: The call-back function 'func' is never called if tcp is in
* the TCPS_CLOSED state.
*
* tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
* request. locks acquired by the call-back routine should not be held across
* the call to tcp_timeout_cancel() or a deadlock may result.
*
* tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
* Otherwise, it returns an integer value greater than or equal to 0. In
* particular, if the call-back function is already placed on the squeue, it can
* not be canceled.
*
* NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
* within squeue context corresponding to the tcp instance. Since the
* call-back is also called via the same squeue, there are no race
* conditions described in untimeout(9F) manual page since all calls are
* strictly serialized.
*
* TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
* stored in tcp_timer_tid and starts a new one using
* MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
* and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
* field.
*
* NOTE: since the timeout cancellation is not guaranteed, the cancelled
* call-back may still be called, so it is possible tcp_timer() will be
* called several times. This should not be a problem since tcp_timer()
* should always check the tcp instance state.
*
*
* IMPLEMENTATION:
*
* TCP timers are implemented using three-stage process. The call to
* tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
* when the timer expires. The tcp_timer_callback() arranges the call of the
* tcp_timer_handler() function via squeue corresponding to the tcp
* instance. The tcp_timer_handler() calls actual requested timeout call-back
* and passes tcp instance as an argument to it. Information is passed between
* stages using the tcp_timer_t structure which contains the connp pointer, the
* tcp call-back to call and the timeout id returned by the timeout(9F).
*
* The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
* like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
* mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
* returns the pointer to this mblk.
*
* The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
* looks like a normal mblk without actual dblk attached to it.
*
* To optimize performance each tcp instance holds a small cache of timer
* mblocks. In the current implementation it caches up to two timer mblocks per
* tcp instance. The cache is preserved over tcp frees and is only freed when
* the whole tcp structure is destroyed by its kmem destructor. Since all tcp
* timer processing happens on a corresponding squeue, the cache manipulation
* does not require any locks. Experiments show that majority of timer mblocks
* allocations are satisfied from the tcp cache and do not involve kmem calls.
*
* The tcp_timeout() places a refhold on the connp instance which guarantees
* that it will be present at the time the call-back function fires. The
* tcp_timer_handler() drops the reference after calling the call-back, so the
* call-back function does not need to manipulate the references explicitly.
*/
typedef struct tcp_timer_s {
conn_t *connp;
void (*tcpt_proc)(void *);
timeout_id_t tcpt_tid;
} tcp_timer_t;
static kmem_cache_t *tcp_timercache;
kmem_cache_t *tcp_sack_info_cache;
kmem_cache_t *tcp_iphc_cache;
/*
* For scalability, we must not run a timer for every TCP connection
* in TIME_WAIT state. To see why, consider (for time wait interval of
* 4 minutes):
* 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
*
* This list is ordered by time, so you need only delete from the head
* until you get to entries which aren't old enough to delete yet.
* The list consists of only the detached TIME_WAIT connections.
*
* Note that the timer (tcp_time_wait_expire) is started when the tcp_t
* becomes detached TIME_WAIT (either by changing the state and already
* being detached or the other way around). This means that the TIME_WAIT
* state can be extended (up to doubled) if the connection doesn't become
* detached for a long time.
*
* The list manipulations (including tcp_time_wait_next/prev)
* are protected by the tcp_time_wait_lock. The content of the
* detached TIME_WAIT connections is protected by the normal perimeters.
*
* This list is per squeue and squeues are shared across the tcp_stack_t's.
* Things on tcp_time_wait_head remain associated with the tcp_stack_t
* and conn_netstack.
* The tcp_t's that are added to tcp_free_list are disassociated and
* have NULL tcp_tcps and conn_netstack pointers.
*/
typedef struct tcp_squeue_priv_s {
kmutex_t tcp_time_wait_lock;
timeout_id_t tcp_time_wait_tid;
tcp_t *tcp_time_wait_head;
tcp_t *tcp_time_wait_tail;
tcp_t *tcp_free_list;
uint_t tcp_free_list_cnt;
} tcp_squeue_priv_t;
/*
* TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
* Running it every 5 seconds seems to give the best results.
*/
#define TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
/*
* To prevent memory hog, limit the number of entries in tcp_free_list
* to 1% of available memory / number of cpus
*/
uint_t tcp_free_list_max_cnt = 0;
#define TCP_XMIT_LOWATER 4096
#define TCP_XMIT_HIWATER 49152
#define TCP_RECV_LOWATER 2048
#define TCP_RECV_HIWATER 49152
/*
* PAWS needs a timer for 24 days. This is the number of ticks in 24 days
*/
#define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
#define TIDUSZ 4096 /* transport interface data unit size */
/*
* Bind hash list size and has function. It has to be a power of 2 for
* hashing.
*/
#define TCP_BIND_FANOUT_SIZE 512
#define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
/*
* Size of listen and acceptor hash list. It has to be a power of 2 for
* hashing.
*/
#define TCP_FANOUT_SIZE 256
#ifdef _ILP32
#define TCP_ACCEPTOR_HASH(accid) \
(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
#else
#define TCP_ACCEPTOR_HASH(accid) \
((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
#endif /* _ILP32 */
#define IP_ADDR_CACHE_SIZE 2048
#define IP_ADDR_CACHE_HASH(faddr) \
(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
/* Hash for HSPs uses all 32 bits, since both networks and hosts are in table */
#define TCP_HSP_HASH_SIZE 256
#define TCP_HSP_HASH(addr) \
(((addr>>24) ^ (addr >>16) ^ \
(addr>>8) ^ (addr)) % TCP_HSP_HASH_SIZE)
/*
* TCP options struct returned from tcp_parse_options.
*/
typedef struct tcp_opt_s {
uint32_t tcp_opt_mss;
uint32_t tcp_opt_wscale;
uint32_t tcp_opt_ts_val;
uint32_t tcp_opt_ts_ecr;
tcp_t *tcp;
} tcp_opt_t;
/*
* RFC1323-recommended phrasing of TSTAMP option, for easier parsing
*/
#ifdef _BIG_ENDIAN
#define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
(TCPOPT_TSTAMP << 8) | 10)
#else
#define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
(TCPOPT_NOP << 8) | TCPOPT_NOP)
#endif
/*
* Flags returned from tcp_parse_options.
*/
#define TCP_OPT_MSS_PRESENT 1
#define TCP_OPT_WSCALE_PRESENT 2
#define TCP_OPT_TSTAMP_PRESENT 4
#define TCP_OPT_SACK_OK_PRESENT 8
#define TCP_OPT_SACK_PRESENT 16
/* TCP option length */
#define TCPOPT_NOP_LEN 1
#define TCPOPT_MAXSEG_LEN 4
#define TCPOPT_WS_LEN 3
#define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1)
#define TCPOPT_TSTAMP_LEN 10
#define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2)
#define TCPOPT_SACK_OK_LEN 2
#define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2)
#define TCPOPT_REAL_SACK_LEN 4
#define TCPOPT_MAX_SACK_LEN 36
#define TCPOPT_HEADER_LEN 2
/* TCP cwnd burst factor. */
#define TCP_CWND_INFINITE 65535
#define TCP_CWND_SS 3
#define TCP_CWND_NORMAL 5
/* Maximum TCP initial cwin (start/restart). */
#define TCP_MAX_INIT_CWND 8
/*
* Initialize cwnd according to RFC 3390. def_max_init_cwnd is
* either tcp_slow_start_initial or tcp_slow_start_after idle
* depending on the caller. If the upper layer has not used the
* TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
* should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
* If the upper layer has changed set the tcp_init_cwnd, just use
* it to calculate the tcp_cwnd.
*/
#define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \
{ \
if ((tcp)->tcp_init_cwnd == 0) { \
(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \
MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
} else { \
(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \
} \
tcp->tcp_cwnd_cnt = 0; \
}
/* TCP Timer control structure */
typedef struct tcpt_s {
pfv_t tcpt_pfv; /* The routine we are to call */
tcp_t *tcpt_tcp; /* The parameter we are to pass in */
} tcpt_t;
/* Host Specific Parameter structure */
typedef struct tcp_hsp {
struct tcp_hsp *tcp_hsp_next;
in6_addr_t tcp_hsp_addr_v6;
in6_addr_t tcp_hsp_subnet_v6;
uint_t tcp_hsp_vers; /* IPV4_VERSION | IPV6_VERSION */
int32_t tcp_hsp_sendspace;
int32_t tcp_hsp_recvspace;
int32_t tcp_hsp_tstamp;
} tcp_hsp_t;
#define tcp_hsp_addr V4_PART_OF_V6(tcp_hsp_addr_v6)
#define tcp_hsp_subnet V4_PART_OF_V6(tcp_hsp_subnet_v6)
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
void tcp_conn_request(void *arg, mblk_t *mp, void *arg2);
static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2);
void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2);
static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2);
static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2);
void tcp_input(void *arg, mblk_t *mp, void *arg2);
void tcp_rput_data(void *arg, mblk_t *mp, void *arg2);
static void tcp_close_output(void *arg, mblk_t *mp, void *arg2);
void tcp_output(void *arg, mblk_t *mp, void *arg2);
static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2);
static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2);
static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2);
/* Prototype for TCP functions */
static void tcp_random_init(void);
int tcp_random(void);
static void tcp_accept(tcp_t *tcp, mblk_t *mp);
static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
tcp_t *eager);
static int tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp);
static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
boolean_t user_specified);
static void tcp_closei_local(tcp_t *tcp);
static void tcp_close_detached(tcp_t *tcp);
static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr, tcph_t *tcph,
mblk_t *idmp, mblk_t **defermp);
static void tcp_connect(tcp_t *tcp, mblk_t *mp);
static void tcp_connect_ipv4(tcp_t *tcp, mblk_t *mp, ipaddr_t *dstaddrp,
in_port_t dstport, uint_t srcid);
static void tcp_connect_ipv6(tcp_t *tcp, mblk_t *mp, in6_addr_t *dstaddrp,
in_port_t dstport, uint32_t flowinfo, uint_t srcid,
uint32_t scope_id);
static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
static void tcp_def_q_set(tcp_t *tcp, mblk_t *mp);
static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
static char *tcp_display(tcp_t *tcp, char *, char);
static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
static void tcp_eager_unlink(tcp_t *tcp);
static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
int unixerr);
static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
int tlierr, int unixerr);
static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int tcp_tpistate(tcp_t *tcp);
static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
int caller_holds_lock);
static void tcp_bind_hash_remove(tcp_t *tcp);
static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
static void tcp_acceptor_hash_remove(tcp_t *tcp);
static void tcp_capability_req(tcp_t *tcp, mblk_t *mp);
static void tcp_info_req(tcp_t *tcp, mblk_t *mp);
static void tcp_addr_req(tcp_t *tcp, mblk_t *mp);
static void tcp_addr_req_ipv6(tcp_t *tcp, mblk_t *mp);
void tcp_g_q_setup(tcp_stack_t *);
void tcp_g_q_create(tcp_stack_t *);
void tcp_g_q_destroy(tcp_stack_t *);
static int tcp_header_init_ipv4(tcp_t *tcp);
static int tcp_header_init_ipv6(tcp_t *tcp);
int tcp_init(tcp_t *tcp, queue_t *q);
static int tcp_init_values(tcp_t *tcp);
static mblk_t *tcp_ip_advise_mblk(void *addr, int addr_len, ipic_t **ipic);
static mblk_t *tcp_ip_bind_mp(tcp_t *tcp, t_scalar_t bind_prim,
t_scalar_t addr_length);
static void tcp_ip_ire_mark_advice(tcp_t *tcp);
static void tcp_ip_notify(tcp_t *tcp);
static mblk_t *tcp_ire_mp(mblk_t *mp);
static void tcp_iss_init(tcp_t *tcp);
static void tcp_keepalive_killer(void *arg);
static int tcp_parse_options(tcph_t *tcph, tcp_opt_t *tcpopt);
static void tcp_mss_set(tcp_t *tcp, uint32_t size, boolean_t do_ss);
static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
int *do_disconnectp, int *t_errorp, int *sys_errorp);
static boolean_t tcp_allow_connopt_set(int level, int name);
int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
int tcp_opt_get(queue_t *q, int level, int name, uchar_t *ptr);
int tcp_opt_set(queue_t *q, uint_t optset_context, int level,
int name, uint_t inlen, uchar_t *invalp, uint_t *outlenp,
uchar_t *outvalp, void *thisdg_attrs, cred_t *cr,
mblk_t *mblk);
static void tcp_opt_reverse(tcp_t *tcp, ipha_t *ipha);
static int tcp_opt_set_header(tcp_t *tcp, boolean_t checkonly,
uchar_t *ptr, uint_t len);
static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
tcp_stack_t *);
static int tcp_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
static void tcp_reinit(tcp_t *tcp);
static void tcp_reinit_values(tcp_t *tcp);
static void tcp_report_item(mblk_t *mp, tcp_t *tcp, int hashval,
tcp_t *thisstream, cred_t *cr);
static uint_t tcp_rcv_drain(queue_t *q, tcp_t *tcp);
static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_ss_rexmit(tcp_t *tcp);
static mblk_t *tcp_rput_add_ancillary(tcp_t *tcp, mblk_t *mp, ip6_pkt_t *ipp);
static void tcp_process_options(tcp_t *, tcph_t *);
static void tcp_rput_common(tcp_t *tcp, mblk_t *mp);
static void tcp_rsrv(queue_t *q);
static int tcp_rwnd_set(tcp_t *tcp, uint32_t rwnd);
static int tcp_snmp_state(tcp_t *tcp);
static int tcp_status_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_bind_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_listen_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_conn_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_acceptor_hash_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_host_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int tcp_host_param_set_ipv6(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int tcp_host_param_report(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static void tcp_timer(void *arg);
static void tcp_timer_callback(void *);
static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
boolean_t random);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
static void tcp_wput_sock(queue_t *q, mblk_t *mp);
void tcp_wput_accept(queue_t *q, mblk_t *mp);
static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
static int tcp_send(queue_t *q, tcp_t *tcp, const int mss,
const int tcp_hdr_len, const int tcp_tcp_hdr_len,
const int num_sack_blk, int *usable, uint_t *snxt,
int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
const int mdt_thres);
static int tcp_multisend(queue_t *q, tcp_t *tcp, const int mss,
const int tcp_hdr_len, const int tcp_tcp_hdr_len,
const int num_sack_blk, int *usable, uint_t *snxt,
int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time,
const int mdt_thres);
static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
int num_sack_blk);
static void tcp_wsrv(queue_t *q);
static int tcp_xmit_end(tcp_t *tcp);
static void tcp_ack_timer(void *arg);
static mblk_t *tcp_ack_mp(tcp_t *tcp);
static void tcp_xmit_early_reset(char *str, mblk_t *mp,
uint32_t seq, uint32_t ack, int ctl, uint_t ip_hdr_len,
zoneid_t zoneid, tcp_stack_t *, conn_t *connp);
static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
uint32_t ack, int ctl);
static tcp_hsp_t *tcp_hsp_lookup(ipaddr_t addr, tcp_stack_t *);
static tcp_hsp_t *tcp_hsp_lookup_ipv6(in6_addr_t *addr, tcp_stack_t *);
static int setmaxps(queue_t *q, int maxpsz);
static void tcp_set_rto(tcp_t *, time_t);
static boolean_t tcp_check_policy(tcp_t *, mblk_t *, ipha_t *, ip6_t *,
boolean_t, boolean_t);
static void tcp_icmp_error_ipv6(tcp_t *tcp, mblk_t *mp,
boolean_t ipsec_mctl);
static mblk_t *tcp_setsockopt_mp(int level, int cmd,
char *opt, int optlen);
static int tcp_build_hdrs(queue_t *, tcp_t *);
static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
uint32_t seg_seq, uint32_t seg_ack, int seg_len,
tcph_t *tcph);
boolean_t tcp_paws_check(tcp_t *tcp, tcph_t *tcph, tcp_opt_t *tcpoptp);
boolean_t tcp_reserved_port_add(int, in_port_t *, in_port_t *);
boolean_t tcp_reserved_port_del(in_port_t, in_port_t);
boolean_t tcp_reserved_port_check(in_port_t, tcp_stack_t *);
static tcp_t *tcp_alloc_temp_tcp(in_port_t, tcp_stack_t *);
static int tcp_reserved_port_list(queue_t *, mblk_t *, caddr_t, cred_t *);
static mblk_t *tcp_mdt_info_mp(mblk_t *);
static void tcp_mdt_update(tcp_t *, ill_mdt_capab_t *, boolean_t);
static int tcp_mdt_add_attrs(multidata_t *, const mblk_t *,
const boolean_t, const uint32_t, const uint32_t,
const uint32_t, const uint32_t, tcp_stack_t *);
static void tcp_multisend_data(tcp_t *, ire_t *, const ill_t *, mblk_t *,
const uint_t, const uint_t, boolean_t *);
static mblk_t *tcp_lso_info_mp(mblk_t *);
static void tcp_lso_update(tcp_t *, ill_lso_capab_t *);
static void tcp_send_data(tcp_t *, queue_t *, mblk_t *);
extern mblk_t *tcp_timermp_alloc(int);
extern void tcp_timermp_free(tcp_t *);
static void tcp_timer_free(tcp_t *tcp, mblk_t *mp);
static void tcp_stop_lingering(tcp_t *tcp);
static void tcp_close_linger_timeout(void *arg);
static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns);
static void tcp_stack_shutdown(netstackid_t stackid, void *arg);
static void tcp_stack_fini(netstackid_t stackid, void *arg);
static void *tcp_g_kstat_init(tcp_g_stat_t *);
static void tcp_g_kstat_fini(kstat_t *);
static void *tcp_kstat_init(netstackid_t, tcp_stack_t *);
static void tcp_kstat_fini(netstackid_t, kstat_t *);
static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *);
static void tcp_kstat2_fini(netstackid_t, kstat_t *);
static int tcp_kstat_update(kstat_t *kp, int rw);
void tcp_reinput(conn_t *connp, mblk_t *mp, squeue_t *sqp);
static int tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
tcph_t *tcph, uint_t ipvers, mblk_t *idmp);
static int tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, ipha_t *ipha,
tcph_t *tcph, mblk_t *idmp);
static squeue_func_t tcp_squeue_switch(int);
static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
static int tcp_close(queue_t *, int);
static int tcpclose_accept(queue_t *);
static void tcp_squeue_add(squeue_t *);
static boolean_t tcp_zcopy_check(tcp_t *);
static void tcp_zcopy_notify(tcp_t *);
static mblk_t *tcp_zcopy_disable(tcp_t *, mblk_t *);
static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, int);
static void tcp_ire_ill_check(tcp_t *, ire_t *, ill_t *, boolean_t);
extern void tcp_kssl_input(tcp_t *, mblk_t *);
void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2);
void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2);
/*
* Routines related to the TCP_IOC_ABORT_CONN ioctl command.
*
* TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
* TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
* (defined in tcp.h) needs to be filled in and passed into the kernel
* via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
* structure contains the four-tuple of a TCP connection and a range of TCP
* states (specified by ac_start and ac_end). The use of wildcard addresses
* and ports is allowed. Connections with a matching four tuple and a state
* within the specified range will be aborted. The valid states for the
* ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
* inclusive.
*
* An application which has its connection aborted by this ioctl will receive
* an error that is dependent on the connection state at the time of the abort.
* If the connection state is < TCPS_TIME_WAIT, an application should behave as
* though a RST packet has been received. If the connection state is equal to
* TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
* and all resources associated with the connection will be freed.
*/
static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
static void tcp_ioctl_abort_handler(tcp_t *, mblk_t *);
static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
boolean_t, tcp_stack_t *);
static struct module_info tcp_rinfo = {
TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
};
static struct module_info tcp_winfo = {
TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
};
/*
* Entry points for TCP as a device. The normal case which supports
* the TCP functionality.
* We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
*/
struct qinit tcp_rinitv4 = {
NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, NULL, &tcp_rinfo
};
struct qinit tcp_rinitv6 = {
NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_close, NULL, &tcp_rinfo
};
struct qinit tcp_winit = {
(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
};
/* Initial entry point for TCP in socket mode. */
struct qinit tcp_sock_winit = {
(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
};
/*
* Entry points for TCP as a acceptor STREAM opened by sockfs when doing
* an accept. Avoid allocating data structures since eager has already
* been created.
*/
struct qinit tcp_acceptor_rinit = {
NULL, (pfi_t)tcp_rsrv, NULL, tcpclose_accept, NULL, &tcp_winfo
};
struct qinit tcp_acceptor_winit = {
(pfi_t)tcp_wput_accept, NULL, NULL, NULL, NULL, &tcp_winfo
};
/*
* Entry points for TCP loopback (read side only)
* The open routine is only used for reopens, thus no need to
* have a separate one for tcp_openv6.
*/
struct qinit tcp_loopback_rinit = {
(pfi_t)0, (pfi_t)tcp_rsrv, tcp_openv4, tcp_close, (pfi_t)0,
&tcp_rinfo, NULL, tcp_fuse_rrw, tcp_fuse_rinfop, STRUIOT_STANDARD
};
/* For AF_INET aka /dev/tcp */
struct streamtab tcpinfov4 = {
&tcp_rinitv4, &tcp_winit
};
/* For AF_INET6 aka /dev/tcp6 */
struct streamtab tcpinfov6 = {
&tcp_rinitv6, &tcp_winit
};
/*
* Have to ensure that tcp_g_q_close is not done by an
* interrupt thread.
*/
static taskq_t *tcp_taskq;
/*
* TCP has a private interface for other kernel modules to reserve a
* port range for them to use. Once reserved, TCP will not use any ports
* in the range. This interface relies on the TCP_EXCLBIND feature. If
* the semantics of TCP_EXCLBIND is changed, implementation of this interface
* has to be verified.
*
* There can be TCP_RESERVED_PORTS_ARRAY_MAX_SIZE port ranges. Each port
* range can cover at most TCP_RESERVED_PORTS_RANGE_MAX ports. A port
* range is [port a, port b] inclusive. And each port range is between
* TCP_LOWESET_RESERVED_PORT and TCP_LARGEST_RESERVED_PORT inclusive.
*
* Note that the default anonymous port range starts from 32768. There is
* no port "collision" between that and the reserved port range. If there
* is port collision (because the default smallest anonymous port is lowered
* or some apps specifically bind to ports in the reserved port range), the
* system may not be able to reserve a port range even there are enough
* unbound ports as a reserved port range contains consecutive ports .
*/
#define TCP_RESERVED_PORTS_ARRAY_MAX_SIZE 5
#define TCP_RESERVED_PORTS_RANGE_MAX 1000
#define TCP_SMALLEST_RESERVED_PORT 10240
#define TCP_LARGEST_RESERVED_PORT 20480
/* Structure to represent those reserved port ranges. */
typedef struct tcp_rport_s {
in_port_t lo_port;
in_port_t hi_port;
tcp_t **temp_tcp_array;
} tcp_rport_t;
/* Setable only in /etc/system. Move to ndd? */
boolean_t tcp_icmp_source_quench = B_FALSE;
/*
* Following assumes TPI alignment requirements stay along 32 bit
* boundaries
*/
#define ROUNDUP32(x) \
(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
/* Template for response to info request. */
static struct T_info_ack tcp_g_t_info_ack = {
T_INFO_ACK, /* PRIM_type */
0, /* TSDU_size */
T_INFINITE, /* ETSDU_size */
T_INVALID, /* CDATA_size */
T_INVALID, /* DDATA_size */
sizeof (sin_t), /* ADDR_size */
0, /* OPT_size - not initialized here */
TIDUSZ, /* TIDU_size */
T_COTS_ORD, /* SERV_type */
TCPS_IDLE, /* CURRENT_state */
(XPG4_1|EXPINLINE) /* PROVIDER_flag */
};
static struct T_info_ack tcp_g_t_info_ack_v6 = {
T_INFO_ACK, /* PRIM_type */
0, /* TSDU_size */
T_INFINITE, /* ETSDU_size */
T_INVALID, /* CDATA_size */
T_INVALID, /* DDATA_size */
sizeof (sin6_t), /* ADDR_size */
0, /* OPT_size - not initialized here */
TIDUSZ, /* TIDU_size */
T_COTS_ORD, /* SERV_type */
TCPS_IDLE, /* CURRENT_state */
(XPG4_1|EXPINLINE) /* PROVIDER_flag */
};
#define MS 1L
#define SECONDS (1000 * MS)
#define MINUTES (60 * SECONDS)
#define HOURS (60 * MINUTES)
#define DAYS (24 * HOURS)
#define PARAM_MAX (~(uint32_t)0)
/* Max size IP datagram is 64k - 1 */
#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcph_t)))
#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcph_t)))
/* Max of the above */
#define TCP_MSS_MAX TCP_MSS_MAX_IPV4
/* Largest TCP port number */
#define TCP_MAX_PORT (64 * 1024 - 1)
/*
* tcp_wroff_xtra is the extra space in front of TCP/IP header for link
* layer header. It has to be a multiple of 4.
*/
static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
#define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val
/*
* All of these are alterable, within the min/max values given, at run time.
* Note that the default value of "tcp_time_wait_interval" is four minutes,
* per the TCP spec.
*/
/* BEGIN CSTYLED */
static tcpparam_t lcl_tcp_param_arr[] = {
/*min max value name */
{ 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"},
{ 1, PARAM_MAX, 128, "tcp_conn_req_max_q" },
{ 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" },
{ 1, 1024, 1, "tcp_conn_req_min" },
{ 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" },
{ 128, (1<<30), 1024*1024, "tcp_cwnd_max" },
{ 0, 10, 0, "tcp_debug" },
{ 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"},
{ 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"},
{ 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"},
{ 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"},
{ 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"},
{ 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"},
{ 1, 255, 64, "tcp_ipv4_ttl"},
{ 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"},
{ 0, 100, 10, "tcp_maxpsz_multiplier" },
{ 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"},
{ 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
{ 1, TCP_MSS_MAX, 108, "tcp_mss_min"},
{ 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"},
{ 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"},
{ 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"},
{ 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"},
{ 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" },
{ 0, 16, 0, "tcp_snd_lowat_fraction" },
{ 0, 128000, 0, "tcp_sth_rcv_hiwat" },
{ 0, 128000, 0, "tcp_sth_rcv_lowat" },
{ 1, 10000, 3, "tcp_dupack_fast_retransmit" },
{ 0, 1, 0, "tcp_ignore_path_mtu" },
{ 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"},
{ 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"},
{ TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
{ TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
{ TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
{ 1, 65536, 4, "tcp_recv_hiwat_minmss"},
{ 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"},
{ 0, TCP_MSS_MAX, 64, "tcp_co_min"},
{ 8192, (1<<30), 1024*1024, "tcp_max_buf"},
/*
* Question: What default value should I set for tcp_strong_iss?
*/
{ 0, 2, 1, "tcp_strong_iss"},
{ 0, 65536, 20, "tcp_rtt_updates"},
{ 0, 1, 1, "tcp_wscale_always"},
{ 0, 1, 0, "tcp_tstamp_always"},
{ 0, 1, 1, "tcp_tstamp_if_wscale"},
{ 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"},
{ 0, 16, 2, "tcp_deferred_acks_max"},
{ 1, 16384, 4, "tcp_slow_start_after_idle"},
{ 1, 4, 4, "tcp_slow_start_initial"},
{ 10*MS, 50*MS, 20*MS, "tcp_co_timer_interval"},
{ 0, 2, 2, "tcp_sack_permitted"},
{ 0, 1, 0, "tcp_trace"},
{ 0, 1, 1, "tcp_compression_enabled"},
{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"},
{ 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"},
{ 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
{ 0, 1, 0, "tcp_rev_src_routes"},
{ 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"},
{ 100*MS, 60*SECONDS, 1*SECONDS, "tcp_ndd_get_info_interval"},
{ 0, 16, 8, "tcp_local_dacks_max"},
{ 0, 2, 1, "tcp_ecn_permitted"},
{ 0, 1, 1, "tcp_rst_sent_rate_enabled"},
{ 0, PARAM_MAX, 40, "tcp_rst_sent_rate"},
{ 0, 100*MS, 50*MS, "tcp_push_timer_interval"},
{ 0, 1, 0, "tcp_use_smss_as_mss_opt"},
{ 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"},
};
/* END CSTYLED */
/*
* tcp_mdt_hdr_{head,tail}_min are the leading and trailing spaces of
* each header fragment in the header buffer. Each parameter value has
* to be a multiple of 4 (32-bit aligned).
*/
static tcpparam_t lcl_tcp_mdt_head_param =
{ 32, 256, 32, "tcp_mdt_hdr_head_min" };
static tcpparam_t lcl_tcp_mdt_tail_param =
{ 0, 256, 32, "tcp_mdt_hdr_tail_min" };
#define tcps_mdt_hdr_head_min tcps_mdt_head_param->tcp_param_val
#define tcps_mdt_hdr_tail_min tcps_mdt_tail_param->tcp_param_val
/*
* tcp_mdt_max_pbufs is the upper limit value that tcp uses to figure out
* the maximum number of payload buffers associated per Multidata.
*/
static tcpparam_t lcl_tcp_mdt_max_pbufs_param =
{ 1, MULTIDATA_MAX_PBUFS, MULTIDATA_MAX_PBUFS, "tcp_mdt_max_pbufs" };
#define tcps_mdt_max_pbufs tcps_mdt_max_pbufs_param->tcp_param_val
/* Round up the value to the nearest mss. */
#define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss))
/*
* Set ECN capable transport (ECT) code point in IP header.
*
* Note that there are 2 ECT code points '01' and '10', which are called
* ECT(1) and ECT(0) respectively. Here we follow the original ECT code
* point ECT(0) for TCP as described in RFC 2481.
*/
#define SET_ECT(tcp, iph) \
if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
/* We need to clear the code point first. */ \
((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
} else { \
((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
}
/*
* The format argument to pass to tcp_display().
* DISP_PORT_ONLY means that the returned string has only port info.
* DISP_ADDR_AND_PORT means that the returned string also contains the
* remote and local IP address.
*/
#define DISP_PORT_ONLY 1
#define DISP_ADDR_AND_PORT 2
#define NDD_TOO_QUICK_MSG \
"ndd get info rate too high for non-privileged users, try again " \
"later.\n"
#define NDD_OUT_OF_BUF_MSG "<< Out of buffer >>\n"
#define IS_VMLOANED_MBLK(mp) \
(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
/* Enable or disable b_cont M_MULTIDATA chaining for MDT. */
boolean_t tcp_mdt_chain = B_TRUE;
/*
* MDT threshold in the form of effective send MSS multiplier; we take
* the MDT path if the amount of unsent data exceeds the threshold value
* (default threshold is 1*SMSS).
*/
uint_t tcp_mdt_smss_threshold = 1;
uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */
/*
* Forces all connections to obey the value of the tcps_maxpsz_multiplier
* tunable settable via NDD. Otherwise, the per-connection behavior is
* determined dynamically during tcp_adapt_ire(), which is the default.
*/
boolean_t tcp_static_maxpsz = B_FALSE;
/* Setable in /etc/system */
/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
uint32_t tcp_random_anon_port = 1;
/*
* To reach to an eager in Q0 which can be dropped due to an incoming
* new SYN request when Q0 is full, a new doubly linked list is
* introduced. This list allows to select an eager from Q0 in O(1) time.
* This is needed to avoid spending too much time walking through the
* long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
* this new list has to be a member of Q0.
* This list is headed by listener's tcp_t. When the list is empty,
* both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
* of listener's tcp_t point to listener's tcp_t itself.
*
* Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
* in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
* These macros do not affect the eager's membership to Q0.
*/
#define MAKE_DROPPABLE(listener, eager) \
if ((eager)->tcp_eager_next_drop_q0 == NULL) { \
(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
= (eager); \
(eager)->tcp_eager_prev_drop_q0 = (listener); \
(eager)->tcp_eager_next_drop_q0 = \
(listener)->tcp_eager_next_drop_q0; \
(listener)->tcp_eager_next_drop_q0 = (eager); \
}
#define MAKE_UNDROPPABLE(eager) \
if ((eager)->tcp_eager_next_drop_q0 != NULL) { \
(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \
= (eager)->tcp_eager_prev_drop_q0; \
(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \
= (eager)->tcp_eager_next_drop_q0; \
(eager)->tcp_eager_prev_drop_q0 = NULL; \
(eager)->tcp_eager_next_drop_q0 = NULL; \
}
/*
* If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
* than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
* data, TCP will not respond with an ACK. RFC 793 requires that
* TCP responds with an ACK for such a bogus ACK. By not following
* the RFC, we prevent TCP from getting into an ACK storm if somehow
* an attacker successfully spoofs an acceptable segment to our
* peer; or when our peer is "confused."
*/
uint32_t tcp_drop_ack_unsent_cnt = 10;
/*
* Hook functions to enable cluster networking
* On non-clustered systems these vectors must always be NULL.
*/
void (*cl_inet_listen)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport) = NULL;
void (*cl_inet_unlisten)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport) = NULL;
void (*cl_inet_connect)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport,
uint8_t *faddrp, in_port_t fport) = NULL;
void (*cl_inet_disconnect)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport,
uint8_t *faddrp, in_port_t fport) = NULL;
/*
* The following are defined in ip.c
*/
extern int (*cl_inet_isclusterwide)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp);
extern uint32_t (*cl_inet_ipident)(uint8_t protocol, sa_family_t addr_family,
uint8_t *laddrp, uint8_t *faddrp);
#define CL_INET_CONNECT(tcp) { \
if (cl_inet_connect != NULL) { \
/* \
* Running in cluster mode - register active connection \
* information \
*/ \
if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
if ((tcp)->tcp_ipha->ipha_src != 0) { \
(*cl_inet_connect)(IPPROTO_TCP, AF_INET,\
(uint8_t *)(&((tcp)->tcp_ipha->ipha_src)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *)(&((tcp)->tcp_ipha->ipha_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
&(tcp)->tcp_ip6h->ip6_src)) {\
(*cl_inet_connect)(IPPROTO_TCP, AF_INET6,\
(uint8_t *)(&((tcp)->tcp_ip6h->ip6_src)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *)(&((tcp)->tcp_ip6h->ip6_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} \
} \
}
#define CL_INET_DISCONNECT(tcp) { \
if (cl_inet_disconnect != NULL) { \
/* \
* Running in cluster mode - deregister active \
* connection information \
*/ \
if ((tcp)->tcp_ipversion == IPV4_VERSION) { \
if ((tcp)->tcp_ip_src != 0) { \
(*cl_inet_disconnect)(IPPROTO_TCP, \
AF_INET, \
(uint8_t *)(&((tcp)->tcp_ip_src)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *) \
(&((tcp)->tcp_ipha->ipha_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
&(tcp)->tcp_ip_src_v6)) { \
(*cl_inet_disconnect)(IPPROTO_TCP, AF_INET6,\
(uint8_t *)(&((tcp)->tcp_ip_src_v6)),\
(in_port_t)(tcp)->tcp_lport, \
(uint8_t *) \
(&((tcp)->tcp_ip6h->ip6_dst)),\
(in_port_t)(tcp)->tcp_fport); \
} \
} \
} \
}
/*
* Cluster networking hook for traversing current connection list.
* This routine is used to extract the current list of live connections
* which must continue to to be dispatched to this node.
*/
int cl_tcp_walk_list(int (*callback)(cl_tcp_info_t *, void *), void *arg);
static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
void *arg, tcp_stack_t *tcps);
/*
* Figure out the value of window scale opton. Note that the rwnd is
* ASSUMED to be rounded up to the nearest MSS before the calculation.
* We cannot find the scale value and then do a round up of tcp_rwnd
* because the scale value may not be correct after that.
*
* Set the compiler flag to make this function inline.
*/
static void
tcp_set_ws_value(tcp_t *tcp)
{
int i;
uint32_t rwnd = tcp->tcp_rwnd;
for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
i++, rwnd >>= 1)
;
tcp->tcp_rcv_ws = i;
}
/*
* Remove a connection from the list of detached TIME_WAIT connections.
* It returns B_FALSE if it can't remove the connection from the list
* as the connection has already been removed from the list due to an
* earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
*/
static boolean_t
tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
{
boolean_t locked = B_FALSE;
if (tcp_time_wait == NULL) {
tcp_time_wait = *((tcp_squeue_priv_t **)
squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
locked = B_TRUE;
} else {
ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
}
if (tcp->tcp_time_wait_expire == 0) {
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
return (B_FALSE);
}
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
if (tcp == tcp_time_wait->tcp_time_wait_head) {
ASSERT(tcp->tcp_time_wait_prev == NULL);
tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
if (tcp_time_wait->tcp_time_wait_head != NULL) {
tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
NULL;
} else {
tcp_time_wait->tcp_time_wait_tail = NULL;
}
} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
ASSERT(tcp->tcp_time_wait_next == NULL);
tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
} else {
ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
tcp->tcp_time_wait_prev->tcp_time_wait_next =
tcp->tcp_time_wait_next;
tcp->tcp_time_wait_next->tcp_time_wait_prev =
tcp->tcp_time_wait_prev;
}
tcp->tcp_time_wait_next = NULL;
tcp->tcp_time_wait_prev = NULL;
tcp->tcp_time_wait_expire = 0;
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
return (B_TRUE);
}
/*
* Add a connection to the list of detached TIME_WAIT connections
* and set its time to expire.
*/
static void
tcp_time_wait_append(tcp_t *tcp)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
tcp_squeue_priv_t *tcp_time_wait =
*((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
SQPRIVATE_TCP));
tcp_timers_stop(tcp);
/* Freed above */
ASSERT(tcp->tcp_timer_tid == 0);
ASSERT(tcp->tcp_ack_tid == 0);
/* must have happened at the time of detaching the tcp */
ASSERT(tcp->tcp_ptpahn == NULL);
ASSERT(tcp->tcp_flow_stopped == 0);
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
ASSERT(tcp->tcp_time_wait_expire == NULL);
ASSERT(tcp->tcp_listener == NULL);
tcp->tcp_time_wait_expire = ddi_get_lbolt();
/*
* The value computed below in tcp->tcp_time_wait_expire may
* appear negative or wrap around. That is ok since our
* interest is only in the difference between the current lbolt
* value and tcp->tcp_time_wait_expire. But the value should not
* be zero, since it means the tcp is not in the TIME_WAIT list.
* The corresponding comparison in tcp_time_wait_collector() uses
* modular arithmetic.
*/
tcp->tcp_time_wait_expire +=
drv_usectohz(tcps->tcps_time_wait_interval * 1000);
if (tcp->tcp_time_wait_expire == 0)
tcp->tcp_time_wait_expire = 1;
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
TCP_DBGSTAT(tcps, tcp_time_wait);
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
if (tcp_time_wait->tcp_time_wait_head == NULL) {
ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
tcp_time_wait->tcp_time_wait_head = tcp;
} else {
ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
TCPS_TIME_WAIT);
tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
}
tcp_time_wait->tcp_time_wait_tail = tcp;
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
}
/* ARGSUSED */
void
tcp_timewait_output(void *arg, mblk_t *mp, void *arg2)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
ASSERT(tcp != NULL);
if (tcp->tcp_state == TCPS_CLOSED) {
return;
}
ASSERT((tcp->tcp_family == AF_INET &&
tcp->tcp_ipversion == IPV4_VERSION) ||
(tcp->tcp_family == AF_INET6 &&
(tcp->tcp_ipversion == IPV4_VERSION ||
tcp->tcp_ipversion == IPV6_VERSION)));
ASSERT(!tcp->tcp_listener);
TCP_STAT(tcps, tcp_time_wait_reap);
ASSERT(TCP_IS_DETACHED(tcp));
/*
* Because they have no upstream client to rebind or tcp_close()
* them later, we axe the connection here and now.
*/
tcp_close_detached(tcp);
}
/*
* Remove cached/latched IPsec references.
*/
void
tcp_ipsec_cleanup(tcp_t *tcp)
{
conn_t *connp = tcp->tcp_connp;
ASSERT(connp->conn_flags & IPCL_TCPCONN);
if (connp->conn_latch != NULL) {
IPLATCH_REFRELE(connp->conn_latch,
connp->conn_netstack);
connp->conn_latch = NULL;
}
if (connp->conn_policy != NULL) {
IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
connp->conn_policy = NULL;
}
}
/*
* Cleaup before placing on free list.
* Disassociate from the netstack/tcp_stack_t since the freelist
* is per squeue and not per netstack.
*/
void
tcp_cleanup(tcp_t *tcp)
{
mblk_t *mp;
char *tcp_iphc;
int tcp_iphc_len;
int tcp_hdr_grown;
tcp_sack_info_t *tcp_sack_info;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
netstack_t *ns = tcps->tcps_netstack;
tcp_bind_hash_remove(tcp);
/* Cleanup that which needs the netstack first */
tcp_ipsec_cleanup(tcp);
tcp_free(tcp);
/* Release any SSL context */
if (tcp->tcp_kssl_ent != NULL) {
kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
tcp->tcp_kssl_ent = NULL;
}
if (tcp->tcp_kssl_ctx != NULL) {
kssl_release_ctx(tcp->tcp_kssl_ctx);
tcp->tcp_kssl_ctx = NULL;
}
tcp->tcp_kssl_pending = B_FALSE;
conn_delete_ire(connp, NULL);
/*
* Since we will bzero the entire structure, we need to
* remove it and reinsert it in global hash list. We
* know the walkers can't get to this conn because we
* had set CONDEMNED flag earlier and checked reference
* under conn_lock so walker won't pick it and when we
* go the ipcl_globalhash_remove() below, no walker
* can get to it.
*/
ipcl_globalhash_remove(connp);
/*
* Now it is safe to decrement the reference counts.
* This might be the last reference on the netstack and TCPS
* in which case it will cause the tcp_g_q_close and
* the freeing of the IP Instance.
*/
connp->conn_netstack = NULL;
netstack_rele(ns);
ASSERT(tcps != NULL);
tcp->tcp_tcps = NULL;
TCPS_REFRELE(tcps);
/* Save some state */
mp = tcp->tcp_timercache;
tcp_sack_info = tcp->tcp_sack_info;
tcp_iphc = tcp->tcp_iphc;
tcp_iphc_len = tcp->tcp_iphc_len;
tcp_hdr_grown = tcp->tcp_hdr_grown;
if (connp->conn_cred != NULL) {
crfree(connp->conn_cred);
connp->conn_cred = NULL;
}
if (connp->conn_peercred != NULL) {
crfree(connp->conn_peercred);
connp->conn_peercred = NULL;
}
ipcl_conn_cleanup(connp);
connp->conn_flags = IPCL_TCPCONN;
bzero(tcp, sizeof (tcp_t));
/* restore the state */
tcp->tcp_timercache = mp;
tcp->tcp_sack_info = tcp_sack_info;
tcp->tcp_iphc = tcp_iphc;
tcp->tcp_iphc_len = tcp_iphc_len;
tcp->tcp_hdr_grown = tcp_hdr_grown;
tcp->tcp_connp = connp;
ASSERT(connp->conn_tcp == tcp);
ASSERT(connp->conn_flags & IPCL_TCPCONN);
connp->conn_state_flags = CONN_INCIPIENT;
ASSERT(connp->conn_ulp == IPPROTO_TCP);
ASSERT(connp->conn_ref == 1);
}
/*
* Blows away all tcps whose TIME_WAIT has expired. List traversal
* is done forwards from the head.
* This walks all stack instances since
* tcp_time_wait remains global across all stacks.
*/
/* ARGSUSED */
void
tcp_time_wait_collector(void *arg)
{
tcp_t *tcp;
clock_t now;
mblk_t *mp;
conn_t *connp;
kmutex_t *lock;
boolean_t removed;
squeue_t *sqp = (squeue_t *)arg;
tcp_squeue_priv_t *tcp_time_wait =
*((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
tcp_time_wait->tcp_time_wait_tid = 0;
if (tcp_time_wait->tcp_free_list != NULL &&
tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
TCP_G_STAT(tcp_freelist_cleanup);
while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
tcp->tcp_time_wait_next = NULL;
tcp_time_wait->tcp_free_list_cnt--;
ASSERT(tcp->tcp_tcps == NULL);
CONN_DEC_REF(tcp->tcp_connp);
}
ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
}
/*
* In order to reap time waits reliably, we should use a
* source of time that is not adjustable by the user -- hence
* the call to ddi_get_lbolt().
*/
now = ddi_get_lbolt();
while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
/*
* Compare times using modular arithmetic, since
* lbolt can wrapover.
*/
if ((now - tcp->tcp_time_wait_expire) < 0) {
break;
}
removed = tcp_time_wait_remove(tcp, tcp_time_wait);
ASSERT(removed);
connp = tcp->tcp_connp;
ASSERT(connp->conn_fanout != NULL);
lock = &connp->conn_fanout->connf_lock;
/*
* This is essentially a TW reclaim fast path optimization for
* performance where the timewait collector checks under the
* fanout lock (so that no one else can get access to the
* conn_t) that the refcnt is 2 i.e. one for TCP and one for
* the classifier hash list. If ref count is indeed 2, we can
* just remove the conn under the fanout lock and avoid
* cleaning up the conn under the squeue, provided that
* clustering callbacks are not enabled. If clustering is
* enabled, we need to make the clustering callback before
* setting the CONDEMNED flag and after dropping all locks and
* so we forego this optimization and fall back to the slow
* path. Also please see the comments in tcp_closei_local
* regarding the refcnt logic.
*
* Since we are holding the tcp_time_wait_lock, its better
* not to block on the fanout_lock because other connections
* can't add themselves to time_wait list. So we do a
* tryenter instead of mutex_enter.
*/
if (mutex_tryenter(lock)) {
mutex_enter(&connp->conn_lock);
if ((connp->conn_ref == 2) &&
(cl_inet_disconnect == NULL)) {
ipcl_hash_remove_locked(connp,
connp->conn_fanout);
/*
* Set the CONDEMNED flag now itself so that
* the refcnt cannot increase due to any
* walker. But we have still not cleaned up
* conn_ire_cache. This is still ok since
* we are going to clean it up in tcp_cleanup
* immediately and any interface unplumb
* thread will wait till the ire is blown away
*/
connp->conn_state_flags |= CONN_CONDEMNED;
mutex_exit(lock);
mutex_exit(&connp->conn_lock);
if (tcp_time_wait->tcp_free_list_cnt <
tcp_free_list_max_cnt) {
/* Add to head of tcp_free_list */
mutex_exit(
&tcp_time_wait->tcp_time_wait_lock);
tcp_cleanup(tcp);
ASSERT(connp->conn_latch == NULL);
ASSERT(connp->conn_policy == NULL);
ASSERT(tcp->tcp_tcps == NULL);
ASSERT(connp->conn_netstack == NULL);
mutex_enter(
&tcp_time_wait->tcp_time_wait_lock);
tcp->tcp_time_wait_next =
tcp_time_wait->tcp_free_list;
tcp_time_wait->tcp_free_list = tcp;
tcp_time_wait->tcp_free_list_cnt++;
continue;
} else {
/* Do not add to tcp_free_list */
mutex_exit(
&tcp_time_wait->tcp_time_wait_lock);
tcp_bind_hash_remove(tcp);
conn_delete_ire(tcp->tcp_connp, NULL);
tcp_ipsec_cleanup(tcp);
CONN_DEC_REF(tcp->tcp_connp);
}
} else {
CONN_INC_REF_LOCKED(connp);
mutex_exit(lock);
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
mutex_exit(&connp->conn_lock);
/*
* We can reuse the closemp here since conn has
* detached (otherwise we wouldn't even be in
* time_wait list). tcp_closemp_used can safely
* be changed without taking a lock as no other
* thread can concurrently access it at this
* point in the connection lifecycle.
*/
if (tcp->tcp_closemp.b_prev == NULL)
tcp->tcp_closemp_used = B_TRUE;
else
cmn_err(CE_PANIC,
"tcp_timewait_collector: "
"concurrent use of tcp_closemp: "
"connp %p tcp %p\n", (void *)connp,
(void *)tcp);
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
squeue_fill(connp->conn_sqp, mp,
tcp_timewait_output, connp,
SQTAG_TCP_TIMEWAIT);
}
} else {
mutex_enter(&connp->conn_lock);
CONN_INC_REF_LOCKED(connp);
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
mutex_exit(&connp->conn_lock);
/*
* We can reuse the closemp here since conn has
* detached (otherwise we wouldn't even be in
* time_wait list). tcp_closemp_used can safely
* be changed without taking a lock as no other
* thread can concurrently access it at this
* point in the connection lifecycle.
*/
if (tcp->tcp_closemp.b_prev == NULL)
tcp->tcp_closemp_used = B_TRUE;
else
cmn_err(CE_PANIC, "tcp_timewait_collector: "
"concurrent use of tcp_closemp: "
"connp %p tcp %p\n", (void *)connp,
(void *)tcp);
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
squeue_fill(connp->conn_sqp, mp,
tcp_timewait_output, connp, 0);
}
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
}
if (tcp_time_wait->tcp_free_list != NULL)
tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
tcp_time_wait->tcp_time_wait_tid =
timeout(tcp_time_wait_collector, sqp, TCP_TIME_WAIT_DELAY);
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
}
/*
* Reply to a clients T_CONN_RES TPI message. This function
* is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
* on the acceptor STREAM and processed in tcp_wput_accept().
* Read the block comment on top of tcp_conn_request().
*/
static void
tcp_accept(tcp_t *listener, mblk_t *mp)
{
tcp_t *acceptor;
tcp_t *eager;
tcp_t *tcp;
struct T_conn_res *tcr;
t_uscalar_t acceptor_id;
t_scalar_t seqnum;
mblk_t *opt_mp = NULL; /* T_OPTMGMT_REQ messages */
mblk_t *ok_mp;
mblk_t *mp1;
tcp_stack_t *tcps = listener->tcp_tcps;
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
tcp_err_ack(listener, mp, TPROTO, 0);
return;
}
tcr = (struct T_conn_res *)mp->b_rptr;
/*
* Under ILP32 the stream head points tcr->ACCEPTOR_id at the
* read side queue of the streams device underneath us i.e. the
* read side queue of 'ip'. Since we can't deference QUEUE_ptr we
* look it up in the queue_hash. Under LP64 it sends down the
* minor_t of the accepting endpoint.
*
* Once the acceptor/eager are modified (in tcp_accept_swap) the
* fanout hash lock is held.
* This prevents any thread from entering the acceptor queue from
* below (since it has not been hard bound yet i.e. any inbound
* packets will arrive on the listener or default tcp queue and
* go through tcp_lookup).
* The CONN_INC_REF will prevent the acceptor from closing.
*
* XXX It is still possible for a tli application to send down data
* on the accepting stream while another thread calls t_accept.
* This should not be a problem for well-behaved applications since
* the T_OK_ACK is sent after the queue swapping is completed.
*
* If the accepting fd is the same as the listening fd, avoid
* queue hash lookup since that will return an eager listener in a
* already established state.
*/
acceptor_id = tcr->ACCEPTOR_id;
mutex_enter(&listener->tcp_eager_lock);
if (listener->tcp_acceptor_id == acceptor_id) {
eager = listener->tcp_eager_next_q;
/* only count how many T_CONN_INDs so don't count q0 */
if ((listener->tcp_conn_req_cnt_q != 1) ||
(eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TBADF, 0);
return;
}
if (listener->tcp_conn_req_cnt_q0 != 0) {
/* Throw away all the eagers on q0. */
tcp_eager_cleanup(listener, 1);
}
if (listener->tcp_syn_defense) {
listener->tcp_syn_defense = B_FALSE;
if (listener->tcp_ip_addr_cache != NULL) {
kmem_free(listener->tcp_ip_addr_cache,
IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
listener->tcp_ip_addr_cache = NULL;
}
}
/*
* Transfer tcp_conn_req_max to the eager so that when
* a disconnect occurs we can revert the endpoint to the
* listen state.
*/
eager->tcp_conn_req_max = listener->tcp_conn_req_max;
ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
/*
* Get a reference on the acceptor just like the
* tcp_acceptor_hash_lookup below.
*/
acceptor = listener;
CONN_INC_REF(acceptor->tcp_connp);
} else {
acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
if (acceptor == NULL) {
if (listener->tcp_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_accept: did not find acceptor 0x%x\n",
acceptor_id);
}
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
return;
}
/*
* Verify acceptor state. The acceptable states for an acceptor
* include TCPS_IDLE and TCPS_BOUND.
*/
switch (acceptor->tcp_state) {
case TCPS_IDLE:
/* FALLTHRU */
case TCPS_BOUND:
break;
default:
CONN_DEC_REF(acceptor->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TOUTSTATE, 0);
return;
}
}
/* The listener must be in TCPS_LISTEN */
if (listener->tcp_state != TCPS_LISTEN) {
CONN_DEC_REF(acceptor->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TOUTSTATE, 0);
return;
}
/*
* Rendezvous with an eager connection request packet hanging off
* 'tcp' that has the 'seqnum' tag. We tagged the detached open
* tcp structure when the connection packet arrived in
* tcp_conn_request().
*/
seqnum = tcr->SEQ_number;
eager = listener;
do {
eager = eager->tcp_eager_next_q;
if (eager == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TBADSEQ, 0);
return;
}
} while (eager->tcp_conn_req_seqnum != seqnum);
mutex_exit(&listener->tcp_eager_lock);
/*
* At this point, both acceptor and listener have 2 ref
* that they begin with. Acceptor has one additional ref
* we placed in lookup while listener has 3 additional
* ref for being behind the squeue (tcp_accept() is
* done on listener's squeue); being in classifier hash;
* and eager's ref on listener.
*/
ASSERT(listener->tcp_connp->conn_ref >= 5);
ASSERT(acceptor->tcp_connp->conn_ref >= 3);
/*
* The eager at this point is set in its own squeue and
* could easily have been killed (tcp_accept_finish will
* deal with that) because of a TH_RST so we can only
* ASSERT for a single ref.
*/
ASSERT(eager->tcp_connp->conn_ref >= 1);
/* Pre allocate the stroptions mblk also */
opt_mp = allocb(sizeof (struct stroptions), BPRI_HI);
if (opt_mp == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
return;
}
DB_TYPE(opt_mp) = M_SETOPTS;
opt_mp->b_wptr += sizeof (struct stroptions);
/*
* Prepare for inheriting IPV6_BOUND_IF and IPV6_RECVPKTINFO
* from listener to acceptor. The message is chained on opt_mp
* which will be sent onto eager's squeue.
*/
if (listener->tcp_bound_if != 0) {
/* allocate optmgmt req */
mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
IPV6_BOUND_IF, (char *)&listener->tcp_bound_if,
sizeof (int));
if (mp1 != NULL)
linkb(opt_mp, mp1);
}
if (listener->tcp_ipv6_recvancillary & TCP_IPV6_RECVPKTINFO) {
uint_t on = 1;
/* allocate optmgmt req */
mp1 = tcp_setsockopt_mp(IPPROTO_IPV6,
IPV6_RECVPKTINFO, (char *)&on, sizeof (on));
if (mp1 != NULL)
linkb(opt_mp, mp1);
}
/* Re-use mp1 to hold a copy of mp, in case reallocb fails */
if ((mp1 = copymsg(mp)) == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
freemsg(opt_mp);
tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
return;
}
tcr = (struct T_conn_res *)mp1->b_rptr;
/*
* This is an expanded version of mi_tpi_ok_ack_alloc()
* which allocates a larger mblk and appends the new
* local address to the ok_ack. The address is copied by
* soaccept() for getsockname().
*/
{
int extra;
extra = (eager->tcp_family == AF_INET) ?
sizeof (sin_t) : sizeof (sin6_t);
/*
* Try to re-use mp, if possible. Otherwise, allocate
* an mblk and return it as ok_mp. In any case, mp
* is no longer usable upon return.
*/
if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
freemsg(opt_mp);
/* Original mp has been freed by now, so use mp1 */
tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
return;
}
mp = NULL; /* We should never use mp after this point */
switch (extra) {
case sizeof (sin_t): {
sin_t *sin = (sin_t *)ok_mp->b_wptr;
ok_mp->b_wptr += extra;
sin->sin_family = AF_INET;
sin->sin_port = eager->tcp_lport;
sin->sin_addr.s_addr =
eager->tcp_ipha->ipha_src;
break;
}
case sizeof (sin6_t): {
sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
ok_mp->b_wptr += extra;
sin6->sin6_family = AF_INET6;
sin6->sin6_port = eager->tcp_lport;
if (eager->tcp_ipversion == IPV4_VERSION) {
sin6->sin6_flowinfo = 0;
IN6_IPADDR_TO_V4MAPPED(
eager->tcp_ipha->ipha_src,
&sin6->sin6_addr);
} else {
ASSERT(eager->tcp_ip6h != NULL);
sin6->sin6_flowinfo =
eager->tcp_ip6h->ip6_vcf &
~IPV6_VERS_AND_FLOW_MASK;
sin6->sin6_addr =
eager->tcp_ip6h->ip6_src;
}
sin6->sin6_scope_id = 0;
sin6->__sin6_src_id = 0;
break;
}
default:
break;
}
ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
}
/*
* If there are no options we know that the T_CONN_RES will
* succeed. However, we can't send the T_OK_ACK upstream until
* the tcp_accept_swap is done since it would be dangerous to
* let the application start using the new fd prior to the swap.
*/
tcp_accept_swap(listener, acceptor, eager);
/*
* tcp_accept_swap unlinks eager from listener but does not drop
* the eager's reference on the listener.
*/
ASSERT(eager->tcp_listener == NULL);
ASSERT(listener->tcp_connp->conn_ref >= 5);
/*
* The eager is now associated with its own queue. Insert in
* the hash so that the connection can be reused for a future
* T_CONN_RES.
*/
tcp_acceptor_hash_insert(acceptor_id, eager);
/*
* We now do the processing of options with T_CONN_RES.
* We delay till now since we wanted to have queue to pass to
* option processing routines that points back to the right
* instance structure which does not happen until after
* tcp_accept_swap().
*
* Note:
* The sanity of the logic here assumes that whatever options
* are appropriate to inherit from listner=>eager are done
* before this point, and whatever were to be overridden (or not)
* in transfer logic from eager=>acceptor in tcp_accept_swap().
* [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
* before its ACCEPTOR_id comes down in T_CONN_RES ]
* This may not be true at this point in time but can be fixed
* independently. This option processing code starts with
* the instantiated acceptor instance and the final queue at
* this point.
*/
if (tcr->OPT_length != 0) {
/* Options to process */
int t_error = 0;
int sys_error = 0;
int do_disconnect = 0;
if (tcp_conprim_opt_process(eager, mp1,
&do_disconnect, &t_error, &sys_error) < 0) {
eager->tcp_accept_error = 1;
if (do_disconnect) {
/*
* An option failed which does not allow
* connection to be accepted.
*
* We allow T_CONN_RES to succeed and
* put a T_DISCON_IND on the eager queue.
*/
ASSERT(t_error == 0 && sys_error == 0);
eager->tcp_send_discon_ind = 1;
} else {
ASSERT(t_error != 0);
freemsg(ok_mp);
/*
* Original mp was either freed or set
* to ok_mp above, so use mp1 instead.
*/
tcp_err_ack(listener, mp1, t_error, sys_error);
goto finish;
}
}
/*
* Most likely success in setting options (except if
* eager->tcp_send_discon_ind set).
* mp1 option buffer represented by OPT_length/offset
* potentially modified and contains results of setting
* options at this point
*/
}
/* We no longer need mp1, since all options processing has passed */
freemsg(mp1);
putnext(listener->tcp_rq, ok_mp);
mutex_enter(&listener->tcp_eager_lock);
if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
tcp_t *tail;
mblk_t *conn_ind;
/*
* This path should not be executed if listener and
* acceptor streams are the same.
*/
ASSERT(listener != acceptor);
tcp = listener->tcp_eager_prev_q0;
/*
* listener->tcp_eager_prev_q0 points to the TAIL of the
* deferred T_conn_ind queue. We need to get to the head of
* the queue in order to send up T_conn_ind the same order as
* how the 3WHS is completed.
*/
while (tcp != listener) {
if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
break;
else
tcp = tcp->tcp_eager_prev_q0;
}
ASSERT(tcp != listener);
conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
ASSERT(conn_ind != NULL);
tcp->tcp_conn.tcp_eager_conn_ind = NULL;
/* Move from q0 to q */
ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
listener->tcp_conn_req_cnt_q0--;
listener->tcp_conn_req_cnt_q++;
tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
tcp->tcp_eager_prev_q0;
tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
tcp->tcp_eager_next_q0;
tcp->tcp_eager_prev_q0 = NULL;
tcp->tcp_eager_next_q0 = NULL;
tcp->tcp_conn_def_q0 = B_FALSE;
/* Make sure the tcp isn't in the list of droppables */
ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
tcp->tcp_eager_prev_drop_q0 == NULL);
/*
* Insert at end of the queue because sockfs sends
* down T_CONN_RES in chronological order. Leaving
* the older conn indications at front of the queue
* helps reducing search time.
*/
tail = listener->tcp_eager_last_q;
if (tail != NULL)
tail->tcp_eager_next_q = tcp;
else
listener->tcp_eager_next_q = tcp;
listener->tcp_eager_last_q = tcp;
tcp->tcp_eager_next_q = NULL;
mutex_exit(&listener->tcp_eager_lock);
putnext(tcp->tcp_rq, conn_ind);
} else {
mutex_exit(&listener->tcp_eager_lock);
}
/*
* Done with the acceptor - free it
*
* Note: from this point on, no access to listener should be made
* as listener can be equal to acceptor.
*/
finish:
ASSERT(acceptor->tcp_detached);
ASSERT(tcps->tcps_g_q != NULL);
acceptor->tcp_rq = tcps->tcps_g_q;
acceptor->tcp_wq = WR(tcps->tcps_g_q);
(void) tcp_clean_death(acceptor, 0, 2);
CONN_DEC_REF(acceptor->tcp_connp);
/*
* In case we already received a FIN we have to make tcp_rput send
* the ordrel_ind. This will also send up a window update if the window
* has opened up.
*
* In the normal case of a successful connection acceptance
* we give the O_T_BIND_REQ to the read side put procedure as an
* indication that this was just accepted. This tells tcp_rput to
* pass up any data queued in tcp_rcv_list.
*
* In the fringe case where options sent with T_CONN_RES failed and
* we required, we would be indicating a T_DISCON_IND to blow
* away this connection.
*/
/*
* XXX: we currently have a problem if XTI application closes the
* acceptor stream in between. This problem exists in on10-gate also
* and is well know but nothing can be done short of major rewrite
* to fix it. Now it is possible to take care of it by assigning TLI/XTI
* eager same squeue as listener (we can distinguish non socket
* listeners at the time of handling a SYN in tcp_conn_request)
* and do most of the work that tcp_accept_finish does here itself
* and then get behind the acceptor squeue to access the acceptor
* queue.
*/
/*
* We already have a ref on tcp so no need to do one before squeue_fill
*/
squeue_fill(eager->tcp_connp->conn_sqp, opt_mp,
tcp_accept_finish, eager->tcp_connp, SQTAG_TCP_ACCEPT_FINISH);
}
/*
* Swap information between the eager and acceptor for a TLI/XTI client.
* The sockfs accept is done on the acceptor stream and control goes
* through tcp_wput_accept() and tcp_accept()/tcp_accept_swap() is not
* called. In either case, both the eager and listener are in their own
* perimeter (squeue) and the code has to deal with potential race.
*
* See the block comment on top of tcp_accept() and tcp_wput_accept().
*/
static void
tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
{
conn_t *econnp, *aconnp;
ASSERT(eager->tcp_rq == listener->tcp_rq);
ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
ASSERT(!eager->tcp_hard_bound);
ASSERT(!TCP_IS_SOCKET(acceptor));
ASSERT(!TCP_IS_SOCKET(eager));
ASSERT(!TCP_IS_SOCKET(listener));
acceptor->tcp_detached = B_TRUE;
/*
* To permit stream re-use by TLI/XTI, the eager needs a copy of
* the acceptor id.
*/
eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
/* remove eager from listen list... */
mutex_enter(&listener->tcp_eager_lock);
tcp_eager_unlink(eager);
ASSERT(eager->tcp_eager_next_q == NULL &&
eager->tcp_eager_last_q == NULL);
ASSERT(eager->tcp_eager_next_q0 == NULL &&
eager->tcp_eager_prev_q0 == NULL);
mutex_exit(&listener->tcp_eager_lock);
eager->tcp_rq = acceptor->tcp_rq;
eager->tcp_wq = acceptor->tcp_wq;
econnp = eager->tcp_connp;
aconnp = acceptor->tcp_connp;
eager->tcp_rq->q_ptr = econnp;
eager->tcp_wq->q_ptr = econnp;
/*
* In the TLI/XTI loopback case, we are inside the listener's squeue,
* which might be a different squeue from our peer TCP instance.
* For TCP Fusion, the peer expects that whenever tcp_detached is
* clear, our TCP queues point to the acceptor's queues. Thus, use
* membar_producer() to ensure that the assignments of tcp_rq/tcp_wq
* above reach global visibility prior to the clearing of tcp_detached.
*/
membar_producer();
eager->tcp_detached = B_FALSE;
ASSERT(eager->tcp_ack_tid == 0);
econnp->conn_dev = aconnp->conn_dev;
econnp->conn_minor_arena = aconnp->conn_minor_arena;
ASSERT(econnp->conn_minor_arena != NULL);
if (eager->tcp_cred != NULL)
crfree(eager->tcp_cred);
eager->tcp_cred = econnp->conn_cred = aconnp->conn_cred;
ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
aconnp->conn_cred = NULL;
econnp->conn_zoneid = aconnp->conn_zoneid;
econnp->conn_allzones = aconnp->conn_allzones;
econnp->conn_mac_exempt = aconnp->conn_mac_exempt;
aconnp->conn_mac_exempt = B_FALSE;
ASSERT(aconnp->conn_peercred == NULL);
/* Do the IPC initialization */
CONN_INC_REF(econnp);
econnp->conn_multicast_loop = aconnp->conn_multicast_loop;
econnp->conn_af_isv6 = aconnp->conn_af_isv6;
econnp->conn_pkt_isv6 = aconnp->conn_pkt_isv6;
/* Done with old IPC. Drop its ref on its connp */
CONN_DEC_REF(aconnp);
}
/*
* Adapt to the information, such as rtt and rtt_sd, provided from the
* ire cached in conn_cache_ire. If no ire cached, do a ire lookup.
*
* Checks for multicast and broadcast destination address.
* Returns zero on failure; non-zero if ok.
*
* Note that the MSS calculation here is based on the info given in
* the IRE. We do not do any calculation based on TCP options. They
* will be handled in tcp_rput_other() and tcp_rput_data() when TCP
* knows which options to use.
*
* Note on how TCP gets its parameters for a connection.
*
* When a tcp_t structure is allocated, it gets all the default parameters.
* In tcp_adapt_ire(), it gets those metric parameters, like rtt, rtt_sd,
* spipe, rpipe, ... from the route metrics. Route metric overrides the
* default. But if there is an associated tcp_host_param, it will override
* the metrics.
*
* An incoming SYN with a multicast or broadcast destination address, is dropped
* in 1 of 2 places.
*
* 1. If the packet was received over the wire it is dropped in
* ip_rput_process_broadcast()
*
* 2. If the packet was received through internal IP loopback, i.e. the packet
* was generated and received on the same machine, it is dropped in
* ip_wput_local()
*
* An incoming SYN with a multicast or broadcast source address is always
* dropped in tcp_adapt_ire. The same logic in tcp_adapt_ire also serves to
* reject an attempt to connect to a broadcast or multicast (destination)
* address.
*/
static int
tcp_adapt_ire(tcp_t *tcp, mblk_t *ire_mp)
{
tcp_hsp_t *hsp;
ire_t *ire;
ire_t *sire = NULL;
iulp_t *ire_uinfo = NULL;
uint32_t mss_max;
uint32_t mss;
boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
conn_t *connp = tcp->tcp_connp;
boolean_t ire_cacheable = B_FALSE;
zoneid_t zoneid = connp->conn_zoneid;
int match_flags = MATCH_IRE_RECURSIVE | MATCH_IRE_DEFAULT |
MATCH_IRE_SECATTR;
ts_label_t *tsl = crgetlabel(CONN_CRED(connp));
ill_t *ill = NULL;
boolean_t incoming = (ire_mp == NULL);
tcp_stack_t *tcps = tcp->tcp_tcps;
ip_stack_t *ipst = tcps->tcps_netstack->netstack_ip;
ASSERT(connp->conn_ire_cache == NULL);
if (tcp->tcp_ipversion == IPV4_VERSION) {
if (CLASSD(tcp->tcp_connp->conn_rem)) {
BUMP_MIB(&ipst->ips_ip_mib, ipIfStatsInDiscards);
return (0);
}
/*
* If IP_NEXTHOP is set, then look for an IRE_CACHE
* for the destination with the nexthop as gateway.
* ire_ctable_lookup() is used because this particular
* ire, if it exists, will be marked private.
* If that is not available, use the interface ire
* for the nexthop.
*
* TSol: tcp_update_label will detect label mismatches based
* only on the destination's label, but that would not
* detect label mismatches based on the security attributes
* of routes or next hop gateway. Hence we need to pass the
* label to ire_ftable_lookup below in order to locate the
* right prefix (and/or) ire cache. Similarly we also need
* pass the label to the ire_cache_lookup below to locate
* the right ire that also matches on the label.
*/
if (tcp->tcp_connp->conn_nexthop_set) {
ire = ire_ctable_lookup(tcp->tcp_connp->conn_rem,
tcp->tcp_connp->conn_nexthop_v4, 0, NULL, zoneid,
tsl, MATCH_IRE_MARK_PRIVATE_ADDR | MATCH_IRE_GW,
ipst);
if (ire == NULL) {
ire = ire_ftable_lookup(
tcp->tcp_connp->conn_nexthop_v4,
0, 0, IRE_INTERFACE, NULL, NULL, zoneid, 0,
tsl, match_flags, ipst);
if (ire == NULL)
return (0);
} else {
ire_uinfo = &ire->ire_uinfo;
}
} else {
ire = ire_cache_lookup(tcp->tcp_connp->conn_rem,
zoneid, tsl, ipst);
if (ire != NULL) {
ire_cacheable = B_TRUE;
ire_uinfo = (ire_mp != NULL) ?
&((ire_t *)ire_mp->b_rptr)->ire_uinfo:
&ire->ire_uinfo;
} else {
if (ire_mp == NULL) {
ire = ire_ftable_lookup(
tcp->tcp_connp->conn_rem,
0, 0, 0, NULL, &sire, zoneid, 0,
tsl, (MATCH_IRE_RECURSIVE |
MATCH_IRE_DEFAULT), ipst);
if (ire == NULL)
return (0);
ire_uinfo = (sire != NULL) ?
&sire->ire_uinfo :
&ire->ire_uinfo;
} else {
ire = (ire_t *)ire_mp->b_rptr;
ire_uinfo =
&((ire_t *)
ire_mp->b_rptr)->ire_uinfo;
}
}
}
ASSERT(ire != NULL);
if ((ire->ire_src_addr == INADDR_ANY) ||
(ire->ire_type & IRE_BROADCAST)) {
/*
* ire->ire_mp is non null when ire_mp passed in is used
* ire->ire_mp is set in ip_bind_insert_ire[_v6]().
*/
if (ire->ire_mp == NULL)
ire_refrele(ire);
if (sire != NULL)
ire_refrele(sire);
return (0);
}
if (tcp->tcp_ipha->ipha_src == INADDR_ANY) {
ipaddr_t src_addr;
/*
* ip_bind_connected() has stored the correct source
* address in conn_src.
*/
src_addr = tcp->tcp_connp->conn_src;
tcp->tcp_ipha->ipha_src = src_addr;
/*
* Copy of the src addr. in tcp_t is needed
* for the lookup funcs.
*/
IN6_IPADDR_TO_V4MAPPED(src_addr, &tcp->tcp_ip_src_v6);
}
/*
* Set the fragment bit so that IP will tell us if the MTU
* should change. IP tells us the latest setting of
* ip_path_mtu_discovery through ire_frag_flag.
*/
if (ipst->ips_ip_path_mtu_discovery) {
tcp->tcp_ipha->ipha_fragment_offset_and_flags =
htons(IPH_DF);
}
/*
* If ire_uinfo is NULL, this is the IRE_INTERFACE case
* for IP_NEXTHOP. No cache ire has been found for the
* destination and we are working with the nexthop's
* interface ire. Since we need to forward all packets
* to the nexthop first, we "blindly" set tcp_localnet
* to false, eventhough the destination may also be
* onlink.
*/
if (ire_uinfo == NULL)
tcp->tcp_localnet = 0;
else
tcp->tcp_localnet = (ire->ire_gateway_addr == 0);
} else {
/*
* For incoming connection ire_mp = NULL
* For outgoing connection ire_mp != NULL
* Technically we should check conn_incoming_ill
* when ire_mp is NULL and conn_outgoing_ill when
* ire_mp is non-NULL. But this is performance
* critical path and for IPV*_BOUND_IF, outgoing
* and incoming ill are always set to the same value.
*/
ill_t *dst_ill = NULL;
ipif_t *dst_ipif = NULL;
ASSERT(connp->conn_outgoing_ill == connp->conn_incoming_ill);
if (connp->conn_outgoing_ill != NULL) {
/* Outgoing or incoming path */
int err;
dst_ill = conn_get_held_ill(connp,
&connp->conn_outgoing_ill, &err);
if (err == ILL_LOOKUP_FAILED || dst_ill == NULL) {
ip1dbg(("tcp_adapt_ire: ill_lookup failed\n"));
return (0);
}
match_flags |= MATCH_IRE_ILL;
dst_ipif = dst_ill->ill_ipif;
}
ire = ire_ctable_lookup_v6(&tcp->tcp_connp->conn_remv6,
0, 0, dst_ipif, zoneid, tsl, match_flags, ipst);
if (ire != NULL) {
ire_cacheable = B_TRUE;
ire_uinfo = (ire_mp != NULL) ?
&((ire_t *)ire_mp->b_rptr)->ire_uinfo:
&ire->ire_uinfo;
} else {
if (ire_mp == NULL) {
ire = ire_ftable_lookup_v6(
&tcp->tcp_connp->conn_remv6,
0, 0, 0, dst_ipif, &sire, zoneid,
0, tsl, match_flags, ipst);
if (ire == NULL) {
if (dst_ill != NULL)
ill_refrele(dst_ill);
return (0);
}
ire_uinfo = (sire != NULL) ? &sire->ire_uinfo :
&ire->ire_uinfo;
} else {
ire = (ire_t *)ire_mp->b_rptr;
ire_uinfo =
&((ire_t *)ire_mp->b_rptr)->ire_uinfo;
}
}
if (dst_ill != NULL)
ill_refrele(dst_ill);
ASSERT(ire != NULL);
ASSERT(ire_uinfo != NULL);
if (IN6_IS_ADDR_UNSPECIFIED(&ire->ire_src_addr_v6) ||
IN6_IS_ADDR_MULTICAST(&ire->ire_addr_v6)) {
/*
* ire->ire_mp is non null when ire_mp passed in is used
* ire->ire_mp is set in ip_bind_insert_ire[_v6]().
*/
if (ire->ire_mp == NULL)
ire_refrele(ire);
if (sire != NULL)
ire_refrele(sire);
return (0);
}
if (IN6_IS_ADDR_UNSPECIFIED(&tcp->tcp_ip6h->ip6_src)) {
in6_addr_t src_addr;
/*
* ip_bind_connected_v6() has stored the correct source
* address per IPv6 addr. selection policy in
* conn_src_v6.
*/
src_addr = tcp->tcp_connp->conn_srcv6;
tcp->tcp_ip6h->ip6_src =