blob: 0e1ef43cfb4751148dd4fd4cda0984000b542650 [file] [log] [blame]
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
/* Copyright (c) 1990 Mentat Inc. */
#include <sys/types.h>
#include <sys/stream.h>
#include <sys/strsun.h>
#include <sys/strsubr.h>
#include <sys/stropts.h>
#include <sys/strlog.h>
#define _SUN_TPI_VERSION 2
#include <sys/tihdr.h>
#include <sys/timod.h>
#include <sys/ddi.h>
#include <sys/sunddi.h>
#include <sys/suntpi.h>
#include <sys/xti_inet.h>
#include <sys/cmn_err.h>
#include <sys/debug.h>
#include <sys/sdt.h>
#include <sys/vtrace.h>
#include <sys/kmem.h>
#include <sys/ethernet.h>
#include <sys/cpuvar.h>
#include <sys/dlpi.h>
#include <sys/pattr.h>
#include <sys/policy.h>
#include <sys/priv.h>
#include <sys/zone.h>
#include <sys/sunldi.h>
#include <sys/errno.h>
#include <sys/signal.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/sockio.h>
#include <sys/isa_defs.h>
#include <sys/md5.h>
#include <sys/random.h>
#include <sys/uio.h>
#include <sys/systm.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <netinet/ip6.h>
#include <netinet/icmp6.h>
#include <net/if.h>
#include <net/route.h>
#include <inet/ipsec_impl.h>
#include <inet/common.h>
#include <inet/ip.h>
#include <inet/ip_impl.h>
#include <inet/ip6.h>
#include <inet/ip_ndp.h>
#include <inet/proto_set.h>
#include <inet/mib2.h>
#include <inet/nd.h>
#include <inet/optcom.h>
#include <inet/snmpcom.h>
#include <inet/kstatcom.h>
#include <inet/tcp.h>
#include <inet/tcp_impl.h>
#include <inet/udp_impl.h>
#include <net/pfkeyv2.h>
#include <inet/ipdrop.h>
#include <inet/ipclassifier.h>
#include <inet/ip_ire.h>
#include <inet/ip_ftable.h>
#include <inet/ip_if.h>
#include <inet/ipp_common.h>
#include <inet/ip_rts.h>
#include <inet/ip_netinfo.h>
#include <sys/squeue_impl.h>
#include <sys/squeue.h>
#include <inet/kssl/ksslapi.h>
#include <sys/tsol/label.h>
#include <sys/tsol/tnet.h>
#include <rpc/pmap_prot.h>
#include <sys/callo.h>
/*
* TCP Notes: aka FireEngine Phase I (PSARC 2002/433)
*
* (Read the detailed design doc in PSARC case directory)
*
* The entire tcp state is contained in tcp_t and conn_t structure
* which are allocated in tandem using ipcl_conn_create() and passing
* IPCL_TCPCONN as a flag. We use 'conn_ref' and 'conn_lock' to protect
* the references on the tcp_t. The tcp_t structure is never compressed
* and packets always land on the correct TCP perimeter from the time
* eager is created till the time tcp_t dies (as such the old mentat
* TCP global queue is not used for detached state and no IPSEC checking
* is required). The global queue is still allocated to send out resets
* for connection which have no listeners and IP directly calls
* tcp_xmit_listeners_reset() which does any policy check.
*
* Protection and Synchronisation mechanism:
*
* The tcp data structure does not use any kind of lock for protecting
* its state but instead uses 'squeues' for mutual exclusion from various
* read and write side threads. To access a tcp member, the thread should
* always be behind squeue (via squeue_enter with flags as SQ_FILL, SQ_PROCESS,
* or SQ_NODRAIN). Since the squeues allow a direct function call, caller
* can pass any tcp function having prototype of edesc_t as argument
* (different from traditional STREAMs model where packets come in only
* designated entry points). The list of functions that can be directly
* called via squeue are listed before the usual function prototype.
*
* Referencing:
*
* TCP is MT-Hot and we use a reference based scheme to make sure that the
* tcp structure doesn't disappear when its needed. When the application
* creates an outgoing connection or accepts an incoming connection, we
* start out with 2 references on 'conn_ref'. One for TCP and one for IP.
* The IP reference is just a symbolic reference since ip_tcpclose()
* looks at tcp structure after tcp_close_output() returns which could
* have dropped the last TCP reference. So as long as the connection is
* in attached state i.e. !TCP_IS_DETACHED, we have 2 references on the
* conn_t. The classifier puts its own reference when the connection is
* inserted in listen or connected hash. Anytime a thread needs to enter
* the tcp connection perimeter, it retrieves the conn/tcp from q->ptr
* on write side or by doing a classify on read side and then puts a
* reference on the conn before doing squeue_enter/tryenter/fill. For
* read side, the classifier itself puts the reference under fanout lock
* to make sure that tcp can't disappear before it gets processed. The
* squeue will drop this reference automatically so the called function
* doesn't have to do a DEC_REF.
*
* Opening a new connection:
*
* The outgoing connection open is pretty simple. tcp_open() does the
* work in creating the conn/tcp structure and initializing it. The
* squeue assignment is done based on the CPU the application
* is running on. So for outbound connections, processing is always done
* on application CPU which might be different from the incoming CPU
* being interrupted by the NIC. An optimal way would be to figure out
* the NIC <-> CPU binding at listen time, and assign the outgoing
* connection to the squeue attached to the CPU that will be interrupted
* for incoming packets (we know the NIC based on the bind IP address).
* This might seem like a problem if more data is going out but the
* fact is that in most cases the transmit is ACK driven transmit where
* the outgoing data normally sits on TCP's xmit queue waiting to be
* transmitted.
*
* Accepting a connection:
*
* This is a more interesting case because of various races involved in
* establishing a eager in its own perimeter. Read the meta comment on
* top of tcp_input_listener(). But briefly, the squeue is picked by
* ip_fanout based on the ring or the sender (if loopback).
*
* Closing a connection:
*
* The close is fairly straight forward. tcp_close() calls tcp_close_output()
* via squeue to do the close and mark the tcp as detached if the connection
* was in state TCPS_ESTABLISHED or greater. In the later case, TCP keep its
* reference but tcp_close() drop IP's reference always. So if tcp was
* not killed, it is sitting in time_wait list with 2 reference - 1 for TCP
* and 1 because it is in classifier's connected hash. This is the condition
* we use to determine that its OK to clean up the tcp outside of squeue
* when time wait expires (check the ref under fanout and conn_lock and
* if it is 2, remove it from fanout hash and kill it).
*
* Although close just drops the necessary references and marks the
* tcp_detached state, tcp_close needs to know the tcp_detached has been
* set (under squeue) before letting the STREAM go away (because a
* inbound packet might attempt to go up the STREAM while the close
* has happened and tcp_detached is not set). So a special lock and
* flag is used along with a condition variable (tcp_closelock, tcp_closed,
* and tcp_closecv) to signal tcp_close that tcp_close_out() has marked
* tcp_detached.
*
* Special provisions and fast paths:
*
* We make special provisions for sockfs by marking tcp_issocket
* whenever we have only sockfs on top of TCP. This allows us to skip
* putting the tcp in acceptor hash since a sockfs listener can never
* become acceptor and also avoid allocating a tcp_t for acceptor STREAM
* since eager has already been allocated and the accept now happens
* on acceptor STREAM. There is a big blob of comment on top of
* tcp_input_listener explaining the new accept. When socket is POP'd,
* sockfs sends us an ioctl to mark the fact and we go back to old
* behaviour. Once tcp_issocket is unset, its never set for the
* life of that connection.
*
* IPsec notes :
*
* Since a packet is always executed on the correct TCP perimeter
* all IPsec processing is defered to IP including checking new
* connections and setting IPSEC policies for new connection. The
* only exception is tcp_xmit_listeners_reset() which is called
* directly from IP and needs to policy check to see if TH_RST
* can be sent out.
*/
/*
* Values for squeue switch:
* 1: SQ_NODRAIN
* 2: SQ_PROCESS
* 3: SQ_FILL
*/
int tcp_squeue_wput = 2; /* /etc/systems */
int tcp_squeue_flag;
/*
* This controls how tiny a write must be before we try to copy it
* into the mblk on the tail of the transmit queue. Not much
* speedup is observed for values larger than sixteen. Zero will
* disable the optimisation.
*/
int tcp_tx_pull_len = 16;
/*
* TCP Statistics.
*
* How TCP statistics work.
*
* There are two types of statistics invoked by two macros.
*
* TCP_STAT(name) does non-atomic increment of a named stat counter. It is
* supposed to be used in non MT-hot paths of the code.
*
* TCP_DBGSTAT(name) does atomic increment of a named stat counter. It is
* supposed to be used for DEBUG purposes and may be used on a hot path.
*
* Both TCP_STAT and TCP_DBGSTAT counters are available using kstat
* (use "kstat tcp" to get them).
*
* There is also additional debugging facility that marks tcp_clean_death()
* instances and saves them in tcp_t structure. It is triggered by
* TCP_TAG_CLEAN_DEATH define. Also, there is a global array of counters for
* tcp_clean_death() calls that counts the number of times each tag was hit. It
* is triggered by TCP_CLD_COUNTERS define.
*
* How to add new counters.
*
* 1) Add a field in the tcp_stat structure describing your counter.
* 2) Add a line in the template in tcp_kstat2_init() with the name
* of the counter.
*
* IMPORTANT!! - make sure that both are in sync !!
* 3) Use either TCP_STAT or TCP_DBGSTAT with the name.
*
* Please avoid using private counters which are not kstat-exported.
*
* TCP_TAG_CLEAN_DEATH set to 1 enables tagging of tcp_clean_death() instances
* in tcp_t structure.
*
* TCP_MAX_CLEAN_DEATH_TAG is the maximum number of possible clean death tags.
*/
#ifndef TCP_DEBUG_COUNTER
#ifdef DEBUG
#define TCP_DEBUG_COUNTER 1
#else
#define TCP_DEBUG_COUNTER 0
#endif
#endif
#define TCP_CLD_COUNTERS 0
#define TCP_TAG_CLEAN_DEATH 1
#define TCP_MAX_CLEAN_DEATH_TAG 32
#ifdef lint
static int _lint_dummy_;
#endif
#if TCP_CLD_COUNTERS
static uint_t tcp_clean_death_stat[TCP_MAX_CLEAN_DEATH_TAG];
#define TCP_CLD_STAT(x) tcp_clean_death_stat[x]++
#elif defined(lint)
#define TCP_CLD_STAT(x) ASSERT(_lint_dummy_ == 0);
#else
#define TCP_CLD_STAT(x)
#endif
#if TCP_DEBUG_COUNTER
#define TCP_DBGSTAT(tcps, x) \
atomic_add_64(&((tcps)->tcps_statistics.x.value.ui64), 1)
#define TCP_G_DBGSTAT(x) \
atomic_add_64(&(tcp_g_statistics.x.value.ui64), 1)
#elif defined(lint)
#define TCP_DBGSTAT(tcps, x) ASSERT(_lint_dummy_ == 0);
#define TCP_G_DBGSTAT(x) ASSERT(_lint_dummy_ == 0);
#else
#define TCP_DBGSTAT(tcps, x)
#define TCP_G_DBGSTAT(x)
#endif
#define TCP_G_STAT(x) (tcp_g_statistics.x.value.ui64++)
tcp_g_stat_t tcp_g_statistics;
kstat_t *tcp_g_kstat;
/* Macros for timestamp comparisons */
#define TSTMP_GEQ(a, b) ((int32_t)((a)-(b)) >= 0)
#define TSTMP_LT(a, b) ((int32_t)((a)-(b)) < 0)
/*
* Parameters for TCP Initial Send Sequence number (ISS) generation. When
* tcp_strong_iss is set to 1, which is the default, the ISS is calculated
* by adding three components: a time component which grows by 1 every 4096
* nanoseconds (versus every 4 microseconds suggested by RFC 793, page 27);
* a per-connection component which grows by 125000 for every new connection;
* and an "extra" component that grows by a random amount centered
* approximately on 64000. This causes the ISS generator to cycle every
* 4.89 hours if no TCP connections are made, and faster if connections are
* made.
*
* When tcp_strong_iss is set to 0, ISS is calculated by adding two
* components: a time component which grows by 250000 every second; and
* a per-connection component which grows by 125000 for every new connections.
*
* A third method, when tcp_strong_iss is set to 2, for generating ISS is
* prescribed by Steve Bellovin. This involves adding time, the 125000 per
* connection, and a one-way hash (MD5) of the connection ID <sport, dport,
* src, dst>, a "truly" random (per RFC 1750) number, and a console-entered
* password.
*/
#define ISS_INCR 250000
#define ISS_NSEC_SHT 12
static sin_t sin_null; /* Zero address for quick clears */
static sin6_t sin6_null; /* Zero address for quick clears */
/*
* This implementation follows the 4.3BSD interpretation of the urgent
* pointer and not RFC 1122. Switching to RFC 1122 behavior would cause
* incompatible changes in protocols like telnet and rlogin.
*/
#define TCP_OLD_URP_INTERPRETATION 1
/*
* Since tcp_listener is not cleared atomically with tcp_detached
* being cleared we need this extra bit to tell a detached connection
* apart from one that is in the process of being accepted.
*/
#define TCP_IS_DETACHED_NONEAGER(tcp) \
(TCP_IS_DETACHED(tcp) && \
(!(tcp)->tcp_hard_binding))
/*
* TCP reassembly macros. We hide starting and ending sequence numbers in
* b_next and b_prev of messages on the reassembly queue. The messages are
* chained using b_cont. These macros are used in tcp_reass() so we don't
* have to see the ugly casts and assignments.
*/
#define TCP_REASS_SEQ(mp) ((uint32_t)(uintptr_t)((mp)->b_next))
#define TCP_REASS_SET_SEQ(mp, u) ((mp)->b_next = \
(mblk_t *)(uintptr_t)(u))
#define TCP_REASS_END(mp) ((uint32_t)(uintptr_t)((mp)->b_prev))
#define TCP_REASS_SET_END(mp, u) ((mp)->b_prev = \
(mblk_t *)(uintptr_t)(u))
/*
* Implementation of TCP Timers.
* =============================
*
* INTERFACE:
*
* There are two basic functions dealing with tcp timers:
*
* timeout_id_t tcp_timeout(connp, func, time)
* clock_t tcp_timeout_cancel(connp, timeout_id)
* TCP_TIMER_RESTART(tcp, intvl)
*
* tcp_timeout() starts a timer for the 'tcp' instance arranging to call 'func'
* after 'time' ticks passed. The function called by timeout() must adhere to
* the same restrictions as a driver soft interrupt handler - it must not sleep
* or call other functions that might sleep. The value returned is the opaque
* non-zero timeout identifier that can be passed to tcp_timeout_cancel() to
* cancel the request. The call to tcp_timeout() may fail in which case it
* returns zero. This is different from the timeout(9F) function which never
* fails.
*
* The call-back function 'func' always receives 'connp' as its single
* argument. It is always executed in the squeue corresponding to the tcp
* structure. The tcp structure is guaranteed to be present at the time the
* call-back is called.
*
* NOTE: The call-back function 'func' is never called if tcp is in
* the TCPS_CLOSED state.
*
* tcp_timeout_cancel() attempts to cancel a pending tcp_timeout()
* request. locks acquired by the call-back routine should not be held across
* the call to tcp_timeout_cancel() or a deadlock may result.
*
* tcp_timeout_cancel() returns -1 if it can not cancel the timeout request.
* Otherwise, it returns an integer value greater than or equal to 0. In
* particular, if the call-back function is already placed on the squeue, it can
* not be canceled.
*
* NOTE: both tcp_timeout() and tcp_timeout_cancel() should always be called
* within squeue context corresponding to the tcp instance. Since the
* call-back is also called via the same squeue, there are no race
* conditions described in untimeout(9F) manual page since all calls are
* strictly serialized.
*
* TCP_TIMER_RESTART() is a macro that attempts to cancel a pending timeout
* stored in tcp_timer_tid and starts a new one using
* MSEC_TO_TICK(intvl). It always uses tcp_timer() function as a call-back
* and stores the return value of tcp_timeout() in the tcp->tcp_timer_tid
* field.
*
* NOTE: since the timeout cancellation is not guaranteed, the cancelled
* call-back may still be called, so it is possible tcp_timer() will be
* called several times. This should not be a problem since tcp_timer()
* should always check the tcp instance state.
*
*
* IMPLEMENTATION:
*
* TCP timers are implemented using three-stage process. The call to
* tcp_timeout() uses timeout(9F) function to call tcp_timer_callback() function
* when the timer expires. The tcp_timer_callback() arranges the call of the
* tcp_timer_handler() function via squeue corresponding to the tcp
* instance. The tcp_timer_handler() calls actual requested timeout call-back
* and passes tcp instance as an argument to it. Information is passed between
* stages using the tcp_timer_t structure which contains the connp pointer, the
* tcp call-back to call and the timeout id returned by the timeout(9F).
*
* The tcp_timer_t structure is not used directly, it is embedded in an mblk_t -
* like structure that is used to enter an squeue. The mp->b_rptr of this pseudo
* mblk points to the beginning of tcp_timer_t structure. The tcp_timeout()
* returns the pointer to this mblk.
*
* The pseudo mblk is allocated from a special tcp_timer_cache kmem cache. It
* looks like a normal mblk without actual dblk attached to it.
*
* To optimize performance each tcp instance holds a small cache of timer
* mblocks. In the current implementation it caches up to two timer mblocks per
* tcp instance. The cache is preserved over tcp frees and is only freed when
* the whole tcp structure is destroyed by its kmem destructor. Since all tcp
* timer processing happens on a corresponding squeue, the cache manipulation
* does not require any locks. Experiments show that majority of timer mblocks
* allocations are satisfied from the tcp cache and do not involve kmem calls.
*
* The tcp_timeout() places a refhold on the connp instance which guarantees
* that it will be present at the time the call-back function fires. The
* tcp_timer_handler() drops the reference after calling the call-back, so the
* call-back function does not need to manipulate the references explicitly.
*/
typedef struct tcp_timer_s {
conn_t *connp;
void (*tcpt_proc)(void *);
callout_id_t tcpt_tid;
} tcp_timer_t;
static kmem_cache_t *tcp_timercache;
kmem_cache_t *tcp_sack_info_cache;
/*
* For scalability, we must not run a timer for every TCP connection
* in TIME_WAIT state. To see why, consider (for time wait interval of
* 4 minutes):
* 1000 connections/sec * 240 seconds/time wait = 240,000 active conn's
*
* This list is ordered by time, so you need only delete from the head
* until you get to entries which aren't old enough to delete yet.
* The list consists of only the detached TIME_WAIT connections.
*
* Note that the timer (tcp_time_wait_expire) is started when the tcp_t
* becomes detached TIME_WAIT (either by changing the state and already
* being detached or the other way around). This means that the TIME_WAIT
* state can be extended (up to doubled) if the connection doesn't become
* detached for a long time.
*
* The list manipulations (including tcp_time_wait_next/prev)
* are protected by the tcp_time_wait_lock. The content of the
* detached TIME_WAIT connections is protected by the normal perimeters.
*
* This list is per squeue and squeues are shared across the tcp_stack_t's.
* Things on tcp_time_wait_head remain associated with the tcp_stack_t
* and conn_netstack.
* The tcp_t's that are added to tcp_free_list are disassociated and
* have NULL tcp_tcps and conn_netstack pointers.
*/
typedef struct tcp_squeue_priv_s {
kmutex_t tcp_time_wait_lock;
callout_id_t tcp_time_wait_tid;
tcp_t *tcp_time_wait_head;
tcp_t *tcp_time_wait_tail;
tcp_t *tcp_free_list;
uint_t tcp_free_list_cnt;
} tcp_squeue_priv_t;
/*
* TCP_TIME_WAIT_DELAY governs how often the time_wait_collector runs.
* Running it every 5 seconds seems to give the best results.
*/
#define TCP_TIME_WAIT_DELAY drv_usectohz(5000000)
/*
* To prevent memory hog, limit the number of entries in tcp_free_list
* to 1% of available memory / number of cpus
*/
uint_t tcp_free_list_max_cnt = 0;
#define TCP_XMIT_LOWATER 4096
#define TCP_XMIT_HIWATER 49152
#define TCP_RECV_LOWATER 2048
#define TCP_RECV_HIWATER 49152
/*
* PAWS needs a timer for 24 days. This is the number of ticks in 24 days
*/
#define PAWS_TIMEOUT ((clock_t)(24*24*60*60*hz))
#define TIDUSZ 4096 /* transport interface data unit size */
/*
* Bind hash list size and has function. It has to be a power of 2 for
* hashing.
*/
#define TCP_BIND_FANOUT_SIZE 512
#define TCP_BIND_HASH(lport) (ntohs(lport) & (TCP_BIND_FANOUT_SIZE - 1))
/*
* Size of listen and acceptor hash list. It has to be a power of 2 for
* hashing.
*/
#define TCP_FANOUT_SIZE 256
#ifdef _ILP32
#define TCP_ACCEPTOR_HASH(accid) \
(((uint_t)(accid) >> 8) & (TCP_FANOUT_SIZE - 1))
#else
#define TCP_ACCEPTOR_HASH(accid) \
((uint_t)(accid) & (TCP_FANOUT_SIZE - 1))
#endif /* _ILP32 */
#define IP_ADDR_CACHE_SIZE 2048
#define IP_ADDR_CACHE_HASH(faddr) \
(ntohl(faddr) & (IP_ADDR_CACHE_SIZE -1))
/*
* TCP options struct returned from tcp_parse_options.
*/
typedef struct tcp_opt_s {
uint32_t tcp_opt_mss;
uint32_t tcp_opt_wscale;
uint32_t tcp_opt_ts_val;
uint32_t tcp_opt_ts_ecr;
tcp_t *tcp;
} tcp_opt_t;
/*
* RFC1323-recommended phrasing of TSTAMP option, for easier parsing
*/
#ifdef _BIG_ENDIAN
#define TCPOPT_NOP_NOP_TSTAMP ((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | \
(TCPOPT_TSTAMP << 8) | 10)
#else
#define TCPOPT_NOP_NOP_TSTAMP ((10 << 24) | (TCPOPT_TSTAMP << 16) | \
(TCPOPT_NOP << 8) | TCPOPT_NOP)
#endif
/*
* Flags returned from tcp_parse_options.
*/
#define TCP_OPT_MSS_PRESENT 1
#define TCP_OPT_WSCALE_PRESENT 2
#define TCP_OPT_TSTAMP_PRESENT 4
#define TCP_OPT_SACK_OK_PRESENT 8
#define TCP_OPT_SACK_PRESENT 16
/* TCP option length */
#define TCPOPT_NOP_LEN 1
#define TCPOPT_MAXSEG_LEN 4
#define TCPOPT_WS_LEN 3
#define TCPOPT_REAL_WS_LEN (TCPOPT_WS_LEN+1)
#define TCPOPT_TSTAMP_LEN 10
#define TCPOPT_REAL_TS_LEN (TCPOPT_TSTAMP_LEN+2)
#define TCPOPT_SACK_OK_LEN 2
#define TCPOPT_REAL_SACK_OK_LEN (TCPOPT_SACK_OK_LEN+2)
#define TCPOPT_REAL_SACK_LEN 4
#define TCPOPT_MAX_SACK_LEN 36
#define TCPOPT_HEADER_LEN 2
/* TCP cwnd burst factor. */
#define TCP_CWND_INFINITE 65535
#define TCP_CWND_SS 3
#define TCP_CWND_NORMAL 5
/* Maximum TCP initial cwin (start/restart). */
#define TCP_MAX_INIT_CWND 8
/*
* Initialize cwnd according to RFC 3390. def_max_init_cwnd is
* either tcp_slow_start_initial or tcp_slow_start_after idle
* depending on the caller. If the upper layer has not used the
* TCP_INIT_CWND option to change the initial cwnd, tcp_init_cwnd
* should be 0 and we use the formula in RFC 3390 to set tcp_cwnd.
* If the upper layer has changed set the tcp_init_cwnd, just use
* it to calculate the tcp_cwnd.
*/
#define SET_TCP_INIT_CWND(tcp, mss, def_max_init_cwnd) \
{ \
if ((tcp)->tcp_init_cwnd == 0) { \
(tcp)->tcp_cwnd = MIN(def_max_init_cwnd * (mss), \
MIN(4 * (mss), MAX(2 * (mss), 4380 / (mss) * (mss)))); \
} else { \
(tcp)->tcp_cwnd = (tcp)->tcp_init_cwnd * (mss); \
} \
tcp->tcp_cwnd_cnt = 0; \
}
/* TCP Timer control structure */
typedef struct tcpt_s {
pfv_t tcpt_pfv; /* The routine we are to call */
tcp_t *tcpt_tcp; /* The parameter we are to pass in */
} tcpt_t;
/*
* Functions called directly via squeue having a prototype of edesc_t.
*/
void tcp_input_listener(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *ira);
static void tcp_wput_nondata(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
void tcp_accept_finish(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
static void tcp_wput_ioctl(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
static void tcp_wput_proto(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
void tcp_input_data(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *ira);
static void tcp_close_output(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
void tcp_output(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
void tcp_output_urgent(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
static void tcp_rsrv_input(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
static void tcp_timer_handler(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
static void tcp_linger_interrupted(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
/* Prototype for TCP functions */
static void tcp_random_init(void);
int tcp_random(void);
static void tcp_tli_accept(tcp_t *tcp, mblk_t *mp);
static void tcp_accept_swap(tcp_t *listener, tcp_t *acceptor,
tcp_t *eager);
static int tcp_set_destination(tcp_t *tcp);
static in_port_t tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int reuseaddr, boolean_t quick_connect, boolean_t bind_to_req_port_only,
boolean_t user_specified);
static void tcp_closei_local(tcp_t *tcp);
static void tcp_close_detached(tcp_t *tcp);
static boolean_t tcp_conn_con(tcp_t *tcp, uchar_t *iphdr,
mblk_t *idmp, mblk_t **defermp, ip_recv_attr_t *ira);
static void tcp_tpi_connect(tcp_t *tcp, mblk_t *mp);
static int tcp_connect_ipv4(tcp_t *tcp, ipaddr_t *dstaddrp,
in_port_t dstport, uint_t srcid);
static int tcp_connect_ipv6(tcp_t *tcp, in6_addr_t *dstaddrp,
in_port_t dstport, uint32_t flowinfo,
uint_t srcid, uint32_t scope_id);
static int tcp_clean_death(tcp_t *tcp, int err, uint8_t tag);
static void tcp_disconnect(tcp_t *tcp, mblk_t *mp);
static char *tcp_display(tcp_t *tcp, char *, char);
static boolean_t tcp_eager_blowoff(tcp_t *listener, t_scalar_t seqnum);
static void tcp_eager_cleanup(tcp_t *listener, boolean_t q0_only);
static void tcp_eager_unlink(tcp_t *tcp);
static void tcp_err_ack(tcp_t *tcp, mblk_t *mp, int tlierr,
int unixerr);
static void tcp_err_ack_prim(tcp_t *tcp, mblk_t *mp, int primitive,
int tlierr, int unixerr);
static int tcp_extra_priv_ports_get(queue_t *q, mblk_t *mp, caddr_t cp,
cred_t *cr);
static int tcp_extra_priv_ports_add(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int tcp_extra_priv_ports_del(queue_t *q, mblk_t *mp,
char *value, caddr_t cp, cred_t *cr);
static int tcp_tpistate(tcp_t *tcp);
static void tcp_bind_hash_insert(tf_t *tf, tcp_t *tcp,
int caller_holds_lock);
static void tcp_bind_hash_remove(tcp_t *tcp);
static tcp_t *tcp_acceptor_hash_lookup(t_uscalar_t id, tcp_stack_t *);
void tcp_acceptor_hash_insert(t_uscalar_t id, tcp_t *tcp);
static void tcp_acceptor_hash_remove(tcp_t *tcp);
static void tcp_capability_req(tcp_t *tcp, mblk_t *mp);
static void tcp_info_req(tcp_t *tcp, mblk_t *mp);
static void tcp_addr_req(tcp_t *tcp, mblk_t *mp);
static void tcp_init_values(tcp_t *tcp);
static void tcp_ip_notify(tcp_t *tcp);
static void tcp_iss_init(tcp_t *tcp);
static void tcp_keepalive_killer(void *arg);
static int tcp_parse_options(tcpha_t *tcpha, tcp_opt_t *tcpopt);
static void tcp_mss_set(tcp_t *tcp, uint32_t size);
static int tcp_conprim_opt_process(tcp_t *tcp, mblk_t *mp,
int *do_disconnectp, int *t_errorp, int *sys_errorp);
static boolean_t tcp_allow_connopt_set(int level, int name);
int tcp_opt_default(queue_t *q, int level, int name, uchar_t *ptr);
static int tcp_param_get(queue_t *q, mblk_t *mp, caddr_t cp, cred_t *cr);
static boolean_t tcp_param_register(IDP *ndp, tcpparam_t *tcppa, int cnt,
tcp_stack_t *);
static int tcp_param_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static int tcp_param_set_aligned(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void tcp_iss_key_init(uint8_t *phrase, int len, tcp_stack_t *);
static int tcp_1948_phrase_set(queue_t *q, mblk_t *mp, char *value,
caddr_t cp, cred_t *cr);
static void tcp_process_shrunk_swnd(tcp_t *tcp, uint32_t shrunk_cnt);
static void tcp_update_xmit_tail(tcp_t *tcp, uint32_t snxt);
static mblk_t *tcp_reass(tcp_t *tcp, mblk_t *mp, uint32_t start);
static void tcp_reass_elim_overlap(tcp_t *tcp, mblk_t *mp);
static void tcp_reinit(tcp_t *tcp);
static void tcp_reinit_values(tcp_t *tcp);
static uint_t tcp_rwnd_reopen(tcp_t *tcp);
static uint_t tcp_rcv_drain(tcp_t *tcp);
static void tcp_sack_rxmit(tcp_t *tcp, uint_t *flags);
static boolean_t tcp_send_rst_chk(tcp_stack_t *);
static void tcp_ss_rexmit(tcp_t *tcp);
static mblk_t *tcp_input_add_ancillary(tcp_t *tcp, mblk_t *mp, ip_pkt_t *ipp,
ip_recv_attr_t *);
static void tcp_process_options(tcp_t *, tcpha_t *);
static void tcp_rsrv(queue_t *q);
static int tcp_snmp_state(tcp_t *tcp);
static void tcp_timer(void *arg);
static void tcp_timer_callback(void *);
static in_port_t tcp_update_next_port(in_port_t port, const tcp_t *tcp,
boolean_t random);
static in_port_t tcp_get_next_priv_port(const tcp_t *);
static void tcp_wput_sock(queue_t *q, mblk_t *mp);
static void tcp_wput_fallback(queue_t *q, mblk_t *mp);
void tcp_tpi_accept(queue_t *q, mblk_t *mp);
static void tcp_wput_data(tcp_t *tcp, mblk_t *mp, boolean_t urgent);
static void tcp_wput_flush(tcp_t *tcp, mblk_t *mp);
static void tcp_wput_iocdata(tcp_t *tcp, mblk_t *mp);
static int tcp_send(tcp_t *tcp, const int mss,
const int total_hdr_len, const int tcp_hdr_len,
const int num_sack_blk, int *usable, uint_t *snxt,
int *tail_unsent, mblk_t **xmit_tail, mblk_t *local_time);
static void tcp_fill_header(tcp_t *tcp, uchar_t *rptr, clock_t now,
int num_sack_blk);
static void tcp_wsrv(queue_t *q);
static int tcp_xmit_end(tcp_t *tcp);
static void tcp_ack_timer(void *arg);
static mblk_t *tcp_ack_mp(tcp_t *tcp);
static void tcp_xmit_early_reset(char *str, mblk_t *mp,
uint32_t seq, uint32_t ack, int ctl, ip_recv_attr_t *,
ip_stack_t *, conn_t *);
static void tcp_xmit_ctl(char *str, tcp_t *tcp, uint32_t seq,
uint32_t ack, int ctl);
static void tcp_set_rto(tcp_t *, time_t);
static void tcp_icmp_input(void *, mblk_t *, void *, ip_recv_attr_t *);
static void tcp_icmp_error_ipv6(tcp_t *, mblk_t *, ip_recv_attr_t *);
static boolean_t tcp_verifyicmp(conn_t *, void *, icmph_t *, icmp6_t *,
ip_recv_attr_t *);
static int tcp_build_hdrs(tcp_t *);
static void tcp_time_wait_processing(tcp_t *tcp, mblk_t *mp,
uint32_t seg_seq, uint32_t seg_ack, int seg_len, tcpha_t *tcpha,
ip_recv_attr_t *ira);
boolean_t tcp_paws_check(tcp_t *tcp, tcpha_t *tcpha, tcp_opt_t *tcpoptp);
static boolean_t tcp_zcopy_check(tcp_t *);
static void tcp_zcopy_notify(tcp_t *);
static mblk_t *tcp_zcopy_backoff(tcp_t *, mblk_t *, boolean_t);
static void tcp_update_lso(tcp_t *tcp, ip_xmit_attr_t *ixa);
static void tcp_update_pmtu(tcp_t *tcp, boolean_t decrease_only);
static void tcp_update_zcopy(tcp_t *tcp);
static void tcp_notify(void *, ip_xmit_attr_t *, ixa_notify_type_t,
ixa_notify_arg_t);
static void tcp_rexmit_after_error(tcp_t *tcp);
static void tcp_send_data(tcp_t *, mblk_t *);
extern mblk_t *tcp_timermp_alloc(int);
extern void tcp_timermp_free(tcp_t *);
static void tcp_timer_free(tcp_t *tcp, mblk_t *mp);
static void tcp_stop_lingering(tcp_t *tcp);
static void tcp_close_linger_timeout(void *arg);
static void *tcp_stack_init(netstackid_t stackid, netstack_t *ns);
static void tcp_stack_fini(netstackid_t stackid, void *arg);
static void *tcp_g_kstat_init(tcp_g_stat_t *);
static void tcp_g_kstat_fini(kstat_t *);
static void *tcp_kstat_init(netstackid_t, tcp_stack_t *);
static void tcp_kstat_fini(netstackid_t, kstat_t *);
static void *tcp_kstat2_init(netstackid_t, tcp_stat_t *);
static void tcp_kstat2_fini(netstackid_t, kstat_t *);
static int tcp_kstat_update(kstat_t *kp, int rw);
static mblk_t *tcp_conn_create_v6(conn_t *lconnp, conn_t *connp, mblk_t *mp,
ip_recv_attr_t *ira);
static mblk_t *tcp_conn_create_v4(conn_t *lconnp, conn_t *connp, mblk_t *mp,
ip_recv_attr_t *ira);
static int tcp_squeue_switch(int);
static int tcp_open(queue_t *, dev_t *, int, int, cred_t *, boolean_t);
static int tcp_openv4(queue_t *, dev_t *, int, int, cred_t *);
static int tcp_openv6(queue_t *, dev_t *, int, int, cred_t *);
static int tcp_tpi_close(queue_t *, int);
static int tcp_tpi_close_accept(queue_t *);
static void tcp_squeue_add(squeue_t *);
static void tcp_setcred_data(mblk_t *, ip_recv_attr_t *);
extern void tcp_kssl_input(tcp_t *, mblk_t *, cred_t *);
void tcp_eager_kill(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy);
void tcp_clean_death_wrapper(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
static int tcp_accept(sock_lower_handle_t, sock_lower_handle_t,
sock_upper_handle_t, cred_t *);
static int tcp_listen(sock_lower_handle_t, int, cred_t *);
static int tcp_do_listen(conn_t *, struct sockaddr *, socklen_t, int, cred_t *,
boolean_t);
static int tcp_do_connect(conn_t *, const struct sockaddr *, socklen_t,
cred_t *, pid_t);
static int tcp_do_bind(conn_t *, struct sockaddr *, socklen_t, cred_t *,
boolean_t);
static int tcp_do_unbind(conn_t *);
static int tcp_bind_check(conn_t *, struct sockaddr *, socklen_t, cred_t *,
boolean_t);
static void tcp_ulp_newconn(conn_t *, conn_t *, mblk_t *);
/*
* Routines related to the TCP_IOC_ABORT_CONN ioctl command.
*
* TCP_IOC_ABORT_CONN is a non-transparent ioctl command used for aborting
* TCP connections. To invoke this ioctl, a tcp_ioc_abort_conn_t structure
* (defined in tcp.h) needs to be filled in and passed into the kernel
* via an I_STR ioctl command (see streamio(7I)). The tcp_ioc_abort_conn_t
* structure contains the four-tuple of a TCP connection and a range of TCP
* states (specified by ac_start and ac_end). The use of wildcard addresses
* and ports is allowed. Connections with a matching four tuple and a state
* within the specified range will be aborted. The valid states for the
* ac_start and ac_end fields are in the range TCPS_SYN_SENT to TCPS_TIME_WAIT,
* inclusive.
*
* An application which has its connection aborted by this ioctl will receive
* an error that is dependent on the connection state at the time of the abort.
* If the connection state is < TCPS_TIME_WAIT, an application should behave as
* though a RST packet has been received. If the connection state is equal to
* TCPS_TIME_WAIT, the 2MSL timeout will immediately be canceled by the kernel
* and all resources associated with the connection will be freed.
*/
static mblk_t *tcp_ioctl_abort_build_msg(tcp_ioc_abort_conn_t *, tcp_t *);
static void tcp_ioctl_abort_dump(tcp_ioc_abort_conn_t *);
static void tcp_ioctl_abort_handler(void *arg, mblk_t *mp, void *arg2,
ip_recv_attr_t *dummy);
static int tcp_ioctl_abort(tcp_ioc_abort_conn_t *, tcp_stack_t *tcps);
static void tcp_ioctl_abort_conn(queue_t *, mblk_t *);
static int tcp_ioctl_abort_bucket(tcp_ioc_abort_conn_t *, int, int *,
boolean_t, tcp_stack_t *);
static struct module_info tcp_rinfo = {
TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, TCP_RECV_HIWATER, TCP_RECV_LOWATER
};
static struct module_info tcp_winfo = {
TCP_MOD_ID, TCP_MOD_NAME, 0, INFPSZ, 127, 16
};
/*
* Entry points for TCP as a device. The normal case which supports
* the TCP functionality.
* We have separate open functions for the /dev/tcp and /dev/tcp6 devices.
*/
struct qinit tcp_rinitv4 = {
NULL, (pfi_t)tcp_rsrv, tcp_openv4, tcp_tpi_close, NULL, &tcp_rinfo
};
struct qinit tcp_rinitv6 = {
NULL, (pfi_t)tcp_rsrv, tcp_openv6, tcp_tpi_close, NULL, &tcp_rinfo
};
struct qinit tcp_winit = {
(pfi_t)tcp_wput, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
};
/* Initial entry point for TCP in socket mode. */
struct qinit tcp_sock_winit = {
(pfi_t)tcp_wput_sock, (pfi_t)tcp_wsrv, NULL, NULL, NULL, &tcp_winfo
};
/* TCP entry point during fallback */
struct qinit tcp_fallback_sock_winit = {
(pfi_t)tcp_wput_fallback, NULL, NULL, NULL, NULL, &tcp_winfo
};
/*
* Entry points for TCP as a acceptor STREAM opened by sockfs when doing
* an accept. Avoid allocating data structures since eager has already
* been created.
*/
struct qinit tcp_acceptor_rinit = {
NULL, (pfi_t)tcp_rsrv, NULL, tcp_tpi_close_accept, NULL, &tcp_winfo
};
struct qinit tcp_acceptor_winit = {
(pfi_t)tcp_tpi_accept, NULL, NULL, NULL, NULL, &tcp_winfo
};
/* For AF_INET aka /dev/tcp */
struct streamtab tcpinfov4 = {
&tcp_rinitv4, &tcp_winit
};
/* For AF_INET6 aka /dev/tcp6 */
struct streamtab tcpinfov6 = {
&tcp_rinitv6, &tcp_winit
};
sock_downcalls_t sock_tcp_downcalls;
/* Setable only in /etc/system. Move to ndd? */
boolean_t tcp_icmp_source_quench = B_FALSE;
/*
* Following assumes TPI alignment requirements stay along 32 bit
* boundaries
*/
#define ROUNDUP32(x) \
(((x) + (sizeof (int32_t) - 1)) & ~(sizeof (int32_t) - 1))
/* Template for response to info request. */
static struct T_info_ack tcp_g_t_info_ack = {
T_INFO_ACK, /* PRIM_type */
0, /* TSDU_size */
T_INFINITE, /* ETSDU_size */
T_INVALID, /* CDATA_size */
T_INVALID, /* DDATA_size */
sizeof (sin_t), /* ADDR_size */
0, /* OPT_size - not initialized here */
TIDUSZ, /* TIDU_size */
T_COTS_ORD, /* SERV_type */
TCPS_IDLE, /* CURRENT_state */
(XPG4_1|EXPINLINE) /* PROVIDER_flag */
};
static struct T_info_ack tcp_g_t_info_ack_v6 = {
T_INFO_ACK, /* PRIM_type */
0, /* TSDU_size */
T_INFINITE, /* ETSDU_size */
T_INVALID, /* CDATA_size */
T_INVALID, /* DDATA_size */
sizeof (sin6_t), /* ADDR_size */
0, /* OPT_size - not initialized here */
TIDUSZ, /* TIDU_size */
T_COTS_ORD, /* SERV_type */
TCPS_IDLE, /* CURRENT_state */
(XPG4_1|EXPINLINE) /* PROVIDER_flag */
};
#define MS 1L
#define SECONDS (1000 * MS)
#define MINUTES (60 * SECONDS)
#define HOURS (60 * MINUTES)
#define DAYS (24 * HOURS)
#define PARAM_MAX (~(uint32_t)0)
/* Max size IP datagram is 64k - 1 */
#define TCP_MSS_MAX_IPV4 (IP_MAXPACKET - (sizeof (ipha_t) + sizeof (tcpha_t)))
#define TCP_MSS_MAX_IPV6 (IP_MAXPACKET - (sizeof (ip6_t) + sizeof (tcpha_t)))
/* Max of the above */
#define TCP_MSS_MAX TCP_MSS_MAX_IPV4
/* Largest TCP port number */
#define TCP_MAX_PORT (64 * 1024 - 1)
/*
* tcp_wroff_xtra is the extra space in front of TCP/IP header for link
* layer header. It has to be a multiple of 4.
*/
static tcpparam_t lcl_tcp_wroff_xtra_param = { 0, 256, 32, "tcp_wroff_xtra" };
#define tcps_wroff_xtra tcps_wroff_xtra_param->tcp_param_val
/*
* All of these are alterable, within the min/max values given, at run time.
* Note that the default value of "tcp_time_wait_interval" is four minutes,
* per the TCP spec.
*/
/* BEGIN CSTYLED */
static tcpparam_t lcl_tcp_param_arr[] = {
/*min max value name */
{ 1*SECONDS, 10*MINUTES, 1*MINUTES, "tcp_time_wait_interval"},
{ 1, PARAM_MAX, 128, "tcp_conn_req_max_q" },
{ 0, PARAM_MAX, 1024, "tcp_conn_req_max_q0" },
{ 1, 1024, 1, "tcp_conn_req_min" },
{ 0*MS, 20*SECONDS, 0*MS, "tcp_conn_grace_period" },
{ 128, (1<<30), 1024*1024, "tcp_cwnd_max" },
{ 0, 10, 0, "tcp_debug" },
{ 1024, (32*1024), 1024, "tcp_smallest_nonpriv_port"},
{ 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_cinterval"},
{ 1*SECONDS, PARAM_MAX, 3*MINUTES, "tcp_ip_abort_linterval"},
{ 500*MS, PARAM_MAX, 8*MINUTES, "tcp_ip_abort_interval"},
{ 1*SECONDS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_cinterval"},
{ 500*MS, PARAM_MAX, 10*SECONDS, "tcp_ip_notify_interval"},
{ 1, 255, 64, "tcp_ipv4_ttl"},
{ 10*SECONDS, 10*DAYS, 2*HOURS, "tcp_keepalive_interval"},
{ 0, 100, 10, "tcp_maxpsz_multiplier" },
{ 1, TCP_MSS_MAX_IPV4, 536, "tcp_mss_def_ipv4"},
{ 1, TCP_MSS_MAX_IPV4, TCP_MSS_MAX_IPV4, "tcp_mss_max_ipv4"},
{ 1, TCP_MSS_MAX, 108, "tcp_mss_min"},
{ 1, (64*1024)-1, (4*1024)-1, "tcp_naglim_def"},
{ 1*MS, 20*SECONDS, 3*SECONDS, "tcp_rexmit_interval_initial"},
{ 1*MS, 2*HOURS, 60*SECONDS, "tcp_rexmit_interval_max"},
{ 1*MS, 2*HOURS, 400*MS, "tcp_rexmit_interval_min"},
{ 1*MS, 1*MINUTES, 100*MS, "tcp_deferred_ack_interval" },
{ 0, 16, 0, "tcp_snd_lowat_fraction" },
{ 0, 128000, 0, "tcp_sth_rcv_hiwat" },
{ 0, 128000, 0, "tcp_sth_rcv_lowat" },
{ 1, 10000, 3, "tcp_dupack_fast_retransmit" },
{ 0, 1, 0, "tcp_ignore_path_mtu" },
{ 1024, TCP_MAX_PORT, 32*1024, "tcp_smallest_anon_port"},
{ 1024, TCP_MAX_PORT, TCP_MAX_PORT, "tcp_largest_anon_port"},
{ TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_HIWATER,"tcp_xmit_hiwat"},
{ TCP_XMIT_LOWATER, (1<<30), TCP_XMIT_LOWATER,"tcp_xmit_lowat"},
{ TCP_RECV_LOWATER, (1<<30), TCP_RECV_HIWATER,"tcp_recv_hiwat"},
{ 1, 65536, 4, "tcp_recv_hiwat_minmss"},
{ 1*SECONDS, PARAM_MAX, 675*SECONDS, "tcp_fin_wait_2_flush_interval"},
{ 8192, (1<<30), 1024*1024, "tcp_max_buf"},
/*
* Question: What default value should I set for tcp_strong_iss?
*/
{ 0, 2, 1, "tcp_strong_iss"},
{ 0, 65536, 20, "tcp_rtt_updates"},
{ 0, 1, 1, "tcp_wscale_always"},
{ 0, 1, 0, "tcp_tstamp_always"},
{ 0, 1, 1, "tcp_tstamp_if_wscale"},
{ 0*MS, 2*HOURS, 0*MS, "tcp_rexmit_interval_extra"},
{ 0, 16, 2, "tcp_deferred_acks_max"},
{ 1, 16384, 4, "tcp_slow_start_after_idle"},
{ 1, 4, 4, "tcp_slow_start_initial"},
{ 0, 2, 2, "tcp_sack_permitted"},
{ 0, 1, 1, "tcp_compression_enabled"},
{ 0, IPV6_MAX_HOPS, IPV6_DEFAULT_HOPS, "tcp_ipv6_hoplimit"},
{ 1, TCP_MSS_MAX_IPV6, 1220, "tcp_mss_def_ipv6"},
{ 1, TCP_MSS_MAX_IPV6, TCP_MSS_MAX_IPV6, "tcp_mss_max_ipv6"},
{ 0, 1, 0, "tcp_rev_src_routes"},
{ 10*MS, 500*MS, 50*MS, "tcp_local_dack_interval"},
{ 0, 16, 8, "tcp_local_dacks_max"},
{ 0, 2, 1, "tcp_ecn_permitted"},
{ 0, 1, 1, "tcp_rst_sent_rate_enabled"},
{ 0, PARAM_MAX, 40, "tcp_rst_sent_rate"},
{ 0, 100*MS, 50*MS, "tcp_push_timer_interval"},
{ 0, 1, 0, "tcp_use_smss_as_mss_opt"},
{ 0, PARAM_MAX, 8*MINUTES, "tcp_keepalive_abort_interval"},
{ 0, 1, 0, "tcp_dev_flow_ctl"},
};
/* END CSTYLED */
/* Round up the value to the nearest mss. */
#define MSS_ROUNDUP(value, mss) ((((value) - 1) / (mss) + 1) * (mss))
/*
* Set ECN capable transport (ECT) code point in IP header.
*
* Note that there are 2 ECT code points '01' and '10', which are called
* ECT(1) and ECT(0) respectively. Here we follow the original ECT code
* point ECT(0) for TCP as described in RFC 2481.
*/
#define SET_ECT(tcp, iph) \
if ((tcp)->tcp_connp->conn_ipversion == IPV4_VERSION) { \
/* We need to clear the code point first. */ \
((ipha_t *)(iph))->ipha_type_of_service &= 0xFC; \
((ipha_t *)(iph))->ipha_type_of_service |= IPH_ECN_ECT0; \
} else { \
((ip6_t *)(iph))->ip6_vcf &= htonl(0xFFCFFFFF); \
((ip6_t *)(iph))->ip6_vcf |= htonl(IPH_ECN_ECT0 << 20); \
}
/*
* The format argument to pass to tcp_display().
* DISP_PORT_ONLY means that the returned string has only port info.
* DISP_ADDR_AND_PORT means that the returned string also contains the
* remote and local IP address.
*/
#define DISP_PORT_ONLY 1
#define DISP_ADDR_AND_PORT 2
#define IS_VMLOANED_MBLK(mp) \
(((mp)->b_datap->db_struioflag & STRUIO_ZC) != 0)
uint32_t do_tcpzcopy = 1; /* 0: disable, 1: enable, 2: force */
/*
* Forces all connections to obey the value of the tcps_maxpsz_multiplier
* tunable settable via NDD. Otherwise, the per-connection behavior is
* determined dynamically during tcp_set_destination(), which is the default.
*/
boolean_t tcp_static_maxpsz = B_FALSE;
/* Setable in /etc/system */
/* If set to 0, pick ephemeral port sequentially; otherwise randomly. */
uint32_t tcp_random_anon_port = 1;
/*
* To reach to an eager in Q0 which can be dropped due to an incoming
* new SYN request when Q0 is full, a new doubly linked list is
* introduced. This list allows to select an eager from Q0 in O(1) time.
* This is needed to avoid spending too much time walking through the
* long list of eagers in Q0 when tcp_drop_q0() is called. Each member of
* this new list has to be a member of Q0.
* This list is headed by listener's tcp_t. When the list is empty,
* both the pointers - tcp_eager_next_drop_q0 and tcp_eager_prev_drop_q0,
* of listener's tcp_t point to listener's tcp_t itself.
*
* Given an eager in Q0 and a listener, MAKE_DROPPABLE() puts the eager
* in the list. MAKE_UNDROPPABLE() takes the eager out of the list.
* These macros do not affect the eager's membership to Q0.
*/
#define MAKE_DROPPABLE(listener, eager) \
if ((eager)->tcp_eager_next_drop_q0 == NULL) { \
(listener)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0\
= (eager); \
(eager)->tcp_eager_prev_drop_q0 = (listener); \
(eager)->tcp_eager_next_drop_q0 = \
(listener)->tcp_eager_next_drop_q0; \
(listener)->tcp_eager_next_drop_q0 = (eager); \
}
#define MAKE_UNDROPPABLE(eager) \
if ((eager)->tcp_eager_next_drop_q0 != NULL) { \
(eager)->tcp_eager_next_drop_q0->tcp_eager_prev_drop_q0 \
= (eager)->tcp_eager_prev_drop_q0; \
(eager)->tcp_eager_prev_drop_q0->tcp_eager_next_drop_q0 \
= (eager)->tcp_eager_next_drop_q0; \
(eager)->tcp_eager_prev_drop_q0 = NULL; \
(eager)->tcp_eager_next_drop_q0 = NULL; \
}
/*
* If tcp_drop_ack_unsent_cnt is greater than 0, when TCP receives more
* than tcp_drop_ack_unsent_cnt number of ACKs which acknowledge unsent
* data, TCP will not respond with an ACK. RFC 793 requires that
* TCP responds with an ACK for such a bogus ACK. By not following
* the RFC, we prevent TCP from getting into an ACK storm if somehow
* an attacker successfully spoofs an acceptable segment to our
* peer; or when our peer is "confused."
*/
uint32_t tcp_drop_ack_unsent_cnt = 10;
/*
* Hook functions to enable cluster networking
* On non-clustered systems these vectors must always be NULL.
*/
void (*cl_inet_listen)(netstackid_t stack_id, uint8_t protocol,
sa_family_t addr_family, uint8_t *laddrp,
in_port_t lport, void *args) = NULL;
void (*cl_inet_unlisten)(netstackid_t stack_id, uint8_t protocol,
sa_family_t addr_family, uint8_t *laddrp,
in_port_t lport, void *args) = NULL;
int (*cl_inet_connect2)(netstackid_t stack_id, uint8_t protocol,
boolean_t is_outgoing,
sa_family_t addr_family,
uint8_t *laddrp, in_port_t lport,
uint8_t *faddrp, in_port_t fport,
void *args) = NULL;
void (*cl_inet_disconnect)(netstackid_t stack_id, uint8_t protocol,
sa_family_t addr_family, uint8_t *laddrp,
in_port_t lport, uint8_t *faddrp,
in_port_t fport, void *args) = NULL;
/*
* int CL_INET_CONNECT(conn_t *cp, tcp_t *tcp, boolean_t is_outgoing, int err)
*/
#define CL_INET_CONNECT(connp, is_outgoing, err) { \
(err) = 0; \
if (cl_inet_connect2 != NULL) { \
/* \
* Running in cluster mode - register active connection \
* information \
*/ \
if ((connp)->conn_ipversion == IPV4_VERSION) { \
if ((connp)->conn_laddr_v4 != 0) { \
(err) = (*cl_inet_connect2)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, is_outgoing, AF_INET, \
(uint8_t *)(&((connp)->conn_laddr_v4)),\
(in_port_t)(connp)->conn_lport, \
(uint8_t *)(&((connp)->conn_faddr_v4)),\
(in_port_t)(connp)->conn_fport, NULL); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
&(connp)->conn_laddr_v6)) { \
(err) = (*cl_inet_connect2)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, is_outgoing, AF_INET6, \
(uint8_t *)(&((connp)->conn_laddr_v6)),\
(in_port_t)(connp)->conn_lport, \
(uint8_t *)(&((connp)->conn_faddr_v6)), \
(in_port_t)(connp)->conn_fport, NULL); \
} \
} \
} \
}
#define CL_INET_DISCONNECT(connp) { \
if (cl_inet_disconnect != NULL) { \
/* \
* Running in cluster mode - deregister active \
* connection information \
*/ \
if ((connp)->conn_ipversion == IPV4_VERSION) { \
if ((connp)->conn_laddr_v4 != 0) { \
(*cl_inet_disconnect)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, AF_INET, \
(uint8_t *)(&((connp)->conn_laddr_v4)),\
(in_port_t)(connp)->conn_lport, \
(uint8_t *)(&((connp)->conn_faddr_v4)),\
(in_port_t)(connp)->conn_fport, NULL); \
} \
} else { \
if (!IN6_IS_ADDR_UNSPECIFIED( \
&(connp)->conn_laddr_v6)) { \
(*cl_inet_disconnect)( \
(connp)->conn_netstack->netstack_stackid,\
IPPROTO_TCP, AF_INET6, \
(uint8_t *)(&((connp)->conn_laddr_v6)),\
(in_port_t)(connp)->conn_lport, \
(uint8_t *)(&((connp)->conn_faddr_v6)), \
(in_port_t)(connp)->conn_fport, NULL); \
} \
} \
} \
}
/*
* Cluster networking hook for traversing current connection list.
* This routine is used to extract the current list of live connections
* which must continue to to be dispatched to this node.
*/
int cl_tcp_walk_list(netstackid_t stack_id,
int (*callback)(cl_tcp_info_t *, void *), void *arg);
static int cl_tcp_walk_list_stack(int (*callback)(cl_tcp_info_t *, void *),
void *arg, tcp_stack_t *tcps);
static void
tcp_set_recv_threshold(tcp_t *tcp, uint32_t new_rcvthresh)
{
uint32_t default_threshold = SOCKET_RECVHIWATER >> 3;
if (IPCL_IS_NONSTR(tcp->tcp_connp)) {
conn_t *connp = tcp->tcp_connp;
struct sock_proto_props sopp;
/*
* only increase rcvthresh upto default_threshold
*/
if (new_rcvthresh > default_threshold)
new_rcvthresh = default_threshold;
sopp.sopp_flags = SOCKOPT_RCVTHRESH;
sopp.sopp_rcvthresh = new_rcvthresh;
(*connp->conn_upcalls->su_set_proto_props)
(connp->conn_upper_handle, &sopp);
}
}
/*
* Figure out the value of window scale opton. Note that the rwnd is
* ASSUMED to be rounded up to the nearest MSS before the calculation.
* We cannot find the scale value and then do a round up of tcp_rwnd
* because the scale value may not be correct after that.
*
* Set the compiler flag to make this function inline.
*/
static void
tcp_set_ws_value(tcp_t *tcp)
{
int i;
uint32_t rwnd = tcp->tcp_rwnd;
for (i = 0; rwnd > TCP_MAXWIN && i < TCP_MAX_WINSHIFT;
i++, rwnd >>= 1)
;
tcp->tcp_rcv_ws = i;
}
/*
* Remove a connection from the list of detached TIME_WAIT connections.
* It returns B_FALSE if it can't remove the connection from the list
* as the connection has already been removed from the list due to an
* earlier call to tcp_time_wait_remove(); otherwise it returns B_TRUE.
*/
static boolean_t
tcp_time_wait_remove(tcp_t *tcp, tcp_squeue_priv_t *tcp_time_wait)
{
boolean_t locked = B_FALSE;
if (tcp_time_wait == NULL) {
tcp_time_wait = *((tcp_squeue_priv_t **)
squeue_getprivate(tcp->tcp_connp->conn_sqp, SQPRIVATE_TCP));
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
locked = B_TRUE;
} else {
ASSERT(MUTEX_HELD(&tcp_time_wait->tcp_time_wait_lock));
}
if (tcp->tcp_time_wait_expire == 0) {
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
return (B_FALSE);
}
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
if (tcp == tcp_time_wait->tcp_time_wait_head) {
ASSERT(tcp->tcp_time_wait_prev == NULL);
tcp_time_wait->tcp_time_wait_head = tcp->tcp_time_wait_next;
if (tcp_time_wait->tcp_time_wait_head != NULL) {
tcp_time_wait->tcp_time_wait_head->tcp_time_wait_prev =
NULL;
} else {
tcp_time_wait->tcp_time_wait_tail = NULL;
}
} else if (tcp == tcp_time_wait->tcp_time_wait_tail) {
ASSERT(tcp != tcp_time_wait->tcp_time_wait_head);
ASSERT(tcp->tcp_time_wait_next == NULL);
tcp_time_wait->tcp_time_wait_tail = tcp->tcp_time_wait_prev;
ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = NULL;
} else {
ASSERT(tcp->tcp_time_wait_prev->tcp_time_wait_next == tcp);
ASSERT(tcp->tcp_time_wait_next->tcp_time_wait_prev == tcp);
tcp->tcp_time_wait_prev->tcp_time_wait_next =
tcp->tcp_time_wait_next;
tcp->tcp_time_wait_next->tcp_time_wait_prev =
tcp->tcp_time_wait_prev;
}
tcp->tcp_time_wait_next = NULL;
tcp->tcp_time_wait_prev = NULL;
tcp->tcp_time_wait_expire = 0;
if (locked)
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
return (B_TRUE);
}
/*
* Add a connection to the list of detached TIME_WAIT connections
* and set its time to expire.
*/
static void
tcp_time_wait_append(tcp_t *tcp)
{
tcp_stack_t *tcps = tcp->tcp_tcps;
tcp_squeue_priv_t *tcp_time_wait =
*((tcp_squeue_priv_t **)squeue_getprivate(tcp->tcp_connp->conn_sqp,
SQPRIVATE_TCP));
tcp_timers_stop(tcp);
/* Freed above */
ASSERT(tcp->tcp_timer_tid == 0);
ASSERT(tcp->tcp_ack_tid == 0);
/* must have happened at the time of detaching the tcp */
ASSERT(tcp->tcp_ptpahn == NULL);
ASSERT(tcp->tcp_flow_stopped == 0);
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
ASSERT(tcp->tcp_time_wait_expire == NULL);
ASSERT(tcp->tcp_listener == NULL);
tcp->tcp_time_wait_expire = ddi_get_lbolt();
/*
* The value computed below in tcp->tcp_time_wait_expire may
* appear negative or wrap around. That is ok since our
* interest is only in the difference between the current lbolt
* value and tcp->tcp_time_wait_expire. But the value should not
* be zero, since it means the tcp is not in the TIME_WAIT list.
* The corresponding comparison in tcp_time_wait_collector() uses
* modular arithmetic.
*/
tcp->tcp_time_wait_expire +=
drv_usectohz(tcps->tcps_time_wait_interval * 1000);
if (tcp->tcp_time_wait_expire == 0)
tcp->tcp_time_wait_expire = 1;
ASSERT(TCP_IS_DETACHED(tcp));
ASSERT(tcp->tcp_state == TCPS_TIME_WAIT);
ASSERT(tcp->tcp_time_wait_next == NULL);
ASSERT(tcp->tcp_time_wait_prev == NULL);
TCP_DBGSTAT(tcps, tcp_time_wait);
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
if (tcp_time_wait->tcp_time_wait_head == NULL) {
ASSERT(tcp_time_wait->tcp_time_wait_tail == NULL);
tcp_time_wait->tcp_time_wait_head = tcp;
} else {
ASSERT(tcp_time_wait->tcp_time_wait_tail != NULL);
ASSERT(tcp_time_wait->tcp_time_wait_tail->tcp_state ==
TCPS_TIME_WAIT);
tcp_time_wait->tcp_time_wait_tail->tcp_time_wait_next = tcp;
tcp->tcp_time_wait_prev = tcp_time_wait->tcp_time_wait_tail;
}
tcp_time_wait->tcp_time_wait_tail = tcp;
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
}
/* ARGSUSED */
void
tcp_timewait_output(void *arg, mblk_t *mp, void *arg2, ip_recv_attr_t *dummy)
{
conn_t *connp = (conn_t *)arg;
tcp_t *tcp = connp->conn_tcp;
tcp_stack_t *tcps = tcp->tcp_tcps;
ASSERT(tcp != NULL);
if (tcp->tcp_state == TCPS_CLOSED) {
return;
}
ASSERT((connp->conn_family == AF_INET &&
connp->conn_ipversion == IPV4_VERSION) ||
(connp->conn_family == AF_INET6 &&
(connp->conn_ipversion == IPV4_VERSION ||
connp->conn_ipversion == IPV6_VERSION)));
ASSERT(!tcp->tcp_listener);
TCP_STAT(tcps, tcp_time_wait_reap);
ASSERT(TCP_IS_DETACHED(tcp));
/*
* Because they have no upstream client to rebind or tcp_close()
* them later, we axe the connection here and now.
*/
tcp_close_detached(tcp);
}
/*
* Remove cached/latched IPsec references.
*/
void
tcp_ipsec_cleanup(tcp_t *tcp)
{
conn_t *connp = tcp->tcp_connp;
ASSERT(connp->conn_flags & IPCL_TCPCONN);
if (connp->conn_latch != NULL) {
IPLATCH_REFRELE(connp->conn_latch);
connp->conn_latch = NULL;
}
if (connp->conn_latch_in_policy != NULL) {
IPPOL_REFRELE(connp->conn_latch_in_policy);
connp->conn_latch_in_policy = NULL;
}
if (connp->conn_latch_in_action != NULL) {
IPACT_REFRELE(connp->conn_latch_in_action);
connp->conn_latch_in_action = NULL;
}
if (connp->conn_policy != NULL) {
IPPH_REFRELE(connp->conn_policy, connp->conn_netstack);
connp->conn_policy = NULL;
}
}
/*
* Cleaup before placing on free list.
* Disassociate from the netstack/tcp_stack_t since the freelist
* is per squeue and not per netstack.
*/
void
tcp_cleanup(tcp_t *tcp)
{
mblk_t *mp;
tcp_sack_info_t *tcp_sack_info;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
netstack_t *ns = tcps->tcps_netstack;
mblk_t *tcp_rsrv_mp;
tcp_bind_hash_remove(tcp);
/* Cleanup that which needs the netstack first */
tcp_ipsec_cleanup(tcp);
ixa_cleanup(connp->conn_ixa);
if (connp->conn_ht_iphc != NULL) {
kmem_free(connp->conn_ht_iphc, connp->conn_ht_iphc_allocated);
connp->conn_ht_iphc = NULL;
connp->conn_ht_iphc_allocated = 0;
connp->conn_ht_iphc_len = 0;
connp->conn_ht_ulp = NULL;
connp->conn_ht_ulp_len = 0;
tcp->tcp_ipha = NULL;
tcp->tcp_ip6h = NULL;
tcp->tcp_tcpha = NULL;
}
/* We clear any IP_OPTIONS and extension headers */
ip_pkt_free(&connp->conn_xmit_ipp);
tcp_free(tcp);
/* Release any SSL context */
if (tcp->tcp_kssl_ent != NULL) {
kssl_release_ent(tcp->tcp_kssl_ent, NULL, KSSL_NO_PROXY);
tcp->tcp_kssl_ent = NULL;
}
if (tcp->tcp_kssl_ctx != NULL) {
kssl_release_ctx(tcp->tcp_kssl_ctx);
tcp->tcp_kssl_ctx = NULL;
}
tcp->tcp_kssl_pending = B_FALSE;
/*
* Since we will bzero the entire structure, we need to
* remove it and reinsert it in global hash list. We
* know the walkers can't get to this conn because we
* had set CONDEMNED flag earlier and checked reference
* under conn_lock so walker won't pick it and when we
* go the ipcl_globalhash_remove() below, no walker
* can get to it.
*/
ipcl_globalhash_remove(connp);
/* Save some state */
mp = tcp->tcp_timercache;
tcp_sack_info = tcp->tcp_sack_info;
tcp_rsrv_mp = tcp->tcp_rsrv_mp;
if (connp->conn_cred != NULL) {
crfree(connp->conn_cred);
connp->conn_cred = NULL;
}
ipcl_conn_cleanup(connp);
connp->conn_flags = IPCL_TCPCONN;
/*
* Now it is safe to decrement the reference counts.
* This might be the last reference on the netstack
* in which case it will cause the freeing of the IP Instance.
*/
connp->conn_netstack = NULL;
connp->conn_ixa->ixa_ipst = NULL;
netstack_rele(ns);
ASSERT(tcps != NULL);
tcp->tcp_tcps = NULL;
bzero(tcp, sizeof (tcp_t));
/* restore the state */
tcp->tcp_timercache = mp;
tcp->tcp_sack_info = tcp_sack_info;
tcp->tcp_rsrv_mp = tcp_rsrv_mp;
tcp->tcp_connp = connp;
ASSERT(connp->conn_tcp == tcp);
ASSERT(connp->conn_flags & IPCL_TCPCONN);
connp->conn_state_flags = CONN_INCIPIENT;
ASSERT(connp->conn_proto == IPPROTO_TCP);
ASSERT(connp->conn_ref == 1);
}
/*
* Blows away all tcps whose TIME_WAIT has expired. List traversal
* is done forwards from the head.
* This walks all stack instances since
* tcp_time_wait remains global across all stacks.
*/
/* ARGSUSED */
void
tcp_time_wait_collector(void *arg)
{
tcp_t *tcp;
clock_t now;
mblk_t *mp;
conn_t *connp;
kmutex_t *lock;
boolean_t removed;
squeue_t *sqp = (squeue_t *)arg;
tcp_squeue_priv_t *tcp_time_wait =
*((tcp_squeue_priv_t **)squeue_getprivate(sqp, SQPRIVATE_TCP));
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
tcp_time_wait->tcp_time_wait_tid = 0;
if (tcp_time_wait->tcp_free_list != NULL &&
tcp_time_wait->tcp_free_list->tcp_in_free_list == B_TRUE) {
TCP_G_STAT(tcp_freelist_cleanup);
while ((tcp = tcp_time_wait->tcp_free_list) != NULL) {
tcp_time_wait->tcp_free_list = tcp->tcp_time_wait_next;
tcp->tcp_time_wait_next = NULL;
tcp_time_wait->tcp_free_list_cnt--;
ASSERT(tcp->tcp_tcps == NULL);
CONN_DEC_REF(tcp->tcp_connp);
}
ASSERT(tcp_time_wait->tcp_free_list_cnt == 0);
}
/*
* In order to reap time waits reliably, we should use a
* source of time that is not adjustable by the user -- hence
* the call to ddi_get_lbolt().
*/
now = ddi_get_lbolt();
while ((tcp = tcp_time_wait->tcp_time_wait_head) != NULL) {
/*
* Compare times using modular arithmetic, since
* lbolt can wrapover.
*/
if ((now - tcp->tcp_time_wait_expire) < 0) {
break;
}
removed = tcp_time_wait_remove(tcp, tcp_time_wait);
ASSERT(removed);
connp = tcp->tcp_connp;
ASSERT(connp->conn_fanout != NULL);
lock = &connp->conn_fanout->connf_lock;
/*
* This is essentially a TW reclaim fast path optimization for
* performance where the timewait collector checks under the
* fanout lock (so that no one else can get access to the
* conn_t) that the refcnt is 2 i.e. one for TCP and one for
* the classifier hash list. If ref count is indeed 2, we can
* just remove the conn under the fanout lock and avoid
* cleaning up the conn under the squeue, provided that
* clustering callbacks are not enabled. If clustering is
* enabled, we need to make the clustering callback before
* setting the CONDEMNED flag and after dropping all locks and
* so we forego this optimization and fall back to the slow
* path. Also please see the comments in tcp_closei_local
* regarding the refcnt logic.
*
* Since we are holding the tcp_time_wait_lock, its better
* not to block on the fanout_lock because other connections
* can't add themselves to time_wait list. So we do a
* tryenter instead of mutex_enter.
*/
if (mutex_tryenter(lock)) {
mutex_enter(&connp->conn_lock);
if ((connp->conn_ref == 2) &&
(cl_inet_disconnect == NULL)) {
ipcl_hash_remove_locked(connp,
connp->conn_fanout);
/*
* Set the CONDEMNED flag now itself so that
* the refcnt cannot increase due to any
* walker.
*/
connp->conn_state_flags |= CONN_CONDEMNED;
mutex_exit(lock);
mutex_exit(&connp->conn_lock);
if (tcp_time_wait->tcp_free_list_cnt <
tcp_free_list_max_cnt) {
/* Add to head of tcp_free_list */
mutex_exit(
&tcp_time_wait->tcp_time_wait_lock);
tcp_cleanup(tcp);
ASSERT(connp->conn_latch == NULL);
ASSERT(connp->conn_policy == NULL);
ASSERT(tcp->tcp_tcps == NULL);
ASSERT(connp->conn_netstack == NULL);
mutex_enter(
&tcp_time_wait->tcp_time_wait_lock);
tcp->tcp_time_wait_next =
tcp_time_wait->tcp_free_list;
tcp_time_wait->tcp_free_list = tcp;
tcp_time_wait->tcp_free_list_cnt++;
continue;
} else {
/* Do not add to tcp_free_list */
mutex_exit(
&tcp_time_wait->tcp_time_wait_lock);
tcp_bind_hash_remove(tcp);
ixa_cleanup(tcp->tcp_connp->conn_ixa);
tcp_ipsec_cleanup(tcp);
CONN_DEC_REF(tcp->tcp_connp);
}
} else {
CONN_INC_REF_LOCKED(connp);
mutex_exit(lock);
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
mutex_exit(&connp->conn_lock);
/*
* We can reuse the closemp here since conn has
* detached (otherwise we wouldn't even be in
* time_wait list). tcp_closemp_used can safely
* be changed without taking a lock as no other
* thread can concurrently access it at this
* point in the connection lifecycle.
*/
if (tcp->tcp_closemp.b_prev == NULL)
tcp->tcp_closemp_used = B_TRUE;
else
cmn_err(CE_PANIC,
"tcp_timewait_collector: "
"concurrent use of tcp_closemp: "
"connp %p tcp %p\n", (void *)connp,
(void *)tcp);
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
tcp_timewait_output, connp, NULL,
SQ_FILL, SQTAG_TCP_TIMEWAIT);
}
} else {
mutex_enter(&connp->conn_lock);
CONN_INC_REF_LOCKED(connp);
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
mutex_exit(&connp->conn_lock);
/*
* We can reuse the closemp here since conn has
* detached (otherwise we wouldn't even be in
* time_wait list). tcp_closemp_used can safely
* be changed without taking a lock as no other
* thread can concurrently access it at this
* point in the connection lifecycle.
*/
if (tcp->tcp_closemp.b_prev == NULL)
tcp->tcp_closemp_used = B_TRUE;
else
cmn_err(CE_PANIC, "tcp_timewait_collector: "
"concurrent use of tcp_closemp: "
"connp %p tcp %p\n", (void *)connp,
(void *)tcp);
TCP_DEBUG_GETPCSTACK(tcp->tcmp_stk, 15);
mp = &tcp->tcp_closemp;
SQUEUE_ENTER_ONE(connp->conn_sqp, mp,
tcp_timewait_output, connp, NULL,
SQ_FILL, SQTAG_TCP_TIMEWAIT);
}
mutex_enter(&tcp_time_wait->tcp_time_wait_lock);
}
if (tcp_time_wait->tcp_free_list != NULL)
tcp_time_wait->tcp_free_list->tcp_in_free_list = B_TRUE;
tcp_time_wait->tcp_time_wait_tid =
timeout_generic(CALLOUT_NORMAL, tcp_time_wait_collector, sqp,
TICK_TO_NSEC(TCP_TIME_WAIT_DELAY), CALLOUT_TCP_RESOLUTION,
CALLOUT_FLAG_ROUNDUP);
mutex_exit(&tcp_time_wait->tcp_time_wait_lock);
}
/*
* Reply to a clients T_CONN_RES TPI message. This function
* is used only for TLI/XTI listener. Sockfs sends T_CONN_RES
* on the acceptor STREAM and processed in tcp_accept_common().
* Read the block comment on top of tcp_input_listener().
*/
static void
tcp_tli_accept(tcp_t *listener, mblk_t *mp)
{
tcp_t *acceptor;
tcp_t *eager;
tcp_t *tcp;
struct T_conn_res *tcr;
t_uscalar_t acceptor_id;
t_scalar_t seqnum;
mblk_t *discon_mp = NULL;
mblk_t *ok_mp;
mblk_t *mp1;
tcp_stack_t *tcps = listener->tcp_tcps;
conn_t *econnp;
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tcr)) {
tcp_err_ack(listener, mp, TPROTO, 0);
return;
}
tcr = (struct T_conn_res *)mp->b_rptr;
/*
* Under ILP32 the stream head points tcr->ACCEPTOR_id at the
* read side queue of the streams device underneath us i.e. the
* read side queue of 'ip'. Since we can't deference QUEUE_ptr we
* look it up in the queue_hash. Under LP64 it sends down the
* minor_t of the accepting endpoint.
*
* Once the acceptor/eager are modified (in tcp_accept_swap) the
* fanout hash lock is held.
* This prevents any thread from entering the acceptor queue from
* below (since it has not been hard bound yet i.e. any inbound
* packets will arrive on the listener conn_t and
* go through the classifier).
* The CONN_INC_REF will prevent the acceptor from closing.
*
* XXX It is still possible for a tli application to send down data
* on the accepting stream while another thread calls t_accept.
* This should not be a problem for well-behaved applications since
* the T_OK_ACK is sent after the queue swapping is completed.
*
* If the accepting fd is the same as the listening fd, avoid
* queue hash lookup since that will return an eager listener in a
* already established state.
*/
acceptor_id = tcr->ACCEPTOR_id;
mutex_enter(&listener->tcp_eager_lock);
if (listener->tcp_acceptor_id == acceptor_id) {
eager = listener->tcp_eager_next_q;
/* only count how many T_CONN_INDs so don't count q0 */
if ((listener->tcp_conn_req_cnt_q != 1) ||
(eager->tcp_conn_req_seqnum != tcr->SEQ_number)) {
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TBADF, 0);
return;
}
if (listener->tcp_conn_req_cnt_q0 != 0) {
/* Throw away all the eagers on q0. */
tcp_eager_cleanup(listener, 1);
}
if (listener->tcp_syn_defense) {
listener->tcp_syn_defense = B_FALSE;
if (listener->tcp_ip_addr_cache != NULL) {
kmem_free(listener->tcp_ip_addr_cache,
IP_ADDR_CACHE_SIZE * sizeof (ipaddr_t));
listener->tcp_ip_addr_cache = NULL;
}
}
/*
* Transfer tcp_conn_req_max to the eager so that when
* a disconnect occurs we can revert the endpoint to the
* listen state.
*/
eager->tcp_conn_req_max = listener->tcp_conn_req_max;
ASSERT(listener->tcp_conn_req_cnt_q0 == 0);
/*
* Get a reference on the acceptor just like the
* tcp_acceptor_hash_lookup below.
*/
acceptor = listener;
CONN_INC_REF(acceptor->tcp_connp);
} else {
acceptor = tcp_acceptor_hash_lookup(acceptor_id, tcps);
if (acceptor == NULL) {
if (listener->tcp_connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1,
SL_ERROR|SL_TRACE,
"tcp_accept: did not find acceptor 0x%x\n",
acceptor_id);
}
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TPROVMISMATCH, 0);
return;
}
/*
* Verify acceptor state. The acceptable states for an acceptor
* include TCPS_IDLE and TCPS_BOUND.
*/
switch (acceptor->tcp_state) {
case TCPS_IDLE:
/* FALLTHRU */
case TCPS_BOUND:
break;
default:
CONN_DEC_REF(acceptor->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TOUTSTATE, 0);
return;
}
}
/* The listener must be in TCPS_LISTEN */
if (listener->tcp_state != TCPS_LISTEN) {
CONN_DEC_REF(acceptor->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TOUTSTATE, 0);
return;
}
/*
* Rendezvous with an eager connection request packet hanging off
* 'tcp' that has the 'seqnum' tag. We tagged the detached open
* tcp structure when the connection packet arrived in
* tcp_input_listener().
*/
seqnum = tcr->SEQ_number;
eager = listener;
do {
eager = eager->tcp_eager_next_q;
if (eager == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
mutex_exit(&listener->tcp_eager_lock);
tcp_err_ack(listener, mp, TBADSEQ, 0);
return;
}
} while (eager->tcp_conn_req_seqnum != seqnum);
mutex_exit(&listener->tcp_eager_lock);
/*
* At this point, both acceptor and listener have 2 ref
* that they begin with. Acceptor has one additional ref
* we placed in lookup while listener has 3 additional
* ref for being behind the squeue (tcp_accept() is
* done on listener's squeue); being in classifier hash;
* and eager's ref on listener.
*/
ASSERT(listener->tcp_connp->conn_ref >= 5);
ASSERT(acceptor->tcp_connp->conn_ref >= 3);
/*
* The eager at this point is set in its own squeue and
* could easily have been killed (tcp_accept_finish will
* deal with that) because of a TH_RST so we can only
* ASSERT for a single ref.
*/
ASSERT(eager->tcp_connp->conn_ref >= 1);
/*
* Pre allocate the discon_ind mblk also. tcp_accept_finish will
* use it if something failed.
*/
discon_mp = allocb(MAX(sizeof (struct T_discon_ind),
sizeof (struct stroptions)), BPRI_HI);
if (discon_mp == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
return;
}
econnp = eager->tcp_connp;
/* Hold a copy of mp, in case reallocb fails */
if ((mp1 = copymsg(mp)) == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
freemsg(discon_mp);
tcp_err_ack(listener, mp, TSYSERR, ENOMEM);
return;
}
tcr = (struct T_conn_res *)mp1->b_rptr;
/*
* This is an expanded version of mi_tpi_ok_ack_alloc()
* which allocates a larger mblk and appends the new
* local address to the ok_ack. The address is copied by
* soaccept() for getsockname().
*/
{
int extra;
extra = (econnp->conn_family == AF_INET) ?
sizeof (sin_t) : sizeof (sin6_t);
/*
* Try to re-use mp, if possible. Otherwise, allocate
* an mblk and return it as ok_mp. In any case, mp
* is no longer usable upon return.
*/
if ((ok_mp = mi_tpi_ok_ack_alloc_extra(mp, extra)) == NULL) {
CONN_DEC_REF(acceptor->tcp_connp);
CONN_DEC_REF(eager->tcp_connp);
freemsg(discon_mp);
/* Original mp has been freed by now, so use mp1 */
tcp_err_ack(listener, mp1, TSYSERR, ENOMEM);
return;
}
mp = NULL; /* We should never use mp after this point */
switch (extra) {
case sizeof (sin_t): {
sin_t *sin = (sin_t *)ok_mp->b_wptr;
ok_mp->b_wptr += extra;
sin->sin_family = AF_INET;
sin->sin_port = econnp->conn_lport;
sin->sin_addr.s_addr = econnp->conn_laddr_v4;
break;
}
case sizeof (sin6_t): {
sin6_t *sin6 = (sin6_t *)ok_mp->b_wptr;
ok_mp->b_wptr += extra;
sin6->sin6_family = AF_INET6;
sin6->sin6_port = econnp->conn_lport;
sin6->sin6_addr = econnp->conn_laddr_v6;
sin6->sin6_flowinfo = econnp->conn_flowinfo;
if (IN6_IS_ADDR_LINKSCOPE(&econnp->conn_laddr_v6) &&
(econnp->conn_ixa->ixa_flags & IXAF_SCOPEID_SET)) {
sin6->sin6_scope_id =
econnp->conn_ixa->ixa_scopeid;
} else {
sin6->sin6_scope_id = 0;
}
sin6->__sin6_src_id = 0;
break;
}
default:
break;
}
ASSERT(ok_mp->b_wptr <= ok_mp->b_datap->db_lim);
}
/*
* If there are no options we know that the T_CONN_RES will
* succeed. However, we can't send the T_OK_ACK upstream until
* the tcp_accept_swap is done since it would be dangerous to
* let the application start using the new fd prior to the swap.
*/
tcp_accept_swap(listener, acceptor, eager);
/*
* tcp_accept_swap unlinks eager from listener but does not drop
* the eager's reference on the listener.
*/
ASSERT(eager->tcp_listener == NULL);
ASSERT(listener->tcp_connp->conn_ref >= 5);
/*
* The eager is now associated with its own queue. Insert in
* the hash so that the connection can be reused for a future
* T_CONN_RES.
*/
tcp_acceptor_hash_insert(acceptor_id, eager);
/*
* We now do the processing of options with T_CONN_RES.
* We delay till now since we wanted to have queue to pass to
* option processing routines that points back to the right
* instance structure which does not happen until after
* tcp_accept_swap().
*
* Note:
* The sanity of the logic here assumes that whatever options
* are appropriate to inherit from listner=>eager are done
* before this point, and whatever were to be overridden (or not)
* in transfer logic from eager=>acceptor in tcp_accept_swap().
* [ Warning: acceptor endpoint can have T_OPTMGMT_REQ done to it
* before its ACCEPTOR_id comes down in T_CONN_RES ]
* This may not be true at this point in time but can be fixed
* independently. This option processing code starts with
* the instantiated acceptor instance and the final queue at
* this point.
*/
if (tcr->OPT_length != 0) {
/* Options to process */
int t_error = 0;
int sys_error = 0;
int do_disconnect = 0;
if (tcp_conprim_opt_process(eager, mp1,
&do_disconnect, &t_error, &sys_error) < 0) {
eager->tcp_accept_error = 1;
if (do_disconnect) {
/*
* An option failed which does not allow
* connection to be accepted.
*
* We allow T_CONN_RES to succeed and
* put a T_DISCON_IND on the eager queue.
*/
ASSERT(t_error == 0 && sys_error == 0);
eager->tcp_send_discon_ind = 1;
} else {
ASSERT(t_error != 0);
freemsg(ok_mp);
/*
* Original mp was either freed or set
* to ok_mp above, so use mp1 instead.
*/
tcp_err_ack(listener, mp1, t_error, sys_error);
goto finish;
}
}
/*
* Most likely success in setting options (except if
* eager->tcp_send_discon_ind set).
* mp1 option buffer represented by OPT_length/offset
* potentially modified and contains results of setting
* options at this point
*/
}
/* We no longer need mp1, since all options processing has passed */
freemsg(mp1);
putnext(listener->tcp_connp->conn_rq, ok_mp);
mutex_enter(&listener->tcp_eager_lock);
if (listener->tcp_eager_prev_q0->tcp_conn_def_q0) {
tcp_t *tail;
mblk_t *conn_ind;
/*
* This path should not be executed if listener and
* acceptor streams are the same.
*/
ASSERT(listener != acceptor);
tcp = listener->tcp_eager_prev_q0;
/*
* listener->tcp_eager_prev_q0 points to the TAIL of the
* deferred T_conn_ind queue. We need to get to the head of
* the queue in order to send up T_conn_ind the same order as
* how the 3WHS is completed.
*/
while (tcp != listener) {
if (!tcp->tcp_eager_prev_q0->tcp_conn_def_q0)
break;
else
tcp = tcp->tcp_eager_prev_q0;
}
ASSERT(tcp != listener);
conn_ind = tcp->tcp_conn.tcp_eager_conn_ind;
ASSERT(conn_ind != NULL);
tcp->tcp_conn.tcp_eager_conn_ind = NULL;
/* Move from q0 to q */
ASSERT(listener->tcp_conn_req_cnt_q0 > 0);
listener->tcp_conn_req_cnt_q0--;
listener->tcp_conn_req_cnt_q++;
tcp->tcp_eager_next_q0->tcp_eager_prev_q0 =
tcp->tcp_eager_prev_q0;
tcp->tcp_eager_prev_q0->tcp_eager_next_q0 =
tcp->tcp_eager_next_q0;
tcp->tcp_eager_prev_q0 = NULL;
tcp->tcp_eager_next_q0 = NULL;
tcp->tcp_conn_def_q0 = B_FALSE;
/* Make sure the tcp isn't in the list of droppables */
ASSERT(tcp->tcp_eager_next_drop_q0 == NULL &&
tcp->tcp_eager_prev_drop_q0 == NULL);
/*
* Insert at end of the queue because sockfs sends
* down T_CONN_RES in chronological order. Leaving
* the older conn indications at front of the queue
* helps reducing search time.
*/
tail = listener->tcp_eager_last_q;
if (tail != NULL)
tail->tcp_eager_next_q = tcp;
else
listener->tcp_eager_next_q = tcp;
listener->tcp_eager_last_q = tcp;
tcp->tcp_eager_next_q = NULL;
mutex_exit(&listener->tcp_eager_lock);
putnext(tcp->tcp_connp->conn_rq, conn_ind);
} else {
mutex_exit(&listener->tcp_eager_lock);
}
/*
* Done with the acceptor - free it
*
* Note: from this point on, no access to listener should be made
* as listener can be equal to acceptor.
*/
finish:
ASSERT(acceptor->tcp_detached);
acceptor->tcp_connp->conn_rq = NULL;
ASSERT(!IPCL_IS_NONSTR(acceptor->tcp_connp));
acceptor->tcp_connp->conn_wq = NULL;
(void) tcp_clean_death(acceptor, 0, 2);
CONN_DEC_REF(acceptor->tcp_connp);
/*
* We pass discon_mp to tcp_accept_finish to get on the right squeue.
*
* It will update the setting for sockfs/stream head and also take
* care of any data that arrived before accept() wad called.
* In case we already received a FIN then tcp_accept_finish will send up
* the ordrel. It will also send up a window update if the window
* has opened up.
*/
/*
* XXX: we currently have a problem if XTI application closes the
* acceptor stream in between. This problem exists in on10-gate also
* and is well know but nothing can be done short of major rewrite
* to fix it. Now it is possible to take care of it by assigning TLI/XTI
* eager same squeue as listener (we can distinguish non socket
* listeners at the time of handling a SYN in tcp_input_listener)
* and do most of the work that tcp_accept_finish does here itself
* and then get behind the acceptor squeue to access the acceptor
* queue.
*/
/*
* We already have a ref on tcp so no need to do one before squeue_enter
*/
SQUEUE_ENTER_ONE(eager->tcp_connp->conn_sqp, discon_mp,
tcp_accept_finish, eager->tcp_connp, NULL, SQ_FILL,
SQTAG_TCP_ACCEPT_FINISH);
}
/*
* Swap information between the eager and acceptor for a TLI/XTI client.
* The sockfs accept is done on the acceptor stream and control goes
* through tcp_tli_accept() and tcp_accept()/tcp_accept_swap() is not
* called. In either case, both the eager and listener are in their own
* perimeter (squeue) and the code has to deal with potential race.
*
* See the block comment on top of tcp_accept() and tcp_tli_accept().
*/
static void
tcp_accept_swap(tcp_t *listener, tcp_t *acceptor, tcp_t *eager)
{
conn_t *econnp, *aconnp;
ASSERT(eager->tcp_connp->conn_rq == listener->tcp_connp->conn_rq);
ASSERT(eager->tcp_detached && !acceptor->tcp_detached);
ASSERT(!TCP_IS_SOCKET(acceptor));
ASSERT(!TCP_IS_SOCKET(eager));
ASSERT(!TCP_IS_SOCKET(listener));
/*
* Trusted Extensions may need to use a security label that is
* different from the acceptor's label on MLP and MAC-Exempt
* sockets. If this is the case, the required security label
* already exists in econnp->conn_ixa->ixa_tsl. Since we make the
* acceptor stream refer to econnp we atomatically get that label.
*/
acceptor->tcp_detached = B_TRUE;
/*
* To permit stream re-use by TLI/XTI, the eager needs a copy of
* the acceptor id.
*/
eager->tcp_acceptor_id = acceptor->tcp_acceptor_id;
/* remove eager from listen list... */
mutex_enter(&listener->tcp_eager_lock);
tcp_eager_unlink(eager);
ASSERT(eager->tcp_eager_next_q == NULL &&
eager->tcp_eager_last_q == NULL);
ASSERT(eager->tcp_eager_next_q0 == NULL &&
eager->tcp_eager_prev_q0 == NULL);
mutex_exit(&listener->tcp_eager_lock);
econnp = eager->tcp_connp;
aconnp = acceptor->tcp_connp;
econnp->conn_rq = aconnp->conn_rq;
econnp->conn_wq = aconnp->conn_wq;
econnp->conn_rq->q_ptr = econnp;
econnp->conn_wq->q_ptr = econnp;
/*
* In the TLI/XTI loopback case, we are inside the listener's squeue,
* which might be a different squeue from our peer TCP instance.
* For TCP Fusion, the peer expects that whenever tcp_detached is
* clear, our TCP queues point to the acceptor's queues. Thus, use
* membar_producer() to ensure that the assignments of conn_rq/conn_wq
* above reach global visibility prior to the clearing of tcp_detached.
*/
membar_producer();
eager->tcp_detached = B_FALSE;
ASSERT(eager->tcp_ack_tid == 0);
econnp->conn_dev = aconnp->conn_dev;
econnp->conn_minor_arena = aconnp->conn_minor_arena;
ASSERT(econnp->conn_minor_arena != NULL);
if (econnp->conn_cred != NULL)
crfree(econnp->conn_cred);
econnp->conn_cred = aconnp->conn_cred;
aconnp->conn_cred = NULL;
econnp->conn_cpid = aconnp->conn_cpid;
ASSERT(econnp->conn_netstack == aconnp->conn_netstack);
ASSERT(eager->tcp_tcps == acceptor->tcp_tcps);
econnp->conn_zoneid = aconnp->conn_zoneid;
econnp->conn_allzones = aconnp->conn_allzones;
econnp->conn_ixa->ixa_zoneid = aconnp->conn_ixa->ixa_zoneid;
econnp->conn_mac_mode = aconnp->conn_mac_mode;
econnp->conn_zone_is_global = aconnp->conn_zone_is_global;
aconnp->conn_mac_mode = CONN_MAC_DEFAULT;
/* Do the IPC initialization */
CONN_INC_REF(econnp);
econnp->conn_family = aconnp->conn_family;
econnp->conn_ipversion = aconnp->conn_ipversion;
/* Done with old IPC. Drop its ref on its connp */
CONN_DEC_REF(aconnp);
}
/*
* Adapt to the information, such as rtt and rtt_sd, provided from the
* DCE and IRE maintained by IP.
*
* Checks for multicast and broadcast destination address.
* Returns zero if ok; an errno on failure.
*
* Note that the MSS calculation here is based on the info given in
* the DCE and IRE. We do not do any calculation based on TCP options. They
* will be handled in tcp_input_data() when TCP knows which options to use.
*
* Note on how TCP gets its parameters for a connection.
*
* When a tcp_t structure is allocated, it gets all the default parameters.
* In tcp_set_destination(), it gets those metric parameters, like rtt, rtt_sd,
* spipe, rpipe, ... from the route metrics. Route metric overrides the
* default.
*
* An incoming SYN with a multicast or broadcast destination address is dropped
* in ip_fanout_v4/v6.
*
* An incoming SYN with a multicast or broadcast source address is always
* dropped in tcp_set_destination, since IPDF_ALLOW_MCBC is not set in
* conn_connect.
* The same logic in tcp_set_destination also serves to
* reject an attempt to connect to a broadcast or multicast (destination)
* address.
*/
static int
tcp_set_destination(tcp_t *tcp)
{
uint32_t mss_max;
uint32_t mss;
boolean_t tcp_detached = TCP_IS_DETACHED(tcp);
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
iulp_t uinfo;
int error;
uint32_t flags;
flags = IPDF_LSO | IPDF_ZCOPY;
/*
* Make sure we have a dce for the destination to avoid dce_ident
* contention for connected sockets.
*/
flags |= IPDF_UNIQUE_DCE;
if (!tcps->tcps_ignore_path_mtu)
connp->conn_ixa->ixa_flags |= IXAF_PMTU_DISCOVERY;
/* Use conn_lock to satify ASSERT; tcp is already serialized */
mutex_enter(&connp->conn_lock);
error = conn_connect(connp, &uinfo, flags);
mutex_exit(&connp->conn_lock);
if (error != 0)
return (error);
error = tcp_build_hdrs(tcp);
if (error != 0)
return (error);
tcp->tcp_localnet = uinfo.iulp_localnet;
if (uinfo.iulp_rtt != 0) {
clock_t rto;
tcp->tcp_rtt_sa = uinfo.iulp_rtt;
tcp->tcp_rtt_sd = uinfo.iulp_rtt_sd;
rto = (tcp->tcp_rtt_sa >> 3) + tcp->tcp_rtt_sd +
tcps->tcps_rexmit_interval_extra +
(tcp->tcp_rtt_sa >> 5);
if (rto > tcps->tcps_rexmit_interval_max) {
tcp->tcp_rto = tcps->tcps_rexmit_interval_max;
} else if (rto < tcps->tcps_rexmit_interval_min) {
tcp->tcp_rto = tcps->tcps_rexmit_interval_min;
} else {
tcp->tcp_rto = rto;
}
}
if (uinfo.iulp_ssthresh != 0)
tcp->tcp_cwnd_ssthresh = uinfo.iulp_ssthresh;
else
tcp->tcp_cwnd_ssthresh = TCP_MAX_LARGEWIN;
if (uinfo.iulp_spipe > 0) {
connp->conn_sndbuf = MIN(uinfo.iulp_spipe,
tcps->tcps_max_buf);
if (tcps->tcps_snd_lowat_fraction != 0) {
connp->conn_sndlowat = connp->conn_sndbuf /
tcps->tcps_snd_lowat_fraction;
}
(void) tcp_maxpsz_set(tcp, B_TRUE);
}
/*
* Note that up till now, acceptor always inherits receive
* window from the listener. But if there is a metrics
* associated with a host, we should use that instead of
* inheriting it from listener. Thus we need to pass this
* info back to the caller.
*/
if (uinfo.iulp_rpipe > 0) {
tcp->tcp_rwnd = MIN(uinfo.iulp_rpipe,
tcps->tcps_max_buf);
}
if (uinfo.iulp_rtomax > 0) {
tcp->tcp_second_timer_threshold =
uinfo.iulp_rtomax;
}
/*
* Use the metric option settings, iulp_tstamp_ok and
* iulp_wscale_ok, only for active open. What this means
* is that if the other side uses timestamp or window
* scale option, TCP will also use those options. That
* is for passive open. If the application sets a
* large window, window scale is enabled regardless of
* the value in iulp_wscale_ok. This is the behavior
* since 2.6. So we keep it.
* The only case left in passive open processing is the
* check for SACK.
* For ECN, it should probably be like SACK. But the
* current value is binary, so we treat it like the other
* cases. The metric only controls active open.For passive
* open, the ndd param, tcp_ecn_permitted, controls the
* behavior.
*/
if (!tcp_detached) {
/*
* The if check means that the following can only
* be turned on by the metrics only IRE, but not off.
*/
if (uinfo.iulp_tstamp_ok)
tcp->tcp_snd_ts_ok = B_TRUE;
if (uinfo.iulp_wscale_ok)
tcp->tcp_snd_ws_ok = B_TRUE;
if (uinfo.iulp_sack == 2)
tcp->tcp_snd_sack_ok = B_TRUE;
if (uinfo.iulp_ecn_ok)
tcp->tcp_ecn_ok = B_TRUE;
} else {
/*
* Passive open.
*
* As above, the if check means that SACK can only be
* turned on by the metric only IRE.
*/
if (uinfo.iulp_sack > 0) {
tcp->tcp_snd_sack_ok = B_TRUE;
}
}
/*
* XXX Note that currently, iulp_mtu can be as small as 68
* because of PMTUd. So tcp_mss may go to negative if combined
* length of all those options exceeds 28 bytes. But because
* of the tcp_mss_min check below, we may not have a problem if
* tcp_mss_min is of a reasonable value. The default is 1 so
* the negative problem still exists. And the check defeats PMTUd.
* In fact, if PMTUd finds that the MSS should be smaller than
* tcp_mss_min, TCP should turn off PMUTd and use the tcp_mss_min
* value.
*
* We do not deal with that now. All those problems related to
* PMTUd will be fixed later.
*/
ASSERT(uinfo.iulp_mtu != 0);
mss = tcp->tcp_initial_pmtu = uinfo.iulp_mtu;
/* Sanity check for MSS value. */
if (connp->conn_ipversion == IPV4_VERSION)
mss_max = tcps->tcps_mss_max_ipv4;
else
mss_max = tcps->tcps_mss_max_ipv6;
if (tcp->tcp_ipsec_overhead == 0)
tcp->tcp_ipsec_overhead = conn_ipsec_length(connp);
mss -= tcp->tcp_ipsec_overhead;
if (mss < tcps->tcps_mss_min)
mss = tcps->tcps_mss_min;
if (mss > mss_max)
mss = mss_max;
/* Note that this is the maximum MSS, excluding all options. */
tcp->tcp_mss = mss;
/*
* Update the tcp connection with LSO capability.
*/
tcp_update_lso(tcp, connp->conn_ixa);
/*
* Initialize the ISS here now that we have the full connection ID.
* The RFC 1948 method of initial sequence number generation requires
* knowledge of the full connection ID before setting the ISS.
*/
tcp_iss_init(tcp);
tcp->tcp_loopback = (uinfo.iulp_loopback | uinfo.iulp_local);
/*
* Make sure that conn is not marked incipient
* for incoming connections. A blind
* removal of incipient flag is cheaper than
* check and removal.
*/
mutex_enter(&connp->conn_lock);
connp->conn_state_flags &= ~CONN_INCIPIENT;
mutex_exit(&connp->conn_lock);
return (0);
}
static void
tcp_tpi_bind(tcp_t *tcp, mblk_t *mp)
{
int error;
conn_t *connp = tcp->tcp_connp;
struct sockaddr *sa;
mblk_t *mp1;
struct T_bind_req *tbr;
int backlog;
socklen_t len;
sin_t *sin;
sin6_t *sin6;
cred_t *cr;
/*
* All Solaris components should pass a db_credp
* for this TPI message, hence we ASSERT.
* But in case there is some other M_PROTO that looks
* like a TPI message sent by some other kernel
* component, we check and return an error.
*/
cr = msg_getcred(mp, NULL);
ASSERT(cr != NULL);
if (cr == NULL) {
tcp_err_ack(tcp, mp, TSYSERR, EINVAL);
return;
}
ASSERT((uintptr_t)(mp->b_wptr - mp->b_rptr) <= (uintptr_t)INT_MAX);
if ((mp->b_wptr - mp->b_rptr) < sizeof (*tbr)) {
if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_tpi_bind: bad req, len %u",
(uint_t)(mp->b_wptr - mp->b_rptr));
}
tcp_err_ack(tcp, mp, TPROTO, 0);
return;
}
/* Make sure the largest address fits */
mp1 = reallocb(mp, sizeof (struct T_bind_ack) + sizeof (sin6_t), 1);
if (mp1 == NULL) {
tcp_err_ack(tcp, mp, TSYSERR, ENOMEM);
return;
}
mp = mp1;
tbr = (struct T_bind_req *)mp->b_rptr;
backlog = tbr->CONIND_number;
len = tbr->ADDR_length;
switch (len) {
case 0: /* request for a generic port */
tbr->ADDR_offset = sizeof (struct T_bind_req);
if (connp->conn_family == AF_INET) {
tbr->ADDR_length = sizeof (sin_t);
sin = (sin_t *)&tbr[1];
*sin = sin_null;
sin->sin_family = AF_INET;
sa = (struct sockaddr *)sin;
len = sizeof (sin_t);
mp->b_wptr = (uchar_t *)&sin[1];
} else {
ASSERT(connp->conn_family == AF_INET6);
tbr->ADDR_length = sizeof (sin6_t);
sin6 = (sin6_t *)&tbr[1];
*sin6 = sin6_null;
sin6->sin6_family = AF_INET6;
sa = (struct sockaddr *)sin6;
len = sizeof (sin6_t);
mp->b_wptr = (uchar_t *)&sin6[1];
}
break;
case sizeof (sin_t): /* Complete IPv4 address */
sa = (struct sockaddr *)mi_offset_param(mp, tbr->ADDR_offset,
sizeof (sin_t));
break;
case sizeof (sin6_t): /* Complete IPv6 address */
sa = (struct sockaddr *)mi_offset_param(mp,
tbr->ADDR_offset, sizeof (sin6_t));
break;
default:
if (connp->conn_debug) {
(void) strlog(TCP_MOD_ID, 0, 1, SL_ERROR|SL_TRACE,
"tcp_tpi_bind: bad address length, %d",
tbr->ADDR_length);
}
tcp_err_ack(tcp, mp, TBADADDR, 0);
return;
}
if (backlog > 0) {
error = tcp_do_listen(connp, sa, len, backlog, DB_CRED(mp),
tbr->PRIM_type != O_T_BIND_REQ);
} else {
error = tcp_do_bind(connp, sa, len, DB_CRED(mp),
tbr->PRIM_type != O_T_BIND_REQ);
}
done:
if (error > 0) {
tcp_err_ack(tcp, mp, TSYSERR, error);
} else if (error < 0) {
tcp_err_ack(tcp, mp, -error, 0);
} else {
/*
* Update port information as sockfs/tpi needs it for checking
*/
if (connp->conn_family == AF_INET) {
sin = (sin_t *)sa;
sin->sin_port = connp->conn_lport;
} else {
sin6 = (sin6_t *)sa;
sin6->sin6_port = connp->conn_lport;
}
mp->b_datap->db_type = M_PCPROTO;
tbr->PRIM_type = T_BIND_ACK;
putnext(connp->conn_rq, mp);
}
}
/*
* If the "bind_to_req_port_only" parameter is set, if the requested port
* number is available, return it, If not return 0
*
* If "bind_to_req_port_only" parameter is not set and
* If the requested port number is available, return it. If not, return
* the first anonymous port we happen across. If no anonymous ports are
* available, return 0. addr is the requested local address, if any.
*
* In either case, when succeeding update the tcp_t to record the port number
* and insert it in the bind hash table.
*
* Note that TCP over IPv4 and IPv6 sockets can use the same port number
* without setting SO_REUSEADDR. This is needed so that they
* can be viewed as two independent transport protocols.
*/
static in_port_t
tcp_bindi(tcp_t *tcp, in_port_t port, const in6_addr_t *laddr,
int reuseaddr, boolean_t quick_connect,
boolean_t bind_to_req_port_only, boolean_t user_specified)
{
/* number of times we have run around the loop */
int count = 0;
/* maximum number of times to run around the loop */
int loopmax;
conn_t *connp = tcp->tcp_connp;
tcp_stack_t *tcps = tcp->tcp_tcps;
/*
* Lookup for free addresses is done in a loop and "loopmax"
* influences how long we spin in the loop
*/
if (bind_to_req_port_only) {
/*
* If the requested port is busy, don't bother to look
* for a new one. Setting loop maximum count to 1 has
* that effect.
*/
loopmax = 1;
} else {
/*
* If the requested port is busy, look for a free one
* in the anonymous port range.
* Set loopmax appropriately so that one does not look
* forever in the case all of the anonymous ports are in use.
*/
if (connp->conn_anon_priv_bind) {
/*
* loopmax =
* (IPPORT_RESERVED-1) - tcp_min_anonpriv_port + 1
*/
loopmax = IPPORT_RESERVED -
tcps->tcps_min_anonpriv_port;
} else {
loopmax = (tcps->tcps_largest_anon_port -
tcps->tcps_smallest_anon_port + 1);
}
}
do {
uint16_t lport;
tf_t *tbf;
tcp_t *ltcp;
conn_t *lconnp;
lport = htons(port);
/*
* Ensure that the tcp_t is not currently in the bind hash.
* Hold the lock on the hash bucket to ensure that
* the duplicate check plus the insertion is an atomic
* operation.
*
* This function does an inline lookup on the bind hash list
* Make sure that we access only members of tcp_t
* and that we don't look at tcp_tcp, since we are not
* doing a CONN_INC_REF.
*/
tcp_bind_hash_remove(tcp);
tbf = &tcps->tcps_bind_fanout[TCP_BIND_HASH(lport)];
mutex_enter(&tbf->tf_lock);
for (ltcp = tbf->tf_tcp; ltcp != NULL;
ltcp = ltcp->tcp_bind_hash) {
if (lport == ltcp->tcp_connp->conn_lport)
break;
}
for (; ltcp != NULL; ltcp = ltcp->tcp_bind_hash_port) {
boolean_t not_socket;
boolean_t exclbind;
lconnp = ltcp->tcp_connp;